From e894a2f3eeaf0c840d368ba4a4e7f036d427fbd2 Mon Sep 17 00:00:00 2001 From: Jasper van Merle Date: Mon, 6 Aug 2018 14:03:55 +0200 Subject: [PATCH 1/7] Add MariaDB documentation Switch to UrlScraper Refactoring --- .gitignore | 1 + .../templates/pages/about_tmpl.coffee | 5 ++ assets/stylesheets/application-dark.css.scss | 1 + assets/stylesheets/application.css.scss | 1 + assets/stylesheets/pages/_mariadb.scss | 7 ++ lib/docs/filters/mariadb/clean_html.rb | 71 +++++++++++++++++++ lib/docs/filters/mariadb/entries.rb | 14 ++++ lib/docs/scrapers/mariadb.rb | 22 ++++++ 8 files changed, 122 insertions(+) create mode 100644 assets/stylesheets/pages/_mariadb.scss create mode 100644 lib/docs/filters/mariadb/clean_html.rb create mode 100644 lib/docs/filters/mariadb/entries.rb create mode 100644 lib/docs/scrapers/mariadb.rb diff --git a/.gitignore b/.gitignore index 8b222826..a2e89741 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ public/fonts public/docs/**/* !public/docs/docs.json !public/docs/**/index.json +log diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee b/assets/javascripts/templates/pages/about_tmpl.coffee index fef9a024..e9d6a748 100644 --- a/assets/javascripts/templates/pages/about_tmpl.coffee +++ b/assets/javascripts/templates/pages/about_tmpl.coffee @@ -410,6 +410,11 @@ credits = [ '2006-2016 LÖVE Development Team', 'GFDL', 'http://www.gnu.org/copyleft/fdl.html' + ], [ + 'MariaDB', + '2018 MariaDB', + 'CC BY-SA & GFDL', + 'https://mariadb.com/kb/en/library/documentation/+license/' ], [ 'Marionette.js', '2017 Muted Solutions, LLC', diff --git a/assets/stylesheets/application-dark.css.scss b/assets/stylesheets/application-dark.css.scss index 4500f90f..9f6b3a7d 100644 --- a/assets/stylesheets/application-dark.css.scss +++ b/assets/stylesheets/application-dark.css.scss @@ -64,6 +64,7 @@ 'pages/liquid', 'pages/love', 'pages/lua', + 'pages/mariadb', 'pages/mdn', 'pages/meteor', 'pages/modernizr', diff --git a/assets/stylesheets/application.css.scss b/assets/stylesheets/application.css.scss index f7321135..64b85f1f 100644 --- a/assets/stylesheets/application.css.scss +++ b/assets/stylesheets/application.css.scss @@ -64,6 +64,7 @@ 'pages/liquid', 'pages/love', 'pages/lua', + 'pages/mariadb', 'pages/mdn', 'pages/meteor', 'pages/modernizr', diff --git a/assets/stylesheets/pages/_mariadb.scss b/assets/stylesheets/pages/_mariadb.scss new file mode 100644 index 00000000..ef6144ce --- /dev/null +++ b/assets/stylesheets/pages/_mariadb.scss @@ -0,0 +1,7 @@ +._mariadb { + @extend %simple; + + .graybox { + @extend %note; + } +} diff --git a/lib/docs/filters/mariadb/clean_html.rb b/lib/docs/filters/mariadb/clean_html.rb new file mode 100644 index 00000000..ffbbb160 --- /dev/null +++ b/lib/docs/filters/mariadb/clean_html.rb @@ -0,0 +1,71 @@ +require 'net/http' + +module Docs + class Mariadb + class CleanHtmlFilter < Filter + @@known_urls = Hash.new + + def call + # Extract main content + @doc = at_css('#content') + + # Remove navigation at the bottom + css('.simple_section_nav').remove + + # Remove table of contents + css('.table_of_contents').remove + + # Add code highlighting and remove nested tags + css('pre').each do |node| + node.content = node.content + node['data-language'] = 'sql' + end + + # Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page + css('a').each do |node| + url = node['href'] + + if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url) + final_url = get_final_url(url) + + if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/') + node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index" + end + end + end + + # Remove navigation items containing only numbers + css('.node_comments').each do |node| + if node.content.scan(/\D/).empty? + node.remove + end + end + + # Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables + css('ul.listing').each do |node| + rows = [] + + node.css('li').each do |li| + name = li.at_css('.media-heading').content + description = li.at_css('.blurb').content + url = li.at_css('a')['href'] + rows << "#{name}#{description}" + end + + table = "#{rows.join('')}
TitleDescription
" + node.replace(table) + end + + doc + end + + def get_final_url(url) + unless @@known_urls.has_key?(url) + @@known_urls[url] = Net::HTTP.get_response(URI(url))['location'] + end + + @@known_urls[url] + end + end + end +end diff --git a/lib/docs/filters/mariadb/entries.rb b/lib/docs/filters/mariadb/entries.rb new file mode 100644 index 00000000..32d4f6b2 --- /dev/null +++ b/lib/docs/filters/mariadb/entries.rb @@ -0,0 +1,14 @@ +module Docs + class Mariadb + class EntriesFilter < Docs::EntriesFilter + def get_name + at_css('.container > h1').content.strip + end + + def get_type + link = at_css('#breadcrumbs > a:nth-child(6)') + link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content + end + end + end +end diff --git a/lib/docs/scrapers/mariadb.rb b/lib/docs/scrapers/mariadb.rb new file mode 100644 index 00000000..b4293a9a --- /dev/null +++ b/lib/docs/scrapers/mariadb.rb @@ -0,0 +1,22 @@ +module Docs + class Mariadb < UrlScraper + self.name = 'MariaDB' + self.type = 'mariadb' + self.release = '10.3.8' + self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/' + self.links = { + home: 'https://mariadb.com/', + code: 'https://github.com/MariaDB/server' + } + + html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title' + + options[:download_images] = false + options[:root_title] = 'MariaDB' + + options[:attribution] = <<-HTML + © 2018 MariaDB
+ Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License. + HTML + end +end From 8223081e707d3b26bf494a8c6f36cedd5b50cd88 Mon Sep 17 00:00:00 2001 From: Jasper van Merle Date: Sun, 21 Oct 2018 17:58:50 +0200 Subject: [PATCH 2/7] Fix images --- lib/docs/filters/mariadb/clean_html.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/docs/filters/mariadb/clean_html.rb b/lib/docs/filters/mariadb/clean_html.rb index ffbbb160..88ab4fc5 100644 --- a/lib/docs/filters/mariadb/clean_html.rb +++ b/lib/docs/filters/mariadb/clean_html.rb @@ -34,6 +34,11 @@ module Docs end end + # Fix images + css('img').each do |node| + node['src'] = node['src'].sub('http:', 'https:') + end + # Remove navigation items containing only numbers css('.node_comments').each do |node| if node.content.scan(/\D/).empty? From 93582d3b8268366e7c831573b8989f1805174a01 Mon Sep 17 00:00:00 2001 From: Jasper van Merle Date: Tue, 13 Aug 2019 22:55:05 +0200 Subject: [PATCH 4/7] mariadb: update scraper and filters to work with the official website --- .../templates/pages/about_tmpl.coffee | 2 +- assets/stylesheets/pages/_mariadb.scss | 2 +- lib/docs/filters/mariadb/clean_html.rb | 41 ++++++++----------- lib/docs/filters/mariadb/entries.rb | 16 ++++++-- .../filters/mariadb/erase_invalid_pages.rb | 34 +++++++++++++++ lib/docs/scrapers/mariadb.rb | 25 ++++++++--- 6 files changed, 84 insertions(+), 36 deletions(-) create mode 100644 lib/docs/filters/mariadb/erase_invalid_pages.rb diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee b/assets/javascripts/templates/pages/about_tmpl.coffee index 98775c57..f00ff2df 100644 --- a/assets/javascripts/templates/pages/about_tmpl.coffee +++ b/assets/javascripts/templates/pages/about_tmpl.coffee @@ -439,7 +439,7 @@ credits = [ 'http://www.gnu.org/copyleft/fdl.html' ], [ 'MariaDB', - '2018 MariaDB', + '2019 MariaDB', 'CC BY-SA & GFDL', 'https://mariadb.com/kb/en/library/documentation/+license/' ], [ diff --git a/assets/stylesheets/pages/_mariadb.scss b/assets/stylesheets/pages/_mariadb.scss index ef6144ce..19d8d639 100644 --- a/assets/stylesheets/pages/_mariadb.scss +++ b/assets/stylesheets/pages/_mariadb.scss @@ -1,7 +1,7 @@ ._mariadb { @extend %simple; - .graybox { + .graybox, .product { @extend %note; } } diff --git a/lib/docs/filters/mariadb/clean_html.rb b/lib/docs/filters/mariadb/clean_html.rb index 88ab4fc5..86b6dc00 100644 --- a/lib/docs/filters/mariadb/clean_html.rb +++ b/lib/docs/filters/mariadb/clean_html.rb @@ -1,11 +1,10 @@ -require 'net/http' - module Docs class Mariadb class CleanHtmlFilter < Filter - @@known_urls = Hash.new - def call + # Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped + return doc if doc.inner_html == '' + # Extract main content @doc = at_css('#content') @@ -21,19 +20,6 @@ module Docs node['data-language'] = 'sql' end - # Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page - css('a').each do |node| - url = node['href'] - - if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url) - final_url = get_final_url(url) - - if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/') - node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index" - end - end - end - # Fix images css('img').each do |node| node['src'] = node['src'].sub('http:', 'https:') @@ -46,11 +32,11 @@ module Docs end end - # Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables + # Convert listings (pages like https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables css('ul.listing').each do |node| rows = [] - node.css('li').each do |li| + node.css('li:not(.no_data)').each do |li| name = li.at_css('.media-heading').content description = li.at_css('.blurb').content url = li.at_css('a')['href'] @@ -61,15 +47,20 @@ module Docs node.replace(table) end - doc - end + # Turn note titles into tags + css('.product_title').each do |node| + node.name = 'strong' + end - def get_final_url(url) - unless @@known_urls.has_key?(url) - @@known_urls[url] = Net::HTTP.get_response(URI(url))['location'] + # Remove comments and questions + css('.related_questions, #comments').remove + css('h2').each do |node| + if node.content == 'Comments' + node.remove + end end - @@known_urls[url] + doc end end end diff --git a/lib/docs/filters/mariadb/entries.rb b/lib/docs/filters/mariadb/entries.rb index 32d4f6b2..a3cbeb34 100644 --- a/lib/docs/filters/mariadb/entries.rb +++ b/lib/docs/filters/mariadb/entries.rb @@ -2,12 +2,22 @@ module Docs class Mariadb class EntriesFilter < Docs::EntriesFilter def get_name - at_css('.container > h1').content.strip + return 'Name' if doc.inner_html == '' + + at_css('#content > h1').content.strip end def get_type - link = at_css('#breadcrumbs > a:nth-child(6)') - link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content + return 'Type' if doc.inner_html == '' + + link = at_css('#breadcrumbs > a:nth-child(4)') + link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content + end + + def entries + # Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be scraped + return [] if doc.inner_html == '' + super end end end diff --git a/lib/docs/filters/mariadb/erase_invalid_pages.rb b/lib/docs/filters/mariadb/erase_invalid_pages.rb new file mode 100644 index 00000000..0987375d --- /dev/null +++ b/lib/docs/filters/mariadb/erase_invalid_pages.rb @@ -0,0 +1,34 @@ +module Docs + class Mariadb + class EraseInvalidPagesFilter < Filter + @@seen_urls = Hash.new + + def call + # The MariaDB documentation uses urls like mariadb.com/kb/en/* + # This means there is no way to detect if a page should be scraped based on it's url + # We run this filter before the internal_urls filter scrapes all internal urls + # If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up + # The entries filter will make sure that no entry is saved for this page + + if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil? + doc.inner_html = '' + end + + current_page = at_css('a.crumb.node_link') + unless current_page.nil? + url = current_page['href'] + + # Some links lead to the same page + # Only parse the page one time + if @@seen_urls.has_key?(url) + doc.inner_html = '' + end + + @@seen_urls[url] = true + end + + doc + end + end + end +end diff --git a/lib/docs/scrapers/mariadb.rb b/lib/docs/scrapers/mariadb.rb index b4293a9a..859b6bec 100644 --- a/lib/docs/scrapers/mariadb.rb +++ b/lib/docs/scrapers/mariadb.rb @@ -2,21 +2,34 @@ module Docs class Mariadb < UrlScraper self.name = 'MariaDB' self.type = 'mariadb' - self.release = '10.3.8' - self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/' + self.release = '10.4.7' + self.base_url = 'https://mariadb.com/kb/en/' + self.root_path = 'library/documentation/' self.links = { home: 'https://mariadb.com/', code: 'https://github.com/MariaDB/server' } - html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title' + html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages' + html_filters.push 'mariadb/entries', 'mariadb/clean_html' - options[:download_images] = false - options[:root_title] = 'MariaDB' + options[:skip_patterns] = [ + /\+/, + /\/ask\//, + /-release-notes\//, + /-changelog\//, + /^documentation\//, + /^mariadb-server-documentation\//, + ] options[:attribution] = <<-HTML - © 2018 MariaDB
+ © 2019 MariaDB
Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License. HTML + + def get_latest_version(opts) + doc = fetch_doc('https://mariadb.com/downloads/', opts) + doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0] + end end end From 5fe2446705831aa0a62c7121187d3604e283d033 Mon Sep 17 00:00:00 2001 From: Jasper van Merle Date: Wed, 28 Aug 2019 18:49:08 +0200 Subject: [PATCH 5/7] mariadb: add icon --- public/icons/docs/mariadb/16.png | Bin 0 -> 936 bytes public/icons/docs/mariadb/16@2x.png | Bin 0 -> 1559 bytes public/icons/docs/mariadb/SOURCE | 1 + 3 files changed, 1 insertion(+) create mode 100644 public/icons/docs/mariadb/16.png create mode 100644 public/icons/docs/mariadb/16@2x.png create mode 100644 public/icons/docs/mariadb/SOURCE diff --git a/public/icons/docs/mariadb/16.png b/public/icons/docs/mariadb/16.png new file mode 100644 index 0000000000000000000000000000000000000000..59a9f586f69cc1df587ace804c9391efab239420 GIT binary patch literal 936 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJOS+@4BLl<6e(pbstU$g(vPY0F z14ES>14Ba#1H&(%P{RubhEf9thF1v;3|2E37{m+a>Xjx|% zIpu0tq%vFe8#-jGm?ZItJ2NnBH?+@6Pl{q-sm%>6Z%xe#4am~BNlOb!otzTaoeQfyvm!qlv^ZN)KjGh)L0(i+q9Hx$M1FN?|!ienHd_4m$hN=loX zleE4tZeMA%lS4v(TFT;_xOMq4TZ&?KmqfS5#Hg7j&CQJ4S`-7+n2iO|O2&x>j=9}Q39IsA7iPsa$Hs;Grr0{C8(OCrSSGqUCCp2Y zZitRmG)_`9O4PDSuZ~Wbni5+Y7N>8O%)s-XL94hlA`z&-$uWUJtV7K_Mb#`>MK6_; zufW(L)5$qm-y(HIcJ#rrs9nWThsvXtWk(+jDq66LH{{R1f(DQT_FedU!g8V?SiVv_$D6_=xxH;ed!!c9I;3^b>?NMQuI!IFWCV4&?=08y0tzklba4!kxSX7z zz-n;nq_&3A$C*>6goa!ds7Vtm?X~gMZDClw&2Qg2Iq9C$=N9HCL?on~;e1wDID5vd znb9I*k)J*=?X39|cFk;C*-_P{-}smu_AQvOcye)MrE6;F*6!5?%+0fRx2N|`FVBtL z>)*V7J_mDiy+cAkfk#F}g-c3ENsk7zb52Z+)1ny_K}B9!QB`d$%+6`kmWVJnmz4Rf z>(ME+V_`7mR&dnqt9S==uxg2GL`h0wNvc(HQ7VvPFfuT-)HSfsHL?gXG`BJ_wK6u> rHZZa>Fwiwz$b+IGH$NpatrAm%p&>*=?M&_OKn)C@u6{1-oD!MTP)004R>004l5008;`004mK004C`008P>0026e000+ooVrmw00006 zVoOIv0RI600RN!9r;`8x00(qQO+^Re2pkbD3*4tFN&o-_8%ab#R9M69RcUNgRTw?@ zE%RokGt=oZ-Iy*EXdx6ZEdfN9P{9UBixP1OM1t4|F<=5F5u^5K$ei_^m8t#zH$>XLww;W7dOL1GeIM;p2NK@pHRbqiLe zUB~PMU-tM2X}ejY#@_B?5T&I60T_ZngOC6GTw7QfI~W4;XO2RxUvHThDej8Vs$&TO z5{(pQ4bb4@Ixf3}^@kgcUyKNsLd-yuFMOi_+t=*DGiGsFw4hGA`=3(aih~u8-!iQG z-m-63f;Bc+RsQ9`Zb8bHYOJ+?3@_1VX6W2U%)F9B(X(ov5b$law)}D^VSbW9_56CQ z5CAZs4lMg?HyWrVDply(WV%BR0E7T_ZU*&U=3>*C?>rO6Snt&g$P8%L)}t^(#4R;E zDofj57%v20W66DYDS%QtL#sXfUXx{5#i_%E_iF#s7Ld6u_#o*cR5}moe4KX1WL@jp z1RBjnYTb+nK$Rfyy?Xp>0c=tU6#HU4JAZ?Ig1IOqOrg)^8LD55;-%>( zPBD@`Q^gRV(M#YEnI?ly)4=vQ@0S3})KAcH{yi+pT@jgPJe6h9pPp&b&?!bv9c$p# z7%fW~=8n~Up;Ex+Vh~ix<#j6;IDa@&hCli%VDzkQIGbIHPiq?!QpPQRJ&KoJHS+3M zEl2enx&M&RN08s0ew&M^0QqHOQ3+j?HytPXBVb<6+xY2ZCn-ry$(bDCEim(HCeKiY zJU~YA&maW2($1hYz&0E8!UaM!_^crR@e$0;U4tK|S7K*kATPzJ>^AZ=4v!&~)SkQ_ z%($hC;G~@a7|u31E6pxR3a*v)u6yVnFu!yRFIyehT9Z6J*{E(a@igvkW47lWN8fKk zfa^{MeyPWHg#uSQ7#z64qS;44soKqPtRl_SfDnKoWO#Fn!wd?? zLosb!njagfktc!DkswSoA&8IAs8cM0G`h-3$oV$j-PFZaNQ7@zC<%!{NlPh>qpBK5 zs2a^P)TmMj3sMy(H&sPQ0g7*M-Q{f>n@ys~gryIavgO(zNXerZ*n>+%k_I7xnh~wt5D(A@wqY19b?SWn@wI)@n Date: Thu, 10 Oct 2019 03:42:25 +0200 Subject: [PATCH 6/7] mariadb: update versions and skip_patterns --- lib/docs/scrapers/mariadb.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/docs/scrapers/mariadb.rb b/lib/docs/scrapers/mariadb.rb index 859b6bec..8f8de55c 100644 --- a/lib/docs/scrapers/mariadb.rb +++ b/lib/docs/scrapers/mariadb.rb @@ -2,7 +2,7 @@ module Docs class Mariadb < UrlScraper self.name = 'MariaDB' self.type = 'mariadb' - self.release = '10.4.7' + self.release = '10.4.8' self.base_url = 'https://mariadb.com/kb/en/' self.root_path = 'library/documentation/' self.links = { @@ -20,6 +20,10 @@ module Docs /-changelog\//, /^documentation\//, /^mariadb-server-documentation\//, + /signal-statement\//, + /resignal-statement\//, + /isolation-level\//, + /rollback-statement\// ] options[:attribution] = <<-HTML From 5ed9942380b749bd16ad888f46154a3e07e676be Mon Sep 17 00:00:00 2001 From: Jasper van Merle Date: Thu, 10 Oct 2019 04:09:09 +0200 Subject: [PATCH 7/7] Fix rate limit related errors --- lib/docs/scrapers/mariadb.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/docs/scrapers/mariadb.rb b/lib/docs/scrapers/mariadb.rb index 8f8de55c..fedc1e5e 100644 --- a/lib/docs/scrapers/mariadb.rb +++ b/lib/docs/scrapers/mariadb.rb @@ -13,6 +13,7 @@ module Docs html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages' html_filters.push 'mariadb/entries', 'mariadb/clean_html' + options[:rate_limit] = 200 options[:skip_patterns] = [ /\+/, /\/ask\//, @@ -20,10 +21,6 @@ module Docs /-changelog\//, /^documentation\//, /^mariadb-server-documentation\//, - /signal-statement\//, - /resignal-statement\//, - /isolation-level\//, - /rollback-statement\// ] options[:attribution] = <<-HTML