diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee b/assets/javascripts/templates/pages/about_tmpl.coffee index 98775c57..f00ff2df 100644 --- a/assets/javascripts/templates/pages/about_tmpl.coffee +++ b/assets/javascripts/templates/pages/about_tmpl.coffee @@ -439,7 +439,7 @@ credits = [ 'http://www.gnu.org/copyleft/fdl.html' ], [ 'MariaDB', - '2018 MariaDB', + '2019 MariaDB', 'CC BY-SA & GFDL', 'https://mariadb.com/kb/en/library/documentation/+license/' ], [ diff --git a/assets/stylesheets/pages/_mariadb.scss b/assets/stylesheets/pages/_mariadb.scss index ef6144ce..19d8d639 100644 --- a/assets/stylesheets/pages/_mariadb.scss +++ b/assets/stylesheets/pages/_mariadb.scss @@ -1,7 +1,7 @@ ._mariadb { @extend %simple; - .graybox { + .graybox, .product { @extend %note; } } diff --git a/lib/docs/filters/mariadb/clean_html.rb b/lib/docs/filters/mariadb/clean_html.rb index 88ab4fc5..86b6dc00 100644 --- a/lib/docs/filters/mariadb/clean_html.rb +++ b/lib/docs/filters/mariadb/clean_html.rb @@ -1,11 +1,10 @@ -require 'net/http' - module Docs class Mariadb class CleanHtmlFilter < Filter - @@known_urls = Hash.new - def call + # Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped + return doc if doc.inner_html == '' + # Extract main content @doc = at_css('#content') @@ -21,19 +20,6 @@ module Docs node['data-language'] = 'sql' end - # Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page - css('a').each do |node| - url = node['href'] - - if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url) - final_url = get_final_url(url) - - if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/') - node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index" - end - end - end - # Fix images css('img').each do |node| node['src'] = node['src'].sub('http:', 'https:') @@ -46,11 +32,11 @@ module Docs end end - # Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables + # Convert listings (pages like https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables css('ul.listing').each do |node| rows = [] - node.css('li').each do |li| + node.css('li:not(.no_data)').each do |li| name = li.at_css('.media-heading').content description = li.at_css('.blurb').content url = li.at_css('a')['href'] @@ -61,15 +47,20 @@ module Docs node.replace(table) end - doc - end + # Turn note titles into tags + css('.product_title').each do |node| + node.name = 'strong' + end - def get_final_url(url) - unless @@known_urls.has_key?(url) - @@known_urls[url] = Net::HTTP.get_response(URI(url))['location'] + # Remove comments and questions + css('.related_questions, #comments').remove + css('h2').each do |node| + if node.content == 'Comments' + node.remove + end end - @@known_urls[url] + doc end end end diff --git a/lib/docs/filters/mariadb/entries.rb b/lib/docs/filters/mariadb/entries.rb index 32d4f6b2..a3cbeb34 100644 --- a/lib/docs/filters/mariadb/entries.rb +++ b/lib/docs/filters/mariadb/entries.rb @@ -2,12 +2,22 @@ module Docs class Mariadb class EntriesFilter < Docs::EntriesFilter def get_name - at_css('.container > h1').content.strip + return 'Name' if doc.inner_html == '' + + at_css('#content > h1').content.strip end def get_type - link = at_css('#breadcrumbs > a:nth-child(6)') - link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content + return 'Type' if doc.inner_html == '' + + link = at_css('#breadcrumbs > a:nth-child(4)') + link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content + end + + def entries + # Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be scraped + return [] if doc.inner_html == '' + super end end end diff --git a/lib/docs/filters/mariadb/erase_invalid_pages.rb b/lib/docs/filters/mariadb/erase_invalid_pages.rb new file mode 100644 index 00000000..0987375d --- /dev/null +++ b/lib/docs/filters/mariadb/erase_invalid_pages.rb @@ -0,0 +1,34 @@ +module Docs + class Mariadb + class EraseInvalidPagesFilter < Filter + @@seen_urls = Hash.new + + def call + # The MariaDB documentation uses urls like mariadb.com/kb/en/* + # This means there is no way to detect if a page should be scraped based on it's url + # We run this filter before the internal_urls filter scrapes all internal urls + # If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up + # The entries filter will make sure that no entry is saved for this page + + if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil? + doc.inner_html = '' + end + + current_page = at_css('a.crumb.node_link') + unless current_page.nil? + url = current_page['href'] + + # Some links lead to the same page + # Only parse the page one time + if @@seen_urls.has_key?(url) + doc.inner_html = '' + end + + @@seen_urls[url] = true + end + + doc + end + end + end +end diff --git a/lib/docs/scrapers/mariadb.rb b/lib/docs/scrapers/mariadb.rb index b4293a9a..859b6bec 100644 --- a/lib/docs/scrapers/mariadb.rb +++ b/lib/docs/scrapers/mariadb.rb @@ -2,21 +2,34 @@ module Docs class Mariadb < UrlScraper self.name = 'MariaDB' self.type = 'mariadb' - self.release = '10.3.8' - self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/' + self.release = '10.4.7' + self.base_url = 'https://mariadb.com/kb/en/' + self.root_path = 'library/documentation/' self.links = { home: 'https://mariadb.com/', code: 'https://github.com/MariaDB/server' } - html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title' + html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages' + html_filters.push 'mariadb/entries', 'mariadb/clean_html' - options[:download_images] = false - options[:root_title] = 'MariaDB' + options[:skip_patterns] = [ + /\+/, + /\/ask\//, + /-release-notes\//, + /-changelog\//, + /^documentation\//, + /^mariadb-server-documentation\//, + ] options[:attribution] = <<-HTML - © 2018 MariaDB
+ © 2019 MariaDB
Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License. HTML + + def get_latest_version(opts) + doc = fetch_doc('https://mariadb.com/downloads/', opts) + doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0] + end end end