mariadb: update scraper and filters to work with the official website

6 years ago · 93582d3b82
parent facb0a44d2
commit 93582d3b82
6 changed files with 84 additions and 36 deletions
--- a/assets/javascripts/templates/pages/about_tmpl.coffee
+++ b/assets/javascripts/templates/pages/about_tmpl.coffee
@ -439,7 +439,7 @@ credits = [
    'http://www.gnu.org/copyleft/fdl.html'
  ], [
    'MariaDB',
-    '2018 MariaDB',
+    '2019 MariaDB',
    'CC BY-SA & GFDL',
    'https://mariadb.com/kb/en/library/documentation/+license/'
  ], [
--- a/assets/stylesheets/pages/_mariadb.scss
+++ b/assets/stylesheets/pages/_mariadb.scss
@ -1,7 +1,7 @@
 ._mariadb {
  @extend %simple;

-  .graybox {
+  .graybox, .product {
    @extend %note;
  }
 }
--- a/lib/docs/filters/mariadb/clean_html.rb
+++ b/lib/docs/filters/mariadb/clean_html.rb
@ -1,11 +1,10 @@
-require 'net/http'
-
 module Docs
  class Mariadb
    class CleanHtmlFilter < Filter
-      @@known_urls = Hash.new
-
      def call
+        # Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped
+        return doc if doc.inner_html == ''
+
        # Extract main content
        @doc = at_css('#content')

@ -21,19 +20,6 @@ module Docs
          node['data-language'] = 'sql'
        end

-        # Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page
-        css('a').each do |node|
-          url = node['href']
-
-          if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url)
-            final_url = get_final_url(url)
-
-            if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/')
-              node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index"
-            end
-          end
-        end
-
        # Fix images
        css('img').each do |node|
          node['src'] = node['src'].sub('http:', 'https:')
@ -46,11 +32,11 @@ module Docs
          end
        end

-        # Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
+        # Convert listings (pages like https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
        css('ul.listing').each do |node|
          rows = []

-          node.css('li').each do |li|
+          node.css('li:not(.no_data)').each do |li|
            name = li.at_css('.media-heading').content
            description = li.at_css('.blurb').content
            url = li.at_css('a')['href']
@ -61,15 +47,20 @@ module Docs
          node.replace(table)
        end

-        doc
+        # Turn note titles into <strong> tags
+        css('.product_title').each do |node|
+          node.name = 'strong'
        end

-      def get_final_url(url)
-        unless @@known_urls.has_key?(url)
-          @@known_urls[url] = Net::HTTP.get_response(URI(url))['location']
+        # Remove comments and questions
+        css('.related_questions, #comments').remove
+        css('h2').each do |node|
+          if node.content == 'Comments'
+            node.remove
+          end
        end

-        @@known_urls[url]
+        doc
      end
    end
  end
--- a/lib/docs/filters/mariadb/entries.rb
+++ b/lib/docs/filters/mariadb/entries.rb
@ -2,12 +2,22 @@ module Docs
  class Mariadb
    class EntriesFilter < Docs::EntriesFilter
      def get_name
-        at_css('.container > h1').content.strip
+        return 'Name' if doc.inner_html == ''
+
+        at_css('#content > h1').content.strip
      end

      def get_type
-        link = at_css('#breadcrumbs > a:nth-child(6)')
-        link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content
+        return 'Type' if doc.inner_html == ''
+
+        link = at_css('#breadcrumbs > a:nth-child(4)')
+        link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content
+      end
+
+      def entries
+        # Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be scraped
+        return [] if doc.inner_html == ''
+        super
      end
    end
  end
--- a/lib/docs/filters/mariadb/erase_invalid_pages.rb
+++ b/lib/docs/filters/mariadb/erase_invalid_pages.rb
@ -0,0 +1,34 @@
+module Docs
+  class Mariadb
+    class EraseInvalidPagesFilter < Filter
+      @@seen_urls = Hash.new
+
+      def call
+        # The MariaDB documentation uses urls like mariadb.com/kb/en/*
+        # This means there is no way to detect if a page should be scraped based on it's url
+        # We run this filter before the internal_urls filter scrapes all internal urls
+        # If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up
+        # The entries filter will make sure that no entry is saved for this page
+
+        if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil?
+          doc.inner_html = ''
+        end
+
+        current_page = at_css('a.crumb.node_link')
+        unless current_page.nil?
+          url = current_page['href']
+
+          # Some links lead to the same page
+          # Only parse the page one time
+          if @@seen_urls.has_key?(url)
+            doc.inner_html = ''
+          end
+
+          @@seen_urls[url] = true
+        end
+
+        doc
+      end
+    end
+  end
+end
--- a/lib/docs/scrapers/mariadb.rb
+++ b/lib/docs/scrapers/mariadb.rb
@ -2,21 +2,34 @@ module Docs
  class Mariadb < UrlScraper
    self.name = 'MariaDB'
    self.type = 'mariadb'
-    self.release = '10.3.8'
-    self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/'
+    self.release = '10.4.7'
+    self.base_url = 'https://mariadb.com/kb/en/'
+    self.root_path = 'library/documentation/'
    self.links = {
      home: 'https://mariadb.com/',
      code: 'https://github.com/MariaDB/server'
    }

-    html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title'
+    html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages'
+    html_filters.push 'mariadb/entries', 'mariadb/clean_html'

-    options[:download_images] = false
-    options[:root_title] = 'MariaDB'
+    options[:skip_patterns] = [
+      /\+/,
+      /\/ask\//,
+      /-release-notes\//,
+      /-changelog\//,
+      /^documentation\//,
+      /^mariadb-server-documentation\//,
+    ]

    options[:attribution] = <<-HTML
-      &copy; 2018 MariaDB<br>
+      &copy; 2019 MariaDB<br>
      Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License.
    HTML
+
+    def get_latest_version(opts)
+      doc = fetch_doc('https://mariadb.com/downloads/', opts)
+      doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0]
+    end
  end
 end