mariadb: update scraper and filters to work with the official website

pull/841/head
Jasper van Merle 6 years ago
parent facb0a44d2
commit 93582d3b82

@ -439,7 +439,7 @@ credits = [
'http://www.gnu.org/copyleft/fdl.html' 'http://www.gnu.org/copyleft/fdl.html'
], [ ], [
'MariaDB', 'MariaDB',
'2018 MariaDB', '2019 MariaDB',
'CC BY-SA & GFDL', 'CC BY-SA & GFDL',
'https://mariadb.com/kb/en/library/documentation/+license/' 'https://mariadb.com/kb/en/library/documentation/+license/'
], [ ], [

@ -1,7 +1,7 @@
._mariadb { ._mariadb {
@extend %simple; @extend %simple;
.graybox { .graybox, .product {
@extend %note; @extend %note;
} }
} }

@ -1,11 +1,10 @@
require 'net/http'
module Docs module Docs
class Mariadb class Mariadb
class CleanHtmlFilter < Filter class CleanHtmlFilter < Filter
@@known_urls = Hash.new
def call def call
# Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped
return doc if doc.inner_html == ''
# Extract main content # Extract main content
@doc = at_css('#content') @doc = at_css('#content')
@ -21,19 +20,6 @@ module Docs
node['data-language'] = 'sql' node['data-language'] = 'sql'
end end
# Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page
css('a').each do |node|
url = node['href']
if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url)
final_url = get_final_url(url)
if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/')
node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index"
end
end
end
# Fix images # Fix images
css('img').each do |node| css('img').each do |node|
node['src'] = node['src'].sub('http:', 'https:') node['src'] = node['src'].sub('http:', 'https:')
@ -46,11 +32,11 @@ module Docs
end end
end end
# Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables # Convert listings (pages like https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
css('ul.listing').each do |node| css('ul.listing').each do |node|
rows = [] rows = []
node.css('li').each do |li| node.css('li:not(.no_data)').each do |li|
name = li.at_css('.media-heading').content name = li.at_css('.media-heading').content
description = li.at_css('.blurb').content description = li.at_css('.blurb').content
url = li.at_css('a')['href'] url = li.at_css('a')['href']
@ -61,15 +47,20 @@ module Docs
node.replace(table) node.replace(table)
end end
doc # Turn note titles into <strong> tags
end css('.product_title').each do |node|
node.name = 'strong'
end
def get_final_url(url) # Remove comments and questions
unless @@known_urls.has_key?(url) css('.related_questions, #comments').remove
@@known_urls[url] = Net::HTTP.get_response(URI(url))['location'] css('h2').each do |node|
if node.content == 'Comments'
node.remove
end
end end
@@known_urls[url] doc
end end
end end
end end

@ -2,12 +2,22 @@ module Docs
class Mariadb class Mariadb
class EntriesFilter < Docs::EntriesFilter class EntriesFilter < Docs::EntriesFilter
def get_name def get_name
at_css('.container > h1').content.strip return 'Name' if doc.inner_html == ''
at_css('#content > h1').content.strip
end end
def get_type def get_type
link = at_css('#breadcrumbs > a:nth-child(6)') return 'Type' if doc.inner_html == ''
link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content
link = at_css('#breadcrumbs > a:nth-child(4)')
link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content
end
def entries
# Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be scraped
return [] if doc.inner_html == ''
super
end end
end end
end end

@ -0,0 +1,34 @@
module Docs
class Mariadb
class EraseInvalidPagesFilter < Filter
@@seen_urls = Hash.new
def call
# The MariaDB documentation uses urls like mariadb.com/kb/en/*
# This means there is no way to detect if a page should be scraped based on it's url
# We run this filter before the internal_urls filter scrapes all internal urls
# If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up
# The entries filter will make sure that no entry is saved for this page
if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil?
doc.inner_html = ''
end
current_page = at_css('a.crumb.node_link')
unless current_page.nil?
url = current_page['href']
# Some links lead to the same page
# Only parse the page one time
if @@seen_urls.has_key?(url)
doc.inner_html = ''
end
@@seen_urls[url] = true
end
doc
end
end
end
end

@ -2,21 +2,34 @@ module Docs
class Mariadb < UrlScraper class Mariadb < UrlScraper
self.name = 'MariaDB' self.name = 'MariaDB'
self.type = 'mariadb' self.type = 'mariadb'
self.release = '10.3.8' self.release = '10.4.7'
self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/' self.base_url = 'https://mariadb.com/kb/en/'
self.root_path = 'library/documentation/'
self.links = { self.links = {
home: 'https://mariadb.com/', home: 'https://mariadb.com/',
code: 'https://github.com/MariaDB/server' code: 'https://github.com/MariaDB/server'
} }
html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title' html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages'
html_filters.push 'mariadb/entries', 'mariadb/clean_html'
options[:download_images] = false options[:skip_patterns] = [
options[:root_title] = 'MariaDB' /\+/,
/\/ask\//,
/-release-notes\//,
/-changelog\//,
/^documentation\//,
/^mariadb-server-documentation\//,
]
options[:attribution] = <<-HTML options[:attribution] = <<-HTML
&copy; 2018 MariaDB<br> &copy; 2019 MariaDB<br>
Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License. Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License.
HTML HTML
def get_latest_version(opts)
doc = fetch_doc('https://mariadb.com/downloads/', opts)
doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0]
end
end end
end end

Loading…
Cancel
Save