mariadb: update scraper and filters to work with the official website

pull/841/head
Jasper van Merle 6 years ago
parent facb0a44d2
commit 93582d3b82

@ -439,7 +439,7 @@ credits = [
'http://www.gnu.org/copyleft/fdl.html'
], [
'MariaDB',
'2018 MariaDB',
'2019 MariaDB',
'CC BY-SA & GFDL',
'https://mariadb.com/kb/en/library/documentation/+license/'
], [

@ -1,7 +1,7 @@
._mariadb {
@extend %simple;
.graybox {
.graybox, .product {
@extend %note;
}
}

@ -1,11 +1,10 @@
require 'net/http'
module Docs
class Mariadb
class CleanHtmlFilter < Filter
@@known_urls = Hash.new
def call
# Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped
return doc if doc.inner_html == ''
# Extract main content
@doc = at_css('#content')
@ -21,19 +20,6 @@ module Docs
node['data-language'] = 'sql'
end
# Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page
css('a').each do |node|
url = node['href']
if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url)
final_url = get_final_url(url)
if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/')
node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index"
end
end
end
# Fix images
css('img').each do |node|
node['src'] = node['src'].sub('http:', 'https:')
@ -46,11 +32,11 @@ module Docs
end
end
# Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
# Convert listings (pages like https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
css('ul.listing').each do |node|
rows = []
node.css('li').each do |li|
node.css('li:not(.no_data)').each do |li|
name = li.at_css('.media-heading').content
description = li.at_css('.blurb').content
url = li.at_css('a')['href']
@ -61,15 +47,20 @@ module Docs
node.replace(table)
end
doc
# Turn note titles into <strong> tags
css('.product_title').each do |node|
node.name = 'strong'
end
def get_final_url(url)
unless @@known_urls.has_key?(url)
@@known_urls[url] = Net::HTTP.get_response(URI(url))['location']
# Remove comments and questions
css('.related_questions, #comments').remove
css('h2').each do |node|
if node.content == 'Comments'
node.remove
end
end
@@known_urls[url]
doc
end
end
end

@ -2,12 +2,22 @@ module Docs
class Mariadb
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('.container > h1').content.strip
return 'Name' if doc.inner_html == ''
at_css('#content > h1').content.strip
end
def get_type
link = at_css('#breadcrumbs > a:nth-child(6)')
link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content
return 'Type' if doc.inner_html == ''
link = at_css('#breadcrumbs > a:nth-child(4)')
link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content
end
def entries
# Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be scraped
return [] if doc.inner_html == ''
super
end
end
end

@ -0,0 +1,34 @@
module Docs
class Mariadb
class EraseInvalidPagesFilter < Filter
@@seen_urls = Hash.new
def call
# The MariaDB documentation uses urls like mariadb.com/kb/en/*
# This means there is no way to detect if a page should be scraped based on it's url
# We run this filter before the internal_urls filter scrapes all internal urls
# If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up
# The entries filter will make sure that no entry is saved for this page
if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil?
doc.inner_html = ''
end
current_page = at_css('a.crumb.node_link')
unless current_page.nil?
url = current_page['href']
# Some links lead to the same page
# Only parse the page one time
if @@seen_urls.has_key?(url)
doc.inner_html = ''
end
@@seen_urls[url] = true
end
doc
end
end
end
end

@ -2,21 +2,34 @@ module Docs
class Mariadb < UrlScraper
self.name = 'MariaDB'
self.type = 'mariadb'
self.release = '10.3.8'
self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/'
self.release = '10.4.7'
self.base_url = 'https://mariadb.com/kb/en/'
self.root_path = 'library/documentation/'
self.links = {
home: 'https://mariadb.com/',
code: 'https://github.com/MariaDB/server'
}
html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title'
html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages'
html_filters.push 'mariadb/entries', 'mariadb/clean_html'
options[:download_images] = false
options[:root_title] = 'MariaDB'
options[:skip_patterns] = [
/\+/,
/\/ask\//,
/-release-notes\//,
/-changelog\//,
/^documentation\//,
/^mariadb-server-documentation\//,
]
options[:attribution] = <<-HTML
&copy; 2018 MariaDB<br>
&copy; 2019 MariaDB<br>
Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License.
HTML
def get_latest_version(opts)
doc = fetch_doc('https://mariadb.com/downloads/', opts)
doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0]
end
end
end

Loading…
Cancel
Save