mirror of https://github.com/freeCodeCamp/devdocs
parent
facb0a44d2
commit
93582d3b82
@ -1,7 +1,7 @@
|
|||||||
._mariadb {
|
._mariadb {
|
||||||
@extend %simple;
|
@extend %simple;
|
||||||
|
|
||||||
.graybox {
|
.graybox, .product {
|
||||||
@extend %note;
|
@extend %note;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,34 @@
|
|||||||
|
module Docs
|
||||||
|
class Mariadb
|
||||||
|
class EraseInvalidPagesFilter < Filter
|
||||||
|
@@seen_urls = Hash.new
|
||||||
|
|
||||||
|
def call
|
||||||
|
# The MariaDB documentation uses urls like mariadb.com/kb/en/*
|
||||||
|
# This means there is no way to detect if a page should be scraped based on it's url
|
||||||
|
# We run this filter before the internal_urls filter scrapes all internal urls
|
||||||
|
# If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up
|
||||||
|
# The entries filter will make sure that no entry is saved for this page
|
||||||
|
|
||||||
|
if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil?
|
||||||
|
doc.inner_html = ''
|
||||||
|
end
|
||||||
|
|
||||||
|
current_page = at_css('a.crumb.node_link')
|
||||||
|
unless current_page.nil?
|
||||||
|
url = current_page['href']
|
||||||
|
|
||||||
|
# Some links lead to the same page
|
||||||
|
# Only parse the page one time
|
||||||
|
if @@seen_urls.has_key?(url)
|
||||||
|
doc.inner_html = ''
|
||||||
|
end
|
||||||
|
|
||||||
|
@@seen_urls[url] = true
|
||||||
|
end
|
||||||
|
|
||||||
|
doc
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in new issue