Finish InfluxData scraper

pull/359/merge
Thibaut Courouble 9 years ago
parent cd8590f90b
commit ebc6be3215

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 102 KiB

@ -1,7 +1,7 @@
[ [
[ [
"2016-02-28", "2016-02-28",
"New documentations: <a href=\"/codeigniter/\">CodeIgniter</a> and <a href=\"/nginx_lua_module/\">nginx Lua Module</a>" "New documentations: <a href=\"/codeigniter/\">CodeIgniter</a>, <a href=\"/nginx_lua_module/\">nginx Lua Module</a> and <a href=\"/influxdata/\">InfluxData</a>"
], [ ], [
"2016-02-15", "2016-02-15",
"New documentations: <a href=\"/cakephp/\">CakePHP</a>, <a href=\"/chef/\">Chef</a> and <a href=\"/ramda/\">Ramda</a>" "New documentations: <a href=\"/cakephp/\">CakePHP</a>, <a href=\"/chef/\">Chef</a> and <a href=\"/ramda/\">Ramda</a>"

@ -199,6 +199,11 @@ credits = [
'The University of Glasgow', 'The University of Glasgow',
'BSD', 'BSD',
'https://www.haskell.org/ghc/license' 'https://www.haskell.org/ghc/license'
], [
'InfluxData',
'2015 InfluxData, Inc.',
'MIT',
'https://github.com/influxdata/docs.influxdata.com/blob/master/LICENSE'
], [ ], [
'jQuery', 'jQuery',
'Packt Publishing<br>&copy; jQuery Foundation and other contributors', 'Packt Publishing<br>&copy; jQuery Foundation and other contributors',

@ -128,3 +128,4 @@
._icon-chef:before { background-position: -2rem -10rem; } ._icon-chef:before { background-position: -2rem -10rem; }
._icon-ramda:before { background-position: -3rem -10rem; @extend %darkIconFix !optional; } ._icon-ramda:before { background-position: -3rem -10rem; @extend %darkIconFix !optional; }
._icon-codeigniter:before { background-position: -4rem -10rem; @extend %darkIconFix !optional; } ._icon-codeigniter:before { background-position: -4rem -10rem; @extend %darkIconFix !optional; }
._icon-influxdata:before { background-position: -5rem -10rem; @extend %darkIconFix !optional; }

@ -18,6 +18,7 @@
._cordova, ._cordova,
._grunt, ._grunt,
._influxdata,
._less, ._less,
._lodash, ._lodash,
._marionette, ._marionette,

@ -0,0 +1,27 @@
module Docs
class Influxdata
class CleanHtmlFilter < Filter
def call
if root_page?
doc.inner_html = ' '
return doc
end
doc = @doc.at_css('#page-content')
css('.page--contribute', 'hr').remove
css('.page--body', '.page--title', 'font').each do |node|
node.before(node.children).remove
end
css('pre > code').each do |node|
node.parent['class'] = node['class']
node.before(node.children).remove
end
doc
end
end
end
end

@ -0,0 +1,30 @@
module Docs
class Influxdata
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('#page-title h1').content
end
def get_type
product = at_css('.product-switcher--current').content.strip
return product if %w(Chronograf Telegraf).include?(product)
node = at_css('#product-sidebar a[href="index"]')
node = node.parent.previous_element unless node.parent['class'] == 'product-sidebar--section-title'
type = node.content.strip
type.remove! ' Reference'
if type.in?(%w(Getting\ Started Introduction Guides))
product
else
"#{product}: #{type}"
end
end
def include_default_entry?
!subpath.end_with?("v#{Influxdata.release}/")
end
end
end
end

@ -1,18 +0,0 @@
module Docs
class Influxdb
class CleanHtmlFilter < Filter
def call
doc = @doc.at_css('#page-content')
# Re-position the page header
header = at_css('.page--body h1')
doc.children.first.add_next_sibling header
# Remove the contribution
at_css('.page--contribute').remove
doc
end
end
end
end

@ -1,19 +0,0 @@
module Docs
class Influxdb
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('#page-title h1').content
end
def get_type
# This is kinda hacky, we are fetching the current type from
# the url, we are asumming that the url pattern is
# category/page or category
path = current_url.relative_path_from(base_url)
"InfluxDB: #{path.split('/').first.titleize}"
end
end
end
end

@ -0,0 +1,32 @@
module Docs
class Influxdata < UrlScraper
self.name = 'InfluxData'
self.type = 'influxdata'
self.release = '0.10'
self.base_url = 'https://docs.influxdata.com/'
html_filters.push 'influxdata/entries', 'influxdata/clean_html', 'title'
options[:trailing_slash] = true
options[:root_title] = 'InfluxData Documentation'
options[:title] = false
options[:only_patterns] = [/(telegraf|influxdb|chronograf|kapacitor)\/v#{release}/]
options[:skip] = [
"influxdb/v#{release}/sample_data/data_download/",
"influxdb/v#{release}/tools/grafana/",
"influxdb/v#{release}/about/"
]
options[:replace_paths] = {
"influxdb/v#{release}/guides/clustering/" => 'influxdb/v0.10/clustering/'
}
options[:attribution] = <<-HTML
&copy; 2015 InfluxData, Inc.<br>
Licensed under the MIT license.
HTML
end
end

@ -1,15 +0,0 @@
module Docs
class Influxdb < UrlScraper
self.name = 'InfluxDB'
self.type = 'influxdb'
self.release = '0.10'
self.base_url = 'https://docs.influxdata.com/influxdb/v0.10/'
html_filters.push 'influxdb/entries', 'influxdb/clean_html'
options[:attribution] = <<-HTML
&copy; 2010&ndash;2015 InfluxData<br>
Licensed under the MIT license.
HTML
end
end

Binary file not shown.

After

Width:  |  Height:  |  Size: 623 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Loading…
Cancel
Save