Improve Rust scraper

pull/885/merge
Thibaut Courouble 6 years ago
parent 3dbc6052bf
commit 7b7aa34b70

@ -3,9 +3,18 @@
h4 { @extend %block-label; }
.docblock { margin-left: 1em; }
div.information, div.important-traits {
@extend %note;
> pre { margin: .5rem 0; }
}
div.stability { margin-bottom: 1em; }
em.stab, span.stab { @extend %label; }
em.stab.unstable, span.stab.unstable { @extend %label-orange; }
.since, .out-of-band { float: right; }
.out-of-band { float: right; }
.since, .srclink {
float: right;
margin-left: .5rem;
}
}

@ -2,8 +2,6 @@ module Docs
class Rust
class CleanHtmlFilter < Filter
def call
puts subpath if at_css('#versioninfo')
if slug.start_with?('book') || slug.start_with?('reference')
@doc = at_css('#content main')
elsif slug == 'error-index'
@ -29,12 +27,16 @@ module Docs
css('.rusttest', '.test-arrow', 'hr').remove
css('.docblock.attributes').each do |node|
node.remove if node.content.include?('#[must_use]')
end
css('a.header').each do |node|
node.first_element_child['id'] = node['name'] || node['id']
node.before(node.children).remove
end
css('.docblock > h1').each { |node| node.name = 'h4' }
css('.docblock > h1:not(.section-header)').each { |node| node.name = 'h4' }
css('h2.section-header').each { |node| node.name = 'h3' }
css('h1.section-header').each { |node| node.name = 'h2' }
@ -44,7 +46,7 @@ module Docs
end
end
css('> .impl-items', '> .docblock', 'pre > pre').each do |node|
css('> .impl-items', '> .docblock', 'pre > pre', '.tooltiptext', '.tooltip').each do |node|
node.before(node.children).remove
end
@ -65,6 +67,32 @@ module Docs
doc.first_element_child.name = 'h1' if doc.first_element_child.name = 'h2'
at_css('h1').content = 'Rust Documentation' if root_page?
css('.table-display').each do |node|
node.css('td').each do |td|
node.before(td.children)
end
node.remove
end
css('h2 .important-traits', 'h3 .important-traits', 'h4 .important-traits').each do |node|
content = node.at_css('.content.hidden .content')
node.at_css('.content.hidden').replace(content) if content
node.parent.after(node)
end
css('code.content').each do |node|
node.name = 'pre'
node.css('.fmt-newline').each do |line|
line.inner_html = line.inner_html + "\n"
end
node.inner_html = node.inner_html.gsub('<br>', "\n")
node.content = node.content
end
css('.since + .srclink').each do |node|
node.previous_element.before(node)
end
doc
end
end

@ -1,7 +1,7 @@
module Docs
class Rust < UrlScraper
self.type = 'rust'
self.release = '1.28.0'
self.release = '1.29.1'
self.base_url = 'https://doc.rust-lang.org/'
self.root_path = 'book/second-edition/index.html'
self.initial_paths = %w(

Loading…
Cancel
Save