Improve C and C++ scrapers

Fixes #138.
pull/142/head
Thibaut 10 years ago
parent 4849e6813a
commit 31d5b9d1e0

@ -1,7 +1,13 @@
._c {
> h2, > h3 { @extend %block-heading; }
> h4 { @extend %block-label, %label-blue; }
> p > code { @extend %label; }
.fmbox { @extend %note; }
code, .t-mark, .t-mark-rev { @extend %label; }
.t-mark, .t-mark-rev {
white-space: nowrap;
@extend %label-green;
}
.t-dcl-begin pre {
margin: 0;
@ -19,9 +25,19 @@
}
.t-sdsc-nopad dl, .t-sdsc-nopad dd { margin: 0; }
td > h5 {
margin: 0;
line-height: inherit;
td {
> h3, > h5 {
margin: 0;
line-height: inherit;
}
> ul { margin: 0; }
> .t-dsc-member-div > div { // utility/functional
float: left;
+ div { margin-left: .5em; }
}
}
.t-inheritance-diagram {

@ -2,13 +2,11 @@ module Docs
class C
class CleanHtmlFilter < Filter
def call
if root_page?
doc.inner_html = ' '
return doc
end
css('h1').remove if root_page?
css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc', '.t-dsc-sep', '.t-dcl-sep',
'#catlinks', '.ambox-notice', '.mw-cite-backlink', '.t-sdsc-sep:first-child:last-child').remove
css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
'.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
'.t-sdsc-sep:first-child:last-child', '.t-example-live-link').remove
css('#bodyContent', '.mw-content-ltr', 'span[style]').each do |node|
node.before(node.children).remove
@ -26,10 +24,16 @@ module Docs
node.content = ' ' if node.content.empty?
end
css('tt').each do |node|
css('tt', 'span > span.source-cpp').each do |node|
node.name = 'code'
end
css('div > span.source-cpp').each do |node|
node.name = 'pre'
node.inner_html = node.inner_html.gsub('<br>', "\n")
node.content = node.content
end
css('div > a > img[alt="About this image"]').each do |node|
node.parent.parent.remove
end
@ -38,6 +42,11 @@ module Docs
node['href'] = node['href'].remove('.html')
end
css('h1 ~ .fmbox').each do |node|
node.name = 'div'
node.content = node.content
end
doc
end
end

@ -22,6 +22,8 @@ module Docs
def get_type
if at_css('#firstHeading').content.include?('C++ keyword')
'Keywords'
elsif subpath.start_with?('experimental')
'Experimental libraries'
elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
type.strip!
type.remove! ' library'

@ -14,6 +14,11 @@ module Docs
options[:root_title] = 'C Programming Language'
options[:skip] = %w(language/history.html)
options[:fix_urls] = ->(url) do
url.sub! %r{\A.+/http%3A/}, "http://"
url
end
options[:attribution] = <<-HTML
&copy; cppreference.com<br>
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.

@ -22,6 +22,11 @@ module Docs
)
options[:only_patterns] = [/\.html\z/]
options[:fix_urls] = ->(url) do
url.sub! %r{\A.+/http%3A/}, "http://"
url
end
options[:attribution] = <<-HTML
&copy; cppreference.com<br>
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.

Loading…
Cancel
Save