Migrate c scraper from filescraper to urlscraper

pull/1606/head
Enoc 3 years ago
parent e17bc84ea4
commit fdfcf3d917

@ -39,7 +39,7 @@
'pages/async', 'pages/async',
'pages/bash', 'pages/bash',
'pages/bootstrap', 'pages/bootstrap',
'pages/c', 'pages/cppref',
'pages/cakephp', 'pages/cakephp',
'pages/clojure', 'pages/clojure',
'pages/codeception', 'pages/codeception',

@ -1,116 +0,0 @@
module Docs
class C
class CleanHtmlFilter < Filter
def call
css('h1').remove if root_page?
css('.t-dcl-rev-aux td[rowspan]').each do |node|
rowspan = node['rowspan'].to_i
node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3
end
css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
'.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
'.t-sdsc-sep:first-child:last-child', '.t-example-live-link',
'.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove
css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image',
'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node|
node.before(node.children).remove
end
css('div > ul').each do |node|
node.parent.before(node.parent.children).remove
end
css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node|
dl = node.parent.parent
if dl.previous_element && dl.previous_element.name == 'ul'
dl.previous_element << node
dl.remove
else
dl.before(node).remove
end
end
css('dl > dd:first-child:last-child').each do |node|
node.parent.before(node.children).remove
end
css('ul').each do |node|
while node.next_element && node.next_element.name == 'ul'
node << node.next_element.children
node.next_element.remove
end
end
css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node|
node.parent['id'] = node['id']
node.before(node.children).remove
end
css('table[style]', 'th[style]', 'td[style]').remove_attr('style')
css('table[cellpadding]').remove_attr('cellpadding')
css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node|
node.name = 'th'
node.content = ' ' if node.content.empty?
end
css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node|
node.name = 'code'
node.remove_attribute('class')
node.content = node.content unless node.at_css('a')
end
css('div > span.source-cpp').each do |node|
node.name = 'pre'
node.inner_html = node.inner_html.gsub('<br>', "\n")
node.content = node.content
end
css('div > a > img[alt="About this image"]').each do |node|
node.parent.parent.remove
end
css('area[href]').each do |node|
node['href'] = node['href'].remove('.html')
end
css('p').each do |node|
while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code')
node << node.next
end
node.inner_html = node.inner_html.strip
node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/
node.remove if node.content.blank? && !node.at_css('img')
end
css('pre').each do |node|
node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp')
'cpp'
else
'c'
end
node.remove_attribute('class')
node.content = node.content.gsub("\t", ' ' * 8)
end
css('code code', '.mw-geshi').each do |node|
node.before(node.children).remove
end
css('h1 ~ .fmbox').each do |node|
node.name = 'div'
node.content = node.content
end
css('img').each do |node|
node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg'
end
doc
end
end
end
end

@ -22,6 +22,9 @@ module Docs
end end
def get_type def get_type
return "C keywords" if slug =~ /keyword/
type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content) type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
type.strip! type.strip!
type.remove! ' library' type.remove! ' library'

@ -1,21 +0,0 @@
module Docs
class C
class FixCodeFilter < Filter
def call
css('div > span.source-c', 'div > span.source-cpp').each do |node|
node.inner_html = node.inner_html.gsub(/<br>\n?/, "\n").gsub("\n</p>\n", "</p>\n")
node.parent.name = 'pre'
node.parent['class'] = node['class']
node.parent.content = node.content
end
nbsp = Nokogiri::HTML('&nbsp;').text
css('pre').each do |node|
node.content = node.content.gsub(nbsp, ' ')
end
doc
end
end
end
end

@ -1,11 +0,0 @@
module Docs
class C
class FixUrlsFilter < Filter
def call
html.gsub! File.join(C.base_url, C.root_path), C.base_url[0..-2]
html.gsub! %r{#{C.base_url}([^"']+?)\.html}, "#{C.base_url}\\1"
html
end
end
end
end

@ -1,42 +0,0 @@
module Docs
class C < FileScraper
self.type = 'c'
self.base_url = 'http://en.cppreference.com/w/c/'
self.root_path = 'header.html'
html_filters.insert_before 'clean_html', 'c/fix_code'
html_filters.push 'c/entries', 'c/clean_html', 'title'
text_filters.push 'c/fix_urls'
options[:decode_and_clean_paths] = true
options[:container] = '#content'
options[:title] = false
options[:root_title] = 'C Programming Language'
options[:skip] = %w(language/history.html)
options[:skip_patterns] = [/experimental/]
options[:fix_urls] = ->(url) do
url.sub! %r{\A.+/http%3A/}, 'http://'
url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com'
url
end
options[:attribution] = <<-HTML
&copy; cppreference.com<br>
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
def get_latest_version(opts)
doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
link = doc.at_css('a[title^="File:"]')
date = link.content.scan(/(\d+)\./)[0][0]
DateTime.strptime(date, '%Y%m%d').to_time.to_i
end
private
def file_path_for(*)
URI.unescape(super)
end
end
end

@ -0,0 +1,12 @@
module Docs
class C < Cppref
self.name = 'c'
self.slug = 'c'
self.base_url = 'https://en.cppreference.com/w/c/'
html_filters.insert_before 'cppref/clean_html', 'c/entries'
options[:root_title] = 'C Programming Language'
end
end

@ -2,7 +2,6 @@ module Docs
class Cpp < Cppref class Cpp < Cppref
self.name = 'C++' self.name = 'C++'
self.slug = 'cpp' self.slug = 'cpp'
self.type = 'c'
self.base_url = 'https://en.cppreference.com/w/cpp/' self.base_url = 'https://en.cppreference.com/w/cpp/'
html_filters.insert_before 'cppref/clean_html', 'cpp/entries' html_filters.insert_before 'cppref/clean_html', 'cpp/entries'
@ -16,13 +15,5 @@ module Docs
regex/regex_token_iterator/operator_cmp.html regex/regex_token_iterator/operator_cmp.html
) )
# Same as get_latest_version in lib/docs/scrapers/c.rb
def get_latest_version(opts)
doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
link = doc.at_css('a[title^="File:"]')
date = link.content.scan(/(\d+)\./)[0][0]
DateTime.strptime(date, '%Y%m%d').to_time.to_i
end
end end
end end

@ -6,7 +6,7 @@ module Docs
html_filters.insert_before 'clean_html', 'cppref/fix_code' html_filters.insert_before 'clean_html', 'cppref/fix_code'
html_filters.push 'cppref/clean_html', 'title' html_filters.push 'cppref/clean_html', 'title'
# 'cpp20/entries',
options[:decode_and_clean_paths] = true options[:decode_and_clean_paths] = true
options[:container] = '#content' options[:container] = '#content'
options[:title] = false options[:title] = false
@ -21,9 +21,13 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML HTML
# def get_latest_version # Check if the 'headers' page has changed
def get_latest_version(opts)
# end doc = fetch_doc(self.base_url + self.root_path, opts)
date = doc.at_css('#footer-info-lastmod').content
date = date.match(/[[:digit:]]{1,2} .* [[:digit:]]{4}/).to_s
date = DateTime.strptime(date, '%e %B %Y').to_time.to_i
end
end end
end end

Loading…
Cancel
Save