Initial migration from cpp filescraper to urlscraper

pull/1606/head
Enoc 3 years ago
parent b0ab319b98
commit a754d50317

@ -0,0 +1,9 @@
module Docs
class Cpp20
class CleanHtmlFilter < Filter
def call
doc
end
end
end
end

@ -0,0 +1,82 @@
module Docs
class Cpp20
class EntriesFilter < Docs::EntriesFilter
@@duplicate_names = []
REPLACE_NAMES = {
'Error directive' => '#error directive',
'Filename and line information' => '#line directive',
'Implementation defined behavior control' => '#pragma directive',
'Replacing text macros' => '#define directive',
'Source file inclusion' => '#include directive' }
def get_name
name = at_css('#firstHeading').content.strip
name = format_name(name)
name = name.split(',').first
name
end
def get_type
if at_css('#firstHeading').content.include?('C++ keyword')
'Keywords'
elsif subpath.start_with?('experimental')
'Experimental libraries'
elsif subpath.start_with?('language/')
'Language'
elsif subpath.start_with?('freestanding')
'Utilities'
elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
type.strip!
type.remove! ' library'
type.remove! ' utilities'
type.remove! 'C++ '
type.capitalize!
type
end
end
def additional_entries
return [] if root_page? || self.name.start_with?('operators')
names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1]
names.each(&:strip!).reject! do |name|
name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator')
end
names.map { |name| [format_name(name)] }
end
def format_name(name)
name.remove! 'C++ concepts: '
name.remove! 'C++ keywords: '
name.remove! 'C++ ' unless name == 'C++ language'
name.remove! %r{\s\(.+\)}
name.sub! %r{\AStandard library header <(.+)>\z}, '\1'
name.sub! %r{(<[^>]+>)}, ''
if name.include?('operator') && name.include?(',')
name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators')
name.sub! ' ', ' '
name << ')' unless name.last == ')' || name.exclude?('(')
name.sub! '()', ''
name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50
end
REPLACE_NAMES[name] || name
end
def entries
entries = []
# avoid duplicate pages
if !(@@duplicate_names.include?(name))
@@duplicate_names.push(name)
entries << default_entry if root_page? || include_default_entry?
entries.concat(additional_entries)
build_entries(entries)
end
end
end
end
end

@ -0,0 +1,40 @@
module Docs
class Cpp20 < UrlScraper
self.name = 'C++20'
self.slug = 'cpp20'
self.type = 'c'
self.base_url = 'https://en.cppreference.com/w/cpp/'
self.root_path = 'header'
html_filters.insert_before 'clean_html', 'c/fix_code'
html_filters.push 'cpp20/entries', 'c/clean_html', 'title'
options[:decode_and_clean_paths] = true
options[:container] = '#content'
options[:title] = false
options[:root_title] = 'C++ Programming Language'
options[:skip] = %w(
language/extending_std.html
language/history.html
regex/ecmascript.html
regex/regex_token_iterator/operator_cmp.html
)
options[:skip_patterns] = [/experimental/]
options[:attribution] = <<-HTML
&copy; cppreference.com<br>
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
# Same as get_latest_version in lib/docs/scrapers/c.rb
def get_latest_version(opts)
doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
link = doc.at_css('a[title^="File:"]')
date = link.content.scan(/(\d+)\./)[0][0]
DateTime.strptime(date, '%Y%m%d').to_time.to_i
end
end
end
Loading…
Cancel
Save