From a754d50317947e5bdb85434eaf4521a708ae5bb9 Mon Sep 17 00:00:00 2001 From: Enoc Date: Wed, 1 Sep 2021 01:19:08 -0600 Subject: [PATCH] Initial migration from cpp filescraper to urlscraper --- lib/docs/filters/cpp20/clean_html.rb | 9 +++ lib/docs/filters/cpp20/entries.rb | 82 ++++++++++++++++++++++++++++ lib/docs/scrapers/cpp20.rb | 40 ++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 lib/docs/filters/cpp20/clean_html.rb create mode 100644 lib/docs/filters/cpp20/entries.rb create mode 100644 lib/docs/scrapers/cpp20.rb diff --git a/lib/docs/filters/cpp20/clean_html.rb b/lib/docs/filters/cpp20/clean_html.rb new file mode 100644 index 00000000..4328e002 --- /dev/null +++ b/lib/docs/filters/cpp20/clean_html.rb @@ -0,0 +1,9 @@ +module Docs + class Cpp20 + class CleanHtmlFilter < Filter + def call + doc + end + end + end +end diff --git a/lib/docs/filters/cpp20/entries.rb b/lib/docs/filters/cpp20/entries.rb new file mode 100644 index 00000000..8a5aad22 --- /dev/null +++ b/lib/docs/filters/cpp20/entries.rb @@ -0,0 +1,82 @@ +module Docs + class Cpp20 + class EntriesFilter < Docs::EntriesFilter + @@duplicate_names = [] + + REPLACE_NAMES = { + 'Error directive' => '#error directive', + 'Filename and line information' => '#line directive', + 'Implementation defined behavior control' => '#pragma directive', + 'Replacing text macros' => '#define directive', + 'Source file inclusion' => '#include directive' } + + def get_name + name = at_css('#firstHeading').content.strip + name = format_name(name) + name = name.split(',').first + name + end + + def get_type + if at_css('#firstHeading').content.include?('C++ keyword') + 'Keywords' + elsif subpath.start_with?('experimental') + 'Experimental libraries' + elsif subpath.start_with?('language/') + 'Language' + elsif subpath.start_with?('freestanding') + 'Utilities' + elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content) + type.strip! + type.remove! ' library' + type.remove! ' utilities' + type.remove! 'C++ ' + type.capitalize! + type + end + end + + def additional_entries + return [] if root_page? || self.name.start_with?('operators') + names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1] + names.each(&:strip!).reject! do |name| + name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator') + end + names.map { |name| [format_name(name)] } + end + + def format_name(name) + name.remove! 'C++ concepts: ' + name.remove! 'C++ keywords: ' + name.remove! 'C++ ' unless name == 'C++ language' + name.remove! %r{\s\(.+\)} + + name.sub! %r{\AStandard library header <(.+)>\z}, '\1' + name.sub! %r{(<[^>]+>)}, '' + + if name.include?('operator') && name.include?(',') + name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators') + name.sub! ' ', ' ' + name << ')' unless name.last == ')' || name.exclude?('(') + name.sub! '()', '' + name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50 + end + + REPLACE_NAMES[name] || name + end + + def entries + entries = [] + + # avoid duplicate pages + if !(@@duplicate_names.include?(name)) + @@duplicate_names.push(name) + entries << default_entry if root_page? || include_default_entry? + entries.concat(additional_entries) + build_entries(entries) + end + end + + end + end +end diff --git a/lib/docs/scrapers/cpp20.rb b/lib/docs/scrapers/cpp20.rb new file mode 100644 index 00000000..14034a43 --- /dev/null +++ b/lib/docs/scrapers/cpp20.rb @@ -0,0 +1,40 @@ +module Docs + class Cpp20 < UrlScraper + self.name = 'C++20' + self.slug = 'cpp20' + self.type = 'c' + self.base_url = 'https://en.cppreference.com/w/cpp/' + self.root_path = 'header' + + html_filters.insert_before 'clean_html', 'c/fix_code' + html_filters.push 'cpp20/entries', 'c/clean_html', 'title' + + options[:decode_and_clean_paths] = true + options[:container] = '#content' + options[:title] = false + options[:root_title] = 'C++ Programming Language' + + options[:skip] = %w( + language/extending_std.html + language/history.html + regex/ecmascript.html + regex/regex_token_iterator/operator_cmp.html + ) + + options[:skip_patterns] = [/experimental/] + + options[:attribution] = <<-HTML + © cppreference.com
+ Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. + HTML + + # Same as get_latest_version in lib/docs/scrapers/c.rb + def get_latest_version(opts) + doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts) + link = doc.at_css('a[title^="File:"]') + date = link.content.scan(/(\d+)\./)[0][0] + DateTime.strptime(date, '%Y%m%d').to_time.to_i + end + + end +end