Initial migration from cpp filescraper to urlscraper

4 years ago · a754d50317
parent b0ab319b98
commit a754d50317
3 changed files with 131 additions and 0 deletions
--- a/lib/docs/filters/cpp20/clean_html.rb
+++ b/lib/docs/filters/cpp20/clean_html.rb
@ -0,0 +1,9 @@
+module Docs
+  class Cpp20
+    class CleanHtmlFilter < Filter
+      def call
+        doc
+      end
+    end
+  end
+end
--- a/lib/docs/filters/cpp20/entries.rb
+++ b/lib/docs/filters/cpp20/entries.rb
@ -0,0 +1,82 @@
+module Docs
+  class Cpp20
+    class EntriesFilter < Docs::EntriesFilter
+      @@duplicate_names = []
+
+      REPLACE_NAMES = {
+        'Error directive' => '#error directive',
+        'Filename and line information' => '#line directive',
+        'Implementation defined behavior control' => '#pragma directive',
+        'Replacing text macros' => '#define directive',
+        'Source file inclusion' => '#include directive' }
+
+      def get_name
+        name = at_css('#firstHeading').content.strip
+        name = format_name(name)
+        name = name.split(',').first
+        name
+      end
+
+      def get_type
+        if at_css('#firstHeading').content.include?('C++ keyword')
+          'Keywords'
+        elsif subpath.start_with?('experimental')
+          'Experimental libraries'
+        elsif subpath.start_with?('language/')
+          'Language'
+        elsif subpath.start_with?('freestanding')
+          'Utilities'
+        elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
+          type.strip!
+          type.remove! ' library'
+          type.remove! ' utilities'
+          type.remove! 'C++ '
+          type.capitalize!
+          type
+        end
+      end
+
+      def additional_entries
+        return [] if root_page? || self.name.start_with?('operators')
+        names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1]
+        names.each(&:strip!).reject! do |name|
+          name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator')
+        end
+        names.map { |name| [format_name(name)] }
+      end
+
+      def format_name(name)
+        name.remove! 'C++ concepts: '
+        name.remove! 'C++ keywords: '
+        name.remove! 'C++ ' unless name == 'C++ language'
+        name.remove! %r{\s\(.+\)}
+
+        name.sub! %r{\AStandard library header <(.+)>\z}, '\1'
+        name.sub! %r{(<[^>]+>)}, ''
+
+        if name.include?('operator') && name.include?(',')
+          name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators')
+          name.sub! '  ', ' '
+          name << ')' unless name.last == ')' || name.exclude?('(')
+          name.sub! '()', ''
+          name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50
+        end
+
+        REPLACE_NAMES[name] || name
+      end
+
+      def entries
+        entries = []
+
+        # avoid duplicate pages
+        if !(@@duplicate_names.include?(name))
+          @@duplicate_names.push(name)
+          entries << default_entry if root_page? || include_default_entry?
+          entries.concat(additional_entries)
+          build_entries(entries)
+        end
+      end
+
+    end
+  end
+end
--- a/lib/docs/scrapers/cpp20.rb
+++ b/lib/docs/scrapers/cpp20.rb
@ -0,0 +1,40 @@
+module Docs
+  class Cpp20 < UrlScraper
+    self.name = 'C++20'
+    self.slug = 'cpp20'
+    self.type = 'c'
+    self.base_url = 'https://en.cppreference.com/w/cpp/'
+    self.root_path = 'header'
+
+    html_filters.insert_before 'clean_html', 'c/fix_code'
+    html_filters.push 'cpp20/entries', 'c/clean_html', 'title'
+
+    options[:decode_and_clean_paths] = true
+    options[:container] = '#content'
+    options[:title] = false
+    options[:root_title] = 'C++ Programming Language'
+
+    options[:skip] = %w(
+      language/extending_std.html
+      language/history.html
+      regex/ecmascript.html
+      regex/regex_token_iterator/operator_cmp.html
+    )
+
+    options[:skip_patterns] = [/experimental/]
+
+    options[:attribution] = <<-HTML
+      &copy; cppreference.com<br>
+      Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
+    HTML
+
+    # Same as get_latest_version in lib/docs/scrapers/c.rb
+    def get_latest_version(opts)
+      doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
+      link = doc.at_css('a[title^="File:"]')
+      date = link.content.scan(/(\d+)\./)[0][0]
+      DateTime.strptime(date, '%Y%m%d').to_time.to_i
+    end
+
+  end
+end