Migrate c scraper from filescraper to urlscraper

3 years ago · fdfcf3d917
parent e17bc84ea4
commit fdfcf3d917
10 changed files with 24 additions and 204 deletions
--- a/assets/stylesheets/application.css.scss
+++ b/assets/stylesheets/application.css.scss
@ -39,7 +39,7 @@
        'pages/async',
        'pages/bash',
        'pages/bootstrap',
-        'pages/c',
+        'pages/cppref',
        'pages/cakephp',
        'pages/clojure',
        'pages/codeception',
--- a/assets/stylesheets/pages/_cppref.scss
+++ b/assets/stylesheets/pages/_cppref.scss
--- a/lib/docs/filters/c/clean_html.rb
+++ b/lib/docs/filters/c/clean_html.rb
@ -1,116 +0,0 @@
-module Docs
-  class C
-    class CleanHtmlFilter < Filter
-      def call
-        css('h1').remove if root_page?
-
-        css('.t-dcl-rev-aux td[rowspan]').each do |node|
-          rowspan = node['rowspan'].to_i
-          node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3
-        end
-
-        css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
-            '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
-            '.t-sdsc-sep:first-child:last-child', '.t-example-live-link',
-            '.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove
-
-        css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image',
-            'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('div > ul').each do |node|
-          node.parent.before(node.parent.children).remove
-        end
-
-        css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node|
-          dl = node.parent.parent
-          if dl.previous_element && dl.previous_element.name == 'ul'
-            dl.previous_element << node
-            dl.remove
-          else
-            dl.before(node).remove
-          end
-        end
-
-        css('dl > dd:first-child:last-child').each do |node|
-          node.parent.before(node.children).remove
-        end
-
-        css('ul').each do |node|
-          while node.next_element && node.next_element.name == 'ul'
-            node << node.next_element.children
-            node.next_element.remove
-          end
-        end
-
-        css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node|
-          node.parent['id'] = node['id']
-          node.before(node.children).remove
-        end
-
-        css('table[style]', 'th[style]', 'td[style]').remove_attr('style')
-        css('table[cellpadding]').remove_attr('cellpadding')
-
-        css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node|
-          node.name = 'th'
-          node.content = ' ' if node.content.empty?
-        end
-
-        css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node|
-          node.name = 'code'
-          node.remove_attribute('class')
-          node.content = node.content unless node.at_css('a')
-        end
-
-        css('div > span.source-cpp').each do |node|
-          node.name = 'pre'
-          node.inner_html = node.inner_html.gsub('<br>', "\n")
-          node.content = node.content
-        end
-
-        css('div > a > img[alt="About this image"]').each do |node|
-          node.parent.parent.remove
-        end
-
-        css('area[href]').each do |node|
-          node['href'] = node['href'].remove('.html')
-        end
-
-        css('p').each do |node|
-          while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code')
-            node << node.next
-          end
-          node.inner_html = node.inner_html.strip
-          node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/
-          node.remove if node.content.blank? && !node.at_css('img')
-        end
-
-        css('pre').each do |node|
-          node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp')
-            'cpp'
-          else
-            'c'
-          end
-          node.remove_attribute('class')
-          node.content = node.content.gsub("\t", ' ' * 8)
-        end
-
-        css('code code', '.mw-geshi').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('h1 ~ .fmbox').each do |node|
-          node.name = 'div'
-          node.content = node.content
-        end
-
-        css('img').each do |node|
-          node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg'
-        end
-
-        doc
-      end
-    end
-  end
-end
--- a/lib/docs/filters/c/entries.rb
+++ b/lib/docs/filters/c/entries.rb
@ -22,6 +22,9 @@ module Docs
      end

      def get_type
+
+        return "C keywords" if slug =~ /keyword/
+
        type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
        type.strip!
        type.remove! ' library'
--- a/lib/docs/filters/c/fix_code.rb
+++ b/lib/docs/filters/c/fix_code.rb
@ -1,21 +0,0 @@
-module Docs
-  class C
-    class FixCodeFilter < Filter
-      def call
-        css('div > span.source-c', 'div > span.source-cpp').each do |node|
-          node.inner_html = node.inner_html.gsub(/<br>\n?/, "\n").gsub("\n</p>\n", "</p>\n")
-          node.parent.name = 'pre'
-          node.parent['class'] = node['class']
-          node.parent.content = node.content
-        end
-
-        nbsp = Nokogiri::HTML('&nbsp;').text
-        css('pre').each do |node|
-          node.content = node.content.gsub(nbsp, ' ')
-        end
-
-        doc
-      end
-    end
-  end
-end
--- a/lib/docs/filters/c/fix_urls.rb
+++ b/lib/docs/filters/c/fix_urls.rb
@ -1,11 +0,0 @@
-module Docs
-  class C
-    class FixUrlsFilter < Filter
-      def call
-        html.gsub! File.join(C.base_url, C.root_path), C.base_url[0..-2]
-        html.gsub! %r{#{C.base_url}([^"']+?)\.html}, "#{C.base_url}\\1"
-        html
-      end
-    end
-  end
-end
--- a/lib/docs/scrapers/c.rb
+++ b/lib/docs/scrapers/c.rb
@ -1,42 +0,0 @@
-module Docs
-  class C < FileScraper
-    self.type = 'c'
-    self.base_url = 'http://en.cppreference.com/w/c/'
-    self.root_path = 'header.html'
-
-    html_filters.insert_before 'clean_html', 'c/fix_code'
-    html_filters.push 'c/entries', 'c/clean_html', 'title'
-    text_filters.push 'c/fix_urls'
-
-    options[:decode_and_clean_paths] = true
-    options[:container] = '#content'
-    options[:title] = false
-    options[:root_title] = 'C Programming Language'
-    options[:skip] = %w(language/history.html)
-    options[:skip_patterns] = [/experimental/]
-
-    options[:fix_urls] = ->(url) do
-      url.sub! %r{\A.+/http%3A/}, 'http://'
-      url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com'
-      url
-    end
-
-    options[:attribution] = <<-HTML
-      &copy; cppreference.com<br>
-      Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
-    HTML
-
-    def get_latest_version(opts)
-      doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
-      link = doc.at_css('a[title^="File:"]')
-      date = link.content.scan(/(\d+)\./)[0][0]
-      DateTime.strptime(date, '%Y%m%d').to_time.to_i
-    end
-
-    private
-
-    def file_path_for(*)
-      URI.unescape(super)
-    end
-  end
-end
--- a/lib/docs/scrapers/cppref/c.rb
+++ b/lib/docs/scrapers/cppref/c.rb
@ -0,0 +1,12 @@
+module Docs
+  class C < Cppref
+    self.name = 'c'
+    self.slug = 'c'
+    self.base_url = 'https://en.cppreference.com/w/c/'
+
+    html_filters.insert_before 'cppref/clean_html', 'c/entries'
+
+    options[:root_title] = 'C Programming Language'
+
+  end
+end
--- a/lib/docs/scrapers/cppref/cpp.rb
+++ b/lib/docs/scrapers/cppref/cpp.rb
@ -2,7 +2,6 @@ module Docs
  class Cpp < Cppref
    self.name = 'C++'
    self.slug = 'cpp'
-    self.type = 'c'
    self.base_url = 'https://en.cppreference.com/w/cpp/'

    html_filters.insert_before 'cppref/clean_html', 'cpp/entries'
@ -16,13 +15,5 @@ module Docs
      regex/regex_token_iterator/operator_cmp.html
    )

-    # Same as get_latest_version in lib/docs/scrapers/c.rb
-    def get_latest_version(opts)
-      doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
-      link = doc.at_css('a[title^="File:"]')
-      date = link.content.scan(/(\d+)\./)[0][0]
-      DateTime.strptime(date, '%Y%m%d').to_time.to_i
-    end
-
  end
 end
--- a/lib/docs/scrapers/cppref/cppref.rb
+++ b/lib/docs/scrapers/cppref/cppref.rb
@ -6,7 +6,7 @@ module Docs

    html_filters.insert_before 'clean_html', 'cppref/fix_code'
    html_filters.push  'cppref/clean_html', 'title'
-      # 'cpp20/entries',
+
    options[:decode_and_clean_paths] = true
    options[:container] = '#content'
    options[:title] = false
@ -21,9 +21,13 @@ module Docs
      Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
    HTML

-    # def get_latest_version
-
-    # end
+    # Check if the 'headers' page has changed
+    def get_latest_version(opts)
+      doc = fetch_doc(self.base_url + self.root_path, opts)
+      date = doc.at_css('#footer-info-lastmod').content
+      date = date.match(/[[:digit:]]{1,2} .* [[:digit:]]{4}/).to_s
+      date = DateTime.strptime(date, '%e %B %Y').to_time.to_i
+    end

  end
 end