Improve HTTP scraper

10 years ago · 1d5b7e3aaf
parent 92d3fd5d0f
commit 1d5b7e3aaf
10 changed files with 123 additions and 52 deletions
--- a/assets/images/icons.png
+++ b/assets/images/icons.png
--- a/assets/images/icons@2x.png
+++ b/assets/images/icons@2x.png
--- a/assets/javascripts/news.json
+++ b/assets/javascripts/news.json
@ -1,6 +1,7 @@
 [
  [
    "2015-02-22",
+    "Improved <a href=\"/http/\">HTTP</a> documentation",
    "New <a href=\"/minitest/\">Minitest</a> documentation"
  ], [
    "2015-02-16",
--- a/assets/javascripts/templates/pages/about_tmpl.coffee
+++ b/assets/javascripts/templates/pages/about_tmpl.coffee
@ -160,11 +160,6 @@ credits = [
    'The University of Glasgow',
    'BSD',
    'http://www.haskell.org/ghc/license'
-  ], [
-    'HTTP',
-    '1999 The Internet Society',
-    'Custom',
-    'http://www.w3.org/Protocols/rfc2616/rfc2616-sec21.html#sec21'
  ], [
    'io.js',
    'io.js contributors',
--- a/assets/stylesheets/pages/_rfc.scss
+++ b/assets/stylesheets/pages/_rfc.scss
@ -1,6 +1,15 @@
-._rfc {
-  padding-left: 1rem;
+._rfc-pre {
+  font-size: .8125rem;
+  min-width: 38rem;
+  @extend %code;

-  > h1, > h2 { margin-left: -1rem; }
  > h2 { @extend %block-heading; }
+  > h3 { @extend %block-label, %label-blue; }
+  > h4 { @extend %block-label; }
+  > h3, > h4 { font-size: .875rem; }
+
+  > h1, > h2, > h3, > h4, > h5 {
+    margin: 0;
+    font-family: $baseFont;
+  }
 }
--- a/lib/docs/filters/http/clean_html.rb
+++ b/lib/docs/filters/http/clean_html.rb
@ -2,47 +2,39 @@ module Docs
  class Http
    class CleanHtmlFilter < Filter
      def call
-        root_page? ? root : other
-        doc
-      end
-
-      def root
-        # Change title
-        title = at_css 'h2'
-        title.name = 'h1'
-        title.inner_html = 'Hypertext Transfer Protocol &mdash; HTTP/1.1'
-
-        # Remove "..." following each link
-        css('span').each do |node|
-          node.inner_html = node.first_element_child if node.first_element_child
+        if root_page?
+          doc.inner_html = '<h1>Hypertext Transfer Protocol</h1>'
+          return doc
        end
-      end

-      def other
-        at_css('address').remove
+        doc.child.remove while doc.child.name != 'pre'
+
+        css('span.grey', '.invisible', '.noprint', 'a[href^="#page-"]').remove

-        # Change title
-        title = at_css 'h2'
-        title.name = 'h1'
-        title.at_css('a').remove
-        title.content = "HTTP #{title.content}"
+        css('pre').each do |node|
+          content = node.inner_html.remove(/\A(\ *\n)+/).remove(/(\n\ *)+\z/)
+          node.before("\n\n" + content).remove
+        end

-        # Update headings
-        css('h3').each do |node|
-          link = node.at_css('a')
-          node.name = "h#{link.content.count('.') + 1}"
-          node['id'] = link['id']
-          link.remove
+        css('span[class^="h"]').each do |node|
+          i = node['class'][/\Ah(\d)/, 1].to_i
+          next unless i > 0
+          node.name = "h#{i}"
+          node.inner_html = node.inner_html.strip
+          node.next.content = node.next.content.remove(/\A\n/) if node.next.text?
        end

-        # Merge adjacent <pre> tags and remove indentation
-        css('pre').each do |node|
-          while (sibling = node.next_element) && sibling.name == 'pre'
-            node.inner_html += "\n#{sibling.inner_html}"
-            sibling.remove
-          end
-          node.inner_html = node.inner_html.strip_heredoc
+        css('.selflink').each do |node|
+          node.parent['id'] = node['name']
+          node.before(node.children).remove
        end
+
+        html = doc.inner_html.strip
+        html.remove! %r[\.{2,}$]
+        html.gsub! %r[(^\n$){3,}], "\n"
+        doc.inner_html = %(<div class="_rfc-pre">#{html}</div>)
+
+        doc
      end
    end
  end
--- a/lib/docs/filters/http/entries.rb
+++ b/lib/docs/filters/http/entries.rb
@ -1,19 +1,91 @@
 module Docs
  class Http
    class EntriesFilter < Docs::EntriesFilter
+      def get_name
+        name = at_css('h1').content
+        name.remove! %r{\A.+\:}
+        name.remove! %r{\A.+\-\-}
+        "#{rfc}: #{name.strip}"
+      end
+
      def get_type
-        at_css('h1').content.sub(/\A\s*HTTP\s+(.+)\s+Definitions\s*\z/, '\1').pluralize
+        'RFC'
      end

-      def include_default_entry?
-        false
+      def rfc
+        slug.sub('rfc', 'RFC ')
      end

+      SECTIONS = {
+        'rfc2616' => [
+          [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15],
+          [14],
+          []
+        ],
+        'rfc7230' => [
+          (2..9).to_a,
+          [],
+          []
+        ],
+        'rfc7231' => [
+          [3, 8, 9],
+          [],
+          [4, 5, 6, 7]
+        ],
+        'rfc7232' => [
+          [5, 6, 7, 8],
+          [2, 3, 4],
+          []
+        ],
+        'rfc7233' => [
+          [5, 6],
+          [2, 3, 4],
+          []
+        ],
+        'rfc7234' => [
+          [3, 6, 7, 8],
+          [4, 5],
+          []
+        ],
+        'rfc7235' => [
+          [2, 5, 6],
+          [3, 4],
+          []
+        ]
+      }
+
+      LEVEL_1 = /\A(\d+)\z/
+      LEVEL_2 = /\A(\d+)\.\d+\z/
+      LEVEL_3 = /\A(\d+)\.\d+\.\d+\z/
+
      def additional_entries
        return [] if root_page?
+        type = nil
+
+        css('a[href^="#section-"]').each_with_object([]) do |node, entries|
+          id = node['href'].remove('#')
+          break entries if entries.any? { |e| e[1] == id }
+
+          content = node.next.content.strip
+          content.remove! %r{\s*\.+\d*\z}
+          content.remove! %r{\A[\.\s]+}
+
+          name = "#{content} (#{rfc})"
+          number = node.content.strip
+
+          if number =~ LEVEL_1
+            if SECTIONS[slug][0].include?($1.to_i)
+              entries << [name, id, self.name]
+            end

-        css(type == 'Status Codes' ? 'h3' : 'h2').map do |node|
-          [node.content, node['id']]
+            type = content.sub(/\ Definitions\z/, 's')
+            type = 'Request Header Fields' if type.include?('Header Fields') && type.exclude?('Response')
+            type = 'Response Status Codes' if type.include?('Status Codes')
+            type = self.name unless type.start_with?('Request ') || type.start_with?('Response ')
+          elsif (number =~ LEVEL_2 && SECTIONS[slug][1].include?($1.to_i)) ||
+                (number =~ LEVEL_3 && SECTIONS[slug][2].include?($1.to_i))
+            entries << [name, id, (name =~ /\A\d\d\d/ ? 'Response Status Codes' : type )]
+          end
        end
      end
    end
--- a/lib/docs/scrapers/http.rb
+++ b/lib/docs/scrapers/http.rb
@ -2,13 +2,15 @@ module Docs
  class Http < UrlScraper
    self.name = 'HTTP'
    self.type = 'rfc'
-    self.base_url = 'http://www.w3.org/Protocols/rfc2616/'
-    self.root_path = 'rfc2616.html'
+    self.base_url = 'https://tools.ietf.org/html/'
+    self.initial_paths = %w(rfc2616 rfc7230 rfc7231
+      rfc7232 rfc7233 rfc7234 rfc7235)

    html_filters.push 'http/clean_html', 'http/entries'

-    options[:only] = %w(rfc2616-sec10.html rfc2616-sec14.html)
-    options[:container] = ->(filter) { '.toc' if filter.root_page? }
-    options[:attribution] = "&copy; 1999 The Internet Society"
+    options[:skip_links] = true
+    options[:attribution] = <<-HTML
+      &copy; document authors. All rights reserved.
+    HTML
  end
 end
--- a/public/icons/docs/http/16.png
+++ b/public/icons/docs/http/16.png
--- a/public/icons/docs/http/16@2x.png
+++ b/public/icons/docs/http/16@2x.png