Improve HTTP scraper

pull/170/head
Thibaut 10 years ago
parent 92d3fd5d0f
commit 1d5b7e3aaf

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

After

Width:  |  Height:  |  Size: 74 KiB

@ -1,6 +1,7 @@
[
[
"2015-02-22",
"Improved <a href=\"/http/\">HTTP</a> documentation",
"New <a href=\"/minitest/\">Minitest</a> documentation"
], [
"2015-02-16",

@ -160,11 +160,6 @@ credits = [
'The University of Glasgow',
'BSD',
'http://www.haskell.org/ghc/license'
], [
'HTTP',
'1999 The Internet Society',
'Custom',
'http://www.w3.org/Protocols/rfc2616/rfc2616-sec21.html#sec21'
], [
'io.js',
'io.js contributors',

@ -1,6 +1,15 @@
._rfc {
padding-left: 1rem;
._rfc-pre {
font-size: .8125rem;
min-width: 38rem;
@extend %code;
> h1, > h2 { margin-left: -1rem; }
> h2 { @extend %block-heading; }
> h3 { @extend %block-label, %label-blue; }
> h4 { @extend %block-label; }
> h3, > h4 { font-size: .875rem; }
> h1, > h2, > h3, > h4, > h5 {
margin: 0;
font-family: $baseFont;
}
}

@ -2,47 +2,39 @@ module Docs
class Http
class CleanHtmlFilter < Filter
def call
root_page? ? root : other
doc
end
def root
# Change title
title = at_css 'h2'
title.name = 'h1'
title.inner_html = 'Hypertext Transfer Protocol &mdash; HTTP/1.1'
# Remove "..." following each link
css('span').each do |node|
node.inner_html = node.first_element_child if node.first_element_child
if root_page?
doc.inner_html = '<h1>Hypertext Transfer Protocol</h1>'
return doc
end
end
def other
at_css('address').remove
doc.child.remove while doc.child.name != 'pre'
css('span.grey', '.invisible', '.noprint', 'a[href^="#page-"]').remove
# Change title
title = at_css 'h2'
title.name = 'h1'
title.at_css('a').remove
title.content = "HTTP #{title.content}"
css('pre').each do |node|
content = node.inner_html.remove(/\A(\ *\n)+/).remove(/(\n\ *)+\z/)
node.before("\n\n" + content).remove
end
# Update headings
css('h3').each do |node|
link = node.at_css('a')
node.name = "h#{link.content.count('.') + 1}"
node['id'] = link['id']
link.remove
css('span[class^="h"]').each do |node|
i = node['class'][/\Ah(\d)/, 1].to_i
next unless i > 0
node.name = "h#{i}"
node.inner_html = node.inner_html.strip
node.next.content = node.next.content.remove(/\A\n/) if node.next.text?
end
# Merge adjacent <pre> tags and remove indentation
css('pre').each do |node|
while (sibling = node.next_element) && sibling.name == 'pre'
node.inner_html += "\n#{sibling.inner_html}"
sibling.remove
end
node.inner_html = node.inner_html.strip_heredoc
css('.selflink').each do |node|
node.parent['id'] = node['name']
node.before(node.children).remove
end
html = doc.inner_html.strip
html.remove! %r[\.{2,}$]
html.gsub! %r[(^\n$){3,}], "\n"
doc.inner_html = %(<div class="_rfc-pre">#{html}</div>)
doc
end
end
end

@ -1,19 +1,91 @@
module Docs
class Http
class EntriesFilter < Docs::EntriesFilter
def get_name
name = at_css('h1').content
name.remove! %r{\A.+\:}
name.remove! %r{\A.+\-\-}
"#{rfc}: #{name.strip}"
end
def get_type
at_css('h1').content.sub(/\A\s*HTTP\s+(.+)\s+Definitions\s*\z/, '\1').pluralize
'RFC'
end
def include_default_entry?
false
def rfc
slug.sub('rfc', 'RFC ')
end
SECTIONS = {
'rfc2616' => [
[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15],
[14],
[]
],
'rfc7230' => [
(2..9).to_a,
[],
[]
],
'rfc7231' => [
[3, 8, 9],
[],
[4, 5, 6, 7]
],
'rfc7232' => [
[5, 6, 7, 8],
[2, 3, 4],
[]
],
'rfc7233' => [
[5, 6],
[2, 3, 4],
[]
],
'rfc7234' => [
[3, 6, 7, 8],
[4, 5],
[]
],
'rfc7235' => [
[2, 5, 6],
[3, 4],
[]
]
}
LEVEL_1 = /\A(\d+)\z/
LEVEL_2 = /\A(\d+)\.\d+\z/
LEVEL_3 = /\A(\d+)\.\d+\.\d+\z/
def additional_entries
return [] if root_page?
type = nil
css('a[href^="#section-"]').each_with_object([]) do |node, entries|
id = node['href'].remove('#')
break entries if entries.any? { |e| e[1] == id }
content = node.next.content.strip
content.remove! %r{\s*\.+\d*\z}
content.remove! %r{\A[\.\s]+}
name = "#{content} (#{rfc})"
number = node.content.strip
if number =~ LEVEL_1
if SECTIONS[slug][0].include?($1.to_i)
entries << [name, id, self.name]
end
css(type == 'Status Codes' ? 'h3' : 'h2').map do |node|
[node.content, node['id']]
type = content.sub(/\ Definitions\z/, 's')
type = 'Request Header Fields' if type.include?('Header Fields') && type.exclude?('Response')
type = 'Response Status Codes' if type.include?('Status Codes')
type = self.name unless type.start_with?('Request ') || type.start_with?('Response ')
elsif (number =~ LEVEL_2 && SECTIONS[slug][1].include?($1.to_i)) ||
(number =~ LEVEL_3 && SECTIONS[slug][2].include?($1.to_i))
entries << [name, id, (name =~ /\A\d\d\d/ ? 'Response Status Codes' : type )]
end
end
end
end

@ -2,13 +2,15 @@ module Docs
class Http < UrlScraper
self.name = 'HTTP'
self.type = 'rfc'
self.base_url = 'http://www.w3.org/Protocols/rfc2616/'
self.root_path = 'rfc2616.html'
self.base_url = 'https://tools.ietf.org/html/'
self.initial_paths = %w(rfc2616 rfc7230 rfc7231
rfc7232 rfc7233 rfc7234 rfc7235)
html_filters.push 'http/clean_html', 'http/entries'
options[:only] = %w(rfc2616-sec10.html rfc2616-sec14.html)
options[:container] = ->(filter) { '.toc' if filter.root_page? }
options[:attribution] = "&copy; 1999 The Internet Society"
options[:skip_links] = true
options[:attribution] = <<-HTML
&copy; document authors. All rights reserved.
HTML
end
end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 737 B

After

Width:  |  Height:  |  Size: 582 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Loading…
Cancel
Save