Improve Erlang scraper

Closes #432.
pull/438/head
Thibaut Courouble 9 years ago
parent 368eda32d6
commit 2346300cee

@ -2,6 +2,7 @@
@extend %simple;
h3.code { @extend %code; }
code.code { @extend %label; }
.note { @extend %note; }
.warning { @extend %note, %note-red; }
.note .label, .warning .label { font-weight: bold; }

@ -45,7 +45,7 @@ module Docs
node.content = content.capitalize if content == content.upcase
end
css('p > span.bold_code:first-child ~ br:last-child').each do |node|
css('p > .bold_code:first-child ~ br:last-child').each do |node|
parent = node.parent
parent.name = 'h3'
parent['class'] = 'code'
@ -54,15 +54,11 @@ module Docs
parent.inner_html = parent.inner_html.strip
end
css('span.code').each do |node|
node.name = 'code'
end
css('pre *:not(a)').each do |node|
css('pre:not(.REFTYPES) *:not(a)', 'a[href^=javascript]').each do |node|
node.before(node.children).remove
end
css('pre').each do |node|
css('pre:not(.REFTYPES)').each do |node|
node.inner_html = node.inner_html.strip_heredoc
end

@ -3,14 +3,32 @@ module Docs
class EntriesFilter < Docs::EntriesFilter
def get_name
name = at_css('h1').content.strip
name.prepend 'Guide: ' if doc.inner_html.include?('<strong>User\'s Guide</strong>')
name << " (#{type.remove('Guide: ')})" if name == '1 Introduction'
name
end
def get_type
type = subpath[/lib\/(.+?)[\-\/]/, 1]
type << "/#{name}" if type == 'stdlib' && entry_nodes.length >= 10
type
if subpath.start_with?('lib/')
type = subpath[/lib\/(.+?)[\-\/]/, 1]
type << "/#{name}" if type == 'stdlib' && entry_nodes.length >= 10
type
elsif subpath.start_with?('doc/')
type = subpath[/doc\/(.+?)\//, 1]
type.capitalize!
type.sub! '_', ' '
type.sub! 'Oam', 'OAM'
type.remove! ' Guide'
type.prepend 'Guide: '
type
elsif subpath.start_with?('erts')
type = 'ERTS'
if name =~ /\A\d/
type.prepend 'Guide: '
elsif entry_nodes.length > 0
type << "/#{name}"
end
type
end
end
def include_default_entry?
@ -18,17 +36,38 @@ module Docs
end
def additional_entries
entry_nodes.map do |node|
id = node['name']
name = id.gsub %r{\-(?<arity>.*)\z}, '/\k<arity>'
name.remove! 'Module:'
name.prepend "#{self.name}:"
[name, id]
return [] unless include_default_entry?
if subpath.start_with?('lib/')
entry_nodes.map do |node|
id = node['name']
name = id.gsub %r{\-(?<arity>.*)\z}, '/\k<arity>'
name.remove! 'Module:'
name.prepend "#{self.name}:"
[name, id]
end
elsif subpath.start_with?('doc/')
[]
elsif subpath.start_with?('erts')
return [] if type.start_with?('Guide')
entry_nodes.map do |node|
id = node['href'][/#(.+)/, 1]
name = node.content.strip
name.remove! 'Module:'
name.prepend "#{self.name}:"
[name, id]
end
end
end
def entry_nodes
@entry_nodes ||= css('div.REFBODY + p > a')
@entry_nodes ||= if subpath.start_with?('lib/')
css('div.REFBODY + p > a')
elsif subpath.start_with?('erts')
link = at_css(".flipMenu a[href='#{File.basename(subpath, '.html')}']")
list = link.parent.parent
list['class'] == 'flipMenu' ? [] : list.css('a').to_a.tap { |a| a.delete(link); }
end
end
end
end

@ -2,7 +2,17 @@ module Docs
class Erlang
class PreCleanHtmlFilter < Filter
def call
css('.flipMenu li[title] > a').remove
css('.flipMenu li[title] > a').remove unless subpath.start_with?('erts') # perf
css('.REFTYPES').each do |node|
node.name = 'pre'
end
css('span.bold_code', 'span.code').each do |node|
node.name = 'code'
node.inner_html = node.inner_html.strip.gsub(/\s+/, ' ')
end
doc
end
end

@ -10,13 +10,18 @@ module Docs
html_filters.insert_after 'container', 'erlang/pre_clean_html'
html_filters.push 'erlang/entries', 'erlang/clean_html'
options[:only_patterns] = [/\Alib/]
options[:only_patterns] = [
/\Alib/,
/\Adoc\/\w+\//,
/\Aerts.+\/html/
]
options[:skip_patterns] = [
/pdf/,
/release_notes/,
/result/,
/java/,
/\.erl\z/,
/\/html\/.*_app\.html\z/,
/_examples\.html\z/,
/\Alib\/edoc/,

Loading…
Cancel
Save