From 2346300cee02c20c3ffe4fdb2191dba91cef2663 Mon Sep 17 00:00:00 2001 From: Thibaut Courouble Date: Sun, 10 Jul 2016 15:03:19 -0400 Subject: [PATCH] Improve Erlang scraper Closes #432. --- assets/stylesheets/pages/_erlang.scss | 1 + lib/docs/filters/erlang/clean_html.rb | 10 ++-- lib/docs/filters/erlang/entries.rb | 61 +++++++++++++++++++---- lib/docs/filters/erlang/pre_clean_html.rb | 12 ++++- lib/docs/scrapers/erlang.rb | 7 ++- 5 files changed, 71 insertions(+), 20 deletions(-) diff --git a/assets/stylesheets/pages/_erlang.scss b/assets/stylesheets/pages/_erlang.scss index 317699dd..db4c470d 100644 --- a/assets/stylesheets/pages/_erlang.scss +++ b/assets/stylesheets/pages/_erlang.scss @@ -2,6 +2,7 @@ @extend %simple; h3.code { @extend %code; } + code.code { @extend %label; } .note { @extend %note; } .warning { @extend %note, %note-red; } .note .label, .warning .label { font-weight: bold; } diff --git a/lib/docs/filters/erlang/clean_html.rb b/lib/docs/filters/erlang/clean_html.rb index 47535e33..2634e744 100644 --- a/lib/docs/filters/erlang/clean_html.rb +++ b/lib/docs/filters/erlang/clean_html.rb @@ -45,7 +45,7 @@ module Docs node.content = content.capitalize if content == content.upcase end - css('p > span.bold_code:first-child ~ br:last-child').each do |node| + css('p > .bold_code:first-child ~ br:last-child').each do |node| parent = node.parent parent.name = 'h3' parent['class'] = 'code' @@ -54,15 +54,11 @@ module Docs parent.inner_html = parent.inner_html.strip end - css('span.code').each do |node| - node.name = 'code' - end - - css('pre *:not(a)').each do |node| + css('pre:not(.REFTYPES) *:not(a)', 'a[href^=javascript]').each do |node| node.before(node.children).remove end - css('pre').each do |node| + css('pre:not(.REFTYPES)').each do |node| node.inner_html = node.inner_html.strip_heredoc end diff --git a/lib/docs/filters/erlang/entries.rb b/lib/docs/filters/erlang/entries.rb index 23050193..24abc2a1 100644 --- a/lib/docs/filters/erlang/entries.rb +++ b/lib/docs/filters/erlang/entries.rb @@ -3,14 +3,32 @@ module Docs class EntriesFilter < Docs::EntriesFilter def get_name name = at_css('h1').content.strip - name.prepend 'Guide: ' if doc.inner_html.include?('User\'s Guide') + name << " (#{type.remove('Guide: ')})" if name == '1 Introduction' name end def get_type - type = subpath[/lib\/(.+?)[\-\/]/, 1] - type << "/#{name}" if type == 'stdlib' && entry_nodes.length >= 10 - type + if subpath.start_with?('lib/') + type = subpath[/lib\/(.+?)[\-\/]/, 1] + type << "/#{name}" if type == 'stdlib' && entry_nodes.length >= 10 + type + elsif subpath.start_with?('doc/') + type = subpath[/doc\/(.+?)\//, 1] + type.capitalize! + type.sub! '_', ' ' + type.sub! 'Oam', 'OAM' + type.remove! ' Guide' + type.prepend 'Guide: ' + type + elsif subpath.start_with?('erts') + type = 'ERTS' + if name =~ /\A\d/ + type.prepend 'Guide: ' + elsif entry_nodes.length > 0 + type << "/#{name}" + end + type + end end def include_default_entry? @@ -18,17 +36,38 @@ module Docs end def additional_entries - entry_nodes.map do |node| - id = node['name'] - name = id.gsub %r{\-(?.*)\z}, '/\k' - name.remove! 'Module:' - name.prepend "#{self.name}:" - [name, id] + return [] unless include_default_entry? + + if subpath.start_with?('lib/') + entry_nodes.map do |node| + id = node['name'] + name = id.gsub %r{\-(?.*)\z}, '/\k' + name.remove! 'Module:' + name.prepend "#{self.name}:" + [name, id] + end + elsif subpath.start_with?('doc/') + [] + elsif subpath.start_with?('erts') + return [] if type.start_with?('Guide') + entry_nodes.map do |node| + id = node['href'][/#(.+)/, 1] + name = node.content.strip + name.remove! 'Module:' + name.prepend "#{self.name}:" + [name, id] + end end end def entry_nodes - @entry_nodes ||= css('div.REFBODY + p > a') + @entry_nodes ||= if subpath.start_with?('lib/') + css('div.REFBODY + p > a') + elsif subpath.start_with?('erts') + link = at_css(".flipMenu a[href='#{File.basename(subpath, '.html')}']") + list = link.parent.parent + list['class'] == 'flipMenu' ? [] : list.css('a').to_a.tap { |a| a.delete(link); } + end end end end diff --git a/lib/docs/filters/erlang/pre_clean_html.rb b/lib/docs/filters/erlang/pre_clean_html.rb index 75824b49..8daefd28 100644 --- a/lib/docs/filters/erlang/pre_clean_html.rb +++ b/lib/docs/filters/erlang/pre_clean_html.rb @@ -2,7 +2,17 @@ module Docs class Erlang class PreCleanHtmlFilter < Filter def call - css('.flipMenu li[title] > a').remove + css('.flipMenu li[title] > a').remove unless subpath.start_with?('erts') # perf + + css('.REFTYPES').each do |node| + node.name = 'pre' + end + + css('span.bold_code', 'span.code').each do |node| + node.name = 'code' + node.inner_html = node.inner_html.strip.gsub(/\s+/, ' ') + end + doc end end diff --git a/lib/docs/scrapers/erlang.rb b/lib/docs/scrapers/erlang.rb index 5b19274e..b35cd03e 100644 --- a/lib/docs/scrapers/erlang.rb +++ b/lib/docs/scrapers/erlang.rb @@ -10,13 +10,18 @@ module Docs html_filters.insert_after 'container', 'erlang/pre_clean_html' html_filters.push 'erlang/entries', 'erlang/clean_html' - options[:only_patterns] = [/\Alib/] + options[:only_patterns] = [ + /\Alib/, + /\Adoc\/\w+\//, + /\Aerts.+\/html/ + ] options[:skip_patterns] = [ /pdf/, /release_notes/, /result/, /java/, + /\.erl\z/, /\/html\/.*_app\.html\z/, /_examples\.html\z/, /\Alib\/edoc/,