diff --git a/lib/docs/filters/r/clean_html.rb b/lib/docs/filters/r/clean_html.rb index 28ea571d..57c91ee5 100644 --- a/lib/docs/filters/r/clean_html.rb +++ b/lib/docs/filters/r/clean_html.rb @@ -3,7 +3,13 @@ module Docs class CleanHtmlFilter < Filter def call slug_parts = slug.split('/') - if slug_parts[0] == 'library' + + if root_page? + css('a[href$="/00index"]').each do |pkg| + pkg['href'] = "/r-#{pkg['href'].split('/')[1]}/" + end + + elsif slug_parts[0] == 'library' title = at_css('h2') title.inner_html = "#{slug_parts[3]} #{title.content}" @@ -11,12 +17,31 @@ module Docs summary.remove if summary elsif slug_parts[-2] == 'manual' + css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, table.index-vr, table[summary]').remove + + css('h2').each do |node| + node.remove if node.content.end_with? ' index' + end + css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + h6').each do |node| - id = node.previous['id'] - node.previous.remove - node['id'] = id.sub(/-1$/, '') if id + # We need the first of the series of span with ids + span = node.previous_element + while span.previous + prev = span.previous_element + break unless prev.name == 'span' and prev['id'] + span.remove + span = prev + end + + node['id'] = span['id'] + span.remove + + css('div.example').each do |node| + node.replace(node.children) + end end - css('table.menu, div.header, hr').remove + + css('h1 + h1').remove css('.footnote h5').each do |node| anchor = node.at_css('a[id]') diff --git a/lib/docs/filters/r/entries.rb b/lib/docs/filters/r/entries.rb index b54c2c21..a9793e07 100644 --- a/lib/docs/filters/r/entries.rb +++ b/lib/docs/filters/r/entries.rb @@ -2,11 +2,16 @@ module Docs class R class EntriesFilter < Docs::EntriesFilter - @@include_manual = false - @@include_misc = false + PKG_INDEX_ENTRIES = Hash.new [] def initialize(*) super + + if slug_parts[-1] == '00Index' + css('tr a').each do |link| + PKG_INDEX_ENTRIES[link['href']] += [link.text] + end + end end def slug_parts @@ -18,11 +23,11 @@ module Docs end def is_manual? - slug_parts[-2] == 'manual' + slug_parts[1] == 'manual' end def get_name - return slug_parts[3] + ' − ' + at_css('h2').content if is_package? + return at_css('h2').content if is_package? title = at_css('h1.settitle') title ? title.content : at_css('h1, h2').content end @@ -30,24 +35,41 @@ module Docs def get_type return slug_parts[1] if is_package? return at_css('h1.settitle').content if is_manual? - 'Miscellaneous' end def include_default_entry? - if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index' - return false - end - is_package? or self.include_misc + is_package? and not slug_parts[-1] == '00Index' + end + + def manual_section(node) + title = node.content.sub /^((Appendix )?[A-Z]|[0-9]+)(\.[0-9]+)* /, '' + title unless ['References', 'Preface', 'Acknowledgements'].include?(title) or title.end_with?(' index') end def additional_entries - return [] unless is_manual? and self.include_manual + if is_package? and slug_parts[-1] != '00Index' + page = slug_parts[-1] + return [page] + PKG_INDEX_ENTRIES.fetch(page, []) + end + + return [] unless is_manual? entries = [] - css('div.contents > ul > li').each do |node| - node.css('a').each do |link| - link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, '' - entries << [link_name, link['href'].split('#')[1], name] + unless slug_parts[-1].downcase == 'r-intro' + # Single top-level category + css('div.contents > ul a').each do |link| + link_name = manual_section(link) + entries << [link_name, link['href'].split('#')[1], name] unless link_name.nil? + end + else + # Split 1st level of manual into different categories + css('div.contents > ul > li').each do |node| + type = manual_section(node.at_css('a')) + next if type.nil? + node.css('> ul a').each do |link| + link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, '' + entries << [link_name, link['href'].split('#')[1], type] + end end end return entries diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb index 9d95fbaa..6a36a843 100644 --- a/lib/docs/scrapers/r.rb +++ b/lib/docs/scrapers/r.rb @@ -21,10 +21,33 @@ module Docs HTML # Never want those + options[:skip_patterns] = [ + /\/DESCRIPTION$/, + /\/NEWS(\.[^\/]*)?$/, + /\/demo$/, + /\.pdf$/ + ] + + ## We want to fix links like so − but only if the targets don’t exist, + ## as these target packages or keywords that do not have their own file, + ## but exist on another page, and we properly record it. + # + #options[:fix_urls] = ->(url) do + # url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" } + # url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| "/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" } + #end + options[:skip] = %w( doc/html/packages-head-utf8.html doc/html/SearchOn.html doc/html/Search.html + doc/html/UserManuals.html + doc/html/faq.html + doc/manual/R-FAQ.html + doc/manual/R-admin.html + doc/manual/R-exts.html + doc/manual/R-ints.html + doc/manual/R-lang.html ) end