diff --git a/lib/docs/filters/r/clean_html.rb b/lib/docs/filters/r/clean_html.rb
index 28ea571d..57c91ee5 100644
--- a/lib/docs/filters/r/clean_html.rb
+++ b/lib/docs/filters/r/clean_html.rb
@@ -3,7 +3,13 @@ module Docs
class CleanHtmlFilter < Filter
def call
slug_parts = slug.split('/')
- if slug_parts[0] == 'library'
+
+ if root_page?
+ css('a[href$="/00index"]').each do |pkg|
+ pkg['href'] = "/r-#{pkg['href'].split('/')[1]}/"
+ end
+
+ elsif slug_parts[0] == 'library'
title = at_css('h2')
title.inner_html = "#{slug_parts[3]}
#{title.content}"
@@ -11,12 +17,31 @@ module Docs
summary.remove if summary
elsif slug_parts[-2] == 'manual'
+ css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, table.index-vr, table[summary]').remove
+
+ css('h2').each do |node|
+ node.remove if node.content.end_with? ' index'
+ end
+
css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + h6').each do |node|
- id = node.previous['id']
- node.previous.remove
- node['id'] = id.sub(/-1$/, '') if id
+ # We need the first of the series of span with ids
+ span = node.previous_element
+ while span.previous
+ prev = span.previous_element
+ break unless prev.name == 'span' and prev['id']
+ span.remove
+ span = prev
+ end
+
+ node['id'] = span['id']
+ span.remove
+
+ css('div.example').each do |node|
+ node.replace(node.children)
+ end
end
- css('table.menu, div.header, hr').remove
+
+ css('h1 + h1').remove
css('.footnote h5').each do |node|
anchor = node.at_css('a[id]')
diff --git a/lib/docs/filters/r/entries.rb b/lib/docs/filters/r/entries.rb
index b54c2c21..a9793e07 100644
--- a/lib/docs/filters/r/entries.rb
+++ b/lib/docs/filters/r/entries.rb
@@ -2,11 +2,16 @@ module Docs
class R
class EntriesFilter < Docs::EntriesFilter
- @@include_manual = false
- @@include_misc = false
+ PKG_INDEX_ENTRIES = Hash.new []
def initialize(*)
super
+
+ if slug_parts[-1] == '00Index'
+ css('tr a').each do |link|
+ PKG_INDEX_ENTRIES[link['href']] += [link.text]
+ end
+ end
end
def slug_parts
@@ -18,11 +23,11 @@ module Docs
end
def is_manual?
- slug_parts[-2] == 'manual'
+ slug_parts[1] == 'manual'
end
def get_name
- return slug_parts[3] + ' − ' + at_css('h2').content if is_package?
+ return at_css('h2').content if is_package?
title = at_css('h1.settitle')
title ? title.content : at_css('h1, h2').content
end
@@ -30,24 +35,41 @@ module Docs
def get_type
return slug_parts[1] if is_package?
return at_css('h1.settitle').content if is_manual?
- 'Miscellaneous'
end
def include_default_entry?
- if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index'
- return false
- end
- is_package? or self.include_misc
+ is_package? and not slug_parts[-1] == '00Index'
+ end
+
+ def manual_section(node)
+ title = node.content.sub /^((Appendix )?[A-Z]|[0-9]+)(\.[0-9]+)* /, ''
+ title unless ['References', 'Preface', 'Acknowledgements'].include?(title) or title.end_with?(' index')
end
def additional_entries
- return [] unless is_manual? and self.include_manual
+ if is_package? and slug_parts[-1] != '00Index'
+ page = slug_parts[-1]
+ return [page] + PKG_INDEX_ENTRIES.fetch(page, [])
+ end
+
+ return [] unless is_manual?
entries = []
- css('div.contents > ul > li').each do |node|
- node.css('a').each do |link|
- link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
- entries << [link_name, link['href'].split('#')[1], name]
+ unless slug_parts[-1].downcase == 'r-intro'
+ # Single top-level category
+ css('div.contents > ul a').each do |link|
+ link_name = manual_section(link)
+ entries << [link_name, link['href'].split('#')[1], name] unless link_name.nil?
+ end
+ else
+ # Split 1st level of manual into different categories
+ css('div.contents > ul > li').each do |node|
+ type = manual_section(node.at_css('a'))
+ next if type.nil?
+ node.css('> ul a').each do |link|
+ link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
+ entries << [link_name, link['href'].split('#')[1], type]
+ end
end
end
return entries
diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb
index 9d95fbaa..6a36a843 100644
--- a/lib/docs/scrapers/r.rb
+++ b/lib/docs/scrapers/r.rb
@@ -21,10 +21,33 @@ module Docs
HTML
# Never want those
+ options[:skip_patterns] = [
+ /\/DESCRIPTION$/,
+ /\/NEWS(\.[^\/]*)?$/,
+ /\/demo$/,
+ /\.pdf$/
+ ]
+
+ ## We want to fix links like so − but only if the targets don’t exist,
+ ## as these target packages or keywords that do not have their own file,
+ ## but exist on another page, and we properly record it.
+ #
+ #options[:fix_urls] = ->(url) do
+ # url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" }
+ # url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| "/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" }
+ #end
+
options[:skip] = %w(
doc/html/packages-head-utf8.html
doc/html/SearchOn.html
doc/html/Search.html
+ doc/html/UserManuals.html
+ doc/html/faq.html
+ doc/manual/R-FAQ.html
+ doc/manual/R-admin.html
+ doc/manual/R-exts.html
+ doc/manual/R-ints.html
+ doc/manual/R-lang.html
)
end