Reindex R documentation, include 2 manuals

Now each page is indexed by their title (by default), and each index term declared for it on the index 2 manuals are included, the data import/export as its own category (as it is rather short), and each top-level section of the R introduction manual (as it is quite a bit longer). Add some manual cleanup. Some pages still seem missing: - either belonging to non-default packages, i.e. it is normal that they miss - or corresponding to index words without their own package (!)
4 years ago · 94b404450c
parent c3b93377c3
commit 94b404450c
3 changed files with 89 additions and 19 deletions
--- a/lib/docs/filters/r/clean_html.rb
+++ b/lib/docs/filters/r/clean_html.rb
@ -3,7 +3,13 @@ module Docs
    class CleanHtmlFilter < Filter
      def call
        slug_parts = slug.split('/')
-        if slug_parts[0] == 'library'
+
+        if root_page?
+          css('a[href$="/00index"]').each do |pkg|
+            pkg['href'] = "/r-#{pkg['href'].split('/')[1]}/"
+          end
+
+        elsif slug_parts[0] == 'library'
          title = at_css('h2')
          title.inner_html = "<code>#{slug_parts[3]}</code> #{title.content}"

@ -11,12 +17,31 @@ module Docs
          summary.remove if summary

        elsif slug_parts[-2] == 'manual'
+          css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, table.index-vr, table[summary]').remove
+
+          css('h2').each do |node|
+            node.remove if node.content.end_with? ' index'
+          end
+
          css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + h6').each do |node|
-            id = node.previous['id']
-            node.previous.remove
-            node['id'] = id.sub(/-1$/, '') if id
+            # We need the first of the series of span with ids
+            span = node.previous_element
+            while span.previous
+              prev = span.previous_element
+              break unless prev.name == 'span' and prev['id']
+              span.remove
+              span = prev
+            end
+
+            node['id'] = span['id']
+            span.remove
+
+            css('div.example').each do |node|
+              node.replace(node.children)
+            end
          end
-          css('table.menu, div.header, hr').remove
+
+          css('h1 + h1').remove

          css('.footnote h5').each do |node|
            anchor = node.at_css('a[id]')
--- a/lib/docs/filters/r/entries.rb
+++ b/lib/docs/filters/r/entries.rb
@ -2,11 +2,16 @@ module Docs
  class R
    class EntriesFilter < Docs::EntriesFilter

-      @@include_manual = false
-      @@include_misc = false
+      PKG_INDEX_ENTRIES = Hash.new []

      def initialize(*)
        super
+
+        if slug_parts[-1] == '00Index'
+          css('tr a').each do |link|
+            PKG_INDEX_ENTRIES[link['href']] += [link.text]
+          end
+        end
      end

      def slug_parts
@ -18,11 +23,11 @@ module Docs
      end

      def is_manual?
-        slug_parts[-2] == 'manual'
+        slug_parts[1] == 'manual'
      end

      def get_name
-        return slug_parts[3] + ' − ' + at_css('h2').content if is_package?
+        return at_css('h2').content if is_package?
        title = at_css('h1.settitle')
        title ? title.content : at_css('h1, h2').content
      end
@ -30,24 +35,41 @@ module Docs
      def get_type
        return slug_parts[1] if is_package?
        return at_css('h1.settitle').content if is_manual?
-        'Miscellaneous'
      end

      def include_default_entry?
-        if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index'
-          return false
-        end
-        is_package? or self.include_misc
+        is_package? and not slug_parts[-1] == '00Index'
+      end
+
+      def manual_section(node)
+        title = node.content.sub /^((Appendix )?[A-Z]|[0-9]+)(\.[0-9]+)* /, ''
+        title unless ['References', 'Preface', 'Acknowledgements'].include?(title) or title.end_with?(' index')
      end

      def additional_entries
-        return [] unless is_manual? and self.include_manual
+        if is_package? and slug_parts[-1] != '00Index'
+          page = slug_parts[-1]
+          return [page] + PKG_INDEX_ENTRIES.fetch(page, [])
+        end
+
+        return [] unless is_manual?

        entries = []
-        css('div.contents > ul > li').each do |node|
-          node.css('a').each do |link|
-            link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
-            entries << [link_name, link['href'].split('#')[1], name]
+        unless slug_parts[-1].downcase == 'r-intro'
+          # Single top-level category
+          css('div.contents > ul a').each do |link|
+            link_name = manual_section(link)
+            entries << [link_name, link['href'].split('#')[1], name] unless link_name.nil?
+          end
+        else
+          # Split 1st level of manual into different categories
+          css('div.contents > ul > li').each do |node|
+            type = manual_section(node.at_css('a'))
+            next if type.nil?
+            node.css('> ul a').each do |link|
+              link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
+              entries << [link_name, link['href'].split('#')[1], type]
+            end
          end
        end
        return entries
--- a/lib/docs/scrapers/r.rb
+++ b/lib/docs/scrapers/r.rb
@ -21,10 +21,33 @@ module Docs
    HTML

    # Never want those
+    options[:skip_patterns] = [
+      /\/DESCRIPTION$/,
+      /\/NEWS(\.[^\/]*)?$/,
+      /\/demo$/,
+      /\.pdf$/
+    ]
+
+    ## We want to fix links like so − but only if the targets don’t exist,
+    ## as these target packages or keywords that do not have their own file,
+    ## but exist on another page, and we properly record it.
+    #
+    #options[:fix_urls] = ->(url) do
+    #  url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" }
+    #  url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| "/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" }
+    #end
+
    options[:skip] = %w(
      doc/html/packages-head-utf8.html
      doc/html/SearchOn.html
      doc/html/Search.html
+      doc/html/UserManuals.html
+      doc/html/faq.html
+      doc/manual/R-FAQ.html
+      doc/manual/R-admin.html
+      doc/manual/R-exts.html
+      doc/manual/R-ints.html
+      doc/manual/R-lang.html
    )

  end