From 0b38f339f120c35b94d32da95b2b2e216bf118a9 Mon Sep 17 00:00:00 2001 From: Cimbali Date: Tue, 25 May 2021 16:18:01 +0200 Subject: [PATCH 1/7] Add R documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contains the R base + recommended package help pages converted to HTML. Equivalent to the fullrefman.pdf generated from source, which is also called « The R Reference Index » on https://cran.r-project.org/manuals.html Currently does not include reference manuals and miscellanea (FAQ, etc.) Script building the documentation: ```bash set -e set -o pipefail DEVDOCSROOT=/path/to/devdocs/docs/r RSOURCEDIR=${TMPDIR:-/tmp}/R/latest RBUILDDIR=${TMPDIR:-/tmp}/R/build RLATEST=https://cran.r-project.org/src/base/R-latest.tar.gz R="$RBUILDDIR/bin/R" libdir="$RBUILDDIR/library" docdir=$RBUILDDIR/doc makevars="$RSOURCEDIR/share/make/vars.mk" if [ ! -f "$R" ] ; then if [ ! -d "$RSOURCEDIR" ]; then mkdir -p "$RSOURCEDIR" && curl "$RLATEST" | tar -C "$RSOURCEDIR" -xzf - --strip-components=1 fi [ -d "$RBUILDDIR" ] || mkdir -p "$RBUILDDIR" [ -f "$RBUILDDIR/config.status" ] || (cd "$RBUILDDIR" && "$RSOURCEDIR/configure") make -C "$RBUILDDIR" && make -C "$RBUILDDIR" docs fi mkdir -p "$DEVDOCSROOT/doc" && cp -r "$docdir"/* "$DEVDOCSROOT/doc/" find "$libdir" -type d -name 'html' -printf '%P\n' | while read d; do mkdir -p "$DEVDOCSROOT/library/$d" cp -r "$libdir/$d"/* "$DEVDOCSROOT/library/$d/" done R_PKGS_BASE="`sed -n 's/^R_PKGS_BASE *= *//p' $makevars`" R_PKGS_RECOMMENDED="`sed -n 's/^R_PKGS_RECOMMENDED *= *//p' $makevars`" cat < 716 bytes public/icons/docs/r/16@2x.png | Bin 0 -> 1430 bytes public/icons/docs/r/SOURCE | 1 + 7 files changed, 130 insertions(+) create mode 100644 lib/docs/filters/r/clean_html.rb create mode 100644 lib/docs/filters/r/entries.rb create mode 100644 lib/docs/scrapers/r.rb create mode 100644 public/icons/docs/r/16.png create mode 100644 public/icons/docs/r/16@2x.png create mode 100644 public/icons/docs/r/SOURCE diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee b/assets/javascripts/templates/pages/about_tmpl.coffee index b86236b0..3138546a 100644 --- a/assets/javascripts/templates/pages/about_tmpl.coffee +++ b/assets/javascripts/templates/pages/about_tmpl.coffee @@ -676,6 +676,11 @@ credits = [ '2012-2018 The Qt Company Ltd', 'GFDL', 'https://doc.qt.io/qt-5/licensing.html' + ], [ + 'R', + '1999--2012 R Foundation for Statistical Computing', + 'GPL', + 'https://svn.r-project.org/R/trunk/COPYING' ], [ 'Ramda', '2013-2020 Scott Sauyet and Michael Hurley', diff --git a/lib/docs/filters/r/clean_html.rb b/lib/docs/filters/r/clean_html.rb new file mode 100644 index 00000000..28ea571d --- /dev/null +++ b/lib/docs/filters/r/clean_html.rb @@ -0,0 +1,34 @@ +module Docs + class R + class CleanHtmlFilter < Filter + def call + slug_parts = slug.split('/') + if slug_parts[0] == 'library' + title = at_css('h2') + title.inner_html = "#{slug_parts[3]} #{title.content}" + + summary = at_css('table[summary]') + summary.remove if summary + + elsif slug_parts[-2] == 'manual' + css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + h6').each do |node| + id = node.previous['id'] + node.previous.remove + node['id'] = id.sub(/-1$/, '') if id + end + css('table.menu, div.header, hr').remove + + css('.footnote h5').each do |node| + anchor = node.at_css('a[id]') + footnote = node.next_sibling + footnote.inner_html = "#{anchor.text} #{footnote.inner_html}" + footnote['id'] = anchor['id'] + node.remove + end + end + + doc + end + end + end +end diff --git a/lib/docs/filters/r/entries.rb b/lib/docs/filters/r/entries.rb new file mode 100644 index 00000000..b54c2c21 --- /dev/null +++ b/lib/docs/filters/r/entries.rb @@ -0,0 +1,59 @@ +module Docs + class R + class EntriesFilter < Docs::EntriesFilter + + @@include_manual = false + @@include_misc = false + + def initialize(*) + super + end + + def slug_parts + slug.split('/') + end + + def is_package? + slug_parts[0] == 'library' + end + + def is_manual? + slug_parts[-2] == 'manual' + end + + def get_name + return slug_parts[3] + ' − ' + at_css('h2').content if is_package? + title = at_css('h1.settitle') + title ? title.content : at_css('h1, h2').content + end + + def get_type + return slug_parts[1] if is_package? + return at_css('h1.settitle').content if is_manual? + 'Miscellaneous' + end + + def include_default_entry? + if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index' + return false + end + is_package? or self.include_misc + end + + def additional_entries + return [] unless is_manual? and self.include_manual + + entries = [] + css('div.contents > ul > li').each do |node| + node.css('a').each do |link| + link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, '' + entries << [link_name, link['href'].split('#')[1], name] + end + end + return entries + end + + private + end + end +end diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb new file mode 100644 index 00000000..7c577d5a --- /dev/null +++ b/lib/docs/scrapers/r.rb @@ -0,0 +1,31 @@ +module Docs + class R < FileScraper + self.name = 'R' + self.slug = 'r' + self.type = 'simple' + self.release = '4.1.0' + self.links = { + home: 'https://www.r-project.org/', + code: 'https://svn.r-project.org/R/' + } + + self.root_path = 'doc/html/packages.html' + + html_filters.push 'r/entries', 'r/clean_html' + + options[:skip_links] = false + + options[:attribution] = <<-HTML + Copyright (©) 1999--2012 R Foundation for Statistical Computing.
+ Licensed under the GNU General Public License. + HTML + + # Never want those + options[:skip] = %w( + doc/html/packages-head-utf8.html + doc/html/SearchOn.html + doc/html/Search.html + ) + + end +end diff --git a/public/icons/docs/r/16.png b/public/icons/docs/r/16.png new file mode 100644 index 0000000000000000000000000000000000000000..2b0c75dafe883e6fbae065fe359b5d3076fffb06 GIT binary patch literal 716 zcmV;-0yF)IP)z`5f0Jm31#SnphGam}2#Dk>5Pis?y{@k2$!~zc!NJ1WG_H_r zAQ=EOl19P^I!GrO&4`sl65fJrrsRr^~eD}^CGc$>$l2u8ZH%l~=Y&y+tH(oAj-dq0ZSO`ZU0E4?R z<9*q~A)` zFQ0r}k~dq>{-R`=Q$Zkq{$biT+RX&QEFh6cj2_(I&`Qjd0huksUvj&$4aTN}yG403 zwKTgd(V(EBEE;M0<6l_S)Y~lSvLrL9X1Klj)`}yyik_shGvk* y1yT$V-8C~Cxf&nOfa~On8G_A-_{>87f1dAp^6H)r>&2M>0000*`58W}{T z_)ie{E6SmHZAp!WMmlvVnbXt>iK(F`Q`0SVo4R}7=jo4o@BHq!dzTKOe?0JUzxTc8 zp67YadCz$-@WB3OGxz#~s;a7#)YR0hP{_%kgA7S>sK#KBjm^zh>-~Oz>wP7#W5>~l zQ&O%^voPZ!j+Qi3Qj#Q)f)w)vB^@K}LD1|%$fcp?Fdxw-CqLe zYmq7O)VaV;NcvXLFR;$h;^K+tyjctcs(KsKI+$cWfJu}lh)M~^U<)mtpEKvBpV|oQ z*|UEb!Tplb5~!A>N?ENaDwELJ7|_dd`ncaqz1`6Id|@i(?*i1-t+S@ zNx2DBUj#X4%9KezMgtCcqh=MQYyf}&fMECqVt5UO*Ot0rpu@gkD6o87<70x^Ch8GM z`{&J@HP-ChT~jRSd$l`UgrxI6!?A^hh3DOh1~0CuGhl8QBkUjkbg$vs7X&mxSn}SO zhGQ^xugFw@&p8t3iwpyjVA~^F*b*;z*oeCYkE93&N&~p=g990TMZ2C7ti^nXRwKx8 zdr^0YNUSKHJo#|q%HAEg=Q|_v9c%3u5sZeO)N^YMpVZngavY#$A2u{$dgx*R&5 z5}KSxE9sI$i<+ovGSY%1j6%aZ&OmJ1y*6q=ZL*Q_Jm3IycFG9$k?12?V=61R=gHtV zNyanB8yypReERh1|9FEBe0^6P5N_K$gm@7K$H}&m3<5B~TPji?E*n{YS<-4rIZko$ z(f@DT8Xx0S+0bl$K$n$P8MNBZz1V|I&flI7?HjAHuTxu=Lmn?bh4+j8} zoX|W=ow`$k=p5$@)+TK#FE39|T#EKRA1CeCM6Ar~b7Rt&zV+*@)YPR57XyF;08^(< zZIo3p$v@p45gs7)8`Jgh#*JJ2Wo2bfIQTt*pCR??o4J3LPt3f&ljOiq$WB|dXi*d3 zxeYgMs_f-U3T`1aKK^y96C^i%bg=jApv52$c$HvQL(B!VN`m8V*CE|oP?X*9!|hhz z%w@|KUW;PJ)VHEyb4%{fobp~h`<@mxicu8^mi*hof$LS68<9!1OllyZ|Y@$vn4%4$%ztKQ`Zxh zTYNETAKs%Gh1rdegrEi1{@w>Sf{wRb=dDf6xwNscIf|2%h=AYkzX@OifDIcqY|bV_ z&-d_c+Wng=bLNFD)I>oYv)7(FWCW6#;hcATbV0G9t}20F6-3%1um%8i0Y z`hM-X^-~wLIlUP0^-JqoCo#wEC!y5`47>1E=}92c1)a6n)9$gYehb>N_I~OT5?4mu zI!S=-E+fWaapB97t$+^1xi^Y4X9bFY4$eSFW{CMROoI*AFa#jkF&E-^ir6+o;jtXq kG5`!{bh$Iw5A43%zu#XxiA?H}RsaA107*qoM6N<$f>}nP=l}o! literal 0 HcmV?d00001 diff --git a/public/icons/docs/r/SOURCE b/public/icons/docs/r/SOURCE new file mode 100644 index 00000000..c02c1e37 --- /dev/null +++ b/public/icons/docs/r/SOURCE @@ -0,0 +1 @@ +https://svn.r-project.org/R/trunk/doc/html/Rlogo.svg From c6cf8d9c6f2d90e58cde7ad71a00513188c7458b Mon Sep 17 00:00:00 2001 From: Cimbali Date: Wed, 26 May 2021 21:07:28 +0100 Subject: [PATCH 2/7] Update assets/javascripts/templates/pages/about_tmpl.coffee Co-authored-by: Simon Legner --- assets/javascripts/templates/pages/about_tmpl.coffee | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee b/assets/javascripts/templates/pages/about_tmpl.coffee index 3138546a..503faaf0 100644 --- a/assets/javascripts/templates/pages/about_tmpl.coffee +++ b/assets/javascripts/templates/pages/about_tmpl.coffee @@ -678,7 +678,7 @@ credits = [ 'https://doc.qt.io/qt-5/licensing.html' ], [ 'R', - '1999--2012 R Foundation for Statistical Computing', + '1999–2012 R Foundation for Statistical Computing', 'GPL', 'https://svn.r-project.org/R/trunk/COPYING' ], [ From c3b93377c3864d1b85a482d1e70e6bea75f85d73 Mon Sep 17 00:00:00 2001 From: Cimbali Date: Wed, 26 May 2021 21:07:36 +0100 Subject: [PATCH 3/7] Update lib/docs/scrapers/r.rb Co-authored-by: Simon Legner --- lib/docs/scrapers/r.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb index 7c577d5a..9d95fbaa 100644 --- a/lib/docs/scrapers/r.rb +++ b/lib/docs/scrapers/r.rb @@ -16,7 +16,7 @@ module Docs options[:skip_links] = false options[:attribution] = <<-HTML - Copyright (©) 1999--2012 R Foundation for Statistical Computing.
+ Copyright (©) 1999–2012 R Foundation for Statistical Computing.
Licensed under the GNU General Public License. HTML From 94b404450cb7af46a904b29003a58bdbe19a6eb4 Mon Sep 17 00:00:00 2001 From: Cimbali Date: Thu, 27 May 2021 16:41:32 +0200 Subject: [PATCH 4/7] Reindex R documentation, include 2 manuals Now each page is indexed by their title (by default), and each index term declared for it on the index 2 manuals are included, the data import/export as its own category (as it is rather short), and each top-level section of the R introduction manual (as it is quite a bit longer). Add some manual cleanup. Some pages still seem missing: - either belonging to non-default packages, i.e. it is normal that they miss - or corresponding to index words without their own package (!) --- lib/docs/filters/r/clean_html.rb | 35 ++++++++++++++++++---- lib/docs/filters/r/entries.rb | 50 +++++++++++++++++++++++--------- lib/docs/scrapers/r.rb | 23 +++++++++++++++ 3 files changed, 89 insertions(+), 19 deletions(-) diff --git a/lib/docs/filters/r/clean_html.rb b/lib/docs/filters/r/clean_html.rb index 28ea571d..57c91ee5 100644 --- a/lib/docs/filters/r/clean_html.rb +++ b/lib/docs/filters/r/clean_html.rb @@ -3,7 +3,13 @@ module Docs class CleanHtmlFilter < Filter def call slug_parts = slug.split('/') - if slug_parts[0] == 'library' + + if root_page? + css('a[href$="/00index"]').each do |pkg| + pkg['href'] = "/r-#{pkg['href'].split('/')[1]}/" + end + + elsif slug_parts[0] == 'library' title = at_css('h2') title.inner_html = "#{slug_parts[3]} #{title.content}" @@ -11,12 +17,31 @@ module Docs summary.remove if summary elsif slug_parts[-2] == 'manual' + css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, table.index-vr, table[summary]').remove + + css('h2').each do |node| + node.remove if node.content.end_with? ' index' + end + css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + h6').each do |node| - id = node.previous['id'] - node.previous.remove - node['id'] = id.sub(/-1$/, '') if id + # We need the first of the series of span with ids + span = node.previous_element + while span.previous + prev = span.previous_element + break unless prev.name == 'span' and prev['id'] + span.remove + span = prev + end + + node['id'] = span['id'] + span.remove + + css('div.example').each do |node| + node.replace(node.children) + end end - css('table.menu, div.header, hr').remove + + css('h1 + h1').remove css('.footnote h5').each do |node| anchor = node.at_css('a[id]') diff --git a/lib/docs/filters/r/entries.rb b/lib/docs/filters/r/entries.rb index b54c2c21..a9793e07 100644 --- a/lib/docs/filters/r/entries.rb +++ b/lib/docs/filters/r/entries.rb @@ -2,11 +2,16 @@ module Docs class R class EntriesFilter < Docs::EntriesFilter - @@include_manual = false - @@include_misc = false + PKG_INDEX_ENTRIES = Hash.new [] def initialize(*) super + + if slug_parts[-1] == '00Index' + css('tr a').each do |link| + PKG_INDEX_ENTRIES[link['href']] += [link.text] + end + end end def slug_parts @@ -18,11 +23,11 @@ module Docs end def is_manual? - slug_parts[-2] == 'manual' + slug_parts[1] == 'manual' end def get_name - return slug_parts[3] + ' − ' + at_css('h2').content if is_package? + return at_css('h2').content if is_package? title = at_css('h1.settitle') title ? title.content : at_css('h1, h2').content end @@ -30,24 +35,41 @@ module Docs def get_type return slug_parts[1] if is_package? return at_css('h1.settitle').content if is_manual? - 'Miscellaneous' end def include_default_entry? - if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index' - return false - end - is_package? or self.include_misc + is_package? and not slug_parts[-1] == '00Index' + end + + def manual_section(node) + title = node.content.sub /^((Appendix )?[A-Z]|[0-9]+)(\.[0-9]+)* /, '' + title unless ['References', 'Preface', 'Acknowledgements'].include?(title) or title.end_with?(' index') end def additional_entries - return [] unless is_manual? and self.include_manual + if is_package? and slug_parts[-1] != '00Index' + page = slug_parts[-1] + return [page] + PKG_INDEX_ENTRIES.fetch(page, []) + end + + return [] unless is_manual? entries = [] - css('div.contents > ul > li').each do |node| - node.css('a').each do |link| - link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, '' - entries << [link_name, link['href'].split('#')[1], name] + unless slug_parts[-1].downcase == 'r-intro' + # Single top-level category + css('div.contents > ul a').each do |link| + link_name = manual_section(link) + entries << [link_name, link['href'].split('#')[1], name] unless link_name.nil? + end + else + # Split 1st level of manual into different categories + css('div.contents > ul > li').each do |node| + type = manual_section(node.at_css('a')) + next if type.nil? + node.css('> ul a').each do |link| + link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, '' + entries << [link_name, link['href'].split('#')[1], type] + end end end return entries diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb index 9d95fbaa..6a36a843 100644 --- a/lib/docs/scrapers/r.rb +++ b/lib/docs/scrapers/r.rb @@ -21,10 +21,33 @@ module Docs HTML # Never want those + options[:skip_patterns] = [ + /\/DESCRIPTION$/, + /\/NEWS(\.[^\/]*)?$/, + /\/demo$/, + /\.pdf$/ + ] + + ## We want to fix links like so − but only if the targets don’t exist, + ## as these target packages or keywords that do not have their own file, + ## but exist on another page, and we properly record it. + # + #options[:fix_urls] = ->(url) do + # url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" } + # url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| "/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" } + #end + options[:skip] = %w( doc/html/packages-head-utf8.html doc/html/SearchOn.html doc/html/Search.html + doc/html/UserManuals.html + doc/html/faq.html + doc/manual/R-FAQ.html + doc/manual/R-admin.html + doc/manual/R-exts.html + doc/manual/R-ints.html + doc/manual/R-lang.html ) end From 3194a3f3e5eb5cad3f11ca7e916a053eb60f8492 Mon Sep 17 00:00:00 2001 From: Cimbali Date: Tue, 1 Jun 2021 00:28:42 +0200 Subject: [PATCH 5/7] =?UTF-8?q?Skip=20links=20to=20pages=20with=20?= =?UTF-8?q?=E2=80=9Cvignettes=E2=80=9D=C2=A0demos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/docs/scrapers/r.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb index 6a36a843..e0e43355 100644 --- a/lib/docs/scrapers/r.rb +++ b/lib/docs/scrapers/r.rb @@ -24,6 +24,7 @@ module Docs options[:skip_patterns] = [ /\/DESCRIPTION$/, /\/NEWS(\.[^\/]*)?$/, + /\/doc\/index\.html$/, /\/demo$/, /\.pdf$/ ] From 42731d6407336278e05918b927458e0c8a9bcf84 Mon Sep 17 00:00:00 2001 From: Cimbali Date: Tue, 1 Jun 2021 23:20:52 +0200 Subject: [PATCH 6/7] Clean footer from R packages Only exist in the newer way of building html pages, via configure --- lib/docs/filters/r/clean_html.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/docs/filters/r/clean_html.rb b/lib/docs/filters/r/clean_html.rb index 57c91ee5..62f9d140 100644 --- a/lib/docs/filters/r/clean_html.rb +++ b/lib/docs/filters/r/clean_html.rb @@ -16,6 +16,8 @@ module Docs summary = at_css('table[summary]') summary.remove if summary + css('hr ~ *, hr').remove + elsif slug_parts[-2] == 'manual' css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, table.index-vr, table[summary]').remove From 005db388cec113f8956c56ef5787d8deb50c02b4 Mon Sep 17 00:00:00 2001 From: Cimbali Date: Wed, 2 Jun 2021 00:20:51 +0200 Subject: [PATCH 7/7] Rewrite links by generating scraper :replace_paths from entries filter --- lib/docs/filters/r/entries.rb | 9 ++++++--- lib/docs/scrapers/r.rb | 14 ++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/docs/filters/r/entries.rb b/lib/docs/filters/r/entries.rb index a9793e07..ed09345d 100644 --- a/lib/docs/filters/r/entries.rb +++ b/lib/docs/filters/r/entries.rb @@ -4,14 +4,17 @@ module Docs PKG_INDEX_ENTRIES = Hash.new [] - def initialize(*) - super - + def call if slug_parts[-1] == '00Index' + dir = File.dirname(result[:subpath]) css('tr a').each do |link| PKG_INDEX_ENTRIES[link['href']] += [link.text] + next if link['href'] == link.text + context[:replace_paths][File.join(dir, "#{link.text}.html")] = File.join(dir, "#{link['href']}.html") end end + + super end def slug_parts diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb index e0e43355..308d1a6b 100644 --- a/lib/docs/scrapers/r.rb +++ b/lib/docs/scrapers/r.rb @@ -29,14 +29,12 @@ module Docs /\.pdf$/ ] - ## We want to fix links like so − but only if the targets don’t exist, - ## as these target packages or keywords that do not have their own file, - ## but exist on another page, and we properly record it. - # - #options[:fix_urls] = ->(url) do - # url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" } - # url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| "/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" } - #end + options[:replace_paths] = { + ## We want to fix links like so − but only if the targets don’t exist: + # 'library/MASS/html/cov.mve.html' => 'library/MASS/html/cov.rob.html' + ## Paths for target packages or keywords that do not have their own file + ## are generated in the entries filter from 00Index.html files + } options[:skip] = %w( doc/html/packages-head-utf8.html