From cf852768ed086f24f2514712585f0d54daed8463 Mon Sep 17 00:00:00 2001 From: Peiran Yao Date: Fri, 20 Nov 2020 13:04:54 -0700 Subject: [PATCH 1/3] PyTorch 1.6.0+ doc structure support Document structure has changed since PyTorch 1.6.0. --- lib/docs/filters/pytorch/clean_html.rb | 9 +++++++++ lib/docs/filters/pytorch/entries.rb | 18 ++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/lib/docs/filters/pytorch/clean_html.rb b/lib/docs/filters/pytorch/clean_html.rb index dd19c3e0..465ae3a5 100644 --- a/lib/docs/filters/pytorch/clean_html.rb +++ b/lib/docs/filters/pytorch/clean_html.rb @@ -2,9 +2,18 @@ module Docs class Pytorch class CleanHtmlFilter < Filter def call + breadcrumbs = at_css('.pytorch-breadcrumbs') + type_name = breadcrumbs.css('li')[1].content + @doc = at_css('.pytorch-article') # Show katex-mathml nodes and remove katex-html nodes css('.katex-html').remove + + # pass type_name to following filters as a new node + node = Nokogiri::XML::Node.new 'meta', doc + node.content = type_name + doc.child.before node + doc end end diff --git a/lib/docs/filters/pytorch/entries.rb b/lib/docs/filters/pytorch/entries.rb index f63926b4..ba92e222 100644 --- a/lib/docs/filters/pytorch/entries.rb +++ b/lib/docs/filters/pytorch/entries.rb @@ -2,6 +2,15 @@ module Docs class Pytorch class EntriesFilter < Docs::EntriesFilter def get_name + # retrive the name in breadcrumb from the auxiliary node + name_in_breadcrumb = doc.child.content + doc.child.remove + + # hard-coded name replacements, for better presentation. + name_replacements = { + "Distributed communication package - torch.distributed" => "torch.distributed" + } + # The id of the container `div.section` indicates the page type. # If the id starts with `module-`, then it's an API reference, # otherwise it is a note or design doc. @@ -9,7 +18,9 @@ module Docs if doc.element_children[1]['id']&.starts_with? 'module-' /\Amodule-(.*)/.match(doc.element_children[1]['id'])[1] else - at_css('h1').content + name_in_breadcrumb = name_in_breadcrumb.delete_suffix(' >') + name_in_breadcrumb = name_replacements.fetch(name_in_breadcrumb, name_in_breadcrumb) + name_in_breadcrumb end end @@ -18,9 +29,8 @@ module Docs end def include_default_entry? - # If the page is not an API reference, we only include it in the index when it - # contains additional entries. See the doc for `get_name`. - doc.element_children[1]['id']&.starts_with? 'module-' + # Only include API references, and ignore notes or design docs + !subpath.start_with? 'generated/' and type.start_with? 'torch' end def additional_entries From 59c6c75519209bf98436200ebc6de9d9aff59716 Mon Sep 17 00:00:00 2001 From: Peiran Yao Date: Fri, 20 Nov 2020 15:00:36 -0700 Subject: [PATCH 2/3] pytorch: reorder filters --- lib/docs/filters/pytorch/clean_html.rb | 9 --------- lib/docs/filters/pytorch/entries.rb | 13 +++++++------ lib/docs/scrapers/pytorch.rb | 2 +- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/lib/docs/filters/pytorch/clean_html.rb b/lib/docs/filters/pytorch/clean_html.rb index 465ae3a5..dd19c3e0 100644 --- a/lib/docs/filters/pytorch/clean_html.rb +++ b/lib/docs/filters/pytorch/clean_html.rb @@ -2,18 +2,9 @@ module Docs class Pytorch class CleanHtmlFilter < Filter def call - breadcrumbs = at_css('.pytorch-breadcrumbs') - type_name = breadcrumbs.css('li')[1].content - @doc = at_css('.pytorch-article') # Show katex-mathml nodes and remove katex-html nodes css('.katex-html').remove - - # pass type_name to following filters as a new node - node = Nokogiri::XML::Node.new 'meta', doc - node.content = type_name - doc.child.before node - doc end end diff --git a/lib/docs/filters/pytorch/entries.rb b/lib/docs/filters/pytorch/entries.rb index ba92e222..4a4580da 100644 --- a/lib/docs/filters/pytorch/entries.rb +++ b/lib/docs/filters/pytorch/entries.rb @@ -2,9 +2,10 @@ module Docs class Pytorch class EntriesFilter < Docs::EntriesFilter def get_name - # retrive the name in breadcrumb from the auxiliary node - name_in_breadcrumb = doc.child.content - doc.child.remove + breadcrumbs = at_css('.pytorch-breadcrumbs') + name_in_breadcrumb = breadcrumbs.css('li')[1].content + + article = at_css('.pytorch-article') # hard-coded name replacements, for better presentation. name_replacements = { @@ -14,9 +15,9 @@ module Docs # The id of the container `div.section` indicates the page type. # If the id starts with `module-`, then it's an API reference, # otherwise it is a note or design doc. - # After the `sphinx/clean_html` filter, that id is assigned to the second element. - if doc.element_children[1]['id']&.starts_with? 'module-' - /\Amodule-(.*)/.match(doc.element_children[1]['id'])[1] + article_id = article.at_css('div.section')['id'] + if article_id.starts_with? 'module-' + /\Amodule-(.*)/.match(article_id)[1] else name_in_breadcrumb = name_in_breadcrumb.delete_suffix(' >') name_in_breadcrumb = name_replacements.fetch(name_in_breadcrumb, name_in_breadcrumb) diff --git a/lib/docs/scrapers/pytorch.rb b/lib/docs/scrapers/pytorch.rb index f370502a..29b480d6 100644 --- a/lib/docs/scrapers/pytorch.rb +++ b/lib/docs/scrapers/pytorch.rb @@ -9,7 +9,7 @@ module Docs code: 'https://github.com/pytorch/pytorch' } - html_filters.push 'pytorch/clean_html', 'sphinx/clean_html', 'pytorch/entries' + html_filters.push 'pytorch/entries', 'pytorch/clean_html', 'sphinx/clean_html' options[:skip] = ['cpp_index.html', 'packages.html', 'py-modindex.html', 'genindex.html'] options[:skip_patterns] = [/\Acommunity/, /\A_modules/, /\Anotes/, /\Aorg\/pytorch\//] From 5afcd785d79d94c6f35d57a1a35b9066a85a5eb8 Mon Sep 17 00:00:00 2001 From: Phil Scherer Date: Sat, 21 Nov 2020 01:40:58 +0000 Subject: [PATCH 3/3] PyTorch 1.6+ scraper code cleanup --- lib/docs/filters/pytorch/entries.rb | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/lib/docs/filters/pytorch/entries.rb b/lib/docs/filters/pytorch/entries.rb index 4a4580da..c7168f0e 100644 --- a/lib/docs/filters/pytorch/entries.rb +++ b/lib/docs/filters/pytorch/entries.rb @@ -1,27 +1,23 @@ module Docs class Pytorch class EntriesFilter < Docs::EntriesFilter - def get_name - breadcrumbs = at_css('.pytorch-breadcrumbs') - name_in_breadcrumb = breadcrumbs.css('li')[1].content - - article = at_css('.pytorch-article') + NAME_REPLACEMENTS = { + "Distributed communication package - torch.distributed" => "torch.distributed" + } - # hard-coded name replacements, for better presentation. - name_replacements = { - "Distributed communication package - torch.distributed" => "torch.distributed" - } + def get_breadcrumbs() + css('.pytorch-breadcrumbs > li').map { |node| node.content.delete_suffix(' >') } + end + def get_name # The id of the container `div.section` indicates the page type. # If the id starts with `module-`, then it's an API reference, # otherwise it is a note or design doc. - article_id = article.at_css('div.section')['id'] - if article_id.starts_with? 'module-' + if at_css('.section')['id'].starts_with? 'module-' /\Amodule-(.*)/.match(article_id)[1] else - name_in_breadcrumb = name_in_breadcrumb.delete_suffix(' >') - name_in_breadcrumb = name_replacements.fetch(name_in_breadcrumb, name_in_breadcrumb) - name_in_breadcrumb + name = get_breadcrumbs()[1] + NAME_REPLACEMENTS.fetch(name, name) end end