From 7c01b590f07b2bb8bb6ee649e82314844804c8fd Mon Sep 17 00:00:00 2001 From: Phil Scherer Date: Sat, 5 Dec 2020 06:34:00 +0000 Subject: [PATCH] Cleanup, version, and improve Relay scraper --- lib/docs/filters/relay/clean_html.rb | 31 +++------------ lib/docs/filters/relay/entries.rb | 58 +++++++++++++--------------- lib/docs/scrapers/relay.rb | 34 ++++++++-------- 3 files changed, 50 insertions(+), 73 deletions(-) diff --git a/lib/docs/filters/relay/clean_html.rb b/lib/docs/filters/relay/clean_html.rb index f18d30ba..e3e7c3a1 100644 --- a/lib/docs/filters/relay/clean_html.rb +++ b/lib/docs/filters/relay/clean_html.rb @@ -2,38 +2,17 @@ module Docs class Relay class CleanHtmlFilter < Filter def call + @doc = at_css('.post') - if slug == 'index' - css('img').remove + header = at_css('h1') + header.parent.before(header).remove - css('.projectTitle').each do |node| - node.name = 'h1' - node.content = 'Relay' - end - - css('pre').remove - - end - - css('.docLastUpdate').remove - - css('.docs-prevnext').remove - - css('.edit-page-link').remove + css('footer').remove css('h2, h3').each do |node| - node.css('a').remove - node['id'] = node.content.gsub(/\s/, '-').downcase + node['id'] = node.at_css('a.anchor')['id'] end - css('.onPageNav').remove - - css('#docsNav').remove - - css('.fixedHeaderContainer').remove - - css('footer').remove - # syntax highlight css('pre').each do |node| node['data-language'] = 'javascript' diff --git a/lib/docs/filters/relay/entries.rb b/lib/docs/filters/relay/entries.rb index 7f7c6859..99f33543 100644 --- a/lib/docs/filters/relay/entries.rb +++ b/lib/docs/filters/relay/entries.rb @@ -1,51 +1,47 @@ module Docs class Relay class EntriesFilter < Docs::EntriesFilter - - def get_name - if slug == 'index' - return 'Relay' + ONLY_SECTIONS = ['API Reference', 'Principles & Architecture'] + ONLY_SLUGS = [] + + def call + if root_page? + css('.navGroup > h3').each do |node| + next if not ONLY_SECTIONS.include? node.content + node.next_element.css('a').each do |anchor| + ONLY_SLUGS << anchor['href'].split('/').last.strip + end + end end + super + end + def get_name at_css('h1').content end def get_type - if slug == 'index' - return 'Relay' - end - at_css('h1').content end - def additional_entries - entries = [] - - if slug == 'index' - return entries - end - - ## avoid adding non-desired entries removing tags - # remove header which contains a

tag - css('.fixedHeaderContainer').remove + def include_default_entry? + ONLY_SLUGS.include? slug + end - # remove table of content whose title is an

tag - css('.toc').remove - ## + def additional_entries + return [] if not include_default_entry? - css('h2, h3').each do |node| - next if node.content.include?('Argument') - entry_name = node.content + css('article h2, article h3').each_with_object [] do |node, entries| + next if node.content.include?('Argument') || + node.content.starts_with?('Example') - if entry_name.include?('(') - entry_name = entry_name.match(/.*\(/)[0] + ')' + name = node.content + if name.include?('(') + name = name.match(/.*\(/)[0] + ')' end - - entry_id = node.content.gsub(/\s/, '-').downcase - entries << [entry_name, entry_id] + id = node.at_css('a.anchor')['id'] + entries << [name, id] end - - entries end end diff --git a/lib/docs/scrapers/relay.rb b/lib/docs/scrapers/relay.rb index 8d01b3bc..0b3f6b8a 100644 --- a/lib/docs/scrapers/relay.rb +++ b/lib/docs/scrapers/relay.rb @@ -1,9 +1,7 @@ module Docs class Relay < UrlScraper self.type = 'simple' - self.release = '10.1.0' - self.base_url = 'https://relay.dev' - self.root_path = 'index.html' + self.root_path = 'introduction-to-relay' self.links = { home: 'https://relay.dev/', code: 'https://github.com/facebook/relay' @@ -11,19 +9,7 @@ module Docs html_filters.push 'relay/entries', 'relay/clean_html' - options[:only] = [ - '/docs/en/graphql-in-relay', - '/docs/en//relay-environment', - '/docs/en/network-layer', - '/docs/en/query-renderer', - '/docs/en/fragment-container', - '/docs/en/refetch-container', - '/docs/en/pagination-container', - '/docs/en/mutations', - '/docs/en/subscriptions', - '/docs/en/relay-store', - '/docs/en/fetch-query' - ] + options[:skip] = %w(videos) options[:attribution] = <<-HTML © 2020–present Facebook Inc.
@@ -34,5 +20,21 @@ module Docs get_latest_github_release('facebook', 'relay', opts) end + version '10' do + self.release = '10.1.0' + self.base_url = "https://relay.dev/docs/en/" + # For some reason, the most-recent version isn't available at a versioned URL + end + + version '9' do + self.release = '9.1.0' + self.base_url = "https://relay.dev/docs/en/v#{self.release}/" + end + + version '8' do + self.release = '8.0.0' + self.base_url = "https://relay.dev/docs/en/v#{self.release}/" + end + end end