From 38e2b107a2eabaa85de44b214554a570aac542d4 Mon Sep 17 00:00:00 2001 From: Enoc Date: Fri, 5 Mar 2021 16:17:27 -0600 Subject: [PATCH] Add external_urls filter This filter traverses all tags and replaces its url for an url poiting to a path of an existant documentation. --- docs/filter-reference.md | 1 + docs/scraper-reference.md | 5 ++++ lib/docs/core/filter.rb | 10 +++++++ lib/docs/core/scraper.rb | 2 +- lib/docs/filters/core/external_urls.rb | 38 ++++++++++++++++++++++++++ lib/docs/scrapers/backbone.rb | 4 +++ 6 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 lib/docs/filters/core/external_urls.rb diff --git a/docs/filter-reference.md b/docs/filter-reference.md index 6c138771..bd654f19 100644 --- a/docs/filter-reference.md +++ b/docs/filter-reference.md @@ -84,6 +84,7 @@ The `call` method must return either `doc` or `html`, depending on the type of f * [`AttributionFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/attribution.rb) — appends the license info and link to the original document * [`TitleFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/title.rb) — prepends the document with a title (disabled by default) * [`EntriesFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/entries.rb) — abstract filter for extracting the page's metadata +* [`ExternalUrlsFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/external_urls.rb) — replaces external URLs for relative URLs of existant devdocs documentation. ## Custom filters diff --git a/docs/scraper-reference.md b/docs/scraper-reference.md index fc00876d..e48fbe0f 100644 --- a/docs/scraper-reference.md +++ b/docs/scraper-reference.md @@ -115,6 +115,7 @@ Additionally: * [`TitleFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/title.rb) is a core HTML filter, disabled by default, which prepends the document with a title (`

`). * [`EntriesFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/entries.rb) is an abstract HTML filter that each scraper must implement and responsible for extracting the page's metadata. +* [`ExternalUrlsFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/external_urls.rb) is an HTML filter that replaces external URLs found in `` tags to urls pointing to existant devdocs documentation. ### Filter options @@ -185,6 +186,10 @@ More information about how filters work is available on the [Filter Reference](. _Note: this filter is disabled by default._ +* [`ExternalUrlsFilter`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/filters/core/external_urls.rb) + + - `:external_urls` [Hash or Proc] If it is a Hash, replaces all URLs found in `` tags for URLs of existant devdocs documentation. If it is a Proc, it is called with an URL (string) as argument and should return a relative URL pointing to an existant devdocs documentation. See [`backbone.rb`](https://github.com/freeCodeCamp/devdocs/blob/master/lib/docs/scrapers/backbone.rb) + ## Keeping scrapers up-to-date In order to keep scrapers up-to-date the `get_latest_version(opts)` method should be overridden. If `self.release` is defined, this should return the latest version of the documentation. If `self.release` is not defined, it should return the Epoch time when the documentation was last modified. If the documentation will never change, simply return `1.0.0`. The result of this method is periodically reported in a "Documentation versions report" issue which helps maintainers keep track of outdated documentations. diff --git a/lib/docs/core/filter.rb b/lib/docs/core/filter.rb index 5be77883..52b9cfca 100644 --- a/lib/docs/core/filter.rb +++ b/lib/docs/core/filter.rb @@ -96,5 +96,15 @@ module Docs path = path.gsub %r{\+}, '_plus_' path end + + def path_to_root + if subpath == '' + return '../' + else + previous_dirs = subpath.scan(/\//) + return '../' * previous_dirs.length + end + end + end end diff --git a/lib/docs/core/scraper.rb b/lib/docs/core/scraper.rb index 083b0015..4013755a 100644 --- a/lib/docs/core/scraper.rb +++ b/lib/docs/core/scraper.rb @@ -41,7 +41,7 @@ module Docs self.html_filters = FilterStack.new self.text_filters = FilterStack.new - html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths', 'parse_cf_email' + html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths', 'parse_cf_email', 'external_urls' text_filters.push 'images' # ensure the images filter runs after all html filters text_filters.push 'inner_html', 'clean_text', 'attribution' diff --git a/lib/docs/filters/core/external_urls.rb b/lib/docs/filters/core/external_urls.rb new file mode 100644 index 00000000..47fbc220 --- /dev/null +++ b/lib/docs/filters/core/external_urls.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +module Docs + class ExternalUrlsFilter < Filter + + def call + if context[:external_urls] + + root = path_to_root + + css('a').each do |node| + + next unless anchorUrl = node['href'] + + # avoid links already converted to internal links + next if anchorUrl.match?(/\.\./) + + if context[:external_urls].is_a?(Proc) + node['href'] = context[:external_urls].call(anchorUrl) + next + end + + url = URI(anchorUrl) + + context[:external_urls].each do |host, name| + if url.host.to_s.match?(host) + node['href'] = root + name + url.path.to_s + '#' + url.fragment.to_s + end + end + + end + end + + doc + end + + end +end diff --git a/lib/docs/scrapers/backbone.rb b/lib/docs/scrapers/backbone.rb index 3a5fdb76..944c69a3 100644 --- a/lib/docs/scrapers/backbone.rb +++ b/lib/docs/scrapers/backbone.rb @@ -21,6 +21,10 @@ module Docs Licensed under the MIT License. HTML + options[:external_urls] = { + 'underscorejs.org' => 'underscore' + } + def get_latest_version(opts) doc = fetch_doc('https://backbonejs.org/', opts) doc.at_css('.version').content[1...-1]