From 4e41ed9f259e4d60e648a6fb10e0b3f2259469a5 Mon Sep 17 00:00:00 2001 From: Thibaut Courouble Date: Sun, 22 Jan 2017 10:26:14 -0500 Subject: [PATCH] Add support --- lib/docs/core/filter.rb | 4 +- lib/docs/core/scraper.rb | 2 +- lib/docs/core/scrapers/url_scraper.rb | 2 +- lib/docs/filters/core/apply_base_url.rb | 21 ++++++ test/lib/docs/core/filter_test.rb | 4 ++ .../docs/filters/core/apply_base_url_test.rb | 69 +++++++++++++++++++ 6 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 lib/docs/filters/core/apply_base_url.rb create mode 100644 test/lib/docs/filters/core/apply_base_url_test.rb diff --git a/lib/docs/core/filter.rb b/lib/docs/core/filter.rb index 82788ba0..8b8278fa 100644 --- a/lib/docs/core/filter.rb +++ b/lib/docs/core/filter.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module Docs class Filter < ::HTML::Pipeline::Filter def css(*args) @@ -73,7 +75,7 @@ module Docs end def relative_url_string?(str) - !fragment_url_string?(str) && str !~ SCHEME_RGX + str !~ SCHEME_RGX && !fragment_url_string?(str) && !data_url_string?(str) end def absolute_url_string?(str) diff --git a/lib/docs/core/scraper.rb b/lib/docs/core/scraper.rb index 63b33e29..bf2af017 100644 --- a/lib/docs/core/scraper.rb +++ b/lib/docs/core/scraper.rb @@ -41,7 +41,7 @@ module Docs self.html_filters = FilterStack.new self.text_filters = FilterStack.new - html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths' + html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths' text_filters.push 'inner_html', 'clean_text', 'attribution' def initialize diff --git a/lib/docs/core/scrapers/url_scraper.rb b/lib/docs/core/scrapers/url_scraper.rb index 12b4a68c..e033c6fa 100644 --- a/lib/docs/core/scrapers/url_scraper.rb +++ b/lib/docs/core/scrapers/url_scraper.rb @@ -129,7 +129,7 @@ module Docs def fetch_redirections result = {} - with_filters 'container', 'normalize_urls', 'internal_urls' do + with_filters 'apply_base_url', 'container', 'normalize_urls', 'internal_urls' do build_pages do |page| next if page[:response_effective_path] == page[:response_path] result[page[:response_path].downcase] = page[:response_effective_path] diff --git a/lib/docs/filters/core/apply_base_url.rb b/lib/docs/filters/core/apply_base_url.rb new file mode 100644 index 00000000..0a20a95a --- /dev/null +++ b/lib/docs/filters/core/apply_base_url.rb @@ -0,0 +1,21 @@ +module Docs + class ApplyBaseUrlFilter < Filter + URL_ATTRIBUTES = { 'a': 'href', 'img': 'src', 'iframe': 'src' } + SCHEME_RGX = /\A[^:\/?#]+:/ + + def call + base_url = at_css('base').try(:[], 'href') + return doc unless base_url + + URL_ATTRIBUTES.each_pair do |tag, attribute| + css(tag).each do |node| + next unless value = node[attribute] + next if !relative_url_string?(value) || value[0] == '/'.freeze + node[attribute] = "#{base_url}#{node[attribute]}" + end + end + + doc + end + end +end diff --git a/test/lib/docs/core/filter_test.rb b/test/lib/docs/core/filter_test.rb index d98ecdad..77265c22 100644 --- a/test/lib/docs/core/filter_test.rb +++ b/test/lib/docs/core/filter_test.rb @@ -149,6 +149,10 @@ class DocsFilterTest < MiniTest::Spec it "returns false with 'mailto:test@example.com'" do refute filter.relative_url_string?('mailto:test@example.com') end + + it "returns false with ''" do + refute filter.relative_url_string?('') + end end describe "#absolute_url_string?" do diff --git a/test/lib/docs/filters/core/apply_base_url_test.rb b/test/lib/docs/filters/core/apply_base_url_test.rb new file mode 100644 index 00000000..e627d959 --- /dev/null +++ b/test/lib/docs/filters/core/apply_base_url_test.rb @@ -0,0 +1,69 @@ +require 'test_helper' +require 'docs' + +class ApplyBaseUrlFilterTest < MiniTest::Spec + include FilterTestHelper + self.filter_class = Docs::ApplyBaseUrlFilter + self.filter_type = 'html' + + context "when there is no " do + it "does nothing" do + @body = make_body nil, link_to('test') + assert_equal link_to('test'), filter_output.at_css('body').inner_html + end + end + + context "when is '/base/'" do + it "rewrites relative urls" do + @body = make_body '/base/', link_to('path#frag') + assert_equal link_to('/base/path#frag'), filter_output.at_css('body').inner_html + end + + it "rewrites relative image urls" do + @body = make_body '/base/', '' + assert_equal '', filter_output.at_css('body').inner_html + end + + it "rewrites relative iframe urls" do + @body = make_body '/base/', '' + assert_equal '', filter_output.at_css('body').inner_html + end + + it "doesn't rewrite absolute urls" do + @body = make_body '/base/', link_to('http://example.com') + assert_equal link_to('http://example.com'), filter_output.at_css('body').inner_html + end + + it "doesn't rewrite protocol-less urls" do + @body = make_body '/base/', link_to('//example.com') + assert_equal link_to('//example.com'), filter_output.at_css('body').inner_html + end + + it "doesn't rewrite root-relative urls" do + @body = make_body '/base/', link_to('/path') + assert_equal link_to('/path'), filter_output.at_css('body').inner_html + end + + it "doesn't rewrite fragment-only urls" do + @body = make_body '/base/', link_to('#test') + assert_equal link_to('#test'), filter_output.at_css('body').inner_html + end + + it "doesn't rewrite email urls" do + @body = make_body '/base/', link_to('mailto:test@example.com') + assert_equal link_to('mailto:test@example.com'), filter_output.at_css('body').inner_html + end + + it "doesn't rewrite data urls" do + @body = make_body '/base/', '' + assert_equal '', filter_output.at_css('body').inner_html + end + end + + private + + def make_body(base, body) + base = %() if base + "#{base}#{body}" + end +end