From 018628ea7dd1b052ee512673712ddb581df21f70 Mon Sep 17 00:00:00 2001 From: Thibaut Date: Sun, 5 Apr 2015 17:46:07 -0400 Subject: [PATCH] Add two-pass redirection rewriter ... to avoid having to maintain huge lists of redirects. This works by doing a first pass to detect which internal URL is redirected where, before doing a second (normal) pass that rewrites all these URLs (links) with their final destination. There's a bit of monkey-patching I'm not proud of, but this works(tm). --- lib/docs/core/scraper.rb | 1 + lib/docs/core/scrapers/url_scraper.rb | 61 +++++++++++++++++++ lib/docs/filters/core/normalize_urls.rb | 48 +++++++++++---- lib/docs/subscribers/doc_subscriber.rb | 4 ++ .../docs/filters/core/normalize_urls_test.rb | 26 +++++++- 5 files changed, 128 insertions(+), 12 deletions(-) diff --git a/lib/docs/core/scraper.rb b/lib/docs/core/scraper.rb index 4e0edadb..174f4856 100644 --- a/lib/docs/core/scraper.rb +++ b/lib/docs/core/scraper.rb @@ -100,6 +100,7 @@ module Docs (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/']) end + options.merge!(additional_options) if respond_to?(:additional_options, true) options.freeze end end diff --git a/lib/docs/core/scrapers/url_scraper.rb b/lib/docs/core/scrapers/url_scraper.rb index cb918b1b..97bcb6f4 100644 --- a/lib/docs/core/scrapers/url_scraper.rb +++ b/lib/docs/core/scrapers/url_scraper.rb @@ -28,5 +28,66 @@ module Docs def process_response?(response) response.success? && response.html? && base_url.contains?(response.effective_url) end + + module FixRedirectionsBehavior + def self.included(base) + base.extend ClassMethods + end + + module ClassMethods + attr_accessor :fix_redirections + attr_reader :redirections + + def store_pages(store) + return super unless fix_redirections + instrument 'info.doc', msg: 'Fetching redirections...' + with_redirections do + instrument 'info.doc', msg: 'Building pages...' + super + end + end + + private + + def with_redirections + @redirections = new.fetch_redirections + yield + ensure + @redirections = nil + end + end + + def fetch_redirections + result = {} + with_filters 'container', 'normalize_urls', 'internal_urls' do + build_pages do |page| + next if page[:response_effective_path] == page[:response_path] + result[page[:response_path].downcase] = page[:response_effective_path] + end + end + result + end + + private + + def process_response(response) + super.merge! response_effective_path: response.effective_path, response_path: response.path + end + + def additional_options + { redirections: self.class.redirections } + end + + def with_filters(*filters) + stack = FilterStack.new + stack.push(*filters) + pipeline.instance_variable_set :@filters, stack.to_a.freeze + yield + ensure + @pipeline = nil + end + end + + include FixRedirectionsBehavior end end diff --git a/lib/docs/filters/core/normalize_urls.rb b/lib/docs/filters/core/normalize_urls.rb index 1fc28d72..6fd77129 100644 --- a/lib/docs/filters/core/normalize_urls.rb +++ b/lib/docs/filters/core/normalize_urls.rb @@ -19,8 +19,12 @@ module Docs def normalize_url(str) url = to_absolute_url(str) - fix_url(url) - fix_url_string(url.to_s) + + while new_url = fix_url(url) + url = new_url + end + + url.to_s rescue URI::InvalidURIError '#' end @@ -31,18 +35,40 @@ module Docs end def fix_url(url) - return unless context[:replace_paths] - path = subpath_to(url) + if context[:redirections] + url = URL.parse(url) + path = url.path.downcase - if context[:replace_paths].has_key?(path) - url.path = url.path.sub %r[#{path}\z], context[:replace_paths][path] + if context[:redirections].key?(path) + url.path = context[:redirections][path] + return url + end end - end - def fix_url_string(str) - str = context[:replace_urls][str] || str if context[:replace_urls] - str = context[:fix_urls].call(str) || str if context[:fix_urls] - str + if context[:replace_paths] + url = URL.parse(url) + path = subpath_to(url) + + if context[:replace_paths].key?(path) + url.path = url.path.sub %r[#{path}\z], context[:replace_paths][path] + return url + end + end + + if context[:replace_urls] + url = url.to_s + + if context[:replace_urls].key?(url) + return context[:replace_urls][url] + end + end + + if context[:fix_urls] + url = url.to_s + orig_url = url.dup + new_url = context[:fix_urls].call(url) + return new_url if new_url != orig_url + end end end end diff --git a/lib/docs/subscribers/doc_subscriber.rb b/lib/docs/subscribers/doc_subscriber.rb index 19950836..7c1fa550 100644 --- a/lib/docs/subscribers/doc_subscriber.rb +++ b/lib/docs/subscribers/doc_subscriber.rb @@ -16,6 +16,10 @@ module Docs log_diff before.keys, after.keys end + def info(event) + log event.payload[:msg] + end + private def parse_payload(event) diff --git a/test/lib/docs/filters/core/normalize_urls_test.rb b/test/lib/docs/filters/core/normalize_urls_test.rb index 9698b88a..6bcb678c 100644 --- a/test/lib/docs/filters/core/normalize_urls_test.rb +++ b/test/lib/docs/filters/core/normalize_urls_test.rb @@ -116,7 +116,7 @@ class NormalizeUrlsFilterTest < MiniTest::Spec end it "calls the block with each absolute url" do - context[:fix_urls] = ->(arg) { (@args ||= []).push(arg) } + context[:fix_urls] = ->(arg) { (@args ||= []).push(arg); nil } @body += link_to '/path?#' filter.call assert_equal ['http://example.com/path?#'] * 2, @args @@ -139,4 +139,28 @@ class NormalizeUrlsFilterTest < MiniTest::Spec refute @called end end + + context "when context[:redirections] is a hash" do + before do + @body = link_to 'http://example.com/path?query#frag' + end + + it "replaces the path of matching urls, case-insensitive" do + @body = link_to('http://example.com/PATH?query#frag') + link_to('http://example.com/path/two') + context[:redirections] = { '/path' => '/fixed' } + expected = link_to('http://example.com/fixed?query#frag') + link_to('http://example.com/path/two') + assert_equal expected, filter_output_string + end + + it "does a multi pass with context[:fix_urls]" do + @body = link_to('http://example.com/path') + context[:fix_urls] = ->(url) do + url.sub! 'example.com', 'example.org' + url.sub! '/Fixed', '/fixed' + url + end + context[:redirections] = { '/path' => '/Fixed' } + assert_equal link_to('http://example.org/fixed'), filter_output_string + end + end end