From 46a9ed16f600d6ce26279ca657770a2d5a4960eb Mon Sep 17 00:00:00 2001 From: Thibaut Courouble Date: Sun, 18 Sep 2016 16:39:29 -0400 Subject: [PATCH] Refactor scrapers with multiple base URLs --- lib/docs/core/scraper.rb | 6 ++-- lib/docs/core/scrapers/url_scraper.rb | 46 +++++++++++++++++++++++++++ lib/docs/scrapers/ember.rb | 32 ++----------------- lib/docs/scrapers/meteor.rb | 33 ++----------------- 4 files changed, 53 insertions(+), 64 deletions(-) diff --git a/lib/docs/core/scraper.rb b/lib/docs/core/scraper.rb index d5109332..3e96cc70 100644 --- a/lib/docs/core/scraper.rb +++ b/lib/docs/core/scraper.rb @@ -3,7 +3,7 @@ require 'set' module Docs class Scraper < Doc class << self - attr_accessor :base_url, :root_path, :initial_paths, :initial_urls, :options, :html_filters, :text_filters, :stubs + attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs def inherited(subclass) super @@ -16,7 +16,6 @@ module Docs subclass.base_url = base_url subclass.root_path = root_path subclass.initial_paths = initial_paths.dup - subclass.initial_urls = initial_urls.dup subclass.options = options.deep_dup subclass.html_filters = html_filters.inheritable_copy subclass.text_filters = text_filters.inheritable_copy @@ -36,7 +35,6 @@ module Docs include Instrumentable self.initial_paths = [] - self.initial_urls = [] self.options = {} self.stubs = {} @@ -105,7 +103,7 @@ module Docs end def initial_urls - @initial_urls ||= [root_url.to_s].concat(self.class.initial_urls).concat(initial_paths.map(&method(:url_for))).freeze + @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze end def pipeline diff --git a/lib/docs/core/scrapers/url_scraper.rb b/lib/docs/core/scrapers/url_scraper.rb index f5ecd40c..d5082478 100644 --- a/lib/docs/core/scrapers/url_scraper.rb +++ b/lib/docs/core/scrapers/url_scraper.rb @@ -50,6 +50,52 @@ module Docs Capybara end + module MultipleBaseUrls + def self.included(base) + base.extend ClassMethods + end + + module ClassMethods + attr_reader :base_urls + + def base_urls=(urls) + self.base_url = urls.first + @base_urls = urls + end + end + + def initial_urls + super + self.class.base_urls[1..-1] + end + + def base_urls + @base_urls ||= self.class.base_urls.map { |url| URL.parse(url) } + end + + private + + def process_url?(url) + base_urls.any? { |base_url| base_url.contains?(url) } + end + + def process_response(response) + original_scheme = self.base_url.scheme + original_host = self.base_url.host + original_path = self.base_url.path + + effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) } + + self.base_url.scheme = effective_base_url.scheme + self.base_url.host = effective_base_url.host + self.base_url.path = effective_base_url.path + super + ensure + self.base_url.scheme = original_scheme + self.base_url.host = original_host + self.base_url.path = original_path + end + end + module FixRedirectionsBehavior def self.included(base) base.extend ClassMethods diff --git a/lib/docs/scrapers/ember.rb b/lib/docs/scrapers/ember.rb index 37b2708d..8f8573e5 100644 --- a/lib/docs/scrapers/ember.rb +++ b/lib/docs/scrapers/ember.rb @@ -1,16 +1,12 @@ module Docs class Ember < UrlScraper - class << self - attr_accessor :guide_url - end + include MultipleBaseUrls self.name = 'Ember.js' self.slug = 'ember' self.type = 'ember' self.release = '2.7.0' - self.base_url = 'http://emberjs.com/api/' - self.guide_url = "https://guides.emberjs.com/v#{self.release}/" - self.initial_urls = [guide_url] + self.base_urls = ['http://emberjs.com/api/', "https://guides.emberjs.com/v#{self.release}/"] self.links = { home: 'http://emberjs.com/', code: 'https://github.com/emberjs/ember.js' @@ -39,29 +35,5 @@ module Docs © 2016 Yehuda Katz, Tom Dale and Ember.js contributors
Licensed under the MIT License. HTML - - def guide_url - @guide_url ||= URL.parse(self.class.guide_url) - end - - private - - def process_url?(url) - base_url.contains?(url) || guide_url.contains?(url) - end - - def process_response(response) - original_scheme = @base_url.scheme - original_host = @base_url.host - original_path = @base_url.path - @base_url.scheme = response.effective_url.scheme - @base_url.host = response.effective_url.host - @base_url.path = response.effective_url.path[/\A\/v[\d\.]+\//, 0] || '/api/' - super - ensure - @base_url.scheme = original_scheme - @base_url.host = original_host - @base_url.path = original_path - end end end diff --git a/lib/docs/scrapers/meteor.rb b/lib/docs/scrapers/meteor.rb index be5b23c5..5d0cddf9 100644 --- a/lib/docs/scrapers/meteor.rb +++ b/lib/docs/scrapers/meteor.rb @@ -1,8 +1,6 @@ module Docs class Meteor < UrlScraper - class << self - attr_accessor :guide_url - end + include MultipleBaseUrls self.type = 'meteor' self.root_path = 'index.html' @@ -28,37 +26,12 @@ module Docs version '1.4' do self.release = '1.4.0' - self.base_url = 'https://docs.meteor.com/' - self.guide_url = 'https://guide.meteor.com/' - self.initial_urls = [guide_url] + self.base_urls = ['https://docs.meteor.com/', 'https://guide.meteor.com/'] end version '1.3' do self.release = '1.3.5' - self.base_url = "https://docs.meteor.com/v#{self.release}/" - self.guide_url = 'https://guide.meteor.com/v1.3/' - self.initial_urls = [guide_url] - end - - def guide_url - @guide_url ||= URL.parse(self.class.guide_url) - end - - private - - def process_url?(url) - base_url.contains?(url) || guide_url.contains?(url) - end - - def process_response(response) - original_host = @base_url.host - original_path = @base_url.path - @base_url.host = response.effective_url.host - @base_url.path = response.effective_url.path[/\A\/v[\d\.]+\//, 0] || '/' - super - ensure - @base_url.host = original_host - @base_url.path = original_path + self.base_urls = ["https://docs.meteor.com/v#{self.release}/", 'https://guide.meteor.com/v1.3/'] end end end