Refactor scrapers with multiple base URLs

pull/496/head
Thibaut Courouble 9 years ago
parent 3eb5a0caaa
commit 46a9ed16f6

@ -3,7 +3,7 @@ require 'set'
module Docs
class Scraper < Doc
class << self
attr_accessor :base_url, :root_path, :initial_paths, :initial_urls, :options, :html_filters, :text_filters, :stubs
attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs
def inherited(subclass)
super
@ -16,7 +16,6 @@ module Docs
subclass.base_url = base_url
subclass.root_path = root_path
subclass.initial_paths = initial_paths.dup
subclass.initial_urls = initial_urls.dup
subclass.options = options.deep_dup
subclass.html_filters = html_filters.inheritable_copy
subclass.text_filters = text_filters.inheritable_copy
@ -36,7 +35,6 @@ module Docs
include Instrumentable
self.initial_paths = []
self.initial_urls = []
self.options = {}
self.stubs = {}
@ -105,7 +103,7 @@ module Docs
end
def initial_urls
@initial_urls ||= [root_url.to_s].concat(self.class.initial_urls).concat(initial_paths.map(&method(:url_for))).freeze
@initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
end
def pipeline

@ -50,6 +50,52 @@ module Docs
Capybara
end
module MultipleBaseUrls
def self.included(base)
base.extend ClassMethods
end
module ClassMethods
attr_reader :base_urls
def base_urls=(urls)
self.base_url = urls.first
@base_urls = urls
end
end
def initial_urls
super + self.class.base_urls[1..-1]
end
def base_urls
@base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
end
private
def process_url?(url)
base_urls.any? { |base_url| base_url.contains?(url) }
end
def process_response(response)
original_scheme = self.base_url.scheme
original_host = self.base_url.host
original_path = self.base_url.path
effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
self.base_url.scheme = effective_base_url.scheme
self.base_url.host = effective_base_url.host
self.base_url.path = effective_base_url.path
super
ensure
self.base_url.scheme = original_scheme
self.base_url.host = original_host
self.base_url.path = original_path
end
end
module FixRedirectionsBehavior
def self.included(base)
base.extend ClassMethods

@ -1,16 +1,12 @@
module Docs
class Ember < UrlScraper
class << self
attr_accessor :guide_url
end
include MultipleBaseUrls
self.name = 'Ember.js'
self.slug = 'ember'
self.type = 'ember'
self.release = '2.7.0'
self.base_url = 'http://emberjs.com/api/'
self.guide_url = "https://guides.emberjs.com/v#{self.release}/"
self.initial_urls = [guide_url]
self.base_urls = ['http://emberjs.com/api/', "https://guides.emberjs.com/v#{self.release}/"]
self.links = {
home: 'http://emberjs.com/',
code: 'https://github.com/emberjs/ember.js'
@ -39,29 +35,5 @@ module Docs
&copy; 2016 Yehuda Katz, Tom Dale and Ember.js contributors<br>
Licensed under the MIT License.
HTML
def guide_url
@guide_url ||= URL.parse(self.class.guide_url)
end
private
def process_url?(url)
base_url.contains?(url) || guide_url.contains?(url)
end
def process_response(response)
original_scheme = @base_url.scheme
original_host = @base_url.host
original_path = @base_url.path
@base_url.scheme = response.effective_url.scheme
@base_url.host = response.effective_url.host
@base_url.path = response.effective_url.path[/\A\/v[\d\.]+\//, 0] || '/api/'
super
ensure
@base_url.scheme = original_scheme
@base_url.host = original_host
@base_url.path = original_path
end
end
end

@ -1,8 +1,6 @@
module Docs
class Meteor < UrlScraper
class << self
attr_accessor :guide_url
end
include MultipleBaseUrls
self.type = 'meteor'
self.root_path = 'index.html'
@ -28,37 +26,12 @@ module Docs
version '1.4' do
self.release = '1.4.0'
self.base_url = 'https://docs.meteor.com/'
self.guide_url = 'https://guide.meteor.com/'
self.initial_urls = [guide_url]
self.base_urls = ['https://docs.meteor.com/', 'https://guide.meteor.com/']
end
version '1.3' do
self.release = '1.3.5'
self.base_url = "https://docs.meteor.com/v#{self.release}/"
self.guide_url = 'https://guide.meteor.com/v1.3/'
self.initial_urls = [guide_url]
end
def guide_url
@guide_url ||= URL.parse(self.class.guide_url)
end
private
def process_url?(url)
base_url.contains?(url) || guide_url.contains?(url)
end
def process_response(response)
original_host = @base_url.host
original_path = @base_url.path
@base_url.host = response.effective_url.host
@base_url.path = response.effective_url.path[/\A\/v[\d\.]+\//, 0] || '/'
super
ensure
@base_url.host = original_host
@base_url.path = original_path
self.base_urls = ["https://docs.meteor.com/v#{self.release}/", 'https://guide.meteor.com/v1.3/']
end
end
end

Loading…
Cancel
Save