Implement initial_paths scraper option

pull/29/head
Thibaut 11 years ago
parent 5e69d6df5e
commit f8298181e9

@ -4,7 +4,7 @@ require 'html/pipeline'
module Docs
class Scraper < Doc
class << self
attr_accessor :base_url, :root_path, :html_filters, :text_filters, :options
attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters
def inherited(subclass)
super
@ -15,6 +15,7 @@ module Docs
end
subclass.root_path = root_path
subclass.initial_paths = initial_paths.dup
subclass.options = options.deep_dup
subclass.html_filters = html_filters.inheritable_copy
subclass.text_filters = text_filters.inheritable_copy
@ -27,11 +28,12 @@ module Docs
include Instrumentable
self.initial_paths = []
self.options = {}
self.html_filters = FilterStack.new
self.text_filters = FilterStack.new
self.options = {}
html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
text_filters.push 'inner_html', 'clean_text', 'attribution'
@ -43,10 +45,10 @@ module Docs
end
def build_pages
requested_urls = Set.new [root_url.to_s.downcase]
instrument 'running.scraper', urls: requested_urls.to_a
requested_urls = Set.new initial_urls.map(&:downcase)
instrument 'running.scraper', urls: initial_urls
request_all root_url.to_s do |response|
request_all initial_urls do |response|
next unless data = handle_response(response)
yield data
next unless data[:internal_urls].present?
@ -72,6 +74,14 @@ module Docs
root_path.present? && root_path != '/'
end
def initial_paths
self.class.initial_paths
end
def initial_urls
@initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
end
def pipeline
@pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
pipeline.instrumentation_service = Docs
@ -80,14 +90,15 @@ module Docs
def options
@options ||= self.class.options.deep_dup.tap do |options|
options.merge! base_url: base_url, root_url: root_url, root_path: root_path
options.merge! base_url: base_url, root_url: root_url,
root_path: root_path, initial_paths: initial_paths
if root_path?
(options[:skip] ||= []).concat ['', '/']
end
if options[:only] || options[:only_patterns]
(options[:only] ||= []).concat root_path? ? [root_path] : ['', '/']
(options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
end
options.freeze

@ -6,6 +6,7 @@ class DocsScraperTest < MiniTest::Spec
self.type = 'scraper'
self.base_url = 'http://example.com/'
self.root_path = '/root'
self.initial_paths = ['/initial']
self.html_filters = Docs::FilterStack.new
self.text_filters = Docs::FilterStack.new
end
@ -37,6 +38,12 @@ class DocsScraperTest < MiniTest::Spec
assert_equal Scraper.root_path, subclass.root_path
end
it "duplicates .initial_paths" do
stub(Scraper).initial_paths { ['path'] }
assert_equal Scraper.initial_paths, subclass.initial_paths
refute_same Scraper.initial_paths, subclass.initial_paths
end
it "duplicates .options" do
stub(Scraper).options { { test: [] } }
assert_equal Scraper.options, subclass.options
@ -112,6 +119,29 @@ class DocsScraperTest < MiniTest::Spec
end
end
describe "#initial_urls" do
let :initial_urls do
scraper.initial_urls
end
it "returns a frozen, memoized Array" do
assert_instance_of Array, initial_urls
assert initial_urls.frozen?
assert_same initial_urls, scraper.initial_urls
end
it "includes the #root_url" do
assert_includes initial_urls, scraper.root_url.to_s
end
it "includes the .initial_paths converted to urls" do
stub(Scraper).base_url { 'http://example.com/' }
stub(Scraper).initial_paths { ['one', '/two'] }
assert_includes initial_urls, 'http://example.com/one'
assert_includes initial_urls, 'http://example.com/two'
end
end
describe "#build_page" do
before do
stub(scraper).handle_response
@ -171,8 +201,8 @@ class DocsScraperTest < MiniTest::Spec
Proc.new {}
end
it "requests the root url" do
mock(scraper).request_all(scraper.root_url.to_s)
it "requests the #initial_urls" do
mock(scraper).request_all(scraper.initial_urls)
scraper.build_pages(&block)
end
@ -181,12 +211,14 @@ class DocsScraperTest < MiniTest::Spec
scraper.build_pages(&block)
assert scraper.last_instrumentation
assert_equal 'running.scraper', scraper.last_instrumentation[:event]
assert_includes scraper.last_instrumentation[:payload][:urls], scraper.root_url.to_s
assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
end
context "when the response is processable" do
before do
stub(scraper).request_all.with_any_args { |*args| @returned = args.last.call(response) }
stub(scraper).request_all do |urls, block|
urls.each { |url| @next_urls ||= block.call(response) }
end
stub(scraper).handle_response(response) { processed_response }
end
@ -203,7 +235,7 @@ class DocsScraperTest < MiniTest::Spec
it "requests nothing more" do
scraper.build_pages(&block)
assert_nil @returned
assert_nil @next_urls
end
it "doesn't instrument 'queued'" do
@ -223,14 +255,14 @@ class DocsScraperTest < MiniTest::Spec
it "requests the urls" do
scraper.build_pages(&block)
assert_equal urls, @returned
assert_equal urls, @next_urls
end
it "doesn't request the same url twice irrespective of case" do
stub(Scraper).root_path { 'PATH' }
processed_response[:internal_urls] = [scraper.root_url.to_s.swapcase]
processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
scraper.build_pages(&block)
assert_empty @returned
assert_empty @next_urls
end
it "instruments 'queued'" do
@ -280,6 +312,10 @@ class DocsScraperTest < MiniTest::Spec
assert_equal '/root', options[:root_path]
end
it "includes .initial_paths" do
assert_equal ['/initial'], options[:initial_paths]
end
context "when #root_path? is false" do
before do
stub(scraper).root_path? { false }
@ -300,15 +336,27 @@ class DocsScraperTest < MiniTest::Spec
assert_includes options[:only], '/'
end
it "adds .initial_paths" do
assert_includes options[:only], '/initial'
end
it "doesn't modify the array in place" do
assert_equal ['/path'], Scraper.options[:only]
end
end
context "and :only_patterns is an array" do
it "assigns ['', '/'] to :only" do
before do
stub(Scraper).options { { only_patterns: [] } }
assert_equal ['', '/'], options[:only]
end
it "adds ['', '/'] to :only" do
assert_includes options[:only], ''
assert_includes options[:only], '/'
end
it "adds .initial_paths to :only" do
assert_includes options[:only], '/initial'
end
end
end
@ -348,9 +396,9 @@ class DocsScraperTest < MiniTest::Spec
end
context "and :only_patterns is an array" do
it "assigns [.root_path] to :only" do
it "adds .root_path to :only" do
stub(Scraper).options { { only_patterns: [] } }
assert_equal ['/root'], options[:only]
assert_includes options[:only], '/root'
end
end
end

Loading…
Cancel
Save