Refactor Docs::Scraper

pull/29/head
Thibaut 11 years ago
parent b66d6d93d6
commit b92db88506

@ -45,14 +45,14 @@ module Docs
end
def build_pages
requested_urls = Set.new initial_urls.map(&:downcase)
history = Set.new initial_urls.map(&:downcase)
instrument 'running.scraper', urls: initial_urls
request_all initial_urls do |response|
next unless data = handle_response(response)
yield data
next unless data[:internal_urls].present?
next_urls = data[:internal_urls].select { |url| requested_urls.add?(url.downcase) }
next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
instrument 'queued.scraper', urls: next_urls
next_urls
end

@ -8,6 +8,8 @@ module Docs
end
end
alias_method :running, :queued
def ignore_response(event)
msg = "Ignore: #{format_url event.payload[:response].url}"
msg << " [#{event.payload[:response].code}]" if event.payload[:response].respond_to?(:code)

@ -21,10 +21,6 @@ class DocsScraperTest < MiniTest::Spec
Struct.new(:body, :url).new
end
let :processed_response do
Hash.new
end
describe ".inherited" do
let :subclass do
Class.new Scraper
@ -88,33 +84,40 @@ class DocsScraperTest < MiniTest::Spec
end
describe "#root_url" do
let :root_url do
scraper.root_url
end
context "when #root_path? is false" do
before do
stub(scraper).root_path? { false }
end
it "returns a Docs::URL" do
assert_instance_of Docs::URL, scraper.root_url
it "returns a memoized Docs::URL" do
assert_instance_of Docs::URL, root_url
assert_same root_url, scraper.root_url
end
it "returns the normalized base url" do
it "returns the normalized .base_url" do
stub(Scraper).base_url { 'http://example.com' }
assert_equal 'http://example.com/', scraper.root_url.to_s
assert_equal 'http://example.com/', root_url.to_s
end
end
context "when .root_path isn't blank" do
context "when #root_path? is true" do
before do
stub(scraper).root_path? { true }
end
it "returns a Docs::URL" do
assert_instance_of Docs::URL, scraper.root_url
it "returns a memoized Docs::URL" do
assert_instance_of Docs::URL, root_url
assert_same root_url, scraper.root_url
end
it "returns base url + root path" do
it "returns .base_url + .root_path" do
stub(Scraper).base_url { 'http://example.com/path/' }
assert_equal 'http://example.com/path/root', scraper.root_url.to_s
stub(Scraper).root_path { '/root' }
assert_equal 'http://example.com/path/root', root_url.to_s
end
end
end
@ -201,6 +204,10 @@ class DocsScraperTest < MiniTest::Spec
Proc.new {}
end
let :processed_response do
Hash.new
end
it "requests the #initial_urls" do
mock(scraper).request_all(scraper.initial_urls)
scraper.build_pages(&block)
@ -224,11 +231,10 @@ class DocsScraperTest < MiniTest::Spec
it "yields the processed response" do
scraper.build_pages { |arg| @arg = arg }
assert @arg
assert_equal processed_response, @arg
assert_same processed_response, @arg
end
context "when response[:internal_urls] is empty" do
context "when :internal_urls is empty" do
before do
processed_response[:internal_urls] = []
end
@ -244,22 +250,21 @@ class DocsScraperTest < MiniTest::Spec
end
end
context "when response[:internal_urls] isn't empty" do
let :urls do
context "when :internal_urls isn't empty" do
let :internal_urls do
['Url']
end
before do
processed_response[:internal_urls] = urls
processed_response[:internal_urls] = internal_urls
end
it "requests the urls" do
scraper.build_pages(&block)
assert_equal urls, @next_urls
assert_equal internal_urls, @next_urls
end
it "doesn't request the same url twice irrespective of case" do
stub(Scraper).root_path { 'PATH' }
processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
scraper.build_pages(&block)
assert_empty @next_urls
@ -269,7 +274,7 @@ class DocsScraperTest < MiniTest::Spec
scraper.build_pages(&block)
assert scraper.last_instrumentation
assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
assert_equal urls, scraper.last_instrumentation[:payload][:urls]
assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls]
end
end
end
@ -286,78 +291,88 @@ class DocsScraperTest < MiniTest::Spec
describe "#options" do
let :options do
Hash.new
end
let :result do
scraper.options
end
before do
stub(Scraper).options { options }
end
it "returns a frozen, memoized Hash" do
assert_instance_of Hash, options
assert options.frozen?
assert_same options, scraper.options
assert_instance_of Hash, result
assert result.frozen?
assert_same result, scraper.options
end
it "includes .options" do
stub(Scraper).options { { test: true } }
assert options[:test]
options[:test] = true
assert result[:test]
end
it "includes #base_url" do
assert_equal scraper.base_url, options[:base_url]
assert_equal scraper.base_url, result[:base_url]
end
it "includes #root_url" do
assert_equal scraper.root_url, options[:root_url]
end
it "includes .root_path" do
assert_equal '/root', options[:root_path]
assert_equal scraper.root_url, result[:root_url]
end
it "includes .initial_paths" do
assert_equal ['/initial'], options[:initial_paths]
it "includes #root_path" do
assert_equal '/root', result[:root_path]
end
context "when #root_path? is false" do
before do
stub(scraper).root_path? { false }
it "includes #initial_paths" do
assert_equal ['/initial'], result[:initial_paths]
end
it "doesn't modify :skip" do
assert_nil options[:skip]
it "adds #initial_paths to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], '/initial'
end
context "and :only is an array" do
before do
stub(Scraper).options { { only: ['/path'] } }
it "adds #initial_paths to :only when :only_patterns is an array" do
options[:only_patterns] = []
assert_includes result[:only], '/initial'
end
it "adds ['', '/']" do
assert_includes options[:only], '/path'
assert_includes options[:only], ''
assert_includes options[:only], '/'
it "doesn't modify :only in place" do
options[:only] = []
result
assert_empty options[:only]
end
it "adds .initial_paths" do
assert_includes options[:only], '/initial'
context "when #root_path? is false" do
before do
stub(scraper).root_path? { false }
end
it "doesn't modify the array in place" do
assert_equal ['/path'], Scraper.options[:only]
end
it "doesn't modify :skip" do
options[:skip] = []
assert_equal options[:skip], result[:skip]
end
context "and :only_patterns is an array" do
before do
stub(Scraper).options { { only_patterns: [] } }
it "adds '' and '/' to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], ''
assert_includes result[:only], '/'
end
it "adds ['', '/'] to :only" do
assert_includes options[:only], ''
assert_includes options[:only], '/'
it "adds '' and '/' to :only when :only_patterns is an array" do
options[:only_patterns] = []
assert_includes result[:only], ''
assert_includes result[:only], '/'
end
it "adds .initial_paths to :only" do
assert_includes options[:only], '/initial'
end
it "doesn't modify :only in place" do
options[:only] = []
result
assert_empty options[:only]
end
end
@ -366,40 +381,33 @@ class DocsScraperTest < MiniTest::Spec
stub(scraper).root_path? { true }
end
context "and :skip is nil" do
it "assigns it ['', '/']" do
assert_equal ['', '/'], options[:skip]
end
end
context "and :skip is an array" do
before do
stub(Scraper).options { { skip: ['/path'] } }
it "adds '' and '/' to :skip when it is nil" do
assert_includes result[:skip], ''
assert_includes result[:skip], '/'
end
it "adds ['', '/']" do
assert_includes options[:skip], '/path'
assert_includes options[:skip], ''
assert_includes options[:skip], '/'
it "adds '' and '/' to :skip when it is an array" do
options[:skip] = ['/path']
assert_includes result[:skip], options[:skip].first
assert_includes result[:skip], ''
assert_includes result[:skip], '/'
end
it "doesn't modify the array in place" do
assert_equal ['/path'], Scraper.options[:skip]
end
it "doesn't modify :skip in place" do
options[:skip] = []
result
assert_empty options[:skip]
end
context "and :only is an array" do
it "adds .root_path" do
stub(Scraper).options { { only: [] } }
assert_includes options[:only], '/root'
end
it "adds #root_path to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], '/root'
end
context "and :only_patterns is an array" do
it "adds .root_path to :only" do
stub(Scraper).options { { only_patterns: [] } }
assert_includes options[:only], '/root'
end
it "adds #root_path to :only when :only_patterns is an array" do
options[:only_patterns] = []
assert_includes result[:only], '/root'
end
end
end
@ -484,18 +492,22 @@ class DocsScraperTest < MiniTest::Spec
end
describe "#pipeline" do
it "returns an HTML::Pipeline with .filters" do
stub(Scraper).filters { [1] }
assert_instance_of ::HTML::Pipeline, scraper.pipeline
assert_equal Scraper.filters, scraper.pipeline.filters
let :pipeline do
scraper.pipeline
end
it "is memoized" do
assert_same scraper.pipeline, scraper.pipeline
it "returns a memoized HTML::Pipeline" do
assert_instance_of ::HTML::Pipeline, pipeline
assert_same pipeline, scraper.pipeline
end
it "returns a pipeline with the filters stored in .filters" do
stub(Scraper).filters { [1] }
assert_equal Scraper.filters, pipeline.filters
end
it "assigns Docs as the pipeline's instrumentation service" do
assert_equal Docs, scraper.pipeline.instrumentation_service
it "returns a pipeline with Docs as instrumentation service" do
assert_equal Docs, pipeline.instrumentation_service
end
end
end

Loading…
Cancel
Save