Refactor Docs::Scraper

pull/29/head
Thibaut 11 years ago
parent b66d6d93d6
commit b92db88506

@ -45,14 +45,14 @@ module Docs
end end
def build_pages def build_pages
requested_urls = Set.new initial_urls.map(&:downcase) history = Set.new initial_urls.map(&:downcase)
instrument 'running.scraper', urls: initial_urls instrument 'running.scraper', urls: initial_urls
request_all initial_urls do |response| request_all initial_urls do |response|
next unless data = handle_response(response) next unless data = handle_response(response)
yield data yield data
next unless data[:internal_urls].present? next unless data[:internal_urls].present?
next_urls = data[:internal_urls].select { |url| requested_urls.add?(url.downcase) } next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
instrument 'queued.scraper', urls: next_urls instrument 'queued.scraper', urls: next_urls
next_urls next_urls
end end

@ -8,6 +8,8 @@ module Docs
end end
end end
alias_method :running, :queued
def ignore_response(event) def ignore_response(event)
msg = "Ignore: #{format_url event.payload[:response].url}" msg = "Ignore: #{format_url event.payload[:response].url}"
msg << " [#{event.payload[:response].code}]" if event.payload[:response].respond_to?(:code) msg << " [#{event.payload[:response].code}]" if event.payload[:response].respond_to?(:code)

@ -21,10 +21,6 @@ class DocsScraperTest < MiniTest::Spec
Struct.new(:body, :url).new Struct.new(:body, :url).new
end end
let :processed_response do
Hash.new
end
describe ".inherited" do describe ".inherited" do
let :subclass do let :subclass do
Class.new Scraper Class.new Scraper
@ -88,33 +84,40 @@ class DocsScraperTest < MiniTest::Spec
end end
describe "#root_url" do describe "#root_url" do
let :root_url do
scraper.root_url
end
context "when #root_path? is false" do context "when #root_path? is false" do
before do before do
stub(scraper).root_path? { false } stub(scraper).root_path? { false }
end end
it "returns a Docs::URL" do it "returns a memoized Docs::URL" do
assert_instance_of Docs::URL, scraper.root_url assert_instance_of Docs::URL, root_url
assert_same root_url, scraper.root_url
end end
it "returns the normalized base url" do it "returns the normalized .base_url" do
stub(Scraper).base_url { 'http://example.com' } stub(Scraper).base_url { 'http://example.com' }
assert_equal 'http://example.com/', scraper.root_url.to_s assert_equal 'http://example.com/', root_url.to_s
end end
end end
context "when .root_path isn't blank" do context "when #root_path? is true" do
before do before do
stub(scraper).root_path? { true } stub(scraper).root_path? { true }
end end
it "returns a Docs::URL" do it "returns a memoized Docs::URL" do
assert_instance_of Docs::URL, scraper.root_url assert_instance_of Docs::URL, root_url
assert_same root_url, scraper.root_url
end end
it "returns base url + root path" do it "returns .base_url + .root_path" do
stub(Scraper).base_url { 'http://example.com/path/' } stub(Scraper).base_url { 'http://example.com/path/' }
assert_equal 'http://example.com/path/root', scraper.root_url.to_s stub(Scraper).root_path { '/root' }
assert_equal 'http://example.com/path/root', root_url.to_s
end end
end end
end end
@ -201,6 +204,10 @@ class DocsScraperTest < MiniTest::Spec
Proc.new {} Proc.new {}
end end
let :processed_response do
Hash.new
end
it "requests the #initial_urls" do it "requests the #initial_urls" do
mock(scraper).request_all(scraper.initial_urls) mock(scraper).request_all(scraper.initial_urls)
scraper.build_pages(&block) scraper.build_pages(&block)
@ -224,11 +231,10 @@ class DocsScraperTest < MiniTest::Spec
it "yields the processed response" do it "yields the processed response" do
scraper.build_pages { |arg| @arg = arg } scraper.build_pages { |arg| @arg = arg }
assert @arg assert_same processed_response, @arg
assert_equal processed_response, @arg
end end
context "when response[:internal_urls] is empty" do context "when :internal_urls is empty" do
before do before do
processed_response[:internal_urls] = [] processed_response[:internal_urls] = []
end end
@ -244,22 +250,21 @@ class DocsScraperTest < MiniTest::Spec
end end
end end
context "when response[:internal_urls] isn't empty" do context "when :internal_urls isn't empty" do
let :urls do let :internal_urls do
['Url'] ['Url']
end end
before do before do
processed_response[:internal_urls] = urls processed_response[:internal_urls] = internal_urls
end end
it "requests the urls" do it "requests the urls" do
scraper.build_pages(&block) scraper.build_pages(&block)
assert_equal urls, @next_urls assert_equal internal_urls, @next_urls
end end
it "doesn't request the same url twice irrespective of case" do it "doesn't request the same url twice irrespective of case" do
stub(Scraper).root_path { 'PATH' }
processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase) processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
scraper.build_pages(&block) scraper.build_pages(&block)
assert_empty @next_urls assert_empty @next_urls
@ -269,7 +274,7 @@ class DocsScraperTest < MiniTest::Spec
scraper.build_pages(&block) scraper.build_pages(&block)
assert scraper.last_instrumentation assert scraper.last_instrumentation
assert_equal 'queued.scraper', scraper.last_instrumentation[:event] assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
assert_equal urls, scraper.last_instrumentation[:payload][:urls] assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls]
end end
end end
end end
@ -286,34 +291,59 @@ class DocsScraperTest < MiniTest::Spec
describe "#options" do describe "#options" do
let :options do let :options do
Hash.new
end
let :result do
scraper.options scraper.options
end end
before do
stub(Scraper).options { options }
end
it "returns a frozen, memoized Hash" do it "returns a frozen, memoized Hash" do
assert_instance_of Hash, options assert_instance_of Hash, result
assert options.frozen? assert result.frozen?
assert_same options, scraper.options assert_same result, scraper.options
end end
it "includes .options" do it "includes .options" do
stub(Scraper).options { { test: true } } options[:test] = true
assert options[:test] assert result[:test]
end end
it "includes #base_url" do it "includes #base_url" do
assert_equal scraper.base_url, options[:base_url] assert_equal scraper.base_url, result[:base_url]
end end
it "includes #root_url" do it "includes #root_url" do
assert_equal scraper.root_url, options[:root_url] assert_equal scraper.root_url, result[:root_url]
end
it "includes #root_path" do
assert_equal '/root', result[:root_path]
end
it "includes #initial_paths" do
assert_equal ['/initial'], result[:initial_paths]
end
it "adds #initial_paths to :only when it is an array" do
options[:only] = ['/path']
assert_includes result[:only], options[:only].first
assert_includes result[:only], '/initial'
end end
it "includes .root_path" do it "adds #initial_paths to :only when :only_patterns is an array" do
assert_equal '/root', options[:root_path] options[:only_patterns] = []
assert_includes result[:only], '/initial'
end end
it "includes .initial_paths" do it "doesn't modify :only in place" do
assert_equal ['/initial'], options[:initial_paths] options[:only] = []
result
assert_empty options[:only]
end end
context "when #root_path? is false" do context "when #root_path? is false" do
@ -322,42 +352,27 @@ class DocsScraperTest < MiniTest::Spec
end end
it "doesn't modify :skip" do it "doesn't modify :skip" do
assert_nil options[:skip] options[:skip] = []
assert_equal options[:skip], result[:skip]
end end
context "and :only is an array" do it "adds '' and '/' to :only when it is an array" do
before do options[:only] = ['/path']
stub(Scraper).options { { only: ['/path'] } } assert_includes result[:only], options[:only].first
end assert_includes result[:only], ''
assert_includes result[:only], '/'
it "adds ['', '/']" do
assert_includes options[:only], '/path'
assert_includes options[:only], ''
assert_includes options[:only], '/'
end
it "adds .initial_paths" do
assert_includes options[:only], '/initial'
end
it "doesn't modify the array in place" do
assert_equal ['/path'], Scraper.options[:only]
end
end end
context "and :only_patterns is an array" do it "adds '' and '/' to :only when :only_patterns is an array" do
before do options[:only_patterns] = []
stub(Scraper).options { { only_patterns: [] } } assert_includes result[:only], ''
end assert_includes result[:only], '/'
end
it "adds ['', '/'] to :only" do
assert_includes options[:only], ''
assert_includes options[:only], '/'
end
it "adds .initial_paths to :only" do it "doesn't modify :only in place" do
assert_includes options[:only], '/initial' options[:only] = []
end result
assert_empty options[:only]
end end
end end
@ -366,40 +381,33 @@ class DocsScraperTest < MiniTest::Spec
stub(scraper).root_path? { true } stub(scraper).root_path? { true }
end end
context "and :skip is nil" do it "adds '' and '/' to :skip when it is nil" do
it "assigns it ['', '/']" do assert_includes result[:skip], ''
assert_equal ['', '/'], options[:skip] assert_includes result[:skip], '/'
end
end end
context "and :skip is an array" do it "adds '' and '/' to :skip when it is an array" do
before do options[:skip] = ['/path']
stub(Scraper).options { { skip: ['/path'] } } assert_includes result[:skip], options[:skip].first
end assert_includes result[:skip], ''
assert_includes result[:skip], '/'
it "adds ['', '/']" do end
assert_includes options[:skip], '/path'
assert_includes options[:skip], ''
assert_includes options[:skip], '/'
end
it "doesn't modify the array in place" do it "doesn't modify :skip in place" do
assert_equal ['/path'], Scraper.options[:skip] options[:skip] = []
end result
assert_empty options[:skip]
end end
context "and :only is an array" do it "adds #root_path to :only when it is an array" do
it "adds .root_path" do options[:only] = ['/path']
stub(Scraper).options { { only: [] } } assert_includes result[:only], options[:only].first
assert_includes options[:only], '/root' assert_includes result[:only], '/root'
end
end end
context "and :only_patterns is an array" do it "adds #root_path to :only when :only_patterns is an array" do
it "adds .root_path to :only" do options[:only_patterns] = []
stub(Scraper).options { { only_patterns: [] } } assert_includes result[:only], '/root'
assert_includes options[:only], '/root'
end
end end
end end
end end
@ -484,18 +492,22 @@ class DocsScraperTest < MiniTest::Spec
end end
describe "#pipeline" do describe "#pipeline" do
it "returns an HTML::Pipeline with .filters" do let :pipeline do
stub(Scraper).filters { [1] } scraper.pipeline
assert_instance_of ::HTML::Pipeline, scraper.pipeline
assert_equal Scraper.filters, scraper.pipeline.filters
end end
it "is memoized" do it "returns a memoized HTML::Pipeline" do
assert_same scraper.pipeline, scraper.pipeline assert_instance_of ::HTML::Pipeline, pipeline
assert_same pipeline, scraper.pipeline
end
it "returns a pipeline with the filters stored in .filters" do
stub(Scraper).filters { [1] }
assert_equal Scraper.filters, pipeline.filters
end end
it "assigns Docs as the pipeline's instrumentation service" do it "returns a pipeline with Docs as instrumentation service" do
assert_equal Docs, scraper.pipeline.instrumentation_service assert_equal Docs, pipeline.instrumentation_service
end end
end end
end end

Loading…
Cancel
Save