Improve MDN scrapers

Closes #488.
Closes #572.
pull/570/merge
Thibaut Courouble 8 years ago
parent 1d3abd0c6c
commit 476c69e419

@ -126,7 +126,7 @@ module Docs
(options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/']) (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
end end
options.merge!(additional_options) if respond_to?(:additional_options, true) options.merge!(additional_options)
options.freeze options.freeze
end end
end end
@ -197,18 +197,31 @@ module Docs
@pipeline = nil @pipeline = nil
end end
def additional_options
{}
end
module FixInternalUrlsBehavior module FixInternalUrlsBehavior
def self.included(base) def self.included(base)
base.extend ClassMethods base.extend ClassMethods
end end
def self.prepended(base)
class << base
prepend ClassMethods
end
end
module ClassMethods module ClassMethods
attr_reader :internal_urls def internal_urls
@internal_urls
end
def store_pages(store) def store_pages(store)
instrument 'info.doc', msg: 'Building internal urls...' instrument 'info.doc', msg: 'Building internal urls...'
with_internal_urls do with_internal_urls do
instrument 'info.doc', msg: 'Building pages...' puts @internal_urls
instrument 'info.doc', msg: 'Continuing...'
super super
end end
end end
@ -226,7 +239,7 @@ module Docs
def fetch_internal_urls def fetch_internal_urls
result = [] result = []
build_pages do |page| build_pages do |page|
result << base_url.subpath_to(page[:response_url]) if page[:entries].present? result << page[:subpath] if page[:entries].present?
end end
result result
end end
@ -240,16 +253,15 @@ module Docs
def additional_options def additional_options
if self.class.internal_urls if self.class.internal_urls
{ super.merge! \
only: self.class.internal_urls.to_set, only: self.class.internal_urls.to_set,
only_patterns: nil, only_patterns: nil,
skip: nil, skip: nil,
skip_patterns: nil, skip_patterns: nil,
skip_links: nil, skip_links: nil,
fixed_internal_urls: true fixed_internal_urls: true
}
else else
{} super
end end
end end

@ -106,13 +106,21 @@ module Docs
base.extend ClassMethods base.extend ClassMethods
end end
def self.prepended(base)
class << base
prepend ClassMethods
end
end
module ClassMethods module ClassMethods
attr_reader :redirections def redirections
@redirections
end
def store_pages(store) def store_pages(store)
instrument 'info.doc', msg: 'Fetching redirections...' instrument 'info.doc', msg: 'Fetching redirections...'
with_redirections do with_redirections do
instrument 'info.doc', msg: 'Building pages...' instrument 'info.doc', msg: 'Continuing...'
super super
end end
end end
@ -145,7 +153,7 @@ module Docs
end end
def additional_options def additional_options
{ redirections: self.class.redirections } super.merge! redirections: self.class.redirections
end end
end end
end end

@ -3,9 +3,12 @@
module Docs module Docs
class InternalUrlsFilter < Filter class InternalUrlsFilter < Filter
def call def call
result[:subpath] = subpath
unless skip_links? unless skip_links?
follow_links? ? update_and_follow_links : update_links follow_links? ? update_and_follow_links : update_links
end end
doc doc
end end

@ -1,6 +1,7 @@
module Docs module Docs
class Dom < Mdn class Dom < Mdn
include FixRedirectionsBehavior prepend FixInternalUrlsBehavior
prepend FixRedirectionsBehavior
self.name = 'DOM' self.name = 'DOM'
self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/API' self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/API'

@ -1,6 +1,7 @@
module Docs module Docs
class Javascript < Mdn class Javascript < Mdn
include FixRedirectionsBehavior prepend FixInternalUrlsBehavior
prepend FixRedirectionsBehavior
self.name = 'JavaScript' self.name = 'JavaScript'
self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference' self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference'

@ -1,6 +1,7 @@
module Docs module Docs
class Svg < Mdn class Svg < Mdn
include FixRedirectionsBehavior prepend FixInternalUrlsBehavior
prepend FixRedirectionsBehavior
self.name = 'SVG' self.name = 'SVG'
self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/SVG' self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/SVG'

Loading…
Cancel
Save