ImmutableJS Scrapper.

Core scrapper changes:
*) Add regexp support to stub
*) Add support to fragment in internal_urls, normalized_paths => done externally.
pull/513/head
Yosi Attias 8 years ago
parent f0211029ef
commit b1ecc6c964

@ -51,12 +51,20 @@ module Docs
def initialize_stubs
self.class.stubs.each do |path, block|
Typhoeus.stub(url_for(path)).and_return do
stub_path = nil
case path
when String
stub_path = url_for(path)
when Regexp
stub_path = path
end
Typhoeus.stub(stub_path).and_return do |request|
Typhoeus::Response.new \
effective_url: url_for(path),
effective_url: request.url,
code: 200,
headers: { 'Content-Type' => 'text/html' },
body: self.instance_exec(&block)
body: self.instance_exec(request.url, &block)
end
end
end

@ -0,0 +1,38 @@
module Docs
class Immutablejs
class CleanHtmlFilter < Filter
def call
# Skip the container "div"
@doc = at_css('div')
# Remove data-reactid attributes for cleaner html
css('*[data-reactid]').each do |reactEl|
reactEl.delete 'data-reactid'
end
# Add id to member label, so we can navigate among them
css('h3.memberLabel').each do |memberLabel|
memberLabel['id'] = memberLabel.content.strip.chomp('()')
end
css('a').each do |link|
# Remove "/" from the start
link['href'] = link['href'].gsub(/^(#)?\//, '')
# We need to convert links - from Iterable/butLast to Iterable#butLast
link['href'] = link['href'].split('/').join('#')
end
# Replace code blocks tag code with pre, and add stylings.
css('code.codeBlock').each do |codeBlock|
codeBlock.name = 'pre'
codeBlock['data-language'] = 'javascript'
codeBlock['class'] = 'language-javascript'
end
doc
end
end
end
end

@ -0,0 +1,31 @@
module Docs
class Immutablejs
class EntriesFilter < Docs::EntriesFilter
def name
typeHeader = at_css('h1.typeHeader')
return typeHeader.content if typeHeader
end
def type
typeHeader = at_css('h1.typeHeader')
return typeHeader.content if typeHeader
# TODO: Is this ok? This the index page.. I don't think it should have it's own type..
nil
end
def additional_entries
if current_url.fragment.nil?
return []
end
css('h3.memberLabel').map do |memberLabel|
entry_name = "#{type}##{memberLabel.content}"
[entry_name, memberLabel.content.chomp('()')]
end
end
end
end
end

@ -0,0 +1,23 @@
module Docs
class Immutablejs
class InternalUrlsFilter < Docs::InternalUrlsFilter
def update_and_follow_links
urls = result[:internal_urls] = []
update_links do |url|
urls << url.to_s
end
urls.uniq!
end
def to_internal_url(str)
if str.start_with? "#/"
return nil if not str =~ /^#\/[^\/]+$/
str = root_url.to_s + str
end
super(str)
end
end
end
end

@ -0,0 +1,28 @@
module Docs
class Immutablejs
class NormalizePathsFilter < Docs::NormalizePathsFilter
#
# Checks if the given url starts with:
# "#" or ".#", with means it's a fragment url
#
FRAGMENT_REGEX = /^(\.)?#/
def path
#
# If we have fragment, we want to use as our path.
#
if current_url.fragment
# Remove "/" from the start
@path = current_url.fragment.sub(/^\//, '')
end
super
end
def normalize_href href
return href.gsub(FRAGMENT_REGEX, '') if href =~ FRAGMENT_REGEX
super
end
end
end
end

@ -0,0 +1,38 @@
module Docs
class Immutablejs < UrlScraper
self.name = "ImmutableJS"
self.type = "immutablejs"
self.release = "3.8.1"
self.base_url = "https://facebook.github.io/immutable-js/docs/"
#
# Replacins core html filters with our own, so we can handle fragments in
#
html_filters.replace 'internal_urls', 'immutablejs/internal_urls'
html_filters.replace 'normalize_paths', 'immutablejs/normalize_paths'
html_filters.push 'immutablejs/clean_html', 'immutablejs/entries'
options[:attribution] = <<-HTML
This documentation is generated from <a href="https://github.com/facebook/immutable-js/blob/master/type-definitions/Immutable.d.ts">Immutable.d.ts</a>.
Pull requests and <a href="https://github.com/facebook/immutable-js/issues">Issues</a> welcome.
HTML
stub(/.*/) do |url|
#
# Reuse capybara sessions, since we scrape all pages..
# by visiting 'about:blank' we reset the oldest session.
#
@capybara ||= load_capybara_selenium
@capybara.visit 'about:blank'
@capybara.visit url
@capybara.execute_script 'return document.querySelector(".docContents").innerHTML'
end
end
end

Binary file not shown.

After

Width:  |  Height:  |  Size: 758 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

Loading…
Cancel
Save