Re-implement Angular.js scraper

Fixes #23. Thanks @TheRusskiy and @afram.
pull/62/head
Thibaut 11 years ago
parent d51a6f9042
commit faf6ecf549

@ -31,11 +31,16 @@ group :docs do
gem 'typhoeus'
gem 'nokogiri', '~> 1.6.0'
gem 'html-pipeline'
gem 'progress_bar'
gem 'unix_utils'
gem 'progress_bar', require: false
gem 'unix_utils', require: false
end
group :test do
gem 'minitest'
gem 'rr', require: false
end
if ENV['SELENIUM'] == '1'
gem 'capybara'
gem 'selenium-webdriver'
end

@ -75,7 +75,7 @@ app.templates.aboutPage = -> """
credits = [
[ 'Angular.js',
'2010-2013 Google, Inc.',
'2010-2014 Google, Inc.',
'CC BY',
'http://creativecommons.org/licenses/by/3.0/'
], [

@ -24,7 +24,10 @@ newsItem = (date, news) ->
result
app.news = [
[ 1390089600001, # January 19, 2013
[ 1390694400000, # January 26, 2013
""" Updated <a href="/angular/">Angular.js</a> documentation """,
], [
390089600001, # January 19, 2013
""" New <a href="/d3/">D3.js</a> and <a href="/knockout/">Knockout.js</a> documentations """,
], [
1390003200000, # January 18, 2013

@ -2,5 +2,6 @@
class app.views.AngularPage extends app.views.BasePage
afterRender: ->
@highlightCode @findAllByClass('prettyprint'), 'javascript'
for el in @findAllByTag('pre')
@highlightCode el, if el.textContent[0] is '<' then 'markup' else 'javascript'
return

@ -1,8 +1,39 @@
._angular {
> h2 { font-size: 1.125rem; }
h2 { @extend %block-heading; }
//
// Index
//
.nav-header.section {
margin: 1.5em 0 1em -2em;
list-style: none;
font-weight: bold;
text-transform: capitalize;
}
//
// Other
//
h3, h4 { font-size: 1rem; }
.methods {
.alert { @extend %note; }
.alert-success { @extend %note-green; }
.alert-error { @extend %note-red; }
p > code, li > code, td > code { @extend %label; }
.view-source, .improve-docs {
position: relative;
float: right;
line-height: 1.7rem;
padding-left: 1em;
font-size: .875rem;
background: white;
}
.defs {
padding-left: 1rem;
list-style: none;
@ -12,6 +43,12 @@
}
> li + li { margin-top: 2em; }
> li > ul { list-style-type: disc; }
h4 {
margin: 1em 0 .5em;
font-size: 1em;
}
ul { list-style-type: disc; }
}
}

@ -0,0 +1,86 @@
module Docs
class Angular
class CleanHtmlFilter < Filter
def call
# Fix internal links (remove colons)
css('a[href]').each do |node|
node['href'] = node['href'].gsub %r{(directive|filter):}, '\1-'
end
root_page? ? root : other
doc
end
def root
css('.pull-right', '.ng-hide').remove
# Turn "module [...]" <li> into <h2>
css('.nav-header.module').each do |node|
node.name = 'h2'
node.parent.before(node)
end
# Remove links to "Directive", "Filter", etc.
css('a.guide').each do |node|
node.replace(node.content)
end
end
def other
css('#example', '.example', '#description_source', '#description_demo', '[id$="example"]').remove
if at_css('h1').content.strip.empty?
# Ensure proper <h1> (e.g. ngResource, AUTO, etc.)
at_css('h2').tap do |node|
at_css('h1').content = node.try(:content) || slug
node.try(:remove)
end
else
# Clean up .hint in <h1>
css('h1 > div > .hint').each do |node|
node.parent.before("<small>(#{node.content.strip})</small>").remove
end
end
at_css('h1').add_child(css('.view-source', '.improve-docs'))
# Remove root-level <div>
while div = at_css('h1 + div')
div.before(div.children)
div.remove
end
# Remove dead links (e.g. ngRepeat)
css('a.type-hint').each do |node|
node.name = 'code'
node.remove_attribute 'href'
end
# Remove some <code> elements
css('h1 > code', 'pre > code', 'h6 > code').each do |node|
node.before(node.content).remove
end
# Fix code indentation
css('code', 'pre').each do |node|
node.inner_html = node.inner_html.strip_heredoc.strip
end
# Make <pre> elements
css('.in-javascript', '.in-html-template-binding').each do |node|
node.name = 'pre'
node.content = node.content
end
css('ul.methods', 'ul.properties', 'ul.events').add_class('defs')
# Remove ng-* attributes
css('*').each do |node|
node.attributes.each_key do |attribute|
node.remove_attribute(attribute) if attribute.start_with? 'ng-'
end
end
end
end
end
end

@ -0,0 +1,11 @@
module Docs
class Angular
class CleanUrlsFilter < Filter
def call
html.gsub! 'angularjs.org/partials/api/', 'angularjs.org/api/'
html.gsub! %r{angularjs.org/api/(.+?)\.html}, 'angularjs.org/api/\1'
html
end
end
end
end

@ -0,0 +1,40 @@
module Docs
class Angular
class EntriesFilter < Docs::EntriesFilter
def get_name
name = slug.split(':').last
name.sub! %r{\Ang\.}, ''
name << " (#{subtype})" if subtype == 'directive' || subtype == 'filter'
name
end
def get_type
type = slug.split('.').first
type << " #{subtype}s" if type == 'ng' && subtype
type
end
def subtype
return @subtype if defined? @subtype
node = at_css 'h1'
data = node.content.match %r{\((.+) in module} if node
@subtype = data && data[1]
end
def additional_entries
entries = []
css('ul.defs').each do |list|
list.css('> li > h3:first-child').each do |node|
name = node.content.strip
name.sub! %r{\(.+\)}, '()'
name.prepend "#{self.name.split.first}."
entries << [name, node['id']]
end
end
entries
end
end
end
end

@ -1,19 +1,60 @@
module Docs
class Angular < UrlScraper
# This scraper is currently broken; the problem being that Angular's
# documentation isn't available as static pages. I will try to restore it
# once Angular 1.2.0 is released.
#
# In the past it used static-ng-doc by Sal Lara (github.com/natchiketa/static-ng-doc)
# to scrape the doc's HTML partials (e.g. docs.angularjs.org/partials/api/ng.html).
#
# If you want to help this is what I need: a static page with links to each
# HTML partial. Or better yet, a static version of Angular's documentation.
self.name = 'Angular.js'
self.slug = 'angular'
self.type = 'angular'
self.version = '1.0.7'
self.base_url = ''
self.version = '1.2.10'
self.base_url = 'http://docs.angularjs.org/partials/api/'
html_filters.insert_before 'normalize_paths', 'angular/clean_html'
html_filters.push 'angular/entries', 'title'
text_filters.push 'angular/clean_urls'
options[:title] = false
options[:root_title] = 'Angular.js'
options[:fix_urls] = ->(url) do
url.sub! '/partials/api/api/', '/partials/api/'
url.sub! '/partials/api/guide/', '/guide/'
url.sub! %r{/partials/api/(.+?)(?<!\.html)(?:\z|(#.*))}, '/partials/api/\1.html\2'
url.gsub! '/partials/api/(.+?)\:', '/partials/api/\1%3A'
url
end
options[:skip] = %w(ng.html)
options[:attribution] = <<-HTML
&copy; 2010&ndash;2014 Google, Inc.<br>
Licensed under the Creative Commons Attribution License 3.0.
HTML
private
def request_one(url)
stub_root_page if url == root_url.to_s
super
end
def request_all(urls, &block)
stub_root_page
super
end
def stub_root_page
response = Typhoeus::Response.new(
effective_url: root_url.to_s,
code: 200,
headers: { 'Content-Type' => 'text/html' },
body: get_root_page_body)
Typhoeus.stub(root_url.to_s).and_return(response)
end
def get_root_page_body
require 'capybara'
Capybara.current_driver = :selenium
Capybara.visit('http://docs.angularjs.org/api/')
Capybara.find('.side-navigation')['innerHTML']
end
end
end

Loading…
Cancel
Save