Improve PHPUnit scraper

pull/142/head
Thibaut 10 years ago
parent bc901cc6c8
commit ef4e59a0b0

@ -1,21 +1,18 @@
._phpunit {
h1 {
margin-top: 0;
@extend %lined-heading;
}
> h2 { @extend %block-heading; }
> h3 { @extend %block-label, %label-blue; }
> h4 { font-size: 1em; }
h2.title {
@extend %block-heading;
}
> p > code { @extend %label; }
.programlisting > pre { white-space: normal; }
.warning, .alert {
@extend %note;
.literal {
padding: 2px 4px;
color: #c7254e;
background-color: #f9f2f4;
border-radius: 4px;
> h3 {
margin: 0 0 .5em;
font-size: 1em;
}
}
div.warning { @extend %note, %note-red; }
.alert-danger { @extend %note-red; }
}

@ -7,58 +7,52 @@ module Docs
end
def root
doc.inner_html = <<-HTML
<p>PHPUnit is a programmer-oriented testing framework for PHP.<br>
It is an instance of the xUnit architecture for unit testing frameworks.</p>
HTML
doc.inner_html = ' '
end
def other
# set root on appendix
@doc = doc.at_css('div.appendix')
@doc = doc.at_css('div.appendix, div.chapter')
# remove attributes 'style'
css('*').remove_attr('style')
css('.example-break', '.table-break').remove
# clean titles
css('div.titlepage').each do |node|
title = node.at_css('.title')
case title.name
when 'h1'
# remove 'Appendix X.' from top title
nodetitle = title.content
title.content = nodetitle.gsub(/Appendix \w+\. /, '')
when 'h2'
# set link anchors in entries (title level 2)
anchor = Nokogiri::XML::Node.new "a", @doc
anchor.content = title.content
anchor['id'] = title.content.downcase.gsub(/[^a-z]/, '')
title.content = ''
anchor.parent = title
end
node.replace title
css('a[id]').each do |node|
next unless node.content.blank?
node.parent['id'] = node['id']
node.remove
end
# set anchor for internal references
css('p.title').each do |node|
anchor = Nokogiri::XML::Node.new "a", @doc
anchor.content = node.content
anchor['id'] = anchor.content[/\w+ [A-z0-9.]+/].downcase.parameterize
node.content = ''
anchor.parent = node
css('.titlepage').each do |node|
title = node.at_css('h1, .title')
title.content = title.content.remove(/(Chapter|Appendix)\s+\w+\.\s+/)
node.before(title).remove
end
# clean internal references
css('a').each do |node|
page = node['href'][/([A-z.-]+)?#/, 1] if node['href']
if page then
page = page + '.html' unless page[/.*\.html/]
if Phpunit.initial_paths.include? page
node['href'] = node['href'].gsub(/#[A-z.-]+/, '#' + node.content.downcase.parameterize)
end
end
css('.section').each do |node|
node.before(node.children).remove
end
css('[style], [border], [valign]').each do |node|
node.remove_attribute('style')
node.remove_attribute('border')
node.remove_attribute('valign')
end
css('.warning h3', '.alert h3').each do |node|
node.remove if node.content == 'Note'
end
css('p > code.literal:first-child:last-child').each do |node|
next if node.previous_sibling && node.previous_sibling.content.present?
next if node.next_sibling && node.next_sibling.content.present?
node.parent.name = 'pre'
node.parent.content = node.content
end
css('pre', '.term').each do |node|
node.content = node.content
end
doc
end
end
end

@ -1,19 +1,24 @@
module Docs
class Phpunit
class EntriesFilter < Docs::EntriesFilter
def additional_entries
entries = []
def get_name
at_css('h1').content
end
def get_type
if name.in?(%w(Assertions Annotations))
name
else
'Guides'
end
end
if at_css('h1')
type = at_css('h1').content.gsub(/Appendix \w+\. /, '')
def additional_entries
return [] if type == 'Guides'
css('h2').each do |node|
name = node.content
id = name.parameterize
entries << [name, id, type]
end
css('h2').map do |node|
[node.content, node['id']]
end
entries
end
end
end

@ -2,26 +2,19 @@ module Docs
class Phpunit < UrlScraper
self.name = 'PHPUnit'
self.type = 'phpunit'
self.slug = 'phpunit'
self.version = '4.3'
self.base_url = 'https://phpunit.de/manual/4.3/en/'
self.initial_paths = %w(appendixes.assertions.html appendixes.annotations.html)
self.base_url = "https://phpunit.de/manual/#{version}/en/"
self.root_path = 'index.html'
html_filters.push 'phpunit/entries', 'phpunit/clean_html', 'title'
options[:skip_links] = true
html_filters.push 'phpunit/clean_html', 'phpunit/entries', 'title'
options[:root_title] = 'PHPUnit'
options[:title] = false
options[:root_title] = "#{self.name} #{self.version}"
options[:fix_urls] = ->(url) do
if self.initial_paths.include? url[/\/([A-z.-]+)#/, 1]
url = url[/#(.+)/, 1].downcase
url.gsub! /(\w+\.\w+)\.(\w+)/, '\1#\2'
end
url
end
options[:skip] = %w(
appendixes.index.html
appendixes.bibliography.html
appendixes.copyright.html)
options[:attribution] = <<-HTML
&copy; 2005&ndash;2014 Sebastian Bergmann<br>

Loading…
Cancel
Save