From ef4e59a0b0676391752c1e026faf35426f85148a Mon Sep 17 00:00:00 2001 From: Thibaut Date: Sun, 30 Nov 2014 11:02:25 -0500 Subject: [PATCH] Improve PHPUnit scraper --- assets/stylesheets/pages/_phpunit.scss | 25 ++++----- lib/docs/filters/phpunit/clean_html.rb | 76 ++++++++++++-------------- lib/docs/filters/phpunit/entries.rb | 25 +++++---- lib/docs/scrapers/phpunit.rb | 23 +++----- 4 files changed, 69 insertions(+), 80 deletions(-) diff --git a/assets/stylesheets/pages/_phpunit.scss b/assets/stylesheets/pages/_phpunit.scss index 2cfd1935..47130f61 100644 --- a/assets/stylesheets/pages/_phpunit.scss +++ b/assets/stylesheets/pages/_phpunit.scss @@ -1,21 +1,18 @@ ._phpunit { - h1 { - margin-top: 0; - @extend %lined-heading; - } + > h2 { @extend %block-heading; } + > h3 { @extend %block-label, %label-blue; } + > h4 { font-size: 1em; } - h2.title { - @extend %block-heading; - } + > p > code { @extend %label; } - .programlisting > pre { white-space: normal; } + .warning, .alert { + @extend %note; - .literal { - padding: 2px 4px; - color: #c7254e; - background-color: #f9f2f4; - border-radius: 4px; + > h3 { + margin: 0 0 .5em; + font-size: 1em; + } } - div.warning { @extend %note, %note-red; } + .alert-danger { @extend %note-red; } } diff --git a/lib/docs/filters/phpunit/clean_html.rb b/lib/docs/filters/phpunit/clean_html.rb index e07901e8..58fea821 100644 --- a/lib/docs/filters/phpunit/clean_html.rb +++ b/lib/docs/filters/phpunit/clean_html.rb @@ -7,58 +7,52 @@ module Docs end def root - doc.inner_html = <<-HTML -

PHPUnit is a programmer-oriented testing framework for PHP.
- It is an instance of the xUnit architecture for unit testing frameworks.

- HTML + doc.inner_html = ' ' end def other - # set root on appendix - @doc = doc.at_css('div.appendix') + @doc = doc.at_css('div.appendix, div.chapter') - # remove attributes 'style' - css('*').remove_attr('style') + css('.example-break', '.table-break').remove - # clean titles - css('div.titlepage').each do |node| - title = node.at_css('.title') - case title.name - when 'h1' - # remove 'Appendix X.' from top title - nodetitle = title.content - title.content = nodetitle.gsub(/Appendix \w+\. /, '') - when 'h2' - # set link anchors in entries (title level 2) - anchor = Nokogiri::XML::Node.new "a", @doc - anchor.content = title.content - anchor['id'] = title.content.downcase.gsub(/[^a-z]/, '') - title.content = '' - anchor.parent = title - end - node.replace title + css('a[id]').each do |node| + next unless node.content.blank? + node.parent['id'] = node['id'] + node.remove end - # set anchor for internal references - css('p.title').each do |node| - anchor = Nokogiri::XML::Node.new "a", @doc - anchor.content = node.content - anchor['id'] = anchor.content[/\w+ [A-z0-9.]+/].downcase.parameterize - node.content = '' - anchor.parent = node + css('.titlepage').each do |node| + title = node.at_css('h1, .title') + title.content = title.content.remove(/(Chapter|Appendix)\s+\w+\.\s+/) + node.before(title).remove end - # clean internal references - css('a').each do |node| - page = node['href'][/([A-z.-]+)?#/, 1] if node['href'] - if page then - page = page + '.html' unless page[/.*\.html/] - if Phpunit.initial_paths.include? page - node['href'] = node['href'].gsub(/#[A-z.-]+/, '#' + node.content.downcase.parameterize) - end - end + css('.section').each do |node| + node.before(node.children).remove end + css('[style], [border], [valign]').each do |node| + node.remove_attribute('style') + node.remove_attribute('border') + node.remove_attribute('valign') + end + + css('.warning h3', '.alert h3').each do |node| + node.remove if node.content == 'Note' + end + + css('p > code.literal:first-child:last-child').each do |node| + next if node.previous_sibling && node.previous_sibling.content.present? + next if node.next_sibling && node.next_sibling.content.present? + node.parent.name = 'pre' + node.parent.content = node.content + end + + css('pre', '.term').each do |node| + node.content = node.content + end + + doc end end end diff --git a/lib/docs/filters/phpunit/entries.rb b/lib/docs/filters/phpunit/entries.rb index a389b930..545c55b8 100644 --- a/lib/docs/filters/phpunit/entries.rb +++ b/lib/docs/filters/phpunit/entries.rb @@ -1,19 +1,24 @@ module Docs class Phpunit class EntriesFilter < Docs::EntriesFilter - def additional_entries - entries = [] + def get_name + at_css('h1').content + end + + def get_type + if name.in?(%w(Assertions Annotations)) + name + else + 'Guides' + end + end - if at_css('h1') - type = at_css('h1').content.gsub(/Appendix \w+\. /, '') + def additional_entries + return [] if type == 'Guides' - css('h2').each do |node| - name = node.content - id = name.parameterize - entries << [name, id, type] - end + css('h2').map do |node| + [node.content, node['id']] end - entries end end end diff --git a/lib/docs/scrapers/phpunit.rb b/lib/docs/scrapers/phpunit.rb index b846df07..4f1ddfb9 100644 --- a/lib/docs/scrapers/phpunit.rb +++ b/lib/docs/scrapers/phpunit.rb @@ -2,26 +2,19 @@ module Docs class Phpunit < UrlScraper self.name = 'PHPUnit' self.type = 'phpunit' - self.slug = 'phpunit' self.version = '4.3' - self.base_url = 'https://phpunit.de/manual/4.3/en/' - self.initial_paths = %w(appendixes.assertions.html appendixes.annotations.html) + self.base_url = "https://phpunit.de/manual/#{version}/en/" + self.root_path = 'index.html' - html_filters.push 'phpunit/entries', 'phpunit/clean_html', 'title' - - options[:skip_links] = true + html_filters.push 'phpunit/clean_html', 'phpunit/entries', 'title' + options[:root_title] = 'PHPUnit' options[:title] = false - options[:root_title] = "#{self.name} #{self.version}" - - options[:fix_urls] = ->(url) do - if self.initial_paths.include? url[/\/([A-z.-]+)#/, 1] - url = url[/#(.+)/, 1].downcase - url.gsub! /(\w+\.\w+)\.(\w+)/, '\1#\2' - end - url - end + options[:skip] = %w( + appendixes.index.html + appendixes.bibliography.html + appendixes.copyright.html) options[:attribution] = <<-HTML © 2005–2014 Sebastian Bergmann