From 073dbf1ab772ab5c57155a0932a63c8c4d886b58 Mon Sep 17 00:00:00 2001 From: Romeo Van Snick Date: Wed, 21 May 2014 15:03:11 +0200 Subject: [PATCH] get the thing working for haskell --- lib/docs/filters/haskell/clean_html.rb | 32 +++++++++++++++ lib/docs/filters/haskell/entries.rb | 56 ++++++++++++++++++++++++++ lib/docs/scrapers/haskell.rb | 15 +++++++ 3 files changed, 103 insertions(+) create mode 100644 lib/docs/filters/haskell/clean_html.rb create mode 100644 lib/docs/filters/haskell/entries.rb diff --git a/lib/docs/filters/haskell/clean_html.rb b/lib/docs/filters/haskell/clean_html.rb new file mode 100644 index 00000000..d722b45a --- /dev/null +++ b/lib/docs/filters/haskell/clean_html.rb @@ -0,0 +1,32 @@ +module Docs + class Haskell + class CleanHtmlFilter < Filter + def call + + # remove unwanted elements + css('#footer', '#package-header', '#module-header', '#synopsis', '.link', '#table-of-contents', '.empty', '.package').remove + + # turn captions into real headers + css('.caption').each do |node| + node.name = 'h2' + end + + css('table .caption').each do |node| + node.name = 'h3' + end + + # # turn source listing in to pre + css('.src').each do |node| + node.name = 'pre' + end + + + if at_css('h1') && at_css('h1').content == 'Haskell Hierarchical Libraries' + css('h1').remove + end + + doc + end + end + end +end diff --git a/lib/docs/filters/haskell/entries.rb b/lib/docs/filters/haskell/entries.rb new file mode 100644 index 00000000..b42a5710 --- /dev/null +++ b/lib/docs/filters/haskell/entries.rb @@ -0,0 +1,56 @@ +module Docs + class Haskell + class EntriesFilter < Docs::EntriesFilter + + # gets name and type in one fell swoop + # + # eg. + # Control.Monad > [Monad, Control] + # Control.Concurrent.Mvar > [Concurrent.MVar, Control] + # Array > [Array, nil] + def get_name_and_type + if at_css('h1') && at_css('h1').content == 'Haskell Hierarchical Libraries' + name = 'Haskell' + type = nil + else + # find full module identifier + caption = at_css('#module-header .caption') + + if caption + # split the module path + parts = caption.content.split('.') + + if parts.length > 1 + # if more than one part then the + # first is the type and the rest is the name + type = parts[0] + name = parts.drop(1).join('.') + else + # if only one part, this is the name + name = parts[0] + type = nil + end + else + # no caption found -> no type / no name + name = 'no-name' + type = 'no-type' + end + end + [name, type] + end + + # get the name + def get_name + n, t = get_name_and_type() + n + end + + # get the type + def get_type + n, t = get_name_and_type() + t + end + + end + end +end diff --git a/lib/docs/scrapers/haskell.rb b/lib/docs/scrapers/haskell.rb index 5ba2fb83..7692f8a6 100755 --- a/lib/docs/scrapers/haskell.rb +++ b/lib/docs/scrapers/haskell.rb @@ -2,8 +2,23 @@ module Docs class Haskell < UrlScraper self.name = 'Haskell' self.slug = 'haskell' + self.type = 'haskell' self.version = '7.8.2' self.base_url = 'http://www.haskell.org/ghc/docs/7.8.2/html/libraries' + self.initial_paths = ['/index.html'] + + html_filters.push 'haskell/entries' + html_filters.push 'haskell/clean_html' + html_filters.push 'title' + + + options[:container] = '#content' + options[:skip_patterns] = [/src/, /index/, /haskell2010/] # skip source listings and index files + + options[:attribution] = <<-HTML + © The University Court of the University of Glasgow.
+ All rights reserved. See here for more info + HTML end end