Update Haskell scraper

pull/110/head
Thibaut 11 years ago
parent 9a43dfbdb4
commit 1819d71ff7

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 49 KiB

@ -130,6 +130,11 @@ credits = [
'2014 Grunt Team', '2014 Grunt Team',
'MIT', 'MIT',
'https://raw.githubusercontent.com/gruntjs/gruntjs.com/master/LICENSE' 'https://raw.githubusercontent.com/gruntjs/gruntjs.com/master/LICENSE'
], [
'Haskell',
'The University of Glasgow',
'BSD',
'http://www.haskell.org/ghc/license'
], [ ], [
'HTTP', 'HTTP',
'1999 The Internet Society', '1999 The Internet Society',

@ -24,7 +24,10 @@ newsItem = (date, news) ->
result result
app.news = [ app.news = [
[ 1400976000000, # May 25, 2014 [ 1402704000000, # June 14, 2014
""" New <a href="/haskell/">Haskell</a> documentation """,
], [
1400976000000, # May 25, 2014
""" New <a href="/laravel/">Laravel</a> documentation """, """ New <a href="/laravel/">Laravel</a> documentation """,
], [ ], [
1399161600000, # May 4, 2014 1399161600000, # May 4, 2014

@ -35,6 +35,7 @@
'pages/ember', 'pages/ember',
'pages/express', 'pages/express',
'pages/go', 'pages/go',
'pages/haskell',
'pages/jquery', 'pages/jquery',
'pages/knockout', 'pages/knockout',
'pages/git', 'pages/git',

@ -57,3 +57,4 @@
._icon-grunt:before { background-position: -3rem -8rem; } ._icon-grunt:before { background-position: -3rem -8rem; }
._icon-maxcdn:before { background-position: -4rem -8rem; } ._icon-maxcdn:before { background-position: -4rem -8rem; }
._icon-laravel:before { background-position: 0 -9rem; } ._icon-laravel:before { background-position: 0 -9rem; }
._icon-haskell:before { background-position: -1rem -9rem; }

@ -1,121 +1,25 @@
._icon-haskell:before { ._haskell {
background-image: image-url('/icons/docs/haskell/16.png'); > h2 { @extend %block-heading; }
background-size: cover; > h3 { @extend %block-label; }
background-repeat: no-repeat; h4 { font-size: 1em; }
}
.empty-table .empty {
display: none;
}
.arguments td.src {
background: #faf9e2;
width: 30%;
}
th.src,
td.src {
font-family: $monoFont;
font-weight: normal;
font-style: normal;
background: #f8f8f8;
}
caption {
font-weight: bold;
text-align: left;
font-style: italic;
font-size: 1.1em;
}
// remove margin in descript listing
dd > pre {
@extend %pre;
margin: 0;
background: #faf9e2;
border-color: #dddaaa #dddaaa #d7d7a9;
}
// warnings are red
.warning {
@extend %note;
@extend %note-red;
}
// complexity classes are blue boxes
.with-complexity {
display: flex;
display: -webkit-flex;
justify-content: space-between;
-webkit-justify-content: space-between;
align-items: flex-start; .module + .package, p.src > .link { float: right; }
-webkit-align-items: flex-start;
align-content: stretch; .src {
-webkit-align-content: stretch; white-space: normal;
} @extend %code;
}
.complexity { p.src { @extend %block-label, %label-blue; }
@extend %note; dt.src { white-space: normal; }
@extend %note-blue;
margin: 0;
margin-left: 1em;
margin-bottom: 0.75em;
font-style: italic;
white-space: nowrap;
flex-shrink: 0;
-webkit-flex-shrink: 0;
order: 2;
-webkit-order: 2;
}
.complexity + span {
order: 1;
-webkit-order: 1;
}
// add box type to "since: ..."
.added {
@extend %note;
@extend %note-gold;
}
.added-cell {
@extend %note-gold;
}
.fields h3 {
display: none;
}
// separate types more .top > .subs { margin-left: 2em; }
.src { .subs p.src { margin-top: 1em; }
margin-top: 2.5em;
}
h1 + .top .src, dt > code, .complexity, .version { @extend %label; }
h2 + .top .src, .complexity, .version { @extend %label-green; }
h3 + .top .src,
.caption + .top .src {
margin-top: 0;
}
// but not for first type table { margin: 1em 0; }
h1 + .top, td > pre { margin: 0; }
h2 + .top,
h3 + .top,
h4 + .top {
margin-top: 0;
}
// change color of example code .warning { @extend %note; }
.example {
border: 1px solid;
background: #faf9e2;
border-color: #dddaaa #dddaaa #d7d7a9;
} }

@ -2,146 +2,54 @@ module Docs
class Haskell class Haskell
class CleanHtmlFilter < Filter class CleanHtmlFilter < Filter
def call def call
root_page? ? root : other
doc
end
# remove unwanted elements def root
css('#footer', '#package-header', '#module-header', '#synopsis', '.link', '#table-of-contents', '.package').remove css('#description', '#module-list').each do |node|
node.before(node.children).remove
# cpations in tables are h3
css('table .caption').each do |node|
node.name = 'h3'
end
# turn captions into real headers
css('.caption').each do |node|
node.name = 'h1'
end
# section
css('.top > .caption').each do |node|
node.name = 'h2'
end
# subsections
css('.top > .subs > .caption', '.fields > .caption').each do |node|
node.name = 'h3'
end
# subsubsections
css('.top > .subs > .subs > .caption').each do |node|
node.name = 'h4'
end
# ...
css('.top > .subs > .subs > .subs > .caption').each do |node|
node.name = 'h5'
end
# ......
css('.top > .subs > .subs > .subs > .subs > .caption').each do |node|
node.name = 'h6'
end
# all pre's are examples
css('pre').each do |node|
node.add_css_class('example')
end end
end
# turn source listing in to pre def other
css('.src').each do |node| css('h1').each do |node|
if node.name != "td" node.remove if node.content == 'Documentation'
node.name = 'pre'
end
end end
# check if second column of table is totally empty. css('h1, h2, h3, h4').each do |node|
# and remove it if it is node.name = node.name.sub(/\d/) { |i| i.to_i + 1 }
css('table').each do |table|
empty = true
table.css('td + td').each do |snd|
empty = empty && snd['class'] =~ /empty/
end
if empty
# remove empty column
table.css('td + td').remove
end
end end
# move table captions into the tables at_css('#module-header').tap do |node|
css(".caption + table").each do |table| heading = at_css('.caption')
caption = table.previous heading.name = 'h1'
caption.name = "caption" node.before(heading)
caption.parent = table node.before(node.children).remove
end end
css(".caption + .show table").each do |table| css('#synopsis').remove
caption = table.parent.parent.css('.caption')[0]
caption.name = 'caption'
caption.parent = table
end
# better arguments display: css('#interface', 'h2 code').each do |node|
css('.src + .arguments table').each do |table| node.before(node.children).remove
src = table.parent.previous # the function name
row = doc.document.create_element('tr')
table.css('tr')[0].before(row)
src.parent = row
src.name = "th"
src['colspan'] = 2
end end
# remove root page title css('a[name]').each do |node|
if root_page? node['id'] = node['name']
at_css('h1').remove node.remove_attribute('name')
end end
# add id to links (based on name) css('p.caption').each do |node|
css('a').each do |node| node.name = 'h4'
if node['name']
node['id'] = node['name']
end
end
# make code in description into proper pre
css('dd > code').each do |node|
node.name = 'pre'
end end
# add some informational boxes
css('em').each do |node| css('em').each do |node|
if node.content == 'Deprecated.' if node.content.start_with?('O(')
# Make deprecated messages red. node.name = 'span'
node.parent.add_css_class('warning') node['class'] = 'complexity'
elsif node.content =~ /O\(.*\)/ elsif node.content.start_with?('Since')
# this is big_O notation, but only apply the class if this is not node.name = 'span'
# inside running text (it must be at the start of a paragraph) node['class'] = 'version'
# from:
# <p><em>O(n)</em>. Koel ok</p>
# to:
# <p class="with-complexity">
# <span class="complexity">O(n)</span>
# <span>Koel ok</span>
# </p>
if node.previous == nil
node.add_css_class('complexity') # add css class
node.name="span" # just make it div
node.next.content = node.next.content.gsub(/^. /, "") # remove . if directly after em
node.content = node.content.gsub(/\.$/, "") # remove trailing . if it's inside em
# reparent the nodes
cont = doc.document.create_element "p", :class => "with-complexity"
node.parent.previous = cont
par = node.parent
node.parent = cont
par.parent = cont
par.name = "span"
end
elsif node.content =~ /Since: .*/
# add box to 'Since:' annotations
if node.parent.parent.name == "td"
node.parent.parent.add_css_class('added-cell')
else
node.add_css_class('added')
end
end end
end end
@ -150,10 +58,3 @@ module Docs
end end
end end
end end
class Nokogiri::XML::Node
def add_css_class( *classes )
existing = (self['class'] || "").split(/\s+/)
self['class'] = existing.concat(classes).uniq.join(" ")
end
end

@ -1,55 +1,54 @@
module Docs module Docs
class Haskell class Haskell
class EntriesFilter < Docs::EntriesFilter class EntriesFilter < Docs::EntriesFilter
IGNORE_ENTRIES_PATHS = %w(
bytestring-0.10.4.0/Data-ByteString-Lazy.html
bytestring-0.10.4.0/Data-ByteString-Char8.html
bytestring-0.10.4.0/Data-ByteString-Lazy-Char8.html
array-0.5.0.0/Data-Array-IArray.html
containers-0.5.5.1/Data-IntMap-Lazy.html
containers-0.5.5.1/Data-Map-Lazy.html
unix-2.7.0.1/System-Posix-Files-ByteString.html
filepath-1.3.0.2/System-FilePath-Windows.html
transformers-0.3.0.0/Control-Monad-Trans-RWS-Lazy.html
transformers-0.3.0.0/Control-Monad-Trans-Writer-Lazy.html
base-4.7.0.0/GHC-Conc-Sync.html
base-4.7.0.0/GHC-IO-Encoding-UTF32.html
unix-2.7.0.1/System-Posix-Terminal-ByteString.html)
# gets name and type in one fell swoop def get_name
# at_css('#module-header .caption').content.strip
# eg. end
# Control.Monad > [Monad, Control]
# Control.Concurrent.Mvar > [Concurrent.MVar, Control]
# Array > [Array, nil]
def get_name_and_type
if at_css('h1') && at_css('h1').content == 'Haskell Hierarchical Libraries'
puts 'ok'
name = 'Haskell'
type = nil
else
# find full module identifier
caption = at_css('#module-header .caption')
if caption def get_type
# split the module path %w(System.Posix System.Win32 Control.Monad).each do |type|
parts = caption.content.split('.') return type if name.start_with?(type)
end
if parts.length > 1 if name.start_with?('Data')
# if more than one part then the name.split('.')[0..1].join('.')
# first is the type and the rest is the name else
type = parts[0] name.split('.').first
name = parts.drop(1).join('.')
else
# if only one part, this is the name
name = parts[0]
type = nil
end
else
# no caption found -> no type / no name
name = 'no-name'
type = 'no-type'
end
end end
[name, type]
end end
# get the name def additional_entries
def get_name return [] if IGNORE_ENTRIES_PATHS.include?(subpath)
n, t = get_name_and_type()
n css('#synopsis > ul > li').each_with_object [] do |node, entries|
link = node.at_css('a')
next unless link['href'].start_with?('#')
name = node.content.strip
name.remove! %r{\A(?:module|data|newtype|class|type family m|type)\s+}
name.sub! %r{\A\((.+?)\)}, '\1'
name.sub!(/ (?:\:\: (\w+))?.+\z/) { |_| $1 ? " (#{$1})" : '' }
next if name == self.name
entries << [name, link['href'].remove('#')]
end
end end
# get the type def include_default_entry?
def get_type at_css('#synopsis > ul > li')
n, t = get_name_and_type()
t
end end
end end
end end

@ -1,24 +1,32 @@
module Docs module Docs
class Haskell < UrlScraper class Haskell < UrlScraper
self.name = 'Haskell' self.name = 'Haskell'
self.slug = 'haskell'
self.type = 'haskell' self.type = 'haskell'
self.version = '7.8.2' self.version = '7.8.2'
self.base_url = 'http://www.haskell.org/ghc/docs/7.8.2/html/libraries/' self.base_url = 'http://www.haskell.org/ghc/docs/7.8.2/html/libraries/'
self.initial_paths = ['/index.html'] self.root_path = 'index.html'
html_filters.push 'haskell/entries' html_filters.push 'haskell/entries', 'haskell/clean_html'
html_filters.push 'haskell/clean_html'
html_filters.push 'title'
options[:container] = '#content'
options[:container] = '#content' options[:skip] = %w(
options[:skip_patterns] = [/src/, /index/, /haskell2010/, /ghc-/, /Cabal-/] # skip source listings and index files hoopl-3.10.0.1/Compiler-Hoopl-Internals.html
base-4.7.0.0/Control-Exception-Base.html
binary-0.7.1.0/Data-Binary-Get-Internal.html
template-haskell-2.9.0.0/Language-Haskell-TH-Lib.html
haskell98-2.0.0.3/Prelude.html
pretty-1.1.1.1/Text-PrettyPrint.html
base-4.7.0.0/Data-OldTypeable-Internal.html
base-4.7.0.0/Data-Typeable-Internal.html
base-4.7.0.0/GHC-IO-Encoding-Types.html
unix-2.7.0.1/System-Posix-Process-Internals.html)
options[:skip_patterns] = [/src\//, /doc-index/, /haskell2010/, /ghc-/, /Cabal-/]
options[:attribution] = <<-HTML options[:attribution] = <<-HTML
&copy; The University Court of the University of Glasgow.<br> &copy; The University of Glasgow and others<br>
All rights reserved. <a href="http://www.haskell.org/ghc/license">See here for more info</a> Licensed under a BSD-style license (see top of the page).
HTML HTML
end
end
end end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 994 B

After

Width:  |  Height:  |  Size: 624 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 705 B

@ -0,0 +1 @@
http://www.haskell.org/haskellwiki/Thompson-Wheeler_logo
Loading…
Cancel
Save