Finish scikit-learn scraper

pull/511/head
Thibaut Courouble 8 years ago
parent 1dbc03fab4
commit c4a543933d

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 KiB

After

Width:  |  Height:  |  Size: 130 KiB

@ -7,7 +7,7 @@ class app.collections.Types extends app.Collection
(result[@_groupFor(type)] ||= []).push(type)
result.filter (e) -> e.length > 0
GUIDES_RGX = /(^|\()(guides?|tutorials?|reference|book|getting\ started|manual)($|[\):])/i
GUIDES_RGX = /(^|\()(guides?|tutorials?|reference|book|getting\ started|manual|examples)($|[\):])/i
APPENDIX_RGX = /appendix/i
_groupFor: (type) ->

@ -1,5 +1,8 @@
[
[
"2016-10-10",
"New documentation: <a href=\"/scikit_learn/\">scikit-learn</a>"
], [
"2016-09-18",
"New documentations: <a href=\"/pandas/\">pandas</a> and <a href=\"/twig/\">Twig</a>"
], [

@ -504,6 +504,11 @@ credits = [
'2011 the scikit-image team',
'BSD',
'http://scikit-image.org/docs/dev/license.html'
], [
'scikit-learn',
'2007-2016 The scikit-learn developers',
'BSD',
'https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/COPYING'
], [
'Sinon',
'2010-2016 Christian Johansen',

@ -120,6 +120,7 @@
._icon-scikit_image:before { background-position: -6rem -6rem; }
._icon-twig:before { background-position: -7rem -6rem; }
._icon-pandas:before { background-position: -8rem -6rem; }
._icon-scikit_learn:before { background-position: -9rem -6rem; }
._icon-bottle:before { background-position: 0 -7rem; }
._icon-docker:before { background-position: -1rem -7rem; }
._icon-cakephp:before { background-position: -2rem -7rem; }

@ -6,12 +6,12 @@
dd > dl:not(.docutils) > dt { @extend %block-label; }
dt + dt { margin-top: -.5em; }
.note, .admonition, div.versionadded, div.versionchanged, .deprecated-removed, .deprecated { @extend %note; }
.note, .admonition, div.versionadded, div.versionchanged, .deprecated-removed, .deprecated, .topic { @extend %note; }
.important { @extend %note-orange; }
.warning, .deprecated-removed, .deprecated { @extend %note-red; }
.versionmodified, span.title {
.versionmodified, span.title, .topic-title {
display: block;
font-weight: bold;
}
@ -37,16 +37,6 @@
.admonition-title + dl { padding-top: .5em; }
td > div { margin: 0 !important; }
.row-fluid {
h2 {
background: none;
border: none;
> a {
float: none;
}
}
}
}
._sphinx {

@ -0,0 +1,24 @@
module Docs
class ScikitLearn
class CleanHtmlFilter < Filter
def call
if root_page?
at_css('h1').content = 'scikit-learn'
css('.row-fluid').each do |node|
html = '<dl>'
node.css('.span4').each do |n|
html += "<dt>#{n.first_element_child.inner_html}</dt>"
html += "<dd>#{n.last_element_child.inner_html}</dd>"
end
html += '</dl>'
node.replace(html)
end
end
doc
end
end
end
end

@ -7,11 +7,14 @@ module Docs
name = at_css('dt').content.strip
name.sub! %r{\(.*}, '()' # Remove function arguments
name.remove! %r{[\=\[].*} # Remove [source] anchor
# name.remove! %r{\s=.*} # Remove the occasional '=' in class names
name.remove! %r{\A(class(method)?) (sklearn\.)?}
else
# User guide
name = at_css('h1').content.strip
name.remove! %r{\(.*?\)}
name.remove! %r{(?<![A-Z]):.*}
name.prepend 'Tutorial: ' if type == 'Tutorials'
name.prepend 'Example: ' if type == 'Examples'
end
name.remove! "\u{00B6}"
@ -23,14 +26,19 @@ module Docs
if subpath.start_with?('modules/generated')
type = at_css('dt > .descclassname').content.strip
type.remove! 'sklearn.'
type.remove! '.'
type.remove! %r{\.\z}
type
elsif subpath.start_with?('tutorial')
'Tutorials'
elsif subpath.start_with?('auto_examples')
'Examples'
else
'Guide'
end
end
def additional_entries
return [] unless subpath.start_with?('modules/generated')
entries = []
css('.class > dt[id]', '.exception > dt[id]', '.attribute > dt[id]').each do |node|

@ -2,7 +2,7 @@ module Docs
class Sphinx
class CleanHtmlFilter < Filter
def call
css('.headerlink', 'hr', '#contents .topic-title', '#topics .topic-title', 'colgroup').remove
css('.headerlink', 'hr', '#contents .topic-title', '#topics .topic-title', 'colgroup', '.line-block').remove
css('.contents > ul:first-child:last-child.simple > li:first-child:last-child').each do |node|
node.parent.before(node.at_css('> ul')) if node.at_css('> ul')

@ -3,32 +3,23 @@ module Docs
self.name = 'scikit-learn'
self.slug = 'scikit_learn'
self.type = 'sphinx'
self.release = '0.17.1'
self.base_url = "http://scikit-learn.org/0.17/"
self.release = '0.18'
self.base_url = 'http://scikit-learn.org/stable/'
self.root_path = 'documentation.html'
self.initial_paths = %w(
user_guide.html
supervised_learning.html
unsupervised_learning.html
model_selection.html
data_transforms.html)
self.links = {
home: 'http://scikit-learn.org/',
code: 'https://github.com/scikit-learn/scikit-learn'
}
html_filters.push 'scikit_learn/entries', 'sphinx/clean_html'
options[:container] = '.body'
options[:root_title] = self.name
html_filters.push 'scikit_learn/entries', 'scikit_learn/clean_html', 'sphinx/clean_html'
options[:only] = self.initial_paths
options[:only_patterns] = [/\Amodules/, /\Adatasets/]
options[:container] = ->(filter) { filter.root_page? ? '.container-index' : '.body' }
options[:skip] = %w(tutorial/statistical_inference/finding_help.html)
options[:only_patterns] = [/\Amodules/, /\Adatasets/, /\Atutorial/, /\Aauto_examples/]
options[:skip_patterns] = [/\Adatasets\/(?!index)/]
options[:attribution] = <<-HTML
&copy; 2007&ndash;2016 The scikit-learn deveopers<br>
&copy; 2007&ndash;2016 The scikit-learn developers<br>
Licensed under the 3-clause BSD License.
HTML

Binary file not shown.

Before

Width:  |  Height:  |  Size: 270 B

After

Width:  |  Height:  |  Size: 247 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 636 B

After

Width:  |  Height:  |  Size: 392 B

Loading…
Cancel
Save