Finish NumPy scraper

pull/406/head
Thibaut Courouble 9 years ago
parent 547f4f54ed
commit 6d36b339e0

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 149 KiB

After

Width:  |  Height:  |  Size: 118 KiB

@ -1,7 +1,7 @@
[
[
"2016-04-24",
"New documentation: <a href=\"/apache_pig/\">Apache Pig</a>"
"New documentations: <a href=\"/numpy/\">NumPy</a> and <a href=\"/apache_pig/\">Apache Pig</a>"
], [
"2016-04-17",
"New documentation: <a href=\"/perl/\">Perl</a>"

@ -335,6 +335,11 @@ credits = [
'npm, Inc. and Contributors<br>npm is a trademark of npm, Inc.',
'npm',
'https://raw.githubusercontent.com/npm/npm/master/LICENSE'
], [
'NumPy',
'2008-2016 NumPy Developers',
'NumPy',
'https://raw.githubusercontent.com/numpy/numpy/master/LICENSE.txt'
], [
'OpenTSDB',
'2010-2016 The OpenTSDB Authors',

@ -138,3 +138,4 @@
._icon-gcc:before { background-position: -2rem -11rem; }
._icon-perl:before { background-position: -3rem -11rem; }
._icon-apache_pig:before { background-position: -4rem -11rem; }
._icon-numpy:before { background-position: -5rem -11rem; }

@ -24,6 +24,8 @@
}
ul.simple { margin: 1em 0; }
dt > a.external { float: right; }
}
._sphinx {

@ -4,11 +4,58 @@ module Docs
def call
@doc = at_css('#spc-section-body')
css('.headerlink').remove # remove permalinks
css('colgroup').remove
# Add class for correct syntax highlighting
css('pre').each do |pre|
pre['class'] = 'python'
css('.section', 'a > em', 'dt > tt', 'dt > em', 'dt > big', 'tbody').each do |node|
node.before(node.children).remove
end
css('.headerlink').each do |node|
id = node['href'][1..-1]
node.parent['id'] ||= id
doc.at_css("span##{id}").try(:remove)
node.remove
end
css('tt', 'span.pre').each do |node|
node.name = 'code'
node.content = node.content
node.remove_attribute 'class'
end
css('h1', 'h2', 'h3').each do |node|
node.content = node.content
end
css('p.rubric').each do |node|
node.name = 'h4'
end
css('blockquote > div:first-child:last-child').each do |node|
node.parent.before(node.parent.children).remove
node.before(node.children).remove
end
css('.admonition-example').each do |node|
title = node.at_css('.admonition-title')
title.name = 'h4'
title.remove_attribute 'class'
node.before(node.children).remove
end
css('em.xref').each do |node|
node.name = 'code'
end
css('div[class*="highlight-"]').each do |node|
node.content = node.content.strip
node.name = 'pre'
node['data-language'] = node['class'][/highlight\-(\w+)/, 1]
node['class'] = node['data-language'] # tmp
end
css('table[border]').each do |node|
node.remove_attribute 'border'
end
doc

@ -2,38 +2,36 @@ module Docs
class Numpy
class EntriesFilter < Docs::EntriesFilter
def get_name
dt = at_css('dt')
if dt
name = dt.content
name.sub! /\(.*/, '()'
name.sub! /[\=\[].*/, ''
name.remove! 'class '
name.remove! 'classmethod '
name.remove! 'exception '
if dt = at_css('dt')
name = dt.content.strip
name.sub! %r{\(.*}, '()'
name.remove! %r{[\=\[].*}
name.remove! %r{\A(class(method)?|exception) }
name.remove! %r{\s—.*}
else
name = at_css('h1').content.strip
end
name.remove! '¶' # remove permalinks from title
name.remove! "\u{00B6}"
name
end
def get_type
type = name.dup
nav_items = at_css('.nav.nav-pills.pull-left').children
if nav_items[7]
# Infer type from navigation item if possible...
type = nav_items[7].content
nav_items = css('.nav.nav-pills.pull-left > li')
if nav_items[3]
type = nav_items[3].content
elsif nav_items[2] && nav_items[2].content !~ /Manual|Reference/
type = nav_items[2].content
else
# ... or the page is probably an overview, so use its title.
type = at_css('h1').content
type.remove! '¶' # remove permalinks from type
type = at_css('h1').content.strip
type.remove! "\u{00B6}"
# Handle some edge cases that arent proberly categorized in the docs
if type[0..16] == 'numpy.polynomial.'
if type.start_with?('numpy.polynomial.')
type = 'Polynomials'
elsif type[0..11] == 'numpy.ufunc.'
type = 'Universal functions (ufunc)'
elsif type[0..12] == 'numpy.nditer.'
elsif type.start_with?('numpy.ufunc.')
type = 'Universal functions'
elsif type.start_with?('numpy.nditer.')
type = 'Indexing routines'
elsif type == 'numpy.core.defchararray.chararray.argsort'
type = 'String operations'
@ -43,6 +41,13 @@ module Docs
type = 'Polynomials'
end
end
type.remove! ' with automatic domain'
type.remove! %r{\s*\(.*}
type.capitalize!
type.sub! 'c-api', 'C API'
type.sub! 'Numpy', 'NumPy'
type.sub! 'swig', 'Swig'
type
end
end

@ -2,7 +2,8 @@ module Docs
class Numpy < FileScraper
self.name = 'NumPy'
self.type = 'sphinx'
self.root_path = 'routines.html'
self.dir = '/Users/Thibaut/DevDocs/Docs/numpy/reference/'
self.root_path = 'index.html'
self.links = {
home: 'http://www.numpy.org/',
code: 'https://github.com/numpy/numpy'
@ -15,22 +16,19 @@ module Docs
# most pages.
options[:container] = '.main'
# "generated" pages seem to be autogenerated from python docstrings.
# "routines" are mostly lists that help organize the generated pages.
# Everything else is manual-like and probably not desired in Devdocs.
options[:only_patterns] = [
/routines\.?.*\.html/,
/generated.*/]
options[:skip_patterns] = [
/.*(?<!\.html)\z/,
/\Agenerated\/numpy\.chararray\.[\w\-]+.html\z/ # duplicate
]
options[:attribution] = <<-HTML
&copy; Copyright 2008-2015, The Scipy community.<br>
Licensed under a BSD-new License.
&copy; 2008&ndash;2016 NumPy Developers<br>
Licensed under the NumPy License.
HTML
version '1.10' do
self.release = '1.10'
self.dir = '/vagrant/numpy-html/reference/'
# self.base_url = 'http://docs.scipy.org/doc/numpy/reference/'
self.release = '1.10.1'
self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
end
end
end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

After

Width:  |  Height:  |  Size: 834 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 2.3 KiB

@ -1 +1 @@
https://www.scipy.org/_static/images/numpylogo_med.png
https://github.com/numpy/numpy/blob/master/branding/icons/numpylogoicon.svg

Loading…
Cancel
Save