Finish OpenJDK scraper

pull/621/head
Thibaut Courouble 8 years ago
parent f606c0abc8
commit 2efce74521

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 140 KiB

After

Width:  |  Height:  |  Size: 141 KiB

@ -1,5 +1,8 @@
[
[
"2017-04-30",
"New documentation: <a href=\"/openjdk/\">OpenJDK</a>"
], [
"2017-02-26",
"Refreshed design.",
"Added <a href=\"/settings\">Preferences</a>."

@ -413,6 +413,11 @@ credits = [
'2008-2017 NumPy Developers',
'NumPy',
'https://raw.githubusercontent.com/numpy/numpy/master/LICENSE.txt'
], [
'OpenJDK',
'1993-2017, Oracle and/or its affiliates. All rights reserved.<br>Licensed under the GNU General Public License, version 2, with the Classpath Exception.<br>Various third party code in OpenJDK is licensed under different licenses.<br>Java and OpenJDK are trademarks or registered trademarks of Oracle and/or its affiliates.',
'GPLv2',
'http://openjdk.java.net/legal/gplv2+ce.html'
], [
'OpenTSDB',
'2010-2016 The OpenTSDB Authors',

@ -123,6 +123,11 @@ table {
border-radius: 3px;
}
caption {
font-weight: $boldFontWeight;
padding: 0 .7em .3em;
}
th, td {
vertical-align: top;
padding: .3em .7em;

@ -43,6 +43,7 @@
%icon-clipboard-white { background-position: -1rem -2rem; }
%icon-close-white { background-position: -2rem -2rem; }
._icon-openjdk:before { background-position: -2rem 0; }
._icon-codeceptjs:before { background-position: -3rem 0; }
._icon-codeception:before { background-position: -4rem 0; }
._icon-sqlite:before { background-position: -5rem 0; @extend %darkIconFix !optional; }

@ -1,22 +1,7 @@
._openjdk {
> ul.inheritance {
@extend %note, %note-blue;
li {
list-style: none;
}
}
@extend %simple;
ul.blockList, ul.blockListLast {
padding-left: 0;
li.blockList {
list-style: none;
}
}
h3 {
@extend %block-heading;
}
h4 {
@extend %block-label, %label-blue;
}
ul.inheritance { list-style: none; }
> ul.inheritance { @extend %note, %note-blue; }
> ul.inheritance ul.inheritance { margin: 0; }
}

@ -42,6 +42,10 @@ module Docs
context[:version]
end
def release
context[:release]
end
def subpath
@subpath ||= subpath_to(current_url)
end

@ -116,7 +116,7 @@ module Docs
@options ||= self.class.options.deep_dup.tap do |options|
options.merge! base_url: base_url, root_url: root_url,
root_path: root_path, initial_paths: initial_paths,
version: self.class.version
version: self.class.version, release: self.class.release
if root_path?
(options[:skip] ||= []).concat ['', '/']

@ -1,7 +1,11 @@
# frozen_string_literal: true
module Docs
class Openjdk
class CleanHtmlFilter < Filter
def call
css('.topNav', '.subNav', '.bottomNav', '.legalCopy', 'noscript', '.subTitle').remove
# Preserve internal fragment links
# Transform <a name="foo"><!-- --></a><bar>text</bar>
# into <bar id="foo">text</bar>
@ -12,75 +16,117 @@ module Docs
end
end
# Find the main container
# Root page have three containers, we use the second one
container = at_css('.contentContainer' + (root_page? ? ':nth-of-type(2)' : ''))
# Move description to the container top
if description_link = at_css('a[href$=".description"]')
target = description_link['href'][1..-1]
description_nodes = xpath("//*[@id='#{target}'] | //*[@id='#{target}']/following-sibling::*")
container.prepend_child(description_nodes)
description_nodes.at_css('h2:contains("Description")')&.remove
description_link.parent.remove
end
# Remove superfluous and duplicated content
css('.subTitle', '.docSummary', '.summary caption', 'caption span.tabEnd').remove
css('table[class$="Summary"] > tr > th').each do |th|
th.parent.remove
end
css('h3[id$=".summary"]').each do |header|
# Keep only a minimal list of annotation required/optional elements
# as with "Methods inherited from class"
if header['id'].match? %r{\.element\.summary$}
table_summary = header.next_element
code_summary = header.document.create_element 'code'
table_summary.css('.memberNameLink a').each_with_index do |element, index|
code_summary << header.document.create_text_node(', ') if index > 0
code_summary << element
end
table_summary.replace(code_summary)
# Remove summary element if detail exists
elsif detail_header = at_css("h3[id='#{header['id'].sub('summary','detail')}']")
header.next_element.remove
header.replace(detail_header.parent.children)
end
end
at_css('.details')&.remove unless at_css('.details h3')
css('h3[id$=".summary"]', 'h3[id$=".detail"]', 'caption span').each do |header|
header.name = 'h3' if header.name == 'span'
content = header.content
content.remove! ' Summary'
content.remove! ' Detail'
header.content = content.pluralize
# Remove superfluous content on package pages
css('h2:contains("Package Specification")').each do |node|
node.next.remove while node.next
node.remove
end
css('h4').each do |entry_header|
entry_pre = entry_header.next_element
entry_header.children = entry_pre.children
entry_pre.remove
# Replace summary tables with their detail content
css('h3[id$=".summary"]').each do |node|
id = node['id'].sub('summary', 'detail')
detail = at_css("h3[id='#{id}']") || at_css("h3[id='#{id.remove('optional.').remove('required.')}']")
node.parent.children = detail.parent.children if detail
end
# Keep only header and container
container.prepend_child(at_css('.header'))
@doc = container
css('h3[id$=".summary"]', 'h3[id$=".detail"]').each do |node|
node.content = node.content.remove(' Summary').remove(' Detail').pluralize
end
# Remove packages not belonging to this version
if root_page?
at_css('.overviewSummary caption h3').content =
version + ' ' +
at_css('.overviewSummary caption h3').content
css('.overviewSummary td.colFirst a').each do |node|
unless context[:only_patterns].any? { |pattern| node['href'].match? pattern }
css('.header')[1].remove
css('.contentContainer')[0].remove
css('.contentContainer')[-1].remove
# Remove packages not belonging to this version
css('td.colFirst a').each do |node|
unless context[:only_patterns].any? { |pattern| pattern =~ node['href'] }
node.parent.parent.remove
end
end
at_css('h1').content = "OpenJDK #{release} Documentation" + (version != release ? " (#{version.split(' ').last})" : '')
end
css('table').each do |node|
node.remove_attribute 'summary'
node.remove_attribute 'cellspacing'
node.remove_attribute 'cellpadding'
node.remove_attribute 'border'
end
css('span.deprecatedLabel').each { |node| node.name = 'strong' }
css('.contentContainer', '.docSummary', 'div.header', 'div.description', 'div.summary', 'span', 'tbody').each do |node|
node.before(node.children).remove
end
css('tt').each { |node| node.name = 'code' }
css('div.block').each { |node| node.name = 'p' unless node.at_css('.block, p') }
# Create paragraphs
css('div > p:first-of-type').each do |node|
node.before('<p></p>')
node = node.previous
node.prepend_child(node.previous) while node.previous
end
css('ul > li > table:only-child').each do |node|
node.parent.parent.before(node)
end
css('blockquote > table:only-child', 'blockquote > dl:only-child').each do |node|
node.parent.before(node).remove
end
css('blockquote > pre:only-child').each do |node|
node.content = node.content.strip_heredoc
node.parent.before(node).remove
end
css('blockquote > code').each do |node|
node.parent.name = 'pre'
node.content = node.content.strip.gsub(/\s+/, ' ')
end
css('dt > cite').each do |node| # remove "See The Java™ Language Specification"
node.parent.next_element.remove
node.parent.remove
end
css('dt:contains("See Also")').each do |node|
unless node.next_element.at_css('a')
node.next_element.remove
node.remove
end
end
# Syntax highlighter
css('ul.blockList li.blockList:only-child').each do |node|
node.first_element_child['id'] ||= node.parent['id'] if node.parent['id']
node.parent.before(node.children).remove
end
css('hr + br', 'p + br', 'div + br', 'hr').remove
css('pre').each do |node|
node.content = node.content.strip
node['data-language'] = 'java'
end
css('.title').each do |node|
node.name = 'h1'
end
css('h3, h4').each do |node|
node.name = node.name.sub(/\d/) { |i| i.to_i - 1 }
end
css('*[title]').remove_attr('title')
css('*[class]').each do |node|
node.remove_attribute('class') unless node['class'] == 'inheritance'
end
doc
end
end

@ -1,3 +1,5 @@
# frozen_string_literal: true
module Docs
class Openjdk
class CleanUrlsFilter < Filter
@ -8,10 +10,10 @@ module Docs
# The following code ignores most options that InternalUrlsFilter accepts,
# only the currently used options are considered here.
self.class.parent.versions.each do |v|
if v.options[:only_patterns].any? { |pattern| path.match? pattern } &&
v.options[:skip_patterns].none? { |pattern| path.match? pattern }
node['href'] = "/#{v.slug}/#{path}"
self.class.parent.versions.each do |version|
if version.options[:only_patterns].any? { |pattern| path.match?(pattern) } &&
version.options[:skip_patterns].none? { |pattern| path.match?(pattern) }
node['href'] = "/#{version.slug}/#{path}"
break
end
end

@ -1,3 +1,5 @@
# frozen_string_literal: true
module Docs
class Openjdk
class EntriesFilter < Docs::EntriesFilter
@ -13,24 +15,28 @@ module Docs
end
def get_type
return 'Packages' if slug.end_with?('package-summary')
if subtitle = at_css('.header > .subTitle:last-of-type')
subtitle.content.strip
type = subtitle.content.strip
else
at_css('.header > .title').content.strip.remove 'Package '
type = at_css('.header > .title').content.strip.remove 'Package '
end
type = type.split('.')[0..2].join('.')
type
end
def additional_entries
# Only keep the first found entry with a unique name,
# i.e. overloaded methods are skipped in index
css('a[name$=".summary"]').each_with_object({}) do |summary, entries|
next if summary['name'] == 'nested.class.summary'
next if summary['name'].include?('nested') || summary['name'].include?('constructor') ||
summary['name'].include?('field') || summary['name'].include?('constant')
summary.parent.css('.memberNameLink a').each do |node|
entry_name = node.parent.parent.content.strip
entry_name.sub! %r{\(.+?\)}m, '()'
id = node['href']
id.remove! %r{.*#}
entries[entry_name] ||= [name + '.' + entry_name, id]
name = node.parent.parent.content.strip
name.sub! %r{\(.+?\)}m, '()'
id = node['href'].remove(%r{.*#})
entries[name] ||= ["#{self.name}.#{name}", id]
end
end.values
end

@ -3,17 +3,12 @@ module Docs
self.name = 'OpenJDK'
self.type = 'openjdk'
self.root_path = 'overview-summary.html'
self.links = {
home: 'http://openjdk.java.net/',
code: 'http://hg.openjdk.java.net/jdk8u'
}
self.release = '8'
# Downloaded from packages.debian.org/sid/openjdk-8-doc
# extracting subdirectoy /usr/share/doc/openjdk-8-jre-headless/api
self.dir = '/Users/Thibaut/DevDocs/Docs/Java'
# Extracting subdirectory /usr/share/doc/openjdk-8-jre-headless/api
self.dir = '/Users/Thibaut/DevDocs/Docs/OpenJDK'
html_filters.push 'openjdk/entries', 'openjdk/clean_html'
html_filters.insert_after 'internal_urls', 'openjdk/clean_urls'
html_filters.push 'openjdk/entries', 'openjdk/clean_html'
options[:skip_patterns] = [
/compact[123]-/,
@ -25,11 +20,15 @@ module Docs
options[:attribution] = <<-HTML
&copy; 1993&ndash;2017, Oracle and/or its affiliates. All rights reserved.<br>
Use is subject to <a href="http://download.oracle.com/otndocs/jcp/java_se-8-mrel-spec/license.html">license terms</a>.<br>
We are not endorsed by or affiliated with Oracle.
Documentation extracted from Debian's OpenJDK Development Kit package.<br>
Licensed under the GNU General Public License, version 2, with the Classpath Exception.<br>
Various third party code in OpenJDK is licensed under different licenses (see Debian package).<br>
Java and OpenJDK are trademarks or registered trademarks of Oracle and/or its affiliates.
HTML
version 'Core' do
version '8' do
self.release = '8'
options[:only_patterns] = [
/\Ajava\/beans\//,
/\Ajava\/io\//,
@ -55,13 +54,17 @@ module Docs
/\Ajavax\/tools\//]
end
version 'GUI' do
version '8 GUI' do
self.release = '8'
options[:only_patterns] = [
/\Ajava\/awt\//,
/\Ajavax\/swing\//]
end
version 'Web' do
version '8 Web' do
self.release = '8'
options[:only_patterns] = [
/\Ajava\/applet\//,
/\Ajava\/rmi\//,

Binary file not shown.

After

Width:  |  Height:  |  Size: 327 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 393 B

Loading…
Cancel
Save