scala: finish scraper and filters

pull/705/head
Jasper van Merle 6 years ago
parent 74323fda3e
commit 6614375671

@ -2,97 +2,107 @@ module Docs
class Scala
class CleanHtmlFilter < Filter
def call
@doc = at_css('#content')
always
add_title
if slug == 'index'
root
else
other
end
doc
end
def always
# remove deprecated sections
# Remove deprecated sections
css('.members').each do |members|
header = members.at_css('h3')
members.remove if header.text.downcase.include? 'deprecate'
end
# Some of this is just for 2.12
# These are things that provide interactive features, which are not supported yet.
css('#subpackage-spacer, #search, #mbrsel, .diagram-btn').remove
css('#footer').remove
css('.toggleContainer').remove
css('#mbrsel, #footer').remove
css('.diagram-container').remove
css('.toggleContainer > .toggle').each do |node|
title = node.at_css('span')
next if title.nil?
content = node.at_css('.hiddenContent')
next if content.nil?
title.name = 'dt'
content.remove_attribute('class')
content.remove_attribute('style')
content.name = 'dd'
attributes = at_css('.attributes')
unless attributes.nil?
title.parent = attributes
content.parent = attributes
end
end
signature = at_css('#signature')
signature.replace %Q|
<h2 id="signature">#{signature.inner_html}</h2>
|
signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
css('div.members > h3').each do |node|
change_tag! 'h2', node
node.name = 'h2'
end
css('div.members > ol').each do |list|
list.css('li').each do |li|
h3 = doc.document.create_element 'h3'
h3['id'] = li['name'].rpartition('#').last unless li['name'].nil?
li.prepend_child h3
li.css('.shortcomment').remove
modifier = li.at_css('.modifier_kind')
modifier.parent = h3 if modifier
modifier.parent = h3 unless modifier.nil?
kind = li.at_css('.modifier_kind .kind')
kind.content = kind.content + ' ' unless kind.nil?
symbol = li.at_css('.symbol')
symbol.parent = h3 if symbol
symbol.parent = h3 unless symbol.nil?
li.swap li.children
end
list.swap list.children
end
pres = css('.fullcomment pre, .fullcommenttop pre')
pres.each do |pre|
css('.fullcomment pre, .fullcommenttop pre').each do |pre|
pre['data-language'] = 'scala'
pre.content = pre.content
end
pres.add_class 'language-scala'
doc
end
def root
css('#filter').remove # these are filters to search through the types and packages
css('#library').remove # these are icons at the top
doc
end
def other
# these are sections of the documentation which do not seem useful
# Sections of the documentation which do not seem useful
%w(#inheritedMembers #groupedMembers .permalink .hiddenContent .material-icons).each do |selector|
css(selector).remove
end
# This is the kind of thing we have, class, object, trait
kind = at_css('.modifier_kind .kind').content
# this image replacement doesn't do anything on 2.12 docs
img = at_css('img')
img.replace %Q|<span class="img_kind">#{kind}</span>| unless img.nil?
class_to_add = kind == 'object' ? 'value': 'type'
# Things that are not shown on the site, like deprecated members
css('li[visbl=prt]').remove
end
def add_title
css('.permalink').remove
# for 2.10, 2.11, the kind class is associated to the body. we have to
# add it somewhere, so we do that with the #definition.
definition = css('#definition')
definition.css('.big_circle').remove
definition.add_class class_to_add
definition = at_css('#definition')
return if definition.nil?
# this is something that is not shown on the site, such as deprecated members
css('li[visbl=prt]').remove
type_full_name = {a: 'Annotation', c: 'Class', t: 'Trait', o: 'Object', p: 'Package'}
type = type_full_name[definition.at_css('.big-circle').text.to_sym]
name = CGI.escapeHTML definition.at_css('h1').text
doc
end
package = definition.at_css('#owner').text rescue ''
package = package + '.' unless name.empty? || package.empty?
private
other = definition.at_css('.morelinks').dup
other_content = other ? "<h3>#{other.to_html}</h3>" : ''
def change_tag!(new_tag, node)
node.replace %Q|<#{new_tag}>#{node.inner_html}</#{new_tag}>|
title_content = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
title = "<h1>#{title_content}</h1>"
definition.replace title + other_content
end
end
end

@ -1,32 +0,0 @@
module Docs
class Scala
class CleanHtml210Filter < Filter
def call
definition = at_css('#definition')
begin
type = definition.at_css('.img_kind').text
name = definition.at_css('h1').text.strip
package = definition.at_css('#owner').text rescue ''
package = package + '.' unless name.empty? || name.start_with?('root')
other = definition.at_css('.morelinks').dup
other_content = other ? "<h3>#{other.to_html}</h3>" : ''
definition.replace %Q|
<h1><small>#{type} #{package}</small>#{name}</h1>
#{other_content}
|
end if definition
doc
end
private
def change_tag!(new_tag, node)
node.replace %Q|<#{new_tag}>#{node.inner_html}</#{new_tag}>|
end
end
end
end

@ -1,36 +0,0 @@
module Docs
class Scala
class CleanHtml212Filter < Filter
def call
css('.permalink').remove
definition = at_css('#definition')
begin
type_full_name = {c: 'class', t: 'trait', o: 'object', 'p': 'package'}
type = type_full_name[definition.at_css('.big-circle').text.to_sym]
name = definition.at_css('h1').text
package = definition.at_css('#owner').text rescue ''
package = package + '.' unless name.empty? || package.empty?
other = definition.at_css('.morelinks').dup
other_content = other ? "<h3>#{other.to_html}</h3>" : ''
definition.replace %Q|
<h1><small>#{type} #{package}</small>#{name}</h1>
#{other_content}
|
end if definition
doc
end
private
def change_tag!(new_tag, node)
node.replace %Q|<#{new_tag}>#{node.inner_html}</#{new_tag}>|
end
end
end
end

@ -1,14 +1,30 @@
module Docs
class Scala
class EntriesFilter < Docs::EntriesFilter
REPLACEMENTS = {
'$eq' => '=',
'$colon' => ':',
'$less' => '<',
}
def get_name
# this first condition is mainly for scala 212 docs, which
# have their package listing as index.html
if is_package?
symbol = at_css('#definition h1')
symbol ? symbol.text.gsub(/\W+/, '') : "package"
else
slug.split('/').last
name = slug.split('/').last
# Some objects have inner objects, show ParentObject$.ChildObject$ instead of ParentObject$$ChildObject$
name = name.gsub('$$', '$.')
# If a dollar sign is used as separator between two characters, replace it with a dot
name = name.gsub(/([^$.])\$([^$.])/, '\1.\2')
REPLACEMENTS.each do |key, value|
name = name.gsub(key, value)
end
name
end
end
@ -26,6 +42,31 @@ module Docs
true
end
def additional_entries
entries = []
full_name = "#{type}.#{name}".remove('$')
css(".members li[name^=\"#{full_name}\"]").each do |node|
# Ignore packages
kind = node.at_css('.modifier_kind > .kind')
next if !kind.nil? && kind.content == 'package'
# Ignore deprecated members
next unless node.at_css('.symbol > .name.deprecated').nil?
id = node['name'].rpartition('#').last
member_name = node.at_css('.name')
# Ignore members only existing of hashtags, we can't link to that
next if member_name.nil? || member_name.content.strip.remove('#').blank?
member = "#{name}.#{member_name.content}()"
entries << [member, id]
end
entries
end
private
# For the package name, we use the slug rather than parsing the package
@ -40,7 +81,6 @@ module Docs
end
def parent_package
name = package_name
parent = package_drop_last(package_name.split('.'))
parent.empty? ? '_root_' : parent
end

@ -1,80 +1,60 @@
module Docs
class Scala < FileScraper
include FixInternalUrlsBehavior
self.name = 'scala'
self.name = 'Scala'
self.type = 'scala'
self.links = {
home: 'http://www.scala-lang.org/',
code: 'https://github.com/scala/scala'
}
version '2.12 Library' do
self.release = '2.12.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Scala212/api/scala-library' # https://downloads.lightbend.com/scala/2.12.3/scala-docs-2.12.3.zip
self.base_url = 'http://www.scala-lang.org/api/2.12.3/'
options[:container] = '#content-container'
options[:attribution] = <<-HTML
&copy; 2002-2019 EPFL, with contributions from Lightbend.
HTML
# https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
# Extract api/scala-library into docs/scala~2.13_library
version '2.13 Library' do
self.release = '2.13.0'
self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
self.root_path = 'index.html'
options[:attribution] = <<-HTML
Scala programming documentation. Copyright (c) 2003-2017 <a
href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
href="http://www.lightbend.com" target="_blank">Lightbend</a>.
HTML
html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_212'
html_filters.push 'scala/entries', 'scala/clean_html'
end
version '2.12 Reflection' do
self.release = '2.12.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Scala212/api/scala-reflect' # https://downloads.lightbend.com/scala/2.12.3/scala-docs-2.12.3.zip
self.base_url = 'http://www.scala-lang.org/api/2.12.3/scala-reflect/'
# https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
# Extract api/scala-reflect into docs/scala~2.13_reflection
version '2.13 Reflection' do
self.release = '2.13.0'
self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
self.root_path = 'index.html'
options[:attribution] = <<-HTML
Scala programming documentation. Copyright (c) 2003-2017 <a
href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
href="http://www.lightbend.com" target="_blank">Lightbend</a>.
HTML
html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_212'
html_filters.push 'scala/entries', 'scala/clean_html'
end
version '2.11 Library' do
self.release = '2.11.8'
self.dir = '/Users/Thibaut/DevDocs/Docs/Scala211/api/scala-library' # https://downloads.lightbend.com/scala/2.11.8/scala-docs-2.11.8.zip
self.base_url = 'http://www.scala-lang.org/api/2.11.8/'
self.root_path = 'package.html'
options[:skip_patterns] = [/^index.html/, /index\/index-/]
options[:attribution] = <<-HTML
Scala programming documentation. Copyright (c) 2003-2016 <a
href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
href="http://www.lightbend.com" target="_blank">Lightbend</a>.
HTML
html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
# https://downloads.lightbend.com/scala/2.12.6/scala-docs-2.12.6.zip
# Extract api/scala-library into docs/scala~2.12_library
version '2.12 Library' do
self.release = '2.12.6'
self.base_url = 'https://www.scala-lang.org/api/2.12.6/'
self.root_path = 'index.html'
html_filters.push 'scala/entries', 'scala/clean_html'
end
version '2.11 Reflection' do
self.release = '2.11.8'
self.dir = '/Users/Thibaut/DevDocs/Docs/Scala211/api/scala-reflect' # https://downloads.lightbend.com/scala/2.11.8/scala-docs-2.11.8.zip
self.base_url = 'http://www.scala-lang.org/api/2.11.8/scala-reflect/'
self.root_path = 'package.html'
options[:skip_patterns] = [/^index.html/, /index\/index-/]
options[:attribution] = <<-HTML
Scala programming documentation. Copyright (c) 2003-2016 <a
href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
href="http://www.lightbend.com" target="_blank">Lightbend</a>.
HTML
html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
# https://downloads.lightbend.com/scala/2.12.6/scala-docs-2.12.6.zip
# Extract api/scala-reflect into docs/scala~2.12_reflection
version '2.12 Reflection' do
self.release = '2.12.6'
self.base_url = 'https://www.scala-lang.org/api/2.12.6/scala-reflect/'
self.root_path = 'index.html'
html_filters.push 'scala/entries', 'scala/clean_html'
end
version '2.10' do
self.release = '2.10.6'
self.dir = '/Users/Thibaut/DevDocs/Docs/Scala210' # https://downloads.lightbend.com/scala/2.10.6/scala-docs-2.10.6.zip
self.base_url = 'http://www.scala-lang.org/api/2.10.6/'
self.root_path = 'package.html'
options[:skip_patterns] = [/^index.html/, /index\/index-/]
options[:attribution] = <<-HTML
Scala programming documentation. Copyright (c) 2003-2013 <a
href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
href="http://typesafe.com" target="_blank">Typesafe</a>.
HTML
html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
def get_latest_version(opts)
doc = fetch_doc('https://www.scala-lang.org/api/current/', opts)
doc.at_css('#doc-version').content
end
end
end

Loading…
Cancel
Save