Add a scraper for Scala 3

pull/1708/head
Nicolas Ettlin 3 years ago committed by Simon Legner
parent 7a9b9fe5be
commit afe3a26c7a

@ -794,9 +794,9 @@ credits = [
'https://raw.githubusercontent.com/sass/sass/stable/MIT-LICENSE'
], [
'Scala',
'2002-2019 EPFL, with contributions from Lightbend',
'2002-2022 EPFL, with contributions from Lightbend',
'Apache',
'https://raw.githubusercontent.com/scala/scala-lang/master/license.md'
'https://www.scala-lang.org/license/'
], [
'scikit-image',
'2019 the scikit-image team',

@ -1,4 +1,38 @@
._scala {
@extend %simple;
.deprecated { @extend %label-red; }
.related-types {
@extend %pre;
margin-top: 0;
white-space: normal;
}
.links {
@extend %box;
margin-left: -1rem;
text-align: center;
padding: .5em;
a { padding: .4em }
@include print {
display: none;
}
}
.source-link {
float: right;
font-size: .75rem;
color: var(--linkColor);
cursor: pointer;
@extend %user-select-none;
&:hover { text-decoration: underline; }
@include print {
display: none;
}
}
}

@ -0,0 +1,181 @@
# frozen_string_literal: true
module Docs
class Scala
class CleanHtmlV3Filter < Filter
def call
# Remove unneeded elements
css('.documentableFilter, .documentableAnchor, .documentableBrief').remove
format_title
format_top_links
format_metadata
format_members
# Simplify the HTML structure
@doc = at_css('#content > div')
css('.documentableList > *').each do |element|
element.parent = doc
end
at_css('.membersList').remove
doc
end
def format_title
# Add the kind of page to the title
cover_header = at_css('.cover-header')
unless cover_header.nil?
icon = cover_header.at_css('.micon')
types = {
cl: 'Class',
ob: 'Object',
tr: 'Trait',
en: 'Enum',
ty: 'Type',
pa: 'Package',
}
type_id = cover_header.at_css('.micon')['class']
type_id.remove!('micon ')
type_id.remove!('-wc')
type = types[type_id.to_sym]
name = CGI.escapeHTML cover_header.at_css('h1').text
package = at_css('.breadcrumbs a:nth-of-type(3)').text
package = package + '.' unless name.empty? || package.empty?
title = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
cover_header.replace "<h1>#{title}</h1>"
end
# Signature
signature = at_css('.signature')
signature_annotations = signature.at_css('.annotations')
signature_annotations.name = 'small' unless signature_annotations.nil?
signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
end
def format_top_links
# Companion page
links = []
at_css('.attributes').css('dt').each do |dt|
next if dt.content.strip != 'Companion:'
dd = dt.next_sibling
companion_link = dd.at_css('a')
companion_link.content = "Companion #{companion_link.content}"
links.append(companion_link.to_html)
dt.remove
dd.remove
end
# Source code
at_css('.attributes').css('dt').each do |dt|
next if dt.content.strip != 'Source:'
dd = dt.next_sibling
source_link = dd.at_css('a')
source_link.content = 'Source code'
links.append(source_link.to_html)
dt.remove
dd.remove
end
# Format the links
title = at_css('h1')
title.add_next_sibling("<div class=\"links\">#{links.join(' • ')}</div>")
end
def format_metadata
# Metadata (attributes)
css('.tabs.single .monospace').each do |node|
node['class'] = 'related-types'
if node.children.count > 15
node.replace "<details>
<summary>#{node.children.count} types</summary>
#{node.to_html}
</details>"
end
end
attributes = at_css('.attributes')
attributes.add_previous_sibling('<h3>Metadata</h3>')
tabs_names = css('.tabs.single .names .tab')
tabs_contents = css('.tabs.single .contents .tab')
tabs_names.zip(tabs_contents).each do |name, contents|
next if name.content == "Graph"
attributes.add_child("<dt>#{name.content}</dt>")
attributes.add_child("<dd>#{contents.inner_html.strip}</dd>")
end
at_css('.tabs').remove
end
def format_members
# Headings
css('.cover h2').each do |node|
node.name = 'h3'
end
css('h2:not(#signature)').remove
css(
'.membersList h3',
# Custom group headers for which Scaladoc generates invalid HTML
'.documentableList > h3:empty + p'
).each do |node|
node.name = 'h2'
node.content = node.content
end
# Methods
css('.documentableElement').each do |element|
header = element.at_css('.header')
header.name = 'h3'
id = element['id']
element.remove_attribute('id')
header['id'] = id
annotations = element.at_css('.annotations')
annotations.name = 'small'
header.prepend_child(annotations)
# View source
element.css('dt').each do |dt|
next if dt.content.strip != 'Source:'
dd = dt.next_sibling
source_link = dd.at_css('a')
source_link.content = 'Source'
source_link['class'] = 'source-link'
header.prepend_child(source_link)
dt.remove
dd.remove
end
# Remove the unnecessary wrapper element
element.replace(element.inner_html)
end
# Remove deprecated sections
css('.documentableList').each do |list|
header = list.at_css('.groupHeader')
list.remove if (header.text.downcase.include? 'deprecate' rescue false)
end
# Code blocks
css('pre > code').each do |code|
pre = code.parent
pre['data-language'] = 'scala'
pre.inner_html = code.inner_html
end
end
end
end
end

@ -0,0 +1,104 @@
# frozen_string_literal: true
module Docs
class Scala
class EntriesV3Filter < Docs::EntriesFilter
REPLACEMENTS = {
'$eq' => '=',
'$colon' => ':',
'$less' => '<',
}
def get_name
if is_package?
at_css('.cover-header h1').text
else
name = slug.split('/').last
# Some objects have inner objects, show ParentObject$.ChildObject$ instead of ParentObject$$ChildObject$
name = name.gsub('$$', '$.')
REPLACEMENTS.each do |key, value|
name = name.gsub(key, value)
end
# If a dollar sign is used as separator between two characters, replace it with a dot
name.gsub(/([^$.])\$([^$.])/, '\1.\2')
end
end
def get_type
# if this entry is for a package, we group the package under the parent package
if is_package?
parent_package
# otherwise, group it under the regular package name
else
package_name
end
end
def include_default_entry?
true
end
def additional_entries
entries = []
titles = []
css(".documentableElement").each do |node|
# Ignore elements without IDs
id = node['id']
next if id.nil?
# Ignore deprecated and inherited members
next unless node.at_css('.deprecated').nil?
member_name = node.at_css('.documentableName').content
title = "#{name}.#{member_name}"
# Add () to methods that take parameters, i.e. methods who have (…)
# in their signature, ignoring occurrences of (implicit …) and (using …)
signature = node.at_css('.signature').content
title += '()' if signature =~ /\((?!implicit)(?!using ).*\)/
next if titles.include?(title) # Ignore duplicates (function overloading)
entries << [title, id]
titles.push(title)
end
entries
end
private
# For the package name, we use the slug rather than parsing the package
# name from the HTML because companion object classes may be broken out into
# their own entries (by the source documentation). When that happens,
# we want to group these classes (like `scala.reflect.api.Annotations.Annotation`)
# under the package name, and not the fully-qualfied name which would
# include the companion object.
def package_name
name = package_drop_last(slug_parts)
name.empty? ? 'scala' : name
end
def parent_package
parent = package_drop_last(package_name.split('.'))
parent.empty? ? 'scala' : parent
end
def package_drop_last(parts)
parts[0...-1].join('.')
end
def slug_parts
slug.split('/')
end
def is_package?
!at_css('.cover-header .micon.pa').nil?
end
end
end
end

@ -3,22 +3,41 @@ module Docs
self.name = 'Scala'
self.type = 'scala'
self.links = {
home: 'http://www.scala-lang.org/',
home: 'https://www.scala-lang.org/',
code: 'https://github.com/scala/scala'
}
options[:container] = '#content-container'
options[:attribution] = <<-HTML
&copy; 2002-2019 EPFL, with contributions from Lightbend.<br>
&copy; 2002-2022 EPFL, with contributions from Lightbend.<br>
Licensed under the Apache License, Version 2.0.
HTML
# For Scala 3, there is no official download link for the documentation
# (see https://contributors.scala-lang.org/t/5537).
#
# We currently need to build the docs ourselves. To do so:
# 1. Make sure that Scala 3 and sbt are installed
# (https://www.scala-lang.org/download/scala3.html)
# 2. Clone the Scala 3 (Dotty) repository (https://github.com/lampepfl/dotty)
# 3. From the Dotty folder, run this command in the terminal:
# $ sbt scaladoc/generateScalaDocumentation
# 4. Extract scaladoc/output/scala3/api/ into docs/scala~3.1
version '3.1' do
self.release = '3.1.1'
self.base_url = 'https://scala-lang.org/api/3.1.1/'
self.root_path = 'index.html'
# options[:container] = '#main-content'
html_filters.push 'scala/entries_v3', 'scala/clean_html_v3'
end
# https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
# Extract api/scala-library into docs/scala~2.13_library
version '2.13 Library' do
self.release = '2.13.0'
self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end
@ -29,6 +48,7 @@ module Docs
self.release = '2.13.0'
self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end
@ -39,6 +59,7 @@ module Docs
self.release = '2.12.9'
self.base_url = 'https://www.scala-lang.org/api/2.12.9/'
self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end
@ -49,13 +70,14 @@ module Docs
self.release = '2.12.9'
self.base_url = 'https://www.scala-lang.org/api/2.12.9/scala-reflect/'
self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end
def get_latest_version(opts)
doc = fetch_doc('https://www.scala-lang.org/api/current/', opts)
doc.at_css('#doc-version').content
doc = fetch_doc('https://www.scala-lang.org/api/3.x/', opts)
doc.at_css('.projectVersion').content
end
end
end

Loading…
Cancel
Save