Merge pull request #1708 from Nicolapps/scala-3-scraper

Add a scraper for Scala 3
pull/1712/head
Simon Legner 3 years ago committed by GitHub
commit 39d3696efe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -794,9 +794,9 @@ credits = [
'https://raw.githubusercontent.com/sass/sass/stable/MIT-LICENSE' 'https://raw.githubusercontent.com/sass/sass/stable/MIT-LICENSE'
], [ ], [
'Scala', 'Scala',
'2002-2019 EPFL, with contributions from Lightbend', '2002-2022 EPFL, with contributions from Lightbend',
'Apache', 'Apache',
'https://raw.githubusercontent.com/scala/scala-lang/master/license.md' 'https://www.scala-lang.org/license/'
], [ ], [
'scikit-image', 'scikit-image',
'2019 the scikit-image team', '2019 the scikit-image team',

@ -1,5 +1,5 @@
/* PrismJS 1.26.0 /* PrismJS 1.27.0
https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markup-templating+matlab+nginx+nim+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+shell-session+sql+typescript+yaml+zig */ https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markup-templating+matlab+nginx+nim+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+scala+shell-session+sql+typescript+yaml+zig */
/// <reference lib="WebWorker"/> /// <reference lib="WebWorker"/>
var _self = (typeof window !== 'undefined') var _self = (typeof window !== 'undefined')
@ -4660,6 +4660,56 @@ Prism.languages.insertBefore('scss', 'function', {
Prism.languages.scss['atrule'].inside.rest = Prism.languages.scss; Prism.languages.scss['atrule'].inside.rest = Prism.languages.scss;
Prism.languages.scala = Prism.languages.extend('java', {
'triple-quoted-string': {
pattern: /"""[\s\S]*?"""/,
greedy: true,
alias: 'string'
},
'string': {
pattern: /("|')(?:\\.|(?!\1)[^\\\r\n])*\1/,
greedy: true
},
'keyword': /<-|=>|\b(?:abstract|case|catch|class|def|do|else|extends|final|finally|for|forSome|if|implicit|import|lazy|match|new|null|object|override|package|private|protected|return|sealed|self|super|this|throw|trait|try|type|val|var|while|with|yield)\b/,
'number': /\b0x(?:[\da-f]*\.)?[\da-f]+|(?:\b\d+(?:\.\d*)?|\B\.\d+)(?:e\d+)?[dfl]?/i,
'builtin': /\b(?:Any|AnyRef|AnyVal|Boolean|Byte|Char|Double|Float|Int|Long|Nothing|Short|String|Unit)\b/,
'symbol': /'[^\d\s\\]\w*/
});
Prism.languages.insertBefore('scala', 'triple-quoted-string', {
'string-interpolation': {
pattern: /\b[a-z]\w*(?:"""(?:[^$]|\$(?:[^{]|\{(?:[^{}]|\{[^{}]*\})*\}))*?"""|"(?:[^$"\r\n]|\$(?:[^{]|\{(?:[^{}]|\{[^{}]*\})*\}))*")/i,
greedy: true,
inside: {
'id': {
pattern: /^\w+/,
greedy: true,
alias: 'function'
},
'escape': {
pattern: /\\\$"|\$[$"]/,
greedy: true,
alias: 'symbol'
},
'interpolation': {
pattern: /\$(?:\w+|\{(?:[^{}]|\{[^{}]*\})*\})/,
greedy: true,
inside: {
'punctuation': /^\$\{?|\}$/,
'expression': {
pattern: /[\s\S]+/,
inside: Prism.languages.scala
}
}
},
'string': /[\s\S]+/
}
}
});
delete Prism.languages.scala['class-name'];
delete Prism.languages.scala['function'];
(function (Prism) { (function (Prism) {
// CAREFUL! // CAREFUL!

@ -1,4 +1,43 @@
._scala { ._scala {
@extend %simple; @extend %simple;
.deprecated { @extend %label-red; } .deprecated { @extend %label-red; }
.attributes dl,
.attributes pre {
margin: 0;
}
.related-types {
@extend %pre;
margin: 0;
white-space: normal;
}
.links {
@extend %box;
margin-left: -1rem;
text-align: center;
padding: .5em;
a { padding: .4em }
@include print {
display: none;
}
}
.source-link {
float: right;
font-size: .75rem;
color: var(--linkColor);
cursor: pointer;
@extend %user-select-none;
&:hover { text-decoration: underline; }
@include print {
display: none;
}
}
} }

@ -1,6 +1,6 @@
module Docs module Docs
class Scala class Scala
class CleanHtmlFilter < Filter class CleanHtmlV2Filter < Filter
def call def call
@doc = at_css('#content') @doc = at_css('#content')

@ -0,0 +1,253 @@
# frozen_string_literal: true
module Docs
class Scala
class CleanHtmlV3Filter < Filter
def call
# Remove unneeded elements
css('.documentableFilter, .documentableAnchor, .documentableBrief').remove
format_title
format_signature
format_top_links
format_metadata
# Remove the redundant long descriptions on the main page
if slug == 'index'
css('.contents').remove
else
format_members
end
simplify_html
doc
end
private
# Formats the title of the page
def format_title
cover_header = at_css('.cover-header')
return if cover_header.nil?
# Add the kind of page to the title
icon = cover_header.at_css('.micon')
types = {
cl: 'Class',
ob: 'Object',
tr: 'Trait',
en: 'Enum',
ty: 'Type',
pa: 'Package',
}
type_id = cover_header.at_css('.micon')['class']
type_id.remove!('micon ')
type_id.remove!('-wc')
type = types[type_id.to_sym]
name = CGI.escapeHTML cover_header.at_css('h1').text
# Add the package name
package = at_css('.breadcrumbs a:nth-of-type(3)').text
package = package + '.' unless name.empty? || package.empty?
# Replace the title
title = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
cover_header.replace "<h1>#{title}</h1>"
end
# Formats the signature block at the top of the page
def format_signature
signature = at_css('.signature')
signature_annotations = signature.at_css('.annotations')
signature_annotations.name = 'small' unless signature_annotations.nil?
signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
end
# Formats the top links (companion page, source code)
def format_top_links
# Companion page (e.g. List ↔ List$)
links = []
at_css('.attributes').css('dt').each do |dt|
next if dt.content.strip != 'Companion:'
dd = dt.next_sibling
companion_link = dd.at_css('a')
companion_link.content = "Companion #{companion_link.content}"
links.append(companion_link.to_html)
dt.remove
dd.remove
end
# Source code
at_css('.attributes').css('dt').each do |dt|
next if dt.content.strip != 'Source:'
dd = dt.next_sibling
source_link = dd.at_css('a')
source_link.content = 'Source code'
links.append(source_link.to_html)
dt.remove
dd.remove
end
# Format the links
title = at_css('h1')
title.add_next_sibling("<div class=\"links\">#{links.join(' • ')}</div>")
end
# Metadata about the whole file (e.g. supertypes)
def format_metadata
# Format the values
css('.tabs.single .monospace').each do |node|
node.css('> div').each do |div|
div['class'] = 'member'
end
node['class'] = 'related-types'
if node.children.count > 15 # Hide too large lists
node.replace "<details>
<summary>#{node.children.count} types</summary>
#{node.to_html}
</details>"
end
end
attributes = at_css('.attributes')
# Change the HTML structure
tabs_names = css('.tabs.single .names .tab')
tabs_contents = css('.tabs.single .contents .tab')
tabs_names.zip(tabs_contents).each do |name, contents|
next if name.content == "Graph"
attributes.add_child("<dt>#{name.content}</dt>")
attributes.add_child("<dd>#{contents.inner_html.strip}</dd>")
end
convert_dl_to_table(attributes)
tabs = at_css('.tabs')
tabs.remove unless tabs.nil? || tabs.parent['class'] == 'membersList'
end
# Format the members (methods, values…)
def format_members
# Section headings
css('.cover h2').each do |node|
node.name = 'h3'
end
css('h2:not(#signature)').remove
css(
'.membersList h3',
# Custom group headers for which Scaladoc generates invalid HTML
# (<h3><p>…</p></h3>)
'.documentableList > h3:empty + p'
).each do |node|
node.name = 'h2'
node.content = node.content
end
# Individual members
css('.documentableElement').each do |element|
header = element.at_css('.header')
header.name = 'h3'
id = element['id']
element.remove_attribute('id')
header['id'] = id unless id.nil?
annotations = element.at_css('.annotations')
annotations.name = 'small'
header.prepend_child(annotations)
# View source
element.css('dt').each do |dt|
next if dt.content.strip != 'Source:'
dd = dt.next_sibling
source_link = dd.at_css('a')
source_link.content = 'Source'
source_link['class'] = 'source-link'
header.prepend_child(source_link)
dt.remove
dd.remove
end
# Format attributes as a table
dl = element.at_css('.attributes')
convert_dl_to_table(dl) unless dl.nil?
# Remove the unnecessary wrapper element
element.replace(element.inner_html)
end
# Remove deprecated sections
css('.documentableList').each do |list|
header = list.at_css('.groupHeader')
list.remove if (header.text.downcase.include? 'deprecate' rescue false)
end
# Code blocks
css('pre > code').each do |code|
pre = code.parent
pre['data-language'] = 'scala'
pre.inner_html = code.inner_html
end
end
# Simplify the HTML structure by removing useless elements
def simplify_html
# Remove unneeded parts of the document
@doc = at_css('#content > div')
# Remove the useless elements around members
css('.documentableList > *').each do |element|
element.parent = doc
end
at_css('.membersList').remove
# Remove useless classes
css('.header, .groupHeader, .cover, .documentableName').each do |element|
element.remove_attribute('class')
end
# Remove useless attributes
css('[t]').each do |element|
element.remove_attribute('t')
end
# Remove useless wrapper elements
css('.docs, .doc, .memberDocumentation, span, div:not([class])').each do |element|
element.replace(element.children)
end
end
def convert_dl_to_table(dl)
table = Nokogiri::XML::Node.new('table', doc)
table['class'] = 'attributes'
dl.css('> dt').each do |dt|
dd = dt.next_element
has_dd = dd.name == 'dd' rescue false
tr = Nokogiri::XML::Node.new('tr', doc)
colspan = has_dd ? '' : ' colspan="2"' # handle <dt> without following <dt>
tr.add_child("<th#{colspan}>#{dt.inner_html.sub(/:$/, '')}</th>")
tr.add_child("<td>#{dd.inner_html}</td>") if has_dd
table.add_child(tr)
end
dl.replace(table)
end
end
end
end

@ -1,6 +1,6 @@
module Docs module Docs
class Scala class Scala
class EntriesFilter < Docs::EntriesFilter class EntriesV2Filter < Docs::EntriesFilter
REPLACEMENTS = { REPLACEMENTS = {
'$eq' => '=', '$eq' => '=',
'$colon' => ':', '$colon' => ':',
@ -75,12 +75,12 @@ module Docs
# include the companion object. # include the companion object.
def package_name def package_name
name = package_drop_last(slug_parts) name = package_drop_last(slug_parts)
name.empty? ? '_root_' : name name.empty? ? 'scala' : name
end end
def parent_package def parent_package
parent = package_drop_last(package_name.split('.')) parent = package_drop_last(package_name.split('.'))
parent.empty? ? '_root_' : parent parent.empty? ? 'scala' : parent
end end
def package_drop_last(parts) def package_drop_last(parts)

@ -0,0 +1,105 @@
# frozen_string_literal: true
module Docs
class Scala
class EntriesV3Filter < Docs::EntriesFilter
REPLACEMENTS = {
'$eq' => '=',
'$colon' => ':',
'$less' => '<',
}
def get_name
if is_package?
at_css('.cover-header h1').text
else
name = slug.split('/').last
# Some objects have inner objects, show ParentObject$.ChildObject$ instead of ParentObject$$ChildObject$
name = name.gsub('$$', '$.')
REPLACEMENTS.each do |key, value|
name = name.gsub(key, value)
end
# If a dollar sign is used as separator between two characters, replace it with a dot
name.gsub(/([^$.])\$([^$.])/, '\1.\2')
end
end
def get_type
# if this entry is for a package, we group the package under the parent package
if is_package?
parent_package
# otherwise, group it under the regular package name
else
package_name
end
end
def include_default_entry?
# Ignore package pages
at_css('.cover-header .micon.pa').nil?
end
def additional_entries
entries = []
titles = []
css(".documentableElement").each do |node|
# Ignore elements without IDs
id = node['id']
next if id.nil?
# Ignore deprecated and inherited members
next unless node.at_css('.deprecated').nil?
member_name = node.at_css('.documentableName').content
title = "#{name}.#{member_name}"
# Add () to methods that take parameters, i.e. methods who have (…)
# in their signature, ignoring occurrences of (implicit …) and (using …)
signature = node.at_css('.signature').content
title += '()' if signature =~ /\((?!implicit)(?!using ).*\)/
next if titles.include?(title) # Ignore duplicates (function overloading)
entries << [title, id]
titles.push(title)
end
entries
end
private
# For the package name, we use the slug rather than parsing the package
# name from the HTML because companion object classes may be broken out into
# their own entries (by the source documentation). When that happens,
# we want to group these classes (like `scala.reflect.api.Annotations.Annotation`)
# under the package name, and not the fully-qualfied name which would
# include the companion object.
def package_name
name = package_drop_last(slug_parts)
name.empty? ? 'scala' : name
end
def parent_package
parent = package_drop_last(package_name.split('.'))
parent.empty? ? 'scala' : parent
end
def package_drop_last(parts)
parts[0...-1].join('.')
end
def slug_parts
slug.split('/')
end
def is_package?
!at_css('.cover-header .micon.pa').nil?
end
end
end
end

@ -3,24 +3,50 @@ module Docs
self.name = 'Scala' self.name = 'Scala'
self.type = 'scala' self.type = 'scala'
self.links = { self.links = {
home: 'http://www.scala-lang.org/', home: 'https://www.scala-lang.org/',
code: 'https://github.com/scala/scala' code: 'https://github.com/scala/scala'
} }
options[:container] = '#content-container'
options[:attribution] = <<-HTML options[:attribution] = <<-HTML
&copy; 2002-2019 EPFL, with contributions from Lightbend.<br> &copy; 2002-2022 EPFL, with contributions from Lightbend.<br>
Licensed under the Apache License, Version 2.0. Licensed under the Apache License, Version 2.0.
HTML HTML
# For Scala 3, there is no official download link for the documentation
# (see https://contributors.scala-lang.org/t/5537).
#
# We currently need to build the docs ourselves. To do so:
# 1. Make sure that Scala 3 and sbt are installed
# (https://www.scala-lang.org/download/scala3.html)
# 2. Clone the Scala 3 (Dotty) repository (https://github.com/lampepfl/dotty)
# 3. From the Dotty folder, run this command in the terminal:
# $ sbt scaladoc/generateScalaDocumentation
# 4. Extract scaladoc/output/scala3/api/ into docs/scala~3.1
version '3.1' do
self.release = '3.1.1'
self.base_url = 'https://scala-lang.org/api/3.1.1/'
self.root_path = 'index.html'
options[:skip_patterns] = [
# Ignore class names with include “#”, which cause issues with the scraper
/%23/,
# Ignore local links to the Java documentation created by a Scaladoc bug
/java\/lang/,
]
html_filters.push 'scala/entries_v3', 'scala/clean_html_v3'
end
# https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
# Extract api/scala-library into docs/scala~2.13_library # Extract api/scala-library into docs/scala~2.13_library
version '2.13 Library' do version '2.13 Library' do
self.release = '2.13.0' self.release = '2.13.0'
self.base_url = 'https://www.scala-lang.org/api/2.13.0/' self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
self.root_path = 'index.html' self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries', 'scala/clean_html' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end end
# https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
@ -29,8 +55,9 @@ module Docs
self.release = '2.13.0' self.release = '2.13.0'
self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/' self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
self.root_path = 'index.html' self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries', 'scala/clean_html' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end end
# https://downloads.lightbend.com/scala/2.12.9/scala-docs-2.12.9.zip # https://downloads.lightbend.com/scala/2.12.9/scala-docs-2.12.9.zip
@ -39,8 +66,9 @@ module Docs
self.release = '2.12.9' self.release = '2.12.9'
self.base_url = 'https://www.scala-lang.org/api/2.12.9/' self.base_url = 'https://www.scala-lang.org/api/2.12.9/'
self.root_path = 'index.html' self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries', 'scala/clean_html' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end end
# https://downloads.lightbend.com/scala/2.12.9/scala-docs-2.12.9.zip # https://downloads.lightbend.com/scala/2.12.9/scala-docs-2.12.9.zip
@ -49,13 +77,14 @@ module Docs
self.release = '2.12.9' self.release = '2.12.9'
self.base_url = 'https://www.scala-lang.org/api/2.12.9/scala-reflect/' self.base_url = 'https://www.scala-lang.org/api/2.12.9/scala-reflect/'
self.root_path = 'index.html' self.root_path = 'index.html'
options[:container] = '#content-container'
html_filters.push 'scala/entries', 'scala/clean_html' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
end end
def get_latest_version(opts) def get_latest_version(opts)
doc = fetch_doc('https://www.scala-lang.org/api/current/', opts) doc = fetch_doc('https://www.scala-lang.org/api/3.x/', opts)
doc.at_css('#doc-version').content doc.at_css('.projectVersion').content
end end
end end
end end

Loading…
Cancel
Save