From afe3a26c7a7388e8d8b44eba1ded347b5a90bfdb Mon Sep 17 00:00:00 2001 From: Nicolas Ettlin Date: Mon, 14 Feb 2022 21:58:52 +0100 Subject: [PATCH] Add a scraper for Scala 3 --- .../templates/pages/about_tmpl.coffee | 4 +- assets/stylesheets/pages/_scala.scss | 34 ++++ lib/docs/filters/scala/clean_html_v3.rb | 181 ++++++++++++++++++ lib/docs/filters/scala/entries_v3.rb | 104 ++++++++++ lib/docs/scrapers/scala.rb | 32 +++- 5 files changed, 348 insertions(+), 7 deletions(-) create mode 100644 lib/docs/filters/scala/clean_html_v3.rb create mode 100644 lib/docs/filters/scala/entries_v3.rb diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee b/assets/javascripts/templates/pages/about_tmpl.coffee index 4eaa4f6d..ec59971d 100644 --- a/assets/javascripts/templates/pages/about_tmpl.coffee +++ b/assets/javascripts/templates/pages/about_tmpl.coffee @@ -794,9 +794,9 @@ credits = [ 'https://raw.githubusercontent.com/sass/sass/stable/MIT-LICENSE' ], [ 'Scala', - '2002-2019 EPFL, with contributions from Lightbend', + '2002-2022 EPFL, with contributions from Lightbend', 'Apache', - 'https://raw.githubusercontent.com/scala/scala-lang/master/license.md' + 'https://www.scala-lang.org/license/' ], [ 'scikit-image', '2019 the scikit-image team', diff --git a/assets/stylesheets/pages/_scala.scss b/assets/stylesheets/pages/_scala.scss index b2beb118..6ddb5dc0 100644 --- a/assets/stylesheets/pages/_scala.scss +++ b/assets/stylesheets/pages/_scala.scss @@ -1,4 +1,38 @@ ._scala { @extend %simple; + .deprecated { @extend %label-red; } + + .related-types { + @extend %pre; + margin-top: 0; + white-space: normal; + } + + .links { + @extend %box; + margin-left: -1rem; + text-align: center; + padding: .5em; + + a { padding: .4em } + + @include print { + display: none; + } + } + + .source-link { + float: right; + font-size: .75rem; + color: var(--linkColor); + cursor: pointer; + @extend %user-select-none; + + &:hover { text-decoration: underline; } + + @include print { + display: none; + } + } } diff --git a/lib/docs/filters/scala/clean_html_v3.rb b/lib/docs/filters/scala/clean_html_v3.rb new file mode 100644 index 00000000..f2d4c793 --- /dev/null +++ b/lib/docs/filters/scala/clean_html_v3.rb @@ -0,0 +1,181 @@ +# frozen_string_literal: true + +module Docs + class Scala + class CleanHtmlV3Filter < Filter + def call + # Remove unneeded elements + css('.documentableFilter, .documentableAnchor, .documentableBrief').remove + + format_title + format_top_links + format_metadata + format_members + + # Simplify the HTML structure + @doc = at_css('#content > div') + css('.documentableList > *').each do |element| + element.parent = doc + end + at_css('.membersList').remove + + doc + end + + def format_title + # Add the kind of page to the title + cover_header = at_css('.cover-header') + unless cover_header.nil? + icon = cover_header.at_css('.micon') + types = { + cl: 'Class', + ob: 'Object', + tr: 'Trait', + en: 'Enum', + ty: 'Type', + pa: 'Package', + } + type_id = cover_header.at_css('.micon')['class'] + type_id.remove!('micon ') + type_id.remove!('-wc') + type = types[type_id.to_sym] + name = CGI.escapeHTML cover_header.at_css('h1').text + + package = at_css('.breadcrumbs a:nth-of-type(3)').text + package = package + '.' unless name.empty? || package.empty? + + title = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip + cover_header.replace "

#{title}

" + end + + # Signature + signature = at_css('.signature') + signature_annotations = signature.at_css('.annotations') + signature_annotations.name = 'small' unless signature_annotations.nil? + signature.replace "

#{signature.inner_html}

" + end + + def format_top_links + # Companion page + links = [] + at_css('.attributes').css('dt').each do |dt| + next if dt.content.strip != 'Companion:' + dd = dt.next_sibling + + companion_link = dd.at_css('a') + companion_link.content = "Companion #{companion_link.content}" + links.append(companion_link.to_html) + + dt.remove + dd.remove + end + + # Source code + at_css('.attributes').css('dt').each do |dt| + next if dt.content.strip != 'Source:' + dd = dt.next_sibling + + source_link = dd.at_css('a') + source_link.content = 'Source code' + links.append(source_link.to_html) + + dt.remove + dd.remove + end + + # Format the links + title = at_css('h1') + title.add_next_sibling("
#{links.join(' • ')}
") + end + + def format_metadata + # Metadata (attributes) + css('.tabs.single .monospace').each do |node| + node['class'] = 'related-types' + + if node.children.count > 15 + node.replace "
+ #{node.children.count} types + #{node.to_html} +
" + end + end + + attributes = at_css('.attributes') + attributes.add_previous_sibling('

Metadata

') + + tabs_names = css('.tabs.single .names .tab') + tabs_contents = css('.tabs.single .contents .tab') + tabs_names.zip(tabs_contents).each do |name, contents| + next if name.content == "Graph" + + attributes.add_child("
#{name.content}
") + attributes.add_child("
#{contents.inner_html.strip}
") + end + at_css('.tabs').remove + end + + def format_members + # Headings + css('.cover h2').each do |node| + node.name = 'h3' + end + css('h2:not(#signature)').remove + css( + '.membersList h3', + + # Custom group headers for which Scaladoc generates invalid HTML + '.documentableList > h3:empty + p' + ).each do |node| + node.name = 'h2' + node.content = node.content + end + + # Methods + css('.documentableElement').each do |element| + header = element.at_css('.header') + header.name = 'h3' + + id = element['id'] + element.remove_attribute('id') + header['id'] = id + + annotations = element.at_css('.annotations') + annotations.name = 'small' + header.prepend_child(annotations) + + # View source + element.css('dt').each do |dt| + next if dt.content.strip != 'Source:' + dd = dt.next_sibling + + source_link = dd.at_css('a') + source_link.content = 'Source' + source_link['class'] = 'source-link' + header.prepend_child(source_link) + + dt.remove + dd.remove + end + + # Remove the unnecessary wrapper element + element.replace(element.inner_html) + end + + # Remove deprecated sections + css('.documentableList').each do |list| + header = list.at_css('.groupHeader') + list.remove if (header.text.downcase.include? 'deprecate' rescue false) + end + + # Code blocks + css('pre > code').each do |code| + pre = code.parent + pre['data-language'] = 'scala' + pre.inner_html = code.inner_html + end + end + + end + end +end diff --git a/lib/docs/filters/scala/entries_v3.rb b/lib/docs/filters/scala/entries_v3.rb new file mode 100644 index 00000000..03a22adb --- /dev/null +++ b/lib/docs/filters/scala/entries_v3.rb @@ -0,0 +1,104 @@ +# frozen_string_literal: true + +module Docs + class Scala + class EntriesV3Filter < Docs::EntriesFilter + REPLACEMENTS = { + '$eq' => '=', + '$colon' => ':', + '$less' => '<', + } + + def get_name + if is_package? + at_css('.cover-header h1').text + else + name = slug.split('/').last + + # Some objects have inner objects, show ParentObject$.ChildObject$ instead of ParentObject$$ChildObject$ + name = name.gsub('$$', '$.') + + REPLACEMENTS.each do |key, value| + name = name.gsub(key, value) + end + + # If a dollar sign is used as separator between two characters, replace it with a dot + name.gsub(/([^$.])\$([^$.])/, '\1.\2') + end + end + + def get_type + # if this entry is for a package, we group the package under the parent package + if is_package? + parent_package + # otherwise, group it under the regular package name + else + package_name + end + end + + def include_default_entry? + true + end + + def additional_entries + entries = [] + titles = [] + + css(".documentableElement").each do |node| + # Ignore elements without IDs + id = node['id'] + next if id.nil? + + # Ignore deprecated and inherited members + next unless node.at_css('.deprecated').nil? + + member_name = node.at_css('.documentableName').content + title = "#{name}.#{member_name}" + + # Add () to methods that take parameters, i.e. methods who have (…) + # in their signature, ignoring occurrences of (implicit …) and (using …) + signature = node.at_css('.signature').content + title += '()' if signature =~ /\((?!implicit)(?!using ).*\)/ + + next if titles.include?(title) # Ignore duplicates (function overloading) + + entries << [title, id] + titles.push(title) + end + + entries + end + + private + + # For the package name, we use the slug rather than parsing the package + # name from the HTML because companion object classes may be broken out into + # their own entries (by the source documentation). When that happens, + # we want to group these classes (like `scala.reflect.api.Annotations.Annotation`) + # under the package name, and not the fully-qualfied name which would + # include the companion object. + def package_name + name = package_drop_last(slug_parts) + name.empty? ? 'scala' : name + end + + def parent_package + parent = package_drop_last(package_name.split('.')) + parent.empty? ? 'scala' : parent + end + + def package_drop_last(parts) + parts[0...-1].join('.') + end + + def slug_parts + slug.split('/') + end + + def is_package? + !at_css('.cover-header .micon.pa').nil? + end + end + end +end diff --git a/lib/docs/scrapers/scala.rb b/lib/docs/scrapers/scala.rb index 112a696e..39e95121 100644 --- a/lib/docs/scrapers/scala.rb +++ b/lib/docs/scrapers/scala.rb @@ -3,22 +3,41 @@ module Docs self.name = 'Scala' self.type = 'scala' self.links = { - home: 'http://www.scala-lang.org/', + home: 'https://www.scala-lang.org/', code: 'https://github.com/scala/scala' } - options[:container] = '#content-container' options[:attribution] = <<-HTML - © 2002-2019 EPFL, with contributions from Lightbend.
+ © 2002-2022 EPFL, with contributions from Lightbend.
Licensed under the Apache License, Version 2.0. HTML + # For Scala 3, there is no official download link for the documentation + # (see https://contributors.scala-lang.org/t/5537). + # + # We currently need to build the docs ourselves. To do so: + # 1. Make sure that Scala 3 and sbt are installed + # (https://www.scala-lang.org/download/scala3.html) + # 2. Clone the Scala 3 (Dotty) repository (https://github.com/lampepfl/dotty) + # 3. From the Dotty folder, run this command in the terminal: + # $ sbt scaladoc/generateScalaDocumentation + # 4. Extract scaladoc/output/scala3/api/ into docs/scala~3.1 + version '3.1' do + self.release = '3.1.1' + self.base_url = 'https://scala-lang.org/api/3.1.1/' + self.root_path = 'index.html' + # options[:container] = '#main-content' + + html_filters.push 'scala/entries_v3', 'scala/clean_html_v3' + end + # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip # Extract api/scala-library into docs/scala~2.13_library version '2.13 Library' do self.release = '2.13.0' self.base_url = 'https://www.scala-lang.org/api/2.13.0/' self.root_path = 'index.html' + options[:container] = '#content-container' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2' end @@ -29,6 +48,7 @@ module Docs self.release = '2.13.0' self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/' self.root_path = 'index.html' + options[:container] = '#content-container' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2' end @@ -39,6 +59,7 @@ module Docs self.release = '2.12.9' self.base_url = 'https://www.scala-lang.org/api/2.12.9/' self.root_path = 'index.html' + options[:container] = '#content-container' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2' end @@ -49,13 +70,14 @@ module Docs self.release = '2.12.9' self.base_url = 'https://www.scala-lang.org/api/2.12.9/scala-reflect/' self.root_path = 'index.html' + options[:container] = '#content-container' html_filters.push 'scala/entries_v2', 'scala/clean_html_v2' end def get_latest_version(opts) - doc = fetch_doc('https://www.scala-lang.org/api/current/', opts) - doc.at_css('#doc-version').content + doc = fetch_doc('https://www.scala-lang.org/api/3.x/', opts) + doc.at_css('.projectVersion').content end end end