diff --git a/docs/Scraper-Reference.md b/docs/Scraper-Reference.md
index 60d377d8..a6736a3e 100644
--- a/docs/Scraper-Reference.md
+++ b/docs/Scraper-Reference.md
@@ -187,7 +187,7 @@ More information about how filters work is available on the [Filter Reference](.
## Keeping scrapers up-to-date
-In order to keep scrapers up-to-date the `get_latest_version(options, &block)` method should be overridden by all scrapers that define the `self.release` attribute. This method should return the latest version of the documentation that is being scraped. The result of this method is periodically reported in a "Documentation versions report" issue which helps maintainers keep track of outdated documentations.
+In order to keep scrapers up-to-date the `get_latest_version(options, &block)` method should be overridden. If `self.release` is defined, this should return the latest version of the documentation. If `self.release` is not defined, it should return the Epoch time when the documentation was last modified. If the documentation will never change, simply return `1.0.0`. The result of this method is periodically reported in a "Documentation versions report" issue which helps maintainers keep track of outdated documentations.
To make life easier, there are a few utility methods that you can use in `get_latest_version`:
* `fetch(url, options, &block)`
diff --git a/lib/docs/core/doc.rb b/lib/docs/core/doc.rb
index cb1cd209..062ac9e7 100644
--- a/lib/docs/core/doc.rb
+++ b/lib/docs/core/doc.rb
@@ -152,7 +152,6 @@ module Docs
end
end
-
def initialize
raise NotImplementedError, "#{self.class} is an abstract class and cannot be instantiated." if self.class.abstract
end
@@ -164,5 +163,108 @@ module Docs
def build_pages(&block)
raise NotImplementedError
end
+
+ def get_scraper_version(opts, &block)
+ if self.class.method_defined?(:options) and !options[:release].nil?
+ block.call options[:release]
+ else
+ # If options[:release] does not exist, we return the Epoch timestamp of when the doc was last modified in DevDocs production
+ fetch_json('https://devdocs.io/docs.json', opts) do |json|
+ items = json.select {|item| item['name'] == self.class.name}
+ items = items.map {|item| item['mtime']}
+ block.call items.max
+ end
+ end
+ end
+
+ # Should return the latest version of this documentation
+ # If options[:release] is defined, it should be in the same format
+ # If options[:release] is not defined, it should return the Epoch timestamp of when the documentation was last updated
+ # If the docs will never change, simply return '1.0.0'
+ def get_latest_version(options, &block)
+ raise NotImplementedError
+ end
+
+ # Returns whether or not this scraper is outdated.
+ #
+ # The default implementation assumes the documentation uses a semver(-like) approach when it comes to versions.
+ # Patch updates are ignored because there are usually little to no documentation changes in bug-fix-only releases.
+ #
+ # Scrapers of documentations that do not use this versioning approach should override this method.
+ #
+ # Examples of the default implementation:
+ # 1 -> 2 = outdated
+ # 1.1 -> 1.2 = outdated
+ # 1.1.1 -> 1.1.2 = not outdated
+ def is_outdated(scraper_version, latest_version)
+ scraper_parts = scraper_version.to_s.split(/\./).map(&:to_i)
+ latest_parts = latest_version.to_s.split(/\./).map(&:to_i)
+
+ # Only check the first two parts, the third part is for patch updates
+ [0, 1].each do |i|
+ break if i >= scraper_parts.length or i >= latest_parts.length
+ return true if latest_parts[i] > scraper_parts[i]
+ return false if latest_parts[i] < scraper_parts[i]
+ end
+
+ false
+ end
+
+ private
+
+ #
+ # Utility methods for get_latest_version
+ #
+
+ def fetch(url, options, &block)
+ headers = {}
+
+ if options.key?(:github_token) and url.start_with?('https://api.github.com/')
+ headers['Authorization'] = "token #{options[:github_token]}"
+ end
+
+ options[:logger].debug("Fetching #{url}")
+
+ Request.run(url, { headers: headers }) do |response|
+ if response.success?
+ block.call response.body
+ else
+ options[:logger].error("Couldn't fetch #{url} (response code #{response.code})")
+ block.call nil
+ end
+ end
+ end
+
+ def fetch_doc(url, options, &block)
+ fetch(url, options) do |body|
+ block.call Nokogiri::HTML.parse(body, nil, 'UTF-8')
+ end
+ end
+
+ def fetch_json(url, options, &block)
+ fetch(url, options) do |body|
+ block.call JSON.parse(body)
+ end
+ end
+
+ def get_npm_version(package, options, &block)
+ fetch_json("https://registry.npmjs.com/#{package}", options) do |json|
+ block.call json['dist-tags']['latest']
+ end
+ end
+
+ def get_latest_github_release(owner, repo, options, &block)
+ fetch_json("https://api.github.com/repos/#{owner}/#{repo}/releases/latest", options, &block)
+ end
+
+ def get_github_tags(owner, repo, options, &block)
+ fetch_json("https://api.github.com/repos/#{owner}/#{repo}/tags", options, &block)
+ end
+
+ def get_github_file_contents(owner, repo, path, options, &block)
+ fetch_json("https://api.github.com/repos/#{owner}/#{repo}/contents/#{path}", options) do |json|
+ block.call(Base64.decode64(json['content']))
+ end
+ end
end
end
diff --git a/lib/docs/core/scraper.rb b/lib/docs/core/scraper.rb
index b124c6db..083b0015 100644
--- a/lib/docs/core/scraper.rb
+++ b/lib/docs/core/scraper.rb
@@ -132,35 +132,6 @@ module Docs
end
end
- def get_latest_version(options, &block)
- raise NotImplementedError
- end
-
- # Returns whether or not this scraper is outdated.
- #
- # The default implementation assumes the documentation uses a semver(-like) approach when it comes to versions.
- # Patch updates are ignored because there are usually little to no documentation changes in bug-fix-only releases.
- #
- # Scrapers of documentations that do not use this versioning approach should override this method.
- #
- # Examples of the default implementation:
- # 1 -> 2 = outdated
- # 1.1 -> 1.2 = outdated
- # 1.1.1 -> 1.1.2 = not outdated
- def is_outdated(scraper_version, latest_version)
- scraper_parts = scraper_version.split(/\./).map(&:to_i)
- latest_parts = latest_version.split(/\./).map(&:to_i)
-
- # Only check the first two parts, the third part is for patch updates
- [0, 1].each do |i|
- break if i >= scraper_parts.length or i >= latest_parts.length
- return true if latest_parts[i] > scraper_parts[i]
- return false if latest_parts[i] < scraper_parts[i]
- end
-
- false
- end
-
private
def request_one(url)
@@ -231,62 +202,6 @@ module Docs
{}
end
- #
- # Utility methods for get_latest_version
- #
-
- def fetch(url, options, &block)
- headers = {}
-
- if options.key?(:github_token) and url.start_with?('https://api.github.com/')
- headers['Authorization'] = "token #{options[:github_token]}"
- end
-
- options[:logger].debug("Fetching #{url}")
-
- Request.run(url, { headers: headers }) do |response|
- if response.success?
- block.call response.body
- else
- options[:logger].error("Couldn't fetch #{url} (response code #{response.code})")
- block.call nil
- end
- end
- end
-
- def fetch_doc(url, options, &block)
- fetch(url, options) do |body|
- block.call Nokogiri::HTML.parse body, nil, 'UTF-8'
- end
- end
-
- def fetch_json(url, options, &block)
- fetch(url, options) do |body|
- json = JSON.parse(body)
- block.call json
- end
- end
-
- def get_npm_version(package, options, &block)
- fetch_json("https://registry.npmjs.com/#{package}", options) do |json|
- block.call json['dist-tags']['latest']
- end
- end
-
- def get_latest_github_release(owner, repo, options, &block)
- fetch_json("https://api.github.com/repos/#{owner}/#{repo}/releases/latest", options, &block)
- end
-
- def get_github_tags(owner, repo, options, &block)
- fetch_json("https://api.github.com/repos/#{owner}/#{repo}/tags", options, &block)
- end
-
- def get_github_file_contents(owner, repo, path, options, &block)
- fetch_json("https://api.github.com/repos/#{owner}/#{repo}/contents/#{path}", options) do |json|
- block.call(Base64.decode64(json['content']))
- end
- end
-
module FixInternalUrlsBehavior
def self.included(base)
base.extend ClassMethods
diff --git a/lib/docs/scrapers/c.rb b/lib/docs/scrapers/c.rb
index f9289617..0ab0ac39 100644
--- a/lib/docs/scrapers/c.rb
+++ b/lib/docs/scrapers/c.rb
@@ -26,6 +26,14 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
+ def get_latest_version(options, &block)
+ fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', options) do |doc|
+ link = doc.at_css('a[title^="File:"]')
+ date = link.content.scan(/(\d+)\./)[0][0]
+ block.call DateTime.strptime(date, '%Y%m%d').to_time.to_i
+ end
+ end
+
private
def file_path_for(*)
diff --git a/lib/docs/scrapers/chef.rb b/lib/docs/scrapers/chef.rb
index 337d1202..f9a248bd 100644
--- a/lib/docs/scrapers/chef.rb
+++ b/lib/docs/scrapers/chef.rb
@@ -49,9 +49,8 @@ module Docs
end
def get_latest_version(options, &block)
- fetch_doc('https://docs-archive.chef.io/', options) do |doc|
- cell = doc.at_css('.main-archives > tr:nth-child(2) > td:nth-child(2)')
- block.call cell.content.sub(/Chef Client /, '')
+ fetch_doc('https://downloads.chef.io/chef', options) do |doc|
+ block.call doc.at_css('h1.product-heading > span').content.strip
end
end
end
diff --git a/lib/docs/scrapers/cpp.rb b/lib/docs/scrapers/cpp.rb
index 374f6883..d26eae6a 100644
--- a/lib/docs/scrapers/cpp.rb
+++ b/lib/docs/scrapers/cpp.rb
@@ -34,6 +34,15 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
+ # Same as get_latest_version in lib/docs/scrapers/c.rb
+ def get_latest_version(options, &block)
+ fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', options) do |doc|
+ link = doc.at_css('a[title^="File:"]')
+ date = link.content.scan(/(\d+)\./)[0][0]
+ block.call DateTime.strptime(date, '%Y%m%d').to_time.to_i
+ end
+ end
+
private
def file_path_for(*)
diff --git a/lib/docs/scrapers/haskell.rb b/lib/docs/scrapers/haskell.rb
index 383e1990..fc848a7a 100755
--- a/lib/docs/scrapers/haskell.rb
+++ b/lib/docs/scrapers/haskell.rb
@@ -10,7 +10,7 @@ module Docs
html_filters.push 'haskell/entries', 'haskell/clean_html'
- options[:container] = ->(filter) { filter.subpath.start_with?('users_guide') ? '.body' : '#content' }
+ options[:container] = ->(filter) {filter.subpath.start_with?('users_guide') ? '.body' : '#content'}
options[:only_patterns] = [/\Alibraries\//, /\Ausers_guide\//]
options[:skip_patterns] = [
@@ -70,9 +70,10 @@ module Docs
end
def get_latest_version(options, &block)
- fetch_doc('https://downloads.haskell.org/~ghc/latest/docs/html/users_guide/', options) do |doc|
- label = doc.at_css('.related > ul > li:last-child').content
- block.call label.scan(/([0-9.]+)/)[0][0]
+ fetch_doc('https://downloads.haskell.org/~ghc/latest/docs/html/', options) do |doc|
+ links = doc.css('a').to_a
+ versions = links.map {|link| link['href'].scan(/ghc-([0-9.]+)/)}
+ block.call versions.find {|version| !version.empty?}[0][0]
end
end
end
diff --git a/lib/docs/scrapers/http.rb b/lib/docs/scrapers/http.rb
index 60f15f75..90e6f5c1 100644
--- a/lib/docs/scrapers/http.rb
+++ b/lib/docs/scrapers/http.rb
@@ -7,6 +7,8 @@ module Docs
html_filters.push 'http/clean_html', 'http/entries', 'title'
+ options[:mdn_tag] = 'HTTP'
+
options[:root_title] = 'HTTP'
options[:title] = ->(filter) { filter.current_url.host == 'tools.ietf.org' ? false : filter.default_title }
options[:container] = ->(filter) { filter.current_url.host == 'tools.ietf.org' ? '.content' : nil }
diff --git a/lib/docs/scrapers/markdown.rb b/lib/docs/scrapers/markdown.rb
index 87e9c957..3400e270 100644
--- a/lib/docs/scrapers/markdown.rb
+++ b/lib/docs/scrapers/markdown.rb
@@ -13,5 +13,9 @@ module Docs
© 2004 John Gruber
Licensed under the BSD License.
HTML
+
+ def get_latest_version(options, &block)
+ block.call '1.0.0'
+ end
end
end
diff --git a/lib/docs/scrapers/mdn/css.rb b/lib/docs/scrapers/mdn/css.rb
index 4c44f1f1..abb69b3a 100644
--- a/lib/docs/scrapers/mdn/css.rb
+++ b/lib/docs/scrapers/mdn/css.rb
@@ -6,6 +6,8 @@ module Docs
html_filters.push 'css/clean_html', 'css/entries', 'title'
+ options[:mdn_tag] = 'CSS'
+
options[:root_title] = 'CSS'
options[:skip] = %w(/CSS3 /Media/Visual /paged_media /Media/TV /Media/Tactile)
diff --git a/lib/docs/scrapers/mdn/dom.rb b/lib/docs/scrapers/mdn/dom.rb
index bbf95b20..a2202929 100644
--- a/lib/docs/scrapers/mdn/dom.rb
+++ b/lib/docs/scrapers/mdn/dom.rb
@@ -8,6 +8,8 @@ module Docs
html_filters.push 'dom/clean_html', 'dom/entries', 'title'
+ options[:mdn_tag] = 'XSLT_Reference'
+
options[:root_title] = 'DOM'
options[:skip] = %w(
diff --git a/lib/docs/scrapers/mdn/dom_events.rb b/lib/docs/scrapers/mdn/dom_events.rb
index fcbdc08f..258fbcd4 100644
--- a/lib/docs/scrapers/mdn/dom_events.rb
+++ b/lib/docs/scrapers/mdn/dom_events.rb
@@ -9,6 +9,8 @@ module Docs
html_filters.insert_after 'clean_html', 'dom_events/clean_html'
html_filters.push 'dom_events/entries', 'title'
+ options[:mdn_tag] = 'events'
+
options[:root_title] = 'DOM Events'
options[:skip] = %w(/MozOrientation)
diff --git a/lib/docs/scrapers/mdn/html.rb b/lib/docs/scrapers/mdn/html.rb
index 4b28cefd..f38432f1 100644
--- a/lib/docs/scrapers/mdn/html.rb
+++ b/lib/docs/scrapers/mdn/html.rb
@@ -7,6 +7,8 @@ module Docs
html_filters.push 'html/clean_html', 'html/entries', 'title'
+ options[:mdn_tag] = 'HTML'
+
options[:root_title] = 'HTML'
options[:title] = ->(filter) do
diff --git a/lib/docs/scrapers/mdn/javascript.rb b/lib/docs/scrapers/mdn/javascript.rb
index 935df61c..cea55fc8 100644
--- a/lib/docs/scrapers/mdn/javascript.rb
+++ b/lib/docs/scrapers/mdn/javascript.rb
@@ -8,6 +8,8 @@ module Docs
html_filters.push 'javascript/clean_html', 'javascript/entries', 'title'
+ options[:mdn_tag] = 'JavaScript'
+
options[:root_title] = 'JavaScript'
# Don't want
diff --git a/lib/docs/scrapers/mdn/mdn.rb b/lib/docs/scrapers/mdn/mdn.rb
index 2ebd38aa..ccb27af9 100644
--- a/lib/docs/scrapers/mdn/mdn.rb
+++ b/lib/docs/scrapers/mdn/mdn.rb
@@ -21,6 +21,12 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike License v2.5 or later.
HTML
+ def get_latest_version(opts, &block)
+ fetch_json("https://developer.mozilla.org/en-US/docs/feeds/json/tag/#{options[:mdn_tag]}", opts) do |json|
+ block.call DateTime.parse(json[0]['pubdate']).to_time.to_i
+ end
+ end
+
private
def process_response?(response)
diff --git a/lib/docs/scrapers/mdn/svg.rb b/lib/docs/scrapers/mdn/svg.rb
index db9de7a1..66baf60d 100644
--- a/lib/docs/scrapers/mdn/svg.rb
+++ b/lib/docs/scrapers/mdn/svg.rb
@@ -8,6 +8,8 @@ module Docs
html_filters.push 'svg/clean_html', 'svg/entries', 'title'
+ options[:mdn_tag] = 'XSLT_Reference'
+
options[:root_title] = 'SVG'
options[:title] = ->(filter) do
diff --git a/lib/docs/scrapers/mdn/xslt_xpath.rb b/lib/docs/scrapers/mdn/xslt_xpath.rb
index 5d812dd4..9bf01c01 100644
--- a/lib/docs/scrapers/mdn/xslt_xpath.rb
+++ b/lib/docs/scrapers/mdn/xslt_xpath.rb
@@ -8,6 +8,8 @@ module Docs
html_filters.push 'xslt_xpath/clean_html', 'xslt_xpath/entries', 'title'
+ options[:mdn_tag] = 'XSLT_Reference'
+
options[:root_title] = 'XSLT'
options[:only_patterns] = [/\A\/XSLT/, /\A\/XPath/]
diff --git a/lib/docs/scrapers/openjdk.rb b/lib/docs/scrapers/openjdk.rb
index e833c7c4..a56a2928 100644
--- a/lib/docs/scrapers/openjdk.rb
+++ b/lib/docs/scrapers/openjdk.rb
@@ -105,7 +105,7 @@ module Docs
end
end
- block.call latest_version.to_s
+ block.call latest_version
end
end
end
diff --git a/lib/docs/scrapers/support_tables.rb b/lib/docs/scrapers/support_tables.rb
index 9a550c1e..90cd7f4f 100644
--- a/lib/docs/scrapers/support_tables.rb
+++ b/lib/docs/scrapers/support_tables.rb
@@ -178,5 +178,12 @@ module Docs