Check for updates based on modified date for scrapers without release

pull/986/head
Jasper van Merle 6 years ago
parent 6ee1693134
commit cd43632a2c

@ -187,7 +187,7 @@ More information about how filters work is available on the [Filter Reference](.
## Keeping scrapers up-to-date
In order to keep scrapers up-to-date the `get_latest_version(options, &block)` method should be overridden by all scrapers that define the `self.release` attribute. This method should return the latest version of the documentation that is being scraped. The result of this method is periodically reported in a "Documentation versions report" issue which helps maintainers keep track of outdated documentations.
In order to keep scrapers up-to-date the `get_latest_version(options, &block)` method should be overridden. If `self.release` is defined, this should return the latest version of the documentation. If `self.release` is not defined, it should return the Epoch time when the documentation was last modified. If the documentation will never change, simply return `1.0.0`. The result of this method is periodically reported in a "Documentation versions report" issue which helps maintainers keep track of outdated documentations.
To make life easier, there are a few utility methods that you can use in `get_latest_version`:
* `fetch(url, options, &block)`

@ -152,7 +152,6 @@ module Docs
end
end
def initialize
raise NotImplementedError, "#{self.class} is an abstract class and cannot be instantiated." if self.class.abstract
end
@ -164,5 +163,108 @@ module Docs
def build_pages(&block)
raise NotImplementedError
end
def get_scraper_version(opts, &block)
if self.class.method_defined?(:options) and !options[:release].nil?
block.call options[:release]
else
# If options[:release] does not exist, we return the Epoch timestamp of when the doc was last modified in DevDocs production
fetch_json('https://devdocs.io/docs.json', opts) do |json|
items = json.select {|item| item['name'] == self.class.name}
items = items.map {|item| item['mtime']}
block.call items.max
end
end
end
# Should return the latest version of this documentation
# If options[:release] is defined, it should be in the same format
# If options[:release] is not defined, it should return the Epoch timestamp of when the documentation was last updated
# If the docs will never change, simply return '1.0.0'
def get_latest_version(options, &block)
raise NotImplementedError
end
# Returns whether or not this scraper is outdated.
#
# The default implementation assumes the documentation uses a semver(-like) approach when it comes to versions.
# Patch updates are ignored because there are usually little to no documentation changes in bug-fix-only releases.
#
# Scrapers of documentations that do not use this versioning approach should override this method.
#
# Examples of the default implementation:
# 1 -> 2 = outdated
# 1.1 -> 1.2 = outdated
# 1.1.1 -> 1.1.2 = not outdated
def is_outdated(scraper_version, latest_version)
scraper_parts = scraper_version.to_s.split(/\./).map(&:to_i)
latest_parts = latest_version.to_s.split(/\./).map(&:to_i)
# Only check the first two parts, the third part is for patch updates
[0, 1].each do |i|
break if i >= scraper_parts.length or i >= latest_parts.length
return true if latest_parts[i] > scraper_parts[i]
return false if latest_parts[i] < scraper_parts[i]
end
false
end
private
#
# Utility methods for get_latest_version
#
def fetch(url, options, &block)
headers = {}
if options.key?(:github_token) and url.start_with?('https://api.github.com/')
headers['Authorization'] = "token #{options[:github_token]}"
end
options[:logger].debug("Fetching #{url}")
Request.run(url, { headers: headers }) do |response|
if response.success?
block.call response.body
else
options[:logger].error("Couldn't fetch #{url} (response code #{response.code})")
block.call nil
end
end
end
def fetch_doc(url, options, &block)
fetch(url, options) do |body|
block.call Nokogiri::HTML.parse(body, nil, 'UTF-8')
end
end
def fetch_json(url, options, &block)
fetch(url, options) do |body|
block.call JSON.parse(body)
end
end
def get_npm_version(package, options, &block)
fetch_json("https://registry.npmjs.com/#{package}", options) do |json|
block.call json['dist-tags']['latest']
end
end
def get_latest_github_release(owner, repo, options, &block)
fetch_json("https://api.github.com/repos/#{owner}/#{repo}/releases/latest", options, &block)
end
def get_github_tags(owner, repo, options, &block)
fetch_json("https://api.github.com/repos/#{owner}/#{repo}/tags", options, &block)
end
def get_github_file_contents(owner, repo, path, options, &block)
fetch_json("https://api.github.com/repos/#{owner}/#{repo}/contents/#{path}", options) do |json|
block.call(Base64.decode64(json['content']))
end
end
end
end

@ -132,35 +132,6 @@ module Docs
end
end
def get_latest_version(options, &block)
raise NotImplementedError
end
# Returns whether or not this scraper is outdated.
#
# The default implementation assumes the documentation uses a semver(-like) approach when it comes to versions.
# Patch updates are ignored because there are usually little to no documentation changes in bug-fix-only releases.
#
# Scrapers of documentations that do not use this versioning approach should override this method.
#
# Examples of the default implementation:
# 1 -> 2 = outdated
# 1.1 -> 1.2 = outdated
# 1.1.1 -> 1.1.2 = not outdated
def is_outdated(scraper_version, latest_version)
scraper_parts = scraper_version.split(/\./).map(&:to_i)
latest_parts = latest_version.split(/\./).map(&:to_i)
# Only check the first two parts, the third part is for patch updates
[0, 1].each do |i|
break if i >= scraper_parts.length or i >= latest_parts.length
return true if latest_parts[i] > scraper_parts[i]
return false if latest_parts[i] < scraper_parts[i]
end
false
end
private
def request_one(url)
@ -231,62 +202,6 @@ module Docs
{}
end
#
# Utility methods for get_latest_version
#
def fetch(url, options, &block)
headers = {}
if options.key?(:github_token) and url.start_with?('https://api.github.com/')
headers['Authorization'] = "token #{options[:github_token]}"
end
options[:logger].debug("Fetching #{url}")
Request.run(url, { headers: headers }) do |response|
if response.success?
block.call response.body
else
options[:logger].error("Couldn't fetch #{url} (response code #{response.code})")
block.call nil
end
end
end
def fetch_doc(url, options, &block)
fetch(url, options) do |body|
block.call Nokogiri::HTML.parse body, nil, 'UTF-8'
end
end
def fetch_json(url, options, &block)
fetch(url, options) do |body|
json = JSON.parse(body)
block.call json
end
end
def get_npm_version(package, options, &block)
fetch_json("https://registry.npmjs.com/#{package}", options) do |json|
block.call json['dist-tags']['latest']
end
end
def get_latest_github_release(owner, repo, options, &block)
fetch_json("https://api.github.com/repos/#{owner}/#{repo}/releases/latest", options, &block)
end
def get_github_tags(owner, repo, options, &block)
fetch_json("https://api.github.com/repos/#{owner}/#{repo}/tags", options, &block)
end
def get_github_file_contents(owner, repo, path, options, &block)
fetch_json("https://api.github.com/repos/#{owner}/#{repo}/contents/#{path}", options) do |json|
block.call(Base64.decode64(json['content']))
end
end
module FixInternalUrlsBehavior
def self.included(base)
base.extend ClassMethods

@ -26,6 +26,14 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
def get_latest_version(options, &block)
fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', options) do |doc|
link = doc.at_css('a[title^="File:"]')
date = link.content.scan(/(\d+)\./)[0][0]
block.call DateTime.strptime(date, '%Y%m%d').to_time.to_i
end
end
private
def file_path_for(*)

@ -49,9 +49,8 @@ module Docs
end
def get_latest_version(options, &block)
fetch_doc('https://docs-archive.chef.io/', options) do |doc|
cell = doc.at_css('.main-archives > tr:nth-child(2) > td:nth-child(2)')
block.call cell.content.sub(/Chef Client /, '')
fetch_doc('https://downloads.chef.io/chef', options) do |doc|
block.call doc.at_css('h1.product-heading > span').content.strip
end
end
end

@ -34,6 +34,15 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
# Same as get_latest_version in lib/docs/scrapers/c.rb
def get_latest_version(options, &block)
fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', options) do |doc|
link = doc.at_css('a[title^="File:"]')
date = link.content.scan(/(\d+)\./)[0][0]
block.call DateTime.strptime(date, '%Y%m%d').to_time.to_i
end
end
private
def file_path_for(*)

@ -10,7 +10,7 @@ module Docs
html_filters.push 'haskell/entries', 'haskell/clean_html'
options[:container] = ->(filter) { filter.subpath.start_with?('users_guide') ? '.body' : '#content' }
options[:container] = ->(filter) {filter.subpath.start_with?('users_guide') ? '.body' : '#content'}
options[:only_patterns] = [/\Alibraries\//, /\Ausers_guide\//]
options[:skip_patterns] = [
@ -70,9 +70,10 @@ module Docs
end
def get_latest_version(options, &block)
fetch_doc('https://downloads.haskell.org/~ghc/latest/docs/html/users_guide/', options) do |doc|
label = doc.at_css('.related > ul > li:last-child').content
block.call label.scan(/([0-9.]+)/)[0][0]
fetch_doc('https://downloads.haskell.org/~ghc/latest/docs/html/', options) do |doc|
links = doc.css('a').to_a
versions = links.map {|link| link['href'].scan(/ghc-([0-9.]+)/)}
block.call versions.find {|version| !version.empty?}[0][0]
end
end
end

@ -7,6 +7,8 @@ module Docs
html_filters.push 'http/clean_html', 'http/entries', 'title'
options[:mdn_tag] = 'HTTP'
options[:root_title] = 'HTTP'
options[:title] = ->(filter) { filter.current_url.host == 'tools.ietf.org' ? false : filter.default_title }
options[:container] = ->(filter) { filter.current_url.host == 'tools.ietf.org' ? '.content' : nil }

@ -13,5 +13,9 @@ module Docs
&copy; 2004 John Gruber<br>
Licensed under the BSD License.
HTML
def get_latest_version(options, &block)
block.call '1.0.0'
end
end
end

@ -6,6 +6,8 @@ module Docs
html_filters.push 'css/clean_html', 'css/entries', 'title'
options[:mdn_tag] = 'CSS'
options[:root_title] = 'CSS'
options[:skip] = %w(/CSS3 /Media/Visual /paged_media /Media/TV /Media/Tactile)

@ -8,6 +8,8 @@ module Docs
html_filters.push 'dom/clean_html', 'dom/entries', 'title'
options[:mdn_tag] = 'XSLT_Reference'
options[:root_title] = 'DOM'
options[:skip] = %w(

@ -9,6 +9,8 @@ module Docs
html_filters.insert_after 'clean_html', 'dom_events/clean_html'
html_filters.push 'dom_events/entries', 'title'
options[:mdn_tag] = 'events'
options[:root_title] = 'DOM Events'
options[:skip] = %w(/MozOrientation)

@ -7,6 +7,8 @@ module Docs
html_filters.push 'html/clean_html', 'html/entries', 'title'
options[:mdn_tag] = 'HTML'
options[:root_title] = 'HTML'
options[:title] = ->(filter) do

@ -8,6 +8,8 @@ module Docs
html_filters.push 'javascript/clean_html', 'javascript/entries', 'title'
options[:mdn_tag] = 'JavaScript'
options[:root_title] = 'JavaScript'
# Don't want

@ -21,6 +21,12 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike License v2.5 or later.
HTML
def get_latest_version(opts, &block)
fetch_json("https://developer.mozilla.org/en-US/docs/feeds/json/tag/#{options[:mdn_tag]}", opts) do |json|
block.call DateTime.parse(json[0]['pubdate']).to_time.to_i
end
end
private
def process_response?(response)

@ -8,6 +8,8 @@ module Docs
html_filters.push 'svg/clean_html', 'svg/entries', 'title'
options[:mdn_tag] = 'XSLT_Reference'
options[:root_title] = 'SVG'
options[:title] = ->(filter) do

@ -8,6 +8,8 @@ module Docs
html_filters.push 'xslt_xpath/clean_html', 'xslt_xpath/entries', 'title'
options[:mdn_tag] = 'XSLT_Reference'
options[:root_title] = 'XSLT'
options[:only_patterns] = [/\A\/XSLT/, /\A\/XPath/]

@ -105,7 +105,7 @@ module Docs
end
end
block.call latest_version.to_s
block.call latest_version
end
end
end

@ -178,5 +178,12 @@ module Docs
</p>
</div>
HTML
def get_latest_version(options, &block)
fetch('https://feeds.feedburner.com/WhenCanIUse?format=xml', options) do |body|
timestamp = body.scan(/<updated>([^<]+)<\/updated>/)[0][0]
block.call DateTime.parse(timestamp).to_time.to_i
end
end
end
end

@ -1,7 +1,7 @@
class UpdatesCLI < Thor
# The GitHub user that is allowed to upload reports
# TODO: Update this before creating a PR
UPLOAD_USER = 'jmerle'
UPLOAD_USER = 'devdocs-bot'
# The repository to create an issue in when uploading the results
# TODO: Update this before creating a PR
@ -59,25 +59,19 @@ class UpdatesCLI < Thor
private
def check_doc(doc, opts)
# Newer scraper versions always come before older scraper versions
# Therefore, the first item's release value is the latest scraper version
#
# For example, a scraper could scrape 3 versions: 10, 11 and 12
# doc.versions.first would be the scraper for version 12
instance = doc.versions.first.new
scraper_version = instance.class.method_defined?(:options) ? instance.options[:release] : nil
return error_result(doc, '`options[:release]` does not exist') if scraper_version.nil?
logger.debug("Checking #{doc.name}")
instance.get_latest_version(opts) do |latest_version|
return {
name: doc.name,
scraper_version: scraper_version,
latest_version: latest_version,
is_outdated: instance.is_outdated(scraper_version, latest_version)
}
instance = doc.versions.first.new
instance.get_scraper_version(opts) do |scraper_version|
instance.get_latest_version(opts) do |latest_version|
return {
name: doc.name,
scraper_version: format_version(scraper_version),
latest_version: format_version(latest_version),
is_outdated: instance.is_outdated(scraper_version, latest_version)
}
end
end
rescue NotImplementedError
logger.warn("Couldn't check #{doc.name}, get_latest_version is not implemented")
@ -87,6 +81,15 @@ class UpdatesCLI < Thor
raise
end
def format_version(version)
str = version.to_s
# If the version is numeric and greater than or equal to 1e9 it's probably a timestamp
return str if str.match(/^(\d)+$/).nil? or str.to_i < 1e9
DateTime.strptime(str, '%s').strftime('%B %-d, %Y')
end
def error_result(doc, reason)
{
name: doc.name,
@ -199,14 +202,15 @@ class UpdatesCLI < Thor
]
results_str = results.select {|result| !result.nil?}.join("\n\n")
travis_str = ENV['TRAVIS'].nil? ? '' : "\n\nThis issue was created by Travis CI build [##{ENV['TRAVIS_BUILD_NUMBER']}](#{ENV['TRAVIS_BUILD_WEB_URL']})."
title = "Documentation versions report for #{Date.today.strftime('%B')} 2019"
title = "Documentation versions report for #{Date.today.strftime('%B %Y')}"
body = <<-MARKDOWN
## What is this?
This is an automatically created issue which contains information about the version status of the documentations available on DevDocs. The results of this report can be used by maintainers when updating outdated documentations.
Maintainers can close this issue when all documentations are up-to-date. This issue is automatically closed when the next report is created.
Maintainers can close this issue when all documentations are up-to-date. This issue is automatically closed when the next report is created.#{travis_str}
## Results

Loading…
Cancel
Save