Simplify file scraper setup; scrape files in the "docs/[slug]" directory

pull/944/head
Thibaut Courouble 6 years ago
parent 91753ce1f8
commit bf003669ba

1
.gitignore vendored

@ -8,3 +8,4 @@ public/fonts
public/docs/**/*
!public/docs/docs.json
!public/docs/**/index.json
/docs/

@ -29,6 +29,7 @@ module Docs
self.rescue_errors = false
class DocNotFound < NameError; end
class SetupError < StandardError; end
def self.all
Dir["#{root_path}/docs/scrapers/**/*.rb"].

@ -95,6 +95,9 @@ module Docs
false
end
end
rescue Docs::SetupError => error
puts "ERROR: #{error.message}"
false
end
def store_pages(store)
@ -118,6 +121,9 @@ module Docs
false
end
end
rescue Docs::SetupError => error
puts "ERROR: #{error.message}"
false
end
private

@ -1,14 +1,13 @@
module Docs
class FileScraper < Scraper
SOURCE_DIRECTORY = File.expand_path '../../../../../docs', __FILE__
Response = Struct.new :body, :url
class << self
attr_accessor :dir
def inherited(subclass)
super
subclass.base_url = base_url
subclass.dir = dir
end
end
@ -16,13 +15,25 @@ module Docs
html_filters.push 'clean_local_urls'
def source_directory
@source_directory ||= File.join(SOURCE_DIRECTORY, self.class.path)
end
private
def assert_source_directory_exists
unless Dir.exists?(source_directory)
raise SetupError, "The #{self.class.name} scraper requires the original documentation files to be stored in the \"#{source_directory}\" directory."
end
end
def request_one(url)
Response.new read_file(file_path_for(url)), URL.parse(url)
assert_source_directory_exists
Response.new read_file(url_to_path(url)), URL.parse(url)
end
def request_all(urls)
assert_source_directory_exists
queue = [urls].flatten
until queue.empty?
result = yield request_one(queue.shift)
@ -34,12 +45,12 @@ module Docs
response.body.present?
end
def file_path_for(url)
File.join self.class.dir, url.remove(base_url.to_s)
def url_to_path(url)
url.remove(base_url.to_s)
end
def read_file(path)
File.read(path)
File.read(File.join(source_directory, path))
rescue
instrument 'warn.doc', msg: "Failed to open file: #{path}"
nil

@ -1,7 +1,6 @@
module Docs
class C < FileScraper
self.type = 'c'
self.dir = '/Users/Thibaut/DevDocs/Docs/c'
self.base_url = 'http://en.cppreference.com/w/c/'
self.root_path = 'header.html'

@ -3,7 +3,6 @@ module Docs
self.name = 'C++'
self.slug = 'cpp'
self.type = 'c'
self.dir = '/Users/Thibaut/DevDocs/Docs/cpp'
self.base_url = 'http://en.cppreference.com/w/cpp/'
self.root_path = 'header.html'

@ -24,13 +24,11 @@ module Docs
version '2' do
self.release = '2.0.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/Dart2'
self.base_url = "https://api.dartlang.org/stable/#{release}/"
end
version '1' do
self.release = '1.24.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Dart1'
self.base_url = "https://api.dartlang.org/stable/#{release}/"
end
end

@ -36,37 +36,31 @@ module Docs
version '2.1' do
self.release = '2.1.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/Django21'
self.base_url = 'https://docs.djangoproject.com/en/2.1/'
end
version '2.0' do
self.release = '2.0.7'
self.dir = '/Users/Thibaut/DevDocs/Docs/Django20'
self.base_url = 'https://docs.djangoproject.com/en/2.0/'
end
version '1.11' do
self.release = '1.11.9'
self.dir = '/Users/Thibaut/DevDocs/Docs/Django111'
self.base_url = 'https://docs.djangoproject.com/en/1.11/'
end
version '1.10' do
self.release = '1.10.8'
self.dir = '/Users/Thibaut/DevDocs/Docs/Django110'
self.base_url = 'https://docs.djangoproject.com/en/1.10/'
end
version '1.9' do
self.release = '1.9.13'
self.dir = '/Users/Thibaut/DevDocs/Docs/Django19'
self.base_url = 'https://docs.djangoproject.com/en/1.9/'
end
version '1.8' do
self.release = '1.8.18'
self.dir = '/Users/Thibaut/DevDocs/Docs/Django18'
self.base_url = 'https://docs.djangoproject.com/en/1.8/'
end
end

@ -42,22 +42,18 @@ module Docs
version '21' do
self.release = '21.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang21'
end
version '20' do
self.release = '20.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang20'
end
version '19' do
self.release = '19.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang19'
end
version '18' do
self.release = '18.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang18'
end
end
end

@ -48,13 +48,11 @@ module Docs
version '7' do
self.release = '7.3.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcc7'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
end
version '7 CPP' do
self.release = '7.3.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp7'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
options[:replace_paths] = CPP_PATHS
@ -62,7 +60,6 @@ module Docs
version '6' do
self.release = '6.4.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcc6'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
options[:root_title] = 'Using the GNU Compiler Collection (GCC)'
@ -70,7 +67,6 @@ module Docs
version '6 CPP' do
self.release = '6.4.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp6'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
options[:replace_paths] = CPP_PATHS
@ -78,7 +74,6 @@ module Docs
version '5' do
self.release = '5.4.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcc5'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
options[:root_title] = 'Using the GNU Compiler Collection (GCC)'
@ -86,7 +81,6 @@ module Docs
version '5 CPP' do
self.release = '5.4.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp5'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
options[:replace_paths] = CPP_PATHS
@ -94,7 +88,6 @@ module Docs
version '4' do
self.release = '4.9.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcc4'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
options[:root_title] = 'Using the GNU Compiler Collection (GCC)'
@ -102,7 +95,6 @@ module Docs
version '4 CPP' do
self.release = '4.9.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp4'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
options[:replace_paths] = CPP_PATHS

@ -8,25 +8,21 @@ module Docs
version '7' do
self.release = '7.3.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran7'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
end
version '6' do
self.release = '6.4.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran6'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
end
version '5' do
self.release = '5.4.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran5'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
end
version '4' do
self.release = '4.9.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran4'
self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
end
end

@ -3,7 +3,6 @@ module Docs
self.name = 'Nokogiri'
self.slug = 'nokogiri'
self.release = '1.8.1'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Nokogiri'
html_filters.replace 'rdoc/entries', 'nokogiri2/entries'

@ -2,7 +2,6 @@ module Docs
class Numpy < FileScraper
self.name = 'NumPy'
self.type = 'sphinx'
self.dir = '/Users/Thibaut/DevDocs/Docs/numpy/reference/'
self.root_path = 'index.html'
self.links = {
home: 'http://www.numpy.org/',

@ -1,11 +1,10 @@
module Docs
class Openjdk < FileScraper
# Downloaded from packages.debian.org/sid/openjdk-8-doc
# Extracting subdirectory /usr/share/doc/openjdk-8-jre-headless/api
self.name = 'OpenJDK'
self.type = 'openjdk'
self.root_path = 'overview-summary.html'
# Downloaded from packages.debian.org/sid/openjdk-8-doc
# Extracting subdirectory /usr/share/doc/openjdk-8-jre-headless/api
self.dir = '/Users/Thibaut/DevDocs/Docs/OpenJDK'
html_filters.insert_after 'internal_urls', 'openjdk/clean_urls'
html_filters.push 'openjdk/entries', 'openjdk/clean_html'

@ -2,7 +2,6 @@ module Docs
class Perl < FileScraper
self.name = 'Perl'
self.type = 'perl'
self.dir = '/Users/Thibaut/DevDocs/Docs/Perl'
self.root_path = 'index.html'
self.links = {
home: 'https://www.perl.org/'

@ -1,5 +1,7 @@
module Docs
class Php < FileScraper
# Downloaded from php.net/download-docs.php
include FixInternalUrlsBehavior
self.name = 'PHP'
@ -23,9 +25,6 @@ module Docs
code: 'https://git.php.net/?p=php-src.git;a=summary'
}
# Downloaded from php.net/download-docs.php
self.dir = '/Users/Thibaut/DevDocs/Docs/PHP'
html_filters.push 'php/internal_urls', 'php/entries', 'php/clean_html', 'title'
text_filters.push 'php/fix_urls'

@ -23,33 +23,29 @@ module Docs
Licensed under the PSF License.
HTML
version '3.7' do
version '3.7' do # docs.python.org/3.7/download.html
self.release = '3.7.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/Python37' # docs.python.org/3.7/download.html
self.base_url = 'https://docs.python.org/3.7/'
html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
end
version '3.6' do
version '3.6' do # docs.python.org/3.6/download.html
self.release = '3.6.6'
self.dir = '/Users/Thibaut/DevDocs/Docs/Python36' # docs.python.org/3.6/download.html
self.base_url = 'https://docs.python.org/3.6/'
html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
end
version '3.5' do
version '3.5' do # docs.python.org/3.5/download.html
self.release = '3.5.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/Python35' # docs.python.org/3.5/download.html
self.base_url = 'https://docs.python.org/3.5/'
html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
end
version '2.7' do
version '2.7' do # docs.python.org/2.7/download.html
self.release = '2.7.13'
self.dir = '/Users/Thibaut/DevDocs/Docs/Python27' # docs.python.org/2.7/download.html
self.base_url = 'https://docs.python.org/2.7/'
html_filters.push 'python/entries_v2', 'sphinx/clean_html', 'python/clean_html'

@ -1,9 +1,9 @@
module Docs
class Minitest < Rdoc
# Run "rake docs" in the gem directory
self.name = 'Ruby / Minitest'
self.slug = 'minitest'
self.release = '5.10.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Minitest' # rake docs
self.links = {
code: 'https://github.com/seattlerb/minitest'
}

@ -4,7 +4,6 @@ module Docs
self.name = 'Ruby on Rails'
self.slug = 'rails'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Rails'
self.initial_paths = %w(guides/index.html)
self.links = {
home: 'http://rubyonrails.org/',

@ -78,22 +78,18 @@ module Docs
version '2.5' do
self.release = '2.5.0'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby25'
end
version '2.4' do
self.release = '2.4.3'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby24'
end
version '2.3' do
self.release = '2.3.6'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby23'
end
version '2.2' do
self.release = '2.2.9'
self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby22'
end
end
end

@ -3,7 +3,6 @@ module Docs
self.name = 'SQLite'
self.type = 'sqlite'
self.release = '3.25.2'
self.dir = '/Users/Thibaut/DevDocs/Docs/sqlite/'
self.base_url = 'https://sqlite.org/'
self.root_path = 'docs.html'
self.initial_paths = %w(keyword_index.html)

@ -2,16 +2,23 @@ require 'test_helper'
require 'docs'
class FileScraperTest < MiniTest::Spec
ROOT_PATH = File.expand_path('../../../../../../', __FILE__)
class Scraper < Docs::FileScraper
self.dir = '/'
self.html_filters = Docs::FilterStack.new
self.text_filters = Docs::FilterStack.new
version 'version' do; end
end
let :scraper do
Scraper.new
end
let :versioned_scraper do
Scraper.versions.first.new
end
let :response do
OpenStruct.new body: 'body', url: Docs::URL.parse(Scraper.base_url)
end
@ -22,9 +29,16 @@ class FileScraperTest < MiniTest::Spec
end
end
describe "#source_directory" do
it "returns the directory at docs/[slug]" do
assert_equal File.join(ROOT_PATH, 'docs', 'scraper'), scraper.source_directory
assert_equal File.join(ROOT_PATH, 'docs', 'scraper~version'), versioned_scraper.source_directory
end
end
describe "#request_one" do
let :path do
File.join(Scraper.dir, 'path')
'path'
end
let :result do
@ -35,20 +49,34 @@ class FileScraperTest < MiniTest::Spec
stub(scraper).read_file
end
it "reads a file" do
mock(scraper).read_file(path)
result
context "when the source directory doesn't exist" do
it "raises an error" do
assert_raises Docs::SetupError do
result
end
end
end
describe "the returned response object" do
it "has a #body" do
stub(scraper).read_file { 'body' }
assert_equal 'body', result.body
context "when the source directory exists" do
before do
stub(scraper).assert_source_directory_exists
end
it "reads a file" do
mock(scraper).read_file(path)
result
end
it "has a #url" do
assert_equal path, result.url.to_s
assert_instance_of Docs::URL, result.url
describe "the returned response object" do
it "has a #body" do
stub(scraper).read_file { 'body' }
assert_equal 'body', result.body
end
it "has a #url" do
assert_equal path, result.url.to_s
assert_instance_of Docs::URL, result.url
end
end
end
end
@ -58,49 +86,63 @@ class FileScraperTest < MiniTest::Spec
%w(one two)
end
it "requests the given url" do
mock(scraper).request_one('url')
scraper.send(:request_all, 'url') {}
end
it "requests the given urls" do
requests = []
stub(scraper).request_one { |url| requests << url; nil }
scraper.send(:request_all, urls) {}
assert_equal urls, requests
end
it "yields the responses" do
responses = []
stub(scraper).request_one { |url| urls.index(url) }
scraper.send(:request_all, urls) { |response| responses << response; nil }
assert_equal (0...urls.length).to_a, responses
context "when the source directory doesn't exist" do
it "raises an error" do
assert_raises Docs::SetupError do
scraper.send(:request_all, urls) {}
end
end
end
context "when the block returns an array" do
let :next_urls do
%w(three four)
context "when the source directory exists" do
before do
stub(scraper).assert_source_directory_exists
end
let :all_urls do
urls + %w(three four)
it "requests the given url" do
mock(scraper).request_one('url')
scraper.send(:request_all, 'url') {}
end
it "requests the returned urls" do
it "requests the given urls" do
requests = []
stub(scraper).request_one { |url| requests << url; url }
scraper.send(:request_all, urls) { [next_urls.shift].compact }
assert_equal all_urls, requests
stub(scraper).request_one { |url| requests << url; nil }
scraper.send(:request_all, urls) {}
assert_equal urls, requests
end
it "yields their responses" do
it "yields the responses" do
responses = []
stub(scraper).request_one { |url| all_urls.index(url) }
scraper.send :request_all, urls do |response|
responses << response
[next_urls.shift].compact
stub(scraper).request_one { |url| urls.index(url) }
scraper.send(:request_all, urls) { |response| responses << response; nil }
assert_equal (0...urls.length).to_a, responses
end
context "when the block returns an array" do
let :next_urls do
%w(three four)
end
let :all_urls do
urls + %w(three four)
end
it "requests the returned urls" do
requests = []
stub(scraper).request_one { |url| requests << url; url }
scraper.send(:request_all, urls) { [next_urls.shift].compact }
assert_equal all_urls, requests
end
it "yields their responses" do
responses = []
stub(scraper).request_one { |url| all_urls.index(url) }
scraper.send :request_all, urls do |response|
responses << response
[next_urls.shift].compact
end
assert_equal (0...all_urls.length).to_a, responses
end
assert_equal (0...all_urls.length).to_a, responses
end
end
end
@ -126,13 +168,13 @@ class FileScraperTest < MiniTest::Spec
scraper.send :read_file, 'file'
end
it "returns the file's content when the file exists" do
stub(File).read('file') { 'content' }
it "returns the file's content when the file exists in the source directory" do
stub(File).read(File.join(ROOT_PATH, 'docs', 'scraper', 'file')) { 'content' }
assert_equal 'content', result
end
it "returns nil when the file doesn't exist" do
stub(File).read('file') { raise }
stub(File).read(File.join(ROOT_PATH, 'docs', 'scraper', 'file')) { raise }
assert_nil result
end
end

Loading…
Cancel
Save