From ebfe3a12080af85ae743ba73a84911c0830ca476 Mon Sep 17 00:00:00 2001 From: ShaneQful Date: Tue, 17 Nov 2015 22:13:33 +0000 Subject: [PATCH] Changed scraper to get list of urls programmatically & improved the get type method in the entries filter --- lib/docs/filters/dojo/entries.rb | 4 +- lib/docs/scrapers/dojo.rb | 195 ++++--------------------------- 2 files changed, 26 insertions(+), 173 deletions(-) diff --git a/lib/docs/filters/dojo/entries.rb b/lib/docs/filters/dojo/entries.rb index b02ea0ef..01a63be0 100644 --- a/lib/docs/filters/dojo/entries.rb +++ b/lib/docs/filters/dojo/entries.rb @@ -6,7 +6,9 @@ module Docs end def get_type - name + list_of_names = name.split(/\/|\./) + list_of_names.pop + list_of_names.join("/") end end end diff --git a/lib/docs/scrapers/dojo.rb b/lib/docs/scrapers/dojo.rb index 207a0ff7..d7ba842c 100644 --- a/lib/docs/scrapers/dojo.rb +++ b/lib/docs/scrapers/dojo.rb @@ -1,179 +1,13 @@ module Docs class Dojo < UrlScraper + include StubRootPage self.name = 'Dojo' self.slug = 'dojo' self.type = 'dojo' self.version = '1.10' self.base_url = 'http://dojotoolkit.org/api/1.10/' - # This is a cut down list of the actually paths taken from the tree.json api on the dojo site - # Dojo used javascript and xhr requests to allow users to browse it's documentation so it can't - # be scrapped by just following links from the base page. This list was generating with a little - # bash and then cut down in order to remove a lot of the more unused documentation e.g. kernel, - # main, dnd and some others - self.initial_paths = %w( - dojo/AdapterRegistry - dojo/aspect - dojo/back - dojo/_base/array - dojo/_base/browser - dojo/_base/Color - dojo/_base/Color.named - dojo/_base/config - dojo/_base/config.modulePaths - dojo/_base/connect - dojo/_base/declare - dojo/_base/Deferred - dojo/_base/event - dojo/_base/fx - dojo/_base/html - dojo/_base/json - dojo/_base/kernel - dojo/_base/lang - dojo/_base/loader - dojo/_base/NodeList - dojo/_base/query - dojo/_base/sniff - dojo/_base/unload - dojo/_base/window - dojo/_base/window.doc - dojo/_base/window.global - dojo/_base/xhr - dojo/_base/xhr.contentHandlers - dojo/behavior - dojo/cache - dojo/cldr/monetary - dojo/cldr/supplemental - dojo/colors - dojo/cookie - dojo/currency - dojo/data/api/Identity - dojo/data/api/Item - dojo/data/api/Notification - dojo/data/api/Read - dojo/data/api/Request - dojo/data/api/Write - dojo/data/ItemFileReadStore - dojo/data/ItemFileWriteStore - dojo/data/ObjectStore - dojo/data/util/filter - dojo/data/util/simpleFetch - dojo/data/util/sorter - dojo/date - dojo/date/locale - dojo/date/stamp - dojo/debounce - dojo/Deferred - dojo/DeferredList - dojo/dom - dojo/dom-attr - dojo/dom-class - dojo/dom-construct - dojo/dom-form - dojo/dom-geometry - dojo/dom-prop - dojo/dom-prop.names - dojo/domReady - dojo/dom-style - dojo/errors/CancelError - dojo/errors/create - dojo/errors/RequestError - dojo/errors/RequestTimeoutError - dojo/Evented - dojo/fx - dojo/fx/easing - dojo/fx.easing - dojo/fx/Toggler - dojo/fx.Toggler - dojo/gears - dojo/gears.available - dojo/has - dojo/hash - dojo/hccss - dojo/html - dojo/html._ContentSetter - dojo/i18n - dojo/i18n.cache - dojo/io/iframe - dojo/io-query - dojo/io/script - dojo/json - dojo/keys - dojo/loadInit - dojo/main - dojo/mouse - dojo/node - dojo/NodeList - dojo/NodeList-data - dojo/NodeList-dom - dojo/NodeList-fx - dojo/NodeList-html - dojo/NodeList-manipulate - dojo/NodeList._nodeDataCache - dojo/NodeList-traverse - dojo/number - dojo/on - dojo/on/asyncEventListener - dojo/on/debounce - dojo/on/throttle - dojo/parser - dojo/promise/all - dojo/promise/first - dojo/promise/instrumentation - dojo/promise/Promise - dojo/promise/tracer - dojo/query - dojo/ready - dojo/regexp - dojo/request - dojo/request/default - dojo/request/handlers - dojo/request/iframe - dojo/request/node - dojo/request/notify - dojo/request/registry - dojo/request/script - dojo/request/util - dojo/request/watch - dojo/request/xhr - dojo/require - dojo/robot - dojo/robot._runsemaphore - dojo/robotx - dojo/robotx._runsemaphore - dojo/router - dojo/router/RouterBase - dojo/rpc/JsonpService - dojo/rpc/JsonService - dojo/rpc/RpcService - dojo/selector/acme - dojo/selector/lite - dojo/selector/_loader - dojo/sniff - dojo/Stateful - dojo/store/api/Store - dojo/store/api/Store.PutDirectives - dojo/store/api/Store.QueryOptions - dojo/store/api/Store.QueryResults - dojo/store/api/Store.SortInformation - dojo/store/api/Store.Transaction - dojo/store/Cache - dojo/store/DataStore - dojo/store/JsonRest - dojo/store/Memory - dojo/store/Observable - dojo/store/util/QueryResults - dojo/store/util/SimpleQueryEngine - dojo/string - dojo/text - dojo/throttle - dojo/topic - dojo/touch - dojo/uacss - dojo/when - dojo/window) - # Add the rest of the url to the path - self.initial_paths = self.initial_paths.map { |l| l + ".html?xhr=true" } + # Dojo expects all the requests to be xhrs or it redirects you back to the docs home page # where it uses js to call the backend based on the URL so you get the appropriate documentation self.headers = { 'User-Agent' => 'devdocs.io' , 'X-Requested-With' => 'XMLHttpRequest' } @@ -185,10 +19,27 @@ module Docs html_filters.push 'dojo/clean_html', 'dojo/entries' # Don't use default selector on xhrs as no body or html document exists - options[:container] = ->(filter) { filter.root_page? ? '#content' : false } - options[:follow_links] = false - options[:skip_links] = true - options[:only] = self.initial_paths + options[:container] = false + + def root_page_body + require 'json' + require 'set' + response = Typhoeus::Request.new("dojotoolkit.org/api/1.10/tree.json", + headers: { 'User-Agent' => 'devdocs.io' , 'X-Requested-With' => 'XMLHttpRequest' }).run + treeJSON = JSON.parse(response.response_body) + treeJSON = treeJSON["children"].bsearch { |framework| framework["name"] == "dojo" } + @url_set = Set.new + def get_url_list treeJSON + @url_set.add(self.class.base_url + treeJSON["fullname"] + ".html?xhr=true") + if (treeJSON["children"]) + treeJSON["children"].each do |child| + get_url_list child + end + end + end + get_url_list treeJSON + @url_set.map { |l| "#{l}"}.join "
" + end options[:attribution] = <<-HTML The Dojo Toolkit is Copyright © 2005–2013