From cda737ceec8992a1af0307b7f80278a1d6af7125 Mon Sep 17 00:00:00 2001 From: Mathieu PATUREL Date: Fri, 26 Apr 2019 20:04:13 +1000 Subject: [PATCH] basic scrapping working --- .gitignore | 1 + lib/docs/filters/trio/clean_html.rb | 21 +++++++++ lib/docs/filters/trio/entries.rb | 21 +++++++++ lib/docs/scrapers/trio.rb | 24 ++++++++++ public/docs/docs.json | 70 ++++++++++++++++++++++++++++- 5 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 lib/docs/filters/trio/clean_html.rb create mode 100644 lib/docs/filters/trio/entries.rb create mode 100644 lib/docs/scrapers/trio.rb diff --git a/.gitignore b/.gitignore index 1060fcf0..27f04dd4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ public/fonts public/docs/**/* docs/**/* !docs/*.md +vendor diff --git a/lib/docs/filters/trio/clean_html.rb b/lib/docs/filters/trio/clean_html.rb new file mode 100644 index 00000000..5c2ef228 --- /dev/null +++ b/lib/docs/filters/trio/clean_html.rb @@ -0,0 +1,21 @@ +module Docs + class Trio + class CleanHtmlFilter < Filter + def call + @doc = at_css('div[role="main"]') + css('.section, [itemprop=articleBody]').each do |node| + node.replace node.children + end + + css('.headerlink').remove + + css('dt').each do |node| + new_node = doc.document.create_element "h3" + new_node.content = node.inner_text[0...-1] + node.replace new_node + end + doc + end + end + end +end diff --git a/lib/docs/filters/trio/entries.rb b/lib/docs/filters/trio/entries.rb new file mode 100644 index 00000000..64387d6e --- /dev/null +++ b/lib/docs/filters/trio/entries.rb @@ -0,0 +1,21 @@ +module Docs + class Trio + class EntriesFilter < Docs::EntriesFilter + def get_name + at_css('h1').text[0...-1] + end + + def get_type + at_css('h1').text[0...-1] + end + + def additional_entries + css('.descname').each_with_object [] do |node, entries| + name = node.previous.text + node.text + id = node.parent['id'] + entries << [name, id] + end + end + end + end +end diff --git a/lib/docs/scrapers/trio.rb b/lib/docs/scrapers/trio.rb new file mode 100644 index 00000000..cf90f77d --- /dev/null +++ b/lib/docs/scrapers/trio.rb @@ -0,0 +1,24 @@ +module Docs + class Trio < UrlScraper + self.type = 'simple' + self.release = '0.11' + self.base_url = 'https://trio.readthedocs.io/en/latest/' + self.root_path = 'index.html' + self.links = { + home: 'https://trio.readthedocs.io/', + code: 'https://github.com/python-trio/trio' + } + + html_filters.push 'trio/entries', 'trio/clean_html' + + options[:attribution] = <<-HTML + HTML + options[:only_patterns] = [ + /reference-core/, + /reference-io/, + /reference-testing/, + /reference-hazmat/, + ] + + end +end diff --git a/public/docs/docs.json b/public/docs/docs.json index 0637a088..eff8727c 100644 --- a/public/docs/docs.json +++ b/public/docs/docs.json @@ -1 +1,69 @@ -[] \ No newline at end of file +[ + { + "name": "Chef", + "slug": "chef~12", + "type": "sphinx_simple", + "links": { + "home": "https://www.chef.io/", + "code": "https://github.com/chef/chef" + }, + "version": "12", + "release": "12.13", + "mtime": 1556264506, + "db_size": 7170006 + }, + { + "name": "CSS", + "slug": "css", + "type": "mdn", + "mtime": 1543099045, + "db_size": 12415944 + }, + { + "name": "DOM", + "slug": "dom", + "type": "mdn", + "mtime": 1543157862, + "db_size": 33998524 + }, + { + "name": "DOM Events", + "slug": "dom_events", + "type": "mdn", + "mtime": 1543099589, + "db_size": 1752500 + }, + { + "name": "HTML", + "slug": "html", + "type": "mdn", + "mtime": 1543097764, + "db_size": 4141596 + }, + { + "name": "HTTP", + "slug": "http", + "type": "mdn", + "mtime": 1543099392, + "db_size": 4731727 + }, + { + "name": "JavaScript", + "slug": "javascript", + "type": "mdn", + "mtime": 1543098529, + "db_size": 6462141 + }, + { + "name": "Trio", + "slug": "trio", + "type": "simple", + "links": { + "home": "https://trio.readthedocs.io/", + "code": "https://github.com/python-trio/trio" + }, + "release": "0.11", + "mtime": 1556272773, + "db_size": 736670 + } +] \ No newline at end of file