ImmutableJS Scrapper.

Core scrapper changes: *) Add regexp support to stub *) Add support to fragment in internal_urls, normalized_paths => done externally.
8 years ago · b1ecc6c964
parent f0211029ef
commit b1ecc6c964
8 changed files with 169 additions and 3 deletions
--- a/lib/docs/core/scraper.rb
+++ b/lib/docs/core/scraper.rb
@ -51,12 +51,20 @@ module Docs

    def initialize_stubs
      self.class.stubs.each do |path, block|
-        Typhoeus.stub(url_for(path)).and_return do
+        stub_path = nil
+        case path
+        when String
+          stub_path = url_for(path)
+        when Regexp
+          stub_path = path
+        end
+
+        Typhoeus.stub(stub_path).and_return do |request|
          Typhoeus::Response.new \
-            effective_url: url_for(path),
+            effective_url: request.url,
            code: 200,
            headers: { 'Content-Type' => 'text/html' },
-            body: self.instance_exec(&block)
+            body: self.instance_exec(request.url, &block)
        end
      end
    end
--- a/lib/docs/filters/immutablejs/clean_html.rb
+++ b/lib/docs/filters/immutablejs/clean_html.rb
@ -0,0 +1,38 @@
+module Docs
+  class Immutablejs
+    class CleanHtmlFilter < Filter
+      def call
+        # Skip the container "div"
+        @doc = at_css('div')
+
+        # Remove data-reactid attributes for cleaner html
+        css('*[data-reactid]').each do |reactEl|
+          reactEl.delete 'data-reactid'
+        end
+
+        # Add id to member label, so we can navigate among them
+        css('h3.memberLabel').each do |memberLabel|
+          memberLabel['id'] = memberLabel.content.strip.chomp('()')
+        end
+
+
+        css('a').each do |link|
+          # Remove "/" from the start
+          link['href'] = link['href'].gsub(/^(#)?\//, '')
+
+          # We need to convert links - from Iterable/butLast to Iterable#butLast
+          link['href'] = link['href'].split('/').join('#')
+        end
+
+        # Replace code blocks tag code with pre, and add stylings.
+        css('code.codeBlock').each do |codeBlock|
+          codeBlock.name = 'pre'
+          codeBlock['data-language'] = 'javascript'
+          codeBlock['class'] = 'language-javascript'
+        end
+
+        doc
+      end
+    end
+  end
+end
--- a/lib/docs/filters/immutablejs/entries.rb
+++ b/lib/docs/filters/immutablejs/entries.rb
@ -0,0 +1,31 @@
+module Docs
+  class Immutablejs
+    class EntriesFilter < Docs::EntriesFilter
+      def name
+        typeHeader = at_css('h1.typeHeader')
+        return typeHeader.content if typeHeader
+      end
+
+      def type
+        typeHeader = at_css('h1.typeHeader')
+        return typeHeader.content if typeHeader
+
+        # TODO: Is this ok? This the index page.. I don't think it should have it's own type..
+        nil
+      end
+
+
+      def additional_entries
+        if current_url.fragment.nil?
+          return []
+        end
+
+        css('h3.memberLabel').map do |memberLabel|
+          entry_name = "#{type}##{memberLabel.content}"
+          [entry_name, memberLabel.content.chomp('()')]
+        end
+      end
+
+    end
+  end
+end
--- a/lib/docs/filters/immutablejs/internal_urls.rb
+++ b/lib/docs/filters/immutablejs/internal_urls.rb
@ -0,0 +1,23 @@
+module Docs
+  class Immutablejs
+    class InternalUrlsFilter < Docs::InternalUrlsFilter
+      def update_and_follow_links
+        urls = result[:internal_urls] = []
+        update_links do |url|
+          urls << url.to_s
+        end
+        urls.uniq!
+      end
+
+      def to_internal_url(str)
+        if str.start_with? "#/"
+          return nil if not str =~ /^#\/[^\/]+$/
+          str = root_url.to_s + str
+        end
+
+        super(str)
+      end
+
+    end
+  end
+end
--- a/lib/docs/filters/immutablejs/normalize_paths.rb
+++ b/lib/docs/filters/immutablejs/normalize_paths.rb
@ -0,0 +1,28 @@
+module Docs
+  class Immutablejs
+    class NormalizePathsFilter < Docs::NormalizePathsFilter
+      #
+      # Checks if the given url starts with:
+      # "#" or ".#", with means it's a fragment url
+      #
+      FRAGMENT_REGEX = /^(\.)?#/
+
+      def path
+        #
+        # If we have fragment, we want to use as our path.
+        #
+        if current_url.fragment
+          # Remove "/" from the start
+          @path = current_url.fragment.sub(/^\//, '')
+        end
+
+        super
+      end
+
+      def normalize_href href
+        return href.gsub(FRAGMENT_REGEX, '') if href =~ FRAGMENT_REGEX
+        super
+      end
+    end
+  end
+end
--- a/lib/docs/scrapers/immutablejs.rb
+++ b/lib/docs/scrapers/immutablejs.rb
@ -0,0 +1,38 @@
+module Docs
+  class Immutablejs < UrlScraper
+    self.name = "ImmutableJS"
+    self.type = "immutablejs"
+    self.release = "3.8.1"
+    self.base_url = "https://facebook.github.io/immutable-js/docs/"
+
+
+    #
+    # Replacins core html filters with our own, so we can handle fragments in
+    #
+    html_filters.replace 'internal_urls', 'immutablejs/internal_urls'
+    html_filters.replace 'normalize_paths', 'immutablejs/normalize_paths'
+
+    html_filters.push  'immutablejs/clean_html', 'immutablejs/entries'
+
+
+    options[:attribution] = <<-HTML
+      This documentation is generated from <a href="https://github.com/facebook/immutable-js/blob/master/type-definitions/Immutable.d.ts">Immutable.d.ts</a>.
+      Pull requests and <a href="https://github.com/facebook/immutable-js/issues">Issues</a> welcome.
+    HTML
+
+    stub(/.*/) do |url|
+      #
+      # Reuse capybara sessions, since we scrape all pages..
+      # by visiting 'about:blank' we reset the oldest session.
+      #
+      @capybara ||= load_capybara_selenium
+      @capybara.visit 'about:blank'
+      @capybara.visit url
+
+      @capybara.execute_script 'return document.querySelector(".docContents").innerHTML'
+    end
+
+
+
+  end
+end
--- a/public/icons/docs/immutablejs/16.png
+++ b/public/icons/docs/immutablejs/16.png
--- a/public/icons/docs/immutablejs/16@2x.png
+++ b/public/icons/docs/immutablejs/16@2x.png