From 4772fc41646fe695e90f248db9ec2e82fde6fcb9 Mon Sep 17 00:00:00 2001 From: naquad Date: Fri, 24 Jan 2014 15:57:18 +0200 Subject: [PATCH] major downloader and devhelp convertor fixes --- lib/devhelp.rb | 6 +- lib/downloader.rb | 142 +++++++++++++++++++++++++++++++--------------- 2 files changed, 100 insertions(+), 48 deletions(-) diff --git a/lib/devhelp.rb b/lib/devhelp.rb index 5d43f92e..42a13acc 100644 --- a/lib/devhelp.rb +++ b/lib/devhelp.rb @@ -25,7 +25,7 @@ class DevHelp end def normalize_url(link) - link.gsub(/^([^.]+?)(?=$|#)/, '\1.html\2') + link.gsub(/^([^#?]+)(?:\.html)?/, '\1.html') end def build_devhelp(doc, structure) @@ -43,6 +43,7 @@ class DevHelp def for_docs(*docs) docs.flatten.each(&method(:for_doc)) + self end def cp_r(src, dst) @@ -191,7 +192,8 @@ class DevHelp select(&method(:is_document?)). each {|d| downloader.process_page(nil, d)} - downloader.run + downloader.wait + self end def is_document?(p) diff --git a/lib/downloader.rb b/lib/downloader.rb index 696ddd70..8938ca19 100644 --- a/lib/downloader.rb +++ b/lib/downloader.rb @@ -1,66 +1,126 @@ -require 'typhoeus' require 'nokogiri' -require 'delegate' +require 'thread' require 'fileutils' +require 'monitor' require 'cgi' -class Downloader < SimpleDelegator - include Typhoeus +# Typhoeus has odd behavior either on my platform +# or at all. Hence rolling out own parallel downloader. - MAX_QUEUE_SIZE = 20 +# Ruby 2.1.0 has broken Thread::SizedQueue +# https://bugs.ruby-lang.org/issues/9342 +class SizedQueue + attr_accessor :max - def initialize(*args) - super(Hydra.new(*args)) - @counter = 0 + def initialize(max) + @max = max + + @queue = [] + @queue.extend(MonitorMixin) + @qe = @queue.new_cond + @qd = @queue.new_cond end - def processor(&block) - @processor = block + def size + @queue.synchronize do + @queue.size + end end - def queue_size - queued_requests.size + def enqueue(what) + @queue.synchronize do + @qd.wait_while { @queue.size >= @max } + @queue.push what + @qe.signal + end end - def file(src, dst, &block) - file = nil + def dequeue + @queue.synchronize do + @qe.wait_while { @queue.empty? } + ret = @queue.pop + @qd.signal + ret + end + end - request = Request.new(src) + alias :<< :enqueue + alias :push :enqueue + alias :unshift :enqueue - request.on_headers do |response| - if response.response_code == 200 - dname = File.dirname(dst) - FileUtils.mkdir_p(dname) unless File.directory?(dname) - file = open(dst, 'wb') - else - failed(src, dst, response) - end - end + alias :>> :dequeue + alias :pop :dequeue + alias :shift :dequeue +end + +class Downloader + def initialize(threads = 10, max = 10) + @counter = 0 + @req = SizedQueue.new(max) + @callbacks = Thread::Queue.new + @threads = [] - request.on_body do |chunk| - file.write(chunk) if file + @threads << Thread.new(&method(:runner)) + + block = method(:downloader) + threads.times do + @threads << Thread.new(&block) end - - request.on_complete do |response| - if file - file.close - block.call(dst) if block + end + + def downloader + while r = @req.pop + begin + open(r[0]) do |f| + dname = File.dirname(r[1]) + FileUtils.mkdir_p(dname) unless File.directory?(dname) + File.write(r[1], f.read) + @callbacks << r[1..-1] if r[2] + end + rescue SocketError, OpenURI::HTTPError, Errno::EACCESS, Errno::EEXISTS => e + puts "#{r[1]} failed to download: #{e.message}" end end + end - queue request + def runner + while t = @callbacks.pop + t[1].call(t[0]) + end + end + + def processor(&block) + @processor = block + end + + def file(src, dst, &block) + @req << [src, dst, block] dst end - def queue(*args, &block) - run while queue_size > MAX_QUEUE_SIZE - __getobj__.queue *args, &block + def wait + @callbacks << nil + (@threads.size - 1).times { @req << nil } + @threads.each(&:join) end def page(src, target) file(src, target) { process_page(src, target) } end + def guess_filename(base_dir, href) + tpath = File.join(base_dir, href) + + [ + [tpath , href ], + ["#{tpath}.html" , "#{href}.html" ], + ["#{tpath.downcase}" , "#{href.downcase}" ], + ["#{tpath.downcase}.html" , "#{href.downcase}.html"] + ].each {|(x, y)| return y if File.exists?(x)} + + href + end + def process_page(src, path) doc = Nokogiri::HTML.parse(File.read(path), 'UTF-8') rdir = path.gsub(%r{\.[^./]*$}, '') + '_files' @@ -88,13 +148,7 @@ class Downloader < SimpleDelegator href = a['href'] next if href =~ %r{^(?:[^:]+:|[#?]|$)} href = CGI.unescape(href) - - np = File.join(base_dir, href) - if File.exists?("#{np}.html") - href << '.html' - end - - a['href'] = href + a['href'] = guess_filename(base_dir, href) end doc.css('style').each do |style| @@ -167,8 +221,4 @@ class Downloader < SimpleDelegator prefix += 1 end end - - def failed(src, dst, response) - puts "#{src} -> #{dst} failed: #{response.status_message}" - end end