major downloader and devhelp convertor fixes

pull/50/head
naquad 11 years ago
parent 94ff7609bc
commit 4772fc4164

@ -25,7 +25,7 @@ class DevHelp
end
def normalize_url(link)
link.gsub(/^([^.]+?)(?=$|#)/, '\1.html\2')
link.gsub(/^([^#?]+)(?:\.html)?/, '\1.html')
end
def build_devhelp(doc, structure)
@ -43,6 +43,7 @@ class DevHelp
def for_docs(*docs)
docs.flatten.each(&method(:for_doc))
self
end
def cp_r(src, dst)
@ -191,7 +192,8 @@ class DevHelp
select(&method(:is_document?)).
each {|d| downloader.process_page(nil, d)}
downloader.run
downloader.wait
self
end
def is_document?(p)

@ -1,66 +1,126 @@
require 'typhoeus'
require 'nokogiri'
require 'delegate'
require 'thread'
require 'fileutils'
require 'monitor'
require 'cgi'
class Downloader < SimpleDelegator
include Typhoeus
# Typhoeus has odd behavior either on my platform
# or at all. Hence rolling out own parallel downloader.
MAX_QUEUE_SIZE = 20
# Ruby 2.1.0 has broken Thread::SizedQueue
# https://bugs.ruby-lang.org/issues/9342
class SizedQueue
attr_accessor :max
def initialize(*args)
super(Hydra.new(*args))
@counter = 0
def initialize(max)
@max = max
@queue = []
@queue.extend(MonitorMixin)
@qe = @queue.new_cond
@qd = @queue.new_cond
end
def processor(&block)
@processor = block
def size
@queue.synchronize do
@queue.size
end
end
def queue_size
queued_requests.size
def enqueue(what)
@queue.synchronize do
@qd.wait_while { @queue.size >= @max }
@queue.push what
@qe.signal
end
end
def file(src, dst, &block)
file = nil
def dequeue
@queue.synchronize do
@qe.wait_while { @queue.empty? }
ret = @queue.pop
@qd.signal
ret
end
end
request = Request.new(src)
alias :<< :enqueue
alias :push :enqueue
alias :unshift :enqueue
request.on_headers do |response|
if response.response_code == 200
dname = File.dirname(dst)
FileUtils.mkdir_p(dname) unless File.directory?(dname)
file = open(dst, 'wb')
else
failed(src, dst, response)
alias :>> :dequeue
alias :pop :dequeue
alias :shift :dequeue
end
class Downloader
def initialize(threads = 10, max = 10)
@counter = 0
@req = SizedQueue.new(max)
@callbacks = Thread::Queue.new
@threads = []
@threads << Thread.new(&method(:runner))
block = method(:downloader)
threads.times do
@threads << Thread.new(&block)
end
end
request.on_body do |chunk|
file.write(chunk) if file
def downloader
while r = @req.pop
begin
open(r[0]) do |f|
dname = File.dirname(r[1])
FileUtils.mkdir_p(dname) unless File.directory?(dname)
File.write(r[1], f.read)
@callbacks << r[1..-1] if r[2]
end
rescue SocketError, OpenURI::HTTPError, Errno::EACCESS, Errno::EEXISTS => e
puts "#{r[1]} failed to download: #{e.message}"
end
end
end
request.on_complete do |response|
if file
file.close
block.call(dst) if block
def runner
while t = @callbacks.pop
t[1].call(t[0])
end
end
queue request
def processor(&block)
@processor = block
end
def file(src, dst, &block)
@req << [src, dst, block]
dst
end
def queue(*args, &block)
run while queue_size > MAX_QUEUE_SIZE
__getobj__.queue *args, &block
def wait
@callbacks << nil
(@threads.size - 1).times { @req << nil }
@threads.each(&:join)
end
def page(src, target)
file(src, target) { process_page(src, target) }
end
def guess_filename(base_dir, href)
tpath = File.join(base_dir, href)
[
[tpath , href ],
["#{tpath}.html" , "#{href}.html" ],
["#{tpath.downcase}" , "#{href.downcase}" ],
["#{tpath.downcase}.html" , "#{href.downcase}.html"]
].each {|(x, y)| return y if File.exists?(x)}
href
end
def process_page(src, path)
doc = Nokogiri::HTML.parse(File.read(path), 'UTF-8')
rdir = path.gsub(%r{\.[^./]*$}, '') + '_files'
@ -88,13 +148,7 @@ class Downloader < SimpleDelegator
href = a['href']
next if href =~ %r{^(?:[^:]+:|[#?]|$)}
href = CGI.unescape(href)
np = File.join(base_dir, href)
if File.exists?("#{np}.html")
href << '.html'
end
a['href'] = href
a['href'] = guess_filename(base_dir, href)
end
doc.css('style').each do |style|
@ -167,8 +221,4 @@ class Downloader < SimpleDelegator
prefix += 1
end
end
def failed(src, dst, response)
puts "#{src} -> #{dst} failed: #{response.status_message}"
end
end

Loading…
Cancel
Save