major downloader and devhelp convertor fixes

pull/50/head
naquad 11 years ago
parent 94ff7609bc
commit 4772fc4164

@ -25,7 +25,7 @@ class DevHelp
end end
def normalize_url(link) def normalize_url(link)
link.gsub(/^([^.]+?)(?=$|#)/, '\1.html\2') link.gsub(/^([^#?]+)(?:\.html)?/, '\1.html')
end end
def build_devhelp(doc, structure) def build_devhelp(doc, structure)
@ -43,6 +43,7 @@ class DevHelp
def for_docs(*docs) def for_docs(*docs)
docs.flatten.each(&method(:for_doc)) docs.flatten.each(&method(:for_doc))
self
end end
def cp_r(src, dst) def cp_r(src, dst)
@ -191,7 +192,8 @@ class DevHelp
select(&method(:is_document?)). select(&method(:is_document?)).
each {|d| downloader.process_page(nil, d)} each {|d| downloader.process_page(nil, d)}
downloader.run downloader.wait
self
end end
def is_document?(p) def is_document?(p)

@ -1,66 +1,126 @@
require 'typhoeus'
require 'nokogiri' require 'nokogiri'
require 'delegate' require 'thread'
require 'fileutils' require 'fileutils'
require 'monitor'
require 'cgi' require 'cgi'
class Downloader < SimpleDelegator # Typhoeus has odd behavior either on my platform
include Typhoeus # or at all. Hence rolling out own parallel downloader.
MAX_QUEUE_SIZE = 20 # Ruby 2.1.0 has broken Thread::SizedQueue
# https://bugs.ruby-lang.org/issues/9342
class SizedQueue
attr_accessor :max
def initialize(*args) def initialize(max)
super(Hydra.new(*args)) @max = max
@counter = 0
@queue = []
@queue.extend(MonitorMixin)
@qe = @queue.new_cond
@qd = @queue.new_cond
end end
def processor(&block) def size
@processor = block @queue.synchronize do
@queue.size
end
end end
def queue_size def enqueue(what)
queued_requests.size @queue.synchronize do
@qd.wait_while { @queue.size >= @max }
@queue.push what
@qe.signal
end
end end
def file(src, dst, &block) def dequeue
file = nil @queue.synchronize do
@qe.wait_while { @queue.empty? }
ret = @queue.pop
@qd.signal
ret
end
end
request = Request.new(src) alias :<< :enqueue
alias :push :enqueue
alias :unshift :enqueue
request.on_headers do |response| alias :>> :dequeue
if response.response_code == 200 alias :pop :dequeue
dname = File.dirname(dst) alias :shift :dequeue
FileUtils.mkdir_p(dname) unless File.directory?(dname) end
file = open(dst, 'wb')
else class Downloader
failed(src, dst, response) def initialize(threads = 10, max = 10)
end @counter = 0
end @req = SizedQueue.new(max)
@callbacks = Thread::Queue.new
@threads = []
request.on_body do |chunk| @threads << Thread.new(&method(:runner))
file.write(chunk) if file
block = method(:downloader)
threads.times do
@threads << Thread.new(&block)
end end
end
request.on_complete do |response|
if file def downloader
file.close while r = @req.pop
block.call(dst) if block begin
open(r[0]) do |f|
dname = File.dirname(r[1])
FileUtils.mkdir_p(dname) unless File.directory?(dname)
File.write(r[1], f.read)
@callbacks << r[1..-1] if r[2]
end
rescue SocketError, OpenURI::HTTPError, Errno::EACCESS, Errno::EEXISTS => e
puts "#{r[1]} failed to download: #{e.message}"
end end
end end
end
queue request def runner
while t = @callbacks.pop
t[1].call(t[0])
end
end
def processor(&block)
@processor = block
end
def file(src, dst, &block)
@req << [src, dst, block]
dst dst
end end
def queue(*args, &block) def wait
run while queue_size > MAX_QUEUE_SIZE @callbacks << nil
__getobj__.queue *args, &block (@threads.size - 1).times { @req << nil }
@threads.each(&:join)
end end
def page(src, target) def page(src, target)
file(src, target) { process_page(src, target) } file(src, target) { process_page(src, target) }
end end
def guess_filename(base_dir, href)
tpath = File.join(base_dir, href)
[
[tpath , href ],
["#{tpath}.html" , "#{href}.html" ],
["#{tpath.downcase}" , "#{href.downcase}" ],
["#{tpath.downcase}.html" , "#{href.downcase}.html"]
].each {|(x, y)| return y if File.exists?(x)}
href
end
def process_page(src, path) def process_page(src, path)
doc = Nokogiri::HTML.parse(File.read(path), 'UTF-8') doc = Nokogiri::HTML.parse(File.read(path), 'UTF-8')
rdir = path.gsub(%r{\.[^./]*$}, '') + '_files' rdir = path.gsub(%r{\.[^./]*$}, '') + '_files'
@ -88,13 +148,7 @@ class Downloader < SimpleDelegator
href = a['href'] href = a['href']
next if href =~ %r{^(?:[^:]+:|[#?]|$)} next if href =~ %r{^(?:[^:]+:|[#?]|$)}
href = CGI.unescape(href) href = CGI.unescape(href)
a['href'] = guess_filename(base_dir, href)
np = File.join(base_dir, href)
if File.exists?("#{np}.html")
href << '.html'
end
a['href'] = href
end end
doc.css('style').each do |style| doc.css('style').each do |style|
@ -167,8 +221,4 @@ class Downloader < SimpleDelegator
prefix += 1 prefix += 1
end end
end end
def failed(src, dst, response)
puts "#{src} -> #{dst} failed: #{response.status_message}"
end
end end

Loading…
Cancel
Save