Improve PHP scraper

Fixes #20
pull/29/head
Thibaut 11 years ago
parent 9007f81151
commit cad60c6170

@ -1,136 +1,112 @@
module Docs module Docs
class Php class Php
class EntriesFilter < Docs::EntriesFilter class EntriesFilter < Docs::EntriesFilter
TYPES = { TYPE_BY_NAME_STARTS_WITH = {
# [name-begin-with] => [type] 'ArrayObject' => 'SPL',
'AMQP' => 'AMQP', 'Cond' => 'pthreads',
'APCIterator' => 'APC', 'CURL' => 'cURL',
'CURL' => 'cURL', 'Date' => 'Date/Time',
'Date' => 'Date and Time', 'ErrorException' => 'Predefined Exceptions',
'DirectoryIterator' => 'Standard PHP Library', 'Exception' => 'Predefined Exceptions',
'Directory' => 'Directories', 'Json' => 'JSON',
'DOM' => 'DOM', 'Http' => 'HTTP',
'Gearman' => 'Gearman', 'Mutex' => 'pthreads',
'Gmagick' => 'Gmagick', 'php_user_filter' => 'Stream',
'Http' => 'HTTP', 'Reflector' => 'Reflection',
'Imagick' => 'Imagick', 'Soap' => 'SOAP',
'Collator' => 'Internationalization', 'SplFile' => 'SPL/File',
'NumberFormatter' => 'Internationalization', 'SplTempFile' => 'SPL/File',
'Locale' => 'Internationalization', 'Spl' => 'SPL',
'MessageFormatter' => 'Internationalization', 'Stackable' => 'pthreads',
'Normalizer' => 'Internationalization', 'streamWrapper' => 'Stream',
'Intl' => 'Internationalization', 'Thread' => 'pthreads',
'intl' => 'Internationalization', 'tidy' => 'Tidy',
'ResourceBundle' => 'Internationalization', 'Worker' => 'pthreads',
'Spoofchecker' => 'Internationalization', 'XsltProcessor' => 'XSLT',
'Transliterator' => 'Internationalization', 'ZipArchive' => 'Zip' }
'UConverter' => 'Internationalization',
'grapheme' => 'Internationalization', %w(APC AMQP Directory DOM Gearman Gmagick Imagick mysqli OAuth PDO
'idn' => 'Internationalization', Reflection Session SimpleXML Solr Sphinx SQLite3 Varnish XSLT Yaf).each do |str|
'Json' => 'JSON', TYPE_BY_NAME_STARTS_WITH[str] = str
'mysqli' => 'mysqli', end
'OAuth' => 'OAuth',
'PDO' => 'PDO', %w(ArrayAccess Closure Generator Iterator IteratorAggregate Serializable Traversable).each do |str|
'Thread' => 'pthreads', TYPE_BY_NAME_STARTS_WITH[str] = 'Predefined Interfaces and Classes'
'Worker' => 'pthreads', end
'Stackable' => 'pthreads',
'Mutex' => 'pthreads', %w(Collator grapheme idn Intl intl Locale MessageFormatter Normalizer
'Cond' => 'pthreads', NumberFormatter ResourceBundle Spoofchecker Transliterator UConverter).each do |str|
'Exception' => 'Predefined Exceptions', TYPE_BY_NAME_STARTS_WITH[str] = 'Internationalization'
'ErrorException' => 'Predefined Exceptions', end
'QuickHash' => 'QuickHash',
'Reflection' => 'Reflection', %w(Countable OuterIterator RecursiveIterator SeekableIterator ).each do |str|
'Reflector' => 'Reflection', TYPE_BY_NAME_STARTS_WITH[str] = 'SPL/Interfaces'
'Session' => 'Sessions', end
'SimpleXML' => 'SimpleXML',
'Soap' => 'SOAP',
'Solr' => 'Solr',
'Sphinx' => 'Sphinx',
'Spl' => 'Standard PHP Library',
'ArrayObject' => 'Standard PHP Library',
'Countable' => 'Standard PHP Library',
'SQLite3' => 'SQLite3',
'streamWrapper' => 'Streams',
'php_user_filter' => 'Streams',
'tidy' => 'Tidy',
'V8Js' => 'V8js',
'Varnish' => 'Varnish',
'Weakref' => 'Weak References',
'WeakRef' => 'Weak References',
'WeakMap' => 'Weak References',
'XSLTProcessor' => 'XSLT',
'XsltProcessor' => 'XSLT',
'Yaf' => 'Yaf',
'ZipArchive' => 'Zip' }
REPLACE_TYPES = { REPLACE_TYPES = {
# [original-type] => [new-type] 'Exceptions' => 'SPL/Exceptions',
'Array' => 'Arrays', 'GD and Image' => 'Image',
'Bzip2' => 'bzip2', 'Gmagick' => 'Image/GraphicsMagick',
'Classes/Object' => 'Classes and Objects', 'Imagick' => 'Image/ImageMagick',
'Date/Time' => 'Date and Time', 'Interfaces' => 'SPL/Interfaces',
'Directory' => 'Directories', 'Iterators' => 'SPL/Iterators',
'Exceptions' => 'Standard PHP Library', 'mysqli' => 'Database/MySQL',
'Function handling' => 'Function Handling', 'PostgreSQL' => 'Database/PostgreSQL',
'GD and Image' => 'GD',
'Gettext' => 'gettext',
'Inotify' => 'inotify',
'Interfaces' => 'Standard PHP Library',
'Iterators' => 'Standard PHP Library',
'Libevent' => 'libevent',
'Mailparse' => 'Mail',
'Misc.' => 'Miscellaneous',
'Multibyte String' => 'Multibyte Strings',
'PCRE' => 'Regular Expressions',
'PHP Options/Info' => 'Options and Info',
'POSIX Regex' => 'Regular Expressions',
'Program execution' => 'Program Execution',
'Session' => 'Sessions', 'Session' => 'Sessions',
'Session PgSQL' => 'PostgreSQL', 'Session PgSQL' => 'Database/PostgreSQL',
'SPL' => 'Standard PHP Library', 'SQLite3' => 'Database/SQLite',
'Statistic' => 'Statistics', 'SQLSRV' => 'Database/SQL Server',
'Stream' => 'Streams', 'Stream' => 'Streams',
'String' => 'Strings', 'Yaml' => 'YAML' }
'Variable handling' => 'Variable Handling',
'XMLReader' => 'XML Reader',
'XMLWriter' => 'XML Writer',
'Yaml' => 'YAML',
'Zlib' => 'zlib' }
IGNORE_SLUGS = %w(reserved.exceptions reserved.interfaces
reserved.variables)
def include_default_entry? TYPE_GROUPS = {
!(slug.start_with?('book') || IGNORE_SLUGS.include?(slug)) 'Classes and Functions' => ['Classes/Object', 'Function handling', 'Predefined Interfaces and Classes', 'runkit'],
end 'Encoding' => ['Gettext', 'iconv', 'Multibyte String'],
'Compression' => ['Bzip2', 'Zip', 'Zlib'],
'Cryptography' => ['Hash', 'Mcrypt', 'OpenSSL', 'Password Hashing'],
'Database' => ['DBA', 'ODBC', 'PDO'],
'Date and Time' => ['Calendar', 'Date/Time'],
'Errors' => ['Error Handling', 'Predefined Exceptions'],
'File System' => ['Directory', 'Fileinfo', 'Filesystem', 'Inotify'],
'HTML' => ['DOM', 'Tidy'],
'Language' => ['Control Structures', 'Misc.', 'PHP Options/Info', 'Predefined Variables'],
'Mail' => ['Mail', 'Mailparse'],
'Mathematics' => ['BC Math', 'Math', 'Statistic'],
'Networking' => ['GeoIP', 'Network', 'Output Control', 'SSH2', 'Socket', 'URL'],
'Process Control' => ['Eio', 'Libevent', 'POSIX', 'Program execution', 'pthreads'],
'String' => ['Ctype', 'PCRE', 'POSIX Regex', 'Taint'],
'Variables' => ['Filter', 'Variable handling'],
'XML' => ['libxml', 'SimpleXML', 'XML Parser', 'XML-RPC', 'XMLReader', 'XMLWriter', 'XSLT'] }
def get_name def get_name
return 'IntlException' if slug == 'class.intlexception'
name = css('> .sect1 > .title', 'h1', 'h2').first.content name = css('> .sect1 > .title', 'h1', 'h2').first.content
name.sub! 'The ', ''
if name == 'Exception class for intl errors' name.sub! ' class', ' (class)'
'IntlException' name.sub! ' interface', ' (interface)'
else name
name.sub! 'The ', ''
name.sub! ' class', ' (class)'
name.sub! ' interface', ' (interface)'
name
end
end end
def get_type def get_type
if key = TYPES.keys.detect { |t| name.start_with?(t) } type = at_css('.up').content.strip
TYPES[key] type = 'SPL/Iterators' if type.end_with? 'Iterator'
else type.sub! ' Functions', ''
type = at_css('.up').content.strip
type.sub! ' Functions', '' TYPE_BY_NAME_STARTS_WITH.each_pair do |key, value|
type.sub! ' Obsolete Aliases and', '' break type = value if name.start_with?(key)
end
if type.end_with? 'Iterator' TYPE_GROUPS.each_pair do |replacement, types|
'Standard PHP Library' types.each do |t|
else return replacement if type == t
REPLACE_TYPES[type] || type
end end
end end
REPLACE_TYPES[type] || type
end
def include_default_entry?
Php::INDEX_PATHS.exclude?(subpath) && doc.at_css('.reference', '.refentry', '.sect1')
end end
end end
end end

@ -0,0 +1,19 @@
module Docs
class Php
class InternalUrlsFilter < Filter
def call
if subpath.start_with?('book.') || subpath.start_with?('class.')
result[:internal_urls] = internal_urls
end
doc
end
def internal_urls
css('.book a', '.chunklist a').inject [] do |urls, link|
urls << link['href'] if link.next.try(:text?) && link['href'].exclude?('ref.pdo-')
urls
end
end
end
end
end

@ -1,111 +1,62 @@
module Docs module Docs
class Php < FileScraper class Php < FileScraper
# WARNING: if you are the kind of developer who likes to automate things,
# this scraper will hurt your feelings.
self.name = 'PHP' self.name = 'PHP'
self.type = 'php' self.type = 'php'
self.version = 'up to 5.5.6' self.version = 'up to 5.5.6'
self.base_url = 'http://www.php.net/manual/en/' self.base_url = 'http://www.php.net/manual/en/'
self.root_path = 'extensions.alphabetical.html' self.root_path = 'index.html'
# Downloaded from php.net/download-docs.php # Downloaded from php.net/download-docs.php
self.dir = '/Users/Thibaut/DevDocs/Docs/PHP' self.dir = '/Users/Thibaut/DevDocs/Docs/PHP'
html_filters.push 'php/entries', 'php/clean_html', 'title' html_filters.push 'php/internal_urls', 'php/entries', 'php/clean_html', 'title'
text_filters.push 'php/fix_urls' text_filters.push 'php/fix_urls'
options[:title] = false options[:title] = false
options[:root_title] = 'PHP: Hypertext Preprocessor' options[:root_title] = 'PHP: Hypertext Preprocessor'
options[:only] = [] # using a whitelist INDEX_PATHS = %w(
index.html
options[:only_patterns] = [/\Afunction\.\w+\.html\z/, funcref.html
/\Areserved\.exceptions/, /\Areserved\.interfaces/, refs.database.html
/\Areserved\.variables/, /\Acontrol\-structures/] set.mysqlinfo.html
language.control-structures.html
reserved.exceptions.html
reserved.interfaces.html
reserved.variables.html)
# TODO: MongoDB, Phar options[:skip_links] = ->(filter) do
BOOKS = %w(amqp apache apc array bc bzip2 calendar classkit classobj com INDEX_PATHS.exclude?(filter.subpath)
ctype curl datetime dba dir dom eio errorfunc exec fileinfo filesystem end
filter ftp funchand gearman geoip gettext gmagick hash http iconv iisfunc
image imagick imap info inotify intl json ldap libevent libxml mail
mailparse math mbstring mcrypt memcached misc mysqli network oauth
openssl outcontrol password pcre pdo pgsql posix pthreads quickhash
readline regex runkit reflection session session-pgsql simplexml soap
sockets solr sphinx spl spl-types sqlite3 sqlsrv ssh2 stats stream
strings taint tidy url v8js var varnish weakref xml xmlreader xmlrpc
xmlwriter xsl yaf yaml zip zlib uodbc)
options[:only].concat BOOKS.map { |s| "book.#{s}.html" }
options[:only_patterns].concat BOOKS.map { |s| /\Afunction\.#{s}(?:\.|\-)/ }
CLASSES = %w(apciterator curlfile dateinterval dateperiod collator options[:only] = INDEX_PATHS.dup
numberformatter locale normalizer messageformatter resourcebundle
spoofchecker transliterator uconverter memcached thread worker stackable
mutex cond runkit reflector sessionhandler sessionhandlerinterface
sphinxclient countable arrayobject streamwrapper xmlreader xsltprocessor
ziparchive exception errorexception)
options[:only].concat CLASSES.map { |s| "class.#{s}.html" }
options[:only_patterns].concat CLASSES.map { |s| /\A#{s}\./ }
FUNCTION_PREFIXES = %w(assert base base64 cal call chunk class cli options[:only_patterns] = [
connection convert count create date debug define disk dns easter ereg /\Aclass\./,
eregi error event file finfo forward func gc gd get grapheme halt header /\Afunction\./,
headers highlight html http idn iis in inet ini is iterator magic mb md5 /\Acontrol-structures/,
mdecrypt memory mime move mt nl ob output parse pg php preg print proc /\Areserved\.exceptions/,
quoted realpath register restore set sha1 shell show stream socket spl /\Areserved\.interfaces/,
str sys tidy time timezone unregister use utf8 variant xml) /\Areserved\.variables/]
options[:only_patterns].concat FUNCTION_PREFIXES.map { |s| /\Afunction\.#{s}\-/ }
FUNCTIONS = %w(trigger-error user-error require-once include-once) BOOKS = %w(amqp apache apc array bc bzip2 calendar classobj ctype curl
options[:only].concat FUNCTIONS.map { |s| "function.#{s}.html" } datetime dba dir dom eio errorfunc exec fileinfo filesystem filter ftp
funchand gearman geoip gettext gmagick hash http iconv iisfunc image
imagick imap info inotify intl json ldap libevent libxml mail mailparse
math mbstring mcrypt memcached misc mysqli network oauth openssl
outcontrol password pcre pdo pgsql posix pthreads regex runkit reflection
session session-pgsql simplexml soap sockets solr sphinx spl spl-types
sqlite3 sqlsrv ssh2 stats stream strings taint tidy uodbc url var varnish
xml xmlreader xmlrpc xmlwriter xsl yaf yaml zip zlib)
options[:only_patterns].concat [ options[:only].concat BOOKS.map { |s| "book.#{s}.html" }
/function\.\w+\-exists\.html\z/,
/\A\w+iterator\./,
/\Afunction\.bz\w+\.html\z/,
/\Aclass\.\w+iterator\.html\z/,
/\Aclass\.\w+exception\.html\z/,
/\Aclass\.amqp/, /\Aamqp/,
/\Aclass\.datetime/, /\Adatetime/,
/\Aclass\.dom/, /\Adom/,
/\Aclass\.gearman/, /\Agearman/,
/\Aclass\.gmagick/, /\Agmagick/,
/\Aclass\.http/, /\Ahttp/,
/\Aclass\.imagick/, /\Aimagick/,
/\Aclass\.intl/, /\Aintl/,
/\Aclass\.json/, /\Ajson/,
/\Aclass\.mysqli/, /\Amysqli/,
/\Aclass\.oauth/, /\Aoauth/,
/\Aclass\.pdo/, /\Apdo/,
/\Aclass\.quickhash/, /\Aquickhash/,
/\Aclass\.reflection/, /\Areflection/,
/\Aclass\.simplexml/, /\Asimplexml/,
/\Aclass\.soap/, /\Asoap/,
/\Aclass\.solr/, /\Asolr/,
/\Aclass\.spl/, /\Aspl/,
/\Aclass\.sqlite3/, /\Asqlite3/,
/\Aclass\.tidy/, /\Atidy/,
/\Aclass\.v8js/, /\Av8js/,
/\Aclass\.varnish/, /\Avarnish/,
/\Aclass\.weak/, /\Aweak/,
/\Aclass\.yaf\-/, /\Ayaf\-/]
options[:skip_patterns] = [/example/, /quickstart/, /\.setup\.html\z/, options[:skip] = %w(
/\.overview\.html\z/, /\.requirements\.html\z/, /\.installation\.html\z/, control-structures.intro.html
/\.install\.html\z/, /\.configuration\.html\z/, /\.resources\.html\z/, control-structures.alternative-syntax.html
/\.constants\.html\z/, /\Amysqlinfo/, /\Adatetime\.formats/] function.mssql-select-db.html)
options[:skip] = %w(control-structures.intro.html options[:skip_patterns] = [/mysqlnd/]
control-structures.alternative-syntax.html memcached.expiration.html
memcached.callbacks.html memcached.callbacks.result.html
memcached.callbacks.read-through.html memcached.sessions.html
mysqli.persistconns.html mysqli.notes.html mysqli.summary.html
pdo.connections.html pdo.transactions.html pdo.prepared-statements.html
pdo.error-handling.html pdo.lobs.htm pdo.drivers.html
reflection.extending.html http.request.options.html
class.lapackexception.html class.snmpexception.html function.mhash.html
spl.datastructures.html spl.iterators.html spl.interfaces.html
spl.exceptions.html spl.files.html spl.misc.html)
options[:attribution] = <<-HTML options[:attribution] = <<-HTML
&copy; 1997&ndash;2013 The PHP Documentation Group<br> &copy; 1997&ndash;2013 The PHP Documentation Group<br>

Loading…
Cancel
Save