Improve PHP scraper

Fixes #20
pull/29/head
Thibaut 11 years ago
parent 9007f81151
commit cad60c6170

@ -1,137 +1,113 @@
module Docs
class Php
class EntriesFilter < Docs::EntriesFilter
TYPES = {
# [name-begin-with] => [type]
'AMQP' => 'AMQP',
'APCIterator' => 'APC',
TYPE_BY_NAME_STARTS_WITH = {
'ArrayObject' => 'SPL',
'Cond' => 'pthreads',
'CURL' => 'cURL',
'Date' => 'Date and Time',
'DirectoryIterator' => 'Standard PHP Library',
'Directory' => 'Directories',
'DOM' => 'DOM',
'Gearman' => 'Gearman',
'Gmagick' => 'Gmagick',
'Http' => 'HTTP',
'Imagick' => 'Imagick',
'Collator' => 'Internationalization',
'NumberFormatter' => 'Internationalization',
'Locale' => 'Internationalization',
'MessageFormatter' => 'Internationalization',
'Normalizer' => 'Internationalization',
'Intl' => 'Internationalization',
'intl' => 'Internationalization',
'ResourceBundle' => 'Internationalization',
'Spoofchecker' => 'Internationalization',
'Transliterator' => 'Internationalization',
'UConverter' => 'Internationalization',
'grapheme' => 'Internationalization',
'idn' => 'Internationalization',
'Date' => 'Date/Time',
'ErrorException' => 'Predefined Exceptions',
'Exception' => 'Predefined Exceptions',
'Json' => 'JSON',
'mysqli' => 'mysqli',
'OAuth' => 'OAuth',
'PDO' => 'PDO',
'Thread' => 'pthreads',
'Worker' => 'pthreads',
'Stackable' => 'pthreads',
'Http' => 'HTTP',
'Mutex' => 'pthreads',
'Cond' => 'pthreads',
'Exception' => 'Predefined Exceptions',
'ErrorException' => 'Predefined Exceptions',
'QuickHash' => 'QuickHash',
'Reflection' => 'Reflection',
'php_user_filter' => 'Stream',
'Reflector' => 'Reflection',
'Session' => 'Sessions',
'SimpleXML' => 'SimpleXML',
'Soap' => 'SOAP',
'Solr' => 'Solr',
'Sphinx' => 'Sphinx',
'Spl' => 'Standard PHP Library',
'ArrayObject' => 'Standard PHP Library',
'Countable' => 'Standard PHP Library',
'SQLite3' => 'SQLite3',
'streamWrapper' => 'Streams',
'php_user_filter' => 'Streams',
'SplFile' => 'SPL/File',
'SplTempFile' => 'SPL/File',
'Spl' => 'SPL',
'Stackable' => 'pthreads',
'streamWrapper' => 'Stream',
'Thread' => 'pthreads',
'tidy' => 'Tidy',
'V8Js' => 'V8js',
'Varnish' => 'Varnish',
'Weakref' => 'Weak References',
'WeakRef' => 'Weak References',
'WeakMap' => 'Weak References',
'XSLTProcessor' => 'XSLT',
'Worker' => 'pthreads',
'XsltProcessor' => 'XSLT',
'Yaf' => 'Yaf',
'ZipArchive' => 'Zip' }
%w(APC AMQP Directory DOM Gearman Gmagick Imagick mysqli OAuth PDO
Reflection Session SimpleXML Solr Sphinx SQLite3 Varnish XSLT Yaf).each do |str|
TYPE_BY_NAME_STARTS_WITH[str] = str
end
%w(ArrayAccess Closure Generator Iterator IteratorAggregate Serializable Traversable).each do |str|
TYPE_BY_NAME_STARTS_WITH[str] = 'Predefined Interfaces and Classes'
end
%w(Collator grapheme idn Intl intl Locale MessageFormatter Normalizer
NumberFormatter ResourceBundle Spoofchecker Transliterator UConverter).each do |str|
TYPE_BY_NAME_STARTS_WITH[str] = 'Internationalization'
end
%w(Countable OuterIterator RecursiveIterator SeekableIterator ).each do |str|
TYPE_BY_NAME_STARTS_WITH[str] = 'SPL/Interfaces'
end
REPLACE_TYPES = {
# [original-type] => [new-type]
'Array' => 'Arrays',
'Bzip2' => 'bzip2',
'Classes/Object' => 'Classes and Objects',
'Date/Time' => 'Date and Time',
'Directory' => 'Directories',
'Exceptions' => 'Standard PHP Library',
'Function handling' => 'Function Handling',
'GD and Image' => 'GD',
'Gettext' => 'gettext',
'Inotify' => 'inotify',
'Interfaces' => 'Standard PHP Library',
'Iterators' => 'Standard PHP Library',
'Libevent' => 'libevent',
'Mailparse' => 'Mail',
'Misc.' => 'Miscellaneous',
'Multibyte String' => 'Multibyte Strings',
'PCRE' => 'Regular Expressions',
'PHP Options/Info' => 'Options and Info',
'POSIX Regex' => 'Regular Expressions',
'Program execution' => 'Program Execution',
'Exceptions' => 'SPL/Exceptions',
'GD and Image' => 'Image',
'Gmagick' => 'Image/GraphicsMagick',
'Imagick' => 'Image/ImageMagick',
'Interfaces' => 'SPL/Interfaces',
'Iterators' => 'SPL/Iterators',
'mysqli' => 'Database/MySQL',
'PostgreSQL' => 'Database/PostgreSQL',
'Session' => 'Sessions',
'Session PgSQL' => 'PostgreSQL',
'SPL' => 'Standard PHP Library',
'Statistic' => 'Statistics',
'Session PgSQL' => 'Database/PostgreSQL',
'SQLite3' => 'Database/SQLite',
'SQLSRV' => 'Database/SQL Server',
'Stream' => 'Streams',
'String' => 'Strings',
'Variable handling' => 'Variable Handling',
'XMLReader' => 'XML Reader',
'XMLWriter' => 'XML Writer',
'Yaml' => 'YAML',
'Zlib' => 'zlib' }
'Yaml' => 'YAML' }
IGNORE_SLUGS = %w(reserved.exceptions reserved.interfaces
reserved.variables)
def include_default_entry?
!(slug.start_with?('book') || IGNORE_SLUGS.include?(slug))
end
TYPE_GROUPS = {
'Classes and Functions' => ['Classes/Object', 'Function handling', 'Predefined Interfaces and Classes', 'runkit'],
'Encoding' => ['Gettext', 'iconv', 'Multibyte String'],
'Compression' => ['Bzip2', 'Zip', 'Zlib'],
'Cryptography' => ['Hash', 'Mcrypt', 'OpenSSL', 'Password Hashing'],
'Database' => ['DBA', 'ODBC', 'PDO'],
'Date and Time' => ['Calendar', 'Date/Time'],
'Errors' => ['Error Handling', 'Predefined Exceptions'],
'File System' => ['Directory', 'Fileinfo', 'Filesystem', 'Inotify'],
'HTML' => ['DOM', 'Tidy'],
'Language' => ['Control Structures', 'Misc.', 'PHP Options/Info', 'Predefined Variables'],
'Mail' => ['Mail', 'Mailparse'],
'Mathematics' => ['BC Math', 'Math', 'Statistic'],
'Networking' => ['GeoIP', 'Network', 'Output Control', 'SSH2', 'Socket', 'URL'],
'Process Control' => ['Eio', 'Libevent', 'POSIX', 'Program execution', 'pthreads'],
'String' => ['Ctype', 'PCRE', 'POSIX Regex', 'Taint'],
'Variables' => ['Filter', 'Variable handling'],
'XML' => ['libxml', 'SimpleXML', 'XML Parser', 'XML-RPC', 'XMLReader', 'XMLWriter', 'XSLT'] }
def get_name
return 'IntlException' if slug == 'class.intlexception'
name = css('> .sect1 > .title', 'h1', 'h2').first.content
if name == 'Exception class for intl errors'
'IntlException'
else
name.sub! 'The ', ''
name.sub! ' class', ' (class)'
name.sub! ' interface', ' (interface)'
name
end
end
def get_type
if key = TYPES.keys.detect { |t| name.start_with?(t) }
TYPES[key]
else
type = at_css('.up').content.strip
type = 'SPL/Iterators' if type.end_with? 'Iterator'
type.sub! ' Functions', ''
type.sub! ' Obsolete Aliases and', ''
if type.end_with? 'Iterator'
'Standard PHP Library'
else
REPLACE_TYPES[type] || type
TYPE_BY_NAME_STARTS_WITH.each_pair do |key, value|
break type = value if name.start_with?(key)
end
TYPE_GROUPS.each_pair do |replacement, types|
types.each do |t|
return replacement if type == t
end
end
REPLACE_TYPES[type] || type
end
def include_default_entry?
Php::INDEX_PATHS.exclude?(subpath) && doc.at_css('.reference', '.refentry', '.sect1')
end
end
end
end

@ -0,0 +1,19 @@
module Docs
class Php
class InternalUrlsFilter < Filter
def call
if subpath.start_with?('book.') || subpath.start_with?('class.')
result[:internal_urls] = internal_urls
end
doc
end
def internal_urls
css('.book a', '.chunklist a').inject [] do |urls, link|
urls << link['href'] if link.next.try(:text?) && link['href'].exclude?('ref.pdo-')
urls
end
end
end
end
end

@ -1,111 +1,62 @@
module Docs
class Php < FileScraper
# WARNING: if you are the kind of developer who likes to automate things,
# this scraper will hurt your feelings.
self.name = 'PHP'
self.type = 'php'
self.version = 'up to 5.5.6'
self.base_url = 'http://www.php.net/manual/en/'
self.root_path = 'extensions.alphabetical.html'
self.root_path = 'index.html'
# Downloaded from php.net/download-docs.php
self.dir = '/Users/Thibaut/DevDocs/Docs/PHP'
html_filters.push 'php/entries', 'php/clean_html', 'title'
html_filters.push 'php/internal_urls', 'php/entries', 'php/clean_html', 'title'
text_filters.push 'php/fix_urls'
options[:title] = false
options[:root_title] = 'PHP: Hypertext Preprocessor'
options[:only] = [] # using a whitelist
options[:only_patterns] = [/\Afunction\.\w+\.html\z/,
/\Areserved\.exceptions/, /\Areserved\.interfaces/,
/\Areserved\.variables/, /\Acontrol\-structures/]
INDEX_PATHS = %w(
index.html
funcref.html
refs.database.html
set.mysqlinfo.html
language.control-structures.html
reserved.exceptions.html
reserved.interfaces.html
reserved.variables.html)
# TODO: MongoDB, Phar
BOOKS = %w(amqp apache apc array bc bzip2 calendar classkit classobj com
ctype curl datetime dba dir dom eio errorfunc exec fileinfo filesystem
filter ftp funchand gearman geoip gettext gmagick hash http iconv iisfunc
image imagick imap info inotify intl json ldap libevent libxml mail
mailparse math mbstring mcrypt memcached misc mysqli network oauth
openssl outcontrol password pcre pdo pgsql posix pthreads quickhash
readline regex runkit reflection session session-pgsql simplexml soap
sockets solr sphinx spl spl-types sqlite3 sqlsrv ssh2 stats stream
strings taint tidy url v8js var varnish weakref xml xmlreader xmlrpc
xmlwriter xsl yaf yaml zip zlib uodbc)
options[:only].concat BOOKS.map { |s| "book.#{s}.html" }
options[:only_patterns].concat BOOKS.map { |s| /\Afunction\.#{s}(?:\.|\-)/ }
options[:skip_links] = ->(filter) do
INDEX_PATHS.exclude?(filter.subpath)
end
CLASSES = %w(apciterator curlfile dateinterval dateperiod collator
numberformatter locale normalizer messageformatter resourcebundle
spoofchecker transliterator uconverter memcached thread worker stackable
mutex cond runkit reflector sessionhandler sessionhandlerinterface
sphinxclient countable arrayobject streamwrapper xmlreader xsltprocessor
ziparchive exception errorexception)
options[:only].concat CLASSES.map { |s| "class.#{s}.html" }
options[:only_patterns].concat CLASSES.map { |s| /\A#{s}\./ }
options[:only] = INDEX_PATHS.dup
FUNCTION_PREFIXES = %w(assert base base64 cal call chunk class cli
connection convert count create date debug define disk dns easter ereg
eregi error event file finfo forward func gc gd get grapheme halt header
headers highlight html http idn iis in inet ini is iterator magic mb md5
mdecrypt memory mime move mt nl ob output parse pg php preg print proc
quoted realpath register restore set sha1 shell show stream socket spl
str sys tidy time timezone unregister use utf8 variant xml)
options[:only_patterns].concat FUNCTION_PREFIXES.map { |s| /\Afunction\.#{s}\-/ }
options[:only_patterns] = [
/\Aclass\./,
/\Afunction\./,
/\Acontrol-structures/,
/\Areserved\.exceptions/,
/\Areserved\.interfaces/,
/\Areserved\.variables/]
FUNCTIONS = %w(trigger-error user-error require-once include-once)
options[:only].concat FUNCTIONS.map { |s| "function.#{s}.html" }
BOOKS = %w(amqp apache apc array bc bzip2 calendar classobj ctype curl
datetime dba dir dom eio errorfunc exec fileinfo filesystem filter ftp
funchand gearman geoip gettext gmagick hash http iconv iisfunc image
imagick imap info inotify intl json ldap libevent libxml mail mailparse
math mbstring mcrypt memcached misc mysqli network oauth openssl
outcontrol password pcre pdo pgsql posix pthreads regex runkit reflection
session session-pgsql simplexml soap sockets solr sphinx spl spl-types
sqlite3 sqlsrv ssh2 stats stream strings taint tidy uodbc url var varnish
xml xmlreader xmlrpc xmlwriter xsl yaf yaml zip zlib)
options[:only_patterns].concat [
/function\.\w+\-exists\.html\z/,
/\A\w+iterator\./,
/\Afunction\.bz\w+\.html\z/,
/\Aclass\.\w+iterator\.html\z/,
/\Aclass\.\w+exception\.html\z/,
/\Aclass\.amqp/, /\Aamqp/,
/\Aclass\.datetime/, /\Adatetime/,
/\Aclass\.dom/, /\Adom/,
/\Aclass\.gearman/, /\Agearman/,
/\Aclass\.gmagick/, /\Agmagick/,
/\Aclass\.http/, /\Ahttp/,
/\Aclass\.imagick/, /\Aimagick/,
/\Aclass\.intl/, /\Aintl/,
/\Aclass\.json/, /\Ajson/,
/\Aclass\.mysqli/, /\Amysqli/,
/\Aclass\.oauth/, /\Aoauth/,
/\Aclass\.pdo/, /\Apdo/,
/\Aclass\.quickhash/, /\Aquickhash/,
/\Aclass\.reflection/, /\Areflection/,
/\Aclass\.simplexml/, /\Asimplexml/,
/\Aclass\.soap/, /\Asoap/,
/\Aclass\.solr/, /\Asolr/,
/\Aclass\.spl/, /\Aspl/,
/\Aclass\.sqlite3/, /\Asqlite3/,
/\Aclass\.tidy/, /\Atidy/,
/\Aclass\.v8js/, /\Av8js/,
/\Aclass\.varnish/, /\Avarnish/,
/\Aclass\.weak/, /\Aweak/,
/\Aclass\.yaf\-/, /\Ayaf\-/]
options[:only].concat BOOKS.map { |s| "book.#{s}.html" }
options[:skip_patterns] = [/example/, /quickstart/, /\.setup\.html\z/,
/\.overview\.html\z/, /\.requirements\.html\z/, /\.installation\.html\z/,
/\.install\.html\z/, /\.configuration\.html\z/, /\.resources\.html\z/,
/\.constants\.html\z/, /\Amysqlinfo/, /\Adatetime\.formats/]
options[:skip] = %w(
control-structures.intro.html
control-structures.alternative-syntax.html
function.mssql-select-db.html)
options[:skip] = %w(control-structures.intro.html
control-structures.alternative-syntax.html memcached.expiration.html
memcached.callbacks.html memcached.callbacks.result.html
memcached.callbacks.read-through.html memcached.sessions.html
mysqli.persistconns.html mysqli.notes.html mysqli.summary.html
pdo.connections.html pdo.transactions.html pdo.prepared-statements.html
pdo.error-handling.html pdo.lobs.htm pdo.drivers.html
reflection.extending.html http.request.options.html
class.lapackexception.html class.snmpexception.html function.mhash.html
spl.datastructures.html spl.iterators.html spl.interfaces.html
spl.exceptions.html spl.files.html spl.misc.html)
options[:skip_patterns] = [/mysqlnd/]
options[:attribution] = <<-HTML
&copy; 1997&ndash;2013 The PHP Documentation Group<br>

Loading…
Cancel
Save