From cad60c6170844c1aa678eeb1cce2d7392f414c0f Mon Sep 17 00:00:00 2001 From: Thibaut Date: Fri, 6 Dec 2013 22:17:55 +0000 Subject: [PATCH] Improve PHP scraper Fixes #20 --- lib/docs/filters/php/entries.rb | 208 ++++++++++++-------------- lib/docs/filters/php/internal_urls.rb | 19 +++ lib/docs/scrapers/php.rb | 123 +++++---------- 3 files changed, 148 insertions(+), 202 deletions(-) create mode 100644 lib/docs/filters/php/internal_urls.rb diff --git a/lib/docs/filters/php/entries.rb b/lib/docs/filters/php/entries.rb index d5110a9c..94d0bfee 100644 --- a/lib/docs/filters/php/entries.rb +++ b/lib/docs/filters/php/entries.rb @@ -1,136 +1,112 @@ module Docs class Php class EntriesFilter < Docs::EntriesFilter - TYPES = { - # [name-begin-with] => [type] - 'AMQP' => 'AMQP', - 'APCIterator' => 'APC', - 'CURL' => 'cURL', - 'Date' => 'Date and Time', - 'DirectoryIterator' => 'Standard PHP Library', - 'Directory' => 'Directories', - 'DOM' => 'DOM', - 'Gearman' => 'Gearman', - 'Gmagick' => 'Gmagick', - 'Http' => 'HTTP', - 'Imagick' => 'Imagick', - 'Collator' => 'Internationalization', - 'NumberFormatter' => 'Internationalization', - 'Locale' => 'Internationalization', - 'MessageFormatter' => 'Internationalization', - 'Normalizer' => 'Internationalization', - 'Intl' => 'Internationalization', - 'intl' => 'Internationalization', - 'ResourceBundle' => 'Internationalization', - 'Spoofchecker' => 'Internationalization', - 'Transliterator' => 'Internationalization', - 'UConverter' => 'Internationalization', - 'grapheme' => 'Internationalization', - 'idn' => 'Internationalization', - 'Json' => 'JSON', - 'mysqli' => 'mysqli', - 'OAuth' => 'OAuth', - 'PDO' => 'PDO', - 'Thread' => 'pthreads', - 'Worker' => 'pthreads', - 'Stackable' => 'pthreads', - 'Mutex' => 'pthreads', - 'Cond' => 'pthreads', - 'Exception' => 'Predefined Exceptions', - 'ErrorException' => 'Predefined Exceptions', - 'QuickHash' => 'QuickHash', - 'Reflection' => 'Reflection', - 'Reflector' => 'Reflection', - 'Session' => 'Sessions', - 'SimpleXML' => 'SimpleXML', - 'Soap' => 'SOAP', - 'Solr' => 'Solr', - 'Sphinx' => 'Sphinx', - 'Spl' => 'Standard PHP Library', - 'ArrayObject' => 'Standard PHP Library', - 'Countable' => 'Standard PHP Library', - 'SQLite3' => 'SQLite3', - 'streamWrapper' => 'Streams', - 'php_user_filter' => 'Streams', - 'tidy' => 'Tidy', - 'V8Js' => 'V8js', - 'Varnish' => 'Varnish', - 'Weakref' => 'Weak References', - 'WeakRef' => 'Weak References', - 'WeakMap' => 'Weak References', - 'XSLTProcessor' => 'XSLT', - 'XsltProcessor' => 'XSLT', - 'Yaf' => 'Yaf', - 'ZipArchive' => 'Zip' } + TYPE_BY_NAME_STARTS_WITH = { + 'ArrayObject' => 'SPL', + 'Cond' => 'pthreads', + 'CURL' => 'cURL', + 'Date' => 'Date/Time', + 'ErrorException' => 'Predefined Exceptions', + 'Exception' => 'Predefined Exceptions', + 'Json' => 'JSON', + 'Http' => 'HTTP', + 'Mutex' => 'pthreads', + 'php_user_filter' => 'Stream', + 'Reflector' => 'Reflection', + 'Soap' => 'SOAP', + 'SplFile' => 'SPL/File', + 'SplTempFile' => 'SPL/File', + 'Spl' => 'SPL', + 'Stackable' => 'pthreads', + 'streamWrapper' => 'Stream', + 'Thread' => 'pthreads', + 'tidy' => 'Tidy', + 'Worker' => 'pthreads', + 'XsltProcessor' => 'XSLT', + 'ZipArchive' => 'Zip' } + + %w(APC AMQP Directory DOM Gearman Gmagick Imagick mysqli OAuth PDO + Reflection Session SimpleXML Solr Sphinx SQLite3 Varnish XSLT Yaf).each do |str| + TYPE_BY_NAME_STARTS_WITH[str] = str + end + + %w(ArrayAccess Closure Generator Iterator IteratorAggregate Serializable Traversable).each do |str| + TYPE_BY_NAME_STARTS_WITH[str] = 'Predefined Interfaces and Classes' + end + + %w(Collator grapheme idn Intl intl Locale MessageFormatter Normalizer + NumberFormatter ResourceBundle Spoofchecker Transliterator UConverter).each do |str| + TYPE_BY_NAME_STARTS_WITH[str] = 'Internationalization' + end + + %w(Countable OuterIterator RecursiveIterator SeekableIterator ).each do |str| + TYPE_BY_NAME_STARTS_WITH[str] = 'SPL/Interfaces' + end REPLACE_TYPES = { - # [original-type] => [new-type] - 'Array' => 'Arrays', - 'Bzip2' => 'bzip2', - 'Classes/Object' => 'Classes and Objects', - 'Date/Time' => 'Date and Time', - 'Directory' => 'Directories', - 'Exceptions' => 'Standard PHP Library', - 'Function handling' => 'Function Handling', - 'GD and Image' => 'GD', - 'Gettext' => 'gettext', - 'Inotify' => 'inotify', - 'Interfaces' => 'Standard PHP Library', - 'Iterators' => 'Standard PHP Library', - 'Libevent' => 'libevent', - 'Mailparse' => 'Mail', - 'Misc.' => 'Miscellaneous', - 'Multibyte String' => 'Multibyte Strings', - 'PCRE' => 'Regular Expressions', - 'PHP Options/Info' => 'Options and Info', - 'POSIX Regex' => 'Regular Expressions', - 'Program execution' => 'Program Execution', + 'Exceptions' => 'SPL/Exceptions', + 'GD and Image' => 'Image', + 'Gmagick' => 'Image/GraphicsMagick', + 'Imagick' => 'Image/ImageMagick', + 'Interfaces' => 'SPL/Interfaces', + 'Iterators' => 'SPL/Iterators', + 'mysqli' => 'Database/MySQL', + 'PostgreSQL' => 'Database/PostgreSQL', 'Session' => 'Sessions', - 'Session PgSQL' => 'PostgreSQL', - 'SPL' => 'Standard PHP Library', - 'Statistic' => 'Statistics', + 'Session PgSQL' => 'Database/PostgreSQL', + 'SQLite3' => 'Database/SQLite', + 'SQLSRV' => 'Database/SQL Server', 'Stream' => 'Streams', - 'String' => 'Strings', - 'Variable handling' => 'Variable Handling', - 'XMLReader' => 'XML Reader', - 'XMLWriter' => 'XML Writer', - 'Yaml' => 'YAML', - 'Zlib' => 'zlib' } - - IGNORE_SLUGS = %w(reserved.exceptions reserved.interfaces - reserved.variables) + 'Yaml' => 'YAML' } - def include_default_entry? - !(slug.start_with?('book') || IGNORE_SLUGS.include?(slug)) - end + TYPE_GROUPS = { + 'Classes and Functions' => ['Classes/Object', 'Function handling', 'Predefined Interfaces and Classes', 'runkit'], + 'Encoding' => ['Gettext', 'iconv', 'Multibyte String'], + 'Compression' => ['Bzip2', 'Zip', 'Zlib'], + 'Cryptography' => ['Hash', 'Mcrypt', 'OpenSSL', 'Password Hashing'], + 'Database' => ['DBA', 'ODBC', 'PDO'], + 'Date and Time' => ['Calendar', 'Date/Time'], + 'Errors' => ['Error Handling', 'Predefined Exceptions'], + 'File System' => ['Directory', 'Fileinfo', 'Filesystem', 'Inotify'], + 'HTML' => ['DOM', 'Tidy'], + 'Language' => ['Control Structures', 'Misc.', 'PHP Options/Info', 'Predefined Variables'], + 'Mail' => ['Mail', 'Mailparse'], + 'Mathematics' => ['BC Math', 'Math', 'Statistic'], + 'Networking' => ['GeoIP', 'Network', 'Output Control', 'SSH2', 'Socket', 'URL'], + 'Process Control' => ['Eio', 'Libevent', 'POSIX', 'Program execution', 'pthreads'], + 'String' => ['Ctype', 'PCRE', 'POSIX Regex', 'Taint'], + 'Variables' => ['Filter', 'Variable handling'], + 'XML' => ['libxml', 'SimpleXML', 'XML Parser', 'XML-RPC', 'XMLReader', 'XMLWriter', 'XSLT'] } def get_name + return 'IntlException' if slug == 'class.intlexception' name = css('> .sect1 > .title', 'h1', 'h2').first.content - - if name == 'Exception class for intl errors' - 'IntlException' - else - name.sub! 'The ', '' - name.sub! ' class', ' (class)' - name.sub! ' interface', ' (interface)' - name - end + name.sub! 'The ', '' + name.sub! ' class', ' (class)' + name.sub! ' interface', ' (interface)' + name end def get_type - if key = TYPES.keys.detect { |t| name.start_with?(t) } - TYPES[key] - else - type = at_css('.up').content.strip - type.sub! ' Functions', '' - type.sub! ' Obsolete Aliases and', '' + type = at_css('.up').content.strip + type = 'SPL/Iterators' if type.end_with? 'Iterator' + type.sub! ' Functions', '' + + TYPE_BY_NAME_STARTS_WITH.each_pair do |key, value| + break type = value if name.start_with?(key) + end - if type.end_with? 'Iterator' - 'Standard PHP Library' - else - REPLACE_TYPES[type] || type + TYPE_GROUPS.each_pair do |replacement, types| + types.each do |t| + return replacement if type == t end end + + REPLACE_TYPES[type] || type + end + + def include_default_entry? + Php::INDEX_PATHS.exclude?(subpath) && doc.at_css('.reference', '.refentry', '.sect1') end end end diff --git a/lib/docs/filters/php/internal_urls.rb b/lib/docs/filters/php/internal_urls.rb new file mode 100644 index 00000000..d5dc384b --- /dev/null +++ b/lib/docs/filters/php/internal_urls.rb @@ -0,0 +1,19 @@ +module Docs + class Php + class InternalUrlsFilter < Filter + def call + if subpath.start_with?('book.') || subpath.start_with?('class.') + result[:internal_urls] = internal_urls + end + doc + end + + def internal_urls + css('.book a', '.chunklist a').inject [] do |urls, link| + urls << link['href'] if link.next.try(:text?) && link['href'].exclude?('ref.pdo-') + urls + end + end + end + end +end diff --git a/lib/docs/scrapers/php.rb b/lib/docs/scrapers/php.rb index 9584b785..bc1a9936 100644 --- a/lib/docs/scrapers/php.rb +++ b/lib/docs/scrapers/php.rb @@ -1,111 +1,62 @@ module Docs class Php < FileScraper - # WARNING: if you are the kind of developer who likes to automate things, - # this scraper will hurt your feelings. - self.name = 'PHP' self.type = 'php' self.version = 'up to 5.5.6' self.base_url = 'http://www.php.net/manual/en/' - self.root_path = 'extensions.alphabetical.html' + self.root_path = 'index.html' # Downloaded from php.net/download-docs.php self.dir = '/Users/Thibaut/DevDocs/Docs/PHP' - html_filters.push 'php/entries', 'php/clean_html', 'title' + html_filters.push 'php/internal_urls', 'php/entries', 'php/clean_html', 'title' text_filters.push 'php/fix_urls' options[:title] = false options[:root_title] = 'PHP: Hypertext Preprocessor' - options[:only] = [] # using a whitelist - - options[:only_patterns] = [/\Afunction\.\w+\.html\z/, - /\Areserved\.exceptions/, /\Areserved\.interfaces/, - /\Areserved\.variables/, /\Acontrol\-structures/] + INDEX_PATHS = %w( + index.html + funcref.html + refs.database.html + set.mysqlinfo.html + language.control-structures.html + reserved.exceptions.html + reserved.interfaces.html + reserved.variables.html) - # TODO: MongoDB, Phar - BOOKS = %w(amqp apache apc array bc bzip2 calendar classkit classobj com - ctype curl datetime dba dir dom eio errorfunc exec fileinfo filesystem - filter ftp funchand gearman geoip gettext gmagick hash http iconv iisfunc - image imagick imap info inotify intl json ldap libevent libxml mail - mailparse math mbstring mcrypt memcached misc mysqli network oauth - openssl outcontrol password pcre pdo pgsql posix pthreads quickhash - readline regex runkit reflection session session-pgsql simplexml soap - sockets solr sphinx spl spl-types sqlite3 sqlsrv ssh2 stats stream - strings taint tidy url v8js var varnish weakref xml xmlreader xmlrpc - xmlwriter xsl yaf yaml zip zlib uodbc) - options[:only].concat BOOKS.map { |s| "book.#{s}.html" } - options[:only_patterns].concat BOOKS.map { |s| /\Afunction\.#{s}(?:\.|\-)/ } + options[:skip_links] = ->(filter) do + INDEX_PATHS.exclude?(filter.subpath) + end - CLASSES = %w(apciterator curlfile dateinterval dateperiod collator - numberformatter locale normalizer messageformatter resourcebundle - spoofchecker transliterator uconverter memcached thread worker stackable - mutex cond runkit reflector sessionhandler sessionhandlerinterface - sphinxclient countable arrayobject streamwrapper xmlreader xsltprocessor - ziparchive exception errorexception) - options[:only].concat CLASSES.map { |s| "class.#{s}.html" } - options[:only_patterns].concat CLASSES.map { |s| /\A#{s}\./ } + options[:only] = INDEX_PATHS.dup - FUNCTION_PREFIXES = %w(assert base base64 cal call chunk class cli - connection convert count create date debug define disk dns easter ereg - eregi error event file finfo forward func gc gd get grapheme halt header - headers highlight html http idn iis in inet ini is iterator magic mb md5 - mdecrypt memory mime move mt nl ob output parse pg php preg print proc - quoted realpath register restore set sha1 shell show stream socket spl - str sys tidy time timezone unregister use utf8 variant xml) - options[:only_patterns].concat FUNCTION_PREFIXES.map { |s| /\Afunction\.#{s}\-/ } + options[:only_patterns] = [ + /\Aclass\./, + /\Afunction\./, + /\Acontrol-structures/, + /\Areserved\.exceptions/, + /\Areserved\.interfaces/, + /\Areserved\.variables/] - FUNCTIONS = %w(trigger-error user-error require-once include-once) - options[:only].concat FUNCTIONS.map { |s| "function.#{s}.html" } + BOOKS = %w(amqp apache apc array bc bzip2 calendar classobj ctype curl + datetime dba dir dom eio errorfunc exec fileinfo filesystem filter ftp + funchand gearman geoip gettext gmagick hash http iconv iisfunc image + imagick imap info inotify intl json ldap libevent libxml mail mailparse + math mbstring mcrypt memcached misc mysqli network oauth openssl + outcontrol password pcre pdo pgsql posix pthreads regex runkit reflection + session session-pgsql simplexml soap sockets solr sphinx spl spl-types + sqlite3 sqlsrv ssh2 stats stream strings taint tidy uodbc url var varnish + xml xmlreader xmlrpc xmlwriter xsl yaf yaml zip zlib) - options[:only_patterns].concat [ - /function\.\w+\-exists\.html\z/, - /\A\w+iterator\./, - /\Afunction\.bz\w+\.html\z/, - /\Aclass\.\w+iterator\.html\z/, - /\Aclass\.\w+exception\.html\z/, - /\Aclass\.amqp/, /\Aamqp/, - /\Aclass\.datetime/, /\Adatetime/, - /\Aclass\.dom/, /\Adom/, - /\Aclass\.gearman/, /\Agearman/, - /\Aclass\.gmagick/, /\Agmagick/, - /\Aclass\.http/, /\Ahttp/, - /\Aclass\.imagick/, /\Aimagick/, - /\Aclass\.intl/, /\Aintl/, - /\Aclass\.json/, /\Ajson/, - /\Aclass\.mysqli/, /\Amysqli/, - /\Aclass\.oauth/, /\Aoauth/, - /\Aclass\.pdo/, /\Apdo/, - /\Aclass\.quickhash/, /\Aquickhash/, - /\Aclass\.reflection/, /\Areflection/, - /\Aclass\.simplexml/, /\Asimplexml/, - /\Aclass\.soap/, /\Asoap/, - /\Aclass\.solr/, /\Asolr/, - /\Aclass\.spl/, /\Aspl/, - /\Aclass\.sqlite3/, /\Asqlite3/, - /\Aclass\.tidy/, /\Atidy/, - /\Aclass\.v8js/, /\Av8js/, - /\Aclass\.varnish/, /\Avarnish/, - /\Aclass\.weak/, /\Aweak/, - /\Aclass\.yaf\-/, /\Ayaf\-/] + options[:only].concat BOOKS.map { |s| "book.#{s}.html" } - options[:skip_patterns] = [/example/, /quickstart/, /\.setup\.html\z/, - /\.overview\.html\z/, /\.requirements\.html\z/, /\.installation\.html\z/, - /\.install\.html\z/, /\.configuration\.html\z/, /\.resources\.html\z/, - /\.constants\.html\z/, /\Amysqlinfo/, /\Adatetime\.formats/] + options[:skip] = %w( + control-structures.intro.html + control-structures.alternative-syntax.html + function.mssql-select-db.html) - options[:skip] = %w(control-structures.intro.html - control-structures.alternative-syntax.html memcached.expiration.html - memcached.callbacks.html memcached.callbacks.result.html - memcached.callbacks.read-through.html memcached.sessions.html - mysqli.persistconns.html mysqli.notes.html mysqli.summary.html - pdo.connections.html pdo.transactions.html pdo.prepared-statements.html - pdo.error-handling.html pdo.lobs.htm pdo.drivers.html - reflection.extending.html http.request.options.html - class.lapackexception.html class.snmpexception.html function.mhash.html - spl.datastructures.html spl.iterators.html spl.interfaces.html - spl.exceptions.html spl.files.html spl.misc.html) + options[:skip_patterns] = [/mysqlnd/] options[:attribution] = <<-HTML © 1997–2013 The PHP Documentation Group