From df1f203310038b4e00a432013de08f9e55abbfd6 Mon Sep 17 00:00:00 2001 From: Thibaut Date: Mon, 29 Dec 2014 18:33:49 -0500 Subject: [PATCH] Update and improve PostgreSQL documentation (9.4) --- lib/docs/filters/postgresql/clean_html.rb | 4 + lib/docs/filters/postgresql/entries.rb | 150 ++++++++++-------- .../{clean_nav.rb => extract_metadata.rb} | 3 +- lib/docs/scrapers/postgresql.rb | 45 ++++-- 4 files changed, 125 insertions(+), 77 deletions(-) rename lib/docs/filters/postgresql/{clean_nav.rb => extract_metadata.rb} (87%) diff --git a/lib/docs/filters/postgresql/clean_html.rb b/lib/docs/filters/postgresql/clean_html.rb index 8c9a9f45..a30e4543 100644 --- a/lib/docs/filters/postgresql/clean_html.rb +++ b/lib/docs/filters/postgresql/clean_html.rb @@ -11,6 +11,10 @@ module Docs end def other + @doc = at_css('#docContent') + + css('.NAVHEADER', '.NAVFOOTER').remove + css('a[name]').each do |node| node.parent['id'] = node['name'] node.before(node.children).remove diff --git a/lib/docs/filters/postgresql/entries.rb b/lib/docs/filters/postgresql/entries.rb index b8904484..c39ee4a2 100644 --- a/lib/docs/filters/postgresql/entries.rb +++ b/lib/docs/filters/postgresql/entries.rb @@ -11,98 +11,144 @@ module Docs 'System Administration Functions' => 'Administration Functions', 'System Information Functions' => 'Information Functions' } - def get_name - name = at_css('h1').content - clean_heading_name(name) + PREPEND_TYPES = [ + 'Type Conversion', + 'Full Text Search', + 'Performance Tips', + 'Server Configuration', + 'Monitoring' ] + + REPLACE_TYPES = { + 'Routine Database Maintenance Tasks' => 'Maintenance', + 'High Availability, Load Balancing, and Replication' => 'High Availability', + 'Monitoring Database Activity' => 'Monitoring', + 'Monitoring Disk Usage' => 'Monitoring', + 'Reliability and the Write-Ahead Log' => 'Write-Ahead Log' } + + def base_name + @base_name ||= clean_heading_name(at_css('h1').content) + end - if %w(Overview Introduction).include?(name) + def get_name + if %w(Overview Introduction).include?(base_name) result[:pg_chapter_name] + elsif PREPEND_TYPES.include?(type) + "#{type}: #{base_name}" else - name.remove! ' (Common Table Expressions)' - REPLACE_NAMES[name] || name + REPLACE_NAMES[base_name] || base_name end end - def clean_heading_name(name) - name.remove! %r{\A[\d\.\s]+} - name.remove! 'Using ' - name.remove! %r{\AThe } - name - end - def get_type return if initial_page? if result[:pg_up_path] == 'sql-commands.html' 'Commands' - elsif result[:pg_up_path].start_with? 'reference-' + elsif result[:pg_up_path].start_with?('reference-') 'Applications' elsif type = result[:pg_chapter_name] - if type.start_with?('Func') && (match = name.match(/\A(?!Form|Seq|Set|Enum)(.+) Func/)) + if type.start_with?('Func') && (match = base_name.match(/\A(?!Form|Seq|Set|Enum)(.+) Func/)) "Functions: #{match[1]}" else - type.remove 'SQL ' + type.remove! 'SQL ' + REPLACE_TYPES[type] || type end end end def additional_entries return [] if skip_additional_entries? - return get_config_entries if config_page? + return config_additional_entries if type && type.include?('Configuration') + return data_types_additional_entries if type == 'Data Types' return get_heading_entries('h3[id]') if slug == 'functions-xml' - if type == 'Data Types' - return get_custom_entries case slug - when 'rangetypes' then 'li > p > .TYPE:first-child' - when 'datatype-textsearch' then '.SECT2 > .TYPE' - else '.CALSTABLE td:first-child > .TYPE' end - end - entries = get_heading_entries('h2[id]') - if slug == 'queries-union' + case slug + when 'queries-union' entries.concat get_custom_entries('p > .LITERAL:first-child') - elsif slug == 'queries-table-expressions' + when 'queries-table-expressions' entries.concat get_heading_entries('h3[id]') entries.concat get_custom_entries('dt > .LITERAL:first-child') - elsif slug == 'functions-logical' + when 'functions-logical' entries.concat get_custom_entries('> table td:first-child > code') - elsif slug == 'functions-formatting' + when 'functions-formatting' entries.concat get_custom_entries('#FUNCTIONS-FORMATTING-TABLE td:first-child > code') - elsif slug == 'functions-admin' + when 'functions-admin' entries.concat get_custom_entries('.TABLE td:first-child > code') - elsif slug == 'functions-string' + when 'functions-string' entries.concat get_custom_entries('> div[id^="FUNC"] td:first-child > code') - elsif type && type.start_with?('Functions') - entries.concat get_custom_entries('> .TABLE td:first-child > code:first-child') - entries.concat get_comparison_entries if slug == 'functions-comparison' + else + if type && type.start_with?('Functions') + entries.concat get_custom_entries('> .TABLE td:first-child > code:first-child') + entries.concat %w(IS NULL BETWEEN DISTINCT\ FROM).map { |name| ["#{self.name}: #{name}"] } if slug == 'functions-comparison' + end end entries end - def get_config_entries + def config_additional_entries css('.VARIABLELIST dt[id]').map do |node| name = node.at_css('.VARNAME').content ["Config: #{name}", node['id']] end end + def data_types_additional_entries + selector = case slug + when 'rangetypes' + 'li > p > .TYPE:first-child' + when 'datatype-textsearch' + '.SECT2 > .TYPE' + else + '.CALSTABLE td:first-child > .TYPE' + end + get_custom_entries(selector) + end + + def include_default_entry? + !initial_page? && !at_css('.TOC') + end + + SKIP_ENTRIES_SLUGS = [ + 'config-setting', + 'applevel-consistency' ] + + SKIP_ENTRIES_TYPES = [ + 'Localization', + 'Type Conversion', + 'Full Text Search', + 'Performance Tips', + 'Client Authentication', + 'Managing Databases', + 'Maintenance', + 'Backup and Restore', + 'High Availability', + 'Monitoring' ] + + def skip_additional_entries? + SKIP_ENTRIES_SLUGS.include?(slug) || SKIP_ENTRIES_TYPES.include?(type) + end + + def clean_heading_name(name) + name.remove! %r{\A[\d\.\s]+} + name.remove! 'Using ' + name.remove! %r{\AThe } + name.remove! ' (Common Table Expressions)' + name + end + def get_heading_entries(selector) - css(selector).inject [] do |entries, node| + css(selector).each_with_object([]) do |node, entries| name = node.content clean_heading_name(name) - - unless skip_heading?(name) - entries << ["#{additional_entry_prefix}: #{name}", node['id']] - end - - entries + entries << ["#{additional_entry_prefix}: #{name}", node['id']] unless skip_heading?(name) end end def get_custom_entries(selector) - css(selector).inject [] do |entries, node| + css(selector).each_with_object([]) do |node, entries| name = node.content name.remove! %r{\(.*?\)}m name.remove! %r{\[.*?\]}m @@ -117,14 +163,6 @@ module Docs node['id'] = id entries << [name, id] end - - entries - end - end - - def get_comparison_entries - %w(IS NULL BETWEEN DISTINCT\ FROM).map do |name| - ["#{self.name}: #{name}"] end end @@ -132,22 +170,10 @@ module Docs type.dup.gsub!('Functions: ', '') || self.name end - def skip_additional_entries? - slug == 'config-setting' || %w(Concurrency\ Control Localization).include?(type) - end - def skip_heading?(name) %w(Usage\ Patterns Portability Caveats Overview).include?(name) || (type.start_with?('Functions') && slug != 'functions-xml' && name.split.first.upcase!) end - - def include_default_entry? - !(initial_page? || at_css('.TOC') || config_page?) - end - - def config_page? - slug.start_with? 'runtime-config' - end end end end diff --git a/lib/docs/filters/postgresql/clean_nav.rb b/lib/docs/filters/postgresql/extract_metadata.rb similarity index 87% rename from lib/docs/filters/postgresql/clean_nav.rb rename to lib/docs/filters/postgresql/extract_metadata.rb index 0f6c7090..50e15d87 100644 --- a/lib/docs/filters/postgresql/clean_nav.rb +++ b/lib/docs/filters/postgresql/extract_metadata.rb @@ -1,10 +1,9 @@ module Docs class Postgresql - class CleanNavFilter < Filter + class ExtractMetadataFilter < Filter def call extract_up_path extract_chapter - css('.NAVHEADER', '.NAVFOOTER').remove doc end diff --git a/lib/docs/scrapers/postgresql.rb b/lib/docs/scrapers/postgresql.rb index 909036fd..fdfe07b8 100644 --- a/lib/docs/scrapers/postgresql.rb +++ b/lib/docs/scrapers/postgresql.rb @@ -1,14 +1,13 @@ module Docs - class Postgresql < FileScraper + class Postgresql < UrlScraper self.name = 'PostgreSQL' self.type = 'postgres' - self.version = 'up to 9.3.2' - self.dir = '/Users/Thibaut/DevDocs/Docs/PostgreSQL' - self.base_url = 'http://www.postgresql.org/docs/9.3/static/' + self.version = '9.4' + self.base_url = "http://www.postgresql.org/docs/#{version}/static/" self.root_path = 'reference.html' - self.initial_paths = %w(sql.html runtime-config.html charset.html) + self.initial_paths = %w(sql.html admin.html) - html_filters.insert_before 'normalize_urls', 'postgresql/clean_nav' + html_filters.insert_before 'normalize_urls', 'postgresql/extract_metadata' html_filters.push 'postgresql/clean_html', 'postgresql/entries', 'title' options[:title] = false @@ -19,7 +18,6 @@ module Docs arrays.html rowtypes.html rangetypes.html - mvcc-intro.html transaction-iso.html explicit-locking.html applevel-consistency.html @@ -27,7 +25,15 @@ module Docs config-setting.html locale.html collation.html - multibyte.html) + multibyte.html + using-explain.html + planner-stats.html + explicit-joins.html + populate.html + non-durability.html + logfile-maintenance.html + continuous-archiving.html + dynamic-trace.html) options[:only_patterns] = [ /\Asql\-/, @@ -37,18 +43,31 @@ module Docs /\Aqueries\-/, /\Adatatype\-/, /\Afunctions\-/, + /\Atypeconv\-/, + /\Atextsearch\-/, + /\Amvcc\-/, /\Aindexes\-/, - /\Aruntime\-config\-/] + /\Aruntime\-config\-/, + /\Aauth\-/, + /\Aclient\-authentication/, + /\Amanage\-ag/, + /\Aroutine/, + /\Abackup\-/, + /\Amonitoring\-/, + /\Awal\-/, + /\Adisk/, + /role/, + /recovery/, + /standby/] options[:skip] = %w( ddl-others.html - runtime-config-custom.html - runtime-config-short.html functions-event-triggers.html - functions-trigger.html) + functions-trigger.html + textsearch-migration.html) options[:attribution] = <<-HTML - © 1996–2013 The PostgreSQL Global Development Group
+ © 1996–2014 The PostgreSQL Global Development Group
Licensed under the PostgreSQL License. HTML end