From eaec6ec43ff3136df6ba83a7abcb55ad28da492f Mon Sep 17 00:00:00 2001 From: Scott Goley Date: Fri, 8 Nov 2024 23:05:14 -0500 Subject: [PATCH] duckdb docs (v1.1) - scrape v1 --- lib/docs/filters/duckdb/clean_html.rb | 41 ++++++++++++++++ lib/docs/filters/duckdb/entries.rb | 45 +++++++++++++++++ lib/docs/scrapers/duckdb.rb | 69 +++++++++++++++++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 lib/docs/filters/duckdb/clean_html.rb create mode 100644 lib/docs/filters/duckdb/entries.rb create mode 100644 lib/docs/scrapers/duckdb.rb diff --git a/lib/docs/filters/duckdb/clean_html.rb b/lib/docs/filters/duckdb/clean_html.rb new file mode 100644 index 00000000..ae518c7b --- /dev/null +++ b/lib/docs/filters/duckdb/clean_html.rb @@ -0,0 +1,41 @@ +module Docs + class Duckdb + class CleanHtmlFilter < Filter + def call + # First extract the main content + @doc = at_css('main') + return doc if @doc.nil? + + # Remove navigation and header elements + css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove + + # Clean up code blocks + css('pre').each do |node| + # Detect language from class or parent div + if node['class']&.include?('sql') || node.at_css('code.sql') + node['data-language'] = 'sql' + elsif node['class']&.include?('language-sql') + node['data-language'] = 'sql' + end + node.content = node.content.strip + end + + # Remove unnecessary attributes but keep essential ones + css('div, span, p').each do |node| + node.remove_attribute('style') + node.remove_attribute('class') unless node['class'] =~ /highlight/ + end + + # Remove empty elements + css('div, span').each do |node| + node.remove if node.content.strip.empty? + end + + # Remove script tags + css('script').remove + + doc + end + end + end +end \ No newline at end of file diff --git a/lib/docs/filters/duckdb/entries.rb b/lib/docs/filters/duckdb/entries.rb new file mode 100644 index 00000000..ea929022 --- /dev/null +++ b/lib/docs/filters/duckdb/entries.rb @@ -0,0 +1,45 @@ +module Docs + class Duckdb + class EntriesFilter < Docs::EntriesFilter + def get_name + at_css('h1')&.content || 'DuckDB' + end + + def get_type + case subpath + when /\Asql\// + 'SQL Reference' + when /\Aapi\// + 'Client APIs' + when /\Aguides\// + 'How-to Guides' + when /\Adata\// + 'Data Import' + when /\Aoperations_manual\// + 'Operations Manual' + when /\Adev\// + 'Development' + when /\Ainternals\// + 'Internals' + when /\Aextensions\// + 'Extensions' + when /\Aarchive\// + 'Archive' + else + 'Documentation' + end + end + + def additional_entries + entries = [] + css('h2[id]', 'h3[id]').each do |node| + name = node.content.strip + # Clean up the name + name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ') + entries << [name, node['id'], get_type] + end + entries + end + end + end +end \ No newline at end of file diff --git a/lib/docs/scrapers/duckdb.rb b/lib/docs/scrapers/duckdb.rb new file mode 100644 index 00000000..a160b3ef --- /dev/null +++ b/lib/docs/scrapers/duckdb.rb @@ -0,0 +1,69 @@ +module Docs + class Duckdb < UrlScraper + self.name = 'DuckDB' + self.type = 'duckdb' + self.root_path = 'index.html' + self.links = { + home: 'https://duckdb.org/', + code: 'https://github.com/duckdb/duckdb' + } + + html_filters.push 'duckdb/entries', 'duckdb/clean_html' + + options[:container] = '.documentation' + + options[:skip_patterns] = [ + /installation/, + /archive/, + /reference/, + ] + + options[:skip] = %w( + docs/archive/ + docs/installation/ + docs/api/ + ) + + options[:attribution] = <<-HTML + © Copyright 2018–2024 Stichting DuckDB Foundation
+ Licensed under the MIT License. + HTML + + version '1.1' do + self.release = '1.1.x' + self.base_url = 'http://localhost:8000/docs/' + end + + # version '1.0' do + # self.release = '1.0.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + # version '0.9' do + # self.release = '0.9.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + # version '0.8' do + # self.release = '0.8.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + # version '0.7' do + # self.release = '0.7.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + def get_latest_version(opts) + get_github_tags('duckdb', 'duckdb', opts) + end + end +end