duckdb docs (v1.1) - scrape v1

pull/2371/head
Scott Goley 2 months ago
parent 5e989570b3
commit eaec6ec43f

@ -0,0 +1,41 @@
module Docs
class Duckdb
class CleanHtmlFilter < Filter
def call
# First extract the main content
@doc = at_css('main')
return doc if @doc.nil?
# Remove navigation and header elements
css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove
# Clean up code blocks
css('pre').each do |node|
# Detect language from class or parent div
if node['class']&.include?('sql') || node.at_css('code.sql')
node['data-language'] = 'sql'
elsif node['class']&.include?('language-sql')
node['data-language'] = 'sql'
end
node.content = node.content.strip
end
# Remove unnecessary attributes but keep essential ones
css('div, span, p').each do |node|
node.remove_attribute('style')
node.remove_attribute('class') unless node['class'] =~ /highlight/
end
# Remove empty elements
css('div, span').each do |node|
node.remove if node.content.strip.empty?
end
# Remove script tags
css('script').remove
doc
end
end
end
end

@ -0,0 +1,45 @@
module Docs
class Duckdb
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('h1')&.content || 'DuckDB'
end
def get_type
case subpath
when /\Asql\//
'SQL Reference'
when /\Aapi\//
'Client APIs'
when /\Aguides\//
'How-to Guides'
when /\Adata\//
'Data Import'
when /\Aoperations_manual\//
'Operations Manual'
when /\Adev\//
'Development'
when /\Ainternals\//
'Internals'
when /\Aextensions\//
'Extensions'
when /\Aarchive\//
'Archive'
else
'Documentation'
end
end
def additional_entries
entries = []
css('h2[id]', 'h3[id]').each do |node|
name = node.content.strip
# Clean up the name
name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ')
entries << [name, node['id'], get_type]
end
entries
end
end
end
end

@ -0,0 +1,69 @@
module Docs
class Duckdb < UrlScraper
self.name = 'DuckDB'
self.type = 'duckdb'
self.root_path = 'index.html'
self.links = {
home: 'https://duckdb.org/',
code: 'https://github.com/duckdb/duckdb'
}
html_filters.push 'duckdb/entries', 'duckdb/clean_html'
options[:container] = '.documentation'
options[:skip_patterns] = [
/installation/,
/archive/,
/reference/,
]
options[:skip] = %w(
docs/archive/
docs/installation/
docs/api/
)
options[:attribution] = <<-HTML
&copy; Copyright 2018&ndash;2024 Stichting DuckDB Foundation<br>
Licensed under the MIT License.
HTML
version '1.1' do
self.release = '1.1.x'
self.base_url = 'http://localhost:8000/docs/'
end
# version '1.0' do
# self.release = '1.0.x'
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
# html_filters.push 'duckdb/clean_html'
# end
# version '0.9' do
# self.release = '0.9.x'
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
# html_filters.push 'duckdb/clean_html'
# end
# version '0.8' do
# self.release = '0.8.x'
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
# html_filters.push 'duckdb/clean_html'
# end
# version '0.7' do
# self.release = '0.7.x'
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
# html_filters.push 'duckdb/clean_html'
# end
def get_latest_version(opts)
get_github_tags('duckdb', 'duckdb', opts)
end
end
end
Loading…
Cancel
Save