diff --git a/lib/docs/filters/duckdb/clean_html.rb b/lib/docs/filters/duckdb/clean_html.rb
new file mode 100644
index 00000000..ae518c7b
--- /dev/null
+++ b/lib/docs/filters/duckdb/clean_html.rb
@@ -0,0 +1,41 @@
+module Docs
+ class Duckdb
+ class CleanHtmlFilter < Filter
+ def call
+ # First extract the main content
+ @doc = at_css('main')
+ return doc if @doc.nil?
+
+ # Remove navigation and header elements
+ css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove
+
+ # Clean up code blocks
+ css('pre').each do |node|
+ # Detect language from class or parent div
+ if node['class']&.include?('sql') || node.at_css('code.sql')
+ node['data-language'] = 'sql'
+ elsif node['class']&.include?('language-sql')
+ node['data-language'] = 'sql'
+ end
+ node.content = node.content.strip
+ end
+
+ # Remove unnecessary attributes but keep essential ones
+ css('div, span, p').each do |node|
+ node.remove_attribute('style')
+ node.remove_attribute('class') unless node['class'] =~ /highlight/
+ end
+
+ # Remove empty elements
+ css('div, span').each do |node|
+ node.remove if node.content.strip.empty?
+ end
+
+ # Remove script tags
+ css('script').remove
+
+ doc
+ end
+ end
+ end
+end
\ No newline at end of file
diff --git a/lib/docs/filters/duckdb/entries.rb b/lib/docs/filters/duckdb/entries.rb
new file mode 100644
index 00000000..ea929022
--- /dev/null
+++ b/lib/docs/filters/duckdb/entries.rb
@@ -0,0 +1,45 @@
+module Docs
+ class Duckdb
+ class EntriesFilter < Docs::EntriesFilter
+ def get_name
+ at_css('h1')&.content || 'DuckDB'
+ end
+
+ def get_type
+ case subpath
+ when /\Asql\//
+ 'SQL Reference'
+ when /\Aapi\//
+ 'Client APIs'
+ when /\Aguides\//
+ 'How-to Guides'
+ when /\Adata\//
+ 'Data Import'
+ when /\Aoperations_manual\//
+ 'Operations Manual'
+ when /\Adev\//
+ 'Development'
+ when /\Ainternals\//
+ 'Internals'
+ when /\Aextensions\//
+ 'Extensions'
+ when /\Aarchive\//
+ 'Archive'
+ else
+ 'Documentation'
+ end
+ end
+
+ def additional_entries
+ entries = []
+ css('h2[id]', 'h3[id]').each do |node|
+ name = node.content.strip
+ # Clean up the name
+ name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ')
+ entries << [name, node['id'], get_type]
+ end
+ entries
+ end
+ end
+ end
+end
\ No newline at end of file
diff --git a/lib/docs/scrapers/duckdb.rb b/lib/docs/scrapers/duckdb.rb
new file mode 100644
index 00000000..a160b3ef
--- /dev/null
+++ b/lib/docs/scrapers/duckdb.rb
@@ -0,0 +1,69 @@
+module Docs
+ class Duckdb < UrlScraper
+ self.name = 'DuckDB'
+ self.type = 'duckdb'
+ self.root_path = 'index.html'
+ self.links = {
+ home: 'https://duckdb.org/',
+ code: 'https://github.com/duckdb/duckdb'
+ }
+
+ html_filters.push 'duckdb/entries', 'duckdb/clean_html'
+
+ options[:container] = '.documentation'
+
+ options[:skip_patterns] = [
+ /installation/,
+ /archive/,
+ /reference/,
+ ]
+
+ options[:skip] = %w(
+ docs/archive/
+ docs/installation/
+ docs/api/
+ )
+
+ options[:attribution] = <<-HTML
+ © Copyright 2018–2024 Stichting DuckDB Foundation
+ Licensed under the MIT License.
+ HTML
+
+ version '1.1' do
+ self.release = '1.1.x'
+ self.base_url = 'http://localhost:8000/docs/'
+ end
+
+ # version '1.0' do
+ # self.release = '1.0.x'
+ # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+ # html_filters.push 'duckdb/clean_html'
+ # end
+
+ # version '0.9' do
+ # self.release = '0.9.x'
+ # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+ # html_filters.push 'duckdb/clean_html'
+ # end
+
+ # version '0.8' do
+ # self.release = '0.8.x'
+ # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+ # html_filters.push 'duckdb/clean_html'
+ # end
+
+ # version '0.7' do
+ # self.release = '0.7.x'
+ # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+ # html_filters.push 'duckdb/clean_html'
+ # end
+
+ def get_latest_version(opts)
+ get_github_tags('duckdb', 'duckdb', opts)
+ end
+ end
+end