From eaec6ec43ff3136df6ba83a7abcb55ad28da492f Mon Sep 17 00:00:00 2001 From: Scott Goley Date: Fri, 8 Nov 2024 23:05:14 -0500 Subject: [PATCH 1/3] duckdb docs (v1.1) - scrape v1 --- lib/docs/filters/duckdb/clean_html.rb | 41 ++++++++++++++++ lib/docs/filters/duckdb/entries.rb | 45 +++++++++++++++++ lib/docs/scrapers/duckdb.rb | 69 +++++++++++++++++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 lib/docs/filters/duckdb/clean_html.rb create mode 100644 lib/docs/filters/duckdb/entries.rb create mode 100644 lib/docs/scrapers/duckdb.rb diff --git a/lib/docs/filters/duckdb/clean_html.rb b/lib/docs/filters/duckdb/clean_html.rb new file mode 100644 index 00000000..ae518c7b --- /dev/null +++ b/lib/docs/filters/duckdb/clean_html.rb @@ -0,0 +1,41 @@ +module Docs + class Duckdb + class CleanHtmlFilter < Filter + def call + # First extract the main content + @doc = at_css('main') + return doc if @doc.nil? + + # Remove navigation and header elements + css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove + + # Clean up code blocks + css('pre').each do |node| + # Detect language from class or parent div + if node['class']&.include?('sql') || node.at_css('code.sql') + node['data-language'] = 'sql' + elsif node['class']&.include?('language-sql') + node['data-language'] = 'sql' + end + node.content = node.content.strip + end + + # Remove unnecessary attributes but keep essential ones + css('div, span, p').each do |node| + node.remove_attribute('style') + node.remove_attribute('class') unless node['class'] =~ /highlight/ + end + + # Remove empty elements + css('div, span').each do |node| + node.remove if node.content.strip.empty? + end + + # Remove script tags + css('script').remove + + doc + end + end + end +end \ No newline at end of file diff --git a/lib/docs/filters/duckdb/entries.rb b/lib/docs/filters/duckdb/entries.rb new file mode 100644 index 00000000..ea929022 --- /dev/null +++ b/lib/docs/filters/duckdb/entries.rb @@ -0,0 +1,45 @@ +module Docs + class Duckdb + class EntriesFilter < Docs::EntriesFilter + def get_name + at_css('h1')&.content || 'DuckDB' + end + + def get_type + case subpath + when /\Asql\// + 'SQL Reference' + when /\Aapi\// + 'Client APIs' + when /\Aguides\// + 'How-to Guides' + when /\Adata\// + 'Data Import' + when /\Aoperations_manual\// + 'Operations Manual' + when /\Adev\// + 'Development' + when /\Ainternals\// + 'Internals' + when /\Aextensions\// + 'Extensions' + when /\Aarchive\// + 'Archive' + else + 'Documentation' + end + end + + def additional_entries + entries = [] + css('h2[id]', 'h3[id]').each do |node| + name = node.content.strip + # Clean up the name + name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ') + entries << [name, node['id'], get_type] + end + entries + end + end + end +end \ No newline at end of file diff --git a/lib/docs/scrapers/duckdb.rb b/lib/docs/scrapers/duckdb.rb new file mode 100644 index 00000000..a160b3ef --- /dev/null +++ b/lib/docs/scrapers/duckdb.rb @@ -0,0 +1,69 @@ +module Docs + class Duckdb < UrlScraper + self.name = 'DuckDB' + self.type = 'duckdb' + self.root_path = 'index.html' + self.links = { + home: 'https://duckdb.org/', + code: 'https://github.com/duckdb/duckdb' + } + + html_filters.push 'duckdb/entries', 'duckdb/clean_html' + + options[:container] = '.documentation' + + options[:skip_patterns] = [ + /installation/, + /archive/, + /reference/, + ] + + options[:skip] = %w( + docs/archive/ + docs/installation/ + docs/api/ + ) + + options[:attribution] = <<-HTML + © Copyright 2018–2024 Stichting DuckDB Foundation
+ Licensed under the MIT License. + HTML + + version '1.1' do + self.release = '1.1.x' + self.base_url = 'http://localhost:8000/docs/' + end + + # version '1.0' do + # self.release = '1.0.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + # version '0.9' do + # self.release = '0.9.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + # version '0.8' do + # self.release = '0.8.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + # version '0.7' do + # self.release = '0.7.x' + # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" + + # html_filters.push 'duckdb/clean_html' + # end + + def get_latest_version(opts) + get_github_tags('duckdb', 'duckdb', opts) + end + end +end From 5800216f1b3f8fe94a399373c91ef5938e939c4d Mon Sep 17 00:00:00 2001 From: Scott Goley Date: Thu, 21 Nov 2024 21:49:50 -0500 Subject: [PATCH 2/3] +duckdb icons & source --- public/icons/docs/duckdb/16.png | Bin 0 -> 902 bytes public/icons/docs/duckdb/16@2x.png | Bin 0 -> 1566 bytes public/icons/docs/duckdb/SOURCE | 1 + 3 files changed, 1 insertion(+) create mode 100644 public/icons/docs/duckdb/16.png create mode 100644 public/icons/docs/duckdb/16@2x.png create mode 100644 public/icons/docs/duckdb/SOURCE diff --git a/public/icons/docs/duckdb/16.png b/public/icons/docs/duckdb/16.png new file mode 100644 index 0000000000000000000000000000000000000000..855df72e07579463f4069f526928fcecf748a151 GIT binary patch literal 902 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`jKx9jP7LeL$-D$|SkfJR9T^xl z_H+M9WCij$3p^r=85sBugD~Uq{1qucL9r6oh?3y^w370~qEv=}#LT=BJwMkF1yeo4 z@6F#Q166EEjqptK^weVD0CHFvq!?Kl7=bJ=AeM%*L2l7tWCn{f0ojI(ObmQLItqw0 z+gZTk89+7&Bmgl;Ka57Pl7X3lVFEh?3sBy`$k>2!0mMv@de#LHb0z`VAixAPg$b-O z$kGDHg6c9fFaXIWFHt)Z&px?XxX*I=yK25|t6ceG z?w|R+_x!`!dEfsr{n^-YL;GBZ_oI-~+}XnNN4O26VxJ$G=IT0qhqhg(=tfPh&_DOz z+j0HAaosSrICcLH{U9ZMulj@4-*&3Y2y?!iDAK&hULv79Soz?!zgE@UN0PPH-l}X5 z)zNsAH2=eGPW2a}%?|}~-K{v|_H&u9bM!ZtdDqf%C;Xbi=ioTe&v&NGYMD{i6y3=1 z{1(5^>;0U}kJcPpk+Ma;X`1bZkKEIyFRWC*qOB9JFyr*}$!n|+zTL%flKXB8+ph;t z3Nuz*Z!uXUyK3Dujf@_?H+z|u`>)vJbUD-MCT~HLbtlVL@$NBFx2hEt-dNY63vgE5~-L4Ur z8I~`5_%&Lhil@Kr!A5*q#W&)P>an6gt-4d{@+Rf0sz0 zTn{V#F((_;Vv_`?NPe%R9JT<}&n~R7u{i6`*5y*jTQ9*^;FAb8u2f=E-s$jDr*Pct<@KoQmE zmHJSf0k{4ENea3g#!eU?ni+T<9sk%cFngIjnpfb_5ES&Pcz2nzte4!Hue8zSP#=^R z%xRBW3i&;(tb=XJ0sy-5c7J2@T92>8n+eV1Khng%nIMz<2ri6mN+OD3e^tecJ{ zm#^&D+);stG4)qH#<2r^zdmNl-(mXc%Kl{Dido7+S5D%Q74 zYnLY?Rru0&>^ziaVpKXB8Z)14?$q3s991H-5m$S`V*Z@v*`#9%?i)-h1(P@#zS3#0 z;2e*eeU9m`fOtQJd#BvL%{IkUUCVd=gzu+6<8Ns)Gu1^7=Fl;8j-Awr< z143WL~iXso%!iju!kfy#A`)baZ+#f6B|(Y9O}==OZ1z z`_u6umt*Z;bzHQ2{r$3Z9IAwI-02#`XY1~XG1k#GV%P@OPjP0R^DvF^`;!a<{9d=Z z6Yy#vs=1#PH5uxsaLXp0e0NLExj2)y358H!^dXin5l)X4H->Qf7O$6OnpV6|F&XY1 z72)9vjyy(V7F#KIshH*8ldU_L7#c*6c`I$VR2-|GE=nMB8JffkO;`cjj*8=}%a_H{ z+t(Iw5rS77^{em4hbI@GR9fl-1AAvOKN^ZCSi(pznYq!#lvBXZ@A<=%b3o(!G&ig z4^Ji(n11@>ZpSJ?J!LdEPlY3snz5Ds>}Id)(b&XwM*kTS!|KdPYzTK1{S;*XAGPbSpb`dt1Of`y*V literal 0 HcmV?d00001 diff --git a/public/icons/docs/duckdb/SOURCE b/public/icons/docs/duckdb/SOURCE new file mode 100644 index 00000000..286d1738 --- /dev/null +++ b/public/icons/docs/duckdb/SOURCE @@ -0,0 +1 @@ +https://github.com/duckdb/duckdb/tree/main/logo \ No newline at end of file From deedda316149dbde221788c8142499cdb43933f2 Mon Sep 17 00:00:00 2001 From: Simon Legner Date: Sat, 23 Nov 2024 14:44:32 +0100 Subject: [PATCH 3/3] Update DuckDB documentation (1.1.3) --- assets/javascripts/news.json | 4 +++ lib/docs/filters/duckdb/attribution.rb | 12 ++++++++ lib/docs/filters/duckdb/clean_html.rb | 21 +++++++------ lib/docs/filters/duckdb/entries.rb | 2 +- lib/docs/scrapers/duckdb.rb | 41 +++++--------------------- 5 files changed, 34 insertions(+), 46 deletions(-) create mode 100644 lib/docs/filters/duckdb/attribution.rb diff --git a/assets/javascripts/news.json b/assets/javascripts/news.json index e56bd8dd..052f4918 100644 --- a/assets/javascripts/news.json +++ b/assets/javascripts/news.json @@ -1,4 +1,8 @@ [ + [ + "2024-11-23", + "New documentation: DuckDB" + ], [ "2024-08-20", "New documentation: Linux man pages" diff --git a/lib/docs/filters/duckdb/attribution.rb b/lib/docs/filters/duckdb/attribution.rb new file mode 100644 index 00000000..7591fdb8 --- /dev/null +++ b/lib/docs/filters/duckdb/attribution.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Docs + class Duckdb + class AttributionFilter < Docs::AttributionFilter + def attribution_link + url = current_url.to_s.sub! 'http://localhost:8000', 'https://duckdb.org' + %(#{url}) + end + end + end +end diff --git a/lib/docs/filters/duckdb/clean_html.rb b/lib/docs/filters/duckdb/clean_html.rb index ae518c7b..d739275e 100644 --- a/lib/docs/filters/duckdb/clean_html.rb +++ b/lib/docs/filters/duckdb/clean_html.rb @@ -3,27 +3,26 @@ module Docs class CleanHtmlFilter < Filter def call # First extract the main content - @doc = at_css('main') + @doc = at_css('#main_content_wrap', 'main') return doc if @doc.nil? + doc.prepend_child at_css('.title').remove + at_css('.title').name = 'h1' + # Remove navigation and header elements - css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove + css('.headerline', '.headlinebar', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove # Clean up code blocks - css('pre').each do |node| - # Detect language from class or parent div - if node['class']&.include?('sql') || node.at_css('code.sql') - node['data-language'] = 'sql' - elsif node['class']&.include?('language-sql') - node['data-language'] = 'sql' - end + css('div.highlighter-rouge').each do |node| + node['data-language'] = node['class'][/language-(\w+)/, 1] if node['class'] node.content = node.content.strip + node.name = 'pre' end - # Remove unnecessary attributes but keep essential ones + # Remove unnecessary attributes css('div, span, p').each do |node| node.remove_attribute('style') - node.remove_attribute('class') unless node['class'] =~ /highlight/ + node.remove_attribute('class') end # Remove empty elements diff --git a/lib/docs/filters/duckdb/entries.rb b/lib/docs/filters/duckdb/entries.rb index ea929022..cb98768a 100644 --- a/lib/docs/filters/duckdb/entries.rb +++ b/lib/docs/filters/duckdb/entries.rb @@ -2,7 +2,7 @@ module Docs class Duckdb class EntriesFilter < Docs::EntriesFilter def get_name - at_css('h1')&.content || 'DuckDB' + at_css('h1', '.title').content end def get_type diff --git a/lib/docs/scrapers/duckdb.rb b/lib/docs/scrapers/duckdb.rb index a160b3ef..98fb16ed 100644 --- a/lib/docs/scrapers/duckdb.rb +++ b/lib/docs/scrapers/duckdb.rb @@ -8,7 +8,13 @@ module Docs code: 'https://github.com/duckdb/duckdb' } + # https://duckdb.org/docs/guides/offline-copy.html + # curl -O https://duckdb.org/duckdb-docs.zip; bsdtar xf duckdb-docs.zip; cd duckdb-docs; python -m http.server + self.release = '1.1.3' + self.base_url = 'http://localhost:8000/docs/' + html_filters.push 'duckdb/entries', 'duckdb/clean_html' + text_filters.replace 'attribution', 'duckdb/attribution' options[:container] = '.documentation' @@ -29,41 +35,8 @@ module Docs Licensed under the MIT License. HTML - version '1.1' do - self.release = '1.1.x' - self.base_url = 'http://localhost:8000/docs/' - end - - # version '1.0' do - # self.release = '1.0.x' - # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" - - # html_filters.push 'duckdb/clean_html' - # end - - # version '0.9' do - # self.release = '0.9.x' - # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" - - # html_filters.push 'duckdb/clean_html' - # end - - # version '0.8' do - # self.release = '0.8.x' - # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" - - # html_filters.push 'duckdb/clean_html' - # end - - # version '0.7' do - # self.release = '0.7.x' - # self.base_url = "https://duckdb.org/docs/archive/#{self.version}/" - - # html_filters.push 'duckdb/clean_html' - # end - def get_latest_version(opts) - get_github_tags('duckdb', 'duckdb', opts) + get_github_tags('duckdb', 'duckdb', opts)[0]['name'] end end end