From a754d50317947e5bdb85434eaf4521a708ae5bb9 Mon Sep 17 00:00:00 2001
From: Enoc
Date: Wed, 1 Sep 2021 01:19:08 -0600
Subject: [PATCH 1/5] Initial migration from cpp filescraper to urlscraper
---
lib/docs/filters/cpp20/clean_html.rb | 9 +++
lib/docs/filters/cpp20/entries.rb | 82 ++++++++++++++++++++++++++++
lib/docs/scrapers/cpp20.rb | 40 ++++++++++++++
3 files changed, 131 insertions(+)
create mode 100644 lib/docs/filters/cpp20/clean_html.rb
create mode 100644 lib/docs/filters/cpp20/entries.rb
create mode 100644 lib/docs/scrapers/cpp20.rb
diff --git a/lib/docs/filters/cpp20/clean_html.rb b/lib/docs/filters/cpp20/clean_html.rb
new file mode 100644
index 00000000..4328e002
--- /dev/null
+++ b/lib/docs/filters/cpp20/clean_html.rb
@@ -0,0 +1,9 @@
+module Docs
+ class Cpp20
+ class CleanHtmlFilter < Filter
+ def call
+ doc
+ end
+ end
+ end
+end
diff --git a/lib/docs/filters/cpp20/entries.rb b/lib/docs/filters/cpp20/entries.rb
new file mode 100644
index 00000000..8a5aad22
--- /dev/null
+++ b/lib/docs/filters/cpp20/entries.rb
@@ -0,0 +1,82 @@
+module Docs
+ class Cpp20
+ class EntriesFilter < Docs::EntriesFilter
+ @@duplicate_names = []
+
+ REPLACE_NAMES = {
+ 'Error directive' => '#error directive',
+ 'Filename and line information' => '#line directive',
+ 'Implementation defined behavior control' => '#pragma directive',
+ 'Replacing text macros' => '#define directive',
+ 'Source file inclusion' => '#include directive' }
+
+ def get_name
+ name = at_css('#firstHeading').content.strip
+ name = format_name(name)
+ name = name.split(',').first
+ name
+ end
+
+ def get_type
+ if at_css('#firstHeading').content.include?('C++ keyword')
+ 'Keywords'
+ elsif subpath.start_with?('experimental')
+ 'Experimental libraries'
+ elsif subpath.start_with?('language/')
+ 'Language'
+ elsif subpath.start_with?('freestanding')
+ 'Utilities'
+ elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
+ type.strip!
+ type.remove! ' library'
+ type.remove! ' utilities'
+ type.remove! 'C++ '
+ type.capitalize!
+ type
+ end
+ end
+
+ def additional_entries
+ return [] if root_page? || self.name.start_with?('operators')
+ names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1]
+ names.each(&:strip!).reject! do |name|
+ name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator')
+ end
+ names.map { |name| [format_name(name)] }
+ end
+
+ def format_name(name)
+ name.remove! 'C++ concepts: '
+ name.remove! 'C++ keywords: '
+ name.remove! 'C++ ' unless name == 'C++ language'
+ name.remove! %r{\s\(.+\)}
+
+ name.sub! %r{\AStandard library header <(.+)>\z}, '\1'
+ name.sub! %r{(<[^>]+>)}, ''
+
+ if name.include?('operator') && name.include?(',')
+ name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators')
+ name.sub! ' ', ' '
+ name << ')' unless name.last == ')' || name.exclude?('(')
+ name.sub! '()', ''
+ name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50
+ end
+
+ REPLACE_NAMES[name] || name
+ end
+
+ def entries
+ entries = []
+
+ # avoid duplicate pages
+ if !(@@duplicate_names.include?(name))
+ @@duplicate_names.push(name)
+ entries << default_entry if root_page? || include_default_entry?
+ entries.concat(additional_entries)
+ build_entries(entries)
+ end
+ end
+
+ end
+ end
+end
diff --git a/lib/docs/scrapers/cpp20.rb b/lib/docs/scrapers/cpp20.rb
new file mode 100644
index 00000000..14034a43
--- /dev/null
+++ b/lib/docs/scrapers/cpp20.rb
@@ -0,0 +1,40 @@
+module Docs
+ class Cpp20 < UrlScraper
+ self.name = 'C++20'
+ self.slug = 'cpp20'
+ self.type = 'c'
+ self.base_url = 'https://en.cppreference.com/w/cpp/'
+ self.root_path = 'header'
+
+ html_filters.insert_before 'clean_html', 'c/fix_code'
+ html_filters.push 'cpp20/entries', 'c/clean_html', 'title'
+
+ options[:decode_and_clean_paths] = true
+ options[:container] = '#content'
+ options[:title] = false
+ options[:root_title] = 'C++ Programming Language'
+
+ options[:skip] = %w(
+ language/extending_std.html
+ language/history.html
+ regex/ecmascript.html
+ regex/regex_token_iterator/operator_cmp.html
+ )
+
+ options[:skip_patterns] = [/experimental/]
+
+ options[:attribution] = <<-HTML
+ © cppreference.com
+ Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
+ HTML
+
+ # Same as get_latest_version in lib/docs/scrapers/c.rb
+ def get_latest_version(opts)
+ doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
+ link = doc.at_css('a[title^="File:"]')
+ date = link.content.scan(/(\d+)\./)[0][0]
+ DateTime.strptime(date, '%Y%m%d').to_time.to_i
+ end
+
+ end
+end
From e17bc84ea4c2a675fb16282339610dcab1506ce0 Mon Sep 17 00:00:00 2001
From: Enoc
Date: Thu, 9 Sep 2021 00:03:46 -0600
Subject: [PATCH 2/5] migrate cpp scraper from filescraper to urlscraper
---
lib/docs/filters/cpp/entries.rb | 20 ++-
lib/docs/filters/cpp/fix_urls.rb | 11 --
lib/docs/filters/cpp20/clean_html.rb | 9 --
lib/docs/filters/cpp20/entries.rb | 82 -------------
lib/docs/filters/cppref/clean_html.rb | 116 ++++++++++++++++++
lib/docs/filters/cppref/fix_code.rb | 21 ++++
lib/docs/scrapers/cpp.rb | 52 --------
lib/docs/scrapers/{cpp20.rb => cppref/cpp.rb} | 20 +--
lib/docs/scrapers/cppref/cppref.rb | 29 +++++
9 files changed, 189 insertions(+), 171 deletions(-)
delete mode 100644 lib/docs/filters/cpp/fix_urls.rb
delete mode 100644 lib/docs/filters/cpp20/clean_html.rb
delete mode 100644 lib/docs/filters/cpp20/entries.rb
create mode 100644 lib/docs/filters/cppref/clean_html.rb
create mode 100644 lib/docs/filters/cppref/fix_code.rb
delete mode 100644 lib/docs/scrapers/cpp.rb
rename lib/docs/scrapers/{cpp20.rb => cppref/cpp.rb} (54%)
create mode 100644 lib/docs/scrapers/cppref/cppref.rb
diff --git a/lib/docs/filters/cpp/entries.rb b/lib/docs/filters/cpp/entries.rb
index b0700139..d13526d3 100644
--- a/lib/docs/filters/cpp/entries.rb
+++ b/lib/docs/filters/cpp/entries.rb
@@ -1,6 +1,8 @@
module Docs
class Cpp
class EntriesFilter < Docs::EntriesFilter
+ @@duplicate_names = []
+
REPLACE_NAMES = {
'Error directive' => '#error directive',
'Filename and line information' => '#line directive',
@@ -11,7 +13,8 @@ module Docs
def get_name
name = at_css('#firstHeading').content.strip
name = format_name(name)
- name.split(',').first
+ name = name.split(',').first
+ name
end
def get_type
@@ -61,6 +64,21 @@ module Docs
REPLACE_NAMES[name] || name
end
+
+ # Avoid duplicate pages, these duplicate page are the same page for
+ # multiple functions that are organized in the same page because provide
+ # similar behavior but have different name.
+ def entries
+ entries = []
+
+ if !(@@duplicate_names.include?(name))
+ @@duplicate_names.push(name)
+ entries << default_entry if root_page? || include_default_entry?
+ entries.concat(additional_entries)
+ build_entries(entries)
+ end
+ end
+
end
end
end
diff --git a/lib/docs/filters/cpp/fix_urls.rb b/lib/docs/filters/cpp/fix_urls.rb
deleted file mode 100644
index 8e8d67c6..00000000
--- a/lib/docs/filters/cpp/fix_urls.rb
+++ /dev/null
@@ -1,11 +0,0 @@
-module Docs
- class Cpp
- class FixUrlsFilter < Filter
- def call
- html.gsub! File.join(Cpp.base_url, Cpp.root_path), Cpp.base_url[0..-2]
- html.gsub! %r{#{Cpp.base_url}([^"']+?)\.html}, "#{Cpp.base_url}\\1"
- html
- end
- end
- end
-end
diff --git a/lib/docs/filters/cpp20/clean_html.rb b/lib/docs/filters/cpp20/clean_html.rb
deleted file mode 100644
index 4328e002..00000000
--- a/lib/docs/filters/cpp20/clean_html.rb
+++ /dev/null
@@ -1,9 +0,0 @@
-module Docs
- class Cpp20
- class CleanHtmlFilter < Filter
- def call
- doc
- end
- end
- end
-end
diff --git a/lib/docs/filters/cpp20/entries.rb b/lib/docs/filters/cpp20/entries.rb
deleted file mode 100644
index 8a5aad22..00000000
--- a/lib/docs/filters/cpp20/entries.rb
+++ /dev/null
@@ -1,82 +0,0 @@
-module Docs
- class Cpp20
- class EntriesFilter < Docs::EntriesFilter
- @@duplicate_names = []
-
- REPLACE_NAMES = {
- 'Error directive' => '#error directive',
- 'Filename and line information' => '#line directive',
- 'Implementation defined behavior control' => '#pragma directive',
- 'Replacing text macros' => '#define directive',
- 'Source file inclusion' => '#include directive' }
-
- def get_name
- name = at_css('#firstHeading').content.strip
- name = format_name(name)
- name = name.split(',').first
- name
- end
-
- def get_type
- if at_css('#firstHeading').content.include?('C++ keyword')
- 'Keywords'
- elsif subpath.start_with?('experimental')
- 'Experimental libraries'
- elsif subpath.start_with?('language/')
- 'Language'
- elsif subpath.start_with?('freestanding')
- 'Utilities'
- elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
- type.strip!
- type.remove! ' library'
- type.remove! ' utilities'
- type.remove! 'C++ '
- type.capitalize!
- type
- end
- end
-
- def additional_entries
- return [] if root_page? || self.name.start_with?('operators')
- names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1]
- names.each(&:strip!).reject! do |name|
- name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator')
- end
- names.map { |name| [format_name(name)] }
- end
-
- def format_name(name)
- name.remove! 'C++ concepts: '
- name.remove! 'C++ keywords: '
- name.remove! 'C++ ' unless name == 'C++ language'
- name.remove! %r{\s\(.+\)}
-
- name.sub! %r{\AStandard library header <(.+)>\z}, '\1'
- name.sub! %r{(<[^>]+>)}, ''
-
- if name.include?('operator') && name.include?(',')
- name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators')
- name.sub! ' ', ' '
- name << ')' unless name.last == ')' || name.exclude?('(')
- name.sub! '()', ''
- name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50
- end
-
- REPLACE_NAMES[name] || name
- end
-
- def entries
- entries = []
-
- # avoid duplicate pages
- if !(@@duplicate_names.include?(name))
- @@duplicate_names.push(name)
- entries << default_entry if root_page? || include_default_entry?
- entries.concat(additional_entries)
- build_entries(entries)
- end
- end
-
- end
- end
-end
diff --git a/lib/docs/filters/cppref/clean_html.rb b/lib/docs/filters/cppref/clean_html.rb
new file mode 100644
index 00000000..ced30f4d
--- /dev/null
+++ b/lib/docs/filters/cppref/clean_html.rb
@@ -0,0 +1,116 @@
+module Docs
+ class Cppref
+ class CleanHtmlFilter < Filter
+ def call
+ css('h1').remove if root_page?
+
+ css('.t-dcl-rev-aux td[rowspan]').each do |node|
+ rowspan = node['rowspan'].to_i
+ node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3
+ end
+
+ css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
+ '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
+ '.t-sdsc-sep:first-child:last-child', '.t-example-live-link',
+ '.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove
+
+ css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image',
+ 'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node|
+ node.before(node.children).remove
+ end
+
+ css('div > ul').each do |node|
+ node.parent.before(node.parent.children).remove
+ end
+
+ css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node|
+ dl = node.parent.parent
+ if dl.previous_element && dl.previous_element.name == 'ul'
+ dl.previous_element << node
+ dl.remove
+ else
+ dl.before(node).remove
+ end
+ end
+
+ css('dl > dd:first-child:last-child').each do |node|
+ node.parent.before(node.children).remove
+ end
+
+ css('ul').each do |node|
+ while node.next_element && node.next_element.name == 'ul'
+ node << node.next_element.children
+ node.next_element.remove
+ end
+ end
+
+ css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node|
+ node.parent['id'] = node['id']
+ node.before(node.children).remove
+ end
+
+ css('table[style]', 'th[style]', 'td[style]').remove_attr('style')
+ css('table[cellpadding]').remove_attr('cellpadding')
+
+ css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node|
+ node.name = 'th'
+ node.content = ' ' if node.content.empty?
+ end
+
+ css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node|
+ node.name = 'code'
+ node.remove_attribute('class')
+ node.content = node.content unless node.at_css('a')
+ end
+
+ css('div > span.source-cpp').each do |node|
+ node.name = 'pre'
+ node.inner_html = node.inner_html.gsub('
', "\n")
+ node.content = node.content
+ end
+
+ css('div > a > img[alt="About this image"]').each do |node|
+ node.parent.parent.remove
+ end
+
+ css('area[href]').each do |node|
+ node['href'] = node['href'].remove('.html')
+ end
+
+ css('p').each do |node|
+ while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code')
+ node << node.next
+ end
+ node.inner_html = node.inner_html.strip
+ node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/
+ node.remove if node.content.blank? && !node.at_css('img')
+ end
+
+ css('pre').each do |node|
+ node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp')
+ 'cpp'
+ else
+ 'c'
+ end
+ node.remove_attribute('class')
+ node.content = node.content.gsub("\t", ' ' * 8)
+ end
+
+ css('code code', '.mw-geshi').each do |node|
+ node.before(node.children).remove
+ end
+
+ css('h1 ~ .fmbox').each do |node|
+ node.name = 'div'
+ node.content = node.content
+ end
+
+ css('img').each do |node|
+ node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg'
+ end
+
+ doc
+ end
+ end
+ end
+end
diff --git a/lib/docs/filters/cppref/fix_code.rb b/lib/docs/filters/cppref/fix_code.rb
new file mode 100644
index 00000000..c80a7426
--- /dev/null
+++ b/lib/docs/filters/cppref/fix_code.rb
@@ -0,0 +1,21 @@
+module Docs
+ class Cppref
+ class FixCodeFilter < Filter
+ def call
+ css('div > span.source-c', 'div > span.source-cpp').each do |node|
+ node.inner_html = node.inner_html.gsub(/
\n?/, "\n").gsub("\n
\n", "\n")
+ node.parent.name = 'pre'
+ node.parent['class'] = node['class']
+ node.parent.content = node.content
+ end
+
+ nbsp = Nokogiri::HTML(' ').text
+ css('pre').each do |node|
+ node.content = node.content.gsub(nbsp, ' ')
+ end
+
+ doc
+ end
+ end
+ end
+end
diff --git a/lib/docs/scrapers/cpp.rb b/lib/docs/scrapers/cpp.rb
deleted file mode 100644
index f0ad2b16..00000000
--- a/lib/docs/scrapers/cpp.rb
+++ /dev/null
@@ -1,52 +0,0 @@
-module Docs
- class Cpp < FileScraper
- self.name = 'C++'
- self.slug = 'cpp'
- self.type = 'c'
- self.base_url = 'http://en.cppreference.com/w/cpp/'
- self.root_path = 'header.html'
-
- html_filters.insert_before 'clean_html', 'c/fix_code'
- html_filters.push 'cpp/entries', 'c/clean_html', 'title'
- text_filters.push 'cpp/fix_urls'
-
- options[:decode_and_clean_paths] = true
- options[:container] = '#content'
- options[:title] = false
- options[:root_title] = 'C++ Programming Language'
- options[:skip] = %w(
- language/extending_std.html
- language/history.html
- regex/ecmascript.html
- regex/regex_token_iterator/operator_cmp.html
- )
- options[:skip_patterns] = [/experimental/]
- options[:only_patterns] = [/\.html\z/]
-
- options[:fix_urls] = ->(url) do
- url = CGI.unescape(url)
- url.sub! %r{\A.+/http%3A/}, 'http://'
- url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com'
- url
- end
-
- options[:attribution] = <<-HTML
- © cppreference.com
- Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
- HTML
-
- # Same as get_latest_version in lib/docs/scrapers/c.rb
- def get_latest_version(opts)
- doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
- link = doc.at_css('a[title^="File:"]')
- date = link.content.scan(/(\d+)\./)[0][0]
- DateTime.strptime(date, '%Y%m%d').to_time.to_i
- end
-
- private
-
- def file_path_for(*)
- URI.unescape(super)
- end
- end
-end
diff --git a/lib/docs/scrapers/cpp20.rb b/lib/docs/scrapers/cppref/cpp.rb
similarity index 54%
rename from lib/docs/scrapers/cpp20.rb
rename to lib/docs/scrapers/cppref/cpp.rb
index 14034a43..bfc87c62 100644
--- a/lib/docs/scrapers/cpp20.rb
+++ b/lib/docs/scrapers/cppref/cpp.rb
@@ -1,17 +1,12 @@
module Docs
- class Cpp20 < UrlScraper
- self.name = 'C++20'
- self.slug = 'cpp20'
+ class Cpp < Cppref
+ self.name = 'C++'
+ self.slug = 'cpp'
self.type = 'c'
self.base_url = 'https://en.cppreference.com/w/cpp/'
- self.root_path = 'header'
- html_filters.insert_before 'clean_html', 'c/fix_code'
- html_filters.push 'cpp20/entries', 'c/clean_html', 'title'
+ html_filters.insert_before 'cppref/clean_html', 'cpp/entries'
- options[:decode_and_clean_paths] = true
- options[:container] = '#content'
- options[:title] = false
options[:root_title] = 'C++ Programming Language'
options[:skip] = %w(
@@ -21,13 +16,6 @@ module Docs
regex/regex_token_iterator/operator_cmp.html
)
- options[:skip_patterns] = [/experimental/]
-
- options[:attribution] = <<-HTML
- © cppreference.com
- Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
- HTML
-
# Same as get_latest_version in lib/docs/scrapers/c.rb
def get_latest_version(opts)
doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
diff --git a/lib/docs/scrapers/cppref/cppref.rb b/lib/docs/scrapers/cppref/cppref.rb
new file mode 100644
index 00000000..b91751ef
--- /dev/null
+++ b/lib/docs/scrapers/cppref/cppref.rb
@@ -0,0 +1,29 @@
+module Docs
+ class Cppref < UrlScraper
+ self.abstract = true
+ self.type = 'cppref'
+ self.root_path = 'header'
+
+ html_filters.insert_before 'clean_html', 'cppref/fix_code'
+ html_filters.push 'cppref/clean_html', 'title'
+ # 'cpp20/entries',
+ options[:decode_and_clean_paths] = true
+ options[:container] = '#content'
+ options[:title] = false
+ options[:skip] = %w(language/history.html)
+
+ options[:skip_patterns] = [
+ /experimental/
+ ]
+
+ options[:attribution] = <<-HTML
+ © cppreference.com
+ Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
+ HTML
+
+ # def get_latest_version
+
+ # end
+
+ end
+end
From fdfcf3d9174d7e386af42412c3a2394d6e1eafca Mon Sep 17 00:00:00 2001
From: Enoc
Date: Fri, 10 Sep 2021 11:14:54 -0600
Subject: [PATCH 3/5] Migrate c scraper from filescraper to urlscraper
---
assets/stylesheets/application.css.scss | 2 +-
.../pages/{_c.scss => _cppref.scss} | 0
lib/docs/filters/c/clean_html.rb | 116 ------------------
lib/docs/filters/c/entries.rb | 3 +
lib/docs/filters/c/fix_code.rb | 21 ----
lib/docs/filters/c/fix_urls.rb | 11 --
lib/docs/scrapers/c.rb | 42 -------
lib/docs/scrapers/cppref/c.rb | 12 ++
lib/docs/scrapers/cppref/cpp.rb | 9 --
lib/docs/scrapers/cppref/cppref.rb | 12 +-
10 files changed, 24 insertions(+), 204 deletions(-)
rename assets/stylesheets/pages/{_c.scss => _cppref.scss} (100%)
delete mode 100644 lib/docs/filters/c/clean_html.rb
delete mode 100644 lib/docs/filters/c/fix_code.rb
delete mode 100644 lib/docs/filters/c/fix_urls.rb
delete mode 100644 lib/docs/scrapers/c.rb
create mode 100644 lib/docs/scrapers/cppref/c.rb
diff --git a/assets/stylesheets/application.css.scss b/assets/stylesheets/application.css.scss
index 0243afeb..542e1510 100644
--- a/assets/stylesheets/application.css.scss
+++ b/assets/stylesheets/application.css.scss
@@ -39,7 +39,7 @@
'pages/async',
'pages/bash',
'pages/bootstrap',
- 'pages/c',
+ 'pages/cppref',
'pages/cakephp',
'pages/clojure',
'pages/codeception',
diff --git a/assets/stylesheets/pages/_c.scss b/assets/stylesheets/pages/_cppref.scss
similarity index 100%
rename from assets/stylesheets/pages/_c.scss
rename to assets/stylesheets/pages/_cppref.scss
diff --git a/lib/docs/filters/c/clean_html.rb b/lib/docs/filters/c/clean_html.rb
deleted file mode 100644
index 0665a941..00000000
--- a/lib/docs/filters/c/clean_html.rb
+++ /dev/null
@@ -1,116 +0,0 @@
-module Docs
- class C
- class CleanHtmlFilter < Filter
- def call
- css('h1').remove if root_page?
-
- css('.t-dcl-rev-aux td[rowspan]').each do |node|
- rowspan = node['rowspan'].to_i
- node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3
- end
-
- css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
- '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
- '.t-sdsc-sep:first-child:last-child', '.t-example-live-link',
- '.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove
-
- css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image',
- 'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node|
- node.before(node.children).remove
- end
-
- css('div > ul').each do |node|
- node.parent.before(node.parent.children).remove
- end
-
- css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node|
- dl = node.parent.parent
- if dl.previous_element && dl.previous_element.name == 'ul'
- dl.previous_element << node
- dl.remove
- else
- dl.before(node).remove
- end
- end
-
- css('dl > dd:first-child:last-child').each do |node|
- node.parent.before(node.children).remove
- end
-
- css('ul').each do |node|
- while node.next_element && node.next_element.name == 'ul'
- node << node.next_element.children
- node.next_element.remove
- end
- end
-
- css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node|
- node.parent['id'] = node['id']
- node.before(node.children).remove
- end
-
- css('table[style]', 'th[style]', 'td[style]').remove_attr('style')
- css('table[cellpadding]').remove_attr('cellpadding')
-
- css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node|
- node.name = 'th'
- node.content = ' ' if node.content.empty?
- end
-
- css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node|
- node.name = 'code'
- node.remove_attribute('class')
- node.content = node.content unless node.at_css('a')
- end
-
- css('div > span.source-cpp').each do |node|
- node.name = 'pre'
- node.inner_html = node.inner_html.gsub('
', "\n")
- node.content = node.content
- end
-
- css('div > a > img[alt="About this image"]').each do |node|
- node.parent.parent.remove
- end
-
- css('area[href]').each do |node|
- node['href'] = node['href'].remove('.html')
- end
-
- css('p').each do |node|
- while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code')
- node << node.next
- end
- node.inner_html = node.inner_html.strip
- node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/
- node.remove if node.content.blank? && !node.at_css('img')
- end
-
- css('pre').each do |node|
- node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp')
- 'cpp'
- else
- 'c'
- end
- node.remove_attribute('class')
- node.content = node.content.gsub("\t", ' ' * 8)
- end
-
- css('code code', '.mw-geshi').each do |node|
- node.before(node.children).remove
- end
-
- css('h1 ~ .fmbox').each do |node|
- node.name = 'div'
- node.content = node.content
- end
-
- css('img').each do |node|
- node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg'
- end
-
- doc
- end
- end
- end
-end
diff --git a/lib/docs/filters/c/entries.rb b/lib/docs/filters/c/entries.rb
index 6c9f1565..63cfec61 100644
--- a/lib/docs/filters/c/entries.rb
+++ b/lib/docs/filters/c/entries.rb
@@ -22,6 +22,9 @@ module Docs
end
def get_type
+
+ return "C keywords" if slug =~ /keyword/
+
type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
type.strip!
type.remove! ' library'
diff --git a/lib/docs/filters/c/fix_code.rb b/lib/docs/filters/c/fix_code.rb
deleted file mode 100644
index a7e764f0..00000000
--- a/lib/docs/filters/c/fix_code.rb
+++ /dev/null
@@ -1,21 +0,0 @@
-module Docs
- class C
- class FixCodeFilter < Filter
- def call
- css('div > span.source-c', 'div > span.source-cpp').each do |node|
- node.inner_html = node.inner_html.gsub(/
\n?/, "\n").gsub("\n\n", "\n")
- node.parent.name = 'pre'
- node.parent['class'] = node['class']
- node.parent.content = node.content
- end
-
- nbsp = Nokogiri::HTML(' ').text
- css('pre').each do |node|
- node.content = node.content.gsub(nbsp, ' ')
- end
-
- doc
- end
- end
- end
-end
diff --git a/lib/docs/filters/c/fix_urls.rb b/lib/docs/filters/c/fix_urls.rb
deleted file mode 100644
index a7d15d94..00000000
--- a/lib/docs/filters/c/fix_urls.rb
+++ /dev/null
@@ -1,11 +0,0 @@
-module Docs
- class C
- class FixUrlsFilter < Filter
- def call
- html.gsub! File.join(C.base_url, C.root_path), C.base_url[0..-2]
- html.gsub! %r{#{C.base_url}([^"']+?)\.html}, "#{C.base_url}\\1"
- html
- end
- end
- end
-end
diff --git a/lib/docs/scrapers/c.rb b/lib/docs/scrapers/c.rb
deleted file mode 100644
index ec99f704..00000000
--- a/lib/docs/scrapers/c.rb
+++ /dev/null
@@ -1,42 +0,0 @@
-module Docs
- class C < FileScraper
- self.type = 'c'
- self.base_url = 'http://en.cppreference.com/w/c/'
- self.root_path = 'header.html'
-
- html_filters.insert_before 'clean_html', 'c/fix_code'
- html_filters.push 'c/entries', 'c/clean_html', 'title'
- text_filters.push 'c/fix_urls'
-
- options[:decode_and_clean_paths] = true
- options[:container] = '#content'
- options[:title] = false
- options[:root_title] = 'C Programming Language'
- options[:skip] = %w(language/history.html)
- options[:skip_patterns] = [/experimental/]
-
- options[:fix_urls] = ->(url) do
- url.sub! %r{\A.+/http%3A/}, 'http://'
- url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com'
- url
- end
-
- options[:attribution] = <<-HTML
- © cppreference.com
- Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
- HTML
-
- def get_latest_version(opts)
- doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
- link = doc.at_css('a[title^="File:"]')
- date = link.content.scan(/(\d+)\./)[0][0]
- DateTime.strptime(date, '%Y%m%d').to_time.to_i
- end
-
- private
-
- def file_path_for(*)
- URI.unescape(super)
- end
- end
-end
diff --git a/lib/docs/scrapers/cppref/c.rb b/lib/docs/scrapers/cppref/c.rb
new file mode 100644
index 00000000..faa48fb3
--- /dev/null
+++ b/lib/docs/scrapers/cppref/c.rb
@@ -0,0 +1,12 @@
+module Docs
+ class C < Cppref
+ self.name = 'c'
+ self.slug = 'c'
+ self.base_url = 'https://en.cppreference.com/w/c/'
+
+ html_filters.insert_before 'cppref/clean_html', 'c/entries'
+
+ options[:root_title] = 'C Programming Language'
+
+ end
+end
diff --git a/lib/docs/scrapers/cppref/cpp.rb b/lib/docs/scrapers/cppref/cpp.rb
index bfc87c62..4f259729 100644
--- a/lib/docs/scrapers/cppref/cpp.rb
+++ b/lib/docs/scrapers/cppref/cpp.rb
@@ -2,7 +2,6 @@ module Docs
class Cpp < Cppref
self.name = 'C++'
self.slug = 'cpp'
- self.type = 'c'
self.base_url = 'https://en.cppreference.com/w/cpp/'
html_filters.insert_before 'cppref/clean_html', 'cpp/entries'
@@ -16,13 +15,5 @@ module Docs
regex/regex_token_iterator/operator_cmp.html
)
- # Same as get_latest_version in lib/docs/scrapers/c.rb
- def get_latest_version(opts)
- doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
- link = doc.at_css('a[title^="File:"]')
- date = link.content.scan(/(\d+)\./)[0][0]
- DateTime.strptime(date, '%Y%m%d').to_time.to_i
- end
-
end
end
diff --git a/lib/docs/scrapers/cppref/cppref.rb b/lib/docs/scrapers/cppref/cppref.rb
index b91751ef..85bbc771 100644
--- a/lib/docs/scrapers/cppref/cppref.rb
+++ b/lib/docs/scrapers/cppref/cppref.rb
@@ -6,7 +6,7 @@ module Docs
html_filters.insert_before 'clean_html', 'cppref/fix_code'
html_filters.push 'cppref/clean_html', 'title'
- # 'cpp20/entries',
+
options[:decode_and_clean_paths] = true
options[:container] = '#content'
options[:title] = false
@@ -21,9 +21,13 @@ module Docs
Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
HTML
- # def get_latest_version
-
- # end
+ # Check if the 'headers' page has changed
+ def get_latest_version(opts)
+ doc = fetch_doc(self.base_url + self.root_path, opts)
+ date = doc.at_css('#footer-info-lastmod').content
+ date = date.match(/[[:digit:]]{1,2} .* [[:digit:]]{4}/).to_s
+ date = DateTime.strptime(date, '%e %B %Y').to_time.to_i
+ end
end
end
From ef449857d534403caa2656eec5744b3e8e1830cf Mon Sep 17 00:00:00 2001
From: Enoc
Date: Wed, 13 Oct 2021 00:14:41 -0600
Subject: [PATCH 4/5] C/Cpp: improve format of fractions
---
lib/docs/filters/cppref/clean_html.rb | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/lib/docs/filters/cppref/clean_html.rb b/lib/docs/filters/cppref/clean_html.rb
index ced30f4d..07acfd45 100644
--- a/lib/docs/filters/cppref/clean_html.rb
+++ b/lib/docs/filters/cppref/clean_html.rb
@@ -109,6 +109,20 @@ module Docs
node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg'
end
+ # temporary solution due lack of mathjax/mathml support
+ css('.t-mfrac').each do |node|
+ fraction = Nokogiri::XML::Node.new('span', doc)
+
+ node.css('td').each do |node|
+ fraction.add_child("#{node.content}")
+ end
+
+ fraction.last_element_child().before("/")
+
+ node.before(fraction)
+ node.remove
+ end
+
doc
end
end
From bffc1948624da57f0ae49a15f5c868ac89f8e488 Mon Sep 17 00:00:00 2001
From: Simon Legner
Date: Fri, 3 Jun 2022 00:01:32 +0200
Subject: [PATCH 5/5] fix(scss): .cppref
---
assets/stylesheets/pages/_cppref.scss | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/assets/stylesheets/pages/_cppref.scss b/assets/stylesheets/pages/_cppref.scss
index 9aeafdcb..8af559bb 100644
--- a/assets/stylesheets/pages/_cppref.scss
+++ b/assets/stylesheets/pages/_cppref.scss
@@ -1,4 +1,4 @@
-._c {
+._cppref {
> h2, > h3 { @extend %block-heading; }
> h4 { @extend %block-label, %label-blue; }
.fmbox { @extend %note; }