From a754d50317947e5bdb85434eaf4521a708ae5bb9 Mon Sep 17 00:00:00 2001 From: Enoc Date: Wed, 1 Sep 2021 01:19:08 -0600 Subject: [PATCH 1/5] Initial migration from cpp filescraper to urlscraper --- lib/docs/filters/cpp20/clean_html.rb | 9 +++ lib/docs/filters/cpp20/entries.rb | 82 ++++++++++++++++++++++++++++ lib/docs/scrapers/cpp20.rb | 40 ++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 lib/docs/filters/cpp20/clean_html.rb create mode 100644 lib/docs/filters/cpp20/entries.rb create mode 100644 lib/docs/scrapers/cpp20.rb diff --git a/lib/docs/filters/cpp20/clean_html.rb b/lib/docs/filters/cpp20/clean_html.rb new file mode 100644 index 00000000..4328e002 --- /dev/null +++ b/lib/docs/filters/cpp20/clean_html.rb @@ -0,0 +1,9 @@ +module Docs + class Cpp20 + class CleanHtmlFilter < Filter + def call + doc + end + end + end +end diff --git a/lib/docs/filters/cpp20/entries.rb b/lib/docs/filters/cpp20/entries.rb new file mode 100644 index 00000000..8a5aad22 --- /dev/null +++ b/lib/docs/filters/cpp20/entries.rb @@ -0,0 +1,82 @@ +module Docs + class Cpp20 + class EntriesFilter < Docs::EntriesFilter + @@duplicate_names = [] + + REPLACE_NAMES = { + 'Error directive' => '#error directive', + 'Filename and line information' => '#line directive', + 'Implementation defined behavior control' => '#pragma directive', + 'Replacing text macros' => '#define directive', + 'Source file inclusion' => '#include directive' } + + def get_name + name = at_css('#firstHeading').content.strip + name = format_name(name) + name = name.split(',').first + name + end + + def get_type + if at_css('#firstHeading').content.include?('C++ keyword') + 'Keywords' + elsif subpath.start_with?('experimental') + 'Experimental libraries' + elsif subpath.start_with?('language/') + 'Language' + elsif subpath.start_with?('freestanding') + 'Utilities' + elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content) + type.strip! + type.remove! ' library' + type.remove! ' utilities' + type.remove! 'C++ ' + type.capitalize! + type + end + end + + def additional_entries + return [] if root_page? || self.name.start_with?('operators') + names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1] + names.each(&:strip!).reject! do |name| + name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator') + end + names.map { |name| [format_name(name)] } + end + + def format_name(name) + name.remove! 'C++ concepts: ' + name.remove! 'C++ keywords: ' + name.remove! 'C++ ' unless name == 'C++ language' + name.remove! %r{\s\(.+\)} + + name.sub! %r{\AStandard library header <(.+)>\z}, '\1' + name.sub! %r{(<[^>]+>)}, '' + + if name.include?('operator') && name.include?(',') + name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators') + name.sub! ' ', ' ' + name << ')' unless name.last == ')' || name.exclude?('(') + name.sub! '()', '' + name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50 + end + + REPLACE_NAMES[name] || name + end + + def entries + entries = [] + + # avoid duplicate pages + if !(@@duplicate_names.include?(name)) + @@duplicate_names.push(name) + entries << default_entry if root_page? || include_default_entry? + entries.concat(additional_entries) + build_entries(entries) + end + end + + end + end +end diff --git a/lib/docs/scrapers/cpp20.rb b/lib/docs/scrapers/cpp20.rb new file mode 100644 index 00000000..14034a43 --- /dev/null +++ b/lib/docs/scrapers/cpp20.rb @@ -0,0 +1,40 @@ +module Docs + class Cpp20 < UrlScraper + self.name = 'C++20' + self.slug = 'cpp20' + self.type = 'c' + self.base_url = 'https://en.cppreference.com/w/cpp/' + self.root_path = 'header' + + html_filters.insert_before 'clean_html', 'c/fix_code' + html_filters.push 'cpp20/entries', 'c/clean_html', 'title' + + options[:decode_and_clean_paths] = true + options[:container] = '#content' + options[:title] = false + options[:root_title] = 'C++ Programming Language' + + options[:skip] = %w( + language/extending_std.html + language/history.html + regex/ecmascript.html + regex/regex_token_iterator/operator_cmp.html + ) + + options[:skip_patterns] = [/experimental/] + + options[:attribution] = <<-HTML + © cppreference.com
+ Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. + HTML + + # Same as get_latest_version in lib/docs/scrapers/c.rb + def get_latest_version(opts) + doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts) + link = doc.at_css('a[title^="File:"]') + date = link.content.scan(/(\d+)\./)[0][0] + DateTime.strptime(date, '%Y%m%d').to_time.to_i + end + + end +end From e17bc84ea4c2a675fb16282339610dcab1506ce0 Mon Sep 17 00:00:00 2001 From: Enoc Date: Thu, 9 Sep 2021 00:03:46 -0600 Subject: [PATCH 2/5] migrate cpp scraper from filescraper to urlscraper --- lib/docs/filters/cpp/entries.rb | 20 ++- lib/docs/filters/cpp/fix_urls.rb | 11 -- lib/docs/filters/cpp20/clean_html.rb | 9 -- lib/docs/filters/cpp20/entries.rb | 82 ------------- lib/docs/filters/cppref/clean_html.rb | 116 ++++++++++++++++++ lib/docs/filters/cppref/fix_code.rb | 21 ++++ lib/docs/scrapers/cpp.rb | 52 -------- lib/docs/scrapers/{cpp20.rb => cppref/cpp.rb} | 20 +-- lib/docs/scrapers/cppref/cppref.rb | 29 +++++ 9 files changed, 189 insertions(+), 171 deletions(-) delete mode 100644 lib/docs/filters/cpp/fix_urls.rb delete mode 100644 lib/docs/filters/cpp20/clean_html.rb delete mode 100644 lib/docs/filters/cpp20/entries.rb create mode 100644 lib/docs/filters/cppref/clean_html.rb create mode 100644 lib/docs/filters/cppref/fix_code.rb delete mode 100644 lib/docs/scrapers/cpp.rb rename lib/docs/scrapers/{cpp20.rb => cppref/cpp.rb} (54%) create mode 100644 lib/docs/scrapers/cppref/cppref.rb diff --git a/lib/docs/filters/cpp/entries.rb b/lib/docs/filters/cpp/entries.rb index b0700139..d13526d3 100644 --- a/lib/docs/filters/cpp/entries.rb +++ b/lib/docs/filters/cpp/entries.rb @@ -1,6 +1,8 @@ module Docs class Cpp class EntriesFilter < Docs::EntriesFilter + @@duplicate_names = [] + REPLACE_NAMES = { 'Error directive' => '#error directive', 'Filename and line information' => '#line directive', @@ -11,7 +13,8 @@ module Docs def get_name name = at_css('#firstHeading').content.strip name = format_name(name) - name.split(',').first + name = name.split(',').first + name end def get_type @@ -61,6 +64,21 @@ module Docs REPLACE_NAMES[name] || name end + + # Avoid duplicate pages, these duplicate page are the same page for + # multiple functions that are organized in the same page because provide + # similar behavior but have different name. + def entries + entries = [] + + if !(@@duplicate_names.include?(name)) + @@duplicate_names.push(name) + entries << default_entry if root_page? || include_default_entry? + entries.concat(additional_entries) + build_entries(entries) + end + end + end end end diff --git a/lib/docs/filters/cpp/fix_urls.rb b/lib/docs/filters/cpp/fix_urls.rb deleted file mode 100644 index 8e8d67c6..00000000 --- a/lib/docs/filters/cpp/fix_urls.rb +++ /dev/null @@ -1,11 +0,0 @@ -module Docs - class Cpp - class FixUrlsFilter < Filter - def call - html.gsub! File.join(Cpp.base_url, Cpp.root_path), Cpp.base_url[0..-2] - html.gsub! %r{#{Cpp.base_url}([^"']+?)\.html}, "#{Cpp.base_url}\\1" - html - end - end - end -end diff --git a/lib/docs/filters/cpp20/clean_html.rb b/lib/docs/filters/cpp20/clean_html.rb deleted file mode 100644 index 4328e002..00000000 --- a/lib/docs/filters/cpp20/clean_html.rb +++ /dev/null @@ -1,9 +0,0 @@ -module Docs - class Cpp20 - class CleanHtmlFilter < Filter - def call - doc - end - end - end -end diff --git a/lib/docs/filters/cpp20/entries.rb b/lib/docs/filters/cpp20/entries.rb deleted file mode 100644 index 8a5aad22..00000000 --- a/lib/docs/filters/cpp20/entries.rb +++ /dev/null @@ -1,82 +0,0 @@ -module Docs - class Cpp20 - class EntriesFilter < Docs::EntriesFilter - @@duplicate_names = [] - - REPLACE_NAMES = { - 'Error directive' => '#error directive', - 'Filename and line information' => '#line directive', - 'Implementation defined behavior control' => '#pragma directive', - 'Replacing text macros' => '#define directive', - 'Source file inclusion' => '#include directive' } - - def get_name - name = at_css('#firstHeading').content.strip - name = format_name(name) - name = name.split(',').first - name - end - - def get_type - if at_css('#firstHeading').content.include?('C++ keyword') - 'Keywords' - elsif subpath.start_with?('experimental') - 'Experimental libraries' - elsif subpath.start_with?('language/') - 'Language' - elsif subpath.start_with?('freestanding') - 'Utilities' - elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content) - type.strip! - type.remove! ' library' - type.remove! ' utilities' - type.remove! 'C++ ' - type.capitalize! - type - end - end - - def additional_entries - return [] if root_page? || self.name.start_with?('operators') - names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1] - names.each(&:strip!).reject! do |name| - name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator') - end - names.map { |name| [format_name(name)] } - end - - def format_name(name) - name.remove! 'C++ concepts: ' - name.remove! 'C++ keywords: ' - name.remove! 'C++ ' unless name == 'C++ language' - name.remove! %r{\s\(.+\)} - - name.sub! %r{\AStandard library header <(.+)>\z}, '\1' - name.sub! %r{(<[^>]+>)}, '' - - if name.include?('operator') && name.include?(',') - name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators') - name.sub! ' ', ' ' - name << ')' unless name.last == ')' || name.exclude?('(') - name.sub! '()', '' - name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50 - end - - REPLACE_NAMES[name] || name - end - - def entries - entries = [] - - # avoid duplicate pages - if !(@@duplicate_names.include?(name)) - @@duplicate_names.push(name) - entries << default_entry if root_page? || include_default_entry? - entries.concat(additional_entries) - build_entries(entries) - end - end - - end - end -end diff --git a/lib/docs/filters/cppref/clean_html.rb b/lib/docs/filters/cppref/clean_html.rb new file mode 100644 index 00000000..ced30f4d --- /dev/null +++ b/lib/docs/filters/cppref/clean_html.rb @@ -0,0 +1,116 @@ +module Docs + class Cppref + class CleanHtmlFilter < Filter + def call + css('h1').remove if root_page? + + css('.t-dcl-rev-aux td[rowspan]').each do |node| + rowspan = node['rowspan'].to_i + node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3 + end + + css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc', + '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink', + '.t-sdsc-sep:first-child:last-child', '.t-example-live-link', + '.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove + + css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image', + 'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node| + node.before(node.children).remove + end + + css('div > ul').each do |node| + node.parent.before(node.parent.children).remove + end + + css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node| + dl = node.parent.parent + if dl.previous_element && dl.previous_element.name == 'ul' + dl.previous_element << node + dl.remove + else + dl.before(node).remove + end + end + + css('dl > dd:first-child:last-child').each do |node| + node.parent.before(node.children).remove + end + + css('ul').each do |node| + while node.next_element && node.next_element.name == 'ul' + node << node.next_element.children + node.next_element.remove + end + end + + css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node| + node.parent['id'] = node['id'] + node.before(node.children).remove + end + + css('table[style]', 'th[style]', 'td[style]').remove_attr('style') + css('table[cellpadding]').remove_attr('cellpadding') + + css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node| + node.name = 'th' + node.content = ' ' if node.content.empty? + end + + css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node| + node.name = 'code' + node.remove_attribute('class') + node.content = node.content unless node.at_css('a') + end + + css('div > span.source-cpp').each do |node| + node.name = 'pre' + node.inner_html = node.inner_html.gsub('
', "\n") + node.content = node.content + end + + css('div > a > img[alt="About this image"]').each do |node| + node.parent.parent.remove + end + + css('area[href]').each do |node| + node['href'] = node['href'].remove('.html') + end + + css('p').each do |node| + while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code') + node << node.next + end + node.inner_html = node.inner_html.strip + node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/ + node.remove if node.content.blank? && !node.at_css('img') + end + + css('pre').each do |node| + node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp') + 'cpp' + else + 'c' + end + node.remove_attribute('class') + node.content = node.content.gsub("\t", ' ' * 8) + end + + css('code code', '.mw-geshi').each do |node| + node.before(node.children).remove + end + + css('h1 ~ .fmbox').each do |node| + node.name = 'div' + node.content = node.content + end + + css('img').each do |node| + node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg' + end + + doc + end + end + end +end diff --git a/lib/docs/filters/cppref/fix_code.rb b/lib/docs/filters/cppref/fix_code.rb new file mode 100644 index 00000000..c80a7426 --- /dev/null +++ b/lib/docs/filters/cppref/fix_code.rb @@ -0,0 +1,21 @@ +module Docs + class Cppref + class FixCodeFilter < Filter + def call + css('div > span.source-c', 'div > span.source-cpp').each do |node| + node.inner_html = node.inner_html.gsub(/
\n?/, "\n").gsub("\n

\n", "

\n") + node.parent.name = 'pre' + node.parent['class'] = node['class'] + node.parent.content = node.content + end + + nbsp = Nokogiri::HTML(' ').text + css('pre').each do |node| + node.content = node.content.gsub(nbsp, ' ') + end + + doc + end + end + end +end diff --git a/lib/docs/scrapers/cpp.rb b/lib/docs/scrapers/cpp.rb deleted file mode 100644 index f0ad2b16..00000000 --- a/lib/docs/scrapers/cpp.rb +++ /dev/null @@ -1,52 +0,0 @@ -module Docs - class Cpp < FileScraper - self.name = 'C++' - self.slug = 'cpp' - self.type = 'c' - self.base_url = 'http://en.cppreference.com/w/cpp/' - self.root_path = 'header.html' - - html_filters.insert_before 'clean_html', 'c/fix_code' - html_filters.push 'cpp/entries', 'c/clean_html', 'title' - text_filters.push 'cpp/fix_urls' - - options[:decode_and_clean_paths] = true - options[:container] = '#content' - options[:title] = false - options[:root_title] = 'C++ Programming Language' - options[:skip] = %w( - language/extending_std.html - language/history.html - regex/ecmascript.html - regex/regex_token_iterator/operator_cmp.html - ) - options[:skip_patterns] = [/experimental/] - options[:only_patterns] = [/\.html\z/] - - options[:fix_urls] = ->(url) do - url = CGI.unescape(url) - url.sub! %r{\A.+/http%3A/}, 'http://' - url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com' - url - end - - options[:attribution] = <<-HTML - © cppreference.com
- Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. - HTML - - # Same as get_latest_version in lib/docs/scrapers/c.rb - def get_latest_version(opts) - doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts) - link = doc.at_css('a[title^="File:"]') - date = link.content.scan(/(\d+)\./)[0][0] - DateTime.strptime(date, '%Y%m%d').to_time.to_i - end - - private - - def file_path_for(*) - URI.unescape(super) - end - end -end diff --git a/lib/docs/scrapers/cpp20.rb b/lib/docs/scrapers/cppref/cpp.rb similarity index 54% rename from lib/docs/scrapers/cpp20.rb rename to lib/docs/scrapers/cppref/cpp.rb index 14034a43..bfc87c62 100644 --- a/lib/docs/scrapers/cpp20.rb +++ b/lib/docs/scrapers/cppref/cpp.rb @@ -1,17 +1,12 @@ module Docs - class Cpp20 < UrlScraper - self.name = 'C++20' - self.slug = 'cpp20' + class Cpp < Cppref + self.name = 'C++' + self.slug = 'cpp' self.type = 'c' self.base_url = 'https://en.cppreference.com/w/cpp/' - self.root_path = 'header' - html_filters.insert_before 'clean_html', 'c/fix_code' - html_filters.push 'cpp20/entries', 'c/clean_html', 'title' + html_filters.insert_before 'cppref/clean_html', 'cpp/entries' - options[:decode_and_clean_paths] = true - options[:container] = '#content' - options[:title] = false options[:root_title] = 'C++ Programming Language' options[:skip] = %w( @@ -21,13 +16,6 @@ module Docs regex/regex_token_iterator/operator_cmp.html ) - options[:skip_patterns] = [/experimental/] - - options[:attribution] = <<-HTML - © cppreference.com
- Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. - HTML - # Same as get_latest_version in lib/docs/scrapers/c.rb def get_latest_version(opts) doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts) diff --git a/lib/docs/scrapers/cppref/cppref.rb b/lib/docs/scrapers/cppref/cppref.rb new file mode 100644 index 00000000..b91751ef --- /dev/null +++ b/lib/docs/scrapers/cppref/cppref.rb @@ -0,0 +1,29 @@ +module Docs + class Cppref < UrlScraper + self.abstract = true + self.type = 'cppref' + self.root_path = 'header' + + html_filters.insert_before 'clean_html', 'cppref/fix_code' + html_filters.push 'cppref/clean_html', 'title' + # 'cpp20/entries', + options[:decode_and_clean_paths] = true + options[:container] = '#content' + options[:title] = false + options[:skip] = %w(language/history.html) + + options[:skip_patterns] = [ + /experimental/ + ] + + options[:attribution] = <<-HTML + © cppreference.com
+ Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. + HTML + + # def get_latest_version + + # end + + end +end From fdfcf3d9174d7e386af42412c3a2394d6e1eafca Mon Sep 17 00:00:00 2001 From: Enoc Date: Fri, 10 Sep 2021 11:14:54 -0600 Subject: [PATCH 3/5] Migrate c scraper from filescraper to urlscraper --- assets/stylesheets/application.css.scss | 2 +- .../pages/{_c.scss => _cppref.scss} | 0 lib/docs/filters/c/clean_html.rb | 116 ------------------ lib/docs/filters/c/entries.rb | 3 + lib/docs/filters/c/fix_code.rb | 21 ---- lib/docs/filters/c/fix_urls.rb | 11 -- lib/docs/scrapers/c.rb | 42 ------- lib/docs/scrapers/cppref/c.rb | 12 ++ lib/docs/scrapers/cppref/cpp.rb | 9 -- lib/docs/scrapers/cppref/cppref.rb | 12 +- 10 files changed, 24 insertions(+), 204 deletions(-) rename assets/stylesheets/pages/{_c.scss => _cppref.scss} (100%) delete mode 100644 lib/docs/filters/c/clean_html.rb delete mode 100644 lib/docs/filters/c/fix_code.rb delete mode 100644 lib/docs/filters/c/fix_urls.rb delete mode 100644 lib/docs/scrapers/c.rb create mode 100644 lib/docs/scrapers/cppref/c.rb diff --git a/assets/stylesheets/application.css.scss b/assets/stylesheets/application.css.scss index 0243afeb..542e1510 100644 --- a/assets/stylesheets/application.css.scss +++ b/assets/stylesheets/application.css.scss @@ -39,7 +39,7 @@ 'pages/async', 'pages/bash', 'pages/bootstrap', - 'pages/c', + 'pages/cppref', 'pages/cakephp', 'pages/clojure', 'pages/codeception', diff --git a/assets/stylesheets/pages/_c.scss b/assets/stylesheets/pages/_cppref.scss similarity index 100% rename from assets/stylesheets/pages/_c.scss rename to assets/stylesheets/pages/_cppref.scss diff --git a/lib/docs/filters/c/clean_html.rb b/lib/docs/filters/c/clean_html.rb deleted file mode 100644 index 0665a941..00000000 --- a/lib/docs/filters/c/clean_html.rb +++ /dev/null @@ -1,116 +0,0 @@ -module Docs - class C - class CleanHtmlFilter < Filter - def call - css('h1').remove if root_page? - - css('.t-dcl-rev-aux td[rowspan]').each do |node| - rowspan = node['rowspan'].to_i - node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3 - end - - css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc', - '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink', - '.t-sdsc-sep:first-child:last-child', '.t-example-live-link', - '.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove - - css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image', - 'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node| - node.before(node.children).remove - end - - css('div > ul').each do |node| - node.parent.before(node.parent.children).remove - end - - css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node| - dl = node.parent.parent - if dl.previous_element && dl.previous_element.name == 'ul' - dl.previous_element << node - dl.remove - else - dl.before(node).remove - end - end - - css('dl > dd:first-child:last-child').each do |node| - node.parent.before(node.children).remove - end - - css('ul').each do |node| - while node.next_element && node.next_element.name == 'ul' - node << node.next_element.children - node.next_element.remove - end - end - - css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node| - node.parent['id'] = node['id'] - node.before(node.children).remove - end - - css('table[style]', 'th[style]', 'td[style]').remove_attr('style') - css('table[cellpadding]').remove_attr('cellpadding') - - css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node| - node.name = 'th' - node.content = ' ' if node.content.empty? - end - - css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node| - node.name = 'code' - node.remove_attribute('class') - node.content = node.content unless node.at_css('a') - end - - css('div > span.source-cpp').each do |node| - node.name = 'pre' - node.inner_html = node.inner_html.gsub('
', "\n") - node.content = node.content - end - - css('div > a > img[alt="About this image"]').each do |node| - node.parent.parent.remove - end - - css('area[href]').each do |node| - node['href'] = node['href'].remove('.html') - end - - css('p').each do |node| - while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code') - node << node.next - end - node.inner_html = node.inner_html.strip - node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/ - node.remove if node.content.blank? && !node.at_css('img') - end - - css('pre').each do |node| - node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp') - 'cpp' - else - 'c' - end - node.remove_attribute('class') - node.content = node.content.gsub("\t", ' ' * 8) - end - - css('code code', '.mw-geshi').each do |node| - node.before(node.children).remove - end - - css('h1 ~ .fmbox').each do |node| - node.name = 'div' - node.content = node.content - end - - css('img').each do |node| - node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg' - end - - doc - end - end - end -end diff --git a/lib/docs/filters/c/entries.rb b/lib/docs/filters/c/entries.rb index 6c9f1565..63cfec61 100644 --- a/lib/docs/filters/c/entries.rb +++ b/lib/docs/filters/c/entries.rb @@ -22,6 +22,9 @@ module Docs end def get_type + + return "C keywords" if slug =~ /keyword/ + type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content) type.strip! type.remove! ' library' diff --git a/lib/docs/filters/c/fix_code.rb b/lib/docs/filters/c/fix_code.rb deleted file mode 100644 index a7e764f0..00000000 --- a/lib/docs/filters/c/fix_code.rb +++ /dev/null @@ -1,21 +0,0 @@ -module Docs - class C - class FixCodeFilter < Filter - def call - css('div > span.source-c', 'div > span.source-cpp').each do |node| - node.inner_html = node.inner_html.gsub(/
\n?/, "\n").gsub("\n

\n", "

\n") - node.parent.name = 'pre' - node.parent['class'] = node['class'] - node.parent.content = node.content - end - - nbsp = Nokogiri::HTML(' ').text - css('pre').each do |node| - node.content = node.content.gsub(nbsp, ' ') - end - - doc - end - end - end -end diff --git a/lib/docs/filters/c/fix_urls.rb b/lib/docs/filters/c/fix_urls.rb deleted file mode 100644 index a7d15d94..00000000 --- a/lib/docs/filters/c/fix_urls.rb +++ /dev/null @@ -1,11 +0,0 @@ -module Docs - class C - class FixUrlsFilter < Filter - def call - html.gsub! File.join(C.base_url, C.root_path), C.base_url[0..-2] - html.gsub! %r{#{C.base_url}([^"']+?)\.html}, "#{C.base_url}\\1" - html - end - end - end -end diff --git a/lib/docs/scrapers/c.rb b/lib/docs/scrapers/c.rb deleted file mode 100644 index ec99f704..00000000 --- a/lib/docs/scrapers/c.rb +++ /dev/null @@ -1,42 +0,0 @@ -module Docs - class C < FileScraper - self.type = 'c' - self.base_url = 'http://en.cppreference.com/w/c/' - self.root_path = 'header.html' - - html_filters.insert_before 'clean_html', 'c/fix_code' - html_filters.push 'c/entries', 'c/clean_html', 'title' - text_filters.push 'c/fix_urls' - - options[:decode_and_clean_paths] = true - options[:container] = '#content' - options[:title] = false - options[:root_title] = 'C Programming Language' - options[:skip] = %w(language/history.html) - options[:skip_patterns] = [/experimental/] - - options[:fix_urls] = ->(url) do - url.sub! %r{\A.+/http%3A/}, 'http://' - url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com' - url - end - - options[:attribution] = <<-HTML - © cppreference.com
- Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. - HTML - - def get_latest_version(opts) - doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts) - link = doc.at_css('a[title^="File:"]') - date = link.content.scan(/(\d+)\./)[0][0] - DateTime.strptime(date, '%Y%m%d').to_time.to_i - end - - private - - def file_path_for(*) - URI.unescape(super) - end - end -end diff --git a/lib/docs/scrapers/cppref/c.rb b/lib/docs/scrapers/cppref/c.rb new file mode 100644 index 00000000..faa48fb3 --- /dev/null +++ b/lib/docs/scrapers/cppref/c.rb @@ -0,0 +1,12 @@ +module Docs + class C < Cppref + self.name = 'c' + self.slug = 'c' + self.base_url = 'https://en.cppreference.com/w/c/' + + html_filters.insert_before 'cppref/clean_html', 'c/entries' + + options[:root_title] = 'C Programming Language' + + end +end diff --git a/lib/docs/scrapers/cppref/cpp.rb b/lib/docs/scrapers/cppref/cpp.rb index bfc87c62..4f259729 100644 --- a/lib/docs/scrapers/cppref/cpp.rb +++ b/lib/docs/scrapers/cppref/cpp.rb @@ -2,7 +2,6 @@ module Docs class Cpp < Cppref self.name = 'C++' self.slug = 'cpp' - self.type = 'c' self.base_url = 'https://en.cppreference.com/w/cpp/' html_filters.insert_before 'cppref/clean_html', 'cpp/entries' @@ -16,13 +15,5 @@ module Docs regex/regex_token_iterator/operator_cmp.html ) - # Same as get_latest_version in lib/docs/scrapers/c.rb - def get_latest_version(opts) - doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts) - link = doc.at_css('a[title^="File:"]') - date = link.content.scan(/(\d+)\./)[0][0] - DateTime.strptime(date, '%Y%m%d').to_time.to_i - end - end end diff --git a/lib/docs/scrapers/cppref/cppref.rb b/lib/docs/scrapers/cppref/cppref.rb index b91751ef..85bbc771 100644 --- a/lib/docs/scrapers/cppref/cppref.rb +++ b/lib/docs/scrapers/cppref/cppref.rb @@ -6,7 +6,7 @@ module Docs html_filters.insert_before 'clean_html', 'cppref/fix_code' html_filters.push 'cppref/clean_html', 'title' - # 'cpp20/entries', + options[:decode_and_clean_paths] = true options[:container] = '#content' options[:title] = false @@ -21,9 +21,13 @@ module Docs Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0. HTML - # def get_latest_version - - # end + # Check if the 'headers' page has changed + def get_latest_version(opts) + doc = fetch_doc(self.base_url + self.root_path, opts) + date = doc.at_css('#footer-info-lastmod').content + date = date.match(/[[:digit:]]{1,2} .* [[:digit:]]{4}/).to_s + date = DateTime.strptime(date, '%e %B %Y').to_time.to_i + end end end From ef449857d534403caa2656eec5744b3e8e1830cf Mon Sep 17 00:00:00 2001 From: Enoc Date: Wed, 13 Oct 2021 00:14:41 -0600 Subject: [PATCH 4/5] C/Cpp: improve format of fractions --- lib/docs/filters/cppref/clean_html.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/docs/filters/cppref/clean_html.rb b/lib/docs/filters/cppref/clean_html.rb index ced30f4d..07acfd45 100644 --- a/lib/docs/filters/cppref/clean_html.rb +++ b/lib/docs/filters/cppref/clean_html.rb @@ -109,6 +109,20 @@ module Docs node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg' end + # temporary solution due lack of mathjax/mathml support + css('.t-mfrac').each do |node| + fraction = Nokogiri::XML::Node.new('span', doc) + + node.css('td').each do |node| + fraction.add_child("#{node.content}") + end + + fraction.last_element_child().before("/") + + node.before(fraction) + node.remove + end + doc end end From bffc1948624da57f0ae49a15f5c868ac89f8e488 Mon Sep 17 00:00:00 2001 From: Simon Legner Date: Fri, 3 Jun 2022 00:01:32 +0200 Subject: [PATCH 5/5] fix(scss): .cppref --- assets/stylesheets/pages/_cppref.scss | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/stylesheets/pages/_cppref.scss b/assets/stylesheets/pages/_cppref.scss index 9aeafdcb..8af559bb 100644 --- a/assets/stylesheets/pages/_cppref.scss +++ b/assets/stylesheets/pages/_cppref.scss @@ -1,4 +1,4 @@ -._c { +._cppref { > h2, > h3 { @extend %block-heading; } > h4 { @extend %block-label, %label-blue; } .fmbox { @extend %note; }