diff --git a/lib/docs/filters/yarn/clean_html_berry.rb b/lib/docs/filters/yarn/clean_html_berry.rb index 96b3ee53..8a28ce25 100644 --- a/lib/docs/filters/yarn/clean_html_berry.rb +++ b/lib/docs/filters/yarn/clean_html_berry.rb @@ -2,45 +2,18 @@ module Docs class Yarn class CleanHtmlBerryFilter < Filter def call - if slug.empty? - @doc = at_css('main') - css( - (['div:first-child'] * 3).join('>'), # Tagline - 'img', - 'hr', # Footer - 'hr + div', # Footer - ).remove - - css('a').each do |link| - link.name = 'div' - link.css('h3').each do |node| - node.replace("

#{node.content}

") - end - end - - return doc - end - - @doc = at_css('article') - # Heading & edit link - css('h1', 'h1 + a').remove unless slug.start_with?('configuration') - - if slug.start_with?('cli') - css('.header-code').each do |node| - node.name = 'span' - end - end - - if slug.start_with?('configuration') - css('h1', 'h2').each do |node| - node.name = node.name.sub(/\d/) { |i| i.to_i + 1 } - end - end + @doc = at_css('main .container div.theme-doc-markdown.markdown') css('*').each do |node| node.remove_attribute('style') end + css('pre').each do |node| + lang = node['class'][/language-(\w+)/, 1] + node['data-language'] = lang if lang + node.content = node.css('.token-line').map(&:content).join("\n") + end + doc end end diff --git a/lib/docs/filters/yarn/entries_berry.rb b/lib/docs/filters/yarn/entries_berry.rb index 44c1e18e..6b99bfa6 100644 --- a/lib/docs/filters/yarn/entries_berry.rb +++ b/lib/docs/filters/yarn/entries_berry.rb @@ -2,26 +2,11 @@ module Docs class Yarn class EntriesBerryFilter < Docs::EntriesFilter def get_name - if slug.start_with?('configuration') - filename = at_css('main .active code') - content = filename.content - return filename.parent.content.sub content, " (#{content})" - end - - name = at_css('h1').content - - if slug.start_with?('getting-started') - active_link = at_css('main .active') - links = active_link.parent.children.to_a - name.prepend "#{links.index(active_link) + 1}. " - end - - name + at_css('main header h1').content end def get_type - return 'CLI' if slug.start_with?('sdks', 'pnpify') - at_css('header .active').content + at_css('nav.navbar a.navbar__item.navbar__link.navbar__link--active').content end end end diff --git a/lib/docs/scrapers/yarn.rb b/lib/docs/scrapers/yarn.rb index 8cc49260..9d20bc8c 100644 --- a/lib/docs/scrapers/yarn.rb +++ b/lib/docs/scrapers/yarn.rb @@ -13,15 +13,16 @@ module Docs HTML version 'Berry' do - self.release = '3.1.1' + self.release = '4.5.1' self.base_url = 'https://yarnpkg.com/' self.links = { home: 'https://yarnpkg.com/', code: 'https://github.com/yarnpkg/berry' } - html_filters.push 'yarn/entries_berry', 'yarn/clean_html_berry', 'title' - options[:skip] = ['features', 'cli', 'configuration', 'advanced'] - options[:skip_patterns] = [/\Aapi/, /\Apackage/] + self.root_path = 'getting-started' + html_filters.push 'yarn/entries_berry', 'yarn/clean_html_berry' + options[:skip] = ['cli', 'cli/builder', 'cli/pnpify', 'cli/sdks', 'protocols'] + options[:skip_patterns] = [/\Aapi/, /\Ablog/, /\Apackage/, /\Aassets/] end version 'Classic' do @@ -38,5 +39,13 @@ module Docs def get_latest_version(opts) get_latest_github_release('yarnpkg', 'berry', opts)[/[\d.]+/] end + + private + + # Some pages contain null bytes and cause the parser to fail + def parse(response) + response.body.gsub!(/[\x00\u0000\0]/, '') + super + end end end