Cleanup and improve TypeScript scraper

pull/1406/head
Phil Scherer 4 years ago
parent 37d5fd3bbb
commit 7e36848e45

@ -2,20 +2,23 @@ module Docs
class Typescript class Typescript
class CleanHtmlFilter < Filter class CleanHtmlFilter < Filter
def call def call
root_page? ? root : other
doc
end
# Top menu bar def root
css('#top-menu').remove header = at_css('h1')
css('.skip-to-main').remove header.parent.before(header).remove
# Sidebar css('h4').each do |node|
css('#sidebar').remove node.name = 'h2'
end
end
# Pound symbol before each title def other
css('.anchor').remove @doc = at_css('article > .whitespace > .markdown')
css('#handbook-content > h2').each do |node| css('.anchor').remove
node.name = 'h1'
end
css('a:contains("Try")').remove css('a:contains("Try")').remove
css('pre').each do |node| css('pre').each do |node|
@ -23,17 +26,6 @@ module Docs
node['data-language'] = 'typescript' node['data-language'] = 'typescript'
node.remove_attribute('class') node.remove_attribute('class')
end end
# 'Next' title area
css('.whitespace-tight').remove
# Right side floating box
css('.handbook-toc').remove
css('#site-footer').remove
doc
end end
end end
end end

@ -3,7 +3,6 @@ module Docs
class EntriesFilter < Docs::EntriesFilter class EntriesFilter < Docs::EntriesFilter
def get_name def get_name
return 'Typescript' if current_url == root_url
return at_css('h2').content return at_css('h2').content
end end
@ -12,13 +11,9 @@ module Docs
end end
def additional_entries def additional_entries
entries = [] css('h2').each_with_object [] do |node,entries|
entries << [node.content, node['id'], name]
css('h2').each do |node|
entries << [node.content, node['id'], name]
end end
entries
end end
end end

@ -3,23 +3,25 @@ module Docs
self.name = 'TypeScript' self.name = 'TypeScript'
self.type = 'simple' self.type = 'simple'
self.release = '4.1.2' self.release = '4.1.2'
self.base_url = 'https://www.typescriptlang.org/docs/handbook' self.base_url = 'https://www.typescriptlang.org/docs/handbook/'
self.root_path = 'index.html' self.root_path = 'index.html'
self.links = { self.links = {
home: 'https://www.typescriptlang.org', home: 'https://www.typescriptlang.org',
code: 'https://github.com/Microsoft/TypeScript' code: 'https://github.com/Microsoft/TypeScript'
} }
html_filters.push 'typescript/entries', 'typescript/clean_html' html_filters.push 'typescript/entries', 'typescript/clean_html', 'title'
options[:container] = 'main'
options[:skip] = [ options[:skip] = [
'/react-&-webpack.html', 'react-&-webpack.html',
'/asp-net-core.html', 'asp-net-core.html',
'/gulp.html', 'gulp.html',
'/dom-manipulation.html', 'dom-manipulation.html',
'/migrating-from-javascript.html', 'migrating-from-javascript.html',
'/babel-with-typescript.html', 'babel-with-typescript.html',
'/intro.html' 'intro.html'
] ]
options[:skip_patterns] = [ options[:skip_patterns] = [

Loading…
Cancel
Save