Cleanup and improve TypeScript scraper

pull/1406/head
Phil Scherer 4 years ago
parent 37d5fd3bbb
commit 7e36848e45

@ -2,20 +2,23 @@ module Docs
class Typescript
class CleanHtmlFilter < Filter
def call
root_page? ? root : other
doc
end
# Top menu bar
css('#top-menu').remove
css('.skip-to-main').remove
def root
header = at_css('h1')
header.parent.before(header).remove
# Sidebar
css('#sidebar').remove
css('h4').each do |node|
node.name = 'h2'
end
end
# Pound symbol before each title
css('.anchor').remove
def other
@doc = at_css('article > .whitespace > .markdown')
css('#handbook-content > h2').each do |node|
node.name = 'h1'
end
css('.anchor').remove
css('a:contains("Try")').remove
css('pre').each do |node|
@ -23,17 +26,6 @@ module Docs
node['data-language'] = 'typescript'
node.remove_attribute('class')
end
# 'Next' title area
css('.whitespace-tight').remove
# Right side floating box
css('.handbook-toc').remove
css('#site-footer').remove
doc
end
end
end

@ -3,7 +3,6 @@ module Docs
class EntriesFilter < Docs::EntriesFilter
def get_name
return 'Typescript' if current_url == root_url
return at_css('h2').content
end
@ -12,13 +11,9 @@ module Docs
end
def additional_entries
entries = []
css('h2').each do |node|
entries << [node.content, node['id'], name]
css('h2').each_with_object [] do |node,entries|
entries << [node.content, node['id'], name]
end
entries
end
end

@ -3,23 +3,25 @@ module Docs
self.name = 'TypeScript'
self.type = 'simple'
self.release = '4.1.2'
self.base_url = 'https://www.typescriptlang.org/docs/handbook'
self.base_url = 'https://www.typescriptlang.org/docs/handbook/'
self.root_path = 'index.html'
self.links = {
home: 'https://www.typescriptlang.org',
code: 'https://github.com/Microsoft/TypeScript'
}
html_filters.push 'typescript/entries', 'typescript/clean_html'
html_filters.push 'typescript/entries', 'typescript/clean_html', 'title'
options[:container] = 'main'
options[:skip] = [
'/react-&-webpack.html',
'/asp-net-core.html',
'/gulp.html',
'/dom-manipulation.html',
'/migrating-from-javascript.html',
'/babel-with-typescript.html',
'/intro.html'
'react-&-webpack.html',
'asp-net-core.html',
'gulp.html',
'dom-manipulation.html',
'migrating-from-javascript.html',
'babel-with-typescript.html',
'intro.html'
]
options[:skip_patterns] = [

Loading…
Cancel
Save