From e4ac0e875718c5b868cc4206b609c93119a3cd62 Mon Sep 17 00:00:00 2001 From: Thibaut Date: Sun, 14 Dec 2014 17:02:59 -0500 Subject: [PATCH] Improve MDN scrapers --- lib/docs/filters/dom/entries.rb | 18 +++++++++++++----- lib/docs/filters/javascript/entries.rb | 5 +++-- lib/docs/filters/mdn/clean_html.rb | 6 ++++++ lib/docs/scrapers/mdn/dom.rb | 2 ++ lib/docs/scrapers/mdn/html.rb | 1 + lib/docs/scrapers/mdn/javascript.rb | 4 +++- 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/lib/docs/filters/dom/entries.rb b/lib/docs/filters/dom/entries.rb index af168e56..887182a8 100644 --- a/lib/docs/filters/dom/entries.rb +++ b/lib/docs/filters/dom/entries.rb @@ -11,10 +11,12 @@ module Docs 'Geolocation' => 'Geolocation', 'Media Capture' => 'Media', 'Media Source' => 'Media', - 'MediaStream' => 'MediaRecorder', + 'MediaStream' => 'Media', 'Navigation Timing' => 'Navigation Timing', 'Network Information' => 'Network Information', + 'Service Workers' => 'Service Workers', 'Web Audio' => 'Web Audio', + 'Web Storage' => 'Web Storage', 'Web Workers' => 'Web Workers', 'WebRTC' => 'WebRTC' } @@ -44,6 +46,7 @@ module Docs 'Range' => 'Range', 'RTC' => 'WebRTC', 'Selection' => 'Selection', + 'Storage' => 'Web Storage', 'StyleSheet' => 'CSS', 'Stylesheet' => 'CSS', 'SVG' => 'SVG', @@ -55,10 +58,15 @@ module Docs 'XMLHttpRequest' => 'XMLHTTPRequest' } TYPE_BY_NAME_INCLUDES = { - 'IndexedDB' => 'IndexedDB', - 'udio' => 'Web Audio', - 'WebGL' => 'Canvas', - 'Worker' => 'Web Workers' } + 'ImageData' => 'Canvas', + 'IndexedDB' => 'IndexedDB', + 'MediaStream' => 'Media', + 'Path2D' => 'Canvas', + 'ServiceWorker' => 'Service Workers', + 'TextMetrics' => 'Canvas', + 'udio' => 'Web Audio', + 'WebGL' => 'Canvas', + 'Worker' => 'Web Workers' } TYPE_BY_NAME_MATCHES = {} diff --git a/lib/docs/filters/javascript/entries.rb b/lib/docs/filters/javascript/entries.rb index 5744c482..b8ed5eb1 100644 --- a/lib/docs/filters/javascript/entries.rb +++ b/lib/docs/filters/javascript/entries.rb @@ -34,7 +34,7 @@ module Docs 'Statements' elsif slug.start_with? 'Operators' 'Operators' - elsif slug.start_with?('Functions_and_function_scope') || slug.start_with?('Functions') + elsif slug.start_with?('Functions_and_function_scope') || slug.start_with?('Functions') || slug.include?('GeneratorFunction') 'Function' elsif slug.start_with? 'Global_Objects' object, method = *slug.remove('Global_Objects/').split('/') @@ -59,7 +59,8 @@ module Docs return true unless node && node.parent == doc && !node.previous_element !node.content.include?('not on a standards track') && - !node.content.include?('removed from the Web') + !node.content.include?('removed from the Web') && + !node.content.include?('could be removed at any time') end end end diff --git a/lib/docs/filters/mdn/clean_html.rb b/lib/docs/filters/mdn/clean_html.rb index 7e8058e3..1bc61e48 100644 --- a/lib/docs/filters/mdn/clean_html.rb +++ b/lib/docs/filters/mdn/clean_html.rb @@ -16,6 +16,12 @@ module Docs node.name = 'th' end + css('nobr').each do |node| + node.before(node.children).remove + end + + css('h2[style]', 'pre[style]').remove_attr('style') + doc end end diff --git a/lib/docs/scrapers/mdn/dom.rb b/lib/docs/scrapers/mdn/dom.rb index 3db5d0b3..e308335a 100644 --- a/lib/docs/scrapers/mdn/dom.rb +++ b/lib/docs/scrapers/mdn/dom.rb @@ -118,6 +118,8 @@ module Docs url.sub! "#{Dom.base_url}/Selection/", "#{Dom.base_url}/Selection." url.sub! "#{Dom.base_url}/windowTimers", "#{Dom.base_url}/window" url.sub! "#{Dom.base_url}/windowEventHandlers", "#{Dom.base_url}/window" + url.sub! %r{\/windowLocalStorage(\.localStorage)?}i, "/window.localStorage" + url.sub! %r{\/windowSessionStorage(\.sessionStorage)?}i, "/window.sessionStorage" url.sub! "#{Dom.base_url}/Screen.", "#{Dom.base_url}/window.screen" url end diff --git a/lib/docs/scrapers/mdn/html.rb b/lib/docs/scrapers/mdn/html.rb index 129c597e..7acc1068 100644 --- a/lib/docs/scrapers/mdn/html.rb +++ b/lib/docs/scrapers/mdn/html.rb @@ -21,6 +21,7 @@ module Docs end end + options[:skip] = ['/Element/shadow'] options[:only_patterns] = [/\A\/Element/] options[:replace_paths] = { diff --git a/lib/docs/scrapers/mdn/javascript.rb b/lib/docs/scrapers/mdn/javascript.rb index 2bc5bfd4..3ba4f8b1 100644 --- a/lib/docs/scrapers/mdn/javascript.rb +++ b/lib/docs/scrapers/mdn/javascript.rb @@ -25,7 +25,9 @@ module Docs /Functions/rest_parameters /Methods_Index /Properties_Index - /Strict_mode/Transitioning_to_strict_mode) + /Strict_mode/Transitioning_to_strict_mode + /Operators/Legacy_generator_function + /Statements/Legacy_generator_function) # Duplicates options[:skip].concat %w(