Improve DOM scraper

pull/200/head
Thibaut 10 years ago
parent 018628ea7d
commit 60f6c284a0

@ -5,8 +5,11 @@ module Docs
'Battery Status' => 'Battery Status', 'Battery Status' => 'Battery Status',
'Canvas ' => 'Canvas', 'Canvas ' => 'Canvas',
'CSS Object Model' => 'CSS', 'CSS Object Model' => 'CSS',
'Cryptography' => 'Web Cryptography',
'Device Orientation' => 'Device Orientation', 'Device Orientation' => 'Device Orientation',
'Encoding' => 'Encoding', 'Encoding' => 'Encoding',
'Encrypted Media Extensions' => 'Encrypted Media',
'Fetch' => 'Fetch',
'File API' => 'File', 'File API' => 'File',
'Geolocation' => 'Geolocation', 'Geolocation' => 'Geolocation',
'Media Capture' => 'Media', 'Media Capture' => 'Media',
@ -14,30 +17,35 @@ module Docs
'MediaStream' => 'Media', 'MediaStream' => 'Media',
'Navigation Timing' => 'Navigation Timing', 'Navigation Timing' => 'Navigation Timing',
'Network Information' => 'Network Information', 'Network Information' => 'Network Information',
'Push API' => 'Push',
'Service Workers' => 'Service Workers', 'Service Workers' => 'Service Workers',
'Web Animations' => 'Animation',
'Web Audio' => 'Web Audio', 'Web Audio' => 'Web Audio',
'Web Messaging' => 'Web Messaging',
'Web Storage' => 'Web Storage', 'Web Storage' => 'Web Storage',
'Web Workers' => 'Web Workers', 'Web Workers' => 'Web Workers',
'WebRTC' => 'WebRTC' } 'WebRTC' => 'WebRTC' }
TYPE_BY_NAME_STARTS_WITH = { TYPE_BY_NAME_STARTS_WITH = {
'Audio' => 'Web Audio', 'Audio' => 'Web Audio',
'Broadcast' => 'Broadcast Channel',
'Canvas' => 'Canvas', 'Canvas' => 'Canvas',
'ChildNode' => 'Node',
'Console' => 'Console',
'CSS' => 'CSS', 'CSS' => 'CSS',
'ChildNode' => 'Node',
'console' => 'Console',
'document' => 'Document', 'document' => 'Document',
'DocumentFragment' => 'DocumentFragment', 'DocumentFragment' => 'DocumentFragment',
'DOM' => 'DOM', 'DOM' => 'DOM',
'element' => 'Element', 'element' => 'Element',
'event' => 'Event', 'event' => 'Event',
'Event' => 'Event', 'Event' => 'Event',
'Fetch' => 'Fetch',
'File' => 'File', 'File' => 'File',
'GlobalEventHandlers' => 'GlobalEventHandlers', 'GlobalEventHandlers' => 'GlobalEventHandlers',
'history' => 'History', 'history' => 'History',
'HTML' => 'Elements', 'HTML' => 'Elements',
'IDB' => 'IndexedDB', 'IDB' => 'IndexedDB',
'Location' => 'Location', 'location' => 'Location',
'navigator' => 'Navigator', 'navigator' => 'Navigator',
'MediaQuery' => 'MediaQuery', 'MediaQuery' => 'MediaQuery',
'Node' => 'Node', 'Node' => 'Node',
@ -45,6 +53,7 @@ module Docs
'ParentNode' => 'Node', 'ParentNode' => 'Node',
'Range' => 'Range', 'Range' => 'Range',
'RTC' => 'WebRTC', 'RTC' => 'WebRTC',
'screen' => 'Screen',
'Selection' => 'Selection', 'Selection' => 'Selection',
'Storage' => 'Web Storage', 'Storage' => 'Web Storage',
'StyleSheet' => 'CSS', 'StyleSheet' => 'CSS',
@ -54,13 +63,17 @@ module Docs
'TreeWalker' => 'TreeWalker', 'TreeWalker' => 'TreeWalker',
'Uint' => 'Typed Arrays', 'Uint' => 'Typed Arrays',
'URL' => 'URL', 'URL' => 'URL',
'window' => 'window', 'window' => 'Window',
'Window' => 'Window',
'XMLHttpRequest' => 'XMLHTTPRequest' } 'XMLHttpRequest' => 'XMLHTTPRequest' }
TYPE_BY_NAME_INCLUDES = { TYPE_BY_NAME_INCLUDES = {
'ChildNode' => 'Node',
'Crypto' => 'Web Cryptography',
'ImageData' => 'Canvas', 'ImageData' => 'Canvas',
'IndexedDB' => 'IndexedDB', 'IndexedDB' => 'IndexedDB',
'MediaStream' => 'Media', 'MediaStream' => 'Media',
'NodeList' => 'Node',
'Path2D' => 'Canvas', 'Path2D' => 'Canvas',
'ServiceWorker' => 'Service Workers', 'ServiceWorker' => 'Service Workers',
'TextMetrics' => 'Canvas', 'TextMetrics' => 'Canvas',
@ -129,10 +142,19 @@ module Docs
end end
end end
SKIP_CONTENT = [
'not on a standards track',
'removed from the Web',
'not on a current W3C standards track',
'This feature is not built into all browsers',
'not currently supported in any browser'
]
def include_default_entry? def include_default_entry?
(node = doc.at_css '.overheadIndicator').nil? || return true if type == 'Console'
type == 'Console' || return true unless node = doc.at_css('.overheadIndicator')
(node.content.exclude?('not on a standards track') && node.content.exclude?('removed from the Web')) content = node.content
SKIP_CONTENT.none? { |str| content.include?(str) }
end end
end end
end end

@ -2,53 +2,34 @@ module Docs
class Dom < Mdn class Dom < Mdn
self.name = 'DOM' self.name = 'DOM'
self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/API' self.base_url = 'https://developer.mozilla.org/en-US/docs/Web/API'
self.fix_redirections = true
html_filters.push 'dom/clean_html', 'dom/entries', 'title' html_filters.push 'dom/clean_html', 'dom/entries', 'title'
options[:root_title] = 'DOM' options[:root_title] = 'DOM'
# Don't want
options[:skip] = %w( options[:skip] = %w(
/App /Reference
/CallEvent /Index
/CanvasPixelArray /Document_Object_Model
/ChromeWorker /document/createProcessingInstruction
/document.createProcessingInstruction /document/documentURIObject
/document.documentURIObject /document/loadOverlay
/document.loadOverlay /document/tooltipNode
/document.tooltipNode
/DOMErrorHandler /DOMErrorHandler
/DOMLocator /DOMLocator
/DOMObject /DOMObject
/DOMStringList /DOMStringList
/Event/Comparison_of_Event_Targets /Event/Comparison_of_Event_Targets
/FMRadio
/IDBDatabaseException /IDBDatabaseException
/IndexedDB_API/Using_JavaScript_Generators_in_Firefox /IndexedDB_API/Using_JavaScript_Generators_in_Firefox
/NamedNodeMap
/Node.baseURIObject
/Node.nodePrincipal
/Notation /Notation
/PowerManager
/PushManager
/ProcessingInstruction /ProcessingInstruction
/TCPServerSocket
/TCPSocket
/TypeInfo /TypeInfo
/Using_the_Browser_API /window/getAttention
/Web_Video_Text_Tracks_Format /window/messageManager
/WifiManager /window/updateCommands
/window.controllers /window/pkcs11)
/window.crypto
/window.getAttention
/window.messageManager
/window.navigator.addIdleObserver
/window.navigator.getDeviceStorage
/window.navigator.getDeviceStorages
/window.navigator.removeIdleObserver
/window.navigator.requestWakeLock
/window.updateCommands
/window.pkcs11)
options[:skip_patterns] = [ options[:skip_patterns] = [
/NS/, /NS/,
@ -58,69 +39,42 @@ module Docs
/webkit/i, /webkit/i,
/gamepad/i, /gamepad/i,
/UserData/, /UserData/,
/Bluetooth/,
/FMRadio/i,
/XDomainRequest/i,
/\A\/Camera/, /\A\/Camera/,
/\A\/Data_Store_API/, /\A\/Data_Store_API/,
/\A\/DataStore/, /\A\/DataStore/,
/\A\/DeviceStorage/, /\A\/DeviceStorage/,
/\A\/DocumentTouch/, /\A\/DocumentTouch/,
/\A\/document\.xml/, /\A\/document\/xml/,
/\A\/XMLDocument/, /\A\/XMLDocument/,
/\A\/DOMCursor/, /\A\/DOMCursor/,
/\A\/DOMRequest/, /\A\/DOMRequest/,
/\A\/element\.on/,
/\A\/Entity/, /\A\/Entity/,
/\A\/navigator\.id/i,
/\A\/Settings/, /\A\/Settings/,
/\A\/Telephony/, /\A\/Telephony/,
/Bluetooth/, /\A\/Window\/\w+bar/i,
/\A\/Window\.\w+bar/i,
/\A\/Apps/, /\A\/Apps/,
/\A\/Contact/, /\A\/Contact/,
/\A\/L10n/, /\A\/L10n/,
/\A\/Permission/] /\A\/Permission/]
# Broken / Empty
options[:skip].concat %w(
/Attr.isId
/document.nodePrincipal
/Event/UIEvent
/Extensions
/StyleSheetList
/SVGPoint
/Window.dispatchEvent
/Window.restore
/Window.routeEvent
/Window.QueryInterface)
# Duplicates
options[:skip].concat %w(
/Reference
/Index
/form.elements
/select.type
/table.rows
/XMLHttpRequest/FormData
/Performance.now
/Document_Object_Model)
options[:fix_urls] = ->(url) do options[:fix_urls] = ->(url) do
return if url.include?('_') || url.include?('?') return if url.include?('_') || url.include?('?')
url.sub! 'https://developer.mozilla.org/en-US/docs/DOM/', "#{Dom.base_url}/" url.sub! 'https://developer.mozilla.org/en-US/docs/DOM/', "#{Dom.base_url}/"
url.sub! 'https://developer.mozilla.org/en/DOM/', "#{Dom.base_url}/" url.sub! 'https://developer.mozilla.org/en/DOM/', "#{Dom.base_url}/"
url.sub! "#{Dom.base_url}/Document\.", "#{Dom.base_url}/document." url.sub! "#{Dom.base_url}/Console", "#{Dom.base_url}/console"
url.sub! "#{Dom.base_url}/Document\/", "#{Dom.base_url}/document\/"
url.sub! "#{Dom.base_url}/Element", "#{Dom.base_url}/element" url.sub! "#{Dom.base_url}/Element", "#{Dom.base_url}/element"
url.sub! "#{Dom.base_url}/History", "#{Dom.base_url}/history" url.sub! "#{Dom.base_url}/History", "#{Dom.base_url}/history"
url.sub! "#{Dom.base_url}/Location", "#{Dom.base_url}/location"
url.sub! "#{Dom.base_url}/Navigator", "#{Dom.base_url}/navigator" url.sub! "#{Dom.base_url}/Navigator", "#{Dom.base_url}/navigator"
url.sub! "#{Dom.base_url}/Screen", "#{Dom.base_url}/screen"
url.sub! "#{Dom.base_url}/Window\/", "#{Dom.base_url}/window\/"
url.sub! "#{Dom.base_url}/notification", "#{Dom.base_url}/Notification" url.sub! "#{Dom.base_url}/notification", "#{Dom.base_url}/Notification"
url.sub! "#{Dom.base_url}/range", "#{Dom.base_url}/Range" url.sub! "#{Dom.base_url}/range", "#{Dom.base_url}/Range"
url.sub! "#{Dom.base_url}/Window", "#{Dom.base_url}/window" url.sub! "#{Dom.base_url}/event", "#{Dom.base_url}/Event"
url.sub! "#{Dom.base_url}/window.navigator", "#{Dom.base_url}/navigator"
url.sub! "#{Dom.base_url}/Selection/", "#{Dom.base_url}/Selection."
url.sub! "#{Dom.base_url}/windowTimers", "#{Dom.base_url}/window"
url.sub! "#{Dom.base_url}/windowEventHandlers", "#{Dom.base_url}/window"
url.sub! %r{\/windowLocalStorage(\.localStorage)?}i, "/window.localStorage"
url.sub! %r{\/windowSessionStorage(\.sessionStorage)?}i, "/window.sessionStorage"
url.sub! "#{Dom.base_url}/Screen.", "#{Dom.base_url}/window.screen"
url url
end end
end end

@ -11,7 +11,7 @@ module Docs
options[:trailing_slash] = false options[:trailing_slash] = false
options[:skip_link] = ->(link) { link['title'].try(:include?, 'hasn\'t been written yet'.freeze) } options[:skip_link] = ->(link) { link['title'].try(:include?, 'written'.freeze) }
options[:attribution] = <<-HTML options[:attribution] = <<-HTML
&copy; 2015 Mozilla Contributors<br> &copy; 2015 Mozilla Contributors<br>

Loading…
Cancel
Save