From b70d884598e1e14b99190e1e5c349553ff59849b Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Mon, 7 May 2018 19:45:46 -0400 Subject: Initial updates for Reader View. --- toolkit/components/reader/AboutReader.jsm | 314 ++++---- toolkit/components/reader/JSDOMParser.js | 46 +- toolkit/components/reader/Readability.js | 853 +++++++++------------ toolkit/components/reader/ReaderMode.jsm | 312 +++++--- toolkit/components/reader/ReaderWorker.js | 7 +- toolkit/components/reader/content/aboutReader.html | 73 +- toolkit/components/reader/content/aboutReader.js | 2 +- 7 files changed, 821 insertions(+), 786 deletions(-) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm index 1fb9db123..ae1ff9d60 100644 --- a/toolkit/components/reader/AboutReader.jsm +++ b/toolkit/components/reader/AboutReader.jsm @@ -15,8 +15,7 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "AsyncPrefs", "resource://gre/modules/AsyncPrefs.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "NarrateControls", "resource://gre/modules/narrate/NarrateControls.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "Rect", "resource://gre/modules/Geometry.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "UITelemetry", "resource://gre/modules/UITelemetry.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "PluralForm", "resource://gre/modules/PluralForm.jsm"); var gStrings = Services.strings.createBundle("chrome://global/locale/aboutReader.properties"); @@ -45,45 +44,49 @@ var AboutReader = function(mm, win, articlePromise) { .getInterface(Ci.nsIDOMWindowUtils).currentInnerWindowID; this._article = null; + this._languagePromise = new Promise(resolve => { + this._foundLanguage = resolve; + }); if (articlePromise) { this._articlePromise = articlePromise; } - this._headerElementRef = Cu.getWeakReference(doc.getElementById("reader-header")); - this._domainElementRef = Cu.getWeakReference(doc.getElementById("reader-domain")); - this._titleElementRef = Cu.getWeakReference(doc.getElementById("reader-title")); - this._creditsElementRef = Cu.getWeakReference(doc.getElementById("reader-credits")); - this._contentElementRef = Cu.getWeakReference(doc.getElementById("moz-reader-content")); - this._toolbarElementRef = Cu.getWeakReference(doc.getElementById("reader-toolbar")); - this._messageElementRef = Cu.getWeakReference(doc.getElementById("reader-message")); + this._headerElementRef = Cu.getWeakReference(doc.querySelector(".reader-header")); + this._domainElementRef = Cu.getWeakReference(doc.querySelector(".reader-domain")); + this._titleElementRef = Cu.getWeakReference(doc.querySelector(".reader-title")); + this._readTimeElementRef = Cu.getWeakReference(doc.querySelector(".reader-estimated-time")); + this._creditsElementRef = Cu.getWeakReference(doc.querySelector(".reader-credits")); + this._contentElementRef = Cu.getWeakReference(doc.querySelector(".moz-reader-content")); + this._toolbarElementRef = Cu.getWeakReference(doc.querySelector(".reader-toolbar")); + this._messageElementRef = Cu.getWeakReference(doc.querySelector(".reader-message")); + this._containerElementRef = Cu.getWeakReference(doc.querySelector(".container")); this._scrollOffset = win.pageYOffset; - doc.addEventListener("click", this, false); + doc.addEventListener("click", this); - win.addEventListener("pagehide", this, false); - win.addEventListener("scroll", this, false); - win.addEventListener("resize", this, false); + win.addEventListener("pagehide", this); + win.addEventListener("scroll", this); + win.addEventListener("resize", this); Services.obs.addObserver(this, "inner-window-destroyed", false); - doc.addEventListener("visibilitychange", this, false); + doc.addEventListener("visibilitychange", this); this._setupStyleDropdown(); this._setupButton("close-button", this._onReaderClose.bind(this), "aboutReader.toolbar.close"); - const gIsFirefoxDesktop = Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}"; - if (gIsFirefoxDesktop) { - // we're ready for any external setup, send a signal for that. - this._mm.sendAsyncMessage("Reader:OnSetup"); - } + // we're ready for any external setup, send a signal for that. + this._mm.sendAsyncMessage("Reader:OnSetup"); let colorSchemeValues = JSON.parse(Services.prefs.getCharPref("reader.color_scheme.values")); let colorSchemeOptions = colorSchemeValues.map((value) => { - return { name: gStrings.GetStringFromName("aboutReader.colorScheme." + value), - value: value, - itemClass: value + "-button" }; + return { + name: gStrings.GetStringFromName("aboutReader.colorScheme." + value), + value, + itemClass: value + "-button" + }; }); let colorScheme = Services.prefs.getCharPref("reader.color_scheme"); @@ -114,7 +117,7 @@ var AboutReader = function(mm, win, articlePromise) { this._setupLineHeightButtons(); if (win.speechSynthesis && Services.prefs.getBoolPref("narrate.enabled")) { - new NarrateControls(mm, win); + new NarrateControls(mm, win, this._languagePromise); } this._loadArticle(); @@ -146,6 +149,10 @@ AboutReader.prototype = { return this._titleElementRef.get(); }, + get _readTimeElement() { + return this._readTimeElementRef.get(); + }, + get _creditsElement() { return this._creditsElementRef.get(); }, @@ -162,6 +169,10 @@ AboutReader.prototype = { return this._messageElementRef.get(); }, + get _containerElement() { + return this._containerElementRef.get(); + }, + get _isToolbarVertical() { if (this._toolbarVertical !== undefined) { return this._toolbarVertical; @@ -178,7 +189,7 @@ AboutReader.prototype = { return _viewId; }, - receiveMessage: function (message) { + receiveMessage (message) { switch (message.name) { // Triggered by Android user pressing BACK while the banner font-dropdown is open. case "Reader:CloseDropdown": { @@ -191,14 +202,14 @@ AboutReader.prototype = { if (message.data.id && message.data.image && !this._doc.getElementById(message.data.id)) { let btn = this._doc.createElement("button"); - btn.setAttribute("class", "button"); + btn.setAttribute("class", "button " + message.data.id); btn.setAttribute("style", "background-image: url('" + message.data.image + "')"); btn.setAttribute("id", message.data.id); if (message.data.title) btn.setAttribute("title", message.data.title); if (message.data.text) btn.textContent = message.data.text; - let tb = this._doc.getElementById("reader-toolbar"); + let tb = this._toolbarElement; tb.appendChild(btn); this._setupButton(message.data.id, button => { this._mm.sendAsyncMessage("Reader:Clicked-" + button.getAttribute("id"), { article: this._article }); @@ -220,18 +231,21 @@ AboutReader.prototype = { } }, - handleEvent: function(aEvent) { + handleEvent(aEvent) { if (!aEvent.isTrusted) return; switch (aEvent.type) { case "click": let target = aEvent.target; - if (target.classList.contains('dropdown-toggle')) { + if (target.classList.contains("dropdown-toggle")) { this._toggleDropdownClicked(aEvent); - } else if (!target.closest('.dropdown-popup')) { + } else if (!target.closest(".dropdown-popup")) { this._closeDropdowns(); } + if (target.tagName == "A" && !target.classList.contains("reader-domain")) { + this._linkClicked(aEvent); + } break; case "scroll": this._closeDropdowns(true); @@ -243,7 +257,7 @@ AboutReader.prototype = { this._updateImageMargins(); if (this._isToolbarVertical) { this._win.setTimeout(() => { - for (let dropdown of this._doc.querySelectorAll('.dropdown.open')) { + for (let dropdown of this._doc.querySelectorAll(".dropdown.open")) { this._updatePopupPosition(dropdown); } }, 0); @@ -271,12 +285,12 @@ AboutReader.prototype = { } }, - observe: function(subject, topic, data) { + observe(subject, topic, data) { if (subject.QueryInterface(Ci.nsISupportsPRUint64).data != this._innerWindowId) { return; } - Services.obs.removeObserver(this, "inner-window-destroyed", false); + Services.obs.removeObserver(this, "inner-window-destroyed"); this._mm.removeMessageListener("Reader:CloseDropdown", this); this._mm.removeMessageListener("Reader:AddButton", this); @@ -284,12 +298,12 @@ AboutReader.prototype = { this._windowUnloaded = true; }, - _onReaderClose: function() { + _onReaderClose() { ReaderMode.leaveReaderMode(this._mm.docShell, this._win); }, - _setFontSize: function(newFontSize) { - let containerClasses = this._doc.getElementById("container").classList; + _setFontSize(newFontSize) { + let containerClasses = this._containerElement.classList; if (this._fontSize > 0) containerClasses.remove("font-size" + this._fontSize); @@ -299,19 +313,19 @@ AboutReader.prototype = { return AsyncPrefs.set("reader.font_size", this._fontSize); }, - _setupFontSizeButtons: function() { + _setupFontSizeButtons() { const FONT_SIZE_MIN = 1; const FONT_SIZE_MAX = 9; // Sample text shown in Android UI. - let sampleText = this._doc.getElementById("font-size-sample"); + let sampleText = this._doc.querySelector(".font-size-sample"); sampleText.textContent = gStrings.GetStringFromName("aboutReader.fontTypeSample"); let currentSize = Services.prefs.getIntPref("reader.font_size"); currentSize = Math.max(FONT_SIZE_MIN, Math.min(FONT_SIZE_MAX, currentSize)); - let plusButton = this._doc.getElementById("font-size-plus"); - let minusButton = this._doc.getElementById("font-size-minus"); + let plusButton = this._doc.querySelector(".plus-button"); + let minusButton = this._doc.querySelector(".minus-button"); function updateControls() { if (currentSize === FONT_SIZE_MIN) { @@ -360,8 +374,8 @@ AboutReader.prototype = { }, true); }, - _setContentWidth: function(newContentWidth) { - let containerClasses = this._doc.getElementById("container").classList; + _setContentWidth(newContentWidth) { + let containerClasses = this._containerElement.classList; if (this._contentWidth > 0) containerClasses.remove("content-width" + this._contentWidth); @@ -371,15 +385,15 @@ AboutReader.prototype = { return AsyncPrefs.set("reader.content_width", this._contentWidth); }, - _setupContentWidthButtons: function() { + _setupContentWidthButtons() { const CONTENT_WIDTH_MIN = 1; const CONTENT_WIDTH_MAX = 9; let currentContentWidth = Services.prefs.getIntPref("reader.content_width"); currentContentWidth = Math.max(CONTENT_WIDTH_MIN, Math.min(CONTENT_WIDTH_MAX, currentContentWidth)); - let plusButton = this._doc.getElementById("content-width-plus"); - let minusButton = this._doc.getElementById("content-width-minus"); + let plusButton = this._doc.querySelector(".content-width-plus-button"); + let minusButton = this._doc.querySelector(".content-width-minus-button"); function updateControls() { if (currentContentWidth === CONTENT_WIDTH_MIN) { @@ -428,8 +442,8 @@ AboutReader.prototype = { }, true); }, - _setLineHeight: function(newLineHeight) { - let contentClasses = this._doc.getElementById("moz-reader-content").classList; + _setLineHeight(newLineHeight) { + let contentClasses = this._contentElement.classList; if (this._lineHeight > 0) contentClasses.remove("line-height" + this._lineHeight); @@ -439,15 +453,15 @@ AboutReader.prototype = { return AsyncPrefs.set("reader.line_height", this._lineHeight); }, - _setupLineHeightButtons: function() { + _setupLineHeightButtons() { const LINE_HEIGHT_MIN = 1; const LINE_HEIGHT_MAX = 9; let currentLineHeight = Services.prefs.getIntPref("reader.line_height"); currentLineHeight = Math.max(LINE_HEIGHT_MIN, Math.min(LINE_HEIGHT_MAX, currentLineHeight)); - let plusButton = this._doc.getElementById("line-height-plus"); - let minusButton = this._doc.getElementById("line-height-minus"); + let plusButton = this._doc.querySelector(".line-height-plus-button"); + let minusButton = this._doc.querySelector(".line-height-minus-button"); function updateControls() { if (currentLineHeight === LINE_HEIGHT_MIN) { @@ -496,7 +510,7 @@ AboutReader.prototype = { }, true); }, - _handleDeviceLight: function(newLux) { + _handleDeviceLight(newLux) { // Desired size of the this._luxValues array. let luxValuesSize = 10; // Add new lux value at the front of the array. @@ -513,7 +527,7 @@ AboutReader.prototype = { return; } // Holds the average of the lux values collected in this._luxValues. - let averageLuxValue = this._totalLux/luxValuesSize; + let averageLuxValue = this._totalLux / luxValuesSize; this._updateColorScheme(averageLuxValue); // Pop the oldest value off the array. @@ -522,7 +536,7 @@ AboutReader.prototype = { this._totalLux -= oldLux; }, - _handleVisibilityChange: function() { + _handleVisibilityChange() { let colorScheme = Services.prefs.getCharPref("reader.color_scheme"); if (colorScheme != "auto") { return; @@ -533,19 +547,19 @@ AboutReader.prototype = { }, // Setup or teardown the ambient light tracking system. - _enableAmbientLighting: function(enable) { + _enableAmbientLighting(enable) { if (enable) { - this._win.addEventListener("devicelight", this, false); + this._win.addEventListener("devicelight", this); this._luxValues = []; this._totalLux = 0; } else { - this._win.removeEventListener("devicelight", this, false); + this._win.removeEventListener("devicelight", this); delete this._luxValues; delete this._totalLux; } }, - _updateColorScheme: function(luxValue) { + _updateColorScheme(luxValue) { // Upper bound value for "dark" color scheme beyond which it changes to "light". let upperBoundDark = 50; // Lower bound value for "light" color scheme beyond which it changes to "dark". @@ -564,7 +578,7 @@ AboutReader.prototype = { this._setColorScheme("light"); }, - _setColorScheme: function(newColorScheme) { + _setColorScheme(newColorScheme) { // "auto" is not a real color scheme if (this._colorScheme === newColorScheme || newColorScheme === "auto") return; @@ -580,14 +594,14 @@ AboutReader.prototype = { // Pref values include "dark", "light", and "auto", which automatically switches // between light and dark color schemes based on the ambient light level. - _setColorSchemePref: function(colorSchemePref) { + _setColorSchemePref(colorSchemePref) { this._enableAmbientLighting(colorSchemePref === "auto"); this._setColorScheme(colorSchemePref); AsyncPrefs.set("reader.color_scheme", colorSchemePref); }, - _setFontType: function(newFontType) { + _setFontType(newFontType) { if (this._fontType === newFontType) return; @@ -602,20 +616,38 @@ AboutReader.prototype = { AsyncPrefs.set("reader.font_type", this._fontType); }, - _setSystemUIVisibility: function(visible) { + _setSystemUIVisibility(visible) { this._mm.sendAsyncMessage("Reader:SystemUIVisibility", { visible: visible }); }, - _loadArticle: Task.async(function* () { + _setToolbarVisibility(visible) { + let tb = this._toolbarElement; + + if (visible) { + if (tb.style.opacity != "1") { + tb.removeAttribute("hidden"); + tb.style.opacity = "1"; + } + } else if (tb.style.opacity != "0") { + tb.addEventListener("transitionend", evt => { + if (tb.style.opacity == "0") { + tb.setAttribute("hidden", ""); + } + }, { once: true }); + tb.style.opacity = "0"; + } + }, + + async _loadArticle() { let url = this._getOriginalUrl(); this._showProgressDelayed(); let article; if (this._articlePromise) { - article = yield this._articlePromise; + article = await this._articlePromise; } else { try { - article = yield this._getArticle(url); + article = await this._getArticle(url); } catch (e) { if (e && e.newURL) { let readerURL = "about:reader?url=" + encodeURIComponent(e.newURL); @@ -638,24 +670,26 @@ AboutReader.prototype = { } this._showContent(article); - }), - - _getArticle: function(url) { - return new Promise((resolve, reject) => { - let listener = (message) => { - this._mm.removeMessageListener("Reader:ArticleData", listener); - if (message.data.newURL) { - reject({ newURL: message.data.newURL }); - return; - } - resolve(message.data.article); - }; - this._mm.addMessageListener("Reader:ArticleData", listener); - this._mm.sendAsyncMessage("Reader:ArticleGet", { url: url }); - }); }, - _requestFavicon: function() { + _getArticle(url) { + // return new Promise((resolve, reject) => { + // let listener = (message) => { + // this._mm.removeMessageListener("Reader:ArticleData", listener); + // if (message.data.newURL) { + // reject({ newURL: message.data.newURL }); + // return; + // } + // resolve(message.data.article); + // }; + // this._mm.addMessageListener("Reader:ArticleData", listener); + // this._mm.sendAsyncMessage("Reader:ArticleGet", { url }); + // }); + //} + return ReaderMode.downloadAndParseDocument(url); + }, + + _requestFavicon() { let handleFaviconReturn = (message) => { this._mm.removeMessageListener("Reader:FaviconReturn", handleFaviconReturn); this._loadFavicon(message.data.url, message.data.faviconUrl); @@ -665,20 +699,20 @@ AboutReader.prototype = { this._mm.sendAsyncMessage("Reader:FaviconRequest", { url: this._article.url }); }, - _loadFavicon: function(url, faviconUrl) { + _loadFavicon(url, faviconUrl) { if (this._article.url !== url) return; let doc = this._doc; - let link = doc.createElement('link'); - link.rel = 'shortcut icon'; + let link = doc.createElement("link"); + link.rel = "shortcut icon"; link.href = faviconUrl; - doc.getElementsByTagName('head')[0].appendChild(link); + doc.getElementsByTagName("head")[0].appendChild(link); }, - _updateImageMargins: function() { + _updateImageMargins() { let windowWidth = this._win.innerWidth; let bodyWidth = this._doc.body.clientWidth; @@ -691,7 +725,7 @@ AboutReader.prototype = { } // If the image is at least half as wide as the body, center it on desktop. - if (img.naturalWidth >= bodyWidth/2) { + if (img.naturalWidth >= bodyWidth / 2) { img.setAttribute("moz-reader-center", true); } else { img.removeAttribute("moz-reader-center"); @@ -713,30 +747,32 @@ AboutReader.prototype = { }, _maybeSetTextDirection: function Read_maybeSetTextDirection(article) { - if (!article.dir) - return; + if (article.dir) { + // Set "dir" attribute on content + this._contentElement.setAttribute("dir", article.dir); + this._headerElement.setAttribute("dir", article.dir); + + // The native locale could be set differently than the article's text direction. + var localeDirection = Services.locale.isAppLocaleRTL ? "rtl" : "ltr"; + this._readTimeElement.setAttribute("dir", localeDirection); + this._readTimeElement.style.textAlign = article.dir == "rtl" ? "right" : "left"; + } + }, + + _formatReadTime(slowEstimate, fastEstimate) { + let displayStringKey = "aboutReader.estimatedReadTimeRange1"; - // Set "dir" attribute on content - this._contentElement.setAttribute("dir", article.dir); - this._headerElement.setAttribute("dir", article.dir); - }, - - _fixLocalLinks() { - // We need to do this because preprocessing the content through nsIParserUtils - // gives back a DOM with a element. That influences how these URLs get - // resolved, making them no longer match the document URI (which is - // about:reader?url=...). To fix this, make all the hash URIs absolute. This - // is hacky, but the alternative of removing the base element has potential - // security implications if Readability has not successfully made all the URLs - // absolute, so we pick just fixing these in-document links explicitly. - let localLinks = this._contentElement.querySelectorAll("a[href^='#']"); - for (let localLink of localLinks) { - // Have to get the attribute because .href provides an absolute URI. - localLink.href = this._doc.documentURI + localLink.getAttribute("href"); + // only show one reading estimate when they are the same value + if (slowEstimate == fastEstimate) { + displayStringKey = "aboutReader.estimatedReadTimeValue1"; } + + return PluralForm.get(slowEstimate, gStrings.GetStringFromName(displayStringKey)) + .replace("#1", fastEstimate) + .replace("#2", slowEstimate); }, - _showError: function() { + _showError() { this._headerElement.style.display = "none"; this._contentElement.style.display = "none"; @@ -746,11 +782,16 @@ AboutReader.prototype = { this._doc.title = errorMessage; + this._doc.documentElement.dataset.isError = true; + this._error = true; + + this._doc.dispatchEvent( + new this._win.CustomEvent("AboutReaderContentError", { bubbles: true, cancelable: false })); }, // This function is the JS version of Java's StringUtils.stripCommonSubdomains. - _stripHost: function(host) { + _stripHost(host) { if (!host) return host; @@ -766,17 +807,18 @@ AboutReader.prototype = { return host.substring(start); }, - _showContent: function(article) { + _showContent(article) { this._messageElement.style.display = "none"; this._article = article; this._domainElement.href = article.url; - let articleUri = Services.io.newURI(article.url, null, null); + let articleUri = Services.io.newURI(article.url); this._domainElement.textContent = this._stripHost(articleUri.host); this._creditsElement.textContent = article.byline; this._titleElement.textContent = article.title; + this._readTimeElement.textContent = this._formatReadTime(article.readingTimeMinsSlow, article.readingTimeMinsFast); this._doc.title = article.title; this._headerElement.style.display = "block"; @@ -787,8 +829,8 @@ AboutReader.prototype = { false, articleUri, this._contentElement); this._contentElement.innerHTML = ""; this._contentElement.appendChild(contentFragment); - this._fixLocalLinks(); this._maybeSetTextDirection(article); + this._foundLanguage(article.language); this._contentElement.style.display = "block"; this._updateImageMargins(); @@ -804,13 +846,13 @@ AboutReader.prototype = { new this._win.CustomEvent("AboutReaderContentReady", { bubbles: true, cancelable: false })); }, - _hideContent: function() { + _hideContent() { this._headerElement.style.display = "none"; this._contentElement.style.display = "none"; }, - _showProgressDelayed: function() { - this._win.setTimeout(function() { + _showProgressDelayed() { + this._win.setTimeout(() => { // No need to show progress if the article has been loaded, // if the window has been unloaded, or if there was an error // trying to load the article. @@ -823,20 +865,20 @@ AboutReader.prototype = { this._messageElement.textContent = gStrings.GetStringFromName("aboutReader.loading2"); this._messageElement.style.display = "block"; - }.bind(this), 300); + }, 300); }, /** * Returns the original article URL for this about:reader view. */ - _getOriginalUrl: function(win) { + _getOriginalUrl(win) { let url = win ? win.location.href : this._win.location.href; return ReaderMode.getOriginalUrl(url) || url; }, - _setupSegmentedButton: function(id, options, initialValue, callback) { + _setupSegmentedButton(id, options, initialValue, callback) { let doc = this._doc; - let segmentedButton = doc.getElementById(id); + let segmentedButton = doc.getElementsByClassName(id)[0]; for (let i = 0; i < options.length; i++) { let option = options[i]; @@ -867,10 +909,6 @@ AboutReader.prototype = { aEvent.stopPropagation(); - // Just pass the ID of the button as an extra and hope the ID doesn't change - // unless the context changes - UITelemetry.addEvent("action.1", "button", null, id); - let items = segmentedButton.children; for (let j = items.length - 1; j >= 0; j--) { items[j].classList.remove("selected"); @@ -878,19 +916,19 @@ AboutReader.prototype = { item.classList.add("selected"); callback(option.value); - }.bind(this), true); + }, true); if (option.value === initialValue) item.classList.add("selected"); } }, - _setupButton: function(id, callback, titleEntity, textEntity) { + _setupButton(id, callback, titleEntity, textEntity) { if (titleEntity) { this._setButtonTip(id, titleEntity); } - let button = this._doc.getElementById(id); + let button = this._doc.getElementsByClassName(id)[0]; if (textEntity) { button.textContent = gStrings.GetStringFromName(textEntity); } @@ -910,17 +948,17 @@ AboutReader.prototype = { * and dynamically as button state changes. * @param Localizable string providing UI element usage tip. */ - _setButtonTip: function(id, titleEntity) { - let button = this._doc.getElementById(id); + _setButtonTip(id, titleEntity) { + let button = this._doc.getElementsByClassName(id)[0]; button.setAttribute("title", gStrings.GetStringFromName(titleEntity)); }, - _setupStyleDropdown: function() { - let dropdownToggle = this._doc.querySelector("#style-dropdown .dropdown-toggle"); + _setupStyleDropdown() { + let dropdownToggle = this._doc.querySelector(".style-dropdown .dropdown-toggle"); dropdownToggle.setAttribute("title", gStrings.GetStringFromName("aboutReader.toolbar.typeControls")); }, - _updatePopupPosition: function(dropdown) { + _updatePopupPosition(dropdown) { let dropdownToggle = dropdown.querySelector(".dropdown-toggle"); let dropdownPopup = dropdown.querySelector(".dropdown-popup"); @@ -931,8 +969,8 @@ AboutReader.prototype = { dropdownPopup.style.top = popupTop + "px"; }, - _toggleDropdownClicked: function(event) { - let dropdown = event.target.closest('.dropdown'); + _toggleDropdownClicked(event) { + let dropdown = event.target.closest(".dropdown"); if (!dropdown) return; @@ -952,7 +990,7 @@ AboutReader.prototype = { /* * If the ReaderView banner font-dropdown is closed, open it. */ - _openDropdown: function(dropdown) { + _openDropdown(dropdown) { if (dropdown.classList.contains("open")) { return; } @@ -969,7 +1007,7 @@ AboutReader.prototype = { * dropdowns because the page is scrolling, allow popups to stay open with * the keep-open class. */ - _closeDropdowns: function(scrolling) { + _closeDropdowns(scrolling) { let selector = ".dropdown.open"; if (scrolling) { selector += ":not(.keep-open)"; @@ -986,6 +1024,18 @@ AboutReader.prototype = { } }, + /* + * Override link handling for same-page references so we don't exit Reader View. + */ + _linkClicked(event) { + var originalUrl = Services.io.newURI(this._getOriginalUrl(), null, null); + var targetUrl = Services.io.newURI(event.target.href, null, null); + if (originalUrl.specIgnoringRef == targetUrl.specIgnoringRef) { + event.preventDefault(); + this._goToReference(targetUrl.ref); + } + }, + /* * Scroll reader view to a reference */ diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index 853649775..38f59c4ea 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -1017,46 +1017,6 @@ } }, - readScript: function (node) { - while (this.currentChar < this.html.length) { - var c = this.nextChar(); - var nextC = this.peekNext(); - if (c === "<") { - if (nextC === "!" || nextC === "?") { - // We're still before the ! or ? that is starting this comment: - this.currentChar++; - node.appendChild(this.discardNextComment()); - continue; - } - if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") { - // Go back before the '<' so we find the end tag. - this.currentChar--; - // Done with this script tag, the caller will close: - return; - } - } - // Either c wasn't a '<' or it was but we couldn't find either a comment - // or a closing script tag, so we should just parse as text until the next one - // comes along: - - var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE; - var textNode = haveTextNode ? node.lastChild : new Text(); - var n = this.html.indexOf("<", this.currentChar); - // Decrement this to include the current character *afterwards* so we don't get stuck - // looking for the same < all the time. - this.currentChar--; - if (n === -1) { - textNode.innerHTML += this.html.substring(this.currentChar, this.html.length); - this.currentChar = this.html.length; - } else { - textNode.innerHTML += this.html.substring(this.currentChar, n); - this.currentChar = n; - } - if (!haveTextNode) - node.appendChild(textNode); - } - }, - discardNextComment: function() { if (this.match("--")) { this.discardTo("-->"); @@ -1131,11 +1091,7 @@ // If this isn't a void Element, read its child nodes if (!closed) { - if (localName == "script") { - this.readScript(node); - } else { - this.readChildren(node); - } + this.readChildren(node); var closingTag = ""; if (!this.match(closingTag)) { this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length)); diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 491461a8e..04949dc61 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -38,32 +38,22 @@ function Readability(uri, doc, options) { this._uri = uri; this._doc = doc; - this._biggestFrame = false; + this._articleTitle = null; this._articleByline = null; this._articleDir = null; - // Configureable options + // Configurable options this._debug = !!options.debug; this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; - this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES; + this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD; + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | this.FLAG_WEIGHT_CLASSES | this.FLAG_CLEAN_CONDITIONALLY; - // The list of pages we've parsed in this call of readability, - // for autopaging. As a key store for easier searching. - this._parsedPages = {}; - - // A list of the ETag headers of pages we've parsed, in case they happen to match, - // we'll know it's a duplicate. - this._pageETags = {}; - - // Make an AJAX request for each page and append it to the document. - this._curPageNum = 1; - var logEl; // Control whether log messages are sent to the console @@ -82,12 +72,12 @@ function Readability(uri, doc, options) { return rv + elDesc; }; this.log = function () { - if (typeof dump !== undefined) { + if (typeof dump !== "undefined") { var msg = Array.prototype.map.call(arguments, function(x) { return (x && x.nodeName) ? logEl(x) : x; }).join(" "); dump("Reader: (Readability) " + msg + "\n"); - } else if (typeof console !== undefined) { + } else if (typeof console !== "undefined") { var args = ["Reader: (Readability) "].concat(arguments); console.log.apply(console, args); } @@ -109,20 +99,19 @@ Readability.prototype = { // tight the competition is among candidates. DEFAULT_N_TOP_CANDIDATES: 5, - // The maximum number of pages to loop through before we call - // it quits and just show a link. - DEFAULT_MAX_PAGES: 5, - // Element tags to score by default. DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), + // The default number of words an article must have in order to return a result + DEFAULT_WORD_THRESHOLD: 500, + // All of the regular expressions in use within readability. // Defined up here so we don't instantiate them repeatedly in loops. REGEXPS: { - unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i, + unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, @@ -138,6 +127,13 @@ Readability.prototype = { ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], + PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], + + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + + // These are the classes that readability sets itself. + CLASSES_TO_PRESERVE: [ "readability-styled", "page" ], + /** * Run any post-process modifications to article content as necessary. * @@ -147,6 +143,9 @@ Readability.prototype = { _postProcessContent: function(articleContent) { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); + + // Remove classes. + this._cleanClasses(articleContent); }, /** @@ -155,8 +154,8 @@ Readability.prototype = { * * If function is not passed, removes all the nodes in node list. * - * @param NodeList nodeList The no - * @param Function filterFn + * @param NodeList nodeList The nodes to operate on + * @param Function filterFn the function to use as a filter * @return void */ _removeNodes: function(nodeList, filterFn) { @@ -171,6 +170,20 @@ Readability.prototype = { } }, + /** + * Iterates over a NodeList, and calls _setNodeTag for each node. + * + * @param NodeList nodeList The nodes to operate on + * @param String newTagName the new tag name to use + * @return void + */ + _replaceNodeTags: function(nodeList, newTagName) { + for (var i = nodeList.length - 1; i >= 0; i--) { + var node = nodeList[i]; + this._setNodeTag(node, newTagName); + } + }, + /** * Iterate over a NodeList, which doesn't natively fully implement the Array * interface. @@ -180,10 +193,9 @@ Readability.prototype = { * * @param NodeList nodeList The NodeList. * @param Function fn The iterate function. - * @param Boolean backward Whether to use backward iteration. * @return void */ - _forEachNode: function(nodeList, fn, backward) { + _forEachNode: function(nodeList, fn) { Array.prototype.forEach.call(nodeList, fn, this); }, @@ -227,6 +239,34 @@ Readability.prototype = { })); }, + /** + * Removes the class="" attribute from every element in the given + * subtree, except those that match CLASSES_TO_PRESERVE and + * the classesToPreserve array from the options object. + * + * @param Element + * @return void + */ + _cleanClasses: function(node) { + var classesToPreserve = this._classesToPreserve; + var className = (node.getAttribute("class") || "") + .split(/\s+/) + .filter(function(cls) { + return classesToPreserve.indexOf(cls) != -1; + }) + .join(" "); + + if (className) { + node.setAttribute("class", className); + } else { + node.removeAttribute("class"); + } + + for (node = node.firstElementChild; node; node = node.nextElementSibling) { + this._cleanClasses(node); + } + }, + /** * Converts each and uri in the given element to an absolute URI, * ignoring #ref URIs. @@ -307,11 +347,20 @@ Readability.prototype = { curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); } catch (e) {/* ignore exceptions setting the title. */} - if (curTitle.match(/ [\|\-] /)) { - curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1'); + var titleHadHierarchicalSeparators = false; + function wordCount(str) { + return str.split(/\s+/).length; + } + + // If there's a separator in the title, first remove the final part + if ((/ [\|\-\\\/>»] /).test(curTitle)) { + titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); + curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1'); - if (curTitle.split(' ').length < 3) - curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1'); + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if (wordCount(curTitle) < 3) + curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1'); } else if (curTitle.indexOf(': ') !== -1) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. @@ -328,8 +377,13 @@ Readability.prototype = { curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1); // If the title is now too short, try the first colon instead: - if (curTitle.split(' ').length < 3) + if (wordCount(curTitle) < 3) { curTitle = origTitle.substring(origTitle.indexOf(':') + 1); + // But if we have too many words before the colon there's something weird + // with the titles and the H tags so let's just use the original title instead + } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) { + curTitle = origTitle; + } } } else if (curTitle.length > 150 || curTitle.length < 15) { var hOnes = doc.getElementsByTagName('h1'); @@ -339,9 +393,16 @@ Readability.prototype = { } curTitle = curTitle.trim(); - - if (curTitle.split(' ').length <= 4) + // If we now have 4 words or fewer as our title, and either no + // 'hierarchical' separators (\, /, > or ») were found in the original + // title or we decreased the number of words by more than 1 word, use + // the original title. + var curTitleWordCount = wordCount(curTitle); + if (curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { curTitle = origTitle; + } return curTitle; }, @@ -362,9 +423,7 @@ Readability.prototype = { this._replaceBrs(doc.body); } - this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) { - this._setNodeTag(fontNode, "SPAN"); - }); + this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN"); }, /** @@ -464,19 +523,49 @@ Readability.prototype = { _prepArticle: function(articleContent) { this._cleanStyles(articleContent); + // Check for data tables before we continue, to avoid removing items in + // those tables, which will often be isolated even though they're + // visually linked to other content-ful elements (text, images, etc.). + this._markDataTables(articleContent); + // Clean out junk from the article content this._cleanConditionally(articleContent, "form"); + this._cleanConditionally(articleContent, "fieldset"); this._clean(articleContent, "object"); this._clean(articleContent, "embed"); this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); - // If there is only one h2, they are probably using it as a header - // and not a subheader, so remove it since we already have a header. - if (articleContent.getElementsByTagName('h2').length === 1) - this._clean(articleContent, "h2"); + // Clean out elements have "share" in their id/class combinations from final top candidates, + // which means we don't remove the top candidates even they have "share". + this._forEachNode(articleContent.children, function(topCandidate) { + this._cleanMatchedNodes(topCandidate, /share/); + }); + + // If there is only one h2 and its text content substantially equals article title, + // they are probably using it as a header and not a subheader, + // so remove it since we already extract the title separately. + var h2 = articleContent.getElementsByTagName('h2'); + if (h2.length === 1) { + var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; + if (Math.abs(lengthSimilarRate) < 0.5) { + var titlesMatch = false; + if (lengthSimilarRate > 0) { + titlesMatch = h2[0].textContent.includes(this._articleTitle); + } else { + titlesMatch = this._articleTitle.includes(h2[0].textContent); + } + if (titlesMatch) { + this._clean(articleContent, "h2"); + } + } + } this._clean(articleContent, "iframe"); + this._clean(articleContent, "input"); + this._clean(articleContent, "textarea"); + this._clean(articleContent, "select"); + this._clean(articleContent, "button"); this._cleanHeaders(articleContent); // Do these last as the previous stuff may have removed junk @@ -662,9 +751,6 @@ Readability.prototype = { var pageCacheHtml = page.innerHTML; - // Check if any "dir" is set on the toplevel document element - this._articleDir = doc.documentElement.getAttribute("dir"); - while (true) { var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); @@ -695,6 +781,15 @@ Readability.prototype = { } } + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || + node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || + node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && + this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } + if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { elementsToScore.push(node); } @@ -709,13 +804,14 @@ Readability.prototype = { var newNode = node.children[0]; node.parentNode.replaceChild(newNode, node); node = newNode; + elementsToScore.push(node); } else if (!this._hasChildBlockElement(node)) { node = this._setNodeTag(node, "P"); elementsToScore.push(node); } else { // EXPERIMENTAL this._forEachNode(node.childNodes, function(childNode) { - if (childNode.nodeType === Node.TEXT_NODE) { + if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) { var p = doc.createElement('p'); p.textContent = childNode.textContent; p.style.display = 'inline'; @@ -812,6 +908,7 @@ Readability.prototype = { var topCandidate = topCandidates[0] || null; var neededToCreateTopCandidate = false; + var parentOfTopCandidate; // If we still have no top candidate, just use the body as a last resort. // We also have to copy the body node so it is something we can modify. @@ -831,6 +928,33 @@ Readability.prototype = { this._initializeNode(topCandidate); } else if (topCandidate) { + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array + // and whose scores are quite closed with current `topCandidate` node. + var alternativeCandidateAncestors = []; + for (var i = 1; i < topCandidates.length; i++) { + if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { + alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); + } + } + var MINIMUM_TOPCANDIDATES = 3; + if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName !== "BODY") { + var listsContainingThisAncestor = 0; + for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { + listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); + } + if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { + topCandidate = parentOfTopCandidate; + break; + } + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + // Because of our bonus system, parents of candidates might have scores // themselves. They get half of the node. There won't be nodes with higher // scores than our topCandidate, but if we see the score going *up* in the first @@ -838,11 +962,15 @@ Readability.prototype = { // lurking in other places that we want to unify in. The sibling stuff // below does some of that - but only if we've looked high enough up the DOM // tree. - var parentOfTopCandidate = topCandidate.parentNode; + parentOfTopCandidate = topCandidate.parentNode; var lastScore = topCandidate.readability.contentScore; // The scores shouldn't get too low. var scoreThreshold = lastScore / 3; - while (parentOfTopCandidate && parentOfTopCandidate.readability) { + while (parentOfTopCandidate.tagName !== "BODY") { + if (!parentOfTopCandidate.readability) { + parentOfTopCandidate = parentOfTopCandidate.parentNode; + continue; + } var parentScore = parentOfTopCandidate.readability.contentScore; if (parentScore < scoreThreshold) break; @@ -854,6 +982,17 @@ Readability.prototype = { lastScore = parentOfTopCandidate.readability.contentScore; parentOfTopCandidate = parentOfTopCandidate.parentNode; } + + // If the top candidate is the only child, use parent instead. This will help sibling + // joining logic when adjacent content is actually located in parent's sibling node. + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { + topCandidate = parentOfTopCandidate; + parentOfTopCandidate = topCandidate.parentNode; + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } } // Now that we have the top candidate, look through its siblings for content @@ -864,7 +1003,9 @@ Readability.prototype = { articleContent.id = "readability-content"; var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); - var siblings = topCandidate.parentNode.children; + // Keep potential top candidate's parent node to try to get text direction of it later. + parentOfTopCandidate = topCandidate.parentNode; + var siblings = parentOfTopCandidate.children; for (var s = 0, sl = siblings.length; s < sl; s++) { var sibling = siblings[s]; @@ -927,24 +1068,22 @@ Readability.prototype = { if (this._debug) this.log("Article content post-prep: " + articleContent.innerHTML); - if (this._curPageNum === 1) { - if (neededToCreateTopCandidate) { - // We already created a fake div thing, and there wouldn't have been any siblings left - // for the previous loop, so there's no point trying to create a new div, and then - // move all the children over. Just assign IDs and class names here. No need to append - // because that already happened anyway. - topCandidate.id = "readability-page-1"; - topCandidate.className = "page"; - } else { - var div = doc.createElement("DIV"); - div.id = "readability-page-1"; - div.className = "page"; - var children = articleContent.childNodes; - while (children.length) { - div.appendChild(children[0]); - } - articleContent.appendChild(div); + if (neededToCreateTopCandidate) { + // We already created a fake div thing, and there wouldn't have been any siblings left + // for the previous loop, so there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class names here. No need to append + // because that already happened anyway. + topCandidate.id = "readability-page-1"; + topCandidate.className = "page"; + } else { + var div = doc.createElement("DIV"); + div.id = "readability-page-1"; + div.className = "page"; + var children = articleContent.childNodes; + while (children.length) { + div.appendChild(children[0]); } + articleContent.appendChild(div); } if (this._debug) @@ -955,7 +1094,7 @@ Readability.prototype = { // grabArticle with different flags set. This gives us a higher likelihood of // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. - if (this._getInnerText(articleContent, true).length < 500) { + if (this._getInnerText(articleContent, true).length < this._wordThreshold) { page.innerHTML = pageCacheHtml; if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { @@ -968,6 +1107,18 @@ Readability.prototype = { return null; } } else { + // Find out text direction from ancestors of final top candidate. + var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); + this._someNode(ancestors, function(ancestor) { + if (!ancestor.tagName) + return false; + var articleDir = ancestor.getAttribute("dir"); + if (articleDir) { + this._articleDir = articleDir; + return true; + } + return false; + }); return articleContent; } } @@ -1044,12 +1195,15 @@ Readability.prototype = { metadata.excerpt = values["twitter:description"]; } - if ("og:title" in values) { - // Use facebook open graph title. - metadata.title = values["og:title"]; - } else if ("twitter:title" in values) { - // Use twitter cards title. - metadata.title = values["twitter:title"]; + metadata.title = this._getArticleTitle(); + if (!metadata.title) { + if ("og:title" in values) { + // Use facebook open graph title. + metadata.title = values["og:title"]; + } else if ("twitter:title" in values) { + // Use twitter cards title. + metadata.title = values["twitter:title"]; + } } return metadata; @@ -1089,6 +1243,13 @@ Readability.prototype = { }); }, + _isElementWithoutContent: function(node) { + return node.nodeType === Node.ELEMENT_NODE && + node.textContent.trim().length == 0 && + (node.children.length == 0 || + node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); + }, + /** * Determine whether element has any children block level elements. * @@ -1139,26 +1300,25 @@ Readability.prototype = { * @return void **/ _cleanStyles: function(e) { - e = e || this._doc; - if (!e) + if (!e || e.tagName.toLowerCase() === 'svg') return; - var cur = e.firstChild; - // Remove any root styles, if we're able. - if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') - e.removeAttribute('style'); - - // Go until there are no more child nodes - while (cur !== null) { - if (cur.nodeType === cur.ELEMENT_NODE) { - // Remove style attribute(s) : - if (cur.className !== "readability-styled") - cur.removeAttribute("style"); + if (e.className !== 'readability-styled') { + // Remove `style` and deprecated presentational attributes + for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { + e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); + } - this._cleanStyles(cur); + if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { + e.removeAttribute('width'); + e.removeAttribute('height'); } + } - cur = cur.nextSibling; + var cur = e.firstElementChild; + while (cur !== null) { + this._cleanStyles(cur); + cur = cur.nextElementSibling; } }, @@ -1184,368 +1344,6 @@ Readability.prototype = { return linkLength / textLength; }, - /** - * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. - * - * @author Dan Lacy - * @return string the base url - **/ - _findBaseUrl: function() { - var uri = this._uri; - var noUrlParams = uri.path.split("?")[0]; - var urlSlashes = noUrlParams.split("/").reverse(); - var cleanedSegments = []; - var possibleType = ""; - - for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { - var segment = urlSlashes[i]; - - // Split off and save anything that looks like a file type. - if (segment.indexOf(".") !== -1) { - possibleType = segment.split(".")[1]; - - // If the type isn't alpha-only, it's probably not actually a file extension. - if (!possibleType.match(/[^a-zA-Z]/)) - segment = segment.split(".")[0]; - } - - // EW-CMS specific segment replacement. Ugly. - // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html - if (segment.indexOf(',00') !== -1) - segment = segment.replace(',00', ''); - - // If our first or second segment has anything looking like a page number, remove it. - if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) - segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); - - var del = false; - - // If this is purely a number, and it's the first or second segment, - // it's probably a page number. Remove it. - if (i < 2 && segment.match(/^\d{1,2}$/)) - del = true; - - // If this is the first segment and it's just "index", remove it. - if (i === 0 && segment.toLowerCase() === "index") - del = true; - - // If our first or second segment is smaller than 3 characters, - // and the first segment was purely alphas, remove it. - if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) - del = true; - - // If it's not marked for deletion, push it to cleanedSegments. - if (!del) - cleanedSegments.push(segment); - } - - // This is our final, cleaned, base article URL. - return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); - }, - - /** - * Look for any paging links that may occur within the document. - * - * @param body - * @return object (array) - **/ - _findNextPageLink: function(elem) { - var uri = this._uri; - var possiblePages = {}; - var allLinks = elem.getElementsByTagName('a'); - var articleBaseUrl = this._findBaseUrl(); - - // Loop through all links, looking for hints that they may be next-page links. - // Things like having "page" in their textContent, className or id, or being a child - // of a node with a page-y className or id. - // - // Also possible: levenshtein distance? longest common subsequence? - // - // After we do that, assign each page a score, and - for (var i = 0, il = allLinks.length; i < il; i += 1) { - var link = allLinks[i]; - var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); - - // If we've already seen this page, ignore it. - if (linkHref === "" || - linkHref === articleBaseUrl || - linkHref === uri.spec || - linkHref in this._parsedPages) { - continue; - } - - // If it's on a different domain, skip it. - if (uri.host !== linkHref.split(/\/+/g)[1]) - continue; - - var linkText = this._getInnerText(link); - - // If the linkText looks like it's not the next page, skip it. - if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) - continue; - - // If the leftovers of the URL after removing the base URL don't contain - // any digits, it's certainly not a next page link. - var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); - if (!linkHrefLeftover.match(/\d/)) - continue; - - if (!(linkHref in possiblePages)) { - possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; - } else { - possiblePages[linkHref].linkText += ' | ' + linkText; - } - - var linkObj = possiblePages[linkHref]; - - // If the articleBaseUrl isn't part of this URL, penalize this link. It could - // still be the link, but the odds are lower. - // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html - if (linkHref.indexOf(articleBaseUrl) !== 0) - linkObj.score -= 25; - - var linkData = linkText + ' ' + link.className + ' ' + link.id; - if (linkData.match(this.REGEXPS.nextLink)) - linkObj.score += 50; - - if (linkData.match(/pag(e|ing|inat)/i)) - linkObj.score += 25; - - if (linkData.match(/(first|last)/i)) { - // -65 is enough to negate any bonuses gotten from a > or » in the text, - // If we already matched on "next", last is probably fine. - // If we didn't, then it's bad. Penalize. - if (!linkObj.linkText.match(this.REGEXPS.nextLink)) - linkObj.score -= 65; - } - - if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) - linkObj.score -= 50; - - if (linkData.match(this.REGEXPS.prevLink)) - linkObj.score -= 200; - - // If a parentNode contains page or paging or paginat - var parentNode = link.parentNode; - var positiveNodeMatch = false; - var negativeNodeMatch = false; - - while (parentNode) { - var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; - - if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { - positiveNodeMatch = true; - linkObj.score += 25; - } - - if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { - // If this is just something like "footer", give it a negative. - // If it's something like "body-and-footer", leave it be. - if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { - linkObj.score -= 25; - negativeNodeMatch = true; - } - } - - parentNode = parentNode.parentNode; - } - - // If the URL looks like it has paging in it, add to the score. - // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 - if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) - linkObj.score += 25; - - // If the URL contains negative values, give a slight decrease. - if (linkHref.match(this.REGEXPS.extraneous)) - linkObj.score -= 15; - - /** - * Minor punishment to anything that doesn't match our current URL. - * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. - * Dan, can you show me a counterexample where this is necessary? - * if (linkHref.indexOf(window.location.href) !== 0) { - * linkObj.score -= 1; - * } - **/ - - // If the link text can be parsed as a number, give it a minor bonus, with a slight - // bias towards lower numbered pages. This is so that pages that might not have 'next' - // in their text can still get scored, and sorted properly by score. - var linkTextAsNumber = parseInt(linkText, 10); - if (linkTextAsNumber) { - // Punish 1 since we're either already there, or it's probably - // before what we want anyways. - if (linkTextAsNumber === 1) { - linkObj.score -= 10; - } else { - linkObj.score += Math.max(0, 10 - linkTextAsNumber); - } - } - } - - // Loop thrugh all of our possible pages from above and find our top - // candidate for the next page URL. Require at least a score of 50, which - // is a relatively high confidence that this page is the next link. - var topPage = null; - for (var page in possiblePages) { - if (possiblePages.hasOwnProperty(page)) { - if (possiblePages[page].score >= 50 && - (!topPage || topPage.score < possiblePages[page].score)) - topPage = possiblePages[page]; - } - } - - var nextHref = null; - if (topPage) { - nextHref = topPage.href.replace(/\/$/, ''); - - this.log('NEXT PAGE IS ' + nextHref); - this._parsedPages[nextHref] = true; - } - return nextHref; - }, - - _successfulRequest: function(request) { - return (request.status >= 200 && request.status < 300) || - request.status === 304 || - (request.status === 0 && request.responseText); - }, - - _ajax: function(url, options) { - var request = new XMLHttpRequest(); - - function respondToReadyState(readyState) { - if (request.readyState === 4) { - if (this._successfulRequest(request)) { - if (options.success) - options.success(request); - } else if (options.error) { - options.error(request); - } - } - } - - if (typeof options === 'undefined') - options = {}; - - request.onreadystatechange = respondToReadyState; - - request.open('get', url, true); - request.setRequestHeader('Accept', 'text/html'); - - try { - request.send(options.postBody); - } catch (e) { - if (options.error) - options.error(); - } - - return request; - }, - - _appendNextPage: function(nextPageLink) { - var doc = this._doc; - this._curPageNum += 1; - - var articlePage = doc.createElement("DIV"); - articlePage.id = 'readability-page-' + this._curPageNum; - articlePage.className = 'page'; - articlePage.innerHTML = '

§

'; - - doc.getElementById("readability-content").appendChild(articlePage); - - if (this._curPageNum > this._maxPages) { - var nextPageMarkup = "
View Next Page
"; - articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; - return; - } - - // Now that we've built the article page DOM element, get the page content - // asynchronously and load the cleaned content into the div we created for it. - (function(pageUrl, thisPage) { - this._ajax(pageUrl, { - success: function(r) { - - // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. - var eTag = r.getResponseHeader('ETag'); - if (eTag) { - if (eTag in this._pageETags) { - this.log("Exact duplicate page found via ETag. Aborting."); - articlePage.style.display = 'none'; - return; - } - this._pageETags[eTag] = 1; - } - - // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. - var page = doc.createElement("DIV"); - - // Do some preprocessing to our HTML to make it ready for appending. - // - Remove any script tags. Swap and reswap newlines with a unicode - // character because multiline regex doesn't work in javascript. - // - Turn any noscript tags into divs so that we can parse them. This - // allows us to find any next page links hidden via javascript. - // - Turn all double br's into p's - was handled by prepDocument in the original view. - // Maybe in the future abstract out prepDocument to work for both the original document - // and AJAX-added pages. - var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/.*?<\/script>/gi, ''); - responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/.*?<\/script>/gi, ''); - responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div'); - responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); - - page.innerHTML = responseHtml; - this._replaceBrs(page); - - // Reset all flags for the next page, as they will search through it and - // disable as necessary at the end of grabArticle. - this._flags = 0x1 | 0x2 | 0x4; - - var secondNextPageLink = this._findNextPageLink(page); - - // NOTE: if we end up supporting _appendNextPage(), we'll need to - // change this call to be async - var content = this._grabArticle(page); - - if (!content) { - this.log("No content found in page to append. Aborting."); - return; - } - - // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. - // Compare it against all of the the previous document's we've gotten. If the previous - // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. - var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; - if (firstP && firstP.innerHTML.length > 100) { - for (var i = 1; i <= this._curPageNum; i += 1) { - var rPage = doc.getElementById('readability-page-' + i); - if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { - this.log('Duplicate of page ' + i + ' - skipping.'); - articlePage.style.display = 'none'; - this._parsedPages[pageUrl] = true; - return; - } - } - } - - this._removeScripts(content); - - thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; - - // After the page has rendered, post process the content. This delay is necessary because, - // in webkit at least, offsetWidth is not set in time to determine image width. We have to - // wait a little bit for reflow to finish before we can fix floating images. - setTimeout((function() { - this._postProcessContent(thisPage); - }).bind(this), 500); - - - if (secondNextPageLink) - this._appendNextPage(secondNextPageLink); - } - }); - }).bind(this)(nextPageLink, articlePage); - }, - /** * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. @@ -1617,16 +1415,17 @@ Readability.prototype = { * @param HTMLElement node * @param String tagName * @param Number maxDepth + * @param Function filterFn a filter to invoke to determine whether this node 'counts' * @return Boolean */ - _hasAncestorTag: function(node, tagName, maxDepth) { + _hasAncestorTag: function(node, tagName, maxDepth, filterFn) { maxDepth = maxDepth || 3; tagName = tagName.toUpperCase(); var depth = 0; while (node.parentNode) { - if (depth > maxDepth) + if (maxDepth > 0 && depth > maxDepth) return false; - if (node.parentNode.tagName === tagName) + if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode))) return true; node = node.parentNode; depth++; @@ -1634,6 +1433,93 @@ Readability.prototype = { return false; }, + /** + * Return an object indicating how many rows and columns this table has. + */ + _getRowAndColumnCount: function(table) { + var rows = 0; + var columns = 0; + var trs = table.getElementsByTagName("tr"); + for (var i = 0; i < trs.length; i++) { + var rowspan = trs[i].getAttribute("rowspan") || 0; + if (rowspan) { + rowspan = parseInt(rowspan, 10); + } + rows += (rowspan || 1); + + // Now look for column-related info + var columnsInThisRow = 0; + var cells = trs[i].getElementsByTagName("td"); + for (var j = 0; j < cells.length; j++) { + var colspan = cells[j].getAttribute("colspan") || 0; + if (colspan) { + colspan = parseInt(colspan, 10); + } + columnsInThisRow += (colspan || 1); + } + columns = Math.max(columns, columnsInThisRow); + } + return {rows: rows, columns: columns}; + }, + + /** + * Look for 'data' (as opposed to 'layout') tables, for which we use + * similar checks as + * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920 + */ + _markDataTables: function(root) { + var tables = root.getElementsByTagName("table"); + for (var i = 0; i < tables.length; i++) { + var table = tables[i]; + var role = table.getAttribute("role"); + if (role == "presentation") { + table._readabilityDataTable = false; + continue; + } + var datatable = table.getAttribute("datatable"); + if (datatable == "0") { + table._readabilityDataTable = false; + continue; + } + var summary = table.getAttribute("summary"); + if (summary) { + table._readabilityDataTable = true; + continue; + } + + var caption = table.getElementsByTagName("caption")[0]; + if (caption && caption.childNodes.length > 0) { + table._readabilityDataTable = true; + continue; + } + + // If the table has a descendant with any of these tags, consider a data table: + var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"]; + var descendantExists = function(tag) { + return !!table.getElementsByTagName(tag)[0]; + }; + if (dataTableDescendants.some(descendantExists)) { + this.log("Data table because found data-y descendant"); + table._readabilityDataTable = true; + continue; + } + + // Nested tables indicate a layout table: + if (table.getElementsByTagName("table")[0]) { + table._readabilityDataTable = false; + continue; + } + + var sizeInfo = this._getRowAndColumnCount(table); + if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) { + table._readabilityDataTable = true; + continue; + } + // Now just go by size entirely: + table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10; + } + }, + /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. @@ -1652,6 +1538,15 @@ Readability.prototype = { // // TODO: Consider taking into account original contentScore here. this._removeNodes(e.getElementsByTagName(tag), function(node) { + // First check if we're in a data table, in which case don't remove us. + var isDataTable = function(t) { + return t._readabilityDataTable; + }; + + if (this._hasAncestorTag(node, "table", -1, isDataTable)) { + return false; + } + var weight = this._getClassWeight(node); var contentScore = 0; @@ -1667,7 +1562,7 @@ Readability.prototype = { // ominous signs, remove the element. var p = node.getElementsByTagName("p").length; var img = node.getElementsByTagName("img").length; - var li = node.getElementsByTagName("li").length-100; + var li = node.getElementsByTagName("li").length - 100; var input = node.getElementsByTagName("input").length; var embedCount = 0; @@ -1681,11 +1576,10 @@ Readability.prototype = { var contentLength = this._getInnerText(node).length; var haveToRemove = - // Make an exception for elements with no p's and exactly 1 img. - (img > p && !this._hasAncestorTag(node, "figure")) || + (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || (!isList && li > p) || (input > Math.floor(p/3)) || - (!isList && contentLength < 25 && (img === 0 || img > 2)) || + (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || (!isList && weight < 25 && linkDensity > 0.2) || (weight >= 25 && linkDensity > 0.5) || ((embedCount === 1 && contentLength < 75) || embedCount > 1); @@ -1695,6 +1589,25 @@ Readability.prototype = { }); }, + /** + * Clean out elements whose id/class combinations match specific string. + * + * @param Element + * @param RegExp match id/class combination. + * @return void + **/ + _cleanMatchedNodes: function(e, regex) { + var endOfSearchMarkerNode = this._getNextNode(e, true); + var next = this._getNextNode(e); + while (next && next != endOfSearchMarkerNode) { + if (regex.test(next.className + " " + next.id)) { + next = this._removeAndGetNext(next); + } else { + next = this._getNextNode(next); + } + } + }, + /** * Clean out spurious headers from an Element. Checks things like classnames and link density. * @@ -1713,10 +1626,6 @@ Readability.prototype = { return (this._flags & flag) > 0; }, - _addFlag: function(flag) { - this._flags = this._flags | flag; - }, - _removeFlag: function(flag) { this._flags = this._flags & ~flag; }, @@ -1807,20 +1716,10 @@ Readability.prototype = { // Remove script tags from the document. this._removeScripts(this._doc); - // FIXME: Disabled multi-page article support for now as it - // needs more work on infrastructure. - - // Make sure this document is added to the list of parsed pages first, - // so we don't double up on the first page. - // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; - - // Pull out any possible next page link first. - // var nextPageLink = this._findNextPageLink(doc.body); - this._prepDocument(); var metadata = this._getArticleMetadata(); - var articleTitle = metadata.title || this._getArticleTitle(); + this._articleTitle = metadata.title; var articleContent = this._grabArticle(); if (!articleContent) @@ -1830,14 +1729,6 @@ Readability.prototype = { this._postProcessContent(articleContent); - // if (nextPageLink) { - // // Append any additional pages after a small timeout so that people - // // can start reading without having to wait for this to finish processing. - // setTimeout((function() { - // this._appendNextPage(nextPageLink); - // }).bind(this), 500); - // } - // If we haven't found an excerpt in the article's metadata, use the article's // first paragraph as the excerpt. This is used for displaying a preview of // the article's content. @@ -1851,7 +1742,7 @@ Readability.prototype = { var textContent = articleContent.textContent; return { uri: this._uri, - title: articleTitle, + title: this._articleTitle, byline: metadata.byline || this._articleByline, dir: this._articleDir, content: articleContent.innerHTML, @@ -1861,3 +1752,7 @@ Readability.prototype = { }; } }; + +if (typeof module === "object") { + module.exports = Readability; +} diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index 033a02489..e9eb83154 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -8,15 +8,18 @@ this.EXPORTED_SYMBOLS = ["ReaderMode"]; const { classes: Cc, interfaces: Ci, utils: Cu } = Components; -// Constants for telemetry. -const DOWNLOAD_SUCCESS = 0; -const DOWNLOAD_ERROR_XHR = 1; -const DOWNLOAD_ERROR_NO_DOC = 2; - -const PARSE_SUCCESS = 0; -const PARSE_ERROR_TOO_MANY_ELEMENTS = 1; -const PARSE_ERROR_WORKER = 2; -const PARSE_ERROR_NO_ARTICLE = 3; +// Class names to preserve in the readerized output. We preserve these class +// names so that rules in aboutReader.css can match them. +const CLASSES_TO_PRESERVE = [ + "caption", + "hidden", + "invisble", + "sr-only", + "visually-hidden", + "visuallyhidden", + "wp-caption", + "wp-caption-text", +]; Cu.import("resource://gre/modules/Services.jsm"); Cu.import("resource://gre/modules/XPCOMUtils.jsm"); @@ -24,17 +27,15 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm"); Cu.importGlobalProperties(["XMLHttpRequest"]); XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js"); -XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "TelemetryStopwatch", "resource://gre/modules/TelemetryStopwatch.jsm"); XPCOMUtils.defineLazyGetter(this, "Readability", function() { let scope = {}; scope.dump = this.dump; Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope); - return scope["Readability"]; + return scope.Readability; }); this.ReaderMode = { @@ -61,21 +62,13 @@ this.ReaderMode = { return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); }, - get isOnLowMemoryPlatform() { - let memory = Cc["@mozilla.org/xpcom/memory-service;1"].getService(Ci.nsIMemory); - delete this.isOnLowMemoryPlatform; - return this.isOnLowMemoryPlatform = memory.isLowMemoryPlatform(); - }, - - _getStateForParseOnLoad: function () { + _getStateForParseOnLoad() { let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); - // For low-memory devices, don't allow reader mode since it takes up a lot of memory. - // See https://bugzilla.mozilla.org/show_bug.cgi?id=792603 for details. - return isForceEnabled || (isEnabled && !this.isOnLowMemoryPlatform); + return isForceEnabled || isEnabled; }, - observe: function(aMessage, aTopic, aData) { + observe(aMessage, aTopic, aData) { switch (aTopic) { case "nsPref:changed": if (aData.startsWith("reader.parse-on-load.")) { @@ -91,7 +84,7 @@ this.ReaderMode = { * Enter the reader mode by going forward one step in history if applicable, * if not, append the about:reader page in the history instead. */ - enterReaderMode: function(docShell, win) { + enterReaderMode(docShell, win) { let url = win.document.location.href; let readerURL = "about:reader?url=" + encodeURIComponent(url); let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); @@ -112,7 +105,7 @@ this.ReaderMode = { * Exit the reader mode by going back one step in history if applicable, * if not, append the original page in the history instead. */ - leaveReaderMode: function(docShell, win) { + leaveReaderMode(docShell, win) { let url = win.document.location.href; let originalURL = this.getOriginalUrl(url); let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); @@ -136,14 +129,14 @@ this.ReaderMode = { * @return The original URL for the article, or null if we did not find * a properly formatted about:reader URL. */ - getOriginalUrl: function(url) { + getOriginalUrl(url) { if (!url.startsWith("about:reader?")) { return null; } let outerHash = ""; try { - let uriObj = Services.io.newURI(url, null, null); + let uriObj = Services.io.newURI(url); url = uriObj.specIgnoringRef; outerHash = uriObj.ref; } catch (ex) { /* ignore, use the raw string */ } @@ -155,27 +148,45 @@ this.ReaderMode = { let originalUrl = searchParams.get("url"); if (outerHash) { try { - let uriObj = Services.io.newURI(originalUrl, null, null); - uriObj = Services.io.newURI('#' + outerHash, null, uriObj); + let uriObj = Services.io.newURI(originalUrl); + uriObj = Services.io.newURI("#" + outerHash, null, uriObj); originalUrl = uriObj.spec; } catch (ex) {} } return originalUrl; }, + getOriginalUrlObjectForDisplay(url) { + let originalUrl = this.getOriginalUrl(url); + if (originalUrl) { + let uriObj; + try { + uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE); + } catch (ex) { + return null; + } + try { + return Services.uriFixup.createExposableURI(uriObj); + } catch (ex) { + return null; + } + } + return null; + }, + /** * Decides whether or not a document is reader-able without parsing the whole thing. * * @param doc A document to parse. * @return boolean Whether or not we should show the reader mode button. */ - isProbablyReaderable: function(doc) { + isProbablyReaderable(doc) { // Only care about 'real' HTML documents: if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { return false; } - let uri = Services.io.newURI(doc.location.href, null, null); + let uri = Services.io.newURI(doc.location.href); if (!this._shouldCheckUri(uri)) { return false; } @@ -187,12 +198,12 @@ this.ReaderMode = { return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); }, - isNodeVisible: function(utils, node) { + isNodeVisible(utils, node) { let bounds = utils.getBoundsWithoutFlushing(node); return bounds.height > 0 && bounds.width > 0; }, - getUtilsForWin: function(win) { + getUtilsForWin(win) { return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils); }, @@ -204,16 +215,14 @@ this.ReaderMode = { * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ - parseDocument: Task.async(function* (doc) { - let documentURI = Services.io.newURI(doc.documentURI, null, null); - let baseURI = Services.io.newURI(doc.baseURI, null, null); - if (!this._shouldCheckUri(documentURI) || !this._shouldCheckUri(baseURI, true)) { + parseDocument(doc) { + if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } - return yield this._readerParse(baseURI, doc); - }), + return this._readerParse(doc); + }, /** * Downloads and parses a document from a URL. @@ -222,19 +231,28 @@ this.ReaderMode = { * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ - downloadAndParseDocument: Task.async(function* (url) { - let doc = yield this._downloadDocument(url); - let uri = Services.io.newURI(doc.baseURI, null, null); - if (!this._shouldCheckUri(uri, true)) { + async downloadAndParseDocument(url) { + let doc = await this._downloadDocument(url); + if (!doc) { + return null; + } + if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } - return yield this._readerParse(uri, doc); - }), + return await this._readerParse(doc); + }, - _downloadDocument: function (url) { - let histogram = Services.telemetry.getHistogramById("READER_MODE_DOWNLOAD_RESULT"); + _downloadDocument(url) { + try { + if (!this._shouldCheckUri(Services.io.newURI(url))) { + return null; + } + } catch (ex) { + Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`)); + return null; + } return new Promise((resolve, reject) => { let xhr = new XMLHttpRequest(); xhr.open("GET", url, true); @@ -243,14 +261,12 @@ this.ReaderMode = { xhr.onload = evt => { if (xhr.status !== 200) { reject("Reader mode XHR failed with status: " + xhr.status); - histogram.add(DOWNLOAD_ERROR_XHR); return; } let doc = xhr.responseXML; if (!doc) { reject("Reader mode XHR didn't return a document"); - histogram.add(DOWNLOAD_ERROR_NO_DOC); return; } @@ -261,7 +277,7 @@ this.ReaderMode = { if (content) { let urlIndex = content.toUpperCase().indexOf("URL="); if (urlIndex > -1) { - let baseURI = Services.io.newURI(url, null, null); + let baseURI = Services.io.newURI(url); let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI); let newURL = newURI.spec; let ssm = Services.scriptSecurityManager; @@ -290,10 +306,10 @@ this.ReaderMode = { // Convert these to real URIs to make sure the escaping (or lack // thereof) is identical: try { - responseURL = Services.io.newURI(responseURL, null, null).specIgnoringRef; + responseURL = Services.io.newURI(responseURL).specIgnoringRef; } catch (ex) { /* Ignore errors - we'll use what we had before */ } try { - givenURL = Services.io.newURI(givenURL, null, null).specIgnoringRef; + givenURL = Services.io.newURI(givenURL).specIgnoringRef; } catch (ex) { /* Ignore errors - we'll use what we had before */ } if (responseURL != givenURL) { @@ -303,7 +319,6 @@ this.ReaderMode = { return; } resolve(doc); - histogram.add(DOWNLOAD_SUCCESS); }; xhr.send(); }); @@ -318,17 +333,17 @@ this.ReaderMode = { * @resolves JS object representing the article, or null if no article is found. * @rejects OS.File.Error */ - getArticleFromCache: Task.async(function* (url) { + async getArticleFromCache(url) { let path = this._toHashedPath(url); try { - let array = yield OS.File.read(path); + let array = await OS.File.read(path); return JSON.parse(new TextDecoder().decode(array)); } catch (e) { if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile) throw e; return null; } - }), + }, /** * Stores an article in the cache. @@ -338,14 +353,14 @@ this.ReaderMode = { * @resolves When the article is stored. * @rejects OS.File.Error */ - storeArticleInCache: Task.async(function* (article) { + async storeArticleInCache(article) { let array = new TextEncoder().encode(JSON.stringify(article)); let path = this._toHashedPath(article.url); - yield this._ensureCacheDir(); + await this._ensureCacheDir(); return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" }) .then(success => { OS.File.stat(path).then(info => { - return Messaging.sendRequest({ + return EventDispatcher.instance.sendRequest({ type: "Reader:AddedToCache", url: article.url, size: info.size, @@ -353,7 +368,7 @@ this.ReaderMode = { }); }); }); - }), + }, /** * Removes an article from the cache given an article URI. @@ -363,26 +378,29 @@ this.ReaderMode = { * @resolves When the article is removed. * @rejects OS.File.Error */ - removeArticleFromCache: Task.async(function* (url) { + async removeArticleFromCache(url) { let path = this._toHashedPath(url); - yield OS.File.remove(path); - }), + await OS.File.remove(path); + }, - log: function(msg) { + log(msg) { if (this.DEBUG) dump("Reader: " + msg); }, _blockedHosts: [ - "mail.google.com", + "amazon.com", + "basilisk-browser.org", "github.com", + "mail.google.com", + "palemoon.org", "pinterest.com", "reddit.com", "twitter.com", "youtube.com", ], - _shouldCheckUri: function (uri, isBaseUri = false) { + _shouldCheckUri(uri, isBaseUri = false) { if (!(uri.schemeIs("http") || uri.schemeIs("https"))) { this.log("Not parsing URI scheme: " + uri.scheme); return false; @@ -412,59 +430,77 @@ this.ReaderMode = { * Attempts to parse a document into an article. Heavy lifting happens * in readerWorker.js. * - * @param uri The base URI of the article. * @param doc The document to parse. * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ - _readerParse: Task.async(function* (uri, doc) { - let histogram = Services.telemetry.getHistogramById("READER_MODE_PARSE_RESULT"); + async _readerParse(doc) { if (this.parseNodeLimit) { let numTags = doc.getElementsByTagName("*").length; if (numTags > this.parseNodeLimit) { - this.log("Aborting parse for " + uri.spec + "; " + numTags + " elements found"); - histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS); + this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found"); return null; } } + // Fetch this here before we send `doc` off to the worker thread, as later on the + // document might be nuked but we will still want the URI. + let {documentURI} = doc; + let uriParam = { - spec: uri.spec, - host: uri.host, - prePath: uri.prePath, - scheme: uri.scheme, - pathBase: Services.io.newURI(".", null, uri).spec + spec: doc.baseURIObject.spec, + host: doc.baseURIObject.host, + prePath: doc.baseURIObject.prePath, + scheme: doc.baseURIObject.scheme, + pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec + }; + + let langAttributes = { + charset: doc.characterSet, + lang: doc.documentElement.lang }; let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"]. createInstance(Ci.nsIDOMSerializer); let serializedDoc = serializer.serializeToString(doc); + let options = { + classesToPreserve: CLASSES_TO_PRESERVE, + }; + let article = null; try { - article = yield ReaderWorker.post("parseDocument", [uriParam, serializedDoc]); + article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]); } catch (e) { Cu.reportError("Error in ReaderWorker: " + e); - histogram.add(PARSE_ERROR_WORKER); } + // Explicitly null out doc to make it clear it might not be available from this + // point on. + doc = null; + if (!article) { this.log("Worker did not return an article"); - histogram.add(PARSE_ERROR_NO_ARTICLE); return null; } - // Readability returns a URI object, but we only care about the URL. - article.url = article.uri.spec; + // Readability returns a URI object based on the baseURI, but we only care + // about the original document's URL from now on. This also avoids spoofing + // attempts where the baseURI doesn't match the domain of the documentURI + article.url = documentURI; delete article.uri; let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks; article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils) .convertToPlainText(article.title, flags, 0); - histogram.add(PARSE_SUCCESS); + await this._assignLanguage(article, langAttributes); + this._maybeAssignTextDirection(article); + + this._assignReadTime(article); + return article; - }), + }, get _cryptoHash() { delete this._cryptoHash; @@ -485,7 +521,7 @@ this.ReaderMode = { * @param url The article URL. This should have referrers removed. * @return The file path to the cached article. */ - _toHashedPath: function (url) { + _toHashedPath(url) { let value = this._unicodeConverter.convertToByteArray(url); this._cryptoHash.init(this._cryptoHash.MD5); this._cryptoHash.update(value, value.length); @@ -502,7 +538,7 @@ this.ReaderMode = { * @resolves When the cache directory exists. * @rejects OS.File.Error */ - _ensureCacheDir: function () { + _ensureCacheDir() { let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache"); return OS.File.exists(dir).then(exists => { if (!exists) { @@ -510,5 +546,107 @@ this.ReaderMode = { } return undefined; }); - } + }, + + /** + * Sets a global language string value if possible. If langauge detection is + * available, use that. Otherwise, revert to a simpler mechanism using the + * document's lang attribute or charset. + * + * @return Promise + * @resolves when the language is detected + */ + _assignLanguage(article, attributes) { + try { + Cu.import("resource://modules/translation/LanguageDetector.jsm"); + return LanguageDetector.detectLanguage(article.textContent).then(result => { + article.language = result.confident ? result.language : null; + }); + } catch(ex) { + return new Promise((resolve) => { + resolve(this._assignSimpleLanguage(attributes)); + }).then(result => { + article.language = result; + }); + } + }, + + _assignSimpleLanguage(attributes) { + var lang = attributes.lang.substring(0,2); + if (lang) { + return lang; + } + + // If there is no lang attribute, try the charset. + // We can only use this for charsets that are specific to one language. + const charsetLang = new Map([ + [ "us-ascii", "en" ], + [ "iso-8859-6", "ar" ], + [ "iso-8859-7", "el" ], + [ "iso-8859-8", "he" ], + [ "iso-8859-9", "tr" ], + [ "iso-8859-11", "th" ], + [ "jis_x0201", "ja" ], + [ "shift_jis", "ja" ], + [ "euc-jp", "ja" ] + ]); + + return charsetLang.get(attributes.charset); + }, + + _maybeAssignTextDirection(article) { + // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved. + if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) { + article.dir = "rtl"; + } + }, + + /** + * Assigns the estimated reading time range of the article to the article object. + * + * @param article the article object to assign the reading time estimate to. + */ + _assignReadTime(article) { + let lang = article.language || "en"; + const readingSpeed = this._getReadingSpeedForLanguage(lang); + const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance; + const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance; + const length = article.length; + + article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow); + article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh); + }, + + /** + * Returns the reading speed of a selection of languages with likely variance. + * + * Reading speed estimated from a study done on reading speeds in various languages. + * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061 + * + * @return object with characters per minute and variance. Defaults to English + * if no suitable language is found in the collection. + */ + _getReadingSpeedForLanguage(lang) { + const readingSpeed = new Map([ + [ "en", {cpm: 987, variance: 118 } ], + [ "ar", {cpm: 612, variance: 88 } ], + [ "de", {cpm: 920, variance: 86 } ], + [ "es", {cpm: 1025, variance: 127 } ], + [ "fi", {cpm: 1078, variance: 121 } ], + [ "fr", {cpm: 998, variance: 126 } ], + [ "he", {cpm: 833, variance: 130 } ], + [ "it", {cpm: 950, variance: 140 } ], + [ "jw", {cpm: 357, variance: 56 } ], + [ "nl", {cpm: 978, variance: 143 } ], + [ "pl", {cpm: 916, variance: 126 } ], + [ "pt", {cpm: 913, variance: 145 } ], + [ "ru", {cpm: 986, variance: 175 } ], + [ "sk", {cpm: 885, variance: 145 } ], + [ "sv", {cpm: 917, variance: 156 } ], + [ "tr", {cpm: 1054, variance: 156 } ], + [ "zh", {cpm: 255, variance: 29 } ], + ]); + + return readingSpeed.get(lang) || readingSpeed.get("en"); + }, }; diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js index 20023d4e0..9ae589d7d 100644 --- a/toolkit/components/reader/ReaderWorker.js +++ b/toolkit/components/reader/ReaderWorker.js @@ -2,6 +2,8 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* eslint-env mozilla/chrome-worker */ + "use strict"; /** @@ -40,11 +42,12 @@ var Agent = { * * @param {object} uri URI data for the document. * @param {string} serializedDoc The serialized document. + * @param {object} options Options object to pass to Readability. * * @return {object} Article object returned from Readability. */ - parseDocument: function (uri, serializedDoc) { + parseDocument(uri, serializedDoc, options) { let doc = new JSDOMParser().parse(serializedDoc); - return new Readability(uri, doc).parse(); + return new Readability(uri, doc, options).parse(); }, }; diff --git a/toolkit/components/reader/content/aboutReader.html b/toolkit/components/reader/content/aboutReader.html index b9c1139f6..1aa644474 100644 --- a/toolkit/components/reader/content/aboutReader.html +++ b/toolkit/components/reader/content/aboutReader.html @@ -7,63 +7,56 @@ - + -
-
- - +
+
+
-

-
+

+
+
+
+
- -
+
- -
+
-
    - -
  • -