From b70d884598e1e14b99190e1e5c349553ff59849b Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Mon, 7 May 2018 19:45:46 -0400 Subject: Initial updates for Reader View. --- toolkit/components/reader/ReaderMode.jsm | 312 ++++++++++++++++++++++--------- 1 file changed, 225 insertions(+), 87 deletions(-) (limited to 'toolkit/components/reader/ReaderMode.jsm') diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index 033a02489..e9eb83154 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -8,15 +8,18 @@ this.EXPORTED_SYMBOLS = ["ReaderMode"]; const { classes: Cc, interfaces: Ci, utils: Cu } = Components; -// Constants for telemetry. -const DOWNLOAD_SUCCESS = 0; -const DOWNLOAD_ERROR_XHR = 1; -const DOWNLOAD_ERROR_NO_DOC = 2; - -const PARSE_SUCCESS = 0; -const PARSE_ERROR_TOO_MANY_ELEMENTS = 1; -const PARSE_ERROR_WORKER = 2; -const PARSE_ERROR_NO_ARTICLE = 3; +// Class names to preserve in the readerized output. We preserve these class +// names so that rules in aboutReader.css can match them. +const CLASSES_TO_PRESERVE = [ + "caption", + "hidden", + "invisble", + "sr-only", + "visually-hidden", + "visuallyhidden", + "wp-caption", + "wp-caption-text", +]; Cu.import("resource://gre/modules/Services.jsm"); Cu.import("resource://gre/modules/XPCOMUtils.jsm"); @@ -24,17 +27,15 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm"); Cu.importGlobalProperties(["XMLHttpRequest"]); XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js"); -XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "TelemetryStopwatch", "resource://gre/modules/TelemetryStopwatch.jsm"); XPCOMUtils.defineLazyGetter(this, "Readability", function() { let scope = {}; scope.dump = this.dump; Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope); - return scope["Readability"]; + return scope.Readability; }); this.ReaderMode = { @@ -61,21 +62,13 @@ this.ReaderMode = { return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); }, - get isOnLowMemoryPlatform() { - let memory = Cc["@mozilla.org/xpcom/memory-service;1"].getService(Ci.nsIMemory); - delete this.isOnLowMemoryPlatform; - return this.isOnLowMemoryPlatform = memory.isLowMemoryPlatform(); - }, - - _getStateForParseOnLoad: function () { + _getStateForParseOnLoad() { let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); - // For low-memory devices, don't allow reader mode since it takes up a lot of memory. - // See https://bugzilla.mozilla.org/show_bug.cgi?id=792603 for details. - return isForceEnabled || (isEnabled && !this.isOnLowMemoryPlatform); + return isForceEnabled || isEnabled; }, - observe: function(aMessage, aTopic, aData) { + observe(aMessage, aTopic, aData) { switch (aTopic) { case "nsPref:changed": if (aData.startsWith("reader.parse-on-load.")) { @@ -91,7 +84,7 @@ this.ReaderMode = { * Enter the reader mode by going forward one step in history if applicable, * if not, append the about:reader page in the history instead. */ - enterReaderMode: function(docShell, win) { + enterReaderMode(docShell, win) { let url = win.document.location.href; let readerURL = "about:reader?url=" + encodeURIComponent(url); let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); @@ -112,7 +105,7 @@ this.ReaderMode = { * Exit the reader mode by going back one step in history if applicable, * if not, append the original page in the history instead. */ - leaveReaderMode: function(docShell, win) { + leaveReaderMode(docShell, win) { let url = win.document.location.href; let originalURL = this.getOriginalUrl(url); let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); @@ -136,14 +129,14 @@ this.ReaderMode = { * @return The original URL for the article, or null if we did not find * a properly formatted about:reader URL. */ - getOriginalUrl: function(url) { + getOriginalUrl(url) { if (!url.startsWith("about:reader?")) { return null; } let outerHash = ""; try { - let uriObj = Services.io.newURI(url, null, null); + let uriObj = Services.io.newURI(url); url = uriObj.specIgnoringRef; outerHash = uriObj.ref; } catch (ex) { /* ignore, use the raw string */ } @@ -155,27 +148,45 @@ this.ReaderMode = { let originalUrl = searchParams.get("url"); if (outerHash) { try { - let uriObj = Services.io.newURI(originalUrl, null, null); - uriObj = Services.io.newURI('#' + outerHash, null, uriObj); + let uriObj = Services.io.newURI(originalUrl); + uriObj = Services.io.newURI("#" + outerHash, null, uriObj); originalUrl = uriObj.spec; } catch (ex) {} } return originalUrl; }, + getOriginalUrlObjectForDisplay(url) { + let originalUrl = this.getOriginalUrl(url); + if (originalUrl) { + let uriObj; + try { + uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE); + } catch (ex) { + return null; + } + try { + return Services.uriFixup.createExposableURI(uriObj); + } catch (ex) { + return null; + } + } + return null; + }, + /** * Decides whether or not a document is reader-able without parsing the whole thing. * * @param doc A document to parse. * @return boolean Whether or not we should show the reader mode button. */ - isProbablyReaderable: function(doc) { + isProbablyReaderable(doc) { // Only care about 'real' HTML documents: if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { return false; } - let uri = Services.io.newURI(doc.location.href, null, null); + let uri = Services.io.newURI(doc.location.href); if (!this._shouldCheckUri(uri)) { return false; } @@ -187,12 +198,12 @@ this.ReaderMode = { return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); }, - isNodeVisible: function(utils, node) { + isNodeVisible(utils, node) { let bounds = utils.getBoundsWithoutFlushing(node); return bounds.height > 0 && bounds.width > 0; }, - getUtilsForWin: function(win) { + getUtilsForWin(win) { return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils); }, @@ -204,16 +215,14 @@ this.ReaderMode = { * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ - parseDocument: Task.async(function* (doc) { - let documentURI = Services.io.newURI(doc.documentURI, null, null); - let baseURI = Services.io.newURI(doc.baseURI, null, null); - if (!this._shouldCheckUri(documentURI) || !this._shouldCheckUri(baseURI, true)) { + parseDocument(doc) { + if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } - return yield this._readerParse(baseURI, doc); - }), + return this._readerParse(doc); + }, /** * Downloads and parses a document from a URL. @@ -222,19 +231,28 @@ this.ReaderMode = { * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ - downloadAndParseDocument: Task.async(function* (url) { - let doc = yield this._downloadDocument(url); - let uri = Services.io.newURI(doc.baseURI, null, null); - if (!this._shouldCheckUri(uri, true)) { + async downloadAndParseDocument(url) { + let doc = await this._downloadDocument(url); + if (!doc) { + return null; + } + if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } - return yield this._readerParse(uri, doc); - }), + return await this._readerParse(doc); + }, - _downloadDocument: function (url) { - let histogram = Services.telemetry.getHistogramById("READER_MODE_DOWNLOAD_RESULT"); + _downloadDocument(url) { + try { + if (!this._shouldCheckUri(Services.io.newURI(url))) { + return null; + } + } catch (ex) { + Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`)); + return null; + } return new Promise((resolve, reject) => { let xhr = new XMLHttpRequest(); xhr.open("GET", url, true); @@ -243,14 +261,12 @@ this.ReaderMode = { xhr.onload = evt => { if (xhr.status !== 200) { reject("Reader mode XHR failed with status: " + xhr.status); - histogram.add(DOWNLOAD_ERROR_XHR); return; } let doc = xhr.responseXML; if (!doc) { reject("Reader mode XHR didn't return a document"); - histogram.add(DOWNLOAD_ERROR_NO_DOC); return; } @@ -261,7 +277,7 @@ this.ReaderMode = { if (content) { let urlIndex = content.toUpperCase().indexOf("URL="); if (urlIndex > -1) { - let baseURI = Services.io.newURI(url, null, null); + let baseURI = Services.io.newURI(url); let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI); let newURL = newURI.spec; let ssm = Services.scriptSecurityManager; @@ -290,10 +306,10 @@ this.ReaderMode = { // Convert these to real URIs to make sure the escaping (or lack // thereof) is identical: try { - responseURL = Services.io.newURI(responseURL, null, null).specIgnoringRef; + responseURL = Services.io.newURI(responseURL).specIgnoringRef; } catch (ex) { /* Ignore errors - we'll use what we had before */ } try { - givenURL = Services.io.newURI(givenURL, null, null).specIgnoringRef; + givenURL = Services.io.newURI(givenURL).specIgnoringRef; } catch (ex) { /* Ignore errors - we'll use what we had before */ } if (responseURL != givenURL) { @@ -303,7 +319,6 @@ this.ReaderMode = { return; } resolve(doc); - histogram.add(DOWNLOAD_SUCCESS); }; xhr.send(); }); @@ -318,17 +333,17 @@ this.ReaderMode = { * @resolves JS object representing the article, or null if no article is found. * @rejects OS.File.Error */ - getArticleFromCache: Task.async(function* (url) { + async getArticleFromCache(url) { let path = this._toHashedPath(url); try { - let array = yield OS.File.read(path); + let array = await OS.File.read(path); return JSON.parse(new TextDecoder().decode(array)); } catch (e) { if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile) throw e; return null; } - }), + }, /** * Stores an article in the cache. @@ -338,14 +353,14 @@ this.ReaderMode = { * @resolves When the article is stored. * @rejects OS.File.Error */ - storeArticleInCache: Task.async(function* (article) { + async storeArticleInCache(article) { let array = new TextEncoder().encode(JSON.stringify(article)); let path = this._toHashedPath(article.url); - yield this._ensureCacheDir(); + await this._ensureCacheDir(); return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" }) .then(success => { OS.File.stat(path).then(info => { - return Messaging.sendRequest({ + return EventDispatcher.instance.sendRequest({ type: "Reader:AddedToCache", url: article.url, size: info.size, @@ -353,7 +368,7 @@ this.ReaderMode = { }); }); }); - }), + }, /** * Removes an article from the cache given an article URI. @@ -363,26 +378,29 @@ this.ReaderMode = { * @resolves When the article is removed. * @rejects OS.File.Error */ - removeArticleFromCache: Task.async(function* (url) { + async removeArticleFromCache(url) { let path = this._toHashedPath(url); - yield OS.File.remove(path); - }), + await OS.File.remove(path); + }, - log: function(msg) { + log(msg) { if (this.DEBUG) dump("Reader: " + msg); }, _blockedHosts: [ - "mail.google.com", + "amazon.com", + "basilisk-browser.org", "github.com", + "mail.google.com", + "palemoon.org", "pinterest.com", "reddit.com", "twitter.com", "youtube.com", ], - _shouldCheckUri: function (uri, isBaseUri = false) { + _shouldCheckUri(uri, isBaseUri = false) { if (!(uri.schemeIs("http") || uri.schemeIs("https"))) { this.log("Not parsing URI scheme: " + uri.scheme); return false; @@ -412,59 +430,77 @@ this.ReaderMode = { * Attempts to parse a document into an article. Heavy lifting happens * in readerWorker.js. * - * @param uri The base URI of the article. * @param doc The document to parse. * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ - _readerParse: Task.async(function* (uri, doc) { - let histogram = Services.telemetry.getHistogramById("READER_MODE_PARSE_RESULT"); + async _readerParse(doc) { if (this.parseNodeLimit) { let numTags = doc.getElementsByTagName("*").length; if (numTags > this.parseNodeLimit) { - this.log("Aborting parse for " + uri.spec + "; " + numTags + " elements found"); - histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS); + this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found"); return null; } } + // Fetch this here before we send `doc` off to the worker thread, as later on the + // document might be nuked but we will still want the URI. + let {documentURI} = doc; + let uriParam = { - spec: uri.spec, - host: uri.host, - prePath: uri.prePath, - scheme: uri.scheme, - pathBase: Services.io.newURI(".", null, uri).spec + spec: doc.baseURIObject.spec, + host: doc.baseURIObject.host, + prePath: doc.baseURIObject.prePath, + scheme: doc.baseURIObject.scheme, + pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec + }; + + let langAttributes = { + charset: doc.characterSet, + lang: doc.documentElement.lang }; let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"]. createInstance(Ci.nsIDOMSerializer); let serializedDoc = serializer.serializeToString(doc); + let options = { + classesToPreserve: CLASSES_TO_PRESERVE, + }; + let article = null; try { - article = yield ReaderWorker.post("parseDocument", [uriParam, serializedDoc]); + article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]); } catch (e) { Cu.reportError("Error in ReaderWorker: " + e); - histogram.add(PARSE_ERROR_WORKER); } + // Explicitly null out doc to make it clear it might not be available from this + // point on. + doc = null; + if (!article) { this.log("Worker did not return an article"); - histogram.add(PARSE_ERROR_NO_ARTICLE); return null; } - // Readability returns a URI object, but we only care about the URL. - article.url = article.uri.spec; + // Readability returns a URI object based on the baseURI, but we only care + // about the original document's URL from now on. This also avoids spoofing + // attempts where the baseURI doesn't match the domain of the documentURI + article.url = documentURI; delete article.uri; let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks; article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils) .convertToPlainText(article.title, flags, 0); - histogram.add(PARSE_SUCCESS); + await this._assignLanguage(article, langAttributes); + this._maybeAssignTextDirection(article); + + this._assignReadTime(article); + return article; - }), + }, get _cryptoHash() { delete this._cryptoHash; @@ -485,7 +521,7 @@ this.ReaderMode = { * @param url The article URL. This should have referrers removed. * @return The file path to the cached article. */ - _toHashedPath: function (url) { + _toHashedPath(url) { let value = this._unicodeConverter.convertToByteArray(url); this._cryptoHash.init(this._cryptoHash.MD5); this._cryptoHash.update(value, value.length); @@ -502,7 +538,7 @@ this.ReaderMode = { * @resolves When the cache directory exists. * @rejects OS.File.Error */ - _ensureCacheDir: function () { + _ensureCacheDir() { let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache"); return OS.File.exists(dir).then(exists => { if (!exists) { @@ -510,5 +546,107 @@ this.ReaderMode = { } return undefined; }); - } + }, + + /** + * Sets a global language string value if possible. If langauge detection is + * available, use that. Otherwise, revert to a simpler mechanism using the + * document's lang attribute or charset. + * + * @return Promise + * @resolves when the language is detected + */ + _assignLanguage(article, attributes) { + try { + Cu.import("resource://modules/translation/LanguageDetector.jsm"); + return LanguageDetector.detectLanguage(article.textContent).then(result => { + article.language = result.confident ? result.language : null; + }); + } catch(ex) { + return new Promise((resolve) => { + resolve(this._assignSimpleLanguage(attributes)); + }).then(result => { + article.language = result; + }); + } + }, + + _assignSimpleLanguage(attributes) { + var lang = attributes.lang.substring(0,2); + if (lang) { + return lang; + } + + // If there is no lang attribute, try the charset. + // We can only use this for charsets that are specific to one language. + const charsetLang = new Map([ + [ "us-ascii", "en" ], + [ "iso-8859-6", "ar" ], + [ "iso-8859-7", "el" ], + [ "iso-8859-8", "he" ], + [ "iso-8859-9", "tr" ], + [ "iso-8859-11", "th" ], + [ "jis_x0201", "ja" ], + [ "shift_jis", "ja" ], + [ "euc-jp", "ja" ] + ]); + + return charsetLang.get(attributes.charset); + }, + + _maybeAssignTextDirection(article) { + // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved. + if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) { + article.dir = "rtl"; + } + }, + + /** + * Assigns the estimated reading time range of the article to the article object. + * + * @param article the article object to assign the reading time estimate to. + */ + _assignReadTime(article) { + let lang = article.language || "en"; + const readingSpeed = this._getReadingSpeedForLanguage(lang); + const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance; + const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance; + const length = article.length; + + article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow); + article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh); + }, + + /** + * Returns the reading speed of a selection of languages with likely variance. + * + * Reading speed estimated from a study done on reading speeds in various languages. + * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061 + * + * @return object with characters per minute and variance. Defaults to English + * if no suitable language is found in the collection. + */ + _getReadingSpeedForLanguage(lang) { + const readingSpeed = new Map([ + [ "en", {cpm: 987, variance: 118 } ], + [ "ar", {cpm: 612, variance: 88 } ], + [ "de", {cpm: 920, variance: 86 } ], + [ "es", {cpm: 1025, variance: 127 } ], + [ "fi", {cpm: 1078, variance: 121 } ], + [ "fr", {cpm: 998, variance: 126 } ], + [ "he", {cpm: 833, variance: 130 } ], + [ "it", {cpm: 950, variance: 140 } ], + [ "jw", {cpm: 357, variance: 56 } ], + [ "nl", {cpm: 978, variance: 143 } ], + [ "pl", {cpm: 916, variance: 126 } ], + [ "pt", {cpm: 913, variance: 145 } ], + [ "ru", {cpm: 986, variance: 175 } ], + [ "sk", {cpm: 885, variance: 145 } ], + [ "sv", {cpm: 917, variance: 156 } ], + [ "tr", {cpm: 1054, variance: 156 } ], + [ "zh", {cpm: 255, variance: 29 } ], + ]); + + return readingSpeed.get(lang) || readingSpeed.get("en"); + }, }; -- cgit v1.2.3