summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader/ReaderMode.jsm
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/reader/ReaderMode.jsm')
-rw-r--r--toolkit/components/reader/ReaderMode.jsm312
1 files changed, 225 insertions, 87 deletions
diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm
index 033a02489..e9eb83154 100644
--- a/toolkit/components/reader/ReaderMode.jsm
+++ b/toolkit/components/reader/ReaderMode.jsm
@@ -8,15 +8,18 @@ this.EXPORTED_SYMBOLS = ["ReaderMode"];
const { classes: Cc, interfaces: Ci, utils: Cu } = Components;
-// Constants for telemetry.
-const DOWNLOAD_SUCCESS = 0;
-const DOWNLOAD_ERROR_XHR = 1;
-const DOWNLOAD_ERROR_NO_DOC = 2;
-
-const PARSE_SUCCESS = 0;
-const PARSE_ERROR_TOO_MANY_ELEMENTS = 1;
-const PARSE_ERROR_WORKER = 2;
-const PARSE_ERROR_NO_ARTICLE = 3;
+// Class names to preserve in the readerized output. We preserve these class
+// names so that rules in aboutReader.css can match them.
+const CLASSES_TO_PRESERVE = [
+ "caption",
+ "hidden",
+ "invisble",
+ "sr-only",
+ "visually-hidden",
+ "visuallyhidden",
+ "wp-caption",
+ "wp-caption-text",
+];
Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/XPCOMUtils.jsm");
@@ -24,17 +27,15 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm");
Cu.importGlobalProperties(["XMLHttpRequest"]);
XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js");
-XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "TelemetryStopwatch", "resource://gre/modules/TelemetryStopwatch.jsm");
XPCOMUtils.defineLazyGetter(this, "Readability", function() {
let scope = {};
scope.dump = this.dump;
Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope);
- return scope["Readability"];
+ return scope.Readability;
});
this.ReaderMode = {
@@ -61,21 +62,13 @@ this.ReaderMode = {
return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad();
},
- get isOnLowMemoryPlatform() {
- let memory = Cc["@mozilla.org/xpcom/memory-service;1"].getService(Ci.nsIMemory);
- delete this.isOnLowMemoryPlatform;
- return this.isOnLowMemoryPlatform = memory.isLowMemoryPlatform();
- },
-
- _getStateForParseOnLoad: function () {
+ _getStateForParseOnLoad() {
let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled");
let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled");
- // For low-memory devices, don't allow reader mode since it takes up a lot of memory.
- // See https://bugzilla.mozilla.org/show_bug.cgi?id=792603 for details.
- return isForceEnabled || (isEnabled && !this.isOnLowMemoryPlatform);
+ return isForceEnabled || isEnabled;
},
- observe: function(aMessage, aTopic, aData) {
+ observe(aMessage, aTopic, aData) {
switch (aTopic) {
case "nsPref:changed":
if (aData.startsWith("reader.parse-on-load.")) {
@@ -91,7 +84,7 @@ this.ReaderMode = {
* Enter the reader mode by going forward one step in history if applicable,
* if not, append the about:reader page in the history instead.
*/
- enterReaderMode: function(docShell, win) {
+ enterReaderMode(docShell, win) {
let url = win.document.location.href;
let readerURL = "about:reader?url=" + encodeURIComponent(url);
let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
@@ -112,7 +105,7 @@ this.ReaderMode = {
* Exit the reader mode by going back one step in history if applicable,
* if not, append the original page in the history instead.
*/
- leaveReaderMode: function(docShell, win) {
+ leaveReaderMode(docShell, win) {
let url = win.document.location.href;
let originalURL = this.getOriginalUrl(url);
let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
@@ -136,14 +129,14 @@ this.ReaderMode = {
* @return The original URL for the article, or null if we did not find
* a properly formatted about:reader URL.
*/
- getOriginalUrl: function(url) {
+ getOriginalUrl(url) {
if (!url.startsWith("about:reader?")) {
return null;
}
let outerHash = "";
try {
- let uriObj = Services.io.newURI(url, null, null);
+ let uriObj = Services.io.newURI(url);
url = uriObj.specIgnoringRef;
outerHash = uriObj.ref;
} catch (ex) { /* ignore, use the raw string */ }
@@ -155,27 +148,45 @@ this.ReaderMode = {
let originalUrl = searchParams.get("url");
if (outerHash) {
try {
- let uriObj = Services.io.newURI(originalUrl, null, null);
- uriObj = Services.io.newURI('#' + outerHash, null, uriObj);
+ let uriObj = Services.io.newURI(originalUrl);
+ uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
originalUrl = uriObj.spec;
} catch (ex) {}
}
return originalUrl;
},
+ getOriginalUrlObjectForDisplay(url) {
+ let originalUrl = this.getOriginalUrl(url);
+ if (originalUrl) {
+ let uriObj;
+ try {
+ uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE);
+ } catch (ex) {
+ return null;
+ }
+ try {
+ return Services.uriFixup.createExposableURI(uriObj);
+ } catch (ex) {
+ return null;
+ }
+ }
+ return null;
+ },
+
/**
* Decides whether or not a document is reader-able without parsing the whole thing.
*
* @param doc A document to parse.
* @return boolean Whether or not we should show the reader mode button.
*/
- isProbablyReaderable: function(doc) {
+ isProbablyReaderable(doc) {
// Only care about 'real' HTML documents:
if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) {
return false;
}
- let uri = Services.io.newURI(doc.location.href, null, null);
+ let uri = Services.io.newURI(doc.location.href);
if (!this._shouldCheckUri(uri)) {
return false;
}
@@ -187,12 +198,12 @@ this.ReaderMode = {
return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils));
},
- isNodeVisible: function(utils, node) {
+ isNodeVisible(utils, node) {
let bounds = utils.getBoundsWithoutFlushing(node);
return bounds.height > 0 && bounds.width > 0;
},
- getUtilsForWin: function(win) {
+ getUtilsForWin(win) {
return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils);
},
@@ -204,16 +215,14 @@ this.ReaderMode = {
* @return {Promise}
* @resolves JS object representing the article, or null if no article is found.
*/
- parseDocument: Task.async(function* (doc) {
- let documentURI = Services.io.newURI(doc.documentURI, null, null);
- let baseURI = Services.io.newURI(doc.baseURI, null, null);
- if (!this._shouldCheckUri(documentURI) || !this._shouldCheckUri(baseURI, true)) {
+ parseDocument(doc) {
+ if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
this.log("Reader mode disabled for URI");
return null;
}
- return yield this._readerParse(baseURI, doc);
- }),
+ return this._readerParse(doc);
+ },
/**
* Downloads and parses a document from a URL.
@@ -222,19 +231,28 @@ this.ReaderMode = {
* @return {Promise}
* @resolves JS object representing the article, or null if no article is found.
*/
- downloadAndParseDocument: Task.async(function* (url) {
- let doc = yield this._downloadDocument(url);
- let uri = Services.io.newURI(doc.baseURI, null, null);
- if (!this._shouldCheckUri(uri, true)) {
+ async downloadAndParseDocument(url) {
+ let doc = await this._downloadDocument(url);
+ if (!doc) {
+ return null;
+ }
+ if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
this.log("Reader mode disabled for URI");
return null;
}
- return yield this._readerParse(uri, doc);
- }),
+ return await this._readerParse(doc);
+ },
- _downloadDocument: function (url) {
- let histogram = Services.telemetry.getHistogramById("READER_MODE_DOWNLOAD_RESULT");
+ _downloadDocument(url) {
+ try {
+ if (!this._shouldCheckUri(Services.io.newURI(url))) {
+ return null;
+ }
+ } catch (ex) {
+ Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`));
+ return null;
+ }
return new Promise((resolve, reject) => {
let xhr = new XMLHttpRequest();
xhr.open("GET", url, true);
@@ -243,14 +261,12 @@ this.ReaderMode = {
xhr.onload = evt => {
if (xhr.status !== 200) {
reject("Reader mode XHR failed with status: " + xhr.status);
- histogram.add(DOWNLOAD_ERROR_XHR);
return;
}
let doc = xhr.responseXML;
if (!doc) {
reject("Reader mode XHR didn't return a document");
- histogram.add(DOWNLOAD_ERROR_NO_DOC);
return;
}
@@ -261,7 +277,7 @@ this.ReaderMode = {
if (content) {
let urlIndex = content.toUpperCase().indexOf("URL=");
if (urlIndex > -1) {
- let baseURI = Services.io.newURI(url, null, null);
+ let baseURI = Services.io.newURI(url);
let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI);
let newURL = newURI.spec;
let ssm = Services.scriptSecurityManager;
@@ -290,10 +306,10 @@ this.ReaderMode = {
// Convert these to real URIs to make sure the escaping (or lack
// thereof) is identical:
try {
- responseURL = Services.io.newURI(responseURL, null, null).specIgnoringRef;
+ responseURL = Services.io.newURI(responseURL).specIgnoringRef;
} catch (ex) { /* Ignore errors - we'll use what we had before */ }
try {
- givenURL = Services.io.newURI(givenURL, null, null).specIgnoringRef;
+ givenURL = Services.io.newURI(givenURL).specIgnoringRef;
} catch (ex) { /* Ignore errors - we'll use what we had before */ }
if (responseURL != givenURL) {
@@ -303,7 +319,6 @@ this.ReaderMode = {
return;
}
resolve(doc);
- histogram.add(DOWNLOAD_SUCCESS);
};
xhr.send();
});
@@ -318,17 +333,17 @@ this.ReaderMode = {
* @resolves JS object representing the article, or null if no article is found.
* @rejects OS.File.Error
*/
- getArticleFromCache: Task.async(function* (url) {
+ async getArticleFromCache(url) {
let path = this._toHashedPath(url);
try {
- let array = yield OS.File.read(path);
+ let array = await OS.File.read(path);
return JSON.parse(new TextDecoder().decode(array));
} catch (e) {
if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile)
throw e;
return null;
}
- }),
+ },
/**
* Stores an article in the cache.
@@ -338,14 +353,14 @@ this.ReaderMode = {
* @resolves When the article is stored.
* @rejects OS.File.Error
*/
- storeArticleInCache: Task.async(function* (article) {
+ async storeArticleInCache(article) {
let array = new TextEncoder().encode(JSON.stringify(article));
let path = this._toHashedPath(article.url);
- yield this._ensureCacheDir();
+ await this._ensureCacheDir();
return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" })
.then(success => {
OS.File.stat(path).then(info => {
- return Messaging.sendRequest({
+ return EventDispatcher.instance.sendRequest({
type: "Reader:AddedToCache",
url: article.url,
size: info.size,
@@ -353,7 +368,7 @@ this.ReaderMode = {
});
});
});
- }),
+ },
/**
* Removes an article from the cache given an article URI.
@@ -363,26 +378,29 @@ this.ReaderMode = {
* @resolves When the article is removed.
* @rejects OS.File.Error
*/
- removeArticleFromCache: Task.async(function* (url) {
+ async removeArticleFromCache(url) {
let path = this._toHashedPath(url);
- yield OS.File.remove(path);
- }),
+ await OS.File.remove(path);
+ },
- log: function(msg) {
+ log(msg) {
if (this.DEBUG)
dump("Reader: " + msg);
},
_blockedHosts: [
- "mail.google.com",
+ "amazon.com",
+ "basilisk-browser.org",
"github.com",
+ "mail.google.com",
+ "palemoon.org",
"pinterest.com",
"reddit.com",
"twitter.com",
"youtube.com",
],
- _shouldCheckUri: function (uri, isBaseUri = false) {
+ _shouldCheckUri(uri, isBaseUri = false) {
if (!(uri.schemeIs("http") || uri.schemeIs("https"))) {
this.log("Not parsing URI scheme: " + uri.scheme);
return false;
@@ -412,59 +430,77 @@ this.ReaderMode = {
* Attempts to parse a document into an article. Heavy lifting happens
* in readerWorker.js.
*
- * @param uri The base URI of the article.
* @param doc The document to parse.
* @return {Promise}
* @resolves JS object representing the article, or null if no article is found.
*/
- _readerParse: Task.async(function* (uri, doc) {
- let histogram = Services.telemetry.getHistogramById("READER_MODE_PARSE_RESULT");
+ async _readerParse(doc) {
if (this.parseNodeLimit) {
let numTags = doc.getElementsByTagName("*").length;
if (numTags > this.parseNodeLimit) {
- this.log("Aborting parse for " + uri.spec + "; " + numTags + " elements found");
- histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS);
+ this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found");
return null;
}
}
+ // Fetch this here before we send `doc` off to the worker thread, as later on the
+ // document might be nuked but we will still want the URI.
+ let {documentURI} = doc;
+
let uriParam = {
- spec: uri.spec,
- host: uri.host,
- prePath: uri.prePath,
- scheme: uri.scheme,
- pathBase: Services.io.newURI(".", null, uri).spec
+ spec: doc.baseURIObject.spec,
+ host: doc.baseURIObject.host,
+ prePath: doc.baseURIObject.prePath,
+ scheme: doc.baseURIObject.scheme,
+ pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec
+ };
+
+ let langAttributes = {
+ charset: doc.characterSet,
+ lang: doc.documentElement.lang
};
let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"].
createInstance(Ci.nsIDOMSerializer);
let serializedDoc = serializer.serializeToString(doc);
+ let options = {
+ classesToPreserve: CLASSES_TO_PRESERVE,
+ };
+
let article = null;
try {
- article = yield ReaderWorker.post("parseDocument", [uriParam, serializedDoc]);
+ article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]);
} catch (e) {
Cu.reportError("Error in ReaderWorker: " + e);
- histogram.add(PARSE_ERROR_WORKER);
}
+ // Explicitly null out doc to make it clear it might not be available from this
+ // point on.
+ doc = null;
+
if (!article) {
this.log("Worker did not return an article");
- histogram.add(PARSE_ERROR_NO_ARTICLE);
return null;
}
- // Readability returns a URI object, but we only care about the URL.
- article.url = article.uri.spec;
+ // Readability returns a URI object based on the baseURI, but we only care
+ // about the original document's URL from now on. This also avoids spoofing
+ // attempts where the baseURI doesn't match the domain of the documentURI
+ article.url = documentURI;
delete article.uri;
let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils)
.convertToPlainText(article.title, flags, 0);
- histogram.add(PARSE_SUCCESS);
+ await this._assignLanguage(article, langAttributes);
+ this._maybeAssignTextDirection(article);
+
+ this._assignReadTime(article);
+
return article;
- }),
+ },
get _cryptoHash() {
delete this._cryptoHash;
@@ -485,7 +521,7 @@ this.ReaderMode = {
* @param url The article URL. This should have referrers removed.
* @return The file path to the cached article.
*/
- _toHashedPath: function (url) {
+ _toHashedPath(url) {
let value = this._unicodeConverter.convertToByteArray(url);
this._cryptoHash.init(this._cryptoHash.MD5);
this._cryptoHash.update(value, value.length);
@@ -502,7 +538,7 @@ this.ReaderMode = {
* @resolves When the cache directory exists.
* @rejects OS.File.Error
*/
- _ensureCacheDir: function () {
+ _ensureCacheDir() {
let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache");
return OS.File.exists(dir).then(exists => {
if (!exists) {
@@ -510,5 +546,107 @@ this.ReaderMode = {
}
return undefined;
});
- }
+ },
+
+ /**
+ * Sets a global language string value if possible. If langauge detection is
+ * available, use that. Otherwise, revert to a simpler mechanism using the
+ * document's lang attribute or charset.
+ *
+ * @return Promise
+ * @resolves when the language is detected
+ */
+ _assignLanguage(article, attributes) {
+ try {
+ Cu.import("resource://modules/translation/LanguageDetector.jsm");
+ return LanguageDetector.detectLanguage(article.textContent).then(result => {
+ article.language = result.confident ? result.language : null;
+ });
+ } catch(ex) {
+ return new Promise((resolve) => {
+ resolve(this._assignSimpleLanguage(attributes));
+ }).then(result => {
+ article.language = result;
+ });
+ }
+ },
+
+ _assignSimpleLanguage(attributes) {
+ var lang = attributes.lang.substring(0,2);
+ if (lang) {
+ return lang;
+ }
+
+ // If there is no lang attribute, try the charset.
+ // We can only use this for charsets that are specific to one language.
+ const charsetLang = new Map([
+ [ "us-ascii", "en" ],
+ [ "iso-8859-6", "ar" ],
+ [ "iso-8859-7", "el" ],
+ [ "iso-8859-8", "he" ],
+ [ "iso-8859-9", "tr" ],
+ [ "iso-8859-11", "th" ],
+ [ "jis_x0201", "ja" ],
+ [ "shift_jis", "ja" ],
+ [ "euc-jp", "ja" ]
+ ]);
+
+ return charsetLang.get(attributes.charset);
+ },
+
+ _maybeAssignTextDirection(article) {
+ // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
+ if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) {
+ article.dir = "rtl";
+ }
+ },
+
+ /**
+ * Assigns the estimated reading time range of the article to the article object.
+ *
+ * @param article the article object to assign the reading time estimate to.
+ */
+ _assignReadTime(article) {
+ let lang = article.language || "en";
+ const readingSpeed = this._getReadingSpeedForLanguage(lang);
+ const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
+ const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
+ const length = article.length;
+
+ article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
+ article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh);
+ },
+
+ /**
+ * Returns the reading speed of a selection of languages with likely variance.
+ *
+ * Reading speed estimated from a study done on reading speeds in various languages.
+ * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
+ *
+ * @return object with characters per minute and variance. Defaults to English
+ * if no suitable language is found in the collection.
+ */
+ _getReadingSpeedForLanguage(lang) {
+ const readingSpeed = new Map([
+ [ "en", {cpm: 987, variance: 118 } ],
+ [ "ar", {cpm: 612, variance: 88 } ],
+ [ "de", {cpm: 920, variance: 86 } ],
+ [ "es", {cpm: 1025, variance: 127 } ],
+ [ "fi", {cpm: 1078, variance: 121 } ],
+ [ "fr", {cpm: 998, variance: 126 } ],
+ [ "he", {cpm: 833, variance: 130 } ],
+ [ "it", {cpm: 950, variance: 140 } ],
+ [ "jw", {cpm: 357, variance: 56 } ],
+ [ "nl", {cpm: 978, variance: 143 } ],
+ [ "pl", {cpm: 916, variance: 126 } ],
+ [ "pt", {cpm: 913, variance: 145 } ],
+ [ "ru", {cpm: 986, variance: 175 } ],
+ [ "sk", {cpm: 885, variance: 145 } ],
+ [ "sv", {cpm: 917, variance: 156 } ],
+ [ "tr", {cpm: 1054, variance: 156 } ],
+ [ "zh", {cpm: 255, variance: 29 } ],
+ ]);
+
+ return readingSpeed.get(lang) || readingSpeed.get("en");
+ },
};