diff options
Diffstat (limited to 'toolkit/components/reader/ReaderMode.jsm')
-rw-r--r-- | toolkit/components/reader/ReaderMode.jsm | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm new file mode 100644 index 000000000..033a02489 --- /dev/null +++ b/toolkit/components/reader/ReaderMode.jsm @@ -0,0 +1,514 @@ +// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +this.EXPORTED_SYMBOLS = ["ReaderMode"]; + +const { classes: Cc, interfaces: Ci, utils: Cu } = Components; + +// Constants for telemetry. +const DOWNLOAD_SUCCESS = 0; +const DOWNLOAD_ERROR_XHR = 1; +const DOWNLOAD_ERROR_NO_DOC = 2; + +const PARSE_SUCCESS = 0; +const PARSE_ERROR_TOO_MANY_ELEMENTS = 1; +const PARSE_ERROR_WORKER = 2; +const PARSE_ERROR_NO_ARTICLE = 3; + +Cu.import("resource://gre/modules/Services.jsm"); +Cu.import("resource://gre/modules/XPCOMUtils.jsm"); + +Cu.importGlobalProperties(["XMLHttpRequest"]); + +XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js"); +XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm"); +XPCOMUtils.defineLazyModuleGetter(this, "TelemetryStopwatch", "resource://gre/modules/TelemetryStopwatch.jsm"); + +XPCOMUtils.defineLazyGetter(this, "Readability", function() { + let scope = {}; + scope.dump = this.dump; + Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope); + return scope["Readability"]; +}); + +this.ReaderMode = { + // Version of the cache schema. + CACHE_VERSION: 1, + + DEBUG: 0, + + // Don't try to parse the page if it has too many elements (for memory and + // performance reasons) + get maxElemsToParse() { + delete this.parseNodeLimit; + + Services.prefs.addObserver("reader.parse-node-limit", this, false); + return this.parseNodeLimit = Services.prefs.getIntPref("reader.parse-node-limit"); + }, + + get isEnabledForParseOnLoad() { + delete this.isEnabledForParseOnLoad; + + // Listen for future pref changes. + Services.prefs.addObserver("reader.parse-on-load.", this, false); + + return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); + }, + + get isOnLowMemoryPlatform() { + let memory = Cc["@mozilla.org/xpcom/memory-service;1"].getService(Ci.nsIMemory); + delete this.isOnLowMemoryPlatform; + return this.isOnLowMemoryPlatform = memory.isLowMemoryPlatform(); + }, + + _getStateForParseOnLoad: function () { + let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); + let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); + // For low-memory devices, don't allow reader mode since it takes up a lot of memory. + // See https://bugzilla.mozilla.org/show_bug.cgi?id=792603 for details. + return isForceEnabled || (isEnabled && !this.isOnLowMemoryPlatform); + }, + + observe: function(aMessage, aTopic, aData) { + switch (aTopic) { + case "nsPref:changed": + if (aData.startsWith("reader.parse-on-load.")) { + this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); + } else if (aData === "reader.parse-node-limit") { + this.parseNodeLimit = Services.prefs.getIntPref(aData); + } + break; + } + }, + + /** + * Enter the reader mode by going forward one step in history if applicable, + * if not, append the about:reader page in the history instead. + */ + enterReaderMode: function(docShell, win) { + let url = win.document.location.href; + let readerURL = "about:reader?url=" + encodeURIComponent(url); + let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); + let sh = webNav.sessionHistory; + if (webNav.canGoForward) { + let forwardEntry = sh.getEntryAtIndex(sh.index + 1, false); + let forwardURL = forwardEntry.URI.spec; + if (forwardURL && (forwardURL == readerURL || !readerURL)) { + webNav.goForward(); + return; + } + } + + win.document.location = readerURL; + }, + + /** + * Exit the reader mode by going back one step in history if applicable, + * if not, append the original page in the history instead. + */ + leaveReaderMode: function(docShell, win) { + let url = win.document.location.href; + let originalURL = this.getOriginalUrl(url); + let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); + let sh = webNav.sessionHistory; + if (webNav.canGoBack) { + let prevEntry = sh.getEntryAtIndex(sh.index - 1, false); + let prevURL = prevEntry.URI.spec; + if (prevURL && (prevURL == originalURL || !originalURL)) { + webNav.goBack(); + return; + } + } + + win.document.location = originalURL; + }, + + /** + * Returns original URL from an about:reader URL. + * + * @param url An about:reader URL. + * @return The original URL for the article, or null if we did not find + * a properly formatted about:reader URL. + */ + getOriginalUrl: function(url) { + if (!url.startsWith("about:reader?")) { + return null; + } + + let outerHash = ""; + try { + let uriObj = Services.io.newURI(url, null, null); + url = uriObj.specIgnoringRef; + outerHash = uriObj.ref; + } catch (ex) { /* ignore, use the raw string */ } + + let searchParams = new URLSearchParams(url.substring("about:reader?".length)); + if (!searchParams.has("url")) { + return null; + } + let originalUrl = searchParams.get("url"); + if (outerHash) { + try { + let uriObj = Services.io.newURI(originalUrl, null, null); + uriObj = Services.io.newURI('#' + outerHash, null, uriObj); + originalUrl = uriObj.spec; + } catch (ex) {} + } + return originalUrl; + }, + + /** + * Decides whether or not a document is reader-able without parsing the whole thing. + * + * @param doc A document to parse. + * @return boolean Whether or not we should show the reader mode button. + */ + isProbablyReaderable: function(doc) { + // Only care about 'real' HTML documents: + if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { + return false; + } + + let uri = Services.io.newURI(doc.location.href, null, null); + if (!this._shouldCheckUri(uri)) { + return false; + } + + let utils = this.getUtilsForWin(doc.defaultView); + // We pass in a helper function to determine if a node is visible, because + // it uses gecko APIs that the engine-agnostic readability code can't rely + // upon. + return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); + }, + + isNodeVisible: function(utils, node) { + let bounds = utils.getBoundsWithoutFlushing(node); + return bounds.height > 0 && bounds.width > 0; + }, + + getUtilsForWin: function(win) { + return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils); + }, + + /** + * Gets an article from a loaded browser's document. This method will not attempt + * to parse certain URIs (e.g. about: URIs). + * + * @param doc A document to parse. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + */ + parseDocument: Task.async(function* (doc) { + let documentURI = Services.io.newURI(doc.documentURI, null, null); + let baseURI = Services.io.newURI(doc.baseURI, null, null); + if (!this._shouldCheckUri(documentURI) || !this._shouldCheckUri(baseURI, true)) { + this.log("Reader mode disabled for URI"); + return null; + } + + return yield this._readerParse(baseURI, doc); + }), + + /** + * Downloads and parses a document from a URL. + * + * @param url URL to download and parse. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + */ + downloadAndParseDocument: Task.async(function* (url) { + let doc = yield this._downloadDocument(url); + let uri = Services.io.newURI(doc.baseURI, null, null); + if (!this._shouldCheckUri(uri, true)) { + this.log("Reader mode disabled for URI"); + return null; + } + + return yield this._readerParse(uri, doc); + }), + + _downloadDocument: function (url) { + let histogram = Services.telemetry.getHistogramById("READER_MODE_DOWNLOAD_RESULT"); + return new Promise((resolve, reject) => { + let xhr = new XMLHttpRequest(); + xhr.open("GET", url, true); + xhr.onerror = evt => reject(evt.error); + xhr.responseType = "document"; + xhr.onload = evt => { + if (xhr.status !== 200) { + reject("Reader mode XHR failed with status: " + xhr.status); + histogram.add(DOWNLOAD_ERROR_XHR); + return; + } + + let doc = xhr.responseXML; + if (!doc) { + reject("Reader mode XHR didn't return a document"); + histogram.add(DOWNLOAD_ERROR_NO_DOC); + return; + } + + // Manually follow a meta refresh tag if one exists. + let meta = doc.querySelector("meta[http-equiv=refresh]"); + if (meta) { + let content = meta.getAttribute("content"); + if (content) { + let urlIndex = content.toUpperCase().indexOf("URL="); + if (urlIndex > -1) { + let baseURI = Services.io.newURI(url, null, null); + let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI); + let newURL = newURI.spec; + let ssm = Services.scriptSecurityManager; + let flags = ssm.LOAD_IS_AUTOMATIC_DOCUMENT_REPLACEMENT | + ssm.DISALLOW_INHERIT_PRINCIPAL; + try { + ssm.checkLoadURIStrWithPrincipal(doc.nodePrincipal, newURL, flags); + } catch (ex) { + let errorMsg = "Reader mode disallowed meta refresh (reason: " + ex + ")."; + + if (Services.prefs.getBoolPref("reader.errors.includeURLs")) + errorMsg += " Refresh target URI: '" + newURL + "'."; + reject(errorMsg); + return; + } + // Otherwise, pass an object indicating our new URL: + if (!baseURI.equalsExceptRef(newURI)) { + reject({newURL}); + return; + } + } + } + } + let responseURL = xhr.responseURL; + let givenURL = url; + // Convert these to real URIs to make sure the escaping (or lack + // thereof) is identical: + try { + responseURL = Services.io.newURI(responseURL, null, null).specIgnoringRef; + } catch (ex) { /* Ignore errors - we'll use what we had before */ } + try { + givenURL = Services.io.newURI(givenURL, null, null).specIgnoringRef; + } catch (ex) { /* Ignore errors - we'll use what we had before */ } + + if (responseURL != givenURL) { + // We were redirected without a meta refresh tag. + // Force redirect to the correct place: + reject({newURL: xhr.responseURL}); + return; + } + resolve(doc); + histogram.add(DOWNLOAD_SUCCESS); + }; + xhr.send(); + }); + }, + + + /** + * Retrieves an article from the cache given an article URI. + * + * @param url The article URL. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + * @rejects OS.File.Error + */ + getArticleFromCache: Task.async(function* (url) { + let path = this._toHashedPath(url); + try { + let array = yield OS.File.read(path); + return JSON.parse(new TextDecoder().decode(array)); + } catch (e) { + if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile) + throw e; + return null; + } + }), + + /** + * Stores an article in the cache. + * + * @param article JS object representing article. + * @return {Promise} + * @resolves When the article is stored. + * @rejects OS.File.Error + */ + storeArticleInCache: Task.async(function* (article) { + let array = new TextEncoder().encode(JSON.stringify(article)); + let path = this._toHashedPath(article.url); + yield this._ensureCacheDir(); + return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" }) + .then(success => { + OS.File.stat(path).then(info => { + return Messaging.sendRequest({ + type: "Reader:AddedToCache", + url: article.url, + size: info.size, + path: path, + }); + }); + }); + }), + + /** + * Removes an article from the cache given an article URI. + * + * @param url The article URL. + * @return {Promise} + * @resolves When the article is removed. + * @rejects OS.File.Error + */ + removeArticleFromCache: Task.async(function* (url) { + let path = this._toHashedPath(url); + yield OS.File.remove(path); + }), + + log: function(msg) { + if (this.DEBUG) + dump("Reader: " + msg); + }, + + _blockedHosts: [ + "mail.google.com", + "github.com", + "pinterest.com", + "reddit.com", + "twitter.com", + "youtube.com", + ], + + _shouldCheckUri: function (uri, isBaseUri = false) { + if (!(uri.schemeIs("http") || uri.schemeIs("https"))) { + this.log("Not parsing URI scheme: " + uri.scheme); + return false; + } + + try { + uri.QueryInterface(Ci.nsIURL); + } catch (ex) { + // If this doesn't work, presumably the URL is not well-formed or something + return false; + } + // Sadly, some high-profile pages have false positives, so bail early for those: + let asciiHost = uri.asciiHost; + if (!isBaseUri && this._blockedHosts.some(blockedHost => asciiHost.endsWith(blockedHost))) { + return false; + } + + if (!isBaseUri && (!uri.filePath || uri.filePath == "/")) { + this.log("Not parsing home page: " + uri.spec); + return false; + } + + return true; + }, + + /** + * Attempts to parse a document into an article. Heavy lifting happens + * in readerWorker.js. + * + * @param uri The base URI of the article. + * @param doc The document to parse. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + */ + _readerParse: Task.async(function* (uri, doc) { + let histogram = Services.telemetry.getHistogramById("READER_MODE_PARSE_RESULT"); + if (this.parseNodeLimit) { + let numTags = doc.getElementsByTagName("*").length; + if (numTags > this.parseNodeLimit) { + this.log("Aborting parse for " + uri.spec + "; " + numTags + " elements found"); + histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS); + return null; + } + } + + let uriParam = { + spec: uri.spec, + host: uri.host, + prePath: uri.prePath, + scheme: uri.scheme, + pathBase: Services.io.newURI(".", null, uri).spec + }; + + let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"]. + createInstance(Ci.nsIDOMSerializer); + let serializedDoc = serializer.serializeToString(doc); + + let article = null; + try { + article = yield ReaderWorker.post("parseDocument", [uriParam, serializedDoc]); + } catch (e) { + Cu.reportError("Error in ReaderWorker: " + e); + histogram.add(PARSE_ERROR_WORKER); + } + + if (!article) { + this.log("Worker did not return an article"); + histogram.add(PARSE_ERROR_NO_ARTICLE); + return null; + } + + // Readability returns a URI object, but we only care about the URL. + article.url = article.uri.spec; + delete article.uri; + + let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks; + article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils) + .convertToPlainText(article.title, flags, 0); + + histogram.add(PARSE_SUCCESS); + return article; + }), + + get _cryptoHash() { + delete this._cryptoHash; + return this._cryptoHash = Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash); + }, + + get _unicodeConverter() { + delete this._unicodeConverter; + this._unicodeConverter = Cc["@mozilla.org/intl/scriptableunicodeconverter"] + .createInstance(Ci.nsIScriptableUnicodeConverter); + this._unicodeConverter.charset = "utf8"; + return this._unicodeConverter; + }, + + /** + * Calculate the hashed path for a stripped article URL. + * + * @param url The article URL. This should have referrers removed. + * @return The file path to the cached article. + */ + _toHashedPath: function (url) { + let value = this._unicodeConverter.convertToByteArray(url); + this._cryptoHash.init(this._cryptoHash.MD5); + this._cryptoHash.update(value, value.length); + + let hash = CommonUtils.encodeBase32(this._cryptoHash.finish(false)); + let fileName = hash.substring(0, hash.indexOf("=")) + ".json"; + return OS.Path.join(OS.Constants.Path.profileDir, "readercache", fileName); + }, + + /** + * Ensures the cache directory exists. + * + * @return Promise + * @resolves When the cache directory exists. + * @rejects OS.File.Error + */ + _ensureCacheDir: function () { + let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache"); + return OS.File.exists(dir).then(exists => { + if (!exists) { + return OS.File.makeDir(dir); + } + return undefined; + }); + } +}; |