From b70d884598e1e14b99190e1e5c349553ff59849b Mon Sep 17 00:00:00 2001
From: Ascrod <32915892+Ascrod@users.noreply.github.com>
Date: Mon, 7 May 2018 19:45:46 -0400
Subject: Initial updates for Reader View.

---
 toolkit/components/reader/ReaderMode.jsm | 312 ++++++++++++++++++++++---------
 1 file changed, 225 insertions(+), 87 deletions(-)

(limited to 'toolkit/components/reader/ReaderMode.jsm')

diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm
index 033a02489..e9eb83154 100644
--- a/toolkit/components/reader/ReaderMode.jsm
+++ b/toolkit/components/reader/ReaderMode.jsm
@@ -8,15 +8,18 @@ this.EXPORTED_SYMBOLS = ["ReaderMode"];
 
 const { classes: Cc, interfaces: Ci, utils: Cu } = Components;
 
-// Constants for telemetry.
-const DOWNLOAD_SUCCESS = 0;
-const DOWNLOAD_ERROR_XHR = 1;
-const DOWNLOAD_ERROR_NO_DOC = 2;
-
-const PARSE_SUCCESS = 0;
-const PARSE_ERROR_TOO_MANY_ELEMENTS = 1;
-const PARSE_ERROR_WORKER = 2;
-const PARSE_ERROR_NO_ARTICLE = 3;
+// Class names to preserve in the readerized output. We preserve these class
+// names so that rules in aboutReader.css can match them.
+const CLASSES_TO_PRESERVE = [
+  "caption",
+  "hidden",
+  "invisble",
+  "sr-only",
+  "visually-hidden",
+  "visuallyhidden",
+  "wp-caption",
+  "wp-caption-text",
+];
 
 Cu.import("resource://gre/modules/Services.jsm");
 Cu.import("resource://gre/modules/XPCOMUtils.jsm");
@@ -24,17 +27,15 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm");
 Cu.importGlobalProperties(["XMLHttpRequest"]);
 
 XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js");
-XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm");
 XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm");
 XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "TelemetryStopwatch", "resource://gre/modules/TelemetryStopwatch.jsm");
 
 XPCOMUtils.defineLazyGetter(this, "Readability", function() {
   let scope = {};
   scope.dump = this.dump;
   Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope);
-  return scope["Readability"];
+  return scope.Readability;
 });
 
 this.ReaderMode = {
@@ -61,21 +62,13 @@ this.ReaderMode = {
     return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad();
   },
 
-  get isOnLowMemoryPlatform() {
-    let memory = Cc["@mozilla.org/xpcom/memory-service;1"].getService(Ci.nsIMemory);
-    delete this.isOnLowMemoryPlatform;
-    return this.isOnLowMemoryPlatform = memory.isLowMemoryPlatform();
-  },
-
-  _getStateForParseOnLoad: function () {
+  _getStateForParseOnLoad() {
     let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled");
     let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled");
-    // For low-memory devices, don't allow reader mode since it takes up a lot of memory.
-    // See https://bugzilla.mozilla.org/show_bug.cgi?id=792603 for details.
-    return isForceEnabled || (isEnabled && !this.isOnLowMemoryPlatform);
+    return isForceEnabled || isEnabled;
   },
 
-  observe: function(aMessage, aTopic, aData) {
+  observe(aMessage, aTopic, aData) {
     switch (aTopic) {
       case "nsPref:changed":
         if (aData.startsWith("reader.parse-on-load.")) {
@@ -91,7 +84,7 @@ this.ReaderMode = {
    * Enter the reader mode by going forward one step in history if applicable,
    * if not, append the about:reader page in the history instead.
    */
-  enterReaderMode: function(docShell, win) {
+  enterReaderMode(docShell, win) {
     let url = win.document.location.href;
     let readerURL = "about:reader?url=" + encodeURIComponent(url);
     let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
@@ -112,7 +105,7 @@ this.ReaderMode = {
    * Exit the reader mode by going back one step in history if applicable,
    * if not, append the original page in the history instead.
    */
-  leaveReaderMode: function(docShell, win) {
+  leaveReaderMode(docShell, win) {
     let url = win.document.location.href;
     let originalURL = this.getOriginalUrl(url);
     let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
@@ -136,14 +129,14 @@ this.ReaderMode = {
    * @return The original URL for the article, or null if we did not find
    *         a properly formatted about:reader URL.
    */
-  getOriginalUrl: function(url) {
+  getOriginalUrl(url) {
     if (!url.startsWith("about:reader?")) {
       return null;
     }
 
     let outerHash = "";
     try {
-      let uriObj = Services.io.newURI(url, null, null);
+      let uriObj = Services.io.newURI(url);
       url = uriObj.specIgnoringRef;
       outerHash = uriObj.ref;
     } catch (ex) { /* ignore, use the raw string */ }
@@ -155,27 +148,45 @@ this.ReaderMode = {
     let originalUrl = searchParams.get("url");
     if (outerHash) {
       try {
-        let uriObj = Services.io.newURI(originalUrl, null, null);
-        uriObj = Services.io.newURI('#' + outerHash, null, uriObj);
+        let uriObj = Services.io.newURI(originalUrl);
+        uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
         originalUrl = uriObj.spec;
       } catch (ex) {}
     }
     return originalUrl;
   },
 
+  getOriginalUrlObjectForDisplay(url) {
+    let originalUrl = this.getOriginalUrl(url);
+    if (originalUrl) {
+      let uriObj;
+      try {
+        uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE);
+      } catch (ex) {
+        return null;
+      }
+      try {
+        return Services.uriFixup.createExposableURI(uriObj);
+      } catch (ex) {
+        return null;
+      }
+    }
+    return null;
+  },
+
   /**
    * Decides whether or not a document is reader-able without parsing the whole thing.
    *
    * @param doc A document to parse.
    * @return boolean Whether or not we should show the reader mode button.
    */
-  isProbablyReaderable: function(doc) {
+  isProbablyReaderable(doc) {
     // Only care about 'real' HTML documents:
     if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) {
       return false;
     }
 
-    let uri = Services.io.newURI(doc.location.href, null, null);
+    let uri = Services.io.newURI(doc.location.href);
     if (!this._shouldCheckUri(uri)) {
       return false;
     }
@@ -187,12 +198,12 @@ this.ReaderMode = {
     return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils));
   },
 
-  isNodeVisible: function(utils, node) {
+  isNodeVisible(utils, node) {
     let bounds = utils.getBoundsWithoutFlushing(node);
     return bounds.height > 0 && bounds.width > 0;
   },
 
-  getUtilsForWin: function(win) {
+  getUtilsForWin(win) {
     return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils);
   },
 
@@ -204,16 +215,14 @@ this.ReaderMode = {
    * @return {Promise}
    * @resolves JS object representing the article, or null if no article is found.
    */
-  parseDocument: Task.async(function* (doc) {
-    let documentURI = Services.io.newURI(doc.documentURI, null, null);
-    let baseURI = Services.io.newURI(doc.baseURI, null, null);
-    if (!this._shouldCheckUri(documentURI) || !this._shouldCheckUri(baseURI, true)) {
+  parseDocument(doc) {
+    if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
       this.log("Reader mode disabled for URI");
       return null;
     }
 
-    return yield this._readerParse(baseURI, doc);
-  }),
+    return this._readerParse(doc);
+  },
 
   /**
    * Downloads and parses a document from a URL.
@@ -222,19 +231,28 @@ this.ReaderMode = {
    * @return {Promise}
    * @resolves JS object representing the article, or null if no article is found.
    */
-  downloadAndParseDocument: Task.async(function* (url) {
-    let doc = yield this._downloadDocument(url);
-    let uri = Services.io.newURI(doc.baseURI, null, null);
-    if (!this._shouldCheckUri(uri, true)) {
+  async downloadAndParseDocument(url) {
+    let doc = await this._downloadDocument(url);
+    if (!doc) {
+      return null;
+    }
+    if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
       this.log("Reader mode disabled for URI");
       return null;
     }
 
-    return yield this._readerParse(uri, doc);
-  }),
+    return await this._readerParse(doc);
+  },
 
-  _downloadDocument: function (url) {
-    let histogram = Services.telemetry.getHistogramById("READER_MODE_DOWNLOAD_RESULT");
+  _downloadDocument(url) {
+    try {
+      if (!this._shouldCheckUri(Services.io.newURI(url))) {
+        return null;
+      }
+    } catch (ex) {
+      Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`));
+      return null;
+    }
     return new Promise((resolve, reject) => {
       let xhr = new XMLHttpRequest();
       xhr.open("GET", url, true);
@@ -243,14 +261,12 @@ this.ReaderMode = {
       xhr.onload = evt => {
         if (xhr.status !== 200) {
           reject("Reader mode XHR failed with status: " + xhr.status);
-          histogram.add(DOWNLOAD_ERROR_XHR);
           return;
         }
 
         let doc = xhr.responseXML;
         if (!doc) {
           reject("Reader mode XHR didn't return a document");
-          histogram.add(DOWNLOAD_ERROR_NO_DOC);
           return;
         }
 
@@ -261,7 +277,7 @@ this.ReaderMode = {
           if (content) {
             let urlIndex = content.toUpperCase().indexOf("URL=");
             if (urlIndex > -1) {
-              let baseURI = Services.io.newURI(url, null, null);
+              let baseURI = Services.io.newURI(url);
               let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI);
               let newURL = newURI.spec;
               let ssm = Services.scriptSecurityManager;
@@ -290,10 +306,10 @@ this.ReaderMode = {
         // Convert these to real URIs to make sure the escaping (or lack
         // thereof) is identical:
         try {
-          responseURL = Services.io.newURI(responseURL, null, null).specIgnoringRef;
+          responseURL = Services.io.newURI(responseURL).specIgnoringRef;
         } catch (ex) { /* Ignore errors - we'll use what we had before */ }
         try {
-          givenURL = Services.io.newURI(givenURL, null, null).specIgnoringRef;
+          givenURL = Services.io.newURI(givenURL).specIgnoringRef;
         } catch (ex) { /* Ignore errors - we'll use what we had before */ }
 
         if (responseURL != givenURL) {
@@ -303,7 +319,6 @@ this.ReaderMode = {
           return;
         }
         resolve(doc);
-        histogram.add(DOWNLOAD_SUCCESS);
       };
       xhr.send();
     });
@@ -318,17 +333,17 @@ this.ReaderMode = {
    * @resolves JS object representing the article, or null if no article is found.
    * @rejects OS.File.Error
    */
-  getArticleFromCache: Task.async(function* (url) {
+  async getArticleFromCache(url) {
     let path = this._toHashedPath(url);
     try {
-      let array = yield OS.File.read(path);
+      let array = await OS.File.read(path);
       return JSON.parse(new TextDecoder().decode(array));
     } catch (e) {
       if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile)
         throw e;
       return null;
     }
-  }),
+  },
 
   /**
    * Stores an article in the cache.
@@ -338,14 +353,14 @@ this.ReaderMode = {
    * @resolves When the article is stored.
    * @rejects OS.File.Error
    */
-  storeArticleInCache: Task.async(function* (article) {
+  async storeArticleInCache(article) {
     let array = new TextEncoder().encode(JSON.stringify(article));
     let path = this._toHashedPath(article.url);
-    yield this._ensureCacheDir();
+    await this._ensureCacheDir();
     return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" })
       .then(success => {
         OS.File.stat(path).then(info => {
-          return Messaging.sendRequest({
+          return EventDispatcher.instance.sendRequest({
             type: "Reader:AddedToCache",
             url: article.url,
             size: info.size,
@@ -353,7 +368,7 @@ this.ReaderMode = {
           });
         });
       });
-  }),
+  },
 
   /**
    * Removes an article from the cache given an article URI.
@@ -363,26 +378,29 @@ this.ReaderMode = {
    * @resolves When the article is removed.
    * @rejects OS.File.Error
    */
-  removeArticleFromCache: Task.async(function* (url) {
+  async removeArticleFromCache(url) {
     let path = this._toHashedPath(url);
-    yield OS.File.remove(path);
-  }),
+    await OS.File.remove(path);
+  },
 
-  log: function(msg) {
+  log(msg) {
     if (this.DEBUG)
       dump("Reader: " + msg);
   },
 
   _blockedHosts: [
-    "mail.google.com",
+    "amazon.com",
+    "basilisk-browser.org",
     "github.com",
+    "mail.google.com",
+    "palemoon.org",
     "pinterest.com",
     "reddit.com",
     "twitter.com",
     "youtube.com",
   ],
 
-  _shouldCheckUri: function (uri, isBaseUri = false) {
+  _shouldCheckUri(uri, isBaseUri = false) {
     if (!(uri.schemeIs("http") || uri.schemeIs("https"))) {
       this.log("Not parsing URI scheme: " + uri.scheme);
       return false;
@@ -412,59 +430,77 @@ this.ReaderMode = {
    * Attempts to parse a document into an article. Heavy lifting happens
    * in readerWorker.js.
    *
-   * @param uri The base URI of the article.
    * @param doc The document to parse.
    * @return {Promise}
    * @resolves JS object representing the article, or null if no article is found.
    */
-  _readerParse: Task.async(function* (uri, doc) {
-    let histogram = Services.telemetry.getHistogramById("READER_MODE_PARSE_RESULT");
+  async _readerParse(doc) {
     if (this.parseNodeLimit) {
       let numTags = doc.getElementsByTagName("*").length;
       if (numTags > this.parseNodeLimit) {
-        this.log("Aborting parse for " + uri.spec + "; " + numTags + " elements found");
-        histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS);
+        this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found");
         return null;
       }
     }
 
+    // Fetch this here before we send `doc` off to the worker thread, as later on the
+    // document might be nuked but we will still want the URI.
+    let {documentURI} = doc;
+
     let uriParam = {
-      spec: uri.spec,
-      host: uri.host,
-      prePath: uri.prePath,
-      scheme: uri.scheme,
-      pathBase: Services.io.newURI(".", null, uri).spec
+      spec: doc.baseURIObject.spec,
+      host: doc.baseURIObject.host,
+      prePath: doc.baseURIObject.prePath,
+      scheme: doc.baseURIObject.scheme,
+      pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec
+    };
+
+    let langAttributes = {
+      charset: doc.characterSet,
+      lang: doc.documentElement.lang
     };
 
     let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"].
                      createInstance(Ci.nsIDOMSerializer);
     let serializedDoc = serializer.serializeToString(doc);
 
+    let options = {
+      classesToPreserve: CLASSES_TO_PRESERVE,
+    };
+
     let article = null;
     try {
-      article = yield ReaderWorker.post("parseDocument", [uriParam, serializedDoc]);
+      article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]);
     } catch (e) {
       Cu.reportError("Error in ReaderWorker: " + e);
-      histogram.add(PARSE_ERROR_WORKER);
     }
 
+    // Explicitly null out doc to make it clear it might not be available from this
+    // point on.
+    doc = null;
+
     if (!article) {
       this.log("Worker did not return an article");
-      histogram.add(PARSE_ERROR_NO_ARTICLE);
       return null;
     }
 
-    // Readability returns a URI object, but we only care about the URL.
-    article.url = article.uri.spec;
+    // Readability returns a URI object based on the baseURI, but we only care
+    // about the original document's URL from now on. This also avoids spoofing
+    // attempts where the baseURI doesn't match the domain of the documentURI
+    article.url = documentURI;
     delete article.uri;
 
     let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
     article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils)
                                                     .convertToPlainText(article.title, flags, 0);
 
-    histogram.add(PARSE_SUCCESS);
+    await this._assignLanguage(article, langAttributes);
+    this._maybeAssignTextDirection(article);
+
+    this._assignReadTime(article);
+
     return article;
-  }),
+  },
 
   get _cryptoHash() {
     delete this._cryptoHash;
@@ -485,7 +521,7 @@ this.ReaderMode = {
    * @param url The article URL. This should have referrers removed.
    * @return The file path to the cached article.
    */
-  _toHashedPath: function (url) {
+  _toHashedPath(url) {
     let value = this._unicodeConverter.convertToByteArray(url);
     this._cryptoHash.init(this._cryptoHash.MD5);
     this._cryptoHash.update(value, value.length);
@@ -502,7 +538,7 @@ this.ReaderMode = {
    * @resolves When the cache directory exists.
    * @rejects OS.File.Error
    */
-  _ensureCacheDir: function () {
+  _ensureCacheDir() {
     let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache");
     return OS.File.exists(dir).then(exists => {
       if (!exists) {
@@ -510,5 +546,107 @@ this.ReaderMode = {
       }
       return undefined;
     });
-  }
+  },
+
+  /**
+   * Sets a global language string value if possible. If langauge detection is
+   * available, use that. Otherwise, revert to a simpler mechanism using the
+   * document's lang attribute or charset.
+   *
+   * @return Promise
+   * @resolves when the language is detected
+   */
+  _assignLanguage(article, attributes) {
+    try {
+      Cu.import("resource://modules/translation/LanguageDetector.jsm");
+      return LanguageDetector.detectLanguage(article.textContent).then(result => {
+        article.language = result.confident ? result.language : null;
+      });
+    } catch(ex) {
+      return new Promise((resolve) => {
+        resolve(this._assignSimpleLanguage(attributes));
+      }).then(result => {
+        article.language = result;
+      });
+    }
+  },
+
+  _assignSimpleLanguage(attributes) {
+    var lang = attributes.lang.substring(0,2);
+    if (lang) {
+      return lang;
+    }
+
+    // If there is no lang attribute, try the charset.
+    // We can only use this for charsets that are specific to one language.
+    const charsetLang = new Map([
+      [ "us-ascii",    "en" ],
+      [ "iso-8859-6",  "ar" ],
+      [ "iso-8859-7",  "el" ],
+      [ "iso-8859-8",  "he" ],
+      [ "iso-8859-9",  "tr" ],
+      [ "iso-8859-11", "th" ],
+      [ "jis_x0201",   "ja" ],
+      [ "shift_jis",   "ja" ],
+      [ "euc-jp",      "ja" ]
+    ]);
+
+    return charsetLang.get(attributes.charset);
+  },
+
+  _maybeAssignTextDirection(article) {
+    // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
+    if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) {
+      article.dir = "rtl";
+    }
+  },
+
+  /**
+   * Assigns the estimated reading time range of the article to the article object.
+   *
+   * @param article the article object to assign the reading time estimate to.
+   */
+  _assignReadTime(article) {
+    let lang = article.language || "en";
+    const readingSpeed = this._getReadingSpeedForLanguage(lang);
+    const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
+    const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
+    const length = article.length;
+
+    article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
+    article.readingTimeMinsFast  = Math.ceil(length / charactersPerMinuteHigh);
+  },
+
+  /**
+   * Returns the reading speed of a selection of languages with likely variance.
+   *
+   * Reading speed estimated from a study done on reading speeds in various languages.
+   * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
+   *
+   * @return object with characters per minute and variance. Defaults to English
+   *         if no suitable language is found in the collection.
+   */
+  _getReadingSpeedForLanguage(lang) {
+    const readingSpeed = new Map([
+      [ "en", {cpm: 987,  variance: 118 } ],
+      [ "ar", {cpm: 612,  variance: 88 } ],
+      [ "de", {cpm: 920,  variance: 86 } ],
+      [ "es", {cpm: 1025, variance: 127 } ],
+      [ "fi", {cpm: 1078, variance: 121 } ],
+      [ "fr", {cpm: 998,  variance: 126 } ],
+      [ "he", {cpm: 833,  variance: 130 } ],
+      [ "it", {cpm: 950,  variance: 140 } ],
+      [ "jw", {cpm: 357,  variance: 56 } ],
+      [ "nl", {cpm: 978,  variance: 143 } ],
+      [ "pl", {cpm: 916,  variance: 126 } ],
+      [ "pt", {cpm: 913,  variance: 145 } ],
+      [ "ru", {cpm: 986,  variance: 175 } ],
+      [ "sk", {cpm: 885,  variance: 145 } ],
+      [ "sv", {cpm: 917,  variance: 156 } ],
+      [ "tr", {cpm: 1054, variance: 156 } ],
+      [ "zh", {cpm: 255,  variance: 29 } ],
+    ]);
+
+    return readingSpeed.get(lang) || readingSpeed.get("en");
+  },
 };
-- 
cgit v1.2.3