Update Readability from mozilla-central release branch (FF 60.0).

author: Ascrod <32915892+Ascrod@users.noreply.github.com> 2018-06-01 21:32:59 -0400
committer: Ascrod <32915892+Ascrod@users.noreply.github.com> 2018-06-02 15:49:59 -0400
commit: 984dad43ae49ba1cfa236af5567bf4c934e59263 (patch)
tree: 9a0e801344d8588d3b3f0d034313449c8a00e79f /toolkit/components
parent: 7d9e2a108478d1fc79aa25b2ac56b62249e31dc5 (diff)
download: UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar
UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar.gz
UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar.lz
UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar.xz
UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.zip
3 files changed, 57 insertions, 33 deletions
diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js
index 38f59c4ea..dd9d37230 100644
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -560,7 +560,8 @@
     },
   };
 
-  var Document = function () {
+  var Document = function (url) {
+    this.documentURI = url;
     this.styleSheets = [];
     this.childNodes = [];
     this.children = [];
@@ -600,6 +601,20 @@
       node.textContent = text;
       return node;
     },
+
+    get baseURI() {
+      if (!this.hasOwnProperty("_baseURI")) {
+        this._baseURI = this.documentURI;
+        var baseElements = this.getElementsByTagName("base");
+        var href = baseElements[0] && baseElements[0].getAttribute("href");
+        if (href) {
+          try {
+            this._baseURI = (new URL(href, this._baseURI)).href;
+          } catch (ex) {/* Just fall back to documentURI */}
+        }
+      }
+      return this._baseURI;
+    },
   };
 
   var Element = function (tag) {
@@ -1118,9 +1133,9 @@
     /**
      * Parses an HTML string and returns a JS implementation of the Document.
      */
-    parse: function (html) {
+    parse: function (html, url) {
       this.html = html;
-      var doc = this.doc = new Document();
+      var doc = this.doc = new Document(url);
       this.readChildren(doc);
 
       // If this is an HTML document, remove root-level children except for the
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 04949dc61..064d2ae88 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -41,6 +41,7 @@ function Readability(uri, doc, options) {
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
+  this._attempts = [];
 
   // Configurable options
   this._debug = !!options.debug;
@@ -275,34 +276,20 @@ Readability.prototype = {
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
-    var scheme = this._uri.scheme;
-    var prePath = this._uri.prePath;
-    var pathBase = this._uri.pathBase;
-
+    var baseURI = this._doc.baseURI;
+    var documentURI = this._doc.documentURI;
     function toAbsoluteURI(uri) {
-      // If this is already an absolute URI, return it.
-      if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
-        return uri;
-
-      // Scheme-rooted relative URI.
-      if (uri.substr(0, 2) == "//")
-        return scheme + "://" + uri.substr(2);
-
-      // Prepath-rooted relative URI.
-      if (uri[0] == "/")
-        return prePath + uri;
-
-      // Dotslash relative URI.
-      if (uri.indexOf("./") === 0)
-        return pathBase + uri.slice(2);
-
-      // Ignore hash URIs:
-      if (uri[0] == "#")
+      // Leave hash links alone if the base URI matches the document URI:
+      if (baseURI == documentURI && uri.charAt(0) == "#") {
         return uri;
-
-      // Standard relative URI; add entire path. pathBase already includes a
-      // trailing "/".
-      return pathBase + uri;
+      }
+      // Otherwise, resolve against base URI:
+      try {
+        return new URL(uri, baseURI).href;
+      } catch (ex) {
+        // Something went wrong, just return the original:
+      }
+      return uri;
     }
 
     var links = articleContent.getElementsByTagName("a");
@@ -535,6 +522,7 @@ Readability.prototype = {
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
+    this._clean(articleContent, "link");
 
     // Clean out elements have "share" in their id/class combinations from final top candidates,
     // which means we don't remove the top candidates even they have "share".
@@ -1089,24 +1077,45 @@ Readability.prototype = {
       if (this._debug)
         this.log("Article content after paging: " + articleContent.innerHTML);
 
+      var parseSuccessful = true;
+
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
-      if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
+      var textLength = this._getInnerText(articleContent, true).length;
+      if (textLength < this._wordThreshold) {
+        parseSuccessful = false;
         page.innerHTML = pageCacheHtml;
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else {
-          return null;
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
+          // No luck after removing flags, just return the longest text we found during the different loops
+          this._attempts.sort(function (a, b) {
+            return a.textLength < b.textLength;
+          });
+
+          // But first check if we actually have something
+          if (!this._attempts[0].textLength) {
+            return null;
+          }
+
+          articleContent = this._attempts[0].articleContent;
+          parseSuccessful = true;
         }
-      } else {
+      }
+
+      if (parseSuccessful) {
         // Find out text direction from ancestors of final top candidate.
         var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
         this._someNode(ancestors, function(ancestor) {
diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js
index 9ae589d7d..69426788b 100644
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -47,7 +47,7 @@ var Agent = {
    * @return {object} Article object returned from Readability.
    */
   parseDocument(uri, serializedDoc, options) {
-    let doc = new JSDOMParser().parse(serializedDoc);
+    let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
     return new Readability(uri, doc, options).parse();
   },
 };
author	Ascrod <32915892+Ascrod@users.noreply.github.com>	2018-06-01 21:32:59 -0400
committer	Ascrod <32915892+Ascrod@users.noreply.github.com>	2018-06-02 15:49:59 -0400
commit	984dad43ae49ba1cfa236af5567bf4c934e59263 (patch)
tree	9a0e801344d8588d3b3f0d034313449c8a00e79f /toolkit/components
parent	7d9e2a108478d1fc79aa25b2ac56b62249e31dc5 (diff)
download	UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar.gz UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar.lz UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.tar.xz UXP-984dad43ae49ba1cfa236af5567bf4c934e59263.zip