From 984dad43ae49ba1cfa236af5567bf4c934e59263 Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Fri, 1 Jun 2018 21:32:59 -0400 Subject: Update Readability from mozilla-central release branch (FF 60.0). --- toolkit/components/reader/JSDOMParser.js | 21 ++++++++-- toolkit/components/reader/Readability.js | 67 ++++++++++++++++++------------- toolkit/components/reader/ReaderWorker.js | 2 +- 3 files changed, 57 insertions(+), 33 deletions(-) (limited to 'toolkit/components') diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index 38f59c4ea..dd9d37230 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -560,7 +560,8 @@ }, }; - var Document = function () { + var Document = function (url) { + this.documentURI = url; this.styleSheets = []; this.childNodes = []; this.children = []; @@ -600,6 +601,20 @@ node.textContent = text; return node; }, + + get baseURI() { + if (!this.hasOwnProperty("_baseURI")) { + this._baseURI = this.documentURI; + var baseElements = this.getElementsByTagName("base"); + var href = baseElements[0] && baseElements[0].getAttribute("href"); + if (href) { + try { + this._baseURI = (new URL(href, this._baseURI)).href; + } catch (ex) {/* Just fall back to documentURI */} + } + } + return this._baseURI; + }, }; var Element = function (tag) { @@ -1118,9 +1133,9 @@ /** * Parses an HTML string and returns a JS implementation of the Document. */ - parse: function (html) { + parse: function (html, url) { this.html = html; - var doc = this.doc = new Document(); + var doc = this.doc = new Document(url); this.readChildren(doc); // If this is an HTML document, remove root-level children except for the diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 04949dc61..064d2ae88 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -41,6 +41,7 @@ function Readability(uri, doc, options) { this._articleTitle = null; this._articleByline = null; this._articleDir = null; + this._attempts = []; // Configurable options this._debug = !!options.debug; @@ -275,34 +276,20 @@ Readability.prototype = { * @return void */ _fixRelativeUris: function(articleContent) { - var scheme = this._uri.scheme; - var prePath = this._uri.prePath; - var pathBase = this._uri.pathBase; - + var baseURI = this._doc.baseURI; + var documentURI = this._doc.documentURI; function toAbsoluteURI(uri) { - // If this is already an absolute URI, return it. - if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) - return uri; - - // Scheme-rooted relative URI. - if (uri.substr(0, 2) == "//") - return scheme + "://" + uri.substr(2); - - // Prepath-rooted relative URI. - if (uri[0] == "/") - return prePath + uri; - - // Dotslash relative URI. - if (uri.indexOf("./") === 0) - return pathBase + uri.slice(2); - - // Ignore hash URIs: - if (uri[0] == "#") + // Leave hash links alone if the base URI matches the document URI: + if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; - - // Standard relative URI; add entire path. pathBase already includes a - // trailing "/". - return pathBase + uri; + } + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; } var links = articleContent.getElementsByTagName("a"); @@ -535,6 +522,7 @@ Readability.prototype = { this._clean(articleContent, "embed"); this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); + this._clean(articleContent, "link"); // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". @@ -1089,24 +1077,45 @@ Readability.prototype = { if (this._debug) this.log("Article content after paging: " + articleContent.innerHTML); + var parseSuccessful = true; + // Now that we've gone through the full algorithm, check to see if // we got any meaningful content. If we didn't, we may need to re-run // grabArticle with different flags set. This gives us a higher likelihood of // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. - if (this._getInnerText(articleContent, true).length < this._wordThreshold) { + var textLength = this._getInnerText(articleContent, true).length; + if (textLength < this._wordThreshold) { + parseSuccessful = false; page.innerHTML = pageCacheHtml; if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { this._removeFlag(this.FLAG_STRIP_UNLIKELYS); + this._attempts.push({articleContent: articleContent, textLength: textLength}); } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { this._removeFlag(this.FLAG_WEIGHT_CLASSES); + this._attempts.push({articleContent: articleContent, textLength: textLength}); } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); + this._attempts.push({articleContent: articleContent, textLength: textLength}); } else { - return null; + this._attempts.push({articleContent: articleContent, textLength: textLength}); + // No luck after removing flags, just return the longest text we found during the different loops + this._attempts.sort(function (a, b) { + return a.textLength < b.textLength; + }); + + // But first check if we actually have something + if (!this._attempts[0].textLength) { + return null; + } + + articleContent = this._attempts[0].articleContent; + parseSuccessful = true; } - } else { + } + + if (parseSuccessful) { // Find out text direction from ancestors of final top candidate. var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); this._someNode(ancestors, function(ancestor) { diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js index 9ae589d7d..69426788b 100644 --- a/toolkit/components/reader/ReaderWorker.js +++ b/toolkit/components/reader/ReaderWorker.js @@ -47,7 +47,7 @@ var Agent = { * @return {object} Article object returned from Readability. */ parseDocument(uri, serializedDoc, options) { - let doc = new JSDOMParser().parse(serializedDoc); + let doc = new JSDOMParser().parse(serializedDoc, uri.spec); return new Readability(uri, doc, options).parse(); }, }; -- cgit v1.2.3