summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader
diff options
context:
space:
mode:
authorwolfbeast <mcwerewolf@gmail.com>2018-07-18 08:24:24 +0200
committerwolfbeast <mcwerewolf@gmail.com>2018-07-18 08:24:24 +0200
commitfc61780b35af913801d72086456f493f63197da6 (patch)
treef85891288a7bd988da9f0f15ae64e5c63f00d493 /toolkit/components/reader
parent69f7f9e5f1475891ce11cc4f431692f965b0cd30 (diff)
parent50d3e596bbe89c95615f96eb71f6bc5be737a1db (diff)
downloadUXP-9ccb235f04529c1ec345d87dad6521cb567d20bb.tar
UXP-9ccb235f04529c1ec345d87dad6521cb567d20bb.tar.gz
UXP-9ccb235f04529c1ec345d87dad6521cb567d20bb.tar.lz
UXP-9ccb235f04529c1ec345d87dad6521cb567d20bb.tar.xz
UXP-9ccb235f04529c1ec345d87dad6521cb567d20bb.zip
Merge commit '50d3e596bbe89c95615f96eb71f6bc5be737a1db' into Basilisk-releasev2018.07.18
# Conflicts: # browser/app/profile/firefox.js # browser/components/preferences/jar.mn
Diffstat (limited to 'toolkit/components/reader')
-rw-r--r--toolkit/components/reader/AboutReader.jsm31
-rw-r--r--toolkit/components/reader/JSDOMParser.js21
-rw-r--r--toolkit/components/reader/Readability.js67
-rw-r--r--toolkit/components/reader/ReaderWorker.js2
4 files changed, 73 insertions, 48 deletions
diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm
index fb82e5789..c5d04476d 100644
--- a/toolkit/components/reader/AboutReader.jsm
+++ b/toolkit/components/reader/AboutReader.jsm
@@ -200,9 +200,6 @@ AboutReader.prototype = {
} else if (!target.closest(".dropdown-popup")) {
this._closeDropdowns();
}
- if (target.tagName == "A" && !target.classList.contains("reader-domain")) {
- this._linkClicked(aEvent);
- }
break;
case "scroll":
this._closeDropdowns(true);
@@ -720,6 +717,21 @@ AboutReader.prototype = {
}
},
+ _fixLocalLinks() {
+ // We need to do this because preprocessing the content through nsIParserUtils
+ // gives back a DOM with a <base> element. That influences how these URLs get
+ // resolved, making them no longer match the document URI (which is
+ // about:reader?url=...). To fix this, make all the hash URIs absolute. This
+ // is hacky, but the alternative of removing the base element has potential
+ // security implications if Readability has not successfully made all the URLs
+ // absolute, so we pick just fixing these in-document links explicitly.
+ let localLinks = this._contentElement.querySelectorAll("a[href^='#']");
+ for (let localLink of localLinks) {
+ // Have to get the attribute because .href provides an absolute URI.
+ localLink.href = this._doc.documentURI + localLink.getAttribute("href");
+ }
+ },
+
_formatReadTime(slowEstimate, fastEstimate) {
let displayStringKey = "aboutReader.estimatedReadTimeRange1";
@@ -790,6 +802,7 @@ AboutReader.prototype = {
false, articleUri, this._contentElement);
this._contentElement.innerHTML = "";
this._contentElement.appendChild(contentFragment);
+ this._fixLocalLinks();
this._maybeSetTextDirection(article);
this._foundLanguage(article.language);
@@ -978,18 +991,6 @@ AboutReader.prototype = {
},
/*
- * Override link handling for same-page references so we don't exit Reader View.
- */
- _linkClicked(event) {
- var originalUrl = Services.io.newURI(this._getOriginalUrl(), null, null);
- var targetUrl = Services.io.newURI(event.target.href, null, null);
- if (originalUrl.specIgnoringRef == targetUrl.specIgnoringRef) {
- event.preventDefault();
- this._goToReference(targetUrl.ref);
- }
- },
-
- /*
* Scroll reader view to a reference
*/
_goToReference(ref) {
diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js
index 38f59c4ea..dd9d37230 100644
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -560,7 +560,8 @@
},
};
- var Document = function () {
+ var Document = function (url) {
+ this.documentURI = url;
this.styleSheets = [];
this.childNodes = [];
this.children = [];
@@ -600,6 +601,20 @@
node.textContent = text;
return node;
},
+
+ get baseURI() {
+ if (!this.hasOwnProperty("_baseURI")) {
+ this._baseURI = this.documentURI;
+ var baseElements = this.getElementsByTagName("base");
+ var href = baseElements[0] && baseElements[0].getAttribute("href");
+ if (href) {
+ try {
+ this._baseURI = (new URL(href, this._baseURI)).href;
+ } catch (ex) {/* Just fall back to documentURI */}
+ }
+ }
+ return this._baseURI;
+ },
};
var Element = function (tag) {
@@ -1118,9 +1133,9 @@
/**
* Parses an HTML string and returns a JS implementation of the Document.
*/
- parse: function (html) {
+ parse: function (html, url) {
this.html = html;
- var doc = this.doc = new Document();
+ var doc = this.doc = new Document(url);
this.readChildren(doc);
// If this is an HTML document, remove root-level children except for the
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 04949dc61..064d2ae88 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -41,6 +41,7 @@ function Readability(uri, doc, options) {
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
+ this._attempts = [];
// Configurable options
this._debug = !!options.debug;
@@ -275,34 +276,20 @@ Readability.prototype = {
* @return void
*/
_fixRelativeUris: function(articleContent) {
- var scheme = this._uri.scheme;
- var prePath = this._uri.prePath;
- var pathBase = this._uri.pathBase;
-
+ var baseURI = this._doc.baseURI;
+ var documentURI = this._doc.documentURI;
function toAbsoluteURI(uri) {
- // If this is already an absolute URI, return it.
- if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
- return uri;
-
- // Scheme-rooted relative URI.
- if (uri.substr(0, 2) == "//")
- return scheme + "://" + uri.substr(2);
-
- // Prepath-rooted relative URI.
- if (uri[0] == "/")
- return prePath + uri;
-
- // Dotslash relative URI.
- if (uri.indexOf("./") === 0)
- return pathBase + uri.slice(2);
-
- // Ignore hash URIs:
- if (uri[0] == "#")
+ // Leave hash links alone if the base URI matches the document URI:
+ if (baseURI == documentURI && uri.charAt(0) == "#") {
return uri;
-
- // Standard relative URI; add entire path. pathBase already includes a
- // trailing "/".
- return pathBase + uri;
+ }
+ // Otherwise, resolve against base URI:
+ try {
+ return new URL(uri, baseURI).href;
+ } catch (ex) {
+ // Something went wrong, just return the original:
+ }
+ return uri;
}
var links = articleContent.getElementsByTagName("a");
@@ -535,6 +522,7 @@ Readability.prototype = {
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
+ this._clean(articleContent, "link");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
@@ -1089,24 +1077,45 @@ Readability.prototype = {
if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML);
+ var parseSuccessful = true;
+
// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
- if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
+ var textLength = this._getInnerText(articleContent, true).length;
+ if (textLength < this._wordThreshold) {
+ parseSuccessful = false;
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else {
- return null;
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
+ // No luck after removing flags, just return the longest text we found during the different loops
+ this._attempts.sort(function (a, b) {
+ return a.textLength < b.textLength;
+ });
+
+ // But first check if we actually have something
+ if (!this._attempts[0].textLength) {
+ return null;
+ }
+
+ articleContent = this._attempts[0].articleContent;
+ parseSuccessful = true;
}
- } else {
+ }
+
+ if (parseSuccessful) {
// Find out text direction from ancestors of final top candidate.
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
this._someNode(ancestors, function(ancestor) {
diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js
index 9ae589d7d..69426788b 100644
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -47,7 +47,7 @@ var Agent = {
* @return {object} Article object returned from Readability.
*/
parseDocument(uri, serializedDoc, options) {
- let doc = new JSDOMParser().parse(serializedDoc);
+ let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
return new Readability(uri, doc, options).parse();
},
};