summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/reader')
-rw-r--r--toolkit/components/reader/JSDOMParser.js21
-rw-r--r--toolkit/components/reader/Readability.js67
-rw-r--r--toolkit/components/reader/ReaderWorker.js2
3 files changed, 57 insertions, 33 deletions
diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js
index 38f59c4ea..dd9d37230 100644
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -560,7 +560,8 @@
},
};
- var Document = function () {
+ var Document = function (url) {
+ this.documentURI = url;
this.styleSheets = [];
this.childNodes = [];
this.children = [];
@@ -600,6 +601,20 @@
node.textContent = text;
return node;
},
+
+ get baseURI() {
+ if (!this.hasOwnProperty("_baseURI")) {
+ this._baseURI = this.documentURI;
+ var baseElements = this.getElementsByTagName("base");
+ var href = baseElements[0] && baseElements[0].getAttribute("href");
+ if (href) {
+ try {
+ this._baseURI = (new URL(href, this._baseURI)).href;
+ } catch (ex) {/* Just fall back to documentURI */}
+ }
+ }
+ return this._baseURI;
+ },
};
var Element = function (tag) {
@@ -1118,9 +1133,9 @@
/**
* Parses an HTML string and returns a JS implementation of the Document.
*/
- parse: function (html) {
+ parse: function (html, url) {
this.html = html;
- var doc = this.doc = new Document();
+ var doc = this.doc = new Document(url);
this.readChildren(doc);
// If this is an HTML document, remove root-level children except for the
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 04949dc61..064d2ae88 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -41,6 +41,7 @@ function Readability(uri, doc, options) {
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
+ this._attempts = [];
// Configurable options
this._debug = !!options.debug;
@@ -275,34 +276,20 @@ Readability.prototype = {
* @return void
*/
_fixRelativeUris: function(articleContent) {
- var scheme = this._uri.scheme;
- var prePath = this._uri.prePath;
- var pathBase = this._uri.pathBase;
-
+ var baseURI = this._doc.baseURI;
+ var documentURI = this._doc.documentURI;
function toAbsoluteURI(uri) {
- // If this is already an absolute URI, return it.
- if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
- return uri;
-
- // Scheme-rooted relative URI.
- if (uri.substr(0, 2) == "//")
- return scheme + "://" + uri.substr(2);
-
- // Prepath-rooted relative URI.
- if (uri[0] == "/")
- return prePath + uri;
-
- // Dotslash relative URI.
- if (uri.indexOf("./") === 0)
- return pathBase + uri.slice(2);
-
- // Ignore hash URIs:
- if (uri[0] == "#")
+ // Leave hash links alone if the base URI matches the document URI:
+ if (baseURI == documentURI && uri.charAt(0) == "#") {
return uri;
-
- // Standard relative URI; add entire path. pathBase already includes a
- // trailing "/".
- return pathBase + uri;
+ }
+ // Otherwise, resolve against base URI:
+ try {
+ return new URL(uri, baseURI).href;
+ } catch (ex) {
+ // Something went wrong, just return the original:
+ }
+ return uri;
}
var links = articleContent.getElementsByTagName("a");
@@ -535,6 +522,7 @@ Readability.prototype = {
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
+ this._clean(articleContent, "link");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
@@ -1089,24 +1077,45 @@ Readability.prototype = {
if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML);
+ var parseSuccessful = true;
+
// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
- if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
+ var textLength = this._getInnerText(articleContent, true).length;
+ if (textLength < this._wordThreshold) {
+ parseSuccessful = false;
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else {
- return null;
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
+ // No luck after removing flags, just return the longest text we found during the different loops
+ this._attempts.sort(function (a, b) {
+ return a.textLength < b.textLength;
+ });
+
+ // But first check if we actually have something
+ if (!this._attempts[0].textLength) {
+ return null;
+ }
+
+ articleContent = this._attempts[0].articleContent;
+ parseSuccessful = true;
}
- } else {
+ }
+
+ if (parseSuccessful) {
// Find out text direction from ancestors of final top candidate.
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
this._someNode(ancestors, function(ancestor) {
diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js
index 9ae589d7d..69426788b 100644
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -47,7 +47,7 @@ var Agent = {
* @return {object} Article object returned from Readability.
*/
parseDocument(uri, serializedDoc, options) {
- let doc = new JSDOMParser().parse(serializedDoc);
+ let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
return new Readability(uri, doc, options).parse();
},
};