summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader/Readability.js
diff options
context:
space:
mode:
authorwolfbeast <mcwerewolf@gmail.com>2018-10-24 11:13:33 +0200
committerwolfbeast <mcwerewolf@gmail.com>2018-10-24 11:13:33 +0200
commit579881ffb4aa0b145c793825cd1b0628e7cd6cdc (patch)
tree65d59fc6b73f120ac1bc2214d4a8442421076b04 /toolkit/components/reader/Readability.js
parenta02c44648a3f7d6f3904eebba026ce5e6f781bef (diff)
parentf71c04d814416ebf52dd54109f2d04f1cbd397c0 (diff)
downloadUXP-579881ffb4aa0b145c793825cd1b0628e7cd6cdc.tar
UXP-579881ffb4aa0b145c793825cd1b0628e7cd6cdc.tar.gz
UXP-579881ffb4aa0b145c793825cd1b0628e7cd6cdc.tar.lz
UXP-579881ffb4aa0b145c793825cd1b0628e7cd6cdc.tar.xz
UXP-579881ffb4aa0b145c793825cd1b0628e7cd6cdc.zip
Merge branch 'master' into Sync-weave
Diffstat (limited to 'toolkit/components/reader/Readability.js')
-rw-r--r--toolkit/components/reader/Readability.js214
1 files changed, 134 insertions, 80 deletions
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 064d2ae88..c2bba0cd3 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -29,14 +29,19 @@
/**
* Public constructor.
- * @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
-function Readability(uri, doc, options) {
+function Readability(doc, options) {
+ // In some older versions, people passed a URI as the first argument. Cope:
+ if (options && options.documentElement) {
+ doc = options;
+ options = arguments[2];
+ } else if (!doc || !doc.documentElement) {
+ throw new Error("First argument to Readability constructor should be a document object.");
+ }
options = options || {};
- this._uri = uri;
this._doc = doc;
this._articleTitle = null;
this._articleByline = null;
@@ -47,7 +52,7 @@ function Readability(uri, doc, options) {
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
- this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+ this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
@@ -93,6 +98,10 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
+ // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+ ELEMENT_NODE: 1,
+ TEXT_NODE: 3,
+
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
@@ -103,13 +112,13 @@ Readability.prototype = {
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
- // The default number of words an article must have in order to return a result
- DEFAULT_WORD_THRESHOLD: 500,
+ // The default number of chars an article must have in order to return a result
+ DEFAULT_CHAR_THRESHOLD: 500,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+ unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@@ -132,8 +141,19 @@ Readability.prototype = {
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+ // The commented out elements qualify as phrasing content but tend to be
+ // removed by readability when put into paragraphs, so we ignore them here.
+ PHRASING_ELEMS: [
+ // "CANVAS", "IFRAME", "SVG", "VIDEO",
+ "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
+ "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
+ "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
+ "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
+ "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
+ ],
+
// These are the classes that readability sets itself.
- CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+ CLASSES_TO_PRESERVE: [ "page" ],
/**
* Run any post-process modifications to article content as necessary.
@@ -216,6 +236,21 @@ Readability.prototype = {
},
/**
+ * Iterate over a NodeList, return true if all of the provided iterate
+ * function calls return true, false otherwise.
+ *
+ * For convenience, the current object context is applied to the
+ * provided iterate function.
+ *
+ * @param NodeList nodeList The NodeList.
+ * @param Function fn The iterate function.
+ * @return Boolean
+ */
+ _everyNode: function(nodeList, fn) {
+ return Array.prototype.every.call(nodeList, fn, this);
+ },
+
+ /**
* Concat all nodelists passed as arguments.
*
* @return ...NodeList
@@ -327,7 +362,7 @@ Readability.prototype = {
var origTitle = "";
try {
- curTitle = origTitle = doc.title;
+ curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
@@ -355,8 +390,9 @@ Readability.prototype = {
doc.getElementsByTagName('h1'),
doc.getElementsByTagName('h2')
);
+ var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
- return heading.textContent === curTitle;
+ return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
@@ -421,7 +457,7 @@ Readability.prototype = {
_nextElement: function (node) {
var next = node;
while (next
- && (next.nodeType != Node.ELEMENT_NODE)
+ && (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
@@ -464,16 +500,22 @@ Readability.prototype = {
while (next) {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == "BR") {
- var nextElem = this._nextElement(next);
+ var nextElem = this._nextElement(next.nextSibling);
if (nextElem && nextElem.tagName == "BR")
break;
}
+ if (!this._isPhrasingContent(next)) break;
+
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
p.appendChild(next);
next = sibling;
}
+
+ while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+
+ if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
}
});
},
@@ -523,6 +565,7 @@ Readability.prototype = {
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
+ this._clean(articleContent, "aside");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
@@ -579,6 +622,19 @@ Readability.prototype = {
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
});
+
+ // Remove single-cell tables
+ this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
+ var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
+ if (this._hasSingleTagInsideElement(tbody, "TR")) {
+ var row = tbody.firstElementChild;
+ if (this._hasSingleTagInsideElement(row, "TD")) {
+ var cell = row.firstElementChild;
+ cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
+ table.parentNode.replaceChild(cell, table);
+ }
+ }
+ });
},
/**
@@ -658,37 +714,6 @@ Readability.prototype = {
return node && node.nextElementSibling;
},
- /**
- * Like _getNextNode, but for DOM implementations with no
- * firstElementChild/nextElementSibling functionality...
- */
- _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
- function nextSiblingEl(n) {
- do {
- n = n.nextSibling;
- } while (n && n.nodeType !== n.ELEMENT_NODE);
- return n;
- }
- // First check for kids if those aren't being ignored
- if (!ignoreSelfAndKids && node.children[0]) {
- return node.children[0];
- }
- // Then for siblings...
- var next = nextSiblingEl(node);
- if (next) {
- return next;
- }
- // And finally, move up the parent chain *and* find a sibling
- // (because this is depth-first traversal, we will have already
- // seen the parent nodes themselves).
- do {
- node = node.parentNode;
- if (node)
- next = nextSiblingEl(node);
- } while (node && !next);
- return node && next;
- },
-
_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
@@ -751,6 +776,12 @@ Readability.prototype = {
while (node) {
var matchString = node.className + " " + node.id;
+ if (!this._isProbablyVisible(node)) {
+ this.log("Removing hidden node - " + matchString);
+ node = this._removeAndGetNext(node);
+ continue;
+ }
+
// Check to see if this node is a byline, and remove it if it is.
if (this._checkByline(node, matchString)) {
node = this._removeAndGetNext(node);
@@ -784,11 +815,31 @@ Readability.prototype = {
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
+ // Put phrasing content into paragraphs.
+ var p = null;
+ var childNode = node.firstChild;
+ while (childNode) {
+ var nextSibling = childNode.nextSibling;
+ if (this._isPhrasingContent(childNode)) {
+ if (p !== null) {
+ p.appendChild(childNode);
+ } else if (!this._isWhitespace(childNode)) {
+ p = doc.createElement('p');
+ node.replaceChild(p, childNode);
+ p.appendChild(childNode);
+ }
+ } else if (p !== null) {
+ while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+ p = null;
+ }
+ childNode = nextSibling;
+ }
+
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
- if (this._hasSinglePInsideElement(node)) {
+ if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
@@ -796,17 +847,6 @@ Readability.prototype = {
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
- } else {
- // EXPERIMENTAL
- this._forEachNode(node.childNodes, function(childNode) {
- if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
- var p = doc.createElement('p');
- p.textContent = childNode.textContent;
- p.style.display = 'inline';
- p.className = 'readability-styled';
- node.replaceChild(p, childNode);
- }
- });
}
}
node = this._getNextNode(node);
@@ -846,7 +886,7 @@ Readability.prototype = {
// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
- if (!ancestor.tagName)
+ if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
return;
if (typeof(ancestor.readability) === 'undefined') {
@@ -1085,7 +1125,7 @@ Readability.prototype = {
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
var textLength = this._getInnerText(articleContent, true).length;
- if (textLength < this._wordThreshold) {
+ if (textLength < this._charThreshold) {
parseSuccessful = false;
page.innerHTML = pageCacheHtml;
@@ -1233,27 +1273,28 @@ Readability.prototype = {
},
/**
- * Check if this node has only whitespace and a single P element
+ * Check if this node has only whitespace and a single element with given tag
* Returns false if the DIV node contains non-empty text nodes
- * or if it contains no P or more than 1 element.
+ * or if it contains no element with given tag or more than 1 element.
*
* @param Element
+ * @param string tag of child element
**/
- _hasSinglePInsideElement: function(element) {
- // There should be exactly 1 element child which is a P:
- if (element.children.length != 1 || element.children[0].tagName !== "P") {
+ _hasSingleTagInsideElement: function(element, tag) {
+ // There should be exactly 1 element child with given tag
+ if (element.children.length != 1 || element.children[0].tagName !== tag) {
return false;
}
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
- return node.nodeType === Node.TEXT_NODE &&
+ return node.nodeType === this.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
_isElementWithoutContent: function(node) {
- return node.nodeType === Node.ELEMENT_NODE &&
+ return node.nodeType === this.ELEMENT_NODE &&
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
@@ -1271,6 +1312,21 @@ Readability.prototype = {
});
},
+ /***
+ * Determine if a node qualifies as phrasing content.
+ * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
+ **/
+ _isPhrasingContent: function(node) {
+ return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
+ ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
+ this._everyNode(node.childNodes, this._isPhrasingContent));
+ },
+
+ _isWhitespace: function(node) {
+ return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
+ (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
+ },
+
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
@@ -1312,16 +1368,14 @@ Readability.prototype = {
if (!e || e.tagName.toLowerCase() === 'svg')
return;
- if (e.className !== 'readability-styled') {
- // Remove `style` and deprecated presentational attributes
- for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
- e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
- }
+ // Remove `style` and deprecated presentational attributes
+ for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+ e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+ }
- if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
- e.removeAttribute('width');
- e.removeAttribute('height');
- }
+ if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+ e.removeAttribute('width');
+ e.removeAttribute('height');
}
var cur = e.firstElementChild;
@@ -1639,6 +1693,10 @@ Readability.prototype = {
this._flags = this._flags & ~flag;
},
+ _isProbablyVisible: function(node) {
+ return node.style.display != "none" && !node.hasAttribute("hidden");
+ },
+
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
@@ -1663,9 +1721,9 @@ Readability.prototype = {
nodes = [].concat.apply(Array.from(set), nodes);
}
- // FIXME we should have a fallback for helperIsVisible, but this is
- // problematic because of jsdom's elem.style handling - see
- // https://github.com/mozilla/readability/pull/186 for context.
+ if (!helperIsVisible) {
+ helperIsVisible = this._isProbablyVisible;
+ }
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
@@ -1719,9 +1777,6 @@ Readability.prototype = {
}
}
- if (typeof this._doc.documentElement.firstElementChild === "undefined") {
- this._getNextNode = this._getNextNodeNoElementProperties;
- }
// Remove script tags from the document.
this._removeScripts(this._doc);
@@ -1750,7 +1805,6 @@ Readability.prototype = {
var textContent = articleContent.textContent;
return {
- uri: this._uri,
title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,