From f696173b4af510cf72885282efcfdd3c4005fbb1 Mon Sep 17 00:00:00 2001
From: Ascrod <32915892+Ascrod@users.noreply.github.com>
Date: Mon, 15 Oct 2018 13:52:23 -0400
Subject: Update Readability from mozilla-central release branch (FF 62.0.3).
Tag #361.
---
toolkit/components/reader/Readability.js | 214 +++++++++++++++++++------------
1 file changed, 134 insertions(+), 80 deletions(-)
(limited to 'toolkit/components/reader/Readability.js')
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 064d2ae88..c2bba0cd3 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -29,14 +29,19 @@
/**
* Public constructor.
- * @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
-function Readability(uri, doc, options) {
+function Readability(doc, options) {
+ // In some older versions, people passed a URI as the first argument. Cope:
+ if (options && options.documentElement) {
+ doc = options;
+ options = arguments[2];
+ } else if (!doc || !doc.documentElement) {
+ throw new Error("First argument to Readability constructor should be a document object.");
+ }
options = options || {};
- this._uri = uri;
this._doc = doc;
this._articleTitle = null;
this._articleByline = null;
@@ -47,7 +52,7 @@ function Readability(uri, doc, options) {
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
- this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+ this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
@@ -93,6 +98,10 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
+ // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+ ELEMENT_NODE: 1,
+ TEXT_NODE: 3,
+
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
@@ -103,13 +112,13 @@ Readability.prototype = {
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
- // The default number of words an article must have in order to return a result
- DEFAULT_WORD_THRESHOLD: 500,
+ // The default number of chars an article must have in order to return a result
+ DEFAULT_CHAR_THRESHOLD: 500,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+ unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@@ -132,8 +141,19 @@ Readability.prototype = {
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+ // The commented out elements qualify as phrasing content but tend to be
+ // removed by readability when put into paragraphs, so we ignore them here.
+ PHRASING_ELEMS: [
+ // "CANVAS", "IFRAME", "SVG", "VIDEO",
+ "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
+ "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
+ "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
+ "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
+ "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
+ ],
+
// These are the classes that readability sets itself.
- CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+ CLASSES_TO_PRESERVE: [ "page" ],
/**
* Run any post-process modifications to article content as necessary.
@@ -215,6 +235,21 @@ Readability.prototype = {
return Array.prototype.some.call(nodeList, fn, this);
},
+ /**
+ * Iterate over a NodeList, return true if all of the provided iterate
+ * function calls return true, false otherwise.
+ *
+ * For convenience, the current object context is applied to the
+ * provided iterate function.
+ *
+ * @param NodeList nodeList The NodeList.
+ * @param Function fn The iterate function.
+ * @return Boolean
+ */
+ _everyNode: function(nodeList, fn) {
+ return Array.prototype.every.call(nodeList, fn, this);
+ },
+
/**
* Concat all nodelists passed as arguments.
*
@@ -327,7 +362,7 @@ Readability.prototype = {
var origTitle = "";
try {
- curTitle = origTitle = doc.title;
+ curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
@@ -355,8 +390,9 @@ Readability.prototype = {
doc.getElementsByTagName('h1'),
doc.getElementsByTagName('h2')
);
+ var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
- return heading.textContent === curTitle;
+ return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
@@ -421,7 +457,7 @@ Readability.prototype = {
_nextElement: function (node) {
var next = node;
while (next
- && (next.nodeType != Node.ELEMENT_NODE)
+ && (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
@@ -464,16 +500,22 @@ Readability.prototype = {
while (next) {
// If we've hit another
, we're done adding children to this
. if (next.tagName == "BR") { - var nextElem = this._nextElement(next); + var nextElem = this._nextElement(next.nextSibling); if (nextElem && nextElem.tagName == "BR") break; } + if (!this._isPhrasingContent(next)) break; + // Otherwise, make this node a child of the new
. var sibling = next.nextSibling; p.appendChild(next); next = sibling; } + + while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); + + if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV"); } }); }, @@ -523,6 +565,7 @@ Readability.prototype = { this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); this._clean(articleContent, "link"); + this._clean(articleContent, "aside"); // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". @@ -579,6 +622,19 @@ Readability.prototype = { if (next && next.tagName == "P") br.parentNode.removeChild(br); }); + + // Remove single-cell tables + this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); + table.parentNode.replaceChild(cell, table); + } + } + }); }, /** @@ -658,37 +714,6 @@ Readability.prototype = { return node && node.nextElementSibling; }, - /** - * Like _getNextNode, but for DOM implementations with no - * firstElementChild/nextElementSibling functionality... - */ - _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) { - function nextSiblingEl(n) { - do { - n = n.nextSibling; - } while (n && n.nodeType !== n.ELEMENT_NODE); - return n; - } - // First check for kids if those aren't being ignored - if (!ignoreSelfAndKids && node.children[0]) { - return node.children[0]; - } - // Then for siblings... - var next = nextSiblingEl(node); - if (next) { - return next; - } - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - do { - node = node.parentNode; - if (node) - next = nextSiblingEl(node); - } while (node && !next); - return node && next; - }, - _checkByline: function(node, matchString) { if (this._articleByline) { return false; @@ -751,6 +776,12 @@ Readability.prototype = { while (node) { var matchString = node.className + " " + node.id; + if (!this._isProbablyVisible(node)) { + this.log("Removing hidden node - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + // Check to see if this node is a byline, and remove it if it is. if (this._checkByline(node, matchString)) { node = this._removeAndGetNext(node); @@ -784,11 +815,31 @@ Readability.prototype = { // Turn all divs that don't have children block level elements into p's if (node.tagName === "DIV") { + // Put phrasing content into paragraphs. + var p = null; + var childNode = node.firstChild; + while (childNode) { + var nextSibling = childNode.nextSibling; + if (this._isPhrasingContent(childNode)) { + if (p !== null) { + p.appendChild(childNode); + } else if (!this._isWhitespace(childNode)) { + p = doc.createElement('p'); + node.replaceChild(p, childNode); + p.appendChild(childNode); + } + } else if (p !== null) { + while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); + p = null; + } + childNode = nextSibling; + } + // Sites like http://mobile.slate.com encloses each paragraph with a DIV // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSinglePInsideElement(node)) { + if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { var newNode = node.children[0]; node.parentNode.replaceChild(newNode, node); node = newNode; @@ -796,17 +847,6 @@ Readability.prototype = { } else if (!this._hasChildBlockElement(node)) { node = this._setNodeTag(node, "P"); elementsToScore.push(node); - } else { - // EXPERIMENTAL - this._forEachNode(node.childNodes, function(childNode) { - if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) { - var p = doc.createElement('p'); - p.textContent = childNode.textContent; - p.style.display = 'inline'; - p.className = 'readability-styled'; - node.replaceChild(p, childNode); - } - }); } } node = this._getNextNode(node); @@ -846,7 +886,7 @@ Readability.prototype = { // Initialize and score ancestors. this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName) + if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined') return; if (typeof(ancestor.readability) === 'undefined') { @@ -1085,7 +1125,7 @@ Readability.prototype = { // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. var textLength = this._getInnerText(articleContent, true).length; - if (textLength < this._wordThreshold) { + if (textLength < this._charThreshold) { parseSuccessful = false; page.innerHTML = pageCacheHtml; @@ -1233,27 +1273,28 @@ Readability.prototype = { }, /** - * Check if this node has only whitespace and a single P element + * Check if this node has only whitespace and a single element with given tag * Returns false if the DIV node contains non-empty text nodes - * or if it contains no P or more than 1 element. + * or if it contains no element with given tag or more than 1 element. * * @param Element + * @param string tag of child element **/ - _hasSinglePInsideElement: function(element) { - // There should be exactly 1 element child which is a P: - if (element.children.length != 1 || element.children[0].tagName !== "P") { + _hasSingleTagInsideElement: function(element, tag) { + // There should be exactly 1 element child with given tag + if (element.children.length != 1 || element.children[0].tagName !== tag) { return false; } // And there should be no text nodes with real content return !this._someNode(element.childNodes, function(node) { - return node.nodeType === Node.TEXT_NODE && + return node.nodeType === this.TEXT_NODE && this.REGEXPS.hasContent.test(node.textContent); }); }, _isElementWithoutContent: function(node) { - return node.nodeType === Node.ELEMENT_NODE && + return node.nodeType === this.ELEMENT_NODE && node.textContent.trim().length == 0 && (node.children.length == 0 || node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); @@ -1271,6 +1312,21 @@ Readability.prototype = { }); }, + /*** + * Determine if a node qualifies as phrasing content. + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content + **/ + _isPhrasingContent: function(node) { + return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || + ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && + this._everyNode(node.childNodes, this._isPhrasingContent)); + }, + + _isWhitespace: function(node) { + return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) || + (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); + }, + /** * Get the inner text of a node - cross browser compatibly. * This also strips out any excess whitespace to be found. @@ -1312,16 +1368,14 @@ Readability.prototype = { if (!e || e.tagName.toLowerCase() === 'svg') return; - if (e.className !== 'readability-styled') { - // Remove `style` and deprecated presentational attributes - for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { - e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); - } + // Remove `style` and deprecated presentational attributes + for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { + e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); + } - if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { - e.removeAttribute('width'); - e.removeAttribute('height'); - } + if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { + e.removeAttribute('width'); + e.removeAttribute('height'); } var cur = e.firstElementChild; @@ -1639,6 +1693,10 @@ Readability.prototype = { this._flags = this._flags & ~flag; }, + _isProbablyVisible: function(node) { + return node.style.display != "none" && !node.hasAttribute("hidden"); + }, + /** * Decides whether or not the document is reader-able without parsing the whole thing. * @@ -1663,9 +1721,9 @@ Readability.prototype = { nodes = [].concat.apply(Array.from(set), nodes); } - // FIXME we should have a fallback for helperIsVisible, but this is - // problematic because of jsdom's elem.style handling - see - // https://github.com/mozilla/readability/pull/186 for context. + if (!helperIsVisible) { + helperIsVisible = this._isProbablyVisible; + } var score = 0; // This is a little cheeky, we use the accumulator 'score' to decide what to return from @@ -1719,9 +1777,6 @@ Readability.prototype = { } } - if (typeof this._doc.documentElement.firstElementChild === "undefined") { - this._getNextNode = this._getNextNodeNoElementProperties; - } // Remove script tags from the document. this._removeScripts(this._doc); @@ -1750,7 +1805,6 @@ Readability.prototype = { var textContent = articleContent.textContent; return { - uri: this._uri, title: this._articleTitle, byline: metadata.byline || this._articleByline, dir: this._articleDir, -- cgit v1.2.3