From f696173b4af510cf72885282efcfdd3c4005fbb1 Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Mon, 15 Oct 2018 13:52:23 -0400 Subject: Update Readability from mozilla-central release branch (FF 62.0.3). Tag #361. --- toolkit/components/reader/JSDOMParser.js | 32 ++++- toolkit/components/reader/Readability.js | 214 +++++++++++++++++++----------- toolkit/components/reader/ReaderMode.jsm | 2 +- toolkit/components/reader/ReaderWorker.js | 2 +- 4 files changed, 165 insertions(+), 85 deletions(-) diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index dd9d37230..debdb08eb 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -618,6 +618,13 @@ }; var Element = function (tag) { + // We use this to find the closing tag. + this._matchingTag = tag; + // We're explicitly a non-namespace aware parser, we just pretend it's all HTML. + var lastColonIndex = tag.lastIndexOf(":"); + if (lastColonIndex != -1) { + tag = tag.substring(lastColonIndex + 1); + } this.attributes = []; this.childNodes = []; this.children = []; @@ -785,7 +792,13 @@ break; } } - } + }, + + hasAttribute: function (name) { + return this.attributes.some(function (attr) { + return attr.name == name; + }); + }, }; var Style = function (node) { @@ -1062,9 +1075,10 @@ return null; // Read any text as Text node + var textNode; if (c !== "<") { --this.currentChar; - var textNode = new Text(); + textNode = new Text(); var n = this.html.indexOf("<", this.currentChar); if (n === -1) { textNode.innerHTML = this.html.substring(this.currentChar, this.html.length); @@ -1076,6 +1090,18 @@ return textNode; } + if (this.match("![CDATA[")) { + var endChar = this.html.indexOf("]]>", this.currentChar); + if (endChar === -1) { + this.error("unclosed CDATA section"); + return null; + } + textNode = new Text(); + textNode.textContent = this.html.substring(this.currentChar, endChar); + this.currentChar = endChar + ("]]>").length; + return textNode; + } + c = this.peekNext(); // Read Comment node. Normally, Comment nodes know their inner @@ -1107,7 +1133,7 @@ // If this isn't a void Element, read its child nodes if (!closed) { this.readChildren(node); - var closingTag = ""; + var closingTag = ""; if (!this.match(closingTag)) { this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length)); return null; diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 064d2ae88..c2bba0cd3 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -29,14 +29,19 @@ /** * Public constructor. - * @param {Object} uri The URI descriptor object. * @param {HTMLDocument} doc The document to parse. * @param {Object} options The options object. */ -function Readability(uri, doc, options) { +function Readability(doc, options) { + // In some older versions, people passed a URI as the first argument. Cope: + if (options && options.documentElement) { + doc = options; + options = arguments[2]; + } else if (!doc || !doc.documentElement) { + throw new Error("First argument to Readability constructor should be a document object."); + } options = options || {}; - this._uri = uri; this._doc = doc; this._articleTitle = null; this._articleByline = null; @@ -47,7 +52,7 @@ function Readability(uri, doc, options) { this._debug = !!options.debug; this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; - this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD; + this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); // Start with all flags set @@ -93,6 +98,10 @@ Readability.prototype = { FLAG_WEIGHT_CLASSES: 0x2, FLAG_CLEAN_CONDITIONALLY: 0x4, + // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + ELEMENT_NODE: 1, + TEXT_NODE: 3, + // Max number of nodes supported by this parser. Default: 0 (no limit) DEFAULT_MAX_ELEMS_TO_PARSE: 0, @@ -103,13 +112,13 @@ Readability.prototype = { // Element tags to score by default. DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), - // The default number of words an article must have in order to return a result - DEFAULT_WORD_THRESHOLD: 500, + // The default number of chars an article must have in order to return a result + DEFAULT_CHAR_THRESHOLD: 500, // All of the regular expressions in use within readability. // Defined up here so we don't instantiate them repeatedly in loops. REGEXPS: { - unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, @@ -132,8 +141,19 @@ Readability.prototype = { DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. + PHRASING_ELEMS: [ + // "CANVAS", "IFRAME", "SVG", "VIDEO", + "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", + "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", + "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", + "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", + "SUP", "TEXTAREA", "TIME", "VAR", "WBR" + ], + // These are the classes that readability sets itself. - CLASSES_TO_PRESERVE: [ "readability-styled", "page" ], + CLASSES_TO_PRESERVE: [ "page" ], /** * Run any post-process modifications to article content as necessary. @@ -215,6 +235,21 @@ Readability.prototype = { return Array.prototype.some.call(nodeList, fn, this); }, + /** + * Iterate over a NodeList, return true if all of the provided iterate + * function calls return true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _everyNode: function(nodeList, fn) { + return Array.prototype.every.call(nodeList, fn, this); + }, + /** * Concat all nodelists passed as arguments. * @@ -327,7 +362,7 @@ Readability.prototype = { var origTitle = ""; try { - curTitle = origTitle = doc.title; + curTitle = origTitle = doc.title.trim(); // If they had an element with id "title" in their HTML if (typeof curTitle !== "string") @@ -355,8 +390,9 @@ Readability.prototype = { doc.getElementsByTagName('h1'), doc.getElementsByTagName('h2') ); + var trimmedTitle = curTitle.trim(); var match = this._someNode(headings, function(heading) { - return heading.textContent === curTitle; + return heading.textContent.trim() === trimmedTitle; }); // If we don't, let's extract the title out of the original title string. @@ -421,7 +457,7 @@ Readability.prototype = { _nextElement: function (node) { var next = node; while (next - && (next.nodeType != Node.ELEMENT_NODE) + && (next.nodeType != this.ELEMENT_NODE) && this.REGEXPS.whitespace.test(next.textContent)) { next = next.nextSibling; } @@ -464,16 +500,22 @@ Readability.prototype = { while (next) { // If we've hit another

, we're done adding children to this

. if (next.tagName == "BR") { - var nextElem = this._nextElement(next); + var nextElem = this._nextElement(next.nextSibling); if (nextElem && nextElem.tagName == "BR") break; } + if (!this._isPhrasingContent(next)) break; + // Otherwise, make this node a child of the new

. var sibling = next.nextSibling; p.appendChild(next); next = sibling; } + + while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); + + if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV"); } }); }, @@ -523,6 +565,7 @@ Readability.prototype = { this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); this._clean(articleContent, "link"); + this._clean(articleContent, "aside"); // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". @@ -579,6 +622,19 @@ Readability.prototype = { if (next && next.tagName == "P") br.parentNode.removeChild(br); }); + + // Remove single-cell tables + this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); + table.parentNode.replaceChild(cell, table); + } + } + }); }, /** @@ -658,37 +714,6 @@ Readability.prototype = { return node && node.nextElementSibling; }, - /** - * Like _getNextNode, but for DOM implementations with no - * firstElementChild/nextElementSibling functionality... - */ - _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) { - function nextSiblingEl(n) { - do { - n = n.nextSibling; - } while (n && n.nodeType !== n.ELEMENT_NODE); - return n; - } - // First check for kids if those aren't being ignored - if (!ignoreSelfAndKids && node.children[0]) { - return node.children[0]; - } - // Then for siblings... - var next = nextSiblingEl(node); - if (next) { - return next; - } - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - do { - node = node.parentNode; - if (node) - next = nextSiblingEl(node); - } while (node && !next); - return node && next; - }, - _checkByline: function(node, matchString) { if (this._articleByline) { return false; @@ -751,6 +776,12 @@ Readability.prototype = { while (node) { var matchString = node.className + " " + node.id; + if (!this._isProbablyVisible(node)) { + this.log("Removing hidden node - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + // Check to see if this node is a byline, and remove it if it is. if (this._checkByline(node, matchString)) { node = this._removeAndGetNext(node); @@ -784,11 +815,31 @@ Readability.prototype = { // Turn all divs that don't have children block level elements into p's if (node.tagName === "DIV") { + // Put phrasing content into paragraphs. + var p = null; + var childNode = node.firstChild; + while (childNode) { + var nextSibling = childNode.nextSibling; + if (this._isPhrasingContent(childNode)) { + if (p !== null) { + p.appendChild(childNode); + } else if (!this._isWhitespace(childNode)) { + p = doc.createElement('p'); + node.replaceChild(p, childNode); + p.appendChild(childNode); + } + } else if (p !== null) { + while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); + p = null; + } + childNode = nextSibling; + } + // Sites like http://mobile.slate.com encloses each paragraph with a DIV // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSinglePInsideElement(node)) { + if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { var newNode = node.children[0]; node.parentNode.replaceChild(newNode, node); node = newNode; @@ -796,17 +847,6 @@ Readability.prototype = { } else if (!this._hasChildBlockElement(node)) { node = this._setNodeTag(node, "P"); elementsToScore.push(node); - } else { - // EXPERIMENTAL - this._forEachNode(node.childNodes, function(childNode) { - if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) { - var p = doc.createElement('p'); - p.textContent = childNode.textContent; - p.style.display = 'inline'; - p.className = 'readability-styled'; - node.replaceChild(p, childNode); - } - }); } } node = this._getNextNode(node); @@ -846,7 +886,7 @@ Readability.prototype = { // Initialize and score ancestors. this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName) + if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined') return; if (typeof(ancestor.readability) === 'undefined') { @@ -1085,7 +1125,7 @@ Readability.prototype = { // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. var textLength = this._getInnerText(articleContent, true).length; - if (textLength < this._wordThreshold) { + if (textLength < this._charThreshold) { parseSuccessful = false; page.innerHTML = pageCacheHtml; @@ -1233,27 +1273,28 @@ Readability.prototype = { }, /** - * Check if this node has only whitespace and a single P element + * Check if this node has only whitespace and a single element with given tag * Returns false if the DIV node contains non-empty text nodes - * or if it contains no P or more than 1 element. + * or if it contains no element with given tag or more than 1 element. * * @param Element + * @param string tag of child element **/ - _hasSinglePInsideElement: function(element) { - // There should be exactly 1 element child which is a P: - if (element.children.length != 1 || element.children[0].tagName !== "P") { + _hasSingleTagInsideElement: function(element, tag) { + // There should be exactly 1 element child with given tag + if (element.children.length != 1 || element.children[0].tagName !== tag) { return false; } // And there should be no text nodes with real content return !this._someNode(element.childNodes, function(node) { - return node.nodeType === Node.TEXT_NODE && + return node.nodeType === this.TEXT_NODE && this.REGEXPS.hasContent.test(node.textContent); }); }, _isElementWithoutContent: function(node) { - return node.nodeType === Node.ELEMENT_NODE && + return node.nodeType === this.ELEMENT_NODE && node.textContent.trim().length == 0 && (node.children.length == 0 || node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); @@ -1271,6 +1312,21 @@ Readability.prototype = { }); }, + /*** + * Determine if a node qualifies as phrasing content. + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content + **/ + _isPhrasingContent: function(node) { + return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || + ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && + this._everyNode(node.childNodes, this._isPhrasingContent)); + }, + + _isWhitespace: function(node) { + return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) || + (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); + }, + /** * Get the inner text of a node - cross browser compatibly. * This also strips out any excess whitespace to be found. @@ -1312,16 +1368,14 @@ Readability.prototype = { if (!e || e.tagName.toLowerCase() === 'svg') return; - if (e.className !== 'readability-styled') { - // Remove `style` and deprecated presentational attributes - for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { - e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); - } + // Remove `style` and deprecated presentational attributes + for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { + e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); + } - if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { - e.removeAttribute('width'); - e.removeAttribute('height'); - } + if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { + e.removeAttribute('width'); + e.removeAttribute('height'); } var cur = e.firstElementChild; @@ -1639,6 +1693,10 @@ Readability.prototype = { this._flags = this._flags & ~flag; }, + _isProbablyVisible: function(node) { + return node.style.display != "none" && !node.hasAttribute("hidden"); + }, + /** * Decides whether or not the document is reader-able without parsing the whole thing. * @@ -1663,9 +1721,9 @@ Readability.prototype = { nodes = [].concat.apply(Array.from(set), nodes); } - // FIXME we should have a fallback for helperIsVisible, but this is - // problematic because of jsdom's elem.style handling - see - // https://github.com/mozilla/readability/pull/186 for context. + if (!helperIsVisible) { + helperIsVisible = this._isProbablyVisible; + } var score = 0; // This is a little cheeky, we use the accumulator 'score' to decide what to return from @@ -1719,9 +1777,6 @@ Readability.prototype = { } } - if (typeof this._doc.documentElement.firstElementChild === "undefined") { - this._getNextNode = this._getNextNodeNoElementProperties; - } // Remove script tags from the document. this._removeScripts(this._doc); @@ -1750,7 +1805,6 @@ Readability.prototype = { var textContent = articleContent.textContent; return { - uri: this._uri, title: this._articleTitle, byline: metadata.byline || this._articleByline, dir: this._articleDir, diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index e9eb83154..218e12d60 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -195,7 +195,7 @@ this.ReaderMode = { // We pass in a helper function to determine if a node is visible, because // it uses gecko APIs that the engine-agnostic readability code can't rely // upon. - return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); + return new Readability(doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); }, isNodeVisible(utils, node) { diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js index 69426788b..9cc684e9b 100644 --- a/toolkit/components/reader/ReaderWorker.js +++ b/toolkit/components/reader/ReaderWorker.js @@ -48,6 +48,6 @@ var Agent = { */ parseDocument(uri, serializedDoc, options) { let doc = new JSDOMParser().parse(serializedDoc, uri.spec); - return new Readability(uri, doc, options).parse(); + return new Readability(doc, options).parse(); }, }; -- cgit v1.2.3