diff options
Diffstat (limited to 'toolkit/components/reader/Readability.js')
-rw-r--r-- | toolkit/components/reader/Readability.js | 390 |
1 files changed, 342 insertions, 48 deletions
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 69fb53f86..4a3689885 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -43,6 +43,7 @@ function Readability(doc, options) { options = options || {}; this._doc = doc; + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; this._articleTitle = null; this._articleByline = null; this._articleDir = null; @@ -55,6 +56,7 @@ function Readability(doc, options) { this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._keepClasses = !!options.keepClasses; // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | @@ -121,20 +123,23 @@ Readability.prototype = { REGEXPS: { // NOTE: These two regular expressions are duplicated in // Readability-readerable.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, - okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, normalize: /\s{2,}/g, videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, whitespace: /^\s*$/, hasContent: /\S$/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i }, DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], @@ -159,6 +164,15 @@ Readability.prototype = { // These are the classes that readability sets itself. CLASSES_TO_PRESERVE: [ "page" ], + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + /** * Run any post-process modifications to article content as necessary. * @@ -169,8 +183,10 @@ Readability.prototype = { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); - // Remove classes. - this._cleanClasses(articleContent); + if (!this._keepClasses) { + // Remove classes. + this._cleanClasses(articleContent); + } }, /** @@ -184,6 +200,10 @@ Readability.prototype = { * @return void */ _removeNodes: function(nodeList, filterFn) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _removeNodes"); + } for (var i = nodeList.length - 1; i >= 0; i--) { var node = nodeList[i]; var parentNode = node.parentNode; @@ -203,6 +223,10 @@ Readability.prototype = { * @return void */ _replaceNodeTags: function(nodeList, newTagName) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _replaceNodeTags"); + } for (var i = nodeList.length - 1; i >= 0; i--) { var node = nodeList[i]; this._setNodeTag(node, newTagName); @@ -322,6 +346,7 @@ Readability.prototype = { if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; } + // Otherwise, resolve against base URI: try { return new URL(uri, baseURI).href; @@ -335,22 +360,50 @@ Readability.prototype = { this._forEachNode(links, function(link) { var href = link.getAttribute("href"); if (href) { - // Replace links with javascript: URIs with text content, since + // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.indexOf("javascript:") === 0) { - var text = this._doc.createTextNode(link.textContent); - link.parentNode.replaceChild(text, link); + // if the link only contains simple text content, it can be converted to a text node + if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + var text = this._doc.createTextNode(link.textContent); + link.parentNode.replaceChild(text, link); + } else { + // if the link has multiple children, they should all be preserved + var container = this._doc.createElement("span"); + while (link.childNodes.length > 0) { + container.appendChild(link.childNodes[0]); + } + link.parentNode.replaceChild(container, link); + } } else { link.setAttribute("href", toAbsoluteURI(href)); } } }); - var imgs = this._getAllNodesWithTag(articleContent, ["img"]); - this._forEachNode(imgs, function(img) { - var src = img.getAttribute("src"); + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + if (src) { - img.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); } }); }, @@ -444,13 +497,13 @@ Readability.prototype = { var doc = this._doc; // Remove all style tags in head - this._removeNodes(doc.getElementsByTagName("style")); + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); if (doc.body) { this._replaceBrs(doc.body); } - this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN"); + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); }, /** @@ -530,7 +583,7 @@ Readability.prototype = { _setNodeTag: function (node, tag) { this.log("_setNodeTag", node, tag); - if (node.__JSDOMParser__) { + if (this._docJSDOMParser) { node.localName = tag.toLowerCase(); node.tagName = tag.toUpperCase(); return node; @@ -545,7 +598,16 @@ Readability.prototype = { replacement.readability = node.readability; for (var i = 0; i < node.attributes.length; i++) { - replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + try { + replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + } catch (ex) { + /* it's possible for setAttribute() to throw if the attribute name + * isn't a valid XML Name. Such attributes can however be parsed from + * source in HTML docs, see https://github.com/whatwg/html/issues/4275, + * so we can hit them here and then throw. We don't care about such + * attributes so we ignore them. + */ + } } return replacement; }, @@ -565,6 +627,8 @@ Readability.prototype = { // visually linked to other content-ful elements (text, images, etc.). this._markDataTables(articleContent); + this._fixLazyImages(articleContent); + // Clean out junk from the article content this._cleanConditionally(articleContent, "form"); this._cleanConditionally(articleContent, "fieldset"); @@ -575,10 +639,15 @@ Readability.prototype = { this._clean(articleContent, "link"); this._clean(articleContent, "aside"); - // Clean out elements have "share" in their id/class combinations from final top candidates, + // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". - this._forEachNode(articleContent.children, function(topCandidate) { - this._cleanMatchedNodes(topCandidate, /share/); + + var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; + + this._forEachNode(articleContent.children, function (topCandidate) { + this._cleanMatchedNodes(topCandidate, function (node, matchString) { + return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + }); }); // If there is only one h2 and its text content substantially equals article title, @@ -614,7 +683,7 @@ Readability.prototype = { this._cleanConditionally(articleContent, "div"); // Remove extra paragraphs - this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { + this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { var imgCount = paragraph.getElementsByTagName("img").length; var embedCount = paragraph.getElementsByTagName("embed").length; var objectCount = paragraph.getElementsByTagName("object").length; @@ -729,9 +798,10 @@ Readability.prototype = { if (node.getAttribute !== undefined) { var rel = node.getAttribute("rel"); + var itemprop = node.getAttribute("itemprop"); } - if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { this._articleByline = node.textContent.trim(); return true; } @@ -800,12 +870,19 @@ Readability.prototype = { if (stripUnlikelyCandidates) { if (this.REGEXPS.unlikelyCandidates.test(matchString) && !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && node.tagName !== "BODY" && node.tagName !== "A") { this.log("Removing unlikely candidate - " + matchString); node = this._removeAndGetNext(node); continue; } + + if (node.getAttribute("role") == "complementary") { + this.log("Removing complementary content - " + matchString); + node = this._removeAndGetNext(node); + continue; + } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). @@ -1200,6 +1277,26 @@ Readability.prototype = { }, /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** * Attempts to get excerpt and byline metadata for the article. * * @return Object with optional "excerpt" and "byline" properties @@ -1220,6 +1317,9 @@ Readability.prototype = { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); var content = element.getAttribute("content"); + if (!content) { + return; + } var matches = null; var name = null; @@ -1276,21 +1376,123 @@ Readability.prototype = { // get site name metadata.siteName = values["og:site_name"]; + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); + return metadata; }, /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all <noscript> that are located after <img> nodes, and which contain only one + * <img> element. Replace the first image with the image from inside the <noscript> tag, + * and remove the <noscript> tag. This improves the quality of the images we use on + * some sites (e.g. Medium). + * + * @param Element + **/ + _unwrapNoscriptImages: function(doc) { + // Find img without source or attributes that might contains image, and remove it. + // This is done to prevent a placeholder img is replaced by img from noscript in next step. + var imgs = Array.from(doc.getElementsByTagName("img")); + this._forEachNode(imgs, function(img) { + for (var i = 0; i < img.attributes.length; i++) { + var attr = img.attributes[i]; + switch (attr.name) { + case "src": + case "srcset": + case "data-src": + case "data-srcset": + return; + } + + if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { + return; + } + } + + img.parentNode.removeChild(img); + }); + + // Next find noscript and try to extract its image + var noscripts = Array.from(doc.getElementsByTagName("noscript")); + this._forEachNode(noscripts, function(noscript) { + // Parse content of noscript and make sure it only contains image + var tmp = doc.createElement("div"); + tmp.innerHTML = noscript.innerHTML; + if (!this._isSingleImage(tmp)) { + return; + } + + // If noscript has previous sibling and it only contains image, + // replace it with noscript content. However we also keep old + // attributes that might contains image. + var prevElement = noscript.previousElementSibling; + if (prevElement && this._isSingleImage(prevElement)) { + var prevImg = prevElement; + if (prevImg.tagName !== "IMG") { + prevImg = prevElement.getElementsByTagName("img")[0]; + } + + var newImg = tmp.getElementsByTagName("img")[0]; + for (var i = 0; i < prevImg.attributes.length; i++) { + var attr = prevImg.attributes[i]; + if (attr.value === "") { + continue; + } + + if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) { + if (newImg.getAttribute(attr.name) === attr.value) { + continue; + } + + var attrName = attr.name; + if (newImg.hasAttribute(attrName)) { + attrName = "data-old-" + attrName; + } + + newImg.setAttribute(attrName, attr.value); + } + } + + noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement); + } + }); + }, + + /** * Removes script tags from the document. * * @param Element **/ _removeScripts: function(doc) { - this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) { + this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) { scriptNode.nodeValue = ""; scriptNode.removeAttribute("src"); return true; }); - this._removeNodes(doc.getElementsByTagName("noscript")); + this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"])); }, /** @@ -1473,20 +1675,20 @@ Readability.prototype = { _clean: function(e, tag) { var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; - this._removeNodes(e.getElementsByTagName(tag), function(element) { + this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) { // Allow youtube and vimeo videos through as people usually want to see those. if (isEmbed) { - var attributeValues = [].map.call(element.attributes, function(attr) { - return attr.value; - }).join("|"); - // First, check the elements attributes to see if any of them contain youtube or vimeo - if (this.REGEXPS.videos.test(attributeValues)) - return false; + for (var i = 0; i < element.attributes.length; i++) { + if (this.REGEXPS.videos.test(element.attributes[i].value)) { + return false; + } + } - // Then check the elements inside this element for the same. - if (this.REGEXPS.videos.test(element.innerHTML)) + // For embed with <object> tag, check inner HTML as well. + if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) { return false; + } } return true; @@ -1604,6 +1806,76 @@ Readability.prototype = { } }, + /* convert images and figures that have properties like data-src into images that can be loaded without JS */ + _fixLazyImages: function (root) { + this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) { + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. + // So, here we check if the data uri is too short, just might as well remove it. + if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. + var parts = this.REGEXPS.b64DataUrl.exec(elem.src); + if (parts[1] === "image/svg+xml") { + return; + } + + // Make sure this element has other attributes which contains image. + // If it doesn't, then this src is important and shouldn't be removed. + var srcCouldBeRemoved = false; + for (var i = 0; i < elem.attributes.length; i++) { + var attr = elem.attributes[i]; + if (attr.name === "src") { + continue; + } + + if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { + srcCouldBeRemoved = true; + break; + } + } + + // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) + // it will be too small, therefore it might be placeholder image. + if (srcCouldBeRemoved) { + var b64starts = elem.src.search(/base64\s*/i) + 7; + var b64length = elem.src.length - b64starts; + if (b64length < 133) { + elem.removeAttribute("src"); + } + } + } + + // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 + if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) { + return; + } + + for (var j = 0; j < elem.attributes.length; j++) { + attr = elem.attributes[j]; + if (attr.name === "src" || attr.name === "srcset") { + continue; + } + var copyTo = null; + if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { + copyTo = "srcset"; + } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { + copyTo = "src"; + } + if (copyTo) { + //if this is an img or picture, set the attribute directly + if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { + elem.setAttribute(copyTo, attr.value); + } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) { + //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure + //see the nytimes-3 testcase for an example + var img = this._doc.createElement("img"); + img.setAttribute(copyTo, attr.value); + elem.appendChild(img); + } + } + } + }); + }, + /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. @@ -1621,12 +1893,17 @@ Readability.prototype = { // without effecting the traversal. // // TODO: Consider taking into account original contentScore here. - this._removeNodes(e.getElementsByTagName(tag), function(node) { - // First check if we're in a data table, in which case don't remove us. + this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) { + // First check if this node IS data table, in which case don't remove it. var isDataTable = function(t) { return t._readabilityDataTable; }; + if (tag === "table" && isDataTable(node)) { + return false; + } + + // Next check if we're inside a data table, in which case don't remove it as well. if (this._hasAncestorTag(node, "table", -1, isDataTable)) { return false; } @@ -1650,10 +1927,22 @@ Readability.prototype = { var input = node.getElementsByTagName("input").length; var embedCount = 0; - var embeds = node.getElementsByTagName("embed"); - for (var ei = 0, il = embeds.length; ei < il; ei += 1) { - if (!this.REGEXPS.videos.test(embeds[ei].src)) - embedCount += 1; + var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]); + + for (var i = 0; i < embeds.length; i++) { + // If this embed has attribute that matches video regex, don't delete it. + for (var j = 0; j < embeds[i].attributes.length; j++) { + if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) { + return false; + } + } + + // For embed with <object> tag, check inner HTML as well. + if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) { + return false; + } + + embedCount++; } var linkDensity = this._getLinkDensity(node); @@ -1674,17 +1963,17 @@ Readability.prototype = { }, /** - * Clean out elements whose id/class combinations match specific string. + * Clean out elements that match the specified conditions * * @param Element - * @param RegExp match id/class combination. + * @param Function determines whether a node should be removed * @return void **/ - _cleanMatchedNodes: function(e, regex) { + _cleanMatchedNodes: function(e, filter) { var endOfSearchMarkerNode = this._getNextNode(e, true); var next = this._getNextNode(e); while (next && next != endOfSearchMarkerNode) { - if (regex.test(next.className + " " + next.id)) { + if (filter.call(this, next, next.className + " " + next.id)) { next = this._removeAndGetNext(next); } else { next = this._getNextNode(next); @@ -1699,11 +1988,9 @@ Readability.prototype = { * @return void **/ _cleanHeaders: function(e) { - for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { - this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) { - return this._getClassWeight(header) < 0; - }); - } + this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) { + return this._getClassWeight(header) < 0; + }); }, _flagIsActive: function(flag) { @@ -1715,7 +2002,11 @@ Readability.prototype = { }, _isProbablyVisible: function(node) { - return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden"); + // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. + return (!node.style || node.style.display != "none") + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); }, /** @@ -1739,6 +2030,9 @@ Readability.prototype = { } } + // Unwrap image from noscript + this._unwrapNoscriptImages(this._doc); + // Remove script tags from the document. this._removeScripts(this._doc); |