From ccc4363462053edfc9cf616afa7f86b3244aaff6 Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Tue, 23 Apr 2019 17:41:09 -0400 Subject: Update Readability from mozilla-central release branch (FF 66.0.3). Tag #361. --- toolkit/components/reader/JSDOMParser.js | 8 +- .../components/reader/Readability-readerable.js | 104 ++++++++ toolkit/components/reader/Readability.js | 289 +++++++++------------ 3 files changed, 235 insertions(+), 166 deletions(-) create mode 100644 toolkit/components/reader/Readability-readerable.js (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index debdb08eb..ab2f503e1 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -691,7 +691,7 @@ // the attribute value will be HTML escaped. var val = attr.value; var quote = (val.indexOf('"') === -1 ? '"' : "'"); - arr.push(" " + attr.name + '=' + quote + val + quote); + arr.push(" " + attr.name + "=" + quote + val + quote); } if (child.localName in voidElems && !child.childNodes.length) { @@ -970,7 +970,7 @@ strBuf.push(c); c = this.nextChar(); } - var tag = strBuf.join(''); + var tag = strBuf.join(""); if (!tag) return false; @@ -981,7 +981,9 @@ while (c !== "/" && c !== ">") { if (c === undefined) return false; - while (whitespace.indexOf(this.html[this.currentChar++]) != -1); + while (whitespace.indexOf(this.html[this.currentChar++]) != -1) { + // Advance cursor to first non-whitespace char. + } this.currentChar--; c = this.nextChar(); if (c !== "/" && c !== ">") { diff --git a/toolkit/components/reader/Readability-readerable.js b/toolkit/components/reader/Readability-readerable.js new file mode 100644 index 000000000..d0e1b8164 --- /dev/null +++ b/toolkit/components/reader/Readability-readerable.js @@ -0,0 +1,104 @@ +/* eslint-env es6:false */ +/* globals exports */ +/* + * DO NOT MODIFY THIS FILE DIRECTLY! + * + * This is a shared library that is maintained in an external repo: + * https://github.com/mozilla/readability + */ + +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +var REGEXPS = { + // NOTE: These two regular expressions are duplicated in + // Readability.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|main|shadow/i, +}; + +function isNodeVisible(node) { + // Have to null-check node.style to deal with SVG and MathML nodes. + return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden"); +} + +/** + * Decides whether or not the document is reader-able without parsing the whole thing. + * + * @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object. + */ +function isProbablyReaderable(doc, isVisible) { + if (!isVisible) { + isVisible = isNodeVisible; + } + + var nodes = doc.querySelectorAll("p, pre"); + + // Get
nodes which have
node(s) and append them into the `nodes` variable. + // Some articles' DOM structures might look like + //
+ // Sentences
+ //
+ // Sentences
+ //
+ var brNodes = doc.querySelectorAll("div > br"); + if (brNodes.length) { + var set = new Set(nodes); + [].forEach.call(brNodes, function(node) { + set.add(node.parentNode); + }); + nodes = Array.from(set); + } + + var score = 0; + // This is a little cheeky, we use the accumulator 'score' to decide what to return from + // this callback: + return [].some.call(nodes, function(node) { + if (!isVisible(node)) + return false; + + var matchString = node.className + " " + node.id; + if (REGEXPS.unlikelyCandidates.test(matchString) && + !REGEXPS.okMaybeItsACandidate.test(matchString)) { + return false; + } + + if (node.matches("li p")) { + return false; + } + + var textContentLength = node.textContent.trim().length; + if (textContentLength < 140) { + return false; + } + + score += Math.sqrt(textContentLength - 140); + + if (score > 20) { + return true; + } + return false; + }); +} + +if (typeof exports === "object") { + exports.isProbablyReaderable = isProbablyReaderable; +} diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index c2bba0cd3..69fb53f86 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -46,6 +46,7 @@ function Readability(doc, options) { this._articleTitle = null; this._articleByline = null; this._articleDir = null; + this._articleSiteName = null; this._attempts = []; // Configurable options @@ -118,15 +119,18 @@ Readability.prototype = { // All of the regular expressions in use within readability. // Defined up here so we don't instantiate them repeatedly in loops. REGEXPS: { + // NOTE: These two regular expressions are duplicated in + // Readability-readerable.js. Please keep both copies in sync. unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, normalize: /\s{2,}/g, - videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, + videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, whitespace: /^\s*$/, @@ -267,7 +271,7 @@ Readability.prototype = { _getAllNodesWithTag: function(node, tagNames) { if (node.querySelectorAll) { - return node.querySelectorAll(tagNames.join(',')); + return node.querySelectorAll(tagNames.join(",")); } return [].concat.apply([], tagNames.map(function(tag) { var collection = node.getElementsByTagName(tag); @@ -327,7 +331,7 @@ Readability.prototype = { return uri; } - var links = articleContent.getElementsByTagName("a"); + var links = this._getAllNodesWithTag(articleContent, ["a"]); this._forEachNode(links, function(link) { var href = link.getAttribute("href"); if (href) { @@ -342,7 +346,7 @@ Readability.prototype = { } }); - var imgs = articleContent.getElementsByTagName("img"); + var imgs = this._getAllNodesWithTag(articleContent, ["img"]); this._forEachNode(imgs, function(img) { var src = img.getAttribute("src"); if (src) { @@ -366,7 +370,7 @@ Readability.prototype = { // If they had an element with id "title" in their HTML if (typeof curTitle !== "string") - curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); + curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); } catch (e) {/* ignore exceptions setting the title. */} var titleHadHierarchicalSeparators = false; @@ -377,18 +381,18 @@ Readability.prototype = { // If there's a separator in the title, first remove the final part if ((/ [\|\-\\\/>»] /).test(curTitle)) { titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); - curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1'); + curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); // If the resulting title is too short (3 words or fewer), remove // the first part instead: if (wordCount(curTitle) < 3) - curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1'); - } else if (curTitle.indexOf(': ') !== -1) { + curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); + } else if (curTitle.indexOf(": ") !== -1) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. var headings = this._concatNodeLists( - doc.getElementsByTagName('h1'), - doc.getElementsByTagName('h2') + doc.getElementsByTagName("h1"), + doc.getElementsByTagName("h2") ); var trimmedTitle = curTitle.trim(); var match = this._someNode(headings, function(heading) { @@ -397,25 +401,25 @@ Readability.prototype = { // If we don't, let's extract the title out of the original title string. if (!match) { - curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1); + curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); // If the title is now too short, try the first colon instead: if (wordCount(curTitle) < 3) { - curTitle = origTitle.substring(origTitle.indexOf(':') + 1); + curTitle = origTitle.substring(origTitle.indexOf(":") + 1); // But if we have too many words before the colon there's something weird // with the titles and the H tags so let's just use the original title instead - } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) { + } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { curTitle = origTitle; } } } else if (curTitle.length > 150 || curTitle.length < 15) { - var hOnes = doc.getElementsByTagName('h1'); + var hOnes = doc.getElementsByTagName("h1"); if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]); } - curTitle = curTitle.trim(); + curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use @@ -505,7 +509,8 @@ Readability.prototype = { break; } - if (!this._isPhrasingContent(next)) break; + if (!this._isPhrasingContent(next)) + break; // Otherwise, make this node a child of the new

. var sibling = next.nextSibling; @@ -513,9 +518,12 @@ Readability.prototype = { next = sibling; } - while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } - if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV"); + if (p.parentNode.tagName === "P") + this._setNodeTag(p.parentNode, "DIV"); } }); }, @@ -576,7 +584,7 @@ Readability.prototype = { // If there is only one h2 and its text content substantially equals article title, // they are probably using it as a header and not a subheader, // so remove it since we already extract the title separately. - var h2 = articleContent.getElementsByTagName('h2'); + var h2 = articleContent.getElementsByTagName("h2"); if (h2.length === 1) { var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; if (Math.abs(lengthSimilarRate) < 0.5) { @@ -606,12 +614,12 @@ Readability.prototype = { this._cleanConditionally(articleContent, "div"); // Remove extra paragraphs - this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) { - var imgCount = paragraph.getElementsByTagName('img').length; - var embedCount = paragraph.getElementsByTagName('embed').length; - var objectCount = paragraph.getElementsByTagName('object').length; + this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { + var imgCount = paragraph.getElementsByTagName("img").length; + var embedCount = paragraph.getElementsByTagName("embed").length; + var objectCount = paragraph.getElementsByTagName("object").length; // At this point, nasty iframes have been removed, only remain embedded video ones. - var iframeCount = paragraph.getElementsByTagName('iframe').length; + var iframeCount = paragraph.getElementsByTagName("iframe").length; var totalCount = imgCount + embedCount + objectCount + iframeCount; return totalCount === 0 && !this._getInnerText(paragraph, false); @@ -648,34 +656,34 @@ Readability.prototype = { node.readability = {"contentScore": 0}; switch (node.tagName) { - case 'DIV': + case "DIV": node.readability.contentScore += 5; break; - case 'PRE': - case 'TD': - case 'BLOCKQUOTE': + case "PRE": + case "TD": + case "BLOCKQUOTE": node.readability.contentScore += 3; break; - case 'ADDRESS': - case 'OL': - case 'UL': - case 'DL': - case 'DD': - case 'DT': - case 'LI': - case 'FORM': + case "ADDRESS": + case "OL": + case "UL": + case "DL": + case "DD": + case "DT": + case "LI": + case "FORM": node.readability.contentScore -= 3; break; - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - case 'TH': + case "H1": + case "H2": + case "H3": + case "H4": + case "H5": + case "H6": + case "TH": node.readability.contentScore -= 5; break; } @@ -824,12 +832,14 @@ Readability.prototype = { if (p !== null) { p.appendChild(childNode); } else if (!this._isWhitespace(childNode)) { - p = doc.createElement('p'); + p = doc.createElement("p"); node.replaceChild(p, childNode); p.appendChild(childNode); } } else if (p !== null) { - while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild); + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } p = null; } childNode = nextSibling; @@ -860,7 +870,7 @@ Readability.prototype = { **/ var candidates = []; this._forEachNode(elementsToScore, function(elementToScore) { - if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined') + if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") return; // If this paragraph is less than 25 characters, don't even count it. @@ -879,17 +889,17 @@ Readability.prototype = { contentScore += 1; // Add points for any commas within this paragraph. - contentScore += innerText.split(',').length; + contentScore += innerText.split(",").length; // For every 100 characters in this paragraph, add another point. Up to 3 points. contentScore += Math.min(Math.floor(innerText.length / 100), 3); // Initialize and score ancestors. this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined') + if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") return; - if (typeof(ancestor.readability) === 'undefined') { + if (typeof(ancestor.readability) === "undefined") { this._initializeNode(ancestor); candidates.push(ancestor); } @@ -920,7 +930,7 @@ Readability.prototype = { var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); candidate.readability.contentScore = candidateScore; - this.log('Candidate:', candidate, "with score " + candidateScore); + this.log("Candidate:", candidate, "with score " + candidateScore); for (var t = 0; t < this._nbTopCandidates; t++) { var aTopCandidate = topCandidates[t]; @@ -1039,8 +1049,8 @@ Readability.prototype = { var sibling = siblings[s]; var append = false; - this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ''); - this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown'); + this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); + this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); if (sibling === topCandidate) { append = true; @@ -1074,7 +1084,7 @@ Readability.prototype = { if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { // We have a node that isn't a common block level element, like a form or td tag. // Turn it into a div so it doesn't get filtered out later by accident. - this.log("Altering sibling:", sibling, 'to div.'); + this.log("Altering sibling:", sibling, "to div."); sibling = this._setNodeTag(sibling, "DIV"); } @@ -1142,7 +1152,7 @@ Readability.prototype = { this._attempts.push({articleContent: articleContent, textLength: textLength}); // No luck after removing flags, just return the longest text we found during the different loops this._attempts.sort(function (a, b) { - return a.textLength < b.textLength; + return b.textLength - a.textLength; }); // But first check if we actually have something @@ -1182,7 +1192,7 @@ Readability.prototype = { * @return Boolean - whether the input string is a byline. */ _isValidByline: function(byline) { - if (typeof byline == 'string' || byline instanceof String) { + if (typeof byline == "string" || byline instanceof String) { byline = byline.trim(); return (byline.length > 0) && (byline.length < 100); } @@ -1199,62 +1209,73 @@ Readability.prototype = { var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); - // Match "description", or Twitter's "twitter:description" (Cards) - // in name attribute. - var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi; + // property is a space-separated list of values + var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; - // Match Facebook's Open Graph title & description properties. - var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi; + // name is a single value + var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; // Find description tags. this._forEachNode(metaElements, function(element) { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); + var content = element.getAttribute("content"); + var matches = null; + var name = null; - if ([elementName, elementProperty].indexOf("author") !== -1) { - metadata.byline = element.getAttribute("content"); - return; + if (elementProperty) { + matches = elementProperty.match(propertyPattern); + if (matches) { + for (var i = matches.length - 1; i >= 0; i--) { + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[i].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); + } + } } - - var name = null; - if (namePattern.test(elementName)) { + if (!matches && elementName && namePattern.test(elementName)) { name = elementName; - } else if (propertyPattern.test(elementProperty)) { - name = elementProperty; - } - - if (name) { - var content = element.getAttribute("content"); if (content) { - // Convert to lowercase and remove any whitespace - // so we can match below. - name = name.toLowerCase().replace(/\s/g, ''); + // Convert to lowercase, remove any whitespace, and convert dots + // to colons so we can match below. + name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); values[name] = content.trim(); } } }); - if ("description" in values) { - metadata.excerpt = values["description"]; - } else if ("og:description" in values) { - // Use facebook open graph description. - metadata.excerpt = values["og:description"]; - } else if ("twitter:description" in values) { - // Use twitter cards description. - metadata.excerpt = values["twitter:description"]; - } + // get title + metadata.title = values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"]; - metadata.title = this._getArticleTitle(); if (!metadata.title) { - if ("og:title" in values) { - // Use facebook open graph title. - metadata.title = values["og:title"]; - } else if ("twitter:title" in values) { - // Use twitter cards title. - metadata.title = values["twitter:title"]; - } + metadata.title = this._getArticleTitle(); } + // get author + metadata.byline = values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; + + // get description + metadata.excerpt = values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; + + // get site name + metadata.siteName = values["og:site_name"]; + return metadata; }, @@ -1264,12 +1285,12 @@ Readability.prototype = { * @param Element **/ _removeScripts: function(doc) { - this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) { + this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) { scriptNode.nodeValue = ""; - scriptNode.removeAttribute('src'); + scriptNode.removeAttribute("src"); return true; }); - this._removeNodes(doc.getElementsByTagName('noscript')); + this._removeNodes(doc.getElementsByTagName("noscript")); }, /** @@ -1336,7 +1357,7 @@ Readability.prototype = { * @return string **/ _getInnerText: function(e, normalizeSpaces) { - normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; + normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces; var textContent = e.textContent.trim(); if (normalizeSpaces) { @@ -1365,7 +1386,7 @@ Readability.prototype = { * @return void **/ _cleanStyles: function(e) { - if (!e || e.tagName.toLowerCase() === 'svg') + if (!e || e.tagName.toLowerCase() === "svg") return; // Remove `style` and deprecated presentational attributes @@ -1374,8 +1395,8 @@ Readability.prototype = { } if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { - e.removeAttribute('width'); - e.removeAttribute('height'); + e.removeAttribute("width"); + e.removeAttribute("height"); } var cur = e.firstElementChild; @@ -1421,7 +1442,7 @@ Readability.prototype = { var weight = 0; // Look for a special classname - if (typeof(e.className) === 'string' && e.className !== '') { + if (typeof(e.className) === "string" && e.className !== "") { if (this.REGEXPS.negative.test(e.className)) weight -= 25; @@ -1430,7 +1451,7 @@ Readability.prototype = { } // Look for a special ID - if (typeof(e.id) === 'string' && e.id !== '') { + if (typeof(e.id) === "string" && e.id !== "") { if (this.REGEXPS.negative.test(e.id)) weight -= 25; @@ -1619,7 +1640,7 @@ Readability.prototype = { return true; } - if (this._getCharCount(node, ',') < 10) { + if (this._getCharCount(node, ",") < 10) { // If there are not very many commas, and the number of // non-paragraph elements is more than paragraphs or other // ominous signs, remove the element. @@ -1679,7 +1700,7 @@ Readability.prototype = { **/ _cleanHeaders: function(e) { for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { - this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) { + this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) { return this._getClassWeight(header) < 0; }); } @@ -1694,66 +1715,7 @@ Readability.prototype = { }, _isProbablyVisible: function(node) { - return node.style.display != "none" && !node.hasAttribute("hidden"); - }, - - /** - * Decides whether or not the document is reader-able without parsing the whole thing. - * - * @return boolean Whether or not we suspect parse() will suceeed at returning an article object. - */ - isProbablyReaderable: function(helperIsVisible) { - var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]); - - // Get

nodes which have
node(s) and append them into the `nodes` variable. - // Some articles' DOM structures might look like - //
- // Sentences
- //
- // Sentences
- //
- var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]); - if (brNodes.length) { - var set = new Set(); - [].forEach.call(brNodes, function(node) { - set.add(node.parentNode); - }); - nodes = [].concat.apply(Array.from(set), nodes); - } - - if (!helperIsVisible) { - helperIsVisible = this._isProbablyVisible; - } - - var score = 0; - // This is a little cheeky, we use the accumulator 'score' to decide what to return from - // this callback: - return this._someNode(nodes, function(node) { - if (helperIsVisible && !helperIsVisible(node)) - return false; - var matchString = node.className + " " + node.id; - - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString)) { - return false; - } - - if (node.matches && node.matches("li p")) { - return false; - } - - var textContentLength = node.textContent.trim().length; - if (textContentLength < 140) { - return false; - } - - score += Math.sqrt(textContentLength - 140); - - if (score > 20) { - return true; - } - return false; - }); + return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden"); }, /** @@ -1812,6 +1774,7 @@ Readability.prototype = { textContent: textContent, length: textContent.length, excerpt: metadata.excerpt, + siteName: metadata.siteName || this._articleSiteName }; } }; -- cgit v1.2.3 From 945b03265637bc8bf5bcd960909d1e8a4ddd5440 Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Thu, 25 Apr 2019 20:08:17 -0400 Subject: Update surrounding code for new Readerable module. Tag #361. --- toolkit/components/reader/ReaderMode.jsm | 114 ++----------------------------- toolkit/components/reader/Readerable.js | 114 +++++++++++++++++++++++++++++++ toolkit/components/reader/Readerable.jsm | 10 +++ toolkit/components/reader/moz.build | 4 ++ 4 files changed, 134 insertions(+), 108 deletions(-) create mode 100644 toolkit/components/reader/Readerable.js create mode 100644 toolkit/components/reader/Readerable.jsm (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index 218e12d60..ebc333495 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -30,13 +30,7 @@ XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-comm XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm"); - -XPCOMUtils.defineLazyGetter(this, "Readability", function() { - let scope = {}; - scope.dump = this.dump; - Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope); - return scope.Readability; -}); +XPCOMUtils.defineLazyModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm"); this.ReaderMode = { // Version of the cache schema. @@ -53,33 +47,6 @@ this.ReaderMode = { return this.parseNodeLimit = Services.prefs.getIntPref("reader.parse-node-limit"); }, - get isEnabledForParseOnLoad() { - delete this.isEnabledForParseOnLoad; - - // Listen for future pref changes. - Services.prefs.addObserver("reader.parse-on-load.", this, false); - - return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); - }, - - _getStateForParseOnLoad() { - let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); - let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); - return isForceEnabled || isEnabled; - }, - - observe(aMessage, aTopic, aData) { - switch (aTopic) { - case "nsPref:changed": - if (aData.startsWith("reader.parse-on-load.")) { - this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); - } else if (aData === "reader.parse-node-limit") { - this.parseNodeLimit = Services.prefs.getIntPref(aData); - } - break; - } - }, - /** * Enter the reader mode by going forward one step in history if applicable, * if not, append the about:reader page in the history instead. @@ -174,39 +141,6 @@ this.ReaderMode = { return null; }, - /** - * Decides whether or not a document is reader-able without parsing the whole thing. - * - * @param doc A document to parse. - * @return boolean Whether or not we should show the reader mode button. - */ - isProbablyReaderable(doc) { - // Only care about 'real' HTML documents: - if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { - return false; - } - - let uri = Services.io.newURI(doc.location.href); - if (!this._shouldCheckUri(uri)) { - return false; - } - - let utils = this.getUtilsForWin(doc.defaultView); - // We pass in a helper function to determine if a node is visible, because - // it uses gecko APIs that the engine-agnostic readability code can't rely - // upon. - return new Readability(doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils)); - }, - - isNodeVisible(utils, node) { - let bounds = utils.getBoundsWithoutFlushing(node); - return bounds.height > 0 && bounds.width > 0; - }, - - getUtilsForWin(win) { - return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils); - }, - /** * Gets an article from a loaded browser's document. This method will not attempt * to parse certain URIs (e.g. about: URIs). @@ -216,7 +150,8 @@ this.ReaderMode = { * @resolves JS object representing the article, or null if no article is found. */ parseDocument(doc) { - if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { + if (!Readerable.shouldCheckUri(doc.documentURIObject) || + !Readerable.shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } @@ -236,7 +171,8 @@ this.ReaderMode = { if (!doc) { return null; } - if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) { + if (!Readerable.shouldCheckUri(doc.documentURIObject) || + !Readerable.shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } @@ -246,7 +182,7 @@ this.ReaderMode = { _downloadDocument(url) { try { - if (!this._shouldCheckUri(Services.io.newURI(url))) { + if (!Readerable.shouldCheckUri(Services.io.newURI(url))) { return null; } } catch (ex) { @@ -388,44 +324,6 @@ this.ReaderMode = { dump("Reader: " + msg); }, - _blockedHosts: [ - "amazon.com", - "basilisk-browser.org", - "github.com", - "mail.google.com", - "palemoon.org", - "pinterest.com", - "reddit.com", - "twitter.com", - "youtube.com", - ], - - _shouldCheckUri(uri, isBaseUri = false) { - if (!(uri.schemeIs("http") || uri.schemeIs("https"))) { - this.log("Not parsing URI scheme: " + uri.scheme); - return false; - } - - try { - uri.QueryInterface(Ci.nsIURL); - } catch (ex) { - // If this doesn't work, presumably the URL is not well-formed or something - return false; - } - // Sadly, some high-profile pages have false positives, so bail early for those: - let asciiHost = uri.asciiHost; - if (!isBaseUri && this._blockedHosts.some(blockedHost => asciiHost.endsWith(blockedHost))) { - return false; - } - - if (!isBaseUri && (!uri.filePath || uri.filePath == "/")) { - this.log("Not parsing home page: " + uri.spec); - return false; - } - - return true; - }, - /** * Attempts to parse a document into an article. Heavy lifting happens * in readerWorker.js. diff --git a/toolkit/components/reader/Readerable.js b/toolkit/components/reader/Readerable.js new file mode 100644 index 000000000..71c23eb5b --- /dev/null +++ b/toolkit/components/reader/Readerable.js @@ -0,0 +1,114 @@ +// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +// This file and Readability-readerable.js are merged together into +// Readerable.jsm. + +/* exported Readerable */ +/* import-globals-from Readability-readerable.js */ + +const { classes: Cc, interfaces: Ci, utils: Cu } = Components; + +Cu.import("resource://gre/modules/Services.jsm"); +Cu.import("resource://gre/modules/XPCOMUtils.jsm"); + +function isNodeVisible(node) { + return node.clientHeight > 0 && node.clientWidth > 0; +} + +var Readerable = { + DEBUG: 0, + + get isEnabledForParseOnLoad() { + delete this.isEnabledForParseOnLoad; + + // Listen for future pref changes. + Services.prefs.addObserver("reader.parse-on-load.", this, false); + + return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); + }, + + _getStateForParseOnLoad() { + let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); + let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); + return isForceEnabled || isEnabled; + }, + + observe(aMessage, aTopic, aData) { + switch (aTopic) { + case "nsPref:changed": + if (aData.startsWith("reader.parse-on-load.")) { + this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); + } else if (aData === "reader.parse-node-limit") { + this.parseNodeLimit = Services.prefs.getIntPref(aData); + } + break; + } + }, + + log(msg) { + if (this.DEBUG) + dump("Reader: " + msg); + }, + + /** + * Decides whether or not a document is reader-able without parsing the whole thing. + * + * @param doc A document to parse. + * @return boolean Whether or not we should show the reader mode button. + */ + isProbablyReaderable(doc) { + // Only care about 'real' HTML documents: + if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) { + return false; + } + + let uri = Services.io.newURI(doc.location.href); + if (!this.shouldCheckUri(uri)) { + return false; + } + + return isProbablyReaderable(doc, isNodeVisible); + }, + + _blockedHosts: [ + "amazon.com", + "basilisk-browser.org", + "github.com", + "mail.google.com", + "palemoon.org", + "pinterest.com", + "reddit.com", + "twitter.com", + "youtube.com", + ], + + shouldCheckUri(uri, isBaseUri = false) { + if (!(uri.schemeIs("http") || uri.schemeIs("https"))) { + this.log("Not parsing URI scheme: " + uri.scheme); + return false; + } + + try { + uri.QueryInterface(Ci.nsIURL); + } catch (ex) { + // If this doesn't work, presumably the URL is not well-formed or something + return false; + } + // Sadly, some high-profile pages have false positives, so bail early for those: + let asciiHost = uri.asciiHost; + if (!isBaseUri && this._blockedHosts.some(blockedHost => asciiHost.endsWith(blockedHost))) { + return false; + } + + if (!isBaseUri && (!uri.filePath || uri.filePath == "/")) { + this.log("Not parsing home page: " + uri.spec); + return false; + } + + return true; + }, +}; diff --git a/toolkit/components/reader/Readerable.jsm b/toolkit/components/reader/Readerable.jsm new file mode 100644 index 000000000..2268487e4 --- /dev/null +++ b/toolkit/components/reader/Readerable.jsm @@ -0,0 +1,10 @@ +// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +var EXPORTED_SYMBOLS = ["Readerable"]; + +#include Readability-readerable.js +#include Readerable.js diff --git a/toolkit/components/reader/moz.build b/toolkit/components/reader/moz.build index 6863d6542..d49bda14f 100644 --- a/toolkit/components/reader/moz.build +++ b/toolkit/components/reader/moz.build @@ -11,6 +11,10 @@ EXTRA_JS_MODULES += [ 'ReaderMode.jsm' ] +EXTRA_PP_JS_MODULES += [ + 'Readerable.jsm' +] + EXTRA_JS_MODULES.reader = [ 'JSDOMParser.js', 'Readability.js', -- cgit v1.2.3 From 952e65590368931781327d1662061d46fa15ddd2 Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Thu, 25 Apr 2019 20:31:18 -0400 Subject: Replace explicit pref observer with lazy preference getters. Tag #361. --- toolkit/components/reader/ReaderMode.jsm | 14 +++++--------- toolkit/components/reader/Readerable.js | 30 ++++++------------------------ 2 files changed, 11 insertions(+), 33 deletions(-) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index ebc333495..030205446 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -38,15 +38,6 @@ this.ReaderMode = { DEBUG: 0, - // Don't try to parse the page if it has too many elements (for memory and - // performance reasons) - get maxElemsToParse() { - delete this.parseNodeLimit; - - Services.prefs.addObserver("reader.parse-node-limit", this, false); - return this.parseNodeLimit = Services.prefs.getIntPref("reader.parse-node-limit"); - }, - /** * Enter the reader mode by going forward one step in history if applicable, * if not, append the about:reader page in the history instead. @@ -548,3 +539,8 @@ this.ReaderMode = { return readingSpeed.get(lang) || readingSpeed.get("en"); }, }; + +// Don't try to parse the page if it has too many elements (for memory and +// performance reasons) +XPCOMUtils.defineLazyPreferenceGetter( + ReaderMode, "maxElemsToParse", "reader.parse-node-limit", 0); diff --git a/toolkit/components/reader/Readerable.js b/toolkit/components/reader/Readerable.js index 71c23eb5b..cee8adc08 100644 --- a/toolkit/components/reader/Readerable.js +++ b/toolkit/components/reader/Readerable.js @@ -23,30 +23,7 @@ var Readerable = { DEBUG: 0, get isEnabledForParseOnLoad() { - delete this.isEnabledForParseOnLoad; - - // Listen for future pref changes. - Services.prefs.addObserver("reader.parse-on-load.", this, false); - - return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); - }, - - _getStateForParseOnLoad() { - let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled"); - let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled"); - return isForceEnabled || isEnabled; - }, - - observe(aMessage, aTopic, aData) { - switch (aTopic) { - case "nsPref:changed": - if (aData.startsWith("reader.parse-on-load.")) { - this.isEnabledForParseOnLoad = this._getStateForParseOnLoad(); - } else if (aData === "reader.parse-node-limit") { - this.parseNodeLimit = Services.prefs.getIntPref(aData); - } - break; - } + return this.isEnabled || this.isForceEnabled; }, log(msg) { @@ -112,3 +89,8 @@ var Readerable = { return true; }, }; + +XPCOMUtils.defineLazyPreferenceGetter( + Readerable, "isEnabled", "reader.parse-on-load.enabled", true); +XPCOMUtils.defineLazyPreferenceGetter( + Readerable, "isForceEnabled", "reader.parse-on-load.force-enabled", false); -- cgit v1.2.3 From 2db53003e902dcd7a927083648435b49cc61a49b Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Thu, 25 Apr 2019 21:13:35 -0400 Subject: Fix parse node limit preference. Tag #361. --- toolkit/components/reader/ReaderMode.jsm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index 030205446..b9b31e29f 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -543,4 +543,4 @@ this.ReaderMode = { // Don't try to parse the page if it has too many elements (for memory and // performance reasons) XPCOMUtils.defineLazyPreferenceGetter( - ReaderMode, "maxElemsToParse", "reader.parse-node-limit", 0); + ReaderMode, "parseNodeLimit", "reader.parse-node-limit", 0); -- cgit v1.2.3 From 742f5aa24d7f65b14a5c32e2685a89d84613e970 Mon Sep 17 00:00:00 2001 From: Ascrod <32915892+Ascrod@users.noreply.github.com> Date: Sat, 27 Apr 2019 08:49:17 -0400 Subject: Bug 1399616 - add WP emoji styling to reader mode. --- toolkit/components/reader/ReaderMode.jsm | 2 ++ 1 file changed, 2 insertions(+) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm index b9b31e29f..5ba898aec 100644 --- a/toolkit/components/reader/ReaderMode.jsm +++ b/toolkit/components/reader/ReaderMode.jsm @@ -12,6 +12,7 @@ const { classes: Cc, interfaces: Ci, utils: Cu } = Components; // names so that rules in aboutReader.css can match them. const CLASSES_TO_PRESERVE = [ "caption", + "emoji", "hidden", "invisble", "sr-only", @@ -19,6 +20,7 @@ const CLASSES_TO_PRESERVE = [ "visuallyhidden", "wp-caption", "wp-caption-text", + "wp-smiley", ]; Cu.import("resource://gre/modules/Services.jsm"); -- cgit v1.2.3 From 0612246b04c12c10929f809c3c5a56da0780f0c1 Mon Sep 17 00:00:00 2001 From: monikamaheshwari Date: Sat, 27 Apr 2019 09:15:22 -0400 Subject: Bug 1422680 - simplify aboutReader.css font-size classes using CSS variables r=Gijs --- toolkit/components/reader/AboutReader.jsm | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm index 6ec959eba..4cc9d6750 100644 --- a/toolkit/components/reader/AboutReader.jsm +++ b/toolkit/components/reader/AboutReader.jsm @@ -276,13 +276,10 @@ AboutReader.prototype = { }, _setFontSize(newFontSize) { - let containerClasses = this._containerElement.classList; - - if (this._fontSize > 0) - containerClasses.remove("font-size" + this._fontSize); - this._fontSize = newFontSize; - containerClasses.add("font-size" + this._fontSize); + let size = (10 + 2 * this._fontSize) + "px"; + + this._containerElement.style.setProperty("--font-size", size); return AsyncPrefs.set("reader.font_size", this._fontSize); }, -- cgit v1.2.3 From b430def773d2ee2fb027d1798aced0c7566c6ed4 Mon Sep 17 00:00:00 2001 From: Sonali9 Date: Sat, 27 Apr 2019 09:18:44 -0400 Subject: Bug 1151735 - Hide font panel when text is being selected r=Gijs --- toolkit/components/reader/AboutReader.jsm | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm index 4cc9d6750..5defa623e 100644 --- a/toolkit/components/reader/AboutReader.jsm +++ b/toolkit/components/reader/AboutReader.jsm @@ -58,6 +58,7 @@ var AboutReader = function(win, articlePromise) { this._scrollOffset = win.pageYOffset; + doc.addEventListener("mousedown", this); doc.addEventListener("click", this); win.addEventListener("pagehide", this); @@ -191,13 +192,16 @@ AboutReader.prototype = { if (!aEvent.isTrusted) return; + let target = aEvent.target; switch (aEvent.type) { + case "mousedown": + if (!target.closest(".dropdown-popup")) { + this._closeDropdowns(); + } + break; case "click": - let target = aEvent.target; if (target.classList.contains("dropdown-toggle")) { this._toggleDropdownClicked(aEvent); - } else if (!target.closest(".dropdown-popup")) { - this._closeDropdowns(); } break; case "scroll": -- cgit v1.2.3 From 5c350eebcda338152572aea64400a07db5ce2ff4 Mon Sep 17 00:00:00 2001 From: monikamaheshwari Date: Sat, 27 Apr 2019 09:35:59 -0400 Subject: Bug 1429442 - Buttons in "type control" popup in reader mode should have tooltips r=Gijs,MarcoZ --- toolkit/components/reader/AboutReader.jsm | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm index 5defa623e..9d9362a0c 100644 --- a/toolkit/components/reader/AboutReader.jsm +++ b/toolkit/components/reader/AboutReader.jsm @@ -120,6 +120,25 @@ var AboutReader = function(win, articlePromise) { } this._loadArticle(); + + let dropdown = this._toolbarElement; + + let elemL10nMap = { + ".minus-button": "minus", + ".plus-button": "plus", + ".content-width-minus-button": "contentwidthminus", + ".content-width-plus-button": "contentwidthplus", + ".line-height-minus-button": "lineheightminus", + ".line-height-plus-button": "lineheightplus", + ".light-button": "colorschemelight", + ".dark-button": "colorschemedark", + ".sepia-button": "colorschemesepia", + }; + + for (let [selector, stringID] of Object.entries(elemL10nMap)) { + dropdown.querySelector(selector).setAttribute("title", + gStrings.GetStringFromName("aboutReader.toolbar." + stringID)); + } }; AboutReader.prototype = { -- cgit v1.2.3