summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader/Readability.js
diff options
context:
space:
mode:
authorAscrod <32915892+Ascrod@users.noreply.github.com>2019-04-23 17:41:09 -0400
committerAscrod <32915892+Ascrod@users.noreply.github.com>2019-04-27 07:43:11 -0400
commitccc4363462053edfc9cf616afa7f86b3244aaff6 (patch)
tree5e8ddd9f992b55fba9c4544f2510876c51577551 /toolkit/components/reader/Readability.js
parente0116ac2b78eb4e621a4d0769e01f8358a6d661c (diff)
downloadUXP-ccc4363462053edfc9cf616afa7f86b3244aaff6.tar
UXP-ccc4363462053edfc9cf616afa7f86b3244aaff6.tar.gz
UXP-ccc4363462053edfc9cf616afa7f86b3244aaff6.tar.lz
UXP-ccc4363462053edfc9cf616afa7f86b3244aaff6.tar.xz
UXP-ccc4363462053edfc9cf616afa7f86b3244aaff6.zip
Update Readability from mozilla-central release branch (FF 66.0.3). Tag #361.
Diffstat (limited to 'toolkit/components/reader/Readability.js')
-rw-r--r--toolkit/components/reader/Readability.js289
1 files changed, 126 insertions, 163 deletions
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index c2bba0cd3..69fb53f86 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -46,6 +46,7 @@ function Readability(doc, options) {
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
+ this._articleSiteName = null;
this._attempts = [];
// Configurable options
@@ -118,15 +119,18 @@ Readability.prototype = {
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
+ // NOTE: These two regular expressions are duplicated in
+ // Readability-readerable.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
+
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
- videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+ videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
@@ -267,7 +271,7 @@ Readability.prototype = {
_getAllNodesWithTag: function(node, tagNames) {
if (node.querySelectorAll) {
- return node.querySelectorAll(tagNames.join(','));
+ return node.querySelectorAll(tagNames.join(","));
}
return [].concat.apply([], tagNames.map(function(tag) {
var collection = node.getElementsByTagName(tag);
@@ -327,7 +331,7 @@ Readability.prototype = {
return uri;
}
- var links = articleContent.getElementsByTagName("a");
+ var links = this._getAllNodesWithTag(articleContent, ["a"]);
this._forEachNode(links, function(link) {
var href = link.getAttribute("href");
if (href) {
@@ -342,7 +346,7 @@ Readability.prototype = {
}
});
- var imgs = articleContent.getElementsByTagName("img");
+ var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
this._forEachNode(imgs, function(img) {
var src = img.getAttribute("src");
if (src) {
@@ -366,7 +370,7 @@ Readability.prototype = {
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
- curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
+ curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
} catch (e) {/* ignore exceptions setting the title. */}
var titleHadHierarchicalSeparators = false;
@@ -377,18 +381,18 @@ Readability.prototype = {
// If there's a separator in the title, first remove the final part
if ((/ [\|\-\\\/>»] /).test(curTitle)) {
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
- curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
+ curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
// If the resulting title is too short (3 words or fewer), remove
// the first part instead:
if (wordCount(curTitle) < 3)
- curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
- } else if (curTitle.indexOf(': ') !== -1) {
+ curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
+ } else if (curTitle.indexOf(": ") !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._concatNodeLists(
- doc.getElementsByTagName('h1'),
- doc.getElementsByTagName('h2')
+ doc.getElementsByTagName("h1"),
+ doc.getElementsByTagName("h2")
);
var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
@@ -397,25 +401,25 @@ Readability.prototype = {
// If we don't, let's extract the title out of the original title string.
if (!match) {
- curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
+ curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
// If the title is now too short, try the first colon instead:
if (wordCount(curTitle) < 3) {
- curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+ curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
// But if we have too many words before the colon there's something weird
// with the titles and the H tags so let's just use the original title instead
- } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
+ } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
curTitle = origTitle;
}
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
- var hOnes = doc.getElementsByTagName('h1');
+ var hOnes = doc.getElementsByTagName("h1");
if (hOnes.length === 1)
curTitle = this._getInnerText(hOnes[0]);
}
- curTitle = curTitle.trim();
+ curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
// If we now have 4 words or fewer as our title, and either no
// 'hierarchical' separators (\, /, > or ») were found in the original
// title or we decreased the number of words by more than 1 word, use
@@ -505,7 +509,8 @@ Readability.prototype = {
break;
}
- if (!this._isPhrasingContent(next)) break;
+ if (!this._isPhrasingContent(next))
+ break;
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
@@ -513,9 +518,12 @@ Readability.prototype = {
next = sibling;
}
- while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+ while (p.lastChild && this._isWhitespace(p.lastChild)) {
+ p.removeChild(p.lastChild);
+ }
- if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
+ if (p.parentNode.tagName === "P")
+ this._setNodeTag(p.parentNode, "DIV");
}
});
},
@@ -576,7 +584,7 @@ Readability.prototype = {
// If there is only one h2 and its text content substantially equals article title,
// they are probably using it as a header and not a subheader,
// so remove it since we already extract the title separately.
- var h2 = articleContent.getElementsByTagName('h2');
+ var h2 = articleContent.getElementsByTagName("h2");
if (h2.length === 1) {
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
if (Math.abs(lengthSimilarRate) < 0.5) {
@@ -606,12 +614,12 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs
- this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
- var imgCount = paragraph.getElementsByTagName('img').length;
- var embedCount = paragraph.getElementsByTagName('embed').length;
- var objectCount = paragraph.getElementsByTagName('object').length;
+ this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
+ var imgCount = paragraph.getElementsByTagName("img").length;
+ var embedCount = paragraph.getElementsByTagName("embed").length;
+ var objectCount = paragraph.getElementsByTagName("object").length;
// At this point, nasty iframes have been removed, only remain embedded video ones.
- var iframeCount = paragraph.getElementsByTagName('iframe').length;
+ var iframeCount = paragraph.getElementsByTagName("iframe").length;
var totalCount = imgCount + embedCount + objectCount + iframeCount;
return totalCount === 0 && !this._getInnerText(paragraph, false);
@@ -648,34 +656,34 @@ Readability.prototype = {
node.readability = {"contentScore": 0};
switch (node.tagName) {
- case 'DIV':
+ case "DIV":
node.readability.contentScore += 5;
break;
- case 'PRE':
- case 'TD':
- case 'BLOCKQUOTE':
+ case "PRE":
+ case "TD":
+ case "BLOCKQUOTE":
node.readability.contentScore += 3;
break;
- case 'ADDRESS':
- case 'OL':
- case 'UL':
- case 'DL':
- case 'DD':
- case 'DT':
- case 'LI':
- case 'FORM':
+ case "ADDRESS":
+ case "OL":
+ case "UL":
+ case "DL":
+ case "DD":
+ case "DT":
+ case "LI":
+ case "FORM":
node.readability.contentScore -= 3;
break;
- case 'H1':
- case 'H2':
- case 'H3':
- case 'H4':
- case 'H5':
- case 'H6':
- case 'TH':
+ case "H1":
+ case "H2":
+ case "H3":
+ case "H4":
+ case "H5":
+ case "H6":
+ case "TH":
node.readability.contentScore -= 5;
break;
}
@@ -824,12 +832,14 @@ Readability.prototype = {
if (p !== null) {
p.appendChild(childNode);
} else if (!this._isWhitespace(childNode)) {
- p = doc.createElement('p');
+ p = doc.createElement("p");
node.replaceChild(p, childNode);
p.appendChild(childNode);
}
} else if (p !== null) {
- while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+ while (p.lastChild && this._isWhitespace(p.lastChild)) {
+ p.removeChild(p.lastChild);
+ }
p = null;
}
childNode = nextSibling;
@@ -860,7 +870,7 @@ Readability.prototype = {
**/
var candidates = [];
this._forEachNode(elementsToScore, function(elementToScore) {
- if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
+ if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
return;
// If this paragraph is less than 25 characters, don't even count it.
@@ -879,17 +889,17 @@ Readability.prototype = {
contentScore += 1;
// Add points for any commas within this paragraph.
- contentScore += innerText.split(',').length;
+ contentScore += innerText.split(",").length;
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
- if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
+ if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
return;
- if (typeof(ancestor.readability) === 'undefined') {
+ if (typeof(ancestor.readability) === "undefined") {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
@@ -920,7 +930,7 @@ Readability.prototype = {
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
candidate.readability.contentScore = candidateScore;
- this.log('Candidate:', candidate, "with score " + candidateScore);
+ this.log("Candidate:", candidate, "with score " + candidateScore);
for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t];
@@ -1039,8 +1049,8 @@ Readability.prototype = {
var sibling = siblings[s];
var append = false;
- this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
- this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
+ this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
+ this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
if (sibling === topCandidate) {
append = true;
@@ -1074,7 +1084,7 @@ Readability.prototype = {
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
- this.log("Altering sibling:", sibling, 'to div.');
+ this.log("Altering sibling:", sibling, "to div.");
sibling = this._setNodeTag(sibling, "DIV");
}
@@ -1142,7 +1152,7 @@ Readability.prototype = {
this._attempts.push({articleContent: articleContent, textLength: textLength});
// No luck after removing flags, just return the longest text we found during the different loops
this._attempts.sort(function (a, b) {
- return a.textLength < b.textLength;
+ return b.textLength - a.textLength;
});
// But first check if we actually have something
@@ -1182,7 +1192,7 @@ Readability.prototype = {
* @return Boolean - whether the input string is a byline.
*/
_isValidByline: function(byline) {
- if (typeof byline == 'string' || byline instanceof String) {
+ if (typeof byline == "string" || byline instanceof String) {
byline = byline.trim();
return (byline.length > 0) && (byline.length < 100);
}
@@ -1199,62 +1209,73 @@ Readability.prototype = {
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");
- // Match "description", or Twitter's "twitter:description" (Cards)
- // in name attribute.
- var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
+ // property is a space-separated list of values
+ var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;
- // Match Facebook's Open Graph title & description properties.
- var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
+ // name is a single value
+ var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
// Find description tags.
this._forEachNode(metaElements, function(element) {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
+ var content = element.getAttribute("content");
+ var matches = null;
+ var name = null;
- if ([elementName, elementProperty].indexOf("author") !== -1) {
- metadata.byline = element.getAttribute("content");
- return;
+ if (elementProperty) {
+ matches = elementProperty.match(propertyPattern);
+ if (matches) {
+ for (var i = matches.length - 1; i >= 0; i--) {
+ // Convert to lowercase, and remove any whitespace
+ // so we can match below.
+ name = matches[i].toLowerCase().replace(/\s/g, "");
+ // multiple authors
+ values[name] = content.trim();
+ }
+ }
}
-
- var name = null;
- if (namePattern.test(elementName)) {
+ if (!matches && elementName && namePattern.test(elementName)) {
name = elementName;
- } else if (propertyPattern.test(elementProperty)) {
- name = elementProperty;
- }
-
- if (name) {
- var content = element.getAttribute("content");
if (content) {
- // Convert to lowercase and remove any whitespace
- // so we can match below.
- name = name.toLowerCase().replace(/\s/g, '');
+ // Convert to lowercase, remove any whitespace, and convert dots
+ // to colons so we can match below.
+ name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
values[name] = content.trim();
}
}
});
- if ("description" in values) {
- metadata.excerpt = values["description"];
- } else if ("og:description" in values) {
- // Use facebook open graph description.
- metadata.excerpt = values["og:description"];
- } else if ("twitter:description" in values) {
- // Use twitter cards description.
- metadata.excerpt = values["twitter:description"];
- }
+ // get title
+ metadata.title = values["dc:title"] ||
+ values["dcterm:title"] ||
+ values["og:title"] ||
+ values["weibo:article:title"] ||
+ values["weibo:webpage:title"] ||
+ values["title"] ||
+ values["twitter:title"];
- metadata.title = this._getArticleTitle();
if (!metadata.title) {
- if ("og:title" in values) {
- // Use facebook open graph title.
- metadata.title = values["og:title"];
- } else if ("twitter:title" in values) {
- // Use twitter cards title.
- metadata.title = values["twitter:title"];
- }
+ metadata.title = this._getArticleTitle();
}
+ // get author
+ metadata.byline = values["dc:creator"] ||
+ values["dcterm:creator"] ||
+ values["author"];
+
+ // get description
+ metadata.excerpt = values["dc:description"] ||
+ values["dcterm:description"] ||
+ values["og:description"] ||
+ values["weibo:article:description"] ||
+ values["weibo:webpage:description"] ||
+ values["description"] ||
+ values["twitter:description"];
+
+ // get site name
+ metadata.siteName = values["og:site_name"];
+
return metadata;
},
@@ -1264,12 +1285,12 @@ Readability.prototype = {
* @param Element
**/
_removeScripts: function(doc) {
- this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
+ this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
scriptNode.nodeValue = "";
- scriptNode.removeAttribute('src');
+ scriptNode.removeAttribute("src");
return true;
});
- this._removeNodes(doc.getElementsByTagName('noscript'));
+ this._removeNodes(doc.getElementsByTagName("noscript"));
},
/**
@@ -1336,7 +1357,7 @@ Readability.prototype = {
* @return string
**/
_getInnerText: function(e, normalizeSpaces) {
- normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
+ normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
var textContent = e.textContent.trim();
if (normalizeSpaces) {
@@ -1365,7 +1386,7 @@ Readability.prototype = {
* @return void
**/
_cleanStyles: function(e) {
- if (!e || e.tagName.toLowerCase() === 'svg')
+ if (!e || e.tagName.toLowerCase() === "svg")
return;
// Remove `style` and deprecated presentational attributes
@@ -1374,8 +1395,8 @@ Readability.prototype = {
}
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
- e.removeAttribute('width');
- e.removeAttribute('height');
+ e.removeAttribute("width");
+ e.removeAttribute("height");
}
var cur = e.firstElementChild;
@@ -1421,7 +1442,7 @@ Readability.prototype = {
var weight = 0;
// Look for a special classname
- if (typeof(e.className) === 'string' && e.className !== '') {
+ if (typeof(e.className) === "string" && e.className !== "") {
if (this.REGEXPS.negative.test(e.className))
weight -= 25;
@@ -1430,7 +1451,7 @@ Readability.prototype = {
}
// Look for a special ID
- if (typeof(e.id) === 'string' && e.id !== '') {
+ if (typeof(e.id) === "string" && e.id !== "") {
if (this.REGEXPS.negative.test(e.id))
weight -= 25;
@@ -1619,7 +1640,7 @@ Readability.prototype = {
return true;
}
- if (this._getCharCount(node, ',') < 10) {
+ if (this._getCharCount(node, ",") < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
@@ -1679,7 +1700,7 @@ Readability.prototype = {
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
- this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
+ this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
return this._getClassWeight(header) < 0;
});
}
@@ -1694,66 +1715,7 @@ Readability.prototype = {
},
_isProbablyVisible: function(node) {
- return node.style.display != "none" && !node.hasAttribute("hidden");
- },
-
- /**
- * Decides whether or not the document is reader-able without parsing the whole thing.
- *
- * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
- */
- isProbablyReaderable: function(helperIsVisible) {
- var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
-
- // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
- // Some articles' DOM structures might look like
- // <div>
- // Sentences<br>
- // <br>
- // Sentences<br>
- // </div>
- var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
- if (brNodes.length) {
- var set = new Set();
- [].forEach.call(brNodes, function(node) {
- set.add(node.parentNode);
- });
- nodes = [].concat.apply(Array.from(set), nodes);
- }
-
- if (!helperIsVisible) {
- helperIsVisible = this._isProbablyVisible;
- }
-
- var score = 0;
- // This is a little cheeky, we use the accumulator 'score' to decide what to return from
- // this callback:
- return this._someNode(nodes, function(node) {
- if (helperIsVisible && !helperIsVisible(node))
- return false;
- var matchString = node.className + " " + node.id;
-
- if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
- !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
- return false;
- }
-
- if (node.matches && node.matches("li p")) {
- return false;
- }
-
- var textContentLength = node.textContent.trim().length;
- if (textContentLength < 140) {
- return false;
- }
-
- score += Math.sqrt(textContentLength - 140);
-
- if (score > 20) {
- return true;
- }
- return false;
- });
+ return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
},
/**
@@ -1812,6 +1774,7 @@ Readability.prototype = {
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
+ siteName: metadata.siteName || this._articleSiteName
};
}
};