summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader/Readability.js
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/reader/Readability.js')
-rw-r--r--toolkit/components/reader/Readability.js853
1 files changed, 374 insertions, 479 deletions
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 491461a8e..04949dc61 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -38,32 +38,22 @@ function Readability(uri, doc, options) {
this._uri = uri;
this._doc = doc;
- this._biggestFrame = false;
+ this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
- // Configureable options
+ // Configurable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
- this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+ this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+ this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
- // The list of pages we've parsed in this call of readability,
- // for autopaging. As a key store for easier searching.
- this._parsedPages = {};
-
- // A list of the ETag headers of pages we've parsed, in case they happen to match,
- // we'll know it's a duplicate.
- this._pageETags = {};
-
- // Make an AJAX request for each page and append it to the document.
- this._curPageNum = 1;
-
var logEl;
// Control whether log messages are sent to the console
@@ -82,12 +72,12 @@ function Readability(uri, doc, options) {
return rv + elDesc;
};
this.log = function () {
- if (typeof dump !== undefined) {
+ if (typeof dump !== "undefined") {
var msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? logEl(x) : x;
}).join(" ");
dump("Reader: (Readability) " + msg + "\n");
- } else if (typeof console !== undefined) {
+ } else if (typeof console !== "undefined") {
var args = ["Reader: (Readability) "].concat(arguments);
console.log.apply(console, args);
}
@@ -109,20 +99,19 @@ Readability.prototype = {
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
- // The maximum number of pages to loop through before we call
- // it quits and just show a link.
- DEFAULT_MAX_PAGES: 5,
-
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
+ // The default number of words an article must have in order to return a result
+ DEFAULT_WORD_THRESHOLD: 500,
+
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+ unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
- negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+ negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
@@ -138,6 +127,13 @@ Readability.prototype = {
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
+ PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
+
+ DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+
+ // These are the classes that readability sets itself.
+ CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+
/**
* Run any post-process modifications to article content as necessary.
*
@@ -147,6 +143,9 @@ Readability.prototype = {
_postProcessContent: function(articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
+
+ // Remove classes.
+ this._cleanClasses(articleContent);
},
/**
@@ -155,8 +154,8 @@ Readability.prototype = {
*
* If function is not passed, removes all the nodes in node list.
*
- * @param NodeList nodeList The no
- * @param Function filterFn
+ * @param NodeList nodeList The nodes to operate on
+ * @param Function filterFn the function to use as a filter
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
@@ -172,6 +171,20 @@ Readability.prototype = {
},
/**
+ * Iterates over a NodeList, and calls _setNodeTag for each node.
+ *
+ * @param NodeList nodeList The nodes to operate on
+ * @param String newTagName the new tag name to use
+ * @return void
+ */
+ _replaceNodeTags: function(nodeList, newTagName) {
+ for (var i = nodeList.length - 1; i >= 0; i--) {
+ var node = nodeList[i];
+ this._setNodeTag(node, newTagName);
+ }
+ },
+
+ /**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
*
@@ -180,10 +193,9 @@ Readability.prototype = {
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
- * @param Boolean backward Whether to use backward iteration.
* @return void
*/
- _forEachNode: function(nodeList, fn, backward) {
+ _forEachNode: function(nodeList, fn) {
Array.prototype.forEach.call(nodeList, fn, this);
},
@@ -228,6 +240,34 @@ Readability.prototype = {
},
/**
+ * Removes the class="" attribute from every element in the given
+ * subtree, except those that match CLASSES_TO_PRESERVE and
+ * the classesToPreserve array from the options object.
+ *
+ * @param Element
+ * @return void
+ */
+ _cleanClasses: function(node) {
+ var classesToPreserve = this._classesToPreserve;
+ var className = (node.getAttribute("class") || "")
+ .split(/\s+/)
+ .filter(function(cls) {
+ return classesToPreserve.indexOf(cls) != -1;
+ })
+ .join(" ");
+
+ if (className) {
+ node.setAttribute("class", className);
+ } else {
+ node.removeAttribute("class");
+ }
+
+ for (node = node.firstElementChild; node; node = node.nextElementSibling) {
+ this._cleanClasses(node);
+ }
+ },
+
+ /**
* Converts each <a> and <img> uri in the given element to an absolute URI,
* ignoring #ref URIs.
*
@@ -307,11 +347,20 @@ Readability.prototype = {
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
} catch (e) {/* ignore exceptions setting the title. */}
- if (curTitle.match(/ [\|\-] /)) {
- curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
+ var titleHadHierarchicalSeparators = false;
+ function wordCount(str) {
+ return str.split(/\s+/).length;
+ }
+
+ // If there's a separator in the title, first remove the final part
+ if ((/ [\|\-\\\/>»] /).test(curTitle)) {
+ titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
+ curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
- if (curTitle.split(' ').length < 3)
- curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
+ // If the resulting title is too short (3 words or fewer), remove
+ // the first part instead:
+ if (wordCount(curTitle) < 3)
+ curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
@@ -328,8 +377,13 @@ Readability.prototype = {
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
// If the title is now too short, try the first colon instead:
- if (curTitle.split(' ').length < 3)
+ if (wordCount(curTitle) < 3) {
curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+ // But if we have too many words before the colon there's something weird
+ // with the titles and the H tags so let's just use the original title instead
+ } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
+ curTitle = origTitle;
+ }
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
var hOnes = doc.getElementsByTagName('h1');
@@ -339,9 +393,16 @@ Readability.prototype = {
}
curTitle = curTitle.trim();
-
- if (curTitle.split(' ').length <= 4)
+ // If we now have 4 words or fewer as our title, and either no
+ // 'hierarchical' separators (\, /, > or ») were found in the original
+ // title or we decreased the number of words by more than 1 word, use
+ // the original title.
+ var curTitleWordCount = wordCount(curTitle);
+ if (curTitleWordCount <= 4 &&
+ (!titleHadHierarchicalSeparators ||
+ curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
curTitle = origTitle;
+ }
return curTitle;
},
@@ -362,9 +423,7 @@ Readability.prototype = {
this._replaceBrs(doc.body);
}
- this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
- this._setNodeTag(fontNode, "SPAN");
- });
+ this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
},
/**
@@ -464,19 +523,49 @@ Readability.prototype = {
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);
+ // Check for data tables before we continue, to avoid removing items in
+ // those tables, which will often be isolated even though they're
+ // visually linked to other content-ful elements (text, images, etc.).
+ this._markDataTables(articleContent);
+
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
+ this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
- // If there is only one h2, they are probably using it as a header
- // and not a subheader, so remove it since we already have a header.
- if (articleContent.getElementsByTagName('h2').length === 1)
- this._clean(articleContent, "h2");
+ // Clean out elements have "share" in their id/class combinations from final top candidates,
+ // which means we don't remove the top candidates even they have "share".
+ this._forEachNode(articleContent.children, function(topCandidate) {
+ this._cleanMatchedNodes(topCandidate, /share/);
+ });
+
+ // If there is only one h2 and its text content substantially equals article title,
+ // they are probably using it as a header and not a subheader,
+ // so remove it since we already extract the title separately.
+ var h2 = articleContent.getElementsByTagName('h2');
+ if (h2.length === 1) {
+ var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
+ if (Math.abs(lengthSimilarRate) < 0.5) {
+ var titlesMatch = false;
+ if (lengthSimilarRate > 0) {
+ titlesMatch = h2[0].textContent.includes(this._articleTitle);
+ } else {
+ titlesMatch = this._articleTitle.includes(h2[0].textContent);
+ }
+ if (titlesMatch) {
+ this._clean(articleContent, "h2");
+ }
+ }
+ }
this._clean(articleContent, "iframe");
+ this._clean(articleContent, "input");
+ this._clean(articleContent, "textarea");
+ this._clean(articleContent, "select");
+ this._clean(articleContent, "button");
this._cleanHeaders(articleContent);
// Do these last as the previous stuff may have removed junk
@@ -662,9 +751,6 @@ Readability.prototype = {
var pageCacheHtml = page.innerHTML;
- // Check if any "dir" is set on the toplevel document element
- this._articleDir = doc.documentElement.getAttribute("dir");
-
while (true) {
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
@@ -695,6 +781,15 @@ Readability.prototype = {
}
}
+ // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+ if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
+ node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
+ node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
+ this._isElementWithoutContent(node)) {
+ node = this._removeAndGetNext(node);
+ continue;
+ }
+
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
}
@@ -709,13 +804,14 @@ Readability.prototype = {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
+ elementsToScore.push(node);
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
- if (childNode.nodeType === Node.TEXT_NODE) {
+ if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
@@ -812,6 +908,7 @@ Readability.prototype = {
var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
+ var parentOfTopCandidate;
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
@@ -831,6 +928,33 @@ Readability.prototype = {
this._initializeNode(topCandidate);
} else if (topCandidate) {
+ // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+ // and whose scores are quite closed with current `topCandidate` node.
+ var alternativeCandidateAncestors = [];
+ for (var i = 1; i < topCandidates.length; i++) {
+ if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
+ alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
+ }
+ }
+ var MINIMUM_TOPCANDIDATES = 3;
+ if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
+ parentOfTopCandidate = topCandidate.parentNode;
+ while (parentOfTopCandidate.tagName !== "BODY") {
+ var listsContainingThisAncestor = 0;
+ for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
+ listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
+ }
+ if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
+ topCandidate = parentOfTopCandidate;
+ break;
+ }
+ parentOfTopCandidate = parentOfTopCandidate.parentNode;
+ }
+ }
+ if (!topCandidate.readability) {
+ this._initializeNode(topCandidate);
+ }
+
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
@@ -838,11 +962,15 @@ Readability.prototype = {
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
- var parentOfTopCandidate = topCandidate.parentNode;
+ parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
- while (parentOfTopCandidate && parentOfTopCandidate.readability) {
+ while (parentOfTopCandidate.tagName !== "BODY") {
+ if (!parentOfTopCandidate.readability) {
+ parentOfTopCandidate = parentOfTopCandidate.parentNode;
+ continue;
+ }
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold)
break;
@@ -854,6 +982,17 @@ Readability.prototype = {
lastScore = parentOfTopCandidate.readability.contentScore;
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
+
+ // If the top candidate is the only child, use parent instead. This will help sibling
+ // joining logic when adjacent content is actually located in parent's sibling node.
+ parentOfTopCandidate = topCandidate.parentNode;
+ while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
+ topCandidate = parentOfTopCandidate;
+ parentOfTopCandidate = topCandidate.parentNode;
+ }
+ if (!topCandidate.readability) {
+ this._initializeNode(topCandidate);
+ }
}
// Now that we have the top candidate, look through its siblings for content
@@ -864,7 +1003,9 @@ Readability.prototype = {
articleContent.id = "readability-content";
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
- var siblings = topCandidate.parentNode.children;
+ // Keep potential top candidate's parent node to try to get text direction of it later.
+ parentOfTopCandidate = topCandidate.parentNode;
+ var siblings = parentOfTopCandidate.children;
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
@@ -927,24 +1068,22 @@ Readability.prototype = {
if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML);
- if (this._curPageNum === 1) {
- if (neededToCreateTopCandidate) {
- // We already created a fake div thing, and there wouldn't have been any siblings left
- // for the previous loop, so there's no point trying to create a new div, and then
- // move all the children over. Just assign IDs and class names here. No need to append
- // because that already happened anyway.
- topCandidate.id = "readability-page-1";
- topCandidate.className = "page";
- } else {
- var div = doc.createElement("DIV");
- div.id = "readability-page-1";
- div.className = "page";
- var children = articleContent.childNodes;
- while (children.length) {
- div.appendChild(children[0]);
- }
- articleContent.appendChild(div);
+ if (neededToCreateTopCandidate) {
+ // We already created a fake div thing, and there wouldn't have been any siblings left
+ // for the previous loop, so there's no point trying to create a new div, and then
+ // move all the children over. Just assign IDs and class names here. No need to append
+ // because that already happened anyway.
+ topCandidate.id = "readability-page-1";
+ topCandidate.className = "page";
+ } else {
+ var div = doc.createElement("DIV");
+ div.id = "readability-page-1";
+ div.className = "page";
+ var children = articleContent.childNodes;
+ while (children.length) {
+ div.appendChild(children[0]);
}
+ articleContent.appendChild(div);
}
if (this._debug)
@@ -955,7 +1094,7 @@ Readability.prototype = {
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
- if (this._getInnerText(articleContent, true).length < 500) {
+ if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
@@ -968,6 +1107,18 @@ Readability.prototype = {
return null;
}
} else {
+ // Find out text direction from ancestors of final top candidate.
+ var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
+ this._someNode(ancestors, function(ancestor) {
+ if (!ancestor.tagName)
+ return false;
+ var articleDir = ancestor.getAttribute("dir");
+ if (articleDir) {
+ this._articleDir = articleDir;
+ return true;
+ }
+ return false;
+ });
return articleContent;
}
}
@@ -1044,12 +1195,15 @@ Readability.prototype = {
metadata.excerpt = values["twitter:description"];
}
- if ("og:title" in values) {
- // Use facebook open graph title.
- metadata.title = values["og:title"];
- } else if ("twitter:title" in values) {
- // Use twitter cards title.
- metadata.title = values["twitter:title"];
+ metadata.title = this._getArticleTitle();
+ if (!metadata.title) {
+ if ("og:title" in values) {
+ // Use facebook open graph title.
+ metadata.title = values["og:title"];
+ } else if ("twitter:title" in values) {
+ // Use twitter cards title.
+ metadata.title = values["twitter:title"];
+ }
}
return metadata;
@@ -1089,6 +1243,13 @@ Readability.prototype = {
});
},
+ _isElementWithoutContent: function(node) {
+ return node.nodeType === Node.ELEMENT_NODE &&
+ node.textContent.trim().length == 0 &&
+ (node.children.length == 0 ||
+ node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
+ },
+
/**
* Determine whether element has any children block level elements.
*
@@ -1139,26 +1300,25 @@ Readability.prototype = {
* @return void
**/
_cleanStyles: function(e) {
- e = e || this._doc;
- if (!e)
+ if (!e || e.tagName.toLowerCase() === 'svg')
return;
- var cur = e.firstChild;
- // Remove any root styles, if we're able.
- if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
- e.removeAttribute('style');
-
- // Go until there are no more child nodes
- while (cur !== null) {
- if (cur.nodeType === cur.ELEMENT_NODE) {
- // Remove style attribute(s) :
- if (cur.className !== "readability-styled")
- cur.removeAttribute("style");
+ if (e.className !== 'readability-styled') {
+ // Remove `style` and deprecated presentational attributes
+ for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+ e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+ }
- this._cleanStyles(cur);
+ if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+ e.removeAttribute('width');
+ e.removeAttribute('height');
}
+ }
- cur = cur.nextSibling;
+ var cur = e.firstElementChild;
+ while (cur !== null) {
+ this._cleanStyles(cur);
+ cur = cur.nextElementSibling;
}
},
@@ -1185,368 +1345,6 @@ Readability.prototype = {
},
/**
- * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
- *
- * @author Dan Lacy
- * @return string the base url
- **/
- _findBaseUrl: function() {
- var uri = this._uri;
- var noUrlParams = uri.path.split("?")[0];
- var urlSlashes = noUrlParams.split("/").reverse();
- var cleanedSegments = [];
- var possibleType = "";
-
- for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
- var segment = urlSlashes[i];
-
- // Split off and save anything that looks like a file type.
- if (segment.indexOf(".") !== -1) {
- possibleType = segment.split(".")[1];
-
- // If the type isn't alpha-only, it's probably not actually a file extension.
- if (!possibleType.match(/[^a-zA-Z]/))
- segment = segment.split(".")[0];
- }
-
- // EW-CMS specific segment replacement. Ugly.
- // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
- if (segment.indexOf(',00') !== -1)
- segment = segment.replace(',00', '');
-
- // If our first or second segment has anything looking like a page number, remove it.
- if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
- segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
-
- var del = false;
-
- // If this is purely a number, and it's the first or second segment,
- // it's probably a page number. Remove it.
- if (i < 2 && segment.match(/^\d{1,2}$/))
- del = true;
-
- // If this is the first segment and it's just "index", remove it.
- if (i === 0 && segment.toLowerCase() === "index")
- del = true;
-
- // If our first or second segment is smaller than 3 characters,
- // and the first segment was purely alphas, remove it.
- if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
- del = true;
-
- // If it's not marked for deletion, push it to cleanedSegments.
- if (!del)
- cleanedSegments.push(segment);
- }
-
- // This is our final, cleaned, base article URL.
- return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
- },
-
- /**
- * Look for any paging links that may occur within the document.
- *
- * @param body
- * @return object (array)
- **/
- _findNextPageLink: function(elem) {
- var uri = this._uri;
- var possiblePages = {};
- var allLinks = elem.getElementsByTagName('a');
- var articleBaseUrl = this._findBaseUrl();
-
- // Loop through all links, looking for hints that they may be next-page links.
- // Things like having "page" in their textContent, className or id, or being a child
- // of a node with a page-y className or id.
- //
- // Also possible: levenshtein distance? longest common subsequence?
- //
- // After we do that, assign each page a score, and
- for (var i = 0, il = allLinks.length; i < il; i += 1) {
- var link = allLinks[i];
- var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
-
- // If we've already seen this page, ignore it.
- if (linkHref === "" ||
- linkHref === articleBaseUrl ||
- linkHref === uri.spec ||
- linkHref in this._parsedPages) {
- continue;
- }
-
- // If it's on a different domain, skip it.
- if (uri.host !== linkHref.split(/\/+/g)[1])
- continue;
-
- var linkText = this._getInnerText(link);
-
- // If the linkText looks like it's not the next page, skip it.
- if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
- continue;
-
- // If the leftovers of the URL after removing the base URL don't contain
- // any digits, it's certainly not a next page link.
- var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
- if (!linkHrefLeftover.match(/\d/))
- continue;
-
- if (!(linkHref in possiblePages)) {
- possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
- } else {
- possiblePages[linkHref].linkText += ' | ' + linkText;
- }
-
- var linkObj = possiblePages[linkHref];
-
- // If the articleBaseUrl isn't part of this URL, penalize this link. It could
- // still be the link, but the odds are lower.
- // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
- if (linkHref.indexOf(articleBaseUrl) !== 0)
- linkObj.score -= 25;
-
- var linkData = linkText + ' ' + link.className + ' ' + link.id;
- if (linkData.match(this.REGEXPS.nextLink))
- linkObj.score += 50;
-
- if (linkData.match(/pag(e|ing|inat)/i))
- linkObj.score += 25;
-
- if (linkData.match(/(first|last)/i)) {
- // -65 is enough to negate any bonuses gotten from a > or » in the text,
- // If we already matched on "next", last is probably fine.
- // If we didn't, then it's bad. Penalize.
- if (!linkObj.linkText.match(this.REGEXPS.nextLink))
- linkObj.score -= 65;
- }
-
- if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
- linkObj.score -= 50;
-
- if (linkData.match(this.REGEXPS.prevLink))
- linkObj.score -= 200;
-
- // If a parentNode contains page or paging or paginat
- var parentNode = link.parentNode;
- var positiveNodeMatch = false;
- var negativeNodeMatch = false;
-
- while (parentNode) {
- var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
-
- if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
- positiveNodeMatch = true;
- linkObj.score += 25;
- }
-
- if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
- // If this is just something like "footer", give it a negative.
- // If it's something like "body-and-footer", leave it be.
- if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
- linkObj.score -= 25;
- negativeNodeMatch = true;
- }
- }
-
- parentNode = parentNode.parentNode;
- }
-
- // If the URL looks like it has paging in it, add to the score.
- // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
- if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
- linkObj.score += 25;
-
- // If the URL contains negative values, give a slight decrease.
- if (linkHref.match(this.REGEXPS.extraneous))
- linkObj.score -= 15;
-
- /**
- * Minor punishment to anything that doesn't match our current URL.
- * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
- * Dan, can you show me a counterexample where this is necessary?
- * if (linkHref.indexOf(window.location.href) !== 0) {
- * linkObj.score -= 1;
- * }
- **/
-
- // If the link text can be parsed as a number, give it a minor bonus, with a slight
- // bias towards lower numbered pages. This is so that pages that might not have 'next'
- // in their text can still get scored, and sorted properly by score.
- var linkTextAsNumber = parseInt(linkText, 10);
- if (linkTextAsNumber) {
- // Punish 1 since we're either already there, or it's probably
- // before what we want anyways.
- if (linkTextAsNumber === 1) {
- linkObj.score -= 10;
- } else {
- linkObj.score += Math.max(0, 10 - linkTextAsNumber);
- }
- }
- }
-
- // Loop thrugh all of our possible pages from above and find our top
- // candidate for the next page URL. Require at least a score of 50, which
- // is a relatively high confidence that this page is the next link.
- var topPage = null;
- for (var page in possiblePages) {
- if (possiblePages.hasOwnProperty(page)) {
- if (possiblePages[page].score >= 50 &&
- (!topPage || topPage.score < possiblePages[page].score))
- topPage = possiblePages[page];
- }
- }
-
- var nextHref = null;
- if (topPage) {
- nextHref = topPage.href.replace(/\/$/, '');
-
- this.log('NEXT PAGE IS ' + nextHref);
- this._parsedPages[nextHref] = true;
- }
- return nextHref;
- },
-
- _successfulRequest: function(request) {
- return (request.status >= 200 && request.status < 300) ||
- request.status === 304 ||
- (request.status === 0 && request.responseText);
- },
-
- _ajax: function(url, options) {
- var request = new XMLHttpRequest();
-
- function respondToReadyState(readyState) {
- if (request.readyState === 4) {
- if (this._successfulRequest(request)) {
- if (options.success)
- options.success(request);
- } else if (options.error) {
- options.error(request);
- }
- }
- }
-
- if (typeof options === 'undefined')
- options = {};
-
- request.onreadystatechange = respondToReadyState;
-
- request.open('get', url, true);
- request.setRequestHeader('Accept', 'text/html');
-
- try {
- request.send(options.postBody);
- } catch (e) {
- if (options.error)
- options.error();
- }
-
- return request;
- },
-
- _appendNextPage: function(nextPageLink) {
- var doc = this._doc;
- this._curPageNum += 1;
-
- var articlePage = doc.createElement("DIV");
- articlePage.id = 'readability-page-' + this._curPageNum;
- articlePage.className = 'page';
- articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
-
- doc.getElementById("readability-content").appendChild(articlePage);
-
- if (this._curPageNum > this._maxPages) {
- var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
- articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
- return;
- }
-
- // Now that we've built the article page DOM element, get the page content
- // asynchronously and load the cleaned content into the div we created for it.
- (function(pageUrl, thisPage) {
- this._ajax(pageUrl, {
- success: function(r) {
-
- // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
- var eTag = r.getResponseHeader('ETag');
- if (eTag) {
- if (eTag in this._pageETags) {
- this.log("Exact duplicate page found via ETag. Aborting.");
- articlePage.style.display = 'none';
- return;
- }
- this._pageETags[eTag] = 1;
- }
-
- // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
- var page = doc.createElement("DIV");
-
- // Do some preprocessing to our HTML to make it ready for appending.
- // - Remove any script tags. Swap and reswap newlines with a unicode
- // character because multiline regex doesn't work in javascript.
- // - Turn any noscript tags into divs so that we can parse them. This
- // allows us to find any next page links hidden via javascript.
- // - Turn all double br's into p's - was handled by prepDocument in the original view.
- // Maybe in the future abstract out prepDocument to work for both the original document
- // and AJAX-added pages.
- var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
- responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
-
- page.innerHTML = responseHtml;
- this._replaceBrs(page);
-
- // Reset all flags for the next page, as they will search through it and
- // disable as necessary at the end of grabArticle.
- this._flags = 0x1 | 0x2 | 0x4;
-
- var secondNextPageLink = this._findNextPageLink(page);
-
- // NOTE: if we end up supporting _appendNextPage(), we'll need to
- // change this call to be async
- var content = this._grabArticle(page);
-
- if (!content) {
- this.log("No content found in page to append. Aborting.");
- return;
- }
-
- // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
- // Compare it against all of the the previous document's we've gotten. If the previous
- // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
- var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
- if (firstP && firstP.innerHTML.length > 100) {
- for (var i = 1; i <= this._curPageNum; i += 1) {
- var rPage = doc.getElementById('readability-page-' + i);
- if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
- this.log('Duplicate of page ' + i + ' - skipping.');
- articlePage.style.display = 'none';
- this._parsedPages[pageUrl] = true;
- return;
- }
- }
- }
-
- this._removeScripts(content);
-
- thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
-
- // After the page has rendered, post process the content. This delay is necessary because,
- // in webkit at least, offsetWidth is not set in time to determine image width. We have to
- // wait a little bit for reflow to finish before we can fix floating images.
- setTimeout((function() {
- this._postProcessContent(thisPage);
- }).bind(this), 500);
-
-
- if (secondNextPageLink)
- this._appendNextPage(secondNextPageLink);
- }
- });
- }).bind(this)(nextPageLink, articlePage);
- },
-
- /**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
*
@@ -1617,16 +1415,17 @@ Readability.prototype = {
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
+ * @param Function filterFn a filter to invoke to determine whether this node 'counts'
* @return Boolean
*/
- _hasAncestorTag: function(node, tagName, maxDepth) {
+ _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
- if (depth > maxDepth)
+ if (maxDepth > 0 && depth > maxDepth)
return false;
- if (node.parentNode.tagName === tagName)
+ if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
return true;
node = node.parentNode;
depth++;
@@ -1635,6 +1434,93 @@ Readability.prototype = {
},
/**
+ * Return an object indicating how many rows and columns this table has.
+ */
+ _getRowAndColumnCount: function(table) {
+ var rows = 0;
+ var columns = 0;
+ var trs = table.getElementsByTagName("tr");
+ for (var i = 0; i < trs.length; i++) {
+ var rowspan = trs[i].getAttribute("rowspan") || 0;
+ if (rowspan) {
+ rowspan = parseInt(rowspan, 10);
+ }
+ rows += (rowspan || 1);
+
+ // Now look for column-related info
+ var columnsInThisRow = 0;
+ var cells = trs[i].getElementsByTagName("td");
+ for (var j = 0; j < cells.length; j++) {
+ var colspan = cells[j].getAttribute("colspan") || 0;
+ if (colspan) {
+ colspan = parseInt(colspan, 10);
+ }
+ columnsInThisRow += (colspan || 1);
+ }
+ columns = Math.max(columns, columnsInThisRow);
+ }
+ return {rows: rows, columns: columns};
+ },
+
+ /**
+ * Look for 'data' (as opposed to 'layout') tables, for which we use
+ * similar checks as
+ * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+ */
+ _markDataTables: function(root) {
+ var tables = root.getElementsByTagName("table");
+ for (var i = 0; i < tables.length; i++) {
+ var table = tables[i];
+ var role = table.getAttribute("role");
+ if (role == "presentation") {
+ table._readabilityDataTable = false;
+ continue;
+ }
+ var datatable = table.getAttribute("datatable");
+ if (datatable == "0") {
+ table._readabilityDataTable = false;
+ continue;
+ }
+ var summary = table.getAttribute("summary");
+ if (summary) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ var caption = table.getElementsByTagName("caption")[0];
+ if (caption && caption.childNodes.length > 0) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ // If the table has a descendant with any of these tags, consider a data table:
+ var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
+ var descendantExists = function(tag) {
+ return !!table.getElementsByTagName(tag)[0];
+ };
+ if (dataTableDescendants.some(descendantExists)) {
+ this.log("Data table because found data-y descendant");
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ // Nested tables indicate a layout table:
+ if (table.getElementsByTagName("table")[0]) {
+ table._readabilityDataTable = false;
+ continue;
+ }
+
+ var sizeInfo = this._getRowAndColumnCount(table);
+ if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+ // Now just go by size entirely:
+ table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
+ }
+ },
+
+ /**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
@@ -1652,6 +1538,15 @@ Readability.prototype = {
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) {
+ // First check if we're in a data table, in which case don't remove us.
+ var isDataTable = function(t) {
+ return t._readabilityDataTable;
+ };
+
+ if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
+ return false;
+ }
+
var weight = this._getClassWeight(node);
var contentScore = 0;
@@ -1667,7 +1562,7 @@ Readability.prototype = {
// ominous signs, remove the element.
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
- var li = node.getElementsByTagName("li").length-100;
+ var li = node.getElementsByTagName("li").length - 100;
var input = node.getElementsByTagName("input").length;
var embedCount = 0;
@@ -1681,11 +1576,10 @@ Readability.prototype = {
var contentLength = this._getInnerText(node).length;
var haveToRemove =
- // Make an exception for elements with no p's and exactly 1 img.
- (img > p && !this._hasAncestorTag(node, "figure")) ||
+ (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
- (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+ (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
@@ -1696,6 +1590,25 @@ Readability.prototype = {
},
/**
+ * Clean out elements whose id/class combinations match specific string.
+ *
+ * @param Element
+ * @param RegExp match id/class combination.
+ * @return void
+ **/
+ _cleanMatchedNodes: function(e, regex) {
+ var endOfSearchMarkerNode = this._getNextNode(e, true);
+ var next = this._getNextNode(e);
+ while (next && next != endOfSearchMarkerNode) {
+ if (regex.test(next.className + " " + next.id)) {
+ next = this._removeAndGetNext(next);
+ } else {
+ next = this._getNextNode(next);
+ }
+ }
+ },
+
+ /**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
@@ -1713,10 +1626,6 @@ Readability.prototype = {
return (this._flags & flag) > 0;
},
- _addFlag: function(flag) {
- this._flags = this._flags | flag;
- },
-
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
@@ -1807,20 +1716,10 @@ Readability.prototype = {
// Remove script tags from the document.
this._removeScripts(this._doc);
- // FIXME: Disabled multi-page article support for now as it
- // needs more work on infrastructure.
-
- // Make sure this document is added to the list of parsed pages first,
- // so we don't double up on the first page.
- // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
-
- // Pull out any possible next page link first.
- // var nextPageLink = this._findNextPageLink(doc.body);
-
this._prepDocument();
var metadata = this._getArticleMetadata();
- var articleTitle = metadata.title || this._getArticleTitle();
+ this._articleTitle = metadata.title;
var articleContent = this._grabArticle();
if (!articleContent)
@@ -1830,14 +1729,6 @@ Readability.prototype = {
this._postProcessContent(articleContent);
- // if (nextPageLink) {
- // // Append any additional pages after a small timeout so that people
- // // can start reading without having to wait for this to finish processing.
- // setTimeout((function() {
- // this._appendNextPage(nextPageLink);
- // }).bind(this), 500);
- // }
-
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.
@@ -1851,7 +1742,7 @@ Readability.prototype = {
var textContent = articleContent.textContent;
return {
uri: this._uri,
- title: articleTitle,
+ title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
@@ -1861,3 +1752,7 @@ Readability.prototype = {
};
}
};
+
+if (typeof module === "object") {
+ module.exports = Readability;
+}