1 files changed, 374 insertions, 479 deletions
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 491461a8e..04949dc61 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -38,32 +38,22 @@ function Readability(uri, doc, options) {
 
   this._uri = uri;
   this._doc = doc;
-  this._biggestFrame = false;
+  this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
 
-  // Configureable options
+  // Configurable options
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
-  this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+  this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+  this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
 
   // Start with all flags set
   this._flags = this.FLAG_STRIP_UNLIKELYS |
                 this.FLAG_WEIGHT_CLASSES |
                 this.FLAG_CLEAN_CONDITIONALLY;
 
-  // The list of pages we've parsed in this call of readability,
-  // for autopaging. As a key store for easier searching.
-  this._parsedPages = {};
-
-  // A list of the ETag headers of pages we've parsed, in case they happen to match,
-  // we'll know it's a duplicate.
-  this._pageETags = {};
-
-  // Make an AJAX request for each page and append it to the document.
-  this._curPageNum = 1;
-
   var logEl;
 
   // Control whether log messages are sent to the console
@@ -82,12 +72,12 @@ function Readability(uri, doc, options) {
       return rv + elDesc;
     };
     this.log = function () {
-      if (typeof dump !== undefined) {
+      if (typeof dump !== "undefined") {
         var msg = Array.prototype.map.call(arguments, function(x) {
           return (x && x.nodeName) ? logEl(x) : x;
         }).join(" ");
         dump("Reader: (Readability) " + msg + "\n");
-      } else if (typeof console !== undefined) {
+      } else if (typeof console !== "undefined") {
         var args = ["Reader: (Readability) "].concat(arguments);
         console.log.apply(console, args);
       }
@@ -109,20 +99,19 @@ Readability.prototype = {
   // tight the competition is among candidates.
   DEFAULT_N_TOP_CANDIDATES: 5,
 
-  // The maximum number of pages to loop through before we call
-  // it quits and just show a link.
-  DEFAULT_MAX_PAGES: 5,
-
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
+  // The default number of words an article must have in order to return a result
+  DEFAULT_WORD_THRESHOLD: 500,
+
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+    unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
-    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby|p-author/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
@@ -138,6 +127,13 @@ Readability.prototype = {
 
   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
 
+  PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
+
+  DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+
+  // These are the classes that readability sets itself.
+  CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+
   /**
    * Run any post-process modifications to article content as necessary.
    *
@@ -147,6 +143,9 @@ Readability.prototype = {
   _postProcessContent: function(articleContent) {
     // Readability cannot open relative uris so we convert them to absolute uris.
     this._fixRelativeUris(articleContent);
+
+    // Remove classes.
+    this._cleanClasses(articleContent);
   },
 
   /**
@@ -155,8 +154,8 @@ Readability.prototype = {
    *
    * If function is not passed, removes all the nodes in node list.
    *
-   * @param NodeList nodeList The no
-   * @param Function filterFn
+   * @param NodeList nodeList The nodes to operate on
+   * @param Function filterFn the function to use as a filter
    * @return void
    */
   _removeNodes: function(nodeList, filterFn) {
@@ -172,6 +171,20 @@ Readability.prototype = {
   },
 
   /**
+   * Iterates over a NodeList, and calls _setNodeTag for each node.
+   *
+   * @param NodeList nodeList The nodes to operate on
+   * @param String newTagName the new tag name to use
+   * @return void
+   */
+  _replaceNodeTags: function(nodeList, newTagName) {
+    for (var i = nodeList.length - 1; i >= 0; i--) {
+      var node = nodeList[i];
+      this._setNodeTag(node, newTagName);
+    }
+  },
+
+  /**
    * Iterate over a NodeList, which doesn't natively fully implement the Array
    * interface.
    *
@@ -180,10 +193,9 @@ Readability.prototype = {
    *
    * @param  NodeList nodeList The NodeList.
    * @param  Function fn       The iterate function.
-   * @param  Boolean  backward Whether to use backward iteration.
    * @return void
    */
-  _forEachNode: function(nodeList, fn, backward) {
+  _forEachNode: function(nodeList, fn) {
     Array.prototype.forEach.call(nodeList, fn, this);
   },
 
@@ -228,6 +240,34 @@ Readability.prototype = {
   },
 
   /**
+   * Removes the class="" attribute from every element in the given
+   * subtree, except those that match CLASSES_TO_PRESERVE and
+   * the classesToPreserve array from the options object.
+   *
+   * @param Element
+   * @return void
+   */
+  _cleanClasses: function(node) {
+    var classesToPreserve = this._classesToPreserve;
+    var className = (node.getAttribute("class") || "")
+      .split(/\s+/)
+      .filter(function(cls) {
+        return classesToPreserve.indexOf(cls) != -1;
+      })
+      .join(" ");
+
+    if (className) {
+      node.setAttribute("class", className);
+    } else {
+      node.removeAttribute("class");
+    }
+
+    for (node = node.firstElementChild; node; node = node.nextElementSibling) {
+      this._cleanClasses(node);
+    }
+  },
+
+  /**
    * Converts each <a> and <img> uri in the given element to an absolute URI,
    * ignoring #ref URIs.
    *
@@ -307,11 +347,20 @@ Readability.prototype = {
         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
     } catch (e) {/* ignore exceptions setting the title. */}
 
-    if (curTitle.match(/ [\|\-] /)) {
-      curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
+    var titleHadHierarchicalSeparators = false;
+    function wordCount(str) {
+      return str.split(/\s+/).length;
+    }
+
+    // If there's a separator in the title, first remove the final part
+    if ((/ [\|\-\\\/>»] /).test(curTitle)) {
+      titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
+      curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
 
-      if (curTitle.split(' ').length < 3)
-        curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
+      // If the resulting title is too short (3 words or fewer), remove
+      // the first part instead:
+      if (wordCount(curTitle) < 3)
+        curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
     } else if (curTitle.indexOf(': ') !== -1) {
       // Check if we have an heading containing this exact string, so we
       // could assume it's the full title.
@@ -328,8 +377,13 @@ Readability.prototype = {
         curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
 
         // If the title is now too short, try the first colon instead:
-        if (curTitle.split(' ').length < 3)
+        if (wordCount(curTitle) < 3) {
           curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+          // But if we have too many words before the colon there's something weird
+          // with the titles and the H tags so let's just use the original title instead
+        } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
+          curTitle = origTitle;
+        }
       }
     } else if (curTitle.length > 150 || curTitle.length < 15) {
       var hOnes = doc.getElementsByTagName('h1');
@@ -339,9 +393,16 @@ Readability.prototype = {
     }
 
     curTitle = curTitle.trim();
-
-    if (curTitle.split(' ').length <= 4)
+    // If we now have 4 words or fewer as our title, and either no
+    // 'hierarchical' separators (\, /, > or ») were found in the original
+    // title or we decreased the number of words by more than 1 word, use
+    // the original title.
+    var curTitleWordCount = wordCount(curTitle);
+    if (curTitleWordCount <= 4 &&
+        (!titleHadHierarchicalSeparators ||
+         curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
       curTitle = origTitle;
+    }
 
     return curTitle;
   },
@@ -362,9 +423,7 @@ Readability.prototype = {
       this._replaceBrs(doc.body);
     }
 
-    this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
-      this._setNodeTag(fontNode, "SPAN");
-    });
+    this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
   },
 
   /**
@@ -464,19 +523,49 @@ Readability.prototype = {
   _prepArticle: function(articleContent) {
     this._cleanStyles(articleContent);
 
+    // Check for data tables before we continue, to avoid removing items in
+    // those tables, which will often be isolated even though they're
+    // visually linked to other content-ful elements (text, images, etc.).
+    this._markDataTables(articleContent);
+
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
+    this._cleanConditionally(articleContent, "fieldset");
     this._clean(articleContent, "object");
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
 
-    // If there is only one h2, they are probably using it as a header
-    // and not a subheader, so remove it since we already have a header.
-    if (articleContent.getElementsByTagName('h2').length === 1)
-      this._clean(articleContent, "h2");
+    // Clean out elements have "share" in their id/class combinations from final top candidates,
+    // which means we don't remove the top candidates even they have "share".
+    this._forEachNode(articleContent.children, function(topCandidate) {
+      this._cleanMatchedNodes(topCandidate, /share/);
+    });
+
+    // If there is only one h2 and its text content substantially equals article title,
+    // they are probably using it as a header and not a subheader,
+    // so remove it since we already extract the title separately.
+    var h2 = articleContent.getElementsByTagName('h2');
+    if (h2.length === 1) {
+      var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
+      if (Math.abs(lengthSimilarRate) < 0.5) {
+        var titlesMatch = false;
+        if (lengthSimilarRate > 0) {
+          titlesMatch = h2[0].textContent.includes(this._articleTitle);
+        } else {
+          titlesMatch = this._articleTitle.includes(h2[0].textContent);
+        }
+        if (titlesMatch) {
+          this._clean(articleContent, "h2");
+        }
+      }
+    }
 
     this._clean(articleContent, "iframe");
+    this._clean(articleContent, "input");
+    this._clean(articleContent, "textarea");
+    this._clean(articleContent, "select");
+    this._clean(articleContent, "button");
     this._cleanHeaders(articleContent);
 
     // Do these last as the previous stuff may have removed junk
@@ -662,9 +751,6 @@ Readability.prototype = {
 
     var pageCacheHtml = page.innerHTML;
 
-    // Check if any "dir" is set on the toplevel document element
-    this._articleDir = doc.documentElement.getAttribute("dir");
-
     while (true) {
       var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
 
@@ -695,6 +781,15 @@ Readability.prototype = {
           }
         }
 
+        // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+        if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
+             node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
+             node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
+            this._isElementWithoutContent(node)) {
+          node = this._removeAndGetNext(node);
+          continue;
+        }
+
         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
           elementsToScore.push(node);
         }
@@ -709,13 +804,14 @@ Readability.prototype = {
             var newNode = node.children[0];
             node.parentNode.replaceChild(newNode, node);
             node = newNode;
+            elementsToScore.push(node);
           } else if (!this._hasChildBlockElement(node)) {
             node = this._setNodeTag(node, "P");
             elementsToScore.push(node);
           } else {
             // EXPERIMENTAL
             this._forEachNode(node.childNodes, function(childNode) {
-              if (childNode.nodeType === Node.TEXT_NODE) {
+              if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
                 var p = doc.createElement('p');
                 p.textContent = childNode.textContent;
                 p.style.display = 'inline';
@@ -812,6 +908,7 @@ Readability.prototype = {
 
       var topCandidate = topCandidates[0] || null;
       var neededToCreateTopCandidate = false;
+      var parentOfTopCandidate;
 
       // If we still have no top candidate, just use the body as a last resort.
       // We also have to copy the body node so it is something we can modify.
@@ -831,6 +928,33 @@ Readability.prototype = {
 
         this._initializeNode(topCandidate);
       } else if (topCandidate) {
+        // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+        // and whose scores are quite closed with current `topCandidate` node.
+        var alternativeCandidateAncestors = [];
+        for (var i = 1; i < topCandidates.length; i++) {
+          if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
+            alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
+          }
+        }
+        var MINIMUM_TOPCANDIDATES = 3;
+        if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
+          parentOfTopCandidate = topCandidate.parentNode;
+          while (parentOfTopCandidate.tagName !== "BODY") {
+            var listsContainingThisAncestor = 0;
+            for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
+              listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
+            }
+            if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
+              topCandidate = parentOfTopCandidate;
+              break;
+            }
+            parentOfTopCandidate = parentOfTopCandidate.parentNode;
+          }
+        }
+        if (!topCandidate.readability) {
+          this._initializeNode(topCandidate);
+        }
+
         // Because of our bonus system, parents of candidates might have scores
         // themselves. They get half of the node. There won't be nodes with higher
         // scores than our topCandidate, but if we see the score going *up* in the first
@@ -838,11 +962,15 @@ Readability.prototype = {
         // lurking in other places that we want to unify in. The sibling stuff
         // below does some of that - but only if we've looked high enough up the DOM
         // tree.
-        var parentOfTopCandidate = topCandidate.parentNode;
+        parentOfTopCandidate = topCandidate.parentNode;
         var lastScore = topCandidate.readability.contentScore;
         // The scores shouldn't get too low.
         var scoreThreshold = lastScore / 3;
-        while (parentOfTopCandidate && parentOfTopCandidate.readability) {
+        while (parentOfTopCandidate.tagName !== "BODY") {
+          if (!parentOfTopCandidate.readability) {
+            parentOfTopCandidate = parentOfTopCandidate.parentNode;
+            continue;
+          }
           var parentScore = parentOfTopCandidate.readability.contentScore;
           if (parentScore < scoreThreshold)
             break;
@@ -854,6 +982,17 @@ Readability.prototype = {
           lastScore = parentOfTopCandidate.readability.contentScore;
           parentOfTopCandidate = parentOfTopCandidate.parentNode;
         }
+
+        // If the top candidate is the only child, use parent instead. This will help sibling
+        // joining logic when adjacent content is actually located in parent's sibling node.
+        parentOfTopCandidate = topCandidate.parentNode;
+        while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
+          topCandidate = parentOfTopCandidate;
+          parentOfTopCandidate = topCandidate.parentNode;
+        }
+        if (!topCandidate.readability) {
+          this._initializeNode(topCandidate);
+        }
       }
 
       // Now that we have the top candidate, look through its siblings for content
@@ -864,7 +1003,9 @@ Readability.prototype = {
         articleContent.id = "readability-content";
 
       var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
-      var siblings = topCandidate.parentNode.children;
+      // Keep potential top candidate's parent node to try to get text direction of it later.
+      parentOfTopCandidate = topCandidate.parentNode;
+      var siblings = parentOfTopCandidate.children;
 
       for (var s = 0, sl = siblings.length; s < sl; s++) {
         var sibling = siblings[s];
@@ -927,24 +1068,22 @@ Readability.prototype = {
       if (this._debug)
         this.log("Article content post-prep: " + articleContent.innerHTML);
 
-      if (this._curPageNum === 1) {
-        if (neededToCreateTopCandidate) {
-          // We already created a fake div thing, and there wouldn't have been any siblings left
-          // for the previous loop, so there's no point trying to create a new div, and then
-          // move all the children over. Just assign IDs and class names here. No need to append
-          // because that already happened anyway.
-          topCandidate.id = "readability-page-1";
-          topCandidate.className = "page";
-        } else {
-          var div = doc.createElement("DIV");
-          div.id = "readability-page-1";
-          div.className = "page";
-          var children = articleContent.childNodes;
-          while (children.length) {
-            div.appendChild(children[0]);
-          }
-          articleContent.appendChild(div);
+      if (neededToCreateTopCandidate) {
+        // We already created a fake div thing, and there wouldn't have been any siblings left
+        // for the previous loop, so there's no point trying to create a new div, and then
+        // move all the children over. Just assign IDs and class names here. No need to append
+        // because that already happened anyway.
+        topCandidate.id = "readability-page-1";
+        topCandidate.className = "page";
+      } else {
+        var div = doc.createElement("DIV");
+        div.id = "readability-page-1";
+        div.className = "page";
+        var children = articleContent.childNodes;
+        while (children.length) {
+          div.appendChild(children[0]);
         }
+        articleContent.appendChild(div);
       }
 
       if (this._debug)
@@ -955,7 +1094,7 @@ Readability.prototype = {
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
-      if (this._getInnerText(articleContent, true).length < 500) {
+      if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
         page.innerHTML = pageCacheHtml;
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
@@ -968,6 +1107,18 @@ Readability.prototype = {
           return null;
         }
       } else {
+        // Find out text direction from ancestors of final top candidate.
+        var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
+        this._someNode(ancestors, function(ancestor) {
+          if (!ancestor.tagName)
+            return false;
+          var articleDir = ancestor.getAttribute("dir");
+          if (articleDir) {
+            this._articleDir = articleDir;
+            return true;
+          }
+          return false;
+        });
         return articleContent;
       }
     }
@@ -1044,12 +1195,15 @@ Readability.prototype = {
       metadata.excerpt = values["twitter:description"];
     }
 
-    if ("og:title" in values) {
-      // Use facebook open graph title.
-      metadata.title = values["og:title"];
-    } else if ("twitter:title" in values) {
-      // Use twitter cards title.
-      metadata.title = values["twitter:title"];
+    metadata.title = this._getArticleTitle();
+    if (!metadata.title) {
+      if ("og:title" in values) {
+        // Use facebook open graph title.
+        metadata.title = values["og:title"];
+      } else if ("twitter:title" in values) {
+        // Use twitter cards title.
+        metadata.title = values["twitter:title"];
+      }
     }
 
     return metadata;
@@ -1089,6 +1243,13 @@ Readability.prototype = {
     });
   },
 
+  _isElementWithoutContent: function(node) {
+    return node.nodeType === Node.ELEMENT_NODE &&
+      node.textContent.trim().length == 0 &&
+      (node.children.length == 0 ||
+       node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
+  },
+
   /**
    * Determine whether element has any children block level elements.
    *
@@ -1139,26 +1300,25 @@ Readability.prototype = {
    * @return void
   **/
   _cleanStyles: function(e) {
-    e = e || this._doc;
-    if (!e)
+    if (!e || e.tagName.toLowerCase() === 'svg')
       return;
-    var cur = e.firstChild;
 
-    // Remove any root styles, if we're able.
-    if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
-      e.removeAttribute('style');
-
-    // Go until there are no more child nodes
-    while (cur !== null) {
-      if (cur.nodeType === cur.ELEMENT_NODE) {
-        // Remove style attribute(s) :
-        if (cur.className !== "readability-styled")
-          cur.removeAttribute("style");
+    if (e.className !== 'readability-styled') {
+      // Remove `style` and deprecated presentational attributes
+      for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+        e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+      }
 
-        this._cleanStyles(cur);
+      if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+        e.removeAttribute('width');
+        e.removeAttribute('height');
       }
+    }
 
-      cur = cur.nextSibling;
+    var cur = e.firstElementChild;
+    while (cur !== null) {
+      this._cleanStyles(cur);
+      cur = cur.nextElementSibling;
     }
   },
 
@@ -1185,368 +1345,6 @@ Readability.prototype = {
   },
 
   /**
-   * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
-   *
-   * @author Dan Lacy
-   * @return string the base url
-  **/
-  _findBaseUrl: function() {
-    var uri = this._uri;
-    var noUrlParams = uri.path.split("?")[0];
-    var urlSlashes = noUrlParams.split("/").reverse();
-    var cleanedSegments = [];
-    var possibleType = "";
-
-    for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
-      var segment = urlSlashes[i];
-
-      // Split off and save anything that looks like a file type.
-      if (segment.indexOf(".") !== -1) {
-        possibleType = segment.split(".")[1];
-
-        // If the type isn't alpha-only, it's probably not actually a file extension.
-        if (!possibleType.match(/[^a-zA-Z]/))
-          segment = segment.split(".")[0];
-      }
-
-      // EW-CMS specific segment replacement. Ugly.
-      // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
-      if (segment.indexOf(',00') !== -1)
-        segment = segment.replace(',00', '');
-
-      // If our first or second segment has anything looking like a page number, remove it.
-      if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
-        segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
-
-      var del = false;
-
-      // If this is purely a number, and it's the first or second segment,
-      // it's probably a page number. Remove it.
-      if (i < 2 && segment.match(/^\d{1,2}$/))
-        del = true;
-
-      // If this is the first segment and it's just "index", remove it.
-      if (i === 0 && segment.toLowerCase() === "index")
-        del = true;
-
-      // If our first or second segment is smaller than 3 characters,
-      // and the first segment was purely alphas, remove it.
-      if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
-        del = true;
-
-      // If it's not marked for deletion, push it to cleanedSegments.
-      if (!del)
-        cleanedSegments.push(segment);
-    }
-
-    // This is our final, cleaned, base article URL.
-    return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
-  },
-
-  /**
-   * Look for any paging links that may occur within the document.
-   *
-   * @param body
-   * @return object (array)
-  **/
-  _findNextPageLink: function(elem) {
-    var uri = this._uri;
-    var possiblePages = {};
-    var allLinks = elem.getElementsByTagName('a');
-    var articleBaseUrl = this._findBaseUrl();
-
-    // Loop through all links, looking for hints that they may be next-page links.
-    // Things like having "page" in their textContent, className or id, or being a child
-    // of a node with a page-y className or id.
-    //
-    // Also possible: levenshtein distance? longest common subsequence?
-    //
-    // After we do that, assign each page a score, and
-    for (var i = 0, il = allLinks.length; i < il; i += 1) {
-      var link = allLinks[i];
-      var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
-
-      // If we've already seen this page, ignore it.
-      if (linkHref === "" ||
-        linkHref === articleBaseUrl ||
-        linkHref === uri.spec ||
-        linkHref in this._parsedPages) {
-        continue;
-      }
-
-      // If it's on a different domain, skip it.
-      if (uri.host !== linkHref.split(/\/+/g)[1])
-        continue;
-
-      var linkText = this._getInnerText(link);
-
-      // If the linkText looks like it's not the next page, skip it.
-      if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
-        continue;
-
-      // If the leftovers of the URL after removing the base URL don't contain
-      // any digits, it's certainly not a next page link.
-      var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
-      if (!linkHrefLeftover.match(/\d/))
-        continue;
-
-      if (!(linkHref in possiblePages)) {
-        possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
-      } else {
-        possiblePages[linkHref].linkText += ' | ' + linkText;
-      }
-
-      var linkObj = possiblePages[linkHref];
-
-      // If the articleBaseUrl isn't part of this URL, penalize this link. It could
-      // still be the link, but the odds are lower.
-      // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
-      if (linkHref.indexOf(articleBaseUrl) !== 0)
-        linkObj.score -= 25;
-
-      var linkData = linkText + ' ' + link.className + ' ' + link.id;
-      if (linkData.match(this.REGEXPS.nextLink))
-        linkObj.score += 50;
-
-      if (linkData.match(/pag(e|ing|inat)/i))
-        linkObj.score += 25;
-
-      if (linkData.match(/(first|last)/i)) {
-        // -65 is enough to negate any bonuses gotten from a > or » in the text,
-        // If we already matched on "next", last is probably fine.
-        // If we didn't, then it's bad. Penalize.
-        if (!linkObj.linkText.match(this.REGEXPS.nextLink))
-          linkObj.score -= 65;
-      }
-
-      if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
-        linkObj.score -= 50;
-
-      if (linkData.match(this.REGEXPS.prevLink))
-        linkObj.score -= 200;
-
-      // If a parentNode contains page or paging or paginat
-      var parentNode = link.parentNode;
-      var positiveNodeMatch = false;
-      var negativeNodeMatch = false;
-
-      while (parentNode) {
-        var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
-
-        if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
-          positiveNodeMatch = true;
-          linkObj.score += 25;
-        }
-
-        if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
-          // If this is just something like "footer", give it a negative.
-          // If it's something like "body-and-footer", leave it be.
-          if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
-            linkObj.score -= 25;
-            negativeNodeMatch = true;
-          }
-        }
-
-        parentNode = parentNode.parentNode;
-      }
-
-      // If the URL looks like it has paging in it, add to the score.
-      // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
-      if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
-        linkObj.score += 25;
-
-      // If the URL contains negative values, give a slight decrease.
-      if (linkHref.match(this.REGEXPS.extraneous))
-        linkObj.score -= 15;
-
-      /**
-       * Minor punishment to anything that doesn't match our current URL.
-       * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
-       *     Dan, can you show me a counterexample where this is necessary?
-       * if (linkHref.indexOf(window.location.href) !== 0) {
-       *  linkObj.score -= 1;
-       * }
-      **/
-
-      // If the link text can be parsed as a number, give it a minor bonus, with a slight
-      // bias towards lower numbered pages. This is so that pages that might not have 'next'
-      // in their text can still get scored, and sorted properly by score.
-      var linkTextAsNumber = parseInt(linkText, 10);
-      if (linkTextAsNumber) {
-        // Punish 1 since we're either already there, or it's probably
-        // before what we want anyways.
-        if (linkTextAsNumber === 1) {
-          linkObj.score -= 10;
-        } else {
-          linkObj.score += Math.max(0, 10 - linkTextAsNumber);
-        }
-      }
-    }
-
-    // Loop thrugh all of our possible pages from above and find our top
-    // candidate for the next page URL. Require at least a score of 50, which
-    // is a relatively high confidence that this page is the next link.
-    var topPage = null;
-    for (var page in possiblePages) {
-      if (possiblePages.hasOwnProperty(page)) {
-        if (possiblePages[page].score >= 50 &&
-          (!topPage || topPage.score < possiblePages[page].score))
-          topPage = possiblePages[page];
-      }
-    }
-
-    var nextHref = null;
-    if (topPage) {
-      nextHref = topPage.href.replace(/\/$/, '');
-
-      this.log('NEXT PAGE IS ' + nextHref);
-      this._parsedPages[nextHref] = true;
-    }
-    return nextHref;
-  },
-
-  _successfulRequest: function(request) {
-    return (request.status >= 200 && request.status < 300) ||
-        request.status === 304 ||
-         (request.status === 0 && request.responseText);
-  },
-
-  _ajax: function(url, options) {
-    var request = new XMLHttpRequest();
-
-    function respondToReadyState(readyState) {
-      if (request.readyState === 4) {
-        if (this._successfulRequest(request)) {
-          if (options.success)
-            options.success(request);
-        } else if (options.error) {
-          options.error(request);
-        }
-      }
-    }
-
-    if (typeof options === 'undefined')
-      options = {};
-
-    request.onreadystatechange = respondToReadyState;
-
-    request.open('get', url, true);
-    request.setRequestHeader('Accept', 'text/html');
-
-    try {
-      request.send(options.postBody);
-    } catch (e) {
-      if (options.error)
-        options.error();
-    }
-
-    return request;
-  },
-
-  _appendNextPage: function(nextPageLink) {
-    var doc = this._doc;
-    this._curPageNum += 1;
-
-    var articlePage = doc.createElement("DIV");
-    articlePage.id = 'readability-page-' + this._curPageNum;
-    articlePage.className = 'page';
-    articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
-
-    doc.getElementById("readability-content").appendChild(articlePage);
-
-    if (this._curPageNum > this._maxPages) {
-      var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
-      articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
-      return;
-    }
-
-    // Now that we've built the article page DOM element, get the page content
-    // asynchronously and load the cleaned content into the div we created for it.
-    (function(pageUrl, thisPage) {
-      this._ajax(pageUrl, {
-        success: function(r) {
-
-          // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
-          var eTag = r.getResponseHeader('ETag');
-          if (eTag) {
-            if (eTag in this._pageETags) {
-              this.log("Exact duplicate page found via ETag. Aborting.");
-              articlePage.style.display = 'none';
-              return;
-            }
-            this._pageETags[eTag] = 1;
-          }
-
-          // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
-          var page = doc.createElement("DIV");
-
-          // Do some preprocessing to our HTML to make it ready for appending.
-          // - Remove any script tags. Swap and reswap newlines with a unicode
-          //   character because multiline regex doesn't work in javascript.
-          // - Turn any noscript tags into divs so that we can parse them. This
-          //   allows us to find any next page links hidden via javascript.
-          // - Turn all double br's into p's - was handled by prepDocument in the original view.
-          //   Maybe in the future abstract out prepDocument to work for both the original document
-          //   and AJAX-added pages.
-          var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
-          responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
-          responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
-          responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
-
-          page.innerHTML = responseHtml;
-          this._replaceBrs(page);
-
-          // Reset all flags for the next page, as they will search through it and
-          // disable as necessary at the end of grabArticle.
-          this._flags = 0x1 | 0x2 | 0x4;
-
-          var secondNextPageLink = this._findNextPageLink(page);
-
-          // NOTE: if we end up supporting _appendNextPage(), we'll need to
-          // change this call to be async
-          var content = this._grabArticle(page);
-
-          if (!content) {
-            this.log("No content found in page to append. Aborting.");
-            return;
-          }
-
-          // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
-          // Compare it against all of the the previous document's we've gotten. If the previous
-          // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
-          var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
-          if (firstP && firstP.innerHTML.length > 100) {
-            for (var i = 1; i <= this._curPageNum; i += 1) {
-              var rPage = doc.getElementById('readability-page-' + i);
-              if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
-                this.log('Duplicate of page ' + i + ' - skipping.');
-                articlePage.style.display = 'none';
-                this._parsedPages[pageUrl] = true;
-                return;
-              }
-            }
-          }
-
-          this._removeScripts(content);
-
-          thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
-
-          // After the page has rendered, post process the content. This delay is necessary because,
-          // in webkit at least, offsetWidth is not set in time to determine image width. We have to
-          // wait a little bit for reflow to finish before we can fix floating images.
-          setTimeout((function() {
-            this._postProcessContent(thisPage);
-          }).bind(this), 500);
-
-
-          if (secondNextPageLink)
-            this._appendNextPage(secondNextPageLink);
-        }
-      });
-    }).bind(this)(nextPageLink, articlePage);
-  },
-
-  /**
    * Get an elements class/id weight. Uses regular expressions to tell if this
    * element looks good or bad.
    *
@@ -1617,16 +1415,17 @@ Readability.prototype = {
    * @param  HTMLElement node
    * @param  String      tagName
    * @param  Number      maxDepth
+   * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
    * @return Boolean
    */
-  _hasAncestorTag: function(node, tagName, maxDepth) {
+  _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
     maxDepth = maxDepth || 3;
     tagName = tagName.toUpperCase();
     var depth = 0;
     while (node.parentNode) {
-      if (depth > maxDepth)
+      if (maxDepth > 0 && depth > maxDepth)
         return false;
-      if (node.parentNode.tagName === tagName)
+      if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
         return true;
       node = node.parentNode;
       depth++;
@@ -1635,6 +1434,93 @@ Readability.prototype = {
   },
 
   /**
+   * Return an object indicating how many rows and columns this table has.
+   */
+  _getRowAndColumnCount: function(table) {
+    var rows = 0;
+    var columns = 0;
+    var trs = table.getElementsByTagName("tr");
+    for (var i = 0; i < trs.length; i++) {
+      var rowspan = trs[i].getAttribute("rowspan") || 0;
+      if (rowspan) {
+        rowspan = parseInt(rowspan, 10);
+      }
+      rows += (rowspan || 1);
+
+      // Now look for column-related info
+      var columnsInThisRow = 0;
+      var cells = trs[i].getElementsByTagName("td");
+      for (var j = 0; j < cells.length; j++) {
+        var colspan = cells[j].getAttribute("colspan") || 0;
+        if (colspan) {
+          colspan = parseInt(colspan, 10);
+        }
+        columnsInThisRow += (colspan || 1);
+      }
+      columns = Math.max(columns, columnsInThisRow);
+    }
+    return {rows: rows, columns: columns};
+  },
+
+  /**
+   * Look for 'data' (as opposed to 'layout') tables, for which we use
+   * similar checks as
+   * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+   */
+  _markDataTables: function(root) {
+    var tables = root.getElementsByTagName("table");
+    for (var i = 0; i < tables.length; i++) {
+      var table = tables[i];
+      var role = table.getAttribute("role");
+      if (role == "presentation") {
+        table._readabilityDataTable = false;
+        continue;
+      }
+      var datatable = table.getAttribute("datatable");
+      if (datatable == "0") {
+        table._readabilityDataTable = false;
+        continue;
+      }
+      var summary = table.getAttribute("summary");
+      if (summary) {
+        table._readabilityDataTable = true;
+        continue;
+      }
+
+      var caption = table.getElementsByTagName("caption")[0];
+      if (caption && caption.childNodes.length > 0) {
+        table._readabilityDataTable = true;
+        continue;
+      }
+
+      // If the table has a descendant with any of these tags, consider a data table:
+      var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
+      var descendantExists = function(tag) {
+        return !!table.getElementsByTagName(tag)[0];
+      };
+      if (dataTableDescendants.some(descendantExists)) {
+        this.log("Data table because found data-y descendant");
+        table._readabilityDataTable = true;
+        continue;
+      }
+
+      // Nested tables indicate a layout table:
+      if (table.getElementsByTagName("table")[0]) {
+        table._readabilityDataTable = false;
+        continue;
+      }
+
+      var sizeInfo = this._getRowAndColumnCount(table);
+      if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
+        table._readabilityDataTable = true;
+        continue;
+      }
+      // Now just go by size entirely:
+      table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
+    }
+  },
+
+  /**
    * Clean an element of all tags of type "tag" if they look fishy.
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
    *
@@ -1652,6 +1538,15 @@ Readability.prototype = {
     //
     // TODO: Consider taking into account original contentScore here.
     this._removeNodes(e.getElementsByTagName(tag), function(node) {
+      // First check if we're in a data table, in which case don't remove us.
+      var isDataTable = function(t) {
+        return t._readabilityDataTable;
+      };
+
+      if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
+        return false;
+      }
+
       var weight = this._getClassWeight(node);
       var contentScore = 0;
 
@@ -1667,7 +1562,7 @@ Readability.prototype = {
         // ominous signs, remove the element.
         var p = node.getElementsByTagName("p").length;
         var img = node.getElementsByTagName("img").length;
-        var li = node.getElementsByTagName("li").length-100;
+        var li = node.getElementsByTagName("li").length - 100;
         var input = node.getElementsByTagName("input").length;
 
         var embedCount = 0;
@@ -1681,11 +1576,10 @@ Readability.prototype = {
         var contentLength = this._getInnerText(node).length;
 
         var haveToRemove =
-          // Make an exception for elements with no p's and exactly 1 img.
-          (img > p && !this._hasAncestorTag(node, "figure")) ||
+          (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
           (!isList && li > p) ||
           (input > Math.floor(p/3)) ||
-          (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+          (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
           (!isList && weight < 25 && linkDensity > 0.2) ||
           (weight >= 25 && linkDensity > 0.5) ||
           ((embedCount === 1 && contentLength < 75) || embedCount > 1);
@@ -1696,6 +1590,25 @@ Readability.prototype = {
   },
 
   /**
+   * Clean out elements whose id/class combinations match specific string.
+   *
+   * @param Element
+   * @param RegExp match id/class combination.
+   * @return void
+   **/
+  _cleanMatchedNodes: function(e, regex) {
+    var endOfSearchMarkerNode = this._getNextNode(e, true);
+    var next = this._getNextNode(e);
+    while (next && next != endOfSearchMarkerNode) {
+      if (regex.test(next.className + " " + next.id)) {
+        next = this._removeAndGetNext(next);
+      } else {
+        next = this._getNextNode(next);
+      }
+    }
+  },
+
+  /**
    * Clean out spurious headers from an Element. Checks things like classnames and link density.
    *
    * @param Element
@@ -1713,10 +1626,6 @@ Readability.prototype = {
     return (this._flags & flag) > 0;
   },
 
-  _addFlag: function(flag) {
-    this._flags = this._flags | flag;
-  },
-
   _removeFlag: function(flag) {
     this._flags = this._flags & ~flag;
   },
@@ -1807,20 +1716,10 @@ Readability.prototype = {
     // Remove script tags from the document.
     this._removeScripts(this._doc);
 
-    // FIXME: Disabled multi-page article support for now as it
-    // needs more work on infrastructure.
-
-    // Make sure this document is added to the list of parsed pages first,
-    // so we don't double up on the first page.
-    // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
-
-    // Pull out any possible next page link first.
-    // var nextPageLink = this._findNextPageLink(doc.body);
-
     this._prepDocument();
 
     var metadata = this._getArticleMetadata();
-    var articleTitle = metadata.title || this._getArticleTitle();
+    this._articleTitle = metadata.title;
 
     var articleContent = this._grabArticle();
     if (!articleContent)
@@ -1830,14 +1729,6 @@ Readability.prototype = {
 
     this._postProcessContent(articleContent);
 
-    // if (nextPageLink) {
-    //   // Append any additional pages after a small timeout so that people
-    //   // can start reading without having to wait for this to finish processing.
-    //   setTimeout((function() {
-    //     this._appendNextPage(nextPageLink);
-    //   }).bind(this), 500);
-    // }
-
     // If we haven't found an excerpt in the article's metadata, use the article's
     // first paragraph as the excerpt. This is used for displaying a preview of
     // the article's content.
@@ -1851,7 +1742,7 @@ Readability.prototype = {
     var textContent = articleContent.textContent;
     return {
       uri: this._uri,
-      title: articleTitle,
+      title: this._articleTitle,
       byline: metadata.byline || this._articleByline,
       dir: this._articleDir,
       content: articleContent.innerHTML,
@@ -1861,3 +1752,7 @@ Readability.prototype = {
     };
   }
 };
+
+if (typeof module === "object") {
+  module.exports = Readability;
+}