summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader
diff options
context:
space:
mode:
authorMoonchild <mcwerewolf@gmail.com>2018-05-16 17:10:38 +0200
committerGitHub <noreply@github.com>2018-05-16 17:10:38 +0200
commit90942a2af0cabb9345cf04fa6113e12197504fcf (patch)
treee16c71be5a1343abe0489863f84ed271b6ebd3d7 /toolkit/components/reader
parent819ca50f163a9113772a7dbfd617d97151893337 (diff)
parent9ef464a5ac0a17135a0f7b4fef070bb4f7fbe44c (diff)
downloadUXP-90942a2af0cabb9345cf04fa6113e12197504fcf.tar
UXP-90942a2af0cabb9345cf04fa6113e12197504fcf.tar.gz
UXP-90942a2af0cabb9345cf04fa6113e12197504fcf.tar.lz
UXP-90942a2af0cabb9345cf04fa6113e12197504fcf.tar.xz
UXP-90942a2af0cabb9345cf04fa6113e12197504fcf.zip
Merge pull request #367 from Ascrod/readerview
Reader and Narrator Updates
Diffstat (limited to 'toolkit/components/reader')
-rw-r--r--toolkit/components/reader/AboutReader.jsm409
-rw-r--r--toolkit/components/reader/JSDOMParser.js46
-rw-r--r--toolkit/components/reader/Readability.js853
-rw-r--r--toolkit/components/reader/ReaderMode.jsm312
-rw-r--r--toolkit/components/reader/ReaderWorker.js7
-rw-r--r--toolkit/components/reader/content/aboutReader.html73
-rw-r--r--toolkit/components/reader/content/aboutReader.js2
7 files changed, 845 insertions, 857 deletions
diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm
index 1fb9db123..fb82e5789 100644
--- a/toolkit/components/reader/AboutReader.jsm
+++ b/toolkit/components/reader/AboutReader.jsm
@@ -15,12 +15,12 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "AsyncPrefs", "resource://gre/modules/AsyncPrefs.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "NarrateControls", "resource://gre/modules/narrate/NarrateControls.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "Rect", "resource://gre/modules/Geometry.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "UITelemetry", "resource://gre/modules/UITelemetry.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "PluralForm", "resource://gre/modules/PluralForm.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "PlacesUtils", "resource://gre/modules/PlacesUtils.jsm");
var gStrings = Services.strings.createBundle("chrome://global/locale/aboutReader.properties");
-var AboutReader = function(mm, win, articlePromise) {
+var AboutReader = function(win, articlePromise) {
let url = this._getOriginalUrl(win);
if (!(url.startsWith("http://") || url.startsWith("https://"))) {
let errorMsg = "Only http:// and https:// URLs can be loaded in about:reader.";
@@ -33,57 +33,59 @@ var AboutReader = function(mm, win, articlePromise) {
let doc = win.document;
- this._mm = mm;
- this._mm.addMessageListener("Reader:CloseDropdown", this);
- this._mm.addMessageListener("Reader:AddButton", this);
- this._mm.addMessageListener("Reader:RemoveButton", this);
- this._mm.addMessageListener("Reader:GetStoredArticleData", this);
-
this._docRef = Cu.getWeakReference(doc);
this._winRef = Cu.getWeakReference(win);
this._innerWindowId = win.QueryInterface(Ci.nsIInterfaceRequestor)
.getInterface(Ci.nsIDOMWindowUtils).currentInnerWindowID;
this._article = null;
+ this._languagePromise = new Promise(resolve => {
+ this._foundLanguage = resolve;
+ });
if (articlePromise) {
this._articlePromise = articlePromise;
}
- this._headerElementRef = Cu.getWeakReference(doc.getElementById("reader-header"));
- this._domainElementRef = Cu.getWeakReference(doc.getElementById("reader-domain"));
- this._titleElementRef = Cu.getWeakReference(doc.getElementById("reader-title"));
- this._creditsElementRef = Cu.getWeakReference(doc.getElementById("reader-credits"));
- this._contentElementRef = Cu.getWeakReference(doc.getElementById("moz-reader-content"));
- this._toolbarElementRef = Cu.getWeakReference(doc.getElementById("reader-toolbar"));
- this._messageElementRef = Cu.getWeakReference(doc.getElementById("reader-message"));
+ this._headerElementRef = Cu.getWeakReference(doc.querySelector(".reader-header"));
+ this._domainElementRef = Cu.getWeakReference(doc.querySelector(".reader-domain"));
+ this._titleElementRef = Cu.getWeakReference(doc.querySelector(".reader-title"));
+ this._readTimeElementRef = Cu.getWeakReference(doc.querySelector(".reader-estimated-time"));
+ this._creditsElementRef = Cu.getWeakReference(doc.querySelector(".reader-credits"));
+ this._contentElementRef = Cu.getWeakReference(doc.querySelector(".moz-reader-content"));
+ this._toolbarElementRef = Cu.getWeakReference(doc.querySelector(".reader-toolbar"));
+ this._messageElementRef = Cu.getWeakReference(doc.querySelector(".reader-message"));
+ this._containerElementRef = Cu.getWeakReference(doc.querySelector(".container"));
this._scrollOffset = win.pageYOffset;
- doc.addEventListener("click", this, false);
+ doc.addEventListener("click", this);
+
+ win.addEventListener("pagehide", this);
+ win.addEventListener("scroll", this);
+ win.addEventListener("resize", this);
- win.addEventListener("pagehide", this, false);
- win.addEventListener("scroll", this, false);
- win.addEventListener("resize", this, false);
+ win.addEventListener("AboutReaderAddButton", this, false, true);
+ win.addEventListener("AboutReaderRemoveButton", this, false, true);
Services.obs.addObserver(this, "inner-window-destroyed", false);
- doc.addEventListener("visibilitychange", this, false);
+ doc.addEventListener("visibilitychange", this);
this._setupStyleDropdown();
this._setupButton("close-button", this._onReaderClose.bind(this), "aboutReader.toolbar.close");
- const gIsFirefoxDesktop = Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}";
- if (gIsFirefoxDesktop) {
- // we're ready for any external setup, send a signal for that.
- this._mm.sendAsyncMessage("Reader:OnSetup");
- }
+ // we're ready for any external setup, send a signal for that.
+ doc.dispatchEvent(
+ new win.CustomEvent("AboutReaderOnSetup", { bubbles: true, cancelable: false }));
let colorSchemeValues = JSON.parse(Services.prefs.getCharPref("reader.color_scheme.values"));
let colorSchemeOptions = colorSchemeValues.map((value) => {
- return { name: gStrings.GetStringFromName("aboutReader.colorScheme." + value),
- value: value,
- itemClass: value + "-button" };
+ return {
+ name: gStrings.GetStringFromName("aboutReader.colorScheme." + value),
+ value,
+ itemClass: value + "-button"
+ };
});
let colorScheme = Services.prefs.getCharPref("reader.color_scheme");
@@ -114,7 +116,7 @@ var AboutReader = function(mm, win, articlePromise) {
this._setupLineHeightButtons();
if (win.speechSynthesis && Services.prefs.getBoolPref("narrate.enabled")) {
- new NarrateControls(mm, win);
+ new NarrateControls(win, this._languagePromise);
}
this._loadArticle();
@@ -146,6 +148,10 @@ AboutReader.prototype = {
return this._titleElementRef.get();
},
+ get _readTimeElement() {
+ return this._readTimeElementRef.get();
+ },
+
get _creditsElement() {
return this._creditsElementRef.get();
},
@@ -162,6 +168,10 @@ AboutReader.prototype = {
return this._messageElementRef.get();
},
+ get _containerElement() {
+ return this._containerElementRef.get();
+ },
+
get _isToolbarVertical() {
if (this._toolbarVertical !== undefined) {
return this._toolbarVertical;
@@ -178,72 +188,31 @@ AboutReader.prototype = {
return _viewId;
},
- receiveMessage: function (message) {
- switch (message.name) {
- // Triggered by Android user pressing BACK while the banner font-dropdown is open.
- case "Reader:CloseDropdown": {
- // Just close it.
- this._closeDropdowns();
- break;
- }
-
- case "Reader:AddButton": {
- if (message.data.id && message.data.image &&
- !this._doc.getElementById(message.data.id)) {
- let btn = this._doc.createElement("button");
- btn.setAttribute("class", "button");
- btn.setAttribute("style", "background-image: url('" + message.data.image + "')");
- btn.setAttribute("id", message.data.id);
- if (message.data.title)
- btn.setAttribute("title", message.data.title);
- if (message.data.text)
- btn.textContent = message.data.text;
- let tb = this._doc.getElementById("reader-toolbar");
- tb.appendChild(btn);
- this._setupButton(message.data.id, button => {
- this._mm.sendAsyncMessage("Reader:Clicked-" + button.getAttribute("id"), { article: this._article });
- });
- }
- break;
- }
- case "Reader:RemoveButton": {
- if (message.data.id) {
- let btn = this._doc.getElementById(message.data.id);
- if (btn)
- btn.remove();
- }
- break;
- }
- case "Reader:GetStoredArticleData": {
- this._mm.sendAsyncMessage("Reader:StoredArticleData", { article: this._article });
- }
- }
- },
-
- handleEvent: function(aEvent) {
+ handleEvent(aEvent) {
if (!aEvent.isTrusted)
return;
switch (aEvent.type) {
case "click":
let target = aEvent.target;
- if (target.classList.contains('dropdown-toggle')) {
+ if (target.classList.contains("dropdown-toggle")) {
this._toggleDropdownClicked(aEvent);
- } else if (!target.closest('.dropdown-popup')) {
+ } else if (!target.closest(".dropdown-popup")) {
this._closeDropdowns();
}
+ if (target.tagName == "A" && !target.classList.contains("reader-domain")) {
+ this._linkClicked(aEvent);
+ }
break;
case "scroll":
this._closeDropdowns(true);
- let isScrollingUp = this._scrollOffset > aEvent.pageY;
- this._setSystemUIVisibility(isScrollingUp);
this._scrollOffset = aEvent.pageY;
break;
case "resize":
this._updateImageMargins();
if (this._isToolbarVertical) {
this._win.setTimeout(() => {
- for (let dropdown of this._doc.querySelectorAll('.dropdown.open')) {
+ for (let dropdown of this._doc.querySelectorAll(".dropdown.open")) {
this._updatePopupPosition(dropdown);
}
}, 0);
@@ -261,35 +230,57 @@ AboutReader.prototype = {
case "pagehide":
// Close the Banners Font-dropdown, cleanup Android BackPressListener.
this._closeDropdowns();
-
- this._mm.removeMessageListener("Reader:CloseDropdown", this);
- this._mm.removeMessageListener("Reader:AddButton", this);
- this._mm.removeMessageListener("Reader:RemoveButton", this);
- this._mm.removeMessageListener("Reader:GetStoredArticleData", this);
this._windowUnloaded = true;
break;
+
+ case "AboutReaderAddButton": {
+ if (aEvent.detail.id && aEvent.detail.image &&
+ !this._doc.getElementById(aEvent.detail.id)) {
+ let btn = this._doc.createElement("button");
+ btn.setAttribute("class", "button " + aEvent.detail.id);
+ btn.setAttribute("style", "background-image: url('" + aEvent.detail.image + "')");
+ btn.setAttribute("id", aEvent.detail.id);
+ if (aEvent.detail.title)
+ btn.setAttribute("title", aEvent.detail.title);
+ if (aEvent.detail.text)
+ btn.textContent = aEvent.detail.text;
+ let tb = this._toolbarElement;
+ tb.appendChild(btn);
+ this._setupButton(aEvent.detail.id, button => {
+ var data = { article: this._article };
+ this._doc.dispatchEvent(
+ new this._win.CustomEvent("AboutReaderButtonClicked-" + button.getAttribute("id"), {detail: data, bubbles: true, cancelable: false}));
+ });
+ }
+ break;
+ }
+
+ case "AboutReaderRemoveButton": {
+ if (aEvent.detail.id) {
+ let btn = this._doc.getElementById(aEvent.detail.id);
+ if (btn)
+ btn.remove();
+ }
+ break;
+ }
}
},
- observe: function(subject, topic, data) {
+ observe(subject, topic, data) {
if (subject.QueryInterface(Ci.nsISupportsPRUint64).data != this._innerWindowId) {
return;
}
- Services.obs.removeObserver(this, "inner-window-destroyed", false);
-
- this._mm.removeMessageListener("Reader:CloseDropdown", this);
- this._mm.removeMessageListener("Reader:AddButton", this);
- this._mm.removeMessageListener("Reader:RemoveButton", this);
+ Services.obs.removeObserver(this, "inner-window-destroyed");
this._windowUnloaded = true;
},
- _onReaderClose: function() {
- ReaderMode.leaveReaderMode(this._mm.docShell, this._win);
+ _onReaderClose() {
+ ReaderMode.leaveReaderMode(this._win.document.docShell, this._win);
},
- _setFontSize: function(newFontSize) {
- let containerClasses = this._doc.getElementById("container").classList;
+ _setFontSize(newFontSize) {
+ let containerClasses = this._containerElement.classList;
if (this._fontSize > 0)
containerClasses.remove("font-size" + this._fontSize);
@@ -299,19 +290,19 @@ AboutReader.prototype = {
return AsyncPrefs.set("reader.font_size", this._fontSize);
},
- _setupFontSizeButtons: function() {
+ _setupFontSizeButtons() {
const FONT_SIZE_MIN = 1;
const FONT_SIZE_MAX = 9;
// Sample text shown in Android UI.
- let sampleText = this._doc.getElementById("font-size-sample");
+ let sampleText = this._doc.querySelector(".font-size-sample");
sampleText.textContent = gStrings.GetStringFromName("aboutReader.fontTypeSample");
let currentSize = Services.prefs.getIntPref("reader.font_size");
currentSize = Math.max(FONT_SIZE_MIN, Math.min(FONT_SIZE_MAX, currentSize));
- let plusButton = this._doc.getElementById("font-size-plus");
- let minusButton = this._doc.getElementById("font-size-minus");
+ let plusButton = this._doc.querySelector(".plus-button");
+ let minusButton = this._doc.querySelector(".minus-button");
function updateControls() {
if (currentSize === FONT_SIZE_MIN) {
@@ -360,8 +351,8 @@ AboutReader.prototype = {
}, true);
},
- _setContentWidth: function(newContentWidth) {
- let containerClasses = this._doc.getElementById("container").classList;
+ _setContentWidth(newContentWidth) {
+ let containerClasses = this._containerElement.classList;
if (this._contentWidth > 0)
containerClasses.remove("content-width" + this._contentWidth);
@@ -371,15 +362,15 @@ AboutReader.prototype = {
return AsyncPrefs.set("reader.content_width", this._contentWidth);
},
- _setupContentWidthButtons: function() {
+ _setupContentWidthButtons() {
const CONTENT_WIDTH_MIN = 1;
const CONTENT_WIDTH_MAX = 9;
let currentContentWidth = Services.prefs.getIntPref("reader.content_width");
currentContentWidth = Math.max(CONTENT_WIDTH_MIN, Math.min(CONTENT_WIDTH_MAX, currentContentWidth));
- let plusButton = this._doc.getElementById("content-width-plus");
- let minusButton = this._doc.getElementById("content-width-minus");
+ let plusButton = this._doc.querySelector(".content-width-plus-button");
+ let minusButton = this._doc.querySelector(".content-width-minus-button");
function updateControls() {
if (currentContentWidth === CONTENT_WIDTH_MIN) {
@@ -428,8 +419,8 @@ AboutReader.prototype = {
}, true);
},
- _setLineHeight: function(newLineHeight) {
- let contentClasses = this._doc.getElementById("moz-reader-content").classList;
+ _setLineHeight(newLineHeight) {
+ let contentClasses = this._contentElement.classList;
if (this._lineHeight > 0)
contentClasses.remove("line-height" + this._lineHeight);
@@ -439,15 +430,15 @@ AboutReader.prototype = {
return AsyncPrefs.set("reader.line_height", this._lineHeight);
},
- _setupLineHeightButtons: function() {
+ _setupLineHeightButtons() {
const LINE_HEIGHT_MIN = 1;
const LINE_HEIGHT_MAX = 9;
let currentLineHeight = Services.prefs.getIntPref("reader.line_height");
currentLineHeight = Math.max(LINE_HEIGHT_MIN, Math.min(LINE_HEIGHT_MAX, currentLineHeight));
- let plusButton = this._doc.getElementById("line-height-plus");
- let minusButton = this._doc.getElementById("line-height-minus");
+ let plusButton = this._doc.querySelector(".line-height-plus-button");
+ let minusButton = this._doc.querySelector(".line-height-minus-button");
function updateControls() {
if (currentLineHeight === LINE_HEIGHT_MIN) {
@@ -496,7 +487,7 @@ AboutReader.prototype = {
}, true);
},
- _handleDeviceLight: function(newLux) {
+ _handleDeviceLight(newLux) {
// Desired size of the this._luxValues array.
let luxValuesSize = 10;
// Add new lux value at the front of the array.
@@ -513,7 +504,7 @@ AboutReader.prototype = {
return;
}
// Holds the average of the lux values collected in this._luxValues.
- let averageLuxValue = this._totalLux/luxValuesSize;
+ let averageLuxValue = this._totalLux / luxValuesSize;
this._updateColorScheme(averageLuxValue);
// Pop the oldest value off the array.
@@ -522,7 +513,7 @@ AboutReader.prototype = {
this._totalLux -= oldLux;
},
- _handleVisibilityChange: function() {
+ _handleVisibilityChange() {
let colorScheme = Services.prefs.getCharPref("reader.color_scheme");
if (colorScheme != "auto") {
return;
@@ -533,19 +524,19 @@ AboutReader.prototype = {
},
// Setup or teardown the ambient light tracking system.
- _enableAmbientLighting: function(enable) {
+ _enableAmbientLighting(enable) {
if (enable) {
- this._win.addEventListener("devicelight", this, false);
+ this._win.addEventListener("devicelight", this);
this._luxValues = [];
this._totalLux = 0;
} else {
- this._win.removeEventListener("devicelight", this, false);
+ this._win.removeEventListener("devicelight", this);
delete this._luxValues;
delete this._totalLux;
}
},
- _updateColorScheme: function(luxValue) {
+ _updateColorScheme(luxValue) {
// Upper bound value for "dark" color scheme beyond which it changes to "light".
let upperBoundDark = 50;
// Lower bound value for "light" color scheme beyond which it changes to "dark".
@@ -564,7 +555,7 @@ AboutReader.prototype = {
this._setColorScheme("light");
},
- _setColorScheme: function(newColorScheme) {
+ _setColorScheme(newColorScheme) {
// "auto" is not a real color scheme
if (this._colorScheme === newColorScheme || newColorScheme === "auto")
return;
@@ -580,14 +571,14 @@ AboutReader.prototype = {
// Pref values include "dark", "light", and "auto", which automatically switches
// between light and dark color schemes based on the ambient light level.
- _setColorSchemePref: function(colorSchemePref) {
+ _setColorSchemePref(colorSchemePref) {
this._enableAmbientLighting(colorSchemePref === "auto");
this._setColorScheme(colorSchemePref);
AsyncPrefs.set("reader.color_scheme", colorSchemePref);
},
- _setFontType: function(newFontType) {
+ _setFontType(newFontType) {
if (this._fontType === newFontType)
return;
@@ -602,20 +593,34 @@ AboutReader.prototype = {
AsyncPrefs.set("reader.font_type", this._fontType);
},
- _setSystemUIVisibility: function(visible) {
- this._mm.sendAsyncMessage("Reader:SystemUIVisibility", { visible: visible });
+ _setToolbarVisibility(visible) {
+ let tb = this._toolbarElement;
+
+ if (visible) {
+ if (tb.style.opacity != "1") {
+ tb.removeAttribute("hidden");
+ tb.style.opacity = "1";
+ }
+ } else if (tb.style.opacity != "0") {
+ tb.addEventListener("transitionend", evt => {
+ if (tb.style.opacity == "0") {
+ tb.setAttribute("hidden", "");
+ }
+ }, { once: true });
+ tb.style.opacity = "0";
+ }
},
- _loadArticle: Task.async(function* () {
+ async _loadArticle() {
let url = this._getOriginalUrl();
this._showProgressDelayed();
let article;
if (this._articlePromise) {
- article = yield this._articlePromise;
+ article = await this._articlePromise;
} else {
try {
- article = yield this._getArticle(url);
+ article = await this._getArticle(url);
} catch (e) {
if (e && e.newURL) {
let readerURL = "about:reader?url=" + encodeURIComponent(e.newURL);
@@ -638,47 +643,37 @@ AboutReader.prototype = {
}
this._showContent(article);
- }),
-
- _getArticle: function(url) {
- return new Promise((resolve, reject) => {
- let listener = (message) => {
- this._mm.removeMessageListener("Reader:ArticleData", listener);
- if (message.data.newURL) {
- reject({ newURL: message.data.newURL });
- return;
- }
- resolve(message.data.article);
- };
- this._mm.addMessageListener("Reader:ArticleData", listener);
- this._mm.sendAsyncMessage("Reader:ArticleGet", { url: url });
- });
},
- _requestFavicon: function() {
- let handleFaviconReturn = (message) => {
- this._mm.removeMessageListener("Reader:FaviconReturn", handleFaviconReturn);
- this._loadFavicon(message.data.url, message.data.faviconUrl);
- };
+ _getArticle(url) {
+ return ReaderMode.downloadAndParseDocument(url);
+ },
- this._mm.addMessageListener("Reader:FaviconReturn", handleFaviconReturn);
- this._mm.sendAsyncMessage("Reader:FaviconRequest", { url: this._article.url });
+ _requestFavicon() {
+ let faviconUrl = PlacesUtils.promiseFaviconLinkUrl(this._article.url);
+ var self = this;
+ faviconUrl.then(function onResolution(favicon) {
+ self._loadFavicon(self._article.url, favicon.path.replace(/^favicon:/, ""));
+ },
+ function onRejection(reason) {
+ Cu.reportError("Error requesting favicon URL for about:reader content: " + reason);
+ }).catch(Cu.reportError);
},
- _loadFavicon: function(url, faviconUrl) {
+ _loadFavicon(url, faviconUrl) {
if (this._article.url !== url)
return;
let doc = this._doc;
- let link = doc.createElement('link');
- link.rel = 'shortcut icon';
+ let link = doc.createElement("link");
+ link.rel = "shortcut icon";
link.href = faviconUrl;
- doc.getElementsByTagName('head')[0].appendChild(link);
+ doc.getElementsByTagName("head")[0].appendChild(link);
},
- _updateImageMargins: function() {
+ _updateImageMargins() {
let windowWidth = this._win.innerWidth;
let bodyWidth = this._doc.body.clientWidth;
@@ -691,7 +686,7 @@ AboutReader.prototype = {
}
// If the image is at least half as wide as the body, center it on desktop.
- if (img.naturalWidth >= bodyWidth/2) {
+ if (img.naturalWidth >= bodyWidth / 2) {
img.setAttribute("moz-reader-center", true);
} else {
img.removeAttribute("moz-reader-center");
@@ -713,30 +708,32 @@ AboutReader.prototype = {
},
_maybeSetTextDirection: function Read_maybeSetTextDirection(article) {
- if (!article.dir)
- return;
+ if (article.dir) {
+ // Set "dir" attribute on content
+ this._contentElement.setAttribute("dir", article.dir);
+ this._headerElement.setAttribute("dir", article.dir);
+
+ // The native locale could be set differently than the article's text direction.
+ var localeDirection = Services.locale.isAppLocaleRTL ? "rtl" : "ltr";
+ this._readTimeElement.setAttribute("dir", localeDirection);
+ this._readTimeElement.style.textAlign = article.dir == "rtl" ? "right" : "left";
+ }
+ },
- // Set "dir" attribute on content
- this._contentElement.setAttribute("dir", article.dir);
- this._headerElement.setAttribute("dir", article.dir);
- },
-
- _fixLocalLinks() {
- // We need to do this because preprocessing the content through nsIParserUtils
- // gives back a DOM with a <base> element. That influences how these URLs get
- // resolved, making them no longer match the document URI (which is
- // about:reader?url=...). To fix this, make all the hash URIs absolute. This
- // is hacky, but the alternative of removing the base element has potential
- // security implications if Readability has not successfully made all the URLs
- // absolute, so we pick just fixing these in-document links explicitly.
- let localLinks = this._contentElement.querySelectorAll("a[href^='#']");
- for (let localLink of localLinks) {
- // Have to get the attribute because .href provides an absolute URI.
- localLink.href = this._doc.documentURI + localLink.getAttribute("href");
+ _formatReadTime(slowEstimate, fastEstimate) {
+ let displayStringKey = "aboutReader.estimatedReadTimeRange1";
+
+ // only show one reading estimate when they are the same value
+ if (slowEstimate == fastEstimate) {
+ displayStringKey = "aboutReader.estimatedReadTimeValue1";
}
+
+ return PluralForm.get(slowEstimate, gStrings.GetStringFromName(displayStringKey))
+ .replace("#1", fastEstimate)
+ .replace("#2", slowEstimate);
},
- _showError: function() {
+ _showError() {
this._headerElement.style.display = "none";
this._contentElement.style.display = "none";
@@ -746,11 +743,16 @@ AboutReader.prototype = {
this._doc.title = errorMessage;
+ this._doc.documentElement.dataset.isError = true;
+
this._error = true;
+
+ this._doc.dispatchEvent(
+ new this._win.CustomEvent("AboutReaderContentError", { bubbles: true, cancelable: false }));
},
// This function is the JS version of Java's StringUtils.stripCommonSubdomains.
- _stripHost: function(host) {
+ _stripHost(host) {
if (!host)
return host;
@@ -766,17 +768,18 @@ AboutReader.prototype = {
return host.substring(start);
},
- _showContent: function(article) {
+ _showContent(article) {
this._messageElement.style.display = "none";
this._article = article;
this._domainElement.href = article.url;
- let articleUri = Services.io.newURI(article.url, null, null);
+ let articleUri = Services.io.newURI(article.url);
this._domainElement.textContent = this._stripHost(articleUri.host);
this._creditsElement.textContent = article.byline;
this._titleElement.textContent = article.title;
+ this._readTimeElement.textContent = this._formatReadTime(article.readingTimeMinsSlow, article.readingTimeMinsFast);
this._doc.title = article.title;
this._headerElement.style.display = "block";
@@ -787,8 +790,8 @@ AboutReader.prototype = {
false, articleUri, this._contentElement);
this._contentElement.innerHTML = "";
this._contentElement.appendChild(contentFragment);
- this._fixLocalLinks();
this._maybeSetTextDirection(article);
+ this._foundLanguage(article.language);
this._contentElement.style.display = "block";
this._updateImageMargins();
@@ -804,13 +807,13 @@ AboutReader.prototype = {
new this._win.CustomEvent("AboutReaderContentReady", { bubbles: true, cancelable: false }));
},
- _hideContent: function() {
+ _hideContent() {
this._headerElement.style.display = "none";
this._contentElement.style.display = "none";
},
- _showProgressDelayed: function() {
- this._win.setTimeout(function() {
+ _showProgressDelayed() {
+ this._win.setTimeout(() => {
// No need to show progress if the article has been loaded,
// if the window has been unloaded, or if there was an error
// trying to load the article.
@@ -823,20 +826,20 @@ AboutReader.prototype = {
this._messageElement.textContent = gStrings.GetStringFromName("aboutReader.loading2");
this._messageElement.style.display = "block";
- }.bind(this), 300);
+ }, 300);
},
/**
* Returns the original article URL for this about:reader view.
*/
- _getOriginalUrl: function(win) {
+ _getOriginalUrl(win) {
let url = win ? win.location.href : this._win.location.href;
return ReaderMode.getOriginalUrl(url) || url;
},
- _setupSegmentedButton: function(id, options, initialValue, callback) {
+ _setupSegmentedButton(id, options, initialValue, callback) {
let doc = this._doc;
- let segmentedButton = doc.getElementById(id);
+ let segmentedButton = doc.getElementsByClassName(id)[0];
for (let i = 0; i < options.length; i++) {
let option = options[i];
@@ -867,10 +870,6 @@ AboutReader.prototype = {
aEvent.stopPropagation();
- // Just pass the ID of the button as an extra and hope the ID doesn't change
- // unless the context changes
- UITelemetry.addEvent("action.1", "button", null, id);
-
let items = segmentedButton.children;
for (let j = items.length - 1; j >= 0; j--) {
items[j].classList.remove("selected");
@@ -878,19 +877,19 @@ AboutReader.prototype = {
item.classList.add("selected");
callback(option.value);
- }.bind(this), true);
+ }, true);
if (option.value === initialValue)
item.classList.add("selected");
}
},
- _setupButton: function(id, callback, titleEntity, textEntity) {
+ _setupButton(id, callback, titleEntity, textEntity) {
if (titleEntity) {
this._setButtonTip(id, titleEntity);
}
- let button = this._doc.getElementById(id);
+ let button = this._doc.getElementsByClassName(id)[0];
if (textEntity) {
button.textContent = gStrings.GetStringFromName(textEntity);
}
@@ -910,17 +909,17 @@ AboutReader.prototype = {
* and dynamically as button state changes.
* @param Localizable string providing UI element usage tip.
*/
- _setButtonTip: function(id, titleEntity) {
- let button = this._doc.getElementById(id);
+ _setButtonTip(id, titleEntity) {
+ let button = this._doc.getElementsByClassName(id)[0];
button.setAttribute("title", gStrings.GetStringFromName(titleEntity));
},
- _setupStyleDropdown: function() {
- let dropdownToggle = this._doc.querySelector("#style-dropdown .dropdown-toggle");
+ _setupStyleDropdown() {
+ let dropdownToggle = this._doc.querySelector(".style-dropdown .dropdown-toggle");
dropdownToggle.setAttribute("title", gStrings.GetStringFromName("aboutReader.toolbar.typeControls"));
},
- _updatePopupPosition: function(dropdown) {
+ _updatePopupPosition(dropdown) {
let dropdownToggle = dropdown.querySelector(".dropdown-toggle");
let dropdownPopup = dropdown.querySelector(".dropdown-popup");
@@ -931,8 +930,8 @@ AboutReader.prototype = {
dropdownPopup.style.top = popupTop + "px";
},
- _toggleDropdownClicked: function(event) {
- let dropdown = event.target.closest('.dropdown');
+ _toggleDropdownClicked(event) {
+ let dropdown = event.target.closest(".dropdown");
if (!dropdown)
return;
@@ -952,16 +951,13 @@ AboutReader.prototype = {
/*
* If the ReaderView banner font-dropdown is closed, open it.
*/
- _openDropdown: function(dropdown) {
+ _openDropdown(dropdown) {
if (dropdown.classList.contains("open")) {
return;
}
this._closeDropdowns();
-
- // Trigger BackPressListener initialization in Android.
dropdown.classList.add("open");
- this._mm.sendAsyncMessage("Reader:DropdownOpened", this.viewId);
},
/*
@@ -969,7 +965,7 @@ AboutReader.prototype = {
* dropdowns because the page is scrolling, allow popups to stay open with
* the keep-open class.
*/
- _closeDropdowns: function(scrolling) {
+ _closeDropdowns(scrolling) {
let selector = ".dropdown.open";
if (scrolling) {
selector += ":not(.keep-open)";
@@ -979,10 +975,17 @@ AboutReader.prototype = {
for (let dropdown of openDropdowns) {
dropdown.classList.remove("open");
}
+ },
- // Trigger BackPressListener cleanup in Android.
- if (openDropdowns.length) {
- this._mm.sendAsyncMessage("Reader:DropdownClosed", this.viewId);
+ /*
+ * Override link handling for same-page references so we don't exit Reader View.
+ */
+ _linkClicked(event) {
+ var originalUrl = Services.io.newURI(this._getOriginalUrl(), null, null);
+ var targetUrl = Services.io.newURI(event.target.href, null, null);
+ if (originalUrl.specIgnoringRef == targetUrl.specIgnoringRef) {
+ event.preventDefault();
+ this._goToReference(targetUrl.ref);
}
},
diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js
index 853649775..38f59c4ea 100644
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -1017,46 +1017,6 @@
}
},
- readScript: function (node) {
- while (this.currentChar < this.html.length) {
- var c = this.nextChar();
- var nextC = this.peekNext();
- if (c === "<") {
- if (nextC === "!" || nextC === "?") {
- // We're still before the ! or ? that is starting this comment:
- this.currentChar++;
- node.appendChild(this.discardNextComment());
- continue;
- }
- if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
- // Go back before the '<' so we find the end tag.
- this.currentChar--;
- // Done with this script tag, the caller will close:
- return;
- }
- }
- // Either c wasn't a '<' or it was but we couldn't find either a comment
- // or a closing script tag, so we should just parse as text until the next one
- // comes along:
-
- var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
- var textNode = haveTextNode ? node.lastChild : new Text();
- var n = this.html.indexOf("<", this.currentChar);
- // Decrement this to include the current character *afterwards* so we don't get stuck
- // looking for the same < all the time.
- this.currentChar--;
- if (n === -1) {
- textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
- this.currentChar = this.html.length;
- } else {
- textNode.innerHTML += this.html.substring(this.currentChar, n);
- this.currentChar = n;
- }
- if (!haveTextNode)
- node.appendChild(textNode);
- }
- },
-
discardNextComment: function() {
if (this.match("--")) {
this.discardTo("-->");
@@ -1131,11 +1091,7 @@
// If this isn't a void Element, read its child nodes
if (!closed) {
- if (localName == "script") {
- this.readScript(node);
- } else {
- this.readChildren(node);
- }
+ this.readChildren(node);
var closingTag = "</" + localName + ">";
if (!this.match(closingTag)) {
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 491461a8e..04949dc61 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -38,32 +38,22 @@ function Readability(uri, doc, options) {
this._uri = uri;
this._doc = doc;
- this._biggestFrame = false;
+ this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
- // Configureable options
+ // Configurable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
- this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+ this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+ this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
- // The list of pages we've parsed in this call of readability,
- // for autopaging. As a key store for easier searching.
- this._parsedPages = {};
-
- // A list of the ETag headers of pages we've parsed, in case they happen to match,
- // we'll know it's a duplicate.
- this._pageETags = {};
-
- // Make an AJAX request for each page and append it to the document.
- this._curPageNum = 1;
-
var logEl;
// Control whether log messages are sent to the console
@@ -82,12 +72,12 @@ function Readability(uri, doc, options) {
return rv + elDesc;
};
this.log = function () {
- if (typeof dump !== undefined) {
+ if (typeof dump !== "undefined") {
var msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? logEl(x) : x;
}).join(" ");
dump("Reader: (Readability) " + msg + "\n");
- } else if (typeof console !== undefined) {
+ } else if (typeof console !== "undefined") {
var args = ["Reader: (Readability) "].concat(arguments);
console.log.apply(console, args);
}
@@ -109,20 +99,19 @@ Readability.prototype = {
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
- // The maximum number of pages to loop through before we call
- // it quits and just show a link.
- DEFAULT_MAX_PAGES: 5,
-
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
+ // The default number of words an article must have in order to return a result
+ DEFAULT_WORD_THRESHOLD: 500,
+
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+ unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
- negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+ negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
@@ -138,6 +127,13 @@ Readability.prototype = {
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
+ PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
+
+ DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+
+ // These are the classes that readability sets itself.
+ CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+
/**
* Run any post-process modifications to article content as necessary.
*
@@ -147,6 +143,9 @@ Readability.prototype = {
_postProcessContent: function(articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
+
+ // Remove classes.
+ this._cleanClasses(articleContent);
},
/**
@@ -155,8 +154,8 @@ Readability.prototype = {
*
* If function is not passed, removes all the nodes in node list.
*
- * @param NodeList nodeList The no
- * @param Function filterFn
+ * @param NodeList nodeList The nodes to operate on
+ * @param Function filterFn the function to use as a filter
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
@@ -172,6 +171,20 @@ Readability.prototype = {
},
/**
+ * Iterates over a NodeList, and calls _setNodeTag for each node.
+ *
+ * @param NodeList nodeList The nodes to operate on
+ * @param String newTagName the new tag name to use
+ * @return void
+ */
+ _replaceNodeTags: function(nodeList, newTagName) {
+ for (var i = nodeList.length - 1; i >= 0; i--) {
+ var node = nodeList[i];
+ this._setNodeTag(node, newTagName);
+ }
+ },
+
+ /**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
*
@@ -180,10 +193,9 @@ Readability.prototype = {
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
- * @param Boolean backward Whether to use backward iteration.
* @return void
*/
- _forEachNode: function(nodeList, fn, backward) {
+ _forEachNode: function(nodeList, fn) {
Array.prototype.forEach.call(nodeList, fn, this);
},
@@ -228,6 +240,34 @@ Readability.prototype = {
},
/**
+ * Removes the class="" attribute from every element in the given
+ * subtree, except those that match CLASSES_TO_PRESERVE and
+ * the classesToPreserve array from the options object.
+ *
+ * @param Element
+ * @return void
+ */
+ _cleanClasses: function(node) {
+ var classesToPreserve = this._classesToPreserve;
+ var className = (node.getAttribute("class") || "")
+ .split(/\s+/)
+ .filter(function(cls) {
+ return classesToPreserve.indexOf(cls) != -1;
+ })
+ .join(" ");
+
+ if (className) {
+ node.setAttribute("class", className);
+ } else {
+ node.removeAttribute("class");
+ }
+
+ for (node = node.firstElementChild; node; node = node.nextElementSibling) {
+ this._cleanClasses(node);
+ }
+ },
+
+ /**
* Converts each <a> and <img> uri in the given element to an absolute URI,
* ignoring #ref URIs.
*
@@ -307,11 +347,20 @@ Readability.prototype = {
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
} catch (e) {/* ignore exceptions setting the title. */}
- if (curTitle.match(/ [\|\-] /)) {
- curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
+ var titleHadHierarchicalSeparators = false;
+ function wordCount(str) {
+ return str.split(/\s+/).length;
+ }
+
+ // If there's a separator in the title, first remove the final part
+ if ((/ [\|\-\\\/>»] /).test(curTitle)) {
+ titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
+ curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
- if (curTitle.split(' ').length < 3)
- curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
+ // If the resulting title is too short (3 words or fewer), remove
+ // the first part instead:
+ if (wordCount(curTitle) < 3)
+ curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
@@ -328,8 +377,13 @@ Readability.prototype = {
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
// If the title is now too short, try the first colon instead:
- if (curTitle.split(' ').length < 3)
+ if (wordCount(curTitle) < 3) {
curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+ // But if we have too many words before the colon there's something weird
+ // with the titles and the H tags so let's just use the original title instead
+ } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
+ curTitle = origTitle;
+ }
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
var hOnes = doc.getElementsByTagName('h1');
@@ -339,9 +393,16 @@ Readability.prototype = {
}
curTitle = curTitle.trim();
-
- if (curTitle.split(' ').length <= 4)
+ // If we now have 4 words or fewer as our title, and either no
+ // 'hierarchical' separators (\, /, > or ») were found in the original
+ // title or we decreased the number of words by more than 1 word, use
+ // the original title.
+ var curTitleWordCount = wordCount(curTitle);
+ if (curTitleWordCount <= 4 &&
+ (!titleHadHierarchicalSeparators ||
+ curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
curTitle = origTitle;
+ }
return curTitle;
},
@@ -362,9 +423,7 @@ Readability.prototype = {
this._replaceBrs(doc.body);
}
- this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
- this._setNodeTag(fontNode, "SPAN");
- });
+ this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
},
/**
@@ -464,19 +523,49 @@ Readability.prototype = {
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);
+ // Check for data tables before we continue, to avoid removing items in
+ // those tables, which will often be isolated even though they're
+ // visually linked to other content-ful elements (text, images, etc.).
+ this._markDataTables(articleContent);
+
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
+ this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
- // If there is only one h2, they are probably using it as a header
- // and not a subheader, so remove it since we already have a header.
- if (articleContent.getElementsByTagName('h2').length === 1)
- this._clean(articleContent, "h2");
+ // Clean out elements have "share" in their id/class combinations from final top candidates,
+ // which means we don't remove the top candidates even they have "share".
+ this._forEachNode(articleContent.children, function(topCandidate) {
+ this._cleanMatchedNodes(topCandidate, /share/);
+ });
+
+ // If there is only one h2 and its text content substantially equals article title,
+ // they are probably using it as a header and not a subheader,
+ // so remove it since we already extract the title separately.
+ var h2 = articleContent.getElementsByTagName('h2');
+ if (h2.length === 1) {
+ var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
+ if (Math.abs(lengthSimilarRate) < 0.5) {
+ var titlesMatch = false;
+ if (lengthSimilarRate > 0) {
+ titlesMatch = h2[0].textContent.includes(this._articleTitle);
+ } else {
+ titlesMatch = this._articleTitle.includes(h2[0].textContent);
+ }
+ if (titlesMatch) {
+ this._clean(articleContent, "h2");
+ }
+ }
+ }
this._clean(articleContent, "iframe");
+ this._clean(articleContent, "input");
+ this._clean(articleContent, "textarea");
+ this._clean(articleContent, "select");
+ this._clean(articleContent, "button");
this._cleanHeaders(articleContent);
// Do these last as the previous stuff may have removed junk
@@ -662,9 +751,6 @@ Readability.prototype = {
var pageCacheHtml = page.innerHTML;
- // Check if any "dir" is set on the toplevel document element
- this._articleDir = doc.documentElement.getAttribute("dir");
-
while (true) {
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
@@ -695,6 +781,15 @@ Readability.prototype = {
}
}
+ // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+ if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
+ node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
+ node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
+ this._isElementWithoutContent(node)) {
+ node = this._removeAndGetNext(node);
+ continue;
+ }
+
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
}
@@ -709,13 +804,14 @@ Readability.prototype = {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
+ elementsToScore.push(node);
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
- if (childNode.nodeType === Node.TEXT_NODE) {
+ if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
@@ -812,6 +908,7 @@ Readability.prototype = {
var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
+ var parentOfTopCandidate;
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
@@ -831,6 +928,33 @@ Readability.prototype = {
this._initializeNode(topCandidate);
} else if (topCandidate) {
+ // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+ // and whose scores are quite closed with current `topCandidate` node.
+ var alternativeCandidateAncestors = [];
+ for (var i = 1; i < topCandidates.length; i++) {
+ if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
+ alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
+ }
+ }
+ var MINIMUM_TOPCANDIDATES = 3;
+ if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
+ parentOfTopCandidate = topCandidate.parentNode;
+ while (parentOfTopCandidate.tagName !== "BODY") {
+ var listsContainingThisAncestor = 0;
+ for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
+ listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
+ }
+ if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
+ topCandidate = parentOfTopCandidate;
+ break;
+ }
+ parentOfTopCandidate = parentOfTopCandidate.parentNode;
+ }
+ }
+ if (!topCandidate.readability) {
+ this._initializeNode(topCandidate);
+ }
+
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
@@ -838,11 +962,15 @@ Readability.prototype = {
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
- var parentOfTopCandidate = topCandidate.parentNode;
+ parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
- while (parentOfTopCandidate && parentOfTopCandidate.readability) {
+ while (parentOfTopCandidate.tagName !== "BODY") {
+ if (!parentOfTopCandidate.readability) {
+ parentOfTopCandidate = parentOfTopCandidate.parentNode;
+ continue;
+ }
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold)
break;
@@ -854,6 +982,17 @@ Readability.prototype = {
lastScore = parentOfTopCandidate.readability.contentScore;
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
+
+ // If the top candidate is the only child, use parent instead. This will help sibling
+ // joining logic when adjacent content is actually located in parent's sibling node.
+ parentOfTopCandidate = topCandidate.parentNode;
+ while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
+ topCandidate = parentOfTopCandidate;
+ parentOfTopCandidate = topCandidate.parentNode;
+ }
+ if (!topCandidate.readability) {
+ this._initializeNode(topCandidate);
+ }
}
// Now that we have the top candidate, look through its siblings for content
@@ -864,7 +1003,9 @@ Readability.prototype = {
articleContent.id = "readability-content";
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
- var siblings = topCandidate.parentNode.children;
+ // Keep potential top candidate's parent node to try to get text direction of it later.
+ parentOfTopCandidate = topCandidate.parentNode;
+ var siblings = parentOfTopCandidate.children;
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
@@ -927,24 +1068,22 @@ Readability.prototype = {
if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML);
- if (this._curPageNum === 1) {
- if (neededToCreateTopCandidate) {
- // We already created a fake div thing, and there wouldn't have been any siblings left
- // for the previous loop, so there's no point trying to create a new div, and then
- // move all the children over. Just assign IDs and class names here. No need to append
- // because that already happened anyway.
- topCandidate.id = "readability-page-1";
- topCandidate.className = "page";
- } else {
- var div = doc.createElement("DIV");
- div.id = "readability-page-1";
- div.className = "page";
- var children = articleContent.childNodes;
- while (children.length) {
- div.appendChild(children[0]);
- }
- articleContent.appendChild(div);
+ if (neededToCreateTopCandidate) {
+ // We already created a fake div thing, and there wouldn't have been any siblings left
+ // for the previous loop, so there's no point trying to create a new div, and then
+ // move all the children over. Just assign IDs and class names here. No need to append
+ // because that already happened anyway.
+ topCandidate.id = "readability-page-1";
+ topCandidate.className = "page";
+ } else {
+ var div = doc.createElement("DIV");
+ div.id = "readability-page-1";
+ div.className = "page";
+ var children = articleContent.childNodes;
+ while (children.length) {
+ div.appendChild(children[0]);
}
+ articleContent.appendChild(div);
}
if (this._debug)
@@ -955,7 +1094,7 @@ Readability.prototype = {
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
- if (this._getInnerText(articleContent, true).length < 500) {
+ if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
@@ -968,6 +1107,18 @@ Readability.prototype = {
return null;
}
} else {
+ // Find out text direction from ancestors of final top candidate.
+ var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
+ this._someNode(ancestors, function(ancestor) {
+ if (!ancestor.tagName)
+ return false;
+ var articleDir = ancestor.getAttribute("dir");
+ if (articleDir) {
+ this._articleDir = articleDir;
+ return true;
+ }
+ return false;
+ });
return articleContent;
}
}
@@ -1044,12 +1195,15 @@ Readability.prototype = {
metadata.excerpt = values["twitter:description"];
}
- if ("og:title" in values) {
- // Use facebook open graph title.
- metadata.title = values["og:title"];
- } else if ("twitter:title" in values) {
- // Use twitter cards title.
- metadata.title = values["twitter:title"];
+ metadata.title = this._getArticleTitle();
+ if (!metadata.title) {
+ if ("og:title" in values) {
+ // Use facebook open graph title.
+ metadata.title = values["og:title"];
+ } else if ("twitter:title" in values) {
+ // Use twitter cards title.
+ metadata.title = values["twitter:title"];
+ }
}
return metadata;
@@ -1089,6 +1243,13 @@ Readability.prototype = {
});
},
+ _isElementWithoutContent: function(node) {
+ return node.nodeType === Node.ELEMENT_NODE &&
+ node.textContent.trim().length == 0 &&
+ (node.children.length == 0 ||
+ node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
+ },
+
/**
* Determine whether element has any children block level elements.
*
@@ -1139,26 +1300,25 @@ Readability.prototype = {
* @return void
**/
_cleanStyles: function(e) {
- e = e || this._doc;
- if (!e)
+ if (!e || e.tagName.toLowerCase() === 'svg')
return;
- var cur = e.firstChild;
- // Remove any root styles, if we're able.
- if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
- e.removeAttribute('style');
-
- // Go until there are no more child nodes
- while (cur !== null) {
- if (cur.nodeType === cur.ELEMENT_NODE) {
- // Remove style attribute(s) :
- if (cur.className !== "readability-styled")
- cur.removeAttribute("style");
+ if (e.className !== 'readability-styled') {
+ // Remove `style` and deprecated presentational attributes
+ for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+ e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+ }
- this._cleanStyles(cur);
+ if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+ e.removeAttribute('width');
+ e.removeAttribute('height');
}
+ }
- cur = cur.nextSibling;
+ var cur = e.firstElementChild;
+ while (cur !== null) {
+ this._cleanStyles(cur);
+ cur = cur.nextElementSibling;
}
},
@@ -1185,368 +1345,6 @@ Readability.prototype = {
},
/**
- * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
- *
- * @author Dan Lacy
- * @return string the base url
- **/
- _findBaseUrl: function() {
- var uri = this._uri;
- var noUrlParams = uri.path.split("?")[0];
- var urlSlashes = noUrlParams.split("/").reverse();
- var cleanedSegments = [];
- var possibleType = "";
-
- for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
- var segment = urlSlashes[i];
-
- // Split off and save anything that looks like a file type.
- if (segment.indexOf(".") !== -1) {
- possibleType = segment.split(".")[1];
-
- // If the type isn't alpha-only, it's probably not actually a file extension.
- if (!possibleType.match(/[^a-zA-Z]/))
- segment = segment.split(".")[0];
- }
-
- // EW-CMS specific segment replacement. Ugly.
- // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
- if (segment.indexOf(',00') !== -1)
- segment = segment.replace(',00', '');
-
- // If our first or second segment has anything looking like a page number, remove it.
- if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
- segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
-
- var del = false;
-
- // If this is purely a number, and it's the first or second segment,
- // it's probably a page number. Remove it.
- if (i < 2 && segment.match(/^\d{1,2}$/))
- del = true;
-
- // If this is the first segment and it's just "index", remove it.
- if (i === 0 && segment.toLowerCase() === "index")
- del = true;
-
- // If our first or second segment is smaller than 3 characters,
- // and the first segment was purely alphas, remove it.
- if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
- del = true;
-
- // If it's not marked for deletion, push it to cleanedSegments.
- if (!del)
- cleanedSegments.push(segment);
- }
-
- // This is our final, cleaned, base article URL.
- return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
- },
-
- /**
- * Look for any paging links that may occur within the document.
- *
- * @param body
- * @return object (array)
- **/
- _findNextPageLink: function(elem) {
- var uri = this._uri;
- var possiblePages = {};
- var allLinks = elem.getElementsByTagName('a');
- var articleBaseUrl = this._findBaseUrl();
-
- // Loop through all links, looking for hints that they may be next-page links.
- // Things like having "page" in their textContent, className or id, or being a child
- // of a node with a page-y className or id.
- //
- // Also possible: levenshtein distance? longest common subsequence?
- //
- // After we do that, assign each page a score, and
- for (var i = 0, il = allLinks.length; i < il; i += 1) {
- var link = allLinks[i];
- var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
-
- // If we've already seen this page, ignore it.
- if (linkHref === "" ||
- linkHref === articleBaseUrl ||
- linkHref === uri.spec ||
- linkHref in this._parsedPages) {
- continue;
- }
-
- // If it's on a different domain, skip it.
- if (uri.host !== linkHref.split(/\/+/g)[1])
- continue;
-
- var linkText = this._getInnerText(link);
-
- // If the linkText looks like it's not the next page, skip it.
- if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
- continue;
-
- // If the leftovers of the URL after removing the base URL don't contain
- // any digits, it's certainly not a next page link.
- var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
- if (!linkHrefLeftover.match(/\d/))
- continue;
-
- if (!(linkHref in possiblePages)) {
- possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
- } else {
- possiblePages[linkHref].linkText += ' | ' + linkText;
- }
-
- var linkObj = possiblePages[linkHref];
-
- // If the articleBaseUrl isn't part of this URL, penalize this link. It could
- // still be the link, but the odds are lower.
- // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
- if (linkHref.indexOf(articleBaseUrl) !== 0)
- linkObj.score -= 25;
-
- var linkData = linkText + ' ' + link.className + ' ' + link.id;
- if (linkData.match(this.REGEXPS.nextLink))
- linkObj.score += 50;
-
- if (linkData.match(/pag(e|ing|inat)/i))
- linkObj.score += 25;
-
- if (linkData.match(/(first|last)/i)) {
- // -65 is enough to negate any bonuses gotten from a > or » in the text,
- // If we already matched on "next", last is probably fine.
- // If we didn't, then it's bad. Penalize.
- if (!linkObj.linkText.match(this.REGEXPS.nextLink))
- linkObj.score -= 65;
- }
-
- if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
- linkObj.score -= 50;
-
- if (linkData.match(this.REGEXPS.prevLink))
- linkObj.score -= 200;
-
- // If a parentNode contains page or paging or paginat
- var parentNode = link.parentNode;
- var positiveNodeMatch = false;
- var negativeNodeMatch = false;
-
- while (parentNode) {
- var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
-
- if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
- positiveNodeMatch = true;
- linkObj.score += 25;
- }
-
- if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
- // If this is just something like "footer", give it a negative.
- // If it's something like "body-and-footer", leave it be.
- if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
- linkObj.score -= 25;
- negativeNodeMatch = true;
- }
- }
-
- parentNode = parentNode.parentNode;
- }
-
- // If the URL looks like it has paging in it, add to the score.
- // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
- if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
- linkObj.score += 25;
-
- // If the URL contains negative values, give a slight decrease.
- if (linkHref.match(this.REGEXPS.extraneous))
- linkObj.score -= 15;
-
- /**
- * Minor punishment to anything that doesn't match our current URL.
- * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
- * Dan, can you show me a counterexample where this is necessary?
- * if (linkHref.indexOf(window.location.href) !== 0) {
- * linkObj.score -= 1;
- * }
- **/
-
- // If the link text can be parsed as a number, give it a minor bonus, with a slight
- // bias towards lower numbered pages. This is so that pages that might not have 'next'
- // in their text can still get scored, and sorted properly by score.
- var linkTextAsNumber = parseInt(linkText, 10);
- if (linkTextAsNumber) {
- // Punish 1 since we're either already there, or it's probably
- // before what we want anyways.
- if (linkTextAsNumber === 1) {
- linkObj.score -= 10;
- } else {
- linkObj.score += Math.max(0, 10 - linkTextAsNumber);
- }
- }
- }
-
- // Loop thrugh all of our possible pages from above and find our top
- // candidate for the next page URL. Require at least a score of 50, which
- // is a relatively high confidence that this page is the next link.
- var topPage = null;
- for (var page in possiblePages) {
- if (possiblePages.hasOwnProperty(page)) {
- if (possiblePages[page].score >= 50 &&
- (!topPage || topPage.score < possiblePages[page].score))
- topPage = possiblePages[page];
- }
- }
-
- var nextHref = null;
- if (topPage) {
- nextHref = topPage.href.replace(/\/$/, '');
-
- this.log('NEXT PAGE IS ' + nextHref);
- this._parsedPages[nextHref] = true;
- }
- return nextHref;
- },
-
- _successfulRequest: function(request) {
- return (request.status >= 200 && request.status < 300) ||
- request.status === 304 ||
- (request.status === 0 && request.responseText);
- },
-
- _ajax: function(url, options) {
- var request = new XMLHttpRequest();
-
- function respondToReadyState(readyState) {
- if (request.readyState === 4) {
- if (this._successfulRequest(request)) {
- if (options.success)
- options.success(request);
- } else if (options.error) {
- options.error(request);
- }
- }
- }
-
- if (typeof options === 'undefined')
- options = {};
-
- request.onreadystatechange = respondToReadyState;
-
- request.open('get', url, true);
- request.setRequestHeader('Accept', 'text/html');
-
- try {
- request.send(options.postBody);
- } catch (e) {
- if (options.error)
- options.error();
- }
-
- return request;
- },
-
- _appendNextPage: function(nextPageLink) {
- var doc = this._doc;
- this._curPageNum += 1;
-
- var articlePage = doc.createElement("DIV");
- articlePage.id = 'readability-page-' + this._curPageNum;
- articlePage.className = 'page';
- articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
-
- doc.getElementById("readability-content").appendChild(articlePage);
-
- if (this._curPageNum > this._maxPages) {
- var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
- articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
- return;
- }
-
- // Now that we've built the article page DOM element, get the page content
- // asynchronously and load the cleaned content into the div we created for it.
- (function(pageUrl, thisPage) {
- this._ajax(pageUrl, {
- success: function(r) {
-
- // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
- var eTag = r.getResponseHeader('ETag');
- if (eTag) {
- if (eTag in this._pageETags) {
- this.log("Exact duplicate page found via ETag. Aborting.");
- articlePage.style.display = 'none';
- return;
- }
- this._pageETags[eTag] = 1;
- }
-
- // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
- var page = doc.createElement("DIV");
-
- // Do some preprocessing to our HTML to make it ready for appending.
- // - Remove any script tags. Swap and reswap newlines with a unicode
- // character because multiline regex doesn't work in javascript.
- // - Turn any noscript tags into divs so that we can parse them. This
- // allows us to find any next page links hidden via javascript.
- // - Turn all double br's into p's - was handled by prepDocument in the original view.
- // Maybe in the future abstract out prepDocument to work for both the original document
- // and AJAX-added pages.
- var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
- responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
-
- page.innerHTML = responseHtml;
- this._replaceBrs(page);
-
- // Reset all flags for the next page, as they will search through it and
- // disable as necessary at the end of grabArticle.
- this._flags = 0x1 | 0x2 | 0x4;
-
- var secondNextPageLink = this._findNextPageLink(page);
-
- // NOTE: if we end up supporting _appendNextPage(), we'll need to
- // change this call to be async
- var content = this._grabArticle(page);
-
- if (!content) {
- this.log("No content found in page to append. Aborting.");
- return;
- }
-
- // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
- // Compare it against all of the the previous document's we've gotten. If the previous
- // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
- var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
- if (firstP && firstP.innerHTML.length > 100) {
- for (var i = 1; i <= this._curPageNum; i += 1) {
- var rPage = doc.getElementById('readability-page-' + i);
- if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
- this.log('Duplicate of page ' + i + ' - skipping.');
- articlePage.style.display = 'none';
- this._parsedPages[pageUrl] = true;
- return;
- }
- }
- }
-
- this._removeScripts(content);
-
- thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
-
- // After the page has rendered, post process the content. This delay is necessary because,
- // in webkit at least, offsetWidth is not set in time to determine image width. We have to
- // wait a little bit for reflow to finish before we can fix floating images.
- setTimeout((function() {
- this._postProcessContent(thisPage);
- }).bind(this), 500);
-
-
- if (secondNextPageLink)
- this._appendNextPage(secondNextPageLink);
- }
- });
- }).bind(this)(nextPageLink, articlePage);
- },
-
- /**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
*
@@ -1617,16 +1415,17 @@ Readability.prototype = {
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
+ * @param Function filterFn a filter to invoke to determine whether this node 'counts'
* @return Boolean
*/
- _hasAncestorTag: function(node, tagName, maxDepth) {
+ _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
- if (depth > maxDepth)
+ if (maxDepth > 0 && depth > maxDepth)
return false;
- if (node.parentNode.tagName === tagName)
+ if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
return true;
node = node.parentNode;
depth++;
@@ -1635,6 +1434,93 @@ Readability.prototype = {
},
/**
+ * Return an object indicating how many rows and columns this table has.
+ */
+ _getRowAndColumnCount: function(table) {
+ var rows = 0;
+ var columns = 0;
+ var trs = table.getElementsByTagName("tr");
+ for (var i = 0; i < trs.length; i++) {
+ var rowspan = trs[i].getAttribute("rowspan") || 0;
+ if (rowspan) {
+ rowspan = parseInt(rowspan, 10);
+ }
+ rows += (rowspan || 1);
+
+ // Now look for column-related info
+ var columnsInThisRow = 0;
+ var cells = trs[i].getElementsByTagName("td");
+ for (var j = 0; j < cells.length; j++) {
+ var colspan = cells[j].getAttribute("colspan") || 0;
+ if (colspan) {
+ colspan = parseInt(colspan, 10);
+ }
+ columnsInThisRow += (colspan || 1);
+ }
+ columns = Math.max(columns, columnsInThisRow);
+ }
+ return {rows: rows, columns: columns};
+ },
+
+ /**
+ * Look for 'data' (as opposed to 'layout') tables, for which we use
+ * similar checks as
+ * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+ */
+ _markDataTables: function(root) {
+ var tables = root.getElementsByTagName("table");
+ for (var i = 0; i < tables.length; i++) {
+ var table = tables[i];
+ var role = table.getAttribute("role");
+ if (role == "presentation") {
+ table._readabilityDataTable = false;
+ continue;
+ }
+ var datatable = table.getAttribute("datatable");
+ if (datatable == "0") {
+ table._readabilityDataTable = false;
+ continue;
+ }
+ var summary = table.getAttribute("summary");
+ if (summary) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ var caption = table.getElementsByTagName("caption")[0];
+ if (caption && caption.childNodes.length > 0) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ // If the table has a descendant with any of these tags, consider a data table:
+ var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
+ var descendantExists = function(tag) {
+ return !!table.getElementsByTagName(tag)[0];
+ };
+ if (dataTableDescendants.some(descendantExists)) {
+ this.log("Data table because found data-y descendant");
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ // Nested tables indicate a layout table:
+ if (table.getElementsByTagName("table")[0]) {
+ table._readabilityDataTable = false;
+ continue;
+ }
+
+ var sizeInfo = this._getRowAndColumnCount(table);
+ if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+ // Now just go by size entirely:
+ table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
+ }
+ },
+
+ /**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
@@ -1652,6 +1538,15 @@ Readability.prototype = {
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) {
+ // First check if we're in a data table, in which case don't remove us.
+ var isDataTable = function(t) {
+ return t._readabilityDataTable;
+ };
+
+ if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
+ return false;
+ }
+
var weight = this._getClassWeight(node);
var contentScore = 0;
@@ -1667,7 +1562,7 @@ Readability.prototype = {
// ominous signs, remove the element.
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
- var li = node.getElementsByTagName("li").length-100;
+ var li = node.getElementsByTagName("li").length - 100;
var input = node.getElementsByTagName("input").length;
var embedCount = 0;
@@ -1681,11 +1576,10 @@ Readability.prototype = {
var contentLength = this._getInnerText(node).length;
var haveToRemove =
- // Make an exception for elements with no p's and exactly 1 img.
- (img > p && !this._hasAncestorTag(node, "figure")) ||
+ (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
- (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+ (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
@@ -1696,6 +1590,25 @@ Readability.prototype = {
},
/**
+ * Clean out elements whose id/class combinations match specific string.
+ *
+ * @param Element
+ * @param RegExp match id/class combination.
+ * @return void
+ **/
+ _cleanMatchedNodes: function(e, regex) {
+ var endOfSearchMarkerNode = this._getNextNode(e, true);
+ var next = this._getNextNode(e);
+ while (next && next != endOfSearchMarkerNode) {
+ if (regex.test(next.className + " " + next.id)) {
+ next = this._removeAndGetNext(next);
+ } else {
+ next = this._getNextNode(next);
+ }
+ }
+ },
+
+ /**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
@@ -1713,10 +1626,6 @@ Readability.prototype = {
return (this._flags & flag) > 0;
},
- _addFlag: function(flag) {
- this._flags = this._flags | flag;
- },
-
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
@@ -1807,20 +1716,10 @@ Readability.prototype = {
// Remove script tags from the document.
this._removeScripts(this._doc);
- // FIXME: Disabled multi-page article support for now as it
- // needs more work on infrastructure.
-
- // Make sure this document is added to the list of parsed pages first,
- // so we don't double up on the first page.
- // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
-
- // Pull out any possible next page link first.
- // var nextPageLink = this._findNextPageLink(doc.body);
-
this._prepDocument();
var metadata = this._getArticleMetadata();
- var articleTitle = metadata.title || this._getArticleTitle();
+ this._articleTitle = metadata.title;
var articleContent = this._grabArticle();
if (!articleContent)
@@ -1830,14 +1729,6 @@ Readability.prototype = {
this._postProcessContent(articleContent);
- // if (nextPageLink) {
- // // Append any additional pages after a small timeout so that people
- // // can start reading without having to wait for this to finish processing.
- // setTimeout((function() {
- // this._appendNextPage(nextPageLink);
- // }).bind(this), 500);
- // }
-
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.
@@ -1851,7 +1742,7 @@ Readability.prototype = {
var textContent = articleContent.textContent;
return {
uri: this._uri,
- title: articleTitle,
+ title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
@@ -1861,3 +1752,7 @@ Readability.prototype = {
};
}
};
+
+if (typeof module === "object") {
+ module.exports = Readability;
+}
diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm
index 033a02489..e9eb83154 100644
--- a/toolkit/components/reader/ReaderMode.jsm
+++ b/toolkit/components/reader/ReaderMode.jsm
@@ -8,15 +8,18 @@ this.EXPORTED_SYMBOLS = ["ReaderMode"];
const { classes: Cc, interfaces: Ci, utils: Cu } = Components;
-// Constants for telemetry.
-const DOWNLOAD_SUCCESS = 0;
-const DOWNLOAD_ERROR_XHR = 1;
-const DOWNLOAD_ERROR_NO_DOC = 2;
-
-const PARSE_SUCCESS = 0;
-const PARSE_ERROR_TOO_MANY_ELEMENTS = 1;
-const PARSE_ERROR_WORKER = 2;
-const PARSE_ERROR_NO_ARTICLE = 3;
+// Class names to preserve in the readerized output. We preserve these class
+// names so that rules in aboutReader.css can match them.
+const CLASSES_TO_PRESERVE = [
+ "caption",
+ "hidden",
+ "invisble",
+ "sr-only",
+ "visually-hidden",
+ "visuallyhidden",
+ "wp-caption",
+ "wp-caption-text",
+];
Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/XPCOMUtils.jsm");
@@ -24,17 +27,15 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm");
Cu.importGlobalProperties(["XMLHttpRequest"]);
XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js");
-XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "TelemetryStopwatch", "resource://gre/modules/TelemetryStopwatch.jsm");
XPCOMUtils.defineLazyGetter(this, "Readability", function() {
let scope = {};
scope.dump = this.dump;
Services.scriptloader.loadSubScript("resource://gre/modules/reader/Readability.js", scope);
- return scope["Readability"];
+ return scope.Readability;
});
this.ReaderMode = {
@@ -61,21 +62,13 @@ this.ReaderMode = {
return this.isEnabledForParseOnLoad = this._getStateForParseOnLoad();
},
- get isOnLowMemoryPlatform() {
- let memory = Cc["@mozilla.org/xpcom/memory-service;1"].getService(Ci.nsIMemory);
- delete this.isOnLowMemoryPlatform;
- return this.isOnLowMemoryPlatform = memory.isLowMemoryPlatform();
- },
-
- _getStateForParseOnLoad: function () {
+ _getStateForParseOnLoad() {
let isEnabled = Services.prefs.getBoolPref("reader.parse-on-load.enabled");
let isForceEnabled = Services.prefs.getBoolPref("reader.parse-on-load.force-enabled");
- // For low-memory devices, don't allow reader mode since it takes up a lot of memory.
- // See https://bugzilla.mozilla.org/show_bug.cgi?id=792603 for details.
- return isForceEnabled || (isEnabled && !this.isOnLowMemoryPlatform);
+ return isForceEnabled || isEnabled;
},
- observe: function(aMessage, aTopic, aData) {
+ observe(aMessage, aTopic, aData) {
switch (aTopic) {
case "nsPref:changed":
if (aData.startsWith("reader.parse-on-load.")) {
@@ -91,7 +84,7 @@ this.ReaderMode = {
* Enter the reader mode by going forward one step in history if applicable,
* if not, append the about:reader page in the history instead.
*/
- enterReaderMode: function(docShell, win) {
+ enterReaderMode(docShell, win) {
let url = win.document.location.href;
let readerURL = "about:reader?url=" + encodeURIComponent(url);
let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
@@ -112,7 +105,7 @@ this.ReaderMode = {
* Exit the reader mode by going back one step in history if applicable,
* if not, append the original page in the history instead.
*/
- leaveReaderMode: function(docShell, win) {
+ leaveReaderMode(docShell, win) {
let url = win.document.location.href;
let originalURL = this.getOriginalUrl(url);
let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
@@ -136,14 +129,14 @@ this.ReaderMode = {
* @return The original URL for the article, or null if we did not find
* a properly formatted about:reader URL.
*/
- getOriginalUrl: function(url) {
+ getOriginalUrl(url) {
if (!url.startsWith("about:reader?")) {
return null;
}
let outerHash = "";
try {
- let uriObj = Services.io.newURI(url, null, null);
+ let uriObj = Services.io.newURI(url);
url = uriObj.specIgnoringRef;
outerHash = uriObj.ref;
} catch (ex) { /* ignore, use the raw string */ }
@@ -155,27 +148,45 @@ this.ReaderMode = {
let originalUrl = searchParams.get("url");
if (outerHash) {
try {
- let uriObj = Services.io.newURI(originalUrl, null, null);
- uriObj = Services.io.newURI('#' + outerHash, null, uriObj);
+ let uriObj = Services.io.newURI(originalUrl);
+ uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
originalUrl = uriObj.spec;
} catch (ex) {}
}
return originalUrl;
},
+ getOriginalUrlObjectForDisplay(url) {
+ let originalUrl = this.getOriginalUrl(url);
+ if (originalUrl) {
+ let uriObj;
+ try {
+ uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE);
+ } catch (ex) {
+ return null;
+ }
+ try {
+ return Services.uriFixup.createExposableURI(uriObj);
+ } catch (ex) {
+ return null;
+ }
+ }
+ return null;
+ },
+
/**
* Decides whether or not a document is reader-able without parsing the whole thing.
*
* @param doc A document to parse.
* @return boolean Whether or not we should show the reader mode button.
*/
- isProbablyReaderable: function(doc) {
+ isProbablyReaderable(doc) {
// Only care about 'real' HTML documents:
if (doc.mozSyntheticDocument || !(doc instanceof doc.defaultView.HTMLDocument)) {
return false;
}
- let uri = Services.io.newURI(doc.location.href, null, null);
+ let uri = Services.io.newURI(doc.location.href);
if (!this._shouldCheckUri(uri)) {
return false;
}
@@ -187,12 +198,12 @@ this.ReaderMode = {
return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils));
},
- isNodeVisible: function(utils, node) {
+ isNodeVisible(utils, node) {
let bounds = utils.getBoundsWithoutFlushing(node);
return bounds.height > 0 && bounds.width > 0;
},
- getUtilsForWin: function(win) {
+ getUtilsForWin(win) {
return win.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIDOMWindowUtils);
},
@@ -204,16 +215,14 @@ this.ReaderMode = {
* @return {Promise}
* @resolves JS object representing the article, or null if no article is found.
*/
- parseDocument: Task.async(function* (doc) {
- let documentURI = Services.io.newURI(doc.documentURI, null, null);
- let baseURI = Services.io.newURI(doc.baseURI, null, null);
- if (!this._shouldCheckUri(documentURI) || !this._shouldCheckUri(baseURI, true)) {
+ parseDocument(doc) {
+ if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
this.log("Reader mode disabled for URI");
return null;
}
- return yield this._readerParse(baseURI, doc);
- }),
+ return this._readerParse(doc);
+ },
/**
* Downloads and parses a document from a URL.
@@ -222,19 +231,28 @@ this.ReaderMode = {
* @return {Promise}
* @resolves JS object representing the article, or null if no article is found.
*/
- downloadAndParseDocument: Task.async(function* (url) {
- let doc = yield this._downloadDocument(url);
- let uri = Services.io.newURI(doc.baseURI, null, null);
- if (!this._shouldCheckUri(uri, true)) {
+ async downloadAndParseDocument(url) {
+ let doc = await this._downloadDocument(url);
+ if (!doc) {
+ return null;
+ }
+ if (!this._shouldCheckUri(doc.documentURIObject) || !this._shouldCheckUri(doc.baseURIObject, true)) {
this.log("Reader mode disabled for URI");
return null;
}
- return yield this._readerParse(uri, doc);
- }),
+ return await this._readerParse(doc);
+ },
- _downloadDocument: function (url) {
- let histogram = Services.telemetry.getHistogramById("READER_MODE_DOWNLOAD_RESULT");
+ _downloadDocument(url) {
+ try {
+ if (!this._shouldCheckUri(Services.io.newURI(url))) {
+ return null;
+ }
+ } catch (ex) {
+ Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`));
+ return null;
+ }
return new Promise((resolve, reject) => {
let xhr = new XMLHttpRequest();
xhr.open("GET", url, true);
@@ -243,14 +261,12 @@ this.ReaderMode = {
xhr.onload = evt => {
if (xhr.status !== 200) {
reject("Reader mode XHR failed with status: " + xhr.status);
- histogram.add(DOWNLOAD_ERROR_XHR);
return;
}
let doc = xhr.responseXML;
if (!doc) {
reject("Reader mode XHR didn't return a document");
- histogram.add(DOWNLOAD_ERROR_NO_DOC);
return;
}
@@ -261,7 +277,7 @@ this.ReaderMode = {
if (content) {
let urlIndex = content.toUpperCase().indexOf("URL=");
if (urlIndex > -1) {
- let baseURI = Services.io.newURI(url, null, null);
+ let baseURI = Services.io.newURI(url);
let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI);
let newURL = newURI.spec;
let ssm = Services.scriptSecurityManager;
@@ -290,10 +306,10 @@ this.ReaderMode = {
// Convert these to real URIs to make sure the escaping (or lack
// thereof) is identical:
try {
- responseURL = Services.io.newURI(responseURL, null, null).specIgnoringRef;
+ responseURL = Services.io.newURI(responseURL).specIgnoringRef;
} catch (ex) { /* Ignore errors - we'll use what we had before */ }
try {
- givenURL = Services.io.newURI(givenURL, null, null).specIgnoringRef;
+ givenURL = Services.io.newURI(givenURL).specIgnoringRef;
} catch (ex) { /* Ignore errors - we'll use what we had before */ }
if (responseURL != givenURL) {
@@ -303,7 +319,6 @@ this.ReaderMode = {
return;
}
resolve(doc);
- histogram.add(DOWNLOAD_SUCCESS);
};
xhr.send();
});
@@ -318,17 +333,17 @@ this.ReaderMode = {
* @resolves JS object representing the article, or null if no article is found.
* @rejects OS.File.Error
*/
- getArticleFromCache: Task.async(function* (url) {
+ async getArticleFromCache(url) {
let path = this._toHashedPath(url);
try {
- let array = yield OS.File.read(path);
+ let array = await OS.File.read(path);
return JSON.parse(new TextDecoder().decode(array));
} catch (e) {
if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile)
throw e;
return null;
}
- }),
+ },
/**
* Stores an article in the cache.
@@ -338,14 +353,14 @@ this.ReaderMode = {
* @resolves When the article is stored.
* @rejects OS.File.Error
*/
- storeArticleInCache: Task.async(function* (article) {
+ async storeArticleInCache(article) {
let array = new TextEncoder().encode(JSON.stringify(article));
let path = this._toHashedPath(article.url);
- yield this._ensureCacheDir();
+ await this._ensureCacheDir();
return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" })
.then(success => {
OS.File.stat(path).then(info => {
- return Messaging.sendRequest({
+ return EventDispatcher.instance.sendRequest({
type: "Reader:AddedToCache",
url: article.url,
size: info.size,
@@ -353,7 +368,7 @@ this.ReaderMode = {
});
});
});
- }),
+ },
/**
* Removes an article from the cache given an article URI.
@@ -363,26 +378,29 @@ this.ReaderMode = {
* @resolves When the article is removed.
* @rejects OS.File.Error
*/
- removeArticleFromCache: Task.async(function* (url) {
+ async removeArticleFromCache(url) {
let path = this._toHashedPath(url);
- yield OS.File.remove(path);
- }),
+ await OS.File.remove(path);
+ },
- log: function(msg) {
+ log(msg) {
if (this.DEBUG)
dump("Reader: " + msg);
},
_blockedHosts: [
- "mail.google.com",
+ "amazon.com",
+ "basilisk-browser.org",
"github.com",
+ "mail.google.com",
+ "palemoon.org",
"pinterest.com",
"reddit.com",
"twitter.com",
"youtube.com",
],
- _shouldCheckUri: function (uri, isBaseUri = false) {
+ _shouldCheckUri(uri, isBaseUri = false) {
if (!(uri.schemeIs("http") || uri.schemeIs("https"))) {
this.log("Not parsing URI scheme: " + uri.scheme);
return false;
@@ -412,59 +430,77 @@ this.ReaderMode = {
* Attempts to parse a document into an article. Heavy lifting happens
* in readerWorker.js.
*
- * @param uri The base URI of the article.
* @param doc The document to parse.
* @return {Promise}
* @resolves JS object representing the article, or null if no article is found.
*/
- _readerParse: Task.async(function* (uri, doc) {
- let histogram = Services.telemetry.getHistogramById("READER_MODE_PARSE_RESULT");
+ async _readerParse(doc) {
if (this.parseNodeLimit) {
let numTags = doc.getElementsByTagName("*").length;
if (numTags > this.parseNodeLimit) {
- this.log("Aborting parse for " + uri.spec + "; " + numTags + " elements found");
- histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS);
+ this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found");
return null;
}
}
+ // Fetch this here before we send `doc` off to the worker thread, as later on the
+ // document might be nuked but we will still want the URI.
+ let {documentURI} = doc;
+
let uriParam = {
- spec: uri.spec,
- host: uri.host,
- prePath: uri.prePath,
- scheme: uri.scheme,
- pathBase: Services.io.newURI(".", null, uri).spec
+ spec: doc.baseURIObject.spec,
+ host: doc.baseURIObject.host,
+ prePath: doc.baseURIObject.prePath,
+ scheme: doc.baseURIObject.scheme,
+ pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec
+ };
+
+ let langAttributes = {
+ charset: doc.characterSet,
+ lang: doc.documentElement.lang
};
let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"].
createInstance(Ci.nsIDOMSerializer);
let serializedDoc = serializer.serializeToString(doc);
+ let options = {
+ classesToPreserve: CLASSES_TO_PRESERVE,
+ };
+
let article = null;
try {
- article = yield ReaderWorker.post("parseDocument", [uriParam, serializedDoc]);
+ article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]);
} catch (e) {
Cu.reportError("Error in ReaderWorker: " + e);
- histogram.add(PARSE_ERROR_WORKER);
}
+ // Explicitly null out doc to make it clear it might not be available from this
+ // point on.
+ doc = null;
+
if (!article) {
this.log("Worker did not return an article");
- histogram.add(PARSE_ERROR_NO_ARTICLE);
return null;
}
- // Readability returns a URI object, but we only care about the URL.
- article.url = article.uri.spec;
+ // Readability returns a URI object based on the baseURI, but we only care
+ // about the original document's URL from now on. This also avoids spoofing
+ // attempts where the baseURI doesn't match the domain of the documentURI
+ article.url = documentURI;
delete article.uri;
let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils)
.convertToPlainText(article.title, flags, 0);
- histogram.add(PARSE_SUCCESS);
+ await this._assignLanguage(article, langAttributes);
+ this._maybeAssignTextDirection(article);
+
+ this._assignReadTime(article);
+
return article;
- }),
+ },
get _cryptoHash() {
delete this._cryptoHash;
@@ -485,7 +521,7 @@ this.ReaderMode = {
* @param url The article URL. This should have referrers removed.
* @return The file path to the cached article.
*/
- _toHashedPath: function (url) {
+ _toHashedPath(url) {
let value = this._unicodeConverter.convertToByteArray(url);
this._cryptoHash.init(this._cryptoHash.MD5);
this._cryptoHash.update(value, value.length);
@@ -502,7 +538,7 @@ this.ReaderMode = {
* @resolves When the cache directory exists.
* @rejects OS.File.Error
*/
- _ensureCacheDir: function () {
+ _ensureCacheDir() {
let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache");
return OS.File.exists(dir).then(exists => {
if (!exists) {
@@ -510,5 +546,107 @@ this.ReaderMode = {
}
return undefined;
});
- }
+ },
+
+ /**
+ * Sets a global language string value if possible. If langauge detection is
+ * available, use that. Otherwise, revert to a simpler mechanism using the
+ * document's lang attribute or charset.
+ *
+ * @return Promise
+ * @resolves when the language is detected
+ */
+ _assignLanguage(article, attributes) {
+ try {
+ Cu.import("resource://modules/translation/LanguageDetector.jsm");
+ return LanguageDetector.detectLanguage(article.textContent).then(result => {
+ article.language = result.confident ? result.language : null;
+ });
+ } catch(ex) {
+ return new Promise((resolve) => {
+ resolve(this._assignSimpleLanguage(attributes));
+ }).then(result => {
+ article.language = result;
+ });
+ }
+ },
+
+ _assignSimpleLanguage(attributes) {
+ var lang = attributes.lang.substring(0,2);
+ if (lang) {
+ return lang;
+ }
+
+ // If there is no lang attribute, try the charset.
+ // We can only use this for charsets that are specific to one language.
+ const charsetLang = new Map([
+ [ "us-ascii", "en" ],
+ [ "iso-8859-6", "ar" ],
+ [ "iso-8859-7", "el" ],
+ [ "iso-8859-8", "he" ],
+ [ "iso-8859-9", "tr" ],
+ [ "iso-8859-11", "th" ],
+ [ "jis_x0201", "ja" ],
+ [ "shift_jis", "ja" ],
+ [ "euc-jp", "ja" ]
+ ]);
+
+ return charsetLang.get(attributes.charset);
+ },
+
+ _maybeAssignTextDirection(article) {
+ // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
+ if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) {
+ article.dir = "rtl";
+ }
+ },
+
+ /**
+ * Assigns the estimated reading time range of the article to the article object.
+ *
+ * @param article the article object to assign the reading time estimate to.
+ */
+ _assignReadTime(article) {
+ let lang = article.language || "en";
+ const readingSpeed = this._getReadingSpeedForLanguage(lang);
+ const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
+ const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
+ const length = article.length;
+
+ article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
+ article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh);
+ },
+
+ /**
+ * Returns the reading speed of a selection of languages with likely variance.
+ *
+ * Reading speed estimated from a study done on reading speeds in various languages.
+ * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
+ *
+ * @return object with characters per minute and variance. Defaults to English
+ * if no suitable language is found in the collection.
+ */
+ _getReadingSpeedForLanguage(lang) {
+ const readingSpeed = new Map([
+ [ "en", {cpm: 987, variance: 118 } ],
+ [ "ar", {cpm: 612, variance: 88 } ],
+ [ "de", {cpm: 920, variance: 86 } ],
+ [ "es", {cpm: 1025, variance: 127 } ],
+ [ "fi", {cpm: 1078, variance: 121 } ],
+ [ "fr", {cpm: 998, variance: 126 } ],
+ [ "he", {cpm: 833, variance: 130 } ],
+ [ "it", {cpm: 950, variance: 140 } ],
+ [ "jw", {cpm: 357, variance: 56 } ],
+ [ "nl", {cpm: 978, variance: 143 } ],
+ [ "pl", {cpm: 916, variance: 126 } ],
+ [ "pt", {cpm: 913, variance: 145 } ],
+ [ "ru", {cpm: 986, variance: 175 } ],
+ [ "sk", {cpm: 885, variance: 145 } ],
+ [ "sv", {cpm: 917, variance: 156 } ],
+ [ "tr", {cpm: 1054, variance: 156 } ],
+ [ "zh", {cpm: 255, variance: 29 } ],
+ ]);
+
+ return readingSpeed.get(lang) || readingSpeed.get("en");
+ },
};
diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js
index 20023d4e0..9ae589d7d 100644
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -2,6 +2,8 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
+/* eslint-env mozilla/chrome-worker */
+
"use strict";
/**
@@ -40,11 +42,12 @@ var Agent = {
*
* @param {object} uri URI data for the document.
* @param {string} serializedDoc The serialized document.
+ * @param {object} options Options object to pass to Readability.
*
* @return {object} Article object returned from Readability.
*/
- parseDocument: function (uri, serializedDoc) {
+ parseDocument(uri, serializedDoc, options) {
let doc = new JSDOMParser().parse(serializedDoc);
- return new Readability(uri, doc).parse();
+ return new Readability(uri, doc, options).parse();
},
};
diff --git a/toolkit/components/reader/content/aboutReader.html b/toolkit/components/reader/content/aboutReader.html
index b9c1139f6..1aa644474 100644
--- a/toolkit/components/reader/content/aboutReader.html
+++ b/toolkit/components/reader/content/aboutReader.html
@@ -7,63 +7,56 @@
<link rel="stylesheet" href="chrome://global/skin/aboutReader.css" type="text/css"/>
- <script type="text/javascript;version=1.8" src="chrome://global/content/reader/aboutReader.js"></script>
+ <script type="text/javascript" src="chrome://global/content/reader/aboutReader.js"></script>
</head>
<body>
- <div id="container" class="container">
- <div id="reader-header" class="header">
- <style scoped>
- @import url("chrome://global/skin/aboutReaderControls.css");
- </style>
- <a id="reader-domain" class="domain"></a>
+ <div class="container">
+ <div class="header reader-header">
+ <a class="domain reader-domain"></a>
<div class="domain-border"></div>
- <h1 id="reader-title"></h1>
- <div id="reader-credits" class="credits"></div>
+ <h1 class="reader-title"></h1>
+ <div class="credits reader-credits"></div>
+ <div class="meta-data">
+ <div class="reader-estimated-time"></div>
+ </div>
</div>
+ <hr>
+
<div class="content">
- <style scoped>
- @import url("chrome://global/skin/aboutReaderContent.css");
- </style>
- <div id="moz-reader-content"></div>
+ <div class="moz-reader-content"></div>
</div>
<div>
- <style scoped>
- @import url("chrome://global/skin/aboutReaderControls.css");
- </style>
- <div id="reader-message"></div>
+ <div class="reader-message"></div>
</div>
</div>
- <ul id="reader-toolbar" class="toolbar">
- <style scoped>
- @import url("chrome://global/skin/aboutReaderControls.css");
- </style>
- <li><button id="close-button" class="button close-button"/></li>
- <ul id="style-dropdown" class="dropdown">
+ <ul class="toolbar reader-toolbar">
+ <li><button class="button close-button"/></li>
+ <ul class="dropdown style-dropdown">
<li><button class="dropdown-toggle button style-button"/></li>
- <li id="reader-popup" class="dropdown-popup">
- <div id="font-type-buttons"></div>
- <hr></hr>
- <div id="font-size-buttons">
- <button id="font-size-minus" class="minus-button"/>
- <button id="font-size-sample"/>
- <button id="font-size-plus" class="plus-button"/>
+ <li class="dropdown-popup">
+ <div class="font-type-buttons"></div>
+ <hr>
+ <div class="font-size-buttons">
+ <button class="minus-button"/>
+ <button class="font-size-sample"/>
+ <button class="plus-button"/>
</div>
- <hr></hr>
- <div id="content-width-buttons">
- <button id="content-width-minus" class="content-width-minus-button"/>
- <button id="content-width-plus" class="content-width-plus-button"/>
+ <hr>
+ <div class="content-width-buttons">
+ <button class="content-width-minus-button"/>
+ <button class="content-width-plus-button"/>
</div>
- <hr></hr>
- <div id="line-height-buttons">
- <button id="line-height-minus" class="line-height-minus-button"/>
- <button id="line-height-plus" class="line-height-plus-button"/>
+ <hr>
+ <div class="line-height-buttons">
+ <button class="line-height-minus-button"/>
+ <button class="line-height-plus-button"/>
</div>
- <hr></hr>
- <div id="color-scheme-buttons"></div>
+ <hr>
+ <div class="color-scheme-buttons"></div>
<div class="dropdown-arrow"/>
</li>
</ul>
diff --git a/toolkit/components/reader/content/aboutReader.js b/toolkit/components/reader/content/aboutReader.js
index 17133e69d..6c963382e 100644
--- a/toolkit/components/reader/content/aboutReader.js
+++ b/toolkit/components/reader/content/aboutReader.js
@@ -4,6 +4,6 @@
"use strict";
-window.addEventListener("DOMContentLoaded", function () {
+window.addEventListener("DOMContentLoaded", function() {
document.dispatchEvent(new CustomEvent("AboutReaderContentLoaded", { bubbles: true }));
});