diff options
Diffstat (limited to 'toolkit')
-rw-r--r-- | toolkit/components/reader/JSDOMParser.js | 21 | ||||
-rw-r--r-- | toolkit/components/reader/Readability.js | 67 | ||||
-rw-r--r-- | toolkit/components/reader/ReaderWorker.js | 2 | ||||
-rw-r--r-- | toolkit/mozapps/installer/packager.py | 22 |
4 files changed, 68 insertions, 44 deletions
diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index 38f59c4ea..dd9d37230 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -560,7 +560,8 @@ }, }; - var Document = function () { + var Document = function (url) { + this.documentURI = url; this.styleSheets = []; this.childNodes = []; this.children = []; @@ -600,6 +601,20 @@ node.textContent = text; return node; }, + + get baseURI() { + if (!this.hasOwnProperty("_baseURI")) { + this._baseURI = this.documentURI; + var baseElements = this.getElementsByTagName("base"); + var href = baseElements[0] && baseElements[0].getAttribute("href"); + if (href) { + try { + this._baseURI = (new URL(href, this._baseURI)).href; + } catch (ex) {/* Just fall back to documentURI */} + } + } + return this._baseURI; + }, }; var Element = function (tag) { @@ -1118,9 +1133,9 @@ /** * Parses an HTML string and returns a JS implementation of the Document. */ - parse: function (html) { + parse: function (html, url) { this.html = html; - var doc = this.doc = new Document(); + var doc = this.doc = new Document(url); this.readChildren(doc); // If this is an HTML document, remove root-level children except for the diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 04949dc61..064d2ae88 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -41,6 +41,7 @@ function Readability(uri, doc, options) { this._articleTitle = null; this._articleByline = null; this._articleDir = null; + this._attempts = []; // Configurable options this._debug = !!options.debug; @@ -275,34 +276,20 @@ Readability.prototype = { * @return void */ _fixRelativeUris: function(articleContent) { - var scheme = this._uri.scheme; - var prePath = this._uri.prePath; - var pathBase = this._uri.pathBase; - + var baseURI = this._doc.baseURI; + var documentURI = this._doc.documentURI; function toAbsoluteURI(uri) { - // If this is already an absolute URI, return it. - if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) - return uri; - - // Scheme-rooted relative URI. - if (uri.substr(0, 2) == "//") - return scheme + "://" + uri.substr(2); - - // Prepath-rooted relative URI. - if (uri[0] == "/") - return prePath + uri; - - // Dotslash relative URI. - if (uri.indexOf("./") === 0) - return pathBase + uri.slice(2); - - // Ignore hash URIs: - if (uri[0] == "#") + // Leave hash links alone if the base URI matches the document URI: + if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; - - // Standard relative URI; add entire path. pathBase already includes a - // trailing "/". - return pathBase + uri; + } + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; } var links = articleContent.getElementsByTagName("a"); @@ -535,6 +522,7 @@ Readability.prototype = { this._clean(articleContent, "embed"); this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); + this._clean(articleContent, "link"); // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". @@ -1089,24 +1077,45 @@ Readability.prototype = { if (this._debug) this.log("Article content after paging: " + articleContent.innerHTML); + var parseSuccessful = true; + // Now that we've gone through the full algorithm, check to see if // we got any meaningful content. If we didn't, we may need to re-run // grabArticle with different flags set. This gives us a higher likelihood of // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. - if (this._getInnerText(articleContent, true).length < this._wordThreshold) { + var textLength = this._getInnerText(articleContent, true).length; + if (textLength < this._wordThreshold) { + parseSuccessful = false; page.innerHTML = pageCacheHtml; if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { this._removeFlag(this.FLAG_STRIP_UNLIKELYS); + this._attempts.push({articleContent: articleContent, textLength: textLength}); } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { this._removeFlag(this.FLAG_WEIGHT_CLASSES); + this._attempts.push({articleContent: articleContent, textLength: textLength}); } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); + this._attempts.push({articleContent: articleContent, textLength: textLength}); } else { - return null; + this._attempts.push({articleContent: articleContent, textLength: textLength}); + // No luck after removing flags, just return the longest text we found during the different loops + this._attempts.sort(function (a, b) { + return a.textLength < b.textLength; + }); + + // But first check if we actually have something + if (!this._attempts[0].textLength) { + return null; + } + + articleContent = this._attempts[0].articleContent; + parseSuccessful = true; } - } else { + } + + if (parseSuccessful) { // Find out text direction from ancestors of final top candidate. var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); this._someNode(ancestors, function(ancestor) { diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js index 9ae589d7d..69426788b 100644 --- a/toolkit/components/reader/ReaderWorker.js +++ b/toolkit/components/reader/ReaderWorker.js @@ -47,7 +47,7 @@ var Agent = { * @return {object} Article object returned from Readability. */ parseDocument(uri, serializedDoc, options) { - let doc = new JSDOMParser().parse(serializedDoc); + let doc = new JSDOMParser().parse(serializedDoc, uri.spec); return new Readability(uri, doc, options).parse(); }, }; diff --git a/toolkit/mozapps/installer/packager.py b/toolkit/mozapps/installer/packager.py index 40f31646a..1a144823c 100644 --- a/toolkit/mozapps/installer/packager.py +++ b/toolkit/mozapps/installer/packager.py @@ -156,16 +156,15 @@ def precompile_cache(registry, source_path, gre_path, app_path): extra_env['TSAN_OPTIONS'] = 'report_bugs=0' if buildconfig.substs.get('MOZ_ASAN'): extra_env['ASAN_OPTIONS'] = 'detect_leaks=0' - if buildconfig.substs.get('MOZ_DISABLE_PRECOMPILED_STARTUPCACHE') != '1': - if launcher.launch(['xpcshell', '-g', gre_path, '-a', app_path, - '-f', os.path.join(os.path.dirname(__file__), - 'precompile_cache.js'), - '-e', 'precompile_startupcache("resource://%s/");' - % resource], - extra_linker_path=gre_path, - extra_env=extra_env): - errors.fatal('Error while running startup cache precompilation') - return + if launcher.launch(['xpcshell', '-g', gre_path, '-a', app_path, + '-f', os.path.join(os.path.dirname(__file__), + 'precompile_cache.js'), + '-e', 'precompile_startupcache("resource://%s/");' + % resource], + extra_linker_path=gre_path, + extra_env=extra_env): + errors.fatal('Error while running startup cache precompilation') + return from mozpack.mozjar import JarReader jar = JarReader(cache) resource = '/resource/%s/' % resource @@ -392,7 +391,8 @@ def main(): # Fill startup cache if isinstance(formatter, OmniJarFormatter) and launcher.can_launch() \ - and buildconfig.substs['MOZ_DISABLE_STARTUPCACHE'] != '1': + and buildconfig.substs['MOZ_DISABLE_STARTUPCACHE'] != '1' \ + and buildconfig.substs['MOZ_DISABLE_PRECOMPILED_STARTUPCACHE'] != '1': gre_path = None def get_bases(): for b in sink.packager.get_bases(addons=False): |