diff options
Diffstat (limited to 'mobile/android/modules/WebsiteMetadata.jsm')
-rw-r--r-- | mobile/android/modules/WebsiteMetadata.jsm | 475 |
1 files changed, 0 insertions, 475 deletions
diff --git a/mobile/android/modules/WebsiteMetadata.jsm b/mobile/android/modules/WebsiteMetadata.jsm deleted file mode 100644 index 39af9ddeb..000000000 --- a/mobile/android/modules/WebsiteMetadata.jsm +++ /dev/null @@ -1,475 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -'use strict'; - -const { classes: Cc, interfaces: Ci, utils: Cu } = Components; - -this.EXPORTED_SYMBOLS = ["WebsiteMetadata"]; - -Cu.import("resource://gre/modules/XPCOMUtils.jsm"); - -XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm"); -XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm"); - -var WebsiteMetadata = { - /** - * Asynchronously parse the document extract metadata. A 'Website:Metadata' event with the metadata - * will be sent. - */ - parseAsynchronously: function(doc) { - Task.spawn(function() { - let metadata = getMetadata(doc, doc.location.href, { - image_url: metadataRules['image_url'] - }); - - // No metadata was extracted, so don't bother sending it. - if (Object.keys(metadata).length === 0) { - return; - } - - let msg = { - type: 'Website:Metadata', - location: doc.location.href, - metadata: metadata, - }; - - Messaging.sendRequest(msg); - }); - } -}; - -// ################################################################################################# -// # Modified version of makeUrlAbsolute() to not import url parser library (and dependencies) -// ################################################################################################# - -function makeUrlAbsolute(context, relative) { - var a = context.doc.createElement('a'); - a.href = relative; - return a.href; -} - -// ################################################################################################# -// # page-metadata-parser -// # https://github.com/mozilla/page-metadata-parser/ -// # 61c58cbd0f0bf2153df832a388a79c66b288b98c -// ################################################################################################# - -function buildRuleset(name, rules, processors) { - const reversedRules = Array.from(rules).reverse(); - const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule( - dom(query), - node => [{ - score: order, - flavor: name, - notes: handler(node), - }] - ))); - - return (doc, context) => { - const kb = builtRuleset.score(doc); - const maxNode = kb.max(name); - - if (maxNode) { - let value = maxNode.flavors.get(name); - - if (processors) { - processors.forEach(processor => { - value = processor(value, context); - }); - } - - if (value) { - if (value.trim) { - return value.trim(); - } - return value; - } - } - }; -} - -const metadataRules = { - description: { - rules: [ - ['meta[property="og:description"]', node => node.element.getAttribute('content')], - ['meta[name="description"]', node => node.element.getAttribute('content')], - ], - }, - - icon_url: { - rules: [ - ['link[rel="apple-touch-icon"]', node => node.element.getAttribute('href')], - ['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute('href')], - ['link[rel="icon"]', node => node.element.getAttribute('href')], - ['link[rel="fluid-icon"]', node => node.element.getAttribute('href')], - ['link[rel="shortcut icon"]', node => node.element.getAttribute('href')], - ['link[rel="Shortcut Icon"]', node => node.element.getAttribute('href')], - ['link[rel="mask-icon"]', node => node.element.getAttribute('href')], - ], - processors: [ - (icon_url, context) => makeUrlAbsolute(context, icon_url) - ] - }, - - image_url: { - rules: [ - ['meta[property="og:image:secure_url"]', node => node.element.getAttribute('content')], - ['meta[property="og:image:url"]', node => node.element.getAttribute('content')], - ['meta[property="og:image"]', node => node.element.getAttribute('content')], - ['meta[property="twitter:image"]', node => node.element.getAttribute('content')], - ['meta[name="thumbnail"]', node => node.element.getAttribute('content')], - ], - processors: [ - (image_url, context) => makeUrlAbsolute(context, image_url) - ], - }, - - keywords: { - rules: [ - ['meta[name="keywords"]', node => node.element.getAttribute('content')], - ], - processors: [ - (keywords) => keywords.split(',').map((keyword) => keyword.trim()), - ] - }, - - title: { - rules: [ - ['meta[property="og:title"]', node => node.element.getAttribute('content')], - ['meta[property="twitter:title"]', node => node.element.getAttribute('content')], - ['meta[name="hdl"]', node => node.element.getAttribute('content')], - ['title', node => node.element.text], - ], - }, - - type: { - rules: [ - ['meta[property="og:type"]', node => node.element.getAttribute('content')], - ], - }, - - url: { - rules: [ - ['meta[property="og:url"]', node => node.element.getAttribute('content')], - ['link[rel="canonical"]', node => node.element.getAttribute('href')], - ], - }, -}; - -function getMetadata(doc, url, rules) { - const metadata = {}; - const context = {url,doc}; - const ruleSet = rules || metadataRules; - - Object.keys(ruleSet).map(metadataKey => { - const metadataRule = ruleSet[metadataKey]; - - if(Array.isArray(metadataRule.rules)) { - const builtRule = buildRuleset(metadataKey, metadataRule.rules, metadataRule.processors); - metadata[metadataKey] = builtRule(doc, context); - } else { - metadata[metadataKey] = getMetadata(doc, url, metadataRule); - } - }); - - return metadata; -} - -// ################################################################################################# -// # Fathom dependencies resolved -// ################################################################################################# - -// const {forEach} = require('wu'); -function forEach(fn, obj) { - for (let x of obj) { - fn(x); - } -} - -function best(iterable, by, isBetter) { - let bestSoFar, bestKeySoFar; - let isFirst = true; - forEach( - function (item) { - const key = by(item); - if (isBetter(key, bestKeySoFar) || isFirst) { - bestSoFar = item; - bestKeySoFar = key; - isFirst = false; - } - }, - iterable); - if (isFirst) { - throw new Error('Tried to call best() on empty iterable'); - } - return bestSoFar; -} - -// const {max} = require('./utils'); -function max(iterable, by = identity) { - return best(iterable, by, (a, b) => a > b); -} - -// ################################################################################################# -// # Fathom -// # https://github.com/mozilla/fathom -// # cac59e470816f17fc1efd4a34437b585e3e451cd -// ################################################################################################# - -// Get a key of a map, first setting it to a default value if it's missing. -function getDefault(map, key, defaultMaker) { - if (map.has(key)) { - return map.get(key); - } - const defaultValue = defaultMaker(); - map.set(key, defaultValue); - return defaultValue; -} - - -// Construct a filtration network of rules. -function ruleset(...rules) { - const rulesByInputFlavor = new Map(); // [someInputFlavor: [rule, ...]] - - // File each rule under its input flavor: - forEach(rule => getDefault(rulesByInputFlavor, rule.source.inputFlavor, () => []).push(rule), - rules); - - return { - // Iterate over a DOM tree or subtree, building up a knowledgebase, a - // data structure holding scores and annotations for interesting - // elements. Return the knowledgebase. - // - // This is the "rank" portion of the rank-and-yank algorithm. - score: function (tree) { - const kb = knowledgebase(); - - // Introduce the whole DOM into the KB as flavor 'dom' to get - // things started: - const nonterminals = [[{tree}, 'dom']]; // [[node, flavor], [node, flavor], ...] - - // While there are new facts, run the applicable rules over them to - // generate even newer facts. Repeat until everything's fully - // digested. Rules run in no particular guaranteed order. - while (nonterminals.length) { - const [inNode, inFlavor] = nonterminals.pop(); - for (let rule of getDefault(rulesByInputFlavor, inFlavor, () => [])) { - const outFacts = resultsOf(rule, inNode, inFlavor, kb); - for (let fact of outFacts) { - const outNode = kb.nodeForElement(fact.element); - - // No matter whether or not this flavor has been - // emitted before for this node, we multiply the score. - // We want to be able to add rules that refine the - // scoring of a node, without having to rewire the path - // of flavors that winds through the ruleset. - // - // 1 score per Node is plenty. That simplifies our - // data, our rankers, our flavor system (since we don't - // need to represent score axes), and our engine. If - // somebody wants more score axes, they can fake it - // themselves with notes, thus paying only for what - // they eat. (We can even provide functions that help - // with that.) Most rulesets will probably be concerned - // with scoring only 1 thing at a time anyway. So, - // rankers return a score multiplier + 0 or more new - // flavors with optional notes. Facts can never be - // deleted from the KB by rankers (or order would start - // to matter); after all, they're *facts*. - outNode.score *= fact.score; - - // Add a new annotation to a node--but only if there - // wasn't already one of the given flavor already - // there; otherwise there's no point. - // - // You might argue that we might want to modify an - // existing note here, but that would be a bad - // idea. Notes of a given flavor should be - // considered immutable once laid down. Otherwise, the - // order of execution of same-flavored rules could - // matter, hurting pluggability. Emit a new flavor and - // a new note if you want to do that. - // - // Also, choosing not to add a new fact to nonterminals - // when we're not adding a new flavor saves the work of - // running the rules against it, which would be - // entirely redundant and perform no new work (unless - // the rankers were nondeterministic, but don't do - // that). - if (!outNode.flavors.has(fact.flavor)) { - outNode.flavors.set(fact.flavor, fact.notes); - kb.indexNodeByFlavor(outNode, fact.flavor); // TODO: better encapsulation rather than indexing explicitly - nonterminals.push([outNode, fact.flavor]); - } - } - } - } - return kb; - } - }; -} - - -// Construct a container for storing and querying facts, where a fact has a -// flavor (used to dispatch further rules upon), a corresponding DOM element, a -// score, and some other arbitrary notes opaque to fathom. -function knowledgebase() { - const nodesByFlavor = new Map(); // Map{'texty' -> [NodeA], - // 'spiffy' -> [NodeA, NodeB]} - // NodeA = {element: <someElement>, - // - // // Global nodewide score. Add - // // custom ones with notes if - // // you want. - // score: 8, - // - // // Flavors is a map of flavor names to notes: - // flavors: Map{'texty' -> {ownText: 'blah', - // someOtherNote: 'foo', - // someCustomScore: 10}, - // // This is an empty note: - // 'fluffy' -> undefined}} - const nodesByElement = new Map(); - - return { - // Return the "node" (our own data structure that we control) that - // corresponds to a given DOM element, creating one if necessary. - nodeForElement: function (element) { - return getDefault(nodesByElement, - element, - () => ({element, - score: 1, - flavors: new Map()})); - }, - - // Return the highest-scored node of the given flavor, undefined if - // there is none. - max: function (flavor) { - const nodes = nodesByFlavor.get(flavor); - return nodes === undefined ? undefined : max(nodes, node => node.score); - }, - - // Let the KB know that a new flavor has been added to an element. - indexNodeByFlavor: function (node, flavor) { - getDefault(nodesByFlavor, flavor, () => []).push(node); - }, - - nodesOfFlavor: function (flavor) { - return getDefault(nodesByFlavor, flavor, () => []); - } - }; -} - - -// Apply a rule (as returned by a call to rule()) to a fact, and return the -// new facts that result. -function resultsOf(rule, node, flavor, kb) { - // If more types of rule pop up someday, do fancier dispatching here. - return rule.source.flavor === 'flavor' ? resultsOfFlavorRule(rule, node, flavor) : resultsOfDomRule(rule, node, kb); -} - - -// Pull the DOM tree off the special property of the root "dom" fact, and query -// against it. -function *resultsOfDomRule(rule, specialDomNode, kb) { - // Use the special "tree" property of the special starting node: - const matches = specialDomNode.tree.querySelectorAll(rule.source.selector); - - for (let i = 0; i < matches.length; i++) { // matches is a NodeList, which doesn't conform to iterator protocol - const element = matches[i]; - const newFacts = explicitFacts(rule.ranker(kb.nodeForElement(element))); - for (let fact of newFacts) { - if (fact.element === undefined) { - fact.element = element; - } - if (fact.flavor === undefined) { - throw new Error('Rankers of dom() rules must return a flavor in each fact. Otherwise, there is no way for that fact to be used later.'); - } - yield fact; - } - } -} - - -function *resultsOfFlavorRule(rule, node, flavor) { - const newFacts = explicitFacts(rule.ranker(node)); - - for (let fact of newFacts) { - // If the ranker didn't specify a different element, assume it's - // talking about the one we passed in: - if (fact.element === undefined) { - fact.element = node.element; - } - if (fact.flavor === undefined) { - fact.flavor = flavor; - } - yield fact; - } -} - - -// Take the possibly abbreviated output of a ranker function, and make it -// explicitly an iterable with a defined score. -// -// Rankers can return undefined, which means "no facts", a single fact, or an -// array of facts. -function *explicitFacts(rankerResult) { - const array = (rankerResult === undefined) ? [] : (Array.isArray(rankerResult) ? rankerResult : [rankerResult]); - for (let fact of array) { - if (fact.score === undefined) { - fact.score = 1; - } - yield fact; - } -} - - -// TODO: For the moment, a lot of responsibility is on the rankers to return a -// pretty big data structure of up to 4 properties. This is a bit verbose for -// an arrow function (as I hope we can use most of the time) and the usual case -// will probably be returning just a score multiplier. Make that case more -// concise. - -// TODO: It is likely that rankers should receive the notes of their input type -// as a 2nd arg, for brevity. - - -// Return a condition that uses a DOM selector to find its matches from the -// original DOM tree. -// -// For consistency, Nodes will still be delivered to the transformers, but -// they'll have empty flavors and score = 1. -// -// Condition constructors like dom() and flavor() build stupid, introspectable -// objects that the query engine can read. They don't actually do the query -// themselves. That way, the query planner can be smarter than them, figuring -// out which indices to use based on all of them. (We'll probably keep a heap -// by each dimension's score and a hash by flavor name, for starters.) Someday, -// fancy things like this may be possible: rule(and(tag('p'), klass('snork')), -// ...) -function dom(selector) { - return { - flavor: 'dom', - inputFlavor: 'dom', - selector - }; -} - - -// Return a condition that discriminates on nodes of the knowledgebase by flavor. -function flavor(inputFlavor) { - return { - flavor: 'flavor', - inputFlavor - }; -} - - -function rule(source, ranker) { - return { - source, - ranker - }; -} |