toolkit/modules/PageMetadata.jsm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

"use strict";

this.EXPORTED_SYMBOLS = ["PageMetadata"];

const {classes: Cc, interfaces: Ci, utils: Cu, results: Cr} = Components;

Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/XPCOMUtils.jsm");
Cu.import("resource://gre/modules/microformat-shiv.js");

XPCOMUtils.defineLazyServiceGetter(this, "UnescapeService",
                                   "@mozilla.org/feed-unescapehtml;1",
                                   "nsIScriptableUnescapeHTML");


/**
 * Maximum number of images to discover in the document, when no preview images
 * are explicitly specified by the metadata.
 * @type {Number}
 */
const DISCOVER_IMAGES_MAX  = 5;


/**
 * Extract metadata and microformats from a HTML document.
 * @type {Object}
 */
this.PageMetadata = {
  /**
   * Get all metadata from an HTML document. This includes:
   * - URL
   * - title
   * - Metadata specified in <meta> tags, including OpenGraph data
   * - Links specified in <link> tags (short, canonical, preview images, alternative)
   * - Content that can be found in the page content that we consider useful metadata
   * - Microformats
   *
   * @param {Document} document - Document to extract data from.
   * @param {Element} [target] - Optional element to restrict microformats lookup to.
   * @returns {Object} Object containing the various metadata, normalized to
   *                   merge some common alternative names for metadata.
   */
  getData(document, target = null) {
    let result = {
      url: this._validateURL(document, document.documentURI),
      title: document.title,
      previews: [],
    };

    // if pushState was used to change the url, most likely all meta data is
    // invalid. This is the case with several major sites that rely on
    // pushState. In that case, we'll only return uri and title. If document is
    // via XHR or something, there is no view or history.
    if (document.defaultView) {
      let docshell = document.defaultView.QueryInterface(Ci.nsIInterfaceRequestor)
                                         .getInterface(Ci.nsIWebNavigation)
                                         .QueryInterface(Ci.nsIDocShell);
      let shentry = {};
      if (docshell.getCurrentSHEntry(shentry) &&
          shentry.value && shentry.value.URIWasModified) {
        return result;
      }
    }

    this._getMetaData(document, result);
    this._getLinkData(document, result);
    this._getPageData(document, result);
    result.microformats = this.getMicroformats(document, target);

    return result;
  },

  getMicroformats(document, target = null) {
    if (target) {
      return Microformats.getParent(target, {node: document});
    }
    return Microformats.get({node: document});
  },

  /**
   * Get metadata as defined in <meta> tags.
   * This adds properties to an existing result object.
   *
   * @param {Document} document - Document to extract data from.
   * @param {Object}  result - Existing result object to add properties to.
   */
  _getMetaData(document, result) {
    // Query for standardized meta data.
    let elements = document.querySelectorAll("head > meta[property], head > meta[name]");
    if (elements.length < 1) {
      return;
    }

    for (let element of elements) {
      let value = element.getAttribute("content")
      if (!value) {
        continue;
      }
      value = UnescapeService.unescape(value.trim());

      let key = element.getAttribute("property") || element.getAttribute("name");
      if (!key) {
        continue;
      }

      // There are a wide array of possible meta tags, expressing articles,
      // products, etc. so all meta tags are passed through but we touch up the
      // most common attributes.
      result[key] = value;

      switch (key) {
        case "title":
        case "og:title": {
          // Only set the title if one hasn't already been obtained (e.g. from the
          // document title element).
          if (!result.title) {
            result.title = value;
          }
          break;
        }

        case "description":
        case "og:description": {
          result.description = value;
          break;
        }

        case "og:site_name": {
          result.siteName = value;
          break;
        }

        case "medium":
        case "og:type": {
          result.medium = value;
          break;
        }

        case "og:video": {
          let url = this._validateURL(document, value);
          if (url) {
            result.source = url;
          }
          break;
        }

        case "og:url": {
          let url = this._validateURL(document, value);
          if (url) {
            result.url = url;
          }
          break;
        }

        case "og:image": {
          let url = this._validateURL(document, value);
          if (url) {
            result.previews.push(url);
          }
          break;
        }
      }
    }
  },

  /**
   * Get metadata as defined in <link> tags.
   * This adds properties to an existing result object.
   *
   * @param {Document} document - Document to extract data from.
   * @param {Object}  result - Existing result object to add properties to.
   */
  _getLinkData: function(document, result) {
    let elements = document.querySelectorAll("head > link[rel], head > link[id]");

    for (let element of elements) {
      let url = element.getAttribute("href");
      if (!url) {
        continue;
      }
      url = this._validateURL(document, UnescapeService.unescape(url.trim()));

      let key = element.getAttribute("rel") || element.getAttribute("id");
      if (!key) {
        continue;
      }

      switch (key) {
        case "shorturl":
        case "shortlink": {
          result.shortUrl = url;
          break;
        }

        case "canonicalurl":
        case "canonical": {
          result.url = url;
          break;
        }

        case "image_src": {
          result.previews.push(url);
          break;
        }

        case "alternate": {
          // Expressly for oembed support but we're liberal here and will let
          // other alternate links through. oembed defines an href, supplied by
          // the site, where you can fetch additional meta data about a page.
          // We'll let the client fetch the oembed data themselves, but they
          // need the data from this link.
          if (!result.alternate) {
            result.alternate = [];
          }

          result.alternate.push({
            type: element.getAttribute("type"),
            href: element.getAttribute("href"),
            title: element.getAttribute("title")
          });
        }
      }
    }
  },

  /**
   * Scrape thought the page content for additional content that may be used to
   * suppliment explicitly defined metadata. This includes:
   * - First few images, when no preview image metadata is explicitly defined.
   *
   * This adds properties to an existing result object.
   *
   * @param {Document} document - Document to extract data from.
   * @param {Object}  result - Existing result object to add properties to.
   */
  _getPageData(document, result) {
    if (result.previews.length < 1) {
      result.previews = this._getImageUrls(document);
    }
  },

  /**
   * Find the first few images in a document, for use as preview images.
   * Will return upto DISCOVER_IMAGES_MAX number of images.
   *
   * @note This is not very clever. It does not (yet) check if any of the
   *       images may be appropriate as a preview image.
   *
   * @param {Document} document - Document to extract data from.
   * @return {[string]} Array of URLs.
   */
  _getImageUrls(document) {
    let result = [];
    let elements = document.querySelectorAll("img");

    for (let element of elements) {
      let src = element.getAttribute("src");
      if (src) {
        result.push(this._validateURL(document, UnescapeService.unescape(src)));

        // We don't want a billion images.
        // TODO: Move this magic number to a const.
        if (result.length > DISCOVER_IMAGES_MAX) {
          break;
        }
      }
    }

    return result;
  },

  /**
   * Validate a URL. This involves resolving the URL if it's relative to the
   * document location, ensuring it's using an expected scheme, and stripping
   * the userPass portion of the URL.
   *
   * @param {Document} document - Document to use as the root location for a relative URL.
   * @param {string} url - URL to validate.
   * @return {string} Result URL.
   */
  _validateURL(document, url) {
    let docURI = Services.io.newURI(document.documentURI, null, null);
    let uri = Services.io.newURI(docURI.resolve(url), null, null);

    if (["http", "https"].indexOf(uri.scheme) < 0) {
      return null;
    }

    uri.userPass = "";

    return uri.spec;
  },
};