Issue #1258 - Part 1: Import mailnews, ldap, and mork from comm-esr52.9.1

author: Matt A. Tobin <email@mattatobin.com> 2019-11-03 00:17:46 -0400
committer: Matt A. Tobin <email@mattatobin.com> 2019-11-03 00:17:46 -0400
commit: 302bf1b523012e11b60425d6eee1221ebc2724eb (patch)
tree: b191a895f8716efcbe42f454f37597a545a6f421 /mailnews/extensions/newsblog/content/feed-parser.js
parent: 21b3f6247403c06f85e1f45d219f87549862198f (diff)
download: UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.gz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.lz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.xz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.zip
1 files changed, 1034 insertions, 0 deletions
diff --git a/mailnews/extensions/newsblog/content/feed-parser.js b/mailnews/extensions/newsblog/content/feed-parser.js
new file mode 100644
index 000000000..660333422
--- /dev/null
+++ b/mailnews/extensions/newsblog/content/feed-parser.js
@@ -0,0 +1,1034 @@
+/* -*- Mode: JavaScript; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// The feed parser depends on FeedItem.js, Feed.js.
+function FeedParser() {
+  this.mSerializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"].
+                     createInstance(Ci.nsIDOMSerializer);
+}
+
+FeedParser.prototype =
+{
+  // parseFeed() returns an array of parsed items ready for processing.  It is
+  // currently a synchronous operation.  If there is an error parsing the feed,
+  // parseFeed returns an empty feed in addition to calling aFeed.onParseError.
+  parseFeed: function (aFeed, aDOM)
+  {
+    if (!(aDOM instanceof Ci.nsIDOMXMLDocument))
+    {
+      // No xml doc.
+      return aFeed.onParseError(aFeed);
+    }
+
+    let doc = aDOM.documentElement;
+    if (doc.namespaceURI == FeedUtils.MOZ_PARSERERROR_NS)
+    {
+      // Gecko caught a basic parsing error.
+      let errStr = doc.firstChild.textContent + "\n" +
+                   doc.firstElementChild.textContent;
+      FeedUtils.log.info("FeedParser.parseFeed: - " + errStr);
+      return aFeed.onParseError(aFeed);
+    }
+    else if (aDOM.querySelector("redirect"))
+    {
+      // Check for RSS2.0 redirect document.
+      let channel = aDOM.querySelector("redirect");
+      if (this.isPermanentRedirect(aFeed, channel, null, null))
+        return;
+
+      return aFeed.onParseError(aFeed);
+    }
+    else if (doc.namespaceURI == FeedUtils.RDF_SYNTAX_NS &&
+             doc.getElementsByTagNameNS(FeedUtils.RSS_NS, "channel")[0])
+    {
+      aFeed.mFeedType = "RSS_1.xRDF"
+      FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
+                          aFeed.mFeedType +" : " +aFeed.url);
+      // aSource can be misencoded (XMLHttpRequest converts to UTF-8 by default),
+      // but the DOM is almost always right because it uses the hints in the
+      // XML file.  This is slower, but not noticably so.  Mozilla doesn't have
+      // the XMLHttpRequest.responseBody property that IE has, which provides
+      // access to the unencoded response.
+      let xmlString = this.mSerializer.serializeToString(doc);
+      return this.parseAsRSS1(aFeed, xmlString, aFeed.request.channel.URI);
+    }
+    else if (doc.namespaceURI == FeedUtils.ATOM_03_NS)
+    {
+      aFeed.mFeedType = "ATOM_0.3"
+      FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
+                          aFeed.mFeedType +" : " +aFeed.url);
+      return this.parseAsAtom(aFeed, aDOM);
+    }
+    else if (doc.namespaceURI == FeedUtils.ATOM_IETF_NS)
+    {
+      aFeed.mFeedType = "ATOM_IETF"
+      FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
+                          aFeed.mFeedType +" : " +aFeed.url);
+      return this.parseAsAtomIETF(aFeed, aDOM);
+    }
+    else if (doc.getElementsByTagNameNS(FeedUtils.RSS_090_NS, "channel")[0])
+    {
+      aFeed.mFeedType = "RSS_0.90"
+      FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
+                          aFeed.mFeedType +" : " +aFeed.url);
+      return this.parseAsRSS2(aFeed, aDOM);
+    }
+    else
+    {
+      // Parse as RSS 0.9x.  In theory even RSS 1.0 feeds could be parsed by
+      // the 0.9x parser if the RSS namespace were the default.
+      let rssVer = doc.localName == "rss" ? doc.getAttribute("version") : null;
+      if (rssVer)
+        aFeed.mFeedType = "RSS_" + rssVer;
+      else
+        aFeed.mFeedType = "RSS_0.9x?";
+      FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
+                          aFeed.mFeedType +" : " +aFeed.url);
+      return this.parseAsRSS2(aFeed, aDOM);
+    }
+  },
+
+  parseAsRSS2: function (aFeed, aDOM)
+  {
+    // Get the first channel (assuming there is only one per RSS File).
+    let parsedItems = new Array();
+
+    let channel = aDOM.querySelector("channel");
+    if (!channel)
+      return aFeed.onParseError(aFeed);
+
+    // Usually the empty string, unless this is RSS .90.
+    let nsURI = channel.namespaceURI || "";
+    FeedUtils.log.debug("FeedParser.parseAsRSS2: channel nsURI - " + nsURI);
+
+    if (this.isPermanentRedirect(aFeed, null, channel, null))
+      return;
+
+    let tags = this.childrenByTagNameNS(channel, nsURI, "title");
+    aFeed.title = aFeed.title || this.getNodeValue(tags ? tags[0] : null);
+    tags = this.childrenByTagNameNS(channel, nsURI, "description");
+    aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null);
+    tags = this.childrenByTagNameNS(channel, nsURI, "link");
+    aFeed.link = this.validLink(this.getNodeValue(tags ? tags[0] : null));
+
+    if (!(aFeed.title || aFeed.description) || !aFeed.link)
+    {
+      FeedUtils.log.error("FeedParser.parseAsRSS2: missing mandatory element " +
+                          "<title> and <description>, or <link>");
+      return aFeed.onParseError(aFeed);
+    }
+
+    if (!aFeed.parseItems)
+      return parsedItems;
+
+    aFeed.invalidateItems();
+    // XXX use getElementsByTagNameNS for now; childrenByTagNameNS would be
+    // better, but RSS .90 is still with us.
+    let itemNodes = aDOM.getElementsByTagNameNS(nsURI, "item");
+    itemNodes = itemNodes ? itemNodes : [];
+    FeedUtils.log.debug("FeedParser.parseAsRSS2: items to parse - " +
+                        itemNodes.length);
+
+    for (let itemNode of itemNodes)
+    {
+      if (!itemNode.childElementCount)
+        continue;
+      let item = new FeedItem();
+      item.feed = aFeed;
+      item.enclosures = [];
+      item.keywords = [];
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origLink");
+      let link = this.validLink(this.getNodeValue(tags ? tags[0] : null));
+      if (!link)
+      {
+        tags = this.childrenByTagNameNS(itemNode, nsURI, "link");
+        link = this.validLink(this.getNodeValue(tags ? tags[0] : null));
+      }
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "guid");
+      let guidNode = tags ? tags[0] : null;
+
+      let guid;
+      let isPermaLink = false;
+      if (guidNode)
+      {
+        guid = this.getNodeValue(guidNode);
+        // isPermaLink is true if the value is "true" or if the attribute is
+        // not present; all other values, including "false" and "False" and
+        // for that matter "TRuE" and "meatcake" are false.
+        if (!guidNode.hasAttribute("isPermaLink") ||
+            guidNode.getAttribute("isPermaLink") == "true")
+          isPermaLink = true;
+        // If attribute isPermaLink is missing, it is good to check the validity
+        // of <guid> value as an URL to avoid linking to non-URL strings.
+        if (!guidNode.hasAttribute("isPermaLink"))
+        {
+          try
+          {
+            Services.io.newURI(guid, null, null);
+            if (Services.io.extractScheme(guid) == "tag")
+              isPermaLink = false;
+          }
+          catch (ex)
+          {
+            isPermaLink = false;
+          }
+        }
+
+        item.id = guid;
+      }
+
+      let guidLink = this.validLink(guid);
+      item.url = isPermaLink && guidLink ? guidLink : link ? link : null;
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "description");
+      item.description = this.getNodeValueFormatted(tags ? tags[0] : null);
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "title");
+      item.title = this.getNodeValue(tags ? tags[0] : null);
+      if (!(item.title || item.description))
+      {
+        FeedUtils.log.info("FeedParser.parseAsRSS2: <item> missing mandatory " +
+                           "element, either <title> or <description>; skipping");
+        continue;
+      }
+
+      if (!item.id)
+      {
+        // At this point, if there is no guid, uniqueness cannot be guaranteed
+        // by any of link or date (optional) or title (optional unless there
+        // is no description). Use a big chunk of description; minimize dupes
+        // with url and title if present.
+        item.id = (item.url || item.feed.url) + "#" + item.title + "#" +
+                  (this.stripTags(item.description ?
+                                    item.description.substr(0, 150) : null) ||
+                   item.title);
+        item.id = item.id.replace(/[\n\r\t\s]+/g, " ");
+      }
+
+      // Escape html entities in <title>, which are unescaped as textContent
+      // values. If the title is used as content, it will remain escaped; if
+      // it is used as the title, it will be unescaped upon store. Bug 1240603.
+      // The <description> tag must follow escaping examples found in
+      // http://www.rssboard.org/rss-encoding-examples, i.e. single escape angle
+      // brackets for tags, which are removed if used as title, and double
+      // escape entities for presentation in title.
+      // Better: always use <title>. Best: use Atom.
+      if (!item.title)
+        item.title = this.stripTags(item.description).substr(0, 150);
+      else
+        item.title = item.htmlEscape(item.title);
+
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "author");
+      if (!tags)
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.DC_NS, "creator");
+      item.author = this.getNodeValue(tags ? tags[0] : null) ||
+                    aFeed.title ||
+                    item.author;
+
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "pubDate");
+      if (!tags || !this.getNodeValue(tags[0]))
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.DC_NS, "date");
+      item.date = this.getNodeValue(tags ? tags[0] : null) || item.date;
+
+      // If the date is invalid, users will see the beginning of the epoch
+      // unless we reset it here, so they'll see the current time instead.
+      // This is typical aggregator behavior.
+      if (item.date)
+      {
+        item.date = item.date.trim();
+        if (!FeedUtils.isValidRFC822Date(item.date))
+        {
+          // XXX Use this on the other formats as well.
+          item.date = this.dateRescue(item.date);
+        }
+      }
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.RSS_CONTENT_NS, "encoded");
+      item.content = this.getNodeValueFormatted(tags ? tags[0] : null);
+
+      // Handle <enclosures> and <media:content>, which may be in a
+      // <media:group> (if present).
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "enclosure");
+      let encUrls = [];
+      if (tags)
+        for (let tag of tags)
+        {
+          let url = this.validLink(tag.getAttribute("url"));
+          if (url && encUrls.indexOf(url) == -1)
+          {
+            let type = this.removeUnprintableASCII(tag.getAttribute("type"));
+            let length = this.removeUnprintableASCII(tag.getAttribute("length"));
+            item.enclosures.push(new FeedEnclosure(url, type, length));
+            encUrls.push(url);
+          }
+        }
+
+      tags = itemNode.getElementsByTagNameNS(FeedUtils.MRSS_NS, "content");
+      if (tags)
+        for (let tag of tags)
+        {
+          let url = this.validLink(tag.getAttribute("url"));
+          if (url && encUrls.indexOf(url) == -1)
+          {
+            let type = this.removeUnprintableASCII(tag.getAttribute("type"));
+            let fileSize = this.removeUnprintableASCII(tag.getAttribute("fileSize"));
+            item.enclosures.push(new FeedEnclosure(url, type, fileSize));
+          }
+        }
+
+      // The <origEnclosureLink> tag has no specification, especially regarding
+      // whether more than one tag is allowed and, if so, how tags would
+      // relate to previously declared (and well specified) enclosure urls.
+      // The common usage is to include 1 origEnclosureLink, in addition to
+      // the specified enclosure tags for 1 enclosure. Thus, we will replace the
+      // first enclosure's, if found, url with the first <origEnclosureLink>
+      // url only or else add the <origEnclosureLink> url.
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origEnclosureLink");
+      let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null));
+      if (origEncUrl)
+      {
+        if (item.enclosures.length)
+          item.enclosures[0].mURL = origEncUrl;
+        else
+          item.enclosures.push(new FeedEnclosure(origEncUrl));
+      }
+
+      // Support <category> and autotagging.
+      tags = this.childrenByTagNameNS(itemNode, nsURI, "category");
+      if (tags)
+      {
+        for (let tag of tags)
+        {
+          let term = this.getNodeValue(tag);
+          term = term ? this.xmlUnescape(term.replace(/,/g, ";")) : null;
+          if (term && item.keywords.indexOf(term) == -1)
+            item.keywords.push(term);
+        }
+      }
+
+      parsedItems.push(item);
+    }
+
+    return parsedItems;
+  },
+
+  parseAsRSS1 : function(aFeed, aSource, aBaseURI)
+  {
+    let parsedItems = new Array();
+
+    // RSS 1.0 is valid RDF, so use the RDF parser/service to extract data.
+    // Create a new RDF data source and parse the feed into it.
+    let ds = Cc["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"].
+             createInstance(Ci.nsIRDFDataSource);
+
+    let rdfparser = Cc["@mozilla.org/rdf/xml-parser;1"].
+                    createInstance(Ci.nsIRDFXMLParser);
+    rdfparser.parseString(ds, aBaseURI, aSource);
+
+    // Get information about the feed as a whole.
+    let channel = ds.GetSource(FeedUtils.RDF_TYPE, FeedUtils.RSS_CHANNEL, true);
+    if (!channel)
+      return aFeed.onParseError(aFeed);
+
+    if (this.isPermanentRedirect(aFeed, null, channel, ds))
+      return;
+
+    aFeed.title = aFeed.title ||
+                  this.getRDFTargetValue(ds, channel, FeedUtils.RSS_TITLE) ||
+                  aFeed.url;
+    aFeed.description = this.getRDFTargetValueFormatted(ds, channel, FeedUtils.RSS_DESCRIPTION) ||
+                        "";
+    aFeed.link = this.validLink(this.getRDFTargetValue(ds, channel, FeedUtils.RSS_LINK)) ||
+                 aFeed.url;
+
+    if (!(aFeed.title || aFeed.description) || !aFeed.link)
+    {
+      FeedUtils.log.error("FeedParser.parseAsRSS1: missing mandatory element " +
+                          "<title> and <description>, or <link>");
+      return aFeed.onParseError(aFeed);
+    }
+
+    if (!aFeed.parseItems)
+      return parsedItems;
+
+    aFeed.invalidateItems();
+
+    // Ignore the <items> list and just get the <item>s.
+    let items = ds.GetSources(FeedUtils.RDF_TYPE, FeedUtils.RSS_ITEM, true);
+
+    let index = 0;
+    while (items.hasMoreElements())
+    {
+      let itemResource = items.getNext().QueryInterface(Ci.nsIRDFResource);
+      let item = new FeedItem();
+      item.feed = aFeed;
+
+      // Prefer the value of the link tag to the item URI since the URI could be
+      // a relative URN.
+      let uri = itemResource.ValueUTF8;
+      let link = this.validLink(this.getRDFTargetValue(ds, itemResource, FeedUtils.RSS_LINK));
+      item.url = link || uri;
+      item.description = this.getRDFTargetValueFormatted(ds, itemResource,
+                                                         FeedUtils.RSS_DESCRIPTION);
+      item.title = this.getRDFTargetValue(ds, itemResource, FeedUtils.RSS_TITLE) ||
+                   this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_SUBJECT) ||
+                   (item.description ?
+                     (this.stripTags(item.description).substr(0, 150)) : null);
+      if (!item.url || !item.title)
+      {
+        FeedUtils.log.info("FeedParser.parseAsRSS1: <item> missing mandatory " +
+                           "element <item rdf:about> and <link>, or <title> and " +
+                           "no <description>; skipping");
+        continue;
+      }
+
+      item.id = item.url;
+      item.url = this.validLink(item.url);
+
+      item.author = this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_CREATOR) ||
+                    this.getRDFTargetValue(ds, channel, FeedUtils.DC_CREATOR) ||
+                    aFeed.title ||
+                    item.author;
+      item.date = this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_DATE) ||
+                  item.date;
+      item.content = this.getRDFTargetValueFormatted(ds, itemResource,
+                                                     FeedUtils.RSS_CONTENT_ENCODED);
+
+      parsedItems[index++] = item;
+    }
+    FeedUtils.log.debug("FeedParser.parseAsRSS1: items parsed - " + index);
+
+    return parsedItems;
+  },
+
+  parseAsAtom: function(aFeed, aDOM)
+  {
+    let parsedItems = new Array();
+
+    // Get the first channel (assuming there is only one per Atom File).
+    let channel = aDOM.querySelector("feed");
+    if (!channel)
+      return aFeed.onParseError(aFeed);
+
+    if (this.isPermanentRedirect(aFeed, null, channel, null))
+      return;
+
+    let tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "title");
+    aFeed.title = aFeed.title ||
+                  this.stripTags(this.getNodeValue(tags ? tags[0] : null));
+    tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "tagline");
+    aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null);
+    tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "link");
+    aFeed.link = this.validLink(this.findAtomLink("alternate", tags));
+
+    if (!aFeed.title)
+    {
+      FeedUtils.log.error("FeedParser.parseAsAtom: missing mandatory element " +
+                          "<title>");
+      return aFeed.onParseError(aFeed);
+    }
+
+    if (!aFeed.parseItems)
+      return parsedItems;
+
+    aFeed.invalidateItems();
+    let items = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "entry");
+    items = items ? items : [];
+    FeedUtils.log.debug("FeedParser.parseAsAtom: items to parse - " +
+                        items.length);
+
+    for (let itemNode of items)
+    {
+      if (!itemNode.childElementCount)
+        continue;
+      let item = new FeedItem();
+      item.feed = aFeed;
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "link");
+      item.url = this.validLink(this.findAtomLink("alternate", tags));
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "id");
+      item.id = this.getNodeValue(tags ? tags[0] : null);
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "summary");
+      item.description = this.getNodeValueFormatted(tags ? tags[0] : null);
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "title");
+      item.title = this.getNodeValue(tags ? tags[0] : null) ||
+                   (item.description ? item.description.substr(0, 150) : null);
+      if (!item.title || !item.id)
+      {
+        // We're lenient about other mandatory tags, but insist on these.
+        FeedUtils.log.info("FeedParser.parseAsAtom: <entry> missing mandatory " +
+                           "element <id>, or <title> and no <summary>; skipping");
+        continue;
+      }
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "author");
+      if (!tags)
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "contributor");
+      if (!tags)
+        tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "author");
+
+      let authorEl = tags ? tags[0] : null;
+
+      let author = "";
+      if (authorEl)
+      {
+        tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_03_NS, "name");
+        let name = this.getNodeValue(tags ? tags[0] : null);
+        tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_03_NS, "email");
+        let email = this.getNodeValue(tags ? tags[0] : null);
+        if (name)
+          author = name + (email ? " <" + email + ">" : "");
+        else if (email)
+          author = email;
+      }
+
+      item.author = author || item.author || aFeed.title;
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "modified");
+      if (!tags || !this.getNodeValue(tags[0]))
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "issued");
+      if (!tags || !this.getNodeValue(tags[0]))
+        tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "created");
+
+      item.date = this.getNodeValue(tags ? tags[0] : null) || item.date;
+
+      // XXX We should get the xml:base attribute from the content tag as well
+      // and use it as the base HREF of the message.
+      // XXX Atom feeds can have multiple content elements; we should differentiate
+      // between them and pick the best one.
+      // Some Atom feeds wrap the content in a CTYPE declaration; others use
+      // a namespace to identify the tags as HTML; and a few are buggy and put
+      // HTML tags in without declaring their namespace so they look like Atom.
+      // We deal with the first two but not the third.
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "content");
+      let contentNode = tags ? tags[0] : null;
+
+      let content;
+      if (contentNode)
+      {
+        content = "";
+        for (let j = 0; j < contentNode.childNodes.length; j++)
+        {
+          let node = contentNode.childNodes.item(j);
+          if (node.nodeType == node.CDATA_SECTION_NODE)
+            content += node.data;
+          else
+            content += this.mSerializer.serializeToString(node);
+        }
+      
+        if (contentNode.getAttribute("mode") == "escaped")
+        {
+          content = content.replace(/&lt;/g, "<");
+          content = content.replace(/&gt;/g, ">");
+          content = content.replace(/&amp;/g, "&");
+        }
+
+        if (content == "")
+          content = null;
+      }
+
+      item.content = content;
+      parsedItems.push(item);
+    }
+
+    return parsedItems;
+  },
+
+  parseAsAtomIETF: function(aFeed, aDOM)
+  {
+    let parsedItems = new Array();
+
+    // Get the first channel (assuming there is only one per Atom File).
+    let channel = this.childrenByTagNameNS(aDOM, FeedUtils.ATOM_IETF_NS, "feed")[0];
+    if (!channel)
+      return aFeed.onParseError(aFeed);
+
+    if (this.isPermanentRedirect(aFeed, null, channel, null))
+      return;
+
+    let tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "title");
+    aFeed.title = aFeed.title ||
+                  this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null));
+
+    tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "subtitle");
+    aFeed.description = this.serializeTextConstruct(tags ? tags[0] : null);
+
+    tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "link");
+    aFeed.link = this.findAtomLink("alternate", tags);
+    aFeed.link = this.validLink(aFeed.link);
+
+    if (!aFeed.title)
+    {
+      FeedUtils.log.error("FeedParser.parseAsAtomIETF: missing mandatory element " +
+                          "<title>");
+      return aFeed.onParseError(aFeed);
+    }
+
+    if (!aFeed.parseItems)
+      return parsedItems;
+
+    aFeed.invalidateItems();
+    let items = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "entry");
+    items = items ? items : [];
+    FeedUtils.log.debug("FeedParser.parseAsAtomIETF: items to parse - " +
+                        items.length);
+
+    for (let itemNode of items)
+    {
+      if (!itemNode.childElementCount)
+        continue;
+      let item = new FeedItem();
+      item.feed = aFeed;
+      item.enclosures = [];
+      item.keywords = [];
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origLink");
+      item.url = this.validLink(this.getNodeValue(tags ? tags[0] : null));
+      if (!item.url)
+      {
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "link");
+        item.url = this.validLink(this.findAtomLink("alternate", tags)) ||
+                   aFeed.link;
+      }
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "id");
+      item.id = this.getNodeValue(tags ? tags[0] : null);
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "summary");
+      item.description = this.serializeTextConstruct(tags ? tags[0] : null);
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "title");
+      item.title = this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null) ||
+                                  (item.description ?
+                                     item.description.substr(0, 150) : null));
+      if (!item.title || !item.id)
+      {
+        // We're lenient about other mandatory tags, but insist on these.
+        FeedUtils.log.info("FeedParser.parseAsAtomIETF: <entry> missing mandatory " +
+                           "element <id>, or <title> and no <summary>; skipping");
+        continue;
+      }
+
+      // XXX Support multiple authors.
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "source");
+      let source = tags ? tags[0] : null;
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "author");
+      if (!tags)
+        tags = this.childrenByTagNameNS(source, FeedUtils.ATOM_IETF_NS, "author");
+      if (!tags)
+        tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "author");
+
+      let authorEl = tags ? tags[0] : null;
+
+      let author = "";
+      if (authorEl)
+      {
+        tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_IETF_NS, "name");
+        let name = this.getNodeValue(tags ? tags[0] : null);
+        tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_IETF_NS, "email");
+        let email = this.getNodeValue(tags ? tags[0] : null);
+        if (name)
+          author = name + (email ? " <" + email + ">" : "");
+        else if (email)
+          author = email;
+      }
+
+      item.author = author || item.author || aFeed.title;
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "updated");
+      if (!tags || !this.getNodeValue(tags[0]))
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "published");
+      if (!tags || !this.getNodeValue(tags[0]))
+        tags = this.childrenByTagNameNS(source, FeedUtils.ATOM_IETF_NS, "published");
+      item.date = this.getNodeValue(tags ? tags[0] : null) || item.date;
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "content");
+      item.content = this.serializeTextConstruct(tags ? tags[0] : null);
+
+      if (item.content)
+        item.xmlContentBase = tags ? tags[0].baseURI : null;
+      else if (item.description)
+      {
+        tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "summary");
+        item.xmlContentBase = tags ? tags[0].baseURI : null;
+      }
+      else
+        item.xmlContentBase = itemNode.baseURI;
+
+      item.xmlContentBase = this.validLink(item.xmlContentBase);
+
+      // Handle <link rel="enclosure"> (if present).
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "link");
+      let encUrls = [];
+      if (tags)
+        for (let tag of tags)
+        {
+          let url = tag.getAttribute("rel") == "enclosure" ?
+                      (tag.getAttribute("href") || "").trim() : null;
+          url = this.validLink(url);
+          if (url && encUrls.indexOf(url) == -1)
+          {
+            let type = this.removeUnprintableASCII(tag.getAttribute("type"));
+            let length = this.removeUnprintableASCII(tag.getAttribute("length"));
+            let title = this.removeUnprintableASCII(tag.getAttribute("title"));
+            item.enclosures.push(new FeedEnclosure(url, type, length, title));
+            encUrls.push(url);
+          }
+        }
+
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origEnclosureLink");
+      let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null));
+      if (origEncUrl)
+      {
+        if (item.enclosures.length)
+          item.enclosures[0].mURL = origEncUrl;
+        else
+          item.enclosures.push(new FeedEnclosure(origEncUrl));
+      }
+
+      // Handle atom threading extension, RFC4685.  There may be 1 or more tags,
+      // and each must contain a ref attribute with 1 Message-Id equivalent
+      // value.  This is the only attr of interest in the spec for presentation.
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_THREAD_NS, "in-reply-to");
+      if (tags)
+      {
+        for (let tag of tags)
+        {
+          let ref = this.removeUnprintableASCII(tag.getAttribute("ref"));
+          if (ref)
+            item.inReplyTo += item.normalizeMessageID(ref) + " ";
+        }
+        item.inReplyTo = item.inReplyTo.trimRight();
+      }
+
+      // Support <category> and autotagging.
+      tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "category");
+      if (tags)
+      {
+        for (let tag of tags)
+        {
+          let term = this.removeUnprintableASCII(tag.getAttribute("term"));
+          term = term ? this.xmlUnescape(term.replace(/,/g, ";")).trim() : null;
+          if (term && item.keywords.indexOf(term) == -1)
+            item.keywords.push(term);
+        }
+      }
+
+      parsedItems.push(item);
+    }
+
+    return parsedItems;
+  },
+
+  isPermanentRedirect: function(aFeed, aRedirDocChannel, aFeedChannel, aDS)
+  {
+    // If subscribing to a new feed, do not check redirect tags.
+    if (!aFeed.downloadCallback || aFeed.downloadCallback.mSubscribeMode)
+      return false;
+
+    let tags, tagName, newUrl;
+    let oldUrl = aFeed.url;
+
+    // Check for RSS2.0 redirect document <newLocation> tag.
+    if (aRedirDocChannel)
+    {
+      tagName = "newLocation";
+      tags = this.childrenByTagNameNS(aRedirDocChannel, "", tagName);
+      newUrl = this.getNodeValue(tags ? tags[0] : null);
+    }
+
+    // Check for <itunes:new-feed-url> tag.
+    if (aFeedChannel)
+    {
+      tagName = "new-feed-url";
+      if (aDS)
+      {
+        tags = FeedUtils.rdf.GetResource(FeedUtils.ITUNES_NS + tagName);
+        newUrl = this.getRDFTargetValue(aDS, aFeedChannel, tags);
+      }
+      else
+      {
+        tags = this.childrenByTagNameNS(aFeedChannel, FeedUtils.ITUNES_NS, tagName);
+        newUrl = this.getNodeValue(tags ? tags[0] : null);
+      }
+      tagName = "itunes:" + tagName;
+    }
+
+    if (newUrl && newUrl != oldUrl && FeedUtils.isValidScheme(newUrl) &&
+        FeedUtils.changeUrlForFeed(aFeed, newUrl))
+    {
+      FeedUtils.log.info("FeedParser.isPermanentRedirect: found <" + tagName +
+                         "> tag; updated feed url from: " + oldUrl + " to: " + newUrl +
+                         " in folder: " + FeedUtils.getFolderPrettyPath(aFeed.folder));
+      aFeed.onUrlChange(aFeed, oldUrl);
+      return true;
+    }
+
+    return false;
+  },
+
+  serializeTextConstruct: function(textElement)
+  {
+    let content = "";
+    if (textElement)
+    {
+      let textType = textElement.getAttribute("type");
+
+      // Atom spec says consider it "text" if not present.
+      if (!textType)
+        textType = "text";
+
+      // There could be some strange content type we don't handle.
+      if (textType != "text" && textType != "html" && textType != "xhtml")
+        return null;
+
+      for (let j = 0; j < textElement.childNodes.length; j++)
+      {
+        let node = textElement.childNodes.item(j);
+        if (node.nodeType == node.CDATA_SECTION_NODE)
+          content += this.xmlEscape(node.data);
+        else
+          content += this.mSerializer.serializeToString(node);
+      }
+
+      if (textType == "html")
+        content = this.xmlUnescape(content);
+
+      content = content.trim();
+    }
+
+    // Other parts of the code depend on this being null if there's no content.
+    return content ? content : null;
+  },
+
+  getRDFTargetValue: function(ds, source, property)
+  {
+    let nodeValue = this.getRDFTargetValueRaw(ds, source, property);
+    if (!nodeValue)
+      return null;
+
+    nodeValue = nodeValue.replace(/[\n\r\t]+/g, " ");
+    return this.removeUnprintableASCII(nodeValue);
+
+  },
+
+  getRDFTargetValueFormatted: function(ds, source, property)
+  {
+    let nodeValue = this.getRDFTargetValueRaw(ds, source, property);
+    if (!nodeValue)
+      return null;
+
+    return this.removeUnprintableASCIIexCRLFTAB(nodeValue);
+
+  },
+
+  getRDFTargetValueRaw: function(ds, source, property)
+  {
+    let node = ds.GetTarget(source, property, true);
+    if (node)
+    {
+      try
+      {
+        node = node.QueryInterface(Ci.nsIRDFLiteral);
+        if (node)
+          return node.Value.trim();
+      }
+      catch (e)
+      {
+        // If the RDF was bogus, do nothing.  Rethrow if it's some other problem.
+        if (!((e instanceof Ci.nsIXPCException) &&
+              e.result == Cr.NS_ERROR_NO_INTERFACE))
+          throw new Error("FeedParser.getRDFTargetValue: " + e);
+      }
+    }
+
+    return null;
+  },
+
+  /**
+   * Return a cleaned up node value. This is intended for values that are not
+   * multiline and not formatted. A sequence of tab or newline is converted to
+   * a space and unprintable ascii is removed.
+   *
+   * @param {Node} node  - A DOM node.
+   * @return {String}    - A clean string value or null.
+   */
+  getNodeValue: function(node)
+  {
+    let nodeValue = this.getNodeValueRaw(node);
+    if (!nodeValue)
+      return null;
+
+    nodeValue = nodeValue.replace(/[\n\r\t]+/g, " ");
+    return this.removeUnprintableASCII(nodeValue);
+  },
+
+  /**
+   * Return a cleaned up formatted node value, meaning CR/LF/TAB are retained
+   * while all other unprintable ascii is removed. This is intended for values
+   * that are multiline and formatted, such as content or description tags.
+   *
+   * @param {Node} node  - A DOM node.
+   * @return {String}    - A clean string value or null.
+   */
+  getNodeValueFormatted: function(node)
+  {
+    let nodeValue = this.getNodeValueRaw(node);
+    if (!nodeValue)
+      return null;
+
+    return this.removeUnprintableASCIIexCRLFTAB(nodeValue);
+  },
+
+  /**
+   * Return a raw node value, as received. This should be sanitized as
+   * appropriate.
+   *
+   * @param {Node} node  - A DOM node.
+   * @return {String}    - A string value or null.
+   */
+  getNodeValueRaw: function(node)
+  {
+    if (node && node.textContent)
+      return node.textContent.trim();
+
+    if (node && node.firstChild)
+    {
+      let ret = "";
+      for (let child = node.firstChild; child; child = child.nextSibling)
+      {
+        let value = this.getNodeValueRaw(child);
+        if (value)
+          ret += value;
+      }
+
+      if (ret)
+        return ret.trim();
+    }
+
+    return null;
+  },
+
+  // Finds elements that are direct children of the first arg.
+  childrenByTagNameNS: function(aElement, aNamespace, aTagName)
+  {
+    if (!aElement)
+      return null;
+
+    let matches = aElement.getElementsByTagNameNS(aNamespace, aTagName);
+    let matchingChildren = new Array();
+    for (let match of matches)
+    {
+      if (match.parentNode == aElement)
+        matchingChildren.push(match)
+    }
+
+    return matchingChildren.length ? matchingChildren : null;
+  },
+
+  /**
+   * Ensure <link> type tags start with http[s]://, ftp:// or magnet:
+   * for values stored in mail headers (content-base and remote enclosures),
+   * particularly to prevent data: uris, javascript, and other spoofing.
+   *
+   * @param {String} link - An intended http url string.
+   * @return {String}     - A clean string starting with http, ftp or magnet,
+   *                        else null.
+   */
+  validLink: function(link)
+  {
+    if (/^((https?|ftp):\/\/|magnet:)/.test(link))
+      return this.removeUnprintableASCII(link.trim());
+
+    return null;
+  },
+
+  findAtomLink: function(linkRel, linkElements)
+  {
+    if (!linkElements)
+      return null;
+
+    // XXX Need to check for MIME type and hreflang.
+    for (let alink of linkElements) {
+      if (alink &&
+          // If there's a link rel.
+          ((alink.getAttribute("rel") && alink.getAttribute("rel") == linkRel) ||
+           // If there isn't, assume 'alternate'.
+           (!alink.getAttribute("rel") && (linkRel == "alternate"))) &&
+          alink.getAttribute("href"))
+      {
+        // Atom links are interpreted relative to xml:base.
+        try {
+          return Services.io.newURI(alink.baseURI, null, null).
+                             resolve(alink.getAttribute("href"));
+        }
+        catch (ex) {}
+      }
+    }
+
+    return null;
+  },
+
+  /**
+   * Remove unprintable ascii, particularly CR/LF, for non formatted tag values.
+   *
+   * @param {String} s - String to clean.
+   * @return {String}
+   */
+  removeUnprintableASCII: function(s)
+  {
+    return s ? s.replace(/[\x00-\x1F\x7F]+/g, "") : "";
+  },
+
+  /**
+   * Remove unprintable ascii, except CR/LF/TAB, for formatted tag values.
+   *
+   * @param {String} s - String to clean.
+   * @return {String}
+   */
+  removeUnprintableASCIIexCRLFTAB: function(s)
+  {
+    return s ? s.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+/g, "") : "";
+  },
+
+  stripTags: function(someHTML)
+  {
+    return someHTML ? someHTML.replace(/<[^>]+>/g, "") : someHTML;
+  },
+
+  xmlUnescape: function(s)
+  {
+    s = s.replace(/&lt;/g, "<");
+    s = s.replace(/&gt;/g, ">");
+    s = s.replace(/&amp;/g, "&");
+    return s;
+  },
+
+  xmlEscape: function(s)
+  {
+    s = s.replace(/&/g, "&amp;");
+    s = s.replace(/>/g, "&gt;");
+    s = s.replace(/</g, "&lt;");
+    return s;
+  },
+
+  dateRescue: function(dateString)
+  {
+    // Deal with various kinds of invalid dates.
+    if (!isNaN(parseInt(dateString)))
+    {
+      // It's an integer, so maybe it's a timestamp.
+      let d = new Date(parseInt(dateString) * 1000);
+      let now = new Date();
+      let yeardiff = now.getFullYear() - d.getFullYear();
+      FeedUtils.log.trace("FeedParser.dateRescue: Rescue Timestamp date - " +
+                          d.toString() + " ,year diff - " + yeardiff);
+      if (yeardiff >= 0 && yeardiff < 3)
+        // It's quite likely the correct date.
+        return d.toString();
+    }
+
+    // Could be an ISO8601/W3C date.  If not, get the current time.
+    return FeedUtils.getValidRFC5322Date(dateString);
+  }
+};
author	Matt A. Tobin <email@mattatobin.com>	2019-11-03 00:17:46 -0400
committer	Matt A. Tobin <email@mattatobin.com>	2019-11-03 00:17:46 -0400
commit	302bf1b523012e11b60425d6eee1221ebc2724eb (patch)
tree	b191a895f8716efcbe42f454f37597a545a6f421 /mailnews/extensions/newsblog/content/feed-parser.js
parent	21b3f6247403c06f85e1f45d219f87549862198f (diff)
download	UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.gz UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.lz UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.xz UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.zip