/* -*- Mode: JavaScript; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // The feed parser depends on FeedItem.js, Feed.js. function FeedParser() { this.mSerializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"]. createInstance(Ci.nsIDOMSerializer); } FeedParser.prototype = { // parseFeed() returns an array of parsed items ready for processing. It is // currently a synchronous operation. If there is an error parsing the feed, // parseFeed returns an empty feed in addition to calling aFeed.onParseError. parseFeed: function (aFeed, aDOM) { if (!(aDOM instanceof Ci.nsIDOMXMLDocument)) { // No xml doc. return aFeed.onParseError(aFeed); } let doc = aDOM.documentElement; if (doc.namespaceURI == FeedUtils.MOZ_PARSERERROR_NS) { // Gecko caught a basic parsing error. let errStr = doc.firstChild.textContent + "\n" + doc.firstElementChild.textContent; FeedUtils.log.info("FeedParser.parseFeed: - " + errStr); return aFeed.onParseError(aFeed); } else if (aDOM.querySelector("redirect")) { // Check for RSS2.0 redirect document. let channel = aDOM.querySelector("redirect"); if (this.isPermanentRedirect(aFeed, channel, null, null)) return; return aFeed.onParseError(aFeed); } else if (doc.namespaceURI == FeedUtils.RDF_SYNTAX_NS && doc.getElementsByTagNameNS(FeedUtils.RSS_NS, "channel")[0]) { aFeed.mFeedType = "RSS_1.xRDF" FeedUtils.log.debug("FeedParser.parseFeed: type:url - " + aFeed.mFeedType +" : " +aFeed.url); // aSource can be misencoded (XMLHttpRequest converts to UTF-8 by default), // but the DOM is almost always right because it uses the hints in the // XML file. This is slower, but not noticably so. Mozilla doesn't have // the XMLHttpRequest.responseBody property that IE has, which provides // access to the unencoded response. let xmlString = this.mSerializer.serializeToString(doc); return this.parseAsRSS1(aFeed, xmlString, aFeed.request.channel.URI); } else if (doc.namespaceURI == FeedUtils.ATOM_03_NS) { aFeed.mFeedType = "ATOM_0.3" FeedUtils.log.debug("FeedParser.parseFeed: type:url - " + aFeed.mFeedType +" : " +aFeed.url); return this.parseAsAtom(aFeed, aDOM); } else if (doc.namespaceURI == FeedUtils.ATOM_IETF_NS) { aFeed.mFeedType = "ATOM_IETF" FeedUtils.log.debug("FeedParser.parseFeed: type:url - " + aFeed.mFeedType +" : " +aFeed.url); return this.parseAsAtomIETF(aFeed, aDOM); } else if (doc.getElementsByTagNameNS(FeedUtils.RSS_090_NS, "channel")[0]) { aFeed.mFeedType = "RSS_0.90" FeedUtils.log.debug("FeedParser.parseFeed: type:url - " + aFeed.mFeedType +" : " +aFeed.url); return this.parseAsRSS2(aFeed, aDOM); } else { // Parse as RSS 0.9x. In theory even RSS 1.0 feeds could be parsed by // the 0.9x parser if the RSS namespace were the default. let rssVer = doc.localName == "rss" ? doc.getAttribute("version") : null; if (rssVer) aFeed.mFeedType = "RSS_" + rssVer; else aFeed.mFeedType = "RSS_0.9x?"; FeedUtils.log.debug("FeedParser.parseFeed: type:url - " + aFeed.mFeedType +" : " +aFeed.url); return this.parseAsRSS2(aFeed, aDOM); } }, parseAsRSS2: function (aFeed, aDOM) { // Get the first channel (assuming there is only one per RSS File). let parsedItems = new Array(); let channel = aDOM.querySelector("channel"); if (!channel) return aFeed.onParseError(aFeed); // Usually the empty string, unless this is RSS .90. let nsURI = channel.namespaceURI || ""; FeedUtils.log.debug("FeedParser.parseAsRSS2: channel nsURI - " + nsURI); if (this.isPermanentRedirect(aFeed, null, channel, null)) return; let tags = this.childrenByTagNameNS(channel, nsURI, "title"); aFeed.title = aFeed.title || this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, nsURI, "description"); aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, nsURI, "link"); aFeed.link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (!(aFeed.title || aFeed.description) || !aFeed.link) { FeedUtils.log.error("FeedParser.parseAsRSS2: missing mandatory element " + " and <description>, or <link>"); return aFeed.onParseError(aFeed); } if (!aFeed.parseItems) return parsedItems; aFeed.invalidateItems(); // XXX use getElementsByTagNameNS for now; childrenByTagNameNS would be // better, but RSS .90 is still with us. let itemNodes = aDOM.getElementsByTagNameNS(nsURI, "item"); itemNodes = itemNodes ? itemNodes : []; FeedUtils.log.debug("FeedParser.parseAsRSS2: items to parse - " + itemNodes.length); for (let itemNode of itemNodes) { if (!itemNode.childElementCount) continue; let item = new FeedItem(); item.feed = aFeed; item.enclosures = []; item.keywords = []; tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origLink"); let link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (!link) { tags = this.childrenByTagNameNS(itemNode, nsURI, "link"); link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); } tags = this.childrenByTagNameNS(itemNode, nsURI, "guid"); let guidNode = tags ? tags[0] : null; let guid; let isPermaLink = false; if (guidNode) { guid = this.getNodeValue(guidNode); // isPermaLink is true if the value is "true" or if the attribute is // not present; all other values, including "false" and "False" and // for that matter "TRuE" and "meatcake" are false. if (!guidNode.hasAttribute("isPermaLink") || guidNode.getAttribute("isPermaLink") == "true") isPermaLink = true; // If attribute isPermaLink is missing, it is good to check the validity // of <guid> value as an URL to avoid linking to non-URL strings. if (!guidNode.hasAttribute("isPermaLink")) { try { Services.io.newURI(guid, null, null); if (Services.io.extractScheme(guid) == "tag") isPermaLink = false; } catch (ex) { isPermaLink = false; } } item.id = guid; } let guidLink = this.validLink(guid); item.url = isPermaLink && guidLink ? guidLink : link ? link : null; tags = this.childrenByTagNameNS(itemNode, nsURI, "description"); item.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(itemNode, nsURI, "title"); item.title = this.getNodeValue(tags ? tags[0] : null); if (!(item.title || item.description)) { FeedUtils.log.info("FeedParser.parseAsRSS2: <item> missing mandatory " + "element, either <title> or <description>; skipping"); continue; } if (!item.id) { // At this point, if there is no guid, uniqueness cannot be guaranteed // by any of link or date (optional) or title (optional unless there // is no description). Use a big chunk of description; minimize dupes // with url and title if present. item.id = (item.url || item.feed.url) + "#" + item.title + "#" + (this.stripTags(item.description ? item.description.substr(0, 150) : null) || item.title); item.id = item.id.replace(/[\n\r\t\s]+/g, " "); } // Escape html entities in <title>, which are unescaped as textContent // values. If the title is used as content, it will remain escaped; if // it is used as the title, it will be unescaped upon store. Bug 1240603. // The <description> tag must follow escaping examples found in // http://www.rssboard.org/rss-encoding-examples, i.e. single escape angle // brackets for tags, which are removed if used as title, and double // escape entities for presentation in title. // Better: always use <title>. Best: use Atom. if (!item.title) item.title = this.stripTags(item.description).substr(0, 150); else item.title = item.htmlEscape(item.title); tags = this.childrenByTagNameNS(itemNode, nsURI, "author"); if (!tags) tags = this.childrenByTagNameNS(itemNode, FeedUtils.DC_NS, "creator"); let author = this.getNodeValue(tags ? tags[0] : null) || aFeed.title; author = this.cleanAuthorName(author); item.author = author ? ["<" + author + ">"] : item.author; tags = this.childrenByTagNameNS(itemNode, nsURI, "pubDate"); if (!tags || !this.getNodeValue(tags[0])) tags = this.childrenByTagNameNS(itemNode, FeedUtils.DC_NS, "date"); item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; // If the date is invalid, users will see the beginning of the epoch // unless we reset it here, so they'll see the current time instead. // This is typical aggregator behavior. if (item.date) { item.date = item.date.trim(); if (!FeedUtils.isValidRFC822Date(item.date)) { // XXX Use this on the other formats as well. item.date = this.dateRescue(item.date); } } tags = this.childrenByTagNameNS(itemNode, FeedUtils.RSS_CONTENT_NS, "encoded"); item.content = this.getNodeValueFormatted(tags ? tags[0] : null); // Handle <enclosures> and <media:content>, which may be in a // <media:group> (if present). tags = this.childrenByTagNameNS(itemNode, nsURI, "enclosure"); let encUrls = []; if (tags) for (let tag of tags) { let url = this.validLink(tag.getAttribute("url")); if (url && encUrls.indexOf(url) == -1) { let type = this.removeUnprintableASCII(tag.getAttribute("type")); let length = this.removeUnprintableASCII(tag.getAttribute("length")); item.enclosures.push(new FeedEnclosure(url, type, length)); encUrls.push(url); } } tags = itemNode.getElementsByTagNameNS(FeedUtils.MRSS_NS, "content"); if (tags) for (let tag of tags) { let url = this.validLink(tag.getAttribute("url")); if (url && encUrls.indexOf(url) == -1) { let type = this.removeUnprintableASCII(tag.getAttribute("type")); let fileSize = this.removeUnprintableASCII(tag.getAttribute("fileSize")); item.enclosures.push(new FeedEnclosure(url, type, fileSize)); } } // The <origEnclosureLink> tag has no specification, especially regarding // whether more than one tag is allowed and, if so, how tags would // relate to previously declared (and well specified) enclosure urls. // The common usage is to include 1 origEnclosureLink, in addition to // the specified enclosure tags for 1 enclosure. Thus, we will replace the // first enclosure's, if found, url with the first <origEnclosureLink> // url only or else add the <origEnclosureLink> url. tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origEnclosureLink"); let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (origEncUrl) { if (item.enclosures.length) item.enclosures[0].mURL = origEncUrl; else item.enclosures.push(new FeedEnclosure(origEncUrl)); } // Support <category> and autotagging. tags = this.childrenByTagNameNS(itemNode, nsURI, "category"); if (tags) { for (let tag of tags) { let term = this.getNodeValue(tag); term = term ? this.xmlUnescape(term.replace(/,/g, ";")) : null; if (term && item.keywords.indexOf(term) == -1) item.keywords.push(term); } } parsedItems.push(item); } return parsedItems; }, parseAsRSS1 : function(aFeed, aSource, aBaseURI) { let parsedItems = new Array(); // RSS 1.0 is valid RDF, so use the RDF parser/service to extract data. // Create a new RDF data source and parse the feed into it. let ds = Cc["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"]. createInstance(Ci.nsIRDFDataSource); let rdfparser = Cc["@mozilla.org/rdf/xml-parser;1"]. createInstance(Ci.nsIRDFXMLParser); rdfparser.parseString(ds, aBaseURI, aSource); // Get information about the feed as a whole. let channel = ds.GetSource(FeedUtils.RDF_TYPE, FeedUtils.RSS_CHANNEL, true); if (!channel) return aFeed.onParseError(aFeed); if (this.isPermanentRedirect(aFeed, null, channel, ds)) return; aFeed.title = aFeed.title || this.getRDFTargetValue(ds, channel, FeedUtils.RSS_TITLE) || aFeed.url; aFeed.description = this.getRDFTargetValueFormatted(ds, channel, FeedUtils.RSS_DESCRIPTION) || ""; aFeed.link = this.validLink(this.getRDFTargetValue(ds, channel, FeedUtils.RSS_LINK)) || aFeed.url; if (!(aFeed.title || aFeed.description) || !aFeed.link) { FeedUtils.log.error("FeedParser.parseAsRSS1: missing mandatory element " + "<title> and <description>, or <link>"); return aFeed.onParseError(aFeed); } if (!aFeed.parseItems) return parsedItems; aFeed.invalidateItems(); // Ignore the <items> list and just get the <item>s. let items = ds.GetSources(FeedUtils.RDF_TYPE, FeedUtils.RSS_ITEM, true); let index = 0; while (items.hasMoreElements()) { let itemResource = items.getNext().QueryInterface(Ci.nsIRDFResource); let item = new FeedItem(); item.feed = aFeed; // Prefer the value of the link tag to the item URI since the URI could be // a relative URN. let uri = itemResource.ValueUTF8; let link = this.validLink(this.getRDFTargetValue(ds, itemResource, FeedUtils.RSS_LINK)); item.url = link || uri; item.description = this.getRDFTargetValueFormatted(ds, itemResource, FeedUtils.RSS_DESCRIPTION); item.title = this.getRDFTargetValue(ds, itemResource, FeedUtils.RSS_TITLE) || this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_SUBJECT) || (item.description ? (this.stripTags(item.description).substr(0, 150)) : null); if (!item.url || !item.title) { FeedUtils.log.info("FeedParser.parseAsRSS1: <item> missing mandatory " + "element <item rdf:about> and <link>, or <title> and " + "no <description>; skipping"); continue; } item.id = item.url; item.url = this.validLink(item.url); let author = this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_CREATOR) || this.getRDFTargetValue(ds, channel, FeedUtils.DC_CREATOR) || aFeed.title; author = this.cleanAuthorName(author); item.author = author ? ["<" + author + ">"] : item.author; item.date = this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_DATE) || item.date; item.content = this.getRDFTargetValueFormatted(ds, itemResource, FeedUtils.RSS_CONTENT_ENCODED); parsedItems[index++] = item; } FeedUtils.log.debug("FeedParser.parseAsRSS1: items parsed - " + index); return parsedItems; }, parseAsAtom: function(aFeed, aDOM) { let parsedItems = new Array(); // Get the first channel (assuming there is only one per Atom File). let channel = aDOM.querySelector("feed"); if (!channel) return aFeed.onParseError(aFeed); if (this.isPermanentRedirect(aFeed, null, channel, null)) return; let tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "title"); aFeed.title = aFeed.title || this.stripTags(this.getNodeValue(tags ? tags[0] : null)); tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "tagline"); aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "link"); aFeed.link = this.validLink(this.findAtomLink("alternate", tags)); if (!aFeed.title) { FeedUtils.log.error("FeedParser.parseAsAtom: missing mandatory element " + "<title>"); return aFeed.onParseError(aFeed); } if (!aFeed.parseItems) return parsedItems; aFeed.invalidateItems(); let items = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "entry"); items = items ? items : []; FeedUtils.log.debug("FeedParser.parseAsAtom: items to parse - " + items.length); for (let itemNode of items) { if (!itemNode.childElementCount) continue; let item = new FeedItem(); item.feed = aFeed; tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "link"); item.url = this.validLink(this.findAtomLink("alternate", tags)); tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "id"); item.id = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "summary"); item.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "title"); item.title = this.getNodeValue(tags ? tags[0] : null) || (item.description ? item.description.substr(0, 150) : null); if (!item.title || !item.id) { // We're lenient about other mandatory tags, but insist on these. FeedUtils.log.info("FeedParser.parseAsAtom: <entry> missing mandatory " + "element <id>, or <title> and no <summary>; skipping"); continue; } tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "author"); if (!tags) tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "contributor"); if (!tags) tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "author"); let authorEl = tags ? tags[0] : null; let author = ""; if (authorEl) { tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_03_NS, "name"); let name = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_03_NS, "email"); let email = this.getNodeValue(tags ? tags[0] : null); if (name) author = name + (email ? " <" + email + ">" : ""); else if (email) author = email; } item.author = author || item.author || aFeed.title; tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "modified"); if (!tags || !this.getNodeValue(tags[0])) tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "issued"); if (!tags || !this.getNodeValue(tags[0])) tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "created"); item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; // XXX We should get the xml:base attribute from the content tag as well // and use it as the base HREF of the message. // XXX Atom feeds can have multiple content elements; we should differentiate // between them and pick the best one. // Some Atom feeds wrap the content in a CTYPE declaration; others use // a namespace to identify the tags as HTML; and a few are buggy and put // HTML tags in without declaring their namespace so they look like Atom. // We deal with the first two but not the third. tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "content"); let contentNode = tags ? tags[0] : null; let content; if (contentNode) { content = ""; for (let j = 0; j < contentNode.childNodes.length; j++) { let node = contentNode.childNodes.item(j); if (node.nodeType == node.CDATA_SECTION_NODE) content += node.data; else content += this.mSerializer.serializeToString(node); } if (contentNode.getAttribute("mode") == "escaped") { content = content.replace(/</g, "<"); content = content.replace(/>/g, ">"); content = content.replace(/&/g, "&"); } if (content == "") content = null; } item.content = content; parsedItems.push(item); } return parsedItems; }, parseAsAtomIETF: function(aFeed, aDOM) { let parsedItems = new Array(); // Get the first channel (assuming there is only one per Atom File). let channel = this.childrenByTagNameNS(aDOM, FeedUtils.ATOM_IETF_NS, "feed")[0]; if (!channel) return aFeed.onParseError(aFeed); if (this.isPermanentRedirect(aFeed, null, channel, null)) return; let tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "title"); aFeed.title = aFeed.title || this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null)); tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "subtitle"); aFeed.description = this.serializeTextConstruct(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "link"); aFeed.link = this.findAtomLink("alternate", tags); aFeed.link = this.validLink(aFeed.link); if (!aFeed.title) { FeedUtils.log.error("FeedParser.parseAsAtomIETF: missing mandatory element " + "<title>"); return aFeed.onParseError(aFeed); } if (!aFeed.parseItems) return parsedItems; aFeed.invalidateItems(); let items = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "entry"); items = items ? items : []; FeedUtils.log.debug("FeedParser.parseAsAtomIETF: items to parse - " + items.length); for (let itemNode of items) { if (!itemNode.childElementCount) continue; let item = new FeedItem(); item.feed = aFeed; item.enclosures = []; item.keywords = []; tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origLink"); item.url = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (!item.url) { tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "link"); item.url = this.validLink(this.findAtomLink("alternate", tags)) || aFeed.link; } tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "id"); item.id = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "summary"); item.description = this.serializeTextConstruct(tags ? tags[0] : null); tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "title"); item.title = this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null) || (item.description ? item.description.substr(0, 150) : null)); if (!item.title || !item.id) { // We're lenient about other mandatory tags, but insist on these. FeedUtils.log.info("FeedParser.parseAsAtomIETF: <entry> missing mandatory " + "element <id>, or <title> and no <summary>; skipping"); continue; } // Support multiple authors. tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "source"); let source = tags ? tags[0] : null; tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "author"); if (!tags) tags = this.childrenByTagNameNS(source, FeedUtils.ATOM_IETF_NS, "author"); if (!tags) tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "author"); let authorTags = tags || []; let authors = []; for (let authorTag of authorTags) { let author = ""; tags = this.childrenByTagNameNS(authorTag, FeedUtils.ATOM_IETF_NS, "name"); let name = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS(authorTag, FeedUtils.ATOM_IETF_NS, "email"); let email = this.getNodeValue(tags ? tags[0] : null); if (name) { name = this.cleanAuthorName(name); if (email) { if (!email.match(/^<.*>$/)) { email = " <" + email + ">"; } author = name + email; } else { author = "<" + name + ">"; } } else if (email) { author = email; } if (author) { authors.push(author); } } if (authors.length == 0) { tags = this.childrenByTagNameNS(channel, FeedUtils.DC_NS, "publisher"); let author = this.getNodeValue(tags ? tags[0] : null) || aFeed.title; author = this.cleanAuthorName(author); item.author = author ? ["<" + author + ">"] : item.author; } else { item.author = authors; } FeedUtils.log.trace("FeedParser.parseAsAtomIETF: author(s) - " + item.author); tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "updated"); if (!tags || !this.getNodeValue(tags[0])) tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "published"); if (!tags || !this.getNodeValue(tags[0])) tags = this.childrenByTagNameNS(source, FeedUtils.ATOM_IETF_NS, "published"); item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "content"); item.content = this.serializeTextConstruct(tags ? tags[0] : null); if (item.content) item.xmlContentBase = tags ? tags[0].baseURI : null; else if (item.description) { tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "summary"); item.xmlContentBase = tags ? tags[0].baseURI : null; } else item.xmlContentBase = itemNode.baseURI; item.xmlContentBase = this.validLink(item.xmlContentBase); // Handle <link rel="enclosure"> (if present). tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "link"); let encUrls = []; if (tags) for (let tag of tags) { let url = tag.getAttribute("rel") == "enclosure" ? (tag.getAttribute("href") || "").trim() : null; url = this.validLink(url); if (url && encUrls.indexOf(url) == -1) { let type = this.removeUnprintableASCII(tag.getAttribute("type")); let length = this.removeUnprintableASCII(tag.getAttribute("length")); let title = this.removeUnprintableASCII(tag.getAttribute("title")); item.enclosures.push(new FeedEnclosure(url, type, length, title)); encUrls.push(url); } } tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origEnclosureLink"); let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (origEncUrl) { if (item.enclosures.length) item.enclosures[0].mURL = origEncUrl; else item.enclosures.push(new FeedEnclosure(origEncUrl)); } // Handle atom threading extension, RFC4685. There may be 1 or more tags, // and each must contain a ref attribute with 1 Message-Id equivalent // value. This is the only attr of interest in the spec for presentation. tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_THREAD_NS, "in-reply-to"); if (tags) { for (let tag of tags) { let ref = this.removeUnprintableASCII(tag.getAttribute("ref")); if (ref) item.inReplyTo += item.normalizeMessageID(ref) + " "; } item.inReplyTo = item.inReplyTo.trimRight(); } // Support <category> and autotagging. tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "category"); if (tags) { for (let tag of tags) { let term = this.removeUnprintableASCII(tag.getAttribute("term")); term = term ? this.xmlUnescape(term.replace(/,/g, ";")).trim() : null; if (term && item.keywords.indexOf(term) == -1) item.keywords.push(term); } } parsedItems.push(item); } return parsedItems; }, isPermanentRedirect: function(aFeed, aRedirDocChannel, aFeedChannel, aDS) { // If subscribing to a new feed, do not check redirect tags. if (!aFeed.downloadCallback || aFeed.downloadCallback.mSubscribeMode) return false; let tags, tagName, newUrl; let oldUrl = aFeed.url; // Check for RSS2.0 redirect document <newLocation> tag. if (aRedirDocChannel) { tagName = "newLocation"; tags = this.childrenByTagNameNS(aRedirDocChannel, "", tagName); newUrl = this.getNodeValue(tags ? tags[0] : null); } // Check for <itunes:new-feed-url> tag. if (aFeedChannel) { tagName = "new-feed-url"; if (aDS) { tags = FeedUtils.rdf.GetResource(FeedUtils.ITUNES_NS + tagName); newUrl = this.getRDFTargetValue(aDS, aFeedChannel, tags); } else { tags = this.childrenByTagNameNS(aFeedChannel, FeedUtils.ITUNES_NS, tagName); newUrl = this.getNodeValue(tags ? tags[0] : null); } tagName = "itunes:" + tagName; } if (newUrl && newUrl != oldUrl && FeedUtils.isValidScheme(newUrl) && FeedUtils.changeUrlForFeed(aFeed, newUrl)) { FeedUtils.log.info("FeedParser.isPermanentRedirect: found <" + tagName + "> tag; updated feed url from: " + oldUrl + " to: " + newUrl + " in folder: " + FeedUtils.getFolderPrettyPath(aFeed.folder)); aFeed.onUrlChange(aFeed, oldUrl); return true; } return false; }, serializeTextConstruct: function(textElement) { let content = ""; if (textElement) { let textType = textElement.getAttribute("type"); // Atom spec says consider it "text" if not present. if (!textType) textType = "text"; // There could be some strange content type we don't handle. if (textType != "text" && textType != "html" && textType != "xhtml") return null; for (let j = 0; j < textElement.childNodes.length; j++) { let node = textElement.childNodes.item(j); if (node.nodeType == node.CDATA_SECTION_NODE) content += this.xmlEscape(node.data); else content += this.mSerializer.serializeToString(node); } if (textType == "html") content = this.xmlUnescape(content); content = content.trim(); } // Other parts of the code depend on this being null if there's no content. return content ? content : null; }, /** * Return a cleaned up author name value. * * @param {String} authorString - A string. * @returns {String} - A clean string value. */ cleanAuthorName(authorString) { if (!authorString) { return ""; } FeedUtils.log.trace("FeedParser.cleanAuthor: author1 - " + authorString); let author = authorString.replace(/[\n\r\t]+/g, " ") .replace(/"/g, '\\"') .trim(); // If the name contains special chars, quote it. if (author.match(/[<>@,"]/)) { author = '"' + author + '"'; } FeedUtils.log.trace("FeedParser.cleanAuthor: author2 - " + author); return author; }, getRDFTargetValue: function(ds, source, property) { let nodeValue = this.getRDFTargetValueRaw(ds, source, property); if (!nodeValue) return null; nodeValue = nodeValue.replace(/[\n\r\t]+/g, " "); return this.removeUnprintableASCII(nodeValue); }, getRDFTargetValueFormatted: function(ds, source, property) { let nodeValue = this.getRDFTargetValueRaw(ds, source, property); if (!nodeValue) return null; return this.removeUnprintableASCIIexCRLFTAB(nodeValue); }, getRDFTargetValueRaw: function(ds, source, property) { let node = ds.GetTarget(source, property, true); if (node) { try { node = node.QueryInterface(Ci.nsIRDFLiteral); if (node) return node.Value.trim(); } catch (e) { // If the RDF was bogus, do nothing. Rethrow if it's some other problem. if (!((e instanceof Ci.nsIXPCException) && e.result == Cr.NS_ERROR_NO_INTERFACE)) throw new Error("FeedParser.getRDFTargetValue: " + e); } } return null; }, /** * Return a cleaned up node value. This is intended for values that are not * multiline and not formatted. A sequence of tab or newline is converted to * a space and unprintable ascii is removed. * * @param {Node} node - A DOM node. * @return {String} - A clean string value or null. */ getNodeValue: function(node) { let nodeValue = this.getNodeValueRaw(node); if (!nodeValue) return null; nodeValue = nodeValue.replace(/[\n\r\t]+/g, " "); return this.removeUnprintableASCII(nodeValue); }, /** * Return a cleaned up formatted node value, meaning CR/LF/TAB are retained * while all other unprintable ascii is removed. This is intended for values * that are multiline and formatted, such as content or description tags. * * @param {Node} node - A DOM node. * @return {String} - A clean string value or null. */ getNodeValueFormatted: function(node) { let nodeValue = this.getNodeValueRaw(node); if (!nodeValue) return null; return this.removeUnprintableASCIIexCRLFTAB(nodeValue); }, /** * Return a raw node value, as received. This should be sanitized as * appropriate. * * @param {Node} node - A DOM node. * @return {String} - A string value or null. */ getNodeValueRaw: function(node) { if (node && node.textContent) return node.textContent.trim(); if (node && node.firstChild) { let ret = ""; for (let child = node.firstChild; child; child = child.nextSibling) { let value = this.getNodeValueRaw(child); if (value) ret += value; } if (ret) return ret.trim(); } return null; }, // Finds elements that are direct children of the first arg. childrenByTagNameNS: function(aElement, aNamespace, aTagName) { if (!aElement) return null; let matches = aElement.getElementsByTagNameNS(aNamespace, aTagName); let matchingChildren = new Array(); for (let match of matches) { if (match.parentNode == aElement) matchingChildren.push(match) } return matchingChildren.length ? matchingChildren : null; }, /** * Ensure <link> type tags start with http[s]://, ftp:// or magnet: * for values stored in mail headers (content-base and remote enclosures), * particularly to prevent data: uris, javascript, and other spoofing. * * @param {String} link - An intended http url string. * @return {String} - A clean string starting with http, ftp or magnet, * else null. */ validLink: function(link) { if (/^((https?|ftp):\/\/|magnet:)/.test(link)) return this.removeUnprintableASCII(link.trim()); return null; }, findAtomLink: function(linkRel, linkElements) { if (!linkElements) return null; // XXX Need to check for MIME type and hreflang. for (let alink of linkElements) { if (alink && // If there's a link rel. ((alink.getAttribute("rel") && alink.getAttribute("rel") == linkRel) || // If there isn't, assume 'alternate'. (!alink.getAttribute("rel") && (linkRel == "alternate"))) && alink.getAttribute("href")) { // Atom links are interpreted relative to xml:base. try { return Services.io.newURI(alink.baseURI, null, null). resolve(alink.getAttribute("href")); } catch (ex) {} } } return null; }, /** * Remove unprintable ascii, particularly CR/LF, for non formatted tag values. * * @param {String} s - String to clean. * @return {String} */ removeUnprintableASCII: function(s) { return s ? s.replace(/[\x00-\x1F\x7F]+/g, "") : ""; }, /** * Remove unprintable ascii, except CR/LF/TAB, for formatted tag values. * * @param {String} s - String to clean. * @return {String} */ removeUnprintableASCIIexCRLFTAB: function(s) { return s ? s.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+/g, "") : ""; }, stripTags: function(someHTML) { return someHTML ? someHTML.replace(/<[^>]+>/g, "") : someHTML; }, xmlUnescape: function(s) { s = s.replace(/</g, "<"); s = s.replace(/>/g, ">"); s = s.replace(/&/g, "&"); return s; }, xmlEscape: function(s) { s = s.replace(/&/g, "&"); s = s.replace(/>/g, ">"); s = s.replace(/</g, "<"); return s; }, dateRescue: function(dateString) { // Deal with various kinds of invalid dates. if (!isNaN(parseInt(dateString))) { // It's an integer, so maybe it's a timestamp. let d = new Date(parseInt(dateString) * 1000); let now = new Date(); let yeardiff = now.getFullYear() - d.getFullYear(); FeedUtils.log.trace("FeedParser.dateRescue: Rescue Timestamp date - " + d.toString() + " ,year diff - " + yeardiff); if (yeardiff >= 0 && yeardiff < 3) // It's quite likely the correct date. return d.toString(); } // Could be an ISO8601/W3C date. If not, get the current time. return FeedUtils.getValidRFC5322Date(dateString); } };