summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/feeds/nsFeedSniffer.cpp
diff options
context:
space:
mode:
authorwolfbeast <mcwerewolf@gmail.com>2018-06-04 13:17:38 +0200
committerwolfbeast <mcwerewolf@gmail.com>2018-06-04 13:17:38 +0200
commita1be17c1cea81ebb1e8b131a662c698d78f3f7f2 (patch)
treea92f7de513be600cc07bac458183e9af40e00c06 /application/basilisk/components/feeds/nsFeedSniffer.cpp
parentbf11fdd304898ac675e39b01b280d39550e419d0 (diff)
downloadUXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.gz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.lz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.xz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.zip
Issue #303 Part 1: Move basilisk files from /browser to /application/basilisk
Diffstat (limited to 'application/basilisk/components/feeds/nsFeedSniffer.cpp')
-rw-r--r--application/basilisk/components/feeds/nsFeedSniffer.cpp363
1 files changed, 363 insertions, 0 deletions
diff --git a/application/basilisk/components/feeds/nsFeedSniffer.cpp b/application/basilisk/components/feeds/nsFeedSniffer.cpp
new file mode 100644
index 000000000..f314d3d3b
--- /dev/null
+++ b/application/basilisk/components/feeds/nsFeedSniffer.cpp
@@ -0,0 +1,363 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsFeedSniffer.h"
+
+
+#include "nsNetCID.h"
+#include "nsXPCOM.h"
+#include "nsCOMPtr.h"
+#include "nsStringStream.h"
+
+#include "nsBrowserCompsCID.h"
+
+#include "nsICategoryManager.h"
+#include "nsIServiceManager.h"
+#include "nsComponentManagerUtils.h"
+#include "nsServiceManagerUtils.h"
+
+#include "nsIStreamConverterService.h"
+#include "nsIStreamConverter.h"
+
+#include "nsIStreamListener.h"
+
+#include "nsIHttpChannel.h"
+#include "nsIMIMEHeaderParam.h"
+
+#include "nsMimeTypes.h"
+#include "nsIURI.h"
+#include <algorithm>
+
+#define TYPE_ATOM "application/atom+xml"
+#define TYPE_RSS "application/rss+xml"
+#define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
+
+#define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+#define NS_RSS "http://purl.org/rss/1.0/"
+
+#define MAX_BYTES 512u
+
+NS_IMPL_ISUPPORTS(nsFeedSniffer,
+ nsIContentSniffer,
+ nsIStreamListener,
+ nsIRequestObserver)
+
+nsresult
+nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
+ const uint8_t* data,
+ uint32_t length)
+{
+ nsresult rv = NS_OK;
+
+ mDecodedData = "";
+ nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
+ if (!httpChannel)
+ return NS_ERROR_NO_INTERFACE;
+
+ nsAutoCString contentEncoding;
+ httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
+ contentEncoding);
+ if (!contentEncoding.IsEmpty()) {
+ nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
+ if (converterService) {
+ ToLowerCase(contentEncoding);
+
+ nsCOMPtr<nsIStreamListener> converter;
+ rv = converterService->AsyncConvertData(contentEncoding.get(),
+ "uncompressed", this, nullptr,
+ getter_AddRefs(converter));
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ converter->OnStartRequest(request, nullptr);
+
+ nsCOMPtr<nsIStringInputStream> rawStream =
+ do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
+ if (!rawStream)
+ return NS_ERROR_FAILURE;
+
+ rv = rawStream->SetData((const char*)data, length);
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ converter->OnStopRequest(request, nullptr, NS_OK);
+ }
+ }
+ return rv;
+}
+
+template<int N>
+static bool
+StringBeginsWithLowercaseLiteral(nsAString& aString,
+ const char (&aSubstring)[N])
+{
+ return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
+}
+
+bool
+HasAttachmentDisposition(nsIHttpChannel* httpChannel)
+{
+ if (!httpChannel)
+ return false;
+
+ uint32_t disp;
+ nsresult rv = httpChannel->GetContentDisposition(&disp);
+
+ if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
+ return true;
+
+ return false;
+}
+
+/**
+ * @return the first occurrence of a character within a string buffer,
+ * or nullptr if not found
+ */
+static const char*
+FindChar(char c, const char *begin, const char *end)
+{
+ for (; begin < end; ++begin) {
+ if (*begin == c)
+ return begin;
+ }
+ return nullptr;
+}
+
+/**
+ *
+ * Determine if a substring is the "documentElement" in the document.
+ *
+ * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
+ * element within the XML DOM, i.e. the root container element. Otherwise,
+ * it's possible that someone embedded one of these tags inside a document of
+ * another type, e.g. a HTML document, and we don't want to show the preview
+ * page if the document isn't actually a feed.
+ *
+ * @param start
+ * The beginning of the data being sniffed
+ * @param end
+ * The end of the data being sniffed, right before the substring that
+ * was found.
+ * @returns true if the found substring is the documentElement, false
+ * otherwise.
+ */
+static bool
+IsDocumentElement(const char *start, const char* end)
+{
+ // For every tag in the buffer, check to see if it's a PI, Doctype or
+ // comment, our desired substring or something invalid.
+ while ( (start = FindChar('<', start, end)) ) {
+ ++start;
+ if (start >= end)
+ return false;
+
+ // Check to see if the character following the '<' is either '?' or '!'
+ // (processing instruction or doctype or comment)... these are valid nodes
+ // to have in the prologue.
+ if (*start != '?' && *start != '!')
+ return false;
+
+ // Now advance the iterator until the '>' (We do this because we don't want
+ // to sniff indicator substrings that are embedded within other nodes, e.g.
+ // comments: <!-- <rdf:RDF .. > -->
+ start = FindChar('>', start, end);
+ if (!start)
+ return false;
+
+ ++start;
+ }
+ return true;
+}
+
+/**
+ * Determines whether or not a string exists as the root element in an XML data
+ * string buffer.
+ * @param dataString
+ * The data being sniffed
+ * @param substring
+ * The substring being tested for existence and root-ness.
+ * @returns true if the substring exists and is the documentElement, false
+ * otherwise.
+ */
+static bool
+ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
+{
+ int32_t offset = dataString.Find(substring);
+ if (offset == -1)
+ return false;
+
+ const char *begin = dataString.BeginReading();
+
+ // Only do the validation when we find the substring.
+ return IsDocumentElement(begin, begin + offset);
+}
+
+NS_IMETHODIMP
+nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
+ const uint8_t* data,
+ uint32_t length,
+ nsACString& sniffedType)
+{
+ nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
+ if (!channel)
+ return NS_ERROR_NO_INTERFACE;
+
+ // Check that this is a GET request, since you can't subscribe to a POST...
+ nsAutoCString method;
+ channel->GetRequestMethod(method);
+ if (!method.EqualsLiteral("GET")) {
+ sniffedType.Truncate();
+ return NS_OK;
+ }
+
+ // We need to find out if this is a load of a view-source document. In this
+ // case we do not want to override the content type, since the source display
+ // does not need to be converted from feed format to XUL. More importantly,
+ // we don't want to change the content type from something
+ // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
+ // etc) to something that only the application fe knows about (maybe.feed)
+ // thus deactivating syntax highlighting.
+ nsCOMPtr<nsIURI> originalURI;
+ channel->GetOriginalURI(getter_AddRefs(originalURI));
+
+ nsAutoCString scheme;
+ originalURI->GetScheme(scheme);
+ if (scheme.EqualsLiteral("view-source")) {
+ sniffedType.Truncate();
+ return NS_OK;
+ }
+
+ // Check the Content-Type to see if it is set correctly. If it is set to
+ // something specific that we think is a reliable indication of a feed, don't
+ // bother sniffing since we assume the site maintainer knows what they're
+ // doing.
+ nsAutoCString contentType;
+ channel->GetContentType(contentType);
+ bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
+ contentType.EqualsLiteral(TYPE_ATOM);
+
+ // Check to see if this was a feed request from the location bar or from
+ // the feed: protocol. This is also a reliable indication.
+ // The value of the header doesn't matter.
+ if (!noSniff) {
+ nsAutoCString sniffHeader;
+ nsresult foundHeader =
+ channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
+ sniffHeader);
+ noSniff = NS_SUCCEEDED(foundHeader);
+ }
+
+ if (noSniff) {
+ // check for an attachment after we have a likely feed.
+ if(HasAttachmentDisposition(channel)) {
+ sniffedType.Truncate();
+ return NS_OK;
+ }
+
+ // set the feed header as a response header, since we have good metadata
+ // telling us that the feed is supposed to be RSS or Atom
+ channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
+ NS_LITERAL_CSTRING("1"), false);
+ sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
+ return NS_OK;
+ }
+
+ // Don't sniff arbitrary types. Limit sniffing to situations that
+ // we think can reasonably arise.
+ if (!contentType.EqualsLiteral(TEXT_HTML) &&
+ !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
+ // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
+ // and check for text/xml and application/xml by hand instead?
+ contentType.Find("xml") == -1) {
+ sniffedType.Truncate();
+ return NS_OK;
+ }
+
+ // Now we need to potentially decompress data served with
+ // Content-Encoding: gzip
+ nsresult rv = ConvertEncodedData(request, data, length);
+ if (NS_FAILED(rv))
+ return rv;
+
+ // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
+ // false positives by accidentally reading document content, e.g. a "how to
+ // make a feed" page.
+ const char* testData;
+ if (mDecodedData.IsEmpty()) {
+ testData = (const char*)data;
+ length = std::min(length, MAX_BYTES);
+ } else {
+ testData = mDecodedData.get();
+ length = std::min(mDecodedData.Length(), MAX_BYTES);
+ }
+
+ // The strategy here is based on that described in:
+ // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
+ // for interoperarbility purposes.
+
+ // Thus begins the actual sniffing.
+ nsDependentCSubstring dataString((const char*)testData, length);
+
+ bool isFeed = false;
+
+ // RSS 0.91/0.92/2.0
+ isFeed = ContainsTopLevelSubstring(dataString, "<rss");
+
+ // Atom 1.0
+ if (!isFeed)
+ isFeed = ContainsTopLevelSubstring(dataString, "<feed");
+
+ // RSS 1.0
+ if (!isFeed) {
+ isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
+ dataString.Find(NS_RDF) != -1 &&
+ dataString.Find(NS_RSS) != -1;
+ }
+
+ // If we sniffed a feed, coerce our internal type
+ if (isFeed && !HasAttachmentDisposition(channel))
+ sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
+ else
+ sniffedType.Truncate();
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
+{
+ return NS_OK;
+}
+
+nsresult
+nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
+ void* closure,
+ const char* rawSegment,
+ uint32_t toOffset,
+ uint32_t count,
+ uint32_t* writeCount)
+{
+ nsCString* decodedData = static_cast<nsCString*>(closure);
+ decodedData->Append(rawSegment, count);
+ *writeCount = count;
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
+ nsIInputStream* stream, uint64_t offset,
+ uint32_t count)
+{
+ uint32_t read;
+ return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
+ &read);
+}
+
+NS_IMETHODIMP
+nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
+ nsresult status)
+{
+ return NS_OK;
+}