summaryrefslogtreecommitdiffstats
path: root/parser/html/nsHtml5StreamParser.h
diff options
context:
space:
mode:
Diffstat (limited to 'parser/html/nsHtml5StreamParser.h')
-rw-r--r--parser/html/nsHtml5StreamParser.h579
1 files changed, 579 insertions, 0 deletions
diff --git a/parser/html/nsHtml5StreamParser.h b/parser/html/nsHtml5StreamParser.h
new file mode 100644
index 000000000..9a38ba067
--- /dev/null
+++ b/parser/html/nsHtml5StreamParser.h
@@ -0,0 +1,579 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsHtml5StreamParser_h
+#define nsHtml5StreamParser_h
+
+#include "nsAutoPtr.h"
+#include "nsCOMPtr.h"
+#include "nsICharsetDetectionObserver.h"
+#include "nsHtml5MetaScanner.h"
+#include "nsIUnicodeDecoder.h"
+#include "nsHtml5TreeOpExecutor.h"
+#include "nsHtml5OwningUTF16Buffer.h"
+#include "nsIInputStream.h"
+#include "mozilla/Mutex.h"
+#include "mozilla/UniquePtr.h"
+#include "nsHtml5AtomTable.h"
+#include "nsHtml5Speculation.h"
+#include "nsITimer.h"
+#include "nsICharsetDetector.h"
+
+class nsHtml5Parser;
+
+#define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
+#define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
+
+enum eParserMode {
+ /**
+ * Parse a document normally as HTML.
+ */
+ NORMAL,
+
+ /**
+ * View document as HTML source.
+ */
+ VIEW_SOURCE_HTML,
+
+ /**
+ * View document as XML source
+ */
+ VIEW_SOURCE_XML,
+
+ /**
+ * View document as plain text source
+ */
+ VIEW_SOURCE_PLAIN,
+
+ /**
+ * View document as plain text
+ */
+ PLAIN_TEXT,
+
+ /**
+ * Load as data (XHR)
+ */
+ LOAD_AS_DATA
+};
+
+enum eBomState {
+ /**
+ * BOM sniffing hasn't started.
+ */
+ BOM_SNIFFING_NOT_STARTED = 0,
+
+ /**
+ * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
+ * seen.
+ */
+ SEEN_UTF_16_LE_FIRST_BYTE = 1,
+
+ /**
+ * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
+ * seen.
+ */
+ SEEN_UTF_16_BE_FIRST_BYTE = 2,
+
+ /**
+ * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
+ * seen.
+ */
+ SEEN_UTF_8_FIRST_BYTE = 3,
+
+ /**
+ * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
+ * have been seen.
+ */
+ SEEN_UTF_8_SECOND_BYTE = 4,
+
+ /**
+ * BOM sniffing was started but is now over for whatever reason.
+ */
+ BOM_SNIFFING_OVER = 5
+};
+
+enum eHtml5StreamState {
+ STREAM_NOT_STARTED = 0,
+ STREAM_BEING_READ = 1,
+ STREAM_ENDED = 2
+};
+
+class nsHtml5StreamParser : public nsICharsetDetectionObserver {
+
+ friend class nsHtml5RequestStopper;
+ friend class nsHtml5DataAvailable;
+ friend class nsHtml5StreamParserContinuation;
+ friend class nsHtml5TimerKungFu;
+
+ public:
+ NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
+ NS_DECL_CYCLE_COLLECTING_ISUPPORTS
+ NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
+ nsICharsetDetectionObserver)
+
+ static void InitializeStatics();
+
+ nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
+ nsHtml5Parser* aOwner,
+ eParserMode aMode);
+
+ // Methods that nsHtml5StreamListener calls
+ nsresult CheckListenerChain();
+
+ nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
+
+ nsresult OnDataAvailable(nsIRequest* aRequest,
+ nsISupports* aContext,
+ nsIInputStream* aInStream,
+ uint64_t aSourceOffset,
+ uint32_t aLength);
+
+ nsresult OnStopRequest(nsIRequest* aRequest,
+ nsISupports* aContext,
+ nsresult status);
+
+ // nsICharsetDetectionObserver
+ /**
+ * Chardet calls this to report the detection result
+ */
+ NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;
+
+ // EncodingDeclarationHandler
+ // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
+ /**
+ * Tree builder uses this to report a late <meta charset>
+ */
+ bool internalEncodingDeclaration(nsString* aEncoding);
+
+ // Not from an external interface
+
+ /**
+ * Call this method once you've created a parser, and want to instruct it
+ * about what charset to load
+ *
+ * @param aCharset the charset of a document
+ * @param aCharsetSource the source of the charset
+ */
+ inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
+ NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
+ "SetDocumentCharset called too late.");
+ NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
+ mCharset = aCharset;
+ mCharsetSource = aSource;
+ }
+
+ inline void SetObserver(nsIRequestObserver* aObserver) {
+ NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
+ mObserver = aObserver;
+ }
+
+ nsresult GetChannel(nsIChannel** aChannel);
+
+ /**
+ * The owner parser must call this after script execution
+ * when no scripts are executing and the document.written
+ * buffer has been exhausted.
+ */
+ void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
+ nsHtml5TreeBuilder* aTreeBuilder,
+ bool aLastWasCR);
+
+ /**
+ * Continues the stream parser if the charset switch failed.
+ */
+ void ContinueAfterFailedCharsetSwitch();
+
+ void Terminate()
+ {
+ mozilla::MutexAutoLock autoLock(mTerminatedMutex);
+ mTerminated = true;
+ }
+
+ void DropTimer();
+
+ /**
+ * Sets mCharset and mCharsetSource appropriately for the XML View Source
+ * case if aEncoding names a supported rough ASCII superset and sets
+ * the mCharset and mCharsetSource to the UTF-8 default otherwise.
+ */
+ void SetEncodingFromExpat(const char16_t* aEncoding);
+
+ /**
+ * Sets the URL for View Source title in case this parser ends up being
+ * used for View Source. If aURL is a view-source: URL, takes the inner
+ * URL. data: URLs are shown with an ellipsis instead of the actual data.
+ */
+ void SetViewSourceTitle(nsIURI* aURL);
+
+ private:
+ virtual ~nsHtml5StreamParser();
+
+#ifdef DEBUG
+ bool IsParserThread() {
+ bool ret;
+ mThread->IsOnCurrentThread(&ret);
+ return ret;
+ }
+#endif
+
+ void MarkAsBroken(nsresult aRv);
+
+ /**
+ * Marks the stream parser as interrupted. If you ever add calls to this
+ * method, be sure to review Uninterrupt usage very, very carefully to
+ * avoid having a previous in-flight runnable cancel your Interrupt()
+ * call on the other thread too soon.
+ */
+ void Interrupt()
+ {
+ mozilla::MutexAutoLock autoLock(mTerminatedMutex);
+ mInterrupted = true;
+ }
+
+ void Uninterrupt()
+ {
+ NS_ASSERTION(IsParserThread(), "Wrong thread!");
+ mTokenizerMutex.AssertCurrentThreadOwns();
+ // Not acquiring mTerminatedMutex because mTokenizerMutex is already
+ // held at this point and is already stronger.
+ mInterrupted = false;
+ }
+
+ /**
+ * Flushes the tree ops from the tree builder and disarms the flush
+ * timer.
+ */
+ void FlushTreeOpsAndDisarmTimer();
+
+ void ParseAvailableData();
+
+ void DoStopRequest();
+
+ void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
+
+ static nsresult CopySegmentsToParser(nsIInputStream *aInStream,
+ void *aClosure,
+ const char *aFromSegment,
+ uint32_t aToOffset,
+ uint32_t aCount,
+ uint32_t *aWriteCount);
+
+ bool IsTerminatedOrInterrupted()
+ {
+ mozilla::MutexAutoLock autoLock(mTerminatedMutex);
+ return mTerminated || mInterrupted;
+ }
+
+ bool IsTerminated()
+ {
+ mozilla::MutexAutoLock autoLock(mTerminatedMutex);
+ return mTerminated;
+ }
+
+ /**
+ * True when there is a Unicode decoder already
+ */
+ inline bool HasDecoder()
+ {
+ return !!mUnicodeDecoder;
+ }
+
+ /**
+ * Push bytes from network when there is no Unicode decoder yet
+ */
+ nsresult SniffStreamBytes(const uint8_t* aFromSegment,
+ uint32_t aCount,
+ uint32_t* aWriteCount);
+
+ /**
+ * Push bytes from network when there is a Unicode decoder already
+ */
+ nsresult WriteStreamBytes(const uint8_t* aFromSegment,
+ uint32_t aCount,
+ uint32_t* aWriteCount);
+
+ /**
+ * Check whether every other byte in the sniffing buffer is zero.
+ */
+ void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
+ uint32_t aCountToSniffingLimit);
+
+ /**
+ * <meta charset> scan failed. Try chardet if applicable. After this, the
+ * the parser will have some encoding even if a last resolt fallback.
+ *
+ * @param aFromSegment The current network buffer or null if the sniffing
+ * buffer is being flushed due to network stream ending.
+ * @param aCount The number of bytes in aFromSegment (ignored if
+ * aFromSegment is null)
+ * @param aWriteCount Return value for how many bytes got read from the
+ * buffer.
+ * @param aCountToSniffingLimit The number of unfilled slots in
+ * mSniffingBuffer
+ */
+ nsresult FinalizeSniffing(const uint8_t* aFromSegment,
+ uint32_t aCount,
+ uint32_t* aWriteCount,
+ uint32_t aCountToSniffingLimit);
+
+ /**
+ * Set up the Unicode decoder and write the sniffing buffer into it
+ * followed by the current network buffer.
+ *
+ * @param aFromSegment The current network buffer or null if the sniffing
+ * buffer is being flushed due to network stream ending.
+ * @param aCount The number of bytes in aFromSegment (ignored if
+ * aFromSegment is null)
+ * @param aWriteCount Return value for how many bytes got read from the
+ * buffer.
+ */
+ nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
+ uint32_t aCount,
+ uint32_t* aWriteCount);
+
+ /**
+ * Initialize the Unicode decoder, mark the BOM as the source and
+ * drop the sniffer.
+ *
+ * @param aDecoderCharsetName The name for the decoder's charset
+ * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
+ * been swallowed)
+ */
+ nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
+
+ /**
+ * Become confident or resolve and encoding name to its preferred form.
+ * @param aEncoding the value of an internal encoding decl. Acts as an
+ * out param, too, when the method returns true.
+ * @return true if the parser needs to start using the new value of
+ * aEncoding and false if the parser became confident or if
+ * the encoding name did not specify a usable encoding
+ */
+ bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
+
+ /**
+ * Callback for mFlushTimer.
+ */
+ static void TimerCallback(nsITimer* aTimer, void* aClosure);
+
+ /**
+ * Parser thread entry point for (maybe) flushing the ops and posting
+ * a flush runnable back on the main thread.
+ */
+ void TimerFlush();
+
+ /**
+ * Called when speculation fails.
+ */
+ void MaybeDisableFutureSpeculation()
+ {
+ mSpeculationFailureCount++;
+ }
+
+ /**
+ * Used to check whether we're getting too many speculation failures and
+ * should just stop trying. The 100 is picked pretty randomly to be not too
+ * small (so most pages are not affected) but small enough that we don't end
+ * up with failed speculations over and over in pathological cases.
+ */
+ bool IsSpeculationEnabled()
+ {
+ return mSpeculationFailureCount < 100;
+ }
+
+ nsCOMPtr<nsIRequest> mRequest;
+ nsCOMPtr<nsIRequestObserver> mObserver;
+
+ /**
+ * The document title to use if this turns out to be a View Source parser.
+ */
+ nsCString mViewSourceTitle;
+
+ /**
+ * The Unicode decoder
+ */
+ nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
+
+ /**
+ * The buffer for sniffing the character encoding
+ */
+ mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;
+
+ /**
+ * The number of meaningful bytes in mSniffingBuffer
+ */
+ uint32_t mSniffingLength;
+
+ /**
+ * BOM sniffing state
+ */
+ eBomState mBomState;
+
+ /**
+ * <meta> prescan implementation
+ */
+ nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
+
+ // encoding-related stuff
+ /**
+ * The source (confidence) of the character encoding in use
+ */
+ int32_t mCharsetSource;
+
+ /**
+ * The character encoding in use
+ */
+ nsCString mCharset;
+
+ /**
+ * Whether reparse is forbidden
+ */
+ bool mReparseForbidden;
+
+ // Portable parser objects
+ /**
+ * The first buffer in the pending UTF-16 buffer queue
+ */
+ RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
+
+ /**
+ * The last buffer in the pending UTF-16 buffer queue
+ */
+ nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
+ // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
+
+ /**
+ * The tree operation executor
+ */
+ nsHtml5TreeOpExecutor* mExecutor;
+
+ /**
+ * The HTML5 tree builder
+ */
+ nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
+
+ /**
+ * The HTML5 tokenizer
+ */
+ nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
+
+ /**
+ * Makes sure the main thread can't mess the tokenizer state while it's
+ * tokenizing. This mutex also protects the current speculation.
+ */
+ mozilla::Mutex mTokenizerMutex;
+
+ /**
+ * The scoped atom table
+ */
+ nsHtml5AtomTable mAtomTable;
+
+ /**
+ * The owner parser.
+ */
+ RefPtr<nsHtml5Parser> mOwner;
+
+ /**
+ * Whether the last character tokenized was a carriage return (for CRLF)
+ */
+ bool mLastWasCR;
+
+ /**
+ * For tracking stream life cycle
+ */
+ eHtml5StreamState mStreamState;
+
+ /**
+ * Whether we are speculating.
+ */
+ bool mSpeculating;
+
+ /**
+ * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
+ */
+ bool mAtEOF;
+
+ /**
+ * The speculations. The mutex protects the nsTArray itself.
+ * To access the queue of current speculation, mTokenizerMutex must be
+ * obtained.
+ * The current speculation is the last element
+ */
+ nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations;
+ mozilla::Mutex mSpeculationMutex;
+
+ /**
+ * Number of times speculation has failed for this parser.
+ */
+ uint32_t mSpeculationFailureCount;
+
+ /**
+ * True to terminate early; protected by mTerminatedMutex
+ */
+ bool mTerminated;
+ bool mInterrupted;
+ mozilla::Mutex mTerminatedMutex;
+
+ /**
+ * The thread this stream parser runs on.
+ */
+ nsCOMPtr<nsIThread> mThread;
+
+ nsCOMPtr<nsIRunnable> mExecutorFlusher;
+
+ nsCOMPtr<nsIRunnable> mLoadFlusher;
+
+ /**
+ * The chardet instance if chardet is enabled.
+ */
+ nsCOMPtr<nsICharsetDetector> mChardet;
+
+ /**
+ * If false, don't push data to chardet.
+ */
+ bool mFeedChardet;
+
+ /**
+ * Whether the initial charset source was kCharsetFromParentFrame
+ */
+ bool mInitialEncodingWasFromParentFrame;
+
+ /**
+ * Timer for flushing tree ops once in a while when not speculating.
+ */
+ nsCOMPtr<nsITimer> mFlushTimer;
+
+ /**
+ * Keeps track whether mFlushTimer has been armed. Unfortunately,
+ * nsITimer doesn't enable querying this from the timer itself.
+ */
+ bool mFlushTimerArmed;
+
+ /**
+ * False initially and true after the timer has fired at least once.
+ */
+ bool mFlushTimerEverFired;
+
+ /**
+ * Whether the parser is doing a normal parse, view source or plain text.
+ */
+ eParserMode mMode;
+
+ /**
+ * The pref html5.flushtimer.initialdelay: Time in milliseconds between
+ * the time a network buffer is seen and the timer firing when the
+ * timer hasn't fired previously in this parse.
+ */
+ static int32_t sTimerInitialDelay;
+
+ /**
+ * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
+ * the time a network buffer is seen and the timer firing when the
+ * timer has already fired previously in this parse.
+ */
+ static int32_t sTimerSubsequentDelay;
+};
+
+#endif // nsHtml5StreamParser_h