summaryrefslogtreecommitdiffstats
path: root/parser/html/nsHtml5StreamParser.h
blob: 2560f84abba4524f56ed0395bc157413f6a7e5f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef nsHtml5StreamParser_h
#define nsHtml5StreamParser_h

#include "nsAutoPtr.h"
#include "nsCOMPtr.h"
#include "nsICharsetDetectionObserver.h"
#include "nsHtml5MetaScanner.h"
#include "nsIUnicodeDecoder.h"
#include "nsHtml5TreeOpExecutor.h"
#include "nsHtml5OwningUTF16Buffer.h"
#include "nsIInputStream.h"
#include "mozilla/Mutex.h"
#include "mozilla/UniquePtr.h"
#include "nsHtml5AtomTable.h"
#include "nsHtml5Speculation.h"
#include "nsITimer.h"
#include "nsICharsetDetector.h"

class nsHtml5Parser;

#define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
#define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024

enum eParserMode {
  /**
   * Parse a document normally as HTML.
   */
  NORMAL,

  /**
   * View document as HTML source.
   */
  VIEW_SOURCE_HTML,

  /**
   * View document as XML source
   */
  VIEW_SOURCE_XML,

  /**
   * View document as plain text source
   */
  VIEW_SOURCE_PLAIN,

  /**
   * View document as plain text
   */
  PLAIN_TEXT,

  /**
   * Load as data (XHR)
   */
  LOAD_AS_DATA
};

enum eBomState {
  /**
   * BOM sniffing hasn't started.
   */
  BOM_SNIFFING_NOT_STARTED = 0,

  /**
   * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
   * seen.
   */
  SEEN_UTF_16_LE_FIRST_BYTE = 1,

  /**
   * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
   * seen.
   */
  SEEN_UTF_16_BE_FIRST_BYTE = 2,

  /**
   * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
   * seen.
   */
  SEEN_UTF_8_FIRST_BYTE = 3,

  /**
   * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
   * have been seen.
   */
  SEEN_UTF_8_SECOND_BYTE = 4,

  /**
   * BOM sniffing was started but is now over for whatever reason.
   */
  BOM_SNIFFING_OVER = 5
};

enum eHtml5StreamState {
  STREAM_NOT_STARTED = 0,
  STREAM_BEING_READ = 1,
  STREAM_ENDED = 2
};

class nsHtml5StreamParser : public nsICharsetDetectionObserver {

  friend class nsHtml5RequestStopper;
  friend class nsHtml5DataAvailable;
  friend class nsHtml5StreamParserContinuation;
  friend class nsHtml5TimerKungFu;

  public:
    NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
    NS_DECL_CYCLE_COLLECTING_ISUPPORTS
    NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
                                             nsICharsetDetectionObserver)

    static void InitializeStatics();

    nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
                        nsHtml5Parser* aOwner,
                        eParserMode aMode);

    // Methods that nsHtml5StreamListener calls
    nsresult CheckListenerChain();

    nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);

    nsresult OnDataAvailable(nsIRequest* aRequest,
                             nsISupports* aContext,
                             nsIInputStream* aInStream,
                             uint64_t aSourceOffset,
                             uint32_t aLength);

    nsresult OnStopRequest(nsIRequest* aRequest,
                           nsISupports* aContext,
                           nsresult status);

    // nsICharsetDetectionObserver
    /**
     * Chardet calls this to report the detection result
     */
    NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;

    // EncodingDeclarationHandler
    // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
    /**
     * Tree builder uses this to report a late <meta charset>
     */
    bool internalEncodingDeclaration(nsHtml5String aEncoding);

    // Not from an external interface

    /**
     *  Call this method once you've created a parser, and want to instruct it
     *  about what charset to load
     *
     *  @param   aCharset the charset of a document
     *  @param   aCharsetSource the source of the charset
     */
    inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
      NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
                      "SetDocumentCharset called too late.");
      NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
      mCharset = aCharset;
      mCharsetSource = aSource;
    }
    
    inline void SetObserver(nsIRequestObserver* aObserver) {
      NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
      mObserver = aObserver;
    }

    nsresult GetChannel(nsIChannel** aChannel);

    /**
     * The owner parser must call this after script execution
     * when no scripts are executing and the document.written 
     * buffer has been exhausted.
     */
    void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 
                              nsHtml5TreeBuilder* aTreeBuilder,
                              bool aLastWasCR);

    /**
     * Continues the stream parser if the charset switch failed.
     */
    void ContinueAfterFailedCharsetSwitch();

    void Terminate()
    {
      mozilla::MutexAutoLock autoLock(mTerminatedMutex);
      mTerminated = true;
    }
    
    void DropTimer();

    /**
     * Sets mCharset and mCharsetSource appropriately for the XML View Source
     * case if aEncoding names a supported rough ASCII superset and sets
     * the mCharset and mCharsetSource to the UTF-8 default otherwise.
     */
    void SetEncodingFromExpat(const char16_t* aEncoding);

    /**
     * Sets the URL for View Source title in case this parser ends up being
     * used for View Source. If aURL is a view-source: URL, takes the inner
     * URL. data: URLs are shown with an ellipsis instead of the actual data.
     */
    void SetViewSourceTitle(nsIURI* aURL);

  private:
    virtual ~nsHtml5StreamParser();

#ifdef DEBUG
    bool IsParserThread() {
      bool ret;
      mThread->IsOnCurrentThread(&ret);
      return ret;
    }
#endif

    void MarkAsBroken(nsresult aRv);

    /**
     * Marks the stream parser as interrupted. If you ever add calls to this
     * method, be sure to review Uninterrupt usage very, very carefully to
     * avoid having a previous in-flight runnable cancel your Interrupt()
     * call on the other thread too soon.
     */
    void Interrupt()
    {
      mozilla::MutexAutoLock autoLock(mTerminatedMutex);
      mInterrupted = true;
    }

    void Uninterrupt()
    {
      NS_ASSERTION(IsParserThread(), "Wrong thread!");
      mTokenizerMutex.AssertCurrentThreadOwns();
      // Not acquiring mTerminatedMutex because mTokenizerMutex is already
      // held at this point and is already stronger.
      mInterrupted = false;      
    }

    /**
     * Flushes the tree ops from the tree builder and disarms the flush
     * timer.
     */
    void FlushTreeOpsAndDisarmTimer();

    void ParseAvailableData();

    void DoStopRequest();

    void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);

    static nsresult CopySegmentsToParser(nsIInputStream *aInStream,
                                         void *aClosure,
                                         const char *aFromSegment,
                                         uint32_t aToOffset,
                                         uint32_t aCount,
                                         uint32_t *aWriteCount);

    bool IsTerminatedOrInterrupted()
    {
      mozilla::MutexAutoLock autoLock(mTerminatedMutex);
      return mTerminated || mInterrupted;
    }

    bool IsTerminated()
    {
      mozilla::MutexAutoLock autoLock(mTerminatedMutex);
      return mTerminated;
    }

    /**
     * True when there is a Unicode decoder already
     */
    inline bool HasDecoder()
    {
      return !!mUnicodeDecoder;
    }

    /**
     * Push bytes from network when there is no Unicode decoder yet
     */
    nsresult SniffStreamBytes(const uint8_t* aFromSegment,
                              uint32_t aCount,
                              uint32_t* aWriteCount);

    /**
     * Push bytes from network when there is a Unicode decoder already
     */
    nsresult WriteStreamBytes(const uint8_t* aFromSegment,
                              uint32_t aCount,
                              uint32_t* aWriteCount);

    /**
     * Check whether every other byte in the sniffing buffer is zero.
     */
    void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
                                     uint32_t aCountToSniffingLimit);

    /**
     * <meta charset> scan failed. Try chardet if applicable. After this, the
     * the parser will have some encoding even if a last resolt fallback.
     *
     * @param aFromSegment The current network buffer or null if the sniffing
     *                     buffer is being flushed due to network stream ending.
     * @param aCount       The number of bytes in aFromSegment (ignored if
     *                     aFromSegment is null)
     * @param aWriteCount  Return value for how many bytes got read from the
     *                     buffer.
     * @param aCountToSniffingLimit The number of unfilled slots in
     *                              mSniffingBuffer
     */
    nsresult FinalizeSniffing(const uint8_t* aFromSegment,
                              uint32_t aCount,
                              uint32_t* aWriteCount,
                              uint32_t aCountToSniffingLimit);

    /**
     * Set up the Unicode decoder and write the sniffing buffer into it
     * followed by the current network buffer.
     *
     * @param aFromSegment The current network buffer or null if the sniffing
     *                     buffer is being flushed due to network stream ending.
     * @param aCount       The number of bytes in aFromSegment (ignored if
     *                     aFromSegment is null)
     * @param aWriteCount  Return value for how many bytes got read from the
     *                     buffer.
     */
    nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
                                                                  uint32_t aCount,
                                                                  uint32_t* aWriteCount);

    /**
     * Initialize the Unicode decoder, mark the BOM as the source and
     * drop the sniffer.
     *
     * @param aDecoderCharsetName The name for the decoder's charset
     *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
     *                            been swallowed)
     */
    nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);

    /**
     * Become confident or resolve and encoding name to its preferred form.
     * @param aEncoding the value of an internal encoding decl. Acts as an
     *                  out param, too, when the method returns true.
     * @return true if the parser needs to start using the new value of
     *         aEncoding and false if the parser became confident or if
     *         the encoding name did not specify a usable encoding
     */
    bool PreferredForInternalEncodingDecl(nsACString& aEncoding);

    /**
     * Callback for mFlushTimer.
     */
    static void TimerCallback(nsITimer* aTimer, void* aClosure);

    /**
     * Parser thread entry point for (maybe) flushing the ops and posting
     * a flush runnable back on the main thread.
     */
    void TimerFlush();

    /**
     * Called when speculation fails.
     */
    void MaybeDisableFutureSpeculation()
    {
        mSpeculationFailureCount++;
    }

    /**
     * Used to check whether we're getting too many speculation failures and
     * should just stop trying.  The 100 is picked pretty randomly to be not too
     * small (so most pages are not affected) but small enough that we don't end
     * up with failed speculations over and over in pathological cases.
     */
    bool IsSpeculationEnabled()
    {
        return mSpeculationFailureCount < 100;
    }

    nsCOMPtr<nsIRequest>          mRequest;
    nsCOMPtr<nsIRequestObserver>  mObserver;

    /**
     * The document title to use if this turns out to be a View Source parser.
     */
    nsCString                     mViewSourceTitle;

    /**
     * The Unicode decoder
     */
    nsCOMPtr<nsIUnicodeDecoder>   mUnicodeDecoder;

    /**
     * The buffer for sniffing the character encoding
     */
    mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;

    /**
     * The number of meaningful bytes in mSniffingBuffer
     */
    uint32_t                      mSniffingLength;

    /**
     * BOM sniffing state
     */
    eBomState                     mBomState;

    /**
     * <meta> prescan implementation
     */
    nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;

    // encoding-related stuff
    /**
     * The source (confidence) of the character encoding in use
     */
    int32_t                       mCharsetSource;

    /**
     * The character encoding in use
     */
    nsCString                     mCharset;

    /**
     * Whether reparse is forbidden
     */
    bool                          mReparseForbidden;

    // Portable parser objects
    /**
     * The first buffer in the pending UTF-16 buffer queue
     */
    RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;

    /**
     * The last buffer in the pending UTF-16 buffer queue
     */
    nsHtml5OwningUTF16Buffer*     mLastBuffer; // weak ref; always points to
                      // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE

    /**
     * The tree operation executor
     */
    nsHtml5TreeOpExecutor*        mExecutor;

    /**
     * The HTML5 tree builder
     */
    nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;

    /**
     * The HTML5 tokenizer
     */
    nsAutoPtr<nsHtml5Tokenizer>   mTokenizer;

    /**
     * Makes sure the main thread can't mess the tokenizer state while it's
     * tokenizing. This mutex also protects the current speculation.
     */
    mozilla::Mutex                mTokenizerMutex;

    /**
     * The scoped atom table
     */
    nsHtml5AtomTable              mAtomTable;

    /**
     * The owner parser.
     */
    RefPtr<nsHtml5Parser>       mOwner;

    /**
     * Whether the last character tokenized was a carriage return (for CRLF)
     */
    bool                          mLastWasCR;

    /**
     * For tracking stream life cycle
     */
    eHtml5StreamState             mStreamState;
    
    /**
     * Whether we are speculating.
     */
    bool                          mSpeculating;

    /**
     * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
     */
    bool                          mAtEOF;

    /**
     * The speculations. The mutex protects the nsTArray itself.
     * To access the queue of current speculation, mTokenizerMutex must be 
     * obtained.
     * The current speculation is the last element
     */
    nsTArray<nsAutoPtr<nsHtml5Speculation> >  mSpeculations;
    mozilla::Mutex                            mSpeculationMutex;

    /**
     * Number of times speculation has failed for this parser.
     */
    uint32_t                      mSpeculationFailureCount;

    /**
     * True to terminate early; protected by mTerminatedMutex
     */
    bool                          mTerminated;
    bool                          mInterrupted;
    mozilla::Mutex                mTerminatedMutex;
    
    /**
     * The thread this stream parser runs on.
     */
    nsCOMPtr<nsIThread>           mThread;
    
    nsCOMPtr<nsIRunnable>         mExecutorFlusher;
    
    nsCOMPtr<nsIRunnable>         mLoadFlusher;

    /**
     * The chardet instance if chardet is enabled.
     */
    nsCOMPtr<nsICharsetDetector>  mChardet;

    /**
     * If false, don't push data to chardet.
     */
    bool                          mFeedChardet;

    /**
     * Whether the initial charset source was kCharsetFromParentFrame
     */
    bool                          mInitialEncodingWasFromParentFrame;

    /**
     * Timer for flushing tree ops once in a while when not speculating.
     */
    nsCOMPtr<nsITimer>            mFlushTimer;

    /**
     * Keeps track whether mFlushTimer has been armed. Unfortunately,
     * nsITimer doesn't enable querying this from the timer itself.
     */
    bool                          mFlushTimerArmed;

    /**
     * False initially and true after the timer has fired at least once.
     */
    bool                          mFlushTimerEverFired;

    /**
     * Whether the parser is doing a normal parse, view source or plain text.
     */
    eParserMode                   mMode;

    /**
     * The pref html5.flushtimer.initialdelay: Time in milliseconds between
     * the time a network buffer is seen and the timer firing when the
     * timer hasn't fired previously in this parse.
     */
    static int32_t                sTimerInitialDelay;

    /**
     * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
     * the time a network buffer is seen and the timer firing when the
     * timer has already fired previously in this parse.
     */
    static int32_t                sTimerSubsequentDelay;
};

#endif // nsHtml5StreamParser_h