Add m-esr52 at 52.6.0

author: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
committer: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
commit: 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree: 10027f336435511475e392454359edea8e25895d /extensions/universalchardet/src/base/nsUniversalDetector.cpp
parent: 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download: UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
1 files changed, 240 insertions, 0 deletions
diff --git a/extensions/universalchardet/src/base/nsUniversalDetector.cpp b/extensions/universalchardet/src/base/nsUniversalDetector.cpp
new file mode 100644
index 000000000..d272827b8
--- /dev/null
+++ b/extensions/universalchardet/src/base/nsUniversalDetector.cpp
@@ -0,0 +1,240 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nscore.h"
+
+#include "nsUniversalDetector.h"
+
+#include "nsMBCSGroupProber.h"
+#include "nsEscCharsetProber.h"
+#include "nsLatin1Prober.h"
+
+nsUniversalDetector::nsUniversalDetector()
+{
+  mDone = false;
+  mBestGuess = -1;   //illegal value as signal
+  mInTag = false;
+  mEscCharSetProber = nullptr;
+
+  mStart = true;
+  mDetectedCharset = nullptr;
+  mGotData = false;
+  mInputState = ePureAscii;
+  mLastChar = '\0';
+
+  uint32_t i;
+  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+    mCharSetProbers[i] = nullptr;
+}
+
+nsUniversalDetector::~nsUniversalDetector() 
+{
+  for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+    delete mCharSetProbers[i];
+
+  delete mEscCharSetProber;
+}
+
+void 
+nsUniversalDetector::Reset()
+{
+  mDone = false;
+  mBestGuess = -1;   //illegal value as signal
+  mInTag = false;
+
+  mStart = true;
+  mDetectedCharset = nullptr;
+  mGotData = false;
+  mInputState = ePureAscii;
+  mLastChar = '\0';
+
+  if (mEscCharSetProber)
+    mEscCharSetProber->Reset();
+
+  uint32_t i;
+  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+    if (mCharSetProbers[i])
+      mCharSetProbers[i]->Reset();
+}
+
+//---------------------------------------------------------------------
+#define SHORTCUT_THRESHOLD      (float)0.95
+#define MINIMUM_THRESHOLD      (float)0.20
+
+nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
+{
+  if(mDone) 
+    return NS_OK;
+
+  if (aLen > 0)
+    mGotData = true;
+
+  //If the data starts with BOM, we know it is UTF
+  if (mStart)
+  {
+    mStart = false;
+    if (aLen >= 2) {
+      switch (aBuf[0]) {
+      case '\xEF':
+        if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) {
+          // EF BB BF  UTF-8 encoded BOM
+          mDetectedCharset = "UTF-8";
+        }
+        break;
+      case '\xFE':
+        if ('\xFF' == aBuf[1]) {
+          // FE FF  UTF-16, big endian BOM
+          mDetectedCharset = "UTF-16BE";
+        }
+        break;
+      case '\xFF':
+        if ('\xFE' == aBuf[1]) {
+          // FF FE  UTF-16, little endian BOM
+          mDetectedCharset = "UTF-16LE";
+        }
+        break;
+      }  // switch
+    }
+
+    if (mDetectedCharset)
+    {
+      mDone = true;
+      return NS_OK;
+    }
+  }
+  
+  uint32_t i;
+  for (i = 0; i < aLen; i++)
+  {
+    //other than 0xa0, if every othe character is ascii, the page is ascii
+    if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
+    {
+      //we got a non-ascii byte (high-byte)
+      if (mInputState != eHighbyte)
+      {
+        //adjust state
+        mInputState = eHighbyte;
+
+        //kill mEscCharSetProber if it is active
+        if (mEscCharSetProber) {
+          delete mEscCharSetProber;
+          mEscCharSetProber = nullptr;
+        }
+
+        //start multibyte and singlebyte charset prober
+        if (nullptr == mCharSetProbers[0])
+        {
+          mCharSetProbers[0] = new nsMBCSGroupProber();
+          if (nullptr == mCharSetProbers[0])
+            return NS_ERROR_OUT_OF_MEMORY;
+        }
+        if (nullptr == mCharSetProbers[2])
+        {
+          mCharSetProbers[2] = new nsLatin1Prober; 
+          if (nullptr == mCharSetProbers[2])
+            return NS_ERROR_OUT_OF_MEMORY;
+        }
+      }
+    }
+    else
+    {
+      //ok, just pure ascii so far
+      if ((ePureAscii == mInputState) && (aBuf[i] == '\033'))
+      {
+        //found escape character
+        mInputState = eEscAscii;
+      }
+      mLastChar = aBuf[i];
+    }
+  }
+
+  nsProbingState st;
+  switch (mInputState)
+  {
+  case eEscAscii:
+    if (nullptr == mEscCharSetProber) {
+      mEscCharSetProber = new nsEscCharSetProber();
+      if (nullptr == mEscCharSetProber)
+        return NS_ERROR_OUT_OF_MEMORY;
+    }
+    st = mEscCharSetProber->HandleData(aBuf, aLen);
+    if (st == eFoundIt)
+    {
+      mDone = true;
+      mDetectedCharset = mEscCharSetProber->GetCharSetName();
+    }
+    break;
+  case eHighbyte:
+    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+    {
+      if (mCharSetProbers[i])
+      {
+        st = mCharSetProbers[i]->HandleData(aBuf, aLen);
+        if (st == eFoundIt) 
+        {
+          mDone = true;
+          mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
+          return NS_OK;
+        }
+      } 
+    }
+    break;
+
+  default:  //pure ascii
+    ;//do nothing here
+  }
+  return NS_OK;
+}
+
+
+//---------------------------------------------------------------------
+void nsUniversalDetector::DataEnd()
+{
+  if (!mGotData)
+  {
+    // we haven't got any data yet, return immediately 
+    // caller program sometimes call DataEnd before anything has been sent to detector
+    return;
+  }
+
+  if (mDetectedCharset)
+  {
+    mDone = true;
+    Report(mDetectedCharset);
+    return;
+  }
+  
+  switch (mInputState)
+  {
+  case eHighbyte:
+    {
+      float proberConfidence;
+      float maxProberConfidence = (float)0.0;
+      int32_t maxProber = 0;
+
+      for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+      {
+        if (mCharSetProbers[i])
+        {
+          proberConfidence = mCharSetProbers[i]->GetConfidence();
+          if (proberConfidence > maxProberConfidence)
+          {
+            maxProberConfidence = proberConfidence;
+            maxProber = i;
+          }
+        }
+      }
+      //do not report anything because we are not confident of it, that's in fact a negative answer
+      if (maxProberConfidence > MINIMUM_THRESHOLD)
+        Report(mCharSetProbers[maxProber]->GetCharSetName());
+    }
+    break;
+  case eEscAscii:
+    break;
+  default:
+    ;
+  }
+  return;
+}
author	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
committer	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
commit	5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree	10027f336435511475e392454359edea8e25895d /extensions/universalchardet/src/base/nsUniversalDetector.cpp
parent	49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download	UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip