summaryrefslogtreecommitdiffstats
path: root/intl/chardet
diff options
context:
space:
mode:
Diffstat (limited to 'intl/chardet')
-rw-r--r--intl/chardet/moz.build19
-rw-r--r--intl/chardet/nsCharDetConstructors.h26
-rw-r--r--intl/chardet/nsChardetModule.cpp45
-rw-r--r--intl/chardet/nsCyrillicClass.h60
-rw-r--r--intl/chardet/nsCyrillicDetector.cpp160
-rw-r--r--intl/chardet/nsCyrillicDetector.h153
-rw-r--r--intl/chardet/nsCyrillicProb.h282
-rw-r--r--intl/chardet/nsDetectionConfident.h43
-rw-r--r--intl/chardet/nsICharsetDetectionObserver.h28
-rw-r--r--intl/chardet/nsICharsetDetector.h51
-rw-r--r--intl/chardet/nsIStringCharsetDetector.h44
-rw-r--r--intl/chardet/tools/GenCyrillicClass.cpp135
-rw-r--r--intl/chardet/tools/charfreq.pl50
-rw-r--r--intl/chardet/tools/charfreqtostat.pl95
-rwxr-xr-xintl/chardet/tools/gen.cmd18
-rw-r--r--intl/chardet/tools/genbig5.pl42
-rw-r--r--intl/chardet/tools/gencp1252.pl55
-rw-r--r--intl/chardet/tools/gencyrillic.pl65
-rw-r--r--intl/chardet/tools/geneucjp.pl47
-rw-r--r--intl/chardet/tools/geneuckr.pl42
-rw-r--r--intl/chardet/tools/geneuctw.pl49
-rw-r--r--intl/chardet/tools/gengb18030.pl44
-rw-r--r--intl/chardet/tools/gengb2312.pl41
-rw-r--r--intl/chardet/tools/genhz.pl57
-rw-r--r--intl/chardet/tools/geniso2022cn.pl58
-rw-r--r--intl/chardet/tools/geniso2022jp.pl49
-rw-r--r--intl/chardet/tools/geniso2022kr.pl55
-rw-r--r--intl/chardet/tools/gensjis.pl46
-rw-r--r--intl/chardet/tools/genutf8.pl189
-rw-r--r--intl/chardet/tools/genverifier.pm175
30 files changed, 2223 insertions, 0 deletions
diff --git a/intl/chardet/moz.build b/intl/chardet/moz.build
new file mode 100644
index 000000000..4d66274e5
--- /dev/null
+++ b/intl/chardet/moz.build
@@ -0,0 +1,19 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+EXPORTS += [
+ 'nsDetectionConfident.h',
+ 'nsICharsetDetectionObserver.h',
+ 'nsICharsetDetector.h',
+ 'nsIStringCharsetDetector.h',
+]
+
+UNIFIED_SOURCES += [
+ 'nsChardetModule.cpp',
+ 'nsCyrillicDetector.cpp',
+]
+
+FINAL_LIBRARY = 'xul'
diff --git a/intl/chardet/nsCharDetConstructors.h b/intl/chardet/nsCharDetConstructors.h
new file mode 100644
index 000000000..caff08976
--- /dev/null
+++ b/intl/chardet/nsCharDetConstructors.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * Header file to be included by module -
+ * warning: defines a whole bunch of static functions
+ */
+
+#ifndef nsCharDetConstructors_h__
+#define nsCharDetConstructors_h__
+
+// chardet
+#include "nsISupports.h"
+#include "nsICharsetDetector.h"
+#include "nsICharsetDetectionObserver.h"
+#include "nsIStringCharsetDetector.h"
+#include "nsCyrillicDetector.h"
+
+NS_GENERIC_FACTORY_CONSTRUCTOR(nsRUProbDetector)
+NS_GENERIC_FACTORY_CONSTRUCTOR(nsUKProbDetector)
+NS_GENERIC_FACTORY_CONSTRUCTOR(nsRUStringProbDetector)
+NS_GENERIC_FACTORY_CONSTRUCTOR(nsUKStringProbDetector)
+
+#endif
diff --git a/intl/chardet/nsChardetModule.cpp b/intl/chardet/nsChardetModule.cpp
new file mode 100644
index 000000000..77dd4ecfe
--- /dev/null
+++ b/intl/chardet/nsChardetModule.cpp
@@ -0,0 +1,45 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/ModuleUtils.h"
+
+#include "nsCharDetConstructors.h"
+
+NS_DEFINE_NAMED_CID(NS_RU_PROBDETECTOR_CID);
+NS_DEFINE_NAMED_CID(NS_UK_PROBDETECTOR_CID);
+NS_DEFINE_NAMED_CID(NS_RU_STRING_PROBDETECTOR_CID);
+NS_DEFINE_NAMED_CID(NS_UK_STRING_PROBDETECTOR_CID);
+
+static const mozilla::Module::CIDEntry kChardetCIDs[] = {
+ { &kNS_RU_PROBDETECTOR_CID, false, nullptr, nsRUProbDetectorConstructor },
+ { &kNS_UK_PROBDETECTOR_CID, false, nullptr, nsUKProbDetectorConstructor },
+ { &kNS_RU_STRING_PROBDETECTOR_CID, false, nullptr, nsRUStringProbDetectorConstructor },
+ { &kNS_UK_STRING_PROBDETECTOR_CID, false, nullptr, nsUKStringProbDetectorConstructor },
+ { nullptr }
+};
+
+static const mozilla::Module::ContractIDEntry kChardetContracts[] = {
+ { NS_CHARSET_DETECTOR_CONTRACTID_BASE "ruprob", &kNS_RU_PROBDETECTOR_CID },
+ { NS_CHARSET_DETECTOR_CONTRACTID_BASE "ukprob", &kNS_UK_PROBDETECTOR_CID },
+ { NS_STRCDETECTOR_CONTRACTID_BASE "ruprob", &kNS_RU_STRING_PROBDETECTOR_CID },
+ { NS_STRCDETECTOR_CONTRACTID_BASE "ukprob", &kNS_UK_STRING_PROBDETECTOR_CID },
+ { nullptr }
+};
+
+static const mozilla::Module::CategoryEntry kChardetCategories[] = {
+ { NS_CHARSET_DETECTOR_CATEGORY, "off", "off" },
+ { NS_CHARSET_DETECTOR_CATEGORY, "ruprob", NS_CHARSET_DETECTOR_CONTRACTID_BASE "ruprob" },
+ { NS_CHARSET_DETECTOR_CATEGORY, "ukprob", NS_CHARSET_DETECTOR_CONTRACTID_BASE "ukprob" },
+ { nullptr }
+};
+
+static const mozilla::Module kChardetModule = {
+ mozilla::Module::kVersion,
+ kChardetCIDs,
+ kChardetContracts,
+ kChardetCategories
+};
+
+NSMODULE_DEFN(nsChardetModule) = &kChardetModule;
diff --git a/intl/chardet/nsCyrillicClass.h b/intl/chardet/nsCyrillicClass.h
new file mode 100644
index 000000000..c03ac1c43
--- /dev/null
+++ b/intl/chardet/nsCyrillicClass.h
@@ -0,0 +1,60 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsCyrillicClass_h__
+#define nsCyrillicClass_h__
+/* PLEASE DO NOT EDIT THIS FILE DIRECTLY. THIS FILE IS GENERATED BY
+ GenCyrllicClass found in mozilla/intl/chardet/tools
+ */
+static const uint8_t KOI8Map [128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+};
+static const uint8_t CP1251Map [128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+};
+static const uint8_t IBM866Map [128] = {
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+static const uint8_t ISO88595Map [128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+static const uint8_t MacCyrillicMap [128] = {
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18,
+ 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17,
+ 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 0,
+};
+#endif
diff --git a/intl/chardet/nsCyrillicDetector.cpp b/intl/chardet/nsCyrillicDetector.cpp
new file mode 100644
index 000000000..feebeed65
--- /dev/null
+++ b/intl/chardet/nsCyrillicDetector.cpp
@@ -0,0 +1,160 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#include "nscore.h"
+#include "nsCyrillicProb.h"
+#include <stdio.h>
+
+#include "nsCOMPtr.h"
+#include "nsISupports.h"
+#include "nsICharsetDetector.h"
+#include "nsICharsetDetectionObserver.h"
+#include "nsIStringCharsetDetector.h"
+#include "nsCyrillicDetector.h"
+
+//----------------------------------------------------------------------
+// Interface nsISupports [implementation]
+NS_IMPL_ISUPPORTS(nsCyrXPCOMDetector, nsICharsetDetector)
+NS_IMPL_ISUPPORTS(nsCyrXPCOMStringDetector, nsIStringCharsetDetector)
+
+void nsCyrillicDetector::HandleData(const char* aBuf, uint32_t aLen)
+{
+ uint8_t cls;
+ const char* b;
+ uint32_t i;
+ if(mDone)
+ return;
+ for(i=0, b=aBuf;i<aLen;i++,b++)
+ {
+ for(unsigned j=0;j<mItems;j++)
+ {
+ if( 0x80 & *b)
+ cls = mCyrillicClass[j][(*b) & 0x7F];
+ else
+ cls = 0;
+ NS_ASSERTION( cls <= 32 , "illegal character class");
+ mProb[j] += gCyrillicProb[mLastCls[j]][cls];
+ mLastCls[j] = cls;
+ }
+ }
+ // We now only based on the first block we receive
+ DataEnd();
+}
+
+//---------------------------------------------------------------------
+#define THRESHOLD_RATIO 1.5f
+void nsCyrillicDetector::DataEnd()
+{
+ uint32_t max=0;
+ uint8_t maxIdx=0;
+ uint8_t j;
+ if(mDone)
+ return;
+ for(j=0;j<mItems;j++) {
+ if(mProb[j] > max)
+ {
+ max = mProb[j];
+ maxIdx= j;
+ }
+ }
+
+ if( 0 == max ) // if we didn't get any 8 bits data
+ return;
+
+#ifdef DEBUG
+ for(j=0;j<mItems;j++)
+ printf("Charset %s->\t%d\n", mCharsets[j], mProb[j]);
+#endif
+ this->Report(mCharsets[maxIdx]);
+ mDone = true;
+}
+
+//---------------------------------------------------------------------
+nsCyrXPCOMDetector:: nsCyrXPCOMDetector(uint8_t aItems,
+ const uint8_t ** aCyrillicClass,
+ const char **aCharsets)
+ : nsCyrillicDetector(aItems, aCyrillicClass, aCharsets)
+{
+ mObserver = nullptr;
+}
+
+//---------------------------------------------------------------------
+nsCyrXPCOMDetector::~nsCyrXPCOMDetector()
+{
+}
+
+//---------------------------------------------------------------------
+NS_IMETHODIMP nsCyrXPCOMDetector::Init(
+ nsICharsetDetectionObserver* aObserver)
+{
+ NS_ASSERTION(mObserver == nullptr , "Init twice");
+ if(nullptr == aObserver)
+ return NS_ERROR_ILLEGAL_VALUE;
+
+ mObserver = aObserver;
+ return NS_OK;
+}
+
+//----------------------------------------------------------
+NS_IMETHODIMP nsCyrXPCOMDetector::DoIt(
+ const char* aBuf, uint32_t aLen, bool* oDontFeedMe)
+{
+ NS_ASSERTION(mObserver != nullptr , "have not init yet");
+
+ if((nullptr == aBuf) || (nullptr == oDontFeedMe))
+ return NS_ERROR_ILLEGAL_VALUE;
+
+ this->HandleData(aBuf, aLen);
+ *oDontFeedMe = false;
+ return NS_OK;
+}
+
+//----------------------------------------------------------
+NS_IMETHODIMP nsCyrXPCOMDetector::Done()
+{
+ NS_ASSERTION(mObserver != nullptr , "have not init yet");
+ this->DataEnd();
+ return NS_OK;
+}
+
+//----------------------------------------------------------
+void nsCyrXPCOMDetector::Report(const char* aCharset)
+{
+ NS_ASSERTION(mObserver != nullptr , "have not init yet");
+ mObserver->Notify(aCharset, eBestAnswer);
+}
+
+//---------------------------------------------------------------------
+nsCyrXPCOMStringDetector:: nsCyrXPCOMStringDetector(uint8_t aItems,
+ const uint8_t ** aCyrillicClass,
+ const char **aCharsets)
+ : nsCyrillicDetector(aItems, aCyrillicClass, aCharsets)
+{
+}
+
+//---------------------------------------------------------------------
+nsCyrXPCOMStringDetector::~nsCyrXPCOMStringDetector()
+{
+}
+
+//---------------------------------------------------------------------
+void nsCyrXPCOMStringDetector::Report(const char *aCharset)
+{
+ mResult = aCharset;
+}
+
+//---------------------------------------------------------------------
+NS_IMETHODIMP nsCyrXPCOMStringDetector::DoIt(const char* aBuf, uint32_t aLen,
+ const char** oCharset, nsDetectionConfident &oConf)
+{
+ mResult = nullptr;
+ mDone = false;
+ this->HandleData(aBuf, aLen);
+ this->DataEnd();
+ *oCharset=mResult;
+ oConf = eBestAnswer;
+ return NS_OK;
+}
+
+
diff --git a/intl/chardet/nsCyrillicDetector.h b/intl/chardet/nsCyrillicDetector.h
new file mode 100644
index 000000000..014db9ec7
--- /dev/null
+++ b/intl/chardet/nsCyrillicDetector.h
@@ -0,0 +1,153 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsCyrillicDetector_h__
+#define nsCyrillicDetector_h__
+
+#include "nsCyrillicClass.h"
+
+
+
+
+// {2002F781-3960-11d3-B3C3-00805F8A6670}
+#define NS_RU_PROBDETECTOR_CID \
+{ 0x2002f781, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+
+// {2002F782-3960-11d3-B3C3-00805F8A6670}
+#define NS_UK_PROBDETECTOR_CID \
+{ 0x2002f782, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+// {2002F783-3960-11d3-B3C3-00805F8A6670}
+#define NS_RU_STRING_PROBDETECTOR_CID \
+{ 0x2002f783, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+// {2002F784-3960-11d3-B3C3-00805F8A6670}
+#define NS_UK_STRING_PROBDETECTOR_CID \
+{ 0x2002f784, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+static const uint8_t *gCyrillicCls[5] =
+{
+ CP1251Map,
+ KOI8Map,
+ ISO88595Map,
+ MacCyrillicMap,
+ IBM866Map
+};
+
+static const char * gRussian[5] = {
+ "windows-1251",
+ "KOI8-R",
+ "ISO-8859-5",
+ "x-mac-cyrillic",
+ "IBM866"
+};
+
+static const char * gUkrainian[5] = {
+ "windows-1251",
+ "KOI8-U",
+ "ISO-8859-5",
+ "x-mac-cyrillic",
+ "IBM866"
+};
+
+#define NUM_CYR_CHARSET 5
+
+class nsCyrillicDetector
+{
+ public:
+ nsCyrillicDetector(uint8_t aItems,
+ const uint8_t ** aCyrillicClass,
+ const char **aCharsets) {
+ mItems = aItems;
+ mCyrillicClass = aCyrillicClass;
+ mCharsets = aCharsets;
+ for(unsigned i=0;i<mItems;i++)
+ mProb[i] = mLastCls[i] =0;
+ mDone = false;
+ }
+ virtual ~nsCyrillicDetector() {}
+ virtual void HandleData(const char* aBuf, uint32_t aLen);
+ virtual void DataEnd();
+ protected:
+ virtual void Report(const char* aCharset) = 0;
+ bool mDone;
+
+ private:
+ uint8_t mItems;
+ const uint8_t ** mCyrillicClass;
+ const char** mCharsets;
+ uint32_t mProb[NUM_CYR_CHARSET];
+ uint8_t mLastCls[NUM_CYR_CHARSET];
+};
+
+class nsCyrXPCOMDetector :
+ public nsCyrillicDetector,
+ public nsICharsetDetector
+{
+ public:
+ // nsISupports interface
+ NS_DECL_ISUPPORTS
+ nsCyrXPCOMDetector(uint8_t aItems,
+ const uint8_t ** aCyrillicClass,
+ const char **aCharsets);
+ NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver) override;
+ NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen, bool *oDontFeedMe) override;
+ NS_IMETHOD Done() override;
+ protected:
+ virtual ~nsCyrXPCOMDetector();
+ virtual void Report(const char* aCharset) override;
+ private:
+ nsCOMPtr<nsICharsetDetectionObserver> mObserver;
+};
+
+class nsCyrXPCOMStringDetector :
+ public nsCyrillicDetector,
+ public nsIStringCharsetDetector
+{
+ public:
+ // nsISupports interface
+ NS_DECL_ISUPPORTS
+ nsCyrXPCOMStringDetector(uint8_t aItems,
+ const uint8_t ** aCyrillicClass,
+ const char **aCharsets);
+ NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen,
+ const char** oCharset, nsDetectionConfident &oConf) override;
+ protected:
+ virtual ~nsCyrXPCOMStringDetector();
+ virtual void Report(const char* aCharset) override;
+ private:
+ nsCOMPtr<nsICharsetDetectionObserver> mObserver;
+ const char* mResult;
+};
+
+class nsRUProbDetector : public nsCyrXPCOMDetector
+{
+ public:
+ nsRUProbDetector()
+ : nsCyrXPCOMDetector(5, gCyrillicCls, gRussian) {}
+};
+
+class nsRUStringProbDetector : public nsCyrXPCOMStringDetector
+{
+ public:
+ nsRUStringProbDetector()
+ : nsCyrXPCOMStringDetector(5, gCyrillicCls, gRussian) {}
+};
+
+class nsUKProbDetector : public nsCyrXPCOMDetector
+{
+ public:
+ nsUKProbDetector()
+ : nsCyrXPCOMDetector(5, gCyrillicCls, gUkrainian) {}
+};
+
+class nsUKStringProbDetector : public nsCyrXPCOMStringDetector
+{
+ public:
+ nsUKStringProbDetector()
+ : nsCyrXPCOMStringDetector(5, gCyrillicCls, gUkrainian) {}
+};
+
+#endif
diff --git a/intl/chardet/nsCyrillicProb.h b/intl/chardet/nsCyrillicProb.h
new file mode 100644
index 000000000..c84c09f95
--- /dev/null
+++ b/intl/chardet/nsCyrillicProb.h
@@ -0,0 +1,282 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsCyrillicProb_h___h__
+#define nsCyrillicProb_h___h__
+/*
+ DO NOT EDIT THIS FILE !!!
+ This file is generated by the perl script in
+ mozilla/intl/chardet/tools/gencyrillic.pl
+
+ To ues that script, you need to grab StatKoi.pm file from
+ the "Cyrillic Software Suite" written by John Neystdt.
+ http://www.neystadt.org/cyrillic (You can also find it from CPAN)
+ */
+const uint16_t gCyrillicProb[33][33] = {{
+0,
+0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+1, 0, 62, 8, 237, 0, 0, 0,
+0, 0, 0, 0, 2, 0, 1, 0,
+0, 0, 50, 9, 1342, 0, 5, 10,
+0, 0, 16, 2, 0, 2041, 505, 0,
+
+},
+{
+0,
+1197, 0, 891, 3797, 594, 2064, 112, 646,
+1039, 166, 152, 3162, 10935, 3465, 10268, 5,
+277, 1744, 3706, 5043, 8884, 79, 716, 4563,
+0, 0, 3090, 205, 9, 591, 1515, 0,
+
+},
+{
+0,
+206, 1117, 0, 0, 0, 652, 0, 0,
+92, 194, 0, 4, 924, 25, 204, 2334,
+2, 836, 832, 403, 0, 365, 63, 1,
+0, 1257, 5, 9, 0, 358, 0, 629,
+
+},
+{
+0,
+0, 935, 0, 0, 0, 1695, 0, 0,
+0, 5193, 0, 5, 1, 1, 0, 461,
+0, 0, 0, 0, 0, 216, 0, 9,
+0, 47, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 4049, 20, 22, 27, 8713, 0, 49,
+0, 1530, 0, 660, 1182, 138, 1459, 5347,
+1488, 344, 741, 1738, 63, 1460, 206, 242,
+19, 743, 26, 51, 0, 0, 33, 90,
+
+},
+{
+0,
+141, 635, 516, 183, 8332, 911, 108, 2694,
+255, 76, 2958, 2366, 8125, 3209, 19276, 285,
+346, 483, 6823, 5705, 6596, 45, 1286, 525,
+0, 0, 1093, 414, 15, 286, 767, 0,
+
+},
+{
+0,
+0, 272, 0, 0, 0, 376, 50, 0,
+0, 803, 0, 0, 15, 2, 28, 591,
+0, 0, 6, 2, 24, 19, 0, 0,
+7, 31, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 4191, 0, 0, 68, 162, 0, 0,
+0, 1248, 0, 8, 369, 0, 12, 15161,
+0, 0, 678, 0, 2, 337, 0, 0,
+0, 0, 0, 19, 0, 0, 11, 0,
+
+},
+{
+0,
+0, 102, 0, 0, 0, 5, 0, 15,
+0, 27, 0, 6, 2, 1, 92, 2227,
+0, 0, 101, 161, 7, 15, 0, 2,
+0, 0, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+1245, 609, 755, 2134, 1161, 4628, 120, 151,
+2180, 5903, 3242, 2804, 3261, 4656, 3708, 1658,
+104, 7815, 882, 3354, 3398, 16, 169, 1769,
+0, 0, 5064, 96, 0, 48, 1628, 0,
+
+},
+{
+0,
+0, 0, 0, 0, 1, 3, 3, 0,
+0, 0, 0, 6, 0, 12, 96, 67,
+1, 0, 0, 2066, 11, 0, 0, 0,
+0, 0, 0, 20, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 4402, 0, 677, 0, 782, 0, 2,
+0, 2724, 0, 10, 876, 0, 35, 6609,
+0, 0, 651, 1323, 1558, 1049, 416, 225,
+0, 0, 2, 13, 0, 0, 0, 0,
+
+},
+{
+0,
+741, 5440, 0, 0, 1, 6066, 0, 89,
+0, 9040, 0, 153, 97, 4, 949, 9899,
+0, 2830, 0, 8, 16, 2139, 434, 0,
+7487, 157, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 2073, 13, 0, 0, 4818, 0, 0,
+0, 3684, 0, 30, 89, 1094, 204, 4078,
+119, 61, 1, 68, 0, 1684, 0, 68,
+10, 1424, 0, 0, 0, 14, 6, 0,
+
+},
+{
+0,
+18, 16528, 0, 176, 474, 5075, 174, 31,
+0, 14151, 0, 840, 0, 0, 8956, 14457,
+0, 911, 0, 1150, 1893, 711, 8, 199,
+271, 9281, 192, 0, 0, 2, 84, 0,
+
+},
+{
+0,
+23, 27, 4868, 799, 7820, 1391, 145, 13562,
+909, 1551, 5834, 1881, 4400, 6329, 2878, 1911,
+3632, 2374, 7308, 8626, 6679, 161, 2573, 15172,
+0, 0, 1322, 778, 34, 129, 944, 0,
+
+},
+{
+0,
+0, 671, 0, 12, 0, 2500, 1, 0,
+0, 409, 0, 26, 3612, 0, 38, 8786,
+268, 87, 13327, 13, 15, 471, 0, 0,
+7, 266, 0, 0, 0, 0, 2, 0,
+
+},
+{
+0,
+847, 0, 3, 184, 878, 1070, 0, 19,
+482, 0, 90, 18, 26, 765, 151, 0,
+0, 18, 20, 81, 2587, 0, 51, 766,
+0, 0, 1224, 0, 0, 2209, 20, 0,
+
+},
+{
+0,
+2, 10059, 62, 17, 21, 11067, 6, 2653,
+30, 7582, 0, 122, 14, 638, 490, 6767,
+9, 1045, 431, 1139, 683, 2482, 326, 496,
+156, 938, 0, 254, 0, 0, 30, 0,
+
+},
+{
+0,
+17, 1493, 218, 3, 213, 633, 26, 3,
+590, 2176, 0, 3716, 3732, 938, 693, 4388,
+1639, 4197, 1185, 2118, 21815, 2792, 0, 1033,
+154, 239, 0, 25, 0, 0, 522, 3,
+
+},
+{
+0,
+0, 9785, 0, 27, 197, 8202, 0, 12,
+24, 5253, 0, 433, 12, 53, 2577, 9712,
+25, 122, 3392, 4966, 4, 836, 0, 8956,
+4693, 1483, 5, 3, 0, 0, 270, 3,
+
+},
+{
+0,
+1930, 104, 260, 18, 1452, 325, 6, 1192,
+51, 6, 0, 1098, 301, 1778, 398, 0,
+2263, 7, 254, 2808, 452, 0, 743, 140,
+0, 0, 45, 559, 0, 1336, 2289, 0,
+
+},
+{
+0,
+0, 796, 390, 0, 1303, 3459, 1, 11,
+0, 632, 0, 37, 0, 0, 620, 0,
+15, 0, 1, 0, 0, 25, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 7418, 0, 51, 10, 5465, 0, 1,
+51, 2962, 0, 999, 3853, 82, 1048, 7277,
+241, 370, 394, 280, 286, 1126, 0, 183,
+24, 3182, 197, 286, 0, 28, 0, 4,
+
+},
+{
+0,
+395, 0, 6, 22, 0, 496, 9, 113,
+0, 700, 0, 171, 0, 78, 3296, 0,
+0, 1501, 0, 1379, 193, 0, 0, 0,
+0, 0, 487, 165, 0, 1633, 30, 0,
+
+},
+{
+0,
+0, 0, 36, 0, 272, 2847, 0, 27,
+4998, 1, 1192, 33, 224, 2657, 219, 0,
+363, 29, 273, 205, 503, 0, 0, 400,
+0, 0, 38, 255, 0, 0, 305, 0,
+
+},
+{
+0,
+0, 7005, 32, 32, 869, 400, 0, 37,
+0, 999, 0, 46, 204, 739, 1570, 1076,
+0, 112, 89, 0, 1, 430, 1, 1191,
+3, 368, 0, 0, 0, 0, 2, 77,
+
+},
+{
+0,
+0, 200, 0, 0, 0, 2054, 0, 0,
+0, 397, 0, 19, 438, 0, 108, 0,
+0, 0, 4, 0, 112, 3, 0, 0,
+4, 0, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 0, 0, 0, 0, 0, 29, 0,
+0, 0, 0, 311, 16, 19, 11, 0,
+2, 0, 10, 3, 1382, 0, 0, 10,
+0, 0, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 297, 0, 0, 0, 4290, 0, 0,
+0, 3968, 0, 0, 0, 0, 33, 0,
+0, 0, 1, 0, 0, 70, 0, 0,
+15, 0, 0, 0, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 2304, 0, 0, 0, 4731, 0, 0,
+0, 1873, 0, 198, 33, 0, 921, 0,
+0, 0, 191, 0, 114, 134, 0, 2,
+12, 0, 0, 7, 0, 0, 0, 0,
+
+},
+{
+0,
+0, 0, 0, 0, 0, 599, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,
+0, 207, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,
+
+},
+};
+#endif
diff --git a/intl/chardet/nsDetectionConfident.h b/intl/chardet/nsDetectionConfident.h
new file mode 100644
index 000000000..c1eb6e17c
--- /dev/null
+++ b/intl/chardet/nsDetectionConfident.h
@@ -0,0 +1,43 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsDetetctionConfident_h__
+#define nsDetetctionConfident_h__
+
+/*
+ This type is used to indicate how confident the detection module about
+ the return result.
+
+ eNoAnswerYet is used to indicate that the detector have not find out a
+ answer yet based on the data it received.
+ eBestAnswer is used to indicate that the answer the detector returned
+ is the best one within the knowledge of the detector.
+ In other words, the test to all other candidcates fail.
+
+ For example, the (Shift_JIS/EUC-JP/ISO-2022-JP) detection
+ module may return this with answer "Shift_JIS "if it receive
+ bytes > 0x80 (which make ISO-2022-JP test failed) and byte
+ 0x82 (which may EUC-JP test failed)
+
+ eSureAnswer is used to indicate that the detector is 100% sure about the
+ answer.
+ Exmaple 1; the Shift_JIS/ISO-2022-JP/EUC-JP detector return
+ this w/ ISO-2022-JP when it hit one of the following ESC seq
+ ESC ( J
+ ESC $ @
+ ESC $ B
+ Example 2: the detector which can detect UCS2 return w/ UCS2
+ when the first 2 byte are BOM mark.
+ Example 3: the Korean detector return ISO-2022-KR when it
+ hit ESC $ ) C
+
+ */
+typedef enum {
+ eNoAnswerYet = 0,
+ eBestAnswer,
+ eSureAnswer,
+ eNoAnswerMatch
+} nsDetectionConfident;
+
+#endif /* nsDetetctionConfident_h__ */
diff --git a/intl/chardet/nsICharsetDetectionObserver.h b/intl/chardet/nsICharsetDetectionObserver.h
new file mode 100644
index 000000000..1877e2ba5
--- /dev/null
+++ b/intl/chardet/nsICharsetDetectionObserver.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsICDETObserver_h__
+#define nsICDETObserver_h__
+
+#include "nsISupports.h"
+#include "nsDetectionConfident.h"
+
+// {12BB8F12-2389-11d3-B3BF-00805F8A6670}
+#define NS_ICHARSETDETECTIONOBSERVER_IID \
+{ 0x12bb8f12, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+/*
+ Used to inform answer by nsICharsetDetector
+ */
+class nsICharsetDetectionObserver : public nsISupports {
+public:
+ NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICHARSETDETECTIONOBSERVER_IID)
+ NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) = 0;
+};
+
+NS_DEFINE_STATIC_IID_ACCESSOR(nsICharsetDetectionObserver,
+ NS_ICHARSETDETECTIONOBSERVER_IID)
+
+#endif /* nsICDETObserver_h__ */
diff --git a/intl/chardet/nsICharsetDetector.h b/intl/chardet/nsICharsetDetector.h
new file mode 100644
index 000000000..2215fa0f0
--- /dev/null
+++ b/intl/chardet/nsICharsetDetector.h
@@ -0,0 +1,51 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsICharsetDetector_h__
+#define nsICharsetDetector_h__
+
+#include "nsISupports.h"
+
+class nsICharsetDetectionObserver;
+
+// {12BB8F14-2389-11d3-B3BF-00805F8A6670}
+#define NS_ICHARSETDETECTOR_IID \
+{ 0x12bb8f14, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+#define NS_CHARSET_DETECTOR_CONTRACTID_BASE "@mozilla.org/intl/charsetdetect;1?type="
+#define NS_CHARSET_DETECTOR_CATEGORY "charset-detectors"
+
+class nsICharsetDetector : public nsISupports {
+public:
+ NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICHARSETDETECTOR_IID)
+
+ /*
+ Setup the observer so it know how to notify the answer
+ */
+ NS_IMETHOD Init(nsICharsetDetectionObserver* observer) = 0;
+
+ /*
+ Feed a block of bytes to the detector.
+ It will call the Notify function of the nsICharsetObserver if it find out
+ the answer.
+ aBytesArray - array of bytes
+ aLen - length of aBytesArray
+ oDontFeedMe - return true if the detector do not need the following block
+ false it need more bytes.
+ This is used to enhance performance
+ */
+ NS_IMETHOD DoIt(const char* aBytesArray, uint32_t aLen, bool* oDontFeedMe) = 0;
+
+ /*
+ It also tell the detector the last chance the make a decision
+ */
+ NS_IMETHOD Done() = 0;
+
+};
+
+NS_DEFINE_STATIC_IID_ACCESSOR(nsICharsetDetector,
+ NS_ICHARSETDETECTOR_IID)
+
+#endif /* nsICharsetDetector_h__ */
diff --git a/intl/chardet/nsIStringCharsetDetector.h b/intl/chardet/nsIStringCharsetDetector.h
new file mode 100644
index 000000000..9abd85df5
--- /dev/null
+++ b/intl/chardet/nsIStringCharsetDetector.h
@@ -0,0 +1,44 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsIStringCharsetDetector_h__
+#define nsIStringCharsetDetector_h__
+
+#include "nsISupports.h"
+#include "nsDetectionConfident.h"
+
+// {12BB8F15-2389-11d3-B3BF-00805F8A6670}
+#define NS_ISTRINGCHARSETDETECTOR_IID \
+{ 0x12bb8f15, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+
+#define NS_STRCDETECTOR_CONTRACTID_BASE "@mozilla.org/intl/stringcharsetdetect;1?type="
+
+/*
+ This interface is similar to nsICharsetDetector
+ The difference is it is for line base detection instead of block based
+ detectection.
+ */
+
+
+class nsIStringCharsetDetector : public nsISupports {
+public:
+
+ NS_DECLARE_STATIC_IID_ACCESSOR(NS_ISTRINGCHARSETDETECTOR_IID)
+ /*
+ Perform the charset detection
+
+ aBytesArray- the bytes
+ aLen- the length of the bytes
+ oCharset- the charset answer
+ oConfident - the confidence of the answer
+ */
+ NS_IMETHOD DoIt(const char* aBytesArray, uint32_t aLen,
+ const char** oCharset, nsDetectionConfident &oConfident) = 0;
+};
+
+NS_DEFINE_STATIC_IID_ACCESSOR(nsIStringCharsetDetector,
+ NS_ISTRINGCHARSETDETECTOR_IID)
+
+#endif /* nsIStringCharsetDetector_h__ */
diff --git a/intl/chardet/tools/GenCyrillicClass.cpp b/intl/chardet/tools/GenCyrillicClass.cpp
new file mode 100644
index 000000000..180651a49
--- /dev/null
+++ b/intl/chardet/tools/GenCyrillicClass.cpp
@@ -0,0 +1,135 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#include "nsICharsetConverterManager.h"
+#include <iostream.h>
+#include "nsISupports.h"
+#include "nsIComponentManager.h"
+#include "nsIServiceManager.h"
+#include "nsIUnicodeDecoder.h"
+#include "nsIUnicodeEncoder.h"
+#include "nsCRT.h"
+#include <stdio.h>
+#include <stdlib.h>
+#if defined(XP_WIN)
+#include <io.h>
+#endif
+#ifdef XP_UNIX
+#include <unistd.h>
+#endif
+
+//---------------------------------------------------------------------------
+void header()
+{
+char *header=
+"#ifndef nsCyrillicClass_h__\n"
+"#define nsCyrillicClass_h__\n"
+"/* PLEASE DO NOT EDIT THIS FILE DIRECTLY. THIS FILE IS GENERATED BY \n"
+" GenCyrllicClass found in mozilla/intl/chardet/tools\n"
+" */\n";
+ printf(header);
+}
+//---------------------------------------------------------------------------
+void footer()
+{
+ printf("#endif\n");
+}
+//---------------------------------------------------------------------------
+void npl()
+{
+char *npl=
+"/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */\n"
+"/* This Source Code Form is subject to the terms of the Mozilla Public\n"
+" * License, v. 2.0. If a copy of the MPL was not distributed with this\n"
+" * file, You can obtain one at http://mozilla.org/MPL/2.0/. */\n";
+ printf(npl);
+}
+//---------------------------------------------------------------------------
+static nsIUnicodeEncoder* gKOI8REncoder = nullptr;
+static nsICharsetConverterManager* gCCM = nullptr;
+
+//---------------------------------------------------------------------------
+uint8_t CyrillicClass(nsIUnicodeDecoder* decoder, uint8_t byte)
+{
+ char16_t ubuf[2];
+ uint8_t bbuf[2];
+
+ int32_t blen = 1;
+ int32_t ulen = 1;
+ nsresult res = decoder->Convert((char*)&byte, &blen, ubuf, &ulen);
+ if(NS_SUCCEEDED(res) && (1 == ulen ))
+ {
+ ubuf[0] = nsCRT::ToUpper(ubuf[0]);
+ blen=1;
+ res = gKOI8REncoder->Convert(ubuf,&ulen,(char*)bbuf,&blen);
+ if(NS_SUCCEEDED(res) && (1 == blen))
+ {
+ if(0xe0 <= bbuf[0])
+ {
+ return bbuf[0] - (uint8_t)0xdf;
+ }
+ }
+ }
+ return 0;
+}
+//---------------------------------------------------------------------------
+void genCyrillicClass(const char* name, const char* charset)
+{
+ nsIUnicodeDecoder *decoder = nullptr;
+ nsresult res = NS_OK;
+ nsAutoString str(charset);
+ res = gCCM->GetUnicodeDecoder(&str, &decoder);
+ if(NS_FAILED(res))
+ {
+ printf("cannot locate %s Decoder\n", charset);
+ return;
+ }
+ printf("static const uint8_t %sMap [128] = {\n",name);
+ uint8_t i,j;
+ for(i=0x80;i!=0x00;i+=0x10)
+ {
+ for(j=0;j<=0x0f;j++)
+ {
+ uint8_t cls = CyrillicClass(decoder, i+j);
+ printf(" %2d, ",cls);
+ }
+ printf("\n");
+ }
+ printf("};\n");
+ NS_IF_RELEASE(decoder);
+}
+//---------------------------------------------------------------------------
+
+
+int main(int argc, char** argv) {
+ nsresult res = nullptr;
+
+ nsCOMPtr<nsICharsetConverterManager> gCCM = do_GetService(kCharsetConverterManagerCID, &res);
+
+ if(NS_FAILED(res) && (nullptr != gCCM))
+ {
+ printf("cannot locate CharsetConverterManager\n");
+ return(-1);
+ }
+ nsAutoString koi8r("KOI8-R");
+ res = gCCM->GetUnicodeEncoder(&koi8r,&gKOI8REncoder);
+ if(NS_FAILED(res) && (nullptr != gKOI8REncoder))
+ {
+ printf("cannot locate KOI8-R Encoder\n");
+ return(-1);
+ }
+
+
+ npl();
+ header();
+
+ genCyrillicClass("KOI8", "KOI8-R");
+ genCyrillicClass("CP1251", "windows-1251");
+ genCyrillicClass("IBM866", "IBM866");
+ genCyrillicClass("ISO88595", "ISO-8859-5");
+ genCyrillicClass("MacCyrillic", "x-mac-cyrillic");
+ footer();
+ NS_IF_RELEASE(gKOI8REncoder);
+ return(0);
+};
diff --git a/intl/chardet/tools/charfreq.pl b/intl/chardet/tools/charfreq.pl
new file mode 100644
index 000000000..4232d4765
--- /dev/null
+++ b/intl/chardet/tools/charfreq.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+#!/usr/bin/perl
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+open (STAT,$ARGV[0]) || die " cannot open data file $ARGV[0]\n";
+@count;
+while(<STAT>)
+{
+ @k = split(/\s+/, $_);
+ $count{$k[0]} = $k[1];
+}
+$count = 0;
+while(<STDIN>)
+{
+ @ck = split /\s*/, $_;
+ $s = 0;
+ $fb = 0;
+ $cl = $#ck;
+ $j = 0;
+ while($j < $cl) {
+ $cc = unpack("C", $ck[$j]);
+ if(0 eq $s ) {
+ if($cc > 0x80) {
+ if($cc > 0xa0) {
+ $fb = $ck[$j];
+ $s = 2;
+ } else {
+ $s = 1;
+ }
+ }
+ } elsif (1 eq $s) {
+ } else {
+ if($cc > 0xa0) {
+ $fb .= $ck[$j];
+ $count{$fb}++;
+ print $fb . " " .$count{$fb} . "\n";
+ $s = 0;
+ } else {
+ $s = 1;
+ }
+ }
+ $j = $j + 1;
+ }
+}
+foreach $c (sort(keys( %count )))
+{
+ print $c . " ". $count{$c} . "\n";
+}
diff --git a/intl/chardet/tools/charfreqtostat.pl b/intl/chardet/tools/charfreqtostat.pl
new file mode 100644
index 000000000..04af0c82c
--- /dev/null
+++ b/intl/chardet/tools/charfreqtostat.pl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+sub GenNPL {
+ my($ret) = << "END_NPL";
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+END_NPL
+
+ return $ret;
+}
+
+print GenNPL();
+$total=0;
+@h;
+@l;
+
+while(<STDIN>)
+{
+ @k = split(/\s+/, $_);
+ @i = unpack("CCCC", $k[0]);
+# printf("%x %x %s",$i[0] , $i[1] , "[" . $k[0] . "] " . $i . " " . $j . " " . $k[1] ."\n");
+ if((0xA1 <= $i[0]) && (0xA1 <= $i[1])){
+ $total += $k[1];
+ $v = $i[0] - 0x00A1;
+ $h[$v] += $k[1];
+ $u = $i[1] - 0x00A1;
+ $l[$u] += $k[1];
+# print "hello $v $h[$v] $u $l[$u]\n";
+ }
+}
+
+
+$ffh = 0.0;
+$ffl = 0.0;
+for($i=0x00A1;$i< 0x00FF ; $i++)
+{
+ $fh[$i - 0x00a1] = $h[$i- 0x00a1] / $total;
+ $ffh += $fh[$i - 0x00a1];
+
+ $fl[$i - 0x00a1] = $l[$i- 0x00a1] / $total;
+ $ffl += $fl[$i - 0x00a1];
+}
+$mh = $ffh / 94.0;
+$ml = $ffl / 94.0;
+
+$sumh=0.0;
+$suml=0.0;
+for($i=0x00A1;$i< 0x00FF ; $i++)
+{
+ $sh = $fh[$i - 0x00a1] - $mh;
+ $sh *= $sh;
+ $sumh += $sh;
+
+ $sl = $fl[$i - 0x00a1] - $ml;
+ $sl *= $sl;
+ $suml += $sl;
+}
+$sumh /= 94.0;
+$suml /= 94.0;
+$stdh = sqrt($sumh);
+$stdl = sqrt($suml);
+
+print "{\n";
+print " {\n";
+for($i=0x00A1;$i< 0x00FF ; $i++)
+{
+ if($i eq 0xfe) {
+ printf(" %.6ff \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i);
+ } else {
+ printf(" %.6ff, \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i);
+ }
+}
+print " },\n";
+printf ("%.6ff, \/\/ Lead Byte StdDev\n", $stdh);
+printf ("%.6ff, \/\/ Lead Byte Mean\n", $mh);
+printf ("%.6ff, \/\/ Lead Byte Weight\n", $stdh / ($stdh + $stdl));
+print " {\n";
+for($i=0x00A1;$i< 0x00FF ; $i++)
+{
+ if($i eq 0xfe) {
+ printf(" %.6ff \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] , $i);
+ } else {
+ printf(" %.6ff, \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] , $i);
+ }
+}
+print " },\n";
+printf ("%.6ff, \/\/ Trail Byte StdDev\n", $stdl);
+printf ("%.6ff, \/\/ Trail Byte Mean\n", $ml);
+printf ("%.6ff \/\/ Trial Byte Weight\n", $stdl / ($stdh + $stdl));
+print "};\n";
diff --git a/intl/chardet/tools/gen.cmd b/intl/chardet/tools/gen.cmd
new file mode 100755
index 000000000..56ca34bc9
--- /dev/null
+++ b/intl/chardet/tools/gen.cmd
@@ -0,0 +1,18 @@
+REM This Source Code Form is subject to the terms of the Mozilla Public
+REM License, v. 2.0. If a copy of the MPL was not distributed with this
+REM file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+perl gencp1252.pl > ..\src\nsCP1252Verifier.h
+perl geneucjp.pl > ..\src\nsEUCJPVerifier.h
+perl geniso2022jp.pl > ..\src\nsISO2022JPVerifier.h
+perl gensjis.pl > ..\src\nsSJISVerifier.h
+perl genutf8.pl > ..\src\nsUTF8Verifier.h
+perl geneuckr.pl > ..\src\nsEUCKRVerifier.h
+perl gengb2312.pl > ..\src\nsGB2312Verifier.h
+perl genbig5.pl > ..\src\nsBIG5Verifier.h
+perl geneuctw.pl > ..\src\nsEUCTWVerifier.h
+perl genucs2be.pl > ..\src\nsUCS2BEVerifier.h
+perl genucs2le.pl > ..\src\nsUCS2LEVerifier.h
+perl genhz.pl > ..\src\nsHZVerifier.h
+perl geniso2022kr.pl > ..\src\nsISO2022KRVerifier.h
+perl geniso2022cn.pl > ..\src\nsISO2022CNVerifier.h
diff --git a/intl/chardet/tools/genbig5.pl b/intl/chardet/tools/genbig5.pl
new file mode 100644
index 000000000..8e3a777cb
--- /dev/null
+++ b/intl/chardet/tools/genbig5.pl
@@ -0,0 +1,42 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@big5_cls);
+my(@big5_st);
+my($big5_ver);
+
+
+@big5_cls = (
+ [ 0x00 , 0x00 , 1 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x01 , 0x3f , 1 ],
+ [ 0x40 , 0x7e , 2 ],
+ [ 0x7f , 0x7f , 1 ],
+ [ 0xff , 0xff , 0 ],
+ [ 0x80 , 0xa0 , 4 ],
+ [ 0xa1 , 0xfe , 3 ],
+);
+
+package genverifier;
+@big5_st = (
+# 0 1 2 3 4
+ 1, 0, 0, 3, 1, # state 0
+ 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 0, 0, 0, # state 3
+);
+
+
+$big5_ver = genverifier::GenVerifier("BIG5", "Big5", \@big5_cls, 5, \@big5_st);
+print $big5_ver;
+
+
+
diff --git a/intl/chardet/tools/gencp1252.pl b/intl/chardet/tools/gencp1252.pl
new file mode 100644
index 000000000..debc53ca5
--- /dev/null
+++ b/intl/chardet/tools/gencp1252.pl
@@ -0,0 +1,55 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@cp1252_cls);
+my(@cp1252_st);
+my($cp1252_ver);
+
+
+@cp1252_cls = (
+ [ 0x00 , 0x00 , 1 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x81 , 0x81 , 0 ],
+ [ 0x8d , 0x8d , 0 ],
+ [ 0x8f , 0x8f , 0 ],
+ [ 0x90 , 0x90 , 0 ],
+ [ 0x9d , 0x9d , 0 ],
+ [ 0xc0 , 0xd6 , 1 ],
+ [ 0xd8 , 0xf6 , 1 ],
+ [ 0xf8 , 0xff , 1 ],
+ [ 0x8a , 0x8a , 1 ],
+ [ 0x8c , 0x8c , 1 ],
+ [ 0x8e , 0x8e , 1 ],
+ [ 0x9a , 0x9a , 1 ],
+ [ 0x9c , 0x9c , 1 ],
+ [ 0x9e , 0x9e , 1 ],
+ [ 0x9f , 0x9f , 1 ],
+ [ 0x00 , 0xff , 2 ],
+);
+
+package genverifier;
+@cp1252_st = (
+# 0 1 2
+ 1, 3, 0, # Start State - 0
+ 1, 1, 1, # Error State - 1
+ 2, 2, 2, # ItsMe State - 2
+ 1, 4, 0, # State - 3
+ 1, 5, 4, # State - 4
+ 1, 1, 4, # State - 5
+);
+
+
+$cp1252_ver = genverifier::GenVerifier("CP1252", "windows-1252",
+ \@cp1252_cls, 3, \@cp1252_st);
+print $cp1252_ver;
+
+
+
diff --git a/intl/chardet/tools/gencyrillic.pl b/intl/chardet/tools/gencyrillic.pl
new file mode 100644
index 000000000..51bd6e456
--- /dev/null
+++ b/intl/chardet/tools/gencyrillic.pl
@@ -0,0 +1,65 @@
+#!/usr/local/bin/perl
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use StatKoi '.' ;
+
+open(FILE, "> ../src/nsCyrillicProb.h") or die "cannot open nsCyrillicDetector.h";
+
+print FILE <<EOF;
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsCyrillicDetector_h__
+#define nsCyrillicDetector_h__
+/*
+ DO NOT EDIT THIS FILE !!!
+ This file is generated by the perl script in
+ mozilla/intl/chardet/tools/gencyrillic.pl
+
+ To ues that script, you need to grab StatKoi.pm file from
+ the "Cyrillic Software Suite" written by John Neystdt.
+ http://www.neystadt.org/cyrillic (You can also find it from CPAN)
+ */
+EOF
+$table = \%Lingua::DetectCharset::StatKoi::StatsTableKoi;
+print FILE "const uint16_t gCyrillicProb[33][33] = {";
+ print FILE "{ \n";
+ print FILE "0,\n";
+ for($j = 0xc0; $j < 0xe0; $j++)
+ {
+ print FILE "0, \t";
+ if( 7 == ( $j % 8) )
+ {
+ print FILE "\n";
+ }
+ }
+ print FILE "\n}, \n";
+for($i = 0xc0; $i < 0xe0; $i++)
+{
+ print FILE "{ \n";
+ print FILE "0,\n";
+ for($j = 0xc0; $j < 0xe0; $j++)
+ {
+ $key = chr($i) . chr($j);
+ if(exists($table->{$key}))
+ {
+ $v = $table->{$key};
+ } else {
+ $v = 0;
+ }
+ print FILE $v . ", \t";
+ if( 7 == ( $j % 8) )
+ {
+ print FILE "\n";
+ }
+ }
+ print FILE "\n}, \n";
+}
+print FILE "};\n";
+print FILE "#endif\n";
diff --git a/intl/chardet/tools/geneucjp.pl b/intl/chardet/tools/geneucjp.pl
new file mode 100644
index 000000000..692be15ab
--- /dev/null
+++ b/intl/chardet/tools/geneucjp.pl
@@ -0,0 +1,47 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@eucjp_cls);
+my(@eucjp_st);
+my($eucjp_ver);
+
+
+@eucjp_cls = (
+ [ 0x0e , 0x0f , 5 ],
+ [ 0xe0 , 0xfe , 0 ],
+ [ 0x8e , 0x8e , 1 ],
+ [ 0xa1 , 0xdf , 2 ],
+ [ 0x8f , 0x8f , 3 ],
+ [ 0x01 , 0x1a , 4 ],
+ [ 0x1c , 0x7f , 4 ],
+ [ 0x00 , 0x00 , 4 ],
+ [ 0x1b , 0x1b , 5 ],
+ [ 0x80 , 0x8d , 5 ],
+ [ 0xa0 , 0xa0 , 5 ],
+ [ 0x80 , 0xff , 5 ]
+);
+
+package genverifier;
+@eucjp_st = (
+# 0 1 2 3 4 5
+ 3, 4, 3, 5, 0, 1, # state 0
+ 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 0, 1, 0, 1, 1, 1, # state 3
+ 1, 1, 0, 1, 1, 1, # state 4
+ 3, 1, 3, 1, 1, 1, # state 5
+);
+
+
+$eucjp_ver = genverifier::GenVerifier("EUCJP", "EUC-JP", \@eucjp_cls, 6, \@eucjp_st);
+print $eucjp_ver;
+
+
+
diff --git a/intl/chardet/tools/geneuckr.pl b/intl/chardet/tools/geneuckr.pl
new file mode 100644
index 000000000..007810a6a
--- /dev/null
+++ b/intl/chardet/tools/geneuckr.pl
@@ -0,0 +1,42 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@euckr_cls);
+my(@euckr_st);
+my($euckr_ver);
+
+
+@euckr_cls = (
+ [ 0x00 , 0x00 , 1 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x01 , 0x7f , 1 ],
+ [ 0x80 , 0xa0 , 0 ],
+ [ 0xff , 0xff , 0 ],
+ [ 0xad , 0xaf , 3 ],
+ [ 0xc9 , 0xc9 , 3 ],
+ [ 0xa1 , 0xfe , 2 ],
+);
+
+package genverifier;
+@euckr_st = (
+# 0 1 2 3
+ 1, 0, 3, 1, # state 0
+ 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 0, 0, # state 3
+);
+
+
+$euckr_ver = genverifier::GenVerifier("EUCKR", "EUC-KR", \@euckr_cls, 4, \@euckr_st);
+print $euckr_ver;
+
+
+
diff --git a/intl/chardet/tools/geneuctw.pl b/intl/chardet/tools/geneuctw.pl
new file mode 100644
index 000000000..88453155e
--- /dev/null
+++ b/intl/chardet/tools/geneuctw.pl
@@ -0,0 +1,49 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@euctw_cls);
+my(@euctw_st);
+my($euctw_ver);
+
+
+@euctw_cls = (
+ [ 0x00 , 0x00 , 2 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x01 , 0x7f , 2 ],
+ [ 0x8e , 0x8e , 6 ],
+ [ 0x80 , 0xa0 , 0 ],
+ [ 0xff , 0xff , 0 ],
+ [ 0xa1 , 0xa1 , 3 ],
+ [ 0xa2 , 0xa7 , 4 ],
+ [ 0xa8 , 0xa9 , 5 ],
+ [ 0xaa , 0xc1 , 1 ],
+ [ 0xc2 , 0xc2 , 3 ],
+ [ 0xc3 , 0xc3 , 1 ],
+ [ 0xc4 , 0xfe , 3 ],
+);
+
+package genverifier;
+@euctw_st = (
+# 0 1 2 3 4 5 6
+ 1, 1, 0, 3, 3, 3, 4, # state 0
+ 1, 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 0, 1, 0, 0, 0, 1, # state 3
+ 1, 1, 1, 1, 5, 1, 1, # state 4
+ 1, 0, 1, 0, 0, 0, 1, # state 5
+);
+
+
+$euctw_ver = genverifier::GenVerifier("EUCTW", "x-euc-tw", \@euctw_cls, 7, \@euctw_st);
+print $euctw_ver;
+
+
+
diff --git a/intl/chardet/tools/gengb18030.pl b/intl/chardet/tools/gengb18030.pl
new file mode 100644
index 000000000..654710b2c
--- /dev/null
+++ b/intl/chardet/tools/gengb18030.pl
@@ -0,0 +1,44 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@gb18030_cls);
+my(@gb18030_st);
+my($gb18030_ver);
+
+
+@gb18030_cls = (
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x30 , 0x39 , 3 ],
+ [ 0x00 , 0x3f , 1 ],
+ [ 0x40 , 0x7e , 2 ],
+ [ 0x7f , 0x7f , 4 ],
+ [ 0x80 , 0x80 , 5 ],
+ [ 0x81 , 0xfe , 6 ],
+ [ 0xff , 0xff , 0 ],
+);
+
+package genverifier;
+@gb18030_st = (
+# 0 1 2 3 4 5 6
+ 1, 0, 0, 0, 0, 0, 3, # state 0
+ 1, 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 0, 4, 1, 0, 0, # state 3, multibytes, 1st byte identified
+ 1, 1, 1, 1, 1, 1, 5, # state 4, multibytes, 2nd byte identified
+ 1, 1, 1, 2, 1, 1, 1, # state 5, multibytes, 3rd byte identified
+);
+
+
+$gb18030_ver = genverifier::GenVerifier("gb18030", "gb18030", \@gb18030_cls, 7, \@gb18030_st);
+print $gb18030_ver;
+
+
+
diff --git a/intl/chardet/tools/gengb2312.pl b/intl/chardet/tools/gengb2312.pl
new file mode 100644
index 000000000..57d86926b
--- /dev/null
+++ b/intl/chardet/tools/gengb2312.pl
@@ -0,0 +1,41 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@gb2312_cls);
+my(@gb2312_st);
+my($gb2312_ver);
+
+
+@gb2312_cls = (
+ [ 0x00 , 0x00 , 1 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x01 , 0x7f , 1 ],
+ [ 0x80 , 0xa0 , 0 ],
+ [ 0xff , 0xff , 0 ],
+ [ 0xaa , 0xaf , 3 ],
+ [ 0xa1 , 0xfe , 2 ],
+);
+
+package genverifier;
+@gb2312_st = (
+# 0 1 2 3
+ 1, 0, 3, 1, # state 0
+ 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 0, 0, # state 3
+);
+
+
+$gb2312_ver = genverifier::GenVerifier("GB2312", "GB2312", \@gb2312_cls, 4, \@gb2312_st);
+print $gb2312_ver;
+
+
+
diff --git a/intl/chardet/tools/genhz.pl b/intl/chardet/tools/genhz.pl
new file mode 100644
index 000000000..c58eb4675
--- /dev/null
+++ b/intl/chardet/tools/genhz.pl
@@ -0,0 +1,57 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@hz_cls);
+my(@hz_st);
+my($hz_ver);
+
+
+#
+#
+# > 0x80 - 1
+# ~ - 2
+# LF - 3
+# { - 4
+# } - 5
+#
+@hz_cls = (
+ [ 0x01 , 0x1a , 0 ],
+ [ 0x7e , 0x7e , 2 ],
+ [ 0x0a , 0x0a , 3 ],
+ [ 0x7b , 0x7b , 4 ],
+ [ 0x7d , 0x7d , 5 ],
+ [ 0x1c , 0x7f , 0 ],
+ [ 0x0e , 0x0f , 1 ],
+ [ 0x1b , 0x1b , 1 ],
+ [ 0x00 , 0x00 , 1 ],
+ [ 0x80 , 0xff , 1 ]
+);
+
+
+#
+#
+package genverifier;
+@hz_st = (
+# 0 1 2 3 4 5
+ 0, 1, 3, 0, 0, 0, # Start State - 0
+ 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 0, 0, 4, 1, # state 3 - got ~
+ 5, 1, 6, 1, 5, 5, # state 4 - got ~ {
+ 4, 1, 4, 1, 4, 4, # state 5 - got ~ { X
+ 4, 1, 4, 1, 4, 2, # state 6 - got ~ { [X X]* ~
+);
+
+$hz_ver = genverifier::GenVerifier("HZ", "HZ-GB-2312",
+ \@hz_cls, 6, \@hz_st);
+print $hz_ver;
+
+
+
diff --git a/intl/chardet/tools/geniso2022cn.pl b/intl/chardet/tools/geniso2022cn.pl
new file mode 100644
index 000000000..c4a43caae
--- /dev/null
+++ b/intl/chardet/tools/geniso2022cn.pl
@@ -0,0 +1,58 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@iso2022cn_cls);
+my(@iso2022cn_st);
+my($iso2022cn_ver);
+
+
+#
+#
+# ESC - 1
+# > 0x80 - 2
+# $ - 3
+# ) - 4
+# * - 5
+# A G - 6
+# H - 7
+# N O - 8
+#
+@iso2022cn_cls = (
+ [ 0x01 , 0x1a , 0 ],
+ [ 0x29 , 0x29 , 3 ],
+ [ 0x43 , 0x43 , 4 ],
+ [ 0x1c , 0x7f , 0 ],
+ [ 0x1b , 0x1b , 1 ],
+ [ 0x00 , 0x00 , 2 ],
+ [ 0x80 , 0xff , 2 ]
+);
+
+
+#
+# ESC$((([)][AG])|([*]H))|[NO])
+#
+package genverifier;
+@iso2022cn_st = (
+# 0 1 2 3 4 5 6 7 8
+ 0, 3, 1, 0, 0, 0, 0, 0, 0, # Start State - 0
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 1, 4, 1, 1, 1, 1, 2, # state 3 - got ESC
+ 1, 1, 1, 1, 5, 6, 1, 1, 1, # state 4 - got ESC $
+ 1, 1, 1, 1, 1, 1, 2, 1, 1, # state 5 - got ESC $ )
+ 1, 1, 1, 1, 1, 1, 1, 2, 1, # state 6 - got ESC $ *
+);
+
+$iso2022cn_ver = genverifier::GenVerifier("ISO2022CN", "ISO-2022-CN",
+ \@iso2022cn_cls, 9, \@iso2022cn_st);
+print $iso2022cn_ver;
+
+
+
diff --git a/intl/chardet/tools/geniso2022jp.pl b/intl/chardet/tools/geniso2022jp.pl
new file mode 100644
index 000000000..4408fbeb0
--- /dev/null
+++ b/intl/chardet/tools/geniso2022jp.pl
@@ -0,0 +1,49 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@iso2022jp_cls);
+my(@iso2022jp_st);
+my($iso2022jp_ver);
+
+# 1:ESC 3:'(' 4:'B' 5:'J' 6:'@' 7:'$' 8:'D' 9:'I'
+@iso2022jp_cls = (
+ [ 0x0e , 0x0f , 2 ],
+ [ 0x28 , 0x28 , 3 ],
+ [ 0x42 , 0x42 , 4 ],
+ [ 0x4a , 0x4a , 5 ],
+ [ 0x40 , 0x40 , 6 ],
+ [ 0x24 , 0x24 , 7 ],
+ [ 0x44 , 0x44 , 8 ],
+ [ 0x49 , 0x49 , 9 ],
+ [ 0x01 , 0x1a , 0 ],
+ [ 0x1c , 0x7f , 0 ],
+ [ 0x1b , 0x1b , 1 ],
+ [ 0x00 , 0x00 , 2 ],
+ [ 0x80 , 0xff , 2 ]
+);
+
+package genverifier;
+@iso2022jp_st = (
+# 0 1 2 3 4 5 6 7 8 9
+ 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, # Start State - 0
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 1, 5, 1, 1, 1, 4, 1, 1, # got ESC
+ 1, 1, 1, 6, 2, 1, 2, 1, 1, 1, # got ESC $
+ 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, # got ESC (
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, # got ESC $ (
+);
+
+$iso2022jp_ver = genverifier::GenVerifier("ISO2022JP", "ISO-2022-JP",
+ \@iso2022jp_cls, 10, \@iso2022jp_st);
+print $iso2022jp_ver;
+
+
+
diff --git a/intl/chardet/tools/geniso2022kr.pl b/intl/chardet/tools/geniso2022kr.pl
new file mode 100644
index 000000000..f56bcf9fb
--- /dev/null
+++ b/intl/chardet/tools/geniso2022kr.pl
@@ -0,0 +1,55 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@iso2022kr_cls);
+my(@iso2022kr_st);
+my($iso2022kr_ver);
+
+
+#
+#
+# ESC - 1
+# > 0x80 - 2
+# $ - 3
+# ) - 4
+# C - 5
+#
+@iso2022kr_cls = (
+ [ 0x01 , 0x1a , 0 ],
+ [ 0x24 , 0x24 , 3 ],
+ [ 0x29 , 0x29 , 4 ],
+ [ 0x43 , 0x43 , 5 ],
+ [ 0x1c , 0x7f , 0 ],
+ [ 0x1b , 0x1b , 1 ],
+ [ 0x00 , 0x00 , 2 ],
+ [ 0x80 , 0xff , 2 ]
+);
+
+
+#
+# ESC$)C
+#
+package genverifier;
+@iso2022kr_st = (
+# 0 1 2 3 4 5
+ 0, 3, 1, 0, 0, 0, # Start State - 0
+ 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 1, 4, 1, 1, # state 3 - got ESC
+ 1, 1, 1, 1, 5, 1, # state 4 - got ESC $
+ 1, 1, 1, 1, 1, 2, # state 5 - got ESC $ )
+);
+
+$iso2022kr_ver = genverifier::GenVerifier("ISO2022KR", "ISO-2022-KR",
+ \@iso2022kr_cls, 6, \@iso2022kr_st);
+print $iso2022kr_ver;
+
+
+
diff --git a/intl/chardet/tools/gensjis.pl b/intl/chardet/tools/gensjis.pl
new file mode 100644
index 000000000..20966d03e
--- /dev/null
+++ b/intl/chardet/tools/gensjis.pl
@@ -0,0 +1,46 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@sjis_cls);
+my(@sjis_st);
+my($sjis_ver);
+
+@sjis_cls = (
+ [ 0x00 , 0x00 , 0 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0xfd , 0xff , 0 ],
+ [ 0x85 , 0x86 , 3 ],
+ [ 0xeb , 0xec , 5 ],
+ [ 0x01 , 0x1a , 1 ],
+ [ 0x1c , 0x3f , 1 ],
+ [ 0x7f , 0x7f , 1 ],
+ [ 0x40 , 0x7e , 2 ],
+ [ 0xa1 , 0xdf , 2 ],
+ [ 0x80 , 0x9f , 3 ],
+ [ 0xa0 , 0xa0 , 4 ],
+ [ 0xe0 , 0xea , 3 ],
+ [ 0xed , 0xfc , 4 ],
+);
+
+package genverifier;
+@sjis_st = (
+# 0 1 2 3 4 5
+ 1, 0, 0, 3, 1, 1, # Start State - 0
+ 1, 1, 1, 1, 1, 1, # Error State - 1
+ 2, 2, 2, 2, 2, 2, # ItsMe State - 2
+ 1, 1, 0, 0, 0, 0, # State - 3
+);
+
+$sjis_ver = genverifier::GenVerifier("SJIS", "Shift_JIS", \@sjis_cls, 6, \@sjis_st);
+print $sjis_ver;
+
+
+
diff --git a/intl/chardet/tools/genutf8.pl b/intl/chardet/tools/genutf8.pl
new file mode 100644
index 000000000..437dd535b
--- /dev/null
+++ b/intl/chardet/tools/genutf8.pl
@@ -0,0 +1,189 @@
+#!/usr/local/bin/perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+use strict;
+require "genverifier.pm";
+use genverifier;
+
+
+my(@utf8_cls);
+my(@utf8_st);
+my($utf8_ver);
+
+#
+#
+# UTF8 encode the UCS4 into 1 to 4 bytes
+#
+# 1 byte 00 00 00 00 00 00 00 7f
+# 2 bytes 00 00 00 80 00 00 07 ff
+# 3 bytes 00 00 08 00 00 00 ff ff
+# 4 bytes 00 01 00 00 00 10 ff ff
+#
+# However, since Surrogate area should not be encoded into UTF8 as
+# a Surrogate pair, we can remove the surrogate area from UTF8
+#
+# 1 byte 00 00 00 00 00 00 00 7f
+# 2 bytes 00 00 00 80 00 00 07 ff
+# 3 bytes 00 00 08 00 00 00 d7 ff
+# 00 00 e0 00 00 00 ff ff
+# 4 bytes 00 01 00 00 00 10 ff ff
+#
+# Now we break them into 6 bits group for 2-4 bytes UTF8
+#
+# 1 byte 00 7f
+# 2 bytes 02 00 1f 3f
+# 3 bytes 00 20 00 0d 1f 3f
+# 0e 00 00 0f 3f 3f
+# 4 bytes 00 10 00 00 04 0f 3f 3f
+#
+# Break down more
+#
+# 1 byte 00 7f
+# 2 bytes 02 00 1f 3f
+# 3 bytes 00 20 00 00 3f 3f
+# 01 00 00 0c 3f 3f
+# 0d 00 00 0d 1f 3f
+# 0e 00 00 0f 3f 3f
+# 4 bytes 00 10 00 00 00 3f 3f 3f
+# 01 00 00 00 03 3f 3f 3f
+# 04 00 00 00 04 0f 3f 3f
+#
+# Now, add
+# c0 to the lead byte of 2 bytes UTF8
+# e0 to the lead byte of 3 bytes UTF8
+# f0 to the lead byte of 4 bytes UTF8
+# 80 to the trail bytes
+#
+# 1 byte 00 7f
+# 2 bytes c2 80 df bf
+# 3 bytes e0 a0 80 e0 bf bf
+# e1 80 80 ec bf bf
+# ed 80 80 ed 9f bf
+# ee 80 80 ef bf bf
+# 4 bytes f0 90 80 80 f0 bf bf bf
+# f1 80 80 80 f3 bf bf bf
+# f4 80 80 80 f4 8f bf bf
+#
+#
+# Now we can construct our state diagram
+#
+# 0:0x0e,0x0f,0x1b->Error
+# 0:[0-0x7f]->0
+# 0:[c2-df]->3
+# 0:e0->4
+# 0:[e1-ec, ee-ef]->5
+# 0:ed->6
+# 0:f0->7
+# 0:[f1-f3]->8
+# 0:f4->9
+# 0:*->Error
+# 3:[80-bf]->0
+# 3:*->Error
+# 4:[a0-bf]->3
+# 4:*->Error
+# 5:[80-bf]->3
+# 5:*->Error
+# 6:[80-9f]->3
+# 6:*->Error
+# 7:[90-bf]->5
+# 7:*->Error
+# 8:[80-bf]->5
+# 8:*->Error
+# 9:[80-8f]->5
+# 9:*->Error
+#
+# Now, we classified chars into class
+#
+# 00,0e,0f,1b:k0
+# 01-0d,10-1a,1c-7f:k1
+# 80-8f:k2
+# 90-9f:k3
+# a0-bf:k4
+# c0-c1:k0
+# c2-df:k5
+# e0:k6
+# e1-ec:k7
+# ed:k8
+# ee-ef:k7
+# f0:k9
+# f1-f3:k10
+# f4:k11
+# f5-ff:k0
+#
+# Now, let's put them into array form
+
+@utf8_cls = (
+ [ 0x00 , 0x00 , 1 ],
+ [ 0x0e , 0x0f , 0 ],
+ [ 0x1b , 0x1b , 0 ],
+ [ 0x01 , 0x0d , 1 ],
+ [ 0x10 , 0x1a , 1 ],
+ [ 0x1c , 0x7f , 1 ],
+ [ 0x80 , 0x8f , 2 ],
+ [ 0x90 , 0x9f , 3 ],
+ [ 0xa0 , 0xbf , 4 ],
+ [ 0xc0 , 0xc1 , 0 ],
+ [ 0xc2 , 0xdf , 5 ],
+ [ 0xe0 , 0xe0 , 6 ],
+ [ 0xe1 , 0xec , 7 ],
+ [ 0xed , 0xed , 8 ],
+ [ 0xee , 0xef , 7 ],
+ [ 0xf0 , 0xf0 , 9 ],
+ [ 0xf1 , 0xf3 , 10 ],
+ [ 0xf4 , 0xf4 , 11 ],
+ [ 0xf5 , 0xff , 0 ],
+);
+#
+# Now, we write the state diagram in class
+#
+# 0:k0->Error
+# 0:k1->0
+# 0:k5->3
+# 0:k6->4
+# 0:k7->5
+# 0:k8->6
+# 0:k9->7
+# 0:k10->8
+# 0:k11->9
+# 0:*->Error
+# 3:k2,k3,k4->0
+# 3:*->Error
+# 4:k4->3
+# 4:*->Error
+# 5:k2,k3,k4->3
+# 5:*->Error
+# 6:k2,k3->3
+# 6:*->Error
+# 7:k3,k4->5
+# 7:*->Error
+# 8:k2,k3,k4->5
+# 8:*->Error
+# 9:k2->5
+# 9:*->Error
+#
+# Now, let's put them into array
+#
+package genverifier;
+@utf8_st = (
+# 0 1 2 3 4 5 6 7 8 9 10 11
+ 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
+ 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3
+ 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4
+ 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5
+ 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
+ 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
+ 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8
+ 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
+);
+
+
+
+$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st);
+print $utf8_ver;
+
+
+
diff --git a/intl/chardet/tools/genverifier.pm b/intl/chardet/tools/genverifier.pm
new file mode 100644
index 000000000..8ccfef4d6
--- /dev/null
+++ b/intl/chardet/tools/genverifier.pm
@@ -0,0 +1,175 @@
+#!/usr/local/bin/perl
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package genverifier;
+use strict;
+use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
+
+use Exporter;
+$VERSION = 1.00;
+@ISA = qw(Exporter);
+
+@EXPORT = qw(
+ GenVerifier
+ );
+@EXPORT_OK = qw();
+
+sub GenNPL {
+ my($ret) = << "END_MPL";
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+END_MPL
+
+ return $ret;
+}
+
+##--------------------------------------------------------------
+sub GetClass {
+ my($char, $clstbl) = @_;
+ my($l);
+ for($l =0; $l <= @$clstbl; $l++) {
+ if(($clstbl->[$l][0] <= $char) && ($char <= $clstbl->[$l][1]))
+ {
+ return $clstbl->[$l][2];
+ }
+ }
+ print "WARNING- there are no class for $char\n";
+};
+##--------------------------------------------------------------
+sub GenClassPkg {
+ my($name, $bits) = @_;
+ return GenPkg($name, $bits, "_cls");
+}
+##--------------------------------------------------------------
+sub GenStatePkg {
+ my($name, $bits) = @_;
+ return GenPkg($name, $bits, "_st");
+};
+##--------------------------------------------------------------
+sub GenPkg {
+ my($name, $bits, $tbl) = @_;
+ my($ret);
+ $ret = " {" .
+ "eIdxSft" . $bits . "bits, " .
+ "eSftMsk" . $bits . "bits, " .
+ "eBitSft" . $bits . "bits, " .
+ "eUnitMsk" . $bits . "bits, " .
+ $name . $tbl . "" .
+ " }";
+ return $ret;
+};
+##--------------------------------------------------------------
+sub Gen4BitsClass {
+ my($name, $clstbl) = @_;
+ my($i,$j);
+ my($cls);
+ my($ret);
+ $ret = "";
+ $ret .= "static const uint32_t " . $name . "_cls [ 256 / 8 ] = {\n";
+ for($i = 0; $i < 0x100; $i+= 8) {
+ $ret .= "PCK4BITS(";
+ for($j = $i; $j < $i + 8; $j++) {
+ $cls = &GetClass($j,$clstbl);
+ $ret .= sprintf("%2d", $cls) ;
+ if($j != ($i+7)) {
+ $ret .= ",";
+ }
+ }
+ if( $i+8 >= 0x100) {
+ $ret .= ") ";
+ } else {
+ $ret .= "),";
+ }
+ $ret .= sprintf(" // %02x - %02x\n", $i, ($i+7));
+ }
+ $ret .= "};\n";
+ return $ret;
+};
+##--------------------------------------------------------------
+sub GenVerifier {
+ my($name, $charset, $cls, $numcls, $st) = @_;
+ my($ret);
+ $ret = GenNPL();
+ $ret .= GenNote();
+ $ret .= GenHeader();
+ $ret .= Gen4BitsClass($name, $cls);
+ $ret .= "\n\n";
+ $ret .= Gen4BitsState($name, $st);
+ $ret .= "\n\n";
+ $ret .= "const SMModel " . $name . "SMModel = {\n";
+ $ret .= GenClassPkg($name, 4);
+ $ret .= ",\n";
+ $ret .= " " . $numcls;
+ $ret .= ",\n";
+ $ret .= GenStatePkg($name, 4);
+ $ret .= ",\n";
+ $ret .= " " . "CHAR_LEN_TABLE(" . $name . "CharLenTable),\n";
+ $ret .= ' "' . $charset . '",' . "\n";
+ $ret .= "};\n";
+ return $ret;
+
+};
+##--------------------------------------------------------------
+sub Gen4BitsState {
+ my($name, $sttbl) = @_;
+ my($lenafterpad) = (((@$sttbl-1) >> 3) + 1) << 3;
+ my($i,$j);
+ my($ret);
+ $ret = "";
+ $ret .= "static const uint32_t " . $name . "_st [ " . ($lenafterpad >> 3) . "] = {\n";
+ for($i = 0; $i < $lenafterpad ; $i+= 8) {
+ $ret .= "PCK4BITS(";
+ for($j = $i; $j < $i + 8; $j++) {
+ if(0 == $sttbl->[$j]) {
+ $ret .= "eStart";
+ } else { if(1 == $sttbl->[$j]) {
+ $ret .= "eError";
+ } else { if(2 == $sttbl->[$j]) {
+ $ret .= "eItsMe";
+ } else {
+ $ret .= sprintf(" %d", $sttbl->[$j]) ;
+ }}}
+ if($j != ($i+7)) {
+ $ret .= ",";
+ }
+ }
+ if( $i+8 >= $lenafterpad ) {
+ $ret .= ") ";
+ } else {
+ $ret .= "),";
+ }
+ $ret .= sprintf(" // %02x - %02x\n", $i, ($i+7));
+ }
+ $ret .= "};\n";
+ return $ret;
+};
+##--------------------------------------------------------------
+
+sub GenNote {
+ my($ret) = << "END_NOTE";
+/*
+ * DO NOT EDIT THIS DOCUMENT MANUALLY !!!
+ * THIS FILE IS AUTOMATICALLY GENERATED BY THE TOOLS UNDER
+ * mozilla/intl/chardet/tools/
+ * Please contact ftang\@netscape.com or mozilla-i18n\@mozilla.org
+ * if you have any question. Thanks
+ */
+END_NOTE
+ return $ret;
+}
+
+##--------------------------------------------------------------
+sub GenHeader {
+ my($ret) = << "END_HEADER";
+#include "nsVerifier.h"
+END_HEADER
+
+ return $ret;
+}
+##--------------------------------------------------------------
+1; # this should be the last line