diff options
Diffstat (limited to 'intl/chardet')
30 files changed, 2223 insertions, 0 deletions
diff --git a/intl/chardet/moz.build b/intl/chardet/moz.build new file mode 100644 index 000000000..4d66274e5 --- /dev/null +++ b/intl/chardet/moz.build @@ -0,0 +1,19 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXPORTS += [ + 'nsDetectionConfident.h', + 'nsICharsetDetectionObserver.h', + 'nsICharsetDetector.h', + 'nsIStringCharsetDetector.h', +] + +UNIFIED_SOURCES += [ + 'nsChardetModule.cpp', + 'nsCyrillicDetector.cpp', +] + +FINAL_LIBRARY = 'xul' diff --git a/intl/chardet/nsCharDetConstructors.h b/intl/chardet/nsCharDetConstructors.h new file mode 100644 index 000000000..caff08976 --- /dev/null +++ b/intl/chardet/nsCharDetConstructors.h @@ -0,0 +1,26 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * Header file to be included by module - + * warning: defines a whole bunch of static functions + */ + +#ifndef nsCharDetConstructors_h__ +#define nsCharDetConstructors_h__ + +// chardet +#include "nsISupports.h" +#include "nsICharsetDetector.h" +#include "nsICharsetDetectionObserver.h" +#include "nsIStringCharsetDetector.h" +#include "nsCyrillicDetector.h" + +NS_GENERIC_FACTORY_CONSTRUCTOR(nsRUProbDetector) +NS_GENERIC_FACTORY_CONSTRUCTOR(nsUKProbDetector) +NS_GENERIC_FACTORY_CONSTRUCTOR(nsRUStringProbDetector) +NS_GENERIC_FACTORY_CONSTRUCTOR(nsUKStringProbDetector) + +#endif diff --git a/intl/chardet/nsChardetModule.cpp b/intl/chardet/nsChardetModule.cpp new file mode 100644 index 000000000..77dd4ecfe --- /dev/null +++ b/intl/chardet/nsChardetModule.cpp @@ -0,0 +1,45 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/ModuleUtils.h" + +#include "nsCharDetConstructors.h" + +NS_DEFINE_NAMED_CID(NS_RU_PROBDETECTOR_CID); +NS_DEFINE_NAMED_CID(NS_UK_PROBDETECTOR_CID); +NS_DEFINE_NAMED_CID(NS_RU_STRING_PROBDETECTOR_CID); +NS_DEFINE_NAMED_CID(NS_UK_STRING_PROBDETECTOR_CID); + +static const mozilla::Module::CIDEntry kChardetCIDs[] = { + { &kNS_RU_PROBDETECTOR_CID, false, nullptr, nsRUProbDetectorConstructor }, + { &kNS_UK_PROBDETECTOR_CID, false, nullptr, nsUKProbDetectorConstructor }, + { &kNS_RU_STRING_PROBDETECTOR_CID, false, nullptr, nsRUStringProbDetectorConstructor }, + { &kNS_UK_STRING_PROBDETECTOR_CID, false, nullptr, nsUKStringProbDetectorConstructor }, + { nullptr } +}; + +static const mozilla::Module::ContractIDEntry kChardetContracts[] = { + { NS_CHARSET_DETECTOR_CONTRACTID_BASE "ruprob", &kNS_RU_PROBDETECTOR_CID }, + { NS_CHARSET_DETECTOR_CONTRACTID_BASE "ukprob", &kNS_UK_PROBDETECTOR_CID }, + { NS_STRCDETECTOR_CONTRACTID_BASE "ruprob", &kNS_RU_STRING_PROBDETECTOR_CID }, + { NS_STRCDETECTOR_CONTRACTID_BASE "ukprob", &kNS_UK_STRING_PROBDETECTOR_CID }, + { nullptr } +}; + +static const mozilla::Module::CategoryEntry kChardetCategories[] = { + { NS_CHARSET_DETECTOR_CATEGORY, "off", "off" }, + { NS_CHARSET_DETECTOR_CATEGORY, "ruprob", NS_CHARSET_DETECTOR_CONTRACTID_BASE "ruprob" }, + { NS_CHARSET_DETECTOR_CATEGORY, "ukprob", NS_CHARSET_DETECTOR_CONTRACTID_BASE "ukprob" }, + { nullptr } +}; + +static const mozilla::Module kChardetModule = { + mozilla::Module::kVersion, + kChardetCIDs, + kChardetContracts, + kChardetCategories +}; + +NSMODULE_DEFN(nsChardetModule) = &kChardetModule; diff --git a/intl/chardet/nsCyrillicClass.h b/intl/chardet/nsCyrillicClass.h new file mode 100644 index 000000000..c03ac1c43 --- /dev/null +++ b/intl/chardet/nsCyrillicClass.h @@ -0,0 +1,60 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsCyrillicClass_h__ +#define nsCyrillicClass_h__ +/* PLEASE DO NOT EDIT THIS FILE DIRECTLY. THIS FILE IS GENERATED BY + GenCyrllicClass found in mozilla/intl/chardet/tools + */ +static const uint8_t KOI8Map [128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +}; +static const uint8_t CP1251Map [128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, +}; +static const uint8_t IBM866Map [128] = { + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +static const uint8_t ISO88595Map [128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +static const uint8_t MacCyrillicMap [128] = { + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, + 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, + 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 0, +}; +#endif diff --git a/intl/chardet/nsCyrillicDetector.cpp b/intl/chardet/nsCyrillicDetector.cpp new file mode 100644 index 000000000..feebeed65 --- /dev/null +++ b/intl/chardet/nsCyrillicDetector.cpp @@ -0,0 +1,160 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "nscore.h" +#include "nsCyrillicProb.h" +#include <stdio.h> + +#include "nsCOMPtr.h" +#include "nsISupports.h" +#include "nsICharsetDetector.h" +#include "nsICharsetDetectionObserver.h" +#include "nsIStringCharsetDetector.h" +#include "nsCyrillicDetector.h" + +//---------------------------------------------------------------------- +// Interface nsISupports [implementation] +NS_IMPL_ISUPPORTS(nsCyrXPCOMDetector, nsICharsetDetector) +NS_IMPL_ISUPPORTS(nsCyrXPCOMStringDetector, nsIStringCharsetDetector) + +void nsCyrillicDetector::HandleData(const char* aBuf, uint32_t aLen) +{ + uint8_t cls; + const char* b; + uint32_t i; + if(mDone) + return; + for(i=0, b=aBuf;i<aLen;i++,b++) + { + for(unsigned j=0;j<mItems;j++) + { + if( 0x80 & *b) + cls = mCyrillicClass[j][(*b) & 0x7F]; + else + cls = 0; + NS_ASSERTION( cls <= 32 , "illegal character class"); + mProb[j] += gCyrillicProb[mLastCls[j]][cls]; + mLastCls[j] = cls; + } + } + // We now only based on the first block we receive + DataEnd(); +} + +//--------------------------------------------------------------------- +#define THRESHOLD_RATIO 1.5f +void nsCyrillicDetector::DataEnd() +{ + uint32_t max=0; + uint8_t maxIdx=0; + uint8_t j; + if(mDone) + return; + for(j=0;j<mItems;j++) { + if(mProb[j] > max) + { + max = mProb[j]; + maxIdx= j; + } + } + + if( 0 == max ) // if we didn't get any 8 bits data + return; + +#ifdef DEBUG + for(j=0;j<mItems;j++) + printf("Charset %s->\t%d\n", mCharsets[j], mProb[j]); +#endif + this->Report(mCharsets[maxIdx]); + mDone = true; +} + +//--------------------------------------------------------------------- +nsCyrXPCOMDetector:: nsCyrXPCOMDetector(uint8_t aItems, + const uint8_t ** aCyrillicClass, + const char **aCharsets) + : nsCyrillicDetector(aItems, aCyrillicClass, aCharsets) +{ + mObserver = nullptr; +} + +//--------------------------------------------------------------------- +nsCyrXPCOMDetector::~nsCyrXPCOMDetector() +{ +} + +//--------------------------------------------------------------------- +NS_IMETHODIMP nsCyrXPCOMDetector::Init( + nsICharsetDetectionObserver* aObserver) +{ + NS_ASSERTION(mObserver == nullptr , "Init twice"); + if(nullptr == aObserver) + return NS_ERROR_ILLEGAL_VALUE; + + mObserver = aObserver; + return NS_OK; +} + +//---------------------------------------------------------- +NS_IMETHODIMP nsCyrXPCOMDetector::DoIt( + const char* aBuf, uint32_t aLen, bool* oDontFeedMe) +{ + NS_ASSERTION(mObserver != nullptr , "have not init yet"); + + if((nullptr == aBuf) || (nullptr == oDontFeedMe)) + return NS_ERROR_ILLEGAL_VALUE; + + this->HandleData(aBuf, aLen); + *oDontFeedMe = false; + return NS_OK; +} + +//---------------------------------------------------------- +NS_IMETHODIMP nsCyrXPCOMDetector::Done() +{ + NS_ASSERTION(mObserver != nullptr , "have not init yet"); + this->DataEnd(); + return NS_OK; +} + +//---------------------------------------------------------- +void nsCyrXPCOMDetector::Report(const char* aCharset) +{ + NS_ASSERTION(mObserver != nullptr , "have not init yet"); + mObserver->Notify(aCharset, eBestAnswer); +} + +//--------------------------------------------------------------------- +nsCyrXPCOMStringDetector:: nsCyrXPCOMStringDetector(uint8_t aItems, + const uint8_t ** aCyrillicClass, + const char **aCharsets) + : nsCyrillicDetector(aItems, aCyrillicClass, aCharsets) +{ +} + +//--------------------------------------------------------------------- +nsCyrXPCOMStringDetector::~nsCyrXPCOMStringDetector() +{ +} + +//--------------------------------------------------------------------- +void nsCyrXPCOMStringDetector::Report(const char *aCharset) +{ + mResult = aCharset; +} + +//--------------------------------------------------------------------- +NS_IMETHODIMP nsCyrXPCOMStringDetector::DoIt(const char* aBuf, uint32_t aLen, + const char** oCharset, nsDetectionConfident &oConf) +{ + mResult = nullptr; + mDone = false; + this->HandleData(aBuf, aLen); + this->DataEnd(); + *oCharset=mResult; + oConf = eBestAnswer; + return NS_OK; +} + + diff --git a/intl/chardet/nsCyrillicDetector.h b/intl/chardet/nsCyrillicDetector.h new file mode 100644 index 000000000..014db9ec7 --- /dev/null +++ b/intl/chardet/nsCyrillicDetector.h @@ -0,0 +1,153 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsCyrillicDetector_h__ +#define nsCyrillicDetector_h__ + +#include "nsCyrillicClass.h" + + + + +// {2002F781-3960-11d3-B3C3-00805F8A6670} +#define NS_RU_PROBDETECTOR_CID \ +{ 0x2002f781, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + + +// {2002F782-3960-11d3-B3C3-00805F8A6670} +#define NS_UK_PROBDETECTOR_CID \ +{ 0x2002f782, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + +// {2002F783-3960-11d3-B3C3-00805F8A6670} +#define NS_RU_STRING_PROBDETECTOR_CID \ +{ 0x2002f783, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + +// {2002F784-3960-11d3-B3C3-00805F8A6670} +#define NS_UK_STRING_PROBDETECTOR_CID \ +{ 0x2002f784, 0x3960, 0x11d3, { 0xb3, 0xc3, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + +static const uint8_t *gCyrillicCls[5] = +{ + CP1251Map, + KOI8Map, + ISO88595Map, + MacCyrillicMap, + IBM866Map +}; + +static const char * gRussian[5] = { + "windows-1251", + "KOI8-R", + "ISO-8859-5", + "x-mac-cyrillic", + "IBM866" +}; + +static const char * gUkrainian[5] = { + "windows-1251", + "KOI8-U", + "ISO-8859-5", + "x-mac-cyrillic", + "IBM866" +}; + +#define NUM_CYR_CHARSET 5 + +class nsCyrillicDetector +{ + public: + nsCyrillicDetector(uint8_t aItems, + const uint8_t ** aCyrillicClass, + const char **aCharsets) { + mItems = aItems; + mCyrillicClass = aCyrillicClass; + mCharsets = aCharsets; + for(unsigned i=0;i<mItems;i++) + mProb[i] = mLastCls[i] =0; + mDone = false; + } + virtual ~nsCyrillicDetector() {} + virtual void HandleData(const char* aBuf, uint32_t aLen); + virtual void DataEnd(); + protected: + virtual void Report(const char* aCharset) = 0; + bool mDone; + + private: + uint8_t mItems; + const uint8_t ** mCyrillicClass; + const char** mCharsets; + uint32_t mProb[NUM_CYR_CHARSET]; + uint8_t mLastCls[NUM_CYR_CHARSET]; +}; + +class nsCyrXPCOMDetector : + public nsCyrillicDetector, + public nsICharsetDetector +{ + public: + // nsISupports interface + NS_DECL_ISUPPORTS + nsCyrXPCOMDetector(uint8_t aItems, + const uint8_t ** aCyrillicClass, + const char **aCharsets); + NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver) override; + NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen, bool *oDontFeedMe) override; + NS_IMETHOD Done() override; + protected: + virtual ~nsCyrXPCOMDetector(); + virtual void Report(const char* aCharset) override; + private: + nsCOMPtr<nsICharsetDetectionObserver> mObserver; +}; + +class nsCyrXPCOMStringDetector : + public nsCyrillicDetector, + public nsIStringCharsetDetector +{ + public: + // nsISupports interface + NS_DECL_ISUPPORTS + nsCyrXPCOMStringDetector(uint8_t aItems, + const uint8_t ** aCyrillicClass, + const char **aCharsets); + NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen, + const char** oCharset, nsDetectionConfident &oConf) override; + protected: + virtual ~nsCyrXPCOMStringDetector(); + virtual void Report(const char* aCharset) override; + private: + nsCOMPtr<nsICharsetDetectionObserver> mObserver; + const char* mResult; +}; + +class nsRUProbDetector : public nsCyrXPCOMDetector +{ + public: + nsRUProbDetector() + : nsCyrXPCOMDetector(5, gCyrillicCls, gRussian) {} +}; + +class nsRUStringProbDetector : public nsCyrXPCOMStringDetector +{ + public: + nsRUStringProbDetector() + : nsCyrXPCOMStringDetector(5, gCyrillicCls, gRussian) {} +}; + +class nsUKProbDetector : public nsCyrXPCOMDetector +{ + public: + nsUKProbDetector() + : nsCyrXPCOMDetector(5, gCyrillicCls, gUkrainian) {} +}; + +class nsUKStringProbDetector : public nsCyrXPCOMStringDetector +{ + public: + nsUKStringProbDetector() + : nsCyrXPCOMStringDetector(5, gCyrillicCls, gUkrainian) {} +}; + +#endif diff --git a/intl/chardet/nsCyrillicProb.h b/intl/chardet/nsCyrillicProb.h new file mode 100644 index 000000000..c84c09f95 --- /dev/null +++ b/intl/chardet/nsCyrillicProb.h @@ -0,0 +1,282 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsCyrillicProb_h___h__ +#define nsCyrillicProb_h___h__ +/* + DO NOT EDIT THIS FILE !!! + This file is generated by the perl script in + mozilla/intl/chardet/tools/gencyrillic.pl + + To ues that script, you need to grab StatKoi.pm file from + the "Cyrillic Software Suite" written by John Neystdt. + http://www.neystadt.org/cyrillic (You can also find it from CPAN) + */ +const uint16_t gCyrillicProb[33][33] = {{ +0, +0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +1, 0, 62, 8, 237, 0, 0, 0, +0, 0, 0, 0, 2, 0, 1, 0, +0, 0, 50, 9, 1342, 0, 5, 10, +0, 0, 16, 2, 0, 2041, 505, 0, + +}, +{ +0, +1197, 0, 891, 3797, 594, 2064, 112, 646, +1039, 166, 152, 3162, 10935, 3465, 10268, 5, +277, 1744, 3706, 5043, 8884, 79, 716, 4563, +0, 0, 3090, 205, 9, 591, 1515, 0, + +}, +{ +0, +206, 1117, 0, 0, 0, 652, 0, 0, +92, 194, 0, 4, 924, 25, 204, 2334, +2, 836, 832, 403, 0, 365, 63, 1, +0, 1257, 5, 9, 0, 358, 0, 629, + +}, +{ +0, +0, 935, 0, 0, 0, 1695, 0, 0, +0, 5193, 0, 5, 1, 1, 0, 461, +0, 0, 0, 0, 0, 216, 0, 9, +0, 47, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 4049, 20, 22, 27, 8713, 0, 49, +0, 1530, 0, 660, 1182, 138, 1459, 5347, +1488, 344, 741, 1738, 63, 1460, 206, 242, +19, 743, 26, 51, 0, 0, 33, 90, + +}, +{ +0, +141, 635, 516, 183, 8332, 911, 108, 2694, +255, 76, 2958, 2366, 8125, 3209, 19276, 285, +346, 483, 6823, 5705, 6596, 45, 1286, 525, +0, 0, 1093, 414, 15, 286, 767, 0, + +}, +{ +0, +0, 272, 0, 0, 0, 376, 50, 0, +0, 803, 0, 0, 15, 2, 28, 591, +0, 0, 6, 2, 24, 19, 0, 0, +7, 31, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 4191, 0, 0, 68, 162, 0, 0, +0, 1248, 0, 8, 369, 0, 12, 15161, +0, 0, 678, 0, 2, 337, 0, 0, +0, 0, 0, 19, 0, 0, 11, 0, + +}, +{ +0, +0, 102, 0, 0, 0, 5, 0, 15, +0, 27, 0, 6, 2, 1, 92, 2227, +0, 0, 101, 161, 7, 15, 0, 2, +0, 0, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +1245, 609, 755, 2134, 1161, 4628, 120, 151, +2180, 5903, 3242, 2804, 3261, 4656, 3708, 1658, +104, 7815, 882, 3354, 3398, 16, 169, 1769, +0, 0, 5064, 96, 0, 48, 1628, 0, + +}, +{ +0, +0, 0, 0, 0, 1, 3, 3, 0, +0, 0, 0, 6, 0, 12, 96, 67, +1, 0, 0, 2066, 11, 0, 0, 0, +0, 0, 0, 20, 0, 0, 0, 0, + +}, +{ +0, +0, 4402, 0, 677, 0, 782, 0, 2, +0, 2724, 0, 10, 876, 0, 35, 6609, +0, 0, 651, 1323, 1558, 1049, 416, 225, +0, 0, 2, 13, 0, 0, 0, 0, + +}, +{ +0, +741, 5440, 0, 0, 1, 6066, 0, 89, +0, 9040, 0, 153, 97, 4, 949, 9899, +0, 2830, 0, 8, 16, 2139, 434, 0, +7487, 157, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 2073, 13, 0, 0, 4818, 0, 0, +0, 3684, 0, 30, 89, 1094, 204, 4078, +119, 61, 1, 68, 0, 1684, 0, 68, +10, 1424, 0, 0, 0, 14, 6, 0, + +}, +{ +0, +18, 16528, 0, 176, 474, 5075, 174, 31, +0, 14151, 0, 840, 0, 0, 8956, 14457, +0, 911, 0, 1150, 1893, 711, 8, 199, +271, 9281, 192, 0, 0, 2, 84, 0, + +}, +{ +0, +23, 27, 4868, 799, 7820, 1391, 145, 13562, +909, 1551, 5834, 1881, 4400, 6329, 2878, 1911, +3632, 2374, 7308, 8626, 6679, 161, 2573, 15172, +0, 0, 1322, 778, 34, 129, 944, 0, + +}, +{ +0, +0, 671, 0, 12, 0, 2500, 1, 0, +0, 409, 0, 26, 3612, 0, 38, 8786, +268, 87, 13327, 13, 15, 471, 0, 0, +7, 266, 0, 0, 0, 0, 2, 0, + +}, +{ +0, +847, 0, 3, 184, 878, 1070, 0, 19, +482, 0, 90, 18, 26, 765, 151, 0, +0, 18, 20, 81, 2587, 0, 51, 766, +0, 0, 1224, 0, 0, 2209, 20, 0, + +}, +{ +0, +2, 10059, 62, 17, 21, 11067, 6, 2653, +30, 7582, 0, 122, 14, 638, 490, 6767, +9, 1045, 431, 1139, 683, 2482, 326, 496, +156, 938, 0, 254, 0, 0, 30, 0, + +}, +{ +0, +17, 1493, 218, 3, 213, 633, 26, 3, +590, 2176, 0, 3716, 3732, 938, 693, 4388, +1639, 4197, 1185, 2118, 21815, 2792, 0, 1033, +154, 239, 0, 25, 0, 0, 522, 3, + +}, +{ +0, +0, 9785, 0, 27, 197, 8202, 0, 12, +24, 5253, 0, 433, 12, 53, 2577, 9712, +25, 122, 3392, 4966, 4, 836, 0, 8956, +4693, 1483, 5, 3, 0, 0, 270, 3, + +}, +{ +0, +1930, 104, 260, 18, 1452, 325, 6, 1192, +51, 6, 0, 1098, 301, 1778, 398, 0, +2263, 7, 254, 2808, 452, 0, 743, 140, +0, 0, 45, 559, 0, 1336, 2289, 0, + +}, +{ +0, +0, 796, 390, 0, 1303, 3459, 1, 11, +0, 632, 0, 37, 0, 0, 620, 0, +15, 0, 1, 0, 0, 25, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 7418, 0, 51, 10, 5465, 0, 1, +51, 2962, 0, 999, 3853, 82, 1048, 7277, +241, 370, 394, 280, 286, 1126, 0, 183, +24, 3182, 197, 286, 0, 28, 0, 4, + +}, +{ +0, +395, 0, 6, 22, 0, 496, 9, 113, +0, 700, 0, 171, 0, 78, 3296, 0, +0, 1501, 0, 1379, 193, 0, 0, 0, +0, 0, 487, 165, 0, 1633, 30, 0, + +}, +{ +0, +0, 0, 36, 0, 272, 2847, 0, 27, +4998, 1, 1192, 33, 224, 2657, 219, 0, +363, 29, 273, 205, 503, 0, 0, 400, +0, 0, 38, 255, 0, 0, 305, 0, + +}, +{ +0, +0, 7005, 32, 32, 869, 400, 0, 37, +0, 999, 0, 46, 204, 739, 1570, 1076, +0, 112, 89, 0, 1, 430, 1, 1191, +3, 368, 0, 0, 0, 0, 2, 77, + +}, +{ +0, +0, 200, 0, 0, 0, 2054, 0, 0, +0, 397, 0, 19, 438, 0, 108, 0, +0, 0, 4, 0, 112, 3, 0, 0, +4, 0, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 0, 0, 0, 0, 0, 29, 0, +0, 0, 0, 311, 16, 19, 11, 0, +2, 0, 10, 3, 1382, 0, 0, 10, +0, 0, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 297, 0, 0, 0, 4290, 0, 0, +0, 3968, 0, 0, 0, 0, 33, 0, +0, 0, 1, 0, 0, 70, 0, 0, +15, 0, 0, 0, 0, 0, 0, 0, + +}, +{ +0, +0, 2304, 0, 0, 0, 4731, 0, 0, +0, 1873, 0, 198, 33, 0, 921, 0, +0, 0, 191, 0, 114, 134, 0, 2, +12, 0, 0, 7, 0, 0, 0, 0, + +}, +{ +0, +0, 0, 0, 0, 0, 599, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, +0, 207, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, + +}, +}; +#endif diff --git a/intl/chardet/nsDetectionConfident.h b/intl/chardet/nsDetectionConfident.h new file mode 100644 index 000000000..c1eb6e17c --- /dev/null +++ b/intl/chardet/nsDetectionConfident.h @@ -0,0 +1,43 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsDetetctionConfident_h__ +#define nsDetetctionConfident_h__ + +/* + This type is used to indicate how confident the detection module about + the return result. + + eNoAnswerYet is used to indicate that the detector have not find out a + answer yet based on the data it received. + eBestAnswer is used to indicate that the answer the detector returned + is the best one within the knowledge of the detector. + In other words, the test to all other candidcates fail. + + For example, the (Shift_JIS/EUC-JP/ISO-2022-JP) detection + module may return this with answer "Shift_JIS "if it receive + bytes > 0x80 (which make ISO-2022-JP test failed) and byte + 0x82 (which may EUC-JP test failed) + + eSureAnswer is used to indicate that the detector is 100% sure about the + answer. + Exmaple 1; the Shift_JIS/ISO-2022-JP/EUC-JP detector return + this w/ ISO-2022-JP when it hit one of the following ESC seq + ESC ( J + ESC $ @ + ESC $ B + Example 2: the detector which can detect UCS2 return w/ UCS2 + when the first 2 byte are BOM mark. + Example 3: the Korean detector return ISO-2022-KR when it + hit ESC $ ) C + + */ +typedef enum { + eNoAnswerYet = 0, + eBestAnswer, + eSureAnswer, + eNoAnswerMatch +} nsDetectionConfident; + +#endif /* nsDetetctionConfident_h__ */ diff --git a/intl/chardet/nsICharsetDetectionObserver.h b/intl/chardet/nsICharsetDetectionObserver.h new file mode 100644 index 000000000..1877e2ba5 --- /dev/null +++ b/intl/chardet/nsICharsetDetectionObserver.h @@ -0,0 +1,28 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsICDETObserver_h__ +#define nsICDETObserver_h__ + +#include "nsISupports.h" +#include "nsDetectionConfident.h" + +// {12BB8F12-2389-11d3-B3BF-00805F8A6670} +#define NS_ICHARSETDETECTIONOBSERVER_IID \ +{ 0x12bb8f12, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + +/* + Used to inform answer by nsICharsetDetector + */ +class nsICharsetDetectionObserver : public nsISupports { +public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICHARSETDETECTIONOBSERVER_IID) + NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsICharsetDetectionObserver, + NS_ICHARSETDETECTIONOBSERVER_IID) + +#endif /* nsICDETObserver_h__ */ diff --git a/intl/chardet/nsICharsetDetector.h b/intl/chardet/nsICharsetDetector.h new file mode 100644 index 000000000..2215fa0f0 --- /dev/null +++ b/intl/chardet/nsICharsetDetector.h @@ -0,0 +1,51 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsICharsetDetector_h__ +#define nsICharsetDetector_h__ + +#include "nsISupports.h" + +class nsICharsetDetectionObserver; + +// {12BB8F14-2389-11d3-B3BF-00805F8A6670} +#define NS_ICHARSETDETECTOR_IID \ +{ 0x12bb8f14, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + +#define NS_CHARSET_DETECTOR_CONTRACTID_BASE "@mozilla.org/intl/charsetdetect;1?type=" +#define NS_CHARSET_DETECTOR_CATEGORY "charset-detectors" + +class nsICharsetDetector : public nsISupports { +public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICHARSETDETECTOR_IID) + + /* + Setup the observer so it know how to notify the answer + */ + NS_IMETHOD Init(nsICharsetDetectionObserver* observer) = 0; + + /* + Feed a block of bytes to the detector. + It will call the Notify function of the nsICharsetObserver if it find out + the answer. + aBytesArray - array of bytes + aLen - length of aBytesArray + oDontFeedMe - return true if the detector do not need the following block + false it need more bytes. + This is used to enhance performance + */ + NS_IMETHOD DoIt(const char* aBytesArray, uint32_t aLen, bool* oDontFeedMe) = 0; + + /* + It also tell the detector the last chance the make a decision + */ + NS_IMETHOD Done() = 0; + +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsICharsetDetector, + NS_ICHARSETDETECTOR_IID) + +#endif /* nsICharsetDetector_h__ */ diff --git a/intl/chardet/nsIStringCharsetDetector.h b/intl/chardet/nsIStringCharsetDetector.h new file mode 100644 index 000000000..9abd85df5 --- /dev/null +++ b/intl/chardet/nsIStringCharsetDetector.h @@ -0,0 +1,44 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsIStringCharsetDetector_h__ +#define nsIStringCharsetDetector_h__ + +#include "nsISupports.h" +#include "nsDetectionConfident.h" + +// {12BB8F15-2389-11d3-B3BF-00805F8A6670} +#define NS_ISTRINGCHARSETDETECTOR_IID \ +{ 0x12bb8f15, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + + +#define NS_STRCDETECTOR_CONTRACTID_BASE "@mozilla.org/intl/stringcharsetdetect;1?type=" + +/* + This interface is similar to nsICharsetDetector + The difference is it is for line base detection instead of block based + detectection. + */ + + +class nsIStringCharsetDetector : public nsISupports { +public: + + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ISTRINGCHARSETDETECTOR_IID) + /* + Perform the charset detection + + aBytesArray- the bytes + aLen- the length of the bytes + oCharset- the charset answer + oConfident - the confidence of the answer + */ + NS_IMETHOD DoIt(const char* aBytesArray, uint32_t aLen, + const char** oCharset, nsDetectionConfident &oConfident) = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIStringCharsetDetector, + NS_ISTRINGCHARSETDETECTOR_IID) + +#endif /* nsIStringCharsetDetector_h__ */ diff --git a/intl/chardet/tools/GenCyrillicClass.cpp b/intl/chardet/tools/GenCyrillicClass.cpp new file mode 100644 index 000000000..180651a49 --- /dev/null +++ b/intl/chardet/tools/GenCyrillicClass.cpp @@ -0,0 +1,135 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "nsICharsetConverterManager.h" +#include <iostream.h> +#include "nsISupports.h" +#include "nsIComponentManager.h" +#include "nsIServiceManager.h" +#include "nsIUnicodeDecoder.h" +#include "nsIUnicodeEncoder.h" +#include "nsCRT.h" +#include <stdio.h> +#include <stdlib.h> +#if defined(XP_WIN) +#include <io.h> +#endif +#ifdef XP_UNIX +#include <unistd.h> +#endif + +//--------------------------------------------------------------------------- +void header() +{ +char *header= +"#ifndef nsCyrillicClass_h__\n" +"#define nsCyrillicClass_h__\n" +"/* PLEASE DO NOT EDIT THIS FILE DIRECTLY. THIS FILE IS GENERATED BY \n" +" GenCyrllicClass found in mozilla/intl/chardet/tools\n" +" */\n"; + printf(header); +} +//--------------------------------------------------------------------------- +void footer() +{ + printf("#endif\n"); +} +//--------------------------------------------------------------------------- +void npl() +{ +char *npl= +"/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */\n" +"/* This Source Code Form is subject to the terms of the Mozilla Public\n" +" * License, v. 2.0. If a copy of the MPL was not distributed with this\n" +" * file, You can obtain one at http://mozilla.org/MPL/2.0/. */\n"; + printf(npl); +} +//--------------------------------------------------------------------------- +static nsIUnicodeEncoder* gKOI8REncoder = nullptr; +static nsICharsetConverterManager* gCCM = nullptr; + +//--------------------------------------------------------------------------- +uint8_t CyrillicClass(nsIUnicodeDecoder* decoder, uint8_t byte) +{ + char16_t ubuf[2]; + uint8_t bbuf[2]; + + int32_t blen = 1; + int32_t ulen = 1; + nsresult res = decoder->Convert((char*)&byte, &blen, ubuf, &ulen); + if(NS_SUCCEEDED(res) && (1 == ulen )) + { + ubuf[0] = nsCRT::ToUpper(ubuf[0]); + blen=1; + res = gKOI8REncoder->Convert(ubuf,&ulen,(char*)bbuf,&blen); + if(NS_SUCCEEDED(res) && (1 == blen)) + { + if(0xe0 <= bbuf[0]) + { + return bbuf[0] - (uint8_t)0xdf; + } + } + } + return 0; +} +//--------------------------------------------------------------------------- +void genCyrillicClass(const char* name, const char* charset) +{ + nsIUnicodeDecoder *decoder = nullptr; + nsresult res = NS_OK; + nsAutoString str(charset); + res = gCCM->GetUnicodeDecoder(&str, &decoder); + if(NS_FAILED(res)) + { + printf("cannot locate %s Decoder\n", charset); + return; + } + printf("static const uint8_t %sMap [128] = {\n",name); + uint8_t i,j; + for(i=0x80;i!=0x00;i+=0x10) + { + for(j=0;j<=0x0f;j++) + { + uint8_t cls = CyrillicClass(decoder, i+j); + printf(" %2d, ",cls); + } + printf("\n"); + } + printf("};\n"); + NS_IF_RELEASE(decoder); +} +//--------------------------------------------------------------------------- + + +int main(int argc, char** argv) { + nsresult res = nullptr; + + nsCOMPtr<nsICharsetConverterManager> gCCM = do_GetService(kCharsetConverterManagerCID, &res); + + if(NS_FAILED(res) && (nullptr != gCCM)) + { + printf("cannot locate CharsetConverterManager\n"); + return(-1); + } + nsAutoString koi8r("KOI8-R"); + res = gCCM->GetUnicodeEncoder(&koi8r,&gKOI8REncoder); + if(NS_FAILED(res) && (nullptr != gKOI8REncoder)) + { + printf("cannot locate KOI8-R Encoder\n"); + return(-1); + } + + + npl(); + header(); + + genCyrillicClass("KOI8", "KOI8-R"); + genCyrillicClass("CP1251", "windows-1251"); + genCyrillicClass("IBM866", "IBM866"); + genCyrillicClass("ISO88595", "ISO-8859-5"); + genCyrillicClass("MacCyrillic", "x-mac-cyrillic"); + footer(); + NS_IF_RELEASE(gKOI8REncoder); + return(0); +}; diff --git a/intl/chardet/tools/charfreq.pl b/intl/chardet/tools/charfreq.pl new file mode 100644 index 000000000..4232d4765 --- /dev/null +++ b/intl/chardet/tools/charfreq.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl +#!/usr/bin/perl +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +open (STAT,$ARGV[0]) || die " cannot open data file $ARGV[0]\n"; +@count; +while(<STAT>) +{ + @k = split(/\s+/, $_); + $count{$k[0]} = $k[1]; +} +$count = 0; +while(<STDIN>) +{ + @ck = split /\s*/, $_; + $s = 0; + $fb = 0; + $cl = $#ck; + $j = 0; + while($j < $cl) { + $cc = unpack("C", $ck[$j]); + if(0 eq $s ) { + if($cc > 0x80) { + if($cc > 0xa0) { + $fb = $ck[$j]; + $s = 2; + } else { + $s = 1; + } + } + } elsif (1 eq $s) { + } else { + if($cc > 0xa0) { + $fb .= $ck[$j]; + $count{$fb}++; + print $fb . " " .$count{$fb} . "\n"; + $s = 0; + } else { + $s = 1; + } + } + $j = $j + 1; + } +} +foreach $c (sort(keys( %count ))) +{ + print $c . " ". $count{$c} . "\n"; +} diff --git a/intl/chardet/tools/charfreqtostat.pl b/intl/chardet/tools/charfreqtostat.pl new file mode 100644 index 000000000..04af0c82c --- /dev/null +++ b/intl/chardet/tools/charfreqtostat.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +sub GenNPL { + my($ret) = << "END_NPL"; +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +END_NPL + + return $ret; +} + +print GenNPL(); +$total=0; +@h; +@l; + +while(<STDIN>) +{ + @k = split(/\s+/, $_); + @i = unpack("CCCC", $k[0]); +# printf("%x %x %s",$i[0] , $i[1] , "[" . $k[0] . "] " . $i . " " . $j . " " . $k[1] ."\n"); + if((0xA1 <= $i[0]) && (0xA1 <= $i[1])){ + $total += $k[1]; + $v = $i[0] - 0x00A1; + $h[$v] += $k[1]; + $u = $i[1] - 0x00A1; + $l[$u] += $k[1]; +# print "hello $v $h[$v] $u $l[$u]\n"; + } +} + + +$ffh = 0.0; +$ffl = 0.0; +for($i=0x00A1;$i< 0x00FF ; $i++) +{ + $fh[$i - 0x00a1] = $h[$i- 0x00a1] / $total; + $ffh += $fh[$i - 0x00a1]; + + $fl[$i - 0x00a1] = $l[$i- 0x00a1] / $total; + $ffl += $fl[$i - 0x00a1]; +} +$mh = $ffh / 94.0; +$ml = $ffl / 94.0; + +$sumh=0.0; +$suml=0.0; +for($i=0x00A1;$i< 0x00FF ; $i++) +{ + $sh = $fh[$i - 0x00a1] - $mh; + $sh *= $sh; + $sumh += $sh; + + $sl = $fl[$i - 0x00a1] - $ml; + $sl *= $sl; + $suml += $sl; +} +$sumh /= 94.0; +$suml /= 94.0; +$stdh = sqrt($sumh); +$stdl = sqrt($suml); + +print "{\n"; +print " {\n"; +for($i=0x00A1;$i< 0x00FF ; $i++) +{ + if($i eq 0xfe) { + printf(" %.6ff \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i); + } else { + printf(" %.6ff, \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i); + } +} +print " },\n"; +printf ("%.6ff, \/\/ Lead Byte StdDev\n", $stdh); +printf ("%.6ff, \/\/ Lead Byte Mean\n", $mh); +printf ("%.6ff, \/\/ Lead Byte Weight\n", $stdh / ($stdh + $stdl)); +print " {\n"; +for($i=0x00A1;$i< 0x00FF ; $i++) +{ + if($i eq 0xfe) { + printf(" %.6ff \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] , $i); + } else { + printf(" %.6ff, \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] , $i); + } +} +print " },\n"; +printf ("%.6ff, \/\/ Trail Byte StdDev\n", $stdl); +printf ("%.6ff, \/\/ Trail Byte Mean\n", $ml); +printf ("%.6ff \/\/ Trial Byte Weight\n", $stdl / ($stdh + $stdl)); +print "};\n"; diff --git a/intl/chardet/tools/gen.cmd b/intl/chardet/tools/gen.cmd new file mode 100755 index 000000000..56ca34bc9 --- /dev/null +++ b/intl/chardet/tools/gen.cmd @@ -0,0 +1,18 @@ +REM This Source Code Form is subject to the terms of the Mozilla Public +REM License, v. 2.0. If a copy of the MPL was not distributed with this +REM file, You can obtain one at http://mozilla.org/MPL/2.0/. + +perl gencp1252.pl > ..\src\nsCP1252Verifier.h +perl geneucjp.pl > ..\src\nsEUCJPVerifier.h +perl geniso2022jp.pl > ..\src\nsISO2022JPVerifier.h +perl gensjis.pl > ..\src\nsSJISVerifier.h +perl genutf8.pl > ..\src\nsUTF8Verifier.h +perl geneuckr.pl > ..\src\nsEUCKRVerifier.h +perl gengb2312.pl > ..\src\nsGB2312Verifier.h +perl genbig5.pl > ..\src\nsBIG5Verifier.h +perl geneuctw.pl > ..\src\nsEUCTWVerifier.h +perl genucs2be.pl > ..\src\nsUCS2BEVerifier.h +perl genucs2le.pl > ..\src\nsUCS2LEVerifier.h +perl genhz.pl > ..\src\nsHZVerifier.h +perl geniso2022kr.pl > ..\src\nsISO2022KRVerifier.h +perl geniso2022cn.pl > ..\src\nsISO2022CNVerifier.h diff --git a/intl/chardet/tools/genbig5.pl b/intl/chardet/tools/genbig5.pl new file mode 100644 index 000000000..8e3a777cb --- /dev/null +++ b/intl/chardet/tools/genbig5.pl @@ -0,0 +1,42 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@big5_cls); +my(@big5_st); +my($big5_ver); + + +@big5_cls = ( + [ 0x00 , 0x00 , 1 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x01 , 0x3f , 1 ], + [ 0x40 , 0x7e , 2 ], + [ 0x7f , 0x7f , 1 ], + [ 0xff , 0xff , 0 ], + [ 0x80 , 0xa0 , 4 ], + [ 0xa1 , 0xfe , 3 ], +); + +package genverifier; +@big5_st = ( +# 0 1 2 3 4 + 1, 0, 0, 3, 1, # state 0 + 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 0, 0, 0, # state 3 +); + + +$big5_ver = genverifier::GenVerifier("BIG5", "Big5", \@big5_cls, 5, \@big5_st); +print $big5_ver; + + + diff --git a/intl/chardet/tools/gencp1252.pl b/intl/chardet/tools/gencp1252.pl new file mode 100644 index 000000000..debc53ca5 --- /dev/null +++ b/intl/chardet/tools/gencp1252.pl @@ -0,0 +1,55 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@cp1252_cls); +my(@cp1252_st); +my($cp1252_ver); + + +@cp1252_cls = ( + [ 0x00 , 0x00 , 1 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x81 , 0x81 , 0 ], + [ 0x8d , 0x8d , 0 ], + [ 0x8f , 0x8f , 0 ], + [ 0x90 , 0x90 , 0 ], + [ 0x9d , 0x9d , 0 ], + [ 0xc0 , 0xd6 , 1 ], + [ 0xd8 , 0xf6 , 1 ], + [ 0xf8 , 0xff , 1 ], + [ 0x8a , 0x8a , 1 ], + [ 0x8c , 0x8c , 1 ], + [ 0x8e , 0x8e , 1 ], + [ 0x9a , 0x9a , 1 ], + [ 0x9c , 0x9c , 1 ], + [ 0x9e , 0x9e , 1 ], + [ 0x9f , 0x9f , 1 ], + [ 0x00 , 0xff , 2 ], +); + +package genverifier; +@cp1252_st = ( +# 0 1 2 + 1, 3, 0, # Start State - 0 + 1, 1, 1, # Error State - 1 + 2, 2, 2, # ItsMe State - 2 + 1, 4, 0, # State - 3 + 1, 5, 4, # State - 4 + 1, 1, 4, # State - 5 +); + + +$cp1252_ver = genverifier::GenVerifier("CP1252", "windows-1252", + \@cp1252_cls, 3, \@cp1252_st); +print $cp1252_ver; + + + diff --git a/intl/chardet/tools/gencyrillic.pl b/intl/chardet/tools/gencyrillic.pl new file mode 100644 index 000000000..51bd6e456 --- /dev/null +++ b/intl/chardet/tools/gencyrillic.pl @@ -0,0 +1,65 @@ +#!/usr/local/bin/perl + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use StatKoi '.' ; + +open(FILE, "> ../src/nsCyrillicProb.h") or die "cannot open nsCyrillicDetector.h"; + +print FILE <<EOF; +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ + +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsCyrillicDetector_h__ +#define nsCyrillicDetector_h__ +/* + DO NOT EDIT THIS FILE !!! + This file is generated by the perl script in + mozilla/intl/chardet/tools/gencyrillic.pl + + To ues that script, you need to grab StatKoi.pm file from + the "Cyrillic Software Suite" written by John Neystdt. + http://www.neystadt.org/cyrillic (You can also find it from CPAN) + */ +EOF +$table = \%Lingua::DetectCharset::StatKoi::StatsTableKoi; +print FILE "const uint16_t gCyrillicProb[33][33] = {"; + print FILE "{ \n"; + print FILE "0,\n"; + for($j = 0xc0; $j < 0xe0; $j++) + { + print FILE "0, \t"; + if( 7 == ( $j % 8) ) + { + print FILE "\n"; + } + } + print FILE "\n}, \n"; +for($i = 0xc0; $i < 0xe0; $i++) +{ + print FILE "{ \n"; + print FILE "0,\n"; + for($j = 0xc0; $j < 0xe0; $j++) + { + $key = chr($i) . chr($j); + if(exists($table->{$key})) + { + $v = $table->{$key}; + } else { + $v = 0; + } + print FILE $v . ", \t"; + if( 7 == ( $j % 8) ) + { + print FILE "\n"; + } + } + print FILE "\n}, \n"; +} +print FILE "};\n"; +print FILE "#endif\n"; diff --git a/intl/chardet/tools/geneucjp.pl b/intl/chardet/tools/geneucjp.pl new file mode 100644 index 000000000..692be15ab --- /dev/null +++ b/intl/chardet/tools/geneucjp.pl @@ -0,0 +1,47 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@eucjp_cls); +my(@eucjp_st); +my($eucjp_ver); + + +@eucjp_cls = ( + [ 0x0e , 0x0f , 5 ], + [ 0xe0 , 0xfe , 0 ], + [ 0x8e , 0x8e , 1 ], + [ 0xa1 , 0xdf , 2 ], + [ 0x8f , 0x8f , 3 ], + [ 0x01 , 0x1a , 4 ], + [ 0x1c , 0x7f , 4 ], + [ 0x00 , 0x00 , 4 ], + [ 0x1b , 0x1b , 5 ], + [ 0x80 , 0x8d , 5 ], + [ 0xa0 , 0xa0 , 5 ], + [ 0x80 , 0xff , 5 ] +); + +package genverifier; +@eucjp_st = ( +# 0 1 2 3 4 5 + 3, 4, 3, 5, 0, 1, # state 0 + 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 0, 1, 0, 1, 1, 1, # state 3 + 1, 1, 0, 1, 1, 1, # state 4 + 3, 1, 3, 1, 1, 1, # state 5 +); + + +$eucjp_ver = genverifier::GenVerifier("EUCJP", "EUC-JP", \@eucjp_cls, 6, \@eucjp_st); +print $eucjp_ver; + + + diff --git a/intl/chardet/tools/geneuckr.pl b/intl/chardet/tools/geneuckr.pl new file mode 100644 index 000000000..007810a6a --- /dev/null +++ b/intl/chardet/tools/geneuckr.pl @@ -0,0 +1,42 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@euckr_cls); +my(@euckr_st); +my($euckr_ver); + + +@euckr_cls = ( + [ 0x00 , 0x00 , 1 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x01 , 0x7f , 1 ], + [ 0x80 , 0xa0 , 0 ], + [ 0xff , 0xff , 0 ], + [ 0xad , 0xaf , 3 ], + [ 0xc9 , 0xc9 , 3 ], + [ 0xa1 , 0xfe , 2 ], +); + +package genverifier; +@euckr_st = ( +# 0 1 2 3 + 1, 0, 3, 1, # state 0 + 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 0, 0, # state 3 +); + + +$euckr_ver = genverifier::GenVerifier("EUCKR", "EUC-KR", \@euckr_cls, 4, \@euckr_st); +print $euckr_ver; + + + diff --git a/intl/chardet/tools/geneuctw.pl b/intl/chardet/tools/geneuctw.pl new file mode 100644 index 000000000..88453155e --- /dev/null +++ b/intl/chardet/tools/geneuctw.pl @@ -0,0 +1,49 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@euctw_cls); +my(@euctw_st); +my($euctw_ver); + + +@euctw_cls = ( + [ 0x00 , 0x00 , 2 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x01 , 0x7f , 2 ], + [ 0x8e , 0x8e , 6 ], + [ 0x80 , 0xa0 , 0 ], + [ 0xff , 0xff , 0 ], + [ 0xa1 , 0xa1 , 3 ], + [ 0xa2 , 0xa7 , 4 ], + [ 0xa8 , 0xa9 , 5 ], + [ 0xaa , 0xc1 , 1 ], + [ 0xc2 , 0xc2 , 3 ], + [ 0xc3 , 0xc3 , 1 ], + [ 0xc4 , 0xfe , 3 ], +); + +package genverifier; +@euctw_st = ( +# 0 1 2 3 4 5 6 + 1, 1, 0, 3, 3, 3, 4, # state 0 + 1, 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 0, 1, 0, 0, 0, 1, # state 3 + 1, 1, 1, 1, 5, 1, 1, # state 4 + 1, 0, 1, 0, 0, 0, 1, # state 5 +); + + +$euctw_ver = genverifier::GenVerifier("EUCTW", "x-euc-tw", \@euctw_cls, 7, \@euctw_st); +print $euctw_ver; + + + diff --git a/intl/chardet/tools/gengb18030.pl b/intl/chardet/tools/gengb18030.pl new file mode 100644 index 000000000..654710b2c --- /dev/null +++ b/intl/chardet/tools/gengb18030.pl @@ -0,0 +1,44 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@gb18030_cls); +my(@gb18030_st); +my($gb18030_ver); + + +@gb18030_cls = ( + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x30 , 0x39 , 3 ], + [ 0x00 , 0x3f , 1 ], + [ 0x40 , 0x7e , 2 ], + [ 0x7f , 0x7f , 4 ], + [ 0x80 , 0x80 , 5 ], + [ 0x81 , 0xfe , 6 ], + [ 0xff , 0xff , 0 ], +); + +package genverifier; +@gb18030_st = ( +# 0 1 2 3 4 5 6 + 1, 0, 0, 0, 0, 0, 3, # state 0 + 1, 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 0, 4, 1, 0, 0, # state 3, multibytes, 1st byte identified + 1, 1, 1, 1, 1, 1, 5, # state 4, multibytes, 2nd byte identified + 1, 1, 1, 2, 1, 1, 1, # state 5, multibytes, 3rd byte identified +); + + +$gb18030_ver = genverifier::GenVerifier("gb18030", "gb18030", \@gb18030_cls, 7, \@gb18030_st); +print $gb18030_ver; + + + diff --git a/intl/chardet/tools/gengb2312.pl b/intl/chardet/tools/gengb2312.pl new file mode 100644 index 000000000..57d86926b --- /dev/null +++ b/intl/chardet/tools/gengb2312.pl @@ -0,0 +1,41 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@gb2312_cls); +my(@gb2312_st); +my($gb2312_ver); + + +@gb2312_cls = ( + [ 0x00 , 0x00 , 1 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x01 , 0x7f , 1 ], + [ 0x80 , 0xa0 , 0 ], + [ 0xff , 0xff , 0 ], + [ 0xaa , 0xaf , 3 ], + [ 0xa1 , 0xfe , 2 ], +); + +package genverifier; +@gb2312_st = ( +# 0 1 2 3 + 1, 0, 3, 1, # state 0 + 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 0, 0, # state 3 +); + + +$gb2312_ver = genverifier::GenVerifier("GB2312", "GB2312", \@gb2312_cls, 4, \@gb2312_st); +print $gb2312_ver; + + + diff --git a/intl/chardet/tools/genhz.pl b/intl/chardet/tools/genhz.pl new file mode 100644 index 000000000..c58eb4675 --- /dev/null +++ b/intl/chardet/tools/genhz.pl @@ -0,0 +1,57 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@hz_cls); +my(@hz_st); +my($hz_ver); + + +# +# +# > 0x80 - 1 +# ~ - 2 +# LF - 3 +# { - 4 +# } - 5 +# +@hz_cls = ( + [ 0x01 , 0x1a , 0 ], + [ 0x7e , 0x7e , 2 ], + [ 0x0a , 0x0a , 3 ], + [ 0x7b , 0x7b , 4 ], + [ 0x7d , 0x7d , 5 ], + [ 0x1c , 0x7f , 0 ], + [ 0x0e , 0x0f , 1 ], + [ 0x1b , 0x1b , 1 ], + [ 0x00 , 0x00 , 1 ], + [ 0x80 , 0xff , 1 ] +); + + +# +# +package genverifier; +@hz_st = ( +# 0 1 2 3 4 5 + 0, 1, 3, 0, 0, 0, # Start State - 0 + 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 0, 0, 4, 1, # state 3 - got ~ + 5, 1, 6, 1, 5, 5, # state 4 - got ~ { + 4, 1, 4, 1, 4, 4, # state 5 - got ~ { X + 4, 1, 4, 1, 4, 2, # state 6 - got ~ { [X X]* ~ +); + +$hz_ver = genverifier::GenVerifier("HZ", "HZ-GB-2312", + \@hz_cls, 6, \@hz_st); +print $hz_ver; + + + diff --git a/intl/chardet/tools/geniso2022cn.pl b/intl/chardet/tools/geniso2022cn.pl new file mode 100644 index 000000000..c4a43caae --- /dev/null +++ b/intl/chardet/tools/geniso2022cn.pl @@ -0,0 +1,58 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@iso2022cn_cls); +my(@iso2022cn_st); +my($iso2022cn_ver); + + +# +# +# ESC - 1 +# > 0x80 - 2 +# $ - 3 +# ) - 4 +# * - 5 +# A G - 6 +# H - 7 +# N O - 8 +# +@iso2022cn_cls = ( + [ 0x01 , 0x1a , 0 ], + [ 0x29 , 0x29 , 3 ], + [ 0x43 , 0x43 , 4 ], + [ 0x1c , 0x7f , 0 ], + [ 0x1b , 0x1b , 1 ], + [ 0x00 , 0x00 , 2 ], + [ 0x80 , 0xff , 2 ] +); + + +# +# ESC$((([)][AG])|([*]H))|[NO]) +# +package genverifier; +@iso2022cn_st = ( +# 0 1 2 3 4 5 6 7 8 + 0, 3, 1, 0, 0, 0, 0, 0, 0, # Start State - 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 1, 4, 1, 1, 1, 1, 2, # state 3 - got ESC + 1, 1, 1, 1, 5, 6, 1, 1, 1, # state 4 - got ESC $ + 1, 1, 1, 1, 1, 1, 2, 1, 1, # state 5 - got ESC $ ) + 1, 1, 1, 1, 1, 1, 1, 2, 1, # state 6 - got ESC $ * +); + +$iso2022cn_ver = genverifier::GenVerifier("ISO2022CN", "ISO-2022-CN", + \@iso2022cn_cls, 9, \@iso2022cn_st); +print $iso2022cn_ver; + + + diff --git a/intl/chardet/tools/geniso2022jp.pl b/intl/chardet/tools/geniso2022jp.pl new file mode 100644 index 000000000..4408fbeb0 --- /dev/null +++ b/intl/chardet/tools/geniso2022jp.pl @@ -0,0 +1,49 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@iso2022jp_cls); +my(@iso2022jp_st); +my($iso2022jp_ver); + +# 1:ESC 3:'(' 4:'B' 5:'J' 6:'@' 7:'$' 8:'D' 9:'I' +@iso2022jp_cls = ( + [ 0x0e , 0x0f , 2 ], + [ 0x28 , 0x28 , 3 ], + [ 0x42 , 0x42 , 4 ], + [ 0x4a , 0x4a , 5 ], + [ 0x40 , 0x40 , 6 ], + [ 0x24 , 0x24 , 7 ], + [ 0x44 , 0x44 , 8 ], + [ 0x49 , 0x49 , 9 ], + [ 0x01 , 0x1a , 0 ], + [ 0x1c , 0x7f , 0 ], + [ 0x1b , 0x1b , 1 ], + [ 0x00 , 0x00 , 2 ], + [ 0x80 , 0xff , 2 ] +); + +package genverifier; +@iso2022jp_st = ( +# 0 1 2 3 4 5 6 7 8 9 + 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, # Start State - 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 1, 5, 1, 1, 1, 4, 1, 1, # got ESC + 1, 1, 1, 6, 2, 1, 2, 1, 1, 1, # got ESC $ + 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, # got ESC ( + 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, # got ESC $ ( +); + +$iso2022jp_ver = genverifier::GenVerifier("ISO2022JP", "ISO-2022-JP", + \@iso2022jp_cls, 10, \@iso2022jp_st); +print $iso2022jp_ver; + + + diff --git a/intl/chardet/tools/geniso2022kr.pl b/intl/chardet/tools/geniso2022kr.pl new file mode 100644 index 000000000..f56bcf9fb --- /dev/null +++ b/intl/chardet/tools/geniso2022kr.pl @@ -0,0 +1,55 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@iso2022kr_cls); +my(@iso2022kr_st); +my($iso2022kr_ver); + + +# +# +# ESC - 1 +# > 0x80 - 2 +# $ - 3 +# ) - 4 +# C - 5 +# +@iso2022kr_cls = ( + [ 0x01 , 0x1a , 0 ], + [ 0x24 , 0x24 , 3 ], + [ 0x29 , 0x29 , 4 ], + [ 0x43 , 0x43 , 5 ], + [ 0x1c , 0x7f , 0 ], + [ 0x1b , 0x1b , 1 ], + [ 0x00 , 0x00 , 2 ], + [ 0x80 , 0xff , 2 ] +); + + +# +# ESC$)C +# +package genverifier; +@iso2022kr_st = ( +# 0 1 2 3 4 5 + 0, 3, 1, 0, 0, 0, # Start State - 0 + 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 1, 4, 1, 1, # state 3 - got ESC + 1, 1, 1, 1, 5, 1, # state 4 - got ESC $ + 1, 1, 1, 1, 1, 2, # state 5 - got ESC $ ) +); + +$iso2022kr_ver = genverifier::GenVerifier("ISO2022KR", "ISO-2022-KR", + \@iso2022kr_cls, 6, \@iso2022kr_st); +print $iso2022kr_ver; + + + diff --git a/intl/chardet/tools/gensjis.pl b/intl/chardet/tools/gensjis.pl new file mode 100644 index 000000000..20966d03e --- /dev/null +++ b/intl/chardet/tools/gensjis.pl @@ -0,0 +1,46 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@sjis_cls); +my(@sjis_st); +my($sjis_ver); + +@sjis_cls = ( + [ 0x00 , 0x00 , 0 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0xfd , 0xff , 0 ], + [ 0x85 , 0x86 , 3 ], + [ 0xeb , 0xec , 5 ], + [ 0x01 , 0x1a , 1 ], + [ 0x1c , 0x3f , 1 ], + [ 0x7f , 0x7f , 1 ], + [ 0x40 , 0x7e , 2 ], + [ 0xa1 , 0xdf , 2 ], + [ 0x80 , 0x9f , 3 ], + [ 0xa0 , 0xa0 , 4 ], + [ 0xe0 , 0xea , 3 ], + [ 0xed , 0xfc , 4 ], +); + +package genverifier; +@sjis_st = ( +# 0 1 2 3 4 5 + 1, 0, 0, 3, 1, 1, # Start State - 0 + 1, 1, 1, 1, 1, 1, # Error State - 1 + 2, 2, 2, 2, 2, 2, # ItsMe State - 2 + 1, 1, 0, 0, 0, 0, # State - 3 +); + +$sjis_ver = genverifier::GenVerifier("SJIS", "Shift_JIS", \@sjis_cls, 6, \@sjis_st); +print $sjis_ver; + + + diff --git a/intl/chardet/tools/genutf8.pl b/intl/chardet/tools/genutf8.pl new file mode 100644 index 000000000..437dd535b --- /dev/null +++ b/intl/chardet/tools/genutf8.pl @@ -0,0 +1,189 @@ +#!/usr/local/bin/perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use strict; +require "genverifier.pm"; +use genverifier; + + +my(@utf8_cls); +my(@utf8_st); +my($utf8_ver); + +# +# +# UTF8 encode the UCS4 into 1 to 4 bytes +# +# 1 byte 00 00 00 00 00 00 00 7f +# 2 bytes 00 00 00 80 00 00 07 ff +# 3 bytes 00 00 08 00 00 00 ff ff +# 4 bytes 00 01 00 00 00 10 ff ff +# +# However, since Surrogate area should not be encoded into UTF8 as +# a Surrogate pair, we can remove the surrogate area from UTF8 +# +# 1 byte 00 00 00 00 00 00 00 7f +# 2 bytes 00 00 00 80 00 00 07 ff +# 3 bytes 00 00 08 00 00 00 d7 ff +# 00 00 e0 00 00 00 ff ff +# 4 bytes 00 01 00 00 00 10 ff ff +# +# Now we break them into 6 bits group for 2-4 bytes UTF8 +# +# 1 byte 00 7f +# 2 bytes 02 00 1f 3f +# 3 bytes 00 20 00 0d 1f 3f +# 0e 00 00 0f 3f 3f +# 4 bytes 00 10 00 00 04 0f 3f 3f +# +# Break down more +# +# 1 byte 00 7f +# 2 bytes 02 00 1f 3f +# 3 bytes 00 20 00 00 3f 3f +# 01 00 00 0c 3f 3f +# 0d 00 00 0d 1f 3f +# 0e 00 00 0f 3f 3f +# 4 bytes 00 10 00 00 00 3f 3f 3f +# 01 00 00 00 03 3f 3f 3f +# 04 00 00 00 04 0f 3f 3f +# +# Now, add +# c0 to the lead byte of 2 bytes UTF8 +# e0 to the lead byte of 3 bytes UTF8 +# f0 to the lead byte of 4 bytes UTF8 +# 80 to the trail bytes +# +# 1 byte 00 7f +# 2 bytes c2 80 df bf +# 3 bytes e0 a0 80 e0 bf bf +# e1 80 80 ec bf bf +# ed 80 80 ed 9f bf +# ee 80 80 ef bf bf +# 4 bytes f0 90 80 80 f0 bf bf bf +# f1 80 80 80 f3 bf bf bf +# f4 80 80 80 f4 8f bf bf +# +# +# Now we can construct our state diagram +# +# 0:0x0e,0x0f,0x1b->Error +# 0:[0-0x7f]->0 +# 0:[c2-df]->3 +# 0:e0->4 +# 0:[e1-ec, ee-ef]->5 +# 0:ed->6 +# 0:f0->7 +# 0:[f1-f3]->8 +# 0:f4->9 +# 0:*->Error +# 3:[80-bf]->0 +# 3:*->Error +# 4:[a0-bf]->3 +# 4:*->Error +# 5:[80-bf]->3 +# 5:*->Error +# 6:[80-9f]->3 +# 6:*->Error +# 7:[90-bf]->5 +# 7:*->Error +# 8:[80-bf]->5 +# 8:*->Error +# 9:[80-8f]->5 +# 9:*->Error +# +# Now, we classified chars into class +# +# 00,0e,0f,1b:k0 +# 01-0d,10-1a,1c-7f:k1 +# 80-8f:k2 +# 90-9f:k3 +# a0-bf:k4 +# c0-c1:k0 +# c2-df:k5 +# e0:k6 +# e1-ec:k7 +# ed:k8 +# ee-ef:k7 +# f0:k9 +# f1-f3:k10 +# f4:k11 +# f5-ff:k0 +# +# Now, let's put them into array form + +@utf8_cls = ( + [ 0x00 , 0x00 , 1 ], + [ 0x0e , 0x0f , 0 ], + [ 0x1b , 0x1b , 0 ], + [ 0x01 , 0x0d , 1 ], + [ 0x10 , 0x1a , 1 ], + [ 0x1c , 0x7f , 1 ], + [ 0x80 , 0x8f , 2 ], + [ 0x90 , 0x9f , 3 ], + [ 0xa0 , 0xbf , 4 ], + [ 0xc0 , 0xc1 , 0 ], + [ 0xc2 , 0xdf , 5 ], + [ 0xe0 , 0xe0 , 6 ], + [ 0xe1 , 0xec , 7 ], + [ 0xed , 0xed , 8 ], + [ 0xee , 0xef , 7 ], + [ 0xf0 , 0xf0 , 9 ], + [ 0xf1 , 0xf3 , 10 ], + [ 0xf4 , 0xf4 , 11 ], + [ 0xf5 , 0xff , 0 ], +); +# +# Now, we write the state diagram in class +# +# 0:k0->Error +# 0:k1->0 +# 0:k5->3 +# 0:k6->4 +# 0:k7->5 +# 0:k8->6 +# 0:k9->7 +# 0:k10->8 +# 0:k11->9 +# 0:*->Error +# 3:k2,k3,k4->0 +# 3:*->Error +# 4:k4->3 +# 4:*->Error +# 5:k2,k3,k4->3 +# 5:*->Error +# 6:k2,k3->3 +# 6:*->Error +# 7:k3,k4->5 +# 7:*->Error +# 8:k2,k3,k4->5 +# 8:*->Error +# 9:k2->5 +# 9:*->Error +# +# Now, let's put them into array +# +package genverifier; +@utf8_st = ( +# 0 1 2 3 4 5 6 7 8 9 10 11 + 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe + 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3 + 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4 + 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5 + 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 + 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 + 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8 + 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 +); + + + +$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st); +print $utf8_ver; + + + diff --git a/intl/chardet/tools/genverifier.pm b/intl/chardet/tools/genverifier.pm new file mode 100644 index 000000000..8ccfef4d6 --- /dev/null +++ b/intl/chardet/tools/genverifier.pm @@ -0,0 +1,175 @@ +#!/usr/local/bin/perl + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package genverifier; +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); + +use Exporter; +$VERSION = 1.00; +@ISA = qw(Exporter); + +@EXPORT = qw( + GenVerifier + ); +@EXPORT_OK = qw(); + +sub GenNPL { + my($ret) = << "END_MPL"; +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +END_MPL + + return $ret; +} + +##-------------------------------------------------------------- +sub GetClass { + my($char, $clstbl) = @_; + my($l); + for($l =0; $l <= @$clstbl; $l++) { + if(($clstbl->[$l][0] <= $char) && ($char <= $clstbl->[$l][1])) + { + return $clstbl->[$l][2]; + } + } + print "WARNING- there are no class for $char\n"; +}; +##-------------------------------------------------------------- +sub GenClassPkg { + my($name, $bits) = @_; + return GenPkg($name, $bits, "_cls"); +} +##-------------------------------------------------------------- +sub GenStatePkg { + my($name, $bits) = @_; + return GenPkg($name, $bits, "_st"); +}; +##-------------------------------------------------------------- +sub GenPkg { + my($name, $bits, $tbl) = @_; + my($ret); + $ret = " {" . + "eIdxSft" . $bits . "bits, " . + "eSftMsk" . $bits . "bits, " . + "eBitSft" . $bits . "bits, " . + "eUnitMsk" . $bits . "bits, " . + $name . $tbl . "" . + " }"; + return $ret; +}; +##-------------------------------------------------------------- +sub Gen4BitsClass { + my($name, $clstbl) = @_; + my($i,$j); + my($cls); + my($ret); + $ret = ""; + $ret .= "static const uint32_t " . $name . "_cls [ 256 / 8 ] = {\n"; + for($i = 0; $i < 0x100; $i+= 8) { + $ret .= "PCK4BITS("; + for($j = $i; $j < $i + 8; $j++) { + $cls = &GetClass($j,$clstbl); + $ret .= sprintf("%2d", $cls) ; + if($j != ($i+7)) { + $ret .= ","; + } + } + if( $i+8 >= 0x100) { + $ret .= ") "; + } else { + $ret .= "),"; + } + $ret .= sprintf(" // %02x - %02x\n", $i, ($i+7)); + } + $ret .= "};\n"; + return $ret; +}; +##-------------------------------------------------------------- +sub GenVerifier { + my($name, $charset, $cls, $numcls, $st) = @_; + my($ret); + $ret = GenNPL(); + $ret .= GenNote(); + $ret .= GenHeader(); + $ret .= Gen4BitsClass($name, $cls); + $ret .= "\n\n"; + $ret .= Gen4BitsState($name, $st); + $ret .= "\n\n"; + $ret .= "const SMModel " . $name . "SMModel = {\n"; + $ret .= GenClassPkg($name, 4); + $ret .= ",\n"; + $ret .= " " . $numcls; + $ret .= ",\n"; + $ret .= GenStatePkg($name, 4); + $ret .= ",\n"; + $ret .= " " . "CHAR_LEN_TABLE(" . $name . "CharLenTable),\n"; + $ret .= ' "' . $charset . '",' . "\n"; + $ret .= "};\n"; + return $ret; + +}; +##-------------------------------------------------------------- +sub Gen4BitsState { + my($name, $sttbl) = @_; + my($lenafterpad) = (((@$sttbl-1) >> 3) + 1) << 3; + my($i,$j); + my($ret); + $ret = ""; + $ret .= "static const uint32_t " . $name . "_st [ " . ($lenafterpad >> 3) . "] = {\n"; + for($i = 0; $i < $lenafterpad ; $i+= 8) { + $ret .= "PCK4BITS("; + for($j = $i; $j < $i + 8; $j++) { + if(0 == $sttbl->[$j]) { + $ret .= "eStart"; + } else { if(1 == $sttbl->[$j]) { + $ret .= "eError"; + } else { if(2 == $sttbl->[$j]) { + $ret .= "eItsMe"; + } else { + $ret .= sprintf(" %d", $sttbl->[$j]) ; + }}} + if($j != ($i+7)) { + $ret .= ","; + } + } + if( $i+8 >= $lenafterpad ) { + $ret .= ") "; + } else { + $ret .= "),"; + } + $ret .= sprintf(" // %02x - %02x\n", $i, ($i+7)); + } + $ret .= "};\n"; + return $ret; +}; +##-------------------------------------------------------------- + +sub GenNote { + my($ret) = << "END_NOTE"; +/* + * DO NOT EDIT THIS DOCUMENT MANUALLY !!! + * THIS FILE IS AUTOMATICALLY GENERATED BY THE TOOLS UNDER + * mozilla/intl/chardet/tools/ + * Please contact ftang\@netscape.com or mozilla-i18n\@mozilla.org + * if you have any question. Thanks + */ +END_NOTE + return $ret; +} + +##-------------------------------------------------------------- +sub GenHeader { + my($ret) = << "END_HEADER"; +#include "nsVerifier.h" +END_HEADER + + return $ret; +} +##-------------------------------------------------------------- +1; # this should be the last line |