diff options
Diffstat (limited to 'intl/icu/source/i18n/csrutf8.cpp')
-rw-r--r-- | intl/icu/source/i18n/csrutf8.cpp | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/csrutf8.cpp b/intl/icu/source/i18n/csrutf8.cpp new file mode 100644 index 000000000..2f153793f --- /dev/null +++ b/intl/icu/source/i18n/csrutf8.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ********************************************************************** + * Copyright (C) 2005-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrutf8.h" +#include "csmatch.h" + +U_NAMESPACE_BEGIN + +CharsetRecog_UTF8::~CharsetRecog_UTF8() +{ + // nothing to do +} + +const char *CharsetRecog_UTF8::getName() const +{ + return "UTF-8"; +} + +UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { + bool hasBOM = FALSE; + int32_t numValid = 0; + int32_t numInvalid = 0; + const uint8_t *inputBytes = input->fRawInput; + int32_t i; + int32_t trailBytes = 0; + int32_t confidence; + + if (input->fRawLength >= 3 && + inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) { + hasBOM = TRUE; + } + + // Scan for multi-byte sequences + for (i=0; i < input->fRawLength; i += 1) { + int32_t b = inputBytes[i]; + + if ((b & 0x80) == 0) { + continue; // ASCII + } + + // Hi bit on char found. Figure out how long the sequence should be + if ((b & 0x0E0) == 0x0C0) { + trailBytes = 1; + } else if ((b & 0x0F0) == 0x0E0) { + trailBytes = 2; + } else if ((b & 0x0F8) == 0xF0) { + trailBytes = 3; + } else { + numInvalid += 1; + continue; + } + + // Verify that we've got the right number of trail bytes in the sequence + for (;;) { + i += 1; + + if (i >= input->fRawLength) { + break; + } + + b = inputBytes[i]; + + if ((b & 0xC0) != 0x080) { + numInvalid += 1; + break; + } + + if (--trailBytes == 0) { + numValid += 1; + break; + } + } + + } + + // Cook up some sort of confidence score, based on presence of a BOM + // and the existence of valid and/or invalid multi-byte sequences. + confidence = 0; + if (hasBOM && numInvalid == 0) { + confidence = 100; + } else if (hasBOM && numValid > numInvalid*10) { + confidence = 80; + } else if (numValid > 3 && numInvalid == 0) { + confidence = 100; + } else if (numValid > 0 && numInvalid == 0) { + confidence = 80; + } else if (numValid == 0 && numInvalid == 0) { + // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which + // accepts ASCII with confidence = 10. + confidence = 15; + } else if (numValid > numInvalid*10) { + // Probably corruput utf-8 data. Valid sequences aren't likely by chance. + confidence = 25; + } + + results->set(input, this, confidence); + return (confidence > 0); +} + +U_NAMESPACE_END +#endif |