summaryrefslogtreecommitdiffstats
path: root/mailnews/base/util/nsMsgI18N.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mailnews/base/util/nsMsgI18N.cpp')
-rw-r--r--mailnews/base/util/nsMsgI18N.cpp479
1 files changed, 479 insertions, 0 deletions
diff --git a/mailnews/base/util/nsMsgI18N.cpp b/mailnews/base/util/nsMsgI18N.cpp
new file mode 100644
index 000000000..b79a4c196
--- /dev/null
+++ b/mailnews/base/util/nsMsgI18N.cpp
@@ -0,0 +1,479 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// as does this
+#include "nsICharsetConverterManager.h"
+#include "nsIPlatformCharset.h"
+#include "nsIServiceManager.h"
+
+#include "nsISupports.h"
+#include "nsIPrefBranch.h"
+#include "nsIPrefService.h"
+#include "nsIMimeConverter.h"
+#include "nsMsgUtils.h"
+#include "nsMsgI18N.h"
+#include "nsMsgMimeCID.h"
+#include "nsILineInputStream.h"
+#include "nsMimeTypes.h"
+#include "nsISaveAsCharset.h"
+#include "nsStringGlue.h"
+#include "prmem.h"
+#include "plstr.h"
+#include "nsUTF8Utils.h"
+#include "nsNetUtil.h"
+#include "nsCRTGlue.h"
+#include "nsComponentManagerUtils.h"
+#include "nsUnicharUtils.h"
+#include "nsIFileStreams.h"
+//
+// International functions necessary for composition
+//
+
+nsresult nsMsgI18NConvertFromUnicode(const char* aCharset,
+ const nsString& inString,
+ nsACString& outString,
+ bool aIsCharsetCanonical,
+ bool aReportUencNoMapping)
+{
+ if (inString.IsEmpty()) {
+ outString.Truncate();
+ return NS_OK;
+ }
+ // Note: This will hide a possible error if the Unicode contains more than one
+ // charset, e.g. Latin1 + Japanese.
+ else if (!aReportUencNoMapping && (!*aCharset ||
+ !PL_strcasecmp(aCharset, "us-ascii") ||
+ !PL_strcasecmp(aCharset, "ISO-8859-1"))) {
+ LossyCopyUTF16toASCII(inString, outString);
+ return NS_OK;
+ }
+ else if (!PL_strcasecmp(aCharset, "UTF-8")) {
+ CopyUTF16toUTF8(inString, outString);
+ return NS_OK;
+ }
+
+ nsresult rv;
+ nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
+ NS_ENSURE_SUCCESS(rv, rv);
+ nsCOMPtr <nsIUnicodeEncoder> encoder;
+
+ // get an unicode converter
+ if (aIsCharsetCanonical) // optimize for modified UTF-7 used by IMAP
+ rv = ccm->GetUnicodeEncoderRaw(aCharset, getter_AddRefs(encoder));
+ else
+ rv = ccm->GetUnicodeEncoder(aCharset, getter_AddRefs(encoder));
+ NS_ENSURE_SUCCESS(rv, rv);
+ // Must set behavior to kOnError_Signal if we want to receive the
+ // NS_ERROR_UENC_NOMAPPING signal, should it occur.
+ int32_t behavior = aReportUencNoMapping ? nsIUnicodeEncoder::kOnError_Signal:
+ nsIUnicodeEncoder::kOnError_Replace;
+ rv = encoder->SetOutputErrorBehavior(behavior, nullptr, '?');
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ const char16_t *originalSrcPtr = inString.get();
+ const char16_t *currentSrcPtr = originalSrcPtr;
+ int32_t originalUnicharLength = inString.Length();
+ int32_t srcLength;
+ int32_t dstLength;
+ char localbuf[512+10]; // We have seen cases were the buffer was overrun
+ // by two (!!) bytes (Bug 1255863).
+ // So give it ten bytes more for now to avoid a crash.
+ int32_t consumedLen = 0;
+
+ bool mappingFailure = false;
+ outString.Truncate();
+ // convert
+ while (consumedLen < originalUnicharLength) {
+ srcLength = originalUnicharLength - consumedLen;
+ dstLength = 512;
+ rv = encoder->Convert(currentSrcPtr, &srcLength, localbuf, &dstLength);
+#ifdef DEBUG
+ if (dstLength > 512) {
+ char warning[100];
+ sprintf(warning, "encoder->Convert() returned %d bytes. Limit = 512", dstLength);
+ NS_WARNING(warning);
+ }
+#endif
+ if (rv == NS_ERROR_UENC_NOMAPPING) {
+ mappingFailure = true;
+ }
+ if (NS_FAILED(rv) || dstLength == 0)
+ break;
+ outString.Append(localbuf, dstLength);
+
+ currentSrcPtr += srcLength;
+ consumedLen = currentSrcPtr - originalSrcPtr; // src length used so far
+ }
+ dstLength = 512; // Reset available buffer size.
+ rv = encoder->Finish(localbuf, &dstLength);
+ if (NS_SUCCEEDED(rv)) {
+ if (dstLength)
+ outString.Append(localbuf, dstLength);
+ return !mappingFailure ? rv: NS_ERROR_UENC_NOMAPPING;
+ }
+ return rv;
+}
+
+nsresult nsMsgI18NConvertToUnicode(const char* aCharset,
+ const nsCString& inString,
+ nsAString& outString,
+ bool aIsCharsetCanonical)
+{
+ if (inString.IsEmpty()) {
+ outString.Truncate();
+ return NS_OK;
+ }
+ else if (!*aCharset || !PL_strcasecmp(aCharset, "us-ascii") ||
+ !PL_strcasecmp(aCharset, "ISO-8859-1")) {
+ // Despite its name, it also works for Latin-1.
+ CopyASCIItoUTF16(inString, outString);
+ return NS_OK;
+ }
+ else if (!PL_strcasecmp(aCharset, "UTF-8")) {
+ if (MsgIsUTF8(inString)) {
+ nsAutoString tmp;
+ CopyUTF8toUTF16(inString, tmp);
+ if (!tmp.IsEmpty() && tmp.First() == char16_t(0xFEFF))
+ tmp.Cut(0, 1);
+ outString.Assign(tmp);
+ return NS_OK;
+ }
+ NS_WARNING("Invalid UTF-8 string");
+ return NS_ERROR_UNEXPECTED;
+ }
+
+ nsresult rv;
+ nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ nsCOMPtr <nsIUnicodeDecoder> decoder;
+
+ // get an unicode converter
+ if (aIsCharsetCanonical) // optimize for modified UTF-7 used by IMAP
+ rv = ccm->GetUnicodeDecoderRaw(aCharset, getter_AddRefs(decoder));
+ else
+ rv = ccm->GetUnicodeDecoderInternal(aCharset, getter_AddRefs(decoder));
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ const char *originalSrcPtr = inString.get();
+ const char *currentSrcPtr = originalSrcPtr;
+ int32_t originalLength = inString.Length();
+ int32_t srcLength;
+ int32_t dstLength;
+ char16_t localbuf[512];
+ int32_t consumedLen = 0;
+
+ outString.Truncate();
+
+ // convert
+ while (consumedLen < originalLength) {
+ srcLength = originalLength - consumedLen;
+ dstLength = 512;
+ rv = decoder->Convert(currentSrcPtr, &srcLength, localbuf, &dstLength);
+ if (NS_FAILED(rv) || dstLength == 0)
+ break;
+ outString.Append(localbuf, dstLength);
+
+ currentSrcPtr += srcLength;
+ consumedLen = currentSrcPtr - originalSrcPtr; // src length used so far
+ }
+ return rv;
+}
+
+// Charset used by the file system.
+const char * nsMsgI18NFileSystemCharset()
+{
+ /* Get a charset used for the file. */
+ static nsAutoCString fileSystemCharset;
+
+ if (fileSystemCharset.IsEmpty())
+ {
+ nsresult rv;
+ nsCOMPtr <nsIPlatformCharset> platformCharset = do_GetService(NS_PLATFORMCHARSET_CONTRACTID, &rv);
+ if (NS_SUCCEEDED(rv)) {
+ rv = platformCharset->GetCharset(kPlatformCharsetSel_FileName,
+ fileSystemCharset);
+ }
+
+ if (NS_FAILED(rv))
+ fileSystemCharset.Assign("ISO-8859-1");
+ }
+ return fileSystemCharset.get();
+}
+
+// Charset used by the text file.
+void nsMsgI18NTextFileCharset(nsACString& aCharset)
+{
+ nsresult rv;
+ nsCOMPtr <nsIPlatformCharset> platformCharset =
+ do_GetService(NS_PLATFORMCHARSET_CONTRACTID, &rv);
+ if (NS_SUCCEEDED(rv)) {
+ rv = platformCharset->GetCharset(kPlatformCharsetSel_PlainTextInFile,
+ aCharset);
+ }
+
+ if (NS_FAILED(rv))
+ aCharset.Assign("ISO-8859-1");
+}
+
+// MIME encoder, output string should be freed by PR_FREE
+// XXX : fix callers later to avoid allocation and copy
+char * nsMsgI18NEncodeMimePartIIStr(const char *header, bool structured, const char *charset, int32_t fieldnamelen, bool usemime)
+{
+ // No MIME, convert to the outgoing mail charset.
+ if (false == usemime) {
+ nsAutoCString convertedStr;
+ if (NS_SUCCEEDED(ConvertFromUnicode(charset, NS_ConvertUTF8toUTF16(header),
+ convertedStr)))
+ return PL_strdup(convertedStr.get());
+ else
+ return PL_strdup(header);
+ }
+
+ nsAutoCString encodedString;
+ nsresult res;
+ nsCOMPtr<nsIMimeConverter> converter = do_GetService(NS_MIME_CONVERTER_CONTRACTID, &res);
+ if (NS_SUCCEEDED(res) && nullptr != converter)
+ res = converter->EncodeMimePartIIStr_UTF8(nsDependentCString(header),
+ structured, "UTF-8", fieldnamelen,
+ nsIMimeConverter::MIME_ENCODED_WORD_SIZE, encodedString);
+
+ return NS_SUCCEEDED(res) ? PL_strdup(encodedString.get()) : nullptr;
+}
+
+// Return True if a charset is stateful (e.g. JIS).
+bool nsMsgI18Nstateful_charset(const char *charset)
+{
+ //TODO: use charset manager's service
+ return (PL_strcasecmp(charset, "ISO-2022-JP") == 0);
+}
+
+bool nsMsgI18Nmultibyte_charset(const char *charset)
+{
+ nsresult res;
+ nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &res);
+ bool result = false;
+
+ if (NS_SUCCEEDED(res)) {
+ nsAutoString charsetData;
+ res = ccm->GetCharsetData(charset, u".isMultibyte", charsetData);
+ if (NS_SUCCEEDED(res)) {
+ result = charsetData.LowerCaseEqualsLiteral("true");
+ }
+ }
+
+ return result;
+}
+
+bool nsMsgI18Ncheck_data_in_charset_range(const char *charset, const char16_t* inString, char **fallbackCharset)
+{
+ if (!charset || !*charset || !inString || !*inString)
+ return true;
+
+ nsresult res;
+ bool result = true;
+
+ nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &res);
+
+ if (NS_SUCCEEDED(res)) {
+ nsCOMPtr <nsIUnicodeEncoder> encoder;
+
+ // get an unicode converter
+ res = ccm->GetUnicodeEncoderRaw(charset, getter_AddRefs(encoder));
+ if(NS_SUCCEEDED(res)) {
+ const char16_t *originalPtr = inString;
+ int32_t originalLen = NS_strlen(inString);
+ const char16_t *currentSrcPtr = originalPtr;
+ char localBuff[512];
+ int32_t consumedLen = 0;
+ int32_t srcLen;
+ int32_t dstLength;
+
+ // convert from unicode
+ while (consumedLen < originalLen) {
+ srcLen = originalLen - consumedLen;
+ dstLength = 512;
+ res = encoder->Convert(currentSrcPtr, &srcLen, localBuff, &dstLength);
+ if (NS_ERROR_UENC_NOMAPPING == res) {
+ result = false;
+ break;
+ }
+ else if (NS_FAILED(res) || (0 == dstLength))
+ break;
+
+ currentSrcPtr += srcLen;
+ consumedLen = currentSrcPtr - originalPtr; // src length used so far
+ }
+ }
+ }
+
+ // if the conversion was not successful then try fallback to other charsets
+ if (!result && fallbackCharset) {
+ nsCString convertedString;
+ res = nsMsgI18NConvertFromUnicode(*fallbackCharset,
+ nsDependentString(inString), convertedString, false, true);
+ result = (NS_SUCCEEDED(res) && NS_ERROR_UENC_NOMAPPING != res);
+ }
+
+ return result;
+}
+
+// Simple parser to parse META charset.
+// It only supports the case when the description is within one line.
+const char *
+nsMsgI18NParseMetaCharset(nsIFile* file)
+{
+ static char charset[nsIMimeConverter::MAX_CHARSET_NAME_LENGTH+1];
+
+ *charset = '\0';
+
+ bool isDirectory = false;
+ file->IsDirectory(&isDirectory);
+ if (isDirectory) {
+ NS_ERROR("file is a directory");
+ return charset;
+ }
+
+ nsresult rv;
+ nsCOMPtr <nsIFileInputStream> fileStream = do_CreateInstance(NS_LOCALFILEINPUTSTREAM_CONTRACTID, &rv);
+ NS_ENSURE_SUCCESS(rv, charset);
+
+ rv = fileStream->Init(file, PR_RDONLY, 0664, false);
+ nsCOMPtr <nsILineInputStream> lineStream = do_QueryInterface(fileStream, &rv);
+
+ nsCString curLine;
+ bool more = true;
+ while (NS_SUCCEEDED(rv) && more) {
+ rv = lineStream->ReadLine(curLine, &more);
+ if (curLine.IsEmpty())
+ continue;
+
+ ToUpperCase(curLine);
+
+ if (curLine.Find("/HEAD") != -1)
+ break;
+
+ if (curLine.Find("META") != -1 &&
+ curLine.Find("HTTP-EQUIV") != -1 &&
+ curLine.Find("CONTENT-TYPE") != -1 &&
+ curLine.Find("CHARSET") != -1) {
+ char *cp = (char *) PL_strchr(PL_strstr(curLine.get(), "CHARSET"), '=');
+ char *token = nullptr;
+ if (cp)
+ {
+ char *newStr = cp + 1;
+ token = NS_strtok(" \"\'", &newStr);
+ }
+ if (token) {
+ PL_strncpy(charset, token, sizeof(charset));
+ charset[sizeof(charset)-1] = '\0';
+
+ // this function cannot parse a file if it is really
+ // encoded by one of the following charsets
+ // so we can say that the charset label must be incorrect for
+ // the .html if we actually see those charsets parsed
+ // and we should ignore them
+ if (!PL_strncasecmp("UTF-16", charset, sizeof("UTF-16")-1) ||
+ !PL_strncasecmp("UTF-32", charset, sizeof("UTF-32")-1))
+ charset[0] = '\0';
+
+ break;
+ }
+ }
+ }
+
+ return charset;
+}
+
+nsresult nsMsgI18NShrinkUTF8Str(const nsCString &inString,
+ uint32_t aMaxLength,
+ nsACString &outString)
+{
+ if (inString.IsEmpty()) {
+ outString.Truncate();
+ return NS_OK;
+ }
+ if (inString.Length() < aMaxLength) {
+ outString.Assign(inString);
+ return NS_OK;
+ }
+ NS_ASSERTION(MsgIsUTF8(inString), "Invalid UTF-8 string is inputted");
+ const char* start = inString.get();
+ const char* end = start + inString.Length();
+ const char* last = start + aMaxLength;
+ const char* cur = start;
+ const char* prev = nullptr;
+ bool err = false;
+ while (cur < last) {
+ prev = cur;
+ if (!UTF8CharEnumerator::NextChar(&cur, end, &err) || err)
+ break;
+ }
+ if (!prev || err) {
+ outString.Truncate();
+ return NS_OK;
+ }
+ uint32_t len = prev - start;
+ outString.Assign(Substring(inString, 0, len));
+ return NS_OK;
+}
+
+void nsMsgI18NConvertRawBytesToUTF16(const nsCString& inString,
+ const char* charset,
+ nsAString& outString)
+{
+ if (MsgIsUTF8(inString))
+ {
+ CopyUTF8toUTF16(inString, outString);
+ return;
+ }
+
+ nsresult rv = ConvertToUnicode(charset, inString, outString);
+ if (NS_SUCCEEDED(rv))
+ return;
+
+ const char* cur = inString.BeginReading();
+ const char* end = inString.EndReading();
+ outString.Truncate();
+ while (cur < end) {
+ char c = *cur++;
+ if (c & char(0x80))
+ outString.Append(UCS2_REPLACEMENT_CHAR);
+ else
+ outString.Append(c);
+ }
+}
+
+void nsMsgI18NConvertRawBytesToUTF8(const nsCString& inString,
+ const char* charset,
+ nsACString& outString)
+{
+ if (MsgIsUTF8(inString))
+ {
+ outString.Assign(inString);
+ return;
+ }
+
+ nsAutoString utf16Text;
+ nsresult rv = ConvertToUnicode(charset, inString, utf16Text);
+ if (NS_SUCCEEDED(rv))
+ {
+ CopyUTF16toUTF8(utf16Text, outString);
+ return;
+ }
+
+ // EF BF BD (UTF-8 encoding of U+FFFD)
+ NS_NAMED_LITERAL_CSTRING(utf8ReplacementChar, "\357\277\275");
+ const char* cur = inString.BeginReading();
+ const char* end = inString.EndReading();
+ outString.Truncate();
+ while (cur < end) {
+ char c = *cur++;
+ if (c & char(0x80))
+ outString.Append(utf8ReplacementChar);
+ else
+ outString.Append(c);
+ }
+}