summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/makeconv/makeconv.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/tools/makeconv/makeconv.cpp')
-rw-r--r--intl/icu/source/tools/makeconv/makeconv.cpp850
1 files changed, 850 insertions, 0 deletions
diff --git a/intl/icu/source/tools/makeconv/makeconv.cpp b/intl/icu/source/tools/makeconv/makeconv.cpp
new file mode 100644
index 000000000..6ca3e613b
--- /dev/null
+++ b/intl/icu/source/tools/makeconv/makeconv.cpp
@@ -0,0 +1,850 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+ ********************************************************************************
+ *
+ * Copyright (C) 1998-2015, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ ********************************************************************************
+ *
+ *
+ * makeconv.cpp:
+ * tool creating a binary (compressed) representation of the conversion mapping
+ * table (IBM NLTC ucmap format).
+ *
+ * 05/04/2000 helena Added fallback mapping into the picture...
+ * 06/29/2000 helena Major rewrite of the callback APIs.
+ */
+
+#include <stdio.h>
+#include "unicode/putil.h"
+#include "unicode/ucnv_err.h"
+#include "charstr.h"
+#include "ucnv_bld.h"
+#include "ucnv_imp.h"
+#include "ucnv_cnv.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "uinvchar.h"
+#include "filestrm.h"
+#include "toolutil.h"
+#include "uoptions.h"
+#include "unicode/udata.h"
+#include "unewdata.h"
+#include "uparse.h"
+#include "ucm.h"
+#include "makeconv.h"
+#include "genmbcs.h"
+
+#define DEBUG 0
+
+typedef struct ConvData {
+ UCMFile *ucm;
+ NewConverter *cnvData, *extData;
+ UConverterSharedData sharedData;
+ UConverterStaticData staticData;
+} ConvData;
+
+static void
+initConvData(ConvData *data) {
+ uprv_memset(data, 0, sizeof(ConvData));
+ data->sharedData.structSize=sizeof(UConverterSharedData);
+ data->staticData.structSize=sizeof(UConverterStaticData);
+ data->sharedData.staticData=&data->staticData;
+}
+
+static void
+cleanupConvData(ConvData *data) {
+ if(data!=NULL) {
+ if(data->cnvData!=NULL) {
+ data->cnvData->close(data->cnvData);
+ data->cnvData=NULL;
+ }
+ if(data->extData!=NULL) {
+ data->extData->close(data->extData);
+ data->extData=NULL;
+ }
+ ucm_close(data->ucm);
+ data->ucm=NULL;
+ }
+}
+
+/*
+ * from ucnvstat.c - static prototypes of data-based converters
+ */
+U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
+
+/*
+ * Global - verbosity
+ */
+UBool VERBOSE = FALSE;
+UBool QUIET = FALSE;
+UBool SMALL = FALSE;
+UBool IGNORE_SISO_CHECK = FALSE;
+
+static void
+createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
+
+/*
+ * Set up the UNewData and write the converter..
+ */
+static void
+writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
+
+UBool haveCopyright=TRUE;
+
+static UDataInfo dataInfo={
+ sizeof(UDataInfo),
+ 0,
+
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ sizeof(UChar),
+ 0,
+
+ {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */
+ {6, 2, 0, 0}, /* formatVersion */
+ {0, 0, 0, 0} /* dataVersion (calculated at runtime) */
+};
+
+static void
+writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
+{
+ UNewDataMemory *mem = NULL;
+ uint32_t sz2;
+ uint32_t size = 0;
+ int32_t tableType;
+
+ if(U_FAILURE(*status))
+ {
+ return;
+ }
+
+ tableType=TABLE_NONE;
+ if(data->cnvData!=NULL) {
+ tableType|=TABLE_BASE;
+ }
+ if(data->extData!=NULL) {
+ tableType|=TABLE_EXT;
+ }
+
+ mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
+
+ if(U_FAILURE(*status))
+ {
+ fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
+ cnvName,
+ "cnv",
+ u_errorName(*status));
+ return;
+ }
+
+ if(VERBOSE)
+ {
+ printf("- Opened udata %s.%s\n", cnvName, "cnv");
+ }
+
+
+ /* all read only, clean, platform independent data. Mmmm. :) */
+ udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
+ size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */
+ /* Now, write the table */
+ if(tableType&TABLE_BASE) {
+ size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
+ }
+ if(tableType&TABLE_EXT) {
+ size += data->extData->write(data->extData, &data->staticData, mem, tableType);
+ }
+
+ sz2 = udata_finish(mem, status);
+ if(size != sz2)
+ {
+ fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
+ *status=U_INTERNAL_PROGRAM_ERROR;
+ }
+ if(VERBOSE)
+ {
+ printf("- Wrote %u bytes to the udata.\n", (int)sz2);
+ }
+}
+
+enum {
+ OPT_HELP_H,
+ OPT_HELP_QUESTION_MARK,
+ OPT_COPYRIGHT,
+ OPT_VERSION,
+ OPT_DESTDIR,
+ OPT_VERBOSE,
+ OPT_SMALL,
+ OPT_IGNORE_SISO_CHECK,
+ OPT_QUIET,
+
+ OPT_COUNT
+};
+
+static UOption options[]={
+ UOPTION_HELP_H,
+ UOPTION_HELP_QUESTION_MARK,
+ UOPTION_COPYRIGHT,
+ UOPTION_VERSION,
+ UOPTION_DESTDIR,
+ UOPTION_VERBOSE,
+ { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
+ { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
+ UOPTION_QUIET,
+};
+
+int main(int argc, char* argv[])
+{
+ ConvData data;
+ char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
+
+ U_MAIN_INIT_ARGS(argc, argv);
+
+ /* Set up the ICU version number */
+ UVersionInfo icuVersion;
+ u_getVersion(icuVersion);
+ uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
+
+ /* preset then read command line options */
+ options[OPT_DESTDIR].value=u_getDataDirectory();
+ argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
+
+ /* error handling, printing usage message */
+ if(argc<0) {
+ fprintf(stderr,
+ "error in command line argument \"%s\"\n",
+ argv[-argc]);
+ } else if(argc<2) {
+ argc=-1;
+ }
+ if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
+ FILE *stdfile=argc<0 ? stderr : stdout;
+ fprintf(stdfile,
+ "usage: %s [-options] files...\n"
+ "\tread .ucm codepage mapping files and write .cnv files\n"
+ "options:\n"
+ "\t-h or -? or --help this usage text\n"
+ "\t-V or --version show a version message\n"
+ "\t-c or --copyright include a copyright notice\n"
+ "\t-d or --destdir destination directory, followed by the path\n"
+ "\t-v or --verbose Turn on verbose output\n"
+ "\t-q or --quiet do not display warnings and progress\n",
+ argv[0]);
+ fprintf(stdfile,
+ "\t --small Generate smaller .cnv files. They will be\n"
+ "\t significantly smaller but may not be compatible with\n"
+ "\t older versions of ICU and will require heap memory\n"
+ "\t allocation when loaded.\n"
+ "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n");
+ return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
+ }
+
+ if(options[OPT_VERSION].doesOccur) {
+ printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
+ dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
+ printf("%s\n", U_COPYRIGHT_STRING);
+ exit(0);
+ }
+
+ /* get the options values */
+ haveCopyright = options[OPT_COPYRIGHT].doesOccur;
+ const char *destdir = options[OPT_DESTDIR].value;
+ VERBOSE = options[OPT_VERBOSE].doesOccur;
+ QUIET = options[OPT_QUIET].doesOccur;
+ SMALL = options[OPT_SMALL].doesOccur;
+
+ if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
+ IGNORE_SISO_CHECK = TRUE;
+ }
+
+ icu::CharString outFileName;
+ UErrorCode err = U_ZERO_ERROR;
+ if (destdir != NULL && *destdir != 0) {
+ outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
+ if (U_FAILURE(err)) {
+ return err;
+ }
+ }
+ int32_t outBasenameStart = outFileName.length();
+
+#if DEBUG
+ {
+ int i;
+ printf("makeconv: processing %d files...\n", argc - 1);
+ for(i=1; i<argc; ++i) {
+ printf("%s ", argv[i]);
+ }
+ printf("\n");
+ fflush(stdout);
+ }
+#endif
+
+ UBool printFilename = (UBool) (argc > 2 || VERBOSE);
+ for (++argv; --argc; ++argv)
+ {
+ UErrorCode localError = U_ZERO_ERROR;
+ const char *arg = getLongPathname(*argv);
+
+ /*produces the right destination path for display*/
+ outFileName.truncate(outBasenameStart);
+ if (outBasenameStart != 0)
+ {
+ /* find the last file sepator */
+ const char *basename = findBasename(arg);
+ outFileName.append(basename, localError);
+ }
+ else
+ {
+ outFileName.append(arg, localError);
+ }
+ if (U_FAILURE(localError)) {
+ return localError;
+ }
+
+ /*removes the extension if any is found*/
+ int32_t lastDotIndex = outFileName.lastIndexOf('.');
+ if (lastDotIndex >= outBasenameStart) {
+ outFileName.truncate(lastDotIndex);
+ }
+
+ /* the basename without extension is the converter name */
+ if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
+ fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
+ return U_BUFFER_OVERFLOW_ERROR;
+ }
+ uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
+
+ /*Adds the target extension*/
+ outFileName.append(CONVERTER_FILE_EXTENSION, localError);
+ if (U_FAILURE(localError)) {
+ return localError;
+ }
+
+#if DEBUG
+ printf("makeconv: processing %s ...\n", arg);
+ fflush(stdout);
+#endif
+ initConvData(&data);
+ createConverter(&data, arg, &localError);
+
+ if (U_FAILURE(localError))
+ {
+ /* if an error is found, print out an error msg and keep going */
+ fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
+ outFileName.data(), arg, u_errorName(localError));
+ if(U_SUCCESS(err)) {
+ err = localError;
+ }
+ }
+ else
+ {
+ /* Insure the static data name matches the file name */
+ /* Changed to ignore directory and only compare base name
+ LDH 1/2/08*/
+ char *p;
+ p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
+
+ if(p == NULL) /* OK, try alternate */
+ {
+ p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
+ if(p == NULL)
+ {
+ p=cnvName; /* If no separators, no problem */
+ }
+ }
+ else
+ {
+ p++; /* If found separator, don't include it in compare */
+ }
+ if(uprv_stricmp(p,data.staticData.name) && !QUIET)
+ {
+ fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
+ cnvName, CONVERTER_FILE_EXTENSION,
+ data.staticData.name);
+ }
+
+ uprv_strcpy((char*)data.staticData.name, cnvName);
+
+ if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
+ fprintf(stderr,
+ "Error: A converter name must contain only invariant characters.\n"
+ "%s is not a valid converter name.\n",
+ data.staticData.name);
+ if(U_SUCCESS(err)) {
+ err = U_INVALID_TABLE_FORMAT;
+ }
+ }
+
+ localError = U_ZERO_ERROR;
+ writeConverterData(&data, cnvName, destdir, &localError);
+
+ if(U_FAILURE(localError))
+ {
+ /* if an error is found, print out an error msg and keep going*/
+ fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
+ u_errorName(localError));
+ if(U_SUCCESS(err)) {
+ err = localError;
+ }
+ }
+ else if (printFilename)
+ {
+ puts(outFileName.data() + outBasenameStart);
+ }
+ }
+ fflush(stdout);
+ fflush(stderr);
+
+ cleanupConvData(&data);
+ }
+
+ return err;
+}
+
+static void
+getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
+ if( (name[0]=='i' || name[0]=='I') &&
+ (name[1]=='b' || name[1]=='B') &&
+ (name[2]=='m' || name[2]=='M')
+ ) {
+ name+=3;
+ if(*name=='-') {
+ ++name;
+ }
+ *pPlatform=UCNV_IBM;
+ *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
+ } else {
+ *pPlatform=UCNV_UNKNOWN;
+ *pCCSID=0;
+ }
+}
+
+static void
+readHeader(ConvData *data,
+ FileStream* convFile,
+ UErrorCode *pErrorCode) {
+ char line[1024];
+ char *s, *key, *value;
+ const UConverterStaticData *prototype;
+ UConverterStaticData *staticData;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ staticData=&data->staticData;
+ staticData->platform=UCNV_IBM;
+ staticData->subCharLen=0;
+
+ while(T_FileStream_readLine(convFile, line, sizeof(line))) {
+ /* basic parsing and handling of state-related items */
+ if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
+ continue;
+ }
+
+ /* stop at the beginning of the mapping section */
+ if(uprv_strcmp(line, "CHARMAP")==0) {
+ break;
+ }
+
+ /* collect the information from the header field, ignore unknown keys */
+ if(uprv_strcmp(key, "code_set_name")==0) {
+ if(*value!=0) {
+ uprv_strcpy((char *)staticData->name, value);
+ getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
+ }
+ } else if(uprv_strcmp(key, "subchar")==0) {
+ uint8_t bytes[UCNV_EXT_MAX_BYTES];
+ int8_t length;
+
+ s=value;
+ length=ucm_parseBytes(bytes, line, (const char **)&s);
+ if(1<=length && length<=4 && *s==0) {
+ staticData->subCharLen=length;
+ uprv_memcpy(staticData->subChar, bytes, length);
+ } else {
+ fprintf(stderr, "error: illegal <subchar> %s\n", value);
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ return;
+ }
+ } else if(uprv_strcmp(key, "subchar1")==0) {
+ uint8_t bytes[UCNV_EXT_MAX_BYTES];
+
+ s=value;
+ if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
+ staticData->subChar1=bytes[0];
+ } else {
+ fprintf(stderr, "error: illegal <subchar1> %s\n", value);
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ return;
+ }
+ }
+ }
+
+ /* copy values from the UCMFile to the static data */
+ staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
+ staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
+ staticData->conversionType=data->ucm->states.conversionType;
+
+ if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
+ fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ return;
+ }
+
+ /*
+ * Now that we know the type, copy any 'default' values from the table.
+ * We need not check the type any further because the parser only
+ * recognizes what we have prototypes for.
+ *
+ * For delta (extension-only) tables, copy values from the base file
+ * instead, see createConverter().
+ */
+ if(data->ucm->baseName[0]==0) {
+ prototype=ucnv_converterStaticData[staticData->conversionType];
+ if(prototype!=NULL) {
+ if(staticData->name[0]==0) {
+ uprv_strcpy((char *)staticData->name, prototype->name);
+ }
+
+ if(staticData->codepage==0) {
+ staticData->codepage=prototype->codepage;
+ }
+
+ if(staticData->platform==0) {
+ staticData->platform=prototype->platform;
+ }
+
+ if(staticData->minBytesPerChar==0) {
+ staticData->minBytesPerChar=prototype->minBytesPerChar;
+ }
+
+ if(staticData->maxBytesPerChar==0) {
+ staticData->maxBytesPerChar=prototype->maxBytesPerChar;
+ }
+
+ if(staticData->subCharLen==0) {
+ staticData->subCharLen=prototype->subCharLen;
+ if(prototype->subCharLen>0) {
+ uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
+ }
+ }
+ }
+ }
+
+ if(data->ucm->states.outputType<0) {
+ data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
+ }
+
+ if( staticData->subChar1!=0 &&
+ (staticData->minBytesPerChar>1 ||
+ (staticData->conversionType!=UCNV_MBCS &&
+ staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
+ ) {
+ fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+}
+
+/* return TRUE if a base table was read, FALSE for an extension table */
+static UBool
+readFile(ConvData *data, const char* converterName,
+ UErrorCode *pErrorCode) {
+ char line[1024];
+ char *end;
+ FileStream *convFile;
+
+ UCMStates *baseStates;
+ UBool dataIsBase;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
+ }
+
+ data->ucm=ucm_open();
+
+ convFile=T_FileStream_open(converterName, "r");
+ if(convFile==NULL) {
+ *pErrorCode=U_FILE_ACCESS_ERROR;
+ return FALSE;
+ }
+
+ readHeader(data, convFile, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
+ }
+
+ if(data->ucm->baseName[0]==0) {
+ dataIsBase=TRUE;
+ baseStates=&data->ucm->states;
+ ucm_processStates(baseStates, IGNORE_SISO_CHECK);
+ } else {
+ dataIsBase=FALSE;
+ baseStates=NULL;
+ }
+
+ /* read the base table */
+ ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
+ }
+
+ /* read an extension table if there is one */
+ while(T_FileStream_readLine(convFile, line, sizeof(line))) {
+ end=uprv_strchr(line, 0);
+ while(line<end &&
+ (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
+ --end;
+ }
+ *end=0;
+
+ if(line[0]=='#' || u_skipWhitespace(line)==end) {
+ continue; /* ignore empty and comment lines */
+ }
+
+ if(0==uprv_strcmp(line, "CHARMAP")) {
+ /* read the extension table */
+ ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
+ } else {
+ fprintf(stderr, "unexpected text after the base mapping table\n");
+ }
+ break;
+ }
+
+ T_FileStream_close(convFile);
+
+ if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
+ fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+
+ return dataIsBase;
+}
+
+static void
+createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
+ ConvData baseData;
+ UBool dataIsBase;
+
+ UConverterStaticData *staticData;
+ UCMStates *states, *baseStates;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ initConvData(data);
+
+ dataIsBase=readFile(data, converterName, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ staticData=&data->staticData;
+ states=&data->ucm->states;
+
+ if(dataIsBase) {
+ /*
+ * Build a normal .cnv file with a base table
+ * and an optional extension table.
+ */
+ data->cnvData=MBCSOpen(data->ucm);
+ if(data->cnvData==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+
+ } else if(!data->cnvData->isValid(data->cnvData,
+ staticData->subChar, staticData->subCharLen)
+ ) {
+ fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+ } else if(staticData->subChar1!=0 &&
+ !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
+ ) {
+ fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+ } else if(
+ data->ucm->ext->mappingsLength>0 &&
+ !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
+ /* sort the table so that it can be turned into UTF-8-friendly data */
+ ucm_sortTable(data->ucm->base);
+ }
+
+ if(U_SUCCESS(*pErrorCode)) {
+ if(
+ /* add the base table after ucm_checkBaseExt()! */
+ !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ } else {
+ /*
+ * addTable() may have requested moving more mappings to the extension table
+ * if they fit into the base toUnicode table but not into the
+ * base fromUnicode table.
+ * (Especially for UTF-8-friendly fromUnicode tables.)
+ * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
+ * to be excluded from the extension toUnicode data.
+ * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
+ * the base fromUnicode table.
+ */
+ ucm_moveMappings(data->ucm->base, data->ucm->ext);
+ ucm_sortTable(data->ucm->ext);
+ if(data->ucm->ext->mappingsLength>0) {
+ /* prepare the extension table, if there is one */
+ data->extData=CnvExtOpen(data->ucm);
+ if(data->extData==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ } else if(
+ !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+ }
+ }
+ }
+ } else {
+ /* Build an extension-only .cnv file. */
+ char baseFilename[500];
+ char *basename;
+
+ initConvData(&baseData);
+
+ /* assemble a path/filename for data->ucm->baseName */
+ uprv_strcpy(baseFilename, converterName);
+ basename=(char *)findBasename(baseFilename);
+ uprv_strcpy(basename, data->ucm->baseName);
+ uprv_strcat(basename, ".ucm");
+
+ /* read the base table */
+ dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ } else if(!dataIsBase) {
+ fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ } else {
+ /* prepare the extension table */
+ data->extData=CnvExtOpen(data->ucm);
+ if(data->extData==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ } else {
+ /* fill in gaps in extension file header fields */
+ UCMapping *m, *mLimit;
+ uint8_t fallbackFlags;
+
+ baseStates=&baseData.ucm->states;
+ if(states->conversionType==UCNV_DBCS) {
+ staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
+ } else if(states->minCharLength==0) {
+ staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
+ }
+ if(states->maxCharLength<states->minCharLength) {
+ staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
+ }
+
+ if(staticData->subCharLen==0) {
+ uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
+ staticData->subCharLen=baseData.staticData.subCharLen;
+ }
+ /*
+ * do not copy subChar1 -
+ * only use what is explicitly specified
+ * because it cannot be unset in the extension file header
+ */
+
+ /* get the fallback flags */
+ fallbackFlags=0;
+ for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
+ m<mLimit && fallbackFlags!=3;
+ ++m
+ ) {
+ if(m->f==1) {
+ fallbackFlags|=1;
+ } else if(m->f==3) {
+ fallbackFlags|=2;
+ }
+ }
+
+ if(fallbackFlags&1) {
+ staticData->hasFromUnicodeFallback=TRUE;
+ }
+ if(fallbackFlags&2) {
+ staticData->hasToUnicodeFallback=TRUE;
+ }
+
+ if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
+ fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+ } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
+ fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+ } else if(
+ !ucm_checkValidity(data->ucm->ext, baseStates) ||
+ !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ } else {
+ if(states->maxCharLength>1) {
+ /*
+ * When building a normal .cnv file with a base table
+ * for an MBCS (not SBCS) table with explicit precision flags,
+ * the MBCSAddTable() function marks some mappings for moving
+ * to the extension table.
+ * They fit into the base toUnicode table but not into the
+ * base fromUnicode table.
+ * (Note: We do have explicit precision flags because they are
+ * required for extension table generation, and
+ * ucm_checkBaseExt() verified it.)
+ *
+ * We do not call MBCSAddTable() here (we probably could)
+ * so we need to do the analysis before building the extension table.
+ * We assume that MBCSAddTable() will build a UTF-8-friendly table.
+ * Redundant mappings in the extension table are ok except they cost some size.
+ *
+ * Do this after ucm_checkBaseExt().
+ */
+ const MBCSData *mbcsData=MBCSGetDummy();
+ int32_t needsMove=0;
+ for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
+ m<mLimit;
+ ++m
+ ) {
+ if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
+ m->f|=MBCS_FROM_U_EXT_FLAG;
+ m->moveFlag=UCM_MOVE_TO_EXT;
+ ++needsMove;
+ }
+ }
+
+ if(needsMove!=0) {
+ ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
+ ucm_sortTable(data->ucm->ext);
+ }
+ }
+ if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+ }
+ }
+ }
+
+ cleanupConvData(&baseData);
+ }
+}
+
+/*
+ * Hey, Emacs, please set the following:
+ *
+ * Local Variables:
+ * indent-tabs-mode: nil
+ * End:
+ *
+ */