From 5f8de423f190bbb79a62f804151bc24824fa32d8 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 04:16:08 -0500 Subject: Add m-esr52 at 52.6.0 --- intl/icu/source/tools/makeconv/Makefile.in | 96 ++ intl/icu/source/tools/makeconv/gencnvex.c | 1078 ++++++++++++++ intl/icu/source/tools/makeconv/genmbcs.cpp | 1566 ++++++++++++++++++++ intl/icu/source/tools/makeconv/genmbcs.h | 126 ++ intl/icu/source/tools/makeconv/makeconv.1.in | 114 ++ intl/icu/source/tools/makeconv/makeconv.cpp | 850 +++++++++++ intl/icu/source/tools/makeconv/makeconv.h | 61 + intl/icu/source/tools/makeconv/makeconv.vcxproj | 267 ++++ .../source/tools/makeconv/makeconv.vcxproj.filters | 39 + intl/icu/source/tools/makeconv/ucnvstat.c | 69 + 10 files changed, 4266 insertions(+) create mode 100644 intl/icu/source/tools/makeconv/Makefile.in create mode 100644 intl/icu/source/tools/makeconv/gencnvex.c create mode 100644 intl/icu/source/tools/makeconv/genmbcs.cpp create mode 100644 intl/icu/source/tools/makeconv/genmbcs.h create mode 100644 intl/icu/source/tools/makeconv/makeconv.1.in create mode 100644 intl/icu/source/tools/makeconv/makeconv.cpp create mode 100644 intl/icu/source/tools/makeconv/makeconv.h create mode 100644 intl/icu/source/tools/makeconv/makeconv.vcxproj create mode 100644 intl/icu/source/tools/makeconv/makeconv.vcxproj.filters create mode 100644 intl/icu/source/tools/makeconv/ucnvstat.c (limited to 'intl/icu/source/tools/makeconv') diff --git a/intl/icu/source/tools/makeconv/Makefile.in b/intl/icu/source/tools/makeconv/Makefile.in new file mode 100644 index 000000000..35d92fe6d --- /dev/null +++ b/intl/icu/source/tools/makeconv/Makefile.in @@ -0,0 +1,96 @@ +## Makefile.in for ICU - tools/makeconv +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Stephen F. Booth + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/makeconv + +TARGET_STUB_NAME = makeconv + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +OBJECTS = makeconv.o ucnvstat.o genmbcs.o gencnvex.o + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/makeconv/gencnvex.c b/intl/icu/source/tools/makeconv/gencnvex.c new file mode 100644 index 000000000..124d3d91f --- /dev/null +++ b/intl/icu/source/tools/makeconv/gencnvex.c @@ -0,0 +1,1078 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gencnvex.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003oct12 +* created by: Markus W. Scherer +*/ + +#include +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "ucnv_cnv.h" +#include "ucnvmbcs.h" +#include "toolutil.h" +#include "unewdata.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" + +static void +CnvExtClose(NewConverter *cnvData); + +static UBool +CnvExtIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + +static UBool +CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + +static uint32_t +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); + +typedef struct CnvExtData { + NewConverter newConverter; + + UCMFile *ucm; + + /* toUnicode (state table in ucm->states) */ + UToolMemory *toUTable, *toUUChars; + + /* fromUnicode */ + UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes; + + uint16_t stage1[MBCS_STAGE_1_SIZE]; + uint16_t stage2[MBCS_STAGE_2_SIZE]; + uint16_t stage3[0x10000< |2 mappings */ + uint16_t stage3Sub1Block; + + /* statistics */ + int32_t + maxInBytes, maxOutBytes, maxBytesPerUChar, + maxInUChars, maxOutUChars, maxUCharsPerByte; +} CnvExtData; + +NewConverter * +CnvExtOpen(UCMFile *ucm) { + CnvExtData *extData; + + extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData)); + if(extData==NULL) { + printf("out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + uprv_memset(extData, 0, sizeof(CnvExtData)); + + extData->ucm=ucm; /* aliased, not owned */ + + extData->newConverter.close=CnvExtClose; + extData->newConverter.isValid=CnvExtIsValid; + extData->newConverter.addTable=CnvExtAddTable; + extData->newConverter.write=CnvExtWrite; + return &extData->newConverter; +} + +static void +CnvExtClose(NewConverter *cnvData) { + CnvExtData *extData=(CnvExtData *)cnvData; + if(extData!=NULL) { + utm_close(extData->toUTable); + utm_close(extData->toUUChars); + utm_close(extData->fromUTableUChars); + utm_close(extData->fromUTableValues); + utm_close(extData->fromUBytes); + uprv_free(extData); + } +} + +/* we do not expect this to be called */ +static UBool +CnvExtIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length) { + return FALSE; +} + +static uint32_t +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType) { + CnvExtData *extData=(CnvExtData *)cnvData; + int32_t length, top, headerSize; + + int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 }; + + if(tableType&TABLE_BASE) { + headerSize=0; + } else { + _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 }; + + /* write the header and base table name for an extension-only table */ + length=(int32_t)uprv_strlen(extData->ucm->baseName)+1; + while(length&3) { + /* add padding */ + extData->ucm->baseName[length++]=0; + } + + headerSize=MBCS_HEADER_V4_LENGTH*4+length; + + /* fill the header */ + header.version[0]=4; + header.version[1]=2; + header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY); + + /* write the header and the base table name */ + udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4); + udata_writeBlock(pData, extData->ucm->baseName, length); + } + + /* fill indexes[] - offsets/indexes are in units of the target array */ + top=0; + + indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH; + top+=length*4; + + indexes[UCNV_EXT_TO_U_INDEX]=top; + indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable); + top+=length*4; + + indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top; + indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars); + top+=length*2; + + indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top; + length=utm_countItems(extData->fromUTableUChars); + top+=length*2; + + if(top&3) { + /* add padding */ + *((UChar *)utm_alloc(extData->fromUTableUChars))=0; + *((uint32_t *)utm_alloc(extData->fromUTableValues))=0; + ++length; + top+=2; + } + indexes[UCNV_EXT_FROM_U_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top; + top+=length*4; + + indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top; + length=utm_countItems(extData->fromUBytes); + top+=length; + + if(top&1) { + /* add padding */ + *((uint8_t *)utm_alloc(extData->fromUBytes))=0; + ++length; + ++top; + } + indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top; + indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top; + indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top; + top+=length*2; + + indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top; + length=extData->stage3Top; + top+=length*2; + + if(top&3) { + /* add padding */ + extData->stage3[extData->stage3Top++]=0; + ++length; + top+=2; + } + indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top; + indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop; + top+=length*4; + + indexes[UCNV_EXT_SIZE]=top; + + /* statistics */ + indexes[UCNV_EXT_COUNT_BYTES]= + (extData->maxInBytes<<16)| + (extData->maxOutBytes<<8)| + extData->maxBytesPerUChar; + indexes[UCNV_EXT_COUNT_UCHARS]= + (extData->maxInUChars<<16)| + (extData->maxOutUChars<<8)| + extData->maxUCharsPerByte; + + indexes[UCNV_EXT_FLAGS]=extData->ucm->ext->unicodeMask; + + /* write the extension data */ + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4); + udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2); + + udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2); + udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4); + udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]); + + udata_writeBlock(pData, extData->stage1, extData->stage1Top*2); + udata_writeBlock(pData, extData->stage2, extData->stage2Top*2); + udata_writeBlock(pData, extData->stage3, extData->stage3Top*2); + udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4); + +#if 0 + { + int32_t i, j; + + length=extData->stage1Top; + printf("\nstage1[%x]:\n", length); + + for(i=0; istage1[i]!=length) { + printf("stage1[%04x]=%04x\n", i, extData->stage1[i]); + } + } + + j=length; + length=extData->stage2Top; + printf("\nstage2[%x]:\n", length); + + for(i=0; istage2[i]!=0) { + printf("stage12[%04x]=%04x\n", j, extData->stage2[i]); + } + } + + length=extData->stage3Top; + printf("\nstage3[%x]:\n", length); + + for(i=0; istage3[i]!=0) { + printf("stage3[%04x]=%04x\n", i, extData->stage3[i]); + } + } + + length=extData->stage3bTop; + printf("\nstage3b[%x]:\n", length); + + for(i=0; istage3b[i]!=0) { + printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]); + } + } + } +#endif + + if(VERBOSE) { + printf("size of extension data: %ld\n", (long)top); + } + + /* return the number of bytes that should have been written */ + return (uint32_t)(headerSize+top); +} + +/* to Unicode --------------------------------------------------------------- */ + +/* + * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for + * the toUnicode table. + * This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable + * for the base toUnicode table but not for the base fromUnicode table. + * The table must be sorted. + * Modifies previous data in the reverseMap. + */ +static int32_t +reduceToUMappings(UCMTable *table) { + UCMapping *mappings; + int32_t *map; + int32_t i, j, count; + int8_t flag; + + mappings=table->mappings; + map=table->reverseMap; + count=table->mappingsLength; + + /* leave the map alone for the initial mappings with desired flags */ + for(i=j=0; iuLen==1) { + u16Length=U16_LENGTH(m->u); + value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u); + } else { + /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */ + + /* get the result code point string and its 16-bit string length */ + u32=UCM_GET_CODE_POINTS(table, m); + errorCode=U_ZERO_ERROR; + u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode); + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { + exit(errorCode); + } + + /* allocate it and put its length and index into the value */ + value= + (((uint32_t)u16Length+UCNV_EXT_TO_U_LENGTH_OFFSET)<toUUChars)); + u=utm_allocN(extData->toUUChars, u16Length); + + /* write the result 16-bit string */ + errorCode=U_ZERO_ERROR; + u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode); + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { + exit(errorCode); + } + } + if(m->f==0) { + value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG; + } + + /* update statistics */ + if(m->bLen>extData->maxInBytes) { + extData->maxInBytes=m->bLen; + } + if(u16Length>extData->maxOutUChars) { + extData->maxOutUChars=u16Length; + } + + ratio=(u16Length+(m->bLen-1))/m->bLen; + if(ratio>extData->maxUCharsPerByte) { + extData->maxUCharsPerByte=ratio; + } + + return value; +} + +/* + * Recursive toUTable generator core function. + * Preconditions: + * - start0: if there is one mapping with an input unit sequence of unitIndex+1 + * then defaultValue=compute the mapping result for this whole sequence + * else defaultValue=0 + * + * recurse into the subsection + */ +static UBool +generateToUTable(CnvExtData *extData, UCMTable *table, + int32_t start, int32_t limit, int32_t unitIndex, + uint32_t defaultValue) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, uniqueCount, count, subStart, subLimit; + + uint8_t *bytes; + int32_t low, high, prev; + + uint32_t *section; + + mappings=table->mappings; + map=table->reverseMap; + + /* step 1: examine the input units; set low, high, uniqueCount */ + m=mappings+map[start]; + bytes=UCM_GET_BYTES(table, m); + low=bytes[unitIndex]; + uniqueCount=1; + + prev=high=low; + for(i=start+1; i=(3*count)/4)) { + /* + * for the root table and for fairly full tables: + * allocate for direct, linear array access + * by keeping count, to write an entry for each unit value + * from low to high + * exception: use a compact table if count==0x100 because + * that cannot be encoded in the length byte + */ + } else { + count=uniqueCount; + } + + if(count>=0x100) { + fprintf(stderr, "error: toUnicode extension table section overflow: %ld section entries\n", (long)count); + return FALSE; + } + + /* allocate the section: 1 entry for the header + count for the items */ + section=(uint32_t *)utm_allocN(extData->toUTable, 1+count); + + /* write the section header */ + *section++=((uint32_t)count<uniqueCount) { + /* write empty subsections for unused units in a linear table */ + while(++prevbLen==unitIndex+1) { + /* do not include this in generateToUTable() */ + ++subStart; + + if(subStarttoUTable); + + /* recurse */ + if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { + return FALSE; + } + } + } + return TRUE; +} + +/* + * Generate the toUTable and toUUChars from the input table. + * The input table must be sorted, and all precision flags must be 0..3. + * This function will modify the table's reverseMap. + */ +static UBool +makeToUTable(CnvExtData *extData, UCMTable *table) { + int32_t toUCount; + + toUCount=reduceToUMappings(table); + + extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4); + extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2); + + return generateToUTable(extData, table, 0, toUCount, 0, 0); +} + +/* from Unicode ------------------------------------------------------------- */ + +/* + * preprocessing: + * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode + * change each Unicode string to encode all but the first code point in 16-bit form + * + * generation: + * for each unique code point + * write an entry in the 3-stage trie + * check that there is only one single-code point sequence + * start recursion for following 16-bit input units + */ + +/* + * Remove toUnicode fallbacks and non- SUB mappings + * which are irrelevant for the fromUnicode extension table. + * Remove MBCS_FROM_U_EXT_FLAG bits. + * Overwrite the reverseMap with an index array to the relevant mappings. + * Modify the code point sequences to a generator-friendly format where + * the first code points remains unchanged but the following are recoded + * into 16-bit Unicode string form. + * The table must be sorted. + * Destroys previous data in the reverseMap. + */ +static int32_t +prepareFromUMappings(UCMTable *table) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, count; + int8_t flag; + + mappings=table->mappings; + map=table->reverseMap; + count=table->mappingsLength; + + /* + * we do not go through the map on input because the mappings are + * sorted lexically + */ + m=mappings; + + for(i=j=0; if; + if(flag>=0) { + flag&=MBCS_FROM_U_EXT_MASK; + m->f=flag; + } + if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) { + map[j++]=i; + + if(m->uLen>1) { + /* recode all but the first code point to 16-bit Unicode */ + UChar32 *u32; + UChar *u; + UChar32 c; + int32_t q, r; + + u32=UCM_GET_CODE_POINTS(table, m); + u=(UChar *)u32; /* destructive in-place recoding */ + for(r=2, q=1; quLen; ++q) { + c=u32[q]; + U16_APPEND_UNSAFE(u, r, c); + } + + /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */ + m->uLen=(int8_t)r; + } + } + } + + return j; +} + +static uint32_t +getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { + uint8_t *bytes, *resultBytes; + uint32_t value; + int32_t u16Length, ratio; + + if(m->f==2) { + /* + * no mapping, preferred + * + * no need to count in statistics because the subchars are already + * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, + * and this non-mapping does not count for maxInUChars which are always + * trivially at least two if counting unmappable supplementary code points + */ + return UCNV_EXT_FROM_U_SUBCHAR1; + } + + bytes=UCM_GET_BYTES(table, m); + value=0; + switch(m->bLen) { + /* 1..3: store the bytes in the value word */ + case 3: + value=((uint32_t)*bytes++)<<16; + case 2: + value|=((uint32_t)*bytes++)<<8; + case 1: + value|=*bytes; + break; + default: + /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ + /* store the bytes in fromUBytes[] and the index in the value word */ + value=(uint32_t)utm_countItems(extData->fromUBytes); + resultBytes=utm_allocN(extData->fromUBytes, m->bLen); + uprv_memcpy(resultBytes, bytes, m->bLen); + break; + } + value|=(uint32_t)m->bLen<f==0) { + value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; + } else if(m->f==4) { + value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG; + } + + /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ + if(m->uLen==1) { + u16Length=U16_LENGTH(m->u); + } else { + u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); + } + + /* update statistics */ + if(u16Length>extData->maxInUChars) { + extData->maxInUChars=u16Length; + } + if(m->bLen>extData->maxOutBytes) { + extData->maxOutBytes=m->bLen; + } + + ratio=(m->bLen+(u16Length-1))/u16Length; + if(ratio>extData->maxBytesPerUChar) { + extData->maxBytesPerUChar=ratio; + } + + return value; +} + +/* + * works like generateToUTable(), except that the + * output section consists of two arrays, one for input UChars and one + * for result values + * + * also, fromUTable sections are always stored in a compact form for + * access via binary search + */ +static UBool +generateFromUTable(CnvExtData *extData, UCMTable *table, + int32_t start, int32_t limit, int32_t unitIndex, + uint32_t defaultValue) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, uniqueCount, count, subStart, subLimit; + + UChar *uchars; + UChar32 low, high, prev; + + UChar *sectionUChars; + uint32_t *sectionValues; + + mappings=table->mappings; + map=table->reverseMap; + + /* step 1: examine the input units; set low, high, uniqueCount */ + m=mappings+map[start]; + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); + low=uchars[unitIndex]; + uniqueCount=1; + + prev=high=low; + for(i=start+1; ifromUTableUChars, 1+count); + sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count); + + /* write the section header */ + *sectionUChars++=(UChar)count; + *sectionValues++=defaultValue; + + /* step 3: write temporary section table with subsection starts */ + prev=low-1; /* just before low to prevent empty subsections before low */ + j=0; /* section table index */ + for(i=start; iuLen==unitIndex+1) { + /* do not include this in generateToUTable() */ + ++subStart; + + if(subStartfromUTableValues); + + /* recurse */ + if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { + return FALSE; + } + } + } + return TRUE; +} + +/* + * add entries to the fromUnicode trie, + * assume to be called with code points in ascending order + * and use that to build the trie in precompacted form + */ +static void +addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) { + int32_t i1, i2, i3, i3b, nextOffset, min, newBlock; + + if(value==0) { + return; + } + + /* + * compute the index for each stage, + * allocate a stage block if necessary, + * and write the stage value + */ + i1=c>>10; + if(i1>=extData->stage1Top) { + extData->stage1Top=i1+1; + } + + nextOffset=(c>>4)&0x3f; + + if(extData->stage1[i1]==0) { + /* allocate another block in stage 2; overlap with the previous block */ + newBlock=extData->stage2Top; + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minstage2[newBlock-1]==0) { + --newBlock; + } + + extData->stage1[i1]=(uint16_t)newBlock; + extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + if(extData->stage2Top>UPRV_LENGTHOF(extData->stage2)) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", (int)c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + i2=extData->stage1[i1]+nextOffset; + nextOffset=c&0xf; + + if(extData->stage2[i2]==0) { + /* allocate another block in stage 3; overlap with the previous block */ + newBlock=extData->stage3Top; + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minstage3[newBlock-1]==0) { + --newBlock; + } + + /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */ + newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1); + extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT); + + extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE; + if(extData->stage3Top>UPRV_LENGTHOF(extData->stage3)) { + fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", (int)c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + i3=((int32_t)extData->stage2[i2]<stage3[i3]==0 because we get + * code points in strictly ascending order + */ + + if(value==UCNV_EXT_FROM_U_SUBCHAR1) { + /* SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */ + extData->stage3[i3]=1; + + /* + * precompaction is not optimal for |2 mappings because + * stage3 values for them are all the same, unlike for other mappings + * which all have unique values; + * use a simple compaction of reusing a whole block filled with these + * mappings + */ + + /* is the entire block filled with |2 mappings? */ + if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) { + for(min=i3-nextOffset; + minstage3[min]==1; + ++min) {} + + if(min==i3) { + /* the entire block is filled with these mappings */ + if(extData->stage3Sub1Block!=0) { + /* point to the previous such block and remove this block from stage3 */ + extData->stage2[i2]=extData->stage3Sub1Block; + extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE; + uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2); + } else { + /* remember this block's stage2 entry */ + extData->stage3Sub1Block=extData->stage2[i2]; + } + } + } + } else { + if((i3b=extData->stage3bTop++)>=UPRV_LENGTHOF(extData->stage3b)) { + fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", (int)c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* roundtrip or fallback mapping */ + extData->stage3[i3]=(uint16_t)i3b; + extData->stage3b[i3b]=value; + } +} + +static UBool +generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) { + UCMapping *mappings, *m; + int32_t *map; + uint32_t value; + int32_t subStart, subLimit; + + UChar32 *codePoints; + UChar32 c, next; + + if(mapLength==0) { + return TRUE; + } + + mappings=table->mappings; + map=table->reverseMap; + + /* + * iterate over same-initial-code point mappings, + * enter the initial code point into the trie, + * and start a recursion on the corresponding mappings section + * with generateFromUTable() + */ + m=mappings+map[0]; + codePoints=UCM_GET_CODE_POINTS(table, m); + next=codePoints[0]; + subLimit=0; + while(subLimituLen==1) { + /* do not include this in generateFromUTable() */ + ++subStart; + + if(subStartfromUTableValues)); + + /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */ + if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) { + return FALSE; + } + } + } + return TRUE; +} + +/* + * Generate the fromU data structures from the input table. + * The input table must be sorted, and all precision flags must be 0..3. + * This function will modify the table's reverseMap. + */ +static UBool +makeFromUTable(CnvExtData *extData, UCMTable *table) { + uint16_t *stage1; + int32_t i, stage1Top, fromUCount; + + fromUCount=prepareFromUMappings(table); + + extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2); + extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4); + extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1); + + /* allocate all-unassigned stage blocks */ + extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; + extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED; + + /* + * stage 3b stores only unique values, and in + * index 0: 0 for "no mapping" + * index 1: "no mapping" with preference for rather than + */ + extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1; + extData->stage3bTop=2; + + /* allocate the first entry in the fromUTable because index 0 means "no result" */ + utm_alloc(extData->fromUTableUChars); + utm_alloc(extData->fromUTableValues); + + if(!generateFromUTrie(extData, table, fromUCount)) { + return FALSE; + } + + /* + * offset the stage 1 trie entries by stage1Top because they will + * be stored in a single array + */ + stage1=extData->stage1; + stage1Top=extData->stage1Top; + for(i=0; iunicodeMask&UCNV_HAS_SURROGATES) { + fprintf(stderr, "error: contains mappings for surrogate code points\n"); + return FALSE; + } + + staticData->conversionType=UCNV_MBCS; + + extData=(CnvExtData *)cnvData; + + /* + * assume that the table is sorted + * + * call the functions in this order because + * makeToUTable() modifies the original reverseMap, + * makeFromUTable() writes a whole new mapping into reverseMap + */ + return + makeToUTable(extData, table) && + makeFromUTable(extData, table); +} diff --git a/intl/icu/source/tools/makeconv/genmbcs.cpp b/intl/icu/source/tools/makeconv/genmbcs.cpp new file mode 100644 index 000000000..68c9eb3f7 --- /dev/null +++ b/intl/icu/source/tools/makeconv/genmbcs.cpp @@ -0,0 +1,1566 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genmbcs.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000jul06 +* created by: Markus W. Scherer +*/ + +#include +#include "unicode/utypes.h" +#include "cstring.h" +#include "cmemory.h" +#include "unewdata.h" +#include "ucnv_cnv.h" +#include "ucnvmbcs.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" + +/* + * TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files. + * Reduce tests for maxCharLength. + */ + +struct MBCSData { + NewConverter newConverter; + + UCMFile *ucm; + + /* toUnicode (state table in ucm->states) */ + _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT]; + int32_t countToUFallbacks; + uint16_t *unicodeCodeUnits; + + /* fromUnicode */ + uint16_t stage1[MBCS_STAGE_1_SIZE]; + uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */ + uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */ + uint8_t *fromUBytes; + uint32_t stage2Top, stage3Top; + + /* fromUTF8 */ + uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT]; /* allow for utf8Max=0xffff */ + + /* + * Maximum UTF-8-friendly code point. + * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100. + * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff. + */ + uint16_t utf8Max; + + UBool utf8Friendly; + UBool omitFromU; +}; + +/* prototypes */ +static void +MBCSClose(NewConverter *cnvData); + +static UBool +MBCSStartMappings(MBCSData *mbcsData); + +static UBool +MBCSAddToUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag); + +static UBool +MBCSIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + +static UBool +MBCSSingleAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag); + +static UBool +MBCSAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag); + +static void +MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData); + +static UBool +MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + +static uint32_t +MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); + +/* helper ------------------------------------------------------------------- */ + +static inline char +hexDigit(uint8_t digit) { + return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); +} + +static inline char * +printBytes(char *buffer, const uint8_t *bytes, int32_t length) { + char *s=buffer; + while(length>0) { + *s++=hexDigit((uint8_t)(*bytes>>4)); + *s++=hexDigit((uint8_t)(*bytes&0xf)); + ++bytes; + --length; + } + + *s=0; + return buffer; +} + +/* implementation ----------------------------------------------------------- */ + +static MBCSData gDummy; + +U_CFUNC const MBCSData * +MBCSGetDummy() { + uprv_memset(&gDummy, 0, sizeof(MBCSData)); + + /* + * Set "pessimistic" values which may sometimes move too many + * mappings to the extension table (but never too few). + * These values cause MBCSOkForBaseFromUnicode() to return FALSE for the + * largest set of mappings. + * Assume maxCharLength>1. + */ + gDummy.utf8Friendly=TRUE; + if(SMALL) { + gDummy.utf8Max=0xffff; + gDummy.omitFromU=TRUE; + } else { + gDummy.utf8Max=MBCS_UTF8_MAX; + } + return &gDummy; +} + +static void +MBCSInit(MBCSData *mbcsData, UCMFile *ucm) { + uprv_memset(mbcsData, 0, sizeof(MBCSData)); + + mbcsData->ucm=ucm; /* aliased, not owned */ + + mbcsData->newConverter.close=MBCSClose; + mbcsData->newConverter.isValid=MBCSIsValid; + mbcsData->newConverter.addTable=MBCSAddTable; + mbcsData->newConverter.write=MBCSWrite; +} + +NewConverter * +MBCSOpen(UCMFile *ucm) { + MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData)); + if(mbcsData==NULL) { + printf("out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + MBCSInit(mbcsData, ucm); + return &mbcsData->newConverter; +} + +static void +MBCSDestruct(MBCSData *mbcsData) { + uprv_free(mbcsData->unicodeCodeUnits); + uprv_free(mbcsData->fromUBytes); +} + +static void +MBCSClose(NewConverter *cnvData) { + MBCSData *mbcsData=(MBCSData *)cnvData; + if(mbcsData!=NULL) { + MBCSDestruct(mbcsData); + uprv_free(mbcsData); + } +} + +static UBool +MBCSStartMappings(MBCSData *mbcsData) { + int32_t i, sum, maxCharLength, + stage2NullLength, stage2AllocLength, + stage3NullLength, stage3AllocLength; + + /* toUnicode */ + + /* allocate the code unit array and prefill it with "unassigned" values */ + sum=mbcsData->ucm->states.countToUCodeUnits; + if(VERBOSE) { + printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum); + } + + if(sum>0) { + mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); + if(mbcsData->unicodeCodeUnits==NULL) { + fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n", + (long)sum); + return FALSE; + } + for(i=0; iunicodeCodeUnits[i]=0xfffe; + } + } + + /* fromUnicode */ + maxCharLength=mbcsData->ucm->states.maxCharLength; + + /* allocate the codepage mappings and preset the first 16 characters to 0 */ + if(maxCharLength==1) { + /* allocate 64k 16-bit results for single-byte codepages */ + sum=0x20000; + } else { + /* allocate 1M * maxCharLength bytes for at most 1M mappings */ + sum=0x100000*maxCharLength; + } + mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum); + if(mbcsData->fromUBytes==NULL) { + fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum); + return FALSE; + } + uprv_memset(mbcsData->fromUBytes, 0, sum); + + /* + * UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time. + * See ucnvmbcs.h for details. + * + * There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which + * assumes that the initial stage 2/3 blocks are the all-unassigned ones. + * Therefore, we refine the data structure while maintaining this placement + * even though it would be convenient to allocate the ASCII block at the + * beginning of stage 3, for example. + * + * UTF-8-friendly fromUnicode tries work from sorted tables and are built + * pre-compacted, overlapping adjacent stage 2/3 blocks. + * This is necessary because the block allocation and compaction changes + * at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional + * stage table uses direct indexes into stage 3, without a multiplier and + * thus with a smaller reach. + * + * Non-UTF-8-friendly fromUnicode tries work from unsorted tables + * (because implicit precision is used), and are compacted + * in post-processing. + * + * Preallocation for UTF-8-friendly fromUnicode tries: + * + * Stage 3: + * 64-entry all-unassigned first block followed by ASCII (128 entries). + * + * Stage 2: + * 64-entry all-unassigned first block followed by preallocated + * 64-block for ASCII. + */ + + /* Preallocate ASCII as a linear 128-entry stage 3 block. */ + stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE; + stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE; + + stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE; + stage3AllocLength=128; /* ASCII U+0000..U+007f */ + + /* Initialize stage 1 for the preallocated blocks. */ + sum=stage2NullLength; + for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) { + mbcsData->stage1[i]=sum; + sum+=MBCS_STAGE_2_BLOCK_SIZE; + } + mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */ + + /* + * Stage 2 indexes count 16-blocks in stage 3 as follows: + * SBCS: directly, indexes increment by 16 + * MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1 + * MBCS UTF-8: directly, indexes increment by 16 + */ + if(maxCharLength==1) { + sum=stage3NullLength; + for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) { + mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum; + sum+=MBCS_STAGE_3_BLOCK_SIZE; + } + } else { + sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY; + for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) { + mbcsData->stage2[mbcsData->stage1[0]+i]=sum; + sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY; + } + } + + sum=stage3NullLength; + for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) { + mbcsData->stageUTF8[i]=sum; + sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE; + } + + /* + * Allocate a 64-entry all-unassigned first stage 3 block, + * for UTF-8-friendly lookup with a trail byte, + * plus 128 entries for ASCII. + */ + mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */ + + return TRUE; +} + +/* return TRUE for success */ +static UBool +setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) { + int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); + if(i>=0) { + /* if there is already a fallback for this offset, then overwrite it */ + mbcsData->toUFallbacks[i].codePoint=c; + return TRUE; + } else { + /* if there is no fallback for this offset, then add one */ + i=mbcsData->countToUFallbacks; + if(i>=MBCS_MAX_FALLBACK_COUNT) { + fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c); + return FALSE; + } else { + mbcsData->toUFallbacks[i].offset=offset; + mbcsData->toUFallbacks[i].codePoint=c; + mbcsData->countToUFallbacks=i+1; + return TRUE; + } + } +} + +/* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */ +static int32_t +removeFallback(MBCSData *mbcsData, uint32_t offset) { + int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); + if(i>=0) { + _MBCSToUFallback *toUFallbacks; + int32_t limit, old; + + toUFallbacks=mbcsData->toUFallbacks; + limit=mbcsData->countToUFallbacks; + old=(int32_t)toUFallbacks[i].codePoint; + + /* copy the last fallback entry here to keep the list contiguous */ + toUFallbacks[i].offset=toUFallbacks[limit-1].offset; + toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint; + mbcsData->countToUFallbacks=limit-1; + return old; + } else { + return -1; + } +} + +/* + * isFallback is almost a boolean: + * 1 (TRUE) this is a fallback mapping + * 0 (FALSE) this is a precise mapping + * -1 the precision of this mapping is not specified + */ +static UBool +MBCSAddToUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag) { + char buffer[10]; + uint32_t offset=0; + int32_t i=0, entry, old; + uint8_t state=0; + + if(mbcsData->ucm->states.countStates==0) { + fprintf(stderr, "error: there is no state information!\n"); + return FALSE; + } + + /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ + if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) { + state=1; + } + + /* + * Walk down the state table like in conversion, + * much like getNextUChar(). + * We assume that c<=0x10ffff. + */ + for(i=0;;) { + entry=mbcsData->ucm->states.stateTable[state][bytes[i++]]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + if(i==length) { + fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n", + (short)state, printBytes(buffer, bytes, length), (int)c); + return FALSE; + } + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); + } else { + if(i0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + case MBCS_STATE_CHANGE_ONLY: + fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + case MBCS_STATE_UNASSIGNED: + fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + case MBCS_STATE_FALLBACK_DIRECT_16: + case MBCS_STATE_VALID_DIRECT_16: + case MBCS_STATE_FALLBACK_DIRECT_20: + case MBCS_STATE_VALID_DIRECT_20: + if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { + /* the "direct" action's value is not "valid-direct-16-unassigned" any more */ + if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) { + old=MBCS_ENTRY_FINAL_VALUE(entry); + } else { + old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + } + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + return FALSE; + } else if(VERBOSE) { + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + } + /* + * Continue after the above warning + * if the precision of the mapping is unspecified. + */ + } + /* reassign the correct action code */ + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0))); + + /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */ + if(c<=0xffff) { + entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c); + } else { + entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000); + } + mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry; + break; + case MBCS_STATE_VALID_16: + /* bits 26..16 are not used, 0 */ + /* bits 15..7 contain the final offset delta to one 16-bit code unit */ + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + /* check that this byte sequence is still unassigned */ + if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) { + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + return FALSE; + } else if(VERBOSE) { + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + } + } + if(c>=0x10000) { + fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + } + if(flag>0) { + /* assign only if there is no precise mapping */ + if(mbcsData->unicodeCodeUnits[offset]==0xfffe) { + return setFallback(mbcsData, offset, c); + } + } else { + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } + break; + case MBCS_STATE_VALID_16_PAIR: + /* bits 26..16 are not used, 0 */ + /* bits 15..7 contain the final offset delta to two 16-bit code units */ + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + /* check that this byte sequence is still unassigned */ + old=mbcsData->unicodeCodeUnits[offset]; + if(old<0xfffe) { + int32_t real; + if(old<0xd800) { + real=old; + } else if(old<=0xdfff) { + real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff); + } else /* old<=0xe001 */ { + real=mbcsData->unicodeCodeUnits[offset+1]; + } + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)real); + return FALSE; + } else if(VERBOSE) { + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)real); + } + } + if(flag>0) { + /* assign only if there is no precise mapping */ + if(old<=0xdbff || old==0xe000) { + /* do nothing */ + } else if(c<=0xffff) { + /* set a BMP fallback code point as a pair with 0xe001 */ + mbcsData->unicodeCodeUnits[offset++]=0xe001; + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } else { + /* set a fallback surrogate pair with two second surrogates */ + mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10)); + mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff)); + } + } else { + if(c<0xd800) { + /* set a BMP code point */ + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } else if(c<=0xffff) { + /* set a BMP code point above 0xd800 as a pair with 0xe000 */ + mbcsData->unicodeCodeUnits[offset++]=0xe000; + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } else { + /* set a surrogate pair */ + mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10)); + mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff)); + } + } + break; + default: + /* reserved, must never occur */ + fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n", + (int)entry, printBytes(buffer, bytes, length), (int)c); + return FALSE; + } + + return TRUE; + } + } +} + +/* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */ +static UBool +MBCSIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length) { + MBCSData *mbcsData=(MBCSData *)cnvData; + + return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length)); +} + +static UBool +MBCSSingleAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t /*length*/, + UChar32 c, + int8_t flag) { + uint16_t *stage3, *p; + uint32_t idx; + uint16_t old; + uint8_t b; + + uint32_t blockSize, newTop, i, nextOffset, newBlock, min; + + /* ignore |2 SUB mappings */ + if(flag==2) { + return TRUE; + } + + /* + * Walk down the triple-stage compact array ("trie") and + * allocate parts as necessary. + * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings. + * We assume that length<=maxCharLength and that c<=0x10ffff. + */ + stage3=(uint16_t *)mbcsData->fromUBytes; + b=*bytes; + + /* inspect stage 1 */ + idx=c>>MBCS_STAGE_1_SHIFT; + if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1); + } else { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK; + } + if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { + /* allocate another block in stage 2 */ + newBlock=mbcsData->stage2Top; + if(mbcsData->utf8Friendly) { + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minstage2Single[newBlock-1]==0) { + --newBlock; + } + } + newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + + if(newTop>MBCS_MAX_STAGE_2_TOP) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b); + return FALSE; + } + + /* + * each stage 2 block contains 64 16-bit words: + * 6 code point bits 9..4 with 1 stage 3 index + */ + mbcsData->stage1[idx]=(uint16_t)newBlock; + mbcsData->stage2Top=newTop; + } + + /* inspect stage 2 */ + idx=mbcsData->stage1[idx]+nextOffset; + if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) { + /* allocate 64-entry blocks for UTF-8-friendly lookup */ + blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE; + nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK; + } else { + blockSize=MBCS_STAGE_3_BLOCK_SIZE; + nextOffset=c&MBCS_STAGE_3_BLOCK_MASK; + } + if(mbcsData->stage2Single[idx]==0) { + /* allocate another block in stage 3 */ + newBlock=mbcsData->stage3Top; + if(mbcsData->utf8Friendly) { + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minMBCS_STAGE_3_SBCS_SIZE) { + fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b); + return FALSE; + } + /* each block has 16 uint16_t entries */ + i=idx; + while(newBlockstage2Single[i++]=(uint16_t)newBlock; + newBlock+=MBCS_STAGE_3_BLOCK_SIZE; + } + mbcsData->stage3Top=newTop; /* ==newBlock */ + } + + /* write the codepage entry into stage 3 and get the previous entry */ + p=stage3+mbcsData->stage2Single[idx]+nextOffset; + old=*p; + if(flag<=0) { + *p=(uint16_t)(0xf00|b); + } else if(IS_PRIVATE_USE(c)) { + *p=(uint16_t)(0xc00|b); + } else { + *p=(uint16_t)(0x800|b); + } + + /* check that this Unicode code point was still unassigned */ + if(old>=0x100) { + if(flag>=0) { + fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", + (int)c, b, old&0xff); + return FALSE; + } else if(VERBOSE) { + fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", + (int)c, b, old&0xff); + } + /* continue after the above warning if the precision of the mapping is unspecified */ + } + + return TRUE; +} + +static UBool +MBCSAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag) { + char buffer[10]; + const uint8_t *pb; + uint8_t *stage3, *p; + uint32_t idx, b, old, stage3Index; + int32_t maxCharLength; + + uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap; + + maxCharLength=mbcsData->ucm->states.maxCharLength; + + if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO && + (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf)) + ) { + fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + } + + if(flag==1 && length==1 && *bytes==0) { + fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n", + (int)c, *bytes); + return FALSE; + } + + /* + * Walk down the triple-stage compact array ("trie") and + * allocate parts as necessary. + * Note that the first stage 2 and 3 blocks are reserved for + * all-unassigned mappings. + * We assume that length<=maxCharLength and that c<=0x10ffff. + */ + stage3=mbcsData->fromUBytes; + + /* inspect stage 1 */ + idx=c>>MBCS_STAGE_1_SHIFT; + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1); + } else { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK; + } + if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { + /* allocate another block in stage 2 */ + newBlock=mbcsData->stage2Top; + if(mbcsData->utf8Friendly) { + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minstage2[newBlock-1]==0) { + --newBlock; + } + } + newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + + if(newTop>MBCS_MAX_STAGE_2_TOP) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + } + + /* + * each stage 2 block contains 64 32-bit words: + * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index + */ + i=idx; + while(newBlockstage1[i++]=(uint16_t)newBlock; + newBlock+=MBCS_STAGE_2_BLOCK_SIZE; + } + mbcsData->stage2Top=newTop; /* ==newBlock */ + } + + /* inspect stage 2 */ + idx=mbcsData->stage1[idx]+nextOffset; + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { + /* allocate 64-entry blocks for UTF-8-friendly lookup */ + blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength; + nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK; + } else { + blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength; + nextOffset=c&MBCS_STAGE_3_BLOCK_MASK; + } + if(mbcsData->stage2[idx]==0) { + /* allocate another block in stage 3 */ + newBlock=mbcsData->stage3Top; + if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) { + /* + * Overlap stage 3 blocks only in multiples of 16-entry blocks + * because of the indexing granularity in stage 2. + */ + maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength; + for(overlap=0; + overlapMBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) { + fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return FALSE; + } + /* each block has 16*maxCharLength bytes */ + i=idx; + while(newBlockstage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength; + newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength; + } + mbcsData->stage3Top=newTop; /* ==newBlock */ + } + + stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[idx]; + + /* Build an alternate, UTF-8-friendly stage table as well. */ + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { + /* Overflow for uint16_t entries in stageUTF8? */ + if(stage3Index>0xffff) { + /* + * This can occur only if the mapping table is nearly perfectly filled and if + * utf8Max==0xffff. + * (There is no known charset like this. GB 18030 does not map + * surrogate code points and LMBCS does not map 256 PUA code points.) + * + * Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff + * (stage3Index can at most reach exactly MBCS_UTF8_LIMIT) + * because we have a sorted table and there are at most MBCS_UTF8_LIMIT + * mappings with 0<=cutf8Max=0xfeff; + } else { + /* + * The stage 3 block has been assigned for the regular trie. + * Just copy its index into stageUTF8[], without the granularity. + */ + mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index; + } + } + + /* write the codepage bytes into stage 3 and get the previous bytes */ + + /* assemble the bytes into a single integer */ + pb=bytes; + b=0; + switch(length) { + case 4: + b=*pb++; + U_FALLTHROUGH; + case 3: + b=(b<<8)|*pb++; + U_FALLTHROUGH; + case 2: + b=(b<<8)|*pb++; + U_FALLTHROUGH; + case 1: + default: + b=(b<<8)|*pb++; + break; + } + + old=0; + p=stage3+(stage3Index+nextOffset)*maxCharLength; + switch(maxCharLength) { + case 2: + old=*(uint16_t *)p; + *(uint16_t *)p=(uint16_t)b; + break; + case 3: + old=(uint32_t)*p<<16; + *p++=(uint8_t)(b>>16); + old|=(uint32_t)*p<<8; + *p++=(uint8_t)(b>>8); + old|=*p; + *p=(uint8_t)b; + break; + case 4: + old=*(uint32_t *)p; + *(uint32_t *)p=b; + break; + default: + /* will never occur */ + break; + } + + /* check that this Unicode code point was still unassigned */ + if((mbcsData->stage2[idx+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) { + if(flag>=0) { + fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + return FALSE; + } else if(VERBOSE) { + fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + } + /* continue after the above warning if the precision of the mapping is + unspecified */ + } + if(flag<=0) { + /* set the roundtrip flag */ + mbcsData->stage2[idx+(nextOffset>>4)]|=(1UL<<(16+(c&0xf))); + } + + return TRUE; +} + +U_CFUNC UBool +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, int8_t flag) { + /* + * A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under + * the following conditions: + * + * - a |2 SUB mapping for (no base table data structure for them) + * - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry) + * - a multi-byte mapping with leading 0x00 bytes (no explicit length field) + * + * Some of these tests are redundant with ucm_mappingType(). + */ + if( (flag==2 && length==1) || + (flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */ + (flag<=1 && length>1 && bytes[0]==0) + ) { + return FALSE; + } + + /* + * Additional restrictions for UTF-8-friendly fromUnicode tables, + * for code points up to the maximum optimized one: + * + * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry) + * - any |1 fallback (no roundtrip flags in the optimized table) + */ + if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) { + return FALSE; + } + + /* + * If we omit the fromUnicode data, we can only store roundtrips there + * because only they are recoverable from the toUnicode data. + * Fallbacks must go into the extension table. + */ + if(mbcsData->omitFromU && flag!=0) { + return FALSE; + } + + /* All other mappings do fit into the base table. */ + return TRUE; +} + +/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */ +static UBool +MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { + MBCSData *mbcsData; + UCMapping *m; + UChar32 c; + int32_t i, maxCharLength; + int8_t f; + UBool isOK, utf8Friendly; + + staticData->unicodeMask=table->unicodeMask; + if(staticData->unicodeMask==3) { + fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n"); + return FALSE; + } + + staticData->conversionType=UCNV_MBCS; + + mbcsData=(MBCSData *)cnvData; + maxCharLength=mbcsData->ucm->states.maxCharLength; + + /* + * Generation of UTF-8-friendly data requires + * a sorted table, which makeconv generates when explicit precision + * indicators are used. + */ + mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0); + if(utf8Friendly) { + mbcsData->utf8Max=MBCS_UTF8_MAX; + if(SMALL && maxCharLength>1) { + mbcsData->omitFromU=TRUE; + } + } else { + mbcsData->utf8Max=0; + if(SMALL && maxCharLength>1) { + fprintf(stderr, + "makeconv warning: --small not available for .ucm files without |0 etc.\n"); + } + } + + if(!MBCSStartMappings(mbcsData)) { + return FALSE; + } + + staticData->hasFromUnicodeFallback=FALSE; + staticData->hasToUnicodeFallback=FALSE; + + isOK=TRUE; + + m=table->mappings; + for(i=0; imappingsLength; ++m, ++i) { + c=m->u; + f=m->f; + + /* + * Small optimization for --small .cnv files: + * + * If there are fromUnicode mappings above MBCS_UTF8_MAX, + * then the file size will be smaller if we make utf8Max larger + * because the size increase in stageUTF8 will be more than balanced by + * how much less of stage2 needs to be stored. + * + * There is no point in doing this incrementally because stageUTF8 + * uses so much less space per block than stage2, + * so we immediately increase utf8Max to 0xffff. + * + * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode() + * sets it to that value when stageUTF8 overflows. + */ + if( mbcsData->omitFromU && f<=1 && + mbcsData->utf8Maxutf8Max<0xfeff + ) { + mbcsData->utf8Max=0xffff; + } + + switch(f) { + case -1: + /* there was no precision/fallback indicator */ + /* fall through to set the mappings */ + U_FALLTHROUGH; + case 0: + /* set roundtrip mappings */ + isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + + if(maxCharLength==1) { + isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) { + isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + } + break; + case 1: + /* set only a fallback mapping from Unicode to codepage */ + if(maxCharLength==1) { + staticData->hasFromUnicodeFallback=TRUE; + isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) { + staticData->hasFromUnicodeFallback=TRUE; + isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + } + break; + case 2: + /* ignore |2 SUB mappings, except to move mappings to the extension table */ + if(maxCharLength>1 && m->bLen==1) { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + } + break; + case 3: + /* set only a fallback mapping from codepage to Unicode */ + staticData->hasToUnicodeFallback=TRUE; + isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + break; + case 4: + /* move "good one-way" mappings to the extension table */ + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + break; + default: + /* will not occur because the parser checked it already */ + fprintf(stderr, "error: illegal fallback indicator %d\n", f); + return FALSE; + } + } + + MBCSPostprocess(mbcsData, staticData); + + return isOK; +} + +static UBool +transformEUC(MBCSData *mbcsData) { + uint8_t *p8; + uint32_t i, value, oldLength, old3Top; + uint8_t b; + + oldLength=mbcsData->ucm->states.maxCharLength; + if(oldLength<3) { + return FALSE; + } + + old3Top=mbcsData->stage3Top; + + /* careful: 2-byte and 4-byte codes are stored in platform endianness! */ + + /* test if all first bytes are in {0, 0x8e, 0x8f} */ + p8=mbcsData->fromUBytes; + +#if !U_IS_BIG_ENDIAN + if(oldLength==4) { + p8+=3; + } +#endif + + for(i=0; ifromUBytes; + + /* modify outputType and adjust stage3Top */ + mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3); + mbcsData->stage3Top=(old3Top*(oldLength-1))/oldLength; + + /* + * EUC-encode all byte sequences; + * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly, + * p. 161 in chapter 4 "Encoding Methods" + * + * This also must reverse the byte order if the platform is little-endian! + */ + if(oldLength==3) { + uint16_t *q=(uint16_t *)p8; + for(i=0; i>16); + (*q++)=(uint8_t)(value>>8); + (*q++)=(uint8_t)value; + } else if(value<=0x8effffff) { + /* code set 2 */ + (*q++)=(uint8_t)((value>>16)&0x7f); + (*q++)=(uint8_t)(value>>8); + (*q++)=(uint8_t)value; + } else /* first byte is 0x8f */ { + /* code set 3 */ + (*q++)=(uint8_t)(value>>16); + (*q++)=(uint8_t)((value>>8)&0x7f); + (*q++)=(uint8_t)value; + } + } + } + + return TRUE; +} + +/* + * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far + * as possible. Overlapping is done on unassigned head and tail + * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER. + * Stage 1 indexes need to be adjusted accordingly. + * This function is very similar to genprops/store.c/compactStage(). + */ +static void +singleCompactStage2(MBCSData *mbcsData) { + /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */ + uint16_t map[MBCS_STAGE_2_MAX_BLOCKS]; + uint16_t i, start, prevEnd, newStart; + + /* enter the all-unassigned first stage 2 block into the map */ + map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; + + /* begin with the first block after the all-unassigned one */ + start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED; + while(startstage2Top) { + prevEnd=(uint16_t)(newStart-1); + + /* find the size of the overlap */ + for(i=0; istage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {} + + if(i>0) { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) { + mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++]; + } + } else if(newStart>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart; + for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) { + mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start; + start=newStart+=MBCS_STAGE_2_BLOCK_SIZE; + } + } + + /* adjust stage2Top */ + if(VERBOSE && newStartstage2Top) { + printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n", + (unsigned long)mbcsData->stage2Top, (unsigned long)newStart, + (long)(mbcsData->stage2Top-newStart)*2); + } + mbcsData->stage2Top=newStart; + + /* now adjust stage 1 */ + for(i=0; istage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]; + } +} + +/* Compact stage 3 for SBCS - same algorithm as above. */ +static void +singleCompactStage3(MBCSData *mbcsData) { + uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes; + + /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */ + uint16_t map[0x1000]; + uint16_t i, start, prevEnd, newStart; + + /* enter the all-unassigned first stage 3 block into the map */ + map[0]=0; + + /* begin with the first block after the all-unassigned one */ + start=newStart=16; + while(startstage3Top) { + prevEnd=(uint16_t)(newStart-1); + + /* find the size of the overlap */ + for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {} + + if(i>0) { + map[start>>4]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(16-i); i>0; --i) { + stage3[newStart++]=stage3[start++]; + } + } else if(newStart>4]=newStart; + for(i=16; i>0; --i) { + stage3[newStart++]=stage3[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>4]=start; + start=newStart+=16; + } + } + + /* adjust stage3Top */ + if(VERBOSE && newStartstage3Top) { + printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n", + (unsigned long)mbcsData->stage3Top, (unsigned long)newStart, + (long)(mbcsData->stage3Top-newStart)*2); + } + mbcsData->stage3Top=newStart; + + /* now adjust stage 2 */ + for(i=0; istage2Top; ++i) { + mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4]; + } +} + +/* + * Compact stage 2 by overlapping adjacent stage 2 blocks as far + * as possible. Overlapping is done on unassigned head and tail + * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER. + * Stage 1 indexes need to be adjusted accordingly. + * This function is very similar to genprops/store.c/compactStage(). + */ +static void +compactStage2(MBCSData *mbcsData) { + /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */ + uint16_t map[MBCS_STAGE_2_MAX_BLOCKS]; + uint16_t i, start, prevEnd, newStart; + + /* enter the all-unassigned first stage 2 block into the map */ + map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; + + /* begin with the first block after the all-unassigned one */ + start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED; + while(startstage2Top) { + prevEnd=(uint16_t)(newStart-1); + + /* find the size of the overlap */ + for(i=0; istage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {} + + if(i>0) { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) { + mbcsData->stage2[newStart++]=mbcsData->stage2[start++]; + } + } else if(newStart>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart; + for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) { + mbcsData->stage2[newStart++]=mbcsData->stage2[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start; + start=newStart+=MBCS_STAGE_2_BLOCK_SIZE; + } + } + + /* adjust stage2Top */ + if(VERBOSE && newStartstage2Top) { + printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n", + (unsigned long)mbcsData->stage2Top, (unsigned long)newStart, + (long)(mbcsData->stage2Top-newStart)*4); + } + mbcsData->stage2Top=newStart; + + /* now adjust stage 1 */ + for(i=0; istage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]; + } +} + +static void +MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData * /*staticData*/) { + UCMStates *states; + int32_t maxCharLength, stage3Width; + + states=&mbcsData->ucm->states; + stage3Width=maxCharLength=states->maxCharLength; + + ucm_optimizeStates(states, + &mbcsData->unicodeCodeUnits, + mbcsData->toUFallbacks, mbcsData->countToUFallbacks, + VERBOSE); + + /* try to compact the fromUnicode tables */ + if(transformEUC(mbcsData)) { + --stage3Width; + } + + /* + * UTF-8-friendly tries are built precompacted, to cope with variable + * stage 3 allocation block sizes. + * + * Tables without precision indicators cannot be built that way, + * because if a block was overlapped with a previous one, then a smaller + * code point for the same block would not fit. + * Therefore, such tables are not marked UTF-8-friendly and must be + * compacted after all mappings are entered. + */ + if(!mbcsData->utf8Friendly) { + if(maxCharLength==1) { + singleCompactStage3(mbcsData); + singleCompactStage2(mbcsData); + } else { + compactStage2(mbcsData); + } + } + + if(VERBOSE) { + /*uint32_t c, i1, i2, i2Limit, i3;*/ + + printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n", + maxCharLength==1 ? "16" : "32", + (unsigned long)mbcsData->stage2Top, + (unsigned long)mbcsData->stage2Top); + printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n", + (int)stage3Width, + (unsigned long)mbcsData->stage3Top/stage3Width, + (unsigned long)mbcsData->stage3Top/stage3Width); +#if 0 + c=0; + for(i1=0; i1stage1[i1]; + if(i2==0) { + c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE; + continue; + } + for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2stage2Single[i2]; + } else { + i3=(uint16_t)mbcsData->stage2[i2]; + } + if(i3==0) { + c+=MBCS_STAGE_3_BLOCK_SIZE; + continue; + } + printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n", + (unsigned long)c, + (unsigned long)i1, + (unsigned long)i2, + (unsigned long)i3); + c+=MBCS_STAGE_3_BLOCK_SIZE; + } + } +#endif + } +} + +static uint32_t +MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType) { + MBCSData *mbcsData=(MBCSData *)cnvData; + uint32_t stage2Start, stage2Length; + uint32_t top, stageUTF8Length=0; + int32_t i, stage1Top; + uint32_t headerLength; + + _MBCSHeader header=UCNV_MBCS_HEADER_INITIALIZER; + + stage2Length=mbcsData->stage2Top; + if(mbcsData->omitFromU) { + /* find how much of stage2 can be omitted */ + int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1; + uint32_t st2=0; /*initialized it to avoid compiler warnings */ + + i=utf8Limit>>MBCS_STAGE_1_SHIFT; + if((utf8Limit&((1<stage1[i])!=0) { + /* utf8Limit is in the middle of an existing stage 2 block */ + stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK); + } else { + /* find the last stage2 block with mappings before utf8Limit */ + while(i>0 && (st2=mbcsData->stage1[--i])==0) {} + /* stage2 up to the end of this block corresponds to stageUTF8 */ + stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE; + } + header.options|=MBCS_OPT_NO_FROM_U; + header.fullStage2Length=stage2Length; + stage2Length-=stage2Start; + if(VERBOSE) { + printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n", + (unsigned long)stage2Start, + (unsigned long)mbcsData->stage2Top, + (unsigned long)mbcsData->stage3Top); + printf("+ total size savings: %lu bytes\n", (unsigned long)stage2Start*4+mbcsData->stage3Top); + } + } else { + stage2Start=0; + } + + if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { + stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ + } else { + stage1Top=0x40; /* 0x40==64 */ + } + + /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */ + if(mbcsData->ucm->states.maxCharLength==1) { + for(i=0; istage1[i]+=(uint16_t)stage1Top; + } + + /* stage2Top/Length have counted 16-bit results, now we need to count bytes */ + /* also round up to a multiple of 4 bytes */ + stage2Length=(stage2Length*2+1)&~1; + + /* stage3Top has counted 16-bit results, now we need to count bytes */ + mbcsData->stage3Top*=2; + + if(mbcsData->utf8Friendly) { + header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */ + } + } else { + for(i=0; istage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */ + } + + /* stage2Top/Length have counted 32-bit results, now we need to count bytes */ + stage2Length*=4; + /* leave stage2Start counting 32-bit units */ + + if(mbcsData->utf8Friendly) { + stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT; + header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */ + } + + /* stage3Top has already counted bytes */ + } + + /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */ + mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3; + + /* fill the header */ + if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) { + header.version[0]=5; + if(header.options&MBCS_OPT_NO_FROM_U) { + headerLength=10; /* include fullStage2Length */ + } else { + headerLength=MBCS_HEADER_V5_MIN_LENGTH; /* 9 */ + } + } else { + header.version[0]=4; + headerLength=MBCS_HEADER_V4_LENGTH; /* 8 */ + } + header.version[1]=4; + /* header.version[2] set above for utf8Friendly data */ + + header.options|=(uint32_t)headerLength; + + header.countStates=mbcsData->ucm->states.countStates; + header.countToUFallbacks=mbcsData->countToUFallbacks; + + header.offsetToUCodeUnits= + headerLength*4+ + mbcsData->ucm->states.countStates*1024+ + mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback); + header.offsetFromUTable= + header.offsetToUCodeUnits+ + mbcsData->ucm->states.countToUCodeUnits*2; + header.offsetFromUBytes= + header.offsetFromUTable+ + stage1Top*2+ + stage2Length; + header.fromUBytesLength=mbcsData->stage3Top; + + top=header.offsetFromUBytes+stageUTF8Length*2; + if(!(header.options&MBCS_OPT_NO_FROM_U)) { + top+=header.fromUBytesLength; + } + + header.flags=(uint8_t)(mbcsData->ucm->states.outputType); + + if(tableType&TABLE_EXT) { + if(top>0xffffff) { + fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top); + return 0; + } + + header.flags|=top<<8; + } + + /* write the MBCS data */ + udata_writeBlock(pData, &header, headerLength*4); + udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024); + udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback)); + udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2); + udata_writeBlock(pData, mbcsData->stage1, stage1Top*2); + if(mbcsData->ucm->states.maxCharLength==1) { + udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length); + } else { + udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length); + } + if(!(header.options&MBCS_OPT_NO_FROM_U)) { + udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top); + } + + if(stageUTF8Length>0) { + udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2); + } + + /* return the number of bytes that should have been written */ + return top; +} diff --git a/intl/icu/source/tools/makeconv/genmbcs.h b/intl/icu/source/tools/makeconv/genmbcs.h new file mode 100644 index 000000000..b1bf8e19f --- /dev/null +++ b/intl/icu/source/tools/makeconv/genmbcs.h @@ -0,0 +1,126 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2008, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genmbcs.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000jul10 +* created by: Markus W. Scherer +*/ + +#ifndef __GENMBCS_H__ +#define __GENMBCS_H__ + +#include "makeconv.h" + +enum { + /* + * TODO: Consider using ucnvmbcs.h constants. + * However, not all values need to be exactly the same, for example + * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX + * may be higher in makeconv than in the runtime code because that + * affects only a small number of .cnv files [if any] but all + * runtime UConverterSharedData objects. + */ + MBCS_STAGE_2_SHIFT=4, + MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ + MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ + MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */ + MBCS_STAGE_1_SHIFT=10, + MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */ + MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */ + MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */ + MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, + MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, + + MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ + MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ + + MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ + MBCS_STAGE_3_BLOCK_MASK=0xf, + MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ + + MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */ + MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ + MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */ + + /* + * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures. + * Possible values are 0x01ff..0xffff, in steps of 0x100. + * + * Unlike for MBCS, this constant only affects the stage 3 block allocation size; + * there is no additional stage 1/2 table stored in the .cnv file. + * The max value should be at least 0x7ff to cover 2-byte UTF-8. + * 0xfff also covers a number other small scripts which have legacy charsets + * (like Thai). + * Higher values up to 0x1fff are harmless and potentially useful because + * that covers small-script blocks which usually have either dense mappings + * or no mappings at all. + * Starting at U+2000, there are mostly symbols and format characters + * with a low density of SBCS mappings, which would result in more wasted + * stage 3 entries with the larger block size. + */ + SBCS_UTF8_MAX=0x1fff, + + /* + * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures. + * Possible values are 0x01ff..0xffff, in steps of 0x100. + * + * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table + * with extreme input data. The function checks for this overflow. + * + * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul. + * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc. + * Larger values cause slightly larger MBCS .cnv files. + */ + MBCS_UTF8_MAX=0xd7ff, + MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ + + MBCS_UTF8_STAGE_SHIFT=6, + MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */ + MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, + + /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */ + MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ + + MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */ + MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */ + + /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ + MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE, + + MBCS_MAX_FALLBACK_COUNT=8192 +}; + +U_CFUNC NewConverter * +MBCSOpen(UCMFile *ucm); + +struct MBCSData; +typedef struct MBCSData MBCSData; + +/* + * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() + * for creating an extension-only file. + * Assume maxCharLength>1. + */ +U_CFUNC const MBCSData * +MBCSGetDummy(void); + +/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ +U_CFUNC UBool +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, int8_t flag); + +U_CFUNC NewConverter * +CnvExtOpen(UCMFile *ucm); + +#endif /* __GENMBCS_H__ */ diff --git a/intl/icu/source/tools/makeconv/makeconv.1.in b/intl/icu/source/tools/makeconv/makeconv.1.in new file mode 100644 index 000000000..4406855ee --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.1.in @@ -0,0 +1,114 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" makeconv.1: manual page for the makeconv utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2002 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye . +.\" +.TH MAKECONV 1 "16 April 2002" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B makeconv +\- compile a converter table +.SH SYNOPSIS +.B makeconv +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +.IR convertertable " .\|.\|." +.SH DESCRIPTION +.B makeconv +converts the ICU converter table +.I convertertable +into a binary file. The binary file has the same base name as +.I convertertable +but has a +.B .cnv +extension (instead of the typical +.B .ucm +extension of the +.I convertertable +file). +This binary file can then be read directly by ICU, or used by +.BR pkgdata (1) +for incorporation into a larger archive or library. +.PP +The +.I convertertable +must be in the ICU ucm (Unicode Codepage Mapping) format in order to +be understood by +.BR makeconv . +The ICU ucm format is similar to the IBM NLTC upmap/tpmap/rpmap files. +Comments in the +.I convertable +are handled as follows. If a comment (starting with a `#' sign) that +is after some text does contain the fallback indicator `|' then only +the text starting with the `#' sign, and ending before the `|' sign, +is ignored. +Otherwise, or if the comment is the first thing on the line, +the comment runs up to the end of the line. This special +handling of comments is to accomodate the practice of putting fallback +information in comments in the strict IBM NLTC ucmap format. +.PP +Note that new converters will be automatically found by ICU after their +installation in ICU's data directory. They do not need to +be listed in the +.BR convrtrs.txt (5) +converters aliases file in order to be available to applications using ICU. +They do need to be listed there if one wants to give them aliases, or +tags, though. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Include a copyright notice in the binary data. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH CAVEATS +If an existing converter table is changed and recompiled using +.BR makeconv , +the resulting binary file must be packaged in the same way that it was +packaged initially. For example, if converters were grouped together in +an archive or a library with +.BR pkgdata (1), +then the archive or library must be rebuilt with the new binary file. +A standalone binary converter file will not take precedence over a +packaged one. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000 IBM, Inc. and others. +.SH SEE ALSO +.BR convrtrs.txt (5) +.br +.BR pkgdata (1) + diff --git a/intl/icu/source/tools/makeconv/makeconv.cpp b/intl/icu/source/tools/makeconv/makeconv.cpp new file mode 100644 index 000000000..6ca3e613b --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.cpp @@ -0,0 +1,850 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************** + * + * Copyright (C) 1998-2015, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************** + * + * + * makeconv.cpp: + * tool creating a binary (compressed) representation of the conversion mapping + * table (IBM NLTC ucmap format). + * + * 05/04/2000 helena Added fallback mapping into the picture... + * 06/29/2000 helena Major rewrite of the callback APIs. + */ + +#include +#include "unicode/putil.h" +#include "unicode/ucnv_err.h" +#include "charstr.h" +#include "ucnv_bld.h" +#include "ucnv_imp.h" +#include "ucnv_cnv.h" +#include "cstring.h" +#include "cmemory.h" +#include "uinvchar.h" +#include "filestrm.h" +#include "toolutil.h" +#include "uoptions.h" +#include "unicode/udata.h" +#include "unewdata.h" +#include "uparse.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" + +#define DEBUG 0 + +typedef struct ConvData { + UCMFile *ucm; + NewConverter *cnvData, *extData; + UConverterSharedData sharedData; + UConverterStaticData staticData; +} ConvData; + +static void +initConvData(ConvData *data) { + uprv_memset(data, 0, sizeof(ConvData)); + data->sharedData.structSize=sizeof(UConverterSharedData); + data->staticData.structSize=sizeof(UConverterStaticData); + data->sharedData.staticData=&data->staticData; +} + +static void +cleanupConvData(ConvData *data) { + if(data!=NULL) { + if(data->cnvData!=NULL) { + data->cnvData->close(data->cnvData); + data->cnvData=NULL; + } + if(data->extData!=NULL) { + data->extData->close(data->extData); + data->extData=NULL; + } + ucm_close(data->ucm); + data->ucm=NULL; + } +} + +/* + * from ucnvstat.c - static prototypes of data-based converters + */ +U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]; + +/* + * Global - verbosity + */ +UBool VERBOSE = FALSE; +UBool QUIET = FALSE; +UBool SMALL = FALSE; +UBool IGNORE_SISO_CHECK = FALSE; + +static void +createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); + +/* + * Set up the UNewData and write the converter.. + */ +static void +writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status); + +UBool haveCopyright=TRUE; + +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */ + {6, 2, 0, 0}, /* formatVersion */ + {0, 0, 0, 0} /* dataVersion (calculated at runtime) */ +}; + +static void +writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status) +{ + UNewDataMemory *mem = NULL; + uint32_t sz2; + uint32_t size = 0; + int32_t tableType; + + if(U_FAILURE(*status)) + { + return; + } + + tableType=TABLE_NONE; + if(data->cnvData!=NULL) { + tableType|=TABLE_BASE; + } + if(data->extData!=NULL) { + tableType|=TABLE_EXT; + } + + mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status); + + if(U_FAILURE(*status)) + { + fprintf(stderr, "Couldn't create the udata %s.%s: %s\n", + cnvName, + "cnv", + u_errorName(*status)); + return; + } + + if(VERBOSE) + { + printf("- Opened udata %s.%s\n", cnvName, "cnv"); + } + + + /* all read only, clean, platform independent data. Mmmm. :) */ + udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData)); + size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */ + /* Now, write the table */ + if(tableType&TABLE_BASE) { + size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType); + } + if(tableType&TABLE_EXT) { + size += data->extData->write(data->extData, &data->staticData, mem, tableType); + } + + sz2 = udata_finish(mem, status); + if(size != sz2) + { + fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size); + *status=U_INTERNAL_PROGRAM_ERROR; + } + if(VERBOSE) + { + printf("- Wrote %u bytes to the udata.\n", (int)sz2); + } +} + +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_COPYRIGHT, + OPT_VERSION, + OPT_DESTDIR, + OPT_VERBOSE, + OPT_SMALL, + OPT_IGNORE_SISO_CHECK, + OPT_QUIET, + + OPT_COUNT +}; + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_COPYRIGHT, + UOPTION_VERSION, + UOPTION_DESTDIR, + UOPTION_VERBOSE, + { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, + { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, + UOPTION_QUIET, +}; + +int main(int argc, char* argv[]) +{ + ConvData data; + char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; + + U_MAIN_INIT_ARGS(argc, argv); + + /* Set up the ICU version number */ + UVersionInfo icuVersion; + u_getVersion(icuVersion); + uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo)); + + /* preset then read command line options */ + options[OPT_DESTDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } else if(argc<2) { + argc=-1; + } + if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + fprintf(stdfile, + "usage: %s [-options] files...\n" + "\tread .ucm codepage mapping files and write .cnv files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-c or --copyright include a copyright notice\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + argv[0]); + fprintf(stdfile, + "\t --small Generate smaller .cnv files. They will be\n" + "\t significantly smaller but may not be compatible with\n" + "\t older versions of ICU and will require heap memory\n" + "\t allocation when loaded.\n" + "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n"); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[OPT_VERSION].doesOccur) { + printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n", + dataInfo.formatVersion[0], dataInfo.formatVersion[1]); + printf("%s\n", U_COPYRIGHT_STRING); + exit(0); + } + + /* get the options values */ + haveCopyright = options[OPT_COPYRIGHT].doesOccur; + const char *destdir = options[OPT_DESTDIR].value; + VERBOSE = options[OPT_VERBOSE].doesOccur; + QUIET = options[OPT_QUIET].doesOccur; + SMALL = options[OPT_SMALL].doesOccur; + + if (options[OPT_IGNORE_SISO_CHECK].doesOccur) { + IGNORE_SISO_CHECK = TRUE; + } + + icu::CharString outFileName; + UErrorCode err = U_ZERO_ERROR; + if (destdir != NULL && *destdir != 0) { + outFileName.append(destdir, err).ensureEndsWithFileSeparator(err); + if (U_FAILURE(err)) { + return err; + } + } + int32_t outBasenameStart = outFileName.length(); + +#if DEBUG + { + int i; + printf("makeconv: processing %d files...\n", argc - 1); + for(i=1; i 2 || VERBOSE); + for (++argv; --argc; ++argv) + { + UErrorCode localError = U_ZERO_ERROR; + const char *arg = getLongPathname(*argv); + + /*produces the right destination path for display*/ + outFileName.truncate(outBasenameStart); + if (outBasenameStart != 0) + { + /* find the last file sepator */ + const char *basename = findBasename(arg); + outFileName.append(basename, localError); + } + else + { + outFileName.append(arg, localError); + } + if (U_FAILURE(localError)) { + return localError; + } + + /*removes the extension if any is found*/ + int32_t lastDotIndex = outFileName.lastIndexOf('.'); + if (lastDotIndex >= outBasenameStart) { + outFileName.truncate(lastDotIndex); + } + + /* the basename without extension is the converter name */ + if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) { + fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart); + return U_BUFFER_OVERFLOW_ERROR; + } + uprv_strcpy(cnvName, outFileName.data() + outBasenameStart); + + /*Adds the target extension*/ + outFileName.append(CONVERTER_FILE_EXTENSION, localError); + if (U_FAILURE(localError)) { + return localError; + } + +#if DEBUG + printf("makeconv: processing %s ...\n", arg); + fflush(stdout); +#endif + initConvData(&data); + createConverter(&data, arg, &localError); + + if (U_FAILURE(localError)) + { + /* if an error is found, print out an error msg and keep going */ + fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", + outFileName.data(), arg, u_errorName(localError)); + if(U_SUCCESS(err)) { + err = localError; + } + } + else + { + /* Insure the static data name matches the file name */ + /* Changed to ignore directory and only compare base name + LDH 1/2/08*/ + char *p; + p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */ + + if(p == NULL) /* OK, try alternate */ + { + p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR); + if(p == NULL) + { + p=cnvName; /* If no separators, no problem */ + } + } + else + { + p++; /* If found separator, don't include it in compare */ + } + if(uprv_stricmp(p,data.staticData.name) && !QUIET) + { + fprintf(stderr, "Warning: %s%s claims to be '%s'\n", + cnvName, CONVERTER_FILE_EXTENSION, + data.staticData.name); + } + + uprv_strcpy((char*)data.staticData.name, cnvName); + + if(!uprv_isInvariantString((char*)data.staticData.name, -1)) { + fprintf(stderr, + "Error: A converter name must contain only invariant characters.\n" + "%s is not a valid converter name.\n", + data.staticData.name); + if(U_SUCCESS(err)) { + err = U_INVALID_TABLE_FORMAT; + } + } + + localError = U_ZERO_ERROR; + writeConverterData(&data, cnvName, destdir, &localError); + + if(U_FAILURE(localError)) + { + /* if an error is found, print out an error msg and keep going*/ + fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg, + u_errorName(localError)); + if(U_SUCCESS(err)) { + err = localError; + } + } + else if (printFilename) + { + puts(outFileName.data() + outBasenameStart); + } + } + fflush(stdout); + fflush(stderr); + + cleanupConvData(&data); + } + + return err; +} + +static void +getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) { + if( (name[0]=='i' || name[0]=='I') && + (name[1]=='b' || name[1]=='B') && + (name[2]=='m' || name[2]=='M') + ) { + name+=3; + if(*name=='-') { + ++name; + } + *pPlatform=UCNV_IBM; + *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10); + } else { + *pPlatform=UCNV_UNKNOWN; + *pCCSID=0; + } +} + +static void +readHeader(ConvData *data, + FileStream* convFile, + UErrorCode *pErrorCode) { + char line[1024]; + char *s, *key, *value; + const UConverterStaticData *prototype; + UConverterStaticData *staticData; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + staticData=&data->staticData; + staticData->platform=UCNV_IBM; + staticData->subCharLen=0; + + while(T_FileStream_readLine(convFile, line, sizeof(line))) { + /* basic parsing and handling of state-related items */ + if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) { + continue; + } + + /* stop at the beginning of the mapping section */ + if(uprv_strcmp(line, "CHARMAP")==0) { + break; + } + + /* collect the information from the header field, ignore unknown keys */ + if(uprv_strcmp(key, "code_set_name")==0) { + if(*value!=0) { + uprv_strcpy((char *)staticData->name, value); + getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage); + } + } else if(uprv_strcmp(key, "subchar")==0) { + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + int8_t length; + + s=value; + length=ucm_parseBytes(bytes, line, (const char **)&s); + if(1<=length && length<=4 && *s==0) { + staticData->subCharLen=length; + uprv_memcpy(staticData->subChar, bytes, length); + } else { + fprintf(stderr, "error: illegal %s\n", value); + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } + } else if(uprv_strcmp(key, "subchar1")==0) { + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + + s=value; + if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) { + staticData->subChar1=bytes[0]; + } else { + fprintf(stderr, "error: illegal %s\n", value); + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } + } + } + + /* copy values from the UCMFile to the static data */ + staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength; + staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength; + staticData->conversionType=data->ucm->states.conversionType; + + if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) { + fprintf(stderr, "ucm error: missing conversion type ()\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } + + /* + * Now that we know the type, copy any 'default' values from the table. + * We need not check the type any further because the parser only + * recognizes what we have prototypes for. + * + * For delta (extension-only) tables, copy values from the base file + * instead, see createConverter(). + */ + if(data->ucm->baseName[0]==0) { + prototype=ucnv_converterStaticData[staticData->conversionType]; + if(prototype!=NULL) { + if(staticData->name[0]==0) { + uprv_strcpy((char *)staticData->name, prototype->name); + } + + if(staticData->codepage==0) { + staticData->codepage=prototype->codepage; + } + + if(staticData->platform==0) { + staticData->platform=prototype->platform; + } + + if(staticData->minBytesPerChar==0) { + staticData->minBytesPerChar=prototype->minBytesPerChar; + } + + if(staticData->maxBytesPerChar==0) { + staticData->maxBytesPerChar=prototype->maxBytesPerChar; + } + + if(staticData->subCharLen==0) { + staticData->subCharLen=prototype->subCharLen; + if(prototype->subCharLen>0) { + uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); + } + } + } + } + + if(data->ucm->states.outputType<0) { + data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1; + } + + if( staticData->subChar1!=0 && + (staticData->minBytesPerChar>1 || + (staticData->conversionType!=UCNV_MBCS && + staticData->conversionType!=UCNV_EBCDIC_STATEFUL)) + ) { + fprintf(stderr, "error: defined for a type other than MBCS or EBCDIC_STATEFUL\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } +} + +/* return TRUE if a base table was read, FALSE for an extension table */ +static UBool +readFile(ConvData *data, const char* converterName, + UErrorCode *pErrorCode) { + char line[1024]; + char *end; + FileStream *convFile; + + UCMStates *baseStates; + UBool dataIsBase; + + if(U_FAILURE(*pErrorCode)) { + return FALSE; + } + + data->ucm=ucm_open(); + + convFile=T_FileStream_open(converterName, "r"); + if(convFile==NULL) { + *pErrorCode=U_FILE_ACCESS_ERROR; + return FALSE; + } + + readHeader(data, convFile, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return FALSE; + } + + if(data->ucm->baseName[0]==0) { + dataIsBase=TRUE; + baseStates=&data->ucm->states; + ucm_processStates(baseStates, IGNORE_SISO_CHECK); + } else { + dataIsBase=FALSE; + baseStates=NULL; + } + + /* read the base table */ + ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return FALSE; + } + + /* read an extension table if there is one */ + while(T_FileStream_readLine(convFile, line, sizeof(line))) { + end=uprv_strchr(line, 0); + while(lineucm, convFile, FALSE, baseStates, pErrorCode); + } else { + fprintf(stderr, "unexpected text after the base mapping table\n"); + } + break; + } + + T_FileStream_close(convFile); + + if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { + fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + + return dataIsBase; +} + +static void +createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) { + ConvData baseData; + UBool dataIsBase; + + UConverterStaticData *staticData; + UCMStates *states, *baseStates; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + initConvData(data); + + dataIsBase=readFile(data, converterName, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } + + staticData=&data->staticData; + states=&data->ucm->states; + + if(dataIsBase) { + /* + * Build a normal .cnv file with a base table + * and an optional extension table. + */ + data->cnvData=MBCSOpen(data->ucm); + if(data->cnvData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + + } else if(!data->cnvData->isValid(data->cnvData, + staticData->subChar, staticData->subCharLen) + ) { + fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if(staticData->subChar1!=0 && + !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1) + ) { + fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if( + data->ucm->ext->mappingsLength>0 && + !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) { + /* sort the table so that it can be turned into UTF-8-friendly data */ + ucm_sortTable(data->ucm->base); + } + + if(U_SUCCESS(*pErrorCode)) { + if( + /* add the base table after ucm_checkBaseExt()! */ + !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else { + /* + * addTable() may have requested moving more mappings to the extension table + * if they fit into the base toUnicode table but not into the + * base fromUnicode table. + * (Especially for UTF-8-friendly fromUnicode tables.) + * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them + * to be excluded from the extension toUnicode data. + * See MBCSOkForBaseFromUnicode() for which mappings do not fit into + * the base fromUnicode table. + */ + ucm_moveMappings(data->ucm->base, data->ucm->ext); + ucm_sortTable(data->ucm->ext); + if(data->ucm->ext->mappingsLength>0) { + /* prepare the extension table, if there is one */ + data->extData=CnvExtOpen(data->ucm); + if(data->extData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + } else if( + !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + } + } + } else { + /* Build an extension-only .cnv file. */ + char baseFilename[500]; + char *basename; + + initConvData(&baseData); + + /* assemble a path/filename for data->ucm->baseName */ + uprv_strcpy(baseFilename, converterName); + basename=(char *)findBasename(baseFilename); + uprv_strcpy(basename, data->ucm->baseName); + uprv_strcat(basename, ".ucm"); + + /* read the base table */ + dataIsBase=readFile(&baseData, baseFilename, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } else if(!dataIsBase) { + fprintf(stderr, "error: the file \"%s\" is not a base table file\n", baseFilename); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else { + /* prepare the extension table */ + data->extData=CnvExtOpen(data->ucm); + if(data->extData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + } else { + /* fill in gaps in extension file header fields */ + UCMapping *m, *mLimit; + uint8_t fallbackFlags; + + baseStates=&baseData.ucm->states; + if(states->conversionType==UCNV_DBCS) { + staticData->minBytesPerChar=(int8_t)(states->minCharLength=2); + } else if(states->minCharLength==0) { + staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength); + } + if(states->maxCharLengthminCharLength) { + staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength); + } + + if(staticData->subCharLen==0) { + uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4); + staticData->subCharLen=baseData.staticData.subCharLen; + } + /* + * do not copy subChar1 - + * only use what is explicitly specified + * because it cannot be unset in the extension file header + */ + + /* get the fallback flags */ + fallbackFlags=0; + for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; + mf==1) { + fallbackFlags|=1; + } else if(m->f==3) { + fallbackFlags|=2; + } + } + + if(fallbackFlags&1) { + staticData->hasFromUnicodeFallback=TRUE; + } + if(fallbackFlags&2) { + staticData->hasToUnicodeFallback=TRUE; + } + + if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) { + fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { + fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if( + !ucm_checkValidity(data->ucm->ext, baseStates) || + !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else { + if(states->maxCharLength>1) { + /* + * When building a normal .cnv file with a base table + * for an MBCS (not SBCS) table with explicit precision flags, + * the MBCSAddTable() function marks some mappings for moving + * to the extension table. + * They fit into the base toUnicode table but not into the + * base fromUnicode table. + * (Note: We do have explicit precision flags because they are + * required for extension table generation, and + * ucm_checkBaseExt() verified it.) + * + * We do not call MBCSAddTable() here (we probably could) + * so we need to do the analysis before building the extension table. + * We assume that MBCSAddTable() will build a UTF-8-friendly table. + * Redundant mappings in the extension table are ok except they cost some size. + * + * Do this after ucm_checkBaseExt(). + */ + const MBCSData *mbcsData=MBCSGetDummy(); + int32_t needsMove=0; + for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; + mb.bytes, m->bLen, m->u, m->f)) { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + ++needsMove; + } + } + + if(needsMove!=0) { + ucm_moveMappings(baseData.ucm->base, data->ucm->ext); + ucm_sortTable(data->ucm->ext); + } + } + if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + } + } + + cleanupConvData(&baseData); + } +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/makeconv/makeconv.h b/intl/icu/source/tools/makeconv/makeconv.h new file mode 100644 index 000000000..b7918853f --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.h @@ -0,0 +1,61 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: makeconv.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000nov01 +* created by: Markus W. Scherer +*/ + +#ifndef __MAKECONV_H__ +#define __MAKECONV_H__ + +#include "unicode/utypes.h" +#include "ucnv_bld.h" +#include "unewdata.h" +#include "ucm.h" + +/* exports from makeconv.c */ +U_CFUNC UBool VERBOSE; +U_CFUNC UBool SMALL; +U_CFUNC UBool IGNORE_SISO_CHECK; + +/* converter table type for writing */ +enum { + TABLE_NONE, + TABLE_BASE, + TABLE_EXT, + TABLE_BASE_AND_EXT +}; + +/* abstract converter generator struct, C++ - style */ +struct NewConverter; +typedef struct NewConverter NewConverter; + +struct NewConverter { + void + (*close)(NewConverter *cnvData); + + /** is this byte sequence valid? */ + UBool + (*isValid)(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + + UBool + (*addTable)(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + + uint32_t + (*write)(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); +}; + +#endif /* __MAKECONV_H__ */ diff --git a/intl/icu/source/tools/makeconv/makeconv.vcxproj b/intl/icu/source/tools/makeconv/makeconv.vcxproj new file mode 100644 index 000000000..3895e898f --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.vcxproj @@ -0,0 +1,267 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {F5AD9738-1A3D-4906-B9C4-A7D9CE33DC2C} + + + + Application + false + MultiByte + v140 + + + Application + false + MultiByte + v140 + + + Application + false + MultiByte + v140 + + + Application + false + MultiByte + v140 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + .\x86\Debug\ + .\x86\Debug\ + true + .\x86\Release\ + .\x86\Release\ + false + .\x64\Debug\ + .\x64\Debug\ + true + .\x64\Release\ + .\x64\Release\ + false + + + + copy "$(TargetPath)" ..\..\..\bin + + ..\..\..\bin\$(TargetFileName);%(Outputs) + + + .\x86\Debug/makeconv.tlb + + + Disabled + ..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + true + false + true + .\x86\Debug/makeconv.pch + .\x86\Debug/ + .\x86\Debug/ + .\x86\Debug/ + true + Level3 + true + EditAndContinue + Default + + + _DEBUG;%(PreprocessorDefinitions) + 0x0409 + + + .\x86\Debug/makeconv.exe + true + true + .\x86\Debug/makeconv.pdb + Console + false + + + + + + + copy "$(TargetPath)" ..\..\..\bin + + ..\..\..\bin\$(TargetFileName);%(Outputs) + + + .\x86\Release/makeconv.tlb + + + ..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE=1;%(PreprocessorDefinitions) + true + MultiThreadedDLL + false + true + .\x86\Release/makeconv.pch + .\x86\Release/ + .\x86\Release/ + .\x86\Release/ + Level3 + true + Default + + + NDEBUG;%(PreprocessorDefinitions) + 0x0409 + + + .\x86\Release/makeconv.exe + true + .\x86\Release/makeconv.pdb + Console + false + + + + + + + copy "$(TargetPath)" ..\..\..\bin64 + + ..\..\..\bin64\$(TargetFileName);%(Outputs) + + + X64 + .\x64\Debug/makeconv.tlb + + + Disabled + ..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories) + WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + true + false + true + .\x64\Debug/makeconv.pch + .\x64\Debug/ + .\x64\Debug/ + .\x64\Debug/ + true + Level3 + true + ProgramDatabase + Default + + + _DEBUG;%(PreprocessorDefinitions) + 0x0409 + + + .\x64\Debug/makeconv.exe + true + true + .\x64\Debug/makeconv.pdb + Console + MachineX64 + + + + + copy "$(TargetPath)" ..\..\..\bin64 + + ..\..\..\bin64\$(TargetFileName);%(Outputs) + + + X64 + .\x64\Release/makeconv.tlb + + + ..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories) + WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE=1;%(PreprocessorDefinitions) + true + MultiThreadedDLL + false + true + .\x64\Release/makeconv.pch + .\x64\Release/ + .\x64\Release/ + .\x64\Release/ + Level3 + true + Default + + + NDEBUG;%(PreprocessorDefinitions) + 0x0409 + + + .\x64\Release/makeconv.exe + true + .\x64\Release/makeconv.pdb + Console + MachineX64 + + + + + + + + + + + + + + + {73c0a65b-d1f2-4de1-b3a6-15dad2c23f3d} + false + + + {6b231032-3cb5-4eed-9210-810d666a23a0} + false + + + + + + \ No newline at end of file diff --git a/intl/icu/source/tools/makeconv/makeconv.vcxproj.filters b/intl/icu/source/tools/makeconv/makeconv.vcxproj.filters new file mode 100644 index 000000000..b5232c7c7 --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.vcxproj.filters @@ -0,0 +1,39 @@ + + + + + {de2dc8b2-bfcb-4516-bc0b-851f2bddd695} + cpp;c;cxx;rc;def;r;odl;idl;hpj;bat + + + {0638fe1b-842e-4db0-b609-7da558bbad33} + h;hpp;hxx;hm;inl + + + {c192904c-2a84-40cd-8829-c5a00d5a15fb} + ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/intl/icu/source/tools/makeconv/ucnvstat.c b/intl/icu/source/tools/makeconv/ucnvstat.c new file mode 100644 index 000000000..890e73190 --- /dev/null +++ b/intl/icu/source/tools/makeconv/ucnvstat.c @@ -0,0 +1,69 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ****************************************************************************** + * + * Copyright (C) 1998-2006, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + * + * + * ucnvstat.c: + * UConverterStaticData prototypes for data based converters + */ + +#include "unicode/utypes.h" +#include "unicode/ucnv.h" +#include "ucnv_bld.h" + + +static const UConverterStaticData _SBCSStaticData={ + sizeof(UConverterStaticData), + "SBCS", + 0, UCNV_IBM, UCNV_SBCS, 1, 1, + { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + + +static const UConverterStaticData _DBCSStaticData={ + sizeof(UConverterStaticData), + "DBCS", + 0, UCNV_IBM, UCNV_DBCS, 2, 2, + { 0, 0, 0, 0 },0, FALSE, FALSE, /* subchar */ + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +static const UConverterStaticData _MBCSStaticData={ + sizeof(UConverterStaticData), + "MBCS", + 0, UCNV_IBM, UCNV_MBCS, 1, 1, + { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +static const UConverterStaticData _EBCDICStatefulStaticData={ + sizeof(UConverterStaticData), + "EBCDICStateful", + 0, UCNV_IBM, UCNV_EBCDIC_STATEFUL, 1, 1, + { 0, 0, 0, 0 },0, FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +/* NULLs for algorithmic types, their tables live in ucnv_bld.c */ +const UConverterStaticData *ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ + &_SBCSStaticData, &_DBCSStaticData, &_MBCSStaticData, NULL/*Lat1*/, + NULL/*UTF8*/, NULL/*UTF16be*/, NULL/*UTF16LE*/, NULL/*UTF32be*/, NULL/*UTF32LE*/, &_EBCDICStatefulStaticData, + NULL/*ISO2022*/, + /* LMBCS */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL +}; + -- cgit v1.2.3