// Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2012, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: stringtriebuilder.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2010dec24 * created by: Markus W. Scherer */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #include "unicode/stringtriebuilder.h" #include "uassert.h" #include "uhash.h" U_CDECL_BEGIN static int32_t U_CALLCONV hashStringTrieNode(const UHashTok key) { return icu::StringTrieBuilder::hashNode(key.pointer); } static UBool U_CALLCONV equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); } U_CDECL_END U_NAMESPACE_BEGIN StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} StringTrieBuilder::~StringTrieBuilder() { deleteCompactBuilder(); } void StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, sizeGuess, &errorCode); if(U_SUCCESS(errorCode)) { if(nodes==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } else { uhash_setKeyDeleter(nodes, uprv_deleteUObject); } } } void StringTrieBuilder::deleteCompactBuilder() { uhash_close(nodes); nodes=NULL; } void StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode) { if(buildOption==USTRINGTRIE_BUILD_FAST) { writeNode(0, elementsLength, 0); } else /* USTRINGTRIE_BUILD_SMALL */ { createCompactBuilder(2*elementsLength, errorCode); Node *root=makeNode(0, elementsLength, 0, errorCode); if(U_SUCCESS(errorCode)) { root->markRightEdgesFirst(-1); root->write(*this); } deleteCompactBuilder(); } } // Requires start<limit, // and all strings of the [start..limit[ elements must be sorted and // have a common prefix of length unitIndex. int32_t StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { UBool hasValue=FALSE; int32_t value=0; int32_t type; if(unitIndex==getElementStringLength(start)) { // An intermediate or final value. value=getElementValue(start++); if(start==limit) { return writeValueAndFinal(value, TRUE); // final-value node } hasValue=TRUE; } // Now all [start..limit[ strings are longer than unitIndex. int32_t minUnit=getElementUnit(start, unitIndex); int32_t maxUnit=getElementUnit(limit-1, unitIndex); if(minUnit==maxUnit) { // Linear-match node: All strings have the same character at unitIndex. int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); writeNode(start, limit, lastUnitIndex); // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. int32_t length=lastUnitIndex-unitIndex; int32_t maxLinearMatchLength=getMaxLinearMatchLength(); while(length>maxLinearMatchLength) { lastUnitIndex-=maxLinearMatchLength; length-=maxLinearMatchLength; writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); write(getMinLinearMatch()+maxLinearMatchLength-1); } writeElementUnits(start, unitIndex, length); type=getMinLinearMatch()+length-1; } else { // Branch node. int32_t length=countElementUnits(start, limit, unitIndex); // length>=2 because minUnit!=maxUnit. writeBranchSubNode(start, limit, unitIndex, length); if(--length<getMinLinearMatch()) { type=length; } else { write(length); type=0; } } return writeValueAndType(hasValue, value, type); } // start<limit && all strings longer than unitIndex && // length different units at unitIndex int32_t StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { UChar middleUnits[kMaxSplitBranchLevels]; int32_t lessThan[kMaxSplitBranchLevels]; int32_t ltLength=0; while(length>getMaxBranchLinearSubNodeLength()) { // Branch on the middle unit. // First, find the middle unit. int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); // Encode the less-than branch first. middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); ++ltLength; // Continue for the greater-or-equal branch. start=i; length=length-length/2; } // For each unit, find its elements array start and whether it has a final value. int32_t starts[kMaxBranchLinearSubNodeLength]; UBool isFinal[kMaxBranchLinearSubNodeLength-1]; int32_t unitNumber=0; do { int32_t i=starts[unitNumber]=start; UChar unit=getElementUnit(i++, unitIndex); i=indexOfElementWithNextUnit(i, unitIndex, unit); isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); start=i; } while(++unitNumber<length-1); // unitNumber==length-1, and the maxUnit elements range is [start..limit[ starts[unitNumber]=start; // Write the sub-nodes in reverse order: The jump lengths are deltas from // after their own positions, so if we wrote the minUnit sub-node first, // then its jump delta would be larger. // Instead we write the minUnit sub-node last, for a shorter delta. int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; do { --unitNumber; if(!isFinal[unitNumber]) { jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); } } while(unitNumber>0); // The maxUnit sub-node is written as the very last one because we do // not jump for it at all. unitNumber=length-1; writeNode(start, limit, unitIndex+1); int32_t offset=write(getElementUnit(start, unitIndex)); // Write the rest of this node's unit-value pairs. while(--unitNumber>=0) { start=starts[unitNumber]; int32_t value; if(isFinal[unitNumber]) { // Write the final value for the one string ending with this unit. value=getElementValue(start); } else { // Write the delta to the start position of the sub-node. value=offset-jumpTargets[unitNumber]; } writeValueAndFinal(value, isFinal[unitNumber]); offset=write(getElementUnit(start, unitIndex)); } // Write the split-branch nodes. while(ltLength>0) { --ltLength; writeDeltaTo(lessThan[ltLength]); offset=write(middleUnits[ltLength]); } return offset; } // Requires start<limit, // and all strings of the [start..limit[ elements must be sorted and // have a common prefix of length unitIndex. StringTrieBuilder::Node * StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } UBool hasValue=FALSE; int32_t value=0; if(unitIndex==getElementStringLength(start)) { // An intermediate or final value. value=getElementValue(start++); if(start==limit) { return registerFinalValue(value, errorCode); } hasValue=TRUE; } Node *node; // Now all [start..limit[ strings are longer than unitIndex. int32_t minUnit=getElementUnit(start, unitIndex); int32_t maxUnit=getElementUnit(limit-1, unitIndex); if(minUnit==maxUnit) { // Linear-match node: All strings have the same character at unitIndex. int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. int32_t length=lastUnitIndex-unitIndex; int32_t maxLinearMatchLength=getMaxLinearMatchLength(); while(length>maxLinearMatchLength) { lastUnitIndex-=maxLinearMatchLength; length-=maxLinearMatchLength; node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); nextNode=registerNode(node, errorCode); } node=createLinearMatchNode(start, unitIndex, length, nextNode); } else { // Branch node. int32_t length=countElementUnits(start, limit, unitIndex); // length>=2 because minUnit!=maxUnit. Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); node=new BranchHeadNode(length, subNode); } if(hasValue && node!=NULL) { if(matchNodesCanHaveValues()) { ((ValueNode *)node)->setValue(value); } else { node=new IntermediateValueNode(value, registerNode(node, errorCode)); } } return registerNode(node, errorCode); } // start<limit && all strings longer than unitIndex && // length different units at unitIndex StringTrieBuilder::Node * StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } UChar middleUnits[kMaxSplitBranchLevels]; Node *lessThan[kMaxSplitBranchLevels]; int32_t ltLength=0; while(length>getMaxBranchLinearSubNodeLength()) { // Branch on the middle unit. // First, find the middle unit. int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); // Create the less-than branch. middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); ++ltLength; // Continue for the greater-or-equal branch. start=i; length=length-length/2; } if(U_FAILURE(errorCode)) { return NULL; } ListBranchNode *listNode=new ListBranchNode(); if(listNode==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } // For each unit, find its elements array start and whether it has a final value. int32_t unitNumber=0; do { int32_t i=start; UChar unit=getElementUnit(i++, unitIndex); i=indexOfElementWithNextUnit(i, unitIndex, unit); if(start==i-1 && unitIndex+1==getElementStringLength(start)) { listNode->add(unit, getElementValue(start)); } else { listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); } start=i; } while(++unitNumber<length-1); // unitNumber==length-1, and the maxUnit elements range is [start..limit[ UChar unit=getElementUnit(start, unitIndex); if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { listNode->add(unit, getElementValue(start)); } else { listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); } Node *node=registerNode(listNode, errorCode); // Create the split-branch nodes. while(ltLength>0) { --ltLength; node=registerNode( new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); } return node; } StringTrieBuilder::Node * StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { delete newNode; return NULL; } if(newNode==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } const UHashElement *old=uhash_find(nodes, newNode); if(old!=NULL) { delete newNode; return (Node *)old->key.pointer; } // If uhash_puti() returns a non-zero value from an equivalent, previously // registered node, then uhash_find() failed to find that and we will leak newNode. #if U_DEBUG int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. #endif uhash_puti(nodes, newNode, 1, &errorCode); U_ASSERT(oldValue==0); if(U_FAILURE(errorCode)) { delete newNode; return NULL; } return newNode; } StringTrieBuilder::Node * StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } FinalValueNode key(value); const UHashElement *old=uhash_find(nodes, &key); if(old!=NULL) { return (Node *)old->key.pointer; } Node *newNode=new FinalValueNode(value); if(newNode==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } // If uhash_puti() returns a non-zero value from an equivalent, previously // registered node, then uhash_find() failed to find that and we will leak newNode. #if U_DEBUG int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. #endif uhash_puti(nodes, newNode, 1, &errorCode); U_ASSERT(oldValue==0); if(U_FAILURE(errorCode)) { delete newNode; return NULL; } return newNode; } UBool StringTrieBuilder::hashNode(const void *node) { return ((const Node *)node)->hashCode(); } UBool StringTrieBuilder::equalNodes(const void *left, const void *right) { return *(const Node *)left==*(const Node *)right; } UBool StringTrieBuilder::Node::operator==(const Node &other) const { return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); } int32_t StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { if(offset==0) { offset=edgeNumber; } return edgeNumber; } UBool StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!Node::operator==(other)) { return FALSE; } const FinalValueNode &o=(const FinalValueNode &)other; return value==o.value; } void StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { offset=builder.writeValueAndFinal(value, TRUE); } UBool StringTrieBuilder::ValueNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!Node::operator==(other)) { return FALSE; } const ValueNode &o=(const ValueNode &)other; return hasValue==o.hasValue && (!hasValue || value==o.value); } UBool StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!ValueNode::operator==(other)) { return FALSE; } const IntermediateValueNode &o=(const IntermediateValueNode &)other; return next==o.next; } int32_t StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { if(offset==0) { offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); } return edgeNumber; } void StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { next->write(builder); offset=builder.writeValueAndFinal(value, FALSE); } UBool StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!ValueNode::operator==(other)) { return FALSE; } const LinearMatchNode &o=(const LinearMatchNode &)other; return length==o.length && next==o.next; } int32_t StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { if(offset==0) { offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); } return edgeNumber; } UBool StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!Node::operator==(other)) { return FALSE; } const ListBranchNode &o=(const ListBranchNode &)other; for(int32_t i=0; i<length; ++i) { if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { return FALSE; } } return TRUE; } int32_t StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { if(offset==0) { firstEdgeNumber=edgeNumber; int32_t step=0; int32_t i=length; do { Node *edge=equal[--i]; if(edge!=NULL) { edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); } // For all but the rightmost edge, decrement the edge number. step=1; } while(i>0); offset=edgeNumber; } return edgeNumber; } void StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { // Write the sub-nodes in reverse order: The jump lengths are deltas from // after their own positions, so if we wrote the minUnit sub-node first, // then its jump delta would be larger. // Instead we write the minUnit sub-node last, for a shorter delta. int32_t unitNumber=length-1; Node *rightEdge=equal[unitNumber]; int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); do { --unitNumber; if(equal[unitNumber]!=NULL) { equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); } } while(unitNumber>0); // The maxUnit sub-node is written as the very last one because we do // not jump for it at all. unitNumber=length-1; if(rightEdge==NULL) { builder.writeValueAndFinal(values[unitNumber], TRUE); } else { rightEdge->write(builder); } offset=builder.write(units[unitNumber]); // Write the rest of this node's unit-value pairs. while(--unitNumber>=0) { int32_t value; UBool isFinal; if(equal[unitNumber]==NULL) { // Write the final value for the one string ending with this unit. value=values[unitNumber]; isFinal=TRUE; } else { // Write the delta to the start position of the sub-node. U_ASSERT(equal[unitNumber]->getOffset()>0); value=offset-equal[unitNumber]->getOffset(); isFinal=FALSE; } builder.writeValueAndFinal(value, isFinal); offset=builder.write(units[unitNumber]); } } UBool StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!Node::operator==(other)) { return FALSE; } const SplitBranchNode &o=(const SplitBranchNode &)other; return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; } int32_t StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { if(offset==0) { firstEdgeNumber=edgeNumber; edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); } return edgeNumber; } void StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { // Encode the less-than branch first. lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); // Encode the greater-or-equal branch last because we do not jump for it at all. greaterOrEqual->write(builder); // Write this node. U_ASSERT(lessThan->getOffset()>0); builder.writeDeltaTo(lessThan->getOffset()); // less-than offset=builder.write(unit); } UBool StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!ValueNode::operator==(other)) { return FALSE; } const BranchHeadNode &o=(const BranchHeadNode &)other; return length==o.length && next==o.next; } int32_t StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { if(offset==0) { offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); } return edgeNumber; } void StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { next->write(builder); if(length<=builder.getMinLinearMatch()) { offset=builder.writeValueAndType(hasValue, value, length-1); } else { builder.write(length-1); offset=builder.writeValueAndType(hasValue, value, 0); } } U_NAMESPACE_END