summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/rbbirb.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/rbbirb.cpp')
-rw-r--r--intl/icu/source/common/rbbirb.cpp320
1 files changed, 320 insertions, 0 deletions
diff --git a/intl/icu/source/common/rbbirb.cpp b/intl/icu/source/common/rbbirb.cpp
new file mode 100644
index 000000000..3d9fe6fd3
--- /dev/null
+++ b/intl/icu/source/common/rbbirb.cpp
@@ -0,0 +1,320 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+//
+// file: rbbirb.cpp
+//
+// Copyright (C) 2002-2011, International Business Machines Corporation and others.
+// All Rights Reserved.
+//
+// This file contains the RBBIRuleBuilder class implementation. This is the main class for
+// building (compiling) break rules into the tables required by the runtime
+// RBBI engine.
+//
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "unicode/brkiter.h"
+#include "unicode/rbbi.h"
+#include "unicode/ubrk.h"
+#include "unicode/unistr.h"
+#include "unicode/uniset.h"
+#include "unicode/uchar.h"
+#include "unicode/uchriter.h"
+#include "unicode/parsepos.h"
+#include "unicode/parseerr.h"
+#include "cmemory.h"
+#include "cstring.h"
+
+#include "rbbirb.h"
+#include "rbbinode.h"
+
+#include "rbbiscan.h"
+#include "rbbisetb.h"
+#include "rbbitblb.h"
+#include "rbbidata.h"
+
+
+U_NAMESPACE_BEGIN
+
+
+//----------------------------------------------------------------------------------------
+//
+// Constructor.
+//
+//----------------------------------------------------------------------------------------
+RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
+ UParseError *parseErr,
+ UErrorCode &status)
+ : fRules(rules)
+{
+ fStatus = &status; // status is checked below
+ fParseError = parseErr;
+ fDebugEnv = NULL;
+#ifdef RBBI_DEBUG
+ fDebugEnv = getenv("U_RBBIDEBUG");
+#endif
+
+
+ fForwardTree = NULL;
+ fReverseTree = NULL;
+ fSafeFwdTree = NULL;
+ fSafeRevTree = NULL;
+ fDefaultTree = &fForwardTree;
+ fForwardTables = NULL;
+ fReverseTables = NULL;
+ fSafeFwdTables = NULL;
+ fSafeRevTables = NULL;
+ fRuleStatusVals = NULL;
+ fChainRules = FALSE;
+ fLBCMNoChain = FALSE;
+ fLookAheadHardBreak = FALSE;
+ fUSetNodes = NULL;
+ fRuleStatusVals = NULL;
+ fScanner = NULL;
+ fSetBuilder = NULL;
+ if (parseErr) {
+ uprv_memset(parseErr, 0, sizeof(UParseError));
+ }
+
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ fUSetNodes = new UVector(status); // bcos status gets overwritten here
+ fRuleStatusVals = new UVector(status);
+ fScanner = new RBBIRuleScanner(this);
+ fSetBuilder = new RBBISetBuilder(this);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+}
+
+
+
+//----------------------------------------------------------------------------------------
+//
+// Destructor
+//
+//----------------------------------------------------------------------------------------
+RBBIRuleBuilder::~RBBIRuleBuilder() {
+
+ int i;
+ for (i=0; ; i++) {
+ RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
+ if (n==NULL) {
+ break;
+ }
+ delete n;
+ }
+
+ delete fUSetNodes;
+ delete fSetBuilder;
+ delete fForwardTables;
+ delete fReverseTables;
+ delete fSafeFwdTables;
+ delete fSafeRevTables;
+
+ delete fForwardTree;
+ delete fReverseTree;
+ delete fSafeFwdTree;
+ delete fSafeRevTree;
+ delete fScanner;
+ delete fRuleStatusVals;
+}
+
+
+
+
+
+//----------------------------------------------------------------------------------------
+//
+// flattenData() - Collect up the compiled RBBI rule data and put it into
+// the format for saving in ICU data files,
+// which is also the format needed by the RBBI runtime engine.
+//
+//----------------------------------------------------------------------------------------
+static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
+
+RBBIDataHeader *RBBIRuleBuilder::flattenData() {
+ int32_t i;
+
+ if (U_FAILURE(*fStatus)) {
+ return NULL;
+ }
+
+ // Remove comments and whitespace from the rules to make it smaller.
+ UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
+
+ // Calculate the size of each section in the data.
+ // Sizes here are padded up to a multiple of 8 for better memory alignment.
+ // Sections sizes actually stored in the header are for the actual data
+ // without the padding.
+ //
+ int32_t headerSize = align8(sizeof(RBBIDataHeader));
+ int32_t forwardTableSize = align8(fForwardTables->getTableSize());
+ int32_t reverseTableSize = align8(fReverseTables->getTableSize());
+ int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
+ int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
+ int32_t trieSize = align8(fSetBuilder->getTrieSize());
+ int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
+ int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
+
+ int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
+ + safeFwdTableSize + safeRevTableSize
+ + statusTableSize + trieSize + rulesSize;
+
+ RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
+ if (data == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
+ uprv_memset(data, 0, totalSize);
+
+
+ data->fMagic = 0xb1a0;
+ data->fFormatVersion[0] = 3;
+ data->fFormatVersion[1] = 1;
+ data->fFormatVersion[2] = 0;
+ data->fFormatVersion[3] = 0;
+ data->fLength = totalSize;
+ data->fCatCount = fSetBuilder->getNumCharCategories();
+
+ data->fFTable = headerSize;
+ data->fFTableLen = forwardTableSize;
+ data->fRTable = data->fFTable + forwardTableSize;
+ data->fRTableLen = reverseTableSize;
+ data->fSFTable = data->fRTable + reverseTableSize;
+ data->fSFTableLen = safeFwdTableSize;
+ data->fSRTable = data->fSFTable + safeFwdTableSize;
+ data->fSRTableLen = safeRevTableSize;
+
+ data->fTrie = data->fSRTable + safeRevTableSize;
+ data->fTrieLen = fSetBuilder->getTrieSize();
+ data->fStatusTable = data->fTrie + trieSize;
+ data->fStatusTableLen= statusTableSize;
+ data->fRuleSource = data->fStatusTable + statusTableSize;
+ data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
+
+ uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
+
+ fForwardTables->exportTable((uint8_t *)data + data->fFTable);
+ fReverseTables->exportTable((uint8_t *)data + data->fRTable);
+ fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
+ fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
+ fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
+
+ int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
+ for (i=0; i<fRuleStatusVals->size(); i++) {
+ ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
+ }
+
+ strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
+
+ return data;
+}
+
+
+
+
+
+
+//----------------------------------------------------------------------------------------
+//
+// createRuleBasedBreakIterator construct from source rules that are passed in
+// in a UnicodeString
+//
+//----------------------------------------------------------------------------------------
+BreakIterator *
+RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
+ UParseError *parseError,
+ UErrorCode &status)
+{
+ // status checked below
+
+ //
+ // Read the input rules, generate a parse tree, symbol table,
+ // and list of all Unicode Sets referenced by the rules.
+ //
+ RBBIRuleBuilder builder(rules, parseError, status);
+ if (U_FAILURE(status)) { // status checked here bcos build below doesn't
+ return NULL;
+ }
+ builder.fScanner->parse();
+
+ //
+ // UnicodeSet processing.
+ // Munge the Unicode Sets to create a set of character categories.
+ // Generate the mapping tables (TRIE) from input 32-bit characters to
+ // the character categories.
+ //
+ builder.fSetBuilder->build();
+
+
+ //
+ // Generate the DFA state transition table.
+ //
+ builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
+ builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
+ builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
+ builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
+ if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
+ builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
+ {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ delete builder.fForwardTables; builder.fForwardTables = NULL;
+ delete builder.fReverseTables; builder.fReverseTables = NULL;
+ delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
+ delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
+ return NULL;
+ }
+
+ builder.fForwardTables->build();
+ builder.fReverseTables->build();
+ builder.fSafeFwdTables->build();
+ builder.fSafeRevTables->build();
+
+#ifdef RBBI_DEBUG
+ if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
+ builder.fForwardTables->printRuleStatusTable();
+ }
+#endif
+
+ //
+ // Package up the compiled data into a memory image
+ // in the run-time format.
+ //
+ RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
+ if (U_FAILURE(*builder.fStatus)) {
+ return NULL;
+ }
+
+
+ //
+ // Clean up the compiler related stuff
+ //
+
+
+ //
+ // Create a break iterator from the compiled rules.
+ // (Identical to creation from stored pre-compiled rules)
+ //
+ // status is checked after init in construction.
+ RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+ if (U_FAILURE(status)) {
+ delete This;
+ This = NULL;
+ }
+ else if(This == NULL) { // test for NULL
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+ return This;
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */