summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/uspoof_conf.h
blob: bc5e4a909fa0c79692c9d692527ffbc56af9eaf9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
*   Copyright (C) 2008-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
******************************************************************************
*   file name:  uspoof_conf.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2009Jan05
*   created by: Andy Heninger
*
*   Internal classes for compiling confusable data into its binary (runtime) form.
*/

#ifndef __USPOOF_BUILDCONF_H__
#define __USPOOF_BUILDCONF_H__

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_REGULAR_EXPRESSIONS 

#include "unicode/uregex.h"
#include "uhash.h"
#include "uspoof_impl.h"

U_NAMESPACE_BEGIN

// SPUString
//              Holds a string that is the result of one of the mappings defined
//              by the confusable mapping data (confusables.txt from Unicode.org)
//              Instances of SPUString exist during the compilation process only.

struct SPUString : public UMemory {
    UnicodeString  *fStr;             // The actual string.
    int32_t         fCharOrStrTableIndex;   // Index into the final runtime data for this
                                      // string (or, for length 1, the single string char
                                      // itself, there being no string table entry for it.)
    SPUString(UnicodeString *s);
    ~SPUString();
};


//  String Pool   A utility class for holding the strings that are the result of
//                the spoof mappings.  These strings will utimately end up in the
//                run-time String Table.
//                This is sort of like a sorted set of strings, except that ICU's anemic
//                built-in collections don't support those, so it is implemented with a
//                combination of a uhash and a UVector.


class SPUStringPool : public UMemory {
  public:
    SPUStringPool(UErrorCode &status);
    ~SPUStringPool();
    
    // Add a string. Return the string from the table.
    // If the input parameter string is already in the table, delete the
    //  input parameter and return the existing string.
    SPUString *addString(UnicodeString *src, UErrorCode &status);


    // Get the n-th string in the collection.
    SPUString *getByIndex(int32_t i);

    // Sort the contents; affects the ordering of getByIndex().
    void sort(UErrorCode &status);

    int32_t size();

  private:
    UVector     *fVec;    // Elements are SPUString *
    UHashtable  *fHash;   // Key: UnicodeString  Value: SPUString
};


// class ConfusabledataBuilder
//     An instance of this class exists while the confusable data is being built from source.
//     It encapsulates the intermediate data structures that are used for building.
//     It exports one static function, to do a confusable data build.

class ConfusabledataBuilder : public UMemory {
  private:
    SpoofImpl  *fSpoofImpl;
    UChar      *fInput;
    UHashtable *fTable;
    UnicodeSet *fKeySet;     // A set of all keys (UChar32s) that go into the four mapping tables.

    // The binary data is first assembled into the following four collections, then
    //   copied to its final raw-memory destination.
    UVector            *fKeyVec;
    UVector            *fValueVec;
    UnicodeString      *fStringTable;
    
    SPUStringPool      *stringPool;
    URegularExpression *fParseLine;
    URegularExpression *fParseHexNum;
    int32_t             fLineNum;

    ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status);
    ~ConfusabledataBuilder();
    void build(const char * confusables, int32_t confusablesLen, UErrorCode &status);

    // Add an entry to the key and value tables being built
    //   input:  data from SLTable, MATable, etc.
    //   outut:  entry added to fKeyVec and fValueVec
    void addKeyEntry(UChar32     keyChar,     // The key character
                     UHashtable *table,       // The table, one of SATable, MATable, etc.
                     int32_t     tableFlag,   // One of USPOOF_SA_TABLE_FLAG, etc.
                     UErrorCode &status);

    // From an index into fKeyVec & fValueVec
    //   get a UnicodeString with the corresponding mapping.
    UnicodeString getMapping(int32_t index);

    // Populate the final binary output data array with the compiled data.
    void outputData(UErrorCode &status);

  public:
    static void buildConfusableData(SpoofImpl *spImpl, const char * confusables,
        int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status);
};
U_NAMESPACE_END

#endif
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS 
#endif  // __USPOOF_BUILDCONF_H__