intl/icu/source/tools/toolutil/uparse.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 2000-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uparse.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2000apr18
*   created by: Markus W. Scherer
*
*   This file provides a parser for files that are delimited by one single
*   character like ';' or TAB. Example: the Unicode Character Properties files
*   like UnicodeData.txt are semicolon-delimited.
*/

#ifndef __UPARSE_H__
#define __UPARSE_H__

#include "unicode/utypes.h"

/**
 * Is c an invariant-character whitespace?
 * @param c invariant character
 */
#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')

U_CDECL_BEGIN

/**
 * Skip space ' ' and TAB '\t' characters.
 *
 * @param s Pointer to characters.
 * @return Pointer to first character at or after s that is not a space or TAB.
 */
U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char *s);

/**
 * Trim whitespace (including line endings) from the end of the string.
 *
 * @param s Pointer to the string.
 * @return Pointer to the new end of the string.
 */
U_CAPI char * U_EXPORT2
u_rtrim(char *s);

/** Function type for u_parseDelimitedFile(). */
typedef void U_CALLCONV
UParseLineFn(void *context,
              char *fields[][2],
              int32_t fieldCount,
              UErrorCode *pErrorCode);

/**
 * Parser for files that are similar to UnicodeData.txt:
 * This function opens the file and reads it line by line. It skips empty lines
 * and comment lines that start with a '#'.
 * All other lines are separated into fields with one delimiter character
 * (semicolon for Unicode Properties files) between two fields. The last field in
 * a line does not need to be terminated with a delimiter.
 *
 * For each line, after segmenting it, a line function is called.
 * It gets passed the array of field start and limit pointers that is
 * passed into this parser and filled by it for each line.
 * For each field i of the line, the start pointer in fields[i][0]
 * points to the beginning of the field, while the limit pointer in fields[i][1]
 * points behind the field, i.e., to the delimiter or the line end.
 *
 * The context parameter of the line function is
 * the same as the one for the parse function.
 *
 * The line function may modify the contents of the fields including the
 * limit characters.
 *
 * If the file cannot be opened, or there is a parsing error or a field function
 * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
 */
U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char *filename, char delimiter,
                     char *fields[][2], int32_t fieldCount,
                     UParseLineFn *lineFn, void *context,
                     UErrorCode *pErrorCode);

/**
 * Parse a string of code points like 0061 0308 0300.
 * s must end with either ';' or NUL.
 *
 * @return Number of code points.
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char *s,
                  uint32_t *dest, int32_t destCapacity,
                  UErrorCode *pErrorCode);

/**
 * Parse a list of code points like 0061 0308 0300
 * into a UChar * string.
 * s must end with either ';' or NUL.
 *
 * Set the first code point in *pFirst.
 *
 * @param s Input char * string.
 * @param dest Output string buffer.
 * @param destCapacity Capacity of dest in numbers of UChars.
 * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
 *               code point in the string.
 * @param pErrorCode ICU error code.
 * @return The length of the string in numbers of UChars.
 */
U_CAPI int32_t U_EXPORT2
u_parseString(const char *s,
              UChar *dest, int32_t destCapacity,
              uint32_t *pFirst,
              UErrorCode *pErrorCode);

/**
 * Parse a code point range like
 * 0085 or
 * 4E00..9FA5.
 *
 * s must contain such a range and end with either ';' or NUL.
 *
 * @return Length of code point range, end-start+1
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
                      uint32_t *pStart, uint32_t *pEnd,
                      UErrorCode *pErrorCode);

/**
 * Same as u_parseCodePointRange() but the range may be terminated by
 * any character. The position of the terminating character is returned via
 * the *terminator output parameter.
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char *s,
                                   uint32_t *pStart, uint32_t *pEnd,
                                   const char **terminator,
                                   UErrorCode *pErrorCode);

U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);

U_CDECL_END

#endif