summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/genrb/read.c
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/tools/genrb/read.c')
-rw-r--r--intl/icu/source/tools/genrb/read.c476
1 files changed, 476 insertions, 0 deletions
diff --git a/intl/icu/source/tools/genrb/read.c b/intl/icu/source/tools/genrb/read.c
new file mode 100644
index 000000000..468c27e07
--- /dev/null
+++ b/intl/icu/source/tools/genrb/read.c
@@ -0,0 +1,476 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1998-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+*
+* File read.c
+*
+* Modification History:
+*
+* Date Name Description
+* 05/26/99 stephen Creation.
+* 5/10/01 Ram removed ustdio dependency
+*******************************************************************************
+*/
+
+#include "read.h"
+#include "errmsg.h"
+#include "unicode/ustring.h"
+#include "unicode/utf16.h"
+
+#define OPENBRACE 0x007B
+#define CLOSEBRACE 0x007D
+#define COMMA 0x002C
+#define QUOTE 0x0022
+#define ESCAPE 0x005C
+#define SLASH 0x002F
+#define ASTERISK 0x002A
+#define SPACE 0x0020
+#define COLON 0x003A
+#define BADBOM 0xFFFE
+#define CR 0x000D
+#define LF 0x000A
+
+static int32_t lineCount;
+
+/* Protos */
+static enum ETokenType getStringToken(UCHARBUF *buf,
+ UChar32 initialChar,
+ struct UString *token,
+ UErrorCode *status);
+
+static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
+static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
+static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
+static UBool isWhitespace (UChar32 c);
+static UBool isNewline (UChar32 c);
+
+U_CFUNC void resetLineNumber() {
+ lineCount = 1;
+}
+
+/* Read and return the next token from the stream. If the token is of
+ type eString, fill in the token parameter with the token. If the
+ token is eError, then the status parameter will contain the
+ specific error. This will be eItemNotFound at the end of file,
+ indicating that all tokens have been returned. This method will
+ never return eString twice in a row; instead, multiple adjacent
+ string tokens will be merged into one, with no intervening
+ space. */
+U_CFUNC enum ETokenType
+getNextToken(UCHARBUF* buf,
+ struct UString *token,
+ uint32_t *linenumber, /* out: linenumber of token */
+ struct UString *comment,
+ UErrorCode *status) {
+ enum ETokenType result;
+ UChar32 c;
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+
+ /* Skip whitespace */
+ c = getNextChar(buf, TRUE, comment, status);
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+
+ *linenumber = lineCount;
+
+ switch(c) {
+ case BADBOM:
+ return TOK_ERROR;
+ case OPENBRACE:
+ return TOK_OPEN_BRACE;
+ case CLOSEBRACE:
+ return TOK_CLOSE_BRACE;
+ case COMMA:
+ return TOK_COMMA;
+ case U_EOF:
+ return TOK_EOF;
+ case COLON:
+ return TOK_COLON;
+
+ default:
+ result = getStringToken(buf, c, token, status);
+ }
+
+ *linenumber = lineCount;
+ return result;
+}
+
+/* Copy a string token into the given UnicodeString. Upon entry, we
+ have already read the first character of the string token, which is
+ not a whitespace character (but may be a QUOTE or ESCAPE). This
+ function reads all subsequent characters that belong with this
+ string, and copy them into the token parameter. The other
+ important, and slightly convoluted purpose of this function is to
+ merge adjacent strings. It looks forward a bit, and if the next
+ non comment, non whitespace item is a string, it reads it in as
+ well. If two adjacent strings are quoted, they are merged without
+ intervening space. Otherwise a single SPACE character is
+ inserted. */
+static enum ETokenType getStringToken(UCHARBUF* buf,
+ UChar32 initialChar,
+ struct UString *token,
+ UErrorCode *status) {
+ UBool lastStringWasQuoted;
+ UChar32 c;
+ UChar target[3] = { '\0' };
+ UChar *pTarget = target;
+ int len=0;
+ UBool isFollowingCharEscaped=FALSE;
+ UBool isNLUnescaped = FALSE;
+ UChar32 prevC=0;
+
+ /* We are guaranteed on entry that initialChar is not a whitespace
+ character. If we are at the EOF, or have some other problem, it
+ doesn't matter; we still want to validly return the initialChar
+ (if nothing else) as a string token. */
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+
+ /* setup */
+ lastStringWasQuoted = FALSE;
+ c = initialChar;
+ ustr_setlen(token, 0, status);
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+
+ for (;;) {
+ if (c == QUOTE) {
+ if (!lastStringWasQuoted && token->fLength > 0) {
+ ustr_ucat(token, SPACE, status);
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+ }
+
+ lastStringWasQuoted = TRUE;
+
+ for (;;) {
+ c = ucbuf_getc(buf,status);
+
+ /* EOF reached */
+ if (c == U_EOF) {
+ return TOK_EOF;
+ }
+
+ /* Unterminated quoted strings */
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+
+ if (c == QUOTE && !isFollowingCharEscaped) {
+ break;
+ }
+
+ if (c == ESCAPE && !isFollowingCharEscaped) {
+ pTarget = target;
+ c = unescape(buf, status);
+
+ if (c == U_ERR) {
+ return TOK_ERROR;
+ }
+ if(c == CR || c == LF){
+ isNLUnescaped = TRUE;
+ }
+ }
+
+ if(c==ESCAPE && !isFollowingCharEscaped){
+ isFollowingCharEscaped = TRUE;
+ }else{
+ U_APPEND_CHAR32(c, pTarget,len);
+ pTarget = target;
+ ustr_uscat(token, pTarget,len, status);
+ isFollowingCharEscaped = FALSE;
+ len=0;
+ if(c == CR || c == LF){
+ if(isNLUnescaped == FALSE && prevC!=CR){
+ lineCount++;
+ }
+ isNLUnescaped = FALSE;
+ }
+ }
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+ prevC = c;
+ }
+ } else {
+ if (token->fLength > 0) {
+ ustr_ucat(token, SPACE, status);
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+ }
+
+ if(lastStringWasQuoted){
+ if(getShowWarning()){
+ warning(lineCount, "Mixing quoted and unquoted strings");
+ }
+ if(isStrict()){
+ return TOK_ERROR;
+ }
+
+ }
+
+ lastStringWasQuoted = FALSE;
+
+ /* if we reach here we are mixing
+ * quoted and unquoted strings
+ * warn in normal mode and error in
+ * pedantic mode
+ */
+
+ if (c == ESCAPE) {
+ pTarget = target;
+ c = unescape(buf, status);
+
+ /* EOF reached */
+ if (c == U_EOF) {
+ return TOK_ERROR;
+ }
+ }
+
+ U_APPEND_CHAR32(c, pTarget,len);
+ pTarget = target;
+ ustr_uscat(token, pTarget,len, status);
+ len=0;
+
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+
+ for (;;) {
+ /* DON'T skip whitespace */
+ c = getNextChar(buf, FALSE, NULL, status);
+
+ /* EOF reached */
+ if (c == U_EOF) {
+ ucbuf_ungetc(c, buf);
+ return TOK_STRING;
+ }
+
+ if (U_FAILURE(*status)) {
+ return TOK_STRING;
+ }
+
+ if (c == QUOTE
+ || c == OPENBRACE
+ || c == CLOSEBRACE
+ || c == COMMA
+ || c == COLON) {
+ ucbuf_ungetc(c, buf);
+ break;
+ }
+
+ if (isWhitespace(c)) {
+ break;
+ }
+
+ if (c == ESCAPE) {
+ pTarget = target;
+ c = unescape(buf, status);
+
+ if (c == U_ERR) {
+ return TOK_ERROR;
+ }
+ }
+
+ U_APPEND_CHAR32(c, pTarget,len);
+ pTarget = target;
+ ustr_uscat(token, pTarget,len, status);
+ len=0;
+ if (U_FAILURE(*status)) {
+ return TOK_ERROR;
+ }
+ }
+ }
+
+ /* DO skip whitespace */
+ c = getNextChar(buf, TRUE, NULL, status);
+
+ if (U_FAILURE(*status)) {
+ return TOK_STRING;
+ }
+
+ if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
+ ucbuf_ungetc(c, buf);
+ return TOK_STRING;
+ }
+ }
+}
+
+/* Retrieve the next character. If skipwhite is
+ true, whitespace is skipped as well. */
+static UChar32 getNextChar(UCHARBUF* buf,
+ UBool skipwhite,
+ struct UString *token,
+ UErrorCode *status) {
+ UChar32 c, c2;
+
+ if (U_FAILURE(*status)) {
+ return U_EOF;
+ }
+
+ for (;;) {
+ c = ucbuf_getc(buf,status);
+
+ if (c == U_EOF) {
+ return U_EOF;
+ }
+
+ if (skipwhite && isWhitespace(c)) {
+ continue;
+ }
+
+ /* This also handles the get() failing case */
+ if (c != SLASH) {
+ return c;
+ }
+
+ c = ucbuf_getc(buf,status); /* "/c" */
+
+ if (c == U_EOF) {
+ return U_EOF;
+ }
+
+ switch (c) {
+ case SLASH: /* "//" */
+ seekUntilNewline(buf, NULL, status);
+ break;
+
+ case ASTERISK: /* " / * " */
+ c2 = ucbuf_getc(buf, status); /* "/ * c" */
+ if(c2 == ASTERISK){ /* "/ * *" */
+ /* parse multi-line comment and store it in token*/
+ seekUntilEndOfComment(buf, token, status);
+ } else {
+ ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
+ seekUntilEndOfComment(buf, NULL, status);
+ }
+ break;
+
+ default:
+ ucbuf_ungetc(c, buf); /* "/c" - put back the c */
+ /* If get() failed this is a NOP */
+ return SLASH;
+ }
+
+ }
+}
+
+static void seekUntilNewline(UCHARBUF* buf,
+ struct UString *token,
+ UErrorCode *status) {
+ UChar32 c;
+
+ if (U_FAILURE(*status)) {
+ return;
+ }
+
+ do {
+ c = ucbuf_getc(buf,status);
+ /* add the char to token */
+ if(token!=NULL){
+ ustr_u32cat(token, c, status);
+ }
+ } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
+}
+
+static void seekUntilEndOfComment(UCHARBUF *buf,
+ struct UString *token,
+ UErrorCode *status) {
+ UChar32 c, d;
+ uint32_t line;
+
+ if (U_FAILURE(*status)) {
+ return;
+ }
+
+ line = lineCount;
+
+ do {
+ c = ucbuf_getc(buf, status);
+
+ if (c == ASTERISK) {
+ d = ucbuf_getc(buf, status);
+
+ if (d != SLASH) {
+ ucbuf_ungetc(d, buf);
+ } else {
+ break;
+ }
+ }
+ /* add the char to token */
+ if(token!=NULL){
+ ustr_u32cat(token, c, status);
+ }
+ /* increment the lineCount */
+ isNewline(c);
+
+ } while (c != U_EOF && *status == U_ZERO_ERROR);
+
+ if (c == U_EOF) {
+ *status = U_INVALID_FORMAT_ERROR;
+ error(line, "unterminated comment detected");
+ }
+}
+
+U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
+ if (U_FAILURE(*status)) {
+ return U_EOF;
+ }
+
+ /* We expect to be called after the ESCAPE has been seen, but
+ * u_fgetcx needs an ESCAPE to do its magic. */
+ ucbuf_ungetc(ESCAPE, buf);
+
+ return ucbuf_getcx32(buf, status);
+}
+
+static UBool isWhitespace(UChar32 c) {
+ switch (c) {
+ /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
+ case 0x000A:
+ case 0x2029:
+ lineCount++;
+ case 0x000D:
+ case 0x0020:
+ case 0x0009:
+ case 0xFEFF:
+ return TRUE;
+
+ default:
+ return FALSE;
+ }
+}
+
+static UBool isNewline(UChar32 c) {
+ switch (c) {
+ /* '\n', '\r', 0x2029 */
+ case 0x000A:
+ case 0x2029:
+ lineCount++;
+ case 0x000D:
+ return TRUE;
+
+ default:
+ return FALSE;
+ }
+}