/*********************************************************************** * $Id$ * Copyright 2009 Aplix Corporation. All rights reserved. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ***********************************************************************/ #include #include #include #include #include "lex.h" #include "misc.h" #include "node.h" #include "process.h" struct file { struct file *next; const char *filename; char *buf; const char *pos, *end; unsigned int linenum; }; const char keywords[] = KEYWORDS; static struct file *file, *firstfile; static struct tok tok; /*********************************************************************** * readinput : read all input files into memory * * Enter: argv = 0-terminated array of filenames */ void readinput(const char *const *argv) { struct file **pfile = &file; for (;;) { struct file *file; const char *filename = *argv++; char *buf = 0; int len = 0, thislen, isstdin; FILE *handle; if (!filename) break; /* Read the file. */ isstdin = !strcmp(filename, "-"); if (isstdin) { handle = stdin; filename = ""; } else { handle = fopen(filename, "rb"); if (!handle) errorexit("%s: %s", filename, strerror(errno)); } for (;;) { thislen = len ? len * 2 : 4096; buf = memrealloc(buf, len + thislen + 1); thislen = fread(buf + len, 1, thislen, handle); if (!thislen) break; len += thislen; } if (ferror(handle)) errorexit("%s: I/O error", filename); if (!isstdin) fclose(handle); buf[len] = 0; buf = memrealloc(buf, len + 1); /* Create the file struct for it. */ file = memalloc(sizeof(struct file)); *pfile = file; pfile = &file->next; file->filename = filename; file->pos = file->buf = buf; file->end = buf + len; file->linenum = 1; } *pfile = 0; firstfile = file; } /*********************************************************************** * lexerrorexit : error and exit with line number */ static void lexerrorexit(const char *format, ...) { va_list ap; va_start(ap, format); vlocerrorexit(file->filename, file->linenum, format, ap); va_end(ap); } /*********************************************************************** * lexblockcomment : lex a block comment * * Enter: start = start of comment * * Return: tok struct, lifetime until next call to lex */ static struct tok * lexblockcomment(const char *start) { const char *p = start + 1; tok.filename = file->filename; tok.linenum = file->linenum; for (;;) { int ch = *++p; if (!ch) lexerrorexit("unterminated block comment"); if (ch != '*') { if (ch == '\n') file->linenum++; continue; } ch = p[1]; if (!ch) lexerrorexit("unterminated block comment"); if (ch == '/') break; } p += 2; file->pos = p; tok.type = TOK_BLOCKCOMMENT; tok.start = start + 2; tok.len = p - start - 4; return &tok; } /*********************************************************************** * lexinlinecomment : lex an inline comment * * Enter: start = start of comment, starts with "//" * * Return: tok struct, lifetime until next call to lex */ static struct tok * lexinlinecomment(const char *start) { const char *p = start + 2; p = start + 1; for (;;) { int ch = *++p; if (!ch || ch == '\n') break; } p++; file->pos = p; tok.type = TOK_INLINECOMMENT; tok.start = start + 2; tok.len = p - start - 2; tok.filename = file->filename; tok.linenum = file->linenum++; return &tok; } /*********************************************************************** * lexnumber : lex a number (or just a '-' symbol) * * Enter: start = start of token * * Return: tok struct, lifetime until next call to lex * * The IDL grammar seems to say that a float can't start with a * decimal point, so that's what we have implemented here. */ static struct tok * lexnumber(const char *start) { for (;;) { const char *p = start; const char *octalend = start; int ch = *p; enum { STATE_START, STATE_INT, STATE_HEX, STATE_OCTAL, STATE_BADOCTAL, STATE_DP, STATE_EXPSTART, STATE_EXPSIGN, STATE_EXP } state = STATE_START; if (ch == '-') { ch = *++p; if (ch == 'I') { // starts of Infinity char * infinity = "-Infinity"; unsigned int len = strlen(infinity); if (!memcmp(start, infinity, len)) { tok.type = TOK_minusinfinity; tok.start = start; tok.len = len; tok.filename = file->filename; tok.linenum = file->linenum; file->pos = start + len; return &tok; } } } if (ch == '0') { state = STATE_OCTAL; ch = *++p; if ((ch & ~0x20) == 'X') { state = STATE_HEX; ch = *++p; } } for (;;) { if ((unsigned)(ch - '0') >= 8) { if ((ch & -2) == '8') { if (state == STATE_OCTAL) { state = STATE_BADOCTAL; octalend = p; } } else if ((unsigned)((ch & ~0x20) - 'A') <= 'F' - 'A') { if (state != STATE_HEX) { if ((ch & ~0x20) != 'E') break; if (state == STATE_HEX || state >= STATE_EXPSTART || state == STATE_START) break; state = STATE_EXPSTART; } } else if (ch == '.') { if (state == STATE_HEX || state >= STATE_DP) break; state = STATE_DP; } else if (ch == '-') { if (state != STATE_EXPSTART) break; state = STATE_EXPSIGN; } else break; } ch = *++p; if (state == STATE_START) state = STATE_INT; else if (state == STATE_EXPSTART || state == STATE_EXPSIGN) state = STATE_EXP; } switch (state) { case STATE_START: /* Must have just been a - character by itself. */ tok.type = '-'; p = start + 1; break; case STATE_BADOCTAL: p = octalend; /* fall through... */ case STATE_INT: case STATE_OCTAL: tok.type = TOK_INTEGER; break; case STATE_HEX: if (p - start == 2 || (p - start == 3 && *start == '-')) p = start + 1; tok.type = TOK_INTEGER; break; case STATE_EXP: case STATE_DP: tok.type = TOK_FLOAT; break; case STATE_EXPSIGN: p--; /* fall through... */ case STATE_EXPSTART: p--; tok.type = TOK_FLOAT; break; } tok.start = start; tok.len = p - start; tok.filename = file->filename; tok.linenum = file->linenum; file->pos = p; return &tok; } } /*********************************************************************** * lexstring : lex a quoted string * * Enter: start = start of token * * Return: tok struct, lifetime until next call to lex */ static struct tok * lexstring(const char *start) { for (;;) { const char *p = start + 1; int ch = *p; for (;;) { if (!ch || ch == '\n') lexerrorexit("unterminated string"); if (ch == '"') { tok.type = TOK_STRING; tok.start = start + 1; tok.len = p - start - 1; tok.filename = file->filename; tok.linenum = file->linenum; file->pos = p + 1; return &tok; } /* Note the IDL spec doesn't seem to allow for escape sequences * in strings. */ ch = *++p; } } } /*********************************************************************** * lexidentifier : lex an identifier * * Enter: start = start of token * * Return: tok struct, lifetime until next call to lex */ static struct tok * lexidentifier(const char *start) { const char *p = start + 1; for (;;) { int ch = *p; if (ch != '_' && (unsigned)(ch - '0') >= 10 && (unsigned)((ch & ~0x20) - 'A') > 'Z' - 'A') { break; } p++; } tok.type = TOK_IDENTIFIER; tok.start = start; tok.len = p - start; tok.filename = file->filename; tok.linenum = file->linenum; file->pos = p; /* See if this is a keyword. (This search is a bit n-squared.) */ { unsigned int type = TOK_DOMString; p = keywords; for (;;) { unsigned int len = strlen(p); if (!len) break; if (len == tok.len && !memcmp(start, p, len)) { tok.type = type; break; } p += len + 1; type++; } } return &tok; } /*********************************************************************** * lex : retrieve next token * * Return: tok struct, lifetime until next call to lex */ struct tok * lex(void) { const char *p; int ch; for (;;) { if (!file) { tok.type = TOK_EOF; tok.start = "end of file"; tok.len = strlen(tok.start); return &tok; } tok.prestart = p = file->pos; /* Flush whitespace. */ for (;;) { ch = *p++; switch (ch) { case ' ': case '\t': case '\r': continue; case '\n': ++file->linenum; tok.prestart = p; continue; } break; } p--; if (ch) break; if (p != file->end) lexerrorexit("\\0 byte not allowed"); file = file->next; } /* See if we have a comment. */ tok.start = p; if (ch == '/') { switch (*++p) { case '*': return lexblockcomment(p - 1); case '/': return lexinlinecomment(p - 1); } tok.type = '/'; } else { /* Handle things that start with '-', which is either '-' as a token, * or a number. Handle numbers. */ if (ch == '-' || (unsigned)(ch - '0') < 10) return lexnumber(p); /* Handle string. */ if (ch == '"') return lexstring(p); /* Handle identifier. */ if (ch == '_' || (unsigned)((ch & ~0x20) - 'A') <= 'Z' - 'A') return lexidentifier(p); /* The only multi-symbol token are ... and [] */ if (ch == '.') { tok.type = '.'; if (*++p == '.' && p[1] == '.') { tok.type = TOK_ELLIPSIS; p += 2; } goto done; } if (ch == '[') { tok.type = '['; if (*++p == ']') { tok.type = TOK_DOUBLEBRACKET; p++; } goto done; } } /* Single symbol token. */ tok.type = ch; p++; done: tok.filename = file->filename; tok.linenum = file->linenum; tok.len = p - tok.start; file->pos = p; return &tok; } /*********************************************************************** * outputwidl : output literal Web IDL input that node was parsed from * * Enter: node = parse node to output literal Web IDL for */ void outputwidl(struct node *node) { const char *start = node->wsstart, *end = node->end; /* Find the file that start is in. */ struct file *file = firstfile; while (start < file->buf || start >= file->end) { file = file->next; assert(file); } /* Find the (current or) next node that has node->start set. Any such * node needs to be put inside a element. */ while (node && !node->start) node = nodewalk(node); /* Output until we get to the end. This has to cope with the text * spanning multiple input files. */ for (;;) { int final = end >= file->buf && end <= file->end; const char *thisend = final ? end : file->end; /* Output the Web IDL, omitting comments. */ while (start != end) { const char *p, *p2, *comment, *endcomment; int ch; if (node && start == node->start) { /* We are on the start of the present node in the tree * walk. Put it in a . */ fputs("", stdout); printtext(node->start, node->end - node->start, 1); fputs("", stdout); start = node->end; /* Skip to the next node with node->start set if any. */ do node = nodewalk(node); while (node && !node->start); continue; } p2 = thisend; if (node && node->start >= file->buf && node->start < p2) p2 = node->start; p = memchr(start, '/', p2 - start); if (!p) { printtext(start, p2 - start, 1); if (p2 != thisend) { start = p2; continue; } break; } /* See if we're at the start of a comment. If so find the end. */ comment = 0; if (p + 1 != thisend) { switch (p[1]) { case '*': /* Block comment. */ comment = p; p++; do p = memchr(p + 1, '*', thisend - p - 1); while (p[1] != '/'); endcomment = p + 2; break; case '/': /* Inline comment. */ comment = p; p = memchr(p, '\n', thisend - p); if (!p) p = thisend; endcomment = p; break; } } if (!comment) { /* Not at start of comment. */ p++; printtext(start, p - start, 1); start = p; assert(start <= end); continue; } /* If the comment has only whitespace before it on the line, * eat that up. */ p = comment; while (p != start && ((ch = p[-1]) == ' ' || ch == '\t')) p--; if (p == start || p[-1] == '\n') { comment = p; /* If the comment has only whitespace after it to the end * of the line, eat that and the newline up. This always * happens for an inline comment on a line by itself. */ p = endcomment; while (p != thisend && ((ch = *p) == ' ' || ch == '\t')) p++; if (p != thisend && *p == '\n') p++; endcomment = p; } printtext(start, comment - start, 1); start = endcomment; if (start > thisend) start = thisend; } if (final) break; file = file->next; assert(file); start = file->buf; } }