diff options
Diffstat (limited to 'ldap/c-sdk/libraries/libldap/regex.c')
-rw-r--r-- | ldap/c-sdk/libraries/libldap/regex.c | 920 |
1 files changed, 0 insertions, 920 deletions
diff --git a/ldap/c-sdk/libraries/libldap/regex.c b/ldap/c-sdk/libraries/libldap/regex.c deleted file mode 100644 index 4ba4ee08a..000000000 --- a/ldap/c-sdk/libraries/libldap/regex.c +++ /dev/null @@ -1,920 +0,0 @@ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Mozilla Communicator client code, released - * March 31, 1998. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998-1999 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * - * Alternatively, the contents of this file may be used under the terms of - * either of the GNU General Public License Version 2 or later (the "GPL"), - * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ -#include "ldap-int.h" -#if defined( macintosh ) || defined( DOS ) || defined( _WINDOWS ) || defined( NEED_BSDREGEX ) || defined( XP_OS2) -#include "regex.h" - -/* - * regex - Regular expression pattern matching and replacement - * - * By: Ozan S. Yigit (oz) - * Dept. of Computer Science - * York University - * - * These routines are the PUBLIC DOMAIN equivalents of regex - * routines as found in 4.nBSD UN*X, with minor extensions. - * - * These routines are derived from various implementations found - * in software tools books, and Conroy's grep. They are NOT derived - * from licensed/restricted software. - * For more interesting/academic/complicated implementations, - * see Henry Spencer's regexp routines, or GNU Emacs pattern - * matching module. - * - * Use the actual CCL code in the CLO - * section of pmatch. No need for a recursive - * pmatch call. - * - * Use a bitmap table to set char bits in an - * 8-bit chunk. - * - * Interfaces: - * re_comp: compile a regular expression into a NFA. - * - * char *re_comp(s) - * char *s; - * - * re_exec: execute the NFA to match a pattern. - * - * int re_exec(s) - * char *s; - * - * re_modw change re_exec's understanding of what a "word" - * looks like (for \< and \>) by adding into the - * hidden word-syntax table. - * - * void re_modw(s) - * char *s; - * - * re_subs: substitute the matched portions in a new string. - * - * int re_subs(src, dst) - * char *src; - * char *dst; - * - * re_fail: failure routine for re_exec. - * - * void re_fail(msg, op) - * char *msg; - * char op; - * - * Regular Expressions: - * - * [1] char matches itself, unless it is a special - * character (metachar): . \ [ ] * + ^ $ - * - * [2] . matches any character. - * - * [3] \ matches the character following it, except - * when followed by a left or right round bracket, - * a digit 1 to 9 or a left or right angle bracket. - * (see [7], [8] and [9]) - * It is used as an escape character for all - * other meta-characters, and itself. When used - * in a set ([4]), it is treated as an ordinary - * character. - * - * [4] [set] matches one of the characters in the set. - * If the first character in the set is "^", - * it matches a character NOT in the set, i.e. - * complements the set. A shorthand S-E is - * used to specify a set of characters S upto - * E, inclusive. The special characters "]" and - * "-" have no special meaning if they appear - * as the first chars in the set. - * examples: match: - * - * [a-z] any lowercase alpha - * - * [^]-] any char except ] and - - * - * [^A-Z] any char except uppercase - * alpha - * - * [a-zA-Z] any alpha - * - * [5] * any regular expression form [1] to [4], followed by - * closure char (*) matches zero or more matches of - * that form. - * - * [6] + same as [5], except it matches one or more. - * - * [7] a regular expression in the form [1] to [10], enclosed - * as \(form\) matches what form matches. The enclosure - * creates a set of tags, used for [8] and for - * pattern substution. The tagged forms are numbered - * starting from 1. - * - * [8] a \ followed by a digit 1 to 9 matches whatever a - * previously tagged regular expression ([7]) matched. - * - * [9] \< a regular expression starting with a \< construct - * \> and/or ending with a \> construct, restricts the - * pattern matching to the beginning of a word, and/or - * the end of a word. A word is defined to be a character - * string beginning and/or ending with the characters - * A-Z a-z 0-9 and _. It must also be preceded and/or - * followed by any character outside those mentioned. - * - * [10] a composite regular expression xy where x and y - * are in the form [1] to [10] matches the longest - * match of x followed by a match for y. - * - * [11] ^ a regular expression starting with a ^ character - * $ and/or ending with a $ character, restricts the - * pattern matching to the beginning of the line, - * or the end of line. [anchors] Elsewhere in the - * pattern, ^ and $ are treated as ordinary characters. - * - * - * Acknowledgements: - * - * HCR's Hugh Redelmeier has been most helpful in various - * stages of development. He convinced me to include BOW - * and EOW constructs, originally invented by Rob Pike at - * the University of Toronto. - * - * References: - * Software tools Kernighan & Plauger - * Software tools in Pascal Kernighan & Plauger - * Grep [rsx-11 C dist] David Conroy - * ed - text editor Un*x Programmer's Manual - * Advanced editing on Un*x B. W. Kernighan - * RegExp routines Henry Spencer - * - * Notes: - * - * This implementation uses a bit-set representation for character - * classes for speed and compactness. Each character is represented - * by one bit in a 128-bit block. Thus, CCL always takes a - * constant 16 bytes in the internal nfa, and re_exec does a single - * bit comparison to locate the character in the set. - * - * Examples: - * - * pattern: foo*.* - * compile: CHR f CHR o CLO CHR o END CLO ANY END END - * matches: fo foo fooo foobar fobar foxx ... - * - * pattern: fo[ob]a[rz] - * compile: CHR f CHR o CCL bitset CHR a CCL bitset END - * matches: fobar fooar fobaz fooaz - * - * pattern: foo\\+ - * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END - * matches: foo\ foo\\ foo\\\ ... - * - * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) - * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END - * matches: foo1foo foo2foo foo3foo - * - * pattern: \(fo.*\)-\1 - * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END - * matches: foo-foo fo-fo fob-fob foobar-foobar ... - */ - -#define MAXNFA 1024 -#define MAXTAG 10 - -#define OKP 1 -#define NOP 0 - -#define CHR 1 -#define ANY 2 -#define CCL 3 -#define BOL 4 -#define EOL 5 -#define BOT 6 -#define EOT 7 -#define BOW 8 -#define EOW 9 -#define REF 10 -#define CLO 11 - -#define END 0 - -/* - * The following defines are not meant to be changeable. - * They are for readability only. - */ -#define MAXCHR 128 -#define CHRBIT 8 -#define BITBLK MAXCHR/CHRBIT -#define BLKIND 0170 -#define BITIND 07 - -#define ASCIIB 0177 - -/* Plain char, on the other hand, may be signed or unsigned; it depends on - * the platform and perhaps a compiler option. A hard fact of life, in C. - * - * 6-April-1999 mcs@netscape.com: replaced CHAR with REGEXCHAR to avoid - * conflicts with system types on Win32. Changed typedef - * for REGEXCHAR to always be unsigned, which seems right. - */ -typedef unsigned char REGEXCHAR; - -static int tagstk[MAXTAG]; /* subpat tag stack..*/ -static REGEXCHAR nfa[MAXNFA]; /* automaton.. */ -static int sta = NOP; /* status of lastpat */ - -static REGEXCHAR bittab[BITBLK]; /* bit table for CCL */ - /* pre-set bits... */ -static REGEXCHAR bitarr[] = {1,2,4,8,16,32,64,128}; - -static void -chset(REGEXCHAR c) -{ - bittab[((c) & (unsigned)BLKIND) >> 3] |= bitarr[(c) & BITIND]; -} - -#define badpat(x) (*nfa = END, x) -#define store(x) *mp++ = x - -char * -LDAP_CALL -re_comp( const char *pat ) -{ - register REGEXCHAR *p; /* pattern pointer */ - register REGEXCHAR *mp=nfa; /* nfa pointer */ - register REGEXCHAR *lp; /* saved pointer.. */ - register REGEXCHAR *sp=nfa; /* another one.. */ - - register int tagi = 0; /* tag stack index */ - register int tagc = 1; /* actual tag count */ - - register int n; - register REGEXCHAR mask; /* xor mask -CCL/NCL */ - int c1, c2; - - if (!pat || !*pat) { - if (sta) { - return 0; - } else { - return badpat("No previous regular expression"); - } - } - sta = NOP; - - for (p = (REGEXCHAR*)pat; *p; p++) { - lp = mp; - switch(*p) { - - case '.': /* match any char.. */ - store(ANY); - break; - - case '^': /* match beginning.. */ - if (p == (REGEXCHAR*)pat) - store(BOL); - else { - store(CHR); - store(*p); - } - break; - - case '$': /* match endofline.. */ - if (!*(p+1)) - store(EOL); - else { - store(CHR); - store(*p); - } - break; - - case '[': /* match char class..*/ - store(CCL); - - if (*++p == '^') { - mask = 0377; - p++; - } - else - mask = 0; - - if (*p == '-') /* real dash */ - chset(*p++); - if (*p == ']') /* real brac */ - chset(*p++); - while (*p && *p != ']') { - if (*p == '-' && *(p+1) && *(p+1) != ']') { - p++; - c1 = *(p-2) + 1; - c2 = *p++; - while (c1 <= c2) - chset((REGEXCHAR)c1++); - } -#ifdef EXTEND - else if (*p == '\\' && *(p+1)) { - p++; - chset(*p++); - } -#endif - else - chset(*p++); - } - if (!*p) - return badpat("Missing ]"); - - for (n = 0; n < BITBLK; bittab[n++] = (REGEXCHAR) 0) - store(mask ^ bittab[n]); - - break; - - case '*': /* match 0 or more.. */ - case '+': /* match 1 or more.. */ - if (p == (REGEXCHAR*)pat) - return badpat("Empty closure"); - lp = sp; /* previous opcode */ - if (*lp == CLO) /* equivalence.. */ - break; - switch(*lp) { - - case BOL: - case BOT: - case EOT: - case BOW: - case EOW: - case REF: - return badpat("Illegal closure"); - default: - break; - } - - if (*p == '+') - for (sp = mp; lp < sp; lp++) - store(*lp); - - store(END); - store(END); - sp = mp; - while (--mp > lp) - *mp = mp[-1]; - store(CLO); - mp = sp; - break; - - case '\\': /* tags, backrefs .. */ - switch(*++p) { - - case '(': - if (tagc < MAXTAG) { - tagstk[++tagi] = tagc; - store(BOT); - store(tagc++); - } - else - return badpat("Too many \\(\\) pairs"); - break; - case ')': - if (*sp == BOT) - return badpat("Null pattern inside \\(\\)"); - if (tagi > 0) { - store(EOT); - store(tagstk[tagi--]); - } - else - return badpat("Unmatched \\)"); - break; - case '<': - store(BOW); - break; - case '>': - if (*sp == BOW) - return badpat("Null pattern inside \\<\\>"); - store(EOW); - break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - n = *p-'0'; - if (tagi > 0 && tagstk[tagi] == n) - return badpat("Cyclical reference"); - if (tagc > n) { - store(REF); - store(n); - } - else - return badpat("Undetermined reference"); - break; -#ifdef EXTEND - case 'b': - store(CHR); - store('\b'); - break; - case 'n': - store(CHR); - store('\n'); - break; - case 'f': - store(CHR); - store('\f'); - break; - case 'r': - store(CHR); - store('\r'); - break; - case 't': - store(CHR); - store('\t'); - break; -#endif - default: - store(CHR); - store(*p); - } - break; - - default : /* an ordinary char */ - store(CHR); - store(*p); - break; - } - sp = lp; - } - if (tagi > 0) - return badpat("Unmatched \\("); - store(END); - sta = OKP; - return 0; -} - - -static REGEXCHAR *bol; -static REGEXCHAR *bopat[MAXTAG]; -static REGEXCHAR *eopat[MAXTAG]; -#ifdef NEEDPROTOS -static REGEXCHAR *pmatch( REGEXCHAR *lp, REGEXCHAR *ap ); -#else /* NEEDPROTOS */ -static REGEXCHAR *pmatch(); -#endif /* NEEDPROTOS */ - -/* - * re_exec: - * execute nfa to find a match. - * - * special cases: (nfa[0]) - * BOL - * Match only once, starting from the - * beginning. - * CHR - * First locate the character without - * calling pmatch, and if found, call - * pmatch for the remaining string. - * END - * re_comp failed, poor luser did not - * check for it. Fail fast. - * - * If a match is found, bopat[0] and eopat[0] are set - * to the beginning and the end of the matched fragment, - * respectively. - * - */ - -int -LDAP_CALL -re_exec( const char *lp ) -{ - register REGEXCHAR c; - register REGEXCHAR *ep = 0; - register REGEXCHAR *ap = nfa; - - bol = (REGEXCHAR*)lp; - - bopat[0] = 0; - bopat[1] = 0; - bopat[2] = 0; - bopat[3] = 0; - bopat[4] = 0; - bopat[5] = 0; - bopat[6] = 0; - bopat[7] = 0; - bopat[8] = 0; - bopat[9] = 0; - - switch(*ap) { - - case BOL: /* anchored: match from BOL only */ - ep = pmatch((REGEXCHAR*)lp,ap); - break; - case CHR: /* ordinary char: locate it fast */ - c = *(ap+1); - while (*lp && *(REGEXCHAR*)lp != c) - lp++; - if (!*lp) /* if EOS, fail, else fall thru. */ - return 0; - default: /* regular matching all the way. */ - do { - if ((ep = pmatch((REGEXCHAR*)lp,ap))) - break; - lp++; - } while (*lp); - - break; - case END: /* munged automaton. fail always */ - return 0; - } - if (!ep) - return 0; - - bopat[0] = (REGEXCHAR*)lp; - eopat[0] = ep; - return 1; -} - -/* - * pmatch: internal routine for the hard part - * - * This code is partly snarfed from an early grep written by - * David Conroy. The backref and tag stuff, and various other - * innovations are by oz. - * - * special case optimizations: (nfa[n], nfa[n+1]) - * CLO ANY - * We KNOW .* will match everything upto the - * end of line. Thus, directly go to the end of - * line, without recursive pmatch calls. As in - * the other closure cases, the remaining pattern - * must be matched by moving backwards on the - * string recursively, to find a match for xy - * (x is ".*" and y is the remaining pattern) - * where the match satisfies the LONGEST match for - * x followed by a match for y. - * CLO CHR - * We can again scan the string forward for the - * single char and at the point of failure, we - * execute the remaining nfa recursively, same as - * above. - * - * At the end of a successful match, bopat[n] and eopat[n] - * are set to the beginning and end of subpatterns matched - * by tagged expressions (n = 1 to 9). - * - */ - -#ifndef re_fail -extern void re_fail(); -#endif /* re_fail */ - -/* - * character classification table for word boundary operators BOW - * and EOW. the reason for not using ctype macros is that we can - * let the user add into our own table. see re_modw. This table - * is not in the bitset form, since we may wish to extend it in the - * future for other character classifications. - * - * TRUE for 0-9 A-Z a-z _ - */ -static char chrtyp[MAXCHR] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 0, 0, 0, 0, 0 - }; - -#define HIBIT 0200 -#define inascii(x) (0177&(x)) -#define iswordc(x) chrtyp[inascii(x)] -#define isinset(x,y) (((y)&HIBIT)?0:((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])) - -/* - * skip values for CLO XXX to skip past the closure - */ - -#define ANYSKIP 2 /* [CLO] ANY END ... */ -#define CHRSKIP 3 /* [CLO] CHR chr END ... */ -#define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */ - -static REGEXCHAR * -pmatch( REGEXCHAR *lp, REGEXCHAR *ap) -{ - register int op, c, n; - register REGEXCHAR *e; /* extra pointer for CLO */ - register REGEXCHAR *bp; /* beginning of subpat.. */ - register REGEXCHAR *ep; /* ending of subpat.. */ - REGEXCHAR *are; /* to save the line ptr. */ - - while ((op = *ap++) != END) - switch(op) { - - case CHR: - if (*lp++ != *ap++) - return 0; - break; - case ANY: - if (!*lp++) - return 0; - break; - case CCL: - c = *lp++; - if (!isinset(ap,c)) - return 0; - ap += BITBLK; - break; - case BOL: - if (lp != bol) - return 0; - break; - case EOL: - if (*lp) - return 0; - break; - case BOT: - bopat[*ap++] = lp; - break; - case EOT: - eopat[*ap++] = lp; - break; - case BOW: - if ((lp!=bol && iswordc(lp[-1])) || !iswordc(*lp)) - return 0; - break; - case EOW: - if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp)) - return 0; - break; - case REF: - n = *ap++; - bp = bopat[n]; - ep = eopat[n]; - while (bp < ep) - if (*bp++ != *lp++) - return 0; - break; - case CLO: - are = lp; - switch(*ap) { - - case ANY: - while (*lp) - lp++; - n = ANYSKIP; - break; - case CHR: - c = *(ap+1); - while (*lp && c == *lp) - lp++; - n = CHRSKIP; - break; - case CCL: - while ((c = *lp) && isinset(ap+1,c)) - lp++; - n = CCLSKIP; - break; - default: - re_fail("closure: bad nfa.", *ap); - return 0; - } - - ap += n; - - while (lp >= are) { - if ((e = pmatch(lp, ap))) - return e; - --lp; - } - return 0; - default: - re_fail("re_exec: bad nfa.", op); - return 0; - } - return lp; -} - -/* - * re_modw: - * add new characters into the word table to change re_exec's - * understanding of what a word should look like. Note that we - * only accept additions into the word definition. - * - * If the string parameter is 0 or null string, the table is - * reset back to the default containing A-Z a-z 0-9 _. [We use - * the compact bitset representation for the default table] - */ - -static REGEXCHAR deftab[16] = { - 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207, - 0376, 0377, 0377, 007 -}; - -void -LDAP_CALL -re_modw( char *s ) -{ - register int i; - - if (!s || !*s) { - for (i = 0; i < MAXCHR; i++) - if (!isinset(deftab,i)) - iswordc(i) = 0; - } - else - while(*s) - iswordc(*s++) = 1; -} - -/* - * re_subs: - * substitute the matched portions of the src in dst. - * - * & substitute the entire matched pattern. - * - * \digit substitute a subpattern, with the given tag number. - * Tags are numbered from 1 to 9. If the particular - * tagged subpattern does not exist, null is substituted. - */ -int -LDAP_CALL -re_subs( char *src, char *dst) -{ - register char c; - register int pin; - register REGEXCHAR *bp; - register REGEXCHAR *ep; - - if (!*src || !bopat[0]) - return 0; - - while ((c = *src++)) { - switch(c) { - - case '&': - pin = 0; - break; - - case '\\': - c = *src++; - if (c >= '0' && c <= '9') { - pin = c - '0'; - break; - } - - default: - *dst++ = c; - continue; - } - - if ((bp = bopat[pin]) && (ep = eopat[pin])) { - while (*bp && bp < ep) - *dst++ = *(char*)bp++; - if (bp < ep) - return 0; - } - } - *dst = (char) 0; - return 1; -} - -#ifdef DEBUG - -/* No printf or exit in 16-bit Windows */ -#if defined( _WINDOWS ) && !defined( _WIN32 ) -static int LDAP_C printf( const char* pszFormat, ...) -{ - char buf[1024]; - va_list arglist; - va_start(arglist, pszFormat); - vsprintf(buf, pszFormat, arglist); - va_end(arglist); - OutputDebugString(buf); - return 0; -} -#define exit(v) return -#endif /* 16-bit Windows */ - - -#ifdef REGEX_DEBUG - -static void nfadump( REGEXCHAR *ap); - -/* - * symbolic - produce a symbolic dump of the nfa - */ -void -symbolic( char *s ) -{ - printf("pattern: %s\n", s); - printf("nfacode:\n"); - nfadump(nfa); -} - -static void -nfadump( REGEXCHAR *ap) -{ - register int n; - - while (*ap != END) - switch(*ap++) { - case CLO: - printf("CLOSURE"); - nfadump(ap); - switch(*ap) { - case CHR: - n = CHRSKIP; - break; - case ANY: - n = ANYSKIP; - break; - case CCL: - n = CCLSKIP; - break; - } - ap += n; - break; - case CHR: - printf("\tCHR %c\n",*ap++); - break; - case ANY: - printf("\tANY .\n"); - break; - case BOL: - printf("\tBOL -\n"); - break; - case EOL: - printf("\tEOL -\n"); - break; - case BOT: - printf("BOT: %d\n",*ap++); - break; - case EOT: - printf("EOT: %d\n",*ap++); - break; - case BOW: - printf("BOW\n"); - break; - case EOW: - printf("EOW\n"); - break; - case REF: - printf("REF: %d\n",*ap++); - break; - case CCL: - printf("\tCCL ["); - for (n = 0; n < MAXCHR; n++) - if (isinset(ap,(REGEXCHAR)n)) { - if (n < ' ') - printf("^%c", n ^ 0x040); - else - printf("%c", n); - } - printf("]\n"); - ap += BITBLK; - break; - default: - printf("bad nfa. opcode %o\n", ap[-1]); - exit(1); - break; - } -} -#endif /* REGEX_DEBUG */ -#endif /* DEBUG */ -#endif /* macintosh or DOS or _WINDOWS or NEED_BSDREGEX */ |