721 lines
17 KiB
C
721 lines
17 KiB
C
/*
|
|
* These routines are from "a pre-release of a bunch of berkelix
|
|
* regex(3)/ed(1) compatible regular-expression routines" written by Ozan
|
|
* S. Yigit, Dept. of Computer Science, York University. Parts of the
|
|
* code that are not needed by Prospero have been removed, but most of
|
|
* the accompanying information has been left intact. This file is to be
|
|
* included on those operating systems that do not support re_comp and
|
|
* re_exec.
|
|
*/
|
|
|
|
/*
|
|
|
|
These routines are completely public domain. You can do whatever you
|
|
like with them, and hopefully you are professional enough not to strip
|
|
out the authorship information, acknowledgements and references.
|
|
|
|
The reason for this being a *pre-release* is that I received a lot
|
|
of useful suggestions about packaging, about additional routines etc.
|
|
from a few people. I do not have too much time to do those changes
|
|
right now, so I am putting this package out for those who needed
|
|
it yesterday. Next release will include other routines, and better
|
|
packaging.
|
|
|
|
These routines are *not* tested under SysV, but they are tested
|
|
under PRO/Venix (2.0) and BSD 4.2.
|
|
|
|
In general, these routines run just as fast, or faster than regex library
|
|
routines under BSD 4.2. In some cases, they are slightly slower. I did not
|
|
try too hard to optimize the re_exec routine.
|
|
|
|
Coding style is a la K&R, with lotsa short identifiers. I like it
|
|
that way. All flames should be fed to yetti!dragon.
|
|
|
|
Acknowledgements: Henry Spencer, Hugh Redelmeier and Drew Sullivan made
|
|
a lot of important suggestions, some of which will be
|
|
incorporated into the next version.
|
|
|
|
*/
|
|
|
|
/*
|
|
* regex - Regular expression pattern matching
|
|
* and replacement
|
|
*
|
|
*
|
|
* By: Ozan S. Yigit (oz)
|
|
* Dept. of Computer Science
|
|
* York University
|
|
* [...!utzoo!yetti!oz || oz@yusol.BITNET || oz@yuyetti.BITNET]
|
|
*
|
|
* These routines are the PUBLIC DOMAIN equivalents
|
|
* of regex routines as found in 4.nBSD UN*X, with minor
|
|
* extensions.
|
|
*
|
|
* These routines are derived from various implementations
|
|
* found in software tools books, and Conroy's grep. They
|
|
* are NOT derived from licensed/restricted software.
|
|
* For more interesting/academic/complicated implementations,
|
|
* see Henry Spencer's regexp routines, or GNU Emacs pattern
|
|
* matching module.
|
|
*
|
|
* Routines:
|
|
* re_comp: compile a regular expression into
|
|
* a DFA.
|
|
*
|
|
* char *re_comp(s)
|
|
* char *s;
|
|
*
|
|
* re_exec: execute the DFA to match a pattern.
|
|
*
|
|
* int re_exec(s)
|
|
* char *s;
|
|
*
|
|
* Regular Expressions:
|
|
*
|
|
* [1] char matches itself, unless it is a special
|
|
* character (metachar): . \ [ ] * + ^ $
|
|
*
|
|
* [2] . matches any character.
|
|
*
|
|
* [3] \ matches the character following it, except
|
|
* when followed by a left or right round bracket,
|
|
* a digit 1 to 9 or a left or right angle bracket.
|
|
* (see [7], [8] and [9])
|
|
* It is used as an escape character for all
|
|
* other meta-characters, and itself. When used
|
|
* in a set ([4]), it is treated as an ordinary
|
|
* character.
|
|
*
|
|
* [4] [set] matches one of the characters in the set.
|
|
* If the first character in the set is "^",
|
|
* it matches a character NOT in the set. A
|
|
* shorthand S-E is used to specify a set of
|
|
* characters S upto E, inclusive. The special
|
|
* characters "]" and "-" have no special
|
|
* meaning if they appear as the first chars
|
|
* in the set.
|
|
* examples: match:
|
|
*
|
|
* [a-z] any lowercase alpha
|
|
*
|
|
* [^]-] any char except ] and -
|
|
*
|
|
* [^A-Z] any char except uppercase
|
|
* alpha
|
|
*
|
|
* [a-zA-Z] any alpha
|
|
*
|
|
* [5] * any regular expression form [1] to [4], followed by
|
|
* closure char (*) matches zero or more matches of
|
|
* that form.
|
|
*
|
|
* [6] + same as [5], except it matches one or more.
|
|
*
|
|
* [7] a regular expression in the form [1] to [10], enclosed
|
|
* as \(form\) matches what form matches. The enclosure
|
|
* creates a set of tags, used for [8] and for
|
|
* pattern substution. The tagged forms are numbered
|
|
* starting from 1.
|
|
*
|
|
* [8] a \ followed by a digit 1 to 9 matches whatever a
|
|
* previously tagged regular expression ([7]) matched.
|
|
*
|
|
* [9] \< a regular expression starting with a \< construct
|
|
* \> and/or ending with a \> construct, restricts the
|
|
* pattern matching to the beginning of a word, and/or
|
|
* the end of a word. A word is defined to be a character
|
|
* string beginning and/or ending with the characters
|
|
* A-Z a-z 0-9 and _. It must also be preceded and/or
|
|
* followed by any character outside those mentioned.
|
|
*
|
|
* [10] a composite regular expression xy where x and y
|
|
* are in the form [1] to [10] matches the longest
|
|
* match of x followed by a match for y.
|
|
*
|
|
* [11] ^ a regular expression starting with a ^ character
|
|
* $ and/or ending with a $ character, restricts the
|
|
* pattern matching to the beginning of the line,
|
|
* or the end of line. [anchors] Elsewhere in the
|
|
* pattern, ^ and $ are treated as ordinary characters.
|
|
*
|
|
*
|
|
* Acknowledgements:
|
|
*
|
|
* HCR's Hugh Redelmeier has been most helpful in various
|
|
* stages of development. He convinced me to include BOW
|
|
* and EOW constructs, originally invented by Rob Pike at
|
|
* the University of Toronto.
|
|
*
|
|
* References:
|
|
* Software tools Kernighan & Plauger
|
|
* Software tools in Pascal Kernighan & Plauger
|
|
* Grep [rsx-11 C dist] David Conroy
|
|
* ed - text editor Un*x Programmer's Manual
|
|
* Advanced editing on Un*x B. W. Kernighan
|
|
* RegExp routines Henry Spencer
|
|
*
|
|
* Notes:
|
|
*
|
|
* This implementation uses a bit-set representation for character
|
|
* classes for speed and compactness. Each character is represented
|
|
* by one bit in a 128-bit block. Thus, CCL or NCL always takes a
|
|
* constant 16 bytes in the internal dfa, and re_exec does a single
|
|
* bit comparison to locate the character in the set.
|
|
*
|
|
* Examples:
|
|
*
|
|
* pattern: foo*.*
|
|
* compile: CHR f CHR o CLO CHR o END CLO ANY END END
|
|
* matches: fo foo fooo foobar fobar foxx ...
|
|
*
|
|
* pattern: fo[ob]a[rz]
|
|
* compile: CHR f CHR o CCL 2 o b CHR a CCL bitset END
|
|
* matches: fobar fooar fobaz fooaz
|
|
*
|
|
* pattern: foo\\+
|
|
* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
|
|
* matches: foo\ foo\\ foo\\\ ...
|
|
*
|
|
* pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
|
|
* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
|
|
* matches: foo1foo foo2foo foo3foo
|
|
*
|
|
* pattern: \(fo.*\)-\1
|
|
* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
|
|
* matches: foo-foo fo-fo fob-fob foobar-foobar ...
|
|
*
|
|
*/
|
|
|
|
#define MAXDFA 1024
|
|
#define MAXTAG 10
|
|
|
|
#define OKP 1
|
|
#define NOP 0
|
|
|
|
#define CHR 1
|
|
#define ANY 2
|
|
#define CCL 3
|
|
#define NCL 4
|
|
#define BOL 5
|
|
#define EOL 6
|
|
#define BOT 7
|
|
#define EOT 8
|
|
#define BOW 9
|
|
#define EOW 10
|
|
#define REF 11
|
|
#define CLO 12
|
|
|
|
#define END 0
|
|
|
|
/*
|
|
* The following defines are not meant
|
|
* to be changeable. They are for readibility
|
|
* only.
|
|
*
|
|
*/
|
|
#define MAXCHR 128
|
|
#define CHRBIT 8
|
|
#define BITBLK MAXCHR/CHRBIT
|
|
#define BLKIND 0170
|
|
#define BITIND 07
|
|
|
|
#define ASCIIB 0177
|
|
|
|
typedef /*unsigned*/ char CHAR;
|
|
|
|
/*!! XXX This is definately not thread safe */
|
|
|
|
static int tagstk[MAXTAG]; /* subpat tag stack..*/
|
|
static CHAR dfa[MAXDFA]; /* automaton.. */
|
|
static int sta = NOP; /* status of lastpat */
|
|
|
|
static CHAR bittab[BITBLK]; /* bit table for CCL */
|
|
|
|
static int internal_error;
|
|
|
|
static void
|
|
chset(c) register CHAR c; { bittab[((c)&BLKIND)>>3] |= 1<<((c)&BITIND); }
|
|
|
|
#define badpat(x) return(*dfa = END, x)
|
|
#define store(x) *mp++ = x
|
|
|
|
#include <pfs_threads.h>
|
|
|
|
char *
|
|
re_comp(pat)
|
|
char *pat;
|
|
{
|
|
register char *p; /* pattern pointer */
|
|
register CHAR *mp=dfa; /* dfa pointer */
|
|
register CHAR *lp; /* saved pointer.. */
|
|
register CHAR *sp=dfa; /* another one.. */
|
|
|
|
register int tagi = 0; /* tag stack index */
|
|
register int tagc = 1; /* actual tag count */
|
|
|
|
register int n;
|
|
int c1, c2;
|
|
|
|
internal_error = 0;
|
|
|
|
if (!pat || !*pat)
|
|
if (sta)
|
|
return(0);
|
|
else
|
|
badpat("No previous regular expression");
|
|
sta = NOP;
|
|
|
|
for (p = pat; *p; p++) {
|
|
lp = mp;
|
|
switch(*p) {
|
|
|
|
case '.': /* match any char.. */
|
|
store(ANY);
|
|
break;
|
|
|
|
case '^': /* match beginning.. */
|
|
if (p == pat)
|
|
store(BOL);
|
|
else {
|
|
store(CHR);
|
|
store(*p);
|
|
}
|
|
break;
|
|
|
|
case '$': /* match endofline.. */
|
|
if (!*(p+1))
|
|
store(EOL);
|
|
else {
|
|
store(CHR);
|
|
store(*p);
|
|
}
|
|
break;
|
|
|
|
case '[': /* match char class..*/
|
|
|
|
if (*++p == '^') {
|
|
store(NCL);
|
|
p++;
|
|
}
|
|
else
|
|
store(CCL);
|
|
|
|
if (*p == '-') /* real dash */
|
|
chset(*p++);
|
|
if (*p == ']') /* real brac */
|
|
chset(*p++);
|
|
while (*p && *p != ']') {
|
|
if (*p == '-' && *(p+1) && *(p+1) != ']') {
|
|
p++;
|
|
c1 = *(p-2) + 1;
|
|
c2 = *p++;
|
|
while (c1 <= c2)
|
|
chset(c1++);
|
|
}
|
|
#ifdef EXTEND
|
|
else if (*p == '\\' && *(p+1)) {
|
|
p++;
|
|
chset(*p++);
|
|
}
|
|
#endif
|
|
else
|
|
chset(*p++);
|
|
}
|
|
if (!*p)
|
|
badpat("Missing ]");
|
|
|
|
for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
|
|
store(bittab[n]);
|
|
|
|
break;
|
|
|
|
case '*': /* match 0 or more.. */
|
|
case '+': /* match 1 or more.. */
|
|
if (p == pat)
|
|
badpat("Empty closure");
|
|
lp = sp; /* previous opcode */
|
|
if (*lp == CLO) /* equivalence.. */
|
|
break;
|
|
switch(*lp) {
|
|
|
|
case BOL:
|
|
case BOT:
|
|
case EOT:
|
|
case BOW:
|
|
case EOW:
|
|
case REF:
|
|
badpat("Illegal closure");
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (*p == '+')
|
|
for (sp = mp; lp < sp; lp++)
|
|
store(*lp);
|
|
|
|
store(END);
|
|
store(END);
|
|
sp = mp;
|
|
while (--mp > lp)
|
|
*mp = mp[-1];
|
|
store(CLO);
|
|
mp = sp;
|
|
break;
|
|
|
|
case '\\': /* tags, backrefs .. */
|
|
switch(*++p) {
|
|
|
|
case '(':
|
|
if (tagc < MAXTAG) {
|
|
tagstk[++tagi] = tagc;
|
|
store(BOT);
|
|
store(tagc++);
|
|
}
|
|
else
|
|
badpat("Too many \\(\\) pairs");
|
|
break;
|
|
case ')':
|
|
if (*sp == BOT)
|
|
badpat("Null pattern inside \\(\\)");
|
|
if (tagi > 0) {
|
|
store(EOT);
|
|
store(tagstk[tagi--]);
|
|
}
|
|
else
|
|
badpat("Unmatched \\)");
|
|
break;
|
|
case '<':
|
|
store(BOW);
|
|
break;
|
|
case '>':
|
|
if (*sp == BOW)
|
|
badpat("Null pattern inside \\<\\>");
|
|
store(EOW);
|
|
break;
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
n = *p-'0';
|
|
if (tagi > 0 && tagstk[tagi] == n)
|
|
badpat("Cyclical reference");
|
|
if (tagc > n) {
|
|
store(REF);
|
|
store(n);
|
|
}
|
|
else
|
|
badpat("Undetermined reference");
|
|
break;
|
|
#ifdef EXTEND
|
|
case 'b':
|
|
store(CHR);
|
|
store('\b');
|
|
break;
|
|
case 'n':
|
|
store(CHR);
|
|
store('\n');
|
|
break;
|
|
case 'f':
|
|
store(CHR);
|
|
store('\f');
|
|
break;
|
|
case 'r':
|
|
store(CHR);
|
|
store('\r');
|
|
break;
|
|
case 't':
|
|
store(CHR);
|
|
store('\t');
|
|
break;
|
|
#endif
|
|
default:
|
|
store(CHR);
|
|
store(*p);
|
|
}
|
|
break;
|
|
|
|
default : /* an ordinary char */
|
|
store(CHR);
|
|
store(*p);
|
|
break;
|
|
}
|
|
sp = lp;
|
|
}
|
|
if (tagi > 0)
|
|
badpat("Unmatched \\(");
|
|
store(END);
|
|
sta = OKP;
|
|
|
|
return(0);
|
|
}
|
|
|
|
|
|
static char *bol;
|
|
static char *bopat[MAXTAG];
|
|
static char *eopat[MAXTAG];
|
|
|
|
static char *pmatch();
|
|
|
|
/*
|
|
* re_exec:
|
|
* execute dfa to find a match.
|
|
*
|
|
* special cases: (dfa[0])
|
|
* BOL
|
|
* Match only once, starting from the
|
|
* beginning.
|
|
* CHR
|
|
* First locate the character without
|
|
* calling pmatch, and if found, call
|
|
* pmatch for the remaining string.
|
|
* END
|
|
* re_comp failed, poor luser did not
|
|
* check for it. Fail fast.
|
|
*
|
|
* If a match is found, bopat[0] and eopat[0] are set
|
|
* to the beginning and the end of the matched fragment,
|
|
* respectively.
|
|
*
|
|
*/
|
|
|
|
char Empty_string[] = "";
|
|
|
|
int
|
|
re_exec(lp)
|
|
register char *lp;
|
|
{
|
|
register char c;
|
|
register char *ep = 0;
|
|
register CHAR *ap = dfa;
|
|
|
|
if (!lp) lp = Empty_string;
|
|
|
|
bol = lp;
|
|
|
|
bopat[0] = 0;
|
|
bopat[1] = 0;
|
|
bopat[2] = 0;
|
|
bopat[3] = 0;
|
|
bopat[4] = 0;
|
|
bopat[5] = 0;
|
|
bopat[6] = 0;
|
|
bopat[7] = 0;
|
|
bopat[8] = 0;
|
|
bopat[9] = 0;
|
|
|
|
switch(*ap) {
|
|
|
|
case BOL: /* anchored: match from BOL only */
|
|
ep = pmatch(lp,ap);
|
|
break;
|
|
case CHR: /* ordinary char: locate it fast */
|
|
c = *(ap+1);
|
|
while (*lp && *lp != c)
|
|
lp++;
|
|
if (!*lp) /* if EOS, fail, else fall thru. */
|
|
return(0);
|
|
default: /* regular matching all the way. */
|
|
while (*lp) {
|
|
if ((ep = pmatch(lp,ap)))
|
|
break;
|
|
lp++;
|
|
}
|
|
break;
|
|
case END: /* munged automaton. fail always */
|
|
return(0);
|
|
}
|
|
if (!ep)
|
|
return(0);
|
|
|
|
if (internal_error)
|
|
return(-1);
|
|
|
|
bopat[0] = lp;
|
|
eopat[0] = ep;
|
|
return(1);
|
|
}
|
|
|
|
/*
|
|
* pmatch:
|
|
* internal routine for the hard part
|
|
*
|
|
* This code is mostly snarfed from an early
|
|
* grep written by David Conroy. The backref and
|
|
* tag stuff, and various other mods are by oZ.
|
|
*
|
|
* special cases: (dfa[n], dfa[n+1])
|
|
* CLO ANY
|
|
* We KNOW ".*" will match ANYTHING
|
|
* upto the end of line. Thus, go to
|
|
* the end of line straight, without
|
|
* calling pmatch recursively. As in
|
|
* the other closure cases, the remaining
|
|
* pattern must be matched by moving
|
|
* backwards on the string recursively,
|
|
* to find a match for xy (x is ".*" and
|
|
* y is the remaining pattern) where
|
|
* the match satisfies the LONGEST match
|
|
* for x followed by a match for y.
|
|
* CLO CHR
|
|
* We can again scan the string forward
|
|
* for the single char without recursion,
|
|
* and at the point of failure, we execute
|
|
* the remaining dfa recursively, as
|
|
* described above.
|
|
*
|
|
* At the end of a successful match, bopat[n] and eopat[n]
|
|
* are set to the beginning and end of subpatterns matched
|
|
* by tagged expressions (n = 1 to 9).
|
|
*
|
|
*/
|
|
|
|
|
|
/*
|
|
* character classification table for word boundary
|
|
* operators BOW and EOW. the reason for not using
|
|
* ctype macros is that we can let the user add into
|
|
* our own table. see re_modw. This table is not in
|
|
* the bitset form, since we may wish to extend it
|
|
* in the future for other character classifications.
|
|
*
|
|
* TRUE for 0-9 A-Z a-z _
|
|
*/
|
|
static char chrtyp[MAXCHR] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
|
|
0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 0, 0, 0, 0, 0
|
|
};
|
|
|
|
#define inascii(x) (0177&(x))
|
|
#define iswordc(x) chrtyp[inascii(x)]
|
|
#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & (1<<((y)&BITIND)))
|
|
|
|
/*
|
|
* skip values for CLO XXX to skip past the closure
|
|
*
|
|
*/
|
|
|
|
#define ANYSKIP 2 /* CLO ANY END ... */
|
|
#define CHRSKIP 3 /* CLO CHR chr END ... */
|
|
#define CCLSKIP 18 /* CLO CCL 16bytes END ... */
|
|
|
|
static char *
|
|
pmatch(lp, ap)
|
|
register char *lp;
|
|
register CHAR *ap;
|
|
{
|
|
register char *e; /* extra pointer for CLO */
|
|
register char *bp; /* beginning of subpat.. */
|
|
register char *ep; /* ending of subpat.. */
|
|
register int op, c, n;
|
|
char *are; /* to save the line ptr. */
|
|
|
|
while ((op = *ap++) != END)
|
|
switch(op) {
|
|
|
|
case CHR:
|
|
if (*lp++ != *ap++)
|
|
return(0);
|
|
break;
|
|
case ANY:
|
|
if (!*lp++)
|
|
return(0);
|
|
break;
|
|
case CCL:
|
|
c = *lp++;
|
|
if (!isinset(ap,c))
|
|
return(0);
|
|
ap += BITBLK;
|
|
break;
|
|
case NCL:
|
|
c = *lp++;
|
|
if (isinset(ap,c))
|
|
return(0);
|
|
ap += BITBLK;
|
|
break;
|
|
case BOL:
|
|
if (lp != bol)
|
|
return(0);
|
|
break;
|
|
case EOL:
|
|
if (*lp)
|
|
return(0);
|
|
break;
|
|
case BOT:
|
|
bopat[*ap++] = lp;
|
|
break;
|
|
case EOT:
|
|
eopat[*ap++] = lp;
|
|
break;
|
|
case BOW:
|
|
if (!(lp!=bol && iswordc(lp[-1])) && iswordc(*lp))
|
|
break;
|
|
return(0);
|
|
case EOW:
|
|
if ((lp!=bol && iswordc(lp[-1])) && !iswordc(*lp))
|
|
break;
|
|
return(0);
|
|
case REF:
|
|
n = *ap++;
|
|
bp = bopat[n];
|
|
ep = eopat[n];
|
|
while (bp < ep)
|
|
if (*bp++ != *lp++)
|
|
return(0);
|
|
break;
|
|
case CLO:
|
|
are = lp;
|
|
switch(*ap) {
|
|
|
|
case ANY:
|
|
while (*lp)
|
|
lp++;
|
|
n = ANYSKIP;
|
|
break;
|
|
case CHR:
|
|
c = *(ap+1);
|
|
while (*lp && c == *lp)
|
|
lp++;
|
|
n = CHRSKIP;
|
|
break;
|
|
case CCL:
|
|
case NCL:
|
|
while (*lp && (e = pmatch(lp, ap)))
|
|
lp = e;
|
|
n = CCLSKIP;
|
|
break;
|
|
default:
|
|
internal_error++;
|
|
return(0);
|
|
}
|
|
|
|
ap += n;
|
|
|
|
while (lp >= are) {
|
|
if (e = pmatch(lp, ap))
|
|
return(e);
|
|
--lp;
|
|
}
|
|
return(0);
|
|
default:
|
|
internal_error++;
|
|
return(0);
|
|
}
|
|
return(lp);
|
|
}
|