diff --git a/contrib/netbsd-tests/lib/libc/regex/data/error.in b/contrib/netbsd-tests/lib/libc/regex/data/error.in index 61e0ea4e41c..b5d2cbe5741 100644 --- a/contrib/netbsd-tests/lib/libc/regex/data/error.in +++ b/contrib/netbsd-tests/lib/libc/regex/data/error.in @@ -1,18 +1,11 @@ # certain syntax errors and non-errors -| C EMPTY | b | | * C BADRPT * b * * + C BADRPT ? C BADRPT -"" &C EMPTY -() - abc @abc +"" & abc "" \(\) b abc @abc -a||b C EMPTY -|ab C EMPTY -ab| C EMPTY -(|a)b C EMPTY -(a|)b C EMPTY (*a) C BADRPT (+a) C BADRPT (?a) C BADRPT diff --git a/contrib/netbsd-tests/lib/libc/regex/data/gnuext.in b/contrib/netbsd-tests/lib/libc/regex/data/gnuext.in new file mode 100644 index 00000000000..09d9d21f32f --- /dev/null +++ b/contrib/netbsd-tests/lib/libc/regex/data/gnuext.in @@ -0,0 +1,39 @@ +# BRE Quantifiers +ab\?c b abc abc +ab\+c b abc abc +# BRE Branching +abc\|de b abc abc +a\|b\|c b abc a +\(ab\|bc\) b abcd ab +# ERE Backrefs +(ab)\1 - ab +(ab)\1 - abab abab +\1(ab) C ESUBREG +(a)(b)(c)(d)(e)(f)(g)(h)(i)\9 - abcdefghii abcdefghii +# \w, \W, \s, \S (alnum, ^alnum, space, ^space) +\w+ - -%@a0X- a0X +\w\+ b -%@a0X- a0X +\s+ - aSNTb SNT +\s\+ b aSNTb SNT +# Word boundaries (\b, \B, \<, \>, \`, \') +# (is/not boundary, start/end word, start/end subject string) +\babc\b & abc +\ & abc +\Babc\B & abc +\B[abc]\B & b +\B[abc]+ - bc +\B[abc]\+ b bc +\`abc\' & abc abc +\`.+\' - abNc abNc +\`.\+\' b abNc abNc +(\`a) - Na +(a\') - aN +# Empty subexpressions +abc\| b abc "" +foo\| b abc "" +\|abc b abc "" +\|foo b abc "" +abc| - abc "" +foo| - abc "" +|abc - abc "" +|foo - abc "" diff --git a/contrib/netbsd-tests/lib/libc/regex/data/meta.in b/contrib/netbsd-tests/lib/libc/regex/data/meta.in index 4533d3591bc..d9bfbf9b94f 100644 --- a/contrib/netbsd-tests/lib/libc/regex/data/meta.in +++ b/contrib/netbsd-tests/lib/libc/regex/data/meta.in @@ -4,7 +4,6 @@ a[bc]d & abd abd a\*c & a*c a*c a\\b & a\b a\b a\\\*b & a\*b a\*b -a\bc & abc abc a\ &C EESCAPE a\\bc & a\bc a\bc \{ bC BADRPT diff --git a/contrib/netbsd-tests/lib/libc/regex/t_regex.sh b/contrib/netbsd-tests/lib/libc/regex/t_regex.sh index bef3ac92695..954815c8c59 100755 --- a/contrib/netbsd-tests/lib/libc/regex/t_regex.sh +++ b/contrib/netbsd-tests/lib/libc/regex/t_regex.sh @@ -70,4 +70,5 @@ atf_init_test_cases() create_tc zero "Checks NULs" create_tc word_bound "Checks word boundaries" create_tc regress "Checks various past problems and suspected problems" + create_tc gnuext "Checks GNU extension functionality" } diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index 073e0010882..25421042d9e 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -106,7 +106,7 @@ static const char *dissect(struct match *m, const char *start, const char *stop, static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); #define MAX_RECURSION 100 #define BOL (OUT-1) #define EOL (BOL-1) @@ -116,6 +116,11 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_ #define EOW (BOL-5) #define BADCHAR (BOL-6) #define NONCHAR(c) ((c) <= OUT) +/* sflags */ +#define SNWBND 01 +#define SBOS 02 +#define SEOS 04 + #ifdef REDEBUG static void print(struct match *m, const char *caption, states st, int ch, FILE *d); #endif @@ -183,6 +188,17 @@ matcher(struct re_guts *g, if (stop < start) return(REG_INVARG); + /* Trivial zero-length match on empty sub */ + if (g->iflags & EMPTBR) { + if (nmatch > 0) { + pmatch[0].rm_so = pmatch[0].rm_eo = 0; + + for (i = 1; i < nmatch; i++) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + } + return(0); + } + /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { if (g->charjump != NULL && g->matchjump != NULL) { @@ -412,10 +428,14 @@ dissect(struct match *m, case OCHAR: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; + case OBOS: + case OEOS: case OBOL: case OEOL: case OBOW: case OEOW: + case OWBND: + case ONWBND: break; case OANY: case OANYOF: @@ -621,22 +641,45 @@ backref(struct match *m, else return(NULL); break; + case OBOS: + break; + if (sp == m->beginp) + { /* yes */ } + else + return(NULL); + break; + case OEOS: + break; + if (sp == m->endp) + { /* yes */ } + else + return(NULL); + break; + case ONWBND: + if (sp > m->beginp && sp < m->endp && + ISWORD(*(sp-1)) == ISWORD(*sp)) + { /* yes */ } + else + return(NULL); + break; + case OWBND: case OBOW: if (sp < m->endp && ISWORD(*sp) && ((sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp > m->offp && !ISWORD(*(sp-1))))) { /* yes */ } - else + else if (OP(s) == OBOW) return(NULL); - break; + /* FALLTHROUGH */ case OEOW: - if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || + if (OP(s) != OBOW && + ( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || (sp < m->endp && !ISWORD(*sp)) ) && (sp > m->beginp && ISWORD(*(sp-1))) ) { /* yes */ } - else + else if (OP(s) != OBOW) return(NULL); break; case O_QUEST: @@ -771,6 +814,7 @@ fast( struct match *m, states st = m->st; states fresh = m->fresh; states tmp = m->tmp; + sopno nxop; const char *p = start; wint_t c; wint_t lastc; /* previous c */ @@ -778,11 +822,12 @@ fast( struct match *m, int i; const char *coldp; /* last p after which no match was underway */ size_t clen; + int sflags = 0; CLEAR(st); SET1(st, startst); SP("fast", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); + st = step(m->g, startst, stopst, st, NOTHING, st, sflags); ASSIGN(fresh, st); SP("start", st, *p); coldp = NULL; @@ -822,7 +867,7 @@ fast( struct match *m, } if (i != 0) { for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); + st = step(m->g, startst, stopst, st, flagch, st, sflags); SP("boleol", st, c); } @@ -835,11 +880,24 @@ fast( struct match *m, (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); + if (p == m->beginp) + sflags |= SBOS; + if (p == m->endp) + sflags |= SEOS; + if (flagch != BOW && flagch != EOW && + lastc != OUT && c != OUT && ISWORD(lastc) == ISWORD(c)) + sflags |= SNWBND; + nxop = OP(m->g->strip[startst]); + /* sflags are used for many 0-length matching events, so check those */ + if (flagch == BOW || flagch == EOW || + nxop == ONWBND || nxop == OBOS || nxop == OEOS) { + st = step(m->g, startst, stopst, st, flagch, st, sflags); + SP("boweownwbnd", st, c); } + /* Don't match 0-length ops elsewhere */ + sflags = 0; + /* are we done? */ if (ISSET(st, stopst) || p == stop || clen > stop - p) break; /* NOTE BREAK OUT */ @@ -848,9 +906,9 @@ fast( struct match *m, ASSIGN(tmp, st); ASSIGN(st, fresh); assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); + st = step(m->g, startst, stopst, tmp, c, st, sflags); SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), st)); p += clen; } @@ -877,6 +935,7 @@ slow( struct match *m, states st = m->st; states empty = m->empty; states tmp = m->tmp; + sopno nxop; const char *p = start; wint_t c; wint_t lastc; /* previous c */ @@ -884,12 +943,13 @@ slow( struct match *m, int i; const char *matchp; /* last p at which a match ended */ size_t clen; + int sflags = 0; AT("slow", start, stop, startst, stopst); CLEAR(st); SET1(st, startst); SP("sstart", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); + st = step(m->g, startst, stopst, st, NOTHING, st, sflags); matchp = NULL; if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) c = OUT; @@ -925,7 +985,7 @@ slow( struct match *m, } if (i != 0) { for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); + st = step(m->g, startst, stopst, st, flagch, st, sflags); SP("sboleol", st, c); } @@ -938,11 +998,24 @@ slow( struct match *m, (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboweow", st, c); + if (p == m->beginp) + sflags |= SBOS; + if (p == m->endp) + sflags |= SEOS; + if (flagch != BOW && flagch != EOW && + lastc != OUT && c != OUT && ISWORD(lastc) == ISWORD(c)) + sflags |= SNWBND; + nxop = OP(m->g->strip[startst]); + /* Consume a match for BOW/EOW markers */ + if (flagch == BOW || flagch == EOW || + nxop == ONWBND || nxop == OBOS || nxop == OEOS) { + st = step(m->g, startst, stopst, st, flagch, st, sflags); + SP("sboweownbwnd", st, c); } + /* Don't match 0-length ops elsewhere */ + sflags = 0; + /* are we done? */ if (ISSET(st, stopst)) matchp = p; @@ -953,9 +1026,9 @@ slow( struct match *m, ASSIGN(tmp, st); ASSIGN(st, empty); assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); + st = step(m->g, startst, stopst, tmp, c, st, sflags); SP("saft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), st)); p += clen; } @@ -982,7 +1055,8 @@ step(struct re_guts *g, sopno stop, /* state after stop state within strip */ states bef, /* states reachable before */ wint_t ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ + states aft, /* states already known reachable after */ + int sflags) /* 0-length matching states*/ { cset *cs; sop s; @@ -1011,12 +1085,25 @@ step(struct re_guts *g, if (ch == EOL || ch == BOLEOL) FWD(aft, bef, 1); break; + case ONWBND: + if (sflags & SNWBND) + FWD(aft, bef, 1); + break; + case OBOS: + if (sflags & SBOS) + FWD(aft, bef, 1); + break; + case OEOS: + if (sflags & SEOS) + FWD(aft, bef, 1); + break; + case OWBND: case OBOW: if (ch == BOW) FWD(aft, bef, 1); - break; + /* FALLTHROUGH */ case OEOW: - if (ch == EOW) + if (OP(s) != OBOW && ch == EOW) FWD(aft, bef, 1); break; case OANY: diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index 95764717bda..d65988948a8 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -79,21 +79,36 @@ struct parse { sopno pend[NPAREN]; /* -> ) ([0] unused) */ }; +struct branchc { + sopno start; + sopno back; + sopno fwd; + + int nbranch; + int nsimple; + int outer; +}; + /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === regcomp.c === */ -static void p_ere(struct parse *p, int stop); -static void p_ere_exp(struct parse *p); +static int p_ere_exp(struct parse *p, struct branchc *bc); static void p_str(struct parse *p); -static void p_bre(struct parse *p, int end1, int end2); -static int p_simp_re(struct parse *p, int starordinary); +static int p_branch_eat_delim(struct parse *p); +static void p_branch_ins_offset(struct parse *p, struct branchc *bc); +static void p_branch_fix_tail(struct parse *p, struct branchc *bc); +static int p_branch_do(struct parse *p, struct branchc *bc); +static void p_re(struct parse *p, int end1, int end2); +static int p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); +static int p_b_pseudoclass(struct parse *p, char c); static void p_b_cclass(struct parse *p, cset *cs); +static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); static void p_b_eclass(struct parse *p, cset *cs); static wint_t p_b_symbol(struct parse *p); static wint_t p_b_coll_elem(struct parse *p, wint_t endc); @@ -139,6 +154,7 @@ static char nuls[10]; /* place to point scanner in event of error */ #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define SEESPEC(a) ((p->g->cflags®_EXTENDED) ? SEE(a) : SEETWO('\\', a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) @@ -264,12 +280,10 @@ regcomp(regex_t * __restrict preg, /* do it */ EMIT(OEND, 0); g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) + if (cflags®_NOSPEC) p_str(p); else - p_bre(p, OUT, OUT); + p_re(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); @@ -305,62 +319,18 @@ regcomp(regex_t * __restrict preg, } /* - - p_ere - ERE parser top level, concatenation and alternation - == static void p_ere(struct parse *p, int_t stop); - */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback; - sopno prevfwd; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - == static void p_ere_exp(struct parse *p); + == static int p_ere_exp(struct parse *p); */ -static void -p_ere_exp(struct parse *p) +static int +p_ere_exp(struct parse *p, struct branchc *bc) { char c; wint_t wc; sopno pos; int count; int count2; + int i; sopno subno; int wascaret = 0; @@ -377,7 +347,7 @@ p_ere_exp(struct parse *p) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) - p_ere(p, ')'); + p_re(p, ')', IGN); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -408,9 +378,6 @@ p_ere_exp(struct parse *p) p->g->iflags |= USEEOL; p->g->neol++; break; - case '|': - SETERROR(REG_EMPTY); - break; case '*': case '+': case '?': @@ -429,12 +396,53 @@ p_ere_exp(struct parse *p) (void)REQUIRE(MORE(), REG_EESCAPE); wc = WGETNEXT(); switch (wc) { + case '`': + EMIT(OBOS, 0); + break; + case '\'': + EMIT(OEOS, 0); + break; case '<': EMIT(OBOW, 0); break; case '>': EMIT(OEOW, 0); break; + case 'b': + EMIT(OWBND, 0); + break; + case 'B': + EMIT(ONWBND, 0); + break; + case 'W': + case 'w': + case 'S': + case 's': + p_b_pseudoclass(p, wc); + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = wc - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; default: ordinary(p, wc); break; @@ -451,12 +459,12 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (0); c = PEEK(); /* we call { a repetition if followed by a digit */ if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) - return; /* no repetition, we're done */ + return (0); /* no repetition, we're done */ NEXT(); (void)REQUIRE(!wascaret, REG_BADRPT); @@ -502,12 +510,13 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (0); c = PEEK(); if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) - return; + return (0); SETERROR(REG_BADRPT); + return (0); } /* @@ -523,7 +532,90 @@ p_str(struct parse *p) } /* - - p_bre - BRE parser top level, anchoring and concatenation + * Eat consecutive branch delimiters for the kind of expression that we are + * parsing, return the number of delimiters that we ate. + */ +static int +p_branch_eat_delim(struct parse *p) +{ + int nskip = 0; + if (p->g->cflags®_EXTENDED) + while (EAT('|')) + ++nskip; + else + while (EATTWO('\\', '|')) + ++nskip; + return nskip; +} + +/* + * Insert necessary branch book-keeping operations. This emits a + * bogus 'next' offset, since we still have more to parse + */ +static void +p_branch_ins_offset(struct parse *p, struct branchc *bc) { + if (!bc->nbranch) { + INSERT(OCH_, bc->start); /* offset is wrong */ + bc->fwd = bc->start; + bc->back = bc->start; + } + + ASTERN(OOR1, bc->back); + bc->back = THERE(); + AHEAD(bc->fwd); /* fix previous offset */ + bc->fwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + ++bc->nbranch; +} + +/* + * Fix the offset of the tail branch, if we actually had any branches. + * This is to correct the bogus placeholder offset that we use. + */ +static void +p_branch_fix_tail(struct parse *p, struct branchc *bc) +{ + /* Fix bogus offset at the tail if we actually have branches */ + if (bc->nbranch > 0) { + AHEAD(bc->fwd); + ASTERN(O_CH, bc->back); + } +} + +/* + * Take care of any branching requirements. This includes inserting the + * appropriate branching instructions as well as eating all of the branch + * delimiters until we either run out of pattern or need to parse more pattern. + */ +static int +p_branch_do(struct parse *p, struct branchc *bc) +{ + int ate = 0; + + /* Empty expression; set the flag if necessary*/ + if (HERE() == bc->start) { + if (bc->outer) + p->g->iflags |= EMPTBR; + return (0); + } else { + ate = p_branch_eat_delim(p); + + /* If we hit another branch immediately, skip it and set flag */ + if (ate > 1 || (ate == 1 && !MORE())) { + if (bc->outer) + p->g->iflags |= EMPTBR; + return (0); + } + if (ate == 0 || !MORE()) + return (0); + p_branch_ins_offset(p, bc); + } + + return (1); +} + +/* + - p_re - Top level parser, concatenation and BRE anchoring == static void p_bre(struct parse *p, int end1, \ == int end2); * Giving end1 as OUT essentially eliminates the end1/end2 check. @@ -533,42 +625,71 @@ p_str(struct parse *p) * The amount of lookahead needed to avoid this kludge is excessive. */ static void -p_bre(struct parse *p, +p_re(struct parse *p, int end1, /* first terminating character */ - int end2) /* second terminating character */ + int end2) /* second terminating character; ignored for EREs */ { - sopno start = HERE(); - int first = 1; /* first subexpression? */ int wasdollar = 0; - - if (EAT('^')) { - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ - DROP(1); - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; + struct branchc bc; + int (*parse_expr)(struct parse *, struct branchc *); + + bc.nbranch = 0; + bc.outer = 0; + if (end1 == OUT && end2 == OUT) + bc.outer = 1; + if (p->g->cflags®_EXTENDED) + parse_expr = p_ere_exp; + else + parse_expr = p_simp_re; +#define SEEEND() ((p->g->cflags®_EXTENDED) ? SEE(end1) : SEETWO(end1, end2)) + for (;;) { + bc.start = HERE(); + bc.nsimple = 0; + /* + * Moving BOL/EOL bits into p_simp_re is more complicated than it needs to + * be, because of the complications in looking for the end of the current + * expression. They are better left here than trying to work out the + * equivalent magic in p_simp_bre, mostly for the sake of readability. + */ + if (!(p->g->cflags®_EXTENDED) && EAT('^')) { + EMIT(OBOL, 0); + p->g->iflags |= USEBOL; + p->g->nbol++; + } + while (MORE() && !SEESPEC('|') && !SEEEND()) { + wasdollar = parse_expr(p, &bc); + ++bc.nsimple; + } + if (!(p->g->cflags®_EXTENDED) && wasdollar) { /* oops, that was a trailing anchor */ + DROP(1); + EMIT(OEOL, 0); + p->g->iflags |= USEEOL; + p->g->neol++; + } + /* + * p_branch_do's return value indicates whether we should continue parsing + * or not. This is both for correctness and optimization, because it will + * signal that we need not continue if it encountered an empty branch or + * the end of the string immediately following a branch delimiter. + */ + if (!p_branch_do(p, &bc)) + break; } - - (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ +#undef SEE_END + p_branch_fix_tail(p, &bc); + assert(!MORE() || SEE(end1)); } /* - p_simp_re - parse a simple RE, an atom possibly followed by a repetition - == static int p_simp_re(struct parse *p, int starordinary); + == static int p_simp_re(struct parse *p, struct branchc *bc); */ static int /* was the simple RE an unbackslashed $? */ p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ + struct branchc *bc) { int c; + int cc; /* convenient/control character */ int count; int count2; sopno pos; @@ -583,7 +704,8 @@ p_simp_re(struct parse *p, c = GETNEXT(); if (c == '\\') { (void)REQUIRE(MORE(), REG_EESCAPE); - c = BACKSL | GETNEXT(); + cc = GETNEXT(); + c = BACKSL | cc; } switch (c) { case '.': @@ -595,12 +717,24 @@ p_simp_re(struct parse *p, case '[': p_bracket(p); break; + case BACKSL|'`': + EMIT(OBOS, 0); + break; + case BACKSL|'\'': + EMIT(OEOS, 0); + break; case BACKSL|'<': EMIT(OBOW, 0); break; case BACKSL|'>': EMIT(OEOW, 0); break; + case BACKSL|'b': + EMIT(OWBND, 0); + break; + case BACKSL|'B': + EMIT(ONWBND, 0); + break; case BACKSL|'{': SETERROR(REG_BADRPT); break; @@ -612,7 +746,7 @@ p_simp_re(struct parse *p, EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); + p_re(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -624,6 +758,12 @@ p_simp_re(struct parse *p, case BACKSL|'}': SETERROR(REG_EPAREN); break; + case BACKSL|'W': + case BACKSL|'w': + case BACKSL|'S': + case BACKSL|'s': + p_b_pseudoclass(p, cc); + break; case BACKSL|'1': case BACKSL|'2': case BACKSL|'3': @@ -648,7 +788,7 @@ p_simp_re(struct parse *p, p->g->backrefs = 1; break; case '*': - (void)REQUIRE(starordinary, REG_BADRPT); + (void)REQUIRE(bc->nsimple == 0, REG_BADRPT); /* FALLTHROUGH */ default: p->next--; @@ -663,6 +803,12 @@ p_simp_re(struct parse *p, ASTERN(O_PLUS, pos); INSERT(OQUEST_, pos); ASTERN(O_QUEST, pos); + } else if (EATTWO('\\', '?')) { + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + } else if (EATTWO('\\', '+')) { + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); } else if (EATTWO('\\', '{')) { count = p_count(p); if (EAT(',')) { @@ -834,6 +980,41 @@ p_b_term(struct parse *p, cset *cs) } /* + - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) + == static int p_b_pseudoclass(struct parse *p, char c) + */ +static int +p_b_pseudoclass(struct parse *p, char c) { + cset *cs; + + if ((cs = allocset(p)) == NULL) + return(0); + + if (p->g->cflags®_ICASE) + cs->icase = 1; + + switch (c) { + case 'W': + cs->invert = 1; + /* PASSTHROUGH */ + case 'w': + p_b_cclass_named(p, cs, "alnum"); + break; + case 'S': + cs->invert = 1; + /* PASSTHROUGH */ + case 's': + p_b_cclass_named(p, cs, "space"); + break; + default: + return(0); + } + + EMIT(OANYOF, (int)(cs - p->g->sets)); + return(1); +} + +/* - p_b_cclass - parse a character-class name and deal with it == static void p_b_cclass(struct parse *p, cset *cs); */ @@ -842,7 +1023,6 @@ p_b_cclass(struct parse *p, cset *cs) { char *sp = p->next; size_t len; - wctype_t wct; char clname[16]; while (MORE() && isalpha((uch)PEEK())) @@ -854,6 +1034,17 @@ p_b_cclass(struct parse *p, cset *cs) } memcpy(clname, sp, len); clname[len] = '\0'; + + p_b_cclass_named(p, cs, clname); +} +/* + - p_b_cclass_named - deal with a named character class + == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); + */ +static void +p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { + wctype_t wct; + if ((wct = wctype(clname)) == 0) { SETERROR(REG_ECTYPE); return; @@ -1483,6 +1674,8 @@ findmust(struct parse *p, struct re_guts *g) /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: + case OWBND: + case ONWBND: case OBOL: case OEOL: case O_QUEST: @@ -1636,6 +1829,8 @@ altoffset(sop *scan, int offset) try++; case OBOW: case OEOW: + case OWBND: + case ONWBND: case OLPAREN: case ORPAREN: case OOR2: diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h index bec1b160cae..c1ecbaec888 100644 --- a/lib/libc/regex/regex2.h +++ b/lib/libc/regex/regex2.h @@ -102,7 +102,10 @@ typedef long sopno; #define O_CH (18L<