diff --git a/contrib/netbsd-tests/lib/libc/regex/README b/contrib/netbsd-tests/lib/libc/regex/README index 6d9a28cf60c..0e8a09a764f 100644 --- a/contrib/netbsd-tests/lib/libc/regex/README +++ b/contrib/netbsd-tests/lib/libc/regex/README @@ -28,6 +28,7 @@ The full list of flags: $ REG_NOTEOL # REG_STARTEND (see below) p REG_PEND + P REG_POSIX For REG_STARTEND, the start/end offsets are those of the substring enclosed in (). diff --git a/contrib/netbsd-tests/lib/libc/regex/data/error.in b/contrib/netbsd-tests/lib/libc/regex/data/error.in index 61e0ea4e41c..27f2e1fb5cb 100644 --- a/contrib/netbsd-tests/lib/libc/regex/data/error.in +++ b/contrib/netbsd-tests/lib/libc/regex/data/error.in @@ -1,18 +1,18 @@ # certain syntax errors and non-errors -| C EMPTY +| CP EMPTY | b | | * C BADRPT * b * * + C BADRPT ? C BADRPT -"" &C EMPTY -() - abc @abc +"" &CP EMPTY +() P abc @abc \(\) b abc @abc -a||b C EMPTY -|ab C EMPTY -ab| C EMPTY -(|a)b C EMPTY -(a|)b C EMPTY +a||b CP EMPTY +|ab CP EMPTY +ab| CP EMPTY +(|a)b CP EMPTY +(a|)b CP EMPTY (*a) C BADRPT (+a) C BADRPT (?a) C BADRPT diff --git a/contrib/netbsd-tests/lib/libc/regex/data/gnuext.in b/contrib/netbsd-tests/lib/libc/regex/data/gnuext.in new file mode 100644 index 00000000000..09d9d21f32f --- /dev/null +++ b/contrib/netbsd-tests/lib/libc/regex/data/gnuext.in @@ -0,0 +1,39 @@ +# BRE Quantifiers +ab\?c b abc abc +ab\+c b abc abc +# BRE Branching +abc\|de b abc abc +a\|b\|c b abc a +\(ab\|bc\) b abcd ab +# ERE Backrefs +(ab)\1 - ab +(ab)\1 - abab abab +\1(ab) C ESUBREG +(a)(b)(c)(d)(e)(f)(g)(h)(i)\9 - abcdefghii abcdefghii +# \w, \W, \s, \S (alnum, ^alnum, space, ^space) +\w+ - -%@a0X- a0X +\w\+ b -%@a0X- a0X +\s+ - aSNTb SNT +\s\+ b aSNTb SNT +# Word boundaries (\b, \B, \<, \>, \`, \') +# (is/not boundary, start/end word, start/end subject string) +\babc\b & abc +\ & abc +\Babc\B & abc +\B[abc]\B & b +\B[abc]+ - bc +\B[abc]\+ b bc +\`abc\' & abc abc +\`.+\' - abNc abNc +\`.\+\' b abNc abNc +(\`a) - Na +(a\') - aN +# Empty subexpressions +abc\| b abc "" +foo\| b abc "" +\|abc b abc "" +\|foo b abc "" +abc| - abc "" +foo| - abc "" +|abc - abc "" +|foo - abc "" diff --git a/contrib/netbsd-tests/lib/libc/regex/data/meta.in b/contrib/netbsd-tests/lib/libc/regex/data/meta.in index 4533d3591bc..87adb41b21a 100644 --- a/contrib/netbsd-tests/lib/libc/regex/data/meta.in +++ b/contrib/netbsd-tests/lib/libc/regex/data/meta.in @@ -4,7 +4,7 @@ a[bc]d & abd abd a\*c & a*c a*c a\\b & a\b a\b a\\\*b & a\*b a\*b -a\bc & abc abc +a\bc &P abc abc a\ &C EESCAPE a\\bc & a\bc a\bc \{ bC BADRPT diff --git a/contrib/netbsd-tests/lib/libc/regex/main.c b/contrib/netbsd-tests/lib/libc/regex/main.c index eac4e2d9b51..d4d93635691 100644 --- a/contrib/netbsd-tests/lib/libc/regex/main.c +++ b/contrib/netbsd-tests/lib/libc/regex/main.c @@ -246,6 +246,8 @@ try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts) char f0copy[1000]; char f2copy[1000]; + if (opt('P', f1)) + opts |= REG_POSIX; strcpy(f0copy, f0); re.re_endp = (opts®_PEND) ? f0copy + strlen(f0copy) : NULL; fixstr(f0copy); diff --git a/contrib/netbsd-tests/lib/libc/regex/t_regex.sh b/contrib/netbsd-tests/lib/libc/regex/t_regex.sh index bef3ac92695..954815c8c59 100755 --- a/contrib/netbsd-tests/lib/libc/regex/t_regex.sh +++ b/contrib/netbsd-tests/lib/libc/regex/t_regex.sh @@ -70,4 +70,5 @@ atf_init_test_cases() create_tc zero "Checks NULs" create_tc word_bound "Checks word boundaries" create_tc regress "Checks various past problems and suspected problems" + create_tc gnuext "Checks GNU extension functionality" } diff --git a/include/regex.h b/include/regex.h index e8b185cac9a..689a74618ec 100644 --- a/include/regex.h +++ b/include/regex.h @@ -69,6 +69,7 @@ typedef struct { #define REG_NOSPEC 0020 #define REG_PEND 0040 #define REG_DUMP 0200 +#define REG_POSIX 0400 /* regerror() flags */ #define REG_ENOSYS (-1) diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index 073e0010882..25421042d9e 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -106,7 +106,7 @@ static const char *dissect(struct match *m, const char *start, const char *stop, static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); #define MAX_RECURSION 100 #define BOL (OUT-1) #define EOL (BOL-1) @@ -116,6 +116,11 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_ #define EOW (BOL-5) #define BADCHAR (BOL-6) #define NONCHAR(c) ((c) <= OUT) +/* sflags */ +#define SNWBND 01 +#define SBOS 02 +#define SEOS 04 + #ifdef REDEBUG static void print(struct match *m, const char *caption, states st, int ch, FILE *d); #endif @@ -183,6 +188,17 @@ matcher(struct re_guts *g, if (stop < start) return(REG_INVARG); + /* Trivial zero-length match on empty sub */ + if (g->iflags & EMPTBR) { + if (nmatch > 0) { + pmatch[0].rm_so = pmatch[0].rm_eo = 0; + + for (i = 1; i < nmatch; i++) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + } + return(0); + } + /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { if (g->charjump != NULL && g->matchjump != NULL) { @@ -412,10 +428,14 @@ dissect(struct match *m, case OCHAR: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; + case OBOS: + case OEOS: case OBOL: case OEOL: case OBOW: case OEOW: + case OWBND: + case ONWBND: break; case OANY: case OANYOF: @@ -621,22 +641,45 @@ backref(struct match *m, else return(NULL); break; + case OBOS: + break; + if (sp == m->beginp) + { /* yes */ } + else + return(NULL); + break; + case OEOS: + break; + if (sp == m->endp) + { /* yes */ } + else + return(NULL); + break; + case ONWBND: + if (sp > m->beginp && sp < m->endp && + ISWORD(*(sp-1)) == ISWORD(*sp)) + { /* yes */ } + else + return(NULL); + break; + case OWBND: case OBOW: if (sp < m->endp && ISWORD(*sp) && ((sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp > m->offp && !ISWORD(*(sp-1))))) { /* yes */ } - else + else if (OP(s) == OBOW) return(NULL); - break; + /* FALLTHROUGH */ case OEOW: - if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || + if (OP(s) != OBOW && + ( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || (sp < m->endp && !ISWORD(*sp)) ) && (sp > m->beginp && ISWORD(*(sp-1))) ) { /* yes */ } - else + else if (OP(s) != OBOW) return(NULL); break; case O_QUEST: @@ -771,6 +814,7 @@ fast( struct match *m, states st = m->st; states fresh = m->fresh; states tmp = m->tmp; + sopno nxop; const char *p = start; wint_t c; wint_t lastc; /* previous c */ @@ -778,11 +822,12 @@ fast( struct match *m, int i; const char *coldp; /* last p after which no match was underway */ size_t clen; + int sflags = 0; CLEAR(st); SET1(st, startst); SP("fast", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); + st = step(m->g, startst, stopst, st, NOTHING, st, sflags); ASSIGN(fresh, st); SP("start", st, *p); coldp = NULL; @@ -822,7 +867,7 @@ fast( struct match *m, } if (i != 0) { for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); + st = step(m->g, startst, stopst, st, flagch, st, sflags); SP("boleol", st, c); } @@ -835,11 +880,24 @@ fast( struct match *m, (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); + if (p == m->beginp) + sflags |= SBOS; + if (p == m->endp) + sflags |= SEOS; + if (flagch != BOW && flagch != EOW && + lastc != OUT && c != OUT && ISWORD(lastc) == ISWORD(c)) + sflags |= SNWBND; + nxop = OP(m->g->strip[startst]); + /* sflags are used for many 0-length matching events, so check those */ + if (flagch == BOW || flagch == EOW || + nxop == ONWBND || nxop == OBOS || nxop == OEOS) { + st = step(m->g, startst, stopst, st, flagch, st, sflags); + SP("boweownwbnd", st, c); } + /* Don't match 0-length ops elsewhere */ + sflags = 0; + /* are we done? */ if (ISSET(st, stopst) || p == stop || clen > stop - p) break; /* NOTE BREAK OUT */ @@ -848,9 +906,9 @@ fast( struct match *m, ASSIGN(tmp, st); ASSIGN(st, fresh); assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); + st = step(m->g, startst, stopst, tmp, c, st, sflags); SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), st)); p += clen; } @@ -877,6 +935,7 @@ slow( struct match *m, states st = m->st; states empty = m->empty; states tmp = m->tmp; + sopno nxop; const char *p = start; wint_t c; wint_t lastc; /* previous c */ @@ -884,12 +943,13 @@ slow( struct match *m, int i; const char *matchp; /* last p at which a match ended */ size_t clen; + int sflags = 0; AT("slow", start, stop, startst, stopst); CLEAR(st); SET1(st, startst); SP("sstart", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); + st = step(m->g, startst, stopst, st, NOTHING, st, sflags); matchp = NULL; if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) c = OUT; @@ -925,7 +985,7 @@ slow( struct match *m, } if (i != 0) { for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); + st = step(m->g, startst, stopst, st, flagch, st, sflags); SP("sboleol", st, c); } @@ -938,11 +998,24 @@ slow( struct match *m, (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboweow", st, c); + if (p == m->beginp) + sflags |= SBOS; + if (p == m->endp) + sflags |= SEOS; + if (flagch != BOW && flagch != EOW && + lastc != OUT && c != OUT && ISWORD(lastc) == ISWORD(c)) + sflags |= SNWBND; + nxop = OP(m->g->strip[startst]); + /* Consume a match for BOW/EOW markers */ + if (flagch == BOW || flagch == EOW || + nxop == ONWBND || nxop == OBOS || nxop == OEOS) { + st = step(m->g, startst, stopst, st, flagch, st, sflags); + SP("sboweownbwnd", st, c); } + /* Don't match 0-length ops elsewhere */ + sflags = 0; + /* are we done? */ if (ISSET(st, stopst)) matchp = p; @@ -953,9 +1026,9 @@ slow( struct match *m, ASSIGN(tmp, st); ASSIGN(st, empty); assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); + st = step(m->g, startst, stopst, tmp, c, st, sflags); SP("saft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), st)); p += clen; } @@ -982,7 +1055,8 @@ step(struct re_guts *g, sopno stop, /* state after stop state within strip */ states bef, /* states reachable before */ wint_t ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ + states aft, /* states already known reachable after */ + int sflags) /* 0-length matching states*/ { cset *cs; sop s; @@ -1011,12 +1085,25 @@ step(struct re_guts *g, if (ch == EOL || ch == BOLEOL) FWD(aft, bef, 1); break; + case ONWBND: + if (sflags & SNWBND) + FWD(aft, bef, 1); + break; + case OBOS: + if (sflags & SBOS) + FWD(aft, bef, 1); + break; + case OEOS: + if (sflags & SEOS) + FWD(aft, bef, 1); + break; + case OWBND: case OBOW: if (ch == BOW) FWD(aft, bef, 1); - break; + /* FALLTHROUGH */ case OEOW: - if (ch == EOW) + if (OP(s) != OBOW && ch == EOW) FWD(aft, bef, 1); break; case OANY: diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index 95764717bda..d4c28b2253e 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -79,21 +79,38 @@ struct parse { sopno pend[NPAREN]; /* -> ) ([0] unused) */ }; +struct branchc { + sopno start; + sopno back; + sopno fwd; + + int nbranch; + int nsimple; + int outer; + int use_gnu; + int bre; +}; + /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === regcomp.c === */ -static void p_ere(struct parse *p, int stop); -static void p_ere_exp(struct parse *p); +static int p_ere_exp(struct parse *p, struct branchc *bc); static void p_str(struct parse *p); -static void p_bre(struct parse *p, int end1, int end2); -static int p_simp_re(struct parse *p, int starordinary); +static int p_branch_eat_delim(struct parse *p, struct branchc *bc); +static void p_branch_ins_offset(struct parse *p, struct branchc *bc); +static void p_branch_fix_tail(struct parse *p, struct branchc *bc); +static int p_branch_do(struct parse *p, struct branchc *bc); +static void p_re(struct parse *p, int end1, int end2); +static int p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); +static int p_b_pseudoclass(struct parse *p, char c); static void p_b_cclass(struct parse *p, cset *cs); +static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); static void p_b_eclass(struct parse *p, cset *cs); static wint_t p_b_symbol(struct parse *p); static wint_t p_b_coll_elem(struct parse *p, wint_t endc); @@ -139,6 +156,7 @@ static char nuls[10]; /* place to point scanner in event of error */ #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define SEESPEC(a) ((p->g->cflags®_EXTENDED) ? SEE(a) : SEETWO('\\', a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) @@ -264,12 +282,10 @@ regcomp(regex_t * __restrict preg, /* do it */ EMIT(OEND, 0); g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) + if (cflags®_NOSPEC) p_str(p); else - p_bre(p, OUT, OUT); + p_re(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); @@ -305,65 +321,21 @@ regcomp(regex_t * __restrict preg, } /* - - p_ere - ERE parser top level, concatenation and alternation - == static void p_ere(struct parse *p, int_t stop); - */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback; - sopno prevfwd; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - == static void p_ere_exp(struct parse *p); + == static int p_ere_exp(struct parse *p); */ -static void -p_ere_exp(struct parse *p) +static int +p_ere_exp(struct parse *p, struct branchc *bc) { char c; wint_t wc; sopno pos; int count; int count2; + int i; sopno subno; int wascaret = 0; - + int handled = 0; assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); @@ -377,7 +349,7 @@ p_ere_exp(struct parse *p) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) - p_ere(p, ')'); + p_re(p, ')', IGN); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -408,9 +380,6 @@ p_ere_exp(struct parse *p) p->g->iflags |= USEEOL; p->g->neol++; break; - case '|': - SETERROR(REG_EMPTY); - break; case '*': case '+': case '?': @@ -428,6 +397,57 @@ p_ere_exp(struct parse *p) case '\\': (void)REQUIRE(MORE(), REG_EESCAPE); wc = WGETNEXT(); + if (bc->use_gnu) { + handled = 1; + switch (wc) { + case '`': + EMIT(OBOS, 0); + break; + case '\'': + EMIT(OEOS, 0); + break; + case 'b': + EMIT(OWBND, 0); + break; + case 'B': + EMIT(ONWBND, 0); + break; + case 'W': + case 'w': + case 'S': + case 's': + p_b_pseudoclass(p, wc); + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = wc - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; + default: + handled = 0; + } + /* Don't proceed to the POSIX bits if we've already handled it */ + if (handled) + break; + } switch (wc) { case '<': EMIT(OBOW, 0); @@ -451,12 +471,12 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (0); c = PEEK(); /* we call { a repetition if followed by a digit */ if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) - return; /* no repetition, we're done */ + return (0); /* no repetition, we're done */ NEXT(); (void)REQUIRE(!wascaret, REG_BADRPT); @@ -502,12 +522,13 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (0); c = PEEK(); if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) - return; + return (0); SETERROR(REG_BADRPT); + return (0); } /* @@ -523,7 +544,91 @@ p_str(struct parse *p) } /* - - p_bre - BRE parser top level, anchoring and concatenation + * Eat consecutive branch delimiters for the kind of expression that we are + * parsing, return the number of delimiters that we ate. + */ +static int +p_branch_eat_delim(struct parse *p, struct branchc *bc) +{ + int nskip = 0; + if (!bc->bre) + while (EAT('|')) + ++nskip; + else + while (EATTWO('\\', '|')) + ++nskip; + return nskip; +} + +/* + * Insert necessary branch book-keeping operations. This emits a + * bogus 'next' offset, since we still have more to parse + */ +static void +p_branch_ins_offset(struct parse *p, struct branchc *bc) { + if (!bc->nbranch) { + INSERT(OCH_, bc->start); /* offset is wrong */ + bc->fwd = bc->start; + bc->back = bc->start; + } + + ASTERN(OOR1, bc->back); + bc->back = THERE(); + AHEAD(bc->fwd); /* fix previous offset */ + bc->fwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + ++bc->nbranch; +} + +/* + * Fix the offset of the tail branch, if we actually had any branches. + * This is to correct the bogus placeholder offset that we use. + */ +static void +p_branch_fix_tail(struct parse *p, struct branchc *bc) +{ + /* Fix bogus offset at the tail if we actually have branches */ + if (bc->nbranch > 0) { + AHEAD(bc->fwd); + ASTERN(O_CH, bc->back); + } +} + +/* + * Take care of any branching requirements. This includes inserting the + * appropriate branching instructions as well as eating all of the branch + * delimiters until we either run out of pattern or need to parse more pattern. + */ +static int +p_branch_do(struct parse *p, struct branchc *bc) +{ + int ate = 0; + + /* Empty expression; set the flag if necessary*/ + if (HERE() == bc->start) { + if (bc->outer) + p->g->iflags |= EMPTBR; + return (0); + } else { + ate = p_branch_eat_delim(p, bc); + (void)REQUIRE(bc->use_gnu || (ate == 0 && !MORE()) || + (ate == 1 && MORE()), REG_EMPTY); + /* If we hit another branch immediately, skip it and set flag */ + if (ate > 1 || (ate == 1 && !MORE())) { + if (bc->outer) + p->g->iflags |= EMPTBR; + return (0); + } + if (ate == 0 || !MORE()) + return (0); + p_branch_ins_offset(p, bc); + } + + return (1); +} + +/* + - p_re - Top level parser, concatenation and BRE anchoring == static void p_bre(struct parse *p, int end1, \ == int end2); * Giving end1 as OUT essentially eliminates the end1/end2 check. @@ -533,47 +638,93 @@ p_str(struct parse *p) * The amount of lookahead needed to avoid this kludge is excessive. */ static void -p_bre(struct parse *p, +p_re(struct parse *p, int end1, /* first terminating character */ - int end2) /* second terminating character */ + int end2) /* second terminating character; ignored for EREs */ { - sopno start = HERE(); - int first = 1; /* first subexpression? */ int wasdollar = 0; - - if (EAT('^')) { - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; + struct branchc bc; + int (*parse_expr)(struct parse *, struct branchc *); + int do_branch = 1; + + bc.use_gnu = 1; + bc.bre = 0; + if (p->g->cflags®_EXTENDED) { + parse_expr = p_ere_exp; + bc.bre = 0; + } else { + parse_expr = p_simp_re; + bc.bre = 1; } - if (wasdollar) { /* oops, that was a trailing anchor */ - DROP(1); - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; + if (p->g->cflags®_POSIX) + bc.use_gnu = 0; + /* Disable branching for BREs when we're in POSIX mode */ + if (!bc.use_gnu && bc.bre) + do_branch = 0; + + bc.nbranch = 0; + bc.outer = 0; + if (end1 == OUT && end2 == OUT) + bc.outer = 1; +#define SEEEND() (!bc.bre ? SEE(end1) : SEETWO(end1, end2)) + for (;;) { + bc.start = HERE(); + bc.nsimple = 0; + /* + * Moving BOL/EOL bits into p_simp_re is more complicated than it needs to + * be, because of the complications in looking for the end of the current + * expression. They are better left here than trying to work out the + * equivalent magic in p_simp_bre, mostly for the sake of readability. + */ + if (bc.bre && EAT('^')) { + EMIT(OBOL, 0); + p->g->iflags |= USEBOL; + p->g->nbol++; + } + while (MORE() && !SEESPEC('|') && !SEEEND()) { + wasdollar = parse_expr(p, &bc); + ++bc.nsimple; + } + (void) REQUIRE(bc.use_gnu || HERE() != bc.start, REG_EMPTY); + if (bc.bre && wasdollar) { /* oops, that was a trailing anchor */ + DROP(1); + EMIT(OEOL, 0); + p->g->iflags |= USEEOL; + p->g->neol++; + } + /* + * p_branch_do's return value indicates whether we should continue parsing + * or not. This is both for correctness and optimization, because it will + * signal that we need not continue if it encountered an empty branch or + * the end of the string immediately following a branch delimiter. + */ + if (do_branch && !p_branch_do(p, &bc)) + break; + if (!MORE()) + break; } - - (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ +#undef SEE_END + if (do_branch) + p_branch_fix_tail(p, &bc); + assert(!MORE() || SEE(end1)); } /* - p_simp_re - parse a simple RE, an atom possibly followed by a repetition - == static int p_simp_re(struct parse *p, int starordinary); + == static int p_simp_re(struct parse *p, struct branchc *bc); */ static int /* was the simple RE an unbackslashed $? */ p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ + struct branchc *bc) { int c; + int cc; /* convenient/control character */ int count; int count2; sopno pos; int i; wint_t wc; + int handled = 0; sopno subno; # define BACKSL (1<use_gnu) { + handled = 1; + switch (c) { + case BACKSL|'`': + EMIT(OBOS, 0); + break; + case BACKSL|'\'': + EMIT(OEOS, 0); + break; + case BACKSL|'b': + EMIT(OWBND, 0); + break; + case BACKSL|'B': + EMIT(ONWBND, 0); + break; + case BACKSL|'W': + case BACKSL|'w': + case BACKSL|'S': + case BACKSL|'s': + p_b_pseudoclass(p, cc); + break; + default: + handled = 0; + } + } } - switch (c) { - case '.': - if (p->g->cflags®_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case BACKSL|'<': - EMIT(OBOW, 0); - break; - case BACKSL|'>': - EMIT(OEOW, 0); - break; - case BACKSL|'{': - SETERROR(REG_BADRPT); - break; - case BACKSL|'(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); + if (!handled) { + switch (c) { + case '.': + if (p->g->cflags®_NEWLINE) + nonnewline(p); + else + EMIT(OANY, 0); + break; + case '[': + p_bracket(p); + break; + case BACKSL|'<': + EMIT(OBOW, 0); + break; + case BACKSL|'>': + EMIT(OEOW, 0); + break; + case BACKSL|'{': + SETERROR(REG_BADRPT); + break; + case BACKSL|'(': + p->g->nsub++; + subno = p->g->nsub; + if (subno < NPAREN) + p->pbegin[subno] = HERE(); + EMIT(OLPAREN, subno); + /* the MORE here is an error heuristic */ + if (MORE() && !SEETWO('\\', ')')) + p_re(p, '\\', ')'); + if (subno < NPAREN) { + p->pend[subno] = HERE(); + assert(p->pend[subno] != 0); + } + EMIT(ORPAREN, subno); + (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); + break; + case BACKSL|')': /* should not get here -- must be user */ + case BACKSL|'}': + SETERROR(REG_EPAREN); + break; + case BACKSL|'1': + case BACKSL|'2': + case BACKSL|'3': + case BACKSL|'4': + case BACKSL|'5': + case BACKSL|'6': + case BACKSL|'7': + case BACKSL|'8': + case BACKSL|'9': + i = (c&~BACKSL) - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; + case '*': + (void)REQUIRE(bc->nsimple == 0, REG_BADRPT); + /* FALLTHROUGH */ + default: + p->next--; + wc = WGETNEXT(); + ordinary(p, wc); + break; } - EMIT(ORPAREN, subno); - (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); - break; - case BACKSL|')': /* should not get here -- must be user */ - case BACKSL|'}': - SETERROR(REG_EPAREN); - break; - case BACKSL|'1': - case BACKSL|'2': - case BACKSL|'3': - case BACKSL|'4': - case BACKSL|'5': - case BACKSL|'6': - case BACKSL|'7': - case BACKSL|'8': - case BACKSL|'9': - i = (c&~BACKSL) - '0'; - assert(i < NPAREN); - if (p->pend[i] != 0) { - assert(i <= p->g->nsub); - EMIT(OBACK_, i); - assert(p->pbegin[i] != 0); - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); - assert(OP(p->strip[p->pend[i]]) == ORPAREN); - (void) dupl(p, p->pbegin[i]+1, p->pend[i]); - EMIT(O_BACK, i); - } else - SETERROR(REG_ESUBREG); - p->g->backrefs = 1; - break; - case '*': - (void)REQUIRE(starordinary, REG_BADRPT); - /* FALLTHROUGH */ - default: - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; } - if (EAT('*')) { /* implemented as +? */ /* this case does not require the (y|) trick, noKLUDGE */ INSERT(OPLUS_, pos); ASTERN(O_PLUS, pos); INSERT(OQUEST_, pos); ASTERN(O_QUEST, pos); + } else if (EATTWO('\\', '?')) { + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + } else if (EATTWO('\\', '+')) { + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); } else if (EATTWO('\\', '{')) { count = p_count(p); if (EAT(',')) { @@ -834,6 +1019,41 @@ p_b_term(struct parse *p, cset *cs) } /* + - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) + == static int p_b_pseudoclass(struct parse *p, char c) + */ +static int +p_b_pseudoclass(struct parse *p, char c) { + cset *cs; + + if ((cs = allocset(p)) == NULL) + return(0); + + if (p->g->cflags®_ICASE) + cs->icase = 1; + + switch (c) { + case 'W': + cs->invert = 1; + /* PASSTHROUGH */ + case 'w': + p_b_cclass_named(p, cs, "alnum"); + break; + case 'S': + cs->invert = 1; + /* PASSTHROUGH */ + case 's': + p_b_cclass_named(p, cs, "space"); + break; + default: + return(0); + } + + EMIT(OANYOF, (int)(cs - p->g->sets)); + return(1); +} + +/* - p_b_cclass - parse a character-class name and deal with it == static void p_b_cclass(struct parse *p, cset *cs); */ @@ -842,7 +1062,6 @@ p_b_cclass(struct parse *p, cset *cs) { char *sp = p->next; size_t len; - wctype_t wct; char clname[16]; while (MORE() && isalpha((uch)PEEK())) @@ -854,6 +1073,17 @@ p_b_cclass(struct parse *p, cset *cs) } memcpy(clname, sp, len); clname[len] = '\0'; + + p_b_cclass_named(p, cs, clname); +} +/* + - p_b_cclass_named - deal with a named character class + == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); + */ +static void +p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { + wctype_t wct; + if ((wct = wctype(clname)) == 0) { SETERROR(REG_ECTYPE); return; @@ -1483,6 +1713,8 @@ findmust(struct parse *p, struct re_guts *g) /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: + case OWBND: + case ONWBND: case OBOL: case OEOL: case O_QUEST: @@ -1636,6 +1868,8 @@ altoffset(sop *scan, int offset) try++; case OBOW: case OEOW: + case OWBND: + case ONWBND: case OLPAREN: case ORPAREN: case OOR2: diff --git a/lib/libc/regex/regex.3 b/lib/libc/regex/regex.3 index 8959272e989..9af50836053 100644 --- a/lib/libc/regex/regex.3 +++ b/lib/libc/regex/regex.3 @@ -32,7 +32,7 @@ .\" @(#)regex.3 8.4 (Berkeley) 3/20/94 .\" $FreeBSD$ .\" -.Dd May 25, 2016 +.Dd April 15, 2017 .Dt REGEX 3 .Os .Sh NAME @@ -124,6 +124,11 @@ This is a synonym for 0, provided as a counterpart to .Dv REG_EXTENDED to improve readability. +.It Dv REG_POSIX +Compile with more strict +.St -p1003.2 +conformance. The default behavior is to compile with GNU extensions +enabled. .It Dv REG_NOSPEC Compile with recognition of all special characters turned off. All characters are thus considered ordinary, diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h index bec1b160cae..c1ecbaec888 100644 --- a/lib/libc/regex/regex2.h +++ b/lib/libc/regex/regex2.h @@ -102,7 +102,10 @@ typedef long sopno; #define O_CH (18L<