diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index ff98bfd694..1a6c9ce183 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -65,6 +65,8 @@ newnfa(struct vars *v, nfa->v = v; nfa->bos[0] = nfa->bos[1] = COLORLESS; nfa->eos[0] = nfa->eos[1] = COLORLESS; + nfa->flags = 0; + nfa->minmatchall = nfa->maxmatchall = -1; nfa->parent = parent; /* Precedes newfstate so parent is valid. */ nfa->post = newfstate(nfa, '@'); /* number 0 */ nfa->pre = newfstate(nfa, '>'); /* number 1 */ @@ -2875,8 +2877,14 @@ analyze(struct nfa *nfa) if (NISERR()) return 0; + /* Detect whether NFA can't match anything */ if (nfa->pre->outs == NULL) return REG_UIMPOSSIBLE; + + /* Detect whether NFA matches all strings (possibly with length bounds) */ + checkmatchall(nfa); + + /* Detect whether NFA can possibly match a zero-length string */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) for (aa = a->to->outs; aa != NULL; aa = aa->outchain) if (aa->to == nfa->post) @@ -2884,6 +2892,282 @@ analyze(struct nfa *nfa) return 0; } +/* + * checkmatchall - does the NFA represent no more than a string length test? + * + * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1 + * to begin with) and set the MATCHALL bit in nfa->flags. + * + * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those + * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the + * post state via RAINBOW arcs, and if there are any loops in the graph, they + * must be loop-to-self arcs, ensuring that each loop iteration consumes + * exactly one character. (Longer loops are problematic because they create + * non-consecutive possible match lengths; we have no good way to represent + * that situation for lengths beyond the DUPINF limit.) + * + * Pseudocolor arcs complicate things a little. We know that they can only + * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for + * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs, + * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS + * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor + * arcs can occur, meaning that the NFA involves some constraint on the + * adjacent characters, which makes it not a matchall NFA. + */ +static void +checkmatchall(struct nfa *nfa) +{ + bool hasmatch[DUPINF + 1]; + int minmatch, + maxmatch, + morematch; + + /* + * hasmatch[i] will be set true if a match of length i is feasible, for i + * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match + * length of DUPINF or more is feasible. + */ + memset(hasmatch, 0, sizeof(hasmatch)); + + /* + * Recursively search the graph for all-RAINBOW paths to the "post" state, + * starting at the "pre" state. The -1 initial depth accounts for the + * fact that transitions out of the "pre" state are not part of the + * matched string. We likewise don't count the final transition to the + * "post" state as part of the match length. (But we still insist that + * those transitions have RAINBOW arcs, otherwise there are lookbehind or + * lookahead constraints at the start/end of the pattern.) + */ + if (!checkmatchall_recurse(nfa, nfa->pre, false, -1, hasmatch)) + return; + + /* + * We found some all-RAINBOW paths, and not anything that we couldn't + * handle. Now verify that pseudocolor arcs adjacent to the pre and post + * states match the RAINBOW arcs there. (We could do this while + * recursing, but it's expensive and unlikely to fail, so do it last.) + */ + if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) || + !check_out_colors_match(nfa->pre, nfa->bos[0], RAINBOW) || + !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) || + !check_out_colors_match(nfa->pre, nfa->bos[1], RAINBOW)) + return; + if (!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) || + !check_in_colors_match(nfa->post, nfa->eos[0], RAINBOW) || + !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]) || + !check_in_colors_match(nfa->post, nfa->eos[1], RAINBOW)) + return; + + /* + * hasmatch[] now represents the set of possible match lengths; but we + * want to reduce that to a min and max value, because it doesn't seem + * worth complicating regexec.c to deal with nonconsecutive possible match + * lengths. Find min and max of first run of lengths, then verify there + * are no nonconsecutive lengths. + */ + for (minmatch = 0; minmatch <= DUPINF; minmatch++) + { + if (hasmatch[minmatch]) + break; + } + assert(minmatch <= DUPINF); /* else checkmatchall_recurse lied */ + for (maxmatch = minmatch; maxmatch < DUPINF; maxmatch++) + { + if (!hasmatch[maxmatch + 1]) + break; + } + for (morematch = maxmatch + 1; morematch <= DUPINF; morematch++) + { + if (hasmatch[morematch]) + return; /* fail, there are nonconsecutive lengths */ + } + + /* Success, so record the info */ + nfa->minmatchall = minmatch; + nfa->maxmatchall = maxmatch; + nfa->flags |= MATCHALL; +} + +/* + * checkmatchall_recurse - recursive search for checkmatchall + * + * s is the current state + * foundloop is true if any predecessor state has a loop-to-self + * depth is the current recursion depth (starting at -1) + * hasmatch[] is the output area for recording feasible match lengths + * + * We return true if there is at least one all-RAINBOW path to the "post" + * state and no non-matchall paths; otherwise false. Note we assume that + * any dead-end paths have already been removed, else we might return + * false unnecessarily. + */ +static bool +checkmatchall_recurse(struct nfa *nfa, struct state *s, + bool foundloop, int depth, + bool *hasmatch) +{ + bool result = false; + struct arc *a; + + /* + * Since this is recursive, it could be driven to stack overflow. But we + * need not treat that as a hard failure; just deem the NFA non-matchall. + */ + if (STACK_TOO_DEEP(nfa->v->re)) + return false; + + /* + * Likewise, if we get to a depth too large to represent correctly in + * maxmatchall, fail quietly. + */ + if (depth >= DUPINF) + return false; + + /* + * Scan the outarcs to detect cases we can't handle, and to see if there + * is a loop-to-self here. We need to know about any such loop before we + * recurse, so it's hard to avoid making two passes over the outarcs. In + * any case, checking for showstoppers before we recurse is probably best. + */ + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN) + return false; /* any LACONs make it non-matchall */ + if (a->co != RAINBOW) + { + if (nfa->cm->cd[a->co].flags & PSEUDO) + { + /* + * Pseudocolor arc: verify it's in a valid place (this seems + * quite unlikely to fail, but let's be sure). + */ + if (s == nfa->pre && + (a->co == nfa->bos[0] || a->co == nfa->bos[1])) + /* okay BOS/BOL arc */ ; + else if (a->to == nfa->post && + (a->co == nfa->eos[0] || a->co == nfa->eos[1])) + /* okay EOS/EOL arc */ ; + else + return false; /* unexpected pseudocolor arc */ + /* We'll finish checking these arcs after the recursion */ + continue; + } + return false; /* any other color makes it non-matchall */ + } + if (a->to == s) + { + /* + * We found a cycle of length 1, so remember that to pass down to + * successor states. (It doesn't matter if there was also such a + * loop at a predecessor state.) + */ + foundloop = true; + } + else if (a->to->tmp) + { + /* We found a cycle of length > 1, so fail. */ + return false; + } + } + + /* We need to recurse, so mark state as under consideration */ + assert(s->tmp == NULL); + s->tmp = s; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co != RAINBOW) + continue; /* ignore pseudocolor arcs */ + if (a->to == nfa->post) + { + /* We found an all-RAINBOW path to the post state */ + result = true; + /* Record potential match lengths */ + assert(depth >= 0); + hasmatch[depth] = true; + if (foundloop) + { + /* A predecessor loop makes all larger lengths match, too */ + int i; + + for (i = depth + 1; i <= DUPINF; i++) + hasmatch[i] = true; + } + } + else if (a->to != s) + { + /* This is a new path forward; recurse to investigate */ + result = checkmatchall_recurse(nfa, a->to, + foundloop, depth + 1, + hasmatch); + /* Fail if any recursive path fails */ + if (!result) + break; + } + } + + s->tmp = NULL; + return result; +} + +/* + * check_out_colors_match - subroutine for checkmatchall + * + * Check if every s outarc of color co1 has a matching outarc of color co2. + * (checkmatchall_recurse already verified that all of the outarcs are PLAIN, + * so we need not examine arc types here. Also, since it verified that there + * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for + * this brute-force O(N^2) implementation to cause problems.) + */ +static bool +check_out_colors_match(struct state *s, color co1, color co2) +{ + struct arc *a1; + struct arc *a2; + + for (a1 = s->outs; a1 != NULL; a1 = a1->outchain) + { + if (a1->co != co1) + continue; + for (a2 = s->outs; a2 != NULL; a2 = a2->outchain) + { + if (a2->co == co2 && a2->to == a1->to) + break; + } + if (a2 == NULL) + return false; + } + return true; +} + +/* + * check_in_colors_match - subroutine for checkmatchall + * + * Check if every s inarc of color co1 has a matching inarc of color co2. + * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still + * not expecting very many arcs.) + */ +static bool +check_in_colors_match(struct state *s, color co1, color co2) +{ + struct arc *a1; + struct arc *a2; + + for (a1 = s->ins; a1 != NULL; a1 = a1->inchain) + { + if (a1->type != PLAIN || a1->co != co1) + continue; + for (a2 = s->ins; a2 != NULL; a2 = a2->inchain) + { + if (a2->type == PLAIN && a2->co == co2 && a2->from == a1->from) + break; + } + if (a2 == NULL) + return false; + } + return true; +} + /* * compact - construct the compact representation of an NFA */ @@ -2930,7 +3214,9 @@ compact(struct nfa *nfa, cnfa->eos[0] = nfa->eos[0]; cnfa->eos[1] = nfa->eos[1]; cnfa->ncolors = maxcolor(nfa->cm) + 1; - cnfa->flags = 0; + cnfa->flags = nfa->flags; + cnfa->minmatchall = nfa->minmatchall; + cnfa->maxmatchall = nfa->maxmatchall; ca = cnfa->arcs; for (s = nfa->states; s != NULL; s = s->next) @@ -3034,6 +3320,11 @@ dumpnfa(struct nfa *nfa, fprintf(f, ", eos [%ld]", (long) nfa->eos[0]); if (nfa->eos[1] != COLORLESS) fprintf(f, ", eol [%ld]", (long) nfa->eos[1]); + if (nfa->flags & HASLACONS) + fprintf(f, ", haslacons"); + if (nfa->flags & MATCHALL) + fprintf(f, ", minmatchall %d, maxmatchall %d", + nfa->minmatchall, nfa->maxmatchall); fprintf(f, "\n"); for (s = nfa->states; s != NULL; s = s->next) { @@ -3201,6 +3492,9 @@ dumpcnfa(struct cnfa *cnfa, fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]); if (cnfa->flags & HASLACONS) fprintf(f, ", haslacons"); + if (cnfa->flags & MATCHALL) + fprintf(f, ", minmatchall %d, maxmatchall %d", + cnfa->minmatchall, cnfa->maxmatchall); fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) dumpcstate(st, cnfa, f); diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index ae8dbe5819..39e837adc2 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -175,6 +175,11 @@ static void cleanup(struct nfa *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); static long analyze(struct nfa *); +static void checkmatchall(struct nfa *); +static bool checkmatchall_recurse(struct nfa *, struct state *, + bool, int, bool *); +static bool check_out_colors_match(struct state *, color, color); +static bool check_in_colors_match(struct state *, color, color); static void compact(struct nfa *, struct cnfa *); static void carcsort(struct carc *, size_t); static int carc_cmp(const void *, const void *); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 32be2592c5..89d162ed6a 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -58,6 +58,29 @@ longest(struct vars *v, if (hitstopp != NULL) *hitstopp = 0; + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = stop - start; + size_t maxmatchall = d->cnfa->maxmatchall; + + if (nchr < d->cnfa->minmatchall) + return NULL; + if (maxmatchall == DUPINF) + { + if (stop == v->stop && hitstopp != NULL) + *hitstopp = 1; + } + else + { + if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL) + *hitstopp = 1; + if (nchr > maxmatchall) + return start + maxmatchall; + } + return stop; + } + /* initialize */ css = initialize(v, d, start); if (css == NULL) @@ -187,6 +210,24 @@ shortest(struct vars *v, if (hitstopp != NULL) *hitstopp = 0; + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = min - start; + + if (d->cnfa->maxmatchall != DUPINF && + nchr > d->cnfa->maxmatchall) + return NULL; + if ((max - start) < d->cnfa->minmatchall) + return NULL; + if (nchr < d->cnfa->minmatchall) + min = start + d->cnfa->minmatchall; + if (coldp != NULL) + *coldp = start; + /* there is no case where we should set *hitstopp */ + return min; + } + /* initialize */ css = initialize(v, d, start); if (css == NULL) @@ -312,6 +353,22 @@ matchuntil(struct vars *v, struct sset *ss; struct colormap *cm = d->cm; + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = probe - v->start; + + /* + * It might seem that we should check maxmatchall too, but the .* at + * the front of the pattern absorbs any extra characters (and it was + * tacked on *after* computing minmatchall/maxmatchall). Thus, we + * should match if there are at least minmatchall characters. + */ + if (nchr < d->cnfa->minmatchall) + return 0; + return 1; + } + /* initialize and startup, or restart, if necessary */ if (cp == NULL || cp > probe) { diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c index e2fbad7a8a..ec435b6f5f 100644 --- a/src/backend/regex/regprefix.c +++ b/src/backend/regex/regprefix.c @@ -77,6 +77,10 @@ pg_regprefix(regex_t *re, assert(g->tree != NULL); cnfa = &g->tree->cnfa; + /* matchall NFAs never have a fixed prefix */ + if (cnfa->flags & MATCHALL) + return REG_NOMATCH; + /* * Since a correct NFA should never contain any exit-free loops, it should * not be possible for our traversal to return to a previously visited NFA diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 6d39108319..82e761bfe5 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -331,6 +331,9 @@ struct nfa struct colormap *cm; /* the color map */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */ + int flags; /* flags to pass forward to cNFA */ + int minmatchall; /* min number of chrs to match, if matchall */ + int maxmatchall; /* max number of chrs to match, or DUPINF */ struct vars *v; /* simplifies compile error reporting */ struct nfa *parent; /* parent NFA, if any */ }; @@ -353,6 +356,14 @@ struct nfa * * Note that in a plain arc, "co" can be RAINBOW; since that's negative, * it doesn't break the rule about how to recognize LACON arcs. + * + * We have special markings for "trivial" NFAs that can match any string + * (possibly with limits on the number of characters therein). In such a + * case, flags & MATCHALL is set (and HASLACONS can't be set). Then the + * fields minmatchall and maxmatchall give the minimum and maximum numbers + * of characters to match. For example, ".*" produces minmatchall = 0 + * and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and + * maxmatchall = DUPINF. */ struct carc { @@ -366,6 +377,7 @@ struct cnfa int ncolors; /* number of colors (max color in use + 1) */ int flags; #define HASLACONS 01 /* uses lookaround constraints */ +#define MATCHALL 02 /* matches all strings of a range of lengths */ int pre; /* setup state number */ int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ @@ -375,6 +387,9 @@ struct cnfa struct carc **states; /* vector of pointers to outarc lists */ /* states[n] are pointers into a single malloc'd array of arcs */ struct carc *arcs; /* the area for the lists */ + /* these fields are used only in a MATCHALL NFA (else they're -1): */ + int minmatchall; /* min number of chrs to match */ + int maxmatchall; /* max number of chrs to match, or DUPINF */ }; /*