From f68970e33f4dc48094c24c78c452ad730ae9ae12 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 3 May 2021 11:42:31 -0400 Subject: [PATCH] Fix performance issue in new regex match-all detection code. Commit 824bf7190 introduced a new search of the NFAs generated by regex compilation. I failed to think hard about the performance characteristics of that search, with the predictable outcome that it's bad: weird regexes can trigger exponential search time. Worse, there's no check-for-interrupt in that code, so you can't even cancel the query if this happens. Fix by introducing memo-ization of the search results, so that any one NFA state need be examined in detail just once. This potentially uses a lot of memory, but we can bound the memory usage by putting a limit on the number of states for which we'll try to prove match-all-ness. That is sane because we already have a limit (DUPINF) on the maximum finite string length that a matchall regex can match; and patterns that involve much more than DUPINF states would probably exceed that limit anyway. Also, rearrange the logic so that we check the basic is-the-graph- all-RAINBOW-arcs property before we start the recursive search to determine path lengths. This will ensure that we fall out quickly whenever the NFA couldn't possibly be matchall. Also stick in a check-for-interrupt, just in case these measures don't completely eliminate the risk of slowness. Discussion: https://postgr.es/m/3483895.1619898362@sss.pgh.pa.us --- src/backend/regex/regc_nfa.c | 505 ++++++++++++++++++---------- src/backend/regex/regcomp.c | 3 +- src/test/regress/expected/regex.out | 37 ++ src/test/regress/sql/regex.sql | 8 + 4 files changed, 378 insertions(+), 175 deletions(-) diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 77b860cb0f..6d77c59e12 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -3032,96 +3032,189 @@ analyze(struct nfa *nfa) static void checkmatchall(struct nfa *nfa) { - bool hasmatch[DUPINF + 1]; - int minmatch, - maxmatch, - morematch; + bool **haspaths; + struct state *s; + int i; /* - * hasmatch[i] will be set true if a match of length i is feasible, for i - * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match - * length of DUPINF or more is feasible. + * If there are too many states, don't bother trying to detect matchall. + * This limit serves to bound the time and memory we could consume below. + * Note that even if the graph is all-RAINBOW, if there are significantly + * more than DUPINF states then it's likely that there are paths of length + * more than DUPINF, which would force us to fail anyhow. In practice, + * plausible ways of writing a matchall regex with maximum finite path + * length K tend not to have very many more than K states. */ - memset(hasmatch, 0, sizeof(hasmatch)); + if (nfa->nstates > DUPINF * 2) + return; + + /* + * First, scan all the states to verify that only RAINBOW arcs appear, + * plus pseudocolor arcs adjacent to the pre and post states. This lets + * us quickly eliminate most cases that aren't matchall NFAs. + */ + for (s = nfa->states; s != NULL; s = s->next) + { + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN) + return; /* any LACONs make it non-matchall */ + if (a->co != RAINBOW) + { + if (nfa->cm->cd[a->co].flags & PSEUDO) + { + /* + * Pseudocolor arc: verify it's in a valid place (this + * seems quite unlikely to fail, but let's be sure). + */ + if (s == nfa->pre && + (a->co == nfa->bos[0] || a->co == nfa->bos[1])) + /* okay BOS/BOL arc */ ; + else if (a->to == nfa->post && + (a->co == nfa->eos[0] || a->co == nfa->eos[1])) + /* okay EOS/EOL arc */ ; + else + return; /* unexpected pseudocolor arc */ + /* We'll check these arcs some more below. */ + } + else + return; /* any other color makes it non-matchall */ + } + } + /* Also, assert that the tmp fields are available for use. */ + assert(s->tmp == NULL); + } + + /* + * The next cheapest check we can make is to verify that the BOS/BOL + * outarcs of the pre state reach the same states as its RAINBOW outarcs. + * If they don't, the NFA expresses some constraints on the character + * before the matched string, making it non-matchall. Likewise, the + * EOS/EOL inarcs of the post state must match its RAINBOW inarcs. + */ + if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) || + !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) || + !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) || + !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1])) + return; + + /* + * Initialize an array of path-length arrays, in which + * checkmatchall_recurse will return per-state results. This lets us + * memo-ize the recursive search and avoid exponential time consumption. + */ + haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *)); + if (haspaths == NULL) + return; /* fail quietly */ + memset(haspaths, 0, nfa->nstates * sizeof(bool *)); /* * Recursively search the graph for all-RAINBOW paths to the "post" state, - * starting at the "pre" state. The -1 initial depth accounts for the - * fact that transitions out of the "pre" state are not part of the - * matched string. We likewise don't count the final transition to the - * "post" state as part of the match length. (But we still insist that - * those transitions have RAINBOW arcs, otherwise there are lookbehind or - * lookahead constraints at the start/end of the pattern.) + * starting at the "pre" state, and computing the lengths of the paths. + * (Given the preceding checks, there should be at least one such path. + * However we could get back a false result anyway, in case there are + * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic + * failures such as ENOMEM.) */ - if (!checkmatchall_recurse(nfa, nfa->pre, false, -1, hasmatch)) - return; + if (checkmatchall_recurse(nfa, nfa->pre, haspaths)) + { + /* The useful result is the path length array for the pre state */ + bool *haspath = haspaths[nfa->pre->no]; + int minmatch, + maxmatch, + morematch; - /* - * We found some all-RAINBOW paths, and not anything that we couldn't - * handle. Now verify that pseudocolor arcs adjacent to the pre and post - * states match the RAINBOW arcs there. (We could do this while - * recursing, but it's expensive and unlikely to fail, so do it last.) - */ - if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) || - !check_out_colors_match(nfa->pre, nfa->bos[0], RAINBOW) || - !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) || - !check_out_colors_match(nfa->pre, nfa->bos[1], RAINBOW)) - return; - if (!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) || - !check_in_colors_match(nfa->post, nfa->eos[0], RAINBOW) || - !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]) || - !check_in_colors_match(nfa->post, nfa->eos[1], RAINBOW)) - return; + assert(haspath != NULL); - /* - * hasmatch[] now represents the set of possible match lengths; but we - * want to reduce that to a min and max value, because it doesn't seem - * worth complicating regexec.c to deal with nonconsecutive possible match - * lengths. Find min and max of first run of lengths, then verify there - * are no nonconsecutive lengths. - */ - for (minmatch = 0; minmatch <= DUPINF; minmatch++) - { - if (hasmatch[minmatch]) - break; - } - assert(minmatch <= DUPINF); /* else checkmatchall_recurse lied */ - for (maxmatch = minmatch; maxmatch < DUPINF; maxmatch++) - { - if (!hasmatch[maxmatch + 1]) - break; - } - for (morematch = maxmatch + 1; morematch <= DUPINF; morematch++) - { - if (hasmatch[morematch]) - return; /* fail, there are nonconsecutive lengths */ + /* + * haspath[] now represents the set of possible path lengths; but we + * want to reduce that to a min and max value, because it doesn't seem + * worth complicating regexec.c to deal with nonconsecutive possible + * match lengths. Find min and max of first run of lengths, then + * verify there are no nonconsecutive lengths. + */ + for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++) + { + if (haspath[minmatch]) + break; + } + assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */ + for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++) + { + if (!haspath[maxmatch + 1]) + break; + } + for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++) + { + if (haspath[morematch]) + { + haspath = NULL; /* fail, there are nonconsecutive lengths */ + break; + } + } + + if (haspath != NULL) + { + /* + * Success, so record the info. Here we have a fine point: the + * path length from the pre state includes the pre-to-initial + * transition, so it's one more than the actually matched string + * length. (We avoided counting the final-to-post transition + * within checkmatchall_recurse, but not this one.) This is why + * checkmatchall_recurse allows one more level of path length than + * might seem necessary. This decrement also takes care of + * converting checkmatchall_recurse's definition of "infinity" as + * "DUPINF+1" to our normal representation as "DUPINF". + */ + assert(minmatch > 0); /* else pre and post states were adjacent */ + nfa->minmatchall = minmatch - 1; + nfa->maxmatchall = maxmatch - 1; + nfa->flags |= MATCHALL; + } } - /* Success, so record the info */ - nfa->minmatchall = minmatch; - nfa->maxmatchall = maxmatch; - nfa->flags |= MATCHALL; + /* Clean up */ + for (i = 0; i < nfa->nstates; i++) + { + if (haspaths[i] != NULL) + FREE(haspaths[i]); + } + FREE(haspaths); } /* * checkmatchall_recurse - recursive search for checkmatchall * - * s is the current state - * foundloop is true if any predecessor state has a loop-to-self - * depth is the current recursion depth (starting at -1) - * hasmatch[] is the output area for recording feasible match lengths + * s is the state to be examined in this recursion level. + * haspaths[] is an array of per-state exit path length arrays. * - * We return true if there is at least one all-RAINBOW path to the "post" - * state and no non-matchall paths; otherwise false. Note we assume that - * any dead-end paths have already been removed, else we might return - * false unnecessarily. + * We return true if the search was performed successfully, false if + * we had to fail because of multi-state loops or other internal reasons. + * (Because "dead" states that can't reach the post state have been + * eliminated, and we already verified that only RAINBOW and matching + * pseudocolor arcs exist, every state should have RAINBOW path(s) to + * the post state. Hence we take a false result from recursive calls + * as meaning that we'd better fail altogether, not just that that + * particular state can't reach the post state.) + * + * On success, we store a malloc'd result array in haspaths[s->no], + * showing the possible path lengths from s to the post state. + * Each state's haspath[] array is of length DUPINF+2. The entries from + * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k + * from this state to the string end. haspath[DUPINF+1] is true if all + * path lengths >= DUPINF+1 are possible. (Situations that cannot be + * represented under these rules cause failure.) + * + * checkmatchall is responsible for eventually freeing the haspath[] arrays. */ static bool -checkmatchall_recurse(struct nfa *nfa, struct state *s, - bool foundloop, int depth, - bool *hasmatch) +checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths) { bool result = false; + bool foundloop = false; + bool *haspath; struct arc *a; /* @@ -3131,61 +3224,20 @@ checkmatchall_recurse(struct nfa *nfa, struct state *s, if (STACK_TOO_DEEP(nfa->v->re)) return false; - /* - * Likewise, if we get to a depth too large to represent correctly in - * maxmatchall, fail quietly. - */ - if (depth >= DUPINF) - return false; - - /* - * Scan the outarcs to detect cases we can't handle, and to see if there - * is a loop-to-self here. We need to know about any such loop before we - * recurse, so it's hard to avoid making two passes over the outarcs. In - * any case, checking for showstoppers before we recurse is probably best. - */ - for (a = s->outs; a != NULL; a = a->outchain) + /* In case the search takes a long time, check for cancel */ + if (CANCEL_REQUESTED(nfa->v->re)) { - if (a->type != PLAIN) - return false; /* any LACONs make it non-matchall */ - if (a->co != RAINBOW) - { - if (nfa->cm->cd[a->co].flags & PSEUDO) - { - /* - * Pseudocolor arc: verify it's in a valid place (this seems - * quite unlikely to fail, but let's be sure). - */ - if (s == nfa->pre && - (a->co == nfa->bos[0] || a->co == nfa->bos[1])) - /* okay BOS/BOL arc */ ; - else if (a->to == nfa->post && - (a->co == nfa->eos[0] || a->co == nfa->eos[1])) - /* okay EOS/EOL arc */ ; - else - return false; /* unexpected pseudocolor arc */ - /* We'll finish checking these arcs after the recursion */ - continue; - } - return false; /* any other color makes it non-matchall */ - } - if (a->to == s) - { - /* - * We found a cycle of length 1, so remember that to pass down to - * successor states. (It doesn't matter if there was also such a - * loop at a predecessor state.) - */ - foundloop = true; - } - else if (a->to->tmp) - { - /* We found a cycle of length > 1, so fail. */ - return false; - } + NERR(REG_CANCEL); + return false; } - /* We need to recurse, so mark state as under consideration */ + /* Create a haspath array for this state */ + haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool)); + if (haspath == NULL) + return false; /* again, treat as non-matchall */ + memset(haspath, 0, (DUPINF + 2) * sizeof(bool)); + + /* Mark this state as being visited */ assert(s->tmp == NULL); s->tmp = s; @@ -3197,95 +3249,202 @@ checkmatchall_recurse(struct nfa *nfa, struct state *s, { /* We found an all-RAINBOW path to the post state */ result = true; - /* ... which should not be adjacent to the pre state */ - if (depth < 0) - { - NERR(REG_ASSERT); - return false; - } - /* Record potential match lengths */ - hasmatch[depth] = true; - if (foundloop) - { - /* A predecessor loop makes all larger lengths match, too */ - int i; - for (i = depth + 1; i <= DUPINF; i++) - hasmatch[i] = true; - } + /* + * Mark this state as being zero steps away from the string end + * (the transition to the post state isn't counted). + */ + haspath[0] = true; } - else if (a->to != s) + else if (a->to == s) { - /* This is a new path forward; recurse to investigate */ - result = checkmatchall_recurse(nfa, a->to, - foundloop, depth + 1, - hasmatch); - /* Fail if any recursive path fails */ - if (!result) + /* We found a cycle of length 1, which we'll deal with below. */ + foundloop = true; + } + else if (a->to->tmp != NULL) + { + /* It's busy, so we found a cycle of length > 1, so fail. */ + result = false; + break; + } + else + { + /* Consider paths forward through this to-state. */ + bool *nexthaspath; + int i; + + /* If to-state was not already visited, recurse */ + if (haspaths[a->to->no] == NULL) + { + result = checkmatchall_recurse(nfa, a->to, haspaths); + /* Fail if any recursive path fails */ + if (!result) + break; + } + else + { + /* The previous visit must have found path(s) to the end */ + result = true; + } + assert(a->to->tmp == NULL); + nexthaspath = haspaths[a->to->no]; + assert(nexthaspath != NULL); + + /* + * Now, for every path of length i from a->to to the string end, + * there is a path of length i + 1 from s to the string end. + */ + if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1]) + { + /* + * a->to has a path of length exactly DUPINF, but not longer; + * or it has paths of all lengths > DUPINF but not one of + * exactly that length. In either case, we cannot represent + * the possible path lengths from s correctly, so fail. + */ + result = false; break; + } + /* Merge knowledge of these path lengths into what we have */ + for (i = 0; i < DUPINF; i++) + haspath[i + 1] |= nexthaspath[i]; + /* Infinity + 1 is still infinity */ + haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1]; } } + if (result && foundloop) + { + /* + * If there is a length-1 loop at this state, then find the shortest + * known path length to the end. The loop means that every larger + * path length is possible, too. (It doesn't matter whether any of + * the longer lengths were already known possible.) + */ + int i; + + for (i = 0; i <= DUPINF; i++) + { + if (haspath[i]) + break; + } + for (i++; i <= DUPINF + 1; i++) + haspath[i] = true; + } + + /* Report out the completed path length map */ + assert(s->no < nfa->nstates); + assert(haspaths[s->no] == NULL); + haspaths[s->no] = haspath; + + /* Mark state no longer busy */ s->tmp = NULL; + return result; } /* * check_out_colors_match - subroutine for checkmatchall * - * Check if every s outarc of color co1 has a matching outarc of color co2. - * (checkmatchall_recurse already verified that all of the outarcs are PLAIN, - * so we need not examine arc types here. Also, since it verified that there - * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for - * this brute-force O(N^2) implementation to cause problems.) + * Check whether the set of states reachable from s by arcs of color co1 + * is equivalent to the set reachable by arcs of color co2. + * checkmatchall already verified that all of the NFA's arcs are PLAIN, + * so we need not examine arc types here. */ static bool check_out_colors_match(struct state *s, color co1, color co2) { - struct arc *a1; - struct arc *a2; + bool result = true; + struct arc *a; - for (a1 = s->outs; a1 != NULL; a1 = a1->outchain) + /* + * To do this in linear time, we assume that the NFA contains no duplicate + * arcs. Run through the out-arcs, marking states reachable by arcs of + * color co1. Run through again, un-marking states reachable by arcs of + * color co2; if we see a not-marked state, we know this co2 arc is + * unmatched. Then run through again, checking for still-marked states, + * and in any case leaving all the tmp fields reset to NULL. + */ + for (a = s->outs; a != NULL; a = a->outchain) { - if (a1->co != co1) - continue; - for (a2 = s->outs; a2 != NULL; a2 = a2->outchain) + if (a->co == co1) { - if (a2->co == co2 && a2->to == a1->to) - break; + assert(a->to->tmp == NULL); + a->to->tmp = a->to; } - if (a2 == NULL) - return false; } - return true; + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co == co2) + { + if (a->to->tmp != NULL) + a->to->tmp = NULL; + else + result = false; /* unmatched co2 arc */ + } + } + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co == co1) + { + if (a->to->tmp != NULL) + { + result = false; /* unmatched co1 arc */ + a->to->tmp = NULL; + } + } + } + return result; } /* * check_in_colors_match - subroutine for checkmatchall * - * Check if every s inarc of color co1 has a matching inarc of color co2. - * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still - * not expecting very many arcs.) + * Check whether the set of states that can reach s by arcs of color co1 + * is equivalent to the set that can reach s by arcs of color co2. + * checkmatchall already verified that all of the NFA's arcs are PLAIN, + * so we need not examine arc types here. */ static bool check_in_colors_match(struct state *s, color co1, color co2) { - struct arc *a1; - struct arc *a2; + bool result = true; + struct arc *a; - for (a1 = s->ins; a1 != NULL; a1 = a1->inchain) + /* + * Identical algorithm to check_out_colors_match, except examine the + * from-states of s' inarcs. + */ + for (a = s->ins; a != NULL; a = a->inchain) { - if (a1->type != PLAIN || a1->co != co1) - continue; - for (a2 = s->ins; a2 != NULL; a2 = a2->inchain) + if (a->co == co1) { - if (a2->type == PLAIN && a2->co == co2 && a2->from == a1->from) - break; + assert(a->from->tmp == NULL); + a->from->tmp = a->from; } - if (a2 == NULL) - return false; } - return true; + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->co == co2) + { + if (a->from->tmp != NULL) + a->from->tmp = NULL; + else + result = false; /* unmatched co2 arc */ + } + } + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->co == co1) + { + if (a->from->tmp != NULL) + { + result = false; /* unmatched co1 arc */ + a->from->tmp = NULL; + } + } + } + return result; } /* diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index ba8dd86464..9f71177d31 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -182,8 +182,7 @@ static void markreachable(struct nfa *, struct state *, struct state *, struct s static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); static long analyze(struct nfa *); static void checkmatchall(struct nfa *); -static bool checkmatchall_recurse(struct nfa *, struct state *, - bool, int, bool *); +static bool checkmatchall_recurse(struct nfa *, struct state *, bool **); static bool check_out_colors_match(struct state *, color, color); static bool check_in_colors_match(struct state *, color, color); static void compact(struct nfa *, struct cnfa *); diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out index 0923ad9b5b..86477cc506 100644 --- a/src/test/regress/expected/regex.out +++ b/src/test/regress/expected/regex.out @@ -567,6 +567,43 @@ select 'a' ~ '()+\1'; t (1 row) +-- Add coverage for some cases in checkmatchall +select regexp_match('xy', '.|...'); + regexp_match +-------------- + {x} +(1 row) + +select regexp_match('xyz', '.|...'); + regexp_match +-------------- + {xyz} +(1 row) + +select regexp_match('xy', '.*'); + regexp_match +-------------- + {xy} +(1 row) + +select regexp_match('fooba', '(?:..)*'); + regexp_match +-------------- + {foob} +(1 row) + +select regexp_match('xyz', repeat('.', 260)); + regexp_match +-------------- + +(1 row) + +select regexp_match('foo', '(?:.|){99}'); + regexp_match +-------------- + {foo} +(1 row) + -- Error conditions select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs ERROR: invalid regular expression: invalid backreference number diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql index a1742240c4..b03a8d9ac2 100644 --- a/src/test/regress/sql/regex.sql +++ b/src/test/regress/sql/regex.sql @@ -135,6 +135,14 @@ select 'a' ~ '.. ()|\1'; select 'a' ~ '()*\1'; select 'a' ~ '()+\1'; +-- Add coverage for some cases in checkmatchall +select regexp_match('xy', '.|...'); +select regexp_match('xyz', '.|...'); +select regexp_match('xy', '.*'); +select regexp_match('fooba', '(?:..)*'); +select regexp_match('xyz', repeat('.', 260)); +select regexp_match('foo', '(?:.|){99}'); + -- Error conditions select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs select 'xyz' ~ 'x(\w)(?=(\1))';