postgres/src/backend/regex/regexec.c

/*
 * re_*exec and friends - match REs
 *
 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
 *
 * Development of this software was funded, in part, by Cray Research Inc.,
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
 * Corporation, none of whom are responsible for the results.  The author
 * thanks all of them.
 *
 * Redistribution and use in source and binary forms -- with or without
 * modification -- are permitted for any purpose, provided that
 * redistributions in source form retain this entire copyright notice and
 * indicate the origin and nature of any modifications.
 *
 * I'd appreciate being given credit for this package in the documentation
 * of software which uses it, but that is not a requirement.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * src/backend/regex/regexec.c
 *
 */

#include "regex/regguts.h"


/* lazy-DFA representation */
struct arcp
{								/* "pointer" to an outarc */
	struct sset *ss;
	color		co;
};

struct sset
{								/* state set */
	unsigned   *states;			/* pointer to bitvector */
	unsigned	hash;			/* hash of bitvector */
#define  HASH(bv, nw)	 (((nw) == 1) ? *(bv) : hash(bv, nw))
#define  HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
		memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
	int			flags;
#define  STARTER	 01			/* the initial state set */
#define  POSTSTATE	 02			/* includes the goal state */
#define  LOCKED		 04			/* locked in cache */
#define  NOPROGRESS  010		/* zero-progress state set */
	struct arcp ins;			/* chain of inarcs pointing here */
	chr		   *lastseen;		/* last entered on arrival here */
	struct sset **outs;			/* outarc vector indexed by color */
	struct arcp *inchain;		/* chain-pointer vector for outarcs */
};

struct dfa
{
	int			nssets;			/* size of cache */
	int			nssused;		/* how many entries occupied yet */
	int			nstates;		/* number of states */
	int			ncolors;		/* length of outarc and inchain vectors */
	int			wordsper;		/* length of state-set bitvectors */
	struct sset *ssets;			/* state-set cache */
	unsigned   *statesarea;		/* bitvector storage */
	unsigned   *work;			/* pointer to work area within statesarea */
	struct sset **outsarea;		/* outarc-vector storage */
	struct arcp *incarea;		/* inchain storage */
	struct cnfa *cnfa;
	struct colormap *cm;
	chr		   *lastpost;		/* location of last cache-flushed success */
	chr		   *lastnopr;		/* location of last cache-flushed NOPROGRESS */
	struct sset *search;		/* replacement-search-pointer memory */
	int			backno;			/* if DFA for a backref, subno it refers to */
	short		backmin;		/* min repetitions for backref */
	short		backmax;		/* max repetitions for backref */
	bool		ismalloced;		/* should this struct dfa be freed? */
	bool		arraysmalloced; /* should its subsidiary arrays be freed? */
};

#define WORK	1				/* number of work bitvectors needed */

/* setup for non-malloc allocation for small cases */
#define FEWSTATES	20			/* must be less than UBITS */
#define FEWCOLORS	15
struct smalldfa
{
	struct dfa	dfa;			/* must be first */
	struct sset ssets[FEWSTATES * 2];
	unsigned	statesarea[FEWSTATES * 2 + WORK];
	struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
	struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
};

#define DOMALLOC	((struct smalldfa *)NULL)	/* force malloc */


/* internal variables, bundled for easy passing around */
struct vars
{
	regex_t    *re;
	struct guts *g;
	int			eflags;			/* copies of arguments */
	size_t		nmatch;
	regmatch_t *pmatch;
	rm_detail_t *details;
	chr		   *start;			/* start of string */
	chr		   *search_start;	/* search start of string */
	chr		   *stop;			/* just past end of string */
	int			err;			/* error code if any (0 none) */
	struct dfa **subdfas;		/* per-tree-subre DFAs */
	struct dfa **ladfas;		/* per-lacon-subre DFAs */
	struct sset **lblastcss;	/* per-lacon-subre lookbehind restart data */
	chr		  **lblastcp;		/* per-lacon-subre lookbehind restart data */
	struct smalldfa dfa1;
	struct smalldfa dfa2;
};

#define VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
#define ISERR() VISERR(v)
#define VERR(vv,e)	((vv)->err = ((vv)->err ? (vv)->err : (e)))
#define ERR(e)	VERR(v, e)		/* record an error */
#define NOERR() {if (ISERR()) return v->err;}	/* if error seen, return it */
#define OFF(p)	((p) - v->start)
#define LOFF(p) ((long)OFF(p))


/*
 * forward declarations
 */
/* === regexec.c === */
static struct dfa *getsubdfa(struct vars *v, struct subre *t);
static struct dfa *getladfa(struct vars *v, int n);
static int	find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
static int	cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
static int	cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
					  struct dfa *d, struct dfa *s, chr **coldp);
static void zapallsubs(regmatch_t *p, size_t n);
static void zaptreesubs(struct vars *v, struct subre *t);
static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
static int	cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
static int	ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
static int	crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
static int	cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
static int	caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
static int	citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
static int	creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);

/* === rege_dfa.c === */
static chr *longest(struct vars *v, struct dfa *d,
					chr *start, chr *stop, int *hitstopp);
static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
					 chr *max, chr **coldp, int *hitstopp);
static int	matchuntil(struct vars *v, struct dfa *d, chr *probe,
					   struct sset **lastcss, chr **lastcp);
static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
						chr *min, chr *max, bool shortest);
static chr *lastcold(struct vars *v, struct dfa *d);
static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
						  struct colormap *cm, struct smalldfa *sml);
static void freedfa(struct dfa *d);
static unsigned hash(unsigned *uv, int n);
static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
						 color co, chr *cp, chr *start);
static int	lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
							  chr *start);
static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
						   chr *start);


/*
 * pg_regexec - match regular expression
 */
int
pg_regexec(regex_t *re,
		   const chr *string,
		   size_t len,
		   size_t search_start,
		   rm_detail_t *details,
		   size_t nmatch,
		   regmatch_t pmatch[],
		   int flags)
{
	struct vars var;
	struct vars *v = &var;
	int			st;
	size_t		n;
	size_t		i;
	int			backref;

#define  LOCALMAT	 20
	regmatch_t	mat[LOCALMAT];

#define  LOCALDFAS	 40
	struct dfa *subdfas[LOCALDFAS];

	/* sanity checks */
	if (re == NULL || string == NULL || re->re_magic != REMAGIC)
		return REG_INVARG;
	if (re->re_csize != sizeof(chr))
		return REG_MIXED;
	if (search_start > len)
		return REG_NOMATCH;

	/* Initialize locale-dependent support */
	pg_set_regex_collation(re->re_collation);

	/* setup */
	v->re = re;
	v->g = (struct guts *) re->re_guts;
	if ((v->g->cflags & REG_EXPECT) && details == NULL)
		return REG_INVARG;
	if (v->g->info & REG_UIMPOSSIBLE)
		return REG_NOMATCH;
	backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
	v->eflags = flags;
	if (backref && nmatch <= v->g->nsub)
	{
		/* need larger work area */
		v->nmatch = v->g->nsub + 1;
		if (v->nmatch <= LOCALMAT)
			v->pmatch = mat;
		else
			v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
		if (v->pmatch == NULL)
			return REG_ESPACE;
		zapallsubs(v->pmatch, v->nmatch);
	}
	else
	{
		/* we can store results directly in caller's array */
		v->pmatch = pmatch;
		/* ensure any extra entries in caller's array are filled with -1 */
		if (nmatch > 0)
			zapallsubs(pmatch, nmatch);
		/* then forget about extra entries, to avoid useless work in find() */
		if (nmatch > v->g->nsub + 1)
			nmatch = v->g->nsub + 1;
		v->nmatch = nmatch;
	}
	v->details = details;
	v->start = (chr *) string;
	v->search_start = (chr *) string + search_start;
	v->stop = (chr *) string + len;
	v->err = 0;
	v->subdfas = NULL;
	v->ladfas = NULL;
	v->lblastcss = NULL;
	v->lblastcp = NULL;
	/* below this point, "goto cleanup" will behave sanely */

	assert(v->g->ntree >= 0);
	n = (size_t) v->g->ntree;
	if (n <= LOCALDFAS)
		v->subdfas = subdfas;
	else
	{
		v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
		if (v->subdfas == NULL)
		{
			st = REG_ESPACE;
			goto cleanup;
		}
	}
	for (i = 0; i < n; i++)
		v->subdfas[i] = NULL;

	assert(v->g->nlacons >= 0);
	n = (size_t) v->g->nlacons;
	if (n > 0)
	{
		v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
		if (v->ladfas == NULL)
		{
			st = REG_ESPACE;
			goto cleanup;
		}
		for (i = 0; i < n; i++)
			v->ladfas[i] = NULL;
		v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
		v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
		if (v->lblastcss == NULL || v->lblastcp == NULL)
		{
			st = REG_ESPACE;
			goto cleanup;
		}
		for (i = 0; i < n; i++)
		{
			v->lblastcss[i] = NULL;
			v->lblastcp[i] = NULL;
		}
	}

	/* do it */
	assert(v->g->tree != NULL);
	if (backref)
		st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
	else
		st = find(v, &v->g->tree->cnfa, &v->g->cmap);

	/* on success, ensure caller's match vector is filled correctly */
	if (st == REG_OKAY && nmatch > 0)
	{
		if (v->pmatch != pmatch)
		{
			/* copy portion of match vector over from (larger) work area */
			assert(nmatch <= v->nmatch);
			memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
		}
		if (v->g->cflags & REG_NOSUB)
		{
			/* don't expose possibly-partial sub-match results to caller */
			zapallsubs(pmatch, nmatch);
		}
	}

	/* clean up */
cleanup:
	if (v->pmatch != pmatch && v->pmatch != mat)
		FREE(v->pmatch);
	if (v->subdfas != NULL)
	{
		n = (size_t) v->g->ntree;
		for (i = 0; i < n; i++)
		{
			if (v->subdfas[i] != NULL)
				freedfa(v->subdfas[i]);
		}
		if (v->subdfas != subdfas)
			FREE(v->subdfas);
	}
	if (v->ladfas != NULL)
	{
		n = (size_t) v->g->nlacons;
		for (i = 0; i < n; i++)
		{
			if (v->ladfas[i] != NULL)
				freedfa(v->ladfas[i]);
		}
		FREE(v->ladfas);
	}
	if (v->lblastcss != NULL)
		FREE(v->lblastcss);
	if (v->lblastcp != NULL)
		FREE(v->lblastcp);

#ifdef REG_DEBUG
	if (v->eflags & (REG_FTRACE | REG_MTRACE))
		fflush(stdout);
#endif

	return st;
}

/*
 * getsubdfa - create or re-fetch the DFA for a tree subre node
 *
 * We only need to create the DFA once per overall regex execution.
 * The DFA will be freed by the cleanup step in pg_regexec().
 */
static struct dfa *
getsubdfa(struct vars *v,
		  struct subre *t)
{
	struct dfa *d = v->subdfas[t->id];

	if (d == NULL)
	{
		d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
		if (d == NULL)
			return NULL;
		/* set up additional info if this is a backref node */
		if (t->op == 'b')
		{
			d->backno = t->backno;
			d->backmin = t->min;
			d->backmax = t->max;
		}
		v->subdfas[t->id] = d;
	}
	return d;
}

/*
 * getladfa - create or re-fetch the DFA for a LACON subre node
 *
 * Same as above, but for LACONs.
 */
static struct dfa *
getladfa(struct vars *v,
		 int n)
{
	assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);

	if (v->ladfas[n] == NULL)
	{
		struct subre *sub = &v->g->lacons[n];

		v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
		/* a LACON can't contain a backref, so nothing else to do */
	}
	return v->ladfas[n];
}

/*
 * find - find a match for the main NFA (no-complications case)
 */
static int
find(struct vars *v,
	 struct cnfa *cnfa,
	 struct colormap *cm)
{
	struct dfa *s;
	struct dfa *d;
	chr		   *begin;
	chr		   *end = NULL;
	chr		   *cold;
	chr		   *open;			/* open and close of range of possible starts */
	chr		   *close;
	int			hitend;
	int			shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;

	/* first, a shot with the search RE */
	s = newdfa(v, &v->g->search, cm, &v->dfa1);
	if (s == NULL)
		return v->err;
	MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
	cold = NULL;
	close = shortest(v, s, v->search_start, v->search_start, v->stop,
					 &cold, (int *) NULL);
	freedfa(s);
	NOERR();
	if (v->g->cflags & REG_EXPECT)
	{
		assert(v->details != NULL);
		if (cold != NULL)
			v->details->rm_extend.rm_so = OFF(cold);
		else
			v->details->rm_extend.rm_so = OFF(v->stop);
		v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
	}
	if (close == NULL)			/* not found */
		return REG_NOMATCH;
	if (v->nmatch == 0)			/* found, don't need exact location */
		return REG_OKAY;

	/* find starting point and match */
	assert(cold != NULL);
	open = cold;
	cold = NULL;
	MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
	d = newdfa(v, cnfa, cm, &v->dfa1);
	if (d == NULL)
		return v->err;
	for (begin = open; begin <= close; begin++)
	{
		MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
		if (shorter)
			end = shortest(v, d, begin, begin, v->stop,
						   (chr **) NULL, &hitend);
		else
			end = longest(v, d, begin, v->stop, &hitend);
		if (ISERR())
		{
			freedfa(d);
			return v->err;
		}
		if (hitend && cold == NULL)
			cold = begin;
		if (end != NULL)
			break;				/* NOTE BREAK OUT */
	}
	assert(end != NULL);		/* search RE succeeded so loop should */
	freedfa(d);

	/* and pin down details */
	assert(v->nmatch > 0);
	v->pmatch[0].rm_so = OFF(begin);
	v->pmatch[0].rm_eo = OFF(end);
	if (v->g->cflags & REG_EXPECT)
	{
		if (cold != NULL)
			v->details->rm_extend.rm_so = OFF(cold);
		else
			v->details->rm_extend.rm_so = OFF(v->stop);
		v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
	}
	if (v->nmatch == 1)			/* no need for submatches */
		return REG_OKAY;

	/* find submatches */
	return cdissect(v, v->g->tree, begin, end);
}

/*
 * cfind - find a match for the main NFA (with complications)
 */
static int
cfind(struct vars *v,
	  struct cnfa *cnfa,
	  struct colormap *cm)
{
	struct dfa *s;
	struct dfa *d;
	chr		   *cold;
	int			ret;

	s = newdfa(v, &v->g->search, cm, &v->dfa1);
	if (s == NULL)
		return v->err;
	d = newdfa(v, cnfa, cm, &v->dfa2);
	if (d == NULL)
	{
		freedfa(s);
		return v->err;
	}

	ret = cfindloop(v, cnfa, cm, d, s, &cold);

	freedfa(d);
	freedfa(s);
	NOERR();
	if (v->g->cflags & REG_EXPECT)
	{
		assert(v->details != NULL);
		if (cold != NULL)
			v->details->rm_extend.rm_so = OFF(cold);
		else
			v->details->rm_extend.rm_so = OFF(v->stop);
		v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
	}
	return ret;
}

/*
 * cfindloop - the heart of cfind
 */
static int
cfindloop(struct vars *v,
		  struct cnfa *cnfa,
		  struct colormap *cm,
		  struct dfa *d,
		  struct dfa *s,
		  chr **coldp)			/* where to put coldstart pointer */
{
	chr		   *begin;
	chr		   *end;
	chr		   *cold;
	chr		   *open;			/* open and close of range of possible starts */
	chr		   *close;
	chr		   *estart;
	chr		   *estop;
	int			er;
	int			shorter = v->g->tree->flags & SHORTER;
	int			hitend;

	assert(d != NULL && s != NULL);
	cold = NULL;
	close = v->search_start;
	do
	{
		/* Search with the search RE for match range at/beyond "close" */
		MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
		close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
		if (ISERR())
		{
			*coldp = cold;
			return v->err;
		}
		if (close == NULL)
			break;				/* no more possible match anywhere */
		assert(cold != NULL);
		open = cold;
		cold = NULL;
		/* Search for matches starting between "open" and "close" inclusive */
		MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
		for (begin = open; begin <= close; begin++)
		{
			MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
			estart = begin;
			estop = v->stop;
			for (;;)
			{
				/* Here we use the top node's detailed RE */
				if (shorter)
					end = shortest(v, d, begin, estart,
								   estop, (chr **) NULL, &hitend);
				else
					end = longest(v, d, begin, estop,
								  &hitend);
				if (ISERR())
				{
					*coldp = cold;
					return v->err;
				}
				if (hitend && cold == NULL)
					cold = begin;
				if (end == NULL)
					break;		/* no match with this begin point, try next */
				MDEBUG(("tentative end %ld\n", LOFF(end)));
				/* Dissect the potential match to see if it really matches */
				er = cdissect(v, v->g->tree, begin, end);
				if (er == REG_OKAY)
				{
					if (v->nmatch > 0)
					{
						v->pmatch[0].rm_so = OFF(begin);
						v->pmatch[0].rm_eo = OFF(end);
					}
					*coldp = cold;
					return REG_OKAY;
				}
				if (er != REG_NOMATCH)
				{
					ERR(er);
					*coldp = cold;
					return er;
				}
				/* Try next longer/shorter match with same begin point */
				if (shorter)
				{
					if (end == estop)
						break;	/* no more, so try next begin point */
					estart = end + 1;
				}
				else
				{
					if (end == begin)
						break;	/* no more, so try next begin point */
					estop = end - 1;
				}
			}					/* end loop over endpoint positions */
		}						/* end loop over beginning positions */

		/*
		 * If we get here, there is no possible match starting at or before
		 * "close", so consider matches beyond that.  We'll do a fresh search
		 * with the search RE to find a new promising match range.
		 */
		close++;
	} while (close < v->stop);

	*coldp = cold;
	return REG_NOMATCH;
}

/*
 * zapallsubs - initialize all subexpression matches to "no match"
 *
 * Note that p[0], the overall-match location, is not touched.
 */
static void
zapallsubs(regmatch_t *p,
		   size_t n)
{
	size_t		i;

	for (i = n - 1; i > 0; i--)
	{
		p[i].rm_so = -1;
		p[i].rm_eo = -1;
	}
}

/*
 * zaptreesubs - initialize subexpressions within subtree to "no match"
 */
static void
zaptreesubs(struct vars *v,
			struct subre *t)
{
	int			n = t->capno;
	struct subre *t2;

	if (n > 0)
	{
		if ((size_t) n < v->nmatch)
		{
			v->pmatch[n].rm_so = -1;
			v->pmatch[n].rm_eo = -1;
		}
	}

	for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
		zaptreesubs(v, t2);
}

/*
 * subset - set subexpression match data for a successful subre
 */
static void
subset(struct vars *v,
	   struct subre *sub,
	   chr *begin,
	   chr *end)
{
	int			n = sub->capno;

	assert(n > 0);
	if ((size_t) n >= v->nmatch)
		return;

	MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
	v->pmatch[n].rm_so = OFF(begin);
	v->pmatch[n].rm_eo = OFF(end);
}

/*
 * cdissect - check backrefs and determine subexpression matches
 *
 * cdissect recursively processes a subre tree to check matching of backrefs
 * and/or identify submatch boundaries for capture nodes.  The proposed match
 * runs from "begin" to "end" (not including "end"), and we are basically
 * "dissecting" it to see where the submatches are.
 *
 * Before calling any level of cdissect, the caller must have run the node's
 * DFA and found that the proposed substring satisfies the DFA.  (We make
 * the caller do that because in concatenation and iteration nodes, it's
 * much faster to check all the substrings against the child DFAs before we
 * recurse.)
 *
 * A side-effect of a successful match is to save match locations for
 * capturing subexpressions in v->pmatch[].  This is a little bit tricky,
 * so we make the following rules:
 * 1. Before initial entry to cdissect, all match data must have been
 *    cleared (this is seen to by zapallsubs).
 * 2. Before any recursive entry to cdissect, the match data for that
 *    subexpression tree must be guaranteed clear (see zaptreesubs).
 * 3. When returning REG_OKAY, each level of cdissect will have saved
 *    any relevant match locations.
 * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
 *    that its subexpression match locations are again clear.
 * 5. No guarantees are made for error cases (i.e., other result codes).
 * 6. When a level of cdissect abandons a successful sub-match, it will
 *    clear that subtree's match locations with zaptreesubs before trying
 *    any new DFA match or cdissect call for that subtree or any subtree
 *    to its right (that is, any subtree that could have a backref into the
 *    abandoned match).
 * This may seem overly complicated, but it's difficult to simplify it
 * because of the provision that match locations must be reset before
 * any fresh DFA match (a rule that is needed to make dfa_backref safe).
 * That means it won't work to just reset relevant match locations at the
 * start of each cdissect level.
 */
static int						/* regexec return code */
cdissect(struct vars *v,
		 struct subre *t,
		 chr *begin,			/* beginning of relevant substring */
		 chr *end)				/* end of same */
{
	int			er;

	assert(t != NULL);
	MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));

	/* handy place to check for operation cancel */
	if (CANCEL_REQUESTED(v->re))
		return REG_CANCEL;
	/* ... and stack overrun */
	if (STACK_TOO_DEEP(v->re))
		return REG_ETOOBIG;

	switch (t->op)
	{
		case '=':				/* terminal node */
			assert(t->child == NULL);
			er = REG_OKAY;		/* no action, parent did the work */
			break;
		case 'b':				/* back reference */
			assert(t->child == NULL);
			er = cbrdissect(v, t, begin, end);
			break;
		case '.':				/* concatenation */
			assert(t->child != NULL);
			if (t->child->flags & SHORTER)	/* reverse scan */
				er = crevcondissect(v, t, begin, end);
			else
				er = ccondissect(v, t, begin, end);
			break;
		case '|':				/* alternation */
			assert(t->child != NULL);
			er = caltdissect(v, t, begin, end);
			break;
		case '*':				/* iteration */
			assert(t->child != NULL);
			if (t->child->flags & SHORTER)	/* reverse scan */
				er = creviterdissect(v, t, begin, end);
			else
				er = citerdissect(v, t, begin, end);
			break;
		case '(':				/* no-op capture node */
			assert(t->child != NULL);
			er = cdissect(v, t->child, begin, end);
			break;
		default:
			er = REG_ASSERT;
			break;
	}

	/*
	 * We should never have a match failure unless backrefs lurk below;
	 * otherwise, either caller failed to check the DFA, or there's some
	 * inconsistency between the DFA and the node's innards.
	 */
	assert(er != REG_NOMATCH || (t->flags & BACKR));

	/*
	 * If this node is marked as capturing, save successful match's location.
	 */
	if (t->capno > 0 && er == REG_OKAY)
		subset(v, t, begin, end);

	return er;
}

/*
 * ccondissect - dissect match for concatenation node
 */
static int						/* regexec return code */
ccondissect(struct vars *v,
			struct subre *t,
			chr *begin,			/* beginning of relevant substring */
			chr *end)			/* end of same */
{
	struct subre *left = t->child;
	struct subre *right = left->sibling;
	struct dfa *d;
	struct dfa *d2;
	chr		   *mid;
	int			er;

	assert(t->op == '.');
	assert(left != NULL && left->cnfa.nstates > 0);
	assert(right != NULL && right->cnfa.nstates > 0);
	assert(right->sibling == NULL);
	assert(!(left->flags & SHORTER));

	d = getsubdfa(v, left);
	NOERR();
	d2 = getsubdfa(v, right);
	NOERR();
	MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));

	/* pick a tentative midpoint */
	mid = longest(v, d, begin, end, (int *) NULL);
	NOERR();
	if (mid == NULL)
		return REG_NOMATCH;
	MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));

	/* iterate until satisfaction or failure */
	for (;;)
	{
		/* try this midpoint on for size */
		if (longest(v, d2, mid, end, (int *) NULL) == end)
		{
			er = cdissect(v, left, begin, mid);
			if (er == REG_OKAY)
			{
				er = cdissect(v, right, mid, end);
				if (er == REG_OKAY)
				{
					/* satisfaction */
					MDEBUG(("%d: successful\n", t->id));
					return REG_OKAY;
				}
				/* Reset left's matches (right should have done so itself) */
				zaptreesubs(v, left);
			}
			if (er != REG_NOMATCH)
				return er;
		}
		NOERR();

		/* that midpoint didn't work, find a new one */
		if (mid == begin)
		{
			/* all possibilities exhausted */
			MDEBUG(("%d: no midpoint\n", t->id));
			return REG_NOMATCH;
		}
		mid = longest(v, d, begin, mid - 1, (int *) NULL);
		NOERR();
		if (mid == NULL)
		{
			/* failed to find a new one */
			MDEBUG(("%d: failed midpoint\n", t->id));
			return REG_NOMATCH;
		}
		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
	}

	/* can't get here */
	return REG_ASSERT;
}

/*
 * crevcondissect - dissect match for concatenation node, shortest-first
 */
static int						/* regexec return code */
crevcondissect(struct vars *v,
			   struct subre *t,
			   chr *begin,		/* beginning of relevant substring */
			   chr *end)		/* end of same */
{
	struct subre *left = t->child;
	struct subre *right = left->sibling;
	struct dfa *d;
	struct dfa *d2;
	chr		   *mid;
	int			er;

	assert(t->op == '.');
	assert(left != NULL && left->cnfa.nstates > 0);
	assert(right != NULL && right->cnfa.nstates > 0);
	assert(right->sibling == NULL);
	assert(left->flags & SHORTER);

	d = getsubdfa(v, left);
	NOERR();
	d2 = getsubdfa(v, right);
	NOERR();
	MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));

	/* pick a tentative midpoint */
	mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
	NOERR();
	if (mid == NULL)
		return REG_NOMATCH;
	MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));

	/* iterate until satisfaction or failure */
	for (;;)
	{
		/* try this midpoint on for size */
		if (longest(v, d2, mid, end, (int *) NULL) == end)
		{
			er = cdissect(v, left, begin, mid);
			if (er == REG_OKAY)
			{
				er = cdissect(v, right, mid, end);
				if (er == REG_OKAY)
				{
					/* satisfaction */
					MDEBUG(("%d: successful\n", t->id));
					return REG_OKAY;
				}
				/* Reset left's matches (right should have done so itself) */
				zaptreesubs(v, left);
			}
			if (er != REG_NOMATCH)
				return er;
		}
		NOERR();

		/* that midpoint didn't work, find a new one */
		if (mid == end)
		{
			/* all possibilities exhausted */
			MDEBUG(("%d: no midpoint\n", t->id));
			return REG_NOMATCH;
		}
		mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
		NOERR();
		if (mid == NULL)
		{
			/* failed to find a new one */
			MDEBUG(("%d: failed midpoint\n", t->id));
			return REG_NOMATCH;
		}
		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
	}

	/* can't get here */
	return REG_ASSERT;
}

/*
 * cbrdissect - dissect match for backref node
 *
 * The backref match might already have been verified by dfa_backref(),
 * but we don't know that for sure so must check it here.
 */
static int						/* regexec return code */
cbrdissect(struct vars *v,
		   struct subre *t,
		   chr *begin,			/* beginning of relevant substring */
		   chr *end)			/* end of same */
{
	int			n = t->backno;
	size_t		numreps;
	size_t		tlen;
	size_t		brlen;
	chr		   *brstring;
	chr		   *p;
	int			min = t->min;
	int			max = t->max;

	assert(t != NULL);
	assert(t->op == 'b');
	assert(n >= 0);
	assert((size_t) n < v->nmatch);

	MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
			LOFF(begin), LOFF(end)));

	/* get the backreferenced string */
	if (v->pmatch[n].rm_so == -1)
		return REG_NOMATCH;
	brstring = v->start + v->pmatch[n].rm_so;
	brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;

	/* special cases for zero-length strings */
	if (brlen == 0)
	{
		/*
		 * matches only if target is zero length, but any number of
		 * repetitions can be considered to be present
		 */
		if (begin == end && min <= max)
		{
			MDEBUG(("%d: backref matched trivially\n", t->id));
			return REG_OKAY;
		}
		return REG_NOMATCH;
	}
	if (begin == end)
	{
		/* matches only if zero repetitions are okay */
		if (min == 0)
		{
			MDEBUG(("%d: backref matched trivially\n", t->id));
			return REG_OKAY;
		}
		return REG_NOMATCH;
	}

	/*
	 * check target length to see if it could possibly be an allowed number of
	 * repetitions of brstring
	 */
	assert(end > begin);
	tlen = end - begin;
	if (tlen % brlen != 0)
		return REG_NOMATCH;
	numreps = tlen / brlen;
	if (numreps < min || (numreps > max && max != DUPINF))
		return REG_NOMATCH;

	/* okay, compare the actual string contents */
	p = begin;
	while (numreps-- > 0)
	{
		if ((*v->g->compare) (brstring, p, brlen) != 0)
			return REG_NOMATCH;
		p += brlen;
	}

	MDEBUG(("%d: backref matched\n", t->id));
	return REG_OKAY;
}

/*
 * caltdissect - dissect match for alternation node
 */
static int						/* regexec return code */
caltdissect(struct vars *v,
			struct subre *t,
			chr *begin,			/* beginning of relevant substring */
			chr *end)			/* end of same */
{
	struct dfa *d;
	int			er;

	assert(t->op == '|');

	t = t->child;
	/* there should be at least 2 alternatives */
	assert(t != NULL && t->sibling != NULL);

	while (t != NULL)
	{
		assert(t->cnfa.nstates > 0);

		MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));

		d = getsubdfa(v, t);
		NOERR();
		if (longest(v, d, begin, end, (int *) NULL) == end)
		{
			MDEBUG(("%d: caltdissect matched\n", t->id));
			er = cdissect(v, t, begin, end);
			if (er != REG_NOMATCH)
				return er;
		}
		NOERR();

		t = t->sibling;
	}

	return REG_NOMATCH;
}

/*
 * citerdissect - dissect match for iteration node
 */
static int						/* regexec return code */
citerdissect(struct vars *v,
			 struct subre *t,
			 chr *begin,		/* beginning of relevant substring */
			 chr *end)			/* end of same */
{
	struct dfa *d;
	chr		  **endpts;
	chr		   *limit;
	int			min_matches;
	size_t		max_matches;
	int			nverified;
	int			k;
	int			i;
	int			er;

	assert(t->op == '*');
	assert(t->child != NULL && t->child->cnfa.nstates > 0);
	assert(!(t->child->flags & SHORTER));
	assert(begin <= end);

	MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));

	/*
	 * For the moment, assume the minimum number of matches is 1.  If zero
	 * matches are allowed, and the target string is empty, we are allowed to
	 * match regardless of the contents of the iter node --- but we would
	 * prefer to match once, so that capturing parens get set.  (An example of
	 * the concern here is a pattern like "()*\1", which historically this
	 * code has allowed to succeed.)  Therefore, we deal with the zero-matches
	 * case at the bottom, after failing to find any other way to match.
	 */
	min_matches = t->min;
	if (min_matches <= 0)
		min_matches = 1;

	/*
	 * We need workspace to track the endpoints of each sub-match.  Normally
	 * we consider only nonzero-length sub-matches, so there can be at most
	 * end-begin of them.  However, if min is larger than that, we will also
	 * consider zero-length sub-matches in order to find enough matches.
	 *
	 * For convenience, endpts[0] contains the "begin" pointer and we store
	 * sub-match endpoints in endpts[1..max_matches].
	 */
	max_matches = end - begin;
	if (max_matches > t->max && t->max != DUPINF)
		max_matches = t->max;
	if (max_matches < min_matches)
		max_matches = min_matches;
	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
	if (endpts == NULL)
		return REG_ESPACE;
	endpts[0] = begin;

	d = getsubdfa(v, t->child);
	if (ISERR())
	{
		FREE(endpts);
		return v->err;
	}

	/*
	 * Our strategy is to first find a set of sub-match endpoints that are
	 * valid according to the child node's DFA, and then recursively dissect
	 * each sub-match to confirm validity.  If any validity check fails,
	 * backtrack that sub-match and try again.  And, when we next try for a
	 * validity check, we need not recheck any successfully verified
	 * sub-matches that we didn't move the endpoints of.  nverified remembers
	 * how many sub-matches are currently known okay.
	 */

	/* initialize to consider first sub-match */
	nverified = 0;
	k = 1;
	limit = end;

	/* iterate until satisfaction or failure */
	while (k > 0)
	{
		/* try to find an endpoint for the k'th sub-match */
		endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
		if (ISERR())
		{
			FREE(endpts);
			return v->err;
		}
		if (endpts[k] == NULL)
		{
			/* no match possible, so see if we can shorten previous one */
			k--;
			goto backtrack;
		}
		MDEBUG(("%d: working endpoint %d: %ld\n",
				t->id, k, LOFF(endpts[k])));

		/* k'th sub-match can no longer be considered verified */
		if (nverified >= k)
			nverified = k - 1;

		if (endpts[k] != end)
		{
			/* haven't reached end yet, try another iteration if allowed */
			if (k >= max_matches)
			{
				/* must try to shorten some previous match */
				k--;
				goto backtrack;
			}

			/* reject zero-length match unless necessary to achieve min */
			if (endpts[k] == endpts[k - 1] &&
				(k >= min_matches || min_matches - k < end - endpts[k]))
				goto backtrack;

			k++;
			limit = end;
			continue;
		}

		/*
		 * We've identified a way to divide the string into k sub-matches that
		 * works so far as the child DFA can tell.  If k is an allowed number
		 * of matches, start the slow part: recurse to verify each sub-match.
		 * We always have k <= max_matches, needn't check that.
		 */
		if (k < min_matches)
			goto backtrack;

		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));

		for (i = nverified + 1; i <= k; i++)
		{
			/* zap any match data from a non-last iteration */
			zaptreesubs(v, t->child);
			er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
			if (er == REG_OKAY)
			{
				nverified = i;
				continue;
			}
			if (er == REG_NOMATCH)
				break;
			/* oops, something failed */
			FREE(endpts);
			return er;
		}

		if (i > k)
		{
			/* satisfaction */
			MDEBUG(("%d: successful\n", t->id));
			FREE(endpts);
			return REG_OKAY;
		}

		/* i'th match failed to verify, so backtrack it */
		k = i;

backtrack:

		/*
		 * Must consider shorter versions of the k'th sub-match.  However,
		 * we'll only ask for a zero-length match if necessary.
		 */
		while (k > 0)
		{
			chr		   *prev_end = endpts[k - 1];

			if (endpts[k] > prev_end)
			{
				limit = endpts[k] - 1;
				if (limit > prev_end ||
					(k < min_matches && min_matches - k >= end - prev_end))
				{
					/* break out of backtrack loop, continue the outer one */
					break;
				}
			}
			/* can't shorten k'th sub-match any more, consider previous one */
			k--;
		}
	}

	/* all possibilities exhausted */
	FREE(endpts);

	/*
	 * Now consider the possibility that we can match to a zero-length string
	 * by using zero repetitions.
	 */
	if (t->min == 0 && begin == end)
	{
		MDEBUG(("%d: allowing zero matches\n", t->id));
		return REG_OKAY;
	}

	MDEBUG(("%d: failed\n", t->id));
	return REG_NOMATCH;
}

/*
 * creviterdissect - dissect match for iteration node, shortest-first
 */
static int						/* regexec return code */
creviterdissect(struct vars *v,
				struct subre *t,
				chr *begin,		/* beginning of relevant substring */
				chr *end)		/* end of same */
{
	struct dfa *d;
	chr		  **endpts;
	chr		   *limit;
	int			min_matches;
	size_t		max_matches;
	int			nverified;
	int			k;
	int			i;
	int			er;

	assert(t->op == '*');
	assert(t->child != NULL && t->child->cnfa.nstates > 0);
	assert(t->child->flags & SHORTER);
	assert(begin <= end);

	MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));

	/*
	 * If zero matches are allowed, and target string is empty, just declare
	 * victory.  OTOH, if target string isn't empty, zero matches can't work
	 * so we pretend the min is 1.
	 */
	min_matches = t->min;
	if (min_matches <= 0)
	{
		if (begin == end)
		{
			MDEBUG(("%d: allowing zero matches\n", t->id));
			return REG_OKAY;
		}
		min_matches = 1;
	}

	/*
	 * We need workspace to track the endpoints of each sub-match.  Normally
	 * we consider only nonzero-length sub-matches, so there can be at most
	 * end-begin of them.  However, if min is larger than that, we will also
	 * consider zero-length sub-matches in order to find enough matches.
	 *
	 * For convenience, endpts[0] contains the "begin" pointer and we store
	 * sub-match endpoints in endpts[1..max_matches].
	 */
	max_matches = end - begin;
	if (max_matches > t->max && t->max != DUPINF)
		max_matches = t->max;
	if (max_matches < min_matches)
		max_matches = min_matches;
	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
	if (endpts == NULL)
		return REG_ESPACE;
	endpts[0] = begin;

	d = getsubdfa(v, t->child);
	if (ISERR())
	{
		FREE(endpts);
		return v->err;
	}

	/*
	 * Our strategy is to first find a set of sub-match endpoints that are
	 * valid according to the child node's DFA, and then recursively dissect
	 * each sub-match to confirm validity.  If any validity check fails,
	 * backtrack that sub-match and try again.  And, when we next try for a
	 * validity check, we need not recheck any successfully verified
	 * sub-matches that we didn't move the endpoints of.  nverified remembers
	 * how many sub-matches are currently known okay.
	 */

	/* initialize to consider first sub-match */
	nverified = 0;
	k = 1;
	limit = begin;

	/* iterate until satisfaction or failure */
	while (k > 0)
	{
		/* disallow zero-length match unless necessary to achieve min */
		if (limit == endpts[k - 1] &&
			limit != end &&
			(k >= min_matches || min_matches - k < end - limit))
			limit++;

		/* if this is the last allowed sub-match, it must reach to the end */
		if (k >= max_matches)
			limit = end;

		/* try to find an endpoint for the k'th sub-match */
		endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
							 (chr **) NULL, (int *) NULL);
		if (ISERR())
		{
			FREE(endpts);
			return v->err;
		}
		if (endpts[k] == NULL)
		{
			/* no match possible, so see if we can lengthen previous one */
			k--;
			goto backtrack;
		}
		MDEBUG(("%d: working endpoint %d: %ld\n",
				t->id, k, LOFF(endpts[k])));

		/* k'th sub-match can no longer be considered verified */
		if (nverified >= k)
			nverified = k - 1;

		if (endpts[k] != end)
		{
			/* haven't reached end yet, try another iteration if allowed */
			if (k >= max_matches)
			{
				/* must try to lengthen some previous match */
				k--;
				goto backtrack;
			}

			k++;
			limit = endpts[k - 1];
			continue;
		}

		/*
		 * We've identified a way to divide the string into k sub-matches that
		 * works so far as the child DFA can tell.  If k is an allowed number
		 * of matches, start the slow part: recurse to verify each sub-match.
		 * We always have k <= max_matches, needn't check that.
		 */
		if (k < min_matches)
			goto backtrack;

		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));

		for (i = nverified + 1; i <= k; i++)
		{
			/* zap any match data from a non-last iteration */
			zaptreesubs(v, t->child);
			er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
			if (er == REG_OKAY)
			{
				nverified = i;
				continue;
			}
			if (er == REG_NOMATCH)
				break;
			/* oops, something failed */
			FREE(endpts);
			return er;
		}

		if (i > k)
		{
			/* satisfaction */
			MDEBUG(("%d: successful\n", t->id));
			FREE(endpts);
			return REG_OKAY;
		}

		/* i'th match failed to verify, so backtrack it */
		k = i;

backtrack:

		/*
		 * Must consider longer versions of the k'th sub-match.
		 */
		while (k > 0)
		{
			if (endpts[k] < end)
			{
				limit = endpts[k] + 1;
				/* break out of backtrack loop, continue the outer one */
				break;
			}
			/* can't lengthen k'th sub-match any more, consider previous one */
			k--;
		}
	}

	/* all possibilities exhausted */
	MDEBUG(("%d: failed\n", t->id));
	FREE(endpts);
	return REG_NOMATCH;
}


#include "rege_dfa.c"