Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
This commit is contained in:
parent
6b40d9bdbd
commit
2a0af7fe46
@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
|
||||
non-ASCII characters to belong to any of these classes.)
|
||||
In addition to these standard character
|
||||
classes, <productname>PostgreSQL</productname> defines
|
||||
the <literal>word</literal> character class, which is the same as
|
||||
<literal>alnum</literal> plus the underscore (<literal>_</literal>)
|
||||
character, and
|
||||
the <literal>ascii</literal> character class, which contains exactly
|
||||
the 7-bit ASCII set.
|
||||
</para>
|
||||
@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
|
||||
matching empty strings at the beginning
|
||||
and end of a word respectively. A word is defined as a sequence
|
||||
of word characters that is neither preceded nor followed by word
|
||||
characters. A word character is an <literal>alnum</literal> character (as
|
||||
defined by the <acronym>POSIX</acronym> character class described above)
|
||||
or an underscore. This is an extension, compatible with but not
|
||||
characters. A word character is any character belonging to the
|
||||
<literal>word</literal> character class, that is, any letter, digit,
|
||||
or underscore. This is an extension, compatible with but not
|
||||
specified by <acronym>POSIX</acronym> 1003.2, and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
The constraint escapes described below are usually preferable; they
|
||||
@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
|
||||
|
||||
<row>
|
||||
<entry> <literal>\w</literal> </entry>
|
||||
<entry> <literal>[[:alnum:]_]</literal>
|
||||
(note underscore is included) </entry>
|
||||
<entry> <literal>[[:word:]]</literal> </entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
|
||||
|
||||
<row>
|
||||
<entry> <literal>\W</literal> </entry>
|
||||
<entry> <literal>[^[:alnum:]_]</literal>
|
||||
(note underscore is included) </entry>
|
||||
<entry> <literal>[^[:word:]]</literal> </entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
<para>
|
||||
Within bracket expressions, <literal>\d</literal>, <literal>\s</literal>,
|
||||
and <literal>\w</literal> lose their outer brackets,
|
||||
and <literal>\D</literal>, <literal>\S</literal>, and <literal>\W</literal> are illegal.
|
||||
(So, for example, <literal>[a-c\d]</literal> is equivalent to
|
||||
The class-shorthand escapes also work within bracket expressions,
|
||||
although the definitions shown above are not quite syntactically
|
||||
valid in that context.
|
||||
For example, <literal>[a-c\d]</literal> is equivalent to
|
||||
<literal>[a-c[:digit:]]</literal>.
|
||||
Also, <literal>[a-c\D]</literal>, which is equivalent to
|
||||
<literal>[a-c^[:digit:]]</literal>, is illegal.)
|
||||
</para>
|
||||
|
||||
<table id="posix-constraint-escapes-table">
|
||||
|
@ -519,15 +519,10 @@ character classes:
|
||||
(note underscore)
|
||||
.RE
|
||||
.PP
|
||||
Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
|
||||
and `\fB\ew\fR'\&
|
||||
lose their outer brackets,
|
||||
and `\fB\eD\fR', `\fB\eS\fR',
|
||||
and `\fB\eW\fR'\&
|
||||
are illegal.
|
||||
.VS 8.2
|
||||
(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
|
||||
Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
|
||||
The class-shorthand escapes also work within bracket expressions,
|
||||
although the definitions shown above are not quite syntactically
|
||||
valid in that context.
|
||||
For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
|
||||
.VE 8.2
|
||||
.PP
|
||||
A constraint escape (AREs only) is a constraint,
|
||||
|
@ -936,7 +936,16 @@ okcolors(struct nfa *nfa,
|
||||
}
|
||||
else if (cd->nschrs == 0 && cd->nuchrs == 0)
|
||||
{
|
||||
/* parent empty, its arcs change color to subcolor */
|
||||
/*
|
||||
* Parent is now empty, so just change all its arcs to the
|
||||
* subcolor, then free the parent.
|
||||
*
|
||||
* It is not obvious that simply relabeling the arcs like this is
|
||||
* OK; it appears to risk creating duplicate arcs. We are
|
||||
* basically relying on the assumption that processing of a
|
||||
* bracket expression can't create arcs of both a color and its
|
||||
* subcolor between the bracket's endpoints.
|
||||
*/
|
||||
cd->sub = NOSUB;
|
||||
scd = &cm->cd[sco];
|
||||
assert(scd->nschrs > 0 || scd->nuchrs > 0);
|
||||
@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa,
|
||||
struct colordesc *cd;
|
||||
struct colordesc *end = CDEND(cm);
|
||||
color co;
|
||||
struct arc *a;
|
||||
|
||||
assert(of != from);
|
||||
|
||||
@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa,
|
||||
if (findarc(of, PLAIN, RAINBOW) != NULL)
|
||||
return;
|
||||
|
||||
/* Otherwise, transiently mark the colors that appear in of's out-arcs */
|
||||
for (a = of->outs; a != NULL; a = a->outchain)
|
||||
{
|
||||
if (a->type == PLAIN)
|
||||
{
|
||||
assert(a->co >= 0);
|
||||
cd = &cm->cd[a->co];
|
||||
assert(!UNUSEDCOLOR(cd));
|
||||
cd->flags |= COLMARK;
|
||||
}
|
||||
}
|
||||
|
||||
/* Scan colors, clear transient marks, add arcs for unmarked colors */
|
||||
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
|
||||
if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
|
||||
if (findarc(of, PLAIN, co) == NULL)
|
||||
newarc(nfa, type, co, from, to);
|
||||
{
|
||||
if (cd->flags & COLMARK)
|
||||
cd->flags &= ~COLMARK;
|
||||
else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
|
||||
newarc(nfa, type, co, from, to);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -193,83 +193,6 @@ prefixes(struct vars *v)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* lexnest - "call a subroutine", interpolating string at the lexical level
|
||||
*
|
||||
* Note, this is not a very general facility. There are a number of
|
||||
* implicit assumptions about what sorts of strings can be subroutines.
|
||||
*/
|
||||
static void
|
||||
lexnest(struct vars *v,
|
||||
const chr *beginp, /* start of interpolation */
|
||||
const chr *endp) /* one past end of interpolation */
|
||||
{
|
||||
assert(v->savenow == NULL); /* only one level of nesting */
|
||||
v->savenow = v->now;
|
||||
v->savestop = v->stop;
|
||||
v->now = beginp;
|
||||
v->stop = endp;
|
||||
}
|
||||
|
||||
/*
|
||||
* string constants to interpolate as expansions of things like \d
|
||||
*/
|
||||
static const chr backd[] = { /* \d */
|
||||
CHR('['), CHR('['), CHR(':'),
|
||||
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr backD[] = { /* \D */
|
||||
CHR('['), CHR('^'), CHR('['), CHR(':'),
|
||||
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr brbackd[] = { /* \d within brackets */
|
||||
CHR('['), CHR(':'),
|
||||
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
|
||||
CHR(':'), CHR(']')
|
||||
};
|
||||
static const chr backs[] = { /* \s */
|
||||
CHR('['), CHR('['), CHR(':'),
|
||||
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr backS[] = { /* \S */
|
||||
CHR('['), CHR('^'), CHR('['), CHR(':'),
|
||||
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr brbacks[] = { /* \s within brackets */
|
||||
CHR('['), CHR(':'),
|
||||
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
|
||||
CHR(':'), CHR(']')
|
||||
};
|
||||
static const chr backw[] = { /* \w */
|
||||
CHR('['), CHR('['), CHR(':'),
|
||||
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
|
||||
CHR(':'), CHR(']'), CHR('_'), CHR(']')
|
||||
};
|
||||
static const chr backW[] = { /* \W */
|
||||
CHR('['), CHR('^'), CHR('['), CHR(':'),
|
||||
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
|
||||
CHR(':'), CHR(']'), CHR('_'), CHR(']')
|
||||
};
|
||||
static const chr brbackw[] = { /* \w within brackets */
|
||||
CHR('['), CHR(':'),
|
||||
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
|
||||
CHR(':'), CHR(']'), CHR('_')
|
||||
};
|
||||
|
||||
/*
|
||||
* lexword - interpolate a bracket expression for word characters
|
||||
* Possibly ought to inquire whether there is a "word" character class.
|
||||
*/
|
||||
static void
|
||||
lexword(struct vars *v)
|
||||
{
|
||||
lexnest(v, backw, ENDOF(backw));
|
||||
}
|
||||
|
||||
/*
|
||||
* next - get next token
|
||||
*/
|
||||
@ -292,14 +215,6 @@ next(struct vars *v)
|
||||
RETV(SBEGIN, 0); /* same as \A */
|
||||
}
|
||||
|
||||
/* if we're nested and we've hit end, return to outer level */
|
||||
if (v->savenow != NULL && ATEOS())
|
||||
{
|
||||
v->now = v->savenow;
|
||||
v->stop = v->savestop;
|
||||
v->savenow = v->savestop = NULL;
|
||||
}
|
||||
|
||||
/* skip white space etc. if appropriate (not in literal or []) */
|
||||
if (v->cflags & REG_EXPANDED)
|
||||
switch (v->lexcon)
|
||||
@ -420,32 +335,15 @@ next(struct vars *v)
|
||||
NOTE(REG_UNONPOSIX);
|
||||
if (ATEOS())
|
||||
FAILW(REG_EESCAPE);
|
||||
(DISCARD) lexescape(v);
|
||||
if (!lexescape(v))
|
||||
return 0;
|
||||
switch (v->nexttype)
|
||||
{ /* not all escapes okay here */
|
||||
case PLAIN:
|
||||
case CCLASSS:
|
||||
case CCLASSC:
|
||||
return 1;
|
||||
break;
|
||||
case CCLASS:
|
||||
switch (v->nextvalue)
|
||||
{
|
||||
case 'd':
|
||||
lexnest(v, brbackd, ENDOF(brbackd));
|
||||
break;
|
||||
case 's':
|
||||
lexnest(v, brbacks, ENDOF(brbacks));
|
||||
break;
|
||||
case 'w':
|
||||
lexnest(v, brbackw, ENDOF(brbackw));
|
||||
break;
|
||||
default:
|
||||
FAILW(REG_EESCAPE);
|
||||
break;
|
||||
}
|
||||
/* lexnest done, back up and try again */
|
||||
v->nexttype = v->lasttype;
|
||||
return next(v);
|
||||
break;
|
||||
}
|
||||
/* not one of the acceptable escapes */
|
||||
FAILW(REG_EESCAPE);
|
||||
@ -691,49 +589,17 @@ next(struct vars *v)
|
||||
}
|
||||
RETV(PLAIN, *v->now++);
|
||||
}
|
||||
(DISCARD) lexescape(v);
|
||||
if (ISERR())
|
||||
FAILW(REG_EESCAPE);
|
||||
if (v->nexttype == CCLASS)
|
||||
{ /* fudge at lexical level */
|
||||
switch (v->nextvalue)
|
||||
{
|
||||
case 'd':
|
||||
lexnest(v, backd, ENDOF(backd));
|
||||
break;
|
||||
case 'D':
|
||||
lexnest(v, backD, ENDOF(backD));
|
||||
break;
|
||||
case 's':
|
||||
lexnest(v, backs, ENDOF(backs));
|
||||
break;
|
||||
case 'S':
|
||||
lexnest(v, backS, ENDOF(backS));
|
||||
break;
|
||||
case 'w':
|
||||
lexnest(v, backw, ENDOF(backw));
|
||||
break;
|
||||
case 'W':
|
||||
lexnest(v, backW, ENDOF(backW));
|
||||
break;
|
||||
default:
|
||||
assert(NOTREACHED);
|
||||
FAILW(REG_ASSERT);
|
||||
break;
|
||||
}
|
||||
/* lexnest done, back up and try again */
|
||||
v->nexttype = v->lasttype;
|
||||
return next(v);
|
||||
}
|
||||
/* otherwise, lexescape has already done the work */
|
||||
return !ISERR();
|
||||
return lexescape(v);
|
||||
}
|
||||
|
||||
/*
|
||||
* lexescape - parse an ARE backslash escape (backslash already eaten)
|
||||
* Note slightly nonstandard use of the CCLASS type code.
|
||||
*
|
||||
* This is used for ARE backslashes both normally and inside bracket
|
||||
* expressions. In the latter case, not all escape types are allowed,
|
||||
* but the caller must reject unwanted ones after we return.
|
||||
*/
|
||||
static int /* not actually used, but convenient for RETV */
|
||||
static int
|
||||
lexescape(struct vars *v)
|
||||
{
|
||||
chr c;
|
||||
@ -775,11 +641,11 @@ lexescape(struct vars *v)
|
||||
break;
|
||||
case CHR('d'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'd');
|
||||
RETV(CCLASSS, CC_DIGIT);
|
||||
break;
|
||||
case CHR('D'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'D');
|
||||
RETV(CCLASSC, CC_DIGIT);
|
||||
break;
|
||||
case CHR('e'):
|
||||
NOTE(REG_UUNPORT);
|
||||
@ -802,11 +668,11 @@ lexescape(struct vars *v)
|
||||
break;
|
||||
case CHR('s'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 's');
|
||||
RETV(CCLASSS, CC_SPACE);
|
||||
break;
|
||||
case CHR('S'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'S');
|
||||
RETV(CCLASSC, CC_SPACE);
|
||||
break;
|
||||
case CHR('t'):
|
||||
RETV(PLAIN, CHR('\t'));
|
||||
@ -828,11 +694,11 @@ lexescape(struct vars *v)
|
||||
break;
|
||||
case CHR('w'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'w');
|
||||
RETV(CCLASSS, CC_WORD);
|
||||
break;
|
||||
case CHR('W'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'W');
|
||||
RETV(CCLASSC, CC_WORD);
|
||||
break;
|
||||
case CHR('x'):
|
||||
NOTE(REG_UUNPORT);
|
||||
|
@ -350,17 +350,13 @@ static const struct cname
|
||||
};
|
||||
|
||||
/*
|
||||
* The following arrays define the valid character class names.
|
||||
* The following array defines the valid character class names.
|
||||
* The entries must match enum char_classes in regguts.h.
|
||||
*/
|
||||
static const char *const classNames[NUM_CCLASSES + 1] = {
|
||||
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
|
||||
"lower", "print", "punct", "space", "upper", "xdigit", NULL
|
||||
};
|
||||
|
||||
enum classes
|
||||
{
|
||||
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
|
||||
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
|
||||
"lower", "print", "punct", "space", "upper", "xdigit", "word",
|
||||
NULL
|
||||
};
|
||||
|
||||
/*
|
||||
@ -536,7 +532,36 @@ eclass(struct vars *v, /* context */
|
||||
}
|
||||
|
||||
/*
|
||||
* cclass - supply cvec for a character class
|
||||
* lookupcclass - lookup a character class identified by name
|
||||
*
|
||||
* On failure, sets an error code in *v; the result is then garbage.
|
||||
*/
|
||||
static enum char_classes
|
||||
lookupcclass(struct vars *v, /* context (for returning errors) */
|
||||
const chr *startp, /* where the name starts */
|
||||
const chr *endp) /* just past the end of the name */
|
||||
{
|
||||
size_t len;
|
||||
const char *const *namePtr;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Map the name to the corresponding enumerated value.
|
||||
*/
|
||||
len = endp - startp;
|
||||
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
|
||||
{
|
||||
if (strlen(*namePtr) == len &&
|
||||
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
|
||||
return (enum char_classes) i;
|
||||
}
|
||||
|
||||
ERR(REG_ECTYPE);
|
||||
return (enum char_classes) 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* cclasscvec - supply cvec for a character class
|
||||
*
|
||||
* Must include case counterparts if "cases" is true.
|
||||
*
|
||||
@ -545,45 +570,20 @@ eclass(struct vars *v, /* context */
|
||||
* because callers are not supposed to explicitly free the result either way.
|
||||
*/
|
||||
static struct cvec *
|
||||
cclass(struct vars *v, /* context */
|
||||
const chr *startp, /* where the name starts */
|
||||
const chr *endp, /* just past the end of the name */
|
||||
int cases) /* case-independent? */
|
||||
cclasscvec(struct vars *v, /* context */
|
||||
enum char_classes cclasscode, /* class to build a cvec for */
|
||||
int cases) /* case-independent? */
|
||||
{
|
||||
size_t len;
|
||||
struct cvec *cv = NULL;
|
||||
const char *const *namePtr;
|
||||
int i,
|
||||
index;
|
||||
|
||||
/*
|
||||
* Map the name to the corresponding enumerated value.
|
||||
*/
|
||||
len = endp - startp;
|
||||
index = -1;
|
||||
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
|
||||
{
|
||||
if (strlen(*namePtr) == len &&
|
||||
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
|
||||
{
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (index == -1)
|
||||
{
|
||||
ERR(REG_ECTYPE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remap lower and upper to alpha if the match is case insensitive.
|
||||
*/
|
||||
|
||||
if (cases &&
|
||||
((enum classes) index == CC_LOWER ||
|
||||
(enum classes) index == CC_UPPER))
|
||||
index = (int) CC_ALPHA;
|
||||
(cclasscode == CC_LOWER ||
|
||||
cclasscode == CC_UPPER))
|
||||
cclasscode = CC_ALPHA;
|
||||
|
||||
/*
|
||||
* Now compute the character class contents. For classes that are based
|
||||
@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */
|
||||
* NB: keep this code in sync with cclass_column_index(), below.
|
||||
*/
|
||||
|
||||
switch ((enum classes) index)
|
||||
switch (cclasscode)
|
||||
{
|
||||
case CC_PRINT:
|
||||
cv = pg_ctype_get_cache(pg_wc_isprint, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
|
||||
break;
|
||||
case CC_ALNUM:
|
||||
cv = pg_ctype_get_cache(pg_wc_isalnum, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
|
||||
break;
|
||||
case CC_ALPHA:
|
||||
cv = pg_ctype_get_cache(pg_wc_isalpha, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
|
||||
break;
|
||||
case CC_WORD:
|
||||
cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
|
||||
break;
|
||||
case CC_ASCII:
|
||||
/* hard-wired meaning */
|
||||
@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */
|
||||
addrange(cv, 0x7f, 0x9f);
|
||||
break;
|
||||
case CC_DIGIT:
|
||||
cv = pg_ctype_get_cache(pg_wc_isdigit, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
|
||||
break;
|
||||
case CC_PUNCT:
|
||||
cv = pg_ctype_get_cache(pg_wc_ispunct, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
|
||||
break;
|
||||
case CC_XDIGIT:
|
||||
|
||||
@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */
|
||||
}
|
||||
break;
|
||||
case CC_SPACE:
|
||||
cv = pg_ctype_get_cache(pg_wc_isspace, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
|
||||
break;
|
||||
case CC_LOWER:
|
||||
cv = pg_ctype_get_cache(pg_wc_islower, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
|
||||
break;
|
||||
case CC_UPPER:
|
||||
cv = pg_ctype_get_cache(pg_wc_isupper, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
|
||||
break;
|
||||
case CC_GRAPH:
|
||||
cv = pg_ctype_get_cache(pg_wc_isgraph, index);
|
||||
cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)
|
||||
|
||||
/*
|
||||
* Note: we should not see requests to consider cclasses that are not
|
||||
* treated as locale-specific by cclass(), above.
|
||||
* treated as locale-specific by cclasscvec(), above.
|
||||
*/
|
||||
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
|
||||
colnum |= cm->classbits[CC_PRINT];
|
||||
@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
|
||||
colnum |= cm->classbits[CC_ALNUM];
|
||||
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
|
||||
colnum |= cm->classbits[CC_ALPHA];
|
||||
if (cm->classbits[CC_WORD] && pg_wc_isword(c))
|
||||
colnum |= cm->classbits[CC_WORD];
|
||||
assert(cm->classbits[CC_ASCII] == 0);
|
||||
assert(cm->classbits[CC_BLANK] == 0);
|
||||
assert(cm->classbits[CC_CNTRL] == 0);
|
||||
|
@ -400,6 +400,15 @@ pg_wc_isalnum(pg_wchar c)
|
||||
return 0; /* can't get here, but keep compiler quiet */
|
||||
}
|
||||
|
||||
static int
|
||||
pg_wc_isword(pg_wchar c)
|
||||
{
|
||||
/* We define word characters as alnum class plus underscore */
|
||||
if (c == CHR('_'))
|
||||
return 1;
|
||||
return pg_wc_isalnum(c);
|
||||
}
|
||||
|
||||
static int
|
||||
pg_wc_isupper(pg_wchar c)
|
||||
{
|
||||
|
@ -46,13 +46,18 @@ static struct subre *parsebranch(struct vars *, int, int, struct state *, struct
|
||||
static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
|
||||
static void nonword(struct vars *, int, struct state *, struct state *);
|
||||
static void word(struct vars *, int, struct state *, struct state *);
|
||||
static void charclass(struct vars *, enum char_classes,
|
||||
struct state *, struct state *);
|
||||
static void charclasscomplement(struct vars *, enum char_classes,
|
||||
struct state *, struct state *);
|
||||
static int scannum(struct vars *);
|
||||
static void repeat(struct vars *, struct state *, struct state *, int, int);
|
||||
static void bracket(struct vars *, struct state *, struct state *);
|
||||
static void cbracket(struct vars *, struct state *, struct state *);
|
||||
static void brackpart(struct vars *, struct state *, struct state *);
|
||||
static void brackpart(struct vars *, struct state *, struct state *, bool *);
|
||||
static const chr *scanplain(struct vars *);
|
||||
static void onechr(struct vars *, chr, struct state *, struct state *);
|
||||
static void optimizebracket(struct vars *, struct state *, struct state *);
|
||||
static void wordchrs(struct vars *);
|
||||
static void processlacon(struct vars *, struct state *, struct state *, int,
|
||||
struct state *, struct state *);
|
||||
@ -81,8 +86,6 @@ static const char *stid(struct subre *, char *, size_t);
|
||||
/* === regc_lex.c === */
|
||||
static void lexstart(struct vars *);
|
||||
static void prefixes(struct vars *);
|
||||
static void lexnest(struct vars *, const chr *, const chr *);
|
||||
static void lexword(struct vars *);
|
||||
static int next(struct vars *);
|
||||
static int lexescape(struct vars *);
|
||||
static chr lexdigits(struct vars *, int, int, int);
|
||||
@ -206,6 +209,7 @@ static void freecvec(struct cvec *);
|
||||
static int pg_wc_isdigit(pg_wchar c);
|
||||
static int pg_wc_isalpha(pg_wchar c);
|
||||
static int pg_wc_isalnum(pg_wchar c);
|
||||
static int pg_wc_isword(pg_wchar c);
|
||||
static int pg_wc_isupper(pg_wchar c);
|
||||
static int pg_wc_islower(pg_wchar c);
|
||||
static int pg_wc_isgraph(pg_wchar c);
|
||||
@ -220,7 +224,8 @@ static chr element(struct vars *, const chr *, const chr *);
|
||||
static struct cvec *range(struct vars *, chr, chr, int);
|
||||
static int before(chr, chr);
|
||||
static struct cvec *eclass(struct vars *, chr, int);
|
||||
static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
|
||||
static enum char_classes lookupcclass(struct vars *, const chr *, const chr *);
|
||||
static struct cvec *cclasscvec(struct vars *, enum char_classes, int);
|
||||
static int cclass_column_index(struct colormap *, chr);
|
||||
static struct cvec *allcases(struct vars *, chr);
|
||||
static int cmp(const chr *, const chr *, size_t);
|
||||
@ -233,14 +238,12 @@ struct vars
|
||||
regex_t *re;
|
||||
const chr *now; /* scan pointer into string */
|
||||
const chr *stop; /* end of string */
|
||||
const chr *savenow; /* saved now and stop for "subroutine call" */
|
||||
const chr *savestop;
|
||||
int err; /* error code (0 if none) */
|
||||
int cflags; /* copy of compile flags */
|
||||
int lasttype; /* type of previous token */
|
||||
int nexttype; /* type of next token */
|
||||
chr nextvalue; /* value (if any) of next token */
|
||||
int lexcon; /* lexical context type (see lex.c) */
|
||||
int lexcon; /* lexical context type (see regc_lex.c) */
|
||||
int nsubexp; /* subexpression count */
|
||||
struct subre **subs; /* subRE pointer vector */
|
||||
size_t nsubs; /* length of vector */
|
||||
@ -287,6 +290,8 @@ struct vars
|
||||
#define ECLASS 'E' /* start of [= */
|
||||
#define CCLASS 'C' /* start of [: */
|
||||
#define END 'X' /* end of [. [= [: */
|
||||
#define CCLASSS 's' /* char class shorthand escape */
|
||||
#define CCLASSC 'c' /* complement char class shorthand escape */
|
||||
#define RANGE 'R' /* - within [] which might be range delim. */
|
||||
#define LACON 'L' /* lookaround constraint subRE */
|
||||
#define AHEAD 'a' /* color-lookahead arc */
|
||||
@ -356,7 +361,6 @@ pg_regcomp(regex_t *re,
|
||||
v->re = re;
|
||||
v->now = string;
|
||||
v->stop = v->now + len;
|
||||
v->savenow = v->savestop = NULL;
|
||||
v->err = 0;
|
||||
v->cflags = flags;
|
||||
v->nsubexp = 0;
|
||||
@ -835,23 +839,25 @@ parseqatom(struct vars *v,
|
||||
return;
|
||||
break;
|
||||
case '<':
|
||||
wordchrs(v); /* does NEXT() */
|
||||
wordchrs(v);
|
||||
s = newstate(v->nfa);
|
||||
NOERR();
|
||||
nonword(v, BEHIND, lp, s);
|
||||
word(v, AHEAD, s, rp);
|
||||
NEXT();
|
||||
return;
|
||||
break;
|
||||
case '>':
|
||||
wordchrs(v); /* does NEXT() */
|
||||
wordchrs(v);
|
||||
s = newstate(v->nfa);
|
||||
NOERR();
|
||||
word(v, BEHIND, lp, s);
|
||||
nonword(v, AHEAD, s, rp);
|
||||
NEXT();
|
||||
return;
|
||||
break;
|
||||
case WBDRY:
|
||||
wordchrs(v); /* does NEXT() */
|
||||
wordchrs(v);
|
||||
s = newstate(v->nfa);
|
||||
NOERR();
|
||||
nonword(v, BEHIND, lp, s);
|
||||
@ -860,10 +866,11 @@ parseqatom(struct vars *v,
|
||||
NOERR();
|
||||
word(v, BEHIND, lp, s);
|
||||
nonword(v, AHEAD, s, rp);
|
||||
NEXT();
|
||||
return;
|
||||
break;
|
||||
case NWBDRY:
|
||||
wordchrs(v); /* does NEXT() */
|
||||
wordchrs(v);
|
||||
s = newstate(v->nfa);
|
||||
NOERR();
|
||||
word(v, BEHIND, lp, s);
|
||||
@ -872,6 +879,7 @@ parseqatom(struct vars *v,
|
||||
NOERR();
|
||||
nonword(v, BEHIND, lp, s);
|
||||
nonword(v, AHEAD, s, rp);
|
||||
NEXT();
|
||||
return;
|
||||
break;
|
||||
case LACON: /* lookaround constraint */
|
||||
@ -925,6 +933,16 @@ parseqatom(struct vars *v,
|
||||
assert(SEE(']') || ISERR());
|
||||
NEXT();
|
||||
break;
|
||||
case CCLASSS:
|
||||
charclass(v, (enum char_classes) v->nextvalue, lp, rp);
|
||||
okcolors(v->nfa, v->cm);
|
||||
NEXT();
|
||||
break;
|
||||
case CCLASSC:
|
||||
charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp);
|
||||
/* charclasscomplement() did okcolors() internally */
|
||||
NEXT();
|
||||
break;
|
||||
case '.':
|
||||
rainbow(v->nfa, v->cm, PLAIN,
|
||||
(v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
|
||||
@ -1338,6 +1356,75 @@ word(struct vars *v,
|
||||
/* (no need for special attention to \n) */
|
||||
}
|
||||
|
||||
/*
|
||||
* charclass - generate arcs for a character class
|
||||
*
|
||||
* This is used for both atoms (\w and sibling escapes) and for elements
|
||||
* of bracket expressions. The caller is responsible for calling okcolors()
|
||||
* at the end of processing the atom or bracket.
|
||||
*/
|
||||
static void
|
||||
charclass(struct vars *v,
|
||||
enum char_classes cls,
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
{
|
||||
struct cvec *cv;
|
||||
|
||||
/* obtain possibly-cached cvec for char class */
|
||||
NOTE(REG_ULOCALE);
|
||||
cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
|
||||
NOERR();
|
||||
|
||||
/* build the arcs; this may cause color splitting */
|
||||
subcolorcvec(v, cv, lp, rp);
|
||||
}
|
||||
|
||||
/*
|
||||
* charclasscomplement - generate arcs for a complemented character class
|
||||
*
|
||||
* This is used for both atoms (\W and sibling escapes) and for elements
|
||||
* of bracket expressions. In bracket expressions, it is the caller's
|
||||
* responsibility that there not be any open subcolors when this is called.
|
||||
*/
|
||||
static void
|
||||
charclasscomplement(struct vars *v,
|
||||
enum char_classes cls,
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
{
|
||||
struct state *cstate;
|
||||
struct cvec *cv;
|
||||
|
||||
/* make dummy state to hang temporary arcs on */
|
||||
cstate = newstate(v->nfa);
|
||||
NOERR();
|
||||
|
||||
/* obtain possibly-cached cvec for char class */
|
||||
NOTE(REG_ULOCALE);
|
||||
cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
|
||||
NOERR();
|
||||
|
||||
/* build arcs for char class; this may cause color splitting */
|
||||
subcolorcvec(v, cv, cstate, cstate);
|
||||
|
||||
/* in NLSTOP mode, ensure newline is not part of the result set */
|
||||
if (v->cflags & REG_NLSTOP)
|
||||
newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate);
|
||||
NOERR();
|
||||
|
||||
/* clean up any subcolors in the arc set */
|
||||
okcolors(v->nfa, v->cm);
|
||||
NOERR();
|
||||
|
||||
/* now build output arcs for the complement of the char class */
|
||||
colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp);
|
||||
NOERR();
|
||||
|
||||
/* clean up dummy state */
|
||||
dropstate(v->nfa, cstate);
|
||||
}
|
||||
|
||||
/*
|
||||
* scannum - scan a number
|
||||
*/
|
||||
@ -1456,6 +1543,7 @@ repeat(struct vars *v,
|
||||
|
||||
/*
|
||||
* bracket - handle non-complemented bracket expression
|
||||
*
|
||||
* Also called from cbracket for complemented bracket expressions.
|
||||
*/
|
||||
static void
|
||||
@ -1463,16 +1551,52 @@ bracket(struct vars *v,
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
{
|
||||
/*
|
||||
* We can't process complemented char classes (e.g. \W) immediately while
|
||||
* scanning the bracket expression, else color bookkeeping gets confused.
|
||||
* Instead, remember whether we saw any in have_cclassc[], and process
|
||||
* them at the end.
|
||||
*/
|
||||
bool have_cclassc[NUM_CCLASSES];
|
||||
bool any_cclassc;
|
||||
int i;
|
||||
|
||||
memset(have_cclassc, false, sizeof(have_cclassc));
|
||||
|
||||
assert(SEE('['));
|
||||
NEXT();
|
||||
while (!SEE(']') && !SEE(EOS))
|
||||
brackpart(v, lp, rp);
|
||||
brackpart(v, lp, rp, have_cclassc);
|
||||
assert(SEE(']') || ISERR());
|
||||
|
||||
/* close up open subcolors from the positive bracket elements */
|
||||
okcolors(v->nfa, v->cm);
|
||||
NOERR();
|
||||
|
||||
/* now handle any complemented elements */
|
||||
any_cclassc = false;
|
||||
for (i = 0; i < NUM_CCLASSES; i++)
|
||||
{
|
||||
if (have_cclassc[i])
|
||||
{
|
||||
charclasscomplement(v, (enum char_classes) i, lp, rp);
|
||||
NOERR();
|
||||
any_cclassc = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If we had any complemented elements, see if we can optimize the bracket
|
||||
* into a rainbow. Since a complemented element is the only way a WHITE
|
||||
* arc could get into the result, there's no point in checking otherwise.
|
||||
*/
|
||||
if (any_cclassc)
|
||||
optimizebracket(v, lp, rp);
|
||||
}
|
||||
|
||||
/*
|
||||
* cbracket - handle complemented bracket expression
|
||||
*
|
||||
* We do it by calling bracket() with dummy endpoints, and then complementing
|
||||
* the result. The alternative would be to invoke rainbow(), and then delete
|
||||
* arcs as the b.e. is seen... but that gets messy, and is really quite
|
||||
@ -1496,7 +1620,9 @@ cbracket(struct vars *v,
|
||||
|
||||
/*
|
||||
* Easy part of complementing, and all there is to do since the MCCE code
|
||||
* was removed.
|
||||
* was removed. Note that the result of colorcomplement() cannot be a
|
||||
* rainbow, since we don't allow empty brackets; so there's no point in
|
||||
* calling optimizebracket() again.
|
||||
*/
|
||||
colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
|
||||
NOERR();
|
||||
@ -1511,14 +1637,15 @@ cbracket(struct vars *v,
|
||||
static void
|
||||
brackpart(struct vars *v,
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
struct state *rp,
|
||||
bool *have_cclassc)
|
||||
{
|
||||
chr startc;
|
||||
chr endc;
|
||||
struct cvec *cv;
|
||||
enum char_classes cls;
|
||||
const chr *startp;
|
||||
const chr *endp;
|
||||
chr c[1];
|
||||
|
||||
/* parse something, get rid of special cases, take shortcuts */
|
||||
switch (v->nexttype)
|
||||
@ -1528,15 +1655,14 @@ brackpart(struct vars *v,
|
||||
return;
|
||||
break;
|
||||
case PLAIN:
|
||||
c[0] = v->nextvalue;
|
||||
startc = v->nextvalue;
|
||||
NEXT();
|
||||
/* shortcut for ordinary chr (not range) */
|
||||
if (!SEE(RANGE))
|
||||
{
|
||||
onechr(v, c[0], lp, rp);
|
||||
onechr(v, startc, lp, rp);
|
||||
return;
|
||||
}
|
||||
startc = element(v, c, c + 1);
|
||||
NOERR();
|
||||
break;
|
||||
case COLLEL:
|
||||
@ -1564,9 +1690,20 @@ brackpart(struct vars *v,
|
||||
endp = scanplain(v);
|
||||
INSIST(startp < endp, REG_ECTYPE);
|
||||
NOERR();
|
||||
cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
|
||||
cls = lookupcclass(v, startp, endp);
|
||||
NOERR();
|
||||
subcolorcvec(v, cv, lp, rp);
|
||||
charclass(v, cls, lp, rp);
|
||||
return;
|
||||
break;
|
||||
case CCLASSS:
|
||||
charclass(v, (enum char_classes) v->nextvalue, lp, rp);
|
||||
NEXT();
|
||||
return;
|
||||
break;
|
||||
case CCLASSC:
|
||||
/* we cannot call charclasscomplement() immediately */
|
||||
have_cclassc[v->nextvalue] = true;
|
||||
NEXT();
|
||||
return;
|
||||
break;
|
||||
default:
|
||||
@ -1582,9 +1719,8 @@ brackpart(struct vars *v,
|
||||
{
|
||||
case PLAIN:
|
||||
case RANGE:
|
||||
c[0] = v->nextvalue;
|
||||
endc = v->nextvalue;
|
||||
NEXT();
|
||||
endc = element(v, c, c + 1);
|
||||
NOERR();
|
||||
break;
|
||||
case COLLEL:
|
||||
@ -1618,7 +1754,7 @@ brackpart(struct vars *v,
|
||||
/*
|
||||
* scanplain - scan PLAIN contents of [. etc.
|
||||
*
|
||||
* Certain bits of trickery in lex.c know that this code does not try
|
||||
* Certain bits of trickery in regc_lex.c know that this code does not try
|
||||
* to look past the final bracket of the [. etc.
|
||||
*/
|
||||
static const chr * /* just after end of sequence */
|
||||
@ -1664,39 +1800,98 @@ onechr(struct vars *v,
|
||||
subcolorcvec(v, allcases(v, c), lp, rp);
|
||||
}
|
||||
|
||||
/*
|
||||
* optimizebracket - see if bracket expression can be converted to RAINBOW
|
||||
*
|
||||
* Cases such as "[\s\S]" can produce a set of arcs of all colors, which we
|
||||
* can replace by a single RAINBOW arc for efficiency. (This might seem
|
||||
* like a silly way to write ".", but it's seemingly a common locution in
|
||||
* some other flavors of regex, so take the trouble to support it well.)
|
||||
*/
|
||||
static void
|
||||
optimizebracket(struct vars *v,
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
{
|
||||
struct colordesc *cd;
|
||||
struct colordesc *end = CDEND(v->cm);
|
||||
struct arc *a;
|
||||
bool israinbow;
|
||||
|
||||
/*
|
||||
* Scan lp's out-arcs and transiently mark the mentioned colors. We
|
||||
* expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp.
|
||||
* (Note: there shouldn't be any pseudocolors yet, but check anyway.)
|
||||
*/
|
||||
for (a = lp->outs; a != NULL; a = a->outchain)
|
||||
{
|
||||
assert(a->type == PLAIN);
|
||||
assert(a->co >= 0); /* i.e. not RAINBOW */
|
||||
assert(a->to == rp);
|
||||
cd = &v->cm->cd[a->co];
|
||||
assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO));
|
||||
cd->flags |= COLMARK;
|
||||
}
|
||||
|
||||
/* Scan colors, clear transient marks, check for unmarked live colors */
|
||||
israinbow = true;
|
||||
for (cd = v->cm->cd; cd < end; cd++)
|
||||
{
|
||||
if (cd->flags & COLMARK)
|
||||
cd->flags &= ~COLMARK;
|
||||
else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
|
||||
israinbow = false;
|
||||
}
|
||||
|
||||
/* Can't do anything if not all colors have arcs */
|
||||
if (!israinbow)
|
||||
return;
|
||||
|
||||
/* OK, drop existing arcs and replace with a rainbow */
|
||||
while ((a = lp->outs) != NULL)
|
||||
freearc(v->nfa, a);
|
||||
newarc(v->nfa, PLAIN, RAINBOW, lp, rp);
|
||||
}
|
||||
|
||||
/*
|
||||
* wordchrs - set up word-chr list for word-boundary stuff, if needed
|
||||
*
|
||||
* The list is kept as a bunch of arcs between two dummy states; it's
|
||||
* disposed of by the unreachable-states sweep in NFA optimization.
|
||||
* Does NEXT(). Must not be called from any unusual lexical context.
|
||||
* This should be reconciled with the \w etc. handling in lex.c, and
|
||||
* should be cleaned up to reduce dependencies on input scanning.
|
||||
* The list is kept as a bunch of circular arcs on an otherwise-unused state.
|
||||
*
|
||||
* Note that this must not be called while we have any open subcolors,
|
||||
* else construction of the list would confuse color bookkeeping.
|
||||
* Hence, we can't currently apply a similar optimization in
|
||||
* charclass[complement](), as those need to be usable within bracket
|
||||
* expressions.
|
||||
*/
|
||||
static void
|
||||
wordchrs(struct vars *v)
|
||||
{
|
||||
struct state *left;
|
||||
struct state *right;
|
||||
struct state *cstate;
|
||||
struct cvec *cv;
|
||||
|
||||
if (v->wordchrs != NULL)
|
||||
{
|
||||
NEXT(); /* for consistency */
|
||||
return;
|
||||
}
|
||||
return; /* done already */
|
||||
|
||||
left = newstate(v->nfa);
|
||||
right = newstate(v->nfa);
|
||||
/* make dummy state to hang the cache arcs on */
|
||||
cstate = newstate(v->nfa);
|
||||
NOERR();
|
||||
/* fine point: implemented with [::], and lexer will set REG_ULOCALE */
|
||||
lexword(v);
|
||||
NEXT();
|
||||
assert(v->savenow != NULL && SEE('['));
|
||||
bracket(v, left, right);
|
||||
assert((v->savenow != NULL && SEE(']')) || ISERR());
|
||||
NEXT();
|
||||
|
||||
/* obtain possibly-cached cvec for \w characters */
|
||||
NOTE(REG_ULOCALE);
|
||||
cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE));
|
||||
NOERR();
|
||||
v->wordchrs = left;
|
||||
|
||||
/* build the arcs; this may cause color splitting */
|
||||
subcolorcvec(v, cv, cstate, cstate);
|
||||
NOERR();
|
||||
|
||||
/* close new open subcolors to ensure the cache entry is self-contained */
|
||||
okcolors(v->nfa, v->cm);
|
||||
NOERR();
|
||||
|
||||
/* success! save the cache pointer */
|
||||
v->wordchrs = cstate;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -127,6 +127,18 @@
|
||||
#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
|
||||
|
||||
|
||||
/*
|
||||
* known character classes
|
||||
*/
|
||||
enum char_classes
|
||||
{
|
||||
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
|
||||
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD
|
||||
};
|
||||
|
||||
#define NUM_CCLASSES 14
|
||||
|
||||
|
||||
/*
|
||||
* As soon as possible, we map chrs into equivalence classes -- "colors" --
|
||||
* which are of much more manageable number.
|
||||
@ -164,12 +176,14 @@ struct colordesc
|
||||
#define NOSUB COLORLESS /* value of "sub" when no open subcolor */
|
||||
struct arc *arcs; /* chain of all arcs of this color */
|
||||
chr firstchr; /* simple char first assigned to this color */
|
||||
int flags; /* bit values defined next */
|
||||
int flags; /* bitmask of the following flags: */
|
||||
#define FREECOL 01 /* currently free */
|
||||
#define PSEUDO 02 /* pseudocolor, no real chars */
|
||||
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
|
||||
#define COLMARK 04 /* temporary marker used in some functions */
|
||||
};
|
||||
|
||||
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
|
||||
|
||||
/*
|
||||
* The color map itself
|
||||
*
|
||||
@ -199,8 +213,6 @@ struct colordesc
|
||||
* appear in increasing chr-value order.
|
||||
*/
|
||||
|
||||
#define NUM_CCLASSES 13 /* must match data in regc_locale.c */
|
||||
|
||||
typedef struct colormaprange
|
||||
{
|
||||
chr cmin; /* range represents cmin..cmax inclusive */
|
||||
|
@ -1970,6 +1970,256 @@ select * from test_regex('a[\w]b', 'axb', 'LPE');
|
||||
{axb}
|
||||
(2 rows)
|
||||
|
||||
-- these should be invalid
|
||||
select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
|
||||
ERROR: invalid regular expression: invalid character range
|
||||
select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
|
||||
ERROR: invalid regular expression: invalid character range
|
||||
select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
|
||||
ERROR: invalid regular expression: invalid character range
|
||||
select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
|
||||
ERROR: invalid regular expression: invalid character range
|
||||
-- test complemented char classes within brackets
|
||||
select * from test_regex('[\D]', '0123456789abc*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{a}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{0}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{1}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{1}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{2}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{2}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\W', '0123456789abc_*', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{*}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{*}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
|
||||
test_regex
|
||||
--------------------------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
|
||||
{"012 3456789abc_*"}
|
||||
(2 rows)
|
||||
|
||||
-- check char classes' handling of newlines
|
||||
select * from test_regex('\s+', E'abc \n def', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{" +
|
||||
"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\s+', E'abc \n def', 'nLP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{" +
|
||||
"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\s]+', E'abc \n def', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{" +
|
||||
"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{" +
|
||||
"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\S+', E'abc\ndef', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\S+', E'abc\ndef', 'nLP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\d+', E'012\n345', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\d+', E'012\n345', 'nLP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\d]+', E'012\n345', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\d]+', E'012\n345', 'nLPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\D+', E'abc\ndef345', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{"abc +
|
||||
def"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\D+', E'abc\ndef345', 'nLP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{"abc +
|
||||
def"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\w+', E'abc_012\ndef', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc_012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc_012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc_012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{abc_012}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\W+', E'***\n@@@___', 'LP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{"*** +
|
||||
@@@"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('\W+', E'***\n@@@___', 'nLP');
|
||||
test_regex
|
||||
-------------------------------
|
||||
{0,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{***}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{"*** +
|
||||
@@@"}
|
||||
(2 rows)
|
||||
|
||||
select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
|
||||
test_regex
|
||||
----------------------------------------
|
||||
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
|
||||
{***}
|
||||
(2 rows)
|
||||
|
||||
-- doing 13 "escapes"
|
||||
-- expectError 13.1 & "a\\" EESCAPE
|
||||
select * from test_regex('a\', '', '');
|
||||
|
@ -597,6 +597,50 @@ select * from test_regex('a[\s]b', 'a b', 'LPE');
|
||||
-- expectMatch 12.18 LPE {a[\w]b} axb axb
|
||||
select * from test_regex('a[\w]b', 'axb', 'LPE');
|
||||
|
||||
-- these should be invalid
|
||||
select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
|
||||
select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
|
||||
select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
|
||||
select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
|
||||
|
||||
-- test complemented char classes within brackets
|
||||
select * from test_regex('[\D]', '0123456789abc*', 'LPE');
|
||||
select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
|
||||
select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
|
||||
select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
|
||||
select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
|
||||
select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
|
||||
select * from test_regex('\W', '0123456789abc_*', 'LP');
|
||||
select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
|
||||
select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
|
||||
|
||||
-- check char classes' handling of newlines
|
||||
select * from test_regex('\s+', E'abc \n def', 'LP');
|
||||
select * from test_regex('\s+', E'abc \n def', 'nLP');
|
||||
select * from test_regex('[\s]+', E'abc \n def', 'LPE');
|
||||
select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
|
||||
select * from test_regex('\S+', E'abc\ndef', 'LP');
|
||||
select * from test_regex('\S+', E'abc\ndef', 'nLP');
|
||||
select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
|
||||
select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
|
||||
select * from test_regex('\d+', E'012\n345', 'LP');
|
||||
select * from test_regex('\d+', E'012\n345', 'nLP');
|
||||
select * from test_regex('[\d]+', E'012\n345', 'LPE');
|
||||
select * from test_regex('[\d]+', E'012\n345', 'nLPE');
|
||||
select * from test_regex('\D+', E'abc\ndef345', 'LP');
|
||||
select * from test_regex('\D+', E'abc\ndef345', 'nLP');
|
||||
select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
|
||||
select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
|
||||
select * from test_regex('\w+', E'abc_012\ndef', 'LP');
|
||||
select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
|
||||
select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
|
||||
select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
|
||||
select * from test_regex('\W+', E'***\n@@@___', 'LP');
|
||||
select * from test_regex('\W+', E'***\n@@@___', 'nLP');
|
||||
select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
|
||||
select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
|
||||
|
||||
|
||||
-- doing 13 "escapes"
|
||||
|
||||
-- expectError 13.1 & "a\\" EESCAPE
|
||||
|
Loading…
x
Reference in New Issue
Block a user