diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 268cee1cbe..3b7a607437 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -630,11 +630,10 @@ textregexreplace_noopt(PG_FUNCTION_ARGS) text *s = PG_GETARG_TEXT_PP(0); text *p = PG_GETARG_TEXT_PP(1); text *r = PG_GETARG_TEXT_PP(2); - regex_t *re; - re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION()); - - PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, 1)); + PG_RETURN_TEXT_P(replace_text_regexp(s, p, r, + REG_ADVANCED, PG_GET_COLLATION(), + 0, 1)); } /* @@ -648,7 +647,6 @@ textregexreplace(PG_FUNCTION_ARGS) text *p = PG_GETARG_TEXT_PP(1); text *r = PG_GETARG_TEXT_PP(2); text *opt = PG_GETARG_TEXT_PP(3); - regex_t *re; pg_re_flags flags; /* @@ -672,10 +670,9 @@ textregexreplace(PG_FUNCTION_ARGS) parse_re_flags(&flags, opt); - re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION()); - - PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, - flags.glob ? 0 : 1)); + PG_RETURN_TEXT_P(replace_text_regexp(s, p, r, + flags.cflags, PG_GET_COLLATION(), + 0, flags.glob ? 0 : 1)); } /* @@ -694,7 +691,6 @@ textregexreplace_extended(PG_FUNCTION_ARGS) int n = 1; text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); pg_re_flags re_flags; - regex_t *re; /* Collect optional parameters */ if (PG_NARGS() > 3) @@ -723,11 +719,10 @@ textregexreplace_extended(PG_FUNCTION_ARGS) if (PG_NARGS() <= 4) n = re_flags.glob ? 0 : 1; - /* Compile the regular expression */ - re = RE_compile_and_cache(p, re_flags.cflags, PG_GET_COLLATION()); - /* Do the replacement(s) */ - PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, start - 1, n)); + PG_RETURN_TEXT_P(replace_text_regexp(s, p, r, + re_flags.cflags, PG_GET_COLLATION(), + start - 1, n)); } /* This is separate to keep the opr_sanity regression test from complaining */ diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 348b5566de..acb8741734 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -4359,34 +4359,36 @@ replace_text(PG_FUNCTION_ARGS) } /* - * check_replace_text_has_escape_char + * check_replace_text_has_escape * - * check whether replace_text contains escape char. + * Returns 0 if text contains no backslashes that need processing. + * Returns 1 if text contains backslashes, but not regexp submatch specifiers. + * Returns 2 if text contains regexp submatch specifiers (\1 .. \9). */ -static bool -check_replace_text_has_escape_char(const text *replace_text) +static int +check_replace_text_has_escape(const text *replace_text) { + int result = 0; const char *p = VARDATA_ANY(replace_text); const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); - if (pg_database_encoding_max_length() == 1) + while (p < p_end) { - for (; p < p_end; p++) + /* Find next escape char, if any. */ + p = memchr(p, '\\', p_end - p); + if (p == NULL) + break; + p++; + /* Note: a backslash at the end doesn't require extra processing. */ + if (p < p_end) { - if (*p == '\\') - return true; + if (*p >= '1' && *p <= '9') + return 2; /* Found a submatch specifier, so done */ + result = 1; /* Found some other sequence, keep looking */ + p++; } } - else - { - for (; p < p_end; p += pg_mblen(p)) - { - if (*p == '\\') - return true; - } - } - - return false; + return result; } /* @@ -4403,25 +4405,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, { const char *p = VARDATA_ANY(replace_text); const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); - int eml = pg_database_encoding_max_length(); - for (;;) + while (p < p_end) { const char *chunk_start = p; int so; int eo; - /* Find next escape char. */ - if (eml == 1) - { - for (; p < p_end && *p != '\\'; p++) - /* nothing */ ; - } - else - { - for (; p < p_end && *p != '\\'; p += pg_mblen(p)) - /* nothing */ ; - } + /* Find next escape char, if any. */ + p = memchr(p, '\\', p_end - p); + if (p == NULL) + p = p_end; /* Copy the text we just scanned over, if any. */ if (p > chunk_start) @@ -4473,7 +4467,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, continue; } - if (so != -1 && eo != -1) + if (so >= 0 && eo >= 0) { /* * Copy the text that is back reference of regexp. Note so and eo @@ -4491,36 +4485,37 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, } } -#define REGEXP_REPLACE_BACKREF_CNT 10 - /* * replace_text_regexp * - * replace substring(s) in src_text that match regexp with replace_text. + * replace substring(s) in src_text that match pattern with replace_text. + * The replace_text can contain backslash markers to substitute + * (parts of) the matched text. * + * cflags: regexp compile flags. + * collation: collation to use. * search_start: the character (not byte) offset in src_text at which to * begin searching. * n: if 0, replace all matches; if > 0, replace only the N'th match. - * - * Note: to avoid having to include regex.h in builtins.h, we declare - * the regexp argument as void *, but really it's regex_t *. */ text * -replace_text_regexp(text *src_text, void *regexp, +replace_text_regexp(text *src_text, text *pattern_text, text *replace_text, + int cflags, Oid collation, int search_start, int n) { text *ret_text; - regex_t *re = (regex_t *) regexp; + regex_t *re; int src_text_len = VARSIZE_ANY_EXHDR(src_text); int nmatches = 0; StringInfoData buf; - regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; + regmatch_t pmatch[10]; /* main match, plus \1 to \9 */ + int nmatch = lengthof(pmatch); pg_wchar *data; size_t data_len; int data_pos; char *start_ptr; - bool have_escape; + int escape_status; initStringInfo(&buf); @@ -4528,8 +4523,19 @@ replace_text_regexp(text *src_text, void *regexp, data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len); - /* Check whether replace_text has escape char. */ - have_escape = check_replace_text_has_escape_char(replace_text); + /* Check whether replace_text has escapes, especially regexp submatches. */ + escape_status = check_replace_text_has_escape(replace_text); + + /* If no regexp submatches, we can use REG_NOSUB. */ + if (escape_status < 2) + { + cflags |= REG_NOSUB; + /* Also tell pg_regexec we only want the whole-match location. */ + nmatch = 1; + } + + /* Prepare the regexp. */ + re = RE_compile_and_cache(pattern_text, cflags, collation); /* start_ptr points to the data_pos'th character of src_text */ start_ptr = (char *) VARDATA_ANY(src_text); @@ -4546,7 +4552,7 @@ replace_text_regexp(text *src_text, void *regexp, data_len, search_start, NULL, /* no details */ - REGEXP_REPLACE_BACKREF_CNT, + nmatch, pmatch, 0); @@ -4602,10 +4608,9 @@ replace_text_regexp(text *src_text, void *regexp, } /* - * Copy the replace_text. Process back references when the - * replace_text has escape characters. + * Copy the replace_text, processing escapes if any are present. */ - if (have_escape) + if (escape_status > 0) appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, start_ptr, data_pos); else diff --git a/src/include/utils/varlena.h b/src/include/utils/varlena.h index 6645e2af13..cd12252ed9 100644 --- a/src/include/utils/varlena.h +++ b/src/include/utils/varlena.h @@ -33,8 +33,9 @@ extern bool SplitDirectoriesString(char *rawstring, char separator, List **namelist); extern bool SplitGUCList(char *rawstring, char separator, List **namelist); -extern text *replace_text_regexp(text *src_text, void *regexp, +extern text *replace_text_regexp(text *src_text, text *pattern_text, text *replace_text, + int cflags, Oid collation, int search_start, int n); #endif diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index a9efd74c7b..0f95b9400b 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -571,13 +571,32 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; ERROR: invalid escape string HINT: Escape string must be empty or one character. --- Test back reference in regexp_replace +-- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); regexp_replace ---------------- (111) 222-3333 (1 row) +SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g'); + regexp_replace +------------------- + fXooYbaXrrYbaXzzY +(1 row) + +SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\\\Y', 'g'); + regexp_replace +---------------- + fX\YbaX\YbaX\Y +(1 row) + +-- not an error, though perhaps it should be: +SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\Y\\1Z\\'); + regexp_replace +----------------- + fX\YoZ\barrbazz +(1 row) + SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); regexp_replace ---------------- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 6a029cc369..8c379182cb 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -187,8 +187,13 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; --- Test back reference in regexp_replace +-- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); +SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g'); +SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\\\Y', 'g'); +-- not an error, though perhaps it should be: +SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\Y\\1Z\\'); + SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');