Remove new coupling between NAMEDATALEN and MAX_LEVENSHTEIN_STRLEN.

Commit e529cd4ffa introduced an Assert requiring NAMEDATALEN to be
less than MAX_LEVENSHTEIN_STRLEN, which has been 255 for a long time.
Since up to that instant we had always allowed NAMEDATALEN to be
substantially more than that, this was ill-advised.

It's debatable whether we need MAX_LEVENSHTEIN_STRLEN at all (versus
putting a CHECK_FOR_INTERRUPTS into the loop), or whether it has to be
so tight; but this patch takes the narrower approach of just not applying
the MAX_LEVENSHTEIN_STRLEN limit to calls from the parser.

Trusting the parser for this seems reasonable, first because the strings
are limited to NAMEDATALEN which is unlikely to be hugely more than 256,
and second because the maximum distance is tightly constrained by
MAX_FUZZY_DISTANCE (though we'd forgotten to make use of that limit in one
place).  That means the cost is not really O(mn) but more like O(max(m,n)).

Relaxing the limit for user-supplied calls is left for future research;
given the lack of complaints to date, it doesn't seem very high priority.

In passing, fix confusion between lengths-in-bytes and lengths-in-chars
in comments and error messages.

Per gripe from Kevin Day; solution suggested by Robert Haas.  Back-patch
to 9.5 where the unwanted restriction was introduced.
This commit is contained in:
Tom Lane 2016-01-22 11:53:06 -05:00
parent 647d87c56a
commit a396144ac0
4 changed files with 60 additions and 45 deletions

View File

@ -171,12 +171,12 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
/* Determine length of each string in bytes */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c,
del_c, sub_c));
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
ins_c, del_c, sub_c, false));
}
@ -194,12 +194,12 @@ levenshtein(PG_FUNCTION_ARGS)
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
/* Determine length of each string in bytes */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1,
1));
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
1, 1, 1, false));
}
@ -221,13 +221,14 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
/* Determine length of each string in bytes */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
t_bytes, ins_c, del_c,
sub_c, max_d));
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
t_data, t_bytes,
ins_c, del_c, sub_c,
max_d, false));
}
@ -246,12 +247,14 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
/* Determine length of each string in bytes */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
t_bytes, 1, 1, 1, max_d));
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
t_data, t_bytes,
1, 1, 1,
max_d, false));
}

View File

@ -550,7 +550,8 @@ updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen,
1, 1, 1,
fuzzystate->distance + 1
- fuzzy_rte_penalty);
- fuzzy_rte_penalty,
true);
/*
* If more than half the characters are different, don't treat it as a
@ -843,10 +844,12 @@ searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname,
*/
if (alias != NULL)
fuzzy_rte_penalty =
varstr_levenshtein(alias, strlen(alias),
rte->eref->aliasname,
strlen(rte->eref->aliasname),
1, 1, 1);
varstr_levenshtein_less_equal(alias, strlen(alias),
rte->eref->aliasname,
strlen(rte->eref->aliasname),
1, 1, 1,
MAX_FUZZY_DISTANCE + 1,
true);
/*
* Scan for a matching column; if we find an exact match, we're

View File

@ -26,9 +26,16 @@
#define MAX_LEVENSHTEIN_STRLEN 255
/*
* Calculates Levenshtein distance metric between supplied csrings, which are
* not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices
* for common cases, but your mileage may vary.
* Calculates Levenshtein distance metric between supplied strings, which are
* not necessarily null-terminated.
*
* source: source string, of length slen bytes.
* target: target string, of length tlen bytes.
* ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
* and substitution respectively; (1, 1, 1) costs suffice for common
* cases, but your mileage may vary.
* max_d: if provided and >= 0, maximum distance we care about; see below.
* trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
*
* One way to compute Levenshtein distance is to incrementally construct
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@ -43,7 +50,7 @@
* array.
*
* If max_d >= 0, we only need to provide an accurate answer when that answer
* is less than or equal to the bound. From any cell in the matrix, there is
* is less than or equal to max_d. From any cell in the matrix, there is
* theoretical "minimum residual distance" from that cell to the last column
* of the final row. This minimum residual distance is zero when the
* untransformed portions of the strings are of equal length (because we might
@ -58,12 +65,15 @@
*/
int
#ifdef LEVENSHTEIN_LESS_EQUAL
varstr_levenshtein_less_equal(const char *source, int slen, const char *target,
int tlen, int ins_c, int del_c, int sub_c,
int max_d)
varstr_levenshtein_less_equal(const char *source, int slen,
const char *target, int tlen,
int ins_c, int del_c, int sub_c,
int max_d, bool trusted)
#else
varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
int ins_c, int del_c, int sub_c)
varstr_levenshtein(const char *source, int slen,
const char *target, int tlen,
int ins_c, int del_c, int sub_c,
bool trusted)
#endif
{
int m,
@ -95,15 +105,7 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
#define STOP_COLUMN m
#endif
/*
* A common use for Levenshtein distance is to match attributes when
* building diagnostic, user-visible messages. Restrict the size of
* MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to
* work.
*/
StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN,
"Levenshtein hinting mechanism restricts NAMEDATALEN");
/* Convert string lengths (in bytes) to lengths in characters */
m = pg_mbstrlen_with_len(source, slen);
n = pg_mbstrlen_with_len(target, tlen);
@ -118,14 +120,18 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
/*
* For security concerns, restrict excessive CPU+RAM usage. (This
* implementation uses O(m) memory and has O(mn) complexity.)
* implementation uses O(m) memory and has O(mn) complexity.) If
* "trusted" is true, caller is responsible for not making excessive
* requests, typically by using a small max_d along with strings that are
* bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
*/
if (m > MAX_LEVENSHTEIN_STRLEN ||
n > MAX_LEVENSHTEIN_STRLEN)
if (!trusted &&
(m > MAX_LEVENSHTEIN_STRLEN ||
n > MAX_LEVENSHTEIN_STRLEN))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("argument exceeds the maximum length of %d bytes",
MAX_LEVENSHTEIN_STRLEN)));
errmsg("levenshtein argument exceeds maximum length of %d characters",
MAX_LEVENSHTEIN_STRLEN)));
#ifdef LEVENSHTEIN_LESS_EQUAL
/* Initialize start and stop columns. */

View File

@ -810,11 +810,14 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
extern Datum name_text(PG_FUNCTION_ARGS);
extern Datum text_name(PG_FUNCTION_ARGS);
extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid);
extern int varstr_levenshtein(const char *source, int slen, const char *target,
int tlen, int ins_c, int del_c, int sub_c);
extern int varstr_levenshtein(const char *source, int slen,
const char *target, int tlen,
int ins_c, int del_c, int sub_c,
bool trusted);
extern int varstr_levenshtein_less_equal(const char *source, int slen,
const char *target, int tlen, int ins_c,
int del_c, int sub_c, int max_d);
const char *target, int tlen,
int ins_c, int del_c, int sub_c,
int max_d, bool trusted);
extern List *textToQualifiedNameList(text *textval);
extern bool SplitIdentifierString(char *rawstring, char separator,
List **namelist);