diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index f9508a574f..c0a2ee848a 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -171,12 +171,12 @@ levenshtein_with_costs(PG_FUNCTION_ARGS) /* Extract a pointer to the actual character data */ s_data = VARDATA_ANY(src); t_data = VARDATA_ANY(dst); - /* Determine length of each string in bytes and characters */ + /* Determine length of each string in bytes */ s_bytes = VARSIZE_ANY_EXHDR(src); t_bytes = VARSIZE_ANY_EXHDR(dst); - PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c, - del_c, sub_c)); + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, + ins_c, del_c, sub_c, false)); } @@ -194,12 +194,12 @@ levenshtein(PG_FUNCTION_ARGS) /* Extract a pointer to the actual character data */ s_data = VARDATA_ANY(src); t_data = VARDATA_ANY(dst); - /* Determine length of each string in bytes and characters */ + /* Determine length of each string in bytes */ s_bytes = VARSIZE_ANY_EXHDR(src); t_bytes = VARSIZE_ANY_EXHDR(dst); - PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1, - 1)); + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, + 1, 1, 1, false)); } @@ -221,13 +221,14 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS) /* Extract a pointer to the actual character data */ s_data = VARDATA_ANY(src); t_data = VARDATA_ANY(dst); - /* Determine length of each string in bytes and characters */ + /* Determine length of each string in bytes */ s_bytes = VARSIZE_ANY_EXHDR(src); t_bytes = VARSIZE_ANY_EXHDR(dst); - PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, - t_bytes, ins_c, del_c, - sub_c, max_d)); + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, + t_data, t_bytes, + ins_c, del_c, sub_c, + max_d, false)); } @@ -246,12 +247,14 @@ levenshtein_less_equal(PG_FUNCTION_ARGS) /* Extract a pointer to the actual character data */ s_data = VARDATA_ANY(src); t_data = VARDATA_ANY(dst); - /* Determine length of each string in bytes and characters */ + /* Determine length of each string in bytes */ s_bytes = VARSIZE_ANY_EXHDR(src); t_bytes = VARSIZE_ANY_EXHDR(dst); - PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, - t_bytes, 1, 1, 1, max_d)); + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, + t_data, t_bytes, + 1, 1, 1, + max_d, false)); } diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 6f569dad54..795eee24de 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -550,7 +550,8 @@ updateFuzzyAttrMatchState(int fuzzy_rte_penalty, varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen, 1, 1, 1, fuzzystate->distance + 1 - - fuzzy_rte_penalty); + - fuzzy_rte_penalty, + true); /* * If more than half the characters are different, don't treat it as a @@ -843,10 +844,12 @@ searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname, */ if (alias != NULL) fuzzy_rte_penalty = - varstr_levenshtein(alias, strlen(alias), - rte->eref->aliasname, - strlen(rte->eref->aliasname), - 1, 1, 1); + varstr_levenshtein_less_equal(alias, strlen(alias), + rte->eref->aliasname, + strlen(rte->eref->aliasname), + 1, 1, 1, + MAX_FUZZY_DISTANCE + 1, + true); /* * Scan for a matching column; if we find an exact match, we're diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index 2c30b6c8e9..cdb8b012cb 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -26,9 +26,16 @@ #define MAX_LEVENSHTEIN_STRLEN 255 /* - * Calculates Levenshtein distance metric between supplied csrings, which are - * not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices - * for common cases, but your mileage may vary. + * Calculates Levenshtein distance metric between supplied strings, which are + * not necessarily null-terminated. + * + * source: source string, of length slen bytes. + * target: target string, of length tlen bytes. + * ins_c, del_c, sub_c: costs to charge for character insertion, deletion, + * and substitution respectively; (1, 1, 1) costs suffice for common + * cases, but your mileage may vary. + * max_d: if provided and >= 0, maximum distance we care about; see below. + * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN. * * One way to compute Levenshtein distance is to incrementally construct * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number @@ -43,7 +50,7 @@ * array. * * If max_d >= 0, we only need to provide an accurate answer when that answer - * is less than or equal to the bound. From any cell in the matrix, there is + * is less than or equal to max_d. From any cell in the matrix, there is * theoretical "minimum residual distance" from that cell to the last column * of the final row. This minimum residual distance is zero when the * untransformed portions of the strings are of equal length (because we might @@ -58,12 +65,15 @@ */ int #ifdef LEVENSHTEIN_LESS_EQUAL -varstr_levenshtein_less_equal(const char *source, int slen, const char *target, - int tlen, int ins_c, int del_c, int sub_c, - int max_d) +varstr_levenshtein_less_equal(const char *source, int slen, + const char *target, int tlen, + int ins_c, int del_c, int sub_c, + int max_d, bool trusted) #else -varstr_levenshtein(const char *source, int slen, const char *target, int tlen, - int ins_c, int del_c, int sub_c) +varstr_levenshtein(const char *source, int slen, + const char *target, int tlen, + int ins_c, int del_c, int sub_c, + bool trusted) #endif { int m, @@ -95,15 +105,7 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen, #define STOP_COLUMN m #endif - /* - * A common use for Levenshtein distance is to match attributes when - * building diagnostic, user-visible messages. Restrict the size of - * MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to - * work. - */ - StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN, - "Levenshtein hinting mechanism restricts NAMEDATALEN"); - + /* Convert string lengths (in bytes) to lengths in characters */ m = pg_mbstrlen_with_len(source, slen); n = pg_mbstrlen_with_len(target, tlen); @@ -118,14 +120,18 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen, /* * For security concerns, restrict excessive CPU+RAM usage. (This - * implementation uses O(m) memory and has O(mn) complexity.) + * implementation uses O(m) memory and has O(mn) complexity.) If + * "trusted" is true, caller is responsible for not making excessive + * requests, typically by using a small max_d along with strings that are + * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly. */ - if (m > MAX_LEVENSHTEIN_STRLEN || - n > MAX_LEVENSHTEIN_STRLEN) + if (!trusted && + (m > MAX_LEVENSHTEIN_STRLEN || + n > MAX_LEVENSHTEIN_STRLEN)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("argument exceeds the maximum length of %d bytes", - MAX_LEVENSHTEIN_STRLEN))); + errmsg("levenshtein argument exceeds maximum length of %d characters", + MAX_LEVENSHTEIN_STRLEN))); #ifdef LEVENSHTEIN_LESS_EQUAL /* Initialize start and stop columns. */ diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 95f2a848d3..15ad1ba3ba 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -808,11 +808,14 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS); extern Datum name_text(PG_FUNCTION_ARGS); extern Datum text_name(PG_FUNCTION_ARGS); extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid); -extern int varstr_levenshtein(const char *source, int slen, const char *target, - int tlen, int ins_c, int del_c, int sub_c); +extern int varstr_levenshtein(const char *source, int slen, + const char *target, int tlen, + int ins_c, int del_c, int sub_c, + bool trusted); extern int varstr_levenshtein_less_equal(const char *source, int slen, - const char *target, int tlen, int ins_c, - int del_c, int sub_c, int max_d); + const char *target, int tlen, + int ins_c, int del_c, int sub_c, + int max_d, bool trusted); extern List *textToQualifiedNameList(text *textval); extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist);