diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile index 024265d479..0327d9510a 100644 --- a/contrib/fuzzystrmatch/Makefile +++ b/contrib/fuzzystrmatch/Makefile @@ -17,6 +17,3 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif - -# levenshtein.c is #included by fuzzystrmatch.c -fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index 7a53d8a008..f0df08032b 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -154,23 +154,6 @@ getcode(char c) /* These prevent GH from becoming F */ #define NOGHTOF(c) (getcode(c) & 16) /* BDH */ -/* Faster than memcmp(), for this use case. */ -static inline bool -rest_of_char_same(const char *s1, const char *s2, int len) -{ - while (len > 0) - { - len--; - if (s1[len] != s2[len]) - return false; - } - return true; -} - -#include "levenshtein.c" -#define LEVENSHTEIN_LESS_EQUAL -#include "levenshtein.c" - PG_FUNCTION_INFO_V1(levenshtein_with_costs); Datum levenshtein_with_costs(PG_FUNCTION_ARGS) @@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS) int ins_c = PG_GETARG_INT32(2); int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; - PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c, + del_c, sub_c)); } @@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS) { text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; - PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1, + 1)); } @@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS) int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d)); + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, + t_bytes, ins_c, del_c, + sub_c, max_d)); } @@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS) text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d)); + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, + t_bytes, 1, 1, 1, max_d)); } diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 7b4391bba1..3ea9bf435a 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \ like.o: like.c like_match.c +varlena.o: varlena.c levenshtein.c + include $(top_srcdir)/src/backend/common.mk diff --git a/contrib/fuzzystrmatch/levenshtein.c b/src/backend/utils/adt/levenshtein.c similarity index 85% rename from contrib/fuzzystrmatch/levenshtein.c rename to src/backend/utils/adt/levenshtein.c index 4f37a54b1e..a8670e9a85 100644 --- a/contrib/fuzzystrmatch/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -1,41 +1,34 @@ -/* +/*------------------------------------------------------------------------- + * * levenshtein.c + * Levenshtein distance implementation. * - * Functions for "fuzzy" comparison of strings + * Original author: Joe Conway * - * Joe Conway + * This file is included by varlena.c twice, to provide matching code for (1) + * Levenshtein distance with custom costings, and (2) Levenshtein distance with + * custom costings and a "max" value above which exact distances are not + * interesting. Before the inclusion, we rely on the presence of the inline + * function rest_of_char_same(). + * + * Written based on a description of the algorithm by Michael Gilleland found + * at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the + * PHP 4.0.6 distribution for inspiration. Configurable penalty costs + * extension is introduced by Volkan YAZICI . + * IDENTIFICATION + * src/backend/utils/adt/levenshtein.c + * + *------------------------------------------------------------------------- */ - -/* - * External declarations for exported functions - */ -#ifdef LEVENSHTEIN_LESS_EQUAL -static int levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d); -#else -static int levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c); -#endif - #define MAX_LEVENSHTEIN_STRLEN 255 - /* - * Calculates Levenshtein distance metric between supplied strings. Generally - * (1, 1, 1) penalty costs suffices for common cases, but your mileage may - * vary. + * Calculates Levenshtein distance metric between supplied csrings, which are + * not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices + * for common cases, but your mileage may vary. * * One way to compute Levenshtein distance is to incrementally construct * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number @@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t, * identify the portion of the matrix close to the diagonal which can still * affect the final answer. */ -static int +int #ifdef LEVENSHTEIN_LESS_EQUAL -levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d) +varstr_levenshtein_less_equal(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c, + int max_d) #else -levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c) +varstr_levenshtein(const char *source, int slen, const char *target, int tlen, + int ins_c, int del_c, int sub_c) #endif { int m, - n, - s_bytes, - t_bytes; + n; int *prev; int *curr; int *s_char_len = NULL; int i, j; - const char *s_data; - const char *t_data; const char *y; /* - * For levenshtein_less_equal_internal, we have real variables called + * For varstr_levenshtein_less_equal, we have real variables called * start_column and stop_column; otherwise it's just short-hand for 0 and * m. */ @@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t, #define STOP_COLUMN m #endif - /* Extract a pointer to the actual character data. */ - s_data = VARDATA_ANY(s); - t_data = VARDATA_ANY(t); - - /* Determine length of each string in bytes and characters. */ - s_bytes = VARSIZE_ANY_EXHDR(s); - t_bytes = VARSIZE_ANY_EXHDR(t); - m = pg_mbstrlen_with_len(s_data, s_bytes); - n = pg_mbstrlen_with_len(t_data, t_bytes); + m = pg_mbstrlen_with_len(source, slen); + n = pg_mbstrlen_with_len(target, tlen); /* * We can transform an empty s into t with n insertions, or a non-empty t @@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t, * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ - if (m != s_bytes || n != t_bytes) + if (m != slen || n != tlen) { int i; - const char *cp = s_data; + const char *cp = source; s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) @@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t, prev[i] = i * del_c; /* Loop through rows of the notional array */ - for (y = t_data, j = 1; j < n; j++) + for (y = target, j = 1; j < n; j++) { int *temp; - const char *x = s_data; - int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; + const char *x = source; + int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; #ifdef LEVENSHTEIN_LESS_EQUAL @@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t, prev[start_column] = max_d + 1; curr[start_column] = max_d + 1; if (start_column != 0) - s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; + source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; start_column++; } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index c3171b549a..b3f397e959 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) return result; } - /* text_cmp() * Internal comparison function for text strings. * Returns -1, 0 or 1 @@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS) { return text_format(fcinfo); } + +/* + * Helper function for Levenshtein distance functions. Faster than memcmp(), + * for this use case. + */ +static inline bool +rest_of_char_same(const char *s1, const char *s2, int len) +{ + while (len > 0) + { + len--; + if (s1[len] != s2[len]) + return false; + } + return true; +} + +/* Expand each Levenshtein distance variant */ +#include "levenshtein.c" +#define LEVENSHTEIN_LESS_EQUAL +#include "levenshtein.c" diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 3ba34f88ee..417fd1771a 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS); extern Datum name_text(PG_FUNCTION_ARGS); extern Datum text_name(PG_FUNCTION_ARGS); extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid); +extern int varstr_levenshtein(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c); +extern int varstr_levenshtein_less_equal(const char *source, int slen, + const char *target, int tlen, int ins_c, + int del_c, int sub_c, int max_d); extern List *textToQualifiedNameList(text *textval); extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist);