diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index ee1b7f3dc9..badbf111ee 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -224,6 +224,10 @@ typedef struct * of aliases to columns of the right input. Thus, positions in the printable * column alias list are not necessarily one-for-one with varattnos of the * JOIN, so we need a separate new_colnames[] array for printing purposes. + * + * Finally, when dealing with wide tables we risk O(N^2) costs in assigning + * non-duplicate column names. We ameliorate that by using a hash table that + * holds all the strings appearing in colnames, new_colnames, and parentUsing. */ typedef struct { @@ -291,6 +295,15 @@ typedef struct int *leftattnos; /* left-child varattnos of join cols, or 0 */ int *rightattnos; /* right-child varattnos of join cols, or 0 */ List *usingNames; /* names assigned to merged columns */ + + /* + * Hash table holding copies of all the strings appearing in this struct's + * colnames, new_colnames, and parentUsing. We use a hash table only for + * sufficiently wide relations, and only during the colname-assignment + * functions set_relation_column_names and set_join_column_names; + * otherwise, names_hash is NULL. + */ + HTAB *names_hash; /* entries are just strings */ } deparse_columns; /* This macro is analogous to rt_fetch(), but for deparse_columns structs */ @@ -376,6 +389,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns, static char *make_colname_unique(char *colname, deparse_namespace *dpns, deparse_columns *colinfo); static void expand_colnames_array_to(deparse_columns *colinfo, int n); +static void build_colinfo_names_hash(deparse_columns *colinfo); +static void add_to_names_hash(deparse_columns *colinfo, const char *name); +static void destroy_colinfo_names_hash(deparse_columns *colinfo); static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte, deparse_columns *colinfo); static char *get_rtable_name(int rtindex, deparse_context *context); @@ -4133,6 +4149,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode) * * parentUsing is a list of all USING aliases assigned in parent joins of * the current jointree node. (The passed-in list must not be modified.) + * + * Note that we do not use per-deparse_columns hash tables in this function. + * The number of names that need to be assigned should be small enough that + * we don't need to trouble with that. */ static void set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing) @@ -4408,6 +4428,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte, colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *)); colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool)); + /* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */ + build_colinfo_names_hash(colinfo); + /* * Scan the columns, select a unique alias for each one, and store it in * colinfo->colnames and colinfo->new_colnames. The former array has NULL @@ -4443,6 +4466,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte, colname = make_colname_unique(colname, dpns, colinfo); colinfo->colnames[i] = colname; + add_to_names_hash(colinfo, colname); } /* Put names of non-dropped columns in new_colnames[] too */ @@ -4456,6 +4480,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte, changed_any = true; } + /* We're now done needing the colinfo's names_hash */ + destroy_colinfo_names_hash(colinfo); + /* * Set correct length for new_colnames[] array. (Note: if columns have * been added, colinfo->num_cols includes them, which is not really quite @@ -4526,6 +4553,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, expand_colnames_array_to(colinfo, noldcolumns); Assert(colinfo->num_cols == noldcolumns); + /* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */ + build_colinfo_names_hash(colinfo); + /* * Scan the join output columns, select an alias for each one, and store * it in colinfo->colnames. If there are USING columns, set_using_names() @@ -4563,6 +4593,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, if (rte->alias == NULL) { colinfo->colnames[i] = real_colname; + add_to_names_hash(colinfo, real_colname); continue; } @@ -4579,6 +4610,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, colname = make_colname_unique(colname, dpns, colinfo); colinfo->colnames[i] = colname; + add_to_names_hash(colinfo, colname); } /* Remember if any assigned aliases differ from "real" name */ @@ -4677,6 +4709,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, } else colinfo->new_colnames[j] = child_colname; + add_to_names_hash(colinfo, colinfo->new_colnames[j]); } colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc]; @@ -4726,6 +4759,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, } else colinfo->new_colnames[j] = child_colname; + add_to_names_hash(colinfo, colinfo->new_colnames[j]); } colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc]; @@ -4740,6 +4774,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, Assert(j == nnewcolumns); #endif + /* We're now done needing the colinfo's names_hash */ + destroy_colinfo_names_hash(colinfo); + /* * For a named join, print column aliases if we changed any from the child * names. Unnamed joins cannot print aliases. @@ -4762,28 +4799,58 @@ colname_is_unique(const char *colname, deparse_namespace *dpns, int i; ListCell *lc; - /* Check against already-assigned column aliases within RTE */ - for (i = 0; i < colinfo->num_cols; i++) + /* + * If we have a hash table, consult that instead of linearly scanning the + * colinfo's strings. + */ + if (colinfo->names_hash) { - char *oldname = colinfo->colnames[i]; - - if (oldname && strcmp(oldname, colname) == 0) + if (hash_search(colinfo->names_hash, + colname, + HASH_FIND, + NULL) != NULL) return false; } + else + { + /* Check against already-assigned column aliases within RTE */ + for (i = 0; i < colinfo->num_cols; i++) + { + char *oldname = colinfo->colnames[i]; + + if (oldname && strcmp(oldname, colname) == 0) + return false; + } + + /* + * If we're building a new_colnames array, check that too (this will + * be partially but not completely redundant with the previous checks) + */ + for (i = 0; i < colinfo->num_new_cols; i++) + { + char *oldname = colinfo->new_colnames[i]; + + if (oldname && strcmp(oldname, colname) == 0) + return false; + } + + /* + * Also check against names already assigned for parent-join USING + * cols + */ + foreach(lc, colinfo->parentUsing) + { + char *oldname = (char *) lfirst(lc); + + if (strcmp(oldname, colname) == 0) + return false; + } + } /* - * If we're building a new_colnames array, check that too (this will be - * partially but not completely redundant with the previous checks) + * Also check against USING-column names that must be globally unique. + * These are not hashed, but there should be few of them. */ - for (i = 0; i < colinfo->num_new_cols; i++) - { - char *oldname = colinfo->new_colnames[i]; - - if (oldname && strcmp(oldname, colname) == 0) - return false; - } - - /* Also check against USING-column names that must be globally unique */ foreach(lc, dpns->using_names) { char *oldname = (char *) lfirst(lc); @@ -4792,15 +4859,6 @@ colname_is_unique(const char *colname, deparse_namespace *dpns, return false; } - /* Also check against names already assigned for parent-join USING cols */ - foreach(lc, colinfo->parentUsing) - { - char *oldname = (char *) lfirst(lc); - - if (strcmp(oldname, colname) == 0) - return false; - } - return true; } @@ -4861,6 +4919,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n) } } +/* + * build_colinfo_names_hash: optionally construct a hash table for colinfo + */ +static void +build_colinfo_names_hash(deparse_columns *colinfo) +{ + HASHCTL hash_ctl; + int i; + ListCell *lc; + + /* + * Use a hash table only for RTEs with at least 32 columns. (The cutoff + * is somewhat arbitrary, but let's choose it so that this code does get + * exercised in the regression tests.) + */ + if (colinfo->num_cols < 32) + return; + + /* + * Set up the hash table. The entries are just strings with no other + * payload. + */ + hash_ctl.keysize = NAMEDATALEN; + hash_ctl.entrysize = NAMEDATALEN; + hash_ctl.hcxt = CurrentMemoryContext; + colinfo->names_hash = hash_create("deparse_columns names", + colinfo->num_cols + colinfo->num_new_cols, + &hash_ctl, + HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); + + /* + * Preload the hash table with any names already present (these would have + * come from set_using_names). + */ + for (i = 0; i < colinfo->num_cols; i++) + { + char *oldname = colinfo->colnames[i]; + + if (oldname) + add_to_names_hash(colinfo, oldname); + } + + for (i = 0; i < colinfo->num_new_cols; i++) + { + char *oldname = colinfo->new_colnames[i]; + + if (oldname) + add_to_names_hash(colinfo, oldname); + } + + foreach(lc, colinfo->parentUsing) + { + char *oldname = (char *) lfirst(lc); + + add_to_names_hash(colinfo, oldname); + } +} + +/* + * add_to_names_hash: add a string to the names_hash, if we're using one + */ +static void +add_to_names_hash(deparse_columns *colinfo, const char *name) +{ + if (colinfo->names_hash) + (void) hash_search(colinfo->names_hash, + name, + HASH_ENTER, + NULL); +} + +/* + * destroy_colinfo_names_hash: destroy hash table when done with it + */ +static void +destroy_colinfo_names_hash(deparse_columns *colinfo) +{ + if (colinfo->names_hash) + { + hash_destroy(colinfo->names_hash); + colinfo->names_hash = NULL; + } +} + /* * identify_join_columns: figure out where columns of a join come from *