diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index 6c4354e9e5..9756d6c5f0 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.4 2007/09/26 10:09:57 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.5 2007/10/23 00:51:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -140,55 +140,64 @@ uniqueWORD(ParsedWord * a, int4 l) TSVector make_tsvector(ParsedText *prs) { - int4 i, + int i, j, lenstr = 0, totallen; TSVector in; WordEntry *ptr; - char *str, - *cur; + char *str; + int stroff; prs->curwords = uniqueWORD(prs->words, prs->curwords); for (i = 0; i < prs->curwords; i++) { - lenstr += SHORTALIGN(prs->words[i].len); - + lenstr += prs->words[i].len; if (prs->words[i].alen) + { + lenstr = SHORTALIGN(lenstr); lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); + } } + if (lenstr > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); + totallen = CALCDATASIZE(prs->curwords, lenstr); in = (TSVector) palloc0(totallen); SET_VARSIZE(in, totallen); in->size = prs->curwords; ptr = ARRPTR(in); - cur = str = STRPTR(in); + str = STRPTR(in); + stroff = 0; for (i = 0; i < prs->curwords; i++) { ptr->len = prs->words[i].len; - if (cur - str > MAXSTRPOS) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("string is too long for tsvector"))); - ptr->pos = cur - str; - memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len); + ptr->pos = stroff; + memcpy(str + stroff, prs->words[i].word, prs->words[i].len); + stroff += prs->words[i].len; pfree(prs->words[i].word); - cur += SHORTALIGN(prs->words[i].len); if (prs->words[i].alen) { + int k = prs->words[i].pos.apos[0]; WordEntryPos *wptr; + if (k > 0xFFFF) + elog(ERROR, "positions array too long"); + ptr->haspos = 1; - *(uint16 *) cur = prs->words[i].pos.apos[0]; + stroff = SHORTALIGN(stroff); + *(uint16 *) (str + stroff) = (uint16) k; wptr = POSDATAPTR(in, ptr); - for (j = 0; j < *(uint16 *) cur; j++) + for (j = 0; j < k; j++) { WEP_SETWEIGHT(wptr[j], 0); WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); } - cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); + stroff += sizeof(uint16) + k * sizeof(WordEntryPos); pfree(prs->words[i].pos.apos); } else diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 0d82da1f90..cb90274943 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.6 2007/10/23 00:51:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,16 +22,18 @@ typedef struct { - WordEntry entry; /* should be first ! */ + WordEntry entry; /* must be first! */ WordEntryPos *pos; int poslen; /* number of elements in pos */ } WordEntryIN; + +/* Compare two WordEntryPos values for qsort */ static int comparePos(const void *a, const void *b) { - int apos = WEP_GETPOS(*(WordEntryPos *) a); - int bpos = WEP_GETPOS(*(WordEntryPos *) b); + int apos = WEP_GETPOS(*(const WordEntryPos *) a); + int bpos = WEP_GETPOS(*(const WordEntryPos *) b); if (apos == bpos) return 0; @@ -53,9 +55,9 @@ uniquePos(WordEntryPos * a, int l) if (l <= 1) return l; - res = a; qsort((void *) a, l, sizeof(WordEntryPos), comparePos); + res = a; ptr = a + 1; while (ptr - a < l) { @@ -63,7 +65,8 @@ uniquePos(WordEntryPos * a, int l) { res++; *res = *ptr; - if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) + if (res - a >= MAXNUMPOS - 1 || + WEP_GETPOS(*res) == MAXENTRYPOS - 1) break; } else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) @@ -74,12 +77,13 @@ uniquePos(WordEntryPos * a, int l) return res + 1 - a; } +/* Compare two WordEntryIN values for qsort */ static int compareentry(const void *va, const void *vb, void *arg) { + const WordEntryIN *a = (const WordEntryIN *) va; + const WordEntryIN *b = (const WordEntryIN *) vb; char *BufferStr = (char *) arg; - WordEntryIN *a = (WordEntryIN *) va; - WordEntryIN *b = (WordEntryIN *) vb; if (a->entry.len == b->entry.len) { @@ -91,44 +95,40 @@ compareentry(const void *va, const void *vb, void *arg) return (a->entry.len > b->entry.len) ? 1 : -1; } +/* + * Sort an array of WordEntryIN, remove duplicates. + * *outbuflen receives the amount of space needed for strings and positions. + */ static int uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { + int buflen; WordEntryIN *ptr, *res; Assert(l >= 1); - if (l == 1) - { - if (a->entry.haspos) - { - a->poslen = uniquePos(a->pos, a->poslen); - *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); - } - else - *outbuflen = a->entry.len; + if (l > 1) + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, + (void *) buf); - return l; - } + buflen = 0; res = a; - ptr = a + 1; - qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); - while (ptr - a < l) { if (!(ptr->entry.len == res->entry.len && - strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0)) + strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], + res->entry.len) == 0)) { + /* done accumulating data into *res, count space needed */ + buflen += res->entry.len; if (res->entry.haspos) { - *outbuflen += SHORTALIGN(res->entry.len); res->poslen = uniquePos(res->pos, res->poslen); - *outbuflen += res->poslen * sizeof(WordEntryPos); + buflen = SHORTALIGN(buflen); + buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); } - else - *outbuflen += res->entry.len; res++; memcpy(res, ptr, sizeof(WordEntryIN)); } @@ -136,37 +136,37 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { if (res->entry.haspos) { + /* append ptr's positions to res's positions */ int newlen = ptr->poslen + res->poslen; - /* Append res to pos */ - - res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos)); - memcpy(&res->pos[res->poslen], - ptr->pos, ptr->poslen * sizeof(WordEntryPos)); + res->pos = (WordEntryPos *) + repalloc(res->pos, newlen * sizeof(WordEntryPos)); + memcpy(&res->pos[res->poslen], ptr->pos, + ptr->poslen * sizeof(WordEntryPos)); res->poslen = newlen; pfree(ptr->pos); } else { + /* just give ptr's positions to pos */ res->entry.haspos = 1; res->pos = ptr->pos; + res->poslen = ptr->poslen; } } ptr++; } - /* add last item */ - + /* count space needed for last item */ + buflen += res->entry.len; if (res->entry.haspos) { - *outbuflen += SHORTALIGN(res->entry.len); - res->poslen = uniquePos(res->pos, res->poslen); - *outbuflen += res->poslen * sizeof(WordEntryPos); + buflen = SHORTALIGN(buflen); + buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); } - else - *outbuflen += res->entry.len; + *outbuflen = buflen; return res + 1 - a; } @@ -193,6 +193,8 @@ tsvectorin(PG_FUNCTION_ARGS) int toklen; WordEntryPos *pos; int poslen; + char *strbuf; + int stroff; /* * Tokens are appended to tmpbuf, cur is a pointer @@ -212,19 +214,17 @@ tsvectorin(PG_FUNCTION_ARGS) while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) { - if (toklen >= MAXSTRLEN) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long (%ld bytes, max %ld bytes)", (long) toklen, - (long) MAXSTRLEN))); - + (long) (MAXSTRLEN-1)))); if (cur - tmpbuf > MAXSTRPOS) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("position value is too large"))); + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); /* * Enlarge buffers if needed @@ -232,7 +232,8 @@ tsvectorin(PG_FUNCTION_ARGS) if (len >= arrlen) { arrlen *= 2; - arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); + arr = (WordEntryIN *) + repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); } while ((cur - tmpbuf) + toklen >= buflen) { @@ -254,7 +255,11 @@ tsvectorin(PG_FUNCTION_ARGS) arr[len].poslen = poslen; } else + { arr[len].entry.haspos = 0; + arr[len].pos = NULL; + arr[len].poslen = 0; + } len++; } @@ -264,40 +269,45 @@ tsvectorin(PG_FUNCTION_ARGS) len = uniqueentry(arr, len, tmpbuf, &buflen); else buflen = 0; + + if (buflen > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); + totallen = CALCDATASIZE(len, buflen); in = (TSVector) palloc0(totallen); - SET_VARSIZE(in, totallen); in->size = len; - cur = STRPTR(in); inarr = ARRPTR(in); + strbuf = STRPTR(in); + stroff = 0; for (i = 0; i < len; i++) { - memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len); - arr[i].entry.pos = cur - STRPTR(in); - cur += SHORTALIGN(arr[i].entry.len); + memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); + arr[i].entry.pos = stroff; + stroff += arr[i].entry.len; if (arr[i].entry.haspos) { - uint16 tmplen; - - if(arr[i].poslen > 0xFFFF) + if (arr[i].poslen > 0xFFFF) elog(ERROR, "positions array too long"); - tmplen = (uint16) arr[i].poslen; - - /* Copy length to output struct */ - memcpy(cur, &tmplen, sizeof(uint16)); - cur += sizeof(uint16); + /* Copy number of positions */ + stroff = SHORTALIGN(stroff); + *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; + stroff += sizeof(uint16); /* Copy positions */ - memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos)); - cur += arr[i].poslen * sizeof(WordEntryPos); + memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); + stroff += arr[i].poslen * sizeof(WordEntryPos); pfree(arr[i].pos); } inarr[i] = arr[i].entry; } + Assert((strbuf + stroff - (char *) in) == totallen); + PG_RETURN_TSVECTOR(in); } @@ -495,11 +505,12 @@ tsvectorrecv(PG_FUNCTION_ARGS) datalen += lex_len; - if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) + if (i > 0 && WordEntryCMP(&vec->entries[i], + &vec->entries[i - 1], + STRPTR(vec)) <= 0) elog(ERROR, "lexemes are misordered"); /* Receive positions */ - if (npos > 0) { uint16 j; diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index e150f9a267..44b69ac76e 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.5 2007/09/11 08:46:29 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.6 2007/10/23 00:51:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -266,8 +266,14 @@ compareEntry(char *ptra, WordEntry * a, char *ptrb, WordEntry * b) return (a->len > b->len) ? 1 : -1; } +/* + * Add positions from src to dest after offsetting them by maxpos. + * Return the number added (might be less than expected due to overflow) + */ static int4 -add_pos(TSVector src, WordEntry * srcptr, TSVector dest, WordEntry * destptr, int4 maxpos) +add_pos(TSVector src, WordEntry * srcptr, + TSVector dest, WordEntry * destptr, + int4 maxpos) { uint16 *clen = &_POSVECPTR(dest, destptr)->npos; int i; @@ -280,7 +286,10 @@ add_pos(TSVector src, WordEntry * srcptr, TSVector dest, WordEntry * destptr, in *clen = 0; startlen = *clen; - for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) + for (i = 0; + i < slen && *clen < MAXNUMPOS && + (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); + i++) { WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); @@ -307,8 +316,8 @@ tsvector_concat(PG_FUNCTION_ARGS) i, j, i1, - i2; - char *cur; + i2, + dataoff; char *data, *data1, *data2; @@ -336,11 +345,13 @@ tsvector_concat(PG_FUNCTION_ARGS) data2 = STRPTR(in2); i1 = in1->size; i2 = in2->size; + /* conservative estimate of space needed */ out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2)); SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2)); out->size = in1->size + in2->size; - data = cur = STRPTR(out); ptr = ARRPTR(out); + data = STRPTR(out); + dataoff = 0; while (i1 && i2) { int cmp = compareEntry(data1, ptr1, data2, ptr2); @@ -349,16 +360,15 @@ tsvector_concat(PG_FUNCTION_ARGS) { /* in1 first */ ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; - memcpy(cur, data1 + ptr1->pos, ptr1->len); - ptr->pos = cur - data; + memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); + ptr->pos = dataoff; + dataoff += ptr1->len; if (ptr->haspos) { - cur += SHORTALIGN(ptr1->len); - memcpy(cur, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); - cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + dataoff = SHORTALIGN(dataoff); + memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } - else - cur += ptr1->len; ptr++; ptr1++; @@ -368,21 +378,21 @@ tsvector_concat(PG_FUNCTION_ARGS) { /* in2 first */ ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; - memcpy(cur, data2 + ptr2->pos, ptr2->len); - ptr->pos = cur - data; + memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); + ptr->pos = dataoff; + dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); - cur += SHORTALIGN(ptr2->len); - if (addlen == 0) ptr->haspos = 0; else - cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + { + dataoff = SHORTALIGN(dataoff); + dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } } - else - cur += ptr2->len; ptr++; ptr2++; @@ -392,30 +402,32 @@ tsvector_concat(PG_FUNCTION_ARGS) { ptr->haspos = ptr1->haspos | ptr2->haspos; ptr->len = ptr1->len; - memcpy(cur, data1 + ptr1->pos, ptr1->len); - ptr->pos = cur - data; + memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); + ptr->pos = dataoff; + dataoff += ptr1->len; if (ptr->haspos) { - cur += SHORTALIGN(ptr1->len); if (ptr1->haspos) { - memcpy(cur, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); - cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + dataoff = SHORTALIGN(dataoff); + memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); if (ptr2->haspos) - cur += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); + dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); } - else if (ptr2->haspos) + else /* must have ptr2->haspos */ { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else - cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + { + dataoff = SHORTALIGN(dataoff); + dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } } } - else - cur += ptr1->len; ptr++; ptr1++; @@ -429,16 +441,15 @@ tsvector_concat(PG_FUNCTION_ARGS) { ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; - memcpy(cur, data1 + ptr1->pos, ptr1->len); - ptr->pos = cur - data; + memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); + ptr->pos = dataoff; + dataoff += ptr1->len; if (ptr->haspos) { - cur += SHORTALIGN(ptr1->len); - memcpy(cur, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); - cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + dataoff = SHORTALIGN(dataoff); + memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } - else - cur += ptr1->len; ptr++; ptr1++; @@ -449,31 +460,40 @@ tsvector_concat(PG_FUNCTION_ARGS) { ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; - memcpy(cur, data2 + ptr2->pos, ptr2->len); - ptr->pos = cur - data; + memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); + ptr->pos = dataoff; + dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); - cur += SHORTALIGN(ptr2->len); - if (addlen == 0) ptr->haspos = 0; else - cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + { + dataoff = SHORTALIGN(dataoff); + dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } } - else - cur += ptr2->len; ptr++; ptr2++; i2--; } + /* + * Instead of checking each offset individually, we check for overflow + * of pos fields once at the end. + */ + if (dataoff > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); + out->size = ptr - ARRPTR(out); - SET_VARSIZE(out, CALCDATASIZE(out->size, cur - data)); + SET_VARSIZE(out, CALCDATASIZE(out->size, dataoff)); if (data != STRPTR(out)) - memmove(STRPTR(out), data, cur - data); + memmove(STRPTR(out), data, dataoff); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1);