Fix assorted bugs in contrib/unaccent's configuration file parsing.

Make it use t_isspace() to identify whitespace, rather than relying on
sscanf which is known to get it wrong on some platform/locale combinations.
Get rid of fixed-size buffers.  Make it actually continue to parse the file
after ignoring a line with untranslatable characters, as was obviously
intended.

The first of these issues is per gripe from J Smith, though not exactly
either of his proposed patches.
This commit is contained in:
Tom Lane 2011-11-07 11:48:53 -05:00
parent ffc703a891
commit ced3a93ccb
1 changed files with 67 additions and 19 deletions

View File

@ -91,35 +91,83 @@ initSuffixTree(char *filename)
do
{
char src[4096];
char trg[4096];
int srclen;
int trglen;
char *line = NULL;
/*
* pg_do_encoding_conversion() (called by tsearch_readline()) will
* emit exception if it finds untranslatable characters in current
* locale. We just skip such lines, continuing with the next.
*/
skip = true;
PG_TRY();
{
/*
* pg_do_encoding_conversion() (called by tsearch_readline()) will
* emit exception if it finds untranslatable characters in current
* locale. We just skip such characters.
*/
char *line;
while ((line = tsearch_readline(&trst)) != NULL)
{
if (sscanf(line, "%s\t%s\n", src, trg) != 2)
/*
* The format of each line must be "src trg" where src and trg
* are sequences of one or more non-whitespace characters,
* separated by whitespace. Whitespace at start or end of
* line is ignored.
*/
int state;
char *ptr;
char *src = NULL;
char *trg = NULL;
int ptrlen;
int srclen = 0;
int trglen = 0;
state = 0;
for (ptr = line; *ptr; ptr += ptrlen)
{
ptrlen = pg_mblen(ptr);
/* ignore whitespace, but end src or trg */
if (t_isspace(ptr))
{
if (state == 1)
state = 2;
else if (state == 3)
state = 4;
continue;
}
switch (state)
{
case 0:
/* start of src */
src = ptr;
srclen = ptrlen;
state = 1;
break;
case 1:
/* continue src */
srclen += ptrlen;
break;
case 2:
/* start of trg */
trg = ptr;
trglen = ptrlen;
state = 3;
break;
case 3:
/* continue trg */
trglen += ptrlen;
break;
default:
/* bogus line format */
state = -1;
break;
}
}
srclen = strlen(src);
trglen = strlen(trg);
if (state >= 3)
rootSuffixTree = placeChar(rootSuffixTree,
(unsigned char *) src, srclen,
trg, trglen);
skip = false;
pfree(line);
}
skip = false;
}
PG_CATCH();
{