Fix assorted bugs in contrib/unaccent's configuration file parsing.
Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches.
This commit is contained in:
parent
ffc703a891
commit
ced3a93ccb
@ -91,35 +91,83 @@ initSuffixTree(char *filename)
|
|||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
char src[4096];
|
/*
|
||||||
char trg[4096];
|
* pg_do_encoding_conversion() (called by tsearch_readline()) will
|
||||||
int srclen;
|
* emit exception if it finds untranslatable characters in current
|
||||||
int trglen;
|
* locale. We just skip such lines, continuing with the next.
|
||||||
char *line = NULL;
|
*/
|
||||||
|
|
||||||
skip = true;
|
skip = true;
|
||||||
|
|
||||||
PG_TRY();
|
PG_TRY();
|
||||||
{
|
{
|
||||||
/*
|
char *line;
|
||||||
* pg_do_encoding_conversion() (called by tsearch_readline()) will
|
|
||||||
* emit exception if it finds untranslatable characters in current
|
|
||||||
* locale. We just skip such characters.
|
|
||||||
*/
|
|
||||||
while ((line = tsearch_readline(&trst)) != NULL)
|
while ((line = tsearch_readline(&trst)) != NULL)
|
||||||
{
|
{
|
||||||
if (sscanf(line, "%s\t%s\n", src, trg) != 2)
|
/*
|
||||||
continue;
|
* The format of each line must be "src trg" where src and trg
|
||||||
|
* are sequences of one or more non-whitespace characters,
|
||||||
|
* separated by whitespace. Whitespace at start or end of
|
||||||
|
* line is ignored.
|
||||||
|
*/
|
||||||
|
int state;
|
||||||
|
char *ptr;
|
||||||
|
char *src = NULL;
|
||||||
|
char *trg = NULL;
|
||||||
|
int ptrlen;
|
||||||
|
int srclen = 0;
|
||||||
|
int trglen = 0;
|
||||||
|
|
||||||
srclen = strlen(src);
|
state = 0;
|
||||||
trglen = strlen(trg);
|
for (ptr = line; *ptr; ptr += ptrlen)
|
||||||
|
{
|
||||||
|
ptrlen = pg_mblen(ptr);
|
||||||
|
/* ignore whitespace, but end src or trg */
|
||||||
|
if (t_isspace(ptr))
|
||||||
|
{
|
||||||
|
if (state == 1)
|
||||||
|
state = 2;
|
||||||
|
else if (state == 3)
|
||||||
|
state = 4;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
switch (state)
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
/* start of src */
|
||||||
|
src = ptr;
|
||||||
|
srclen = ptrlen;
|
||||||
|
state = 1;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
/* continue src */
|
||||||
|
srclen += ptrlen;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
/* start of trg */
|
||||||
|
trg = ptr;
|
||||||
|
trglen = ptrlen;
|
||||||
|
state = 3;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
/* continue trg */
|
||||||
|
trglen += ptrlen;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
/* bogus line format */
|
||||||
|
state = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state >= 3)
|
||||||
|
rootSuffixTree = placeChar(rootSuffixTree,
|
||||||
|
(unsigned char *) src, srclen,
|
||||||
|
trg, trglen);
|
||||||
|
|
||||||
rootSuffixTree = placeChar(rootSuffixTree,
|
|
||||||
(unsigned char *) src, srclen,
|
|
||||||
trg, trglen);
|
|
||||||
skip = false;
|
|
||||||
pfree(line);
|
pfree(line);
|
||||||
}
|
}
|
||||||
|
skip = false;
|
||||||
}
|
}
|
||||||
PG_CATCH();
|
PG_CATCH();
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user