The fti.pl supplied with the fulltextindex module generate ALL possible
substrings of two characters or greater, and is case-sensitive. This patch makes it work correctly. It generates only the suffixes of each word, plus lowercases them - as specified by the README file. This brings it into line with the fti.c function, makes it case-insensitive properly, removes the problem with duplicate rows being returned from an fti search and greatly reduces the size of the generated index table. It was written by my co-worker, Brett Toolin. Christopher Kings-Lynne
This commit is contained in:
parent
8c6761acc7
commit
938236a297
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# This script substracts all substrings out of a specific column in a table
|
||||
# This script substracts all suffixes of all words in a specific column in a table
|
||||
# and generates output that can be loaded into a new table with the
|
||||
# psql '\copy' command. The new table should have the following structure:
|
||||
#
|
||||
@ -52,27 +52,28 @@ $PGRES_BAD_RESPONSE = 5 ;
|
||||
$PGRES_NONFATAL_ERROR = 6 ;
|
||||
$PGRES_FATAL_ERROR = 7 ;
|
||||
|
||||
# the minimum length of word to include in the full text index
|
||||
$MIN_WORD_LENGTH = 2;
|
||||
|
||||
# the minimum length of the substrings in the full text index
|
||||
$MIN_SUBSTRING_LENGTH = 2;
|
||||
|
||||
$[ = 0; # make sure string offsets start at 0
|
||||
|
||||
sub break_up {
|
||||
my $string = pop @_;
|
||||
|
||||
# convert strings to lower case
|
||||
$string = lc($string);
|
||||
@strings = split(/\W+/, $string);
|
||||
@subs = ();
|
||||
|
||||
foreach $s (@strings) {
|
||||
$len = length($s);
|
||||
next if ($len < 4);
|
||||
|
||||
$lpos = $len-1;
|
||||
while ($lpos >= 3) {
|
||||
$fpos = $lpos - 3;
|
||||
while ($fpos >= 0) {
|
||||
$sub = substr($s, $fpos, $lpos - $fpos + 1);
|
||||
push(@subs, $sub);
|
||||
$fpos = $fpos - 1;
|
||||
}
|
||||
$lpos = $lpos - 1;
|
||||
next if ($len <= $MIN_WORD_LENGTH);
|
||||
for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) {
|
||||
$tmp = substr($s, $i);
|
||||
push(@subs, $tmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user