postgres/contrib/fulltextindex/fti.pl

211 lines
4.5 KiB
Perl

#!/usr/bin/perl
#
# This script substracts all suffixes of all words in a specific column in a table
# and generates output that can be loaded into a new table with the
# psql '\copy' command. The new table should have the following structure:
#
# create table tab (
# string text,
# id oid
# );
#
# Note that you cannot use 'copy' (the SQL-command) directly, because
# there's no '\.' included at the end of the output.
#
# The output can be fed through the UNIX commands 'uniq' and 'sort'
# to generate the smallest and sorted output to populate the fti-table.
#
# Example:
#
# fti.pl -u -d mydb -t mytable -c mycolumn,mycolumn2 -f myfile
# sort -o myoutfile myfile
# uniq myoutfile sorted-file
#
# psql -u mydb
#
# \copy my_fti_table from myfile
#
# create index fti_idx on my_fti_table (string,id);
#
# create function fti() returns trigger as
# '/path/to/fti/file/fti.so'
# language 'C';
#
# create trigger my_fti_trigger after update or insert or delete
# on mytable
# for each row execute procedure fti(my_fti_table, mycolumn);
#
# Make sure you have an index on mytable(oid) to be able to do somewhat
# efficient substring searches.
#use lib '/usr/local/pgsql/lib/perl5/';
use lib '/mnt/web/guide/postgres/lib/perl5/site_perl';
use Pg;
use Getopt::Std;
$PGRES_EMPTY_QUERY = 0 ;
$PGRES_COMMAND_OK = 1 ;
$PGRES_TUPLES_OK = 2 ;
$PGRES_COPY_OUT = 3 ;
$PGRES_COPY_IN = 4 ;
$PGRES_BAD_RESPONSE = 5 ;
$PGRES_NONFATAL_ERROR = 6 ;
$PGRES_FATAL_ERROR = 7 ;
# the minimum length of word to include in the full text index
$MIN_WORD_LENGTH = 2;
# the minimum length of the substrings in the full text index
$MIN_SUBSTRING_LENGTH = 2;
$[ = 0; # make sure string offsets start at 0
sub break_up {
my $string = pop @_;
# convert strings to lower case
$string = lc($string);
@strings = split(/\W+/, $string);
@subs = ();
foreach $s (@strings) {
$len = length($s);
next if ($len <= $MIN_WORD_LENGTH);
for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) {
$tmp = substr($s, $i);
push(@subs, $tmp);
}
}
return @subs;
}
sub connect_db {
my $dbname = shift @_;
my $user = shift @_;
my $passwd = shift @_;
if (!defined($dbname) || $dbname eq "") {
return 1;
}
$connect_string = "dbname=$dbname";
if ($user ne "") {
if ($passwd eq "") {
return 0;
}
$connect_string = "$connect_string user=$user password=$passwd ".
"authtype=password";
}
$PG_CONN = PQconnectdb($connect_string);
if (PQstatus($PG_CONN)) {
print STDERR "Couldn't make connection with database!\n";
print STDERR PQerrorMessage($PG_CONN), "\n";
return 0;
}
return 1;
}
sub quit_prog {
close(OUT);
unlink $opt_f;
if (defined($PG_CONN)) {
PQfinish($PG_CONN);
}
exit 1;
}
sub get_username {
print "Username: ";
chop($n = <STDIN>);
return $n;;
}
sub get_password {
print "Password: ";
system("stty -echo < /dev/tty");
chop($pwd = <STDIN>);
print "\n";
system("stty echo < /dev/tty");
return $pwd;
}
sub main {
getopts('d:t:c:f:u');
if (!$opt_d || !$opt_t || !$opt_c || !$opt_f) {
print STDERR "usage: $0 [-u] -d database -t table -c column[,column...] ".
"-f output-file\n";
return 1;
}
@cols = split(/,/, $opt_c);
if (defined($opt_u)) {
$uname = get_username();
$pwd = get_password();
} else {
$uname = "";
$pwd = "";
}
$SIG{'INT'} = 'quit_prog';
if (!connect_db($opt_d, $uname, $pwd)) {
print STDERR "Connecting to database failed!\n";
return 1;
}
if (!open(OUT, ">$opt_f")) {
print STDERR "Couldnt' open file '$opt_f' for output!\n";
return 1;
}
PQexec($PG_CONN, "SET search_path = public");
PQexec($PG_CONN, "begin");
$query = "declare C cursor for select (\"";
$query .= join("\" || ' ' || \"", @cols);
$query .= "\") as string, oid from $opt_t";
$res = PQexec($PG_CONN, $query);
if (!$res || (PQresultStatus($res) != $PGRES_COMMAND_OK)) {
print STDERR "Error declaring cursor!\n";
print STDERR PQerrorMessage($PG_CONN), "\n";
PQfinish($PG_CONN);
return 1;
}
PQclear($res);
$query = "fetch in C";
while (($res = PQexec($PG_CONN, $query)) &&
(PQresultStatus($res) == $PGRES_TUPLES_OK) &&
(PQntuples($res) == 1)) {
$col = PQgetvalue($res, 0, 0);
$oid = PQgetvalue($res, 0, 1);
@subs = break_up($col);
foreach $i (@subs) {
print OUT "$i\t$oid\n";
}
}
if (!$res || (PQresultStatus($res) != PGRES_TUPLES_OK)) {
print STDERR "Error retrieving data from backend!\n";
print STDERR PQerrorMEssage($PG_CONN), "\n";
PQfinish($PG_CONN);
return 1;
}
PQclear($res);
PQfinish($PG_CONN);
return 0;
}
exit main();