NetBSD/usr.sbin/makemandb/apropos-utils.c

1150 lines
30 KiB
C
Raw Normal View History

/* $NetBSD: apropos-utils.c,v 1.40 2017/11/25 14:29:38 abhinav Exp $ */
/*-
* Copyright (c) 2011 Abhinav Upadhyay <er.abhinav.upadhyay@gmail.com>
* All rights reserved.
*
* This code was developed as part of Google's Summer of Code 2011 program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: apropos-utils.c,v 1.40 2017/11/25 14:29:38 abhinav Exp $");
#include <sys/queue.h>
#include <sys/stat.h>
#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <util.h>
#include <zlib.h>
#include <term.h>
#include <unistd.h>
#undef tab // XXX: manconf.h
#include "apropos-utils.h"
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
#include "custom_apropos_tokenizer.h"
#include "manconf.h"
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
#include "fts3_tokenizer.h"
typedef struct orig_callback_data {
void *data;
int (*callback) (query_callback_args*);
} orig_callback_data;
typedef struct inverse_document_frequency {
double value;
int status;
} inverse_document_frequency;
/* weights for individual columns */
static const double col_weights[] = {
2.0, // NAME
2.00, // Name-description
0.55, // DESCRIPTION
0.10, // LIBRARY
0.001, //RETURN VALUES
0.20, //ENVIRONMENT
0.01, //FILES
0.001, //EXIT STATUS
2.00, //DIAGNOSTICS
0.05, //ERRORS
0.00, //md5_hash
1.00 //machine
};
#ifndef APROPOS_DEBUG
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
static int
register_tokenizer(sqlite3 *db)
{
int rc;
sqlite3_stmt *stmt;
const sqlite3_tokenizer_module *p;
const char *name = "custom_apropos_tokenizer";
get_custom_apropos_tokenizer(&p);
const char *sql = "SELECT fts3_tokenizer(?, ?)";
sqlite3_db_config(db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
rc = sqlite3_prepare_v2(db, sql, -1, &stmt, 0);
if (rc != SQLITE_OK)
return rc;
sqlite3_bind_text(stmt, 1, name, -1, SQLITE_STATIC);
sqlite3_bind_blob(stmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(stmt);
return sqlite3_finalize(stmt);
}
#endif
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
/*
* lower --
* Converts the string str to lower case
*/
char *
lower(char *str)
{
assert(str);
int i = 0;
char c;
2017-04-30 19:56:30 +03:00
while ((c = str[i]) != '\0')
str[i++] = tolower((unsigned char) c);
return str;
}
/*
* concat--
2013-02-11 03:24:18 +04:00
* Utility function. Concatenates together: dst, a space character and src.
* dst + " " + src
*/
void
concat(char **dst, const char *src)
{
concat2(dst, src, strlen(src));
}
void
concat2(char **dst, const char *src, size_t srclen)
{
size_t totallen, dstlen;
char *mydst = *dst;
assert(src != NULL);
/*
* If destination buffer dst is NULL, then simply
* strdup the source buffer
*/
if (mydst == NULL) {
mydst = estrndup(src, srclen);
*dst = mydst;
return;
}
dstlen = strlen(mydst);
/*
* NUL Byte and separator space
*/
totallen = dstlen + srclen + 2;
mydst = erealloc(mydst, totallen);
/* Append a space at the end of dst */
mydst[dstlen++] = ' ';
2013-02-11 03:24:18 +04:00
/* Now, copy src at the end of dst */
memcpy(mydst + dstlen, src, srclen);
mydst[dstlen + srclen] = '\0';
*dst = mydst;
}
void
close_db(sqlite3 *db)
{
sqlite3_close(db);
sqlite3_shutdown();
}
/*
* create_db --
* Creates the database schema.
*/
static int
create_db(sqlite3 *db)
{
const char *sqlstr = NULL;
char *schemasql;
char *errmsg = NULL;
2013-02-11 03:24:18 +04:00
/*------------------------ Create the tables------------------------------*/
#if NOTYET
sqlite3_exec(db, "PRAGMA journal_mode = WAL", NULL, NULL, NULL);
#else
sqlite3_exec(db, "PRAGMA journal_mode = DELETE", NULL, NULL, NULL);
#endif
schemasql = sqlite3_mprintf("PRAGMA user_version = %d",
APROPOS_SCHEMA_VERSION);
sqlite3_exec(db, schemasql, NULL, NULL, &errmsg);
if (errmsg != NULL)
goto out;
sqlite3_free(schemasql);
sqlstr =
//mandb
"CREATE VIRTUAL TABLE mandb USING fts4(section, name, "
"name_desc, desc, lib, return_vals, env, files, "
"exit_status, diagnostics, errors, md5_hash UNIQUE, machine, "
#ifndef APROPOS_DEBUG
"compress=zip, uncompress=unzip, tokenize=custom_apropos_tokenizer, "
#else
"tokenize=porter, "
#endif
"notindexed=section, notindexed=md5_hash); "
//mandb_meta
"CREATE TABLE IF NOT EXISTS mandb_meta(device, inode, mtime, "
"file UNIQUE, md5_hash UNIQUE, id INTEGER PRIMARY KEY); "
//mandb_links
Better handle MLINKS in apropos(1). apropos(1) only indexes the first .Nm entry from the NAME section in the full text index. Rest of the .Nm entries are stored in a separate table: mandb_links. Till now apropos(1) did not use the mandb_links table. So whenever a query was being made for one of the man page links, such as realloc(3), it was showing malloc(3) in the results but not as the first result. And, also the result would show up as malloc(3), rather than realloc(3) (which can be confusing). With this change, for single keyword queries, apropos(1) would now utilise the mandb_links table as well. If the query is for one of the links of a man page, it would show as the first result. Also, the result would show up as the name of the link rather than the original man page name. For example, if the query was for realloc, the output would be realloc(3), rather than malloc(3). Following are some example queries showing difference in the output before this change and after this change: #Before changes $ apropos -n 5 -M realloc reallocarr (3) reallocate array reallocarray (3) reallocate memory for an array of elements checking for overflow fgetwln (3) get a line of wide characters from a stream fgetln (3) get a line from a stream posix_memalign (3) aligned memory allocation #After changes $ ./apropos -n 5 -M realloc realloc (3) general memory allocation operations realloc (3) general purpose memory allocation functions realloc (9) general-purpose kernel memory allocator reallocarr (3) reallocate array reallocarray (3) reallocate memory for an array of elements checking for overflow #Before changes $ apropos -n 5 -M TAILQ_REMOVE SLIST_HEAD (3) implementations of singly-linked lists, lists, simple queues, tail queues, and singly-linked tail queues #After changes $ ./apropos -n 5 -M TAILQ_REMOVE TAILQ_REMOVE (3) implementations of singly-linked lists, lists, simple queues, tail queues, and singly-linked tail queues #Before changes $ apropos -n 5 -M falloc filedesc (9) file descriptor tables and operations file (9) operations on file entries #After changes $ ./apropos -n 5 -M falloc falloc (9) file descriptor tables and operations file (9) operations on file entries ok christos@
2017-04-23 16:52:57 +03:00
"CREATE TABLE IF NOT EXISTS mandb_links(link COLLATE NOCASE, target, section, "
"machine, md5_hash); ";
sqlite3_exec(db, sqlstr, NULL, NULL, &errmsg);
if (errmsg != NULL)
goto out;
sqlstr =
"CREATE INDEX IF NOT EXISTS index_mandb_links ON mandb_links "
"(link); "
"CREATE INDEX IF NOT EXISTS index_mandb_meta_dev ON mandb_meta "
"(device, inode); "
"CREATE INDEX IF NOT EXISTS index_mandb_links_md5 ON mandb_links "
"(md5_hash);";
sqlite3_exec(db, sqlstr, NULL, NULL, &errmsg);
if (errmsg != NULL)
goto out;
return 0;
out:
warnx("%s", errmsg);
free(errmsg);
sqlite3_close(db);
sqlite3_shutdown();
return -1;
}
/*
* zip --
* User defined Sqlite function to compress the FTS table
*/
static void
zip(sqlite3_context *pctx, int nval, sqlite3_value **apval)
2013-02-11 03:24:18 +04:00
{
int nin;
long int nout;
const unsigned char * inbuf;
unsigned char *outbuf;
assert(nval == 1);
nin = sqlite3_value_bytes(apval[0]);
inbuf = (const unsigned char *) sqlite3_value_blob(apval[0]);
nout = nin + 13 + (nin + 999) / 1000;
outbuf = emalloc(nout);
compress(outbuf, (unsigned long *) &nout, inbuf, nin);
sqlite3_result_blob(pctx, outbuf, nout, free);
}
/*
* unzip --
* User defined Sqlite function to uncompress the FTS table.
*/
static void
unzip(sqlite3_context *pctx, int nval, sqlite3_value **apval)
{
unsigned int rc;
unsigned char *outbuf;
z_stream stream;
assert(nval == 1);
stream.next_in = __UNCONST(sqlite3_value_blob(apval[0]));
stream.avail_in = sqlite3_value_bytes(apval[0]);
stream.avail_out = stream.avail_in * 2 + 100;
stream.next_out = outbuf = emalloc(stream.avail_out);
stream.zalloc = NULL;
stream.zfree = NULL;
if (inflateInit(&stream) != Z_OK) {
free(outbuf);
return;
}
while ((rc = inflate(&stream, Z_SYNC_FLUSH)) != Z_STREAM_END) {
if (rc != Z_OK ||
(stream.avail_out != 0 && stream.avail_in == 0)) {
free(outbuf);
return;
}
outbuf = erealloc(outbuf, stream.total_out * 2);
stream.next_out = outbuf + stream.total_out;
stream.avail_out = stream.total_out;
}
if (inflateEnd(&stream) != Z_OK) {
free(outbuf);
return;
}
outbuf = erealloc(outbuf, stream.total_out);
sqlite3_result_text(pctx, (const char *)outbuf, stream.total_out, free);
}
/*
* get_dbpath --
* Read the path of the database from man.conf and return.
*/
char *
get_dbpath(const char *manconf)
{
TAG *tp;
char *dbpath;
config(manconf);
tp = gettag("_mandb", 1);
if (!tp)
return NULL;
2013-02-11 03:24:18 +04:00
if (TAILQ_EMPTY(&tp->entrylist))
return NULL;
dbpath = TAILQ_LAST(&tp->entrylist, tqh)->s;
return dbpath;
}
/* init_db --
* Prepare the database. Register the compress/uncompress functions and the
* stopword tokenizer.
2013-02-11 03:24:18 +04:00
* db_flag specifies the mode in which to open the database. 3 options are
* available:
* 1. DB_READONLY: Open in READONLY mode. An error if db does not exist.
* 2. DB_READWRITE: Open in read-write mode. An error if db does not exist.
* 3. DB_CREATE: Open in read-write mode. It will try to create the db if
* it does not exist already.
* RETURN VALUES:
* The function will return NULL in case the db does not exist
* and DB_CREATE
2013-02-11 03:24:18 +04:00
* was not specified. And in case DB_CREATE was specified and yet NULL is
* returned, then there was some other error.
* In normal cases the function should return a handle to the db.
*/
sqlite3 *
init_db(mandb_access_mode db_flag, const char *manconf)
{
sqlite3 *db = NULL;
sqlite3_stmt *stmt;
struct stat sb;
int rc;
int create_db_flag = 0;
char *dbpath = get_dbpath(manconf);
if (dbpath == NULL)
errx(EXIT_FAILURE, "_mandb entry not found in man.conf");
if (!(stat(dbpath, &sb) == 0 && S_ISREG(sb.st_mode))) {
/* Database does not exist, check if DB_CREATE was specified,
* and set flag to create the database schema
*/
if (db_flag != (MANDB_CREATE)) {
warnx("Missing apropos database. "
"Please run makemandb to create it.");
return NULL;
}
create_db_flag = 1;
} else {
/*
* Database exists. Check if we have the permissions
* to read/write the files
*/
int access_mode = R_OK;
2016-04-24 21:11:43 +03:00
switch (db_flag) {
case MANDB_CREATE:
case MANDB_WRITE:
access_mode |= W_OK;
break;
default:
break;
}
if ((access(dbpath, access_mode)) != 0) {
warnx("Unable to access the database, please check"
" permissions for `%s'", dbpath);
return NULL;
}
}
sqlite3_initialize();
rc = sqlite3_open_v2(dbpath, &db, db_flag, NULL);
2013-02-11 03:24:18 +04:00
if (rc != SQLITE_OK) {
warnx("%s", sqlite3_errmsg(db));
goto error;
}
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
sqlite3_extended_result_codes(db, 1);
#ifndef APROPOS_DEBUG
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
rc = register_tokenizer(db);
if (rc != SQLITE_OK) {
warnx("Unable to register custom tokenizer: %s", sqlite3_errmsg(db));
goto error;
}
#endif
Add a custom tokenizer which does not stem certain keywords. Which keywords should not be stemmed is specified in the nostem.txt file. (Right now I have taken all the man page names, split them if they had underscores, removed common English words and converted everything to lowercase.) The tokenizer itself is based on the Porter stemming tokenizer shipped with Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with some modifications to prevent stemming keywords specified in nostem.txt. Additionally, it now uses underscore `_' also as a token delimiter. Therefore, now it's possible to do query for `lwp' and all `_lwp_*' man page names will be matched. Or the query can be `unconst' and `__UNCONST' will be matched. This was not possible earlier, because underscore was not a delimiter and therefore the index would have __UNCONST as a key rather than UNCONST. The tokenizer needs fts3_tokenizer.h file, which is not shipped with the amalgamation build of Sqlite, therefore it needs to be added here (unless we decide there is a better place for it). To enforce using the new tokenizer, a schema version bump is needed Since the tokenization is done both at the indexing time (via makemandb) and also while query time (via apropos or whatis), it will be needed to bump the schema version everytime nostem.txt is modified. Otherwise the index will consist of old tokens and desired changes will not be seen with apropos. This should also fix the issue reported in PR bin/46255. Similar suggestion was also made on tech-userlevel@ recently: <http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html> Thanks to christos@ for multiple rounds of reviews of the tokenizer code.
2017-06-18 19:24:10 +03:00
if (create_db_flag && create_db(db) < 0) {
warnx("%s", "Unable to create database schema");
goto error;
}
rc = sqlite3_prepare_v2(db, "PRAGMA user_version", -1, &stmt, NULL);
if (rc != SQLITE_OK) {
warnx("Unable to query schema version: %s",
sqlite3_errmsg(db));
goto error;
}
if (sqlite3_step(stmt) != SQLITE_ROW) {
sqlite3_finalize(stmt);
warnx("Unable to query schema version: %s",
sqlite3_errmsg(db));
goto error;
}
if (sqlite3_column_int(stmt, 0) != APROPOS_SCHEMA_VERSION) {
sqlite3_finalize(stmt);
warnx("Incorrect schema version found. "
"Please run makemandb -f.");
goto error;
}
sqlite3_finalize(stmt);
2013-02-11 03:24:18 +04:00
/* Register the zip and unzip functions for FTS compression */
rc = sqlite3_create_function(db, "zip", 1, SQLITE_ANY, NULL, zip,
NULL, NULL);
if (rc != SQLITE_OK) {
warnx("Unable to register function: compress: %s",
sqlite3_errmsg(db));
goto error;
}
2013-02-11 03:24:18 +04:00
rc = sqlite3_create_function(db, "unzip", 1, SQLITE_ANY, NULL,
unzip, NULL, NULL);
if (rc != SQLITE_OK) {
warnx("Unable to register function: uncompress: %s",
sqlite3_errmsg(db));
goto error;
}
return db;
error:
close_db(db);
return NULL;
}
/*
* rank_func --
* Sqlite user defined function for ranking the documents.
* For each phrase of the query, it computes the tf and idf and adds them over.
* It computes the final rank, by multiplying tf and idf together.
2013-02-11 03:24:18 +04:00
* Weight of term t for document d = (term frequency of t in d *
* inverse document frequency of t)
*
2013-02-11 03:24:18 +04:00
* Term Frequency of term t in document d = Number of times t occurs in d /
* Number of times t appears in all documents
*
2013-02-11 03:24:18 +04:00
* Inverse document frequency of t = log(Total number of documents /
* Number of documents in which t occurs)
*/
static void
rank_func(sqlite3_context *pctx, int nval, sqlite3_value **apval)
{
inverse_document_frequency *idf = sqlite3_user_data(pctx);
double tf = 0.0;
const unsigned int *matchinfo;
int ncol;
int nphrase;
int iphrase;
int ndoc;
int doclen = 0;
const double k = 3.75;
/*
* Check that the number of arguments passed to this
* function is correct.
*/
assert(nval == 1);
matchinfo = (const unsigned int *) sqlite3_value_blob(apval[0]);
nphrase = matchinfo[0];
ncol = matchinfo[1];
ndoc = matchinfo[2 + 3 * ncol * nphrase + ncol];
for (iphrase = 0; iphrase < nphrase; iphrase++) {
int icol;
const unsigned int *phraseinfo =
&matchinfo[2 + ncol + iphrase * ncol * 3];
for(icol = 1; icol < ncol; icol++) {
2013-02-11 03:24:18 +04:00
/* nhitcount: number of times the current phrase occurs
* in the current column in the current document.
* nglobalhitcount: number of times current phrase
* occurs in the current column in all documents.
* ndocshitcount: number of documents in which the
* current phrase occurs in the current column at
* least once.
*/
int nhitcount = phraseinfo[3 * icol];
int nglobalhitcount = phraseinfo[3 * icol + 1];
int ndocshitcount = phraseinfo[3 * icol + 2];
doclen = matchinfo[2 + icol ];
double weight = col_weights[icol - 1];
if (idf->status == 0 && ndocshitcount)
idf->value +=
log(((double)ndoc / ndocshitcount))* weight;
/*
* Dividing the tf by document length to normalize
* the effect of longer documents.
*/
if (nglobalhitcount > 0 && nhitcount)
tf += (((double)nhitcount * weight)
/ (nglobalhitcount * doclen));
}
}
idf->status = 1;
2013-02-11 03:24:18 +04:00
/*
* Final score: Dividing by k + tf further normalizes the weight
* leading to better results. The value of k is experimental
*/
double score = (tf * idf->value) / (k + tf);
sqlite3_result_double(pctx, score);
return;
}
/*
* generates sql query for matching the user entered query
*/
static char *
generate_search_query(query_args *args, const char *snippet_args[3])
{
const char *default_snippet_args[3];
char *section_clause = NULL;
char *limit_clause = NULL;
char *machine_clause = NULL;
char *query = NULL;
if (args->machine) {
machine_clause = sqlite3_mprintf("AND mandb.machine=%Q", args->machine);
if (machine_clause == NULL)
goto RETURN;
}
if (args->nrec >= 0) {
/* Use the provided number of records and offset */
limit_clause = sqlite3_mprintf(" LIMIT %d OFFSET %d",
args->nrec, args->offset);
if (limit_clause == NULL)
goto RETURN;
}
2013-02-11 03:24:18 +04:00
/* We want to build a query of the form: "select x,y,z from mandb where
* mandb match :query [AND (section IN ('1', '2')]
* ORDER BY rank DESC [LIMIT 10 OFFSET 0]"
* NOTES:
* 1. The portion in first pair of square brackets is optional.
* It will be there only if the user has specified an option
* to search in one or more specific sections.
* 2. The LIMIT portion will be there if the user has specified
* a limit using the -n option.
*/
if (args->sections && args->sections[0]) {
concat(&section_clause, " AND mandb.section IN (");
for (size_t i = 0; args->sections[i]; i++) {
char *temp;
char c = args->sections[i + 1]? ',': ')';
if ((temp = sqlite3_mprintf("%Q%c", args->sections[i], c)) == NULL)
goto RETURN;
concat(&section_clause, temp);
free(temp);
}
}
if (snippet_args == NULL) {
default_snippet_args[0] = "";
default_snippet_args[1] = "";
default_snippet_args[2] = "...";
snippet_args = default_snippet_args;
}
if (args->legacy) {
char *wild;
easprintf(&wild, "%%%s%%", args->search_str);
query = sqlite3_mprintf("SELECT section, name, name_desc, machine"
" FROM mandb"
" WHERE name LIKE %Q OR name_desc LIKE %Q "
"%s"
"%s",
wild, wild,
section_clause ? section_clause : "",
limit_clause ? limit_clause : "");
free(wild);
Better handle MLINKS in apropos(1). apropos(1) only indexes the first .Nm entry from the NAME section in the full text index. Rest of the .Nm entries are stored in a separate table: mandb_links. Till now apropos(1) did not use the mandb_links table. So whenever a query was being made for one of the man page links, such as realloc(3), it was showing malloc(3) in the results but not as the first result. And, also the result would show up as malloc(3), rather than realloc(3) (which can be confusing). With this change, for single keyword queries, apropos(1) would now utilise the mandb_links table as well. If the query is for one of the links of a man page, it would show as the first result. Also, the result would show up as the name of the link rather than the original man page name. For example, if the query was for realloc, the output would be realloc(3), rather than malloc(3). Following are some example queries showing difference in the output before this change and after this change: #Before changes $ apropos -n 5 -M realloc reallocarr (3) reallocate array reallocarray (3) reallocate memory for an array of elements checking for overflow fgetwln (3) get a line of wide characters from a stream fgetln (3) get a line from a stream posix_memalign (3) aligned memory allocation #After changes $ ./apropos -n 5 -M realloc realloc (3) general memory allocation operations realloc (3) general purpose memory allocation functions realloc (9) general-purpose kernel memory allocator reallocarr (3) reallocate array reallocarray (3) reallocate memory for an array of elements checking for overflow #Before changes $ apropos -n 5 -M TAILQ_REMOVE SLIST_HEAD (3) implementations of singly-linked lists, lists, simple queues, tail queues, and singly-linked tail queues #After changes $ ./apropos -n 5 -M TAILQ_REMOVE TAILQ_REMOVE (3) implementations of singly-linked lists, lists, simple queues, tail queues, and singly-linked tail queues #Before changes $ apropos -n 5 -M falloc filedesc (9) file descriptor tables and operations file (9) operations on file entries #After changes $ ./apropos -n 5 -M falloc falloc (9) file descriptor tables and operations file (9) operations on file entries ok christos@
2017-04-23 16:52:57 +03:00
} else if (strchr(args->search_str, ' ') == NULL) {
/*
* If it's a single word query, we want to search in the
* links table as well. If the link table contains an entry
* for the queried keyword, we want to use that as the name of
* the man page.
* For example, for `apropos realloc` the output should be
* realloc(3) and not malloc(3).
*/
query = sqlite3_mprintf(
"SELECT section, name, name_desc, machine,"
" snippet(mandb, %Q, %Q, %Q, -1, 40 ),"
" rank_func(matchinfo(mandb, \"pclxn\")) AS rank"
" FROM mandb WHERE name NOT IN ("
" SELECT target FROM mandb_links WHERE link=%Q AND"
" mandb_links.section=mandb.section) AND mandb MATCH %Q %s %s"
" UNION"
" SELECT mandb.section, mandb_links.link AS name, mandb.name_desc,"
" mandb.machine, '' AS snippet, 100.00 AS rank"
" FROM mandb JOIN mandb_links ON mandb.name=mandb_links.target and"
" mandb.section=mandb_links.section WHERE mandb_links.link=%Q"
" %s %s"
" ORDER BY rank DESC %s",
snippet_args[0], snippet_args[1], snippet_args[2],
args->search_str, args->search_str, section_clause ? section_clause : "",
machine_clause ? machine_clause : "", args->search_str,
machine_clause ? machine_clause : "",
section_clause ? section_clause : "",
limit_clause ? limit_clause : "");
} else {
query = sqlite3_mprintf("SELECT section, name, name_desc, machine,"
" snippet(mandb, %Q, %Q, %Q, -1, 40 ),"
" rank_func(matchinfo(mandb, \"pclxn\")) AS rank"
" FROM mandb"
" WHERE mandb MATCH %Q %s "
"%s"
" ORDER BY rank DESC"
"%s",
snippet_args[0], snippet_args[1], snippet_args[2],
args->search_str, machine_clause ? machine_clause : "",
section_clause ? section_clause : "",
limit_clause ? limit_clause : "");
}
RETURN:
free(machine_clause);
free(section_clause);
free(limit_clause);
return query;
}
/*
* Execute the full text search query and return the number of results
* obtained.
*/
static unsigned int
execute_search_query(sqlite3 *db, char *query, query_args *args)
{
sqlite3_stmt *stmt;
char *name;
char *slash_ptr;
const char *name_temp;
char *m = NULL;
int rc;
query_callback_args callback_args;
inverse_document_frequency idf = {0, 0};
if (!args->legacy) {
/* Register the rank function */
rc = sqlite3_create_function(db, "rank_func", 1, SQLITE_ANY,
(void *) &idf, rank_func, NULL, NULL);
if (rc != SQLITE_OK) {
warnx("Unable to register the ranking function: %s",
sqlite3_errmsg(db));
sqlite3_close(db);
sqlite3_shutdown();
exit(EXIT_FAILURE);
}
}
rc = sqlite3_prepare_v2(db, query, -1, &stmt, NULL);
if (rc == SQLITE_IOERR) {
warnx("Corrupt database. Please rerun makemandb");
return -1;
} else if (rc != SQLITE_OK) {
warnx("%s", sqlite3_errmsg(db));
return -1;
}
unsigned int nresults = 0;
while (sqlite3_step(stmt) == SQLITE_ROW) {
nresults++;
callback_args.section = (const char *) sqlite3_column_text(stmt, 0);
name_temp = (const char *) sqlite3_column_text(stmt, 1);
callback_args.name_desc = (const char *) sqlite3_column_text(stmt, 2);
callback_args.machine = (const char *) sqlite3_column_text(stmt, 3);
if (!args->legacy)
callback_args.snippet = (const char *) sqlite3_column_text(stmt, 4);
else
callback_args.snippet = "";
if ((slash_ptr = strrchr(name_temp, '/')) != NULL)
name_temp = slash_ptr + 1;
if (callback_args.machine && callback_args.machine[0]) {
m = estrdup(callback_args.machine);
easprintf(&name, "%s/%s", lower(m), name_temp);
free(m);
} else {
name = estrdup((const char *)
sqlite3_column_text(stmt, 1));
}
callback_args.name = name;
callback_args.other_data = args->callback_data;
(args->callback)(&callback_args);
free(name);
}
sqlite3_finalize(stmt);
return nresults;
}
/*
* run_query_internal --
* Performs the searches for the keywords entered by the user.
* The 2nd param: snippet_args is an array of strings providing values for the
* last three parameters to the snippet function of sqlite. (Look at the docs).
* The 3rd param: args contains rest of the search parameters. Look at
* arpopos-utils.h for the description of individual fields.
*
*/
static int
run_query_internal(sqlite3 *db, const char *snippet_args[3], query_args *args)
{
char *query;
query = generate_search_query(args, snippet_args);
if (query == NULL) {
*args->errmsg = estrdup("malloc failed");
return -1;
}
execute_search_query(db, query, args);
sqlite3_free(query);
return *(args->errmsg) == NULL ? 0 : -1;
}
static char *
get_escaped_html_string(const char *src, size_t *slen)
{
static const char trouble[] = "<>\"&\002\003";
/*
* First scan the src to find out the number of occurrences
* of {'>', '<' '"', '&'}. Then allocate a new buffer with
* sufficient space to be able to store the quoted versions
* of the special characters {&gt;, &lt;, &quot;, &amp;}.
* Copy over the characters from the original src into
* this buffer while replacing the special characters with
* their quoted versions.
*/
char *dst, *ddst;
size_t count;
const char *ssrc;
for (count = 0, ssrc = src; *src; count++) {
size_t sz = strcspn(src, trouble);
src += sz + 1;
}
#define append(a) \
do { \
memcpy(dst, (a), sizeof(a) - 1); \
dst += sizeof(a) - 1; \
} while (/*CONSTCOND*/0)
ddst = dst = emalloc(*slen + count * 5 + 1);
for (src = ssrc; *src; src++) {
switch (*src) {
case '<':
append("&lt;");
break;
case '>':
append("&gt;");
break;
case '\"':
append("&quot;");
break;
case '&':
/*
* Don't perform the quoting if this & is part of
* an mdoc escape sequence, e.g. \&
*/
if (src != ssrc && src[-1] != '\\')
append("&amp;");
else
append("&");
break;
case '\002':
append("<b>");
break;
case '\003':
append("</b>");
break;
default:
*dst++ = *src;
break;
}
}
*dst = '\0';
*slen = dst - ddst;
return ddst;
}
/*
* callback_html --
* Callback function for run_query_html. It builds the html output and then
* calls the actual user supplied callback function.
*/
static int
callback_html(query_callback_args *callback_args)
{
struct orig_callback_data *orig_data = callback_args->other_data;
int (*callback)(query_callback_args*) = orig_data->callback;
size_t length = callback_args->snippet_length;
size_t name_description_length = strlen(callback_args->name_desc);
char *qsnippet = get_escaped_html_string(callback_args->snippet, &length);
char *qname_description = get_escaped_html_string(callback_args->name_desc,
&name_description_length);
callback_args->name_desc = qname_description;
callback_args->snippet = qsnippet;
callback_args->snippet_length = length;
callback_args->other_data = orig_data->data;
(*callback)(callback_args);
free(qsnippet);
free(qname_description);
return 0;
}
/*
* run_query_html --
* Utility function to output query result in HTML format.
* It internally calls run_query only, but it first passes the output to its
* own custom callback function, which preprocess the snippet for quoting
* inline HTML fragments.
* After that it delegates the call the actual user supplied callback function.
*/
static int
run_query_html(sqlite3 *db, query_args *args)
{
struct orig_callback_data orig_data;
orig_data.callback = args->callback;
orig_data.data = args->callback_data;
const char *snippet_args[] = {"\002", "\003", "..."};
args->callback = &callback_html;
args->callback_data = (void *) &orig_data;
return run_query_internal(db, snippet_args, args);
}
/*
* underline a string, pager style.
*/
static char *
ul_pager(int ul, const char *s)
{
size_t len;
char *dst, *d;
if (!ul)
return estrdup(s);
// a -> _\ba
len = strlen(s) * 3 + 1;
d = dst = emalloc(len);
while (*s) {
*d++ = '_';
*d++ = '\b';
*d++ = *s++;
}
*d = '\0';
return dst;
}
/*
* callback_pager --
* A callback similar to callback_html. It overstrikes the matching text in
* the snippet so that it appears emboldened when viewed using a pager like
* more or less.
*/
static int
callback_pager(query_callback_args *callback_args)
{
struct orig_callback_data *orig_data = callback_args->other_data;
char *psnippet;
const char *temp = callback_args->snippet;
int count = 0;
int i = 0, did;
size_t sz = 0;
size_t psnippet_length;
/* Count the number of bytes of matching text. For each of these
* bytes we will use 2 extra bytes to overstrike it so that it
* appears bold when viewed using a pager.
*/
while (*temp) {
sz = strcspn(temp, "\002\003");
temp += sz;
if (*temp == '\003') {
count += 2 * (sz);
}
temp++;
}
psnippet_length = callback_args->snippet_length + count;
psnippet = emalloc(psnippet_length + 1);
/* Copy the bytes from snippet to psnippet:
* 1. Copy the bytes before \002 as it is.
* 2. The bytes after \002 need to be overstriked till we
* encounter \003.
* 3. To overstrike a byte 'A' we need to write 'A\bA'
*/
did = 0;
const char *snippet = callback_args->snippet;
while (*snippet) {
sz = strcspn(snippet, "\002");
memcpy(&psnippet[i], snippet, sz);
snippet += sz;
i += sz;
/* Don't change this. Advancing the pointer without reading the byte
* is causing strange behavior.
*/
if (*snippet == '\002')
snippet++;
while (*snippet && *snippet != '\003') {
did = 1;
psnippet[i++] = *snippet;
psnippet[i++] = '\b';
psnippet[i++] = *snippet++;
}
if (*snippet)
snippet++;
}
psnippet[i] = 0;
char *ul_section = ul_pager(did, callback_args->section);
char *ul_name = ul_pager(did, callback_args->name);
char *ul_name_desc = ul_pager(did, callback_args->name_desc);
callback_args->section = ul_section;
callback_args->name = ul_name;
callback_args->name_desc = ul_name_desc;
callback_args->snippet = psnippet;
callback_args->snippet_length = psnippet_length;
callback_args->other_data = orig_data->data;
(orig_data->callback)(callback_args);
free(ul_section);
free(ul_name);
free(ul_name_desc);
free(psnippet);
return 0;
}
struct term_args {
struct orig_callback_data *orig_data;
const char *smul;
const char *rmul;
};
/*
* underline a string, pager style.
*/
static char *
ul_term(const char *s, const struct term_args *ta)
{
char *dst;
easprintf(&dst, "%s%s%s", ta->smul, s, ta->rmul);
return dst;
}
/*
* callback_term --
* A callback similar to callback_html. It overstrikes the matching text in
* the snippet so that it appears emboldened when viewed using a pager like
* more or less.
*/
static int
callback_term(query_callback_args *callback_args)
{
struct term_args *ta = callback_args->other_data;
struct orig_callback_data *orig_data = ta->orig_data;
char *ul_section = ul_term(callback_args->section, ta);
char *ul_name = ul_term(callback_args->name, ta);
char *ul_name_desc = ul_term(callback_args->name_desc, ta);
callback_args->section = ul_section;
callback_args->name = ul_name;
callback_args->name_desc = ul_name_desc;
callback_args->other_data = orig_data->data;
(orig_data->callback)(callback_args);
free(ul_section);
free(ul_name);
free(ul_name_desc);
return 0;
}
/*
* run_query_pager --
* Utility function similar to run_query_html. This function tries to
* pre-process the result assuming it will be piped to a pager.
* For this purpose it first calls its own callback function callback_pager
* which then delegates the call to the user supplied callback.
*/
static int
2012-05-10 19:36:09 +04:00
run_query_pager(sqlite3 *db, query_args *args)
{
struct orig_callback_data orig_data;
orig_data.callback = args->callback;
orig_data.data = args->callback_data;
const char *snippet_args[3] = { "\002", "\003", "..." };
args->callback = &callback_pager;
args->callback_data = (void *) &orig_data;
return run_query_internal(db, snippet_args, args);
}
struct nv {
char *s;
size_t l;
};
static int
term_putc(int c, void *p)
{
struct nv *nv = p;
nv->s[nv->l++] = c;
return 0;
}
static char *
term_fix_seq(TERMINAL *ti, const char *seq)
{
char *res = estrdup(seq);
struct nv nv;
if (ti == NULL)
return res;
nv.s = res;
nv.l = 0;
ti_puts(ti, seq, 1, term_putc, &nv);
nv.s[nv.l] = '\0';
return res;
}
static void
term_init(int fd, const char *sa[5])
{
TERMINAL *ti;
int error;
const char *bold, *sgr0, *smso, *rmso, *smul, *rmul;
if (ti_setupterm(&ti, NULL, fd, &error) == -1) {
bold = sgr0 = NULL;
smso = rmso = smul = rmul = "";
ti = NULL;
} else {
bold = ti_getstr(ti, "bold");
sgr0 = ti_getstr(ti, "sgr0");
if (bold == NULL || sgr0 == NULL) {
smso = ti_getstr(ti, "smso");
if (smso == NULL ||
(rmso = ti_getstr(ti, "rmso")) == NULL)
smso = rmso = "";
bold = sgr0 = NULL;
} else
smso = rmso = "";
smul = ti_getstr(ti, "smul");
if (smul == NULL || (rmul = ti_getstr(ti, "rmul")) == NULL)
smul = rmul = "";
}
sa[0] = term_fix_seq(ti, bold ? bold : smso);
sa[1] = term_fix_seq(ti, sgr0 ? sgr0 : rmso);
sa[2] = estrdup("...");
sa[3] = term_fix_seq(ti, smul);
sa[4] = term_fix_seq(ti, rmul);
if (ti)
del_curterm(ti);
}
/*
* run_query_term --
* Utility function similar to run_query_html. This function tries to
* pre-process the result assuming it will be displayed on a terminal
* For this purpose it first calls its own callback function callback_pager
* which then delegates the call to the user supplied callback.
*/
static int
run_query_term(sqlite3 *db, query_args *args)
{
struct orig_callback_data orig_data;
struct term_args ta;
orig_data.callback = args->callback;
orig_data.data = args->callback_data;
const char *snippet_args[5];
term_init(STDOUT_FILENO, snippet_args);
ta.smul = snippet_args[3];
ta.rmul = snippet_args[4];
ta.orig_data = (void *) &orig_data;
args->callback = &callback_term;
args->callback_data = &ta;
return run_query_internal(db, snippet_args, args);
}
static int
run_query_none(sqlite3 *db, query_args *args)
{
struct orig_callback_data orig_data;
orig_data.callback = args->callback;
orig_data.data = args->callback_data;
const char *snippet_args[3] = { "", "", "..." };
args->callback = &callback_pager;
args->callback_data = (void *) &orig_data;
return run_query_internal(db, snippet_args, args);
}
int
run_query(sqlite3 *db, query_format fmt, query_args *args)
{
switch (fmt) {
case APROPOS_NONE:
return run_query_none(db, args);
case APROPOS_HTML:
return run_query_html(db, args);
case APROPOS_TERM:
return run_query_term(db, args);
case APROPOS_PAGER:
return run_query_pager(db, args);
default:
warnx("Unknown query format %d", (int)fmt);
return -1;
}
}