Add a CLucene full-text add-on.

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@39172 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
Clemens Zeidler 2010-10-28 15:22:52 +00:00
parent 4462ce0d83
commit 3a88830daa
9 changed files with 844 additions and 0 deletions

View File

@ -0,0 +1,27 @@
/*
* Copyright 2010, Haiku.
* Distributed under the terms of the MIT License.
*
* Authors:
* Clemens Zeidler <haiku@clemens-zeidler.de>
*/
#ifndef TEXT_DATA_BASE_H
#define TEXT_DATA_BASE_H
#include <Entry.h>
class TextWriteDataBase {
public:
virtual ~TextWriteDataBase() {}
virtual status_t InitCheck() = 0;
virtual status_t AddDocument(const entry_ref& ref) = 0;
virtual status_t RemoveDocument(const entry_ref& ref) = 0;
virtual status_t Commit() = 0;
};
#endif // TEXT_DATA_BASE_H

View File

@ -3,6 +3,7 @@ SubDir HAIKU_TOP src add-ons ;
SubInclude HAIKU_TOP src add-ons accelerants ;
SubInclude HAIKU_TOP src add-ons decorators ;
SubInclude HAIKU_TOP src add-ons disk_systems ;
SubInclude HAIKU_TOP src add-ons index_server ;
SubInclude HAIKU_TOP src add-ons input_server ;
SubInclude HAIKU_TOP src add-ons kernel ;
SubInclude HAIKU_TOP src add-ons locale ;

View File

@ -0,0 +1,291 @@
/*
* Copyright 2010, Haiku.
* Distributed under the terms of the MIT License.
*
* Authors:
* based on previous work of Ankur Sethi
* Clemens Zeidler <haiku@clemens-zeidler.de>
*/
#include "CLuceneDataBase.h"
#include <Directory.h>
#include <File.h>
#include <TranslatorRoster.h>
#define DEBUG_CLUCENE_DATABASE
#ifdef DEBUG_CLUCENE_DATABASE
#include <stdio.h>
# define STRACE(x...) printf("FT: " x)
#else
# define STRACE(x...) ;
#endif
using namespace lucene::document;
using namespace lucene::util;
const uint8 kCluceneTries = 10;
wchar_t* to_wchar(const char *str)
{
int size = strlen(str) * sizeof(wchar_t) ;
wchar_t *wStr = new wchar_t[size] ;
if (mbstowcs(wStr, str, size) == -1)
return NULL ;
else
return wStr ;
}
CLuceneWriteDataBase::CLuceneWriteDataBase(const BPath& databasePath)
:
fDataBasePath(databasePath),
fTempPath(databasePath),
fIndexWriter(NULL)
{
printf("CLuceneWriteDataBase fDataBasePath %s\n", fDataBasePath.Path());
create_directory(fDataBasePath.Path(), 0755);
fTempPath.Append("temp_file");
}
CLuceneWriteDataBase::~CLuceneWriteDataBase()
{
// TODO: delete fTempPath file
}
status_t
CLuceneWriteDataBase::InitCheck()
{
return B_OK;
}
status_t
CLuceneWriteDataBase::AddDocument(const entry_ref& ref)
{
fAddQueue.push_back(ref);
return B_ERROR;
}
status_t
CLuceneWriteDataBase::RemoveDocument(const entry_ref& ref)
{
fDeleteQueue.push_back(ref);
return B_ERROR;
}
status_t
CLuceneWriteDataBase::Commit()
{
if (fAddQueue.size() == 0 && fDeleteQueue.size() == 0)
return B_OK;
STRACE("Commit\n");
_RemoveDocuments(fAddQueue);
_RemoveDocuments(fDeleteQueue);
fDeleteQueue.clear();
if (fAddQueue.size() == 0)
return B_OK;
fIndexWriter = _OpenIndexWriter();
if (fIndexWriter == NULL)
return B_ERROR;
status_t status = B_OK;
for (unsigned int i = 0; i < fAddQueue.size(); i++) {
if (!_IndexDocument(fAddQueue.at(i))) {
status = B_ERROR;
break;
}
}
fAddQueue.clear();
fIndexWriter->close();
delete fIndexWriter;
fIndexWriter = NULL;
return status;
}
IndexWriter*
CLuceneWriteDataBase::_OpenIndexWriter()
{
IndexWriter* writer = NULL;
for (int i = 0; i < kCluceneTries; i++) {
try {
bool createIndex = true;
if (IndexReader::indexExists(fDataBasePath.Path()))
createIndex = false;
writer = new IndexWriter(fDataBasePath.Path(),
&fStandardAnalyzer, createIndex);
if (writer)
break;
} catch (CLuceneError &error) {
STRACE("CLuceneError: _OpenIndexWriter %s\n", error.what());
delete writer;
writer = NULL;
}
}
return writer;
}
IndexReader*
CLuceneWriteDataBase::_OpenIndexReader()
{
IndexReader* reader = NULL;
BEntry entry(fDataBasePath.Path(), NULL);
if (!entry.Exists())
return NULL;
for (int i = 0; i < kCluceneTries; i++) {
try {
if (!IndexReader::indexExists(fDataBasePath.Path()))
return NULL;
reader = IndexReader::open(fDataBasePath.Path());
if (reader)
break;
} catch (CLuceneError &error) {
STRACE("CLuceneError: _OpenIndexReader %s\n", error.what());
delete reader;
reader = NULL;
}
}
return reader;
}
bool
CLuceneWriteDataBase::_RemoveDocuments(std::vector<entry_ref>& docs)
{
IndexReader *reader = NULL;
reader = _OpenIndexReader();
if (!reader)
return false;
bool status = false;
for (unsigned int i = 0; i < docs.size(); i++) {
BPath path(&docs.at(i));
wchar_t* wPath = to_wchar(path.Path());
if (wPath == NULL)
continue;
for (int i = 0; i < kCluceneTries; i++) {
status = _RemoveDocument(wPath, reader);
if (status)
break;
reader->close();
delete reader;
reader = _OpenIndexReader();
if (!reader) {
status = false;
break;
}
}
delete wPath;
if (!status)
break;
}
reader->close();
delete reader;
return status;
}
bool
CLuceneWriteDataBase::_RemoveDocument(wchar_t* wPath, IndexReader* reader)
{
try {
Term term(_T("path"), wPath);
reader->deleteDocuments(&term);
} catch (CLuceneError &error) {
STRACE("CLuceneError: deleteDocuments %s\n", error.what());
return false;
}
return true;
}
bool
CLuceneWriteDataBase::_IndexDocument(const entry_ref& ref)
{
BPath path(&ref);
BFile inFile, outFile;
inFile.SetTo(path.Path(), B_READ_ONLY);
if (inFile.InitCheck() != B_OK) {
STRACE("Can't open inFile %s\n", path.Path());
return false;
}
outFile.SetTo(fTempPath.Path(),
B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
if (outFile.InitCheck() != B_OK) {
STRACE("Can't open outFile %s\n", fTempPath.Path());
return false;
}
BTranslatorRoster* translatorRoster = BTranslatorRoster::Default();
if (translatorRoster->Translate(&inFile, NULL, NULL, &outFile, 'TEXT')
!= B_OK)
return false;
inFile.Unset();
outFile.Unset();
FileReader* fileReader = new FileReader(fTempPath.Path(), "UTF-8");
wchar_t* wPath = to_wchar(path.Path());
if (wPath == NULL)
return false;
Document *document = new Document;
Field contentField(_T("contents"), fileReader,
Field::STORE_NO | Field::INDEX_TOKENIZED);
document->add(contentField);
Field pathField(_T("path"), wPath,
Field::STORE_YES | Field::INDEX_UNTOKENIZED);
document->add(pathField);
bool status = true;
for (int i = 0; i < kCluceneTries; i++) {
try {
fIndexWriter->addDocument(document);
STRACE("document added, retries: %i\n", i);
break;
} catch (CLuceneError &error) {
STRACE("CLuceneError addDocument %s\n", error.what());
fIndexWriter->close();
delete fIndexWriter;
fIndexWriter = _OpenIndexWriter();
if (fIndexWriter == NULL) {
status = false;
break;
}
}
}
if (!status)
delete document;
delete wPath;
return status;
}

View File

@ -0,0 +1,58 @@
/*
* Copyright 2010, Haiku.
* Distributed under the terms of the MIT License.
*
* Authors:
* Clemens Zeidler <haiku@clemens-zeidler.de>
*/
#ifndef CLUCENE_DATA_BASE_H
#define CLUCENE_DATA_BASE_H
#include <vector>
#include <Path.h>
#include "TextDataBase.h"
#include <CLucene.h>
using namespace lucene::index;
using namespace lucene::analysis::standard;
class CLuceneWriteDataBase : public TextWriteDataBase {
public:
CLuceneWriteDataBase(const BPath& databasePath);
~CLuceneWriteDataBase();
status_t InitCheck();
status_t AddDocument(const entry_ref& ref);
status_t RemoveDocument(const entry_ref& ref);
status_t Commit();
private:
IndexWriter* _OpenIndexWriter();
IndexReader* _OpenIndexReader();
bool _RemoveDocuments(std::vector<entry_ref>& docs);
bool _RemoveDocument(wchar_t* doc,
IndexReader* reader);
bool _IndexDocument(const entry_ref& ref);
BPath fDataBasePath;
BPath fTempPath;
std::vector<entry_ref> fAddQueue;
std::vector<entry_ref> fDeleteQueue;
StandardAnalyzer fStandardAnalyzer;
IndexWriter* fIndexWriter;
};
#endif

View File

@ -0,0 +1,162 @@
/*
* Copyright 2010, Haiku.
* Distributed under the terms of the MIT License.
*
* Authors:
* Clemens Zeidler <haiku@clemens-zeidler.de>
*/
#include "FullTextAnalyser.h"
#include <new>
#include <Directory.h>
#include <String.h>
#include <TranslatorFormats.h>
#include <TranslatorRoster.h>
#include "CLuceneDataBase.h"
#include "IndexServerPrivate.h"
#define DEBUG_FULLTEXT_ANALYSER
#ifdef DEBUG_FULLTEXT_ANALYSER
#include <stdio.h>
# define STRACE(x...) printf("FullTextAnalyser: " x)
#else
# define STRACE(x...) ;
#endif
FullTextAnalyser::FullTextAnalyser(BString name, const BVolume& volume)
:
FileAnalyser(name, volume),
fWriteDataBase(NULL),
fNUncommited(0)
{
BDirectory dir;
volume.GetRootDirectory(&dir);
fDataBasePath.SetTo(&dir);
fDataBasePath.Append(kIndexServerDirectory);
status_t status = fDataBasePath.Append(kFullTextDirectory);
if (status == B_OK)
fWriteDataBase = new CLuceneWriteDataBase(fDataBasePath);
}
FullTextAnalyser::~FullTextAnalyser()
{
delete fWriteDataBase;
}
status_t
FullTextAnalyser::InitCheck()
{
if (fDataBasePath.InitCheck() != B_OK)
return fDataBasePath.InitCheck();
if (!fWriteDataBase)
return B_NO_MEMORY;
return fWriteDataBase->InitCheck();
}
void
FullTextAnalyser::AnalyseEntry(const entry_ref& ref)
{
if (!_InterestingEntry(ref))
return;
BPath path(&ref);
if (BString(path.Path()).FindFirst(fDataBasePath.Path()) == 0) {
STRACE("In database path %s\n", path.Path());
return;
}
if (BString(path.Path()).FindFirst("/boot/common/cache/tmp") == 0)
return;
//STRACE("FullTextAnalyser AnalyseEntry: %s %s\n", ref.name, path.Path());
fWriteDataBase->AddDocument(ref);
fNUncommited++;
if (fNUncommited > 100)
LastEntry();
}
void
FullTextAnalyser::DeleteEntry(const entry_ref& ref)
{
STRACE("FullTextAnalyser DeleteEntry: %s\n", ref.name);
fWriteDataBase->RemoveDocument(ref);
}
void
FullTextAnalyser::MoveEntry(const entry_ref& oldRef, const entry_ref& newRef)
{
if (!_InterestingEntry(newRef))
return;
STRACE("FullTextAnalyser MoveEntry: %s to %s\n", oldRef.name, newRef.name);
fWriteDataBase->RemoveDocument(oldRef);
AnalyseEntry(newRef);
}
void
FullTextAnalyser::LastEntry()
{
fWriteDataBase->Commit();
fNUncommited = 0;
}
bool
FullTextAnalyser::_InterestingEntry(const entry_ref& ref)
{
if (_IsInIndexDirectory(ref))
return false;
BFile file(&ref, B_READ_ONLY);
translator_info translatorInfo;
if (BTranslatorRoster::Default()->Identify(&file, NULL, &translatorInfo, 0,
NULL, B_TRANSLATOR_TEXT) != B_OK)
return false;
return true;
}
bool
FullTextAnalyser::_IsInIndexDirectory(const entry_ref& ref)
{
BEntry entry(&ref);
BDirectory dataBaseDir(fDataBasePath.Path());
if (dataBaseDir.Contains(&entry))
return true;
return false;
}
FullTextAddOn::FullTextAddOn(image_id id, const char* name)
:
IndexServerAddOn(id, name)
{
}
FileAnalyser*
FullTextAddOn::CreateFileAnalyser(const BVolume& volume)
{
return new (std::nothrow)FullTextAnalyser(Name(), volume);
}
extern "C" IndexServerAddOn* (instantiate_index_server_addon)(image_id id,
const char* name)
{
return new (std::nothrow)FullTextAddOn(id, name);
}

View File

@ -0,0 +1,54 @@
/*
* Copyright 2010, Haiku.
* Distributed under the terms of the MIT License.
*
* Authors:
* Clemens Zeidler <haiku@clemens-zeidler.de>
*/
#ifndef FULL_TEXT_ANALYSER_H
#define FULL_TEXT_ANALYSER_H
#include "IndexServerAddOn.h"
#include <Path.h>
#include "TextDataBase.h"
const char* kFullTextDirectory = "FullTextAnalyser";
class FullTextAnalyser : public FileAnalyser {
public:
FullTextAnalyser(BString name,
const BVolume& volume);
~FullTextAnalyser();
status_t InitCheck();
void AnalyseEntry(const entry_ref& ref);
void DeleteEntry(const entry_ref& ref);
void MoveEntry(const entry_ref& oldRef,
const entry_ref& newRef);
void LastEntry();
private:
inline bool _InterestingEntry(const entry_ref& ref);
inline bool _IsInIndexDirectory(const entry_ref& ref);
TextWriteDataBase* fWriteDataBase;
BPath fDataBasePath;
uint32 fNUncommited;
};
class FullTextAddOn : public IndexServerAddOn {
public:
FullTextAddOn(image_id id, const char* name);
FileAnalyser* CreateFileAnalyser(const BVolume& volume);
};
#endif

View File

@ -0,0 +1,17 @@
SubDir HAIKU_TOP src add-ons index_server FullText ;
UsePrivateHeaders index_server shared ;
SubDirSysHdrs $(HAIKU_CLUCENE_HEADERS) ;
Addon FullTextAnalyser :
CLuceneDataBase.cpp
FullTextAnalyser.cpp
IndexServerAddOn.cpp
:
be translation $(HAIKU_CLUCENE_LIBS) $(TARGET_LIBSTDC++)
;
SEARCH on [ FGristFiles IndexServerAddOn.cpp ]
+= [ FDirName $(SUBDIR) $(DOTDOT) ] ;

View File

@ -0,0 +1,230 @@
/*
* Copyright 2010, Haiku.
* Distributed under the terms of the MIT License.
*
* Authors:
* Clemens Zeidler <haiku@clemens-zeidler.de>
*/
#include "IndexServerAddOn.h"
#include <Debug.h>
#include <Directory.h>
#include <File.h>
#include <Path.h>
#include "IndexServerPrivate.h"
analyser_settings::analyser_settings()
:
catchUpEnabled(true),
syncPosition(0),
watchingStart(0),
watchingPosition(0)
{
}
const char* kAnalyserStatusFile = "AnalyserStatus";
const char* kCatchUpEnabledAttr = "CatchUpEnabled";
const char* kSyncPositionAttr = "SyncPosition";
const char* kWatchingStartAttr = "WatchingStart";
const char* kWatchingPositionAttr = "WatchingPosition";
AnalyserSettings::AnalyserSettings(const BString& name, const BVolume& volume)
:
fName(name),
fVolume(volume)
{
ReadSettings();
}
bool
AnalyserSettings::ReadSettings()
{
BAutolock _(fSettingsLock);
BDirectory rootDir;
fVolume.GetRootDirectory(&rootDir);
BPath path(&rootDir);
path.Append(kIndexServerDirectory);
path.Append(fName);
path.Append(kAnalyserStatusFile);
BFile file(path.Path(), B_READ_ONLY);
if (file.InitCheck() != B_OK)
return false;
uint32 value;
file.ReadAttr(kCatchUpEnabledAttr, B_UINT32_TYPE, 0, &value,
sizeof(uint32));
fAnalyserSettings.catchUpEnabled = value != 0 ? true : false;
file.ReadAttr(kSyncPositionAttr, B_INT64_TYPE, 0,
&fAnalyserSettings.syncPosition, sizeof(int64));
file.ReadAttr(kWatchingStartAttr, B_INT64_TYPE, 0,
&fAnalyserSettings.watchingStart, sizeof(int64));
file.ReadAttr(kWatchingPositionAttr, B_INT64_TYPE, 0,
&fAnalyserSettings.watchingPosition, sizeof(int64));
return true;
}
bool
AnalyserSettings::WriteSettings()
{
BAutolock _(fSettingsLock);
BDirectory rootDir;
fVolume.GetRootDirectory(&rootDir);
BPath path(&rootDir);
path.Append(kIndexServerDirectory);
path.Append(fName);
if (create_directory(path.Path(), 777) != B_OK)
return false;
path.Append(kAnalyserStatusFile);
BFile file(path.Path(), B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
if (file.InitCheck() != B_OK)
return false;
uint32 value = fAnalyserSettings.catchUpEnabled ? 1 : 0;
file.WriteAttr(kCatchUpEnabledAttr, B_UINT32_TYPE, 0, &value,
sizeof(uint32));
file.WriteAttr(kSyncPositionAttr, B_INT64_TYPE, 0,
&fAnalyserSettings.syncPosition, sizeof(int64));
file.WriteAttr(kWatchingStartAttr, B_INT64_TYPE, 0,
&fAnalyserSettings.watchingStart, sizeof(int64));
file.WriteAttr(kWatchingPositionAttr, B_INT64_TYPE, 0,
&fAnalyserSettings.watchingPosition, sizeof(int64));
return true;
}
analyser_settings
AnalyserSettings::RawSettings()
{
BAutolock _(fSettingsLock);
return fAnalyserSettings;
}
void
AnalyserSettings::SetCatchUpEnabled(bool enabled)
{
BAutolock _(fSettingsLock);
fAnalyserSettings.catchUpEnabled = enabled;
}
void
AnalyserSettings::SetSyncPosition(bigtime_t time)
{
BAutolock _(fSettingsLock);
fAnalyserSettings.syncPosition = time;
}
void
AnalyserSettings::SetWatchingStart(bigtime_t time)
{
BAutolock _(fSettingsLock);
fAnalyserSettings.watchingStart = time;
}
void
AnalyserSettings::SetWatchingPosition(bigtime_t time)
{
BAutolock _(fSettingsLock);
fAnalyserSettings.watchingPosition = time;
}
bool
AnalyserSettings::CatchUpEnabled()
{
BAutolock _(fSettingsLock);
return fAnalyserSettings.catchUpEnabled;
}
bigtime_t
AnalyserSettings::SyncPosition()
{
BAutolock _(fSettingsLock);
return fAnalyserSettings.syncPosition;
}
bigtime_t
AnalyserSettings::WatchingStart()
{
BAutolock _(fSettingsLock);
return fAnalyserSettings.watchingStart;
}
bigtime_t
AnalyserSettings::WatchingPosition()
{
BAutolock _(fSettingsLock);
return fAnalyserSettings.watchingPosition;
}
FileAnalyser::FileAnalyser(const BString& name, const BVolume& volume)
:
fVolume(volume),
fName(name)
{
}
void
FileAnalyser::SetSettings(AnalyserSettings* settings)
{
ASSERT(fName == settings->Name() && fVolume == settings->Volume());
fAnalyserSettings = settings;
ASSERT(fAnalyserSettings.Get());
UpdateSettingsCache();
}
AnalyserSettings*
FileAnalyser::Settings() const
{
return fAnalyserSettings;
}
const analyser_settings&
FileAnalyser::CachedSettings() const
{
return fCachedSettings;
}
void
FileAnalyser::UpdateSettingsCache()
{
fCachedSettings = fAnalyserSettings->RawSettings();
}

View File

@ -0,0 +1,4 @@
SubDir HAIKU_TOP src add-ons index_server ;
SubInclude HAIKU_TOP src add-ons index_server AudioTags ;
SubInclude HAIKU_TOP src add-ons index_server FullText ;