extracted stem database from rcldb to make it smaller
This commit is contained in:
parent
9086c6e531
commit
7302cb745f
@ -15,7 +15,7 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \
|
||||
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
|
||||
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
|
||||
textsplit.o transcode.o \
|
||||
unacpp.o unac.o docseq.o sortseq.o copyfile.o
|
||||
unacpp.o unac.o docseq.o sortseq.o copyfile.o stemdb.o
|
||||
|
||||
SRCS = \
|
||||
$(depth)/utils/conftree.cpp $(depth)/index/csguess.cpp \
|
||||
@ -30,6 +30,7 @@ SRCS = \
|
||||
$(depth)/common/myhtmlparse.cpp $(depth)/common/pathhash.cpp \
|
||||
$(depth)/utils/pathut.cpp $(depth)/common/rclconfig.cpp \
|
||||
$(depth)/common/rcldb.cpp $(depth)/common/rclinit.cpp \
|
||||
$(depth)/common/stemdb.cpp \
|
||||
$(depth)/utils/base64.cpp $(depth)/utils/readfile.cpp \
|
||||
$(depth)/utils/smallut.cpp $(depth)/common/textsplit.cpp \
|
||||
$(depth)/utils/transcode.cpp $(depth)/common/unacpp.cpp \
|
||||
@ -96,6 +97,8 @@ rclinit.o : $(depth)/common/rclinit.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||
rcldb.o : $(depth)/common/rcldb.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||
stemdb.o : $(depth)/common/stemdb.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||
readfile.o : $(depth)/utils/readfile.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||
base64.o : $(depth)/utils/base64.cpp
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.68 2006-04-13 09:50:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -30,8 +30,10 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
#define RCLDB_INTERNAL
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "stemdb.h"
|
||||
#include "textsplit.h"
|
||||
#include "transcode.h"
|
||||
#include "unacpp.h"
|
||||
@ -41,10 +43,8 @@ using namespace std;
|
||||
#include "smallut.h"
|
||||
#include "pathhash.h"
|
||||
#include "utf8iter.h"
|
||||
#include "wipedir.h"
|
||||
|
||||
#include "xapian.h"
|
||||
#include <xapian/stem.h>
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (A>B?A:B)
|
||||
@ -664,27 +664,15 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
return true;
|
||||
}
|
||||
|
||||
const static string stemdirstem = "stem_";
|
||||
/// Compute name of stem db for given base database and language
|
||||
static string stemdbname(const string& basename, string lang)
|
||||
{
|
||||
string nm = path_cat(basename, stemdirstem + lang);
|
||||
return nm;
|
||||
}
|
||||
|
||||
// Return list of existing stem db languages
|
||||
list<string> Db::getStemLangs()
|
||||
{
|
||||
list<string> dirs;
|
||||
LOGDEB(("Db::getStemLang\n"));
|
||||
if (m_ndb == 0)
|
||||
list<string> dirs;
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return dirs;
|
||||
string pattern = stemdirstem + "*";
|
||||
dirs = path_dirglob(m_ndb->m_basedir, pattern);
|
||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
*it = path_basename(*it);
|
||||
*it = it->substr(stemdirstem.length(), string::npos);
|
||||
}
|
||||
dirs = StemDb::getLangs(m_ndb->m_basedir);
|
||||
return dirs;
|
||||
}
|
||||
|
||||
@ -694,25 +682,9 @@ list<string> Db::getStemLangs()
|
||||
bool Db::deleteStemDb(const string& lang)
|
||||
{
|
||||
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||
if (m_ndb == 0)
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return false;
|
||||
if (m_ndb->m_isopen == false)
|
||||
return false;
|
||||
|
||||
string dir = stemdbname(m_ndb->m_basedir, lang);
|
||||
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||
// we don't
|
||||
inline static bool
|
||||
p_notlowerascii(unsigned int c)
|
||||
{
|
||||
if (c < 'a' || (c > 'z' && c < 128))
|
||||
return true;
|
||||
return false;
|
||||
return StemDb::deleteDb(m_ndb->m_basedir, lang);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -724,145 +696,13 @@ p_notlowerascii(unsigned int c)
|
||||
bool Db::createStemDb(const string& lang)
|
||||
{
|
||||
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
|
||||
if (m_ndb == 0)
|
||||
return false;
|
||||
if (m_ndb->m_isopen == false)
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return false;
|
||||
|
||||
// First build the in-memory stem database:
|
||||
// We walk the list of all terms, and stem each.
|
||||
// If the stem is identical to the term, no need to create an entry
|
||||
// Else, we add an entry to the multimap.
|
||||
// At the end, we only save stem-terms associations with several terms, the
|
||||
// others are not useful
|
||||
// Note: a map<string, list<string> > would probably be more efficient
|
||||
multimap<string, string> assocs;
|
||||
// Statistics
|
||||
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
||||
int stemconst=0; // Stem == term
|
||||
int stemdiff=0; // Count of all different stems
|
||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
Xapian::TermIterator it;
|
||||
for (it = m_ndb->wdb.allterms_begin();
|
||||
it != m_ndb->wdb.allterms_end(); it++) {
|
||||
// If it has any non-lowercase 7bit char, cant be stemmable
|
||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||
++nostem;
|
||||
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
||||
// (*it).c_str(), *sit));
|
||||
continue;
|
||||
}
|
||||
string stem = stemmer.stem_word(*it);
|
||||
//cerr << "word " << *it << " stem " << stem << endl;
|
||||
if (stem == *it) {
|
||||
++stemconst;
|
||||
continue;
|
||||
}
|
||||
assocs.insert(pair<string,string>(stem, *it));
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||
return false;
|
||||
} catch (...) {
|
||||
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||
lang.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
class DirWiper {
|
||||
public:
|
||||
string dir;
|
||||
bool do_it;
|
||||
DirWiper(string d) : dir(d), do_it(true) {}
|
||||
~DirWiper() {
|
||||
if (do_it) {
|
||||
wipedir(dir);
|
||||
rmdir(dir.c_str());
|
||||
}
|
||||
}
|
||||
};
|
||||
// Create xapian database for stem relations
|
||||
string stemdbdir = stemdbname(m_ndb->m_basedir, lang);
|
||||
// We want to get rid of the db dir in case of error. This gets disarmed
|
||||
// just before success return.
|
||||
DirWiper wiper(stemdbdir);
|
||||
const char *ermsg = "NOERROR";
|
||||
Xapian::WritableDatabase sdb;
|
||||
try {
|
||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
if (ermsg != "NOERROR") {
|
||||
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
||||
stemdbdir.c_str(), ermsg));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Enter pseud-docs in db. Walk the multimap, only enter
|
||||
// associations where there are several parent terms
|
||||
string stem;
|
||||
list<string> derivs;
|
||||
for (multimap<string,string>::const_iterator it = assocs.begin();
|
||||
it != assocs.end(); it++) {
|
||||
if (stem == it->first) {
|
||||
// Staying with same stem
|
||||
derivs.push_back(it->second);
|
||||
// cerr << " " << it->second << endl;
|
||||
} else {
|
||||
// Changing stems
|
||||
++stemdiff;
|
||||
if (derivs.size() == 1) {
|
||||
// Exactly one term stems to this. Check for the case where
|
||||
// the stem itself exists as a term. The code above would not
|
||||
// have inserted anything in this case.
|
||||
if (m_ndb->wdb.term_exists(stem))
|
||||
derivs.push_back(stem);
|
||||
}
|
||||
if (derivs.size() > 1) {
|
||||
// Previous stem has multiple derivatives. Enter in db
|
||||
++stemmultiple;
|
||||
Xapian::Document newdocument;
|
||||
newdocument.add_term(stem);
|
||||
// The doc data is just parents=blank-separated-list
|
||||
string record = "parents=";
|
||||
for (list<string>::const_iterator it = derivs.begin();
|
||||
it != derivs.end(); it++) {
|
||||
record += *it + " ";
|
||||
}
|
||||
record += "\n";
|
||||
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
try {
|
||||
sdb.replace_document(stem, newdocument);
|
||||
} catch (...) {
|
||||
LOGERR(("Db::createstemdb: replace failed\n"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
derivs.clear();
|
||||
stem = it->first;
|
||||
derivs.push_back(it->second);
|
||||
// cerr << "\n" << stem << " " << it->second;
|
||||
}
|
||||
}
|
||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||
wiper.do_it = false;
|
||||
return true;
|
||||
return StemDb:: createDb(m_ndb->m_iswritable ? m_ndb->wdb : m_ndb->db,
|
||||
m_ndb->m_basedir, lang);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is called at the end of an indexing session, to delete the
|
||||
* documents for files that are no longer there.
|
||||
@ -907,57 +747,6 @@ bool Db::purge()
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand term to list of all terms which stem to the same term.
|
||||
*/
|
||||
static list<string> stemexpand(Native *m_ndb, string term, const string& lang)
|
||||
{
|
||||
list<string> explist;
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
string stem = stemmer.stem_word(term);
|
||||
LOGDEB(("stemexpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
|
||||
// Try to fetch the doc from the stem db
|
||||
string stemdbdir = stemdbname(m_ndb->m_basedir, lang);
|
||||
Xapian::Database sdb(stemdbdir);
|
||||
LOGDEB1(("stemexpand: %s lastdocid: %d\n",
|
||||
stemdbdir.c_str(), sdb.get_lastdocid()));
|
||||
if (!sdb.term_exists(stem)) {
|
||||
LOGDEB1(("Db::stemexpand: no term for %s\n", stem.c_str()));
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
||||
if (did == sdb.postlist_end(stem)) {
|
||||
LOGDEB1(("stemexpand: no term(1) for %s\n",stem.c_str()));
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
Xapian::Document doc = sdb.get_document(*did);
|
||||
string data = doc.get_data();
|
||||
// No need for a conftree, but we need to massage the data a little
|
||||
string::size_type pos = data.find_first_of("=");
|
||||
++pos;
|
||||
string::size_type pos1 = data.find_last_of("\n");
|
||||
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
stringToStrings(data.substr(pos, pos1-pos), explist);
|
||||
if (find(explist.begin(), explist.end(), term) == explist.end()) {
|
||||
explist.push_back(term);
|
||||
}
|
||||
LOGDEB(("stemexpand: %s -> %s\n", stem.c_str(),
|
||||
stringlistdisp(explist).c_str()));
|
||||
} catch (...) {
|
||||
LOGERR(("stemexpand: error accessing stem db\n"));
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
return explist;
|
||||
}
|
||||
|
||||
|
||||
// Splitter callback for breaking query into terms
|
||||
class wsQData : public TextSplitCB {
|
||||
public:
|
||||
@ -1042,7 +831,7 @@ static void stringToXapianQueries(const string &iq,
|
||||
dumb_string(term, term1);
|
||||
// Possibly perform stem compression/expansion
|
||||
if (!nostemexp && (opts & Db::QO_STEM)) {
|
||||
exp = stemexpand(m_ndb, term1, stemlang);
|
||||
exp = StemDb::stemExpand(m_ndb->m_basedir, stemlang,term1);
|
||||
} else {
|
||||
exp.push_back(term1);
|
||||
}
|
||||
|
||||
257
src/rcldb/stemdb.cpp
Normal file
257
src/rcldb/stemdb.cpp
Normal file
@ -0,0 +1,257 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
|
||||
#include <xapian.h>
|
||||
#include <xapian/stem.h>
|
||||
|
||||
#include "stemdb.h"
|
||||
#include "wipedir.h"
|
||||
#include "pathut.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Rcl {
|
||||
namespace StemDb {
|
||||
|
||||
|
||||
const static string stemdirstem = "stem_";
|
||||
|
||||
/// Compute name of stem db for given base database and language
|
||||
static string stemdbname(const string& dbdir, const string& lang)
|
||||
{
|
||||
return path_cat(dbdir, stemdirstem + lang);
|
||||
}
|
||||
|
||||
list<string> getLangs(const string& dbdir)
|
||||
{
|
||||
string pattern = stemdirstem + "*";
|
||||
list<string> dirs = path_dirglob(dbdir, pattern);
|
||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
*it = path_basename(*it);
|
||||
*it = it->substr(stemdirstem.length(), string::npos);
|
||||
}
|
||||
return dirs;
|
||||
}
|
||||
|
||||
bool deleteDb(const string& dbdir, const string& lang)
|
||||
{
|
||||
string dir = stemdbname(dbdir, lang);
|
||||
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Autoclean/delete directory
|
||||
class DirWiper {
|
||||
public:
|
||||
string dir;
|
||||
bool do_it;
|
||||
DirWiper(string d) : dir(d), do_it(true) {}
|
||||
~DirWiper() {
|
||||
if (do_it) {
|
||||
wipedir(dir);
|
||||
rmdir(dir.c_str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||
// we don't
|
||||
inline static bool
|
||||
p_notlowerascii(unsigned int c)
|
||||
{
|
||||
if (c < 'a' || (c > 'z' && c < 128))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create database of stem to parents associations for a given language.
|
||||
* We walk the list of all terms, stem them, and create another Xapian db
|
||||
* with documents indexed by a single term (the stem), and with the list of
|
||||
* parent terms in the document data.
|
||||
*/
|
||||
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||
{
|
||||
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
||||
|
||||
// First build the in-memory stem database:
|
||||
// We walk the list of all terms, and stem each.
|
||||
// If the stem is identical to the term, no need to create an entry
|
||||
// Else, we add an entry to the multimap.
|
||||
// At the end, we only save stem-terms associations with several terms, the
|
||||
// others are not useful
|
||||
// Note: a map<string, list<string> > would probably be more efficient
|
||||
multimap<string, string> assocs;
|
||||
// Statistics
|
||||
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
||||
int stemconst=0; // Stem == term
|
||||
int stemdiff=0; // Count of all different stems
|
||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
Xapian::TermIterator it;
|
||||
for (it = xdb.allterms_begin();
|
||||
it != xdb.allterms_end(); it++) {
|
||||
// If it has any non-lowercase 7bit char, cant be stemmable
|
||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||
++nostem;
|
||||
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
||||
// (*it).c_str(), *sit));
|
||||
continue;
|
||||
}
|
||||
string stem = stemmer.stem_word(*it);
|
||||
//cerr << "word " << *it << " stem " << stem << endl;
|
||||
if (stem == *it) {
|
||||
++stemconst;
|
||||
continue;
|
||||
}
|
||||
assocs.insert(pair<string,string>(stem, *it));
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||
return false;
|
||||
} catch (...) {
|
||||
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||
lang.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create xapian database for stem relations
|
||||
string stemdbdir = stemdbname(dbdir, lang);
|
||||
// We want to get rid of the db dir in case of error. This gets disarmed
|
||||
// just before success return.
|
||||
DirWiper wiper(stemdbdir);
|
||||
const char *ermsg = "NOERROR";
|
||||
Xapian::WritableDatabase sdb;
|
||||
try {
|
||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
if (ermsg != "NOERROR") {
|
||||
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
||||
stemdbdir.c_str(), ermsg));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Enter pseud-docs in db. Walk the multimap, only enter
|
||||
// associations where there are several parent terms
|
||||
string stem;
|
||||
list<string> derivs;
|
||||
for (multimap<string,string>::const_iterator it = assocs.begin();
|
||||
it != assocs.end(); it++) {
|
||||
if (stem == it->first) {
|
||||
// Staying with same stem
|
||||
derivs.push_back(it->second);
|
||||
// cerr << " " << it->second << endl;
|
||||
} else {
|
||||
// Changing stems
|
||||
++stemdiff;
|
||||
if (derivs.size() == 1) {
|
||||
// Exactly one term stems to this. Check for the case where
|
||||
// the stem itself exists as a term. The code above would not
|
||||
// have inserted anything in this case.
|
||||
if (xdb.term_exists(stem))
|
||||
derivs.push_back(stem);
|
||||
}
|
||||
if (derivs.size() > 1) {
|
||||
// Previous stem has multiple derivatives. Enter in db
|
||||
++stemmultiple;
|
||||
Xapian::Document newdocument;
|
||||
newdocument.add_term(stem);
|
||||
// The doc data is just parents=blank-separated-list
|
||||
string record = "parents=";
|
||||
for (list<string>::const_iterator it = derivs.begin();
|
||||
it != derivs.end(); it++) {
|
||||
record += *it + " ";
|
||||
}
|
||||
record += "\n";
|
||||
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
try {
|
||||
sdb.replace_document(stem, newdocument);
|
||||
} catch (...) {
|
||||
LOGERR(("Db::createstemdb: replace failed\n"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
derivs.clear();
|
||||
stem = it->first;
|
||||
derivs.push_back(it->second);
|
||||
// cerr << "\n" << stem << " " << it->second;
|
||||
}
|
||||
}
|
||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||
wiper.do_it = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand term to list of all terms which stem to the same term.
|
||||
*/
|
||||
list<string> stemExpand(const string& dbdir, const string& lang,
|
||||
const string& term)
|
||||
{
|
||||
list<string> explist;
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
string stem = stemmer.stem_word(term);
|
||||
LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
|
||||
// Try to fetch the doc from the stem db
|
||||
string stemdbdir = stemdbname(dbdir, lang);
|
||||
Xapian::Database sdb(stemdbdir);
|
||||
LOGDEB1(("stemExpand: %s lastdocid: %d\n",
|
||||
stemdbdir.c_str(), sdb.get_lastdocid()));
|
||||
if (!sdb.term_exists(stem)) {
|
||||
LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str()));
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
||||
if (did == sdb.postlist_end(stem)) {
|
||||
LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str()));
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
Xapian::Document doc = sdb.get_document(*did);
|
||||
string data = doc.get_data();
|
||||
// No need for a conftree, but we need to massage the data a little
|
||||
string::size_type pos = data.find_first_of("=");
|
||||
++pos;
|
||||
string::size_type pos1 = data.find_last_of("\n");
|
||||
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
stringToStrings(data.substr(pos, pos1-pos), explist);
|
||||
if (find(explist.begin(), explist.end(), term) == explist.end()) {
|
||||
explist.push_back(term);
|
||||
}
|
||||
LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(),
|
||||
stringlistdisp(explist).c_str()));
|
||||
} catch (...) {
|
||||
LOGERR(("stemExpand: error accessing stem db\n"));
|
||||
explist.push_back(term);
|
||||
return explist;
|
||||
}
|
||||
return explist;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
37
src/rcldb/stemdb.h
Normal file
37
src/rcldb/stemdb.h
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef _STEMDB_H_INCLUDED_
|
||||
#define _STEMDB_H_INCLUDED_
|
||||
/* @(#$Id: stemdb.h,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#ifdef RCLDB_INTERNAL
|
||||
/// Stem database code
|
||||
///
|
||||
/// Stem databases list stems and the set of index terms they expand to. They
|
||||
/// are computed from index data by stemming each term and regrouping those
|
||||
/// that stem to the same value.
|
||||
/// Stem databases are stored as separate xapian databases (used as an
|
||||
/// Isam method), in subdirectories of the index.
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
namespace Rcl {
|
||||
namespace StemDb {
|
||||
|
||||
/// Get languages of existing stem databases
|
||||
extern std::list<std::string> getLangs(const std::string& dbdir);
|
||||
/// Delete stem database for given language
|
||||
extern bool deleteDb(const std::string& dbdir, const std::string& lang);
|
||||
/// Create stem database for given language
|
||||
extern bool createDb(Xapian::Database& xdb,
|
||||
const std::string& dbdir, const std::string& lang);
|
||||
/// Expand term to stem siblings
|
||||
extern std::list<std::string> stemExpand(const std::string& dbdir,
|
||||
const std::string& lang,
|
||||
const std::string& term);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // RCLDB_INTERNAL
|
||||
#endif /* _STEMDB_H_INCLUDED_ */
|
||||
Loading…
x
Reference in New Issue
Block a user