extracted stem database from rcldb to make it smaller
This commit is contained in:
parent
9086c6e531
commit
7302cb745f
@ -15,7 +15,7 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \
|
|||||||
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
|
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
|
||||||
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
|
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
|
||||||
textsplit.o transcode.o \
|
textsplit.o transcode.o \
|
||||||
unacpp.o unac.o docseq.o sortseq.o copyfile.o
|
unacpp.o unac.o docseq.o sortseq.o copyfile.o stemdb.o
|
||||||
|
|
||||||
SRCS = \
|
SRCS = \
|
||||||
$(depth)/utils/conftree.cpp $(depth)/index/csguess.cpp \
|
$(depth)/utils/conftree.cpp $(depth)/index/csguess.cpp \
|
||||||
@ -30,6 +30,7 @@ SRCS = \
|
|||||||
$(depth)/common/myhtmlparse.cpp $(depth)/common/pathhash.cpp \
|
$(depth)/common/myhtmlparse.cpp $(depth)/common/pathhash.cpp \
|
||||||
$(depth)/utils/pathut.cpp $(depth)/common/rclconfig.cpp \
|
$(depth)/utils/pathut.cpp $(depth)/common/rclconfig.cpp \
|
||||||
$(depth)/common/rcldb.cpp $(depth)/common/rclinit.cpp \
|
$(depth)/common/rcldb.cpp $(depth)/common/rclinit.cpp \
|
||||||
|
$(depth)/common/stemdb.cpp \
|
||||||
$(depth)/utils/base64.cpp $(depth)/utils/readfile.cpp \
|
$(depth)/utils/base64.cpp $(depth)/utils/readfile.cpp \
|
||||||
$(depth)/utils/smallut.cpp $(depth)/common/textsplit.cpp \
|
$(depth)/utils/smallut.cpp $(depth)/common/textsplit.cpp \
|
||||||
$(depth)/utils/transcode.cpp $(depth)/common/unacpp.cpp \
|
$(depth)/utils/transcode.cpp $(depth)/common/unacpp.cpp \
|
||||||
@ -96,6 +97,8 @@ rclinit.o : $(depth)/common/rclinit.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||||
rcldb.o : $(depth)/common/rcldb.cpp
|
rcldb.o : $(depth)/common/rcldb.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||||
|
stemdb.o : $(depth)/common/stemdb.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||||
readfile.o : $(depth)/utils/readfile.cpp
|
readfile.o : $(depth)/utils/readfile.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c $<
|
$(CXX) $(ALL_CXXFLAGS) -c $<
|
||||||
base64.o : $(depth)/utils/base64.cpp
|
base64.o : $(depth)/utils/base64.cpp
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.68 2006-04-13 09:50:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -30,8 +30,10 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $
|
|||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using namespace std;
|
using namespace std;
|
||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
#define RCLDB_INTERNAL
|
||||||
|
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
|
#include "stemdb.h"
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
@ -41,10 +43,8 @@ using namespace std;
|
|||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "pathhash.h"
|
#include "pathhash.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "wipedir.h"
|
|
||||||
|
|
||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
#include <xapian/stem.h>
|
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(A,B) (A>B?A:B)
|
#define MAX(A,B) (A>B?A:B)
|
||||||
@ -664,27 +664,15 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const static string stemdirstem = "stem_";
|
|
||||||
/// Compute name of stem db for given base database and language
|
|
||||||
static string stemdbname(const string& basename, string lang)
|
|
||||||
{
|
|
||||||
string nm = path_cat(basename, stemdirstem + lang);
|
|
||||||
return nm;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return list of existing stem db languages
|
// Return list of existing stem db languages
|
||||||
list<string> Db::getStemLangs()
|
list<string> Db::getStemLangs()
|
||||||
{
|
{
|
||||||
list<string> dirs;
|
|
||||||
LOGDEB(("Db::getStemLang\n"));
|
LOGDEB(("Db::getStemLang\n"));
|
||||||
if (m_ndb == 0)
|
list<string> dirs;
|
||||||
|
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||||
return dirs;
|
return dirs;
|
||||||
string pattern = stemdirstem + "*";
|
dirs = StemDb::getLangs(m_ndb->m_basedir);
|
||||||
dirs = path_dirglob(m_ndb->m_basedir, pattern);
|
|
||||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
|
||||||
*it = path_basename(*it);
|
|
||||||
*it = it->substr(stemdirstem.length(), string::npos);
|
|
||||||
}
|
|
||||||
return dirs;
|
return dirs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -694,25 +682,9 @@ list<string> Db::getStemLangs()
|
|||||||
bool Db::deleteStemDb(const string& lang)
|
bool Db::deleteStemDb(const string& lang)
|
||||||
{
|
{
|
||||||
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||||
if (m_ndb == 0)
|
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||||
return false;
|
return false;
|
||||||
if (m_ndb->m_isopen == false)
|
return StemDb::deleteDb(m_ndb->m_basedir, lang);
|
||||||
return false;
|
|
||||||
|
|
||||||
string dir = stemdbname(m_ndb->m_basedir, lang);
|
|
||||||
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
|
||||||
// we don't
|
|
||||||
inline static bool
|
|
||||||
p_notlowerascii(unsigned int c)
|
|
||||||
{
|
|
||||||
if (c < 'a' || (c > 'z' && c < 128))
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -724,145 +696,13 @@ p_notlowerascii(unsigned int c)
|
|||||||
bool Db::createStemDb(const string& lang)
|
bool Db::createStemDb(const string& lang)
|
||||||
{
|
{
|
||||||
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
|
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
|
||||||
if (m_ndb == 0)
|
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||||
return false;
|
|
||||||
if (m_ndb->m_isopen == false)
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// First build the in-memory stem database:
|
return StemDb:: createDb(m_ndb->m_iswritable ? m_ndb->wdb : m_ndb->db,
|
||||||
// We walk the list of all terms, and stem each.
|
m_ndb->m_basedir, lang);
|
||||||
// If the stem is identical to the term, no need to create an entry
|
|
||||||
// Else, we add an entry to the multimap.
|
|
||||||
// At the end, we only save stem-terms associations with several terms, the
|
|
||||||
// others are not useful
|
|
||||||
// Note: a map<string, list<string> > would probably be more efficient
|
|
||||||
multimap<string, string> assocs;
|
|
||||||
// Statistics
|
|
||||||
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
|
||||||
int stemconst=0; // Stem == term
|
|
||||||
int stemdiff=0; // Count of all different stems
|
|
||||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
|
||||||
try {
|
|
||||||
Xapian::Stem stemmer(lang);
|
|
||||||
Xapian::TermIterator it;
|
|
||||||
for (it = m_ndb->wdb.allterms_begin();
|
|
||||||
it != m_ndb->wdb.allterms_end(); it++) {
|
|
||||||
// If it has any non-lowercase 7bit char, cant be stemmable
|
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
|
||||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
|
||||||
++nostem;
|
|
||||||
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
|
||||||
// (*it).c_str(), *sit));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
string stem = stemmer.stem_word(*it);
|
|
||||||
//cerr << "word " << *it << " stem " << stem << endl;
|
|
||||||
if (stem == *it) {
|
|
||||||
++stemconst;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
assocs.insert(pair<string,string>(stem, *it));
|
|
||||||
}
|
|
||||||
} catch (const Xapian::Error &e) {
|
|
||||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
|
||||||
return false;
|
|
||||||
} catch (...) {
|
|
||||||
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
|
||||||
lang.c_str()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
class DirWiper {
|
|
||||||
public:
|
|
||||||
string dir;
|
|
||||||
bool do_it;
|
|
||||||
DirWiper(string d) : dir(d), do_it(true) {}
|
|
||||||
~DirWiper() {
|
|
||||||
if (do_it) {
|
|
||||||
wipedir(dir);
|
|
||||||
rmdir(dir.c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// Create xapian database for stem relations
|
|
||||||
string stemdbdir = stemdbname(m_ndb->m_basedir, lang);
|
|
||||||
// We want to get rid of the db dir in case of error. This gets disarmed
|
|
||||||
// just before success return.
|
|
||||||
DirWiper wiper(stemdbdir);
|
|
||||||
const char *ermsg = "NOERROR";
|
|
||||||
Xapian::WritableDatabase sdb;
|
|
||||||
try {
|
|
||||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
|
||||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
|
||||||
} catch (const Xapian::Error &e) {
|
|
||||||
ermsg = e.get_msg().c_str();
|
|
||||||
} catch (const string &s) {
|
|
||||||
ermsg = s.c_str();
|
|
||||||
} catch (const char *s) {
|
|
||||||
ermsg = s;
|
|
||||||
} catch (...) {
|
|
||||||
ermsg = "Caught unknown exception";
|
|
||||||
}
|
|
||||||
if (ermsg != "NOERROR") {
|
|
||||||
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
|
||||||
stemdbdir.c_str(), ermsg));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enter pseud-docs in db. Walk the multimap, only enter
|
|
||||||
// associations where there are several parent terms
|
|
||||||
string stem;
|
|
||||||
list<string> derivs;
|
|
||||||
for (multimap<string,string>::const_iterator it = assocs.begin();
|
|
||||||
it != assocs.end(); it++) {
|
|
||||||
if (stem == it->first) {
|
|
||||||
// Staying with same stem
|
|
||||||
derivs.push_back(it->second);
|
|
||||||
// cerr << " " << it->second << endl;
|
|
||||||
} else {
|
|
||||||
// Changing stems
|
|
||||||
++stemdiff;
|
|
||||||
if (derivs.size() == 1) {
|
|
||||||
// Exactly one term stems to this. Check for the case where
|
|
||||||
// the stem itself exists as a term. The code above would not
|
|
||||||
// have inserted anything in this case.
|
|
||||||
if (m_ndb->wdb.term_exists(stem))
|
|
||||||
derivs.push_back(stem);
|
|
||||||
}
|
|
||||||
if (derivs.size() > 1) {
|
|
||||||
// Previous stem has multiple derivatives. Enter in db
|
|
||||||
++stemmultiple;
|
|
||||||
Xapian::Document newdocument;
|
|
||||||
newdocument.add_term(stem);
|
|
||||||
// The doc data is just parents=blank-separated-list
|
|
||||||
string record = "parents=";
|
|
||||||
for (list<string>::const_iterator it = derivs.begin();
|
|
||||||
it != derivs.end(); it++) {
|
|
||||||
record += *it + " ";
|
|
||||||
}
|
|
||||||
record += "\n";
|
|
||||||
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
|
|
||||||
newdocument.set_data(record);
|
|
||||||
try {
|
|
||||||
sdb.replace_document(stem, newdocument);
|
|
||||||
} catch (...) {
|
|
||||||
LOGERR(("Db::createstemdb: replace failed\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
derivs.clear();
|
|
||||||
stem = it->first;
|
|
||||||
derivs.push_back(it->second);
|
|
||||||
// cerr << "\n" << stem << " " << it->second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
|
||||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
|
||||||
wiper.do_it = false;
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is called at the end of an indexing session, to delete the
|
* This is called at the end of an indexing session, to delete the
|
||||||
* documents for files that are no longer there.
|
* documents for files that are no longer there.
|
||||||
@ -907,57 +747,6 @@ bool Db::purge()
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Expand term to list of all terms which stem to the same term.
|
|
||||||
*/
|
|
||||||
static list<string> stemexpand(Native *m_ndb, string term, const string& lang)
|
|
||||||
{
|
|
||||||
list<string> explist;
|
|
||||||
try {
|
|
||||||
Xapian::Stem stemmer(lang);
|
|
||||||
string stem = stemmer.stem_word(term);
|
|
||||||
LOGDEB(("stemexpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
|
|
||||||
// Try to fetch the doc from the stem db
|
|
||||||
string stemdbdir = stemdbname(m_ndb->m_basedir, lang);
|
|
||||||
Xapian::Database sdb(stemdbdir);
|
|
||||||
LOGDEB1(("stemexpand: %s lastdocid: %d\n",
|
|
||||||
stemdbdir.c_str(), sdb.get_lastdocid()));
|
|
||||||
if (!sdb.term_exists(stem)) {
|
|
||||||
LOGDEB1(("Db::stemexpand: no term for %s\n", stem.c_str()));
|
|
||||||
explist.push_back(term);
|
|
||||||
return explist;
|
|
||||||
}
|
|
||||||
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
|
||||||
if (did == sdb.postlist_end(stem)) {
|
|
||||||
LOGDEB1(("stemexpand: no term(1) for %s\n",stem.c_str()));
|
|
||||||
explist.push_back(term);
|
|
||||||
return explist;
|
|
||||||
}
|
|
||||||
Xapian::Document doc = sdb.get_document(*did);
|
|
||||||
string data = doc.get_data();
|
|
||||||
// No need for a conftree, but we need to massage the data a little
|
|
||||||
string::size_type pos = data.find_first_of("=");
|
|
||||||
++pos;
|
|
||||||
string::size_type pos1 = data.find_last_of("\n");
|
|
||||||
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
|
||||||
explist.push_back(term);
|
|
||||||
return explist;
|
|
||||||
}
|
|
||||||
stringToStrings(data.substr(pos, pos1-pos), explist);
|
|
||||||
if (find(explist.begin(), explist.end(), term) == explist.end()) {
|
|
||||||
explist.push_back(term);
|
|
||||||
}
|
|
||||||
LOGDEB(("stemexpand: %s -> %s\n", stem.c_str(),
|
|
||||||
stringlistdisp(explist).c_str()));
|
|
||||||
} catch (...) {
|
|
||||||
LOGERR(("stemexpand: error accessing stem db\n"));
|
|
||||||
explist.push_back(term);
|
|
||||||
return explist;
|
|
||||||
}
|
|
||||||
return explist;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Splitter callback for breaking query into terms
|
// Splitter callback for breaking query into terms
|
||||||
class wsQData : public TextSplitCB {
|
class wsQData : public TextSplitCB {
|
||||||
public:
|
public:
|
||||||
@ -1042,7 +831,7 @@ static void stringToXapianQueries(const string &iq,
|
|||||||
dumb_string(term, term1);
|
dumb_string(term, term1);
|
||||||
// Possibly perform stem compression/expansion
|
// Possibly perform stem compression/expansion
|
||||||
if (!nostemexp && (opts & Db::QO_STEM)) {
|
if (!nostemexp && (opts & Db::QO_STEM)) {
|
||||||
exp = stemexpand(m_ndb, term1, stemlang);
|
exp = StemDb::stemExpand(m_ndb->m_basedir, stemlang,term1);
|
||||||
} else {
|
} else {
|
||||||
exp.push_back(term1);
|
exp.push_back(term1);
|
||||||
}
|
}
|
||||||
|
|||||||
257
src/rcldb/stemdb.cpp
Normal file
257
src/rcldb/stemdb.cpp
Normal file
@ -0,0 +1,257 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
#include <xapian.h>
|
||||||
|
#include <xapian/stem.h>
|
||||||
|
|
||||||
|
#include "stemdb.h"
|
||||||
|
#include "wipedir.h"
|
||||||
|
#include "pathut.h"
|
||||||
|
#include "debuglog.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
namespace Rcl {
|
||||||
|
namespace StemDb {
|
||||||
|
|
||||||
|
|
||||||
|
const static string stemdirstem = "stem_";
|
||||||
|
|
||||||
|
/// Compute name of stem db for given base database and language
|
||||||
|
static string stemdbname(const string& dbdir, const string& lang)
|
||||||
|
{
|
||||||
|
return path_cat(dbdir, stemdirstem + lang);
|
||||||
|
}
|
||||||
|
|
||||||
|
list<string> getLangs(const string& dbdir)
|
||||||
|
{
|
||||||
|
string pattern = stemdirstem + "*";
|
||||||
|
list<string> dirs = path_dirglob(dbdir, pattern);
|
||||||
|
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||||
|
*it = path_basename(*it);
|
||||||
|
*it = it->substr(stemdirstem.length(), string::npos);
|
||||||
|
}
|
||||||
|
return dirs;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool deleteDb(const string& dbdir, const string& lang)
|
||||||
|
{
|
||||||
|
string dir = stemdbname(dbdir, lang);
|
||||||
|
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Autoclean/delete directory
|
||||||
|
class DirWiper {
|
||||||
|
public:
|
||||||
|
string dir;
|
||||||
|
bool do_it;
|
||||||
|
DirWiper(string d) : dir(d), do_it(true) {}
|
||||||
|
~DirWiper() {
|
||||||
|
if (do_it) {
|
||||||
|
wipedir(dir);
|
||||||
|
rmdir(dir.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||||
|
// we don't
|
||||||
|
inline static bool
|
||||||
|
p_notlowerascii(unsigned int c)
|
||||||
|
{
|
||||||
|
if (c < 'a' || (c > 'z' && c < 128))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create database of stem to parents associations for a given language.
|
||||||
|
* We walk the list of all terms, stem them, and create another Xapian db
|
||||||
|
* with documents indexed by a single term (the stem), and with the list of
|
||||||
|
* parent terms in the document data.
|
||||||
|
*/
|
||||||
|
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||||
|
{
|
||||||
|
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
||||||
|
|
||||||
|
// First build the in-memory stem database:
|
||||||
|
// We walk the list of all terms, and stem each.
|
||||||
|
// If the stem is identical to the term, no need to create an entry
|
||||||
|
// Else, we add an entry to the multimap.
|
||||||
|
// At the end, we only save stem-terms associations with several terms, the
|
||||||
|
// others are not useful
|
||||||
|
// Note: a map<string, list<string> > would probably be more efficient
|
||||||
|
multimap<string, string> assocs;
|
||||||
|
// Statistics
|
||||||
|
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
||||||
|
int stemconst=0; // Stem == term
|
||||||
|
int stemdiff=0; // Count of all different stems
|
||||||
|
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||||
|
try {
|
||||||
|
Xapian::Stem stemmer(lang);
|
||||||
|
Xapian::TermIterator it;
|
||||||
|
for (it = xdb.allterms_begin();
|
||||||
|
it != xdb.allterms_end(); it++) {
|
||||||
|
// If it has any non-lowercase 7bit char, cant be stemmable
|
||||||
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
|
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||||
|
++nostem;
|
||||||
|
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
||||||
|
// (*it).c_str(), *sit));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
string stem = stemmer.stem_word(*it);
|
||||||
|
//cerr << "word " << *it << " stem " << stem << endl;
|
||||||
|
if (stem == *it) {
|
||||||
|
++stemconst;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assocs.insert(pair<string,string>(stem, *it));
|
||||||
|
}
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||||
|
return false;
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||||
|
lang.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create xapian database for stem relations
|
||||||
|
string stemdbdir = stemdbname(dbdir, lang);
|
||||||
|
// We want to get rid of the db dir in case of error. This gets disarmed
|
||||||
|
// just before success return.
|
||||||
|
DirWiper wiper(stemdbdir);
|
||||||
|
const char *ermsg = "NOERROR";
|
||||||
|
Xapian::WritableDatabase sdb;
|
||||||
|
try {
|
||||||
|
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||||
|
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
ermsg = e.get_msg().c_str();
|
||||||
|
} catch (const string &s) {
|
||||||
|
ermsg = s.c_str();
|
||||||
|
} catch (const char *s) {
|
||||||
|
ermsg = s;
|
||||||
|
} catch (...) {
|
||||||
|
ermsg = "Caught unknown exception";
|
||||||
|
}
|
||||||
|
if (ermsg != "NOERROR") {
|
||||||
|
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
||||||
|
stemdbdir.c_str(), ermsg));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enter pseud-docs in db. Walk the multimap, only enter
|
||||||
|
// associations where there are several parent terms
|
||||||
|
string stem;
|
||||||
|
list<string> derivs;
|
||||||
|
for (multimap<string,string>::const_iterator it = assocs.begin();
|
||||||
|
it != assocs.end(); it++) {
|
||||||
|
if (stem == it->first) {
|
||||||
|
// Staying with same stem
|
||||||
|
derivs.push_back(it->second);
|
||||||
|
// cerr << " " << it->second << endl;
|
||||||
|
} else {
|
||||||
|
// Changing stems
|
||||||
|
++stemdiff;
|
||||||
|
if (derivs.size() == 1) {
|
||||||
|
// Exactly one term stems to this. Check for the case where
|
||||||
|
// the stem itself exists as a term. The code above would not
|
||||||
|
// have inserted anything in this case.
|
||||||
|
if (xdb.term_exists(stem))
|
||||||
|
derivs.push_back(stem);
|
||||||
|
}
|
||||||
|
if (derivs.size() > 1) {
|
||||||
|
// Previous stem has multiple derivatives. Enter in db
|
||||||
|
++stemmultiple;
|
||||||
|
Xapian::Document newdocument;
|
||||||
|
newdocument.add_term(stem);
|
||||||
|
// The doc data is just parents=blank-separated-list
|
||||||
|
string record = "parents=";
|
||||||
|
for (list<string>::const_iterator it = derivs.begin();
|
||||||
|
it != derivs.end(); it++) {
|
||||||
|
record += *it + " ";
|
||||||
|
}
|
||||||
|
record += "\n";
|
||||||
|
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
|
||||||
|
newdocument.set_data(record);
|
||||||
|
try {
|
||||||
|
sdb.replace_document(stem, newdocument);
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("Db::createstemdb: replace failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
derivs.clear();
|
||||||
|
stem = it->first;
|
||||||
|
derivs.push_back(it->second);
|
||||||
|
// cerr << "\n" << stem << " " << it->second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||||
|
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||||
|
wiper.do_it = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expand term to list of all terms which stem to the same term.
|
||||||
|
*/
|
||||||
|
list<string> stemExpand(const string& dbdir, const string& lang,
|
||||||
|
const string& term)
|
||||||
|
{
|
||||||
|
list<string> explist;
|
||||||
|
try {
|
||||||
|
Xapian::Stem stemmer(lang);
|
||||||
|
string stem = stemmer.stem_word(term);
|
||||||
|
LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
|
||||||
|
// Try to fetch the doc from the stem db
|
||||||
|
string stemdbdir = stemdbname(dbdir, lang);
|
||||||
|
Xapian::Database sdb(stemdbdir);
|
||||||
|
LOGDEB1(("stemExpand: %s lastdocid: %d\n",
|
||||||
|
stemdbdir.c_str(), sdb.get_lastdocid()));
|
||||||
|
if (!sdb.term_exists(stem)) {
|
||||||
|
LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str()));
|
||||||
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
||||||
|
if (did == sdb.postlist_end(stem)) {
|
||||||
|
LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str()));
|
||||||
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
Xapian::Document doc = sdb.get_document(*did);
|
||||||
|
string data = doc.get_data();
|
||||||
|
// No need for a conftree, but we need to massage the data a little
|
||||||
|
string::size_type pos = data.find_first_of("=");
|
||||||
|
++pos;
|
||||||
|
string::size_type pos1 = data.find_last_of("\n");
|
||||||
|
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
||||||
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
stringToStrings(data.substr(pos, pos1-pos), explist);
|
||||||
|
if (find(explist.begin(), explist.end(), term) == explist.end()) {
|
||||||
|
explist.push_back(term);
|
||||||
|
}
|
||||||
|
LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(),
|
||||||
|
stringlistdisp(explist).c_str()));
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("stemExpand: error accessing stem db\n"));
|
||||||
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
37
src/rcldb/stemdb.h
Normal file
37
src/rcldb/stemdb.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#ifndef _STEMDB_H_INCLUDED_
|
||||||
|
#define _STEMDB_H_INCLUDED_
|
||||||
|
/* @(#$Id: stemdb.h,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
#ifdef RCLDB_INTERNAL
|
||||||
|
/// Stem database code
|
||||||
|
///
|
||||||
|
/// Stem databases list stems and the set of index terms they expand to. They
|
||||||
|
/// are computed from index data by stemming each term and regrouping those
|
||||||
|
/// that stem to the same value.
|
||||||
|
/// Stem databases are stored as separate xapian databases (used as an
|
||||||
|
/// Isam method), in subdirectories of the index.
|
||||||
|
|
||||||
|
#include <list>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include <xapian.h>
|
||||||
|
|
||||||
|
namespace Rcl {
|
||||||
|
namespace StemDb {
|
||||||
|
|
||||||
|
/// Get languages of existing stem databases
|
||||||
|
extern std::list<std::string> getLangs(const std::string& dbdir);
|
||||||
|
/// Delete stem database for given language
|
||||||
|
extern bool deleteDb(const std::string& dbdir, const std::string& lang);
|
||||||
|
/// Create stem database for given language
|
||||||
|
extern bool createDb(Xapian::Database& xdb,
|
||||||
|
const std::string& dbdir, const std::string& lang);
|
||||||
|
/// Expand term to stem siblings
|
||||||
|
extern std::list<std::string> stemExpand(const std::string& dbdir,
|
||||||
|
const std::string& lang,
|
||||||
|
const std::string& term);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // RCLDB_INTERNAL
|
||||||
|
#endif /* _STEMDB_H_INCLUDED_ */
|
||||||
Loading…
x
Reference in New Issue
Block a user