extracted stem database from rcldb to make it smaller

This commit is contained in:
dockes 2006-04-13 09:50:03 +00:00
parent 9086c6e531
commit 7302cb745f
4 changed files with 310 additions and 224 deletions

View File

@ -15,7 +15,7 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
textsplit.o transcode.o \
unacpp.o unac.o docseq.o sortseq.o copyfile.o
unacpp.o unac.o docseq.o sortseq.o copyfile.o stemdb.o
SRCS = \
$(depth)/utils/conftree.cpp $(depth)/index/csguess.cpp \
@ -30,6 +30,7 @@ SRCS = \
$(depth)/common/myhtmlparse.cpp $(depth)/common/pathhash.cpp \
$(depth)/utils/pathut.cpp $(depth)/common/rclconfig.cpp \
$(depth)/common/rcldb.cpp $(depth)/common/rclinit.cpp \
$(depth)/common/stemdb.cpp \
$(depth)/utils/base64.cpp $(depth)/utils/readfile.cpp \
$(depth)/utils/smallut.cpp $(depth)/common/textsplit.cpp \
$(depth)/utils/transcode.cpp $(depth)/common/unacpp.cpp \
@ -96,6 +97,8 @@ rclinit.o : $(depth)/common/rclinit.cpp
$(CXX) $(ALL_CXXFLAGS) -c $<
rcldb.o : $(depth)/common/rcldb.cpp
$(CXX) $(ALL_CXXFLAGS) -c $<
stemdb.o : $(depth)/common/stemdb.cpp
$(CXX) $(ALL_CXXFLAGS) -c $<
readfile.o : $(depth)/utils/readfile.cpp
$(CXX) $(ALL_CXXFLAGS) -c $<
base64.o : $(depth)/utils/base64.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.68 2006-04-13 09:50:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -30,8 +30,10 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */
#define RCLDB_INTERNAL
#include "rcldb.h"
#include "stemdb.h"
#include "textsplit.h"
#include "transcode.h"
#include "unacpp.h"
@ -41,10 +43,8 @@ using namespace std;
#include "smallut.h"
#include "pathhash.h"
#include "utf8iter.h"
#include "wipedir.h"
#include "xapian.h"
#include <xapian/stem.h>
#ifndef MAX
#define MAX(A,B) (A>B?A:B)
@ -664,27 +664,15 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
return true;
}
const static string stemdirstem = "stem_";
/// Compute name of stem db for given base database and language
static string stemdbname(const string& basename, string lang)
{
string nm = path_cat(basename, stemdirstem + lang);
return nm;
}
// Return list of existing stem db languages
list<string> Db::getStemLangs()
{
list<string> dirs;
LOGDEB(("Db::getStemLang\n"));
if (m_ndb == 0)
list<string> dirs;
if (m_ndb == 0 || m_ndb->m_isopen == false)
return dirs;
string pattern = stemdirstem + "*";
dirs = path_dirglob(m_ndb->m_basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
dirs = StemDb::getLangs(m_ndb->m_basedir);
return dirs;
}
@ -694,25 +682,9 @@ list<string> Db::getStemLangs()
bool Db::deleteStemDb(const string& lang)
{
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
if (m_ndb == 0)
if (m_ndb == 0 || m_ndb->m_isopen == false)
return false;
if (m_ndb->m_isopen == false)
return false;
string dir = stemdbname(m_ndb->m_basedir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
// Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool
p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
return StemDb::deleteDb(m_ndb->m_basedir, lang);
}
/**
@ -724,145 +696,13 @@ p_notlowerascii(unsigned int c)
bool Db::createStemDb(const string& lang)
{
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
if (m_ndb == 0)
return false;
if (m_ndb->m_isopen == false)
if (m_ndb == 0 || m_ndb->m_isopen == false)
return false;
// First build the in-memory stem database:
// We walk the list of all terms, and stem each.
// If the stem is identical to the term, no need to create an entry
// Else, we add an entry to the multimap.
// At the end, we only save stem-terms associations with several terms, the
// others are not useful
// Note: a map<string, list<string> > would probably be more efficient
multimap<string, string> assocs;
// Statistics
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
int stemconst=0; // Stem == term
int stemdiff=0; // Count of all different stems
int stemmultiple = 0; // Count of stems with multiple derivatives
try {
Xapian::Stem stemmer(lang);
Xapian::TermIterator it;
for (it = m_ndb->wdb.allterms_begin();
it != m_ndb->wdb.allterms_end(); it++) {
// If it has any non-lowercase 7bit char, cant be stemmable
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
// (*it).c_str(), *sit));
continue;
}
string stem = stemmer.stem_word(*it);
//cerr << "word " << *it << " stem " << stem << endl;
if (stem == *it) {
++stemconst;
continue;
}
assocs.insert(pair<string,string>(stem, *it));
}
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) {
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str()));
return false;
}
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
// Create xapian database for stem relations
string stemdbdir = stemdbname(m_ndb->m_basedir, lang);
// We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb;
try {
sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
if (ermsg != "NOERROR") {
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
stemdbdir.c_str(), ermsg));
return false;
}
// Enter pseud-docs in db. Walk the multimap, only enter
// associations where there are several parent terms
string stem;
list<string> derivs;
for (multimap<string,string>::const_iterator it = assocs.begin();
it != assocs.end(); it++) {
if (stem == it->first) {
// Staying with same stem
derivs.push_back(it->second);
// cerr << " " << it->second << endl;
} else {
// Changing stems
++stemdiff;
if (derivs.size() == 1) {
// Exactly one term stems to this. Check for the case where
// the stem itself exists as a term. The code above would not
// have inserted anything in this case.
if (m_ndb->wdb.term_exists(stem))
derivs.push_back(stem);
}
if (derivs.size() > 1) {
// Previous stem has multiple derivatives. Enter in db
++stemmultiple;
Xapian::Document newdocument;
newdocument.add_term(stem);
// The doc data is just parents=blank-separated-list
string record = "parents=";
for (list<string>::const_iterator it = derivs.begin();
it != derivs.end(); it++) {
record += *it + " ";
}
record += "\n";
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
try {
sdb.replace_document(stem, newdocument);
} catch (...) {
LOGERR(("Db::createstemdb: replace failed\n"));
return false;
}
}
derivs.clear();
stem = it->first;
derivs.push_back(it->second);
// cerr << "\n" << stem << " " << it->second;
}
}
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true;
return StemDb:: createDb(m_ndb->m_iswritable ? m_ndb->wdb : m_ndb->db,
m_ndb->m_basedir, lang);
}
/**
* This is called at the end of an indexing session, to delete the
* documents for files that are no longer there.
@ -907,57 +747,6 @@ bool Db::purge()
return true;
}
/**
* Expand term to list of all terms which stem to the same term.
*/
static list<string> stemexpand(Native *m_ndb, string term, const string& lang)
{
list<string> explist;
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
LOGDEB(("stemexpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
// Try to fetch the doc from the stem db
string stemdbdir = stemdbname(m_ndb->m_basedir, lang);
Xapian::Database sdb(stemdbdir);
LOGDEB1(("stemexpand: %s lastdocid: %d\n",
stemdbdir.c_str(), sdb.get_lastdocid()));
if (!sdb.term_exists(stem)) {
LOGDEB1(("Db::stemexpand: no term for %s\n", stem.c_str()));
explist.push_back(term);
return explist;
}
Xapian::PostingIterator did = sdb.postlist_begin(stem);
if (did == sdb.postlist_end(stem)) {
LOGDEB1(("stemexpand: no term(1) for %s\n",stem.c_str()));
explist.push_back(term);
return explist;
}
Xapian::Document doc = sdb.get_document(*did);
string data = doc.get_data();
// No need for a conftree, but we need to massage the data a little
string::size_type pos = data.find_first_of("=");
++pos;
string::size_type pos1 = data.find_last_of("\n");
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
explist.push_back(term);
return explist;
}
stringToStrings(data.substr(pos, pos1-pos), explist);
if (find(explist.begin(), explist.end(), term) == explist.end()) {
explist.push_back(term);
}
LOGDEB(("stemexpand: %s -> %s\n", stem.c_str(),
stringlistdisp(explist).c_str()));
} catch (...) {
LOGERR(("stemexpand: error accessing stem db\n"));
explist.push_back(term);
return explist;
}
return explist;
}
// Splitter callback for breaking query into terms
class wsQData : public TextSplitCB {
public:
@ -1042,7 +831,7 @@ static void stringToXapianQueries(const string &iq,
dumb_string(term, term1);
// Possibly perform stem compression/expansion
if (!nostemexp && (opts & Db::QO_STEM)) {
exp = stemexpand(m_ndb, term1, stemlang);
exp = StemDb::stemExpand(m_ndb->m_basedir, stemlang,term1);
} else {
exp.push_back(term1);
}

257
src/rcldb/stemdb.cpp Normal file
View File

@ -0,0 +1,257 @@
#ifndef lint
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <algorithm>
#include <map>
#include <xapian.h>
#include <xapian/stem.h>
#include "stemdb.h"
#include "wipedir.h"
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
using namespace std;
namespace Rcl {
namespace StemDb {
const static string stemdirstem = "stem_";
/// Compute name of stem db for given base database and language
static string stemdbname(const string& dbdir, const string& lang)
{
return path_cat(dbdir, stemdirstem + lang);
}
list<string> getLangs(const string& dbdir)
{
string pattern = stemdirstem + "*";
list<string> dirs = path_dirglob(dbdir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
bool deleteDb(const string& dbdir, const string& lang)
{
string dir = stemdbname(dbdir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
// Autoclean/delete directory
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
// Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool
p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db
* with documents indexed by a single term (the stem), and with the list of
* parent terms in the document data.
*/
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
{
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
// First build the in-memory stem database:
// We walk the list of all terms, and stem each.
// If the stem is identical to the term, no need to create an entry
// Else, we add an entry to the multimap.
// At the end, we only save stem-terms associations with several terms, the
// others are not useful
// Note: a map<string, list<string> > would probably be more efficient
multimap<string, string> assocs;
// Statistics
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
int stemconst=0; // Stem == term
int stemdiff=0; // Count of all different stems
int stemmultiple = 0; // Count of stems with multiple derivatives
try {
Xapian::Stem stemmer(lang);
Xapian::TermIterator it;
for (it = xdb.allterms_begin();
it != xdb.allterms_end(); it++) {
// If it has any non-lowercase 7bit char, cant be stemmable
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
// (*it).c_str(), *sit));
continue;
}
string stem = stemmer.stem_word(*it);
//cerr << "word " << *it << " stem " << stem << endl;
if (stem == *it) {
++stemconst;
continue;
}
assocs.insert(pair<string,string>(stem, *it));
}
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) {
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str()));
return false;
}
// Create xapian database for stem relations
string stemdbdir = stemdbname(dbdir, lang);
// We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb;
try {
sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
if (ermsg != "NOERROR") {
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
stemdbdir.c_str(), ermsg));
return false;
}
// Enter pseud-docs in db. Walk the multimap, only enter
// associations where there are several parent terms
string stem;
list<string> derivs;
for (multimap<string,string>::const_iterator it = assocs.begin();
it != assocs.end(); it++) {
if (stem == it->first) {
// Staying with same stem
derivs.push_back(it->second);
// cerr << " " << it->second << endl;
} else {
// Changing stems
++stemdiff;
if (derivs.size() == 1) {
// Exactly one term stems to this. Check for the case where
// the stem itself exists as a term. The code above would not
// have inserted anything in this case.
if (xdb.term_exists(stem))
derivs.push_back(stem);
}
if (derivs.size() > 1) {
// Previous stem has multiple derivatives. Enter in db
++stemmultiple;
Xapian::Document newdocument;
newdocument.add_term(stem);
// The doc data is just parents=blank-separated-list
string record = "parents=";
for (list<string>::const_iterator it = derivs.begin();
it != derivs.end(); it++) {
record += *it + " ";
}
record += "\n";
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
try {
sdb.replace_document(stem, newdocument);
} catch (...) {
LOGERR(("Db::createstemdb: replace failed\n"));
return false;
}
}
derivs.clear();
stem = it->first;
derivs.push_back(it->second);
// cerr << "\n" << stem << " " << it->second;
}
}
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true;
}
/**
* Expand term to list of all terms which stem to the same term.
*/
list<string> stemExpand(const string& dbdir, const string& lang,
const string& term)
{
list<string> explist;
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
// Try to fetch the doc from the stem db
string stemdbdir = stemdbname(dbdir, lang);
Xapian::Database sdb(stemdbdir);
LOGDEB1(("stemExpand: %s lastdocid: %d\n",
stemdbdir.c_str(), sdb.get_lastdocid()));
if (!sdb.term_exists(stem)) {
LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str()));
explist.push_back(term);
return explist;
}
Xapian::PostingIterator did = sdb.postlist_begin(stem);
if (did == sdb.postlist_end(stem)) {
LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str()));
explist.push_back(term);
return explist;
}
Xapian::Document doc = sdb.get_document(*did);
string data = doc.get_data();
// No need for a conftree, but we need to massage the data a little
string::size_type pos = data.find_first_of("=");
++pos;
string::size_type pos1 = data.find_last_of("\n");
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
explist.push_back(term);
return explist;
}
stringToStrings(data.substr(pos, pos1-pos), explist);
if (find(explist.begin(), explist.end(), term) == explist.end()) {
explist.push_back(term);
}
LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(),
stringlistdisp(explist).c_str()));
} catch (...) {
LOGERR(("stemExpand: error accessing stem db\n"));
explist.push_back(term);
return explist;
}
return explist;
}
}
}

37
src/rcldb/stemdb.h Normal file
View File

@ -0,0 +1,37 @@
#ifndef _STEMDB_H_INCLUDED_
#define _STEMDB_H_INCLUDED_
/* @(#$Id: stemdb.h,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2004 J.F.Dockes */
#ifdef RCLDB_INTERNAL
/// Stem database code
///
/// Stem databases list stems and the set of index terms they expand to. They
/// are computed from index data by stemming each term and regrouping those
/// that stem to the same value.
/// Stem databases are stored as separate xapian databases (used as an
/// Isam method), in subdirectories of the index.
#include <list>
#include <string>
#include <xapian.h>
namespace Rcl {
namespace StemDb {
/// Get languages of existing stem databases
extern std::list<std::string> getLangs(const std::string& dbdir);
/// Delete stem database for given language
extern bool deleteDb(const std::string& dbdir, const std::string& lang);
/// Create stem database for given language
extern bool createDb(Xapian::Database& xdb,
const std::string& dbdir, const std::string& lang);
/// Expand term to stem siblings
extern std::list<std::string> stemExpand(const std::string& dbdir,
const std::string& lang,
const std::string& term);
}
}
#endif // RCLDB_INTERNAL
#endif /* _STEMDB_H_INCLUDED_ */