From 7302cb745fbab20528e3a410e7d6d9b5e86bae32 Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 13 Apr 2006 09:50:03 +0000 Subject: [PATCH] extracted stem database from rcldb to make it smaller --- src/lib/Makefile | 5 +- src/rcldb/rcldb.cpp | 235 ++------------------------------------- src/rcldb/stemdb.cpp | 257 +++++++++++++++++++++++++++++++++++++++++++ src/rcldb/stemdb.h | 37 +++++++ 4 files changed, 310 insertions(+), 224 deletions(-) create mode 100644 src/rcldb/stemdb.cpp create mode 100644 src/rcldb/stemdb.h diff --git a/src/lib/Makefile b/src/lib/Makefile index 17797bbf..a2d27952 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -15,7 +15,7 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \ mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \ rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \ textsplit.o transcode.o \ - unacpp.o unac.o docseq.o sortseq.o copyfile.o + unacpp.o unac.o docseq.o sortseq.o copyfile.o stemdb.o SRCS = \ $(depth)/utils/conftree.cpp $(depth)/index/csguess.cpp \ @@ -30,6 +30,7 @@ SRCS = \ $(depth)/common/myhtmlparse.cpp $(depth)/common/pathhash.cpp \ $(depth)/utils/pathut.cpp $(depth)/common/rclconfig.cpp \ $(depth)/common/rcldb.cpp $(depth)/common/rclinit.cpp \ + $(depth)/common/stemdb.cpp \ $(depth)/utils/base64.cpp $(depth)/utils/readfile.cpp \ $(depth)/utils/smallut.cpp $(depth)/common/textsplit.cpp \ $(depth)/utils/transcode.cpp $(depth)/common/unacpp.cpp \ @@ -96,6 +97,8 @@ rclinit.o : $(depth)/common/rclinit.cpp $(CXX) $(ALL_CXXFLAGS) -c $< rcldb.o : $(depth)/common/rcldb.cpp $(CXX) $(ALL_CXXFLAGS) -c $< +stemdb.o : $(depth)/common/stemdb.cpp + $(CXX) $(ALL_CXXFLAGS) -c $< readfile.o : $(depth)/utils/readfile.cpp $(CXX) $(ALL_CXXFLAGS) -c $< base64.o : $(depth)/utils/base64.cpp diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index e1067f68..7cc5e61e 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.68 2006-04-13 09:50:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -30,8 +30,10 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.67 2006-04-12 10:41:39 dockes Exp $ #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ +#define RCLDB_INTERNAL #include "rcldb.h" +#include "stemdb.h" #include "textsplit.h" #include "transcode.h" #include "unacpp.h" @@ -41,10 +43,8 @@ using namespace std; #include "smallut.h" #include "pathhash.h" #include "utf8iter.h" -#include "wipedir.h" #include "xapian.h" -#include #ifndef MAX #define MAX(A,B) (A>B?A:B) @@ -664,27 +664,15 @@ bool Db::needUpdate(const string &filename, const struct stat *stp) return true; } -const static string stemdirstem = "stem_"; -/// Compute name of stem db for given base database and language -static string stemdbname(const string& basename, string lang) -{ - string nm = path_cat(basename, stemdirstem + lang); - return nm; -} // Return list of existing stem db languages list Db::getStemLangs() { - list dirs; LOGDEB(("Db::getStemLang\n")); - if (m_ndb == 0) + list dirs; + if (m_ndb == 0 || m_ndb->m_isopen == false) return dirs; - string pattern = stemdirstem + "*"; - dirs = path_dirglob(m_ndb->m_basedir, pattern); - for (list::iterator it = dirs.begin(); it != dirs.end(); it++) { - *it = path_basename(*it); - *it = it->substr(stemdirstem.length(), string::npos); - } + dirs = StemDb::getLangs(m_ndb->m_basedir); return dirs; } @@ -694,25 +682,9 @@ list Db::getStemLangs() bool Db::deleteStemDb(const string& lang) { LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str())); - if (m_ndb == 0) + if (m_ndb == 0 || m_ndb->m_isopen == false) return false; - if (m_ndb->m_isopen == false) - return false; - - string dir = stemdbname(m_ndb->m_basedir, lang); - if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0) - return true; - return false; -} - -// Deciding if we try to stem the term. If it has numerals or capitals -// we don't -inline static bool -p_notlowerascii(unsigned int c) -{ - if (c < 'a' || (c > 'z' && c < 128)) - return true; - return false; + return StemDb::deleteDb(m_ndb->m_basedir, lang); } /** @@ -724,145 +696,13 @@ p_notlowerascii(unsigned int c) bool Db::createStemDb(const string& lang) { LOGDEB(("Db::createStemDb(%s)\n", lang.c_str())); - if (m_ndb == 0) - return false; - if (m_ndb->m_isopen == false) + if (m_ndb == 0 || m_ndb->m_isopen == false) return false; - // First build the in-memory stem database: - // We walk the list of all terms, and stem each. - // If the stem is identical to the term, no need to create an entry - // Else, we add an entry to the multimap. - // At the end, we only save stem-terms associations with several terms, the - // others are not useful - // Note: a map > would probably be more efficient - multimap assocs; - // Statistics - int nostem=0; // Dont even try: not-alphanum (incomplete for now) - int stemconst=0; // Stem == term - int stemdiff=0; // Count of all different stems - int stemmultiple = 0; // Count of stems with multiple derivatives - try { - Xapian::Stem stemmer(lang); - Xapian::TermIterator it; - for (it = m_ndb->wdb.allterms_begin(); - it != m_ndb->wdb.allterms_end(); it++) { - // If it has any non-lowercase 7bit char, cant be stemmable - string::iterator sit = (*it).begin(), eit = sit + (*it).length(); - if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) { - ++nostem; - // LOGDEB(("stemskipped: [%s], because of 0x%x\n", - // (*it).c_str(), *sit)); - continue; - } - string stem = stemmer.stem_word(*it); - //cerr << "word " << *it << " stem " << stem << endl; - if (stem == *it) { - ++stemconst; - continue; - } - assocs.insert(pair(stem, *it)); - } - } catch (const Xapian::Error &e) { - LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str())); - return false; - } catch (...) { - LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", - lang.c_str())); - return false; - } - - class DirWiper { - public: - string dir; - bool do_it; - DirWiper(string d) : dir(d), do_it(true) {} - ~DirWiper() { - if (do_it) { - wipedir(dir); - rmdir(dir.c_str()); - } - } - }; - // Create xapian database for stem relations - string stemdbdir = stemdbname(m_ndb->m_basedir, lang); - // We want to get rid of the db dir in case of error. This gets disarmed - // just before success return. - DirWiper wiper(stemdbdir); - const char *ermsg = "NOERROR"; - Xapian::WritableDatabase sdb; - try { - sdb = Xapian::WritableDatabase(stemdbdir, - Xapian::DB_CREATE_OR_OVERWRITE); - } catch (const Xapian::Error &e) { - ermsg = e.get_msg().c_str(); - } catch (const string &s) { - ermsg = s.c_str(); - } catch (const char *s) { - ermsg = s; - } catch (...) { - ermsg = "Caught unknown exception"; - } - if (ermsg != "NOERROR") { - LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n", - stemdbdir.c_str(), ermsg)); - return false; - } - - // Enter pseud-docs in db. Walk the multimap, only enter - // associations where there are several parent terms - string stem; - list derivs; - for (multimap::const_iterator it = assocs.begin(); - it != assocs.end(); it++) { - if (stem == it->first) { - // Staying with same stem - derivs.push_back(it->second); - // cerr << " " << it->second << endl; - } else { - // Changing stems - ++stemdiff; - if (derivs.size() == 1) { - // Exactly one term stems to this. Check for the case where - // the stem itself exists as a term. The code above would not - // have inserted anything in this case. - if (m_ndb->wdb.term_exists(stem)) - derivs.push_back(stem); - } - if (derivs.size() > 1) { - // Previous stem has multiple derivatives. Enter in db - ++stemmultiple; - Xapian::Document newdocument; - newdocument.add_term(stem); - // The doc data is just parents=blank-separated-list - string record = "parents="; - for (list::const_iterator it = derivs.begin(); - it != derivs.end(); it++) { - record += *it + " "; - } - record += "\n"; - LOGDEB1(("stemdocument data: %s\n", record.c_str())); - newdocument.set_data(record); - try { - sdb.replace_document(stem, newdocument); - } catch (...) { - LOGERR(("Db::createstemdb: replace failed\n")); - return false; - } - } - derivs.clear(); - stem = it->first; - derivs.push_back(it->second); - // cerr << "\n" << stem << " " << it->second; - } - } - LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", - assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); - wiper.do_it = false; - return true; + return StemDb:: createDb(m_ndb->m_iswritable ? m_ndb->wdb : m_ndb->db, + m_ndb->m_basedir, lang); } - /** * This is called at the end of an indexing session, to delete the * documents for files that are no longer there. @@ -907,57 +747,6 @@ bool Db::purge() return true; } -/** - * Expand term to list of all terms which stem to the same term. - */ -static list stemexpand(Native *m_ndb, string term, const string& lang) -{ - list explist; - try { - Xapian::Stem stemmer(lang); - string stem = stemmer.stem_word(term); - LOGDEB(("stemexpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str())); - // Try to fetch the doc from the stem db - string stemdbdir = stemdbname(m_ndb->m_basedir, lang); - Xapian::Database sdb(stemdbdir); - LOGDEB1(("stemexpand: %s lastdocid: %d\n", - stemdbdir.c_str(), sdb.get_lastdocid())); - if (!sdb.term_exists(stem)) { - LOGDEB1(("Db::stemexpand: no term for %s\n", stem.c_str())); - explist.push_back(term); - return explist; - } - Xapian::PostingIterator did = sdb.postlist_begin(stem); - if (did == sdb.postlist_end(stem)) { - LOGDEB1(("stemexpand: no term(1) for %s\n",stem.c_str())); - explist.push_back(term); - return explist; - } - Xapian::Document doc = sdb.get_document(*did); - string data = doc.get_data(); - // No need for a conftree, but we need to massage the data a little - string::size_type pos = data.find_first_of("="); - ++pos; - string::size_type pos1 = data.find_last_of("\n"); - if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ?? - explist.push_back(term); - return explist; - } - stringToStrings(data.substr(pos, pos1-pos), explist); - if (find(explist.begin(), explist.end(), term) == explist.end()) { - explist.push_back(term); - } - LOGDEB(("stemexpand: %s -> %s\n", stem.c_str(), - stringlistdisp(explist).c_str())); - } catch (...) { - LOGERR(("stemexpand: error accessing stem db\n")); - explist.push_back(term); - return explist; - } - return explist; -} - - // Splitter callback for breaking query into terms class wsQData : public TextSplitCB { public: @@ -1042,7 +831,7 @@ static void stringToXapianQueries(const string &iq, dumb_string(term, term1); // Possibly perform stem compression/expansion if (!nostemexp && (opts & Db::QO_STEM)) { - exp = stemexpand(m_ndb, term1, stemlang); + exp = StemDb::stemExpand(m_ndb->m_basedir, stemlang,term1); } else { exp.push_back(term1); } diff --git a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp new file mode 100644 index 00000000..c9ec066e --- /dev/null +++ b/src/rcldb/stemdb.cpp @@ -0,0 +1,257 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif + +#include +#include + +#include +#include + +#include "stemdb.h" +#include "wipedir.h" +#include "pathut.h" +#include "debuglog.h" +#include "smallut.h" + +using namespace std; + +namespace Rcl { +namespace StemDb { + + +const static string stemdirstem = "stem_"; + +/// Compute name of stem db for given base database and language +static string stemdbname(const string& dbdir, const string& lang) +{ + return path_cat(dbdir, stemdirstem + lang); +} + +list getLangs(const string& dbdir) +{ + string pattern = stemdirstem + "*"; + list dirs = path_dirglob(dbdir, pattern); + for (list::iterator it = dirs.begin(); it != dirs.end(); it++) { + *it = path_basename(*it); + *it = it->substr(stemdirstem.length(), string::npos); + } + return dirs; +} + +bool deleteDb(const string& dbdir, const string& lang) +{ + string dir = stemdbname(dbdir, lang); + if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0) + return true; + return false; +} + +// Autoclean/delete directory +class DirWiper { + public: + string dir; + bool do_it; + DirWiper(string d) : dir(d), do_it(true) {} + ~DirWiper() { + if (do_it) { + wipedir(dir); + rmdir(dir.c_str()); + } + } +}; + +// Deciding if we try to stem the term. If it has numerals or capitals +// we don't +inline static bool +p_notlowerascii(unsigned int c) +{ + if (c < 'a' || (c > 'z' && c < 128)) + return true; + return false; +} + +/** + * Create database of stem to parents associations for a given language. + * We walk the list of all terms, stem them, and create another Xapian db + * with documents indexed by a single term (the stem), and with the list of + * parent terms in the document data. + */ +bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang) +{ + LOGDEB(("StemDb::createDb(%s)\n", lang.c_str())); + + // First build the in-memory stem database: + // We walk the list of all terms, and stem each. + // If the stem is identical to the term, no need to create an entry + // Else, we add an entry to the multimap. + // At the end, we only save stem-terms associations with several terms, the + // others are not useful + // Note: a map > would probably be more efficient + multimap assocs; + // Statistics + int nostem=0; // Dont even try: not-alphanum (incomplete for now) + int stemconst=0; // Stem == term + int stemdiff=0; // Count of all different stems + int stemmultiple = 0; // Count of stems with multiple derivatives + try { + Xapian::Stem stemmer(lang); + Xapian::TermIterator it; + for (it = xdb.allterms_begin(); + it != xdb.allterms_end(); it++) { + // If it has any non-lowercase 7bit char, cant be stemmable + string::iterator sit = (*it).begin(), eit = sit + (*it).length(); + if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) { + ++nostem; + // LOGDEB(("stemskipped: [%s], because of 0x%x\n", + // (*it).c_str(), *sit)); + continue; + } + string stem = stemmer.stem_word(*it); + //cerr << "word " << *it << " stem " << stem << endl; + if (stem == *it) { + ++stemconst; + continue; + } + assocs.insert(pair(stem, *it)); + } + } catch (const Xapian::Error &e) { + LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str())); + return false; + } catch (...) { + LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", + lang.c_str())); + return false; + } + + // Create xapian database for stem relations + string stemdbdir = stemdbname(dbdir, lang); + // We want to get rid of the db dir in case of error. This gets disarmed + // just before success return. + DirWiper wiper(stemdbdir); + const char *ermsg = "NOERROR"; + Xapian::WritableDatabase sdb; + try { + sdb = Xapian::WritableDatabase(stemdbdir, + Xapian::DB_CREATE_OR_OVERWRITE); + } catch (const Xapian::Error &e) { + ermsg = e.get_msg().c_str(); + } catch (const string &s) { + ermsg = s.c_str(); + } catch (const char *s) { + ermsg = s; + } catch (...) { + ermsg = "Caught unknown exception"; + } + if (ermsg != "NOERROR") { + LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n", + stemdbdir.c_str(), ermsg)); + return false; + } + + // Enter pseud-docs in db. Walk the multimap, only enter + // associations where there are several parent terms + string stem; + list derivs; + for (multimap::const_iterator it = assocs.begin(); + it != assocs.end(); it++) { + if (stem == it->first) { + // Staying with same stem + derivs.push_back(it->second); + // cerr << " " << it->second << endl; + } else { + // Changing stems + ++stemdiff; + if (derivs.size() == 1) { + // Exactly one term stems to this. Check for the case where + // the stem itself exists as a term. The code above would not + // have inserted anything in this case. + if (xdb.term_exists(stem)) + derivs.push_back(stem); + } + if (derivs.size() > 1) { + // Previous stem has multiple derivatives. Enter in db + ++stemmultiple; + Xapian::Document newdocument; + newdocument.add_term(stem); + // The doc data is just parents=blank-separated-list + string record = "parents="; + for (list::const_iterator it = derivs.begin(); + it != derivs.end(); it++) { + record += *it + " "; + } + record += "\n"; + LOGDEB1(("stemdocument data: %s\n", record.c_str())); + newdocument.set_data(record); + try { + sdb.replace_document(stem, newdocument); + } catch (...) { + LOGERR(("Db::createstemdb: replace failed\n")); + return false; + } + } + derivs.clear(); + stem = it->first; + derivs.push_back(it->second); + // cerr << "\n" << stem << " " << it->second; + } + } + LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", + assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); + wiper.do_it = false; + return true; +} + +/** + * Expand term to list of all terms which stem to the same term. + */ +list stemExpand(const string& dbdir, const string& lang, + const string& term) +{ + list explist; + try { + Xapian::Stem stemmer(lang); + string stem = stemmer.stem_word(term); + LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str())); + // Try to fetch the doc from the stem db + string stemdbdir = stemdbname(dbdir, lang); + Xapian::Database sdb(stemdbdir); + LOGDEB1(("stemExpand: %s lastdocid: %d\n", + stemdbdir.c_str(), sdb.get_lastdocid())); + if (!sdb.term_exists(stem)) { + LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str())); + explist.push_back(term); + return explist; + } + Xapian::PostingIterator did = sdb.postlist_begin(stem); + if (did == sdb.postlist_end(stem)) { + LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str())); + explist.push_back(term); + return explist; + } + Xapian::Document doc = sdb.get_document(*did); + string data = doc.get_data(); + // No need for a conftree, but we need to massage the data a little + string::size_type pos = data.find_first_of("="); + ++pos; + string::size_type pos1 = data.find_last_of("\n"); + if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ?? + explist.push_back(term); + return explist; + } + stringToStrings(data.substr(pos, pos1-pos), explist); + if (find(explist.begin(), explist.end(), term) == explist.end()) { + explist.push_back(term); + } + LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(), + stringlistdisp(explist).c_str())); + } catch (...) { + LOGERR(("stemExpand: error accessing stem db\n")); + explist.push_back(term); + return explist; + } + return explist; +} + +} +} diff --git a/src/rcldb/stemdb.h b/src/rcldb/stemdb.h new file mode 100644 index 00000000..41709f6a --- /dev/null +++ b/src/rcldb/stemdb.h @@ -0,0 +1,37 @@ +#ifndef _STEMDB_H_INCLUDED_ +#define _STEMDB_H_INCLUDED_ +/* @(#$Id: stemdb.h,v 1.1 2006-04-13 09:50:03 dockes Exp $ (C) 2004 J.F.Dockes */ +#ifdef RCLDB_INTERNAL +/// Stem database code +/// +/// Stem databases list stems and the set of index terms they expand to. They +/// are computed from index data by stemming each term and regrouping those +/// that stem to the same value. +/// Stem databases are stored as separate xapian databases (used as an +/// Isam method), in subdirectories of the index. + +#include +#include + +#include + +namespace Rcl { +namespace StemDb { + +/// Get languages of existing stem databases +extern std::list getLangs(const std::string& dbdir); +/// Delete stem database for given language +extern bool deleteDb(const std::string& dbdir, const std::string& lang); +/// Create stem database for given language +extern bool createDb(Xapian::Database& xdb, + const std::string& dbdir, const std::string& lang); +/// Expand term to stem siblings +extern std::list stemExpand(const std::string& dbdir, + const std::string& lang, + const std::string& term); + +} +} + +#endif // RCLDB_INTERNAL +#endif /* _STEMDB_H_INCLUDED_ */