From 9b273d94e864e221e86389e9266c630e90af0e66 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 15 Sep 2012 15:16:20 +0200 Subject: [PATCH] ensure that recoll configured with indexStripChars=1 runs as compiled with -DRCL_INDEX_STRIPCHARS --HG-- branch : CASEDIACSENS --- src/aspell/rclaspell.cpp | 59 +++++-- src/aspell/rclaspell.h | 21 +-- src/common/rclconfig.cpp | 23 +-- src/common/rclconfig.h | 10 +- src/qtgui/reslist.cpp | 7 +- src/query/plaintorich.cpp | 24 ++- src/query/reslistpager.cpp | 13 +- src/query/xadump.cpp | 32 +++- src/rcldb/expansiondbs.cpp | 35 ++-- src/rcldb/expansiondbs.h | 11 +- src/rcldb/rcldb.cpp | 58 +++++-- src/rcldb/rcldb.h | 31 ++-- src/rcldb/searchdata.cpp | 316 ++++++++++++++++++++----------------- src/rcldb/stemdb.cpp | 29 ++-- src/utils/smallut.cpp | 7 +- src/utils/smallut.h | 7 + tests/config/recoll.conf | 2 + 17 files changed, 425 insertions(+), 260 deletions(-) diff --git a/src/aspell/rclaspell.cpp b/src/aspell/rclaspell.cpp index 4381bf04..67029bdf 100644 --- a/src/aspell/rclaspell.cpp +++ b/src/aspell/rclaspell.cpp @@ -23,9 +23,9 @@ #include #include -#include #include -#include + +using namespace std; #include ASPELL_INCLUDE @@ -33,7 +33,7 @@ #include "execmd.h" #include "rclaspell.h" #include "debuglog.h" - +#include "unacpp.h" #include "ptmutex.h" // Just a place where we keep the Aspell library entry points together @@ -260,6 +260,14 @@ public: while (m_db.termWalkNext(m_tit, *m_input)) { if (!Rcl::Db::isSpellingCandidate(*m_input)) continue; +#ifndef RCL_INDEX_STRIPCHARS + if (!o_index_stripchars) { + string lower; + if (!unacmaybefold(*m_input, lower, "UTF-8", UNACOP_FOLD)) + continue; + m_input->swap(lower); + } +#endif // Got a non-empty sort-of appropriate term, let's send it to // aspell m_input->append("\n"); @@ -335,17 +343,29 @@ bool Aspell::make_speller(string& reason) return true; } -bool Aspell::check(Rcl::Db &db, const string &term, string& reason) +bool Aspell::check(const string &iterm, string& reason) { - LOGDEB2(("Aspell::check [%s]\n", term.c_str())); + LOGDEB2(("Aspell::check [%s]\n", iterm.c_str())); + string mterm(iterm); if (!ok() || !make_speller(reason)) return false; - if (term.empty()) + if (iterm.empty()) return true; //?? +#ifndef RCL_INDEX_STRIPCHARS + if (!o_index_stripchars) { + string lower; + if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) { + LOGERR(("Aspell::check : cant lowercase input\n")); + return false; + } + mterm.swap(lower); + } +#endif + int ret = aapi.aspell_speller_check(m_data->m_speller, - term.c_str(), term.length()); + mterm.c_str(), mterm.length()); reason.clear(); switch (ret) { case 0: return false; @@ -358,19 +378,31 @@ bool Aspell::check(Rcl::Db &db, const string &term, string& reason) } } -bool Aspell::suggest(Rcl::Db &db, const string &term, +bool Aspell::suggest(Rcl::Db &db, const string &_term, list& suggestions, string& reason) { if (!ok() || !make_speller(reason)) return false; - if (term.empty()) + string mterm(_term); + if (mterm.empty()) return true; //?? +#ifndef RCL_INDEX_STRIPCHARS + if (!o_index_stripchars) { + string lower; + if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) { + LOGERR(("Aspell::check : cant lowercase input\n")); + return false; + } + mterm.swap(lower); + } +#endif + AspellCanHaveError *ret; const AspellWordList *wl = aapi.aspell_speller_suggest(m_data->m_speller, - term.c_str(), term.length()); + mterm.c_str(), mterm.length()); if (wl == 0) { reason = aapi.aspell_speller_error_message(m_data->m_speller); return false; @@ -385,7 +417,7 @@ bool Aspell::suggest(Rcl::Db &db, const string &term, // ******** This should depend if // stemming is turned on or not for querying ******* string sw(word); - if (db.termExists(sw) && db.stemDiffers("english", sw, term)) + if (db.termExists(sw) && db.stemDiffers("english", sw, mterm)) suggestions.push_back(word); } aapi.delete_aspell_string_enumeration(els); @@ -418,7 +450,6 @@ using namespace std; static char *thisprog; RclConfig *rclconfig; -Rcl::Db rcldb; static char usage [] = " -b : build dictionary\n" @@ -477,7 +508,9 @@ int main(int argc, char **argv) exit(1); } - if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) { + Rcl::Db rcldb(rclconfig); + + if (!rcldb.open(Rcl::Db::DbRO, 0)) { fprintf(stderr, "Could not open database in %s\n", dbdir.c_str()); exit(1); } diff --git a/src/aspell/rclaspell.h b/src/aspell/rclaspell.h index 06032495..b969aa75 100644 --- a/src/aspell/rclaspell.h +++ b/src/aspell/rclaspell.h @@ -37,11 +37,6 @@ #include "rclconfig.h" #include "rcldb.h" -#ifndef NO_NAMESPACES -using std::string; -using std::list; -#endif // NO_NAMESPACES - class AspellData; class Aspell { @@ -53,26 +48,26 @@ class Aspell { bool ok() const; /** Find the aspell command and shared library, init function pointers */ - bool init(string &reason); + bool init(std::string &reason); /** Build dictionary out of index term list. This is done at the end * of an indexing pass. */ - bool buildDict(Rcl::Db &db, string &reason); + bool buildDict(Rcl::Db &db, std::string &reason); /** Check that word is in dictionary. ret==false && !reason.empty() => err*/ - bool check(Rcl::Db &db, const string& term, string& reason); + bool check(const std::string& term, std::string& reason); /** Return a list of possible expansions for a given word */ - bool suggest(Rcl::Db &db, const string& term, list &suggestions, - string &reason); + bool suggest(Rcl::Db &db, const std::string& term, + std::list &suggestions, std::string &reason); private: - string dicPath(); + std::string dicPath(); RclConfig *m_config; - string m_lang; + std::string m_lang; AspellData *m_data; - bool make_speller(string& reason); + bool make_speller(std::string& reason); }; #endif /* RCL_USE_ASPELL */ diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index b303fce9..a3c5245f 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -15,6 +15,8 @@ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef TEST_RCLCONFIG +#include "autoconfig.h" + #include #include #include @@ -34,6 +36,7 @@ #include #include #include +using namespace std; #include "cstr.h" #include "pathut.h" @@ -45,15 +48,8 @@ #include "readfile.h" #include "fstreewalk.h" -#ifndef NO_NAMESPACES -using namespace std; -#endif /* NO_NAMESPACES */ - -#ifndef MIN -#define MIN(A,B) (((A)<(B)) ? (A) : (B)) -#endif -#ifndef MAX -#define MAX(A,B) (((A)>(B)) ? (A) : (B)) +#ifndef RCL_INDEX_STRIPCHARS +bool o_index_stripchars; #endif bool ParamStale::needrecompute() @@ -77,6 +73,7 @@ bool ParamStale::needrecompute() } return false; } + void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm) { parent = rconf; @@ -239,6 +236,14 @@ bool RclConfig::updateMainConfig() FsTreeWalker::setNoFnmPathname(); } +#ifndef RCL_INDEX_STRIPCHARS + static int m_index_stripchars_init = 0; + if (!m_index_stripchars_init) { + getConfParam("indexStripChars", &o_index_stripchars); + m_index_stripchars_init = 1; + } +#endif + return true; } diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 64ce44c6..3fd29a52 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -303,5 +303,13 @@ class RclConfig { bool readFieldsConfig(const string& errloc); }; - +// This global variable defines if we are running with an index +// stripped of accents and case or a raw one. Ideally, it should be +// constant, but it needs to be initialized from the configuration, so +// there is no way to do this. It never changes after initialization +// of course. When set, it is supposed to get all of recoll to behave like if +// if was compiled with RCL_INDEX_STRIPCHARS +#ifndef RCL_INDEX_STRIPCHARS +extern bool o_index_stripchars; +#endif #endif /* _RCLCONFIG_H_INCLUDED_ */ diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index 491d93cf..9cbf037d 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -197,10 +197,14 @@ void QtGuiResListPager::suggest(const vectoruterms, // If the term is in the index, we don't suggest alternatives. // Actually, we may want to check the frequencies and propose something // anyway if a possible variation is much more common (as google does) - if (aspell->check(*rcldb, *uit, reason)) +#warning need to take case and diacs sensibility into account somehow + // Maybe use the xapian index instead ? How to retrieve the sensitivity flags ? + if (0) { + if (aspell->check(*uit, reason)) continue; else if (!reason.empty()) return; + } if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) { LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n", reason.c_str())); @@ -336,6 +340,7 @@ ResList::~ResList() QT_TR_NOOP("Open"), QT_TR_NOOP("(show query)"), QT_TR_NOOP("

Alternate spellings (accents suppressed): "), + QT_TR_NOOP("

Alternate spellings: "), }; } diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp index b7c461d5..3dab6f8d 100644 --- a/src/query/plaintorich.cpp +++ b/src/query/plaintorich.cpp @@ -79,22 +79,30 @@ class TextSplitPTR : public TextSplit { for (vector >::const_iterator vit = hdata.groups.begin(); vit != hdata.groups.end(); vit++) { if (vit->size() == 1) { -#ifdef RCL_INDEX_STRIPCHARS - m_terms[vit->front()] = vit - hdata.groups.begin(); -#else - string dumb = vit->front(); - unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD); - m_terms[dumb] = vit - hdata.groups.begin(); +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) { +#endif + m_terms[vit->front()] = vit - hdata.groups.begin(); +#ifndef RCL_INDEX_STRIPCHARS + } else { + string dumb = vit->front(); + unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD); + m_terms[dumb] = vit - hdata.groups.begin(); + } #endif } else if (vit->size() > 1) { for (vector::const_iterator it = vit->begin(); it != vit->end(); it++) { -#ifdef RCL_INDEX_STRIPCHARS +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) { +#endif m_gterms.insert(*it); -#else +#ifndef RCL_INDEX_STRIPCHARS + } else { string dumb = *it; unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD); m_gterms.insert(dumb); + } #endif } } diff --git a/src/query/reslistpager.cpp b/src/query/reslistpager.cpp index 9718709f..d7c7b1d0 100644 --- a/src/query/reslistpager.cpp +++ b/src/query/reslistpager.cpp @@ -320,9 +320,16 @@ void ResListPager::displayPage(RclConfig *config) map > spellings; suggest(uterms, spellings); if (!spellings.empty()) { - chunk << - trans("

Alternate spellings (accents suppressed): ") - << "

"; + if (o_index_stripchars) { + chunk << + trans("

Alternate spellings (accents suppressed): ") + << "

"; + } else { + chunk << + trans("

Alternate spellings: ") + << "

"; + + } for (map >::const_iterator it0 = spellings.begin(); it0 != spellings.end(); it0++) { diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp index fddc9853..dd64a9ef 100644 --- a/src/query/xadump.cpp +++ b/src/query/xadump.cpp @@ -116,12 +116,20 @@ static void sigcleanup(int sig) exit(1); } +#ifndef RCL_INDEX_STRIPCHARS +bool o_index_stripchars; +#endif + inline bool has_prefix(const string& trm) { -#ifdef RCL_INDEX_STRIPCHARS - return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z'; -#else - return trm.size() > 0 && trm[0] == ':'; +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) { +#endif + return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z'; +#ifndef RCL_INDEX_STRIPCHARS + } else { + return trm.size() > 0 && trm[0] == ':'; + } #endif } @@ -201,10 +209,22 @@ int main(int argc, char **argv) try { db = new Xapian::Database(dbdir); - cout << "DB: ndocs " << db->get_doccount() << " lastdocid " << db->get_lastdocid() << " avglength " << db->get_avlength() << endl; - + +#ifndef RCL_INDEX_STRIPCHARS + // If we have terms with a leading ':' it's a new style, + // unstripped index + { + Xapian::TermIterator term = db->allterms_begin(":"); + if (term == db->allterms_end()) + o_index_stripchars = true; + else + o_index_stripchars = false; + cout<<"DB: terms are "<<(o_index_stripchars?"stripped":"raw")< unacstemdbs; // We can reuse the same stemmer pointers, the objects are stateless. - for (unsigned int i = 0; i < langs.size(); i++) { - unacstemdbs.push_back( - XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i], - stemmers.back().getptr())); - unacstemdbs.back().recreate(); + if (!o_index_stripchars) { + for (unsigned int i = 0; i < langs.size(); i++) { + unacstemdbs.push_back( + XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i], + stemmers.back().getptr())); + unacstemdbs.back().recreate(); + } } - SynTermTransUnac transunac(UNACOP_UNACFOLD); XapWritableComputableSynFamMember diacasedb(wdb, synFamDiCa, "all", &transunac); - diacasedb.recreate(); + if (!o_index_stripchars) + diacasedb.recreate(); #endif // Walk the list of all terms, and stem/unac each. @@ -109,8 +111,10 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb, // is the input to the stem db, and add a synonym from the // stripped term to the cased and accented one, for accent // and case expansion at query time - unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD); - diacasedb.addSynonym(*it); + if (!o_index_stripchars) { + unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD); + diacasedb.addSynonym(*it); + } #endif // Create stemming synonym for every language. The input is the @@ -124,12 +128,15 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb, // the unaccented term. While this may be incorrect, it is // also necessary for searching in a diacritic-unsensitive // way on a raw index - string unac; - unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC); - if (unac != lower) - for (unsigned int i = 0; i < langs.size(); i++) { - unacstemdbs[i].addSynonym(unac); + if (!o_index_stripchars) { + string unac; + unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC); + if (unac != lower) { + for (unsigned int i = 0; i < langs.size(); i++) { + unacstemdbs[i].addSynonym(unac); + } } + } #endif } } XCATCHERROR(ermsg); diff --git a/src/rcldb/expansiondbs.h b/src/rcldb/expansiondbs.h index 97846870..dd819826 100644 --- a/src/rcldb/expansiondbs.h +++ b/src/rcldb/expansiondbs.h @@ -24,10 +24,13 @@ namespace Rcl { -/* A Capitals/Diacritics removal functor for using with - XapComputableSynFamMember */ +/** A Capitals/Diacritics removal functor for using with + * XapComputableSynFamMember */ class SynTermTransUnac : public SynTermTrans { public: + /** Constructor + * @param op defines if we remove diacritics, case or both + */ SynTermTransUnac(UnacOp op) : m_op(op) { @@ -43,7 +46,9 @@ public: UnacOp m_op; }; -/** Walk the Xapian term list and create all the expansion dbs in one go */ +/** Walk the Xapian term list and create all the expansion dbs in one go. + * + */ extern bool createExpansionDbs(Xapian::WritableDatabase& wdb, const std::vector& langs); } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 671e63c1..581436fe 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -92,10 +92,11 @@ const string start_of_field_term = "XXST"; const string end_of_field_term = "XXND"; static const string page_break_term = "XXPG"; #else -const string start_of_field_term = "XXST/"; -const string end_of_field_term = "XXND/"; -static const string page_break_term = "XXPG/"; +string start_of_field_term; +string end_of_field_term; +const string page_break_term = "XXPG/"; #endif + // Field name for the unsplit file name. Has to exist in the field file // because of usage in termmatch() static const string unsplitFilenameFieldName = "rclUnsplitFN"; @@ -683,6 +684,18 @@ Db::Db(RclConfig *cfp) m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1), m_maxFsOccupPc(0), m_mode(Db::DbRO) { +#ifndef RCL_INDEX_STRIPCHARS + if (start_of_field_term.empty()) { + if (o_index_stripchars) { + start_of_field_term = "XXST"; + end_of_field_term = "XXND"; + } else { + start_of_field_term = "XXST/"; + end_of_field_term = "XXND/"; + } + } +#endif + m_ndb = new Native(this); if (m_config) { m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); @@ -886,12 +899,13 @@ int Db::termDocCnt(const string& _term) return -1; string term = _term; -#ifdef RCL_INDEX_STRIPCHARS - if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { - LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str())); - return 0; - } +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) #endif + if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str())); + return 0; + } if (m_stops.isStop(term)) { LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str())); @@ -1151,13 +1165,17 @@ string Db::getSpellingSuggestion(const string& word) { if (m_ndb == 0) return string(); + string term = word; -#ifdef RCL_INDEX_STRIPCHARS + +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) +#endif if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) { LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str())); return string(); } -#endif + if (!isSpellingCandidate(term)) return string(); return m_ndb->xrdb.get_spelling_suggestion(term); @@ -1266,9 +1284,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, TermProc *nxt = &tpidx; TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; -#ifdef RCL_INDEX_STRIPCHARS - TermProcPrep tpprep(nxt); nxt = &tpprep; + + TermProcPrep tpprep(nxt); +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) #endif + nxt = &tpprep; TextSplitDb splitter(newdocument, nxt); tpidx.setTSD(&splitter); @@ -1951,12 +1972,15 @@ bool Db::termMatch(MatchType typ, const string &lang, // Get rid of capitals and accents string droot = root; -#ifdef RCL_INDEX_STRIPCHARS - if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) { - LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str())); - return false; - } + +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) #endif + if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) { + LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str())); + return false; + } + string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars; string prefix; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 445e63a2..2c81b354 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -129,18 +129,27 @@ extern void *DbUpdWorker(void*); inline bool has_prefix(const string& trm) { -#ifdef RCL_INDEX_STRIPCHARS - return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z'; -#else - return !trm.empty() && trm[0] == ':'; +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) { +#endif + return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z'; +#ifndef RCL_INDEX_STRIPCHARS + } else { + return !trm.empty() && trm[0] == ':'; + } #endif } + inline string wrap_prefix(const string& pfx) { -#ifdef RCL_INDEX_STRIPCHARS - return pfx; -#else - return cstr_colon + pfx + cstr_colon; +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) { +#endif + return pfx; +#ifndef RCL_INDEX_STRIPCHARS + } else { + return cstr_colon + pfx + cstr_colon; + } #endif } @@ -384,9 +393,13 @@ private: string version_string(); extern const string pathelt_prefix; +#ifdef RCL_INDEX_STRIPCHARS extern const string start_of_field_term; extern const string end_of_field_term; - +#else +extern string start_of_field_term; +extern string end_of_field_term; +#endif } #endif /* _DB_H_INCLUDED_ */ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index bddacf3f..c28ea8cc 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -79,10 +79,22 @@ static const int original_term_wqf_booster = 10; #ifdef RCL_INDEX_STRIPCHARS #define bufprefix(BUF, L) {(BUF)[0] = L;} -#define bpoffs 1 +#define bpoffs() 1 #else -#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';} -#define bpoffs 3 +static inline void bufprefix(char *buf, char c) +{ + if (o_index_stripchars) { + buf[0] = c; + } else { + buf[0] = ':'; + buf[1] = c; + buf[2] = ':'; + } +} +static inline int bpoffs() +{ + return o_index_stripchars ? 1 : 3; +} #endif static Xapian::Query @@ -92,7 +104,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2) // only doing %d's ! char buf[200]; bufprefix(buf, 'D'); - sprintf(buf+bpoffs, "%04d%02d", y1, m1); + sprintf(buf+bpoffs(), "%04d%02d", y1, m1); vector v; int d_last = monthdays(m1, y1); @@ -103,7 +115,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2) // Deal with any initial partial month if (d1 > 1 || d_end < d_last) { for ( ; d1 <= d_end ; d1++) { - sprintf(buf + 6 + bpoffs, "%02d", d1); + sprintf(buf + 6 + bpoffs(), "%02d", d1); v.push_back(Xapian::Query(buf)); } } else { @@ -117,32 +129,32 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2) int m_last = (y1 < y2) ? 12 : m2 - 1; while (++m1 <= m_last) { - sprintf(buf + 4 + bpoffs, "%02d", m1); + sprintf(buf + 4 + bpoffs(), "%02d", m1); bufprefix(buf, 'M'); v.push_back(Xapian::Query(buf)); } if (y1 < y2) { while (++y1 < y2) { - sprintf(buf + bpoffs, "%04d", y1); + sprintf(buf + bpoffs(), "%04d", y1); bufprefix(buf, 'Y'); v.push_back(Xapian::Query(buf)); } - sprintf(buf + bpoffs, "%04d", y2); + sprintf(buf + bpoffs(), "%04d", y2); bufprefix(buf, 'M'); for (m1 = 1; m1 < m2; m1++) { - sprintf(buf + 4 + bpoffs, "%02d", m1); + sprintf(buf + 4 + bpoffs(), "%02d", m1); v.push_back(Xapian::Query(buf)); } } - sprintf(buf + 2 + bpoffs, "%02d", m2); + sprintf(buf + 2 + bpoffs(), "%02d", m2); // Deal with any final partial month if (d2 < monthdays(m2, y2)) { bufprefix(buf, 'D'); for (d1 = 1 ; d1 <= d2; d1++) { - sprintf(buf + 6 + bpoffs, "%02d", d1); + sprintf(buf + 6 + bpoffs(), "%02d", d1); v.push_back(Xapian::Query(buf)); } } else { @@ -663,13 +675,13 @@ static void listVector(const string& what, const vector&l) */ void StringToXapianQ::expandTerm(int mods, const string& term, - vector& exp, string &sterm, + vector& oexp, string &sterm, const string& prefix) { LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n", mods, m_field.c_str(), term.c_str(), m_stemlang.c_str())); sterm.clear(); - exp.clear(); + oexp.clear(); if (term.empty()) return; @@ -693,145 +705,161 @@ void StringToXapianQ::expandTerm(int mods, bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0; bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0; - // If we are working with a raw index, apply the rules for case and - // diacritics sensitivity. + if (o_index_stripchars) { + diac_sensitive = case_sensitive = false; + } else { + // If we are working with a raw index, apply the rules for case and + // diacritics sensitivity. - // If any character has a diacritic, we become - // diacritic-sensitive. Note that the way that the test is - // performed (conversion+comparison) will automatically ignore - // accented characters which are actually a separate letter - if (unachasaccents(term)) - diac_sensitive = true; + // If any character has a diacritic, we become + // diacritic-sensitive. Note that the way that the test is + // performed (conversion+comparison) will automatically ignore + // accented characters which are actually a separate letter + if (unachasaccents(term)) + diac_sensitive = true; - // If any character apart the first is uppercase, we become case-sensitive. - // The first character is reserved for turning off stemming. You need to - // use a query language modifier to search for Floor in a case-sensitive - // way. - Utf8Iter it(term); - it++; - if (unachasuppercase(term.substr(it.getBpos()))) - case_sensitive = true; + // If any character apart the first is uppercase, we become + // case-sensitive. The first character is reserved for + // turning off stemming. You need to use a query language + // modifier to search for Floor in a case-sensitive way. + Utf8Iter it(term); + it++; + if (unachasuppercase(term.substr(it.getBpos()))) + case_sensitive = true; - // If we are sensitive to case or diacritics turn stemming off - if (diac_sensitive || case_sensitive) - nostemexp = true; + // If we are sensitive to case or diacritics turn stemming off + if (diac_sensitive || case_sensitive) + nostemexp = true; - if (!case_sensitive || !diac_sensitive) - noexpansion = false; + if (!case_sensitive || !diac_sensitive) + noexpansion = false; + } #endif if (noexpansion) { sterm = term; - exp.push_back(prefix + term); - } else { - TermMatchResult res; - if (haswild) { - // Note that if there are wildcards, we do a direct from-index - // expansion, which means that we are casediac-sensitive. There - // would be nothing to prevent us to expand from the casediac - // synonyms first. To be done later - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, - m_field); - } else { - sterm = term; -#ifdef RCL_INDEX_STRIPCHARS - m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, - m_field); -#else - // No stem expansion when diacritic or case sensitivity is - // set, it makes no sense (it would mess with the - // diacritics anyway if they are not in the stem part). - // In these 3 cases, perform appropriate expansion from - // the charstripping db, and do a bogus wildcard expansion - // (there is no wild card) to generate the result: - if (diac_sensitive && case_sensitive) { - // No expansion whatsoever - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, - m_field); - } else { - // Access case and diacritics expansion: - vector exp; - SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD); - XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, - "all", &unacfoldtrans); + oexp.push_back(prefix + term); + LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); + return; + } - if (diac_sensitive) { - // Expand for accents and case, filtering for same accents, - // then bogus wildcard expansion for generating result - SynTermTransUnac foldtrans(UNACOP_FOLD); - synac.synExpand(term, exp, &foldtrans); - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, - -1, m_field); - } - } else if (case_sensitive) { - // Expand for accents and case, filtering for same case, - // then bogus wildcard expansion for generating result - SynTermTransUnac unactrans(UNACOP_UNAC); - synac.synExpand(term, exp, &unactrans); - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, - -1, m_field); - } - } else { - // Expand for accents and case, then lowercase - // result for input to stemdb. - synac.synExpand(term, exp); - for (unsigned int i = 0; i < exp.size(); i++) { - string lower; - unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD); - exp[i] = lower; - } - sort(exp.begin(), exp.end()); - vector::iterator uit = - unique(exp.begin(), exp.end()); - exp.resize(uit - exp.begin()); - LOGDEB(("ExpandTerm: after casediac: %s\n", - stringsToString(exp).c_str())); + SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD); + XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all", + &unacfoldtrans); + vector lexp; - StemDb db(m_db.m_ndb->xrdb); - vector exp1; - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - db.stemExpand(m_stemlang, *it, exp1); - } - LOGDEB(("ExpandTerm: after stem: %s\n", - stringsToString(exp1).c_str())); - - // Expand the resulting list for case (all stemdb content - // is lowercase) - exp.clear(); - for (vector::const_iterator it = exp1.begin(); - it != exp1.end(); it++) { - synac.synExpand(*it, exp); - } - sort(exp.begin(), exp.end()); - uit = unique(exp.begin(), exp.end()); - exp.resize(uit - exp.begin()); - - LOGDEB(("ExpandTerm: after case exp of stem: %s\n", - stringsToString(exp).c_str())); - - // Bogus wildcard expand to generate the result - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, - -1, m_field); - } - - } - } -#endif - } - - for (vector::const_iterator it = res.entries.begin(); - it != res.entries.end(); it++) { - exp.push_back(it->term); - } - LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str())); + TermMatchResult res; + if (haswild) { + // Note that if there are wildcards, we do a direct from-index + // expansion, which means that we are casediac-sensitive. There + // would be nothing to prevent us to expand from the casediac + // synonyms first. To be done later + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, + m_field); + goto termmatchtoresult; } + + sterm = term; + +#ifdef RCL_INDEX_STRIPCHARS + + m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field); + +#else + + if (o_index_stripchars) { + // If the index is raw, we can only come here if nostemexp is unset + // and we just need stem expansion. + m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field); + goto termmatchtoresult; + } + + // No stem expansion when diacritic or case sensitivity is set, it + // makes no sense (it would mess with the diacritics anyway if + // they are not in the stem part). In these 3 cases, perform + // appropriate expansion from the charstripping db, and do a bogus + // wildcard expansion (there is no wild card) to generate the + // result: + + if (diac_sensitive && case_sensitive) { + // No expansion whatsoever + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field); + goto termmatchtoresult; + } + + if (diac_sensitive) { + // Expand for accents and case, filtering for same accents, + // then bogus wildcard expansion for generating result + SynTermTransUnac foldtrans(UNACOP_FOLD); + synac.synExpand(term, lexp, &foldtrans); + goto exptotermatch; + } + + if (case_sensitive) { + // Expand for accents and case, filtering for same case, then + // bogus wildcard expansion for generating result + SynTermTransUnac unactrans(UNACOP_UNAC); + synac.synExpand(term, lexp, &unactrans); + goto exptotermatch; + } + + // We are neither accent- nor case- sensitive and may need stem + // expansion or not. + + // Expand for accents and case + synac.synExpand(term, lexp); + LOGDEB(("ExpTerm: casediac: %s\n", stringsToString(lexp).c_str())); + if (nostemexp) + goto exptotermatch; + + // Need stem expansion. Lowercase the result of accent and case + // expansion for input to stemdb. + for (unsigned int i = 0; i < lexp.size(); i++) { + string lower; + unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD); + lexp[i] = lower; + } + sort(lexp.begin(), lexp.end()); + { + vector::iterator uit = unique(lexp.begin(), lexp.end()); + lexp.resize(uit - lexp.begin()); + StemDb db(m_db.m_ndb->xrdb); + vector exp1; + for (vector::const_iterator it = lexp.begin(); + it != lexp.end(); it++) { + db.stemExpand(m_stemlang, *it, exp1); + } + LOGDEB(("ExpTerm: stem: %s\n", stringsToString(exp1).c_str())); + + // Expand the resulting list for case (all stemdb content + // is lowercase) + lexp.clear(); + for (vector::const_iterator it = exp1.begin(); + it != exp1.end(); it++) { + synac.synExpand(*it, lexp); + } + sort(lexp.begin(), lexp.end()); + uit = unique(lexp.begin(), lexp.end()); + lexp.resize(uit - lexp.begin()); + } + LOGDEB(("ExpTerm: case exp of stem: %s\n", stringsToString(lexp).c_str())); + + // Bogus wildcard expand to generate the result +exptotermatch: + for (vector::const_iterator it = lexp.begin(); + it != lexp.end(); it++) { + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, + res, -1, m_field); + } +#endif + + // Term match entries to vector of terms +termmatchtoresult: + for (vector::const_iterator it = res.entries.begin(); + it != res.entries.end(); it++) { + oexp.push_back(it->term); + } + LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); } // Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d @@ -1097,9 +1125,11 @@ bool StringToXapianQ::processUserString(const string &iq, TermProcStop tpstop(nxt, stops); nxt = &tpstop; //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon; //tpcommon.onlygrams(true); -#ifdef RCL_INDEX_STRIPCHARS - TermProcPrep tpprep(nxt); nxt = &tpprep; + TermProcPrep tpprep(nxt); +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) #endif + nxt = &tpprep; TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | TextSplit::TXTS_KEEPWILD), diff --git a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp index ad20553d..c3d435e8 100644 --- a/src/rcldb/stemdb.cpp +++ b/src/rcldb/stemdb.cpp @@ -26,6 +26,8 @@ #include #include +#include +using namespace std; #include @@ -34,18 +36,14 @@ #include "smallut.h" #include "synfamily.h" #include "unacpp.h" - -#include - -using namespace std; +#include "rclconfig.h" namespace Rcl { /** * Expand for one or several languages */ -bool StemDb::stemExpand(const std::string& langs, - const std::string& term, +bool StemDb::stemExpand(const std::string& langs, const std::string& term, vector& result) { vector llangs; @@ -59,14 +57,17 @@ bool StemDb::stemExpand(const std::string& langs, } #ifndef RCL_INDEX_STRIPCHARS - for (vector::const_iterator it = llangs.begin(); - it != llangs.end(); it++) { - SynTermTransStem stemmer(*it); - XapComputableSynFamMember expander(getdb(), synFamStemUnac, - *it, &stemmer); - string unac; - unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC); - (void)expander.synExpand(unac, result); + // Expand the unaccented stem + if (!o_index_stripchars) { + for (vector::const_iterator it = llangs.begin(); + it != llangs.end(); it++) { + SynTermTransStem stemmer(*it); + XapComputableSynFamMember expander(getdb(), synFamStemUnac, + *it, &stemmer); + string unac; + unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC); + (void)expander.synExpand(unac, result); + } } #endif diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index 701d7b2d..a8a784c1 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -33,17 +33,12 @@ #include #include #include +using namespace std; #include "smallut.h" #include "utf8iter.h" #include "hldata.h" -#ifndef NO_NAMESPACES -using namespace std; -#endif /* NO_NAMESPACES */ - -#define MIN(A,B) ((A)<(B)?(A):(B)) - int stringicmp(const string & s1, const string& s2) { string::const_iterator it1 = s1.begin(); diff --git a/src/utils/smallut.h b/src/utils/smallut.h index 424953ae..c0f698d3 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -224,4 +224,11 @@ public: } }; +#ifndef MIN +#define MIN(A,B) (((A)<(B)) ? (A) : (B)) +#endif +#ifndef MAX +#define MAX(A,B) (((A)>(B)) ? (A) : (B)) +#endif + #endif /* _SMALLUT_H_INCLUDED_ */ diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf index df13a81b..1408117d 100644 --- a/tests/config/recoll.conf +++ b/tests/config/recoll.conf @@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst daemloglevel = 6 daemlogfilename = /tmp/rclmontrace +indexStripChars = 1 + topdirs = /home/dockes/projets/fulltext/testrecoll/ skippedPaths = \