diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index df913e65..0b91ad6c 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -186,7 +186,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, } } if (pos != m_prevpos || l != m_prevlen) { - bool ret = m_cb->takeword(w, pos, btstart, btend); + bool ret = takeword(w, pos, btstart, btend); m_prevpos = pos; m_prevlen = w.length(); return ret; @@ -558,7 +558,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0; unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars; for (unsigned int i = loopbeg; i < loopend; i++) { - if (!m_cb->takeword(it.buffer().substr(boffs[i], + if (!takeword(it.buffer().substr(boffs[i], btend-boffs[i]), m_wordpos - (nchars-i-1), boffs[i], btend)) { return false; @@ -579,7 +579,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) // first if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) { unsigned int btend = it.getBpos(); // Current char is out - if (!m_cb->takeword(it.buffer().substr(boffs[0], + if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]), m_wordpos - nchars, boffs[0], btend)) { @@ -595,12 +595,12 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) return true; } -// Callback class for countWords -class utSplitterCB : public TextSplitCB { +// Specialization for countWords +class TextSplitCW : public TextSplit { public: int wcnt; - utSplitterCB() : wcnt(0) {} - bool takeword(const string &term, int pos, int bs, int be) { + TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {} + bool takeword(const string &, int, int, int) { wcnt++; return true; } @@ -608,10 +608,9 @@ class utSplitterCB : public TextSplitCB { int TextSplit::countWords(const string& s, TextSplit::Flags flgs) { - utSplitterCB cb; - TextSplit splitter(&cb, flgs); + TextSplitCW splitter(flgs); splitter.text_to_words(s); - return cb.wcnt; + return splitter.wcnt; } bool TextSplit::hasVisibleWhite(const string &in) @@ -726,12 +725,13 @@ bool TextSplit::stringToStrings(const string &s, list &tokens) using namespace std; -// A small class to hold state while splitting text -class mySplitterCB : public TextSplitCB { +class myTextSplit : public TextSplit { int first; bool nooutput; public: - mySplitterCB() : first(1), nooutput(false) {} + myTextSplit(Flags flags = Flags(TXTS_NONE)) : + TextSplit(flags),first(1), nooutput(false) + {} void setNoOut(bool val) {nooutput = val;} bool takeword(const string &term, int pos, int bs, int be) { if (nooutput) @@ -821,12 +821,8 @@ int main(int argc, char **argv) DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); - mySplitterCB cb; TextSplit::Flags flags = TextSplit::TXTS_NONE; - if (op_flags&OPT_S) - cb.setNoOut(true); - if (op_flags&OPT_s) flags = TextSplit::TXTS_ONLYSPANS; else if (op_flags&OPT_w) @@ -867,7 +863,9 @@ int main(int argc, char **argv) int n = TextSplit::countWords(data, flags); cout << n << " words" << endl; } else { - TextSplit splitter(&cb, flags); + myTextSplit splitter(flags); + if (op_flags&OPT_S) + splitter.setNoOut(true); splitter.text_to_words(data); } } diff --git a/src/common/textsplit.h b/src/common/textsplit.h index b8caab09..7f2aa47a 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -25,19 +25,6 @@ using std::string; using std::list; #endif -/** - * Function class whose takeword method is called for every detected word while * splitting text. - */ -class TextSplitCB { -public: - virtual ~TextSplitCB() {} - virtual bool takeword(const string& term, - int pos, // term pos - int bts, // byte offset of first char in term - int bte // byte offset of first char after term - ) = 0; -}; - class Utf8Iter; @@ -67,20 +54,25 @@ public: TXTS_KEEPWILD = 4 // Handle wildcards as letters }; - /** - * Constructor: just store callback object - */ - TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE)) - : m_flags(flags), m_cb(t), m_maxWordLength(40), - m_prevpos(-1) + + TextSplit(Flags flags = Flags(TXTS_NONE)) + : m_flags(flags), m_maxWordLength(40), m_prevpos(-1) { } + virtual ~TextSplit() {} /** Split text, emit words and positions. */ bool text_to_words(const string &in); - //Utility functions : these does not need the user to setup a callback - // etc. + /** Process one output word: to be implemented by the actual user class */ + virtual bool takeword(const string& term, + int pos, // term pos + int bts, // byte offset of first char in term + int bte // byte offset of first char after term + ) = 0; + + + // Static utility functions: /** Count words in string, as the splitter would generate them */ static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS); @@ -102,7 +94,6 @@ public: private: Flags m_flags; - TextSplitCB *m_cb; int m_maxWordLength; // Current span. Might be jf.dockes@wanadoo.f @@ -132,5 +123,4 @@ private: bool doemit(bool spanerase, int bp, bool spanemit=false); }; - #endif /* _TEXTSPLIT_H_INCLUDED_ */ diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp index 7c718cff..5e59e4a3 100644 --- a/src/query/plaintorich.cpp +++ b/src/query/plaintorich.cpp @@ -58,15 +58,15 @@ static string vecStringToString(const vector& t) // Text splitter callback used to take note of the position of query terms // inside the result text. This is then used to insert highlight tags. -class myTextSplitCB : public TextSplitCB { +class TextSplitPTR : public TextSplit { public: // Out: begin and end byte positions of query terms/groups in text vector > tboffs; - myTextSplitCB(const vector& its, - const vector >&groups, - const vector& slacks) + TextSplitPTR(const vector& its, + const vector >&groups, + const vector& slacks) : m_wcount(0), m_groups(groups), m_slacks(slacks) { for (vector::const_iterator it = its.begin(); @@ -86,7 +86,8 @@ class myTextSplitCB : public TextSplitCB { virtual bool takeword(const std::string& term, int pos, int bts, int bte) { string dumb; if (!unacmaybefold(term, dumb, "UTF-8", true)) { - LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str())); + LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", + term.c_str())); return true; } //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), @@ -186,9 +187,9 @@ static bool do_proximity_test(int window, vector* >& plists, } // Check if there is a NEAR match for the group of terms -bool myTextSplitCB::matchGroup(const vector& terms, int window) +bool TextSplitPTR::matchGroup(const vector& terms, int window) { - LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window, + LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window, vecStringToString(terms).c_str())); // The position lists we are going to work with. We extract them from the @@ -207,7 +208,7 @@ bool myTextSplitCB::matchGroup(const vector& terms, int window) it != terms.end(); it++) { map >::iterator pl = m_plists.find(*it); if (pl == m_plists.end()) { - LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n", + LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n", (*it).c_str())); continue; } @@ -215,10 +216,10 @@ bool myTextSplitCB::matchGroup(const vector& terms, int window) plistToTerm[&(pl->second)] = *it; realgroup.push_back(*it); } - LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n", + LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n", window, vecStringToString(realgroup).c_str())); if (plists.size() < 2) { - LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n")); + LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n")); return false; } // Sort the positions lists so that the shorter is first @@ -243,7 +244,7 @@ bool myTextSplitCB::matchGroup(const vector& terms, int window) int sta = int(10E9), sto = 0; LOGDEB0(("MatchGroup: Testing at pos %d\n", pos)); if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) { - LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n", + LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", sta, sto)); // Maybe extend the window by 1st term position, this was not // done by do_prox.. @@ -253,7 +254,7 @@ bool myTextSplitCB::matchGroup(const vector& terms, int window) map >::iterator i1 = m_gpostobytes.find(sta); map >::iterator i2 = m_gpostobytes.find(sto); if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { - LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n", + LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n", i1->second.first, i2->second.second)); tboffs.push_back(pair(i1->second.first, i2->second.second)); @@ -278,7 +279,7 @@ public: }; // Do the phrase match thing, then merge the highlight lists -bool myTextSplitCB::matchGroups() +bool TextSplitPTR::matchGroups() { vector >::const_iterator vit = m_groups.begin(); vector::const_iterator sit = m_slacks.begin(); @@ -333,15 +334,14 @@ bool PlainToRich::plaintorich(const string& in, // Compute the positions for the query terms. We use the text // splitter to break the text into words, and compare the words to // the search terms, - myTextSplitCB cb(terms, groups, slacks); - TextSplit splitter(&cb); + TextSplitPTR splitter(terms, groups, slacks); // Note: the splitter returns the term locations in byte, not // character, offsets. splitter.text_to_words(in); LOGDEB0(("plaintorich: split done %d mS\n", chron.millis())); // Compute the positions for NEAR and PHRASE groups. - cb.matchGroups(); + splitter.matchGroups(); out.clear(); out.push_back(""); @@ -353,12 +353,12 @@ bool PlainToRich::plaintorich(const string& in, // Iterator for the list of input term positions. We use it to // output highlight tags and to compute term positions in the // output text - vector >::iterator tPosIt = cb.tboffs.begin(); - vector >::iterator tPosEnd = cb.tboffs.end(); + vector >::iterator tPosIt = splitter.tboffs.begin(); + vector >::iterator tPosEnd = splitter.tboffs.end(); #if 0 - for (vector >::const_iterator it = cb.tboffs.begin(); - it != cb.tboffs.end(); it++) { + for (vector >::const_iterator it = splitter.tboffs.begin(); + it != splitter.tboffs.end(); it++) { LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second)); } #endif @@ -412,7 +412,7 @@ bool PlainToRich::plaintorich(const string& in, } // Skip all highlight areas that would overlap this one int crend = tPosIt->second; - while (tPosIt != cb.tboffs.end() && tPosIt->first < crend) + while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend) tPosIt++; inrcltag = 0; } diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 18604fe0..c6112b3f 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -42,6 +42,7 @@ using namespace std; #include "internfile.h" #include "wipedir.h" #include "transcode.h" +#include "textsplit.h" bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& idoc) { diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp index be5930d3..76e55762 100644 --- a/src/query/wasatorcl.cpp +++ b/src/query/wasatorcl.cpp @@ -31,6 +31,7 @@ using std::list; #include "smallut.h" #include "rclconfig.h" #include "refcntr.h" +#include "textsplit.h" Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 82a18bd5..29678f8b 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -781,16 +781,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx) } -// The text splitter callback class which receives words from the -// splitter and adds postings to the Xapian document. -class mySplitterCB : public TextSplitCB { +// The splitter breaks text into words and adds postings to the Xapian document. +class TextSplitDb : public TextSplit { public: Xapian::Document &doc; // Xapian document Xapian::termpos basepos; // Base for document section Xapian::termpos curpos; // Current position. Used to set basepos for the // following section StopList &stops; - mySplitterCB(Xapian::Document &d, StopList &_stops) + TextSplitDb(Xapian::Document &d, StopList &_stops) : doc(d), basepos(1), curpos(0), stops(_stops) {} bool takeword(const std::string &term, int pos, int, int); @@ -802,15 +801,16 @@ private: string prefix; }; -// Callback for the document to word splitting class during indexation -bool mySplitterCB::takeword(const std::string &_term, int pos, int, int) +// Get one term from the doc, remove accents and lowercase, then add posting +bool TextSplitDb::takeword(const std::string &_term, int pos, int, int) { #if 0 - LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str())); + LOGDEB(("TextSplitDb::takeword: [%s]\n", _term.c_str())); #endif string term; if (!unacmaybefold(_term, term, "UTF-8", true)) { - LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str())); + LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", + _term.c_str())); term.clear(); // We don't generate a fatal error because of a bad term return true; @@ -892,14 +892,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc doc = idoc; Xapian::Document newdocument; - mySplitterCB splitData(newdocument, m_stops); - TextSplit splitter(&splitData); + TextSplitDb splitter(newdocument, m_stops); // Split and index file name as document term(s) LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); if (!splitter.text_to_words(doc.utf8fn)) LOGDEB(("Db::addOrUpdate: split failed for file name\n")); - splitData.basepos += splitData.curpos + 100; + splitter.basepos += splitter.curpos + 100; // Index textual metadata. These are all indexed as text with // positions, as we may want to do phrase searches with them (this @@ -919,19 +918,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", meta_it->first.c_str(), pfx.c_str(), meta_it->second.c_str())); - splitData.setprefix(pfx); // Subject + splitter.setprefix(pfx); // Subject if (!splitter.text_to_words(meta_it->second)) LOGDEB(("Db::addOrUpdate: split failed for %s\n", meta_it->first.c_str())); - splitData.setprefix(string()); - splitData.basepos += splitData.curpos + 100; + splitter.setprefix(string()); + splitter.basepos += splitter.curpos + 100; } } - if (splitData.curpos < baseTextPosition) - splitData.basepos = baseTextPosition; + if (splitter.curpos < baseTextPosition) + splitter.basepos = baseTextPosition; else - splitData.basepos += splitData.curpos + 100; + splitter.basepos += splitter.curpos + 100; // Split and index body text LOGDEB2(("Db::add: split body\n")); diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index bdb94c3c..88d239ba 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -188,25 +188,27 @@ void SearchData::getUTerms(vector& terms) const // phrases. This is for parts of the user entry which would appear as // a single word because there is no white space inside, but are // actually multiple terms to rcldb (ie term1,term2) -class wsQData : public TextSplitCB { +class TextSplitQ : public TextSplit { public: - wsQData(const StopList &_stops) - : stops(_stops), alltermcount(0) + TextSplitQ(Flags flags, const StopList &_stops) + : TextSplit(flags), stops(_stops), alltermcount(0) {} bool takeword(const std::string &interm, int , int, int) { alltermcount++; - LOGDEB1(("wsQData::takeword: %s\n", interm.c_str())); + LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str())); // Check if the first letter is a majuscule in which // case we do not want to do stem expansion. Note that // the test is convoluted and possibly problematic string noacterm, noaclowterm; if (!unacmaybefold(interm, noacterm, "UTF-8", false)) { - LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str())); + LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", + interm.c_str())); return true; } if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { - LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str())); + LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", + noacterm.c_str())); return true; } bool nostemexp = false; @@ -216,7 +218,8 @@ class wsQData : public TextSplitCB { nostemexp = true; if (stops.hasStops() && stops.isStop(noaclowterm)) { - LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str())); + LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n", + noaclowterm.c_str())); return true; } terms.push_back(noaclowterm); @@ -271,7 +274,7 @@ private: // After splitting entry on whitespace: process non-phrase element void processSimpleSpan(const string& span, bool nostemexp, list &pqueries); // Process phrase/near element - void processPhraseOrNear(wsQData *splitData, + void processPhraseOrNear(TextSplitQ *splitData, list &pqueries, bool useNear, int slack); @@ -420,7 +423,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, // NEAR xapian query, the elements of which can themselves be OR // queries if the terms get expanded by stemming or wildcards (we // don't do stemming for PHRASE though) -void StringToXapianQ::processPhraseOrNear(wsQData *splitData, +void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, list &pqueries, bool useNear, int slack) { @@ -527,31 +530,31 @@ bool StringToXapianQ::processUserString(const string &iq, // We now adjust the phrase/near slack by the term count // difference (this is mainly better for cjk where this is a very // common occurrence because of the ngrams thing. - wsQData splitDataS(stops), splitDataW(stops); - TextSplit splitterS(&splitDataS, - TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | - TextSplit::TXTS_KEEPWILD)); + TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | + TextSplit::TXTS_KEEPWILD), + stops); splitterS.text_to_words(*it); - TextSplit splitterW(&splitDataW, - TextSplit::Flags(TextSplit::TXTS_NOSPANS | - TextSplit::TXTS_KEEPWILD)); + TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | + TextSplit::TXTS_KEEPWILD), + stops); splitterW.text_to_words(*it); - wsQData *splitData = &splitDataS; - if (splitDataS.terms.size() > 1 && - splitDataS.terms.size() != splitDataW.terms.size()) { - slack += splitDataW.terms.size() - splitDataS.terms.size(); + TextSplitQ *splitter = &splitterS; + if (splitterS.terms.size() > 1 && + splitterS.terms.size() != splitterW.terms.size()) { + slack += splitterW.terms.size() - splitterS.terms.size(); // used to: splitData = &splitDataW; } - LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size())); - switch (splitData->terms.size()) { + LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size())); + switch (splitter->terms.size()) { case 0: continue;// ?? case 1: - processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries); + processSimpleSpan(splitter->terms.front(), + splitter->nostemexps.front(), pqueries); break; default: - processPhraseOrNear(splitData, pqueries, useNear, slack); + processPhraseOrNear(splitter, pqueries, useNear, slack); } } } catch (const Xapian::Error &e) { diff --git a/src/rcldb/stoplist.cpp b/src/rcldb/stoplist.cpp index 5a242a43..8df8385c 100644 --- a/src/rcldb/stoplist.cpp +++ b/src/rcldb/stoplist.cpp @@ -5,6 +5,7 @@ static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp #include "debuglog.h" #include "readfile.h" #include "unacpp.h" +#include "textsplit.h" #include "stoplist.h" #ifndef NO_NAMESPACES @@ -12,6 +13,21 @@ namespace Rcl { #endif +class TextSplitSW : public TextSplit { +public: + set& stops; + TextSplitSW(Flags flags, set& stps) + : TextSplit(flags), stops(stps) + {} + virtual bool takeword(const string& term, int, int, int) + { + string dterm; + unacmaybefold(term, dterm, "UTF-8", true); + stops.insert(dterm); + return true; + } +}; + bool StopList::setFile(const string &filename) { m_hasStops = false; @@ -22,18 +38,9 @@ bool StopList::setFile(const string &filename) filename.c_str(), reason.c_str())); return false; } - TextSplit ts(this, TextSplit::TXTS_ONLYSPANS); + TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops); ts.text_to_words(stoptext); - return true; -} - -bool StopList::takeword(const string& term, int, int, int) -{ - string dterm; - unacmaybefold(term, dterm, "UTF-8", true); - LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str())); - m_hasStops = true; - m_stops.insert(dterm); + m_hasStops = !m_stops.empty(); return true; } diff --git a/src/rcldb/stoplist.h b/src/rcldb/stoplist.h index 157786c3..2a47d806 100644 --- a/src/rcldb/stoplist.h +++ b/src/rcldb/stoplist.h @@ -5,8 +5,6 @@ #include #include -#include "textsplit.h" - #ifndef NO_NAMESPACES using std::set; using std::string; @@ -14,7 +12,7 @@ namespace Rcl { #endif -class StopList : public TextSplitCB { +class StopList { public: StopList() : m_hasStops(false) {} StopList(const string &filename) {setFile(filename);} @@ -23,7 +21,6 @@ public: bool setFile(const string &filename); bool isStop(const string &term) const; bool hasStops() const {return m_hasStops;} - virtual bool takeword(const string& term, int pos, int bts, int bte); private: bool m_hasStops;