diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index fe0b2921..4e2b7c40 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -164,7 +164,7 @@ bool TextSplit::o_noNumbers = false; // Do some checking (the kind which is simpler to do here than in the // main loop), then send term to our client. inline bool TextSplit::emitterm(bool isspan, string &w, int pos, - int btstart, int btend) + int btstart, int btend) { LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos)); @@ -348,12 +348,14 @@ bool TextSplit::text_to_words(const string &in) m_inNumber = false; } break; + case WILD: if (m_flags & TXTS_KEEPWILD) goto NORMALCHAR; else goto SPACE; break; + case '-': case '+': curspanglue = cc; @@ -381,12 +383,16 @@ bool TextSplit::text_to_words(const string &in) m_wordStart += it.appendchartostring(m_span); } break; + case '.': case ',': + { + // Need a little lookahead here. At worse this gets the end null + int nextc = it[it.getCpos()+1]; + int nextwhat = whatcc(nextc); if (m_inNumber) { - // 132.jpg ? - int wn = it[it.getCpos()+1]; - if (whatcc(wn) != DIGIT && wn != 'e' && wn != 'E') + // we're eliminating 132.jpg here. Good idea ? + if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E') goto SPACE; m_wordLen += it.appendchartostring(m_span); curspanglue = cc; @@ -398,10 +404,15 @@ bool TextSplit::text_to_words(const string &in) // Another problem is that something like .x-errs // will be split as .x-errs, x, errs but not x-errs // A final comma in a word will be removed by doemit - if (cc == '.' && it[it.getCpos()+1] != '.') { + + // Only letters and digits make sense after + if (nextwhat != A_LLETTER && nextwhat != A_ULETTER && + nextwhat != DIGIT && nextwhat != LETTER) + goto SPACE; + + if (cc == '.') { // Check for number like .1 - if (m_span.length() == 0 && - whatcc(it[it.getCpos()+1]) == DIGIT) { + if (m_span.length() == 0 && nextwhat == DIGIT) { m_inNumber = true; m_wordLen += it.appendchartostring(m_span); curspanglue = cc; @@ -430,7 +441,9 @@ bool TextSplit::text_to_words(const string &in) } } goto SPACE; + } break; + case '@': if (m_wordLen) { if (!doemit(false, it.getBpos())) @@ -623,8 +636,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) // first if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) { unsigned int btend = it.getBpos(); // Current char is out - if (!takeword(it.buffer().substr(boffs[0], - btend-boffs[0]), + if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]), m_wordpos - nchars, boffs[0], btend)) { return false; @@ -764,18 +776,19 @@ bool TextSplit::stringToStrings(const string &s, list &tokens) #include "readfile.h" #include "debuglog.h" #include "transcode.h" +#include "unacpp.h" +#include "termproc.h" using namespace std; -class myTextSplit : public TextSplit { +class myTermProc : public Rcl::TermProc { int first; bool nooutput; - public: - myTextSplit(Flags flags = Flags(TXTS_NONE)) : - TextSplit(flags),first(1), nooutput(false) - {} +public: + myTermProc() : TermProc(0), first(1), nooutput(false) {} void setNoOut(bool val) {nooutput = val;} - bool takeword(const string &term, int pos, int bs, int be) { + virtual bool takeword(const string &term, int pos, int bs, int be) + { if (nooutput) return true; FILE *fp = stdout; @@ -812,13 +825,15 @@ static string thisprog; static string usage = " textsplit [opts] [filename]\n" - " -S: no output\n" - " -s: only spans\n" - " -w: only words\n" - " -n: no numbers\n" - " -k: preserve wildcards (?*)\n" - " -c: just count words\n" + " -q : no output\n" + " -s : only spans\n" + " -w : only words\n" + " -n : no numbers\n" + " -k : preserve wildcards (?*)\n" + " -c : just count words\n" + " -u : use unac\n" " -C [charset] : input charset\n" + " -S [stopfile] : stopfile to use for commongrams\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" ; @@ -833,15 +848,18 @@ Usage(void) static int op_flags; #define OPT_s 0x1 #define OPT_w 0x2 -#define OPT_S 0x4 +#define OPT_q 0x4 #define OPT_c 0x8 #define OPT_k 0x10 #define OPT_C 0x20 #define OPT_n 0x40 +#define OPT_S 0x80 +#define OPT_u 0x100 int main(int argc, char **argv) { - string charset; + string charset, stopfile; + thisprog = argv[0]; argc--; argv++; @@ -858,8 +876,12 @@ int main(int argc, char **argv) goto b1; case 'k': op_flags |= OPT_k; break; case 'n': op_flags |= OPT_n; break; + case 'q': op_flags |= OPT_q; break; case 's': op_flags |= OPT_s; break; - case 'S': op_flags |= OPT_S; break; + case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); + stopfile = *(++argv); argc--; + goto b1; + case 'u': op_flags |= OPT_u; break; case 'w': op_flags |= OPT_w; break; default: Usage(); break; } @@ -879,6 +901,13 @@ int main(int argc, char **argv) if (op_flags & OPT_n) TextSplit::noNumbers(); + Rcl::StopList stoplist; + if (op_flags & OPT_S) { + if (!stoplist.setFile(stopfile)) { + cerr << "Can't read stopfile: " << stopfile << endl; + exit(1); + } + } string odata, reason; if (argc == 1) { const char *filename = *argv++; argc--; @@ -912,10 +941,25 @@ int main(int argc, char **argv) int n = TextSplit::countWords(data, flags); cout << n << " words" << endl; } else { - myTextSplit splitter(flags); - if (op_flags&OPT_S) - splitter.setNoOut(true); + myTermProc printproc; + + Rcl::TermProc *nxt = &printproc; + + Rcl::TermProcCommongrams commonproc(nxt, stoplist); + if (op_flags & OPT_S) + nxt = &commonproc; + + Rcl::TermProcPrep preproc(nxt); + if (op_flags & OPT_u) + nxt = &preproc; + + Rcl::TextSplitP splitter(nxt, flags); + + if (op_flags & OPT_q) + printproc.setNoOut(true); + splitter.text_to_words(data); + } } #endif // TEST diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index c1fd0ca8..22422ae3 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -897,8 +897,9 @@ private: // Reimplement text_to_words to insert the begin and end anchor terms. bool TextSplitDb::text_to_words(const string &in) { - LOGDEB2(("TextSplitDb::text_to_words\n")); + bool ret = false; string ermsg; + try { // Index the possibly prefixed start term. doc.add_posting(prefix + start_of_field_term, basepos, wdfinc); @@ -906,14 +907,12 @@ bool TextSplitDb::text_to_words(const string &in) } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); - basepos += curpos + 100; - return false; + goto out; } if (!TextSplitP::text_to_words(in)) { LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n")); - basepos += curpos + 100; - return false; + goto out; } try { @@ -923,10 +922,12 @@ bool TextSplitDb::text_to_words(const string &in) } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); - basepos += curpos + 100; - return false; + goto out; } + ret = true; + +out: basepos += curpos + 100; return true; } @@ -961,6 +962,7 @@ public: LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); return false; } + private: TextSplitDb *m_ts; }; @@ -1028,12 +1030,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc doc = idoc; Xapian::Document newdocument; + + // The term processing pipeline: TermProcIdx tpidx; -// TermProcStop tpstop(&tpidx, m_stops); - TermProcCommongrams tpstop(&tpidx, m_stops); - TermProcPrep tpprep(&tpstop); - TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep); + TermProc *nxt = &tpidx; + TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; +// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; + TermProcPrep tpprep(nxt); nxt = &tpprep; + + TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt); tpidx.setTSD(&splitter); + // Split and index file name as document term(s) LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); if (!splitter.text_to_words(doc.utf8fn)) diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index be9f14ad..11cc713f 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -478,7 +478,8 @@ void SearchData::getUTerms(vector& terms) const class TextSplitQ : public TextSplitP { public: TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc) - : TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0) + : TextSplitP(prc, flags), + curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0) {} bool takeword(const std::string &term, int pos, int bs, int be) @@ -509,16 +510,30 @@ public: bool takeword(const std::string &term, int pos, int bs, int be) { m_ts->alltermcount++; - m_ts->lastpos = pos; + if (m_ts->lastpos < pos) + m_ts->lastpos = pos; bool noexpand = be ? m_ts->curnostemexp : true; - LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n", - term.c_str(), noexpand)); - m_ts->terms.push_back(term); - m_ts->nostemexps.push_back(noexpand); + LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", + term.c_str(), pos, noexpand)); + if (m_terms[pos].size() < term.size()) { + m_terms[pos] = term; + m_nste[pos] = noexpand; + } + return true; + } + bool flush() + { + for (map::const_iterator it = m_terms.begin(); + it != m_terms.end(); it++) { + m_ts->terms.push_back(it->second); + m_ts->nostemexps.push_back(m_nste[it->first]); + } return true; } private: TextSplitQ *m_ts; + map m_terms; + map m_nste; }; // A class used to translate a user compound string (*not* a query @@ -783,7 +798,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, // Generate an appropriate PHRASE/NEAR query with adjusted slack // For phrases, give a relevance boost like we do for original terms - LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", + LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", splitData->alltermcount, splitData->lastpos)); Xapian::Query xq(op, orqueries.begin(), orqueries.end(), splitData->lastpos + 1 + slack); @@ -839,7 +854,7 @@ bool StringToXapianQ::processUserString(const string &iq, bool useNear ) { - LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); + LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear)); ermsg.erase(); m_uterms.clear(); m_terms.clear(); @@ -874,45 +889,35 @@ bool StringToXapianQ::processUserString(const string &iq, // We used to do word split, searching for // "term0 term1 term2" instead, which may have worse // performance, but will succeed. - // We now adjust the phrase/near slack by the term count - // difference (this is mainly better for cjk where this is a very - // common occurrence because of the ngrams thing. + // We now adjust the phrase/near slack by comparing the term count + // and the last position + // The term processing pipeline: TermProcQ tpq; - // TermProcStop tpstop(&tpidx, stops); - TermProcCommongrams tpstop(&tpq, stops); - tpstop.onlygrams(true); - TermProcPrep tpprep(&tpstop); + TermProc *nxt = &tpq; + TermProcStop tpstop(nxt, stops); nxt = &tpstop; + //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon; + //tpcommon.onlygrams(true); + TermProcPrep tpprep(nxt); nxt = &tpprep; - TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | + TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | TextSplit::TXTS_KEEPWILD), - stops, &tpprep); - tpq.setTSQ(&splitterS); - splitterS.text_to_words(*it); - LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size())); - TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | - TextSplit::TXTS_KEEPWILD), - stops, &tpprep); - tpq.setTSQ(&splitterW); - tpstop.onlygrams(false); - splitterW.text_to_words(*it); + stops, nxt); + tpq.setTSQ(&splitter); + splitter.text_to_words(*it); - if (splitterS.terms.size() > 1 && - splitterS.terms.size() != splitterW.terms.size()) { - slack += splitterW.terms.size() - splitterS.terms.size(); - } + slack += splitter.lastpos - splitter.terms.size() + 1; - TextSplitQ *splitter = &splitterS; - LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size())); - switch (splitter->terms.size() + terminc) { + LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size())); + switch (splitter.terms.size() + terminc) { case 0: continue;// ?? case 1: - processSimpleSpan(splitter->terms.front(), - splitter->nostemexps.front(), pqueries); + processSimpleSpan(splitter.terms.front(), + splitter.nostemexps.front(), pqueries); break; default: - processPhraseOrNear(splitter, pqueries, useNear, slack, mods); + processPhraseOrNear(&splitter, pqueries, useNear, slack, mods); } } } catch (const Xapian::Error &e) { diff --git a/src/rcldb/termproc.h b/src/rcldb/termproc.h index 0d37dfe6..fd036e01 100644 --- a/src/rcldb/termproc.h +++ b/src/rcldb/termproc.h @@ -66,10 +66,10 @@ private: }; /** - * Intermediary specialized texsplit class: this will probably replace the base - * textsplit when we've converted all the code. The takeword() routine in this - * calls a TextProc's instead of being specialized in a derived class by the - * user module. The text_to_word() method also takes care of flushing. + * Specialized TextSplit class: this will probably replace the base + * TextSplit when we've converted all the code. The takeword() routine in this + * calls a TermProc's instead of being overriden in a user derived class. + * The text_to_word() method also takes care of flushing. */ class TextSplitP : public TextSplit { public: @@ -99,18 +99,39 @@ private: /** Unaccent and lowercase term. This is usually the first in the pipeline */ class TermProcPrep : public TermProc { public: - TermProcPrep(TermProc *nxt) : TermProc(nxt) {} + TermProcPrep(TermProc *nxt) + : TermProc(nxt), m_totalterms(0), m_unacerrors(0) {} virtual bool takeword(const string& itrm, int pos, int bs, int be) { + m_totalterms++; string otrm; if (!unacmaybefold(itrm, otrm, "UTF-8", true)) { - LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); - // We don't generate a fatal error because of a bad term + LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); + m_unacerrors++; + // We don't generate a fatal error because of a bad term, + // but one has to put the limit somewhere + if (m_unacerrors > 500 && + (double(m_totalterms) / double(m_unacerrors)) < 2.0) { + // More than 1 error for every other term + LOGERR(("splitter::takeword: too many unac errors %d/%d\n", + m_unacerrors, m_totalterms)); + return false; + } return true; } return TermProc::takeword(otrm, pos, bs, be); } + + virtual bool flush() + { + m_totalterms = m_unacerrors = 0; + return TermProc::flush(); + } + +private: + int m_totalterms; + int m_unacerrors; }; /** Compare to stop words list and discard if match found */ @@ -119,19 +140,23 @@ public: TermProcStop(TermProc *nxt, const Rcl::StopList& stops) : TermProc(nxt), m_stops(stops) {} - virtual bool takeword(const string& term, int pos, int bts, int bte) + virtual bool takeword(const string& term, int pos, int bs, int be) { if (m_stops.isStop(term)) { return true; } - return TermProc::takeword(term, pos, bts, bte); + return TermProc::takeword(term, pos, bs, be); } + private: const Rcl::StopList& m_stops; }; /** Handle common-gram generation: combine frequent terms with neighbours to * shorten the positions lists for phrase searches. + * NOTE: This does not currently work because of bad interaction with the + * spans (ie john@domain.com) generation in textsplit. Not used, kept for + * testing only */ class TermProcCommongrams : public TermProc { public: @@ -147,7 +172,7 @@ public: if (!m_prevterm.empty() && (m_prevstop || isstop)) { // create 2-gram. space unnecessary but improves - // lisibility of queries + // the readability of queries string twogram; twogram.swap(m_prevterm); twogram.append(1, ' '); @@ -164,7 +189,7 @@ public: } #endif } - + m_prevterm = term; m_prevstop = isstop; m_prevpos = pos; @@ -181,7 +206,7 @@ public: return true; } - bool flush() + virtual bool flush() { if (!m_prevsent && !m_prevterm.empty()) if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))