get rid of a few garbage terms during indexing. Set a threshold for conversion errors after which we discard the doc. Stabilize the new termproc pipeline but no commongrams for now

2011-10-12 17:55:58 +02:00 · 2011-10-12 17:55:58 +02:00 · 0860b559ee
commit 0860b559ee
parent a2c9d2a82b
4 changed files with 167 additions and 86 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -164,7 +164,7 @@ bool          TextSplit::o_noNumbers = false;
 // Do some checking (the kind which is simpler to do here than in the
 // main loop), then send term to our client.
 inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
-			 int btstart, int btend)
+				int btstart, int btend)
 {
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));

@ -348,12 +348,14 @@ bool TextSplit::text_to_words(const string &in)
 		m_inNumber = false;
 	    }
 	    break;
+
 	case WILD:
 	    if (m_flags & TXTS_KEEPWILD)
 		goto NORMALCHAR;
 	    else
 		goto SPACE;
 	    break;
+
 	case '-':
 	case '+':
 	    curspanglue = cc;
@ -381,12 +383,16 @@ bool TextSplit::text_to_words(const string &in)
 		m_wordStart += it.appendchartostring(m_span);
 	    }
 	    break;
+
 	case '.':
 	case ',':
+	{
+	    // Need a little lookahead here. At worse this gets the end null
+	    int nextc = it[it.getCpos()+1];
+	    int nextwhat = whatcc(nextc);
 	    if (m_inNumber) {
-		// 132.jpg ?
-                int wn = it[it.getCpos()+1];
-		if (whatcc(wn) != DIGIT && wn != 'e' && wn != 'E')
+		// we're eliminating 132.jpg here. Good idea ?
+		if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E')
 		    goto SPACE;
 		m_wordLen += it.appendchartostring(m_span);
 		curspanglue = cc;
@ -398,10 +404,15 @@ bool TextSplit::text_to_words(const string &in)
                // Another problem is that something like .x-errs 
 		// will be split as .x-errs, x, errs but not x-errs
 		// A final comma in a word will be removed by doemit
-		if (cc == '.' && it[it.getCpos()+1] != '.') {
+
+		// Only letters and digits make sense after
+		if (nextwhat != A_LLETTER && nextwhat != A_ULETTER && 
+		    nextwhat != DIGIT && nextwhat != LETTER)
+		    goto SPACE;
+
+		if (cc == '.') {
                    // Check for number like .1
-                    if (m_span.length() == 0 &&
-                        whatcc(it[it.getCpos()+1]) == DIGIT) {
+                    if (m_span.length() == 0 && nextwhat == DIGIT) {
                        m_inNumber = true;
                        m_wordLen += it.appendchartostring(m_span);
                        curspanglue = cc;
@ -430,7 +441,9 @@ bool TextSplit::text_to_words(const string &in)
 		}
 	    }
 	    goto SPACE;
+	}
 	    break;
+
 	case '@':
 	    if (m_wordLen) {
 		if (!doemit(false, it.getBpos()))
@ -623,8 +636,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
    // first
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen)  {
 	unsigned int btend = it.getBpos(); // Current char is out
-	if (!takeword(it.buffer().substr(boffs[0], 
-					       btend-boffs[0]),
+	if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]),
 			    m_wordpos - nchars,
 			    boffs[0], btend)) {
 	    return false;
@ -764,18 +776,19 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
 #include "readfile.h"
 #include "debuglog.h"
 #include "transcode.h"
+#include "unacpp.h"
+#include "termproc.h"

 using namespace std;

-class myTextSplit : public TextSplit {
+class myTermProc : public Rcl::TermProc {
    int first;
    bool nooutput;
- public:
-    myTextSplit(Flags flags = Flags(TXTS_NONE)) : 
-        TextSplit(flags),first(1), nooutput(false) 
-    {}
+public:
+    myTermProc() : TermProc(0), first(1), nooutput(false)  {}
    void setNoOut(bool val) {nooutput = val;}
-    bool takeword(const string &term, int pos, int bs, int be) {
+    virtual bool takeword(const string &term, int pos, int bs, int be)
+    {
 	if (nooutput)
 	    return true;
 	FILE *fp = stdout;
@ -812,13 +825,15 @@ static string thisprog;

 static string usage =
    " textsplit [opts] [filename]\n"
-    "   -S: no output\n"
-    "   -s:  only spans\n"
-    "   -w:  only words\n"
-    "   -n:  no numbers\n"
-    "   -k:  preserve wildcards (?*)\n"
-    "   -c: just count words\n"
+    "   -q : no output\n"
+    "   -s :  only spans\n"
+    "   -w :  only words\n"
+    "   -n :  no numbers\n"
+    "   -k :  preserve wildcards (?*)\n"
+    "   -c : just count words\n"
+    "   -u : use unac\n"
    "   -C [charset] : input charset\n"
+    "   -S [stopfile] : stopfile to use for commongrams\n"
    " if filename is 'stdin', will read stdin for data (end with ^D)\n"
    "  \n\n"
    ;
@ -833,15 +848,18 @@ Usage(void)
 static int        op_flags;
 #define OPT_s	  0x1 
 #define OPT_w	  0x2
-#define OPT_S	  0x4
+#define OPT_q	  0x4
 #define OPT_c     0x8
 #define OPT_k     0x10
 #define OPT_C     0x20
 #define OPT_n     0x40
+#define OPT_S     0x80
+#define OPT_u     0x100

 int main(int argc, char **argv)
 {
-    string charset;
+    string charset, stopfile;
+
    thisprog = argv[0];
    argc--; argv++;

@ -858,8 +876,12 @@ int main(int argc, char **argv)
                goto b1;
 	    case 'k':	op_flags |= OPT_k; break;
 	    case 'n':	op_flags |= OPT_n; break;
+	    case 'q':	op_flags |= OPT_q; break;
 	    case 's':	op_flags |= OPT_s; break;
-	    case 'S':	op_flags |= OPT_S; break;
+            case 'S':	op_flags |= OPT_S; if (argc < 2)  Usage();
+                stopfile = *(++argv); argc--; 
+                goto b1;
+	    case 'u':	op_flags |= OPT_u; break;
 	    case 'w':	op_flags |= OPT_w; break;
 	    default: Usage();	break;
 	    }
@ -879,6 +901,13 @@ int main(int argc, char **argv)
    if (op_flags & OPT_n)
 	TextSplit::noNumbers();

+    Rcl::StopList stoplist;
+    if (op_flags & OPT_S) {
+	if (!stoplist.setFile(stopfile)) {
+	    cerr << "Can't read stopfile: " << stopfile << endl;
+	    exit(1);
+	}
+    }
    string odata, reason;
    if (argc == 1) {
 	const char *filename = *argv++;	argc--;
@ -912,10 +941,25 @@ int main(int argc, char **argv)
 	int n = TextSplit::countWords(data, flags);
 	cout << n << " words" << endl;
    } else {
-	myTextSplit splitter(flags);
-        if (op_flags&OPT_S)
-            splitter.setNoOut(true);
+	myTermProc printproc;
+
+	Rcl::TermProc *nxt = &printproc;
+
+	Rcl::TermProcCommongrams commonproc(nxt, stoplist);
+	if (op_flags & OPT_S)
+	    nxt = &commonproc;
+
+	Rcl::TermProcPrep preproc(nxt);
+	if (op_flags & OPT_u) 
+	    nxt = &preproc;
+
+	Rcl::TextSplitP splitter(nxt, flags);
+
+        if (op_flags & OPT_q)
+            printproc.setNoOut(true);
+
 	splitter.text_to_words(data);
+
    }    
 }
 #endif // TEST
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -897,8 +897,9 @@ private:
 // Reimplement text_to_words to insert the begin and end anchor terms.
 bool TextSplitDb::text_to_words(const string &in) 
 {
-    LOGDEB2(("TextSplitDb::text_to_words\n"));
+    bool ret = false;
    string ermsg;
+
    try {
 	// Index the possibly prefixed start term.
 	doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
@ -906,14 +907,12 @@ bool TextSplitDb::text_to_words(const string &in)
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
 	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
-	basepos += curpos + 100;
-	return false;
+	goto out;
    }

    if (!TextSplitP::text_to_words(in)) {
 	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
-	basepos += curpos + 100;
-	return false;
+	goto out;
    }

    try {
@ -923,10 +922,12 @@ bool TextSplitDb::text_to_words(const string &in)
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
 	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
-	basepos += curpos + 100;
-	return false;
+	goto out;
    }

+    ret = true;
+
+out:
    basepos += curpos + 100;
    return true;
 }
@ -961,6 +962,7 @@ public:
 	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
 	return false;
    }
+
 private:
    TextSplitDb *m_ts;
 };
@ -1028,12 +1030,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    Doc doc = idoc;

    Xapian::Document newdocument;
+
+    // The term processing pipeline:
    TermProcIdx tpidx;
-//    TermProcStop tpstop(&tpidx, m_stops);
-    TermProcCommongrams tpstop(&tpidx, m_stops);
-    TermProcPrep tpprep(&tpstop);
-    TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
+    TermProc *nxt = &tpidx;
+    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
+//    TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
+    TermProcPrep tpprep(nxt); nxt = &tpprep;
+
+    TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
    tpidx.setTSD(&splitter);
+
    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
    if (!splitter.text_to_words(doc.utf8fn))
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -478,7 +478,8 @@ void SearchData::getUTerms(vector<string>& terms) const
 class TextSplitQ : public TextSplitP {
 public:
    TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
-	: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
+	: TextSplitP(prc, flags), 
+	  curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
    {}

    bool takeword(const std::string &term, int pos, int bs, int be) 
@ -509,16 +510,30 @@ public:
    bool takeword(const std::string &term, int pos, int bs, int be) 
    {
 	m_ts->alltermcount++;
-        m_ts->lastpos = pos;
+	if (m_ts->lastpos < pos)
+	    m_ts->lastpos = pos;
 	bool noexpand = be ? m_ts->curnostemexp : true;
-	LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n", 
-		term.c_str(), noexpand));
-	m_ts->terms.push_back(term);
-	m_ts->nostemexps.push_back(noexpand);
+	LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", 
+		term.c_str(), pos, noexpand));
+	if (m_terms[pos].size() < term.size()) {
+	    m_terms[pos] = term;
+	    m_nste[pos] = noexpand;
+	}
+	return true;
+    }
+    bool flush()
+    {
+	for (map<int, string>::const_iterator it = m_terms.begin();
+	     it != m_terms.end(); it++) {
+	    m_ts->terms.push_back(it->second);
+	    m_ts->nostemexps.push_back(m_nste[it->first]);
+	}
 	return true;
    }
 private:
    TextSplitQ *m_ts;
+    map<int, string> m_terms;
+    map<int, bool> m_nste;
 };

 // A class used to translate a user compound string (*not* a query
@ -783,7 +798,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,

    // Generate an appropriate PHRASE/NEAR query with adjusted slack
    // For phrases, give a relevance boost like we do for original terms
-    LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", 
+    LOGDEB2(("PHRASE/NEAR:  alltermcount %d lastpos %d\n", 
             splitData->alltermcount, splitData->lastpos));
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
 		     splitData->lastpos + 1 + slack);
@ -839,7 +854,7 @@ bool StringToXapianQ::processUserString(const string &iq,
 					bool useNear
 					)
 {
-    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
+    LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
    ermsg.erase();
    m_uterms.clear();
    m_terms.clear();
@ -874,45 +889,35 @@ bool StringToXapianQ::processUserString(const string &iq,
 	    // We used to do  word split, searching for 
 	    // "term0 term1 term2" instead, which may have worse 
 	    // performance, but will succeed.
-	    // We now adjust the phrase/near slack by the term count
-	    // difference (this is mainly better for cjk where this is a very
-	    // common occurrence because of the ngrams thing.
+	    // We now adjust the phrase/near slack by comparing the term count
+	    // and the last position

+	    // The term processing pipeline:
 	    TermProcQ tpq;
-            //    TermProcStop tpstop(&tpidx, stops);
-	    TermProcCommongrams tpstop(&tpq, stops);
-	    tpstop.onlygrams(true);
-	    TermProcPrep tpprep(&tpstop);
+	    TermProc *nxt = &tpq;
+            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
+            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
+            //tpcommon.onlygrams(true);
+	    TermProcPrep tpprep(nxt); nxt = &tpprep;

-	    TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
+	    TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
 						  TextSplit::TXTS_KEEPWILD), 
-                                 stops, &tpprep);
-	    tpq.setTSQ(&splitterS);
-	    splitterS.text_to_words(*it);
-	    LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
-	    TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | 
-                                                  TextSplit::TXTS_KEEPWILD),
-                                 stops, &tpprep);
-	    tpq.setTSQ(&splitterW);
-	    tpstop.onlygrams(false);
-	    splitterW.text_to_words(*it);
+                                 stops, nxt);
+	    tpq.setTSQ(&splitter);
+	    splitter.text_to_words(*it);

-	    if (splitterS.terms.size() > 1 && 
-		splitterS.terms.size() != splitterW.terms.size()) {
-		slack += splitterW.terms.size() - splitterS.terms.size();
-	    }
+	    slack += splitter.lastpos - splitter.terms.size() + 1;

-	    TextSplitQ *splitter = &splitterS;
-	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
-	    switch (splitter->terms.size() + terminc) {
+	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
+	    switch (splitter.terms.size() + terminc) {
 	    case 0: 
 		continue;// ??
 	    case 1: 
-		processSimpleSpan(splitter->terms.front(), 
-                                  splitter->nostemexps.front(), pqueries);
+		processSimpleSpan(splitter.terms.front(), 
+                                  splitter.nostemexps.front(), pqueries);
 		break;
 	    default:
-		processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
+		processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
 	    }
 	}
    } catch (const Xapian::Error &e) {
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@ -66,10 +66,10 @@ private:
 };

 /** 
- * Intermediary specialized texsplit class: this will probably replace the base
- * textsplit when we've converted all the code. The takeword() routine in this
- * calls a TextProc's instead of being specialized in a derived class by the
- * user module. The text_to_word() method also takes care of flushing.
+ * Specialized TextSplit class: this will probably replace the base
+ * TextSplit when we've converted all the code. The takeword() routine in this
+ * calls a TermProc's instead of being overriden in a user derived class.
+ * The text_to_word() method also takes care of flushing.
 */
 class TextSplitP : public TextSplit {
 public:
@ -99,18 +99,39 @@ private:
 /** Unaccent and lowercase term. This is usually the first in the pipeline */
 class TermProcPrep : public TermProc {
 public:
-    TermProcPrep(TermProc *nxt)	: TermProc(nxt) {}
+    TermProcPrep(TermProc *nxt)	
+	: TermProc(nxt), m_totalterms(0), m_unacerrors(0) {}

    virtual bool takeword(const string& itrm, int pos, int bs, int be)
    {
+	m_totalterms++;
 	string otrm;
 	if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
-	    LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
-	    // We don't generate a fatal error because of a bad term
+	    LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+	    m_unacerrors++;
+	    // We don't generate a fatal error because of a bad term,
+	    // but one has to put the limit somewhere
+	    if (m_unacerrors > 500 && 
+		(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
+		// More than 1 error for every other term
+		LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
+			m_unacerrors, m_totalterms));
+		return false;
+	    }
 	    return true;
 	}
 	return TermProc::takeword(otrm, pos, bs, be);
    }
+
+    virtual bool flush()
+    {
+	m_totalterms = m_unacerrors = 0;
+	return TermProc::flush();
+    }
+
+private:
+    int m_totalterms;
+    int m_unacerrors;
 };

 /** Compare to stop words list and discard if match found */
@ -119,19 +140,23 @@ public:
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
 	: TermProc(nxt), m_stops(stops) {}

-    virtual bool takeword(const string& term, int pos, int bts, int bte)
+    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
 	if (m_stops.isStop(term)) {
 	    return true;
 	}
-	return TermProc::takeword(term, pos, bts, bte);
+	return TermProc::takeword(term, pos, bs, be);
    }
+
 private:
    const Rcl::StopList& m_stops;
 };

 /** Handle common-gram generation: combine frequent terms with neighbours to
 *  shorten the positions lists for phrase searches.
+ *  NOTE: This does not currently work because of bad interaction with the 
+ *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
+ *  testing only
 */
 class TermProcCommongrams : public TermProc {
 public:
@ -147,7 +172,7 @@ public:

 	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
 	    // create 2-gram. space unnecessary but improves
-	    // lisibility of queries
+	    // the readability of queries
 	    string twogram;
 	    twogram.swap(m_prevterm);
 	    twogram.append(1, ' ');
@ -164,7 +189,7 @@ public:
 	    }
 #endif
 	}
-
+	
 	m_prevterm = term;
 	m_prevstop = isstop;
 	m_prevpos = pos;
@ -181,7 +206,7 @@ public:
 	return true;
    }

-    bool flush()
+    virtual bool flush()
    {
 	if (!m_prevsent && !m_prevterm.empty())
 	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))