get rid of a few garbage terms during indexing. Set a threshold for conversion errors after which we discard the doc. Stabilize the new termproc pipeline but no commongrams for now

This commit is contained in:
Jean-Francois Dockes 2011-10-12 17:55:58 +02:00
parent a2c9d2a82b
commit 0860b559ee
4 changed files with 167 additions and 86 deletions

View File

@ -164,7 +164,7 @@ bool TextSplit::o_noNumbers = false;
// Do some checking (the kind which is simpler to do here than in the // Do some checking (the kind which is simpler to do here than in the
// main loop), then send term to our client. // main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
int btstart, int btend) int btstart, int btend)
{ {
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos)); LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
@ -348,12 +348,14 @@ bool TextSplit::text_to_words(const string &in)
m_inNumber = false; m_inNumber = false;
} }
break; break;
case WILD: case WILD:
if (m_flags & TXTS_KEEPWILD) if (m_flags & TXTS_KEEPWILD)
goto NORMALCHAR; goto NORMALCHAR;
else else
goto SPACE; goto SPACE;
break; break;
case '-': case '-':
case '+': case '+':
curspanglue = cc; curspanglue = cc;
@ -381,12 +383,16 @@ bool TextSplit::text_to_words(const string &in)
m_wordStart += it.appendchartostring(m_span); m_wordStart += it.appendchartostring(m_span);
} }
break; break;
case '.': case '.':
case ',': case ',':
{
// Need a little lookahead here. At worse this gets the end null
int nextc = it[it.getCpos()+1];
int nextwhat = whatcc(nextc);
if (m_inNumber) { if (m_inNumber) {
// 132.jpg ? // we're eliminating 132.jpg here. Good idea ?
int wn = it[it.getCpos()+1]; if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E')
if (whatcc(wn) != DIGIT && wn != 'e' && wn != 'E')
goto SPACE; goto SPACE;
m_wordLen += it.appendchartostring(m_span); m_wordLen += it.appendchartostring(m_span);
curspanglue = cc; curspanglue = cc;
@ -398,10 +404,15 @@ bool TextSplit::text_to_words(const string &in)
// Another problem is that something like .x-errs // Another problem is that something like .x-errs
// will be split as .x-errs, x, errs but not x-errs // will be split as .x-errs, x, errs but not x-errs
// A final comma in a word will be removed by doemit // A final comma in a word will be removed by doemit
if (cc == '.' && it[it.getCpos()+1] != '.') {
// Only letters and digits make sense after
if (nextwhat != A_LLETTER && nextwhat != A_ULETTER &&
nextwhat != DIGIT && nextwhat != LETTER)
goto SPACE;
if (cc == '.') {
// Check for number like .1 // Check for number like .1
if (m_span.length() == 0 && if (m_span.length() == 0 && nextwhat == DIGIT) {
whatcc(it[it.getCpos()+1]) == DIGIT) {
m_inNumber = true; m_inNumber = true;
m_wordLen += it.appendchartostring(m_span); m_wordLen += it.appendchartostring(m_span);
curspanglue = cc; curspanglue = cc;
@ -430,7 +441,9 @@ bool TextSplit::text_to_words(const string &in)
} }
} }
goto SPACE; goto SPACE;
}
break; break;
case '@': case '@':
if (m_wordLen) { if (m_wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
@ -623,8 +636,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// first // first
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) { if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
unsigned int btend = it.getBpos(); // Current char is out unsigned int btend = it.getBpos(); // Current char is out
if (!takeword(it.buffer().substr(boffs[0], if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]),
btend-boffs[0]),
m_wordpos - nchars, m_wordpos - nchars,
boffs[0], btend)) { boffs[0], btend)) {
return false; return false;
@ -764,18 +776,19 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
#include "readfile.h" #include "readfile.h"
#include "debuglog.h" #include "debuglog.h"
#include "transcode.h" #include "transcode.h"
#include "unacpp.h"
#include "termproc.h"
using namespace std; using namespace std;
class myTextSplit : public TextSplit { class myTermProc : public Rcl::TermProc {
int first; int first;
bool nooutput; bool nooutput;
public: public:
myTextSplit(Flags flags = Flags(TXTS_NONE)) : myTermProc() : TermProc(0), first(1), nooutput(false) {}
TextSplit(flags),first(1), nooutput(false)
{}
void setNoOut(bool val) {nooutput = val;} void setNoOut(bool val) {nooutput = val;}
bool takeword(const string &term, int pos, int bs, int be) { virtual bool takeword(const string &term, int pos, int bs, int be)
{
if (nooutput) if (nooutput)
return true; return true;
FILE *fp = stdout; FILE *fp = stdout;
@ -812,13 +825,15 @@ static string thisprog;
static string usage = static string usage =
" textsplit [opts] [filename]\n" " textsplit [opts] [filename]\n"
" -S: no output\n" " -q : no output\n"
" -s: only spans\n" " -s : only spans\n"
" -w: only words\n" " -w : only words\n"
" -n: no numbers\n" " -n : no numbers\n"
" -k: preserve wildcards (?*)\n" " -k : preserve wildcards (?*)\n"
" -c: just count words\n" " -c : just count words\n"
" -u : use unac\n"
" -C [charset] : input charset\n" " -C [charset] : input charset\n"
" -S [stopfile] : stopfile to use for commongrams\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n" " \n\n"
; ;
@ -833,15 +848,18 @@ Usage(void)
static int op_flags; static int op_flags;
#define OPT_s 0x1 #define OPT_s 0x1
#define OPT_w 0x2 #define OPT_w 0x2
#define OPT_S 0x4 #define OPT_q 0x4
#define OPT_c 0x8 #define OPT_c 0x8
#define OPT_k 0x10 #define OPT_k 0x10
#define OPT_C 0x20 #define OPT_C 0x20
#define OPT_n 0x40 #define OPT_n 0x40
#define OPT_S 0x80
#define OPT_u 0x100
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
string charset; string charset, stopfile;
thisprog = argv[0]; thisprog = argv[0];
argc--; argv++; argc--; argv++;
@ -858,8 +876,12 @@ int main(int argc, char **argv)
goto b1; goto b1;
case 'k': op_flags |= OPT_k; break; case 'k': op_flags |= OPT_k; break;
case 'n': op_flags |= OPT_n; break; case 'n': op_flags |= OPT_n; break;
case 'q': op_flags |= OPT_q; break;
case 's': op_flags |= OPT_s; break; case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break; case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
stopfile = *(++argv); argc--;
goto b1;
case 'u': op_flags |= OPT_u; break;
case 'w': op_flags |= OPT_w; break; case 'w': op_flags |= OPT_w; break;
default: Usage(); break; default: Usage(); break;
} }
@ -879,6 +901,13 @@ int main(int argc, char **argv)
if (op_flags & OPT_n) if (op_flags & OPT_n)
TextSplit::noNumbers(); TextSplit::noNumbers();
Rcl::StopList stoplist;
if (op_flags & OPT_S) {
if (!stoplist.setFile(stopfile)) {
cerr << "Can't read stopfile: " << stopfile << endl;
exit(1);
}
}
string odata, reason; string odata, reason;
if (argc == 1) { if (argc == 1) {
const char *filename = *argv++; argc--; const char *filename = *argv++; argc--;
@ -912,10 +941,25 @@ int main(int argc, char **argv)
int n = TextSplit::countWords(data, flags); int n = TextSplit::countWords(data, flags);
cout << n << " words" << endl; cout << n << " words" << endl;
} else { } else {
myTextSplit splitter(flags); myTermProc printproc;
if (op_flags&OPT_S)
splitter.setNoOut(true); Rcl::TermProc *nxt = &printproc;
Rcl::TermProcCommongrams commonproc(nxt, stoplist);
if (op_flags & OPT_S)
nxt = &commonproc;
Rcl::TermProcPrep preproc(nxt);
if (op_flags & OPT_u)
nxt = &preproc;
Rcl::TextSplitP splitter(nxt, flags);
if (op_flags & OPT_q)
printproc.setNoOut(true);
splitter.text_to_words(data); splitter.text_to_words(data);
} }
} }
#endif // TEST #endif // TEST

View File

@ -897,8 +897,9 @@ private:
// Reimplement text_to_words to insert the begin and end anchor terms. // Reimplement text_to_words to insert the begin and end anchor terms.
bool TextSplitDb::text_to_words(const string &in) bool TextSplitDb::text_to_words(const string &in)
{ {
LOGDEB2(("TextSplitDb::text_to_words\n")); bool ret = false;
string ermsg; string ermsg;
try { try {
// Index the possibly prefixed start term. // Index the possibly prefixed start term.
doc.add_posting(prefix + start_of_field_term, basepos, wdfinc); doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
@ -906,14 +907,12 @@ bool TextSplitDb::text_to_words(const string &in)
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
basepos += curpos + 100; goto out;
return false;
} }
if (!TextSplitP::text_to_words(in)) { if (!TextSplitP::text_to_words(in)) {
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n")); LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
basepos += curpos + 100; goto out;
return false;
} }
try { try {
@ -923,10 +922,12 @@ bool TextSplitDb::text_to_words(const string &in)
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
basepos += curpos + 100; goto out;
return false;
} }
ret = true;
out:
basepos += curpos + 100; basepos += curpos + 100;
return true; return true;
} }
@ -961,6 +962,7 @@ public:
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false; return false;
} }
private: private:
TextSplitDb *m_ts; TextSplitDb *m_ts;
}; };
@ -1028,12 +1030,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
Doc doc = idoc; Doc doc = idoc;
Xapian::Document newdocument; Xapian::Document newdocument;
// The term processing pipeline:
TermProcIdx tpidx; TermProcIdx tpidx;
// TermProcStop tpstop(&tpidx, m_stops); TermProc *nxt = &tpidx;
TermProcCommongrams tpstop(&tpidx, m_stops); TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
TermProcPrep tpprep(&tpstop); // TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep); TermProcPrep tpprep(nxt); nxt = &tpprep;
TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
tpidx.setTSD(&splitter); tpidx.setTSD(&splitter);
// Split and index file name as document term(s) // Split and index file name as document term(s)
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
if (!splitter.text_to_words(doc.utf8fn)) if (!splitter.text_to_words(doc.utf8fn))

View File

@ -478,7 +478,8 @@ void SearchData::getUTerms(vector<string>& terms) const
class TextSplitQ : public TextSplitP { class TextSplitQ : public TextSplitP {
public: public:
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc) TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0) : TextSplitP(prc, flags),
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
{} {}
bool takeword(const std::string &term, int pos, int bs, int be) bool takeword(const std::string &term, int pos, int bs, int be)
@ -509,16 +510,30 @@ public:
bool takeword(const std::string &term, int pos, int bs, int be) bool takeword(const std::string &term, int pos, int bs, int be)
{ {
m_ts->alltermcount++; m_ts->alltermcount++;
m_ts->lastpos = pos; if (m_ts->lastpos < pos)
m_ts->lastpos = pos;
bool noexpand = be ? m_ts->curnostemexp : true; bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n", LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), noexpand)); term.c_str(), pos, noexpand));
m_ts->terms.push_back(term); if (m_terms[pos].size() < term.size()) {
m_ts->nostemexps.push_back(noexpand); m_terms[pos] = term;
m_nste[pos] = noexpand;
}
return true;
}
bool flush()
{
for (map<int, string>::const_iterator it = m_terms.begin();
it != m_terms.end(); it++) {
m_ts->terms.push_back(it->second);
m_ts->nostemexps.push_back(m_nste[it->first]);
}
return true; return true;
} }
private: private:
TextSplitQ *m_ts; TextSplitQ *m_ts;
map<int, string> m_terms;
map<int, bool> m_nste;
}; };
// A class used to translate a user compound string (*not* a query // A class used to translate a user compound string (*not* a query
@ -783,7 +798,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
// Generate an appropriate PHRASE/NEAR query with adjusted slack // Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms // For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
splitData->alltermcount, splitData->lastpos)); splitData->alltermcount, splitData->lastpos));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(), Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->lastpos + 1 + slack); splitData->lastpos + 1 + slack);
@ -839,7 +854,7 @@ bool StringToXapianQ::processUserString(const string &iq,
bool useNear bool useNear
) )
{ {
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
ermsg.erase(); ermsg.erase();
m_uterms.clear(); m_uterms.clear();
m_terms.clear(); m_terms.clear();
@ -874,45 +889,35 @@ bool StringToXapianQ::processUserString(const string &iq,
// We used to do word split, searching for // We used to do word split, searching for
// "term0 term1 term2" instead, which may have worse // "term0 term1 term2" instead, which may have worse
// performance, but will succeed. // performance, but will succeed.
// We now adjust the phrase/near slack by the term count // We now adjust the phrase/near slack by comparing the term count
// difference (this is mainly better for cjk where this is a very // and the last position
// common occurrence because of the ngrams thing.
// The term processing pipeline:
TermProcQ tpq; TermProcQ tpq;
// TermProcStop tpstop(&tpidx, stops); TermProc *nxt = &tpq;
TermProcCommongrams tpstop(&tpq, stops); TermProcStop tpstop(nxt, stops); nxt = &tpstop;
tpstop.onlygrams(true); //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
TermProcPrep tpprep(&tpstop); //tpcommon.onlygrams(true);
TermProcPrep tpprep(nxt); nxt = &tpprep;
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD), TextSplit::TXTS_KEEPWILD),
stops, &tpprep); stops, nxt);
tpq.setTSQ(&splitterS); tpq.setTSQ(&splitter);
splitterS.text_to_words(*it); splitter.text_to_words(*it);
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD),
stops, &tpprep);
tpq.setTSQ(&splitterW);
tpstop.onlygrams(false);
splitterW.text_to_words(*it);
if (splitterS.terms.size() > 1 && slack += splitter.lastpos - splitter.terms.size() + 1;
splitterS.terms.size() != splitterW.terms.size()) {
slack += splitterW.terms.size() - splitterS.terms.size();
}
TextSplitQ *splitter = &splitterS; LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size())); switch (splitter.terms.size() + terminc) {
switch (splitter->terms.size() + terminc) {
case 0: case 0:
continue;// ?? continue;// ??
case 1: case 1:
processSimpleSpan(splitter->terms.front(), processSimpleSpan(splitter.terms.front(),
splitter->nostemexps.front(), pqueries); splitter.nostemexps.front(), pqueries);
break; break;
default: default:
processPhraseOrNear(splitter, pqueries, useNear, slack, mods); processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
} }
} }
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {

View File

@ -66,10 +66,10 @@ private:
}; };
/** /**
* Intermediary specialized texsplit class: this will probably replace the base * Specialized TextSplit class: this will probably replace the base
* textsplit when we've converted all the code. The takeword() routine in this * TextSplit when we've converted all the code. The takeword() routine in this
* calls a TextProc's instead of being specialized in a derived class by the * calls a TermProc's instead of being overriden in a user derived class.
* user module. The text_to_word() method also takes care of flushing. * The text_to_word() method also takes care of flushing.
*/ */
class TextSplitP : public TextSplit { class TextSplitP : public TextSplit {
public: public:
@ -99,18 +99,39 @@ private:
/** Unaccent and lowercase term. This is usually the first in the pipeline */ /** Unaccent and lowercase term. This is usually the first in the pipeline */
class TermProcPrep : public TermProc { class TermProcPrep : public TermProc {
public: public:
TermProcPrep(TermProc *nxt) : TermProc(nxt) {} TermProcPrep(TermProc *nxt)
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) {}
virtual bool takeword(const string& itrm, int pos, int bs, int be) virtual bool takeword(const string& itrm, int pos, int bs, int be)
{ {
m_totalterms++;
string otrm; string otrm;
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) { if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
// We don't generate a fatal error because of a bad term m_unacerrors++;
// We don't generate a fatal error because of a bad term,
// but one has to put the limit somewhere
if (m_unacerrors > 500 &&
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
// More than 1 error for every other term
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
m_unacerrors, m_totalterms));
return false;
}
return true; return true;
} }
return TermProc::takeword(otrm, pos, bs, be); return TermProc::takeword(otrm, pos, bs, be);
} }
virtual bool flush()
{
m_totalterms = m_unacerrors = 0;
return TermProc::flush();
}
private:
int m_totalterms;
int m_unacerrors;
}; };
/** Compare to stop words list and discard if match found */ /** Compare to stop words list and discard if match found */
@ -119,19 +140,23 @@ public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops) TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops) {} : TermProc(nxt), m_stops(stops) {}
virtual bool takeword(const string& term, int pos, int bts, int bte) virtual bool takeword(const string& term, int pos, int bs, int be)
{ {
if (m_stops.isStop(term)) { if (m_stops.isStop(term)) {
return true; return true;
} }
return TermProc::takeword(term, pos, bts, bte); return TermProc::takeword(term, pos, bs, be);
} }
private: private:
const Rcl::StopList& m_stops; const Rcl::StopList& m_stops;
}; };
/** Handle common-gram generation: combine frequent terms with neighbours to /** Handle common-gram generation: combine frequent terms with neighbours to
* shorten the positions lists for phrase searches. * shorten the positions lists for phrase searches.
* NOTE: This does not currently work because of bad interaction with the
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
* testing only
*/ */
class TermProcCommongrams : public TermProc { class TermProcCommongrams : public TermProc {
public: public:
@ -147,7 +172,7 @@ public:
if (!m_prevterm.empty() && (m_prevstop || isstop)) { if (!m_prevterm.empty() && (m_prevstop || isstop)) {
// create 2-gram. space unnecessary but improves // create 2-gram. space unnecessary but improves
// lisibility of queries // the readability of queries
string twogram; string twogram;
twogram.swap(m_prevterm); twogram.swap(m_prevterm);
twogram.append(1, ' '); twogram.append(1, ' ');
@ -181,7 +206,7 @@ public:
return true; return true;
} }
bool flush() virtual bool flush()
{ {
if (!m_prevsent && !m_prevterm.empty()) if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))