get rid of a few garbage terms during indexing. Set a threshold for conversion errors after which we discard the doc. Stabilize the new termproc pipeline but no commongrams for now

This commit is contained in:
Jean-Francois Dockes 2011-10-12 17:55:58 +02:00
parent a2c9d2a82b
commit 0860b559ee
4 changed files with 167 additions and 86 deletions

View File

@ -164,7 +164,7 @@ bool TextSplit::o_noNumbers = false;
// Do some checking (the kind which is simpler to do here than in the
// main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
int btstart, int btend)
int btstart, int btend)
{
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
@ -348,12 +348,14 @@ bool TextSplit::text_to_words(const string &in)
m_inNumber = false;
}
break;
case WILD:
if (m_flags & TXTS_KEEPWILD)
goto NORMALCHAR;
else
goto SPACE;
break;
case '-':
case '+':
curspanglue = cc;
@ -381,12 +383,16 @@ bool TextSplit::text_to_words(const string &in)
m_wordStart += it.appendchartostring(m_span);
}
break;
case '.':
case ',':
{
// Need a little lookahead here. At worse this gets the end null
int nextc = it[it.getCpos()+1];
int nextwhat = whatcc(nextc);
if (m_inNumber) {
// 132.jpg ?
int wn = it[it.getCpos()+1];
if (whatcc(wn) != DIGIT && wn != 'e' && wn != 'E')
// we're eliminating 132.jpg here. Good idea ?
if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E')
goto SPACE;
m_wordLen += it.appendchartostring(m_span);
curspanglue = cc;
@ -398,10 +404,15 @@ bool TextSplit::text_to_words(const string &in)
// Another problem is that something like .x-errs
// will be split as .x-errs, x, errs but not x-errs
// A final comma in a word will be removed by doemit
if (cc == '.' && it[it.getCpos()+1] != '.') {
// Only letters and digits make sense after
if (nextwhat != A_LLETTER && nextwhat != A_ULETTER &&
nextwhat != DIGIT && nextwhat != LETTER)
goto SPACE;
if (cc == '.') {
// Check for number like .1
if (m_span.length() == 0 &&
whatcc(it[it.getCpos()+1]) == DIGIT) {
if (m_span.length() == 0 && nextwhat == DIGIT) {
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
curspanglue = cc;
@ -430,7 +441,9 @@ bool TextSplit::text_to_words(const string &in)
}
}
goto SPACE;
}
break;
case '@':
if (m_wordLen) {
if (!doemit(false, it.getBpos()))
@ -623,8 +636,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// first
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
unsigned int btend = it.getBpos(); // Current char is out
if (!takeword(it.buffer().substr(boffs[0],
btend-boffs[0]),
if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]),
m_wordpos - nchars,
boffs[0], btend)) {
return false;
@ -764,18 +776,19 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
#include "readfile.h"
#include "debuglog.h"
#include "transcode.h"
#include "unacpp.h"
#include "termproc.h"
using namespace std;
class myTextSplit : public TextSplit {
class myTermProc : public Rcl::TermProc {
int first;
bool nooutput;
public:
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
TextSplit(flags),first(1), nooutput(false)
{}
public:
myTermProc() : TermProc(0), first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;}
bool takeword(const string &term, int pos, int bs, int be) {
virtual bool takeword(const string &term, int pos, int bs, int be)
{
if (nooutput)
return true;
FILE *fp = stdout;
@ -812,13 +825,15 @@ static string thisprog;
static string usage =
" textsplit [opts] [filename]\n"
" -S: no output\n"
" -s: only spans\n"
" -w: only words\n"
" -n: no numbers\n"
" -k: preserve wildcards (?*)\n"
" -c: just count words\n"
" -q : no output\n"
" -s : only spans\n"
" -w : only words\n"
" -n : no numbers\n"
" -k : preserve wildcards (?*)\n"
" -c : just count words\n"
" -u : use unac\n"
" -C [charset] : input charset\n"
" -S [stopfile] : stopfile to use for commongrams\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n"
;
@ -833,15 +848,18 @@ Usage(void)
static int op_flags;
#define OPT_s 0x1
#define OPT_w 0x2
#define OPT_S 0x4
#define OPT_q 0x4
#define OPT_c 0x8
#define OPT_k 0x10
#define OPT_C 0x20
#define OPT_n 0x40
#define OPT_S 0x80
#define OPT_u 0x100
int main(int argc, char **argv)
{
string charset;
string charset, stopfile;
thisprog = argv[0];
argc--; argv++;
@ -858,8 +876,12 @@ int main(int argc, char **argv)
goto b1;
case 'k': op_flags |= OPT_k; break;
case 'n': op_flags |= OPT_n; break;
case 'q': op_flags |= OPT_q; break;
case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break;
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
stopfile = *(++argv); argc--;
goto b1;
case 'u': op_flags |= OPT_u; break;
case 'w': op_flags |= OPT_w; break;
default: Usage(); break;
}
@ -879,6 +901,13 @@ int main(int argc, char **argv)
if (op_flags & OPT_n)
TextSplit::noNumbers();
Rcl::StopList stoplist;
if (op_flags & OPT_S) {
if (!stoplist.setFile(stopfile)) {
cerr << "Can't read stopfile: " << stopfile << endl;
exit(1);
}
}
string odata, reason;
if (argc == 1) {
const char *filename = *argv++; argc--;
@ -912,10 +941,25 @@ int main(int argc, char **argv)
int n = TextSplit::countWords(data, flags);
cout << n << " words" << endl;
} else {
myTextSplit splitter(flags);
if (op_flags&OPT_S)
splitter.setNoOut(true);
myTermProc printproc;
Rcl::TermProc *nxt = &printproc;
Rcl::TermProcCommongrams commonproc(nxt, stoplist);
if (op_flags & OPT_S)
nxt = &commonproc;
Rcl::TermProcPrep preproc(nxt);
if (op_flags & OPT_u)
nxt = &preproc;
Rcl::TextSplitP splitter(nxt, flags);
if (op_flags & OPT_q)
printproc.setNoOut(true);
splitter.text_to_words(data);
}
}
#endif // TEST

View File

@ -897,8 +897,9 @@ private:
// Reimplement text_to_words to insert the begin and end anchor terms.
bool TextSplitDb::text_to_words(const string &in)
{
LOGDEB2(("TextSplitDb::text_to_words\n"));
bool ret = false;
string ermsg;
try {
// Index the possibly prefixed start term.
doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
@ -906,14 +907,12 @@ bool TextSplitDb::text_to_words(const string &in)
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
basepos += curpos + 100;
return false;
goto out;
}
if (!TextSplitP::text_to_words(in)) {
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
basepos += curpos + 100;
return false;
goto out;
}
try {
@ -923,10 +922,12 @@ bool TextSplitDb::text_to_words(const string &in)
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
basepos += curpos + 100;
return false;
goto out;
}
ret = true;
out:
basepos += curpos + 100;
return true;
}
@ -961,6 +962,7 @@ public:
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false;
}
private:
TextSplitDb *m_ts;
};
@ -1028,12 +1030,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
Doc doc = idoc;
Xapian::Document newdocument;
// The term processing pipeline:
TermProcIdx tpidx;
// TermProcStop tpstop(&tpidx, m_stops);
TermProcCommongrams tpstop(&tpidx, m_stops);
TermProcPrep tpprep(&tpstop);
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
TermProc *nxt = &tpidx;
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
TermProcPrep tpprep(nxt); nxt = &tpprep;
TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
tpidx.setTSD(&splitter);
// Split and index file name as document term(s)
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
if (!splitter.text_to_words(doc.utf8fn))

View File

@ -478,7 +478,8 @@ void SearchData::getUTerms(vector<string>& terms) const
class TextSplitQ : public TextSplitP {
public:
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
: TextSplitP(prc, flags),
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
{}
bool takeword(const std::string &term, int pos, int bs, int be)
@ -509,16 +510,30 @@ public:
bool takeword(const std::string &term, int pos, int bs, int be)
{
m_ts->alltermcount++;
m_ts->lastpos = pos;
if (m_ts->lastpos < pos)
m_ts->lastpos = pos;
bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n",
term.c_str(), noexpand));
m_ts->terms.push_back(term);
m_ts->nostemexps.push_back(noexpand);
LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), pos, noexpand));
if (m_terms[pos].size() < term.size()) {
m_terms[pos] = term;
m_nste[pos] = noexpand;
}
return true;
}
bool flush()
{
for (map<int, string>::const_iterator it = m_terms.begin();
it != m_terms.end(); it++) {
m_ts->terms.push_back(it->second);
m_ts->nostemexps.push_back(m_nste[it->first]);
}
return true;
}
private:
TextSplitQ *m_ts;
map<int, string> m_terms;
map<int, bool> m_nste;
};
// A class used to translate a user compound string (*not* a query
@ -783,7 +798,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
splitData->alltermcount, splitData->lastpos));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->lastpos + 1 + slack);
@ -839,7 +854,7 @@ bool StringToXapianQ::processUserString(const string &iq,
bool useNear
)
{
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
ermsg.erase();
m_uterms.clear();
m_terms.clear();
@ -874,45 +889,35 @@ bool StringToXapianQ::processUserString(const string &iq,
// We used to do word split, searching for
// "term0 term1 term2" instead, which may have worse
// performance, but will succeed.
// We now adjust the phrase/near slack by the term count
// difference (this is mainly better for cjk where this is a very
// common occurrence because of the ngrams thing.
// We now adjust the phrase/near slack by comparing the term count
// and the last position
// The term processing pipeline:
TermProcQ tpq;
// TermProcStop tpstop(&tpidx, stops);
TermProcCommongrams tpstop(&tpq, stops);
tpstop.onlygrams(true);
TermProcPrep tpprep(&tpstop);
TermProc *nxt = &tpq;
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
//tpcommon.onlygrams(true);
TermProcPrep tpprep(nxt); nxt = &tpprep;
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
stops, &tpprep);
tpq.setTSQ(&splitterS);
splitterS.text_to_words(*it);
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD),
stops, &tpprep);
tpq.setTSQ(&splitterW);
tpstop.onlygrams(false);
splitterW.text_to_words(*it);
stops, nxt);
tpq.setTSQ(&splitter);
splitter.text_to_words(*it);
if (splitterS.terms.size() > 1 &&
splitterS.terms.size() != splitterW.terms.size()) {
slack += splitterW.terms.size() - splitterS.terms.size();
}
slack += splitter.lastpos - splitter.terms.size() + 1;
TextSplitQ *splitter = &splitterS;
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
switch (splitter->terms.size() + terminc) {
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
switch (splitter.terms.size() + terminc) {
case 0:
continue;// ??
case 1:
processSimpleSpan(splitter->terms.front(),
splitter->nostemexps.front(), pqueries);
processSimpleSpan(splitter.terms.front(),
splitter.nostemexps.front(), pqueries);
break;
default:
processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
}
}
} catch (const Xapian::Error &e) {

View File

@ -66,10 +66,10 @@ private:
};
/**
* Intermediary specialized texsplit class: this will probably replace the base
* textsplit when we've converted all the code. The takeword() routine in this
* calls a TextProc's instead of being specialized in a derived class by the
* user module. The text_to_word() method also takes care of flushing.
* Specialized TextSplit class: this will probably replace the base
* TextSplit when we've converted all the code. The takeword() routine in this
* calls a TermProc's instead of being overriden in a user derived class.
* The text_to_word() method also takes care of flushing.
*/
class TextSplitP : public TextSplit {
public:
@ -99,18 +99,39 @@ private:
/** Unaccent and lowercase term. This is usually the first in the pipeline */
class TermProcPrep : public TermProc {
public:
TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
TermProcPrep(TermProc *nxt)
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) {}
virtual bool takeword(const string& itrm, int pos, int bs, int be)
{
m_totalterms++;
string otrm;
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
// We don't generate a fatal error because of a bad term
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
m_unacerrors++;
// We don't generate a fatal error because of a bad term,
// but one has to put the limit somewhere
if (m_unacerrors > 500 &&
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
// More than 1 error for every other term
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
m_unacerrors, m_totalterms));
return false;
}
return true;
}
return TermProc::takeword(otrm, pos, bs, be);
}
virtual bool flush()
{
m_totalterms = m_unacerrors = 0;
return TermProc::flush();
}
private:
int m_totalterms;
int m_unacerrors;
};
/** Compare to stop words list and discard if match found */
@ -119,19 +140,23 @@ public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops) {}
virtual bool takeword(const string& term, int pos, int bts, int bte)
virtual bool takeword(const string& term, int pos, int bs, int be)
{
if (m_stops.isStop(term)) {
return true;
}
return TermProc::takeword(term, pos, bts, bte);
return TermProc::takeword(term, pos, bs, be);
}
private:
const Rcl::StopList& m_stops;
};
/** Handle common-gram generation: combine frequent terms with neighbours to
* shorten the positions lists for phrase searches.
* NOTE: This does not currently work because of bad interaction with the
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
* testing only
*/
class TermProcCommongrams : public TermProc {
public:
@ -147,7 +172,7 @@ public:
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
// create 2-gram. space unnecessary but improves
// lisibility of queries
// the readability of queries
string twogram;
twogram.swap(m_prevterm);
twogram.append(1, ' ');
@ -164,7 +189,7 @@ public:
}
#endif
}
m_prevterm = term;
m_prevstop = isstop;
m_prevpos = pos;
@ -181,7 +206,7 @@ public:
return true;
}
bool flush()
virtual bool flush()
{
if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))