get rid of a few garbage terms during indexing. Set a threshold for conversion errors after which we discard the doc. Stabilize the new termproc pipeline but no commongrams for now
This commit is contained in:
parent
a2c9d2a82b
commit
0860b559ee
@ -164,7 +164,7 @@ bool TextSplit::o_noNumbers = false;
|
||||
// Do some checking (the kind which is simpler to do here than in the
|
||||
// main loop), then send term to our client.
|
||||
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
int btstart, int btend)
|
||||
int btstart, int btend)
|
||||
{
|
||||
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
||||
|
||||
@ -348,12 +348,14 @@ bool TextSplit::text_to_words(const string &in)
|
||||
m_inNumber = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case WILD:
|
||||
if (m_flags & TXTS_KEEPWILD)
|
||||
goto NORMALCHAR;
|
||||
else
|
||||
goto SPACE;
|
||||
break;
|
||||
|
||||
case '-':
|
||||
case '+':
|
||||
curspanglue = cc;
|
||||
@ -381,12 +383,16 @@ bool TextSplit::text_to_words(const string &in)
|
||||
m_wordStart += it.appendchartostring(m_span);
|
||||
}
|
||||
break;
|
||||
|
||||
case '.':
|
||||
case ',':
|
||||
{
|
||||
// Need a little lookahead here. At worse this gets the end null
|
||||
int nextc = it[it.getCpos()+1];
|
||||
int nextwhat = whatcc(nextc);
|
||||
if (m_inNumber) {
|
||||
// 132.jpg ?
|
||||
int wn = it[it.getCpos()+1];
|
||||
if (whatcc(wn) != DIGIT && wn != 'e' && wn != 'E')
|
||||
// we're eliminating 132.jpg here. Good idea ?
|
||||
if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E')
|
||||
goto SPACE;
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
curspanglue = cc;
|
||||
@ -398,10 +404,15 @@ bool TextSplit::text_to_words(const string &in)
|
||||
// Another problem is that something like .x-errs
|
||||
// will be split as .x-errs, x, errs but not x-errs
|
||||
// A final comma in a word will be removed by doemit
|
||||
if (cc == '.' && it[it.getCpos()+1] != '.') {
|
||||
|
||||
// Only letters and digits make sense after
|
||||
if (nextwhat != A_LLETTER && nextwhat != A_ULETTER &&
|
||||
nextwhat != DIGIT && nextwhat != LETTER)
|
||||
goto SPACE;
|
||||
|
||||
if (cc == '.') {
|
||||
// Check for number like .1
|
||||
if (m_span.length() == 0 &&
|
||||
whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||
if (m_span.length() == 0 && nextwhat == DIGIT) {
|
||||
m_inNumber = true;
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
curspanglue = cc;
|
||||
@ -430,7 +441,9 @@ bool TextSplit::text_to_words(const string &in)
|
||||
}
|
||||
}
|
||||
goto SPACE;
|
||||
}
|
||||
break;
|
||||
|
||||
case '@':
|
||||
if (m_wordLen) {
|
||||
if (!doemit(false, it.getBpos()))
|
||||
@ -623,8 +636,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
// first
|
||||
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
||||
unsigned int btend = it.getBpos(); // Current char is out
|
||||
if (!takeword(it.buffer().substr(boffs[0],
|
||||
btend-boffs[0]),
|
||||
if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]),
|
||||
m_wordpos - nchars,
|
||||
boffs[0], btend)) {
|
||||
return false;
|
||||
@ -764,18 +776,19 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
||||
#include "readfile.h"
|
||||
#include "debuglog.h"
|
||||
#include "transcode.h"
|
||||
#include "unacpp.h"
|
||||
#include "termproc.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class myTextSplit : public TextSplit {
|
||||
class myTermProc : public Rcl::TermProc {
|
||||
int first;
|
||||
bool nooutput;
|
||||
public:
|
||||
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
|
||||
TextSplit(flags),first(1), nooutput(false)
|
||||
{}
|
||||
public:
|
||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||
void setNoOut(bool val) {nooutput = val;}
|
||||
bool takeword(const string &term, int pos, int bs, int be) {
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||
{
|
||||
if (nooutput)
|
||||
return true;
|
||||
FILE *fp = stdout;
|
||||
@ -812,13 +825,15 @@ static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" textsplit [opts] [filename]\n"
|
||||
" -S: no output\n"
|
||||
" -s: only spans\n"
|
||||
" -w: only words\n"
|
||||
" -n: no numbers\n"
|
||||
" -k: preserve wildcards (?*)\n"
|
||||
" -c: just count words\n"
|
||||
" -q : no output\n"
|
||||
" -s : only spans\n"
|
||||
" -w : only words\n"
|
||||
" -n : no numbers\n"
|
||||
" -k : preserve wildcards (?*)\n"
|
||||
" -c : just count words\n"
|
||||
" -u : use unac\n"
|
||||
" -C [charset] : input charset\n"
|
||||
" -S [stopfile] : stopfile to use for commongrams\n"
|
||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||
" \n\n"
|
||||
;
|
||||
@ -833,15 +848,18 @@ Usage(void)
|
||||
static int op_flags;
|
||||
#define OPT_s 0x1
|
||||
#define OPT_w 0x2
|
||||
#define OPT_S 0x4
|
||||
#define OPT_q 0x4
|
||||
#define OPT_c 0x8
|
||||
#define OPT_k 0x10
|
||||
#define OPT_C 0x20
|
||||
#define OPT_n 0x40
|
||||
#define OPT_S 0x80
|
||||
#define OPT_u 0x100
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string charset;
|
||||
string charset, stopfile;
|
||||
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
@ -858,8 +876,12 @@ int main(int argc, char **argv)
|
||||
goto b1;
|
||||
case 'k': op_flags |= OPT_k; break;
|
||||
case 'n': op_flags |= OPT_n; break;
|
||||
case 'q': op_flags |= OPT_q; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'S': op_flags |= OPT_S; break;
|
||||
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
|
||||
stopfile = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 'u': op_flags |= OPT_u; break;
|
||||
case 'w': op_flags |= OPT_w; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
@ -879,6 +901,13 @@ int main(int argc, char **argv)
|
||||
if (op_flags & OPT_n)
|
||||
TextSplit::noNumbers();
|
||||
|
||||
Rcl::StopList stoplist;
|
||||
if (op_flags & OPT_S) {
|
||||
if (!stoplist.setFile(stopfile)) {
|
||||
cerr << "Can't read stopfile: " << stopfile << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
string odata, reason;
|
||||
if (argc == 1) {
|
||||
const char *filename = *argv++; argc--;
|
||||
@ -912,10 +941,25 @@ int main(int argc, char **argv)
|
||||
int n = TextSplit::countWords(data, flags);
|
||||
cout << n << " words" << endl;
|
||||
} else {
|
||||
myTextSplit splitter(flags);
|
||||
if (op_flags&OPT_S)
|
||||
splitter.setNoOut(true);
|
||||
myTermProc printproc;
|
||||
|
||||
Rcl::TermProc *nxt = &printproc;
|
||||
|
||||
Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
||||
if (op_flags & OPT_S)
|
||||
nxt = &commonproc;
|
||||
|
||||
Rcl::TermProcPrep preproc(nxt);
|
||||
if (op_flags & OPT_u)
|
||||
nxt = &preproc;
|
||||
|
||||
Rcl::TextSplitP splitter(nxt, flags);
|
||||
|
||||
if (op_flags & OPT_q)
|
||||
printproc.setNoOut(true);
|
||||
|
||||
splitter.text_to_words(data);
|
||||
|
||||
}
|
||||
}
|
||||
#endif // TEST
|
||||
|
||||
@ -897,8 +897,9 @@ private:
|
||||
// Reimplement text_to_words to insert the begin and end anchor terms.
|
||||
bool TextSplitDb::text_to_words(const string &in)
|
||||
{
|
||||
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
||||
bool ret = false;
|
||||
string ermsg;
|
||||
|
||||
try {
|
||||
// Index the possibly prefixed start term.
|
||||
doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
|
||||
@ -906,14 +907,12 @@ bool TextSplitDb::text_to_words(const string &in)
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||
basepos += curpos + 100;
|
||||
return false;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!TextSplitP::text_to_words(in)) {
|
||||
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
||||
basepos += curpos + 100;
|
||||
return false;
|
||||
goto out;
|
||||
}
|
||||
|
||||
try {
|
||||
@ -923,10 +922,12 @@ bool TextSplitDb::text_to_words(const string &in)
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||
basepos += curpos + 100;
|
||||
return false;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = true;
|
||||
|
||||
out:
|
||||
basepos += curpos + 100;
|
||||
return true;
|
||||
}
|
||||
@ -961,6 +962,7 @@ public:
|
||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
TextSplitDb *m_ts;
|
||||
};
|
||||
@ -1028,12 +1030,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
Doc doc = idoc;
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
// The term processing pipeline:
|
||||
TermProcIdx tpidx;
|
||||
// TermProcStop tpstop(&tpidx, m_stops);
|
||||
TermProcCommongrams tpstop(&tpidx, m_stops);
|
||||
TermProcPrep tpprep(&tpstop);
|
||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
|
||||
TermProc *nxt = &tpidx;
|
||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||
|
||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
|
||||
tpidx.setTSD(&splitter);
|
||||
|
||||
// Split and index file name as document term(s)
|
||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||
if (!splitter.text_to_words(doc.utf8fn))
|
||||
|
||||
@ -478,7 +478,8 @@ void SearchData::getUTerms(vector<string>& terms) const
|
||||
class TextSplitQ : public TextSplitP {
|
||||
public:
|
||||
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
||||
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
|
||||
: TextSplitP(prc, flags),
|
||||
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
||||
{}
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
@ -509,16 +510,30 @@ public:
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
m_ts->alltermcount++;
|
||||
m_ts->lastpos = pos;
|
||||
if (m_ts->lastpos < pos)
|
||||
m_ts->lastpos = pos;
|
||||
bool noexpand = be ? m_ts->curnostemexp : true;
|
||||
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n",
|
||||
term.c_str(), noexpand));
|
||||
m_ts->terms.push_back(term);
|
||||
m_ts->nostemexps.push_back(noexpand);
|
||||
LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
||||
term.c_str(), pos, noexpand));
|
||||
if (m_terms[pos].size() < term.size()) {
|
||||
m_terms[pos] = term;
|
||||
m_nste[pos] = noexpand;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool flush()
|
||||
{
|
||||
for (map<int, string>::const_iterator it = m_terms.begin();
|
||||
it != m_terms.end(); it++) {
|
||||
m_ts->terms.push_back(it->second);
|
||||
m_ts->nostemexps.push_back(m_nste[it->first]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
TextSplitQ *m_ts;
|
||||
map<int, string> m_terms;
|
||||
map<int, bool> m_nste;
|
||||
};
|
||||
|
||||
// A class used to translate a user compound string (*not* a query
|
||||
@ -783,7 +798,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||
|
||||
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||
// For phrases, give a relevance boost like we do for original terms
|
||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||
splitData->alltermcount, splitData->lastpos));
|
||||
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||
splitData->lastpos + 1 + slack);
|
||||
@ -839,7 +854,7 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
bool useNear
|
||||
)
|
||||
{
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
|
||||
ermsg.erase();
|
||||
m_uterms.clear();
|
||||
m_terms.clear();
|
||||
@ -874,45 +889,35 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
// We used to do word split, searching for
|
||||
// "term0 term1 term2" instead, which may have worse
|
||||
// performance, but will succeed.
|
||||
// We now adjust the phrase/near slack by the term count
|
||||
// difference (this is mainly better for cjk where this is a very
|
||||
// common occurrence because of the ngrams thing.
|
||||
// We now adjust the phrase/near slack by comparing the term count
|
||||
// and the last position
|
||||
|
||||
// The term processing pipeline:
|
||||
TermProcQ tpq;
|
||||
// TermProcStop tpstop(&tpidx, stops);
|
||||
TermProcCommongrams tpstop(&tpq, stops);
|
||||
tpstop.onlygrams(true);
|
||||
TermProcPrep tpprep(&tpstop);
|
||||
TermProc *nxt = &tpq;
|
||||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||
//tpcommon.onlygrams(true);
|
||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||
|
||||
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, &tpprep);
|
||||
tpq.setTSQ(&splitterS);
|
||||
splitterS.text_to_words(*it);
|
||||
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
|
||||
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, &tpprep);
|
||||
tpq.setTSQ(&splitterW);
|
||||
tpstop.onlygrams(false);
|
||||
splitterW.text_to_words(*it);
|
||||
stops, nxt);
|
||||
tpq.setTSQ(&splitter);
|
||||
splitter.text_to_words(*it);
|
||||
|
||||
if (splitterS.terms.size() > 1 &&
|
||||
splitterS.terms.size() != splitterW.terms.size()) {
|
||||
slack += splitterW.terms.size() - splitterS.terms.size();
|
||||
}
|
||||
slack += splitter.lastpos - splitter.terms.size() + 1;
|
||||
|
||||
TextSplitQ *splitter = &splitterS;
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
||||
switch (splitter->terms.size() + terminc) {
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
||||
switch (splitter.terms.size() + terminc) {
|
||||
case 0:
|
||||
continue;// ??
|
||||
case 1:
|
||||
processSimpleSpan(splitter->terms.front(),
|
||||
splitter->nostemexps.front(), pqueries);
|
||||
processSimpleSpan(splitter.terms.front(),
|
||||
splitter.nostemexps.front(), pqueries);
|
||||
break;
|
||||
default:
|
||||
processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
|
||||
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
|
||||
@ -66,10 +66,10 @@ private:
|
||||
};
|
||||
|
||||
/**
|
||||
* Intermediary specialized texsplit class: this will probably replace the base
|
||||
* textsplit when we've converted all the code. The takeword() routine in this
|
||||
* calls a TextProc's instead of being specialized in a derived class by the
|
||||
* user module. The text_to_word() method also takes care of flushing.
|
||||
* Specialized TextSplit class: this will probably replace the base
|
||||
* TextSplit when we've converted all the code. The takeword() routine in this
|
||||
* calls a TermProc's instead of being overriden in a user derived class.
|
||||
* The text_to_word() method also takes care of flushing.
|
||||
*/
|
||||
class TextSplitP : public TextSplit {
|
||||
public:
|
||||
@ -99,18 +99,39 @@ private:
|
||||
/** Unaccent and lowercase term. This is usually the first in the pipeline */
|
||||
class TermProcPrep : public TermProc {
|
||||
public:
|
||||
TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
|
||||
TermProcPrep(TermProc *nxt)
|
||||
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) {}
|
||||
|
||||
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
||||
{
|
||||
m_totalterms++;
|
||||
string otrm;
|
||||
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
|
||||
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
||||
// We don't generate a fatal error because of a bad term
|
||||
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
||||
m_unacerrors++;
|
||||
// We don't generate a fatal error because of a bad term,
|
||||
// but one has to put the limit somewhere
|
||||
if (m_unacerrors > 500 &&
|
||||
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
||||
// More than 1 error for every other term
|
||||
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
||||
m_unacerrors, m_totalterms));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return TermProc::takeword(otrm, pos, bs, be);
|
||||
}
|
||||
|
||||
virtual bool flush()
|
||||
{
|
||||
m_totalterms = m_unacerrors = 0;
|
||||
return TermProc::flush();
|
||||
}
|
||||
|
||||
private:
|
||||
int m_totalterms;
|
||||
int m_unacerrors;
|
||||
};
|
||||
|
||||
/** Compare to stop words list and discard if match found */
|
||||
@ -119,19 +140,23 @@ public:
|
||||
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||
: TermProc(nxt), m_stops(stops) {}
|
||||
|
||||
virtual bool takeword(const string& term, int pos, int bts, int bte)
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||
{
|
||||
if (m_stops.isStop(term)) {
|
||||
return true;
|
||||
}
|
||||
return TermProc::takeword(term, pos, bts, bte);
|
||||
return TermProc::takeword(term, pos, bs, be);
|
||||
}
|
||||
|
||||
private:
|
||||
const Rcl::StopList& m_stops;
|
||||
};
|
||||
|
||||
/** Handle common-gram generation: combine frequent terms with neighbours to
|
||||
* shorten the positions lists for phrase searches.
|
||||
* NOTE: This does not currently work because of bad interaction with the
|
||||
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
|
||||
* testing only
|
||||
*/
|
||||
class TermProcCommongrams : public TermProc {
|
||||
public:
|
||||
@ -147,7 +172,7 @@ public:
|
||||
|
||||
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
||||
// create 2-gram. space unnecessary but improves
|
||||
// lisibility of queries
|
||||
// the readability of queries
|
||||
string twogram;
|
||||
twogram.swap(m_prevterm);
|
||||
twogram.append(1, ' ');
|
||||
@ -164,7 +189,7 @@ public:
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
m_prevterm = term;
|
||||
m_prevstop = isstop;
|
||||
m_prevpos = pos;
|
||||
@ -181,7 +206,7 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
bool flush()
|
||||
virtual bool flush()
|
||||
{
|
||||
if (!m_prevsent && !m_prevterm.empty())
|
||||
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user