get rid of a few garbage terms during indexing. Set a threshold for conversion errors after which we discard the doc. Stabilize the new termproc pipeline but no commongrams for now
This commit is contained in:
parent
a2c9d2a82b
commit
0860b559ee
@ -164,7 +164,7 @@ bool TextSplit::o_noNumbers = false;
|
|||||||
// Do some checking (the kind which is simpler to do here than in the
|
// Do some checking (the kind which is simpler to do here than in the
|
||||||
// main loop), then send term to our client.
|
// main loop), then send term to our client.
|
||||||
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||||
int btstart, int btend)
|
int btstart, int btend)
|
||||||
{
|
{
|
||||||
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
||||||
|
|
||||||
@ -348,12 +348,14 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WILD:
|
case WILD:
|
||||||
if (m_flags & TXTS_KEEPWILD)
|
if (m_flags & TXTS_KEEPWILD)
|
||||||
goto NORMALCHAR;
|
goto NORMALCHAR;
|
||||||
else
|
else
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
curspanglue = cc;
|
curspanglue = cc;
|
||||||
@ -381,12 +383,16 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '.':
|
case '.':
|
||||||
case ',':
|
case ',':
|
||||||
|
{
|
||||||
|
// Need a little lookahead here. At worse this gets the end null
|
||||||
|
int nextc = it[it.getCpos()+1];
|
||||||
|
int nextwhat = whatcc(nextc);
|
||||||
if (m_inNumber) {
|
if (m_inNumber) {
|
||||||
// 132.jpg ?
|
// we're eliminating 132.jpg here. Good idea ?
|
||||||
int wn = it[it.getCpos()+1];
|
if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E')
|
||||||
if (whatcc(wn) != DIGIT && wn != 'e' && wn != 'E')
|
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
curspanglue = cc;
|
curspanglue = cc;
|
||||||
@ -398,10 +404,15 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// Another problem is that something like .x-errs
|
// Another problem is that something like .x-errs
|
||||||
// will be split as .x-errs, x, errs but not x-errs
|
// will be split as .x-errs, x, errs but not x-errs
|
||||||
// A final comma in a word will be removed by doemit
|
// A final comma in a word will be removed by doemit
|
||||||
if (cc == '.' && it[it.getCpos()+1] != '.') {
|
|
||||||
|
// Only letters and digits make sense after
|
||||||
|
if (nextwhat != A_LLETTER && nextwhat != A_ULETTER &&
|
||||||
|
nextwhat != DIGIT && nextwhat != LETTER)
|
||||||
|
goto SPACE;
|
||||||
|
|
||||||
|
if (cc == '.') {
|
||||||
// Check for number like .1
|
// Check for number like .1
|
||||||
if (m_span.length() == 0 &&
|
if (m_span.length() == 0 && nextwhat == DIGIT) {
|
||||||
whatcc(it[it.getCpos()+1]) == DIGIT) {
|
|
||||||
m_inNumber = true;
|
m_inNumber = true;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
curspanglue = cc;
|
curspanglue = cc;
|
||||||
@ -430,7 +441,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '@':
|
case '@':
|
||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
@ -623,8 +636,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// first
|
// first
|
||||||
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
||||||
unsigned int btend = it.getBpos(); // Current char is out
|
unsigned int btend = it.getBpos(); // Current char is out
|
||||||
if (!takeword(it.buffer().substr(boffs[0],
|
if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]),
|
||||||
btend-boffs[0]),
|
|
||||||
m_wordpos - nchars,
|
m_wordpos - nchars,
|
||||||
boffs[0], btend)) {
|
boffs[0], btend)) {
|
||||||
return false;
|
return false;
|
||||||
@ -764,18 +776,19 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
|||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
|
#include "unacpp.h"
|
||||||
|
#include "termproc.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
class myTextSplit : public TextSplit {
|
class myTermProc : public Rcl::TermProc {
|
||||||
int first;
|
int first;
|
||||||
bool nooutput;
|
bool nooutput;
|
||||||
public:
|
public:
|
||||||
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
|
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||||
TextSplit(flags),first(1), nooutput(false)
|
|
||||||
{}
|
|
||||||
void setNoOut(bool val) {nooutput = val;}
|
void setNoOut(bool val) {nooutput = val;}
|
||||||
bool takeword(const string &term, int pos, int bs, int be) {
|
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||||
|
{
|
||||||
if (nooutput)
|
if (nooutput)
|
||||||
return true;
|
return true;
|
||||||
FILE *fp = stdout;
|
FILE *fp = stdout;
|
||||||
@ -812,13 +825,15 @@ static string thisprog;
|
|||||||
|
|
||||||
static string usage =
|
static string usage =
|
||||||
" textsplit [opts] [filename]\n"
|
" textsplit [opts] [filename]\n"
|
||||||
" -S: no output\n"
|
" -q : no output\n"
|
||||||
" -s: only spans\n"
|
" -s : only spans\n"
|
||||||
" -w: only words\n"
|
" -w : only words\n"
|
||||||
" -n: no numbers\n"
|
" -n : no numbers\n"
|
||||||
" -k: preserve wildcards (?*)\n"
|
" -k : preserve wildcards (?*)\n"
|
||||||
" -c: just count words\n"
|
" -c : just count words\n"
|
||||||
|
" -u : use unac\n"
|
||||||
" -C [charset] : input charset\n"
|
" -C [charset] : input charset\n"
|
||||||
|
" -S [stopfile] : stopfile to use for commongrams\n"
|
||||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||||
" \n\n"
|
" \n\n"
|
||||||
;
|
;
|
||||||
@ -833,15 +848,18 @@ Usage(void)
|
|||||||
static int op_flags;
|
static int op_flags;
|
||||||
#define OPT_s 0x1
|
#define OPT_s 0x1
|
||||||
#define OPT_w 0x2
|
#define OPT_w 0x2
|
||||||
#define OPT_S 0x4
|
#define OPT_q 0x4
|
||||||
#define OPT_c 0x8
|
#define OPT_c 0x8
|
||||||
#define OPT_k 0x10
|
#define OPT_k 0x10
|
||||||
#define OPT_C 0x20
|
#define OPT_C 0x20
|
||||||
#define OPT_n 0x40
|
#define OPT_n 0x40
|
||||||
|
#define OPT_S 0x80
|
||||||
|
#define OPT_u 0x100
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
string charset;
|
string charset, stopfile;
|
||||||
|
|
||||||
thisprog = argv[0];
|
thisprog = argv[0];
|
||||||
argc--; argv++;
|
argc--; argv++;
|
||||||
|
|
||||||
@ -858,8 +876,12 @@ int main(int argc, char **argv)
|
|||||||
goto b1;
|
goto b1;
|
||||||
case 'k': op_flags |= OPT_k; break;
|
case 'k': op_flags |= OPT_k; break;
|
||||||
case 'n': op_flags |= OPT_n; break;
|
case 'n': op_flags |= OPT_n; break;
|
||||||
|
case 'q': op_flags |= OPT_q; break;
|
||||||
case 's': op_flags |= OPT_s; break;
|
case 's': op_flags |= OPT_s; break;
|
||||||
case 'S': op_flags |= OPT_S; break;
|
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
|
||||||
|
stopfile = *(++argv); argc--;
|
||||||
|
goto b1;
|
||||||
|
case 'u': op_flags |= OPT_u; break;
|
||||||
case 'w': op_flags |= OPT_w; break;
|
case 'w': op_flags |= OPT_w; break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
@ -879,6 +901,13 @@ int main(int argc, char **argv)
|
|||||||
if (op_flags & OPT_n)
|
if (op_flags & OPT_n)
|
||||||
TextSplit::noNumbers();
|
TextSplit::noNumbers();
|
||||||
|
|
||||||
|
Rcl::StopList stoplist;
|
||||||
|
if (op_flags & OPT_S) {
|
||||||
|
if (!stoplist.setFile(stopfile)) {
|
||||||
|
cerr << "Can't read stopfile: " << stopfile << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
string odata, reason;
|
string odata, reason;
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
const char *filename = *argv++; argc--;
|
const char *filename = *argv++; argc--;
|
||||||
@ -912,10 +941,25 @@ int main(int argc, char **argv)
|
|||||||
int n = TextSplit::countWords(data, flags);
|
int n = TextSplit::countWords(data, flags);
|
||||||
cout << n << " words" << endl;
|
cout << n << " words" << endl;
|
||||||
} else {
|
} else {
|
||||||
myTextSplit splitter(flags);
|
myTermProc printproc;
|
||||||
if (op_flags&OPT_S)
|
|
||||||
splitter.setNoOut(true);
|
Rcl::TermProc *nxt = &printproc;
|
||||||
|
|
||||||
|
Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
||||||
|
if (op_flags & OPT_S)
|
||||||
|
nxt = &commonproc;
|
||||||
|
|
||||||
|
Rcl::TermProcPrep preproc(nxt);
|
||||||
|
if (op_flags & OPT_u)
|
||||||
|
nxt = &preproc;
|
||||||
|
|
||||||
|
Rcl::TextSplitP splitter(nxt, flags);
|
||||||
|
|
||||||
|
if (op_flags & OPT_q)
|
||||||
|
printproc.setNoOut(true);
|
||||||
|
|
||||||
splitter.text_to_words(data);
|
splitter.text_to_words(data);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // TEST
|
#endif // TEST
|
||||||
|
|||||||
@ -897,8 +897,9 @@ private:
|
|||||||
// Reimplement text_to_words to insert the begin and end anchor terms.
|
// Reimplement text_to_words to insert the begin and end anchor terms.
|
||||||
bool TextSplitDb::text_to_words(const string &in)
|
bool TextSplitDb::text_to_words(const string &in)
|
||||||
{
|
{
|
||||||
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
bool ret = false;
|
||||||
string ermsg;
|
string ermsg;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Index the possibly prefixed start term.
|
// Index the possibly prefixed start term.
|
||||||
doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
|
doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
|
||||||
@ -906,14 +907,12 @@ bool TextSplitDb::text_to_words(const string &in)
|
|||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||||
basepos += curpos + 100;
|
goto out;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!TextSplitP::text_to_words(in)) {
|
if (!TextSplitP::text_to_words(in)) {
|
||||||
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
||||||
basepos += curpos + 100;
|
goto out;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -923,10 +922,12 @@ bool TextSplitDb::text_to_words(const string &in)
|
|||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||||
basepos += curpos + 100;
|
goto out;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = true;
|
||||||
|
|
||||||
|
out:
|
||||||
basepos += curpos + 100;
|
basepos += curpos + 100;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -961,6 +962,7 @@ public:
|
|||||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
TextSplitDb *m_ts;
|
TextSplitDb *m_ts;
|
||||||
};
|
};
|
||||||
@ -1028,12 +1030,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
Doc doc = idoc;
|
Doc doc = idoc;
|
||||||
|
|
||||||
Xapian::Document newdocument;
|
Xapian::Document newdocument;
|
||||||
|
|
||||||
|
// The term processing pipeline:
|
||||||
TermProcIdx tpidx;
|
TermProcIdx tpidx;
|
||||||
// TermProcStop tpstop(&tpidx, m_stops);
|
TermProc *nxt = &tpidx;
|
||||||
TermProcCommongrams tpstop(&tpidx, m_stops);
|
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||||
TermProcPrep tpprep(&tpstop);
|
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
|
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||||
|
|
||||||
|
TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
|
||||||
tpidx.setTSD(&splitter);
|
tpidx.setTSD(&splitter);
|
||||||
|
|
||||||
// Split and index file name as document term(s)
|
// Split and index file name as document term(s)
|
||||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||||
if (!splitter.text_to_words(doc.utf8fn))
|
if (!splitter.text_to_words(doc.utf8fn))
|
||||||
|
|||||||
@ -478,7 +478,8 @@ void SearchData::getUTerms(vector<string>& terms) const
|
|||||||
class TextSplitQ : public TextSplitP {
|
class TextSplitQ : public TextSplitP {
|
||||||
public:
|
public:
|
||||||
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
||||||
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
|
: TextSplitP(prc, flags),
|
||||||
|
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||||
@ -509,16 +510,30 @@ public:
|
|||||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
m_ts->alltermcount++;
|
m_ts->alltermcount++;
|
||||||
m_ts->lastpos = pos;
|
if (m_ts->lastpos < pos)
|
||||||
|
m_ts->lastpos = pos;
|
||||||
bool noexpand = be ? m_ts->curnostemexp : true;
|
bool noexpand = be ? m_ts->curnostemexp : true;
|
||||||
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n",
|
LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
||||||
term.c_str(), noexpand));
|
term.c_str(), pos, noexpand));
|
||||||
m_ts->terms.push_back(term);
|
if (m_terms[pos].size() < term.size()) {
|
||||||
m_ts->nostemexps.push_back(noexpand);
|
m_terms[pos] = term;
|
||||||
|
m_nste[pos] = noexpand;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool flush()
|
||||||
|
{
|
||||||
|
for (map<int, string>::const_iterator it = m_terms.begin();
|
||||||
|
it != m_terms.end(); it++) {
|
||||||
|
m_ts->terms.push_back(it->second);
|
||||||
|
m_ts->nostemexps.push_back(m_nste[it->first]);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
TextSplitQ *m_ts;
|
TextSplitQ *m_ts;
|
||||||
|
map<int, string> m_terms;
|
||||||
|
map<int, bool> m_nste;
|
||||||
};
|
};
|
||||||
|
|
||||||
// A class used to translate a user compound string (*not* a query
|
// A class used to translate a user compound string (*not* a query
|
||||||
@ -783,7 +798,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||||||
|
|
||||||
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||||
// For phrases, give a relevance boost like we do for original terms
|
// For phrases, give a relevance boost like we do for original terms
|
||||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||||
splitData->alltermcount, splitData->lastpos));
|
splitData->alltermcount, splitData->lastpos));
|
||||||
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||||
splitData->lastpos + 1 + slack);
|
splitData->lastpos + 1 + slack);
|
||||||
@ -839,7 +854,7 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
bool useNear
|
bool useNear
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
|
||||||
ermsg.erase();
|
ermsg.erase();
|
||||||
m_uterms.clear();
|
m_uterms.clear();
|
||||||
m_terms.clear();
|
m_terms.clear();
|
||||||
@ -874,45 +889,35 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
// We used to do word split, searching for
|
// We used to do word split, searching for
|
||||||
// "term0 term1 term2" instead, which may have worse
|
// "term0 term1 term2" instead, which may have worse
|
||||||
// performance, but will succeed.
|
// performance, but will succeed.
|
||||||
// We now adjust the phrase/near slack by the term count
|
// We now adjust the phrase/near slack by comparing the term count
|
||||||
// difference (this is mainly better for cjk where this is a very
|
// and the last position
|
||||||
// common occurrence because of the ngrams thing.
|
|
||||||
|
|
||||||
|
// The term processing pipeline:
|
||||||
TermProcQ tpq;
|
TermProcQ tpq;
|
||||||
// TermProcStop tpstop(&tpidx, stops);
|
TermProc *nxt = &tpq;
|
||||||
TermProcCommongrams tpstop(&tpq, stops);
|
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||||
tpstop.onlygrams(true);
|
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||||
TermProcPrep tpprep(&tpstop);
|
//tpcommon.onlygrams(true);
|
||||||
|
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||||
|
|
||||||
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD),
|
TextSplit::TXTS_KEEPWILD),
|
||||||
stops, &tpprep);
|
stops, nxt);
|
||||||
tpq.setTSQ(&splitterS);
|
tpq.setTSQ(&splitter);
|
||||||
splitterS.text_to_words(*it);
|
splitter.text_to_words(*it);
|
||||||
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
|
|
||||||
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
|
||||||
TextSplit::TXTS_KEEPWILD),
|
|
||||||
stops, &tpprep);
|
|
||||||
tpq.setTSQ(&splitterW);
|
|
||||||
tpstop.onlygrams(false);
|
|
||||||
splitterW.text_to_words(*it);
|
|
||||||
|
|
||||||
if (splitterS.terms.size() > 1 &&
|
slack += splitter.lastpos - splitter.terms.size() + 1;
|
||||||
splitterS.terms.size() != splitterW.terms.size()) {
|
|
||||||
slack += splitterW.terms.size() - splitterS.terms.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
TextSplitQ *splitter = &splitterS;
|
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
||||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
switch (splitter.terms.size() + terminc) {
|
||||||
switch (splitter->terms.size() + terminc) {
|
|
||||||
case 0:
|
case 0:
|
||||||
continue;// ??
|
continue;// ??
|
||||||
case 1:
|
case 1:
|
||||||
processSimpleSpan(splitter->terms.front(),
|
processSimpleSpan(splitter.terms.front(),
|
||||||
splitter->nostemexps.front(), pqueries);
|
splitter.nostemexps.front(), pqueries);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
|
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
|
|||||||
@ -66,10 +66,10 @@ private:
|
|||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Intermediary specialized texsplit class: this will probably replace the base
|
* Specialized TextSplit class: this will probably replace the base
|
||||||
* textsplit when we've converted all the code. The takeword() routine in this
|
* TextSplit when we've converted all the code. The takeword() routine in this
|
||||||
* calls a TextProc's instead of being specialized in a derived class by the
|
* calls a TermProc's instead of being overriden in a user derived class.
|
||||||
* user module. The text_to_word() method also takes care of flushing.
|
* The text_to_word() method also takes care of flushing.
|
||||||
*/
|
*/
|
||||||
class TextSplitP : public TextSplit {
|
class TextSplitP : public TextSplit {
|
||||||
public:
|
public:
|
||||||
@ -99,18 +99,39 @@ private:
|
|||||||
/** Unaccent and lowercase term. This is usually the first in the pipeline */
|
/** Unaccent and lowercase term. This is usually the first in the pipeline */
|
||||||
class TermProcPrep : public TermProc {
|
class TermProcPrep : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
|
TermProcPrep(TermProc *nxt)
|
||||||
|
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) {}
|
||||||
|
|
||||||
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
|
m_totalterms++;
|
||||||
string otrm;
|
string otrm;
|
||||||
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
|
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
|
||||||
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
||||||
// We don't generate a fatal error because of a bad term
|
m_unacerrors++;
|
||||||
|
// We don't generate a fatal error because of a bad term,
|
||||||
|
// but one has to put the limit somewhere
|
||||||
|
if (m_unacerrors > 500 &&
|
||||||
|
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
||||||
|
// More than 1 error for every other term
|
||||||
|
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
||||||
|
m_unacerrors, m_totalterms));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return TermProc::takeword(otrm, pos, bs, be);
|
return TermProc::takeword(otrm, pos, bs, be);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool flush()
|
||||||
|
{
|
||||||
|
m_totalterms = m_unacerrors = 0;
|
||||||
|
return TermProc::flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int m_totalterms;
|
||||||
|
int m_unacerrors;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Compare to stop words list and discard if match found */
|
/** Compare to stop words list and discard if match found */
|
||||||
@ -119,19 +140,23 @@ public:
|
|||||||
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
: TermProc(nxt), m_stops(stops) {}
|
: TermProc(nxt), m_stops(stops) {}
|
||||||
|
|
||||||
virtual bool takeword(const string& term, int pos, int bts, int bte)
|
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
if (m_stops.isStop(term)) {
|
if (m_stops.isStop(term)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return TermProc::takeword(term, pos, bts, bte);
|
return TermProc::takeword(term, pos, bs, be);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const Rcl::StopList& m_stops;
|
const Rcl::StopList& m_stops;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Handle common-gram generation: combine frequent terms with neighbours to
|
/** Handle common-gram generation: combine frequent terms with neighbours to
|
||||||
* shorten the positions lists for phrase searches.
|
* shorten the positions lists for phrase searches.
|
||||||
|
* NOTE: This does not currently work because of bad interaction with the
|
||||||
|
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
|
||||||
|
* testing only
|
||||||
*/
|
*/
|
||||||
class TermProcCommongrams : public TermProc {
|
class TermProcCommongrams : public TermProc {
|
||||||
public:
|
public:
|
||||||
@ -147,7 +172,7 @@ public:
|
|||||||
|
|
||||||
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
||||||
// create 2-gram. space unnecessary but improves
|
// create 2-gram. space unnecessary but improves
|
||||||
// lisibility of queries
|
// the readability of queries
|
||||||
string twogram;
|
string twogram;
|
||||||
twogram.swap(m_prevterm);
|
twogram.swap(m_prevterm);
|
||||||
twogram.append(1, ' ');
|
twogram.append(1, ' ');
|
||||||
@ -164,7 +189,7 @@ public:
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
m_prevterm = term;
|
m_prevterm = term;
|
||||||
m_prevstop = isstop;
|
m_prevstop = isstop;
|
||||||
m_prevpos = pos;
|
m_prevpos = pos;
|
||||||
@ -181,7 +206,7 @@ public:
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool flush()
|
virtual bool flush()
|
||||||
{
|
{
|
||||||
if (!m_prevsent && !m_prevterm.empty())
|
if (!m_prevsent && !m_prevterm.empty())
|
||||||
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user