cosmetics: use derived class for actual splitter instead of callback
This commit is contained in:
parent
90a8280f21
commit
8b2b00bc72
@ -186,7 +186,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
}
|
||||
}
|
||||
if (pos != m_prevpos || l != m_prevlen) {
|
||||
bool ret = m_cb->takeword(w, pos, btstart, btend);
|
||||
bool ret = takeword(w, pos, btstart, btend);
|
||||
m_prevpos = pos;
|
||||
m_prevlen = w.length();
|
||||
return ret;
|
||||
@ -558,7 +558,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
||||
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
||||
for (unsigned int i = loopbeg; i < loopend; i++) {
|
||||
if (!m_cb->takeword(it.buffer().substr(boffs[i],
|
||||
if (!takeword(it.buffer().substr(boffs[i],
|
||||
btend-boffs[i]),
|
||||
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
||||
return false;
|
||||
@ -579,7 +579,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
// first
|
||||
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
||||
unsigned int btend = it.getBpos(); // Current char is out
|
||||
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
||||
if (!takeword(it.buffer().substr(boffs[0],
|
||||
btend-boffs[0]),
|
||||
m_wordpos - nchars,
|
||||
boffs[0], btend)) {
|
||||
@ -595,12 +595,12 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
return true;
|
||||
}
|
||||
|
||||
// Callback class for countWords
|
||||
class utSplitterCB : public TextSplitCB {
|
||||
// Specialization for countWords
|
||||
class TextSplitCW : public TextSplit {
|
||||
public:
|
||||
int wcnt;
|
||||
utSplitterCB() : wcnt(0) {}
|
||||
bool takeword(const string &term, int pos, int bs, int be) {
|
||||
TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
|
||||
bool takeword(const string &, int, int, int) {
|
||||
wcnt++;
|
||||
return true;
|
||||
}
|
||||
@ -608,10 +608,9 @@ class utSplitterCB : public TextSplitCB {
|
||||
|
||||
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
||||
{
|
||||
utSplitterCB cb;
|
||||
TextSplit splitter(&cb, flgs);
|
||||
TextSplitCW splitter(flgs);
|
||||
splitter.text_to_words(s);
|
||||
return cb.wcnt;
|
||||
return splitter.wcnt;
|
||||
}
|
||||
|
||||
bool TextSplit::hasVisibleWhite(const string &in)
|
||||
@ -726,12 +725,13 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
||||
|
||||
using namespace std;
|
||||
|
||||
// A small class to hold state while splitting text
|
||||
class mySplitterCB : public TextSplitCB {
|
||||
class myTextSplit : public TextSplit {
|
||||
int first;
|
||||
bool nooutput;
|
||||
public:
|
||||
mySplitterCB() : first(1), nooutput(false) {}
|
||||
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
|
||||
TextSplit(flags),first(1), nooutput(false)
|
||||
{}
|
||||
void setNoOut(bool val) {nooutput = val;}
|
||||
bool takeword(const string &term, int pos, int bs, int be) {
|
||||
if (nooutput)
|
||||
@ -821,12 +821,8 @@ int main(int argc, char **argv)
|
||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||
DebugLog::setfilename("stderr");
|
||||
|
||||
mySplitterCB cb;
|
||||
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
||||
|
||||
if (op_flags&OPT_S)
|
||||
cb.setNoOut(true);
|
||||
|
||||
if (op_flags&OPT_s)
|
||||
flags = TextSplit::TXTS_ONLYSPANS;
|
||||
else if (op_flags&OPT_w)
|
||||
@ -867,7 +863,9 @@ int main(int argc, char **argv)
|
||||
int n = TextSplit::countWords(data, flags);
|
||||
cout << n << " words" << endl;
|
||||
} else {
|
||||
TextSplit splitter(&cb, flags);
|
||||
myTextSplit splitter(flags);
|
||||
if (op_flags&OPT_S)
|
||||
splitter.setNoOut(true);
|
||||
splitter.text_to_words(data);
|
||||
}
|
||||
}
|
||||
|
||||
@ -25,19 +25,6 @@ using std::string;
|
||||
using std::list;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Function class whose takeword method is called for every detected word while * splitting text.
|
||||
*/
|
||||
class TextSplitCB {
|
||||
public:
|
||||
virtual ~TextSplitCB() {}
|
||||
virtual bool takeword(const string& term,
|
||||
int pos, // term pos
|
||||
int bts, // byte offset of first char in term
|
||||
int bte // byte offset of first char after term
|
||||
) = 0;
|
||||
};
|
||||
|
||||
class Utf8Iter;
|
||||
|
||||
|
||||
@ -67,20 +54,25 @@ public:
|
||||
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
||||
};
|
||||
|
||||
/**
|
||||
* Constructor: just store callback object
|
||||
*/
|
||||
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
||||
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
||||
m_prevpos(-1)
|
||||
|
||||
TextSplit(Flags flags = Flags(TXTS_NONE))
|
||||
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
||||
{
|
||||
}
|
||||
virtual ~TextSplit() {}
|
||||
|
||||
/** Split text, emit words and positions. */
|
||||
bool text_to_words(const string &in);
|
||||
|
||||
//Utility functions : these does not need the user to setup a callback
|
||||
// etc.
|
||||
/** Process one output word: to be implemented by the actual user class */
|
||||
virtual bool takeword(const string& term,
|
||||
int pos, // term pos
|
||||
int bts, // byte offset of first char in term
|
||||
int bte // byte offset of first char after term
|
||||
) = 0;
|
||||
|
||||
|
||||
// Static utility functions:
|
||||
|
||||
/** Count words in string, as the splitter would generate them */
|
||||
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
||||
@ -102,7 +94,6 @@ public:
|
||||
|
||||
private:
|
||||
Flags m_flags;
|
||||
TextSplitCB *m_cb;
|
||||
int m_maxWordLength;
|
||||
|
||||
// Current span. Might be jf.dockes@wanadoo.f
|
||||
@ -132,5 +123,4 @@ private:
|
||||
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
||||
};
|
||||
|
||||
|
||||
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
||||
|
||||
@ -58,15 +58,15 @@ static string vecStringToString(const vector<string>& t)
|
||||
|
||||
// Text splitter callback used to take note of the position of query terms
|
||||
// inside the result text. This is then used to insert highlight tags.
|
||||
class myTextSplitCB : public TextSplitCB {
|
||||
class TextSplitPTR : public TextSplit {
|
||||
public:
|
||||
|
||||
// Out: begin and end byte positions of query terms/groups in text
|
||||
vector<pair<int, int> > tboffs;
|
||||
|
||||
myTextSplitCB(const vector<string>& its,
|
||||
const vector<vector<string> >&groups,
|
||||
const vector<int>& slacks)
|
||||
TextSplitPTR(const vector<string>& its,
|
||||
const vector<vector<string> >&groups,
|
||||
const vector<int>& slacks)
|
||||
: m_wcount(0), m_groups(groups), m_slacks(slacks)
|
||||
{
|
||||
for (vector<string>::const_iterator it = its.begin();
|
||||
@ -86,7 +86,8 @@ class myTextSplitCB : public TextSplitCB {
|
||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||
string dumb;
|
||||
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
||||
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
|
||||
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
||||
term.c_str()));
|
||||
return true;
|
||||
}
|
||||
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
||||
@ -186,9 +187,9 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
||||
}
|
||||
|
||||
// Check if there is a NEAR match for the group of terms
|
||||
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
|
||||
{
|
||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
||||
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
||||
vecStringToString(terms).c_str()));
|
||||
|
||||
// The position lists we are going to work with. We extract them from the
|
||||
@ -207,7 +208,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
it != terms.end(); it++) {
|
||||
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
||||
if (pl == m_plists.end()) {
|
||||
LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
|
||||
(*it).c_str()));
|
||||
continue;
|
||||
}
|
||||
@ -215,10 +216,10 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
plistToTerm[&(pl->second)] = *it;
|
||||
realgroup.push_back(*it);
|
||||
}
|
||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n",
|
||||
LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
|
||||
window, vecStringToString(realgroup).c_str()));
|
||||
if (plists.size() < 2) {
|
||||
LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
|
||||
return false;
|
||||
}
|
||||
// Sort the positions lists so that the shorter is first
|
||||
@ -243,7 +244,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
int sta = int(10E9), sto = 0;
|
||||
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
||||
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
||||
LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n",
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
||||
sta, sto));
|
||||
// Maybe extend the window by 1st term position, this was not
|
||||
// done by do_prox..
|
||||
@ -253,7 +254,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
||||
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
||||
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
||||
LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
|
||||
i1->second.first, i2->second.second));
|
||||
tboffs.push_back(pair<int, int>(i1->second.first,
|
||||
i2->second.second));
|
||||
@ -278,7 +279,7 @@ public:
|
||||
};
|
||||
|
||||
// Do the phrase match thing, then merge the highlight lists
|
||||
bool myTextSplitCB::matchGroups()
|
||||
bool TextSplitPTR::matchGroups()
|
||||
{
|
||||
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
||||
vector<int>::const_iterator sit = m_slacks.begin();
|
||||
@ -333,15 +334,14 @@ bool PlainToRich::plaintorich(const string& in,
|
||||
// Compute the positions for the query terms. We use the text
|
||||
// splitter to break the text into words, and compare the words to
|
||||
// the search terms,
|
||||
myTextSplitCB cb(terms, groups, slacks);
|
||||
TextSplit splitter(&cb);
|
||||
TextSplitPTR splitter(terms, groups, slacks);
|
||||
// Note: the splitter returns the term locations in byte, not
|
||||
// character, offsets.
|
||||
splitter.text_to_words(in);
|
||||
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
|
||||
|
||||
// Compute the positions for NEAR and PHRASE groups.
|
||||
cb.matchGroups();
|
||||
splitter.matchGroups();
|
||||
|
||||
out.clear();
|
||||
out.push_back("");
|
||||
@ -353,12 +353,12 @@ bool PlainToRich::plaintorich(const string& in,
|
||||
// Iterator for the list of input term positions. We use it to
|
||||
// output highlight tags and to compute term positions in the
|
||||
// output text
|
||||
vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
|
||||
vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();
|
||||
vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
|
||||
vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
|
||||
|
||||
#if 0
|
||||
for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
|
||||
it != cb.tboffs.end(); it++) {
|
||||
for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
|
||||
it != splitter.tboffs.end(); it++) {
|
||||
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
||||
}
|
||||
#endif
|
||||
@ -412,7 +412,7 @@ bool PlainToRich::plaintorich(const string& in,
|
||||
}
|
||||
// Skip all highlight areas that would overlap this one
|
||||
int crend = tPosIt->second;
|
||||
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
|
||||
while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
|
||||
tPosIt++;
|
||||
inrcltag = 0;
|
||||
}
|
||||
|
||||
@ -42,6 +42,7 @@ using namespace std;
|
||||
#include "internfile.h"
|
||||
#include "wipedir.h"
|
||||
#include "transcode.h"
|
||||
#include "textsplit.h"
|
||||
|
||||
bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& idoc)
|
||||
{
|
||||
|
||||
@ -31,6 +31,7 @@ using std::list;
|
||||
#include "smallut.h"
|
||||
#include "rclconfig.h"
|
||||
#include "refcntr.h"
|
||||
#include "textsplit.h"
|
||||
|
||||
Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason)
|
||||
{
|
||||
|
||||
@ -781,16 +781,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx)
|
||||
}
|
||||
|
||||
|
||||
// The text splitter callback class which receives words from the
|
||||
// splitter and adds postings to the Xapian document.
|
||||
class mySplitterCB : public TextSplitCB {
|
||||
// The splitter breaks text into words and adds postings to the Xapian document.
|
||||
class TextSplitDb : public TextSplit {
|
||||
public:
|
||||
Xapian::Document &doc; // Xapian document
|
||||
Xapian::termpos basepos; // Base for document section
|
||||
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||
// following section
|
||||
StopList &stops;
|
||||
mySplitterCB(Xapian::Document &d, StopList &_stops)
|
||||
TextSplitDb(Xapian::Document &d, StopList &_stops)
|
||||
: doc(d), basepos(1), curpos(0), stops(_stops)
|
||||
{}
|
||||
bool takeword(const std::string &term, int pos, int, int);
|
||||
@ -802,15 +801,16 @@ private:
|
||||
string prefix;
|
||||
};
|
||||
|
||||
// Callback for the document to word splitting class during indexation
|
||||
bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
|
||||
// Get one term from the doc, remove accents and lowercase, then add posting
|
||||
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
||||
{
|
||||
#if 0
|
||||
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
|
||||
LOGDEB(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
|
||||
#endif
|
||||
string term;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
||||
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
|
||||
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
|
||||
_term.c_str()));
|
||||
term.clear();
|
||||
// We don't generate a fatal error because of a bad term
|
||||
return true;
|
||||
@ -892,14 +892,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
Doc doc = idoc;
|
||||
|
||||
Xapian::Document newdocument;
|
||||
mySplitterCB splitData(newdocument, m_stops);
|
||||
TextSplit splitter(&splitData);
|
||||
TextSplitDb splitter(newdocument, m_stops);
|
||||
|
||||
// Split and index file name as document term(s)
|
||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||
if (!splitter.text_to_words(doc.utf8fn))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
splitter.basepos += splitter.curpos + 100;
|
||||
|
||||
// Index textual metadata. These are all indexed as text with
|
||||
// positions, as we may want to do phrase searches with them (this
|
||||
@ -919,19 +918,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
||||
meta_it->first.c_str(), pfx.c_str(),
|
||||
meta_it->second.c_str()));
|
||||
splitData.setprefix(pfx); // Subject
|
||||
splitter.setprefix(pfx); // Subject
|
||||
if (!splitter.text_to_words(meta_it->second))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
||||
meta_it->first.c_str()));
|
||||
splitData.setprefix(string());
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
splitter.setprefix(string());
|
||||
splitter.basepos += splitter.curpos + 100;
|
||||
}
|
||||
}
|
||||
|
||||
if (splitData.curpos < baseTextPosition)
|
||||
splitData.basepos = baseTextPosition;
|
||||
if (splitter.curpos < baseTextPosition)
|
||||
splitter.basepos = baseTextPosition;
|
||||
else
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
splitter.basepos += splitter.curpos + 100;
|
||||
|
||||
// Split and index body text
|
||||
LOGDEB2(("Db::add: split body\n"));
|
||||
|
||||
@ -188,25 +188,27 @@ void SearchData::getUTerms(vector<string>& terms) const
|
||||
// phrases. This is for parts of the user entry which would appear as
|
||||
// a single word because there is no white space inside, but are
|
||||
// actually multiple terms to rcldb (ie term1,term2)
|
||||
class wsQData : public TextSplitCB {
|
||||
class TextSplitQ : public TextSplit {
|
||||
public:
|
||||
wsQData(const StopList &_stops)
|
||||
: stops(_stops), alltermcount(0)
|
||||
TextSplitQ(Flags flags, const StopList &_stops)
|
||||
: TextSplit(flags), stops(_stops), alltermcount(0)
|
||||
{}
|
||||
bool takeword(const std::string &interm, int , int, int) {
|
||||
alltermcount++;
|
||||
LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
|
||||
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
|
||||
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Note that
|
||||
// the test is convoluted and possibly problematic
|
||||
string noacterm, noaclowterm;
|
||||
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||
interm.c_str()));
|
||||
return true;
|
||||
}
|
||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||
noacterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
bool nostemexp = false;
|
||||
@ -216,7 +218,8 @@ class wsQData : public TextSplitCB {
|
||||
nostemexp = true;
|
||||
|
||||
if (stops.hasStops() && stops.isStop(noaclowterm)) {
|
||||
LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
|
||||
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
|
||||
noaclowterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
terms.push_back(noaclowterm);
|
||||
@ -271,7 +274,7 @@ private:
|
||||
// After splitting entry on whitespace: process non-phrase element
|
||||
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
|
||||
// Process phrase/near element
|
||||
void processPhraseOrNear(wsQData *splitData,
|
||||
void processPhraseOrNear(TextSplitQ *splitData,
|
||||
list<Xapian::Query> &pqueries,
|
||||
bool useNear, int slack);
|
||||
|
||||
@ -420,7 +423,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
||||
// NEAR xapian query, the elements of which can themselves be OR
|
||||
// queries if the terms get expanded by stemming or wildcards (we
|
||||
// don't do stemming for PHRASE though)
|
||||
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||
list<Xapian::Query> &pqueries,
|
||||
bool useNear, int slack)
|
||||
{
|
||||
@ -527,31 +530,31 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
// We now adjust the phrase/near slack by the term count
|
||||
// difference (this is mainly better for cjk where this is a very
|
||||
// common occurrence because of the ngrams thing.
|
||||
wsQData splitDataS(stops), splitDataW(stops);
|
||||
TextSplit splitterS(&splitDataS,
|
||||
TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD));
|
||||
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops);
|
||||
splitterS.text_to_words(*it);
|
||||
TextSplit splitterW(&splitDataW,
|
||||
TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
||||
TextSplit::TXTS_KEEPWILD));
|
||||
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops);
|
||||
splitterW.text_to_words(*it);
|
||||
wsQData *splitData = &splitDataS;
|
||||
if (splitDataS.terms.size() > 1 &&
|
||||
splitDataS.terms.size() != splitDataW.terms.size()) {
|
||||
slack += splitDataW.terms.size() - splitDataS.terms.size();
|
||||
TextSplitQ *splitter = &splitterS;
|
||||
if (splitterS.terms.size() > 1 &&
|
||||
splitterS.terms.size() != splitterW.terms.size()) {
|
||||
slack += splitterW.terms.size() - splitterS.terms.size();
|
||||
// used to: splitData = &splitDataW;
|
||||
}
|
||||
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
|
||||
switch (splitData->terms.size()) {
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
||||
switch (splitter->terms.size()) {
|
||||
case 0:
|
||||
continue;// ??
|
||||
case 1:
|
||||
processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
|
||||
processSimpleSpan(splitter->terms.front(),
|
||||
splitter->nostemexps.front(), pqueries);
|
||||
break;
|
||||
default:
|
||||
processPhraseOrNear(splitData, pqueries, useNear, slack);
|
||||
processPhraseOrNear(splitter, pqueries, useNear, slack);
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
|
||||
@ -5,6 +5,7 @@ static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp
|
||||
#include "debuglog.h"
|
||||
#include "readfile.h"
|
||||
#include "unacpp.h"
|
||||
#include "textsplit.h"
|
||||
#include "stoplist.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
@ -12,6 +13,21 @@ namespace Rcl
|
||||
{
|
||||
#endif
|
||||
|
||||
class TextSplitSW : public TextSplit {
|
||||
public:
|
||||
set<string>& stops;
|
||||
TextSplitSW(Flags flags, set<string>& stps)
|
||||
: TextSplit(flags), stops(stps)
|
||||
{}
|
||||
virtual bool takeword(const string& term, int, int, int)
|
||||
{
|
||||
string dterm;
|
||||
unacmaybefold(term, dterm, "UTF-8", true);
|
||||
stops.insert(dterm);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
bool StopList::setFile(const string &filename)
|
||||
{
|
||||
m_hasStops = false;
|
||||
@ -22,18 +38,9 @@ bool StopList::setFile(const string &filename)
|
||||
filename.c_str(), reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
|
||||
TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
|
||||
ts.text_to_words(stoptext);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StopList::takeword(const string& term, int, int, int)
|
||||
{
|
||||
string dterm;
|
||||
unacmaybefold(term, dterm, "UTF-8", true);
|
||||
LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
|
||||
m_hasStops = true;
|
||||
m_stops.insert(dterm);
|
||||
m_hasStops = !m_stops.empty();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -5,8 +5,6 @@
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "textsplit.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::set;
|
||||
using std::string;
|
||||
@ -14,7 +12,7 @@ namespace Rcl
|
||||
{
|
||||
#endif
|
||||
|
||||
class StopList : public TextSplitCB {
|
||||
class StopList {
|
||||
public:
|
||||
StopList() : m_hasStops(false) {}
|
||||
StopList(const string &filename) {setFile(filename);}
|
||||
@ -23,7 +21,6 @@ public:
|
||||
bool setFile(const string &filename);
|
||||
bool isStop(const string &term) const;
|
||||
bool hasStops() const {return m_hasStops;}
|
||||
virtual bool takeword(const string& term, int pos, int bts, int bte);
|
||||
|
||||
private:
|
||||
bool m_hasStops;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user