cosmetics: use derived class for actual splitter instead of callback

This commit is contained in:
Jean-Francois Dockes 2010-02-02 15:33:52 +01:00
parent 90a8280f21
commit 8b2b00bc72
9 changed files with 114 additions and 118 deletions

View File

@ -186,7 +186,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
}
}
if (pos != m_prevpos || l != m_prevlen) {
bool ret = m_cb->takeword(w, pos, btstart, btend);
bool ret = takeword(w, pos, btstart, btend);
m_prevpos = pos;
m_prevlen = w.length();
return ret;
@ -558,7 +558,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
for (unsigned int i = loopbeg; i < loopend; i++) {
if (!m_cb->takeword(it.buffer().substr(boffs[i],
if (!takeword(it.buffer().substr(boffs[i],
btend-boffs[i]),
m_wordpos - (nchars-i-1), boffs[i], btend)) {
return false;
@ -579,7 +579,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// first
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
unsigned int btend = it.getBpos(); // Current char is out
if (!m_cb->takeword(it.buffer().substr(boffs[0],
if (!takeword(it.buffer().substr(boffs[0],
btend-boffs[0]),
m_wordpos - nchars,
boffs[0], btend)) {
@ -595,12 +595,12 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
return true;
}
// Callback class for countWords
class utSplitterCB : public TextSplitCB {
// Specialization for countWords
class TextSplitCW : public TextSplit {
public:
int wcnt;
utSplitterCB() : wcnt(0) {}
bool takeword(const string &term, int pos, int bs, int be) {
TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
bool takeword(const string &, int, int, int) {
wcnt++;
return true;
}
@ -608,10 +608,9 @@ class utSplitterCB : public TextSplitCB {
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
{
utSplitterCB cb;
TextSplit splitter(&cb, flgs);
TextSplitCW splitter(flgs);
splitter.text_to_words(s);
return cb.wcnt;
return splitter.wcnt;
}
bool TextSplit::hasVisibleWhite(const string &in)
@ -726,12 +725,13 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
using namespace std;
// A small class to hold state while splitting text
class mySplitterCB : public TextSplitCB {
class myTextSplit : public TextSplit {
int first;
bool nooutput;
public:
mySplitterCB() : first(1), nooutput(false) {}
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
TextSplit(flags),first(1), nooutput(false)
{}
void setNoOut(bool val) {nooutput = val;}
bool takeword(const string &term, int pos, int bs, int be) {
if (nooutput)
@ -821,12 +821,8 @@ int main(int argc, char **argv)
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
mySplitterCB cb;
TextSplit::Flags flags = TextSplit::TXTS_NONE;
if (op_flags&OPT_S)
cb.setNoOut(true);
if (op_flags&OPT_s)
flags = TextSplit::TXTS_ONLYSPANS;
else if (op_flags&OPT_w)
@ -867,7 +863,9 @@ int main(int argc, char **argv)
int n = TextSplit::countWords(data, flags);
cout << n << " words" << endl;
} else {
TextSplit splitter(&cb, flags);
myTextSplit splitter(flags);
if (op_flags&OPT_S)
splitter.setNoOut(true);
splitter.text_to_words(data);
}
}

View File

@ -25,19 +25,6 @@ using std::string;
using std::list;
#endif
/**
* Function class whose takeword method is called for every detected word while * splitting text.
*/
class TextSplitCB {
public:
virtual ~TextSplitCB() {}
virtual bool takeword(const string& term,
int pos, // term pos
int bts, // byte offset of first char in term
int bte // byte offset of first char after term
) = 0;
};
class Utf8Iter;
@ -67,20 +54,25 @@ public:
TXTS_KEEPWILD = 4 // Handle wildcards as letters
};
/**
* Constructor: just store callback object
*/
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
: m_flags(flags), m_cb(t), m_maxWordLength(40),
m_prevpos(-1)
TextSplit(Flags flags = Flags(TXTS_NONE))
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
{
}
virtual ~TextSplit() {}
/** Split text, emit words and positions. */
bool text_to_words(const string &in);
//Utility functions : these does not need the user to setup a callback
// etc.
/** Process one output word: to be implemented by the actual user class */
virtual bool takeword(const string& term,
int pos, // term pos
int bts, // byte offset of first char in term
int bte // byte offset of first char after term
) = 0;
// Static utility functions:
/** Count words in string, as the splitter would generate them */
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
@ -102,7 +94,6 @@ public:
private:
Flags m_flags;
TextSplitCB *m_cb;
int m_maxWordLength;
// Current span. Might be jf.dockes@wanadoo.f
@ -132,5 +123,4 @@ private:
bool doemit(bool spanerase, int bp, bool spanemit=false);
};
#endif /* _TEXTSPLIT_H_INCLUDED_ */

View File

@ -58,15 +58,15 @@ static string vecStringToString(const vector<string>& t)
// Text splitter callback used to take note of the position of query terms
// inside the result text. This is then used to insert highlight tags.
class myTextSplitCB : public TextSplitCB {
class TextSplitPTR : public TextSplit {
public:
// Out: begin and end byte positions of query terms/groups in text
vector<pair<int, int> > tboffs;
myTextSplitCB(const vector<string>& its,
const vector<vector<string> >&groups,
const vector<int>& slacks)
TextSplitPTR(const vector<string>& its,
const vector<vector<string> >&groups,
const vector<int>& slacks)
: m_wcount(0), m_groups(groups), m_slacks(slacks)
{
for (vector<string>::const_iterator it = its.begin();
@ -86,7 +86,8 @@ class myTextSplitCB : public TextSplitCB {
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb;
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
term.c_str()));
return true;
}
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
@ -186,9 +187,9 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
}
// Check if there is a NEAR match for the group of terms
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
{
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
vecStringToString(terms).c_str()));
// The position lists we are going to work with. We extract them from the
@ -207,7 +208,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl == m_plists.end()) {
LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
(*it).c_str()));
continue;
}
@ -215,10 +216,10 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
plistToTerm[&(pl->second)] = *it;
realgroup.push_back(*it);
}
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n",
LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
window, vecStringToString(realgroup).c_str()));
if (plists.size() < 2) {
LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
return false;
}
// Sort the positions lists so that the shorter is first
@ -243,7 +244,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
int sta = int(10E9), sto = 0;
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n",
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
sta, sto));
// Maybe extend the window by 1st term position, this was not
// done by do_prox..
@ -253,7 +254,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
i1->second.first, i2->second.second));
tboffs.push_back(pair<int, int>(i1->second.first,
i2->second.second));
@ -278,7 +279,7 @@ public:
};
// Do the phrase match thing, then merge the highlight lists
bool myTextSplitCB::matchGroups()
bool TextSplitPTR::matchGroups()
{
vector<vector<string> >::const_iterator vit = m_groups.begin();
vector<int>::const_iterator sit = m_slacks.begin();
@ -333,15 +334,14 @@ bool PlainToRich::plaintorich(const string& in,
// Compute the positions for the query terms. We use the text
// splitter to break the text into words, and compare the words to
// the search terms,
myTextSplitCB cb(terms, groups, slacks);
TextSplit splitter(&cb);
TextSplitPTR splitter(terms, groups, slacks);
// Note: the splitter returns the term locations in byte, not
// character, offsets.
splitter.text_to_words(in);
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
// Compute the positions for NEAR and PHRASE groups.
cb.matchGroups();
splitter.matchGroups();
out.clear();
out.push_back("");
@ -353,12 +353,12 @@ bool PlainToRich::plaintorich(const string& in,
// Iterator for the list of input term positions. We use it to
// output highlight tags and to compute term positions in the
// output text
vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();
vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
#if 0
for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
it != cb.tboffs.end(); it++) {
for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
it != splitter.tboffs.end(); it++) {
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
}
#endif
@ -412,7 +412,7 @@ bool PlainToRich::plaintorich(const string& in,
}
// Skip all highlight areas that would overlap this one
int crend = tPosIt->second;
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
tPosIt++;
inrcltag = 0;
}

View File

@ -42,6 +42,7 @@ using namespace std;
#include "internfile.h"
#include "wipedir.h"
#include "transcode.h"
#include "textsplit.h"
bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& idoc)
{

View File

@ -31,6 +31,7 @@ using std::list;
#include "smallut.h"
#include "rclconfig.h"
#include "refcntr.h"
#include "textsplit.h"
Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason)
{

View File

@ -781,16 +781,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx)
}
// The text splitter callback class which receives words from the
// splitter and adds postings to the Xapian document.
class mySplitterCB : public TextSplitCB {
// The splitter breaks text into words and adds postings to the Xapian document.
class TextSplitDb : public TextSplit {
public:
Xapian::Document &doc; // Xapian document
Xapian::termpos basepos; // Base for document section
Xapian::termpos curpos; // Current position. Used to set basepos for the
// following section
StopList &stops;
mySplitterCB(Xapian::Document &d, StopList &_stops)
TextSplitDb(Xapian::Document &d, StopList &_stops)
: doc(d), basepos(1), curpos(0), stops(_stops)
{}
bool takeword(const std::string &term, int pos, int, int);
@ -802,15 +801,16 @@ private:
string prefix;
};
// Callback for the document to word splitting class during indexation
bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
// Get one term from the doc, remove accents and lowercase, then add posting
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
{
#if 0
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
LOGDEB(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
#endif
string term;
if (!unacmaybefold(_term, term, "UTF-8", true)) {
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
_term.c_str()));
term.clear();
// We don't generate a fatal error because of a bad term
return true;
@ -892,14 +892,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
Doc doc = idoc;
Xapian::Document newdocument;
mySplitterCB splitData(newdocument, m_stops);
TextSplit splitter(&splitData);
TextSplitDb splitter(newdocument, m_stops);
// Split and index file name as document term(s)
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
if (!splitter.text_to_words(doc.utf8fn))
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
splitData.basepos += splitData.curpos + 100;
splitter.basepos += splitter.curpos + 100;
// Index textual metadata. These are all indexed as text with
// positions, as we may want to do phrase searches with them (this
@ -919,19 +918,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
meta_it->first.c_str(), pfx.c_str(),
meta_it->second.c_str()));
splitData.setprefix(pfx); // Subject
splitter.setprefix(pfx); // Subject
if (!splitter.text_to_words(meta_it->second))
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
meta_it->first.c_str()));
splitData.setprefix(string());
splitData.basepos += splitData.curpos + 100;
splitter.setprefix(string());
splitter.basepos += splitter.curpos + 100;
}
}
if (splitData.curpos < baseTextPosition)
splitData.basepos = baseTextPosition;
if (splitter.curpos < baseTextPosition)
splitter.basepos = baseTextPosition;
else
splitData.basepos += splitData.curpos + 100;
splitter.basepos += splitter.curpos + 100;
// Split and index body text
LOGDEB2(("Db::add: split body\n"));

View File

@ -188,25 +188,27 @@ void SearchData::getUTerms(vector<string>& terms) const
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2)
class wsQData : public TextSplitCB {
class TextSplitQ : public TextSplit {
public:
wsQData(const StopList &_stops)
: stops(_stops), alltermcount(0)
TextSplitQ(Flags flags, const StopList &_stops)
: TextSplit(flags), stops(_stops), alltermcount(0)
{}
bool takeword(const std::string &interm, int , int, int) {
alltermcount++;
LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Note that
// the test is convoluted and possibly problematic
string noacterm, noaclowterm;
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
interm.c_str()));
return true;
}
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
noacterm.c_str()));
return true;
}
bool nostemexp = false;
@ -216,7 +218,8 @@ class wsQData : public TextSplitCB {
nostemexp = true;
if (stops.hasStops() && stops.isStop(noaclowterm)) {
LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
noaclowterm.c_str()));
return true;
}
terms.push_back(noaclowterm);
@ -271,7 +274,7 @@ private:
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
// Process phrase/near element
void processPhraseOrNear(wsQData *splitData,
void processPhraseOrNear(TextSplitQ *splitData,
list<Xapian::Query> &pqueries,
bool useNear, int slack);
@ -420,7 +423,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
list<Xapian::Query> &pqueries,
bool useNear, int slack)
{
@ -527,31 +530,31 @@ bool StringToXapianQ::processUserString(const string &iq,
// We now adjust the phrase/near slack by the term count
// difference (this is mainly better for cjk where this is a very
// common occurrence because of the ngrams thing.
wsQData splitDataS(stops), splitDataW(stops);
TextSplit splitterS(&splitDataS,
TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD));
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
stops);
splitterS.text_to_words(*it);
TextSplit splitterW(&splitDataW,
TextSplit::Flags(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD));
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD),
stops);
splitterW.text_to_words(*it);
wsQData *splitData = &splitDataS;
if (splitDataS.terms.size() > 1 &&
splitDataS.terms.size() != splitDataW.terms.size()) {
slack += splitDataW.terms.size() - splitDataS.terms.size();
TextSplitQ *splitter = &splitterS;
if (splitterS.terms.size() > 1 &&
splitterS.terms.size() != splitterW.terms.size()) {
slack += splitterW.terms.size() - splitterS.terms.size();
// used to: splitData = &splitDataW;
}
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
switch (splitData->terms.size()) {
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
switch (splitter->terms.size()) {
case 0:
continue;// ??
case 1:
processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
processSimpleSpan(splitter->terms.front(),
splitter->nostemexps.front(), pqueries);
break;
default:
processPhraseOrNear(splitData, pqueries, useNear, slack);
processPhraseOrNear(splitter, pqueries, useNear, slack);
}
}
} catch (const Xapian::Error &e) {

View File

@ -5,6 +5,7 @@ static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp
#include "debuglog.h"
#include "readfile.h"
#include "unacpp.h"
#include "textsplit.h"
#include "stoplist.h"
#ifndef NO_NAMESPACES
@ -12,6 +13,21 @@ namespace Rcl
{
#endif
class TextSplitSW : public TextSplit {
public:
set<string>& stops;
TextSplitSW(Flags flags, set<string>& stps)
: TextSplit(flags), stops(stps)
{}
virtual bool takeword(const string& term, int, int, int)
{
string dterm;
unacmaybefold(term, dterm, "UTF-8", true);
stops.insert(dterm);
return true;
}
};
bool StopList::setFile(const string &filename)
{
m_hasStops = false;
@ -22,18 +38,9 @@ bool StopList::setFile(const string &filename)
filename.c_str(), reason.c_str()));
return false;
}
TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
ts.text_to_words(stoptext);
return true;
}
bool StopList::takeword(const string& term, int, int, int)
{
string dterm;
unacmaybefold(term, dterm, "UTF-8", true);
LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
m_hasStops = true;
m_stops.insert(dterm);
m_hasStops = !m_stops.empty();
return true;
}

View File

@ -5,8 +5,6 @@
#include <set>
#include <string>
#include "textsplit.h"
#ifndef NO_NAMESPACES
using std::set;
using std::string;
@ -14,7 +12,7 @@ namespace Rcl
{
#endif
class StopList : public TextSplitCB {
class StopList {
public:
StopList() : m_hasStops(false) {}
StopList(const string &filename) {setFile(filename);}
@ -23,7 +21,6 @@ public:
bool setFile(const string &filename);
bool isStop(const string &term) const;
bool hasStops() const {return m_hasStops;}
virtual bool takeword(const string& term, int pos, int bts, int bte);
private:
bool m_hasStops;