cosmetics: use derived class for actual splitter instead of callback

2010-02-02 15:33:52 +01:00 · 2010-02-02 15:33:52 +01:00 · 8b2b00bc72
commit 8b2b00bc72
parent 90a8280f21
9 changed files with 114 additions and 118 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -186,7 +186,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
 	    }
 	}
 	if (pos != m_prevpos || l != m_prevlen) {
-	    bool ret = m_cb->takeword(w, pos, btstart, btend);
+	    bool ret = takeword(w, pos, btstart, btend);
 	    m_prevpos = pos;
 	    m_prevlen = w.length();
 	    return ret;
@ -558,7 +558,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
 	    unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
 	    unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
 	    for (unsigned int i = loopbeg; i < loopend; i++) {
-		if (!m_cb->takeword(it.buffer().substr(boffs[i], 
+		if (!takeword(it.buffer().substr(boffs[i], 
 						       btend-boffs[i]),
 				m_wordpos - (nchars-i-1), boffs[i], btend)) {
 		    return false;
@ -579,7 +579,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
    // first
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen)  {
 	unsigned int btend = it.getBpos(); // Current char is out
-	if (!m_cb->takeword(it.buffer().substr(boffs[0], 
+	if (!takeword(it.buffer().substr(boffs[0], 
 					       btend-boffs[0]),
 			    m_wordpos - nchars,
 			    boffs[0], btend)) {
@ -595,12 +595,12 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
    return true;
 }

-// Callback class for countWords 
-class utSplitterCB : public TextSplitCB {
+// Specialization for countWords 
+class TextSplitCW : public TextSplit {
 public:
    int wcnt;
-    utSplitterCB() : wcnt(0) {}
-    bool takeword(const string &term, int pos, int bs, int be) {
+    TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
+    bool takeword(const string &, int, int, int) {
 	wcnt++;
 	return true;
    }
@ -608,10 +608,9 @@ class utSplitterCB : public TextSplitCB {

 int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
 {
-    utSplitterCB cb;
-    TextSplit splitter(&cb, flgs);
+    TextSplitCW splitter(flgs);
    splitter.text_to_words(s);
-    return cb.wcnt;
+    return splitter.wcnt;
 }

 bool TextSplit::hasVisibleWhite(const string &in)
@ -726,12 +725,13 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)

 using namespace std;

-// A small class to hold state while splitting text
-class mySplitterCB : public TextSplitCB {
+class myTextSplit : public TextSplit {
    int first;
    bool nooutput;
 public:
-    mySplitterCB() : first(1), nooutput(false) {}
+    myTextSplit(Flags flags = Flags(TXTS_NONE)) : 
+        TextSplit(flags),first(1), nooutput(false) 
+    {}
    void setNoOut(bool val) {nooutput = val;}
    bool takeword(const string &term, int pos, int bs, int be) {
 	if (nooutput)
@ -821,12 +821,8 @@ int main(int argc, char **argv)
    DebugLog::getdbl()->setloglevel(DEBDEB1);
    DebugLog::setfilename("stderr");

-    mySplitterCB cb;
    TextSplit::Flags flags = TextSplit::TXTS_NONE;

-    if (op_flags&OPT_S)
-	cb.setNoOut(true);
-
    if (op_flags&OPT_s)
 	flags = TextSplit::TXTS_ONLYSPANS;
    else if (op_flags&OPT_w)
@ -867,7 +863,9 @@ int main(int argc, char **argv)
 	int n = TextSplit::countWords(data, flags);
 	cout << n << " words" << endl;
    } else {
-	TextSplit splitter(&cb,  flags);
+	myTextSplit splitter(flags);
+        if (op_flags&OPT_S)
+            splitter.setNoOut(true);
 	splitter.text_to_words(data);
    }    
 }
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -25,19 +25,6 @@ using std::string;
 using std::list;
 #endif

-/**
- * Function class whose takeword method is called for every detected word while * splitting text.
- */
-class TextSplitCB {
-public:
-    virtual ~TextSplitCB() {}
-    virtual bool takeword(const string& term, 
-			  int pos,  // term pos
-			  int bts,  // byte offset of first char in term
-			  int bte   // byte offset of first char after term
-			  ) = 0; 
-};
-
 class Utf8Iter;


@ -67,20 +54,25 @@ public:
 		TXTS_KEEPWILD = 4 // Handle wildcards as letters
    };

-    /**
-     * Constructor: just store callback object
-     */
-    TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
-	: m_flags(flags), m_cb(t), m_maxWordLength(40), 
-	  m_prevpos(-1)
+    
+    TextSplit(Flags flags = Flags(TXTS_NONE))
+	: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
    {
    }
+    virtual ~TextSplit() {}

    /** Split text, emit words and positions. */
    bool text_to_words(const string &in);

-    //Utility functions : these does not need the user to setup a callback 
-    // etc.
+    /** Process one output word: to be implemented by the actual user class */
+    virtual bool takeword(const string& term, 
+			  int pos,  // term pos
+			  int bts,  // byte offset of first char in term
+			  int bte   // byte offset of first char after term
+			  ) = 0; 
+
+
+    // Static utility functions:

    /** Count words in string, as the splitter would generate them */
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
@ -102,7 +94,6 @@ public:

 private:
    Flags         m_flags;
-    TextSplitCB  *m_cb;
    int           m_maxWordLength;

    // Current span. Might be jf.dockes@wanadoo.f
@ -132,5 +123,4 @@ private:
    bool doemit(bool spanerase, int bp, bool spanemit=false);
 };

-
 #endif /* _TEXTSPLIT_H_INCLUDED_ */
--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@ -58,15 +58,15 @@ static string vecStringToString(const vector<string>& t)

 // Text splitter callback used to take note of the position of query terms 
 // inside the result text. This is then used to insert highlight tags. 
-class myTextSplitCB : public TextSplitCB {
+class TextSplitPTR : public TextSplit {
 public:

    // Out: begin and end byte positions of query terms/groups in text
    vector<pair<int, int> > tboffs;  

-    myTextSplitCB(const vector<string>& its, 
-		  const vector<vector<string> >&groups, 
-		  const vector<int>& slacks) 
+    TextSplitPTR(const vector<string>& its, 
+                 const vector<vector<string> >&groups, 
+                 const vector<int>& slacks) 
 	:  m_wcount(0), m_groups(groups), m_slacks(slacks)
    {
 	for (vector<string>::const_iterator it = its.begin(); 
@ -86,7 +86,8 @@ class myTextSplitCB : public TextSplitCB {
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
 	string dumb;
 	if (!unacmaybefold(term, dumb, "UTF-8", true)) {
-	    LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
+	    LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
+                     term.c_str()));
 	    return true;
 	}
 	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
@ -186,9 +187,9 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
 }

 // Check if there is a NEAR match for the group of terms
-bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
+bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
 {
-    LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
+    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
 	    vecStringToString(terms).c_str()));

    // The position lists we are going to work with. We extract them from the 
@ -207,7 +208,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
 	 it != terms.end(); it++) {
 	map<string, vector<int> >::iterator pl = m_plists.find(*it);
 	if (pl == m_plists.end()) {
-	    LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
+	    LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
 		    (*it).c_str()));
 	    continue;
 	}
@ -215,10 +216,10 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
 	plistToTerm[&(pl->second)] = *it;
 	realgroup.push_back(*it);
    }
-    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n", 
+    LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n", 
 	     window, vecStringToString(realgroup).c_str()));
    if (plists.size() < 2) {
-	LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
+	LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
 	return false;
    }
    // Sort the positions lists so that the shorter is first
@ -243,7 +244,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
 	int sta = int(10E9), sto = 0;
 	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
 	if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
-	    LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n", 
+	    LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
 		     sta, sto)); 
 	    // Maybe extend the window by 1st term position, this was not
 	    // done by do_prox..
@ -253,7 +254,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
 	    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
 	    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
 	    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
-		LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
+		LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
 			i1->second.first, i2->second.second));
 		tboffs.push_back(pair<int, int>(i1->second.first, 
 						i2->second.second));
@ -278,7 +279,7 @@ public:
 };

 // Do the phrase match thing, then merge the highlight lists
-bool myTextSplitCB::matchGroups()
+bool TextSplitPTR::matchGroups()
 {
    vector<vector<string> >::const_iterator vit = m_groups.begin();
    vector<int>::const_iterator sit = m_slacks.begin();
@ -333,15 +334,14 @@ bool PlainToRich::plaintorich(const string& in,
    // Compute the positions for the query terms.  We use the text
    // splitter to break the text into words, and compare the words to
    // the search terms,
-    myTextSplitCB cb(terms, groups, slacks);
-    TextSplit splitter(&cb);
+    TextSplitPTR splitter(terms, groups, slacks);
    // Note: the splitter returns the term locations in byte, not
    // character, offsets.
    splitter.text_to_words(in);
    LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));

    // Compute the positions for NEAR and PHRASE groups.
-    cb.matchGroups();
+    splitter.matchGroups();

    out.clear();
    out.push_back("");
@ -353,12 +353,12 @@ bool PlainToRich::plaintorich(const string& in,
    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
-    vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
-    vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();
+    vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
+    vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();

 #if 0
-    for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
-	 it != cb.tboffs.end(); it++) {
+    for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
+	 it != splitter.tboffs.end(); it++) {
 	LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
    }
 #endif
@ -412,7 +412,7 @@ bool PlainToRich::plaintorich(const string& in,
 		}
 		// Skip all highlight areas that would overlap this one
 		int crend = tPosIt->second;
-		while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
+		while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
 		    tPosIt++;
                inrcltag = 0;
 	    }
--- a/src/query/recollq.cpp
+++ b/src/query/recollq.cpp
@ -42,6 +42,7 @@ using namespace std;
 #include "internfile.h"
 #include "wipedir.h"
 #include "transcode.h"
+#include "textsplit.h"

 bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& idoc)
 {
--- a/src/query/wasatorcl.cpp
+++ b/src/query/wasatorcl.cpp
@ -31,6 +31,7 @@ using std::list;
 #include "smallut.h"
 #include "rclconfig.h"
 #include "refcntr.h"
+#include "textsplit.h"

 Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason)
 {
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -781,16 +781,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx)
 }


-// The text splitter callback class which receives words from the
-// splitter and adds postings to the Xapian document.
-class mySplitterCB : public TextSplitCB {
+// The splitter breaks text into words and adds postings to the Xapian document.
+class TextSplitDb : public TextSplit {
 public:
    Xapian::Document &doc;   // Xapian document 
    Xapian::termpos basepos; // Base for document section
    Xapian::termpos curpos;  // Current position. Used to set basepos for the
                             // following section
    StopList &stops;
-    mySplitterCB(Xapian::Document &d, StopList &_stops) 
+    TextSplitDb(Xapian::Document &d, StopList &_stops) 
 	: doc(d), basepos(1), curpos(0), stops(_stops)
    {}
    bool takeword(const std::string &term, int pos, int, int);
@ -802,15 +801,16 @@ private:
    string  prefix; 
 };

-// Callback for the document to word splitting class during indexation
-bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
+// Get one term from the doc, remove accents and lowercase, then add posting
+bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
 {
 #if 0
-    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
+    LOGDEB(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
 #endif
    string term;
    if (!unacmaybefold(_term, term, "UTF-8", true)) {
-	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
+	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", 
+                 _term.c_str()));
 	term.clear();
 	// We don't generate a fatal error because of a bad term
 	return true;
@ -892,14 +892,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    Doc doc = idoc;

    Xapian::Document newdocument;
-    mySplitterCB splitData(newdocument, m_stops);
-    TextSplit splitter(&splitData);
+    TextSplitDb splitter(newdocument, m_stops);

    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
    if (!splitter.text_to_words(doc.utf8fn))
        LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
-    splitData.basepos += splitData.curpos + 100;
+    splitter.basepos += splitter.curpos + 100;

    // Index textual metadata.  These are all indexed as text with
    // positions, as we may want to do phrase searches with them (this
@ -919,19 +918,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	    LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", 
 		    meta_it->first.c_str(), pfx.c_str(), 
 		    meta_it->second.c_str()));
-	    splitData.setprefix(pfx); // Subject
+	    splitter.setprefix(pfx); // Subject
 	    if (!splitter.text_to_words(meta_it->second))
                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
                        meta_it->first.c_str()));
-	    splitData.setprefix(string());
-	    splitData.basepos += splitData.curpos + 100;
+	    splitter.setprefix(string());
+	    splitter.basepos += splitter.curpos + 100;
 	}
    }

-    if (splitData.curpos < baseTextPosition)
-	splitData.basepos = baseTextPosition;
+    if (splitter.curpos < baseTextPosition)
+	splitter.basepos = baseTextPosition;
    else
-	splitData.basepos += splitData.curpos + 100;
+	splitter.basepos += splitter.curpos + 100;

    // Split and index body text
    LOGDEB2(("Db::add: split body\n"));
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -188,25 +188,27 @@ void SearchData::getUTerms(vector<string>& terms) const
 // phrases. This is for parts of the user entry which would appear as
 // a single word because there is no white space inside, but are
 // actually multiple terms to rcldb (ie term1,term2)
-class wsQData : public TextSplitCB {
+class TextSplitQ : public TextSplit {
 public:
-    wsQData(const StopList &_stops) 
-	: stops(_stops), alltermcount(0)
+    TextSplitQ(Flags flags, const StopList &_stops) 
+	: TextSplit(flags), stops(_stops), alltermcount(0)
    {}
    bool takeword(const std::string &interm, int , int, int) {
 	alltermcount++;
-	LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
+	LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));

 	// Check if the first letter is a majuscule in which
 	// case we do not want to do stem expansion. Note that
 	// the test is convoluted and possibly problematic
 	string noacterm, noaclowterm;
 	if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
-	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", 
+                     interm.c_str()));
 	    return true;
 	} 
 	if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
-	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", 
+                     noacterm.c_str()));
 	    return true;
 	}
 	bool nostemexp = false;
@ -216,7 +218,8 @@ class wsQData : public TextSplitCB {
 	    nostemexp = true;

 	if (stops.hasStops() && stops.isStop(noaclowterm)) {
-	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
+	    LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n", 
+                     noaclowterm.c_str()));
 	    return true;
 	}
 	terms.push_back(noaclowterm);
@ -271,7 +274,7 @@ private:
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
    // Process phrase/near element
-    void processPhraseOrNear(wsQData *splitData, 
+    void processPhraseOrNear(TextSplitQ *splitData, 
 			     list<Xapian::Query> &pqueries,
 			     bool useNear, int slack);

@ -420,7 +423,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 // NEAR xapian query, the elements of which can themselves be OR
 // queries if the terms get expanded by stemming or wildcards (we
 // don't do stemming for PHRASE though)
-void StringToXapianQ::processPhraseOrNear(wsQData *splitData, 
+void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, 
 					  list<Xapian::Query> &pqueries,
 					  bool useNear, int slack)
 {
@ -527,31 +530,31 @@ bool StringToXapianQ::processUserString(const string &iq,
 	    // We now adjust the phrase/near slack by the term count
 	    // difference (this is mainly better for cjk where this is a very
 	    // common occurrence because of the ngrams thing.
-	    wsQData splitDataS(stops), splitDataW(stops);
-	    TextSplit splitterS(&splitDataS, 
-				TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
-						 TextSplit::TXTS_KEEPWILD));
+	    TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
+                                                  TextSplit::TXTS_KEEPWILD), 
+                                 stops);
 	    splitterS.text_to_words(*it);
-	    TextSplit splitterW(&splitDataW, 
-				TextSplit::Flags(TextSplit::TXTS_NOSPANS | 
-						 TextSplit::TXTS_KEEPWILD));
+	    TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | 
+                                                  TextSplit::TXTS_KEEPWILD),
+                                 stops);
 	    splitterW.text_to_words(*it);
-	    wsQData *splitData = &splitDataS;
-	    if (splitDataS.terms.size() > 1 && 
-		splitDataS.terms.size() != splitDataW.terms.size()) {
-		slack += splitDataW.terms.size() - splitDataS.terms.size();
+	    TextSplitQ *splitter = &splitterS;
+	    if (splitterS.terms.size() > 1 && 
+		splitterS.terms.size() != splitterW.terms.size()) {
+		slack += splitterW.terms.size() - splitterS.terms.size();
 		// used to: splitData = &splitDataW;
 	    }

-	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
-	    switch (splitData->terms.size()) {
+	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
+	    switch (splitter->terms.size()) {
 	    case 0: 
 		continue;// ??
 	    case 1: 
-		processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
+		processSimpleSpan(splitter->terms.front(), 
+                                  splitter->nostemexps.front(), pqueries);
 		break;
 	    default:
-		processPhraseOrNear(splitData, pqueries, useNear, slack);
+		processPhraseOrNear(splitter, pqueries, useNear, slack);
 	    }
 	}
    } catch (const Xapian::Error &e) {
--- a/src/rcldb/stoplist.cpp
+++ b/src/rcldb/stoplist.cpp
@ -5,6 +5,7 @@ static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp
 #include "debuglog.h"
 #include "readfile.h"
 #include "unacpp.h"
+#include "textsplit.h"
 #include "stoplist.h"

 #ifndef NO_NAMESPACES
@ -12,6 +13,21 @@ namespace Rcl
 {
 #endif

+class TextSplitSW : public TextSplit {
+public:
+    set<string>& stops;
+    TextSplitSW(Flags flags, set<string>& stps) 
+        : TextSplit(flags), stops(stps) 
+    {}
+    virtual bool takeword(const string& term, int, int, int)
+    {
+        string dterm;
+        unacmaybefold(term, dterm, "UTF-8", true);
+        stops.insert(dterm);
+        return true;
+    }
+};
+
 bool StopList::setFile(const string &filename)
 {
    m_hasStops = false;
@ -22,18 +38,9 @@ bool StopList::setFile(const string &filename)
 		filename.c_str(), reason.c_str()));
 	return false;
    }
-    TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
+    TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
    ts.text_to_words(stoptext);
-    return true;
-}
-
-bool StopList::takeword(const string& term, int, int, int)
-{
-    string dterm;
-    unacmaybefold(term, dterm, "UTF-8", true);
-    LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
-    m_hasStops = true;
-    m_stops.insert(dterm);
+    m_hasStops = !m_stops.empty();
    return true;
 }

--- a/src/rcldb/stoplist.h
+++ b/src/rcldb/stoplist.h
@ -5,8 +5,6 @@
 #include <set>
 #include <string>

-#include "textsplit.h"
-
 #ifndef NO_NAMESPACES
 using std::set;
 using std::string;
@ -14,7 +12,7 @@ namespace Rcl
 {
 #endif

-class StopList : public TextSplitCB {
+class StopList {
 public:
    StopList() : m_hasStops(false) {}
    StopList(const string &filename) {setFile(filename);}
@ -23,7 +21,6 @@ public:
    bool setFile(const string &filename);
    bool isStop(const string &term) const;
    bool hasStops() const {return m_hasStops;}
-    virtual bool takeword(const string& term, int pos, int bts, int bte); 

 private:
    bool m_hasStops;