cosmetics: use derived class for actual splitter instead of callback
This commit is contained in:
parent
90a8280f21
commit
8b2b00bc72
@ -186,7 +186,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pos != m_prevpos || l != m_prevlen) {
|
if (pos != m_prevpos || l != m_prevlen) {
|
||||||
bool ret = m_cb->takeword(w, pos, btstart, btend);
|
bool ret = takeword(w, pos, btstart, btend);
|
||||||
m_prevpos = pos;
|
m_prevpos = pos;
|
||||||
m_prevlen = w.length();
|
m_prevlen = w.length();
|
||||||
return ret;
|
return ret;
|
||||||
@ -558,7 +558,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
||||||
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
||||||
for (unsigned int i = loopbeg; i < loopend; i++) {
|
for (unsigned int i = loopbeg; i < loopend; i++) {
|
||||||
if (!m_cb->takeword(it.buffer().substr(boffs[i],
|
if (!takeword(it.buffer().substr(boffs[i],
|
||||||
btend-boffs[i]),
|
btend-boffs[i]),
|
||||||
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
||||||
return false;
|
return false;
|
||||||
@ -579,7 +579,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// first
|
// first
|
||||||
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
||||||
unsigned int btend = it.getBpos(); // Current char is out
|
unsigned int btend = it.getBpos(); // Current char is out
|
||||||
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
if (!takeword(it.buffer().substr(boffs[0],
|
||||||
btend-boffs[0]),
|
btend-boffs[0]),
|
||||||
m_wordpos - nchars,
|
m_wordpos - nchars,
|
||||||
boffs[0], btend)) {
|
boffs[0], btend)) {
|
||||||
@ -595,12 +595,12 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Callback class for countWords
|
// Specialization for countWords
|
||||||
class utSplitterCB : public TextSplitCB {
|
class TextSplitCW : public TextSplit {
|
||||||
public:
|
public:
|
||||||
int wcnt;
|
int wcnt;
|
||||||
utSplitterCB() : wcnt(0) {}
|
TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
|
||||||
bool takeword(const string &term, int pos, int bs, int be) {
|
bool takeword(const string &, int, int, int) {
|
||||||
wcnt++;
|
wcnt++;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -608,10 +608,9 @@ class utSplitterCB : public TextSplitCB {
|
|||||||
|
|
||||||
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
||||||
{
|
{
|
||||||
utSplitterCB cb;
|
TextSplitCW splitter(flgs);
|
||||||
TextSplit splitter(&cb, flgs);
|
|
||||||
splitter.text_to_words(s);
|
splitter.text_to_words(s);
|
||||||
return cb.wcnt;
|
return splitter.wcnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TextSplit::hasVisibleWhite(const string &in)
|
bool TextSplit::hasVisibleWhite(const string &in)
|
||||||
@ -726,12 +725,13 @@ bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
// A small class to hold state while splitting text
|
class myTextSplit : public TextSplit {
|
||||||
class mySplitterCB : public TextSplitCB {
|
|
||||||
int first;
|
int first;
|
||||||
bool nooutput;
|
bool nooutput;
|
||||||
public:
|
public:
|
||||||
mySplitterCB() : first(1), nooutput(false) {}
|
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
|
||||||
|
TextSplit(flags),first(1), nooutput(false)
|
||||||
|
{}
|
||||||
void setNoOut(bool val) {nooutput = val;}
|
void setNoOut(bool val) {nooutput = val;}
|
||||||
bool takeword(const string &term, int pos, int bs, int be) {
|
bool takeword(const string &term, int pos, int bs, int be) {
|
||||||
if (nooutput)
|
if (nooutput)
|
||||||
@ -821,12 +821,8 @@ int main(int argc, char **argv)
|
|||||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||||
DebugLog::setfilename("stderr");
|
DebugLog::setfilename("stderr");
|
||||||
|
|
||||||
mySplitterCB cb;
|
|
||||||
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
||||||
|
|
||||||
if (op_flags&OPT_S)
|
|
||||||
cb.setNoOut(true);
|
|
||||||
|
|
||||||
if (op_flags&OPT_s)
|
if (op_flags&OPT_s)
|
||||||
flags = TextSplit::TXTS_ONLYSPANS;
|
flags = TextSplit::TXTS_ONLYSPANS;
|
||||||
else if (op_flags&OPT_w)
|
else if (op_flags&OPT_w)
|
||||||
@ -867,7 +863,9 @@ int main(int argc, char **argv)
|
|||||||
int n = TextSplit::countWords(data, flags);
|
int n = TextSplit::countWords(data, flags);
|
||||||
cout << n << " words" << endl;
|
cout << n << " words" << endl;
|
||||||
} else {
|
} else {
|
||||||
TextSplit splitter(&cb, flags);
|
myTextSplit splitter(flags);
|
||||||
|
if (op_flags&OPT_S)
|
||||||
|
splitter.setNoOut(true);
|
||||||
splitter.text_to_words(data);
|
splitter.text_to_words(data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -25,19 +25,6 @@ using std::string;
|
|||||||
using std::list;
|
using std::list;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
|
||||||
* Function class whose takeword method is called for every detected word while * splitting text.
|
|
||||||
*/
|
|
||||||
class TextSplitCB {
|
|
||||||
public:
|
|
||||||
virtual ~TextSplitCB() {}
|
|
||||||
virtual bool takeword(const string& term,
|
|
||||||
int pos, // term pos
|
|
||||||
int bts, // byte offset of first char in term
|
|
||||||
int bte // byte offset of first char after term
|
|
||||||
) = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
class Utf8Iter;
|
class Utf8Iter;
|
||||||
|
|
||||||
|
|
||||||
@ -67,20 +54,25 @@ public:
|
|||||||
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor: just store callback object
|
TextSplit(Flags flags = Flags(TXTS_NONE))
|
||||||
*/
|
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
||||||
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
|
||||||
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
|
||||||
m_prevpos(-1)
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
virtual ~TextSplit() {}
|
||||||
|
|
||||||
/** Split text, emit words and positions. */
|
/** Split text, emit words and positions. */
|
||||||
bool text_to_words(const string &in);
|
bool text_to_words(const string &in);
|
||||||
|
|
||||||
//Utility functions : these does not need the user to setup a callback
|
/** Process one output word: to be implemented by the actual user class */
|
||||||
// etc.
|
virtual bool takeword(const string& term,
|
||||||
|
int pos, // term pos
|
||||||
|
int bts, // byte offset of first char in term
|
||||||
|
int bte // byte offset of first char after term
|
||||||
|
) = 0;
|
||||||
|
|
||||||
|
|
||||||
|
// Static utility functions:
|
||||||
|
|
||||||
/** Count words in string, as the splitter would generate them */
|
/** Count words in string, as the splitter would generate them */
|
||||||
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
||||||
@ -102,7 +94,6 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
Flags m_flags;
|
Flags m_flags;
|
||||||
TextSplitCB *m_cb;
|
|
||||||
int m_maxWordLength;
|
int m_maxWordLength;
|
||||||
|
|
||||||
// Current span. Might be jf.dockes@wanadoo.f
|
// Current span. Might be jf.dockes@wanadoo.f
|
||||||
@ -132,5 +123,4 @@ private:
|
|||||||
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
||||||
|
|||||||
@ -58,15 +58,15 @@ static string vecStringToString(const vector<string>& t)
|
|||||||
|
|
||||||
// Text splitter callback used to take note of the position of query terms
|
// Text splitter callback used to take note of the position of query terms
|
||||||
// inside the result text. This is then used to insert highlight tags.
|
// inside the result text. This is then used to insert highlight tags.
|
||||||
class myTextSplitCB : public TextSplitCB {
|
class TextSplitPTR : public TextSplit {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Out: begin and end byte positions of query terms/groups in text
|
// Out: begin and end byte positions of query terms/groups in text
|
||||||
vector<pair<int, int> > tboffs;
|
vector<pair<int, int> > tboffs;
|
||||||
|
|
||||||
myTextSplitCB(const vector<string>& its,
|
TextSplitPTR(const vector<string>& its,
|
||||||
const vector<vector<string> >&groups,
|
const vector<vector<string> >&groups,
|
||||||
const vector<int>& slacks)
|
const vector<int>& slacks)
|
||||||
: m_wcount(0), m_groups(groups), m_slacks(slacks)
|
: m_wcount(0), m_groups(groups), m_slacks(slacks)
|
||||||
{
|
{
|
||||||
for (vector<string>::const_iterator it = its.begin();
|
for (vector<string>::const_iterator it = its.begin();
|
||||||
@ -86,7 +86,8 @@ class myTextSplitCB : public TextSplitCB {
|
|||||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||||
string dumb;
|
string dumb;
|
||||||
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
||||||
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
|
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
||||||
|
term.c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
||||||
@ -186,9 +187,9 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if there is a NEAR match for the group of terms
|
// Check if there is a NEAR match for the group of terms
|
||||||
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
|
||||||
{
|
{
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
||||||
vecStringToString(terms).c_str()));
|
vecStringToString(terms).c_str()));
|
||||||
|
|
||||||
// The position lists we are going to work with. We extract them from the
|
// The position lists we are going to work with. We extract them from the
|
||||||
@ -207,7 +208,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
it != terms.end(); it++) {
|
it != terms.end(); it++) {
|
||||||
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
||||||
if (pl == m_plists.end()) {
|
if (pl == m_plists.end()) {
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
|
||||||
(*it).c_str()));
|
(*it).c_str()));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -215,10 +216,10 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
plistToTerm[&(pl->second)] = *it;
|
plistToTerm[&(pl->second)] = *it;
|
||||||
realgroup.push_back(*it);
|
realgroup.push_back(*it);
|
||||||
}
|
}
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n",
|
LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
|
||||||
window, vecStringToString(realgroup).c_str()));
|
window, vecStringToString(realgroup).c_str()));
|
||||||
if (plists.size() < 2) {
|
if (plists.size() < 2) {
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
|
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Sort the positions lists so that the shorter is first
|
// Sort the positions lists so that the shorter is first
|
||||||
@ -243,7 +244,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
int sta = int(10E9), sto = 0;
|
int sta = int(10E9), sto = 0;
|
||||||
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
||||||
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n",
|
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
||||||
sta, sto));
|
sta, sto));
|
||||||
// Maybe extend the window by 1st term position, this was not
|
// Maybe extend the window by 1st term position, this was not
|
||||||
// done by do_prox..
|
// done by do_prox..
|
||||||
@ -253,7 +254,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
||||||
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
||||||
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
|
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
|
||||||
i1->second.first, i2->second.second));
|
i1->second.first, i2->second.second));
|
||||||
tboffs.push_back(pair<int, int>(i1->second.first,
|
tboffs.push_back(pair<int, int>(i1->second.first,
|
||||||
i2->second.second));
|
i2->second.second));
|
||||||
@ -278,7 +279,7 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Do the phrase match thing, then merge the highlight lists
|
// Do the phrase match thing, then merge the highlight lists
|
||||||
bool myTextSplitCB::matchGroups()
|
bool TextSplitPTR::matchGroups()
|
||||||
{
|
{
|
||||||
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
||||||
vector<int>::const_iterator sit = m_slacks.begin();
|
vector<int>::const_iterator sit = m_slacks.begin();
|
||||||
@ -333,15 +334,14 @@ bool PlainToRich::plaintorich(const string& in,
|
|||||||
// Compute the positions for the query terms. We use the text
|
// Compute the positions for the query terms. We use the text
|
||||||
// splitter to break the text into words, and compare the words to
|
// splitter to break the text into words, and compare the words to
|
||||||
// the search terms,
|
// the search terms,
|
||||||
myTextSplitCB cb(terms, groups, slacks);
|
TextSplitPTR splitter(terms, groups, slacks);
|
||||||
TextSplit splitter(&cb);
|
|
||||||
// Note: the splitter returns the term locations in byte, not
|
// Note: the splitter returns the term locations in byte, not
|
||||||
// character, offsets.
|
// character, offsets.
|
||||||
splitter.text_to_words(in);
|
splitter.text_to_words(in);
|
||||||
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
|
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
|
||||||
|
|
||||||
// Compute the positions for NEAR and PHRASE groups.
|
// Compute the positions for NEAR and PHRASE groups.
|
||||||
cb.matchGroups();
|
splitter.matchGroups();
|
||||||
|
|
||||||
out.clear();
|
out.clear();
|
||||||
out.push_back("");
|
out.push_back("");
|
||||||
@ -353,12 +353,12 @@ bool PlainToRich::plaintorich(const string& in,
|
|||||||
// Iterator for the list of input term positions. We use it to
|
// Iterator for the list of input term positions. We use it to
|
||||||
// output highlight tags and to compute term positions in the
|
// output highlight tags and to compute term positions in the
|
||||||
// output text
|
// output text
|
||||||
vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
|
vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
|
||||||
vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();
|
vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
|
for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
|
||||||
it != cb.tboffs.end(); it++) {
|
it != splitter.tboffs.end(); it++) {
|
||||||
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -412,7 +412,7 @@ bool PlainToRich::plaintorich(const string& in,
|
|||||||
}
|
}
|
||||||
// Skip all highlight areas that would overlap this one
|
// Skip all highlight areas that would overlap this one
|
||||||
int crend = tPosIt->second;
|
int crend = tPosIt->second;
|
||||||
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
|
while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
|
||||||
tPosIt++;
|
tPosIt++;
|
||||||
inrcltag = 0;
|
inrcltag = 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,6 +42,7 @@ using namespace std;
|
|||||||
#include "internfile.h"
|
#include "internfile.h"
|
||||||
#include "wipedir.h"
|
#include "wipedir.h"
|
||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
|
#include "textsplit.h"
|
||||||
|
|
||||||
bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& idoc)
|
bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& idoc)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -31,6 +31,7 @@ using std::list;
|
|||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
#include "refcntr.h"
|
#include "refcntr.h"
|
||||||
|
#include "textsplit.h"
|
||||||
|
|
||||||
Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason)
|
Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -781,16 +781,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// The text splitter callback class which receives words from the
|
// The splitter breaks text into words and adds postings to the Xapian document.
|
||||||
// splitter and adds postings to the Xapian document.
|
class TextSplitDb : public TextSplit {
|
||||||
class mySplitterCB : public TextSplitCB {
|
|
||||||
public:
|
public:
|
||||||
Xapian::Document &doc; // Xapian document
|
Xapian::Document &doc; // Xapian document
|
||||||
Xapian::termpos basepos; // Base for document section
|
Xapian::termpos basepos; // Base for document section
|
||||||
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||||
// following section
|
// following section
|
||||||
StopList &stops;
|
StopList &stops;
|
||||||
mySplitterCB(Xapian::Document &d, StopList &_stops)
|
TextSplitDb(Xapian::Document &d, StopList &_stops)
|
||||||
: doc(d), basepos(1), curpos(0), stops(_stops)
|
: doc(d), basepos(1), curpos(0), stops(_stops)
|
||||||
{}
|
{}
|
||||||
bool takeword(const std::string &term, int pos, int, int);
|
bool takeword(const std::string &term, int pos, int, int);
|
||||||
@ -802,15 +801,16 @@ private:
|
|||||||
string prefix;
|
string prefix;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Callback for the document to word splitting class during indexation
|
// Get one term from the doc, remove accents and lowercase, then add posting
|
||||||
bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
|
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
|
LOGDEB(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
|
||||||
#endif
|
#endif
|
||||||
string term;
|
string term;
|
||||||
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
||||||
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
|
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
|
||||||
|
_term.c_str()));
|
||||||
term.clear();
|
term.clear();
|
||||||
// We don't generate a fatal error because of a bad term
|
// We don't generate a fatal error because of a bad term
|
||||||
return true;
|
return true;
|
||||||
@ -892,14 +892,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
Doc doc = idoc;
|
Doc doc = idoc;
|
||||||
|
|
||||||
Xapian::Document newdocument;
|
Xapian::Document newdocument;
|
||||||
mySplitterCB splitData(newdocument, m_stops);
|
TextSplitDb splitter(newdocument, m_stops);
|
||||||
TextSplit splitter(&splitData);
|
|
||||||
|
|
||||||
// Split and index file name as document term(s)
|
// Split and index file name as document term(s)
|
||||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||||
if (!splitter.text_to_words(doc.utf8fn))
|
if (!splitter.text_to_words(doc.utf8fn))
|
||||||
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
|
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitter.basepos += splitter.curpos + 100;
|
||||||
|
|
||||||
// Index textual metadata. These are all indexed as text with
|
// Index textual metadata. These are all indexed as text with
|
||||||
// positions, as we may want to do phrase searches with them (this
|
// positions, as we may want to do phrase searches with them (this
|
||||||
@ -919,19 +918,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
||||||
meta_it->first.c_str(), pfx.c_str(),
|
meta_it->first.c_str(), pfx.c_str(),
|
||||||
meta_it->second.c_str()));
|
meta_it->second.c_str()));
|
||||||
splitData.setprefix(pfx); // Subject
|
splitter.setprefix(pfx); // Subject
|
||||||
if (!splitter.text_to_words(meta_it->second))
|
if (!splitter.text_to_words(meta_it->second))
|
||||||
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
||||||
meta_it->first.c_str()));
|
meta_it->first.c_str()));
|
||||||
splitData.setprefix(string());
|
splitter.setprefix(string());
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitter.basepos += splitter.curpos + 100;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (splitData.curpos < baseTextPosition)
|
if (splitter.curpos < baseTextPosition)
|
||||||
splitData.basepos = baseTextPosition;
|
splitter.basepos = baseTextPosition;
|
||||||
else
|
else
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitter.basepos += splitter.curpos + 100;
|
||||||
|
|
||||||
// Split and index body text
|
// Split and index body text
|
||||||
LOGDEB2(("Db::add: split body\n"));
|
LOGDEB2(("Db::add: split body\n"));
|
||||||
|
|||||||
@ -188,25 +188,27 @@ void SearchData::getUTerms(vector<string>& terms) const
|
|||||||
// phrases. This is for parts of the user entry which would appear as
|
// phrases. This is for parts of the user entry which would appear as
|
||||||
// a single word because there is no white space inside, but are
|
// a single word because there is no white space inside, but are
|
||||||
// actually multiple terms to rcldb (ie term1,term2)
|
// actually multiple terms to rcldb (ie term1,term2)
|
||||||
class wsQData : public TextSplitCB {
|
class TextSplitQ : public TextSplit {
|
||||||
public:
|
public:
|
||||||
wsQData(const StopList &_stops)
|
TextSplitQ(Flags flags, const StopList &_stops)
|
||||||
: stops(_stops), alltermcount(0)
|
: TextSplit(flags), stops(_stops), alltermcount(0)
|
||||||
{}
|
{}
|
||||||
bool takeword(const std::string &interm, int , int, int) {
|
bool takeword(const std::string &interm, int , int, int) {
|
||||||
alltermcount++;
|
alltermcount++;
|
||||||
LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
|
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
|
||||||
|
|
||||||
// Check if the first letter is a majuscule in which
|
// Check if the first letter is a majuscule in which
|
||||||
// case we do not want to do stem expansion. Note that
|
// case we do not want to do stem expansion. Note that
|
||||||
// the test is convoluted and possibly problematic
|
// the test is convoluted and possibly problematic
|
||||||
string noacterm, noaclowterm;
|
string noacterm, noaclowterm;
|
||||||
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
|
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
|
||||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
|
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||||
|
interm.c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
|
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||||
|
noacterm.c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool nostemexp = false;
|
bool nostemexp = false;
|
||||||
@ -216,7 +218,8 @@ class wsQData : public TextSplitCB {
|
|||||||
nostemexp = true;
|
nostemexp = true;
|
||||||
|
|
||||||
if (stops.hasStops() && stops.isStop(noaclowterm)) {
|
if (stops.hasStops() && stops.isStop(noaclowterm)) {
|
||||||
LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
|
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
|
||||||
|
noaclowterm.c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
terms.push_back(noaclowterm);
|
terms.push_back(noaclowterm);
|
||||||
@ -271,7 +274,7 @@ private:
|
|||||||
// After splitting entry on whitespace: process non-phrase element
|
// After splitting entry on whitespace: process non-phrase element
|
||||||
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
|
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
|
||||||
// Process phrase/near element
|
// Process phrase/near element
|
||||||
void processPhraseOrNear(wsQData *splitData,
|
void processPhraseOrNear(TextSplitQ *splitData,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
bool useNear, int slack);
|
bool useNear, int slack);
|
||||||
|
|
||||||
@ -420,7 +423,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
|||||||
// NEAR xapian query, the elements of which can themselves be OR
|
// NEAR xapian query, the elements of which can themselves be OR
|
||||||
// queries if the terms get expanded by stemming or wildcards (we
|
// queries if the terms get expanded by stemming or wildcards (we
|
||||||
// don't do stemming for PHRASE though)
|
// don't do stemming for PHRASE though)
|
||||||
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
bool useNear, int slack)
|
bool useNear, int slack)
|
||||||
{
|
{
|
||||||
@ -527,31 +530,31 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
// We now adjust the phrase/near slack by the term count
|
// We now adjust the phrase/near slack by the term count
|
||||||
// difference (this is mainly better for cjk where this is a very
|
// difference (this is mainly better for cjk where this is a very
|
||||||
// common occurrence because of the ngrams thing.
|
// common occurrence because of the ngrams thing.
|
||||||
wsQData splitDataS(stops), splitDataW(stops);
|
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit splitterS(&splitDataS,
|
TextSplit::TXTS_KEEPWILD),
|
||||||
TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
stops);
|
||||||
TextSplit::TXTS_KEEPWILD));
|
|
||||||
splitterS.text_to_words(*it);
|
splitterS.text_to_words(*it);
|
||||||
TextSplit splitterW(&splitDataW,
|
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
||||||
TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
TextSplit::TXTS_KEEPWILD),
|
||||||
TextSplit::TXTS_KEEPWILD));
|
stops);
|
||||||
splitterW.text_to_words(*it);
|
splitterW.text_to_words(*it);
|
||||||
wsQData *splitData = &splitDataS;
|
TextSplitQ *splitter = &splitterS;
|
||||||
if (splitDataS.terms.size() > 1 &&
|
if (splitterS.terms.size() > 1 &&
|
||||||
splitDataS.terms.size() != splitDataW.terms.size()) {
|
splitterS.terms.size() != splitterW.terms.size()) {
|
||||||
slack += splitDataW.terms.size() - splitDataS.terms.size();
|
slack += splitterW.terms.size() - splitterS.terms.size();
|
||||||
// used to: splitData = &splitDataW;
|
// used to: splitData = &splitDataW;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
|
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
||||||
switch (splitData->terms.size()) {
|
switch (splitter->terms.size()) {
|
||||||
case 0:
|
case 0:
|
||||||
continue;// ??
|
continue;// ??
|
||||||
case 1:
|
case 1:
|
||||||
processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
|
processSimpleSpan(splitter->terms.front(),
|
||||||
|
splitter->nostemexps.front(), pqueries);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
processPhraseOrNear(splitData, pqueries, useNear, slack);
|
processPhraseOrNear(splitter, pqueries, useNear, slack);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
|
|||||||
@ -5,6 +5,7 @@ static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp
|
|||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
|
#include "textsplit.h"
|
||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -12,6 +13,21 @@ namespace Rcl
|
|||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
class TextSplitSW : public TextSplit {
|
||||||
|
public:
|
||||||
|
set<string>& stops;
|
||||||
|
TextSplitSW(Flags flags, set<string>& stps)
|
||||||
|
: TextSplit(flags), stops(stps)
|
||||||
|
{}
|
||||||
|
virtual bool takeword(const string& term, int, int, int)
|
||||||
|
{
|
||||||
|
string dterm;
|
||||||
|
unacmaybefold(term, dterm, "UTF-8", true);
|
||||||
|
stops.insert(dterm);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
bool StopList::setFile(const string &filename)
|
bool StopList::setFile(const string &filename)
|
||||||
{
|
{
|
||||||
m_hasStops = false;
|
m_hasStops = false;
|
||||||
@ -22,18 +38,9 @@ bool StopList::setFile(const string &filename)
|
|||||||
filename.c_str(), reason.c_str()));
|
filename.c_str(), reason.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
|
TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
|
||||||
ts.text_to_words(stoptext);
|
ts.text_to_words(stoptext);
|
||||||
return true;
|
m_hasStops = !m_stops.empty();
|
||||||
}
|
|
||||||
|
|
||||||
bool StopList::takeword(const string& term, int, int, int)
|
|
||||||
{
|
|
||||||
string dterm;
|
|
||||||
unacmaybefold(term, dterm, "UTF-8", true);
|
|
||||||
LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
|
|
||||||
m_hasStops = true;
|
|
||||||
m_stops.insert(dterm);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,8 +5,6 @@
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "textsplit.h"
|
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using std::set;
|
using std::set;
|
||||||
using std::string;
|
using std::string;
|
||||||
@ -14,7 +12,7 @@ namespace Rcl
|
|||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class StopList : public TextSplitCB {
|
class StopList {
|
||||||
public:
|
public:
|
||||||
StopList() : m_hasStops(false) {}
|
StopList() : m_hasStops(false) {}
|
||||||
StopList(const string &filename) {setFile(filename);}
|
StopList(const string &filename) {setFile(filename);}
|
||||||
@ -23,7 +21,6 @@ public:
|
|||||||
bool setFile(const string &filename);
|
bool setFile(const string &filename);
|
||||||
bool isStop(const string &term) const;
|
bool isStop(const string &term) const;
|
||||||
bool hasStops() const {return m_hasStops;}
|
bool hasStops() const {return m_hasStops;}
|
||||||
virtual bool takeword(const string& term, int pos, int bts, int bte);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool m_hasStops;
|
bool m_hasStops;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user