phrases ok except for preview position

This commit is contained in:
dockes 2005-02-08 10:56:13 +00:00
parent 4c54a8478f
commit 4588803281
3 changed files with 84 additions and 48 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.7 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_TEXTSPLIT #ifndef TEST_TEXTSPLIT
@ -65,12 +65,11 @@ static void setcharclasses()
// Do some cleanup (the kind which is simpler to do here than in the main loop, // Do some cleanup (the kind which is simpler to do here than in the main loop,
// then send term to our client. // then send term to our client.
bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase, bool TextSplit::emitterm(bool isspan, string &w, int pos,
int btstart, int btend) int btstart, int btend)
{ {
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
if (fq && !isspan)
return true;
if (!cb) if (!cb)
return false; return false;
@ -113,13 +112,32 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
} }
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
bool ret = cb->takeword(w, pos, btstart, btend); bool ret = cb->takeword(w, pos, btstart, btend);
if (doerase)
w.erase();
return ret; return ret;
} }
return true; return true;
} }
bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
bool spanerase, int bp)
{
// When splitting for query, we only emit final spans
if (fq && !spanerase) {
wordpos++;
word.erase();
return true;
}
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
return false;
if (word.length() != span.length() && !fq)
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
return false;
wordpos++;
if (spanerase)
span.erase();
word.erase();
return true;
}
/* /*
* We basically emit a word every time we see a separator, but some chars are * We basically emit a word every time we see a separator, but some chars are
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc, * handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
@ -143,11 +161,7 @@ bool TextSplit::text_to_words(const string &in)
case SPACE: case SPACE:
SPACE: SPACE:
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (!doemit(word, wordpos, span, spanpos, true, i))
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
return false;
}
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} }
@ -163,11 +177,7 @@ bool TextSplit::text_to_words(const string &in)
span += c; span += c;
} }
} else { } else {
if (span.length() != word.length()) { if (!doemit(word, wordpos, span, spanpos, false, i))
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
span += c; span += c;
@ -175,11 +185,7 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '@': case '@':
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (!doemit(word, wordpos, span, spanpos, false, i))
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} else } else
@ -188,11 +194,7 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '\'': case '\'':
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (!doemit(word, wordpos, span, spanpos, false, i))
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
span += c; span += c;
@ -202,8 +204,9 @@ bool TextSplit::text_to_words(const string &in)
if (number) { if (number) {
word += c; word += c;
} else { } else {
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
if (word.length()) { if (word.length()) {
if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) if (!doemit(word, wordpos, span, spanpos, false, i))
return false; return false;
number = false; number = false;
} else } else
@ -249,10 +252,8 @@ bool TextSplit::text_to_words(const string &in)
} }
} }
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) if (!doemit(word, wordpos, span, spanpos, true, i))
if (!emitterm(true, span, spanpos, true, i-span.length(), i)) return false;
return false;
return emitterm(false, word, wordpos, true, i-word.length(), i);
} }
return true; return true;
} }
@ -282,10 +283,12 @@ class mySplitterCB : public TextSplitCB {
}; };
static string teststring = static string teststring =
"le ta "
"jfd@okyz.com " "jfd@okyz.com "
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami " "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
"a 134 +134 -14 -1.5 +1.5 1.54e10 a " "a 134 +134 -14 -1.5 +1.5 1.54e10 a "
"@^#$(#$(*) " "@^#$(#$(*) "
"192.168.4.1 "
"one\n\rtwo\nthree-\nfour " "one\n\rtwo\nthree-\nfour "
"[olala][ululu] " "[olala][ululu] "
"'o'brien' " "'o'brien' "
@ -297,14 +300,14 @@ int main(int argc, char **argv)
DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr"); DebugLog::setfilename("stderr");
mySplitterCB cb; mySplitterCB cb;
TextSplit splitter(&cb); TextSplit splitter(&cb, true);
if (argc == 2) { if (argc == 2) {
string data; string data;
if (!file_to_string(argv[1], data)) if (!file_to_string(argv[1], data))
exit(1); exit(1);
splitter.text_to_words(data); splitter.text_to_words(data);
} else { } else {
cout << teststring << endl; cout << endl << teststring << endl << endl;
splitter.text_to_words(teststring); splitter.text_to_words(teststring);
} }

View File

@ -1,8 +1,11 @@
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.6 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES
using std::string;
#endif
// Function class whose called for every detected word // Function class whose called for every detected word
class TextSplitCB { class TextSplitCB {
@ -25,8 +28,9 @@ class TextSplit {
bool fq; // Are we splitting for query or index ? bool fq; // Are we splitting for query or index ?
TextSplitCB *cb; TextSplitCB *cb;
int maxWordLength; int maxWordLength;
bool emitterm(bool isspan, std::string &term, int pos, bool doerase, bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
int bs, int be); bool doemit(string &word, int &wordpos, string &span, int spanpos,
bool spanerase, int bp);
public: public:
/** /**
* Constructor: just store callback and client data * Constructor: just store callback and client data

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.21 2005-02-08 10:56:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -438,33 +438,62 @@ bool Rcl::Db::purge()
class wsQData : public TextSplitCB { class wsQData : public TextSplitCB {
public: public:
vector<string> terms; vector<string> terms;
string catterms() {
string s;
for (unsigned int i=0;i<terms.size();i++) {
s += "[" + terms[i] + "] ";
}
return s;
}
bool takeword(const std::string &term, int , int, int) { bool takeword(const std::string &term, int , int, int) {
LOGDEB(("Takeword: %s\n", term.c_str()));
terms.push_back(term); terms.push_back(term);
return true; return true;
} }
}; };
bool Rcl::Db::setQuery(const std::string &querystring) bool Rcl::Db::setQuery(const std::string &iqstring)
{ {
LOGDEB(("Rcl::Db::setQuery: %s\n", querystring.c_str())); LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str()));
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;
if (!ndb) if (!ndb)
return false; return false;
wsQData splitData; string qstring;;
TextSplit splitter(&splitData, true); if (!dumb_string(iqstring, qstring)) {
string noacc;
if (!dumb_string(querystring, noacc)) {
return false; return false;
} }
splitter.text_to_words(noacc);
// First extract phrases:
list<string> phrases;
ConfTree::stringToStrings(qstring, phrases);
for (list<string>::const_iterator i=phrases.begin();
i != phrases.end();i++) {
LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str()));
}
list<Xapian::Query> pqueries;
for (list<string>::const_iterator it = phrases.begin();
it != phrases.end(); it++) {
ndb->query = Xapian::Query(Xapian::Query::OP_OR, splitData.terms.begin(), wsQData splitData;
splitData.terms.end()); TextSplit splitter(&splitData, true);
splitter.text_to_words(*it);
LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
switch(splitData.terms.size()) {
case 0: continue;// ??
case 1:
pqueries.push_back(Xapian::Query(splitData.terms.front()));
break;
default:
LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str()));
pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
splitData.terms.begin(),
splitData.terms.end()));
}
}
ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
pqueries.end());
delete ndb->enquire; delete ndb->enquire;
ndb->enquire = new Xapian::Enquire(ndb->db); ndb->enquire = new Xapian::Enquire(ndb->db);
ndb->enquire->set_query(ndb->query); ndb->enquire->set_query(ndb->query);