phrases ok except for preview position
This commit is contained in:
parent
4c54a8478f
commit
4588803281
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.7 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#ifndef TEST_TEXTSPLIT
|
#ifndef TEST_TEXTSPLIT
|
||||||
|
|
||||||
@ -65,12 +65,11 @@ static void setcharclasses()
|
|||||||
|
|
||||||
// Do some cleanup (the kind which is simpler to do here than in the main loop,
|
// Do some cleanup (the kind which is simpler to do here than in the main loop,
|
||||||
// then send term to our client.
|
// then send term to our client.
|
||||||
bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
|
bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||||
int btstart, int btend)
|
int btstart, int btend)
|
||||||
{
|
{
|
||||||
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
||||||
if (fq && !isspan)
|
|
||||||
return true;
|
|
||||||
if (!cb)
|
if (!cb)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -113,13 +112,32 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
|
|||||||
}
|
}
|
||||||
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
|
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
|
||||||
bool ret = cb->takeword(w, pos, btstart, btend);
|
bool ret = cb->takeword(w, pos, btstart, btend);
|
||||||
if (doerase)
|
|
||||||
w.erase();
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
|
||||||
|
bool spanerase, int bp)
|
||||||
|
{
|
||||||
|
// When splitting for query, we only emit final spans
|
||||||
|
if (fq && !spanerase) {
|
||||||
|
wordpos++;
|
||||||
|
word.erase();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
||||||
|
return false;
|
||||||
|
if (word.length() != span.length() && !fq)
|
||||||
|
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
|
||||||
|
return false;
|
||||||
|
wordpos++;
|
||||||
|
if (spanerase)
|
||||||
|
span.erase();
|
||||||
|
word.erase();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We basically emit a word every time we see a separator, but some chars are
|
* We basically emit a word every time we see a separator, but some chars are
|
||||||
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
||||||
@ -143,11 +161,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length()) {
|
if (!doemit(word, wordpos, span, spanpos, true, i))
|
||||||
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
@ -163,11 +177,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
span += c;
|
span += c;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (span.length() != word.length()) {
|
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||||
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += c;
|
span += c;
|
||||||
@ -175,11 +185,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
case '@':
|
case '@':
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length()) {
|
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||||
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
} else
|
||||||
@ -188,11 +194,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length()) {
|
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||||
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += c;
|
span += c;
|
||||||
@ -202,8 +204,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (number) {
|
if (number) {
|
||||||
word += c;
|
word += c;
|
||||||
} else {
|
} else {
|
||||||
|
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
} else
|
||||||
@ -249,10 +252,8 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length())
|
if (!doemit(word, wordpos, span, spanpos, true, i))
|
||||||
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
|
return false;
|
||||||
return false;
|
|
||||||
return emitterm(false, word, wordpos, true, i-word.length(), i);
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -282,10 +283,12 @@ class mySplitterCB : public TextSplitCB {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static string teststring =
|
static string teststring =
|
||||||
|
"le ta "
|
||||||
"jfd@okyz.com "
|
"jfd@okyz.com "
|
||||||
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
|
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
|
||||||
"a 134 +134 -14 -1.5 +1.5 1.54e10 a "
|
"a 134 +134 -14 -1.5 +1.5 1.54e10 a "
|
||||||
"@^#$(#$(*) "
|
"@^#$(#$(*) "
|
||||||
|
"192.168.4.1 "
|
||||||
"one\n\rtwo\nthree-\nfour "
|
"one\n\rtwo\nthree-\nfour "
|
||||||
"[olala][ululu] "
|
"[olala][ululu] "
|
||||||
"'o'brien' "
|
"'o'brien' "
|
||||||
@ -297,14 +300,14 @@ int main(int argc, char **argv)
|
|||||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||||
DebugLog::setfilename("stderr");
|
DebugLog::setfilename("stderr");
|
||||||
mySplitterCB cb;
|
mySplitterCB cb;
|
||||||
TextSplit splitter(&cb);
|
TextSplit splitter(&cb, true);
|
||||||
if (argc == 2) {
|
if (argc == 2) {
|
||||||
string data;
|
string data;
|
||||||
if (!file_to_string(argv[1], data))
|
if (!file_to_string(argv[1], data))
|
||||||
exit(1);
|
exit(1);
|
||||||
splitter.text_to_words(data);
|
splitter.text_to_words(data);
|
||||||
} else {
|
} else {
|
||||||
cout << teststring << endl;
|
cout << endl << teststring << endl << endl;
|
||||||
splitter.text_to_words(teststring);
|
splitter.text_to_words(teststring);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,11 @@
|
|||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.6 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
using std::string;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Function class whose called for every detected word
|
// Function class whose called for every detected word
|
||||||
class TextSplitCB {
|
class TextSplitCB {
|
||||||
@ -25,8 +28,9 @@ class TextSplit {
|
|||||||
bool fq; // Are we splitting for query or index ?
|
bool fq; // Are we splitting for query or index ?
|
||||||
TextSplitCB *cb;
|
TextSplitCB *cb;
|
||||||
int maxWordLength;
|
int maxWordLength;
|
||||||
bool emitterm(bool isspan, std::string &term, int pos, bool doerase,
|
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
||||||
int bs, int be);
|
bool doemit(string &word, int &wordpos, string &span, int spanpos,
|
||||||
|
bool spanerase, int bp);
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Constructor: just store callback and client data
|
* Constructor: just store callback and client data
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.21 2005-02-08 10:56:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -438,33 +438,62 @@ bool Rcl::Db::purge()
|
|||||||
class wsQData : public TextSplitCB {
|
class wsQData : public TextSplitCB {
|
||||||
public:
|
public:
|
||||||
vector<string> terms;
|
vector<string> terms;
|
||||||
|
string catterms() {
|
||||||
|
string s;
|
||||||
|
for (unsigned int i=0;i<terms.size();i++) {
|
||||||
|
s += "[" + terms[i] + "] ";
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
bool takeword(const std::string &term, int , int, int) {
|
bool takeword(const std::string &term, int , int, int) {
|
||||||
|
LOGDEB(("Takeword: %s\n", term.c_str()));
|
||||||
terms.push_back(term);
|
terms.push_back(term);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
bool Rcl::Db::setQuery(const std::string &querystring)
|
bool Rcl::Db::setQuery(const std::string &iqstring)
|
||||||
{
|
{
|
||||||
LOGDEB(("Rcl::Db::setQuery: %s\n", querystring.c_str()));
|
LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str()));
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
if (!ndb)
|
if (!ndb)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
wsQData splitData;
|
string qstring;;
|
||||||
TextSplit splitter(&splitData, true);
|
if (!dumb_string(iqstring, qstring)) {
|
||||||
|
|
||||||
string noacc;
|
|
||||||
if (!dumb_string(querystring, noacc)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
splitter.text_to_words(noacc);
|
|
||||||
|
|
||||||
|
// First extract phrases:
|
||||||
|
list<string> phrases;
|
||||||
|
ConfTree::stringToStrings(qstring, phrases);
|
||||||
|
for (list<string>::const_iterator i=phrases.begin();
|
||||||
|
i != phrases.end();i++) {
|
||||||
|
LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str()));
|
||||||
|
}
|
||||||
|
list<Xapian::Query> pqueries;
|
||||||
|
for (list<string>::const_iterator it = phrases.begin();
|
||||||
|
it != phrases.end(); it++) {
|
||||||
|
|
||||||
ndb->query = Xapian::Query(Xapian::Query::OP_OR, splitData.terms.begin(),
|
wsQData splitData;
|
||||||
splitData.terms.end());
|
TextSplit splitter(&splitData, true);
|
||||||
|
splitter.text_to_words(*it);
|
||||||
|
LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
|
||||||
|
switch(splitData.terms.size()) {
|
||||||
|
case 0: continue;// ??
|
||||||
|
case 1:
|
||||||
|
pqueries.push_back(Xapian::Query(splitData.terms.front()));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str()));
|
||||||
|
pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||||
|
splitData.terms.begin(),
|
||||||
|
splitData.terms.end()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
||||||
|
pqueries.end());
|
||||||
delete ndb->enquire;
|
delete ndb->enquire;
|
||||||
ndb->enquire = new Xapian::Enquire(ndb->db);
|
ndb->enquire = new Xapian::Enquire(ndb->db);
|
||||||
ndb->enquire->set_query(ndb->query);
|
ndb->enquire->set_query(ndb->query);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user