fixes in textsplit
This commit is contained in:
parent
2a020407da
commit
4c54a8478f
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_TEXTSPLIT
|
||||
|
||||
@ -26,7 +26,7 @@ using namespace std;
|
||||
|
||||
// Character classes: we have three main groups, and then some chars
|
||||
// are their own class because they want special handling.
|
||||
// We have an array with 256 slots where we keep the character states.
|
||||
// We have an array with 256 slots where we keep the character types.
|
||||
// The array could be fully static, but we use a small function to fill it
|
||||
// once.
|
||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
||||
@ -37,32 +37,40 @@ static void setcharclasses()
|
||||
if (init)
|
||||
return;
|
||||
unsigned int i;
|
||||
memset(charclasses, LETTER, sizeof(charclasses));
|
||||
for (i = 0 ; i < 256 ; i ++)
|
||||
charclasses[i] = LETTER;
|
||||
|
||||
for (i = 0; i < ' ';i++)
|
||||
charclasses[i] = SPACE;
|
||||
|
||||
char digits[] = "0123456789";
|
||||
for (i = 0; i < sizeof(digits); i++)
|
||||
for (i = 0; i < strlen(digits); i++)
|
||||
charclasses[int(digits[i])] = DIGIT;
|
||||
|
||||
char blankspace[] = "\t\v\f ";
|
||||
for (i = 0; i < sizeof(blankspace); i++)
|
||||
for (i = 0; i < strlen(blankspace); i++)
|
||||
charclasses[int(blankspace[i])] = SPACE;
|
||||
|
||||
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
|
||||
for (i = 0; i < sizeof(seps); i++)
|
||||
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?";
|
||||
for (i = 0; i < strlen(seps); i++)
|
||||
charclasses[int(seps[i])] = SPACE;
|
||||
|
||||
char special[] = ".@+-,#'\n\r";
|
||||
for (i = 0; i < sizeof(special); i++)
|
||||
for (i = 0; i < strlen(special); i++)
|
||||
charclasses[int(special[i])] = special[i];
|
||||
|
||||
init = 1;
|
||||
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
||||
}
|
||||
|
||||
bool TextSplit::emitterm(string &w, int pos, bool doerase,
|
||||
// Do some cleanup (the kind which is simpler to do here than in the main loop,
|
||||
// then send term to our client.
|
||||
bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
|
||||
int btstart, int btend)
|
||||
{
|
||||
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
||||
|
||||
if (fq && !isspan)
|
||||
return true;
|
||||
if (!cb)
|
||||
return false;
|
||||
|
||||
@ -73,13 +81,36 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase,
|
||||
case '.':
|
||||
case ',':
|
||||
case '@':
|
||||
case '\'':
|
||||
w.erase(w.length()-1);
|
||||
break;
|
||||
default:
|
||||
goto breakloop;
|
||||
goto breakloop1;
|
||||
}
|
||||
}
|
||||
breakloop1:
|
||||
|
||||
// In addition, it doesn't make sense currently to keep ' at the beginning
|
||||
while (w.length() > 0) {
|
||||
switch (w[0]) {
|
||||
case ',':
|
||||
case '\'':
|
||||
w.erase(w.length()-1);
|
||||
break;
|
||||
default:
|
||||
goto breakloop2;
|
||||
}
|
||||
}
|
||||
breakloop2:
|
||||
|
||||
// 1 char word: we index single letters, but nothing else
|
||||
if (w.length() == 1) {
|
||||
int c = (int)w[0];
|
||||
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
||||
//cerr << "ERASING single letter term " << c << endl;
|
||||
w.erase();
|
||||
}
|
||||
}
|
||||
breakloop:
|
||||
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
|
||||
bool ret = cb->takeword(w, pos, btstart, btend);
|
||||
if (doerase)
|
||||
@ -113,10 +144,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||
SPACE:
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(span, spanpos, true, i-span.length(), i))
|
||||
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
}
|
||||
@ -126,42 +157,53 @@ bool TextSplit::text_to_words(const string &in)
|
||||
case '-':
|
||||
case '+':
|
||||
if (word.length() == 0) {
|
||||
if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) {
|
||||
if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
|
||||
number = true;
|
||||
word += c;
|
||||
span += c;
|
||||
}
|
||||
} else {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(span, spanpos, false, i-span.length(), i))
|
||||
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
span += c;
|
||||
}
|
||||
break;
|
||||
case '\'':
|
||||
case '@':
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(span, spanpos, false, i-span.length(), i))
|
||||
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
} else
|
||||
word += c;
|
||||
span += c;
|
||||
break;
|
||||
case '\'':
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
span += c;
|
||||
}
|
||||
break;
|
||||
case '.':
|
||||
if (number) {
|
||||
word += c;
|
||||
} else {
|
||||
if (word.length()) {
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
} else
|
||||
@ -208,9 +250,9 @@ bool TextSplit::text_to_words(const string &in)
|
||||
}
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length())
|
||||
if (!emitterm(span, spanpos, true, i-span.length(), i))
|
||||
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
|
||||
return false;
|
||||
return emitterm(word, wordpos, true, i-word.length(), i);
|
||||
return emitterm(false, word, wordpos, true, i-word.length(), i);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -220,11 +262,13 @@ bool TextSplit::text_to_words(const string &in)
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "textsplit.h"
|
||||
#include "readfile.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -232,7 +276,7 @@ using namespace std;
|
||||
class mySplitterCB : public TextSplitCB {
|
||||
public:
|
||||
bool takeword(const std::string &term, int pos, int bs, int be) {
|
||||
cout << pos << " " << term << " bs " << bs << " be " << be << endl;
|
||||
printf("%3d %-20s %d %d\n", pos, term.c_str(), bs, be);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
@ -240,15 +284,18 @@ class mySplitterCB : public TextSplitCB {
|
||||
static string teststring =
|
||||
"jfd@okyz.com "
|
||||
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
|
||||
"a 134 +134 -14 -1.5 +1.5 1.54e10 a"
|
||||
"@^#$(#$(*)"
|
||||
"one\n\rtwo\nthree-\nfour"
|
||||
"[olala][ululu]"
|
||||
|
||||
"a 134 +134 -14 -1.5 +1.5 1.54e10 a "
|
||||
"@^#$(#$(*) "
|
||||
"one\n\rtwo\nthree-\nfour "
|
||||
"[olala][ululu] "
|
||||
"'o'brien' "
|
||||
"\n"
|
||||
;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||
DebugLog::setfilename("stderr");
|
||||
mySplitterCB cb;
|
||||
TextSplit splitter(&cb);
|
||||
if (argc == 2) {
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||
#define _TEXTSPLIT_H_INCLUDED_
|
||||
/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -22,14 +22,17 @@ class TextSplitCB {
|
||||
* but 'ts much simpler this way...
|
||||
*/
|
||||
class TextSplit {
|
||||
bool fq; // Are we splitting for query or index ?
|
||||
TextSplitCB *cb;
|
||||
int maxWordLength;
|
||||
bool emitterm(std::string &term, int pos, bool doerase, int, int);
|
||||
bool emitterm(bool isspan, std::string &term, int pos, bool doerase,
|
||||
int bs, int be);
|
||||
public:
|
||||
/**
|
||||
* Constructor: just store callback and client data
|
||||
*/
|
||||
TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
|
||||
TextSplit(TextSplitCB *t, bool forquery = false)
|
||||
: fq(forquery), cb(t), maxWordLength(40) {}
|
||||
/**
|
||||
* Split text, emit words and positions.
|
||||
*/
|
||||
|
||||
@ -23,11 +23,13 @@ unix {
|
||||
UI_DIR = .ui
|
||||
MOC_DIR = .moc
|
||||
OBJECTS_DIR = .obj
|
||||
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
|
||||
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \
|
||||
-lfontconfig -lfreetype -lexpat -lz
|
||||
INCLUDEPATH += ../common ../index ../query ../unac ../utils
|
||||
#QMAKE_LFLAGS_SHAPP += -static
|
||||
}
|
||||
|
||||
UNAME = $$system(uname -s)
|
||||
contains( UNAME, [lL]inux ) {
|
||||
LIBS -= -liconv
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,15 +80,14 @@ class myTextSplitCB : public TextSplitCB {
|
||||
static string plaintorich(const string &in, const list<string>& terms,
|
||||
list<pair<int, int> >&termoffsets)
|
||||
{
|
||||
#if 0
|
||||
{string t;
|
||||
for (list<string>::const_iterator it = terms.begin();it != terms.end();it++)
|
||||
t += "'" + *it + "' ";
|
||||
LOGDEB(("plaintorich: term: %s\n", t.c_str()));
|
||||
for (list<string>::const_iterator it = terms.begin();
|
||||
it != terms.end();it++) t += "'" + *it + "' ";
|
||||
LOGDEB(("plaintorich: terms: %s\n", t.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
myTextSplitCB cb(terms);
|
||||
TextSplit splitter(&cb);
|
||||
TextSplit splitter(&cb, true);
|
||||
splitter.text_to_words(in);
|
||||
string out1;
|
||||
if (cb.tboffs.empty()) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.3 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <strings.h>
|
||||
@ -45,11 +45,23 @@ static int op_flags;
|
||||
#define OPT_F 0x80
|
||||
#define OPT_E 0x100
|
||||
|
||||
Xapian::Database db;
|
||||
Xapian::Database *db;
|
||||
|
||||
static void cleanup()
|
||||
{
|
||||
delete db;
|
||||
}
|
||||
|
||||
static void sigcleanup(int sig)
|
||||
{
|
||||
fprintf(stderr, "sigcleanup\n");
|
||||
cleanup();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string dbdir = "/home/dockes/tmp/xapiandb";
|
||||
string dbdir = "/home/dockes/.recoll/xapiandb";
|
||||
string outencoding = "ISO8859-1";
|
||||
int docid = 1;
|
||||
string aterm;
|
||||
@ -92,46 +104,57 @@ int main(int argc, char **argv)
|
||||
|
||||
if (argc != 0)
|
||||
Usage();
|
||||
atexit(cleanup);
|
||||
if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGHUP, sigcleanup);
|
||||
if (signal(SIGINT, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGINT, sigcleanup);
|
||||
if (signal(SIGQUIT, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGQUIT, sigcleanup);
|
||||
if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGTERM, sigcleanup);
|
||||
|
||||
try {
|
||||
db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN);
|
||||
db = new Xapian::Database(dbdir);
|
||||
|
||||
cout << "DB: ndocs " << db.get_doccount() << " lastdocid " <<
|
||||
db.get_lastdocid() << " avglength " << db.get_avlength() << endl;
|
||||
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
||||
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
||||
|
||||
if (op_flags & OPT_T) {
|
||||
Xapian::TermIterator term;
|
||||
string printable;
|
||||
if (op_flags & OPT_i) {
|
||||
for (term = db.termlist_begin(docid);
|
||||
term != db.termlist_end(docid);term++) {
|
||||
for (term = db->termlist_begin(docid);
|
||||
term != db->termlist_end(docid);term++) {
|
||||
transcode(*term, printable, "UTF-8", outencoding);
|
||||
cout << printable << endl;
|
||||
cout << "[" << printable << "]" << endl;
|
||||
}
|
||||
} else {
|
||||
for (term = db.allterms_begin();
|
||||
term != db.allterms_end();term++) {
|
||||
transcode(*term, printable, "UTF-8", outencoding);
|
||||
cout << printable << endl;
|
||||
for (term = db->allterms_begin();
|
||||
term != db->allterms_end();term++) {
|
||||
if (transcode(*term, printable, "UTF-8", outencoding))
|
||||
cout << "[" << printable << "]" << endl;
|
||||
else
|
||||
cout << "utf8[" << *term << "]" << endl;
|
||||
}
|
||||
}
|
||||
} else if (op_flags & OPT_D) {
|
||||
Xapian::Document doc = db.get_document(docid);
|
||||
Xapian::Document doc = db->get_document(docid);
|
||||
string data = doc.get_data();
|
||||
cout << data << endl;
|
||||
} else if (op_flags & OPT_P) {
|
||||
Xapian::PostingIterator doc;
|
||||
for (doc = db.postlist_begin(aterm);
|
||||
doc != db.postlist_end(aterm);doc++) {
|
||||
for (doc = db->postlist_begin(aterm);
|
||||
doc != db->postlist_end(aterm);doc++) {
|
||||
cout << *doc << endl;
|
||||
}
|
||||
|
||||
} else if (op_flags & OPT_F) {
|
||||
cout << "FreqFor " << aterm << " : " <<
|
||||
db.get_termfreq(aterm) << endl;
|
||||
db->get_termfreq(aterm) << endl;
|
||||
} else if (op_flags & OPT_E) {
|
||||
cout << "Exists " << aterm << " : " <<
|
||||
db.term_exists(aterm) << endl;
|
||||
db->term_exists(aterm) << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -454,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
|
||||
return false;
|
||||
|
||||
wsQData splitData;
|
||||
TextSplit splitter(&splitData);
|
||||
TextSplit splitter(&splitData, true);
|
||||
|
||||
string noacc;
|
||||
if (!dumb_string(querystring, noacc)) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.3 2005-02-01 17:20:06 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_EXECMD
|
||||
#include <unistd.h>
|
||||
@ -186,7 +186,6 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
while (argv[i]) cerr << argv[i++] << endl;}
|
||||
#endif
|
||||
|
||||
LOGDEB(("ExecCmd::doexec: execvp(%s)\n", cmd.c_str()));
|
||||
execvp(cmd.c_str(), (char *const*)argv);
|
||||
// Hu ho
|
||||
LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user