fixes in textsplit

This commit is contained in:
dockes 2005-02-08 09:34:47 +00:00
parent 2a020407da
commit 4c54a8478f
7 changed files with 135 additions and 62 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_TEXTSPLIT #ifndef TEST_TEXTSPLIT
@ -26,7 +26,7 @@ using namespace std;
// Character classes: we have three main groups, and then some chars // Character classes: we have three main groups, and then some chars
// are their own class because they want special handling. // are their own class because they want special handling.
// We have an array with 256 slots where we keep the character states. // We have an array with 256 slots where we keep the character types.
// The array could be fully static, but we use a small function to fill it // The array could be fully static, but we use a small function to fill it
// once. // once.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
@ -37,32 +37,40 @@ static void setcharclasses()
if (init) if (init)
return; return;
unsigned int i; unsigned int i;
memset(charclasses, LETTER, sizeof(charclasses)); for (i = 0 ; i < 256 ; i ++)
charclasses[i] = LETTER;
for (i = 0; i < ' ';i++)
charclasses[i] = SPACE;
char digits[] = "0123456789"; char digits[] = "0123456789";
for (i = 0; i < sizeof(digits); i++) for (i = 0; i < strlen(digits); i++)
charclasses[int(digits[i])] = DIGIT; charclasses[int(digits[i])] = DIGIT;
char blankspace[] = "\t\v\f "; char blankspace[] = "\t\v\f ";
for (i = 0; i < sizeof(blankspace); i++) for (i = 0; i < strlen(blankspace); i++)
charclasses[int(blankspace[i])] = SPACE; charclasses[int(blankspace[i])] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*"; char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?";
for (i = 0; i < sizeof(seps); i++) for (i = 0; i < strlen(seps); i++)
charclasses[int(seps[i])] = SPACE; charclasses[int(seps[i])] = SPACE;
char special[] = ".@+-,#'\n\r"; char special[] = ".@+-,#'\n\r";
for (i = 0; i < sizeof(special); i++) for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i]; charclasses[int(special[i])] = special[i];
init = 1; init = 1;
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
} }
bool TextSplit::emitterm(string &w, int pos, bool doerase, // Do some cleanup (the kind which is simpler to do here than in the main loop,
// then send term to our client.
bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
int btstart, int btend) int btstart, int btend)
{ {
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
if (fq && !isspan)
return true;
if (!cb) if (!cb)
return false; return false;
@ -73,13 +81,36 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase,
case '.': case '.':
case ',': case ',':
case '@': case '@':
case '\'':
w.erase(w.length()-1); w.erase(w.length()-1);
break; break;
default: default:
goto breakloop; goto breakloop1;
}
}
breakloop1:
// In addition, it doesn't make sense currently to keep ' at the beginning
while (w.length() > 0) {
switch (w[0]) {
case ',':
case '\'':
w.erase(w.length()-1);
break;
default:
goto breakloop2;
}
}
breakloop2:
// 1 char word: we index single letters, but nothing else
if (w.length() == 1) {
int c = (int)w[0];
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
//cerr << "ERASING single letter term " << c << endl;
w.erase();
} }
} }
breakloop:
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
bool ret = cb->takeword(w, pos, btstart, btend); bool ret = cb->takeword(w, pos, btstart, btend);
if (doerase) if (doerase)
@ -113,10 +144,10 @@ bool TextSplit::text_to_words(const string &in)
SPACE: SPACE:
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (span.length() != word.length()) {
if (!emitterm(span, spanpos, true, i-span.length(), i)) if (!emitterm(true, span, spanpos, true, i-span.length(), i))
return false; return false;
} }
if (!emitterm(word, wordpos++, true, i-word.length(), i)) if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} }
@ -126,42 +157,53 @@ bool TextSplit::text_to_words(const string &in)
case '-': case '-':
case '+': case '+':
if (word.length() == 0) { if (word.length() == 0) {
if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) { if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
number = true; number = true;
word += c; word += c;
span += c; span += c;
} }
} else { } else {
if (span.length() != word.length()) { if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false, i-span.length(), i)) if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false; return false;
} }
if (!emitterm(word, wordpos++, true, i-word.length(), i)) if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
span += c; span += c;
} }
break; break;
case '\'':
case '@': case '@':
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false, i-span.length(), i)) if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false; return false;
} }
if (!emitterm(word, wordpos++, true, i-word.length(), i)) if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} else } else
word += c; word += c;
span += c; span += c;
break; break;
case '\'':
if (word.length()) {
if (span.length() != word.length()) {
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false;
number = false;
span += c;
}
break;
case '.': case '.':
if (number) { if (number) {
word += c; word += c;
} else { } else {
if (word.length()) { if (word.length()) {
if (!emitterm(word, wordpos++, true, i-word.length(), i)) if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} else } else
@ -208,9 +250,9 @@ bool TextSplit::text_to_words(const string &in)
} }
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) if (span.length() != word.length())
if (!emitterm(span, spanpos, true, i-span.length(), i)) if (!emitterm(true, span, spanpos, true, i-span.length(), i))
return false; return false;
return emitterm(word, wordpos, true, i-word.length(), i); return emitterm(false, word, wordpos, true, i-word.length(), i);
} }
return true; return true;
} }
@ -220,11 +262,13 @@ bool TextSplit::text_to_words(const string &in)
#include <unistd.h> #include <unistd.h>
#include <errno.h> #include <errno.h>
#include <fcntl.h> #include <fcntl.h>
#include <stdio.h>
#include <iostream> #include <iostream>
#include "textsplit.h" #include "textsplit.h"
#include "readfile.h" #include "readfile.h"
#include "debuglog.h"
using namespace std; using namespace std;
@ -232,7 +276,7 @@ using namespace std;
class mySplitterCB : public TextSplitCB { class mySplitterCB : public TextSplitCB {
public: public:
bool takeword(const std::string &term, int pos, int bs, int be) { bool takeword(const std::string &term, int pos, int bs, int be) {
cout << pos << " " << term << " bs " << bs << " be " << be << endl; printf("%3d %-20s %d %d\n", pos, term.c_str(), bs, be);
return true; return true;
} }
}; };
@ -240,15 +284,18 @@ class mySplitterCB : public TextSplitCB {
static string teststring = static string teststring =
"jfd@okyz.com " "jfd@okyz.com "
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami " "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
"a 134 +134 -14 -1.5 +1.5 1.54e10 a" "a 134 +134 -14 -1.5 +1.5 1.54e10 a "
"@^#$(#$(*)" "@^#$(#$(*) "
"one\n\rtwo\nthree-\nfour" "one\n\rtwo\nthree-\nfour "
"[olala][ululu]" "[olala][ululu] "
"'o'brien' "
"\n"
; ;
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
mySplitterCB cb; mySplitterCB cb;
TextSplit splitter(&cb); TextSplit splitter(&cb);
if (argc == 2) { if (argc == 2) {

View File

@ -1,6 +1,6 @@
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
@ -22,14 +22,17 @@ class TextSplitCB {
* but 'ts much simpler this way... * but 'ts much simpler this way...
*/ */
class TextSplit { class TextSplit {
bool fq; // Are we splitting for query or index ?
TextSplitCB *cb; TextSplitCB *cb;
int maxWordLength; int maxWordLength;
bool emitterm(std::string &term, int pos, bool doerase, int, int); bool emitterm(bool isspan, std::string &term, int pos, bool doerase,
int bs, int be);
public: public:
/** /**
* Constructor: just store callback and client data * Constructor: just store callback and client data
*/ */
TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {} TextSplit(TextSplitCB *t, bool forquery = false)
: fq(forquery), cb(t), maxWordLength(40) {}
/** /**
* Split text, emit words and positions. * Split text, emit words and positions.
*/ */

View File

@ -23,11 +23,13 @@ unix {
UI_DIR = .ui UI_DIR = .ui
MOC_DIR = .moc MOC_DIR = .moc
OBJECTS_DIR = .obj OBJECTS_DIR = .obj
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \
-lfontconfig -lfreetype -lexpat -lz
INCLUDEPATH += ../common ../index ../query ../unac ../utils INCLUDEPATH += ../common ../index ../query ../unac ../utils
#QMAKE_LFLAGS_SHAPP += -static
} }
UNAME = $$system(uname -s) UNAME = $$system(uname -s)
contains( UNAME, [lL]inux ) { contains( UNAME, [lL]inux ) {
LIBS -= -liconv LIBS -= -liconv
} }

View File

@ -80,15 +80,14 @@ class myTextSplitCB : public TextSplitCB {
static string plaintorich(const string &in, const list<string>& terms, static string plaintorich(const string &in, const list<string>& terms,
list<pair<int, int> >&termoffsets) list<pair<int, int> >&termoffsets)
{ {
#if 0
{string t; {string t;
for (list<string>::const_iterator it = terms.begin();it != terms.end();it++) for (list<string>::const_iterator it = terms.begin();
t += "'" + *it + "' "; it != terms.end();it++) t += "'" + *it + "' ";
LOGDEB(("plaintorich: term: %s\n", t.c_str())); LOGDEB(("plaintorich: terms: %s\n", t.c_str()));
} }
#endif
myTextSplitCB cb(terms); myTextSplitCB cb(terms);
TextSplit splitter(&cb); TextSplit splitter(&cb, true);
splitter.text_to_words(in); splitter.text_to_words(in);
string out1; string out1;
if (cb.tboffs.empty()) { if (cb.tboffs.empty()) {

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.3 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: xadump.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <strings.h> #include <strings.h>
@ -45,11 +45,23 @@ static int op_flags;
#define OPT_F 0x80 #define OPT_F 0x80
#define OPT_E 0x100 #define OPT_E 0x100
Xapian::Database db; Xapian::Database *db;
static void cleanup()
{
delete db;
}
static void sigcleanup(int sig)
{
fprintf(stderr, "sigcleanup\n");
cleanup();
exit(1);
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
string dbdir = "/home/dockes/tmp/xapiandb"; string dbdir = "/home/dockes/.recoll/xapiandb";
string outencoding = "ISO8859-1"; string outencoding = "ISO8859-1";
int docid = 1; int docid = 1;
string aterm; string aterm;
@ -92,46 +104,57 @@ int main(int argc, char **argv)
if (argc != 0) if (argc != 0)
Usage(); Usage();
atexit(cleanup);
if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
signal(SIGHUP, sigcleanup);
if (signal(SIGINT, SIG_IGN) != SIG_IGN)
signal(SIGINT, sigcleanup);
if (signal(SIGQUIT, SIG_IGN) != SIG_IGN)
signal(SIGQUIT, sigcleanup);
if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
signal(SIGTERM, sigcleanup);
try { try {
db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN); db = new Xapian::Database(dbdir);
cout << "DB: ndocs " << db.get_doccount() << " lastdocid " << cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
db.get_lastdocid() << " avglength " << db.get_avlength() << endl; db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
if (op_flags & OPT_T) { if (op_flags & OPT_T) {
Xapian::TermIterator term; Xapian::TermIterator term;
string printable; string printable;
if (op_flags & OPT_i) { if (op_flags & OPT_i) {
for (term = db.termlist_begin(docid); for (term = db->termlist_begin(docid);
term != db.termlist_end(docid);term++) { term != db->termlist_end(docid);term++) {
transcode(*term, printable, "UTF-8", outencoding); transcode(*term, printable, "UTF-8", outencoding);
cout << printable << endl; cout << "[" << printable << "]" << endl;
} }
} else { } else {
for (term = db.allterms_begin(); for (term = db->allterms_begin();
term != db.allterms_end();term++) { term != db->allterms_end();term++) {
transcode(*term, printable, "UTF-8", outencoding); if (transcode(*term, printable, "UTF-8", outencoding))
cout << printable << endl; cout << "[" << printable << "]" << endl;
else
cout << "utf8[" << *term << "]" << endl;
} }
} }
} else if (op_flags & OPT_D) { } else if (op_flags & OPT_D) {
Xapian::Document doc = db.get_document(docid); Xapian::Document doc = db->get_document(docid);
string data = doc.get_data(); string data = doc.get_data();
cout << data << endl; cout << data << endl;
} else if (op_flags & OPT_P) { } else if (op_flags & OPT_P) {
Xapian::PostingIterator doc; Xapian::PostingIterator doc;
for (doc = db.postlist_begin(aterm); for (doc = db->postlist_begin(aterm);
doc != db.postlist_end(aterm);doc++) { doc != db->postlist_end(aterm);doc++) {
cout << *doc << endl; cout << *doc << endl;
} }
} else if (op_flags & OPT_F) { } else if (op_flags & OPT_F) {
cout << "FreqFor " << aterm << " : " << cout << "FreqFor " << aterm << " : " <<
db.get_termfreq(aterm) << endl; db->get_termfreq(aterm) << endl;
} else if (op_flags & OPT_E) { } else if (op_flags & OPT_E) {
cout << "Exists " << aterm << " : " << cout << "Exists " << aterm << " : " <<
db.term_exists(aterm) << endl; db->term_exists(aterm) << endl;
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -454,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
return false; return false;
wsQData splitData; wsQData splitData;
TextSplit splitter(&splitData); TextSplit splitter(&splitData, true);
string noacc; string noacc;
if (!dumb_string(querystring, noacc)) { if (!dumb_string(querystring, noacc)) {

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.3 2005-02-01 17:20:06 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_EXECMD #ifndef TEST_EXECMD
#include <unistd.h> #include <unistd.h>
@ -186,7 +186,6 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
while (argv[i]) cerr << argv[i++] << endl;} while (argv[i]) cerr << argv[i++] << endl;}
#endif #endif
LOGDEB(("ExecCmd::doexec: execvp(%s)\n", cmd.c_str()));
execvp(cmd.c_str(), (char *const*)argv); execvp(cmd.c_str(), (char *const*)argv);
// Hu ho // Hu ho
LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(), LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(),