fixes in textsplit

This commit is contained in:
dockes 2005-02-08 09:34:47 +00:00
parent 2a020407da
commit 4c54a8478f
7 changed files with 135 additions and 62 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_TEXTSPLIT
@ -26,7 +26,7 @@ using namespace std;
// Character classes: we have three main groups, and then some chars
// are their own class because they want special handling.
// We have an array with 256 slots where we keep the character states.
// We have an array with 256 slots where we keep the character types.
// The array could be fully static, but we use a small function to fill it
// once.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
@ -37,32 +37,40 @@ static void setcharclasses()
if (init)
return;
unsigned int i;
memset(charclasses, LETTER, sizeof(charclasses));
for (i = 0 ; i < 256 ; i ++)
charclasses[i] = LETTER;
for (i = 0; i < ' ';i++)
charclasses[i] = SPACE;
char digits[] = "0123456789";
for (i = 0; i < sizeof(digits); i++)
for (i = 0; i < strlen(digits); i++)
charclasses[int(digits[i])] = DIGIT;
char blankspace[] = "\t\v\f ";
for (i = 0; i < sizeof(blankspace); i++)
for (i = 0; i < strlen(blankspace); i++)
charclasses[int(blankspace[i])] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
for (i = 0; i < sizeof(seps); i++)
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?";
for (i = 0; i < strlen(seps); i++)
charclasses[int(seps[i])] = SPACE;
char special[] = ".@+-,#'\n\r";
for (i = 0; i < sizeof(special); i++)
for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i];
init = 1;
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
}
bool TextSplit::emitterm(string &w, int pos, bool doerase,
// Do some cleanup (the kind which is simpler to do here than in the main loop,
// then send term to our client.
bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
int btstart, int btend)
{
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
if (fq && !isspan)
return true;
if (!cb)
return false;
@ -73,13 +81,36 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase,
case '.':
case ',':
case '@':
case '\'':
w.erase(w.length()-1);
break;
default:
goto breakloop;
goto breakloop1;
}
}
breakloop1:
// In addition, it doesn't make sense currently to keep ' at the beginning
while (w.length() > 0) {
switch (w[0]) {
case ',':
case '\'':
w.erase(w.length()-1);
break;
default:
goto breakloop2;
}
}
breakloop2:
// 1 char word: we index single letters, but nothing else
if (w.length() == 1) {
int c = (int)w[0];
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
//cerr << "ERASING single letter term " << c << endl;
w.erase();
}
}
breakloop:
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
bool ret = cb->takeword(w, pos, btstart, btend);
if (doerase)
@ -113,10 +144,10 @@ bool TextSplit::text_to_words(const string &in)
SPACE:
if (word.length()) {
if (span.length() != word.length()) {
if (!emitterm(span, spanpos, true, i-span.length(), i))
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
return false;
}
if (!emitterm(word, wordpos++, true, i-word.length(), i))
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false;
number = false;
}
@ -126,42 +157,53 @@ bool TextSplit::text_to_words(const string &in)
case '-':
case '+':
if (word.length() == 0) {
if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) {
if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
number = true;
word += c;
span += c;
}
} else {
if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false, i-span.length(), i))
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(word, wordpos++, true, i-word.length(), i))
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false;
number = false;
span += c;
}
break;
case '\'':
case '@':
if (word.length()) {
if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false, i-span.length(), i))
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(word, wordpos++, true, i-word.length(), i))
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false;
number = false;
} else
word += c;
span += c;
break;
case '\'':
if (word.length()) {
if (span.length() != word.length()) {
if (!emitterm(true, span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false;
number = false;
span += c;
}
break;
case '.':
if (number) {
word += c;
} else {
if (word.length()) {
if (!emitterm(word, wordpos++, true, i-word.length(), i))
if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
return false;
number = false;
} else
@ -208,9 +250,9 @@ bool TextSplit::text_to_words(const string &in)
}
if (word.length()) {
if (span.length() != word.length())
if (!emitterm(span, spanpos, true, i-span.length(), i))
if (!emitterm(true, span, spanpos, true, i-span.length(), i))
return false;
return emitterm(word, wordpos, true, i-word.length(), i);
return emitterm(false, word, wordpos, true, i-word.length(), i);
}
return true;
}
@ -220,11 +262,13 @@ bool TextSplit::text_to_words(const string &in)
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <iostream>
#include "textsplit.h"
#include "readfile.h"
#include "debuglog.h"
using namespace std;
@ -232,7 +276,7 @@ using namespace std;
class mySplitterCB : public TextSplitCB {
public:
bool takeword(const std::string &term, int pos, int bs, int be) {
cout << pos << " " << term << " bs " << bs << " be " << be << endl;
printf("%3d %-20s %d %d\n", pos, term.c_str(), bs, be);
return true;
}
};
@ -240,15 +284,18 @@ class mySplitterCB : public TextSplitCB {
static string teststring =
"jfd@okyz.com "
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
"a 134 +134 -14 -1.5 +1.5 1.54e10 a"
"@^#$(#$(*)"
"one\n\rtwo\nthree-\nfour"
"[olala][ululu]"
"a 134 +134 -14 -1.5 +1.5 1.54e10 a "
"@^#$(#$(*) "
"one\n\rtwo\nthree-\nfour "
"[olala][ululu] "
"'o'brien' "
"\n"
;
int main(int argc, char **argv)
{
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
mySplitterCB cb;
TextSplit splitter(&cb);
if (argc == 2) {

View File

@ -1,6 +1,6 @@
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -22,14 +22,17 @@ class TextSplitCB {
* but 'ts much simpler this way...
*/
class TextSplit {
bool fq; // Are we splitting for query or index ?
TextSplitCB *cb;
int maxWordLength;
bool emitterm(std::string &term, int pos, bool doerase, int, int);
bool emitterm(bool isspan, std::string &term, int pos, bool doerase,
int bs, int be);
public:
/**
* Constructor: just store callback and client data
*/
TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
TextSplit(TextSplitCB *t, bool forquery = false)
: fq(forquery), cb(t), maxWordLength(40) {}
/**
* Split text, emit words and positions.
*/

View File

@ -23,11 +23,13 @@ unix {
UI_DIR = .ui
MOC_DIR = .moc
OBJECTS_DIR = .obj
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \
-lfontconfig -lfreetype -lexpat -lz
INCLUDEPATH += ../common ../index ../query ../unac ../utils
#QMAKE_LFLAGS_SHAPP += -static
}
UNAME = $$system(uname -s)
contains( UNAME, [lL]inux ) {
LIBS -= -liconv
}
}

View File

@ -80,15 +80,14 @@ class myTextSplitCB : public TextSplitCB {
static string plaintorich(const string &in, const list<string>& terms,
list<pair<int, int> >&termoffsets)
{
#if 0
{string t;
for (list<string>::const_iterator it = terms.begin();it != terms.end();it++)
t += "'" + *it + "' ";
LOGDEB(("plaintorich: term: %s\n", t.c_str()));
for (list<string>::const_iterator it = terms.begin();
it != terms.end();it++) t += "'" + *it + "' ";
LOGDEB(("plaintorich: terms: %s\n", t.c_str()));
}
#endif
myTextSplitCB cb(terms);
TextSplit splitter(&cb);
TextSplit splitter(&cb, true);
splitter.text_to_words(in);
string out1;
if (cb.tboffs.empty()) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.3 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <strings.h>
@ -45,11 +45,23 @@ static int op_flags;
#define OPT_F 0x80
#define OPT_E 0x100
Xapian::Database db;
Xapian::Database *db;
static void cleanup()
{
delete db;
}
static void sigcleanup(int sig)
{
fprintf(stderr, "sigcleanup\n");
cleanup();
exit(1);
}
int main(int argc, char **argv)
{
string dbdir = "/home/dockes/tmp/xapiandb";
string dbdir = "/home/dockes/.recoll/xapiandb";
string outencoding = "ISO8859-1";
int docid = 1;
string aterm;
@ -92,46 +104,57 @@ int main(int argc, char **argv)
if (argc != 0)
Usage();
atexit(cleanup);
if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
signal(SIGHUP, sigcleanup);
if (signal(SIGINT, SIG_IGN) != SIG_IGN)
signal(SIGINT, sigcleanup);
if (signal(SIGQUIT, SIG_IGN) != SIG_IGN)
signal(SIGQUIT, sigcleanup);
if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
signal(SIGTERM, sigcleanup);
try {
db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN);
db = new Xapian::Database(dbdir);
cout << "DB: ndocs " << db.get_doccount() << " lastdocid " <<
db.get_lastdocid() << " avglength " << db.get_avlength() << endl;
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
if (op_flags & OPT_T) {
Xapian::TermIterator term;
string printable;
if (op_flags & OPT_i) {
for (term = db.termlist_begin(docid);
term != db.termlist_end(docid);term++) {
for (term = db->termlist_begin(docid);
term != db->termlist_end(docid);term++) {
transcode(*term, printable, "UTF-8", outencoding);
cout << printable << endl;
cout << "[" << printable << "]" << endl;
}
} else {
for (term = db.allterms_begin();
term != db.allterms_end();term++) {
transcode(*term, printable, "UTF-8", outencoding);
cout << printable << endl;
for (term = db->allterms_begin();
term != db->allterms_end();term++) {
if (transcode(*term, printable, "UTF-8", outencoding))
cout << "[" << printable << "]" << endl;
else
cout << "utf8[" << *term << "]" << endl;
}
}
} else if (op_flags & OPT_D) {
Xapian::Document doc = db.get_document(docid);
Xapian::Document doc = db->get_document(docid);
string data = doc.get_data();
cout << data << endl;
} else if (op_flags & OPT_P) {
Xapian::PostingIterator doc;
for (doc = db.postlist_begin(aterm);
doc != db.postlist_end(aterm);doc++) {
for (doc = db->postlist_begin(aterm);
doc != db->postlist_end(aterm);doc++) {
cout << *doc << endl;
}
} else if (op_flags & OPT_F) {
cout << "FreqFor " << aterm << " : " <<
db.get_termfreq(aterm) << endl;
db->get_termfreq(aterm) << endl;
} else if (op_flags & OPT_E) {
cout << "Exists " << aterm << " : " <<
db.term_exists(aterm) << endl;
db->term_exists(aterm) << endl;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -454,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
return false;
wsQData splitData;
TextSplit splitter(&splitData);
TextSplit splitter(&splitData, true);
string noacc;
if (!dumb_string(querystring, noacc)) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.3 2005-02-01 17:20:06 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_EXECMD
#include <unistd.h>
@ -186,7 +186,6 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
while (argv[i]) cerr << argv[i++] << endl;}
#endif
LOGDEB(("ExecCmd::doexec: execvp(%s)\n", cmd.c_str()));
execvp(cmd.c_str(), (char *const*)argv);
// Hu ho
LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(),