From 4c54a8478f6cede5746b9219ca787d018f8dc8e1 Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 8 Feb 2005 09:34:47 +0000 Subject: [PATCH] fixes in textsplit --- src/common/textsplit.cpp | 105 +++++++++++++++++++++++++++----------- src/common/textsplit.h | 9 ++-- src/qtgui/recoll.pro | 6 ++- src/qtgui/recollmain.ui.h | 11 ++-- src/query/xadump.cpp | 59 ++++++++++++++------- src/rcldb/rcldb.cpp | 4 +- src/utils/execmd.cpp | 3 +- 7 files changed, 135 insertions(+), 62 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index c682017e..3999a9e1 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -26,7 +26,7 @@ using namespace std; // Character classes: we have three main groups, and then some chars // are their own class because they want special handling. -// We have an array with 256 slots where we keep the character states. +// We have an array with 256 slots where we keep the character types. // The array could be fully static, but we use a small function to fill it // once. enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; @@ -37,32 +37,40 @@ static void setcharclasses() if (init) return; unsigned int i; - memset(charclasses, LETTER, sizeof(charclasses)); + for (i = 0 ; i < 256 ; i ++) + charclasses[i] = LETTER; + + for (i = 0; i < ' ';i++) + charclasses[i] = SPACE; char digits[] = "0123456789"; - for (i = 0; i < sizeof(digits); i++) + for (i = 0; i < strlen(digits); i++) charclasses[int(digits[i])] = DIGIT; char blankspace[] = "\t\v\f "; - for (i = 0; i < sizeof(blankspace); i++) + for (i = 0; i < strlen(blankspace); i++) charclasses[int(blankspace[i])] = SPACE; - char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*"; - for (i = 0; i < sizeof(seps); i++) + char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?"; + for (i = 0; i < strlen(seps); i++) charclasses[int(seps[i])] = SPACE; char special[] = ".@+-,#'\n\r"; - for (i = 0; i < sizeof(special); i++) + for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; init = 1; + //for (i=0;i<256;i++)cerr< "< 0) { + switch (w[0]) { + case ',': + case '\'': + w.erase(w.length()-1); + break; + default: + goto breakloop2; + } + } + breakloop2: + + // 1 char word: we index single letters, but nothing else + if (w.length() == 1) { + int c = (int)w[0]; + if (charclasses[c] != LETTER && charclasses[c] != DIGIT) { + //cerr << "ERASING single letter term " << c << endl; + w.erase(); } } - breakloop: if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { bool ret = cb->takeword(w, pos, btstart, btend); if (doerase) @@ -113,10 +144,10 @@ bool TextSplit::text_to_words(const string &in) SPACE: if (word.length()) { if (span.length() != word.length()) { - if (!emitterm(span, spanpos, true, i-span.length(), i)) + if (!emitterm(true, span, spanpos, true, i-span.length(), i)) return false; } - if (!emitterm(word, wordpos++, true, i-word.length(), i)) + if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) return false; number = false; } @@ -126,42 +157,53 @@ bool TextSplit::text_to_words(const string &in) case '-': case '+': if (word.length() == 0) { - if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) { + if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) { number = true; word += c; span += c; } } else { if (span.length() != word.length()) { - if (!emitterm(span, spanpos, false, i-span.length(), i)) + if (!emitterm(true, span, spanpos, false, i-span.length(), i)) return false; } - if (!emitterm(word, wordpos++, true, i-word.length(), i)) + if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) return false; number = false; span += c; } break; - case '\'': case '@': if (word.length()) { if (span.length() != word.length()) { - if (!emitterm(span, spanpos, false, i-span.length(), i)) + if (!emitterm(true, span, spanpos, false, i-span.length(), i)) return false; } - if (!emitterm(word, wordpos++, true, i-word.length(), i)) + if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) return false; number = false; } else word += c; span += c; break; + case '\'': + if (word.length()) { + if (span.length() != word.length()) { + if (!emitterm(true, span, spanpos, false, i-span.length(), i)) + return false; + } + if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) + return false; + number = false; + span += c; + } + break; case '.': if (number) { word += c; } else { if (word.length()) { - if (!emitterm(word, wordpos++, true, i-word.length(), i)) + if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) return false; number = false; } else @@ -208,9 +250,9 @@ bool TextSplit::text_to_words(const string &in) } if (word.length()) { if (span.length() != word.length()) - if (!emitterm(span, spanpos, true, i-span.length(), i)) + if (!emitterm(true, span, spanpos, true, i-span.length(), i)) return false; - return emitterm(word, wordpos, true, i-word.length(), i); + return emitterm(false, word, wordpos, true, i-word.length(), i); } return true; } @@ -220,11 +262,13 @@ bool TextSplit::text_to_words(const string &in) #include #include #include +#include #include #include "textsplit.h" #include "readfile.h" +#include "debuglog.h" using namespace std; @@ -232,7 +276,7 @@ using namespace std; class mySplitterCB : public TextSplitCB { public: bool takeword(const std::string &term, int pos, int bs, int be) { - cout << pos << " " << term << " bs " << bs << " be " << be << endl; + printf("%3d %-20s %d %d\n", pos, term.c_str(), bs, be); return true; } }; @@ -240,15 +284,18 @@ class mySplitterCB : public TextSplitCB { static string teststring = "jfd@okyz.com " "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami " - "a 134 +134 -14 -1.5 +1.5 1.54e10 a" - "@^#$(#$(*)" - "one\n\rtwo\nthree-\nfour" - "[olala][ululu]" - + "a 134 +134 -14 -1.5 +1.5 1.54e10 a " + "@^#$(#$(*) " + "one\n\rtwo\nthree-\nfour " + "[olala][ululu] " + "'o'brien' " + "\n" ; int main(int argc, char **argv) { + DebugLog::getdbl()->setloglevel(DEBDEB1); + DebugLog::setfilename("stderr"); mySplitterCB cb; TextSplit splitter(&cb); if (argc == 2) { diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 0f1cb1af..adb9d4b0 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -1,6 +1,6 @@ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -22,14 +22,17 @@ class TextSplitCB { * but 'ts much simpler this way... */ class TextSplit { + bool fq; // Are we splitting for query or index ? TextSplitCB *cb; int maxWordLength; - bool emitterm(std::string &term, int pos, bool doerase, int, int); + bool emitterm(bool isspan, std::string &term, int pos, bool doerase, + int bs, int be); public: /** * Constructor: just store callback and client data */ - TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {} + TextSplit(TextSplitCB *t, bool forquery = false) + : fq(forquery), cb(t), maxWordLength(40) {} /** * Split text, emit words and positions. */ diff --git a/src/qtgui/recoll.pro b/src/qtgui/recoll.pro index 3f02d8f8..f641eefa 100644 --- a/src/qtgui/recoll.pro +++ b/src/qtgui/recoll.pro @@ -23,11 +23,13 @@ unix { UI_DIR = .ui MOC_DIR = .moc OBJECTS_DIR = .obj - LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv + LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \ + -lfontconfig -lfreetype -lexpat -lz INCLUDEPATH += ../common ../index ../query ../unac ../utils + #QMAKE_LFLAGS_SHAPP += -static } UNAME = $$system(uname -s) contains( UNAME, [lL]inux ) { LIBS -= -liconv -} \ No newline at end of file +} diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index 2fb686eb..ffc1ad18 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -80,15 +80,14 @@ class myTextSplitCB : public TextSplitCB { static string plaintorich(const string &in, const list& terms, list >&termoffsets) { -#if 0 {string t; - for (list::const_iterator it = terms.begin();it != terms.end();it++) - t += "'" + *it + "' "; - LOGDEB(("plaintorich: term: %s\n", t.c_str())); + for (list::const_iterator it = terms.begin(); + it != terms.end();it++) t += "'" + *it + "' "; + LOGDEB(("plaintorich: terms: %s\n", t.c_str())); } -#endif + myTextSplitCB cb(terms); - TextSplit splitter(&cb); + TextSplit splitter(&cb, true); splitter.text_to_words(in); string out1; if (cb.tboffs.empty()) { diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp index 6b12a2f6..ae9f7613 100644 --- a/src/query/xadump.cpp +++ b/src/query/xadump.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: xadump.cpp,v 1.3 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: xadump.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -45,11 +45,23 @@ static int op_flags; #define OPT_F 0x80 #define OPT_E 0x100 -Xapian::Database db; +Xapian::Database *db; + +static void cleanup() +{ + delete db; +} + +static void sigcleanup(int sig) +{ + fprintf(stderr, "sigcleanup\n"); + cleanup(); + exit(1); +} int main(int argc, char **argv) { - string dbdir = "/home/dockes/tmp/xapiandb"; + string dbdir = "/home/dockes/.recoll/xapiandb"; string outencoding = "ISO8859-1"; int docid = 1; string aterm; @@ -92,46 +104,57 @@ int main(int argc, char **argv) if (argc != 0) Usage(); + atexit(cleanup); + if (signal(SIGHUP, SIG_IGN) != SIG_IGN) + signal(SIGHUP, sigcleanup); + if (signal(SIGINT, SIG_IGN) != SIG_IGN) + signal(SIGINT, sigcleanup); + if (signal(SIGQUIT, SIG_IGN) != SIG_IGN) + signal(SIGQUIT, sigcleanup); + if (signal(SIGTERM, SIG_IGN) != SIG_IGN) + signal(SIGTERM, sigcleanup); try { - db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN); + db = new Xapian::Database(dbdir); - cout << "DB: ndocs " << db.get_doccount() << " lastdocid " << - db.get_lastdocid() << " avglength " << db.get_avlength() << endl; + cout << "DB: ndocs " << db->get_doccount() << " lastdocid " << + db->get_lastdocid() << " avglength " << db->get_avlength() << endl; if (op_flags & OPT_T) { Xapian::TermIterator term; string printable; if (op_flags & OPT_i) { - for (term = db.termlist_begin(docid); - term != db.termlist_end(docid);term++) { + for (term = db->termlist_begin(docid); + term != db->termlist_end(docid);term++) { transcode(*term, printable, "UTF-8", outencoding); - cout << printable << endl; + cout << "[" << printable << "]" << endl; } } else { - for (term = db.allterms_begin(); - term != db.allterms_end();term++) { - transcode(*term, printable, "UTF-8", outencoding); - cout << printable << endl; + for (term = db->allterms_begin(); + term != db->allterms_end();term++) { + if (transcode(*term, printable, "UTF-8", outencoding)) + cout << "[" << printable << "]" << endl; + else + cout << "utf8[" << *term << "]" << endl; } } } else if (op_flags & OPT_D) { - Xapian::Document doc = db.get_document(docid); + Xapian::Document doc = db->get_document(docid); string data = doc.get_data(); cout << data << endl; } else if (op_flags & OPT_P) { Xapian::PostingIterator doc; - for (doc = db.postlist_begin(aterm); - doc != db.postlist_end(aterm);doc++) { + for (doc = db->postlist_begin(aterm); + doc != db->postlist_end(aterm);doc++) { cout << *doc << endl; } } else if (op_flags & OPT_F) { cout << "FreqFor " << aterm << " : " << - db.get_termfreq(aterm) << endl; + db->get_termfreq(aterm) << endl; } else if (op_flags & OPT_E) { cout << "Exists " << aterm << " : " << - db.term_exists(aterm) << endl; + db->term_exists(aterm) << endl; } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index d1f1a72e..bbe0583e 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -454,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring) return false; wsQData splitData; - TextSplit splitter(&splitData); + TextSplit splitter(&splitData, true); string noacc; if (!dumb_string(querystring, noacc)) { diff --git a/src/utils/execmd.cpp b/src/utils/execmd.cpp index 56490f0c..dfb4fc5d 100644 --- a/src/utils/execmd.cpp +++ b/src/utils/execmd.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: execmd.cpp,v 1.3 2005-02-01 17:20:06 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_EXECMD #include @@ -186,7 +186,6 @@ ExecCmd::doexec(const string &cmd, const list args, while (argv[i]) cerr << argv[i++] << endl;} #endif - LOGDEB(("ExecCmd::doexec: execvp(%s)\n", cmd.c_str())); execvp(cmd.c_str(), (char *const*)argv); // Hu ho LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(),