From 68fb37b5f9b4507efd929c36c142c3db618051ce Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 5 Apr 2005 09:35:35 +0000 Subject: [PATCH] *** empty log message *** --- src/common/rclinit.cpp | 40 ++++++++++++++++++++++ src/common/rclinit.h | 9 +++++ src/index/mimetype.cpp | 17 ++++++---- src/index/recollindex.cpp | 24 +++---------- src/internfile/internfile.cpp | 4 +-- src/internfile/mh_mail.cpp | 15 ++++++--- src/lib/Makefile | 8 +++-- src/qtgui/main.cpp | 17 +++------- src/rcldb/rcldb.cpp | 63 +++++++++++++++++++++++------------ src/sampleconf/mimemap | 14 +++++--- 10 files changed, 137 insertions(+), 74 deletions(-) create mode 100644 src/common/rclinit.cpp create mode 100644 src/common/rclinit.h diff --git a/src/common/rclinit.cpp b/src/common/rclinit.cpp new file mode 100644 index 00000000..2177afb0 --- /dev/null +++ b/src/common/rclinit.cpp @@ -0,0 +1,40 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.1 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes"; +#endif + +#include +#include + +#include "debuglog.h" +#include "rclconfig.h" + +RclConfig *recollinit(void (*cleanup)(void), void (*sigcleanup)(int)) +{ + atexit(cleanup); + if (signal(SIGHUP, SIG_IGN) != SIG_IGN) + signal(SIGHUP, sigcleanup); + if (signal(SIGINT, SIG_IGN) != SIG_IGN) + signal(SIGINT, sigcleanup); + if (signal(SIGQUIT, SIG_IGN) != SIG_IGN) + signal(SIGQUIT, sigcleanup); + if (signal(SIGTERM, SIG_IGN) != SIG_IGN) + signal(SIGTERM, sigcleanup); + + DebugLog::getdbl()->setloglevel(DEBDEB1); + DebugLog::setfilename("stderr"); + RclConfig *config = new RclConfig; + if (!config || !config->ok()) { + fprintf(stderr, "Config could not be built\n"); + exit(1); + } + + string logfilename, loglevel; + if (config->getConfParam(string("logfilename"), logfilename)) + DebugLog::setfilename(logfilename.c_str()); + if (config->getConfParam(string("loglevel"), loglevel)) { + int lev = atoi(loglevel.c_str()); + DebugLog::getdbl()->setloglevel(lev); + } + + return config; +} diff --git a/src/common/rclinit.h b/src/common/rclinit.h new file mode 100644 index 00000000..cecfab63 --- /dev/null +++ b/src/common/rclinit.h @@ -0,0 +1,9 @@ +#ifndef _RCLINIT_H_INCLUDED_ +#define _RCLINIT_H_INCLUDED_ +/* @(#$Id: rclinit.h,v 1.1 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes */ + +class RclConfig; + +extern RclConfig *recollinit(void (*cleanup)(void), void (*sigcleanup)(int)); + +#endif /* _RCLINIT_H_INCLUDED_ */ diff --git a/src/index/mimetype.cpp b/src/index/mimetype.cpp index 5fbc2905..de502ea2 100644 --- a/src/index/mimetype.cpp +++ b/src/index/mimetype.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.7 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.8 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -13,6 +13,7 @@ using std::list; #include "debuglog.h" #include "execmd.h" #include "conftree.h" +#include "smallut.h" static string mimetypefromdata(const string &fn) { @@ -61,8 +62,9 @@ string mimetype(const string &fn, ConfTree *mtypes) it != stoplist.end(); it++) { if (it->length() > fn.length()) continue; - if (!fn.compare(fn.length() - it->length(), string::npos, *it)) { - LOGDEB1(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(), + if (!stringicmp(fn.substr(fn.length() - it->length(),string::npos), + *it)) { + LOGINFO(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(), it->c_str())); return ""; } @@ -82,13 +84,16 @@ string mimetype(const string &fn, ConfTree *mtypes) return mtype; } - // Look at file data ? Only when no suffix or always + // Look at file data ? Only when no suffix or always ? // Also 'file' is not that great for us. For exemple it will // mistake mail folders for simple text files if there is no 'Received' // header, which would be the case, for exemple in a 'Sent' folder. Also // I'm not sure that file -i exists on all systems - //if (suff.empty()) - return mimetypefromdata(fn); + + //if (suff.empty()) // causes problems with shifted files, like + // messages.1, messages.2 etc... + return mimetypefromdata(fn); + return ""; } diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index b29ac89a..0ebcfe18 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -1,11 +1,12 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.9 2005-02-01 08:42:56 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.10 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include #include "debuglog.h" +#include "rclinit.h" #include "indexer.h" ConfIndexer *indexer; @@ -25,24 +26,9 @@ static void sigcleanup(int sig) int main(int argc, const char **argv) { - atexit(cleanup); - if (signal(SIGHUP, SIG_IGN) != SIG_IGN) - signal(SIGHUP, sigcleanup); - if (signal(SIGINT, SIG_IGN) != SIG_IGN) - signal(SIGINT, sigcleanup); - if (signal(SIGQUIT, SIG_IGN) != SIG_IGN) - signal(SIGQUIT, sigcleanup); - if (signal(SIGTERM, SIG_IGN) != SIG_IGN) - signal(SIGTERM, sigcleanup); + RclConfig *config = recollinit(cleanup, sigcleanup); - DebugLog::getdbl()->setloglevel(DEBDEB1); - DebugLog::setfilename("stderr"); - RclConfig config; - if (!config.ok()) { - fprintf(stderr, "Config could not be built\n"); - exit(1); - } - indexer = new ConfIndexer(&config); - + indexer = new ConfIndexer(config); + exit(!indexer->index()); } diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 055acfb0..9c405281 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.4 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.5 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -88,7 +88,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, { mime = mimetype(fn, config->getMimeMap()); if (mime.empty()) { - // No mime type: not listed in our map. + // No mime type: not listed in our map, or present in stop list LOGDEB(("FileInterner::FileInterner: (no mime) [%s]\n", fn.c_str())); return; } diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 002e1103..21412747 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.3 2005-04-04 13:18:46 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.4 2005-04-05 09:35:35 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -202,7 +202,7 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, doc.isMultipart(), doc.getSubType().c_str())); walkmime(conf, docout.text, doc, 0); - //LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str())); + // LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str())); return MimeHandler::MHDone; } @@ -231,21 +231,27 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, } else if (!stringicmp("alternative", doc.getSubType())) { std::vector::iterator ittxt, ithtml; ittxt = ithtml = doc.members.end(); - for (it = doc.members.begin(); it != doc.members.end();it++) { + int i = 1; + for (it = doc.members.begin(); it != doc.members.end();it++, i++) { // Get and parse content-type header Binc::HeaderItem hi; - if (!doc.h.getFirstHeader("Content-Type", hi)) + if (!it->h.getFirstHeader("Content-Type", hi)) { + LOGDEB(("No content-type header for part %d\n", i)); continue; + } MimeHeaderValue content_type; parseMimeHeaderValue(hi.getValue(), content_type); + LOGDEB2(("walkmime: C-type: %s\n",content_type.value.c_str())); if (!stringlowercmp("text/plain", content_type.value)) ittxt = it; else if (!stringlowercmp("text/html", content_type.value)) ithtml = it; } if (ittxt != doc.members.end()) { + LOGDEB2(("walkmime: alternative: chose text/plain part\n")) walkmime(cnf, out, *ittxt, depth+1); } else if (ithtml != doc.members.end()) { + LOGDEB2(("walkmime: alternative: chose text/html part\n")) walkmime(cnf, out, *ithtml, depth+1); } } @@ -337,5 +343,6 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, } out += string("\r\n") + transcoded; + LOGDEB2(("walkmime: out now: [%s]\n", out.c_str())); } } diff --git a/src/lib/Makefile b/src/lib/Makefile index 6b4cfea2..ffcf7b43 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -10,7 +10,7 @@ OBJS = conftree.o csguess.o debuglog.o \ execmd.o wipedir.o \ fstreewalk.o html.o mail.o htmlparse.o indexer.o internfile.o \ mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \ - rclconfig.o rcldb.o readfile.o smallut.o \ + rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \ textsplit.o transcode.o \ unacpp.o unac.o SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ @@ -20,8 +20,8 @@ SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ ../index/indexer.cpp ../common/internfile.cpp \ ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \ ../common/myhtmlparse.cpp ../utils/pathut.cpp \ - ../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \ - ../utils/smallut.cpp \ + ../common/rclconfig.cpp ../common/rcldb.cpp ../common/rclinit.cpp \ + ../utils/readfile.cpp ../utils/smallut.cpp \ ../common/textsplit.cpp ../utils/transcode.cpp \ ../common/unacpp.cpp ../unac/unac.c @@ -66,6 +66,8 @@ pathut.o : ../utils/pathut.cpp $(CXX) $(CXXFLAGS) -c $< rclconfig.o : ../common/rclconfig.cpp $(CXX) $(CXXFLAGS) -c $< +rclinit.o : ../common/rclinit.cpp + $(CXX) $(CXXFLAGS) -c $< rcldb.o : ../common/rcldb.cpp $(CXX) $(CXXFLAGS) -c $< readfile.o : ../utils/readfile.cpp diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp index db8ca82d..18647cbf 100644 --- a/src/qtgui/main.cpp +++ b/src/qtgui/main.cpp @@ -12,6 +12,7 @@ #include "recoll.h" #include "smallut.h" #include "wipedir.h" +#include "rclinit.h" RclConfig *rclconfig; Rcl::Db *rcldb; @@ -33,10 +34,9 @@ void recollCleanup() } } - static void sigcleanup(int) { - fprintf(stderr, "sigcleanup\n"); + // fprintf(stderr, "sigcleanup\n"); // Cant call exit from here, because the atexit cleanup does some // thread stuff that we can't do from signal context. // Just set a flag and let the watchdog timer do the work @@ -54,23 +54,14 @@ int main( int argc, char ** argv ) w.connect(timer, SIGNAL(timeout()), &w, SLOT(checkExit())); timer->start(100); - atexit(recollCleanup); - if (signal(SIGHUP, SIG_IGN) != SIG_IGN) - signal(SIGHUP, sigcleanup); - if (signal(SIGINT, SIG_IGN) != SIG_IGN) - signal(SIGINT, sigcleanup); - if (signal(SIGQUIT, SIG_IGN) != SIG_IGN) - signal(SIGQUIT, sigcleanup); - if (signal(SIGTERM, SIG_IGN) != SIG_IGN) - signal(SIGTERM, sigcleanup); + rclconfig = recollinit(recollCleanup, sigcleanup); - - rclconfig = new RclConfig; if (!rclconfig || !rclconfig->ok()) { QMessageBox::critical(0, "Recoll", QString("Could not find configuration")); exit(1); } + string dbdir; if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) { // Note: this will have to be replaced by a call to a diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index ff046764..94a8bc71 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.26 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.27 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -317,9 +317,17 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) splitter.text_to_words(noacc); newdocument.add_term("T" + doc.mimetype); - string pathterm = doc.ipath.empty() ? - "P" + fn : "P" + fn + "|" + doc.ipath; + + string pathterm = "P" + fn; newdocument.add_term(pathterm); + + string uniterm; + if (!doc.ipath.empty()) { + uniterm = "Q" + fn + "|" + doc.ipath; + newdocument.add_term(uniterm); + } + + const char *fnc = fn.c_str(); // Document data record. omindex has the following nl separated fields: @@ -342,7 +350,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); - time_t mtime = atol(doc.mtime.c_str()); struct tm *tm = localtime(&mtime); char buf[9]; @@ -360,7 +367,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) // entry. try { Xapian::docid did = - ndb->wdb.replace_document(pathterm, newdocument); + ndb->wdb.replace_document(uniterm.empty() ? pathterm : uniterm, + newdocument); if (did < ndb->updated.size()) { ndb->updated[did] = true; LOGDEB(("Rcl::Db::add: docid %d updated [%s , %s]\n", did, fnc, @@ -385,33 +393,44 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) return false; Native *ndb = (Native *)pdata; + // If no document exist with this path, we do need update string pathterm = "P" + filename; if (!ndb->wdb.term_exists(pathterm)) { - pathterm += string("|") + "1"; - if (!ndb->wdb.term_exists(pathterm)) { - return true; - } + return true; } + + // Look for all documents with this path. Check the update time (once). + // If the db is up to date, set the update flags for all documents Xapian::PostingIterator doc; try { - Xapian::PostingIterator did = ndb->wdb.postlist_begin(pathterm); - if (did == ndb->wdb.postlist_end(pathterm)) - return true; - Xapian::Document doc = ndb->wdb.get_document(*did); - string data = doc.get_data(); - const char *cp = strstr(data.c_str(), "mtime="); - cp += 6; - long mtime = atol(cp); - if (mtime >= stp->st_mtime) { + Xapian::PostingIterator did0 = ndb->wdb.postlist_begin(pathterm); + for (Xapian::PostingIterator did = did0; + did != ndb->wdb.postlist_end(pathterm); did++) { + + Xapian::Document doc = ndb->wdb.get_document(*did); + + // Check the date once. no need to look at the others if the + // db needs updating. + if (did == did0) { + string data = doc.get_data(); + const char *cp = strstr(data.c_str(), "mtime="); + cp += 6; + long mtime = atol(cp); + if (mtime < stp->st_mtime) { + // Db is not up to date. Let's index the file + return true; + } + } + + // Db is up to date. Make a note that this document exists. if (*did < ndb->updated.size()) ndb->updated[*did] = true; - return false; - } + } } catch (...) { return true; } - return true; + return false; } /// Compute name of stem db for given base database and language @@ -582,7 +601,7 @@ bool Rcl::Db::purge() ndb->wdb.delete_document(did); LOGDEB(("Rcl::Db::purge: deleted document #%d\n", did)); } catch (const Xapian::DocNotFoundError &) { - LOGDEB(("Rcl::Db::purge: document #%d not found\n", did)); + LOGDEB2(("Rcl::Db::purge: document #%d not found\n", did)); } } } diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index 23c6a1ab..626170a5 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -1,4 +1,4 @@ -# @(#$Id: mimemap,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimemap,v 1.4 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes # Recoll: associations of file name extensions to mime types .txt = text/plain @@ -37,8 +37,12 @@ .rtf = text/rtf -# A list of stuff that we don't want to touch at all -recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz +# A list of stuff that we don't want to touch at all (for now). Having the +# suffixes listed in there speeds up things quite a lot by avoiding +# unneeded decompression or 'file' calls +recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz \ + .c .h .cpp .m4 .tcl .js .sh .pl .awk \ + .o .lib .dll .a \ + .dat .bak .rdf .log .db .ini .gnm .gnumeric \ + .jpg .gif .bmp .xpm -[FILE] -# This section for future non suffix-based extension (ie detect mail folders)