diff --git a/src/VERSION b/src/VERSION index 0495c4a8..3a3cd8cc 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.2.3 +1.3.1 diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index f2af109c..a27f934e 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.24 2006-01-26 07:02:06 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.25 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -236,6 +236,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // Internal access path for multi-document files doc.ipath = ipath; + // File name transcoded to utf8 for indexation. + // We actually might want a separate param for the filename charset + string charset = config->getDefCharset(); + // If this fails, the path won't be indexed, no big deal + transcode(fn, doc.utf8fn, charset, "UTF-8"); // Do database-specific work to update document data if (!db.add(fn, doc, stp)) return FsTreeWalker::FtwError; diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 38a2bc64..be94e209 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.15 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.16 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -125,41 +125,42 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, // for a compressed file. m_mime = mimetype(m_fn, m_cfg, usfci); - // If identification fails, try to use the input parameter. Note that this - // is normally not a compressed type (it's the mime type from the db) + // If identification fails, try to use the input parameter. This + // is then normally not a compressed type (it's the mime type from + // the db), and is only set when previewing, not for indexing if (m_mime.empty() && imime) m_mime = *imime; + + if (!m_mime.empty()) { + // Has mime: check for a compressed file. If so, create a + // temporary uncompressed file, and rerun the mime type + // identification, then do the rest with the temp file. + listucmd; + if (m_cfg->getUncompressor(m_mime, ucmd)) { + if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) { + return; + } + LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n", + m_tdir.c_str(), m_tfile.c_str())); + m_fn = m_tfile; + m_mime = mimetype(m_fn, m_cfg, usfci); + if (m_mime.empty() && imime) + m_mime = *imime; + } + } + if (m_mime.empty()) { - // No mime type: not listed in our map, or present in stop list - LOGDEB(("FileInterner::FileInterner: (no mime) [%s]\n", m_fn.c_str())); - return; + // No mime type. We let it through as config may warrant that + // we index all file names + LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str())); } - // First check for a compressed file. If so, create a temporary - // uncompressed file, and rerun the mime type identification, then do the - // rest with the temp file. - listucmd; - if (m_cfg->getUncompressor(m_mime, ucmd)) { - if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) { - return; - } - LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n", - m_tdir.c_str(), m_tfile.c_str())); - m_fn = m_tfile; - m_mime = mimetype(m_fn, m_cfg, usfci); - if (m_mime.empty() && imime) - m_mime = *imime; - if (m_mime.empty()) { - // No mime type ?? pass on. - LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str())); - return; - } - } - - // Look for appropriate handler + // Look for appropriate handler (might still return empty) m_handler = getMimeHandler(m_mime, m_cfg); + if (!m_handler) { - // No handler for this type, for now :( + // No handler for this type, for now :( if indexallfilenames + // is set in the config, this normally wont happen (we get mh_unknown) LOGDEB(("FileInterner::FileInterner: %s: no handler\n", m_mime.c_str())); return; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 6d2b76a6..e17a1798 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.16 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.17 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -31,6 +31,7 @@ using namespace std; #include "mh_mail.h" #include "mh_text.h" #include "mh_exec.h" +#include "mh_unknown.h" /** Create internal handler object appropriate for given mime type */ static MimeHandler *mhFactory(const string &mime) @@ -52,35 +53,48 @@ static MimeHandler *mhFactory(const string &mime) MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg) { // Get handler definition for mime type - string hs = cfg->getMimeHandlerDef(mtype); - if (hs.empty()) - return 0; + string hs; + if (!mtype.empty()) + hs = cfg->getMimeHandlerDef(mtype); - // Break definition into type and name - list toks; - stringToStrings(hs, toks); - if (toks.empty()) { - LOGERR(("getMimeHandler: bad mimeconf line for %s\n", mtype.c_str())); - return 0; - } - - // Retrieve handler function according to type - if (!stringlowercmp("internal", toks.front())) { - return mhFactory(mtype); - } else if (!stringlowercmp("dll", toks.front())) { - return 0; - } else if (!stringlowercmp("exec", toks.front())) { - if (toks.size() < 2) { - LOGERR(("getMimeHandler: bad line for %s: %s\n", mtype.c_str(), - hs.c_str())); + if (!hs.empty()) { + // Break definition into type and name + list toks; + stringToStrings(hs, toks); + if (toks.empty()) { + LOGERR(("getMimeHandler: bad mimeconf line for %s\n", + mtype.c_str())); return 0; } - MimeHandlerExec *h = new MimeHandlerExec; - list::const_iterator it1 = toks.begin(); - it1++; - for (;it1 != toks.end();it1++) - h->params.push_back(*it1); - return h; + + // Retrieve handler function according to type + if (!stringlowercmp("internal", toks.front())) { + return mhFactory(mtype); + } else if (!stringlowercmp("dll", toks.front())) { + } else if (!stringlowercmp("exec", toks.front())) { + if (toks.size() < 2) { + LOGERR(("getMimeHandler: bad line for %s: %s\n", + mtype.c_str(), hs.c_str())); + return 0; + } + MimeHandlerExec *h = new MimeHandlerExec; + list::const_iterator it1 = toks.begin(); + it1++; + for (;it1 != toks.end();it1++) + h->params.push_back(*it1); + return h; + } + } + + // We are supposed to get here if there was no specific error, but + // there is no identified mime type, or no handler + // associated. These files are either ignored or their name is + // indexed, depending on configuration + bool indexunknown = false; + cfg->getConfParam("indexallfilenames", &indexunknown); + if (indexunknown) { + return new MimeHandlerUnknown; + } else { + return 0; } - return 0; } diff --git a/src/qtgui/advsearch.ui b/src/qtgui/advsearch.ui index ad5f92cb..66596944 100644 --- a/src/qtgui/advsearch.ui +++ b/src/qtgui/advsearch.ui @@ -24,7 +24,7 @@ - layout13 + layout12 @@ -32,18 +32,12 @@ - layout15 + layout11 unnamed - - 10 - - - 10 - textLabel2 @@ -54,12 +48,54 @@ - layout14 + layout10 unnamed + + + orWordsTL + + + Any of these words + + + + + orWordsLE + + + + + textLabel1_2 + + + File name + + + + + fileNameLE + + + + + noWordsTL + + + None of these words + + + + + noWordsLE + + + + + andWordsTL @@ -74,7 +110,7 @@ All these words - + andWordsLE @@ -93,40 +129,11 @@ This exact phrase - + phraseLE - - - orWordsTL - - - Any of these words - - - - - orWordsLE - - - - - noWordsTL - - - None of these words - - - - - noWordsLE - - - - - @@ -353,20 +360,6 @@ - - - line1 - - - HLine - - - Sunken - - - Horizontal - - layout25 @@ -398,6 +391,20 @@ + + + line1 + + + HLine + + + Sunken + + + Horizontal + + diff --git a/src/qtgui/advsearch.ui.h b/src/qtgui/advsearch.ui.h index 4660a56b..7561ccc8 100644 --- a/src/qtgui/advsearch.ui.h +++ b/src/qtgui/advsearch.ui.h @@ -131,6 +131,7 @@ void advsearch::searchPB_clicked() mydata.phrase = string((const char*)(phraseLE->text().utf8())); mydata.orwords = string((const char*)(orWordsLE->text().utf8())); mydata.nowords = string((const char*)(noWordsLE->text().utf8())); + mydata.filename = string((const char*)(fileNameLE->text().utf8())); if (restrictFtCB->isOn() && noFiltypsLB->count() > 0) { for (unsigned int i = 0; i < yesFiltypsLB->count(); i++) { QCString ctext = yesFiltypsLB->item(i)->text().utf8(); diff --git a/src/qtgui/ssearchb.ui b/src/qtgui/ssearchb.ui index 0d81077f..b7471432 100644 --- a/src/qtgui/ssearchb.ui +++ b/src/qtgui/ssearchb.ui @@ -75,6 +75,23 @@ If this is set, each returned document will contain all the terms in the query. Else documents will be ordered by relevance, but may not contain all the terms. + + + isFNameCB + + + &File name + + + Alt+F + + + Search is on file names only, and may use wildcards. + + + If this is set, the search will only be performed on file names. Wildcards ? and * can be used and will be matched as in a shell command line. + + queryText diff --git a/src/qtgui/ssearchb.ui.h b/src/qtgui/ssearchb.ui.h index 1f04313d..d0fb216e 100644 --- a/src/qtgui/ssearchb.ui.h +++ b/src/qtgui/ssearchb.ui.h @@ -44,9 +44,11 @@ void SSearchBase::startSimpleSearch() LOGDEB(("SSearchBase::startSimpleSearch\n")); Rcl::AdvSearchData sdata; - QCString u8 = queryText->text().utf8(); - if (allTermsCB->isChecked()) + + if (isFNameCB->isChecked()) + sdata.filename = u8; + else if (allTermsCB->isChecked()) sdata.allwords = u8; else sdata.orwords = u8; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 815b2082..a7e4c32d 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.57 2006-02-07 10:26:49 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.58 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -20,6 +20,7 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.57 2006-02-07 10:26:49 dockes Exp $ #include #include #include +#include #include #include @@ -287,6 +288,7 @@ bool Rcl::dumb_string(const string &in, string &out) if (!unacmaybefold(s1, out, "UTF-8", true)) { LOGERR(("dumb_string: unac failed for %s\n", in.c_str())); out.erase(); + // See comment at start of func return true; } return true; @@ -387,11 +389,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, // /////// Split and index terms in document body and auxiliary fields string noacc; - // Split and index file name. This supposes that it's either ascii - // or utf-8. If this fails, we just go on. We need a config - // parameter for file name charset. - // Do we really want to fold case here ? - if (dumb_string(fn, noacc)) { + // Split and index file path. Do we really want to do this? Or do + // it with the simple file name only ? + if (dumb_string(doc.utf8fn, noacc)) { splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; } @@ -439,6 +439,14 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, string pathterm = "P" + hash; newdocument.add_term(pathterm); + // Simple file name. This is used for file name searches only. We index + // it with a term prefix + string sfn = path_getsimple(doc.utf8fn); + if (dumb_string(sfn, noacc) && !noacc.empty()) { + sfn = string("XSFN") + noacc; + newdocument.add_term(sfn); + } + // Internal path: with path, makes unique identifier for documents // inside multidocument files. string uniterm; @@ -992,7 +1000,7 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, Native *ndb = (Native *)pdata; if (!ndb) return false; - asdata.erase(); + m_asdata.erase(); dbindices.clear(); list pqueries; stringToXapianQueries(iqstring, stemlang, ndb, pqueries, opts); @@ -1023,7 +1031,7 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts, if (!sdata.topdir.empty()) LOGDEB((" restricted to: %s\n", sdata.topdir.c_str())); - asdata = sdata; + m_asdata = sdata; dbindices.clear(); Native *ndb = (Native *)pdata; @@ -1031,12 +1039,62 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts, return false; list pqueries; Xapian::Query xq; - + + if (!sdata.filename.empty()) { + LOGDEB((" filename search\n")); + // File name search, with possible wildcards. + // We expand wildcards by scanning the filename terms (prefixed + // with XSFN) from the database. + // We build an OR query with the expanded values if any. + string pattern; + // We take the data either from allwords or orwords to avoid + // interaction with the allwords checkbox + dumb_string(sdata.filename, pattern); + + // If pattern is not quoted, we add * at each end: match any + // substring + if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') + pattern = pattern.substr(1, pattern.size() -2); + else + pattern = "*" + pattern + "*"; + + LOGDEB((" pattern: [%s]\n", pattern.c_str())); + + // Match pattern against all file names in the db + Xapian::TermIterator it = ndb->db.allterms_begin(); + it.skip_to("XSFN"); + list names; + for (;it != ndb->db.allterms_end(); it++) { + if ((*it).find("XSFN") != 0) + break; + string fn = (*it).substr(4); + LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str())); + if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) { + names.push_back((*it).c_str()); + } + // Limit the match count + if (names.size() > 1000) { + LOGERR(("Rcl::Db::SetQuery: too many matched file names\n")); + break; + } + } + if (names.empty()) { + // Build an impossible query: we know its impossible because we + // control the prefixes! + names.push_back("XIMPOSSIBLE"); + } + // Build a query out of the matching file name terms. + xq = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); + } + if (!sdata.allwords.empty()) { stringToXapianQueries(sdata.allwords, stemlang, ndb, pqueries, opts); if (!pqueries.empty()) { - xq = Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(), - pqueries.end()); + Xapian::Query nq = + Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(), + pqueries.end()); + xq = xq.empty() ? nq : + Xapian::Query(Xapian::Query::OP_AND, xq, nq); pqueries.clear(); } } @@ -1044,8 +1102,8 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts, if (!sdata.orwords.empty()) { stringToXapianQueries(sdata.orwords, stemlang, ndb, pqueries, opts); if (!pqueries.empty()) { - Xapian::Query nq; - nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), + Xapian::Query nq = + Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); xq = xq.empty() ? nq : Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, nq); @@ -1157,7 +1215,7 @@ class Rcl::DbPops { string url; parms.get(string("url"), url); url = url.substr(7); - if (url.find(rdb->asdata.topdir) == 0) + if (url.find(rdb->m_asdata.topdir) == 0) return true; return false; } @@ -1215,8 +1273,8 @@ bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent) } // For now the only post-query filter is on dir subtree - bool postqfilter = !asdata.topdir.empty(); - LOGDEB1(("Topdir %s postqflt %d\n", asdata.topdir.c_str(), postqfilter)); + bool postqfilter = !m_asdata.topdir.empty(); + LOGDEB1(("Topdir %s postqflt %d\n", m_asdata.topdir.c_str(), postqfilter)); int xapi; if (postqfilter) { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 61e78852..21c6b5d0 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.25 2006-02-07 10:26:49 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.26 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -52,26 +52,33 @@ namespace Rcl { class Doc { public: // These fields potentially go into the document data record - string url; - string ipath; - string mimetype; + // We indicate the routine that sets them up during indexing + string url; // Computed from fn by Db::add + string utf8fn; // Transcoded version of the file path. + // Set by DbIndexer::processone + string ipath; // Set by DbIndexer::processone + string mimetype; // Set by FileInterner::internfile string fmtime; // File modification time as decimal ascii unix time + // Set by DbIndexer::processone string dmtime; // Data reference date (same format). Ie: mail date - string origcharset; - string title; - string keywords; - string abstract; - string fbytes; // File size - string dbytes; // Doc size + // Possibly set by handler + string origcharset; // Charset we transcoded from (in case we want back) + // Possibly set by handler + string title; // Possibly set by handler + string keywords; // Possibly set by handler + string abstract; // Possibly set by handler + string fbytes; // File size. Set by Db::Add + string dbytes; // Doc size. Set by Db::Add from text length - // The following fields don't go to the db. text is only used when - // indexing - string text; + // The following fields don't go to the db record + + string text; // text is split and indexed int pc; // used by sortseq, convenience void erase() { url.erase(); + utf8fn.erase(); ipath.erase(); mimetype.erase(); fmtime.erase(); @@ -96,6 +103,7 @@ class AdvSearchData { string phrase; string orwords; string nowords; + string filename; list filetypes; // restrict to types. Empty if inactive string topdir; // restrict to subtree. Empty if inactive string description; // Printable expanded version of the complete query @@ -107,6 +115,7 @@ class AdvSearchData { nowords.erase(); filetypes.clear(); topdir.erase(); + filename.erase(); description.erase(); } }; @@ -167,7 +176,7 @@ class Db { private: - AdvSearchData asdata; + AdvSearchData m_asdata; vector dbindices; // In case there is a postq filter: sequence of // db indices that match void *pdata; // Pointer to private data. We don't want db(ie