added code to specifically index/search file names

This commit is contained in:
dockes 2006-03-20 16:05:41 +00:00
parent f96fcd6dd3
commit d4852f3b0d
10 changed files with 259 additions and 145 deletions

View File

@ -1 +1 @@
1.2.3
1.3.1

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.24 2006-01-26 07:02:06 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.25 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -236,6 +236,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// Internal access path for multi-document files
doc.ipath = ipath;
// File name transcoded to utf8 for indexation.
// We actually might want a separate param for the filename charset
string charset = config->getDefCharset();
// If this fails, the path won't be indexed, no big deal
transcode(fn, doc.utf8fn, charset, "UTF-8");
// Do database-specific work to update document data
if (!db.add(fn, doc, stp))
return FsTreeWalker::FtwError;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.15 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.16 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -125,41 +125,42 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
// for a compressed file.
m_mime = mimetype(m_fn, m_cfg, usfci);
// If identification fails, try to use the input parameter. Note that this
// is normally not a compressed type (it's the mime type from the db)
// If identification fails, try to use the input parameter. This
// is then normally not a compressed type (it's the mime type from
// the db), and is only set when previewing, not for indexing
if (m_mime.empty() && imime)
m_mime = *imime;
if (!m_mime.empty()) {
// Has mime: check for a compressed file. If so, create a
// temporary uncompressed file, and rerun the mime type
// identification, then do the rest with the temp file.
list<string>ucmd;
if (m_cfg->getUncompressor(m_mime, ucmd)) {
if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
return;
}
LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n",
m_tdir.c_str(), m_tfile.c_str()));
m_fn = m_tfile;
m_mime = mimetype(m_fn, m_cfg, usfci);
if (m_mime.empty() && imime)
m_mime = *imime;
}
}
if (m_mime.empty()) {
// No mime type: not listed in our map, or present in stop list
LOGDEB(("FileInterner::FileInterner: (no mime) [%s]\n", m_fn.c_str()));
return;
// No mime type. We let it through as config may warrant that
// we index all file names
LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
}
// First check for a compressed file. If so, create a temporary
// uncompressed file, and rerun the mime type identification, then do the
// rest with the temp file.
list<string>ucmd;
if (m_cfg->getUncompressor(m_mime, ucmd)) {
if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
return;
}
LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n",
m_tdir.c_str(), m_tfile.c_str()));
m_fn = m_tfile;
m_mime = mimetype(m_fn, m_cfg, usfci);
if (m_mime.empty() && imime)
m_mime = *imime;
if (m_mime.empty()) {
// No mime type ?? pass on.
LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
return;
}
}
// Look for appropriate handler
// Look for appropriate handler (might still return empty)
m_handler = getMimeHandler(m_mime, m_cfg);
if (!m_handler) {
// No handler for this type, for now :(
// No handler for this type, for now :( if indexallfilenames
// is set in the config, this normally wont happen (we get mh_unknown)
LOGDEB(("FileInterner::FileInterner: %s: no handler\n",
m_mime.c_str()));
return;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.16 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.17 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -31,6 +31,7 @@ using namespace std;
#include "mh_mail.h"
#include "mh_text.h"
#include "mh_exec.h"
#include "mh_unknown.h"
/** Create internal handler object appropriate for given mime type */
static MimeHandler *mhFactory(const string &mime)
@ -52,35 +53,48 @@ static MimeHandler *mhFactory(const string &mime)
MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
{
// Get handler definition for mime type
string hs = cfg->getMimeHandlerDef(mtype);
if (hs.empty())
return 0;
string hs;
if (!mtype.empty())
hs = cfg->getMimeHandlerDef(mtype);
// Break definition into type and name
list<string> toks;
stringToStrings(hs, toks);
if (toks.empty()) {
LOGERR(("getMimeHandler: bad mimeconf line for %s\n", mtype.c_str()));
return 0;
}
// Retrieve handler function according to type
if (!stringlowercmp("internal", toks.front())) {
return mhFactory(mtype);
} else if (!stringlowercmp("dll", toks.front())) {
return 0;
} else if (!stringlowercmp("exec", toks.front())) {
if (toks.size() < 2) {
LOGERR(("getMimeHandler: bad line for %s: %s\n", mtype.c_str(),
hs.c_str()));
if (!hs.empty()) {
// Break definition into type and name
list<string> toks;
stringToStrings(hs, toks);
if (toks.empty()) {
LOGERR(("getMimeHandler: bad mimeconf line for %s\n",
mtype.c_str()));
return 0;
}
MimeHandlerExec *h = new MimeHandlerExec;
list<string>::const_iterator it1 = toks.begin();
it1++;
for (;it1 != toks.end();it1++)
h->params.push_back(*it1);
return h;
// Retrieve handler function according to type
if (!stringlowercmp("internal", toks.front())) {
return mhFactory(mtype);
} else if (!stringlowercmp("dll", toks.front())) {
} else if (!stringlowercmp("exec", toks.front())) {
if (toks.size() < 2) {
LOGERR(("getMimeHandler: bad line for %s: %s\n",
mtype.c_str(), hs.c_str()));
return 0;
}
MimeHandlerExec *h = new MimeHandlerExec;
list<string>::const_iterator it1 = toks.begin();
it1++;
for (;it1 != toks.end();it1++)
h->params.push_back(*it1);
return h;
}
}
// We are supposed to get here if there was no specific error, but
// there is no identified mime type, or no handler
// associated. These files are either ignored or their name is
// indexed, depending on configuration
bool indexunknown = false;
cfg->getConfParam("indexallfilenames", &indexunknown);
if (indexunknown) {
return new MimeHandlerUnknown;
} else {
return 0;
}
return 0;
}

View File

@ -24,7 +24,7 @@
</property>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout13</cstring>
<cstring>layout12</cstring>
</property>
<vbox>
<property name="name">
@ -32,18 +32,12 @@
</property>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout15</cstring>
<cstring>layout11</cstring>
</property>
<hbox>
<property name="name">
<cstring>unnamed</cstring>
</property>
<property name="margin">
<number>10</number>
</property>
<property name="spacing">
<number>10</number>
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel2</cstring>
@ -54,12 +48,54 @@
</widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout14</cstring>
<cstring>layout10</cstring>
</property>
<grid>
<property name="name">
<cstring>unnamed</cstring>
</property>
<widget class="QLabel" row="2" column="0" rowspan="1" colspan="2">
<property name="name">
<cstring>orWordsTL</cstring>
</property>
<property name="text">
<string>Any of these words</string>
</property>
</widget>
<widget class="QLineEdit" row="2" column="2">
<property name="name">
<cstring>orWordsLE</cstring>
</property>
</widget>
<widget class="QLabel" row="4" column="0">
<property name="name">
<cstring>textLabel1_2</cstring>
</property>
<property name="text">
<string>File name</string>
</property>
</widget>
<widget class="QLineEdit" row="4" column="2">
<property name="name">
<cstring>fileNameLE</cstring>
</property>
</widget>
<widget class="QLabel" row="3" column="0" rowspan="1" colspan="2">
<property name="name">
<cstring>noWordsTL</cstring>
</property>
<property name="text">
<string>None of these words</string>
</property>
</widget>
<widget class="QLineEdit" row="3" column="2">
<property name="name">
<cstring>noWordsLE</cstring>
</property>
<property name="text">
<string></string>
</property>
</widget>
<widget class="QLabel" row="0" column="0">
<property name="name">
<cstring>andWordsTL</cstring>
@ -74,7 +110,7 @@
<string>All these words</string>
</property>
</widget>
<widget class="QLineEdit" row="0" column="1" rowspan="1" colspan="3">
<widget class="QLineEdit" row="0" column="1" rowspan="1" colspan="2">
<property name="name">
<cstring>andWordsLE</cstring>
</property>
@ -93,40 +129,11 @@
<string>This exact phrase</string>
</property>
</widget>
<widget class="QLineEdit" row="1" column="2" rowspan="1" colspan="2">
<widget class="QLineEdit" row="1" column="2">
<property name="name">
<cstring>phraseLE</cstring>
</property>
</widget>
<widget class="QLabel" row="2" column="0" rowspan="1" colspan="2">
<property name="name">
<cstring>orWordsTL</cstring>
</property>
<property name="text">
<string>Any of these words</string>
</property>
</widget>
<widget class="QLineEdit" row="2" column="2" rowspan="1" colspan="2">
<property name="name">
<cstring>orWordsLE</cstring>
</property>
</widget>
<widget class="QLabel" row="3" column="0" rowspan="1" colspan="3">
<property name="name">
<cstring>noWordsTL</cstring>
</property>
<property name="text">
<string>None of these words</string>
</property>
</widget>
<widget class="QLineEdit" row="3" column="3">
<property name="name">
<cstring>noWordsLE</cstring>
</property>
<property name="text">
<string></string>
</property>
</widget>
</grid>
</widget>
</hbox>
@ -353,20 +360,6 @@
</widget>
</grid>
</widget>
<widget class="Line">
<property name="name">
<cstring>line1</cstring>
</property>
<property name="frameShape">
<enum>HLine</enum>
</property>
<property name="frameShadow">
<enum>Sunken</enum>
</property>
<property name="orientation">
<enum>Horizontal</enum>
</property>
</widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout25</cstring>
@ -398,6 +391,20 @@
</widget>
</vbox>
</widget>
<widget class="Line">
<property name="name">
<cstring>line1</cstring>
</property>
<property name="frameShape">
<enum>HLine</enum>
</property>
<property name="frameShadow">
<enum>Sunken</enum>
</property>
<property name="orientation">
<enum>Horizontal</enum>
</property>
</widget>
</vbox>
</widget>
<connections>

View File

@ -131,6 +131,7 @@ void advsearch::searchPB_clicked()
mydata.phrase = string((const char*)(phraseLE->text().utf8()));
mydata.orwords = string((const char*)(orWordsLE->text().utf8()));
mydata.nowords = string((const char*)(noWordsLE->text().utf8()));
mydata.filename = string((const char*)(fileNameLE->text().utf8()));
if (restrictFtCB->isOn() && noFiltypsLB->count() > 0) {
for (unsigned int i = 0; i < yesFiltypsLB->count(); i++) {
QCString ctext = yesFiltypsLB->item(i)->text().utf8();

View File

@ -75,6 +75,23 @@
<string>If this is set, each returned document will contain all the terms in the query. Else documents will be ordered by relevance, but may not contain all the terms.</string>
</property>
</widget>
<widget class="QCheckBox">
<property name="name">
<cstring>isFNameCB</cstring>
</property>
<property name="text">
<string>&amp;File name</string>
</property>
<property name="accel">
<string>Alt+F</string>
</property>
<property name="toolTip" stdset="0">
<string>Search is on file names only, and may use wildcards.</string>
</property>
<property name="whatsThis" stdset="0">
<string>If this is set, the search will only be performed on file names. Wildcards ? and * can be used and will be matched as in a shell command line.</string>
</property>
</widget>
<widget class="QLineEdit">
<property name="name">
<cstring>queryText</cstring>

View File

@ -44,9 +44,11 @@ void SSearchBase::startSimpleSearch()
LOGDEB(("SSearchBase::startSimpleSearch\n"));
Rcl::AdvSearchData sdata;
QCString u8 = queryText->text().utf8();
if (allTermsCB->isChecked())
if (isFNameCB->isChecked())
sdata.filename = u8;
else if (allTermsCB->isChecked())
sdata.allwords = u8;
else
sdata.orwords = u8;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.57 2006-02-07 10:26:49 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.58 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -20,6 +20,7 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.57 2006-02-07 10:26:49 dockes Exp $
#include <stdio.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fnmatch.h>
#include <iostream>
#include <string>
@ -287,6 +288,7 @@ bool Rcl::dumb_string(const string &in, string &out)
if (!unacmaybefold(s1, out, "UTF-8", true)) {
LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
out.erase();
// See comment at start of func
return true;
}
return true;
@ -387,11 +389,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
// /////// Split and index terms in document body and auxiliary fields
string noacc;
// Split and index file name. This supposes that it's either ascii
// or utf-8. If this fails, we just go on. We need a config
// parameter for file name charset.
// Do we really want to fold case here ?
if (dumb_string(fn, noacc)) {
// Split and index file path. Do we really want to do this? Or do
// it with the simple file name only ?
if (dumb_string(doc.utf8fn, noacc)) {
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
}
@ -439,6 +439,14 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
string pathterm = "P" + hash;
newdocument.add_term(pathterm);
// Simple file name. This is used for file name searches only. We index
// it with a term prefix
string sfn = path_getsimple(doc.utf8fn);
if (dumb_string(sfn, noacc) && !noacc.empty()) {
sfn = string("XSFN") + noacc;
newdocument.add_term(sfn);
}
// Internal path: with path, makes unique identifier for documents
// inside multidocument files.
string uniterm;
@ -992,7 +1000,7 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
Native *ndb = (Native *)pdata;
if (!ndb)
return false;
asdata.erase();
m_asdata.erase();
dbindices.clear();
list<Xapian::Query> pqueries;
stringToXapianQueries(iqstring, stemlang, ndb, pqueries, opts);
@ -1023,7 +1031,7 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
if (!sdata.topdir.empty())
LOGDEB((" restricted to: %s\n", sdata.topdir.c_str()));
asdata = sdata;
m_asdata = sdata;
dbindices.clear();
Native *ndb = (Native *)pdata;
@ -1031,12 +1039,62 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
return false;
list<Xapian::Query> pqueries;
Xapian::Query xq;
if (!sdata.filename.empty()) {
LOGDEB((" filename search\n"));
// File name search, with possible wildcards.
// We expand wildcards by scanning the filename terms (prefixed
// with XSFN) from the database.
// We build an OR query with the expanded values if any.
string pattern;
// We take the data either from allwords or orwords to avoid
// interaction with the allwords checkbox
dumb_string(sdata.filename, pattern);
// If pattern is not quoted, we add * at each end: match any
// substring
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"')
pattern = pattern.substr(1, pattern.size() -2);
else
pattern = "*" + pattern + "*";
LOGDEB((" pattern: [%s]\n", pattern.c_str()));
// Match pattern against all file names in the db
Xapian::TermIterator it = ndb->db.allterms_begin();
it.skip_to("XSFN");
list<string> names;
for (;it != ndb->db.allterms_end(); it++) {
if ((*it).find("XSFN") != 0)
break;
string fn = (*it).substr(4);
LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str()));
if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) {
names.push_back((*it).c_str());
}
// Limit the match count
if (names.size() > 1000) {
LOGERR(("Rcl::Db::SetQuery: too many matched file names\n"));
break;
}
}
if (names.empty()) {
// Build an impossible query: we know its impossible because we
// control the prefixes!
names.push_back("XIMPOSSIBLE");
}
// Build a query out of the matching file name terms.
xq = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
}
if (!sdata.allwords.empty()) {
stringToXapianQueries(sdata.allwords, stemlang, ndb, pqueries, opts);
if (!pqueries.empty()) {
xq = Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(),
pqueries.end());
Xapian::Query nq =
Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(),
pqueries.end());
xq = xq.empty() ? nq :
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
pqueries.clear();
}
}
@ -1044,8 +1102,8 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
if (!sdata.orwords.empty()) {
stringToXapianQueries(sdata.orwords, stemlang, ndb, pqueries, opts);
if (!pqueries.empty()) {
Xapian::Query nq;
nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
Xapian::Query nq =
Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
pqueries.end());
xq = xq.empty() ? nq :
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, nq);
@ -1157,7 +1215,7 @@ class Rcl::DbPops {
string url;
parms.get(string("url"), url);
url = url.substr(7);
if (url.find(rdb->asdata.topdir) == 0)
if (url.find(rdb->m_asdata.topdir) == 0)
return true;
return false;
}
@ -1215,8 +1273,8 @@ bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
}
// For now the only post-query filter is on dir subtree
bool postqfilter = !asdata.topdir.empty();
LOGDEB1(("Topdir %s postqflt %d\n", asdata.topdir.c_str(), postqfilter));
bool postqfilter = !m_asdata.topdir.empty();
LOGDEB1(("Topdir %s postqflt %d\n", m_asdata.topdir.c_str(), postqfilter));
int xapi;
if (postqfilter) {

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.25 2006-02-07 10:26:49 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.26 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -52,26 +52,33 @@ namespace Rcl {
class Doc {
public:
// These fields potentially go into the document data record
string url;
string ipath;
string mimetype;
// We indicate the routine that sets them up during indexing
string url; // Computed from fn by Db::add
string utf8fn; // Transcoded version of the file path.
// Set by DbIndexer::processone
string ipath; // Set by DbIndexer::processone
string mimetype; // Set by FileInterner::internfile
string fmtime; // File modification time as decimal ascii unix time
// Set by DbIndexer::processone
string dmtime; // Data reference date (same format). Ie: mail date
string origcharset;
string title;
string keywords;
string abstract;
string fbytes; // File size
string dbytes; // Doc size
// Possibly set by handler
string origcharset; // Charset we transcoded from (in case we want back)
// Possibly set by handler
string title; // Possibly set by handler
string keywords; // Possibly set by handler
string abstract; // Possibly set by handler
string fbytes; // File size. Set by Db::Add
string dbytes; // Doc size. Set by Db::Add from text length
// The following fields don't go to the db. text is only used when
// indexing
string text;
// The following fields don't go to the db record
string text; // text is split and indexed
int pc; // used by sortseq, convenience
void erase() {
url.erase();
utf8fn.erase();
ipath.erase();
mimetype.erase();
fmtime.erase();
@ -96,6 +103,7 @@ class AdvSearchData {
string phrase;
string orwords;
string nowords;
string filename;
list<string> filetypes; // restrict to types. Empty if inactive
string topdir; // restrict to subtree. Empty if inactive
string description; // Printable expanded version of the complete query
@ -107,6 +115,7 @@ class AdvSearchData {
nowords.erase();
filetypes.clear();
topdir.erase();
filename.erase();
description.erase();
}
};
@ -167,7 +176,7 @@ class Db {
private:
AdvSearchData asdata;
AdvSearchData m_asdata;
vector<int> dbindices; // In case there is a postq filter: sequence of
// db indices that match
void *pdata; // Pointer to private data. We don't want db(ie