Cleaned up file name handling. Fixes that file names were sometimes indexed split, sometimes not. They now always are both, with different prefixes. Forces reindex
This commit is contained in:
parent
4eaf12fb9c
commit
8b34610dde
@ -452,8 +452,9 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
doc.fmtime = ascdate;
|
doc.fmtime = ascdate;
|
||||||
if (doc.url.empty())
|
if (doc.url.empty())
|
||||||
doc.url = cstr_fileu + fn;
|
doc.url = cstr_fileu + fn;
|
||||||
if (doc.utf8fn.empty())
|
const string *fnp = 0;
|
||||||
doc.utf8fn = utf8fn;
|
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
|
||||||
|
doc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||||
|
|
||||||
char cbuf[100];
|
char cbuf[100];
|
||||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||||
@ -512,7 +513,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
LOGDEB1(("Creating empty doc for file\n"));
|
LOGDEB1(("Creating empty doc for file\n"));
|
||||||
Rcl::Doc fileDoc;
|
Rcl::Doc fileDoc;
|
||||||
fileDoc.fmtime = ascdate;
|
fileDoc.fmtime = ascdate;
|
||||||
fileDoc.utf8fn = utf8fn;
|
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||||
fileDoc.mimetype = interner.getMimetype();
|
fileDoc.mimetype = interner.getMimetype();
|
||||||
fileDoc.url = cstr_fileu + fn;
|
fileDoc.url = cstr_fileu + fn;
|
||||||
|
|
||||||
|
|||||||
@ -679,6 +679,11 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
|||||||
doc.dmtime = it->second;
|
doc.dmtime = it->second;
|
||||||
} else if (it->first == cstr_dj_keyorigcharset) {
|
} else if (it->first == cstr_dj_keyorigcharset) {
|
||||||
doc.origcharset = it->second;
|
doc.origcharset = it->second;
|
||||||
|
} else if (it->first == cstr_dj_keyfn) {
|
||||||
|
// Only if not set during the stack walk
|
||||||
|
const string *fnp = 0;
|
||||||
|
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
|
||||||
|
doc.meta[Rcl::Doc::keyfn] = it->second;
|
||||||
} else if (it->first == cstr_dj_keymt ||
|
} else if (it->first == cstr_dj_keymt ||
|
||||||
it->first == cstr_dj_keycharset) {
|
it->first == cstr_dj_keycharset) {
|
||||||
// don't need/want these.
|
// don't need/want these.
|
||||||
@ -735,7 +740,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
|
|||||||
// We have a non-empty ipath
|
// We have a non-empty ipath
|
||||||
hasipath = true;
|
hasipath = true;
|
||||||
getKeyValue(docdata, cstr_dj_keymt, doc.mimetype);
|
getKeyValue(docdata, cstr_dj_keymt, doc.mimetype);
|
||||||
getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn);
|
getKeyValue(docdata, cstr_dj_keyfn, doc.meta[Rcl::Doc::keyfn]);
|
||||||
} else {
|
} else {
|
||||||
if (doc.fbytes.empty())
|
if (doc.fbytes.empty())
|
||||||
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
|
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
|
||||||
@ -999,15 +1004,16 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, const string& ipath
|
|||||||
return FIError;
|
return FIError;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If indexing compute ipath and significant mimetype. ipath is
|
// Compute ipath and significant mimetype. ipath is returned
|
||||||
// returned through doc.ipath. We also retrieve some metadata
|
// through doc.ipath. We also retrieve some metadata fields from
|
||||||
// fields from the ancesters (like date or author). This is useful
|
// the ancesters (like date or author). This is useful for email
|
||||||
// for email attachments. The values will be replaced by those
|
// attachments. The values will be replaced by those internal to
|
||||||
// internal to the document (by dijontorcl()) if any, so the order
|
// the document (by dijontorcl()) if any, so the order of calls is
|
||||||
// of calls is important.
|
// important. We used to only do this when indexing, but the aux
|
||||||
if (!m_forPreview) {
|
// fields like filename and author may be interesting when
|
||||||
collectIpathAndMT(doc);
|
// previewing too
|
||||||
} else {
|
collectIpathAndMT(doc);
|
||||||
|
if (m_forPreview) {
|
||||||
doc.mimetype = m_reachedMType;
|
doc.mimetype = m_reachedMType;
|
||||||
}
|
}
|
||||||
// Keep this AFTER collectIpathAndMT
|
// Keep this AFTER collectIpathAndMT
|
||||||
|
|||||||
@ -428,8 +428,6 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
|
|||||||
case 'f':
|
case 'f':
|
||||||
if (!key.compare(Rcl::Doc::keyfs)) {
|
if (!key.compare(Rcl::Doc::keyfs)) {
|
||||||
self->doc->fbytes = uvalue;
|
self->doc->fbytes = uvalue;
|
||||||
} else if (!key.compare(Rcl::Doc::keyfn)) {
|
|
||||||
self->doc->utf8fn = uvalue;
|
|
||||||
} else if (!key.compare(Rcl::Doc::keyfmt)) {
|
} else if (!key.compare(Rcl::Doc::keyfmt)) {
|
||||||
self->doc->fmtime = uvalue;
|
self->doc->fmtime = uvalue;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -109,6 +109,22 @@ static const string cstr_syntAbs("?!#@");
|
|||||||
static map<string, FieldTraits> fldToTraits;
|
static map<string, FieldTraits> fldToTraits;
|
||||||
static PTMutexInit o_fldToTraits_mutex;
|
static PTMutexInit o_fldToTraits_mutex;
|
||||||
|
|
||||||
|
// A bogus fldToTraits key (bogus because not a real field) used to
|
||||||
|
// retrieve the prefix used for specific filename searches (unsplit
|
||||||
|
// filename, not "filename as 'filename:' field" searches)
|
||||||
|
static const string keySysFilenamePrefix("rclUnsplitFN");
|
||||||
|
// The prefix for regular "filename:" field searches.
|
||||||
|
static const string cstr_fnAsFieldPrefix("XSFN");
|
||||||
|
// The prefix for unsplit filename terms used with specific -f or
|
||||||
|
// "File Name" GUI entries. There is a compile option to use the same prefix
|
||||||
|
// for both.
|
||||||
|
// #define UNSPLIT_FN_PREFIX_SAME_AS_SPLIT
|
||||||
|
#if defined(UNSPLIT_FN_PREFIX_SAME_AS_SPLIT)
|
||||||
|
static const string cstr_fnUnsplitPrefix(cstr_fnAsFieldPrefix);
|
||||||
|
#else
|
||||||
|
static const string cstr_fnUnsplitPrefix("XSFS");
|
||||||
|
#endif
|
||||||
|
|
||||||
static void initFldToTraits()
|
static void initFldToTraits()
|
||||||
{
|
{
|
||||||
PTMutexLocker locker(o_fldToTraits_mutex);
|
PTMutexLocker locker(o_fldToTraits_mutex);
|
||||||
@ -123,7 +139,9 @@ static void initFldToTraits()
|
|||||||
fldToTraits[Doc::keyabs] = FieldTraits();
|
fldToTraits[Doc::keyabs] = FieldTraits();
|
||||||
|
|
||||||
fldToTraits["ext"] = FieldTraits("XE");
|
fldToTraits["ext"] = FieldTraits("XE");
|
||||||
fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
|
|
||||||
|
fldToTraits[Doc::keyfn] = FieldTraits(cstr_fnAsFieldPrefix);
|
||||||
|
fldToTraits[keySysFilenamePrefix] = FieldTraits(cstr_fnUnsplitPrefix);
|
||||||
|
|
||||||
fldToTraits[cstr_caption] = FieldTraits("S");
|
fldToTraits[cstr_caption] = FieldTraits("S");
|
||||||
fldToTraits[Doc::keytt] = FieldTraits("S");
|
fldToTraits[Doc::keytt] = FieldTraits("S");
|
||||||
@ -220,7 +238,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
|||||||
vector<string> keys = parms.getNames(string());
|
vector<string> keys = parms.getNames(string());
|
||||||
for (vector<string>::const_iterator it = keys.begin();
|
for (vector<string>::const_iterator it = keys.begin();
|
||||||
it != keys.end(); it++) {
|
it != keys.end(); it++) {
|
||||||
if (doc.meta.find(*it) == doc.meta.end())
|
if (doc.meta.find(*it) == doc.meta.end())
|
||||||
parms.get(*it, doc.meta[*it]);
|
parms.get(*it, doc.meta[*it]);
|
||||||
}
|
}
|
||||||
doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
|
doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
|
||||||
@ -1099,11 +1117,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
TextSplitDb splitter(newdocument, nxt);
|
TextSplitDb splitter(newdocument, nxt);
|
||||||
tpidx.setTSD(&splitter);
|
tpidx.setTSD(&splitter);
|
||||||
|
|
||||||
// Split and index file name as document term(s)
|
|
||||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
|
||||||
if (!splitter.text_to_words(doc.utf8fn))
|
|
||||||
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
|
|
||||||
|
|
||||||
// If the ipath is like a path, index the last element. This is
|
// If the ipath is like a path, index the last element. This is
|
||||||
// for compound documents like zip and chm for which the filter
|
// for compound documents like zip and chm for which the filter
|
||||||
// uses the file path as ipath.
|
// uses the file path as ipath.
|
||||||
@ -1180,11 +1193,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
// Mime type
|
// Mime type
|
||||||
newdocument.add_term("T" + doc.mimetype);
|
newdocument.add_term("T" + doc.mimetype);
|
||||||
|
|
||||||
// Simple file name indexed for file name searches with a term prefix
|
// Simple file name indexed unsplit for file name searches with a
|
||||||
// We also add a term for the filename extension if any.
|
// term prefix We also add a term for the filename extension if
|
||||||
if (!doc.utf8fn.empty()) {
|
// any.
|
||||||
|
string utf8fn;
|
||||||
|
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
||||||
string fn;
|
string fn;
|
||||||
if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) {
|
if (unacmaybefold(utf8fn, fn, "UTF-8", true)) {
|
||||||
// We should truncate after extracting the extension, but this is
|
// We should truncate after extracting the extension, but this is
|
||||||
// a pathological case anyway
|
// a pathological case anyway
|
||||||
if (fn.size() > 230)
|
if (fn.size() > 230)
|
||||||
@ -1193,18 +1208,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
if (pos != string::npos && pos != fn.length() - 1) {
|
if (pos != string::npos && pos != fn.length() - 1) {
|
||||||
newdocument.add_term(string("XE") + fn.substr(pos + 1));
|
newdocument.add_term(string("XE") + fn.substr(pos + 1));
|
||||||
}
|
}
|
||||||
fn = string("XSFN") + fn;
|
fn = cstr_fnUnsplitPrefix + fn;
|
||||||
newdocument.add_term(fn);
|
newdocument.add_term(fn);
|
||||||
}
|
}
|
||||||
// Store utf8fn inside the metadata array as keyfn
|
|
||||||
// (="filename") so that it can be accessed by the "stored"
|
|
||||||
// processing below, without special-casing it. We only do it
|
|
||||||
// if keyfn is currently empty, because there could be a value
|
|
||||||
// already (ie for a mail attachment with a file name
|
|
||||||
// attribute)
|
|
||||||
if (doc.meta[Doc::keyfn].empty()) {
|
|
||||||
doc.meta[Doc::keyfn] = doc.utf8fn;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Udi unique term: this is used for file existence/uptodate
|
// Udi unique term: this is used for file existence/uptodate
|
||||||
@ -1663,7 +1669,8 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
|
|||||||
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
||||||
|
|
||||||
TermMatchResult result;
|
TermMatchResult result;
|
||||||
if (!termMatch(ET_WILD, string(), pattern, result, 1000, Doc::keyfn))
|
if (!termMatch(ET_WILD, string(), pattern, result, 1000,
|
||||||
|
keySysFilenamePrefix))
|
||||||
return false;
|
return false;
|
||||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||||
it != result.entries.end(); it++)
|
it != result.entries.end(); it++)
|
||||||
|
|||||||
@ -212,8 +212,7 @@ class Db {
|
|||||||
/** Return min and max years for doc mod times in db */
|
/** Return min and max years for doc mod times in db */
|
||||||
bool maxYearSpan(int *minyear, int *maxyear);
|
bool maxYearSpan(int *minyear, int *maxyear);
|
||||||
|
|
||||||
/** Special filename wildcard to XSFN terms expansion.
|
/** Wildcard expansion specific to file names. Internal/sdata use only */
|
||||||
internal/searchdata use only */
|
|
||||||
bool filenameWildExp(const string& exp, vector<string>& names);
|
bool filenameWildExp(const string& exp, vector<string>& names);
|
||||||
|
|
||||||
/** Set parameters for synthetic abstract generation */
|
/** Set parameters for synthetic abstract generation */
|
||||||
|
|||||||
@ -47,7 +47,6 @@ namespace Rcl {
|
|||||||
void Doc::dump(bool dotext) const
|
void Doc::dump(bool dotext) const
|
||||||
{
|
{
|
||||||
LOGDEB(("Rcl::Doc::dump: url: [%s]\n", url.c_str()));
|
LOGDEB(("Rcl::Doc::dump: url: [%s]\n", url.c_str()));
|
||||||
LOGDEB(("Rcl::Doc::dump: utf8fn: [%s]\n", utf8fn.c_str()));
|
|
||||||
LOGDEB(("Rcl::Doc::dump: ipath: [%s]\n", ipath.c_str()));
|
LOGDEB(("Rcl::Doc::dump: ipath: [%s]\n", ipath.c_str()));
|
||||||
LOGDEB(("Rcl::Doc::dump: mimetype: [%s]\n", mimetype.c_str()));
|
LOGDEB(("Rcl::Doc::dump: mimetype: [%s]\n", mimetype.c_str()));
|
||||||
LOGDEB(("Rcl::Doc::dump: fmtime: [%s]\n", fmtime.c_str()));
|
LOGDEB(("Rcl::Doc::dump: fmtime: [%s]\n", fmtime.c_str()));
|
||||||
|
|||||||
@ -48,11 +48,6 @@ class Doc {
|
|||||||
// Query: from doc data.
|
// Query: from doc data.
|
||||||
string url;
|
string url;
|
||||||
|
|
||||||
// Transcoded version of the simple file name for SFN-prefixed
|
|
||||||
// specific file name indexing
|
|
||||||
// Index: set by DbIndexer::processone
|
|
||||||
string utf8fn;
|
|
||||||
|
|
||||||
// Internal path for multi-doc files. Ascii
|
// Internal path for multi-doc files. Ascii
|
||||||
// Set by FsIndexer::processone
|
// Set by FsIndexer::processone
|
||||||
string ipath;
|
string ipath;
|
||||||
@ -123,7 +118,6 @@ class Doc {
|
|||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
void erase() {
|
void erase() {
|
||||||
url.erase();
|
url.erase();
|
||||||
utf8fn.erase();
|
|
||||||
ipath.erase();
|
ipath.erase();
|
||||||
mimetype.erase();
|
mimetype.erase();
|
||||||
fmtime.erase();
|
fmtime.erase();
|
||||||
@ -153,6 +147,17 @@ class Doc {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
bool peekmeta(const string& nm, const string **value = 0) const
|
||||||
|
{
|
||||||
|
map<string,string>::const_iterator it = meta.find(nm);
|
||||||
|
if (it != meta.end()) {
|
||||||
|
if (value)
|
||||||
|
*value = &(it->second);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void dump(bool dotext=false) const;
|
void dump(bool dotext=false) const;
|
||||||
|
|
||||||
|
|||||||
@ -1024,32 +1024,25 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Translate a FILENAME search clause. This mostly (or always) comes
|
// Translate a FILENAME search clause. This always comes
|
||||||
// from a "filename" search from the gui or recollq. A query language
|
// from a "filename" search from the gui or recollq. A query language
|
||||||
// "filename:"-prefixed field will not go through here, but through
|
// "filename:"-prefixed field will not go through here, but through
|
||||||
// the generic field-processing code.
|
// the generic field-processing code.
|
||||||
//
|
//
|
||||||
// In the case of multiple space-separated fragments, we generate an
|
// We do not split the entry any more (used to do some crazy thing
|
||||||
// AND of OR queries. Each OR query comes from the expansion of a
|
// about expanding multiple fragments in the past. We just take the
|
||||||
// fragment. We used to generate a single OR with all expanded terms,
|
// value blanks and all and expand this against the indexed unsplit
|
||||||
// which did not make much sense.
|
// file names
|
||||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||||
const string&)
|
const string&)
|
||||||
{
|
{
|
||||||
Xapian::Query *qp = (Xapian::Query *)p;
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
*qp = Xapian::Query();
|
*qp = Xapian::Query();
|
||||||
|
|
||||||
vector<string> patterns;
|
|
||||||
TextSplit::stringToStrings(m_text, patterns);
|
|
||||||
vector<string> names;
|
vector<string> names;
|
||||||
for (vector<string>::iterator it = patterns.begin();
|
db.filenameWildExp(m_text, names);
|
||||||
it != patterns.end(); it++) {
|
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||||
vector<string> more;
|
|
||||||
db.filenameWildExp(*it, more);
|
|
||||||
Xapian::Query tq = Xapian::Query(Xapian::Query::OP_OR, more.begin(),
|
|
||||||
more.end());
|
|
||||||
*qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq);
|
|
||||||
}
|
|
||||||
if (m_weight != 1.0) {
|
if (m_weight != 1.0) {
|
||||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -262,15 +262,11 @@ protected:
|
|||||||
int m_slack;
|
int m_slack;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Filename search clause. This is special because term expansion is only
|
/**
|
||||||
* performed against the XSFN terms (it's performed against the main index
|
* Filename search clause. This is special because term expansion is only
|
||||||
* for all other fields). Else we could just use a "filename:" field
|
* performed against the unsplit file name terms.
|
||||||
* This doesn't really make sense (either). I think we could either expand
|
|
||||||
* filenames against all terms and then select the XSFN ones, or always perform
|
|
||||||
* expansion only against the field's terms ? Anyway this doesn't hurt
|
|
||||||
* much either.
|
|
||||||
*
|
*
|
||||||
* There is a big advantage though in expanding only against the
|
* There is a big advantage in expanding only against the
|
||||||
* field, especially for file names, because this makes searches for
|
* field, especially for file names, because this makes searches for
|
||||||
* "*xx" much faster (no need to scan the whole main index).
|
* "*xx" much faster (no need to scan the whole main index).
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -42,12 +42,12 @@ keywords = K
|
|||||||
# add your own to search for fields produced by the filters and not handled
|
# add your own to search for fields produced by the filters and not handled
|
||||||
# by default.
|
# by default.
|
||||||
# Some values are internally reserved by recoll:
|
# Some values are internally reserved by recoll:
|
||||||
# XP (for path elements), XXST, XXND
|
# XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND,
|
||||||
# Using XX was not a good idea.
|
# Using XX was not a good idea.
|
||||||
|
#
|
||||||
# I hereby commit to not using XY for Recoll:
|
# I hereby commit to not using XY for Recoll:
|
||||||
# *** USE XY for beginning your local prefixes ***
|
# *** USE XY for beginning your local prefixes *** ie:
|
||||||
ext = XE
|
# myfield = XYMYPREF
|
||||||
filename = XSFN
|
|
||||||
recipient = XTO
|
recipient = XTO
|
||||||
|
|
||||||
############################
|
############################
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user