diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 3c4e7519..14b68598 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -452,8 +452,9 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, doc.fmtime = ascdate; if (doc.url.empty()) doc.url = cstr_fileu + fn; - if (doc.utf8fn.empty()) - doc.utf8fn = utf8fn; + const string *fnp = 0; + if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty()) + doc.meta[Rcl::Doc::keyfn] = utf8fn; char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); @@ -512,7 +513,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, LOGDEB1(("Creating empty doc for file\n")); Rcl::Doc fileDoc; fileDoc.fmtime = ascdate; - fileDoc.utf8fn = utf8fn; + fileDoc.meta[Rcl::Doc::keyfn] = utf8fn; fileDoc.mimetype = interner.getMimetype(); fileDoc.url = cstr_fileu + fn; diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 1e49cedd..5ec50022 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -679,6 +679,11 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) doc.dmtime = it->second; } else if (it->first == cstr_dj_keyorigcharset) { doc.origcharset = it->second; + } else if (it->first == cstr_dj_keyfn) { + // Only if not set during the stack walk + const string *fnp = 0; + if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty()) + doc.meta[Rcl::Doc::keyfn] = it->second; } else if (it->first == cstr_dj_keymt || it->first == cstr_dj_keycharset) { // don't need/want these. @@ -735,7 +740,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const // We have a non-empty ipath hasipath = true; getKeyValue(docdata, cstr_dj_keymt, doc.mimetype); - getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn); + getKeyValue(docdata, cstr_dj_keyfn, doc.meta[Rcl::Doc::keyfn]); } else { if (doc.fbytes.empty()) getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes); @@ -999,15 +1004,16 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, const string& ipath return FIError; } - // If indexing compute ipath and significant mimetype. ipath is - // returned through doc.ipath. We also retrieve some metadata - // fields from the ancesters (like date or author). This is useful - // for email attachments. The values will be replaced by those - // internal to the document (by dijontorcl()) if any, so the order - // of calls is important. - if (!m_forPreview) { - collectIpathAndMT(doc); - } else { + // Compute ipath and significant mimetype. ipath is returned + // through doc.ipath. We also retrieve some metadata fields from + // the ancesters (like date or author). This is useful for email + // attachments. The values will be replaced by those internal to + // the document (by dijontorcl()) if any, so the order of calls is + // important. We used to only do this when indexing, but the aux + // fields like filename and author may be interesting when + // previewing too + collectIpathAndMT(doc); + if (m_forPreview) { doc.mimetype = m_reachedMType; } // Keep this AFTER collectIpathAndMT diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index d0c2385c..2ee52a25 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -428,8 +428,6 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value) case 'f': if (!key.compare(Rcl::Doc::keyfs)) { self->doc->fbytes = uvalue; - } else if (!key.compare(Rcl::Doc::keyfn)) { - self->doc->utf8fn = uvalue; } else if (!key.compare(Rcl::Doc::keyfmt)) { self->doc->fmtime = uvalue; } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 84b0929d..e4b91511 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -109,6 +109,22 @@ static const string cstr_syntAbs("?!#@"); static map fldToTraits; static PTMutexInit o_fldToTraits_mutex; +// A bogus fldToTraits key (bogus because not a real field) used to +// retrieve the prefix used for specific filename searches (unsplit +// filename, not "filename as 'filename:' field" searches) +static const string keySysFilenamePrefix("rclUnsplitFN"); +// The prefix for regular "filename:" field searches. +static const string cstr_fnAsFieldPrefix("XSFN"); +// The prefix for unsplit filename terms used with specific -f or +// "File Name" GUI entries. There is a compile option to use the same prefix +// for both. +// #define UNSPLIT_FN_PREFIX_SAME_AS_SPLIT +#if defined(UNSPLIT_FN_PREFIX_SAME_AS_SPLIT) +static const string cstr_fnUnsplitPrefix(cstr_fnAsFieldPrefix); +#else +static const string cstr_fnUnsplitPrefix("XSFS"); +#endif + static void initFldToTraits() { PTMutexLocker locker(o_fldToTraits_mutex); @@ -123,7 +139,9 @@ static void initFldToTraits() fldToTraits[Doc::keyabs] = FieldTraits(); fldToTraits["ext"] = FieldTraits("XE"); - fldToTraits[Doc::keyfn] = FieldTraits("XSFN"); + + fldToTraits[Doc::keyfn] = FieldTraits(cstr_fnAsFieldPrefix); + fldToTraits[keySysFilenamePrefix] = FieldTraits(cstr_fnUnsplitPrefix); fldToTraits[cstr_caption] = FieldTraits("S"); fldToTraits[Doc::keytt] = FieldTraits("S"); @@ -220,7 +238,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, vector keys = parms.getNames(string()); for (vector::const_iterator it = keys.begin(); it != keys.end(); it++) { - if (doc.meta.find(*it) == doc.meta.end()) + if (doc.meta.find(*it) == doc.meta.end()) parms.get(*it, doc.meta[*it]); } doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime; @@ -1099,11 +1117,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, TextSplitDb splitter(newdocument, nxt); tpidx.setTSD(&splitter); - // Split and index file name as document term(s) - LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); - if (!splitter.text_to_words(doc.utf8fn)) - LOGDEB(("Db::addOrUpdate: split failed for file name\n")); - // If the ipath is like a path, index the last element. This is // for compound documents like zip and chm for which the filter // uses the file path as ipath. @@ -1180,11 +1193,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, // Mime type newdocument.add_term("T" + doc.mimetype); - // Simple file name indexed for file name searches with a term prefix - // We also add a term for the filename extension if any. - if (!doc.utf8fn.empty()) { + // Simple file name indexed unsplit for file name searches with a + // term prefix We also add a term for the filename extension if + // any. + string utf8fn; + if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { string fn; - if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) { + if (unacmaybefold(utf8fn, fn, "UTF-8", true)) { // We should truncate after extracting the extension, but this is // a pathological case anyway if (fn.size() > 230) @@ -1193,18 +1208,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, if (pos != string::npos && pos != fn.length() - 1) { newdocument.add_term(string("XE") + fn.substr(pos + 1)); } - fn = string("XSFN") + fn; + fn = cstr_fnUnsplitPrefix + fn; newdocument.add_term(fn); } - // Store utf8fn inside the metadata array as keyfn - // (="filename") so that it can be accessed by the "stored" - // processing below, without special-casing it. We only do it - // if keyfn is currently empty, because there could be a value - // already (ie for a mail attachment with a file name - // attribute) - if (doc.meta[Doc::keyfn].empty()) { - doc.meta[Doc::keyfn] = doc.utf8fn; - } } // Udi unique term: this is used for file existence/uptodate @@ -1663,7 +1669,8 @@ bool Db::filenameWildExp(const string& fnexp, vector& names) LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str())); TermMatchResult result; - if (!termMatch(ET_WILD, string(), pattern, result, 1000, Doc::keyfn)) + if (!termMatch(ET_WILD, string(), pattern, result, 1000, + keySysFilenamePrefix)) return false; for (vector::const_iterator it = result.entries.begin(); it != result.entries.end(); it++) diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index d3f1f60f..4361f518 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -212,8 +212,7 @@ class Db { /** Return min and max years for doc mod times in db */ bool maxYearSpan(int *minyear, int *maxyear); - /** Special filename wildcard to XSFN terms expansion. - internal/searchdata use only */ + /** Wildcard expansion specific to file names. Internal/sdata use only */ bool filenameWildExp(const string& exp, vector& names); /** Set parameters for synthetic abstract generation */ diff --git a/src/rcldb/rcldoc.cpp b/src/rcldb/rcldoc.cpp index 8a2e9eef..290d3b21 100644 --- a/src/rcldb/rcldoc.cpp +++ b/src/rcldb/rcldoc.cpp @@ -47,7 +47,6 @@ namespace Rcl { void Doc::dump(bool dotext) const { LOGDEB(("Rcl::Doc::dump: url: [%s]\n", url.c_str())); - LOGDEB(("Rcl::Doc::dump: utf8fn: [%s]\n", utf8fn.c_str())); LOGDEB(("Rcl::Doc::dump: ipath: [%s]\n", ipath.c_str())); LOGDEB(("Rcl::Doc::dump: mimetype: [%s]\n", mimetype.c_str())); LOGDEB(("Rcl::Doc::dump: fmtime: [%s]\n", fmtime.c_str())); diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index a06dada4..44a1b64d 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -48,11 +48,6 @@ class Doc { // Query: from doc data. string url; - // Transcoded version of the simple file name for SFN-prefixed - // specific file name indexing - // Index: set by DbIndexer::processone - string utf8fn; - // Internal path for multi-doc files. Ascii // Set by FsIndexer::processone string ipath; @@ -123,7 +118,6 @@ class Doc { /////////////////////////////////////////////////////////////////// void erase() { url.erase(); - utf8fn.erase(); ipath.erase(); mimetype.erase(); fmtime.erase(); @@ -153,6 +147,17 @@ class Doc { return false; } } + bool peekmeta(const string& nm, const string **value = 0) const + { + map::const_iterator it = meta.find(nm); + if (it != meta.end()) { + if (value) + *value = &(it->second); + return true; + } else { + return false; + } + } void dump(bool dotext=false) const; diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 78b09413..7bd5e2cb 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1024,32 +1024,25 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, return true; } -// Translate a FILENAME search clause. This mostly (or always) comes +// Translate a FILENAME search clause. This always comes // from a "filename" search from the gui or recollq. A query language // "filename:"-prefixed field will not go through here, but through // the generic field-processing code. // -// In the case of multiple space-separated fragments, we generate an -// AND of OR queries. Each OR query comes from the expansion of a -// fragment. We used to generate a single OR with all expanded terms, -// which did not make much sense. +// We do not split the entry any more (used to do some crazy thing +// about expanding multiple fragments in the past. We just take the +// value blanks and all and expand this against the indexed unsplit +// file names bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, const string&) { Xapian::Query *qp = (Xapian::Query *)p; *qp = Xapian::Query(); - vector patterns; - TextSplit::stringToStrings(m_text, patterns); vector names; - for (vector::iterator it = patterns.begin(); - it != patterns.end(); it++) { - vector more; - db.filenameWildExp(*it, more); - Xapian::Query tq = Xapian::Query(Xapian::Query::OP_OR, more.begin(), - more.end()); - *qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq); - } + db.filenameWildExp(m_text, names); + *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); + if (m_weight != 1.0) { *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); } diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index d872778c..564b4125 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -262,15 +262,11 @@ protected: int m_slack; }; -/** Filename search clause. This is special because term expansion is only - * performed against the XSFN terms (it's performed against the main index - * for all other fields). Else we could just use a "filename:" field - * This doesn't really make sense (either). I think we could either expand - * filenames against all terms and then select the XSFN ones, or always perform - * expansion only against the field's terms ? Anyway this doesn't hurt - * much either. +/** + * Filename search clause. This is special because term expansion is only + * performed against the unsplit file name terms. * - * There is a big advantage though in expanding only against the + * There is a big advantage in expanding only against the * field, especially for file names, because this makes searches for * "*xx" much faster (no need to scan the whole main index). */ diff --git a/src/sampleconf/fields b/src/sampleconf/fields index e7dc7bf3..1794cb02 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -42,12 +42,12 @@ keywords = K # add your own to search for fields produced by the filters and not handled # by default. # Some values are internally reserved by recoll: -# XP (for path elements), XXST, XXND +# XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND, # Using XX was not a good idea. +# # I hereby commit to not using XY for Recoll: -# *** USE XY for beginning your local prefixes *** -ext = XE -filename = XSFN +# *** USE XY for beginning your local prefixes *** ie: +# myfield = XYMYPREF recipient = XTO ############################