Cleaned up file name handling. Fixes that file names were sometimes indexed split, sometimes not. They now always are both, with different prefixes. Forces reindex

2012-04-13 09:18:08 +02:00 · 2012-04-13 09:18:08 +02:00 · 8b34610dde
commit 8b34610dde
parent 4eaf12fb9c
10 changed files with 77 additions and 73 deletions
--- a/src/index/fsindexer.cpp
+++ b/src/index/fsindexer.cpp
@ -452,8 +452,9 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
 	    doc.fmtime = ascdate;
        if (doc.url.empty())
            doc.url = cstr_fileu + fn;
-	if (doc.utf8fn.empty())
+	const string *fnp = 0;
-	    doc.utf8fn = utf8fn;
+	if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
 	    doc.meta[Rcl::Doc::keyfn] = utf8fn;
 	char cbuf[100]; 
 	sprintf(cbuf, OFFTPC, stp->st_size);
@ -512,7 +513,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
 	LOGDEB1(("Creating empty doc for file\n"));
 	Rcl::Doc fileDoc;
 	fileDoc.fmtime = ascdate;
-	fileDoc.utf8fn = utf8fn;
+	fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
 	fileDoc.mimetype = interner.getMimetype();
 	fileDoc.url = cstr_fileu + fn;
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@ -679,6 +679,11 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
 	    doc.dmtime = it->second;
 	} else if (it->first == cstr_dj_keyorigcharset) {
 	    doc.origcharset = it->second;
 	} else if (it->first == cstr_dj_keyfn) {
 	    // Only if not set during the stack walk
 	    const string *fnp = 0;
 	    if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
 		doc.meta[Rcl::Doc::keyfn] = it->second;
 	} else if (it->first == cstr_dj_keymt || 
 		   it->first == cstr_dj_keycharset) {
 	    // don't need/want these.
@ -735,7 +740,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
 		// We have a non-empty ipath
 		hasipath = true;
 		getKeyValue(docdata, cstr_dj_keymt, doc.mimetype);
-		getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn);
+		getKeyValue(docdata, cstr_dj_keyfn, doc.meta[Rcl::Doc::keyfn]);
 	    } else {
 		if (doc.fbytes.empty())
 		    getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
@ -999,15 +1004,16 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, const string& ipath
 	return FIError;
    }
-    // If indexing compute ipath and significant mimetype.  ipath is
+    // Compute ipath and significant mimetype.  ipath is returned
-    // returned through doc.ipath. We also retrieve some metadata
+    // through doc.ipath. We also retrieve some metadata fields from
-    // fields from the ancesters (like date or author). This is useful
+    // the ancesters (like date or author). This is useful for email
-    // for email attachments. The values will be replaced by those
+    // attachments. The values will be replaced by those internal to
-    // internal to the document (by dijontorcl()) if any, so the order
+    // the document (by dijontorcl()) if any, so the order of calls is
-    // of calls is important.
+    // important. We used to only do this when indexing, but the aux
-    if (!m_forPreview) {
+    // fields like filename and author may be interesting when
-	collectIpathAndMT(doc);
+    // previewing too
-    } else {
+    collectIpathAndMT(doc);
    if (m_forPreview) {
 	doc.mimetype = m_reachedMType;
    }
    // Keep this AFTER collectIpathAndMT
--- a/src/python/recoll/pyrecoll.cpp
+++ b/src/python/recoll/pyrecoll.cpp
@ -428,8 +428,6 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
    case 'f':
 	if (!key.compare(Rcl::Doc::keyfs)) {
 	    self->doc->fbytes = uvalue;
 	} else if (!key.compare(Rcl::Doc::keyfn)) {
 	    self->doc->utf8fn = uvalue;
 	} else if (!key.compare(Rcl::Doc::keyfmt)) {
 	    self->doc->fmtime = uvalue;
 	}
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -109,6 +109,22 @@ static const string cstr_syntAbs("?!#@");
 static map<string, FieldTraits> fldToTraits;
 static PTMutexInit o_fldToTraits_mutex;
 // A bogus fldToTraits key (bogus because not a real field) used to
 // retrieve the prefix used for specific filename searches (unsplit
 // filename, not "filename as 'filename:' field" searches)
 static const string keySysFilenamePrefix("rclUnsplitFN");
 // The prefix for regular "filename:" field searches.
 static const string cstr_fnAsFieldPrefix("XSFN");
 // The prefix for unsplit filename terms used with specific -f or
 // "File Name" GUI entries. There is a compile option to use the same prefix 
 // for both.
 // #define UNSPLIT_FN_PREFIX_SAME_AS_SPLIT
 #if defined(UNSPLIT_FN_PREFIX_SAME_AS_SPLIT)
 static const string cstr_fnUnsplitPrefix(cstr_fnAsFieldPrefix);
 #else
 static const string cstr_fnUnsplitPrefix("XSFS");
 #endif
 static void initFldToTraits() 
 {
    PTMutexLocker locker(o_fldToTraits_mutex);
@ -123,7 +139,9 @@ static void initFldToTraits()
    fldToTraits[Doc::keyabs] = FieldTraits();
    fldToTraits["ext"] = FieldTraits("XE");
-    fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
+
    fldToTraits[Doc::keyfn] = FieldTraits(cstr_fnAsFieldPrefix);
    fldToTraits[keySysFilenamePrefix] = FieldTraits(cstr_fnUnsplitPrefix);
    fldToTraits[cstr_caption] = FieldTraits("S");
    fldToTraits[Doc::keytt] = FieldTraits("S");
@ -220,7 +238,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
    vector<string> keys = parms.getNames(string());
    for (vector<string>::const_iterator it = keys.begin(); 
 	 it != keys.end(); it++) {
-	if (doc.meta.find(*it) == doc.meta.end()) 
+	if (doc.meta.find(*it) == doc.meta.end())
 	    parms.get(*it, doc.meta[*it]);
    }
    doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
@ -1099,11 +1117,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    TextSplitDb splitter(newdocument, nxt);
    tpidx.setTSD(&splitter);
    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
    if (!splitter.text_to_words(doc.utf8fn))
        LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
    // If the ipath is like a path, index the last element. This is
    // for compound documents like zip and chm for which the filter
    // uses the file path as ipath. 
@ -1180,11 +1193,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    // Mime type
    newdocument.add_term("T" + doc.mimetype);
-    // Simple file name indexed for file name searches with a term prefix
+    // Simple file name indexed unsplit for file name searches with a
-    // We also add a term for the filename extension if any.
+    // term prefix We also add a term for the filename extension if
-    if (!doc.utf8fn.empty()) {
+    // any.
    string utf8fn;
    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
 	string fn;
-	if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) {
+	if (unacmaybefold(utf8fn, fn, "UTF-8", true)) {
 	    // We should truncate after extracting the extension, but this is
 	    // a pathological case anyway
 	    if (fn.size() > 230)
@ -1193,18 +1208,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	    if (pos != string::npos && pos != fn.length() - 1) {
 		newdocument.add_term(string("XE") + fn.substr(pos + 1));
 	    }
-	    fn = string("XSFN") + fn;
+	    fn = cstr_fnUnsplitPrefix + fn;
 	    newdocument.add_term(fn);
 	}
        // Store utf8fn inside the metadata array as keyfn
        // (="filename") so that it can be accessed by the "stored"
        // processing below, without special-casing it. We only do it
        // if keyfn is currently empty, because there could be a value
        // already (ie for a mail attachment with a file name
        // attribute)
 	if (doc.meta[Doc::keyfn].empty()) {
            doc.meta[Doc::keyfn] = doc.utf8fn;
 	}
    }
    // Udi unique term: this is used for file existence/uptodate
@ -1663,7 +1669,8 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
    LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
    TermMatchResult result;
-    if (!termMatch(ET_WILD, string(), pattern, result, 1000, Doc::keyfn))
+    if (!termMatch(ET_WILD, string(), pattern, result, 1000, 
 		   keySysFilenamePrefix))
 	return false;
    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
 	 it != result.entries.end(); it++) 
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -212,8 +212,7 @@ class Db {
    /** Return min and max years for doc mod times in db */
    bool maxYearSpan(int *minyear, int *maxyear);
-    /** Special filename wildcard to XSFN terms expansion.
+    /** Wildcard expansion specific to file names. Internal/sdata use only */
 	internal/searchdata use only */
    bool filenameWildExp(const string& exp, vector<string>& names);
    /** Set parameters for synthetic abstract generation */
--- a/src/rcldb/rcldoc.cpp
+++ b/src/rcldb/rcldoc.cpp
@ -47,7 +47,6 @@ namespace Rcl {
    void Doc::dump(bool dotext) const
    {
        LOGDEB(("Rcl::Doc::dump: url: [%s]\n", url.c_str()));
        LOGDEB(("Rcl::Doc::dump: utf8fn: [%s]\n", utf8fn.c_str()));
        LOGDEB(("Rcl::Doc::dump: ipath: [%s]\n", ipath.c_str()));
        LOGDEB(("Rcl::Doc::dump: mimetype: [%s]\n", mimetype.c_str()));
        LOGDEB(("Rcl::Doc::dump: fmtime: [%s]\n", fmtime.c_str()));
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@ -48,11 +48,6 @@ class Doc {
    // Query: from doc data.
    string url;
    // Transcoded version of the simple file name for SFN-prefixed
    // specific file name indexing
    // Index: set by DbIndexer::processone    
    string utf8fn; 
    // Internal path for multi-doc files. Ascii
    // Set by FsIndexer::processone    
    string ipath;
@ -123,7 +118,6 @@ class Doc {
    ///////////////////////////////////////////////////////////////////
    void erase() {
 	url.erase();
 	utf8fn.erase();
 	ipath.erase();
 	mimetype.erase();
 	fmtime.erase();
@ -153,6 +147,17 @@ class Doc {
 	    return false;
 	}
    }
    bool peekmeta(const string& nm, const string **value = 0) const
    {
 	map<string,string>::const_iterator it = meta.find(nm);
 	if (it != meta.end()) {
 	    if (value)
 		*value = &(it->second);
 	    return true;
 	} else {
 	    return false;
 	}
    }
    void dump(bool dotext=false) const;
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -1024,32 +1024,25 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
    return true;
 }
-// Translate a FILENAME search clause. This mostly (or always) comes
+// Translate a FILENAME search clause. This always comes
 // from a "filename" search from the gui or recollq. A query language
 // "filename:"-prefixed field will not go through here, but through
 // the generic field-processing code.
 //
-// In the case of multiple space-separated fragments, we generate an
+// We do not split the entry any more (used to do some crazy thing
-// AND of OR queries. Each OR query comes from the expansion of a
+// about expanding multiple fragments in the past. We just take the
-// fragment. We used to generate a single OR with all expanded terms,
+// value blanks and all and expand this against the indexed unsplit
-// which did not make much sense.
+// file names
 bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, 
 					     const string&)
 {
    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();
    vector<string> patterns;
    TextSplit::stringToStrings(m_text, patterns);
    vector<string> names;
-    for (vector<string>::iterator it = patterns.begin();
+    db.filenameWildExp(m_text, names);
-	 it != patterns.end(); it++) {
+    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
-	vector<string> more;
+
 	db.filenameWildExp(*it, more);
 	Xapian::Query tq = Xapian::Query(Xapian::Query::OP_OR, more.begin(), 
 					 more.end());
 	*qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq);
    }
    if (m_weight != 1.0) {
 	*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
    }
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@ -262,15 +262,11 @@ protected:
    int m_slack;
 };
-/** Filename search clause. This is special because term expansion is only
+/** 
- * performed against the XSFN terms (it's performed against the main index
+ * Filename search clause. This is special because term expansion is only
- * for all other fields). Else we could just use a "filename:" field
+ * performed against the unsplit file name terms. 
 * This doesn't really make sense (either). I think we could either expand
 * filenames against all terms and then select the XSFN ones, or always perform
 * expansion only against the field's terms ? Anyway this doesn't hurt
 * much either. 
 *
- * There is a big advantage though in expanding only against the
+ * There is a big advantage in expanding only against the
 * field, especially for file names, because this makes searches for
 * "*xx" much faster (no need to scan the whole main index).
 */
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@ -42,12 +42,12 @@ keywords = K
 # add your own to search for fields produced by the filters and not handled
 # by default. 
 # Some values are internally reserved by recoll: 
-#   XP (for path elements), XXST, XXND
+#   XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND, 
 # Using XX was not a good idea. 
 #
 # I hereby commit to not using XY for Recoll:
-# *** USE XY for beginning your local prefixes ***
+# *** USE XY for beginning your local prefixes *** ie:
-ext = XE
+# myfield = XYMYPREF
 filename = XSFN
 recipient = XTO
 ############################