filename search fields: generate an AND of OR lists out of wildcard expansion instead of a global OR which did not make much sense

2011-01-13 11:47:35 +01:00 · 2011-01-13 11:47:35 +01:00 · 85b36d3c34
commit 85b36d3c34
parent d80f4478fc
5 changed files with 91 additions and 32 deletions
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@ -25,15 +25,12 @@ static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.9 2007-12-13 06:58:21 dockes Exp $

 #include <string>

-#ifndef NO_NAMESPACES
-using std::string;
-#endif /* NO_NAMESPACES */
-
 #include "unacpp.h"
 #include "unac.h"
+#include "debuglog.h"
+#include "utf8iter.h"

-
-bool unacmaybefold(const std::string &in, std::string &out, 
+bool unacmaybefold(const string &in, string &out, 
 		   const char *encoding, bool dofold)
 {
    char *cout = 0;
@ -56,6 +53,31 @@ bool unacmaybefold(const std::string &in, std::string &out,
    return true;
 }

+bool unaciscapital(const string& in)
+{
+    if (in.empty())
+	return false;
+    Utf8Iter it(in);
+    string shorter;
+    it.appendchartostring(shorter);
+
+    string noacterm, noaclowterm;
+    if (!unacmaybefold(shorter, noacterm, "UTF-8", false)) {
+	LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
+	return false;
+    } 
+    if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
+	LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
+	return false;
+    }
+    Utf8Iter it1(noacterm);
+    Utf8Iter it2(noaclowterm);
+    if (*it1 != *it2)
+	return true;
+    else
+	return false;
+}
+
 #else // not testing

 #include <unistd.h>
--- a/src/common/unacpp.h
+++ b/src/common/unacpp.h
@ -20,7 +20,15 @@

 #include <string>

+#ifndef NO_NAMESPACES
+using std::string;
+#endif /* NO_NAMESPACES */
+
 // A small stringified wrapper for unac.c
-extern bool unacmaybefold(const std::string &in, std::string &out, 
+extern bool unacmaybefold(const string& in, string& out, 
 			  const char *encoding, bool dofold);
+
+// Utility function to determine if string begins with capital
+extern bool unaciscapital(const string& in);
+
 #endif /* _UNACPP_H_INCLUDED_ */
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@ -683,11 +683,40 @@ fvwm

      <para><guilabel>File name</guilabel> will specifically look for file
        names. The entry will be split at white space characters,
-        and each pattern will be separately expanded. If you want
-        to search for a pattern including white space, use
-        double quotes. The point of having a separate file name
+        and each fragment will be separately expanded, then the search will
+        be for file names matching all fragments (this is new in 1.15,
+        older releases did an OR of the whole thing which did not make
+        sense). Things to know:
+        <itemizedlist>
+            <listitem><para>The search is case- and accent-insensitive.</para>
+            </listitem>
+            <listitem><para>Fragments without any wild card
+            character and not capitalized will be prepended and appended
+            with '*' (ie: <replaceable>etc</replaceable> ->
+            <replaceable>*etc*</replaceable>, but
+            <replaceable>Etc</replaceable> ->
+            <replaceable>etc</replaceable>). Of course it does not make
+            sense to have multiple fragments if one of them is capitalized
+            (as this one will require an exact match).</para> 
+            </listitem>
+            <listitem><para>If you want to search for a pattern including
+            white space, use double quotes (ie: <replaceable>"admin
+            note*"</replaceable>).</para> 
+            </listitem>
+            <listitem><para>If you have a big index (many files),
+            excessively generic fragments may result in inefficient
+            searches.</para>
+            </listitem>
+            <listitem><para>As an example, <replaceable>inst
+            recoll</replaceable> would match
+            <replaceable>recollinstall.in</replaceable> (and quite a few
+            others...).</para> 
+            </listitem>
+          </itemizedlist>
+        The point of having a separate file name
        search is that wild card expansion can be performed more
-        efficiently on a relatively small subset of the index.</para>
+        efficiently on a relatively small subset of the index (allowing
+        wild cards on the left of terms without excessive penality).</para>

      <para>The fourth entry (<guilabel>Query Language</guilabel>) is
      described in <link linkend="rcl.search.lang">its own
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1383,13 +1383,16 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
    string pattern = fnexp;
    names.clear();

-    // If pattern is not quoted, and has no wildcards, we add * at
+    // If pattern is not capitalized, not quoted (quoted pattern can't
+    // get here currently anyway), and has no wildcards, we add * at
    // each end: match any substring
    if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
 	pattern = pattern.substr(1, pattern.size() -2);
-    } else if (pattern.find_first_of("*?[") == string::npos) {
+    } else if (pattern.find_first_of("*?[") == string::npos && 
+	       !unaciscapital(pattern)) {
 	pattern = "*" + pattern + "*";
    } // else let it be
+
    LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));

    TermMatchResult result;
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -403,24 +403,14 @@ class TextSplitQ : public TextSplit {
 	LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));

 	// Check if the first letter is a majuscule in which
-	// case we do not want to do stem expansion. Note that
-	// the test is convoluted and possibly problematic
-	string noacterm, noaclowterm;
-	if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
+	// case we do not want to do stem expansion. 
+	bool nostemexp = unaciscapital(interm);
+	string noaclowterm;
+	if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
 	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", 
                     interm.c_str()));
 	    return true;
-	} 
-	if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
-	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", 
-                     noacterm.c_str()));
-	    return true;
 	}
-	bool nostemexp = false;
-	Utf8Iter it1(noacterm);
-	Utf8Iter it2(noaclowterm);
-	if (*it1 != *it2)
-	    nostemexp = true;

 	if (stops.hasStops() && stops.isStop(noaclowterm)) {
 	    LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n", 
@ -828,8 +818,15 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
    return true;
 }

-// Translate a FILENAME search clause. Actually this is now mostly
-// a "filename" field search.
+// Translate a FILENAME search clause. This mostly (or always) comes
+// from a "filename" search from the gui or recollq. A query language
+// "filename:"-prefixed field will not go through here, but through
+// the generic field-processing code.
+//
+// In the case of multiple space-separated fragments, we generate an
+// AND of OR queries. Each OR query comes from the expansion of a
+// fragment. We used to generate a single OR with all expanded terms,
+// which did not make much sense.
 bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, 
 					     const string&)
 {
@ -843,10 +840,10 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
 	 it != patterns.end(); it++) {
 	list<string> more;
 	db.filenameWildExp(*it, more);
-	names.splice(names.end(), more);
+	Xapian::Query tq = Xapian::Query(Xapian::Query::OP_OR, more.begin(), 
+					 more.end());
+	*qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq);
    }
-    // Build a query out of the matching file name terms.
-    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
    return true;
 }