filename search fields: generate an AND of OR lists out of wildcard expansion instead of a global OR which did not make much sense

This commit is contained in:
Jean-Francois Dockes 2011-01-13 11:47:35 +01:00
parent d80f4478fc
commit 85b36d3c34
5 changed files with 91 additions and 32 deletions

View File

@ -25,15 +25,12 @@ static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.9 2007-12-13 06:58:21 dockes Exp $
#include <string>
#ifndef NO_NAMESPACES
using std::string;
#endif /* NO_NAMESPACES */
#include "unacpp.h"
#include "unac.h"
#include "debuglog.h"
#include "utf8iter.h"
bool unacmaybefold(const std::string &in, std::string &out,
bool unacmaybefold(const string &in, string &out,
const char *encoding, bool dofold)
{
char *cout = 0;
@ -56,6 +53,31 @@ bool unacmaybefold(const std::string &in, std::string &out,
return true;
}
bool unaciscapital(const string& in)
{
if (in.empty())
return false;
Utf8Iter it(in);
string shorter;
it.appendchartostring(shorter);
string noacterm, noaclowterm;
if (!unacmaybefold(shorter, noacterm, "UTF-8", false)) {
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
return false;
}
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
return false;
}
Utf8Iter it1(noacterm);
Utf8Iter it2(noaclowterm);
if (*it1 != *it2)
return true;
else
return false;
}
#else // not testing
#include <unistd.h>

View File

@ -20,7 +20,15 @@
#include <string>
#ifndef NO_NAMESPACES
using std::string;
#endif /* NO_NAMESPACES */
// A small stringified wrapper for unac.c
extern bool unacmaybefold(const std::string &in, std::string &out,
extern bool unacmaybefold(const string& in, string& out,
const char *encoding, bool dofold);
// Utility function to determine if string begins with capital
extern bool unaciscapital(const string& in);
#endif /* _UNACPP_H_INCLUDED_ */

View File

@ -683,11 +683,40 @@ fvwm
<para><guilabel>File name</guilabel> will specifically look for file
names. The entry will be split at white space characters,
and each pattern will be separately expanded. If you want
to search for a pattern including white space, use
double quotes. The point of having a separate file name
and each fragment will be separately expanded, then the search will
be for file names matching all fragments (this is new in 1.15,
older releases did an OR of the whole thing which did not make
sense). Things to know:
<itemizedlist>
<listitem><para>The search is case- and accent-insensitive.</para>
</listitem>
<listitem><para>Fragments without any wild card
character and not capitalized will be prepended and appended
with '*' (ie: <replaceable>etc</replaceable> ->
<replaceable>*etc*</replaceable>, but
<replaceable>Etc</replaceable> ->
<replaceable>etc</replaceable>). Of course it does not make
sense to have multiple fragments if one of them is capitalized
(as this one will require an exact match).</para>
</listitem>
<listitem><para>If you want to search for a pattern including
white space, use double quotes (ie: <replaceable>"admin
note*"</replaceable>).</para>
</listitem>
<listitem><para>If you have a big index (many files),
excessively generic fragments may result in inefficient
searches.</para>
</listitem>
<listitem><para>As an example, <replaceable>inst
recoll</replaceable> would match
<replaceable>recollinstall.in</replaceable> (and quite a few
others...).</para>
</listitem>
</itemizedlist>
The point of having a separate file name
search is that wild card expansion can be performed more
efficiently on a relatively small subset of the index.</para>
efficiently on a relatively small subset of the index (allowing
wild cards on the left of terms without excessive penality).</para>
<para>The fourth entry (<guilabel>Query Language</guilabel>) is
described in <link linkend="rcl.search.lang">its own

View File

@ -1383,13 +1383,16 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
string pattern = fnexp;
names.clear();
// If pattern is not quoted, and has no wildcards, we add * at
// If pattern is not capitalized, not quoted (quoted pattern can't
// get here currently anyway), and has no wildcards, we add * at
// each end: match any substring
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
pattern = pattern.substr(1, pattern.size() -2);
} else if (pattern.find_first_of("*?[") == string::npos) {
} else if (pattern.find_first_of("*?[") == string::npos &&
!unaciscapital(pattern)) {
pattern = "*" + pattern + "*";
} // else let it be
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
TermMatchResult result;

View File

@ -403,24 +403,14 @@ class TextSplitQ : public TextSplit {
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Note that
// the test is convoluted and possibly problematic
string noacterm, noaclowterm;
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
// case we do not want to do stem expansion.
bool nostemexp = unaciscapital(interm);
string noaclowterm;
if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
interm.c_str()));
return true;
}
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
noacterm.c_str()));
return true;
}
bool nostemexp = false;
Utf8Iter it1(noacterm);
Utf8Iter it2(noaclowterm);
if (*it1 != *it2)
nostemexp = true;
if (stops.hasStops() && stops.isStop(noaclowterm)) {
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
@ -828,8 +818,15 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
return true;
}
// Translate a FILENAME search clause. Actually this is now mostly
// a "filename" field search.
// Translate a FILENAME search clause. This mostly (or always) comes
// from a "filename" search from the gui or recollq. A query language
// "filename:"-prefixed field will not go through here, but through
// the generic field-processing code.
//
// In the case of multiple space-separated fragments, we generate an
// AND of OR queries. Each OR query comes from the expansion of a
// fragment. We used to generate a single OR with all expanded terms,
// which did not make much sense.
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
const string&)
{
@ -843,10 +840,10 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
it != patterns.end(); it++) {
list<string> more;
db.filenameWildExp(*it, more);
names.splice(names.end(), more);
Xapian::Query tq = Xapian::Query(Xapian::Query::OP_OR, more.begin(),
more.end());
*qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq);
}
// Build a query out of the matching file name terms.
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
return true;
}