filename search fields: generate an AND of OR lists out of wildcard expansion instead of a global OR which did not make much sense
This commit is contained in:
parent
d80f4478fc
commit
85b36d3c34
@ -25,15 +25,12 @@ static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.9 2007-12-13 06:58:21 dockes Exp $
|
||||
|
||||
#include <string>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#include "unacpp.h"
|
||||
#include "unac.h"
|
||||
#include "debuglog.h"
|
||||
#include "utf8iter.h"
|
||||
|
||||
|
||||
bool unacmaybefold(const std::string &in, std::string &out,
|
||||
bool unacmaybefold(const string &in, string &out,
|
||||
const char *encoding, bool dofold)
|
||||
{
|
||||
char *cout = 0;
|
||||
@ -56,6 +53,31 @@ bool unacmaybefold(const std::string &in, std::string &out,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unaciscapital(const string& in)
|
||||
{
|
||||
if (in.empty())
|
||||
return false;
|
||||
Utf8Iter it(in);
|
||||
string shorter;
|
||||
it.appendchartostring(shorter);
|
||||
|
||||
string noacterm, noaclowterm;
|
||||
if (!unacmaybefold(shorter, noacterm, "UTF-8", false)) {
|
||||
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
|
||||
return false;
|
||||
}
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
#else // not testing
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@ -20,7 +20,15 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
// A small stringified wrapper for unac.c
|
||||
extern bool unacmaybefold(const std::string &in, std::string &out,
|
||||
extern bool unacmaybefold(const string& in, string& out,
|
||||
const char *encoding, bool dofold);
|
||||
|
||||
// Utility function to determine if string begins with capital
|
||||
extern bool unaciscapital(const string& in);
|
||||
|
||||
#endif /* _UNACPP_H_INCLUDED_ */
|
||||
|
||||
@ -683,11 +683,40 @@ fvwm
|
||||
|
||||
<para><guilabel>File name</guilabel> will specifically look for file
|
||||
names. The entry will be split at white space characters,
|
||||
and each pattern will be separately expanded. If you want
|
||||
to search for a pattern including white space, use
|
||||
double quotes. The point of having a separate file name
|
||||
and each fragment will be separately expanded, then the search will
|
||||
be for file names matching all fragments (this is new in 1.15,
|
||||
older releases did an OR of the whole thing which did not make
|
||||
sense). Things to know:
|
||||
<itemizedlist>
|
||||
<listitem><para>The search is case- and accent-insensitive.</para>
|
||||
</listitem>
|
||||
<listitem><para>Fragments without any wild card
|
||||
character and not capitalized will be prepended and appended
|
||||
with '*' (ie: <replaceable>etc</replaceable> ->
|
||||
<replaceable>*etc*</replaceable>, but
|
||||
<replaceable>Etc</replaceable> ->
|
||||
<replaceable>etc</replaceable>). Of course it does not make
|
||||
sense to have multiple fragments if one of them is capitalized
|
||||
(as this one will require an exact match).</para>
|
||||
</listitem>
|
||||
<listitem><para>If you want to search for a pattern including
|
||||
white space, use double quotes (ie: <replaceable>"admin
|
||||
note*"</replaceable>).</para>
|
||||
</listitem>
|
||||
<listitem><para>If you have a big index (many files),
|
||||
excessively generic fragments may result in inefficient
|
||||
searches.</para>
|
||||
</listitem>
|
||||
<listitem><para>As an example, <replaceable>inst
|
||||
recoll</replaceable> would match
|
||||
<replaceable>recollinstall.in</replaceable> (and quite a few
|
||||
others...).</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
The point of having a separate file name
|
||||
search is that wild card expansion can be performed more
|
||||
efficiently on a relatively small subset of the index.</para>
|
||||
efficiently on a relatively small subset of the index (allowing
|
||||
wild cards on the left of terms without excessive penality).</para>
|
||||
|
||||
<para>The fourth entry (<guilabel>Query Language</guilabel>) is
|
||||
described in <link linkend="rcl.search.lang">its own
|
||||
|
||||
@ -1383,13 +1383,16 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
|
||||
string pattern = fnexp;
|
||||
names.clear();
|
||||
|
||||
// If pattern is not quoted, and has no wildcards, we add * at
|
||||
// If pattern is not capitalized, not quoted (quoted pattern can't
|
||||
// get here currently anyway), and has no wildcards, we add * at
|
||||
// each end: match any substring
|
||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||
pattern = pattern.substr(1, pattern.size() -2);
|
||||
} else if (pattern.find_first_of("*?[") == string::npos) {
|
||||
} else if (pattern.find_first_of("*?[") == string::npos &&
|
||||
!unaciscapital(pattern)) {
|
||||
pattern = "*" + pattern + "*";
|
||||
} // else let it be
|
||||
|
||||
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
TermMatchResult result;
|
||||
|
||||
@ -403,24 +403,14 @@ class TextSplitQ : public TextSplit {
|
||||
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
|
||||
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Note that
|
||||
// the test is convoluted and possibly problematic
|
||||
string noacterm, noaclowterm;
|
||||
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
|
||||
// case we do not want to do stem expansion.
|
||||
bool nostemexp = unaciscapital(interm);
|
||||
string noaclowterm;
|
||||
if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||
interm.c_str()));
|
||||
return true;
|
||||
}
|
||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||
noacterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
bool nostemexp = false;
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
nostemexp = true;
|
||||
|
||||
if (stops.hasStops() && stops.isStop(noaclowterm)) {
|
||||
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
|
||||
@ -828,8 +818,15 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a FILENAME search clause. Actually this is now mostly
|
||||
// a "filename" field search.
|
||||
// Translate a FILENAME search clause. This mostly (or always) comes
|
||||
// from a "filename" search from the gui or recollq. A query language
|
||||
// "filename:"-prefixed field will not go through here, but through
|
||||
// the generic field-processing code.
|
||||
//
|
||||
// In the case of multiple space-separated fragments, we generate an
|
||||
// AND of OR queries. Each OR query comes from the expansion of a
|
||||
// fragment. We used to generate a single OR with all expanded terms,
|
||||
// which did not make much sense.
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string&)
|
||||
{
|
||||
@ -843,10 +840,10 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
it != patterns.end(); it++) {
|
||||
list<string> more;
|
||||
db.filenameWildExp(*it, more);
|
||||
names.splice(names.end(), more);
|
||||
Xapian::Query tq = Xapian::Query(Xapian::Query::OP_OR, more.begin(),
|
||||
more.end());
|
||||
*qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq);
|
||||
}
|
||||
// Build a query out of the matching file name terms.
|
||||
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user