merged the case/diac sensitivity code back into trunk
This commit is contained in:
commit
52bc9f4aa3
@ -23,9 +23,9 @@
|
|||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
#include <iostream>
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <vector>
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include ASPELL_INCLUDE
|
#include ASPELL_INCLUDE
|
||||||
|
|
||||||
@ -33,7 +33,7 @@
|
|||||||
#include "execmd.h"
|
#include "execmd.h"
|
||||||
#include "rclaspell.h"
|
#include "rclaspell.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "unacpp.h"
|
||||||
#include "ptmutex.h"
|
#include "ptmutex.h"
|
||||||
|
|
||||||
// Just a place where we keep the Aspell library entry points together
|
// Just a place where we keep the Aspell library entry points together
|
||||||
@ -260,8 +260,17 @@ public:
|
|||||||
while (m_db.termWalkNext(m_tit, *m_input)) {
|
while (m_db.termWalkNext(m_tit, *m_input)) {
|
||||||
if (!Rcl::Db::isSpellingCandidate(*m_input))
|
if (!Rcl::Db::isSpellingCandidate(*m_input))
|
||||||
continue;
|
continue;
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(*m_input, lower, "UTF-8", UNACOP_FOLD))
|
||||||
|
continue;
|
||||||
|
m_input->swap(lower);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// Got a non-empty sort-of appropriate term, let's send it to
|
// Got a non-empty sort-of appropriate term, let's send it to
|
||||||
// aspell
|
// aspell
|
||||||
|
LOGDEB2(("ASpExecPv: [%s]\n", m_input->c_str()));
|
||||||
m_input->append("\n");
|
m_input->append("\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -335,17 +344,29 @@ bool Aspell::make_speller(string& reason)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
|
bool Aspell::check(const string &iterm, string& reason)
|
||||||
{
|
{
|
||||||
LOGDEB2(("Aspell::check [%s]\n", term.c_str()));
|
LOGDEB2(("Aspell::check [%s]\n", iterm.c_str()));
|
||||||
|
string mterm(iterm);
|
||||||
|
|
||||||
if (!ok() || !make_speller(reason))
|
if (!ok() || !make_speller(reason))
|
||||||
return false;
|
return false;
|
||||||
if (term.empty())
|
if (iterm.empty())
|
||||||
return true; //??
|
return true; //??
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
|
LOGERR(("Aspell::check : cant lowercase input\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
mterm.swap(lower);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
int ret = aapi.aspell_speller_check(m_data->m_speller,
|
int ret = aapi.aspell_speller_check(m_data->m_speller,
|
||||||
term.c_str(), term.length());
|
mterm.c_str(), mterm.length());
|
||||||
reason.clear();
|
reason.clear();
|
||||||
switch (ret) {
|
switch (ret) {
|
||||||
case 0: return false;
|
case 0: return false;
|
||||||
@ -358,19 +379,31 @@ bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Aspell::suggest(Rcl::Db &db, const string &term,
|
bool Aspell::suggest(Rcl::Db &db, const string &_term,
|
||||||
list<string>& suggestions, string& reason)
|
list<string>& suggestions, string& reason)
|
||||||
{
|
{
|
||||||
if (!ok() || !make_speller(reason))
|
if (!ok() || !make_speller(reason))
|
||||||
return false;
|
return false;
|
||||||
if (term.empty())
|
string mterm(_term);
|
||||||
|
if (mterm.empty())
|
||||||
return true; //??
|
return true; //??
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
|
LOGERR(("Aspell::check : cant lowercase input\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
mterm.swap(lower);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
AspellCanHaveError *ret;
|
AspellCanHaveError *ret;
|
||||||
|
|
||||||
const AspellWordList *wl =
|
const AspellWordList *wl =
|
||||||
aapi.aspell_speller_suggest(m_data->m_speller,
|
aapi.aspell_speller_suggest(m_data->m_speller,
|
||||||
term.c_str(), term.length());
|
mterm.c_str(), mterm.length());
|
||||||
if (wl == 0) {
|
if (wl == 0) {
|
||||||
reason = aapi.aspell_speller_error_message(m_data->m_speller);
|
reason = aapi.aspell_speller_error_message(m_data->m_speller);
|
||||||
return false;
|
return false;
|
||||||
@ -385,7 +418,7 @@ bool Aspell::suggest(Rcl::Db &db, const string &term,
|
|||||||
// ******** This should depend if
|
// ******** This should depend if
|
||||||
// stemming is turned on or not for querying *******
|
// stemming is turned on or not for querying *******
|
||||||
string sw(word);
|
string sw(word);
|
||||||
if (db.termExists(sw) && db.stemDiffers("english", sw, term))
|
if (db.termExists(sw) && db.stemDiffers("english", sw, mterm))
|
||||||
suggestions.push_back(word);
|
suggestions.push_back(word);
|
||||||
}
|
}
|
||||||
aapi.delete_aspell_string_enumeration(els);
|
aapi.delete_aspell_string_enumeration(els);
|
||||||
@ -418,12 +451,12 @@ using namespace std;
|
|||||||
|
|
||||||
static char *thisprog;
|
static char *thisprog;
|
||||||
RclConfig *rclconfig;
|
RclConfig *rclconfig;
|
||||||
Rcl::Db rcldb;
|
|
||||||
|
|
||||||
static char usage [] =
|
static char usage [] =
|
||||||
" -b : build dictionary\n"
|
" -b : build dictionary\n"
|
||||||
" -s <term>: suggestions for term\n"
|
" -s <term>: suggestions for term\n"
|
||||||
"\n\n"
|
" -c <term>: check term\n"
|
||||||
|
"\n"
|
||||||
;
|
;
|
||||||
static void
|
static void
|
||||||
Usage(void)
|
Usage(void)
|
||||||
@ -436,6 +469,7 @@ static int op_flags;
|
|||||||
#define OPT_MOINS 0x1
|
#define OPT_MOINS 0x1
|
||||||
#define OPT_s 0x2
|
#define OPT_s 0x2
|
||||||
#define OPT_b 0x4
|
#define OPT_b 0x4
|
||||||
|
#define OPT_c 0x8
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@ -452,6 +486,10 @@ int main(int argc, char **argv)
|
|||||||
while (**argv)
|
while (**argv)
|
||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case 'b': op_flags |= OPT_b; break;
|
case 'b': op_flags |= OPT_b; break;
|
||||||
|
case 'c': op_flags |= OPT_c; if (argc < 2) Usage();
|
||||||
|
word = *(++argv);
|
||||||
|
argc--;
|
||||||
|
goto b1;
|
||||||
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
|
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
|
||||||
word = *(++argv);
|
word = *(++argv);
|
||||||
argc--;
|
argc--;
|
||||||
@ -477,7 +515,9 @@ int main(int argc, char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
|
Rcl::Db rcldb(rclconfig);
|
||||||
|
|
||||||
|
if (!rcldb.open(Rcl::Db::DbRO, 0)) {
|
||||||
fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
|
fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -493,6 +533,18 @@ int main(int argc, char **argv)
|
|||||||
cerr << "buildDict failed: " << reason << endl;
|
cerr << "buildDict failed: " << reason << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
} else if (op_flags & OPT_c) {
|
||||||
|
bool ret = aspell.check(word, reason);
|
||||||
|
if (!ret && reason.size()) {
|
||||||
|
cerr << "Aspell error: " << reason << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
cout << word;
|
||||||
|
if (ret) {
|
||||||
|
cout << " is in dictionary" << endl;
|
||||||
|
} else {
|
||||||
|
cout << " not in dictionary" << endl;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
list<string> suggs;
|
list<string> suggs;
|
||||||
if (!aspell.suggest(rcldb, word, suggs, reason)) {
|
if (!aspell.suggest(rcldb, word, suggs, reason)) {
|
||||||
|
|||||||
@ -37,11 +37,6 @@
|
|||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using std::string;
|
|
||||||
using std::list;
|
|
||||||
#endif // NO_NAMESPACES
|
|
||||||
|
|
||||||
class AspellData;
|
class AspellData;
|
||||||
|
|
||||||
class Aspell {
|
class Aspell {
|
||||||
@ -53,26 +48,31 @@ class Aspell {
|
|||||||
bool ok() const;
|
bool ok() const;
|
||||||
|
|
||||||
/** Find the aspell command and shared library, init function pointers */
|
/** Find the aspell command and shared library, init function pointers */
|
||||||
bool init(string &reason);
|
bool init(std::string &reason);
|
||||||
|
|
||||||
/** Build dictionary out of index term list. This is done at the end
|
/** Build dictionary out of index term list. This is done at the end
|
||||||
* of an indexing pass. */
|
* of an indexing pass. */
|
||||||
bool buildDict(Rcl::Db &db, string &reason);
|
bool buildDict(Rcl::Db &db, std::string &reason);
|
||||||
|
|
||||||
/** Check that word is in dictionary. ret==false && !reason.empty() => err*/
|
/** Check that word is in dictionary. Note that this would mean
|
||||||
bool check(Rcl::Db &db, const string& term, string& reason);
|
* that the EXACT word is: aspell just does a lookup, no
|
||||||
|
* grammatical, case or diacritics magic of any kind
|
||||||
|
*
|
||||||
|
* @return true if word in dic, false if not. reason.size() -> error
|
||||||
|
*/
|
||||||
|
bool check(const std::string& term, std::string& reason);
|
||||||
|
|
||||||
/** Return a list of possible expansions for a given word */
|
/** Return a list of possible expansions for a given word */
|
||||||
bool suggest(Rcl::Db &db, const string& term, list<string> &suggestions,
|
bool suggest(Rcl::Db &db, const std::string& term,
|
||||||
string &reason);
|
std::list<std::string> &suggestions, std::string &reason);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string dicPath();
|
std::string dicPath();
|
||||||
RclConfig *m_config;
|
RclConfig *m_config;
|
||||||
string m_lang;
|
std::string m_lang;
|
||||||
AspellData *m_data;
|
AspellData *m_data;
|
||||||
|
|
||||||
bool make_speller(string& reason);
|
bool make_speller(std::string& reason);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* RCL_USE_ASPELL */
|
#endif /* RCL_USE_ASPELL */
|
||||||
|
|||||||
@ -15,6 +15,8 @@
|
|||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
#ifndef TEST_RCLCONFIG
|
#ifndef TEST_RCLCONFIG
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -34,6 +36,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include "cstr.h"
|
#include "cstr.h"
|
||||||
#include "pathut.h"
|
#include "pathut.h"
|
||||||
@ -45,15 +48,8 @@
|
|||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "fstreewalk.h"
|
#include "fstreewalk.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
using namespace std;
|
bool o_index_stripchars;
|
||||||
#endif /* NO_NAMESPACES */
|
|
||||||
|
|
||||||
#ifndef MIN
|
|
||||||
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
|
|
||||||
#endif
|
|
||||||
#ifndef MAX
|
|
||||||
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool ParamStale::needrecompute()
|
bool ParamStale::needrecompute()
|
||||||
@ -77,6 +73,7 @@ bool ParamStale::needrecompute()
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm)
|
void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm)
|
||||||
{
|
{
|
||||||
parent = rconf;
|
parent = rconf;
|
||||||
@ -239,6 +236,14 @@ bool RclConfig::updateMainConfig()
|
|||||||
FsTreeWalker::setNoFnmPathname();
|
FsTreeWalker::setNoFnmPathname();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
static int m_index_stripchars_init = 0;
|
||||||
|
if (!m_index_stripchars_init) {
|
||||||
|
getConfParam("indexStripChars", &o_index_stripchars);
|
||||||
|
m_index_stripchars_init = 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -303,5 +303,13 @@ class RclConfig {
|
|||||||
bool readFieldsConfig(const string& errloc);
|
bool readFieldsConfig(const string& errloc);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// This global variable defines if we are running with an index
|
||||||
|
// stripped of accents and case or a raw one. Ideally, it should be
|
||||||
|
// constant, but it needs to be initialized from the configuration, so
|
||||||
|
// there is no way to do this. It never changes after initialization
|
||||||
|
// of course. When set, it is supposed to get all of recoll to behave like if
|
||||||
|
// if was compiled with RCL_INDEX_STRIPCHARS
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
extern bool o_index_stripchars;
|
||||||
|
#endif
|
||||||
#endif /* _RCLCONFIG_H_INCLUDED_ */
|
#endif /* _RCLCONFIG_H_INCLUDED_ */
|
||||||
|
|||||||
@ -63,26 +63,57 @@ bool unacmaybefold(const string &in, string &out,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Functions to determine upper-case or accented status could be implemented
|
||||||
|
// hugely more efficiently inside the unac c code, but there only used for
|
||||||
|
// testing user-entered terms, so we don't really care.
|
||||||
bool unaciscapital(const string& in)
|
bool unaciscapital(const string& in)
|
||||||
{
|
{
|
||||||
|
LOGDEB2(("unaciscapital: [%s]\n", in.c_str()));
|
||||||
if (in.empty())
|
if (in.empty())
|
||||||
return false;
|
return false;
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
string shorter;
|
string shorter;
|
||||||
it.appendchartostring(shorter);
|
it.appendchartostring(shorter);
|
||||||
|
|
||||||
string noacterm, noaclowterm;
|
string lower;
|
||||||
if (!unacmaybefold(shorter, noacterm, "UTF-8", UNACOP_UNAC)) {
|
if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
|
LOGINFO(("unaciscapital: unac/fold failed for [%s]\n", in.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", UNACOP_UNACFOLD)) {
|
Utf8Iter it1(lower);
|
||||||
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
|
if (*it != *it1)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
Utf8Iter it1(noacterm);
|
bool unachasuppercase(const string& in)
|
||||||
Utf8Iter it2(noaclowterm);
|
{
|
||||||
if (*it1 != *it2)
|
LOGDEB2(("unachasuppercase: [%s]\n", in.c_str()));
|
||||||
|
if (in.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
|
LOGINFO(("unachasuppercase: unac/fold failed for [%s]\n", in.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (lower != in)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
bool unachasaccents(const string& in)
|
||||||
|
{
|
||||||
|
LOGDEB2(("unachasaccents: [%s]\n", in.c_str()));
|
||||||
|
if (in.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
string noac;
|
||||||
|
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
|
||||||
|
LOGINFO(("unachasaccents: unac/unac failed for [%s]\n", in.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (noac != in)
|
||||||
return true;
|
return true;
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
@ -107,12 +138,15 @@ static char *thisprog;
|
|||||||
|
|
||||||
static char usage [] = "\n"
|
static char usage [] = "\n"
|
||||||
"[-c|-C] <encoding> <infile> <outfile>\n"
|
"[-c|-C] <encoding> <infile> <outfile>\n"
|
||||||
" Default : unaccent\n"
|
" Default : unaccent\n"
|
||||||
" -c : unaccent and casefold\n"
|
" -c : unaccent and casefold\n"
|
||||||
" -C : casefold only\n"
|
" -C : casefold only\n"
|
||||||
|
"-t <string> test string as capitalized, upper-case anywhere, accents\n"
|
||||||
|
" the parameter is supposedly utf-8 so this can only work in an utf-8\n"
|
||||||
|
" locale\n"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
;
|
;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
Usage(void)
|
Usage(void)
|
||||||
{
|
{
|
||||||
@ -123,6 +157,7 @@ Usage(void)
|
|||||||
static int op_flags;
|
static int op_flags;
|
||||||
#define OPT_c 0x2
|
#define OPT_c 0x2
|
||||||
#define OPT_C 0x4
|
#define OPT_C 0x4
|
||||||
|
#define OPT_t 0x8
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@ -140,58 +175,73 @@ int main(int argc, char **argv)
|
|||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case 'c': op_flags |= OPT_c; break;
|
case 'c': op_flags |= OPT_c; break;
|
||||||
case 'C': op_flags |= OPT_C; break;
|
case 'C': op_flags |= OPT_C; break;
|
||||||
|
case 't': op_flags |= OPT_t; break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
argc--; argv++;
|
argc--; argv++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (op_flags & OPT_c) {
|
if (op_flags & OPT_t) {
|
||||||
op = UNACOP_UNACFOLD;
|
if (argc != 1)
|
||||||
} else if (op_flags & OPT_C) {
|
Usage();
|
||||||
op = UNACOP_FOLD;
|
string in = *argv++;argc--;
|
||||||
}
|
bool capital, upper, accent;
|
||||||
|
capital = unaciscapital(in);
|
||||||
if (argc != 3) {
|
upper = unachasuppercase(in);
|
||||||
Usage();
|
accent = unachasaccents(in);
|
||||||
}
|
cout << "[" << in << "] : " <<
|
||||||
|
"capitalized: " << (capital ? "Yes. " : "No. ") <<
|
||||||
const char *encoding = *argv++; argc--;
|
"has uppercase: " << (upper ? "Yes. " : "No. ") <<
|
||||||
string ifn = *argv++; argc--;
|
"has accents: " << (accent ? "Yes. " : "No. ") <<
|
||||||
if (!ifn.compare("stdin"))
|
endl;
|
||||||
ifn.clear();
|
return 0;
|
||||||
const char *ofn = *argv++; argc--;
|
|
||||||
|
|
||||||
string reason;
|
|
||||||
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
|
||||||
|
|
||||||
string odata;
|
|
||||||
if (!file_to_string(ifn, odata)) {
|
|
||||||
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
string ndata;
|
|
||||||
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
|
||||||
cerr << "unac: " << ndata << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int fd;
|
|
||||||
if (strcmp(ofn, "stdout")) {
|
|
||||||
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
|
||||||
} else {
|
} else {
|
||||||
fd = 1;
|
if (argc != 3)
|
||||||
|
Usage();
|
||||||
|
if (op_flags & OPT_c) {
|
||||||
|
op = UNACOP_UNACFOLD;
|
||||||
|
} else if (op_flags & OPT_C) {
|
||||||
|
op = UNACOP_FOLD;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *encoding = *argv++; argc--;
|
||||||
|
string ifn = *argv++; argc--;
|
||||||
|
if (!ifn.compare("stdin"))
|
||||||
|
ifn.clear();
|
||||||
|
const char *ofn = *argv++; argc--;
|
||||||
|
|
||||||
|
string reason;
|
||||||
|
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||||
|
|
||||||
|
string odata;
|
||||||
|
if (!file_to_string(ifn, odata)) {
|
||||||
|
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
string ndata;
|
||||||
|
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
||||||
|
cerr << "unac: " << ndata << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int fd;
|
||||||
|
if (strcmp(ofn, "stdout")) {
|
||||||
|
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||||
|
} else {
|
||||||
|
fd = 1;
|
||||||
|
}
|
||||||
|
if (fd < 0) {
|
||||||
|
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
||||||
|
<< endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
||||||
|
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
close(fd);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
if (fd < 0) {
|
|
||||||
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
|
||||||
<< endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
|
||||||
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
close(fd);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -24,11 +24,17 @@ using std::string;
|
|||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
|
||||||
// A small stringified wrapper for unac.c
|
// A small stringified wrapper for unac.c
|
||||||
enum UnacOp {UNACOP_UNAC, UNACOP_UNACFOLD, UNACOP_FOLD};
|
enum UnacOp {UNACOP_UNAC = 1, UNACOP_FOLD = 2, UNACOP_UNACFOLD = 3};
|
||||||
extern bool unacmaybefold(const string& in, string& out,
|
extern bool unacmaybefold(const string& in, string& out,
|
||||||
const char *encoding, UnacOp what);
|
const char *encoding, UnacOp what);
|
||||||
|
|
||||||
// Utility function to determine if string begins with capital
|
// Utility function to determine if string begins with capital
|
||||||
extern bool unaciscapital(const string& in);
|
extern bool unaciscapital(const string& in);
|
||||||
|
// Utility function to determine if string has upper-case anywhere
|
||||||
|
extern bool unachasuppercase(const string& in);
|
||||||
|
// Utility function to determine if any character is accented. This
|
||||||
|
// approprialey ignores the characters from unac_except_chars which
|
||||||
|
// are really separate letters
|
||||||
|
extern bool unachasaccents(const string& in);
|
||||||
|
|
||||||
#endif /* _UNACPP_H_INCLUDED_ */
|
#endif /* _UNACPP_H_INCLUDED_ */
|
||||||
|
|||||||
@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
#ifndef TEST_SUBTREELIST
|
#ifndef TEST_SUBTREELIST
|
||||||
|
|
||||||
|
#include "cstr.h"
|
||||||
#include "refcntr.h"
|
#include "refcntr.h"
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
#include "searchdata.h"
|
#include "searchdata.h"
|
||||||
@ -35,7 +36,7 @@ bool subtreelist(RclConfig *config, const string& top,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR);
|
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, cstr_null);
|
||||||
RefCntr<Rcl::SearchData> rq(sd);
|
RefCntr<Rcl::SearchData> rq(sd);
|
||||||
|
|
||||||
rq->addDirSpec(top);
|
rq->addDirSpec(top);
|
||||||
|
|||||||
@ -6,8 +6,8 @@ LIBS = librcl.a
|
|||||||
|
|
||||||
all: $(LIBS)
|
all: $(LIBS)
|
||||||
|
|
||||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o expansiondbs.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp expansiondbs.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||||
|
|
||||||
librcl.a : $(DEPS) $(OBJS)
|
librcl.a : $(DEPS) $(OBJS)
|
||||||
ar ru librcl.a $(OBJS)
|
ar ru librcl.a $(OBJS)
|
||||||
@ -87,6 +87,8 @@ wasastringtoquery.o : ../query/wasastringtoquery.cpp $(depth)/mk/localdefs
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
|
||||||
wasatorcl.o : ../query/wasatorcl.cpp $(depth)/mk/localdefs
|
wasatorcl.o : ../query/wasatorcl.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
|
||||||
|
expansiondbs.o : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/expansiondbs.cpp
|
||||||
rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
|
||||||
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
||||||
@ -278,6 +280,9 @@ wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp $(depth)/mk/localde
|
|||||||
wasatorcl.dep.stamp : ../query/wasatorcl.cpp $(depth)/mk/localdefs
|
wasatorcl.dep.stamp : ../query/wasatorcl.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
|
||||||
touch wasatorcl.dep.stamp
|
touch wasatorcl.dep.stamp
|
||||||
|
expansiondbs.dep.stamp : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/expansiondbs.cpp > expansiondbs.dep
|
||||||
|
touch expansiondbs.dep.stamp
|
||||||
rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
|
||||||
touch rcldb.dep.stamp
|
touch rcldb.dep.stamp
|
||||||
@ -405,6 +410,7 @@ include reslistpager.dep
|
|||||||
include sortseq.dep
|
include sortseq.dep
|
||||||
include wasastringtoquery.dep
|
include wasastringtoquery.dep
|
||||||
include wasatorcl.dep
|
include wasatorcl.dep
|
||||||
|
include expansiondbs.dep
|
||||||
include rcldb.dep
|
include rcldb.dep
|
||||||
include rcldoc.dep
|
include rcldoc.dep
|
||||||
include rclquery.dep
|
include rclquery.dep
|
||||||
|
|||||||
@ -41,6 +41,7 @@ ${depth}/query/reslistpager.cpp \
|
|||||||
${depth}/query/sortseq.cpp \
|
${depth}/query/sortseq.cpp \
|
||||||
${depth}/query/wasastringtoquery.cpp \
|
${depth}/query/wasastringtoquery.cpp \
|
||||||
${depth}/query/wasatorcl.cpp \
|
${depth}/query/wasatorcl.cpp \
|
||||||
|
${depth}/rcldb/expansiondbs.cpp \
|
||||||
${depth}/rcldb/rcldb.cpp \
|
${depth}/rcldb/rcldb.cpp \
|
||||||
${depth}/rcldb/rcldoc.cpp \
|
${depth}/rcldb/rcldoc.cpp \
|
||||||
${depth}/rcldb/rclquery.cpp \
|
${depth}/rcldb/rclquery.cpp \
|
||||||
|
|||||||
@ -93,7 +93,7 @@ SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs)
|
|||||||
if (stp && strcasecmp(stp, "or")) {
|
if (stp && strcasecmp(stp, "or")) {
|
||||||
tp = Rcl::SCLT_OR;
|
tp = Rcl::SCLT_OR;
|
||||||
}
|
}
|
||||||
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp));
|
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp, "english"));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -715,18 +715,18 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
|||||||
PyErr_SetString(PyExc_AttributeError, "query");
|
PyErr_SetString(PyExc_AttributeError, "query");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SearchData defaults to stemming in english
|
||||||
|
// Use default for now but need to add way to specify language
|
||||||
string reason;
|
string reason;
|
||||||
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, utf8, reason);
|
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, dostem ? "english" : "",
|
||||||
|
utf8, reason);
|
||||||
|
|
||||||
if (!sd) {
|
if (!sd) {
|
||||||
PyErr_SetString(PyExc_ValueError, reason.c_str());
|
PyErr_SetString(PyExc_ValueError, reason.c_str());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// SearchData defaults to stemming in english
|
|
||||||
// Use default for now but need to add way to specify language
|
|
||||||
if (!dostem)
|
|
||||||
sd->setStemlang("");
|
|
||||||
RefCntr<Rcl::SearchData> rq(sd);
|
RefCntr<Rcl::SearchData> rq(sd);
|
||||||
string sf = self->sortfield ? string(self->sortfield) : string("");
|
string sf = self->sortfield ? string(self->sortfield) : string("");
|
||||||
self->query->setSortBy(sf, self->ascending);
|
self->query->setSortBy(sf, self->ascending);
|
||||||
|
|||||||
@ -356,8 +356,9 @@ size_t AdvSearch::stringToSize(QString qsize)
|
|||||||
using namespace Rcl;
|
using namespace Rcl;
|
||||||
void AdvSearch::runSearch()
|
void AdvSearch::runSearch()
|
||||||
{
|
{
|
||||||
|
string stemLang = prefs.stemlang();
|
||||||
RefCntr<SearchData> sdata(new SearchData(conjunctCMB->currentIndex() == 0 ?
|
RefCntr<SearchData> sdata(new SearchData(conjunctCMB->currentIndex() == 0 ?
|
||||||
SCLT_AND : SCLT_OR));
|
SCLT_AND : SCLT_OR, stemLang));
|
||||||
bool hasclause = false;
|
bool hasclause = false;
|
||||||
|
|
||||||
for (list<SearchClauseW*>::iterator it = m_clauseWins.begin();
|
for (list<SearchClauseW*>::iterator it = m_clauseWins.begin();
|
||||||
|
|||||||
@ -372,6 +372,18 @@ void rwSettings(bool writing)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string PrefsPack::stemlang()
|
||||||
|
{
|
||||||
|
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
||||||
|
if (stemLang == "ALL") {
|
||||||
|
if (theconfig)
|
||||||
|
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
||||||
|
else
|
||||||
|
stemLang = "";
|
||||||
|
}
|
||||||
|
return stemLang;
|
||||||
|
}
|
||||||
|
|
||||||
QString myGetFileName(bool isdir, QString caption, bool filenosave)
|
QString myGetFileName(bool isdir, QString caption, bool filenosave)
|
||||||
{
|
{
|
||||||
LOGDEB1(("myFileDialog: isdir %d\n", isdir));
|
LOGDEB1(("myFileDialog: isdir %d\n", isdir));
|
||||||
|
|||||||
@ -120,6 +120,8 @@ class PrefsPack {
|
|||||||
// Default paragraph format for result list
|
// Default paragraph format for result list
|
||||||
static const char *dfltResListFormat;
|
static const char *dfltResListFormat;
|
||||||
|
|
||||||
|
std::string stemlang();
|
||||||
|
|
||||||
PrefsPack() :
|
PrefsPack() :
|
||||||
respagesize(8),
|
respagesize(8),
|
||||||
reslistfontsize(10),
|
reslistfontsize(10),
|
||||||
|
|||||||
@ -757,12 +757,6 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
|
||||||
if (stemLang == "ALL") {
|
|
||||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
|
||||||
}
|
|
||||||
sdata->setStemlang(stemLang);
|
|
||||||
|
|
||||||
Rcl::Query *query = new Rcl::Query(rcldb);
|
Rcl::Query *query = new Rcl::Query(rcldb);
|
||||||
query->setCollapseDuplicates(prefs.collapseDuplicates);
|
query->setCollapseDuplicates(prefs.collapseDuplicates);
|
||||||
|
|
||||||
@ -1078,9 +1072,7 @@ void RclMain::showActiveTypes()
|
|||||||
// Get list of all mime types in index. For this, we use a
|
// Get list of all mime types in index. For this, we use a
|
||||||
// wildcard field search on mtype
|
// wildcard field search on mtype
|
||||||
Rcl::TermMatchResult matches;
|
Rcl::TermMatchResult matches;
|
||||||
string prefix;
|
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype")) {
|
||||||
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype",
|
|
||||||
&prefix)) {
|
|
||||||
QMessageBox::warning(0, tr("Error"),
|
QMessageBox::warning(0, tr("Error"),
|
||||||
tr("Index query error"),
|
tr("Index query error"),
|
||||||
QMessageBox::Ok,
|
QMessageBox::Ok,
|
||||||
@ -1093,7 +1085,7 @@ void RclMain::showActiveTypes()
|
|||||||
for (vector<Rcl::TermMatchEntry>::const_iterator it =
|
for (vector<Rcl::TermMatchEntry>::const_iterator it =
|
||||||
matches.entries.begin();
|
matches.entries.begin();
|
||||||
it != matches.entries.end(); it++) {
|
it != matches.entries.end(); it++) {
|
||||||
mtypesfromdb.insert(it->term.substr(prefix.size()));
|
mtypesfromdb.insert(it->term.substr(matches.prefix.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// All types listed in mimeconf:
|
// All types listed in mimeconf:
|
||||||
@ -1779,7 +1771,7 @@ void RclMain::showDocHistory()
|
|||||||
}
|
}
|
||||||
// Construct a bogus SearchData structure
|
// Construct a bogus SearchData structure
|
||||||
RefCntr<Rcl::SearchData>searchdata =
|
RefCntr<Rcl::SearchData>searchdata =
|
||||||
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
|
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND, cstr_null));
|
||||||
searchdata->setDescription((const char *)tr("History data").toUtf8());
|
searchdata->setDescription((const char *)tr("History data").toUtf8());
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -198,10 +198,19 @@ void QtGuiResListPager::suggest(const vector<string>uterms,
|
|||||||
// If the term is in the index, we don't suggest alternatives.
|
// If the term is in the index, we don't suggest alternatives.
|
||||||
// Actually, we may want to check the frequencies and propose something
|
// Actually, we may want to check the frequencies and propose something
|
||||||
// anyway if a possible variation is much more common (as google does)
|
// anyway if a possible variation is much more common (as google does)
|
||||||
if (aspell->check(*rcldb, *uit, reason))
|
#warning need to take case and diacs sensibility into account somehow
|
||||||
continue;
|
// Maybe use the xapian index instead ? How to retrieve the
|
||||||
else if (!reason.empty())
|
// sensitivity flags ?
|
||||||
return;
|
|
||||||
|
// We used to call aspell->check() here and continue if it
|
||||||
|
// succeeded. but this does not work if we are in
|
||||||
|
// case-sensitive mode and the term was not found because of a
|
||||||
|
// case difference (our aspell is all lowercase).
|
||||||
|
// if (aspell->check(*uit, reason))
|
||||||
|
// continue;
|
||||||
|
// else if (!reason.empty())
|
||||||
|
// return;
|
||||||
|
|
||||||
if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
|
if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
|
||||||
LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n",
|
LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n",
|
||||||
reason.c_str()));
|
reason.c_str()));
|
||||||
@ -337,6 +346,7 @@ ResList::~ResList()
|
|||||||
QT_TR_NOOP("Open"),
|
QT_TR_NOOP("Open"),
|
||||||
QT_TR_NOOP("(show query)"),
|
QT_TR_NOOP("(show query)"),
|
||||||
QT_TR_NOOP("<p><i>Alternate spellings (accents suppressed): </i>"),
|
QT_TR_NOOP("<p><i>Alternate spellings (accents suppressed): </i>"),
|
||||||
|
QT_TR_NOOP("<p><i>Alternate spellings: </i>"),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -126,23 +126,25 @@ void SSearch::startSimpleSearch()
|
|||||||
if (u8.length() == 0)
|
if (u8.length() == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
string stemlang = prefs.stemlang();
|
||||||
|
|
||||||
SSearchType tp = (SSearchType)searchTypCMB->currentIndex();
|
SSearchType tp = (SSearchType)searchTypCMB->currentIndex();
|
||||||
Rcl::SearchData *sdata = 0;
|
Rcl::SearchData *sdata = 0;
|
||||||
|
|
||||||
if (tp == SST_LANG) {
|
if (tp == SST_LANG) {
|
||||||
string reason;
|
string reason;
|
||||||
if (prefs.autoSuffsEnable)
|
if (prefs.autoSuffsEnable)
|
||||||
sdata = wasaStringToRcl(theconfig, u8, reason,
|
sdata = wasaStringToRcl(theconfig, stemlang, u8, reason,
|
||||||
(const char *)prefs.autoSuffs.toUtf8());
|
(const char *)prefs.autoSuffs.toUtf8());
|
||||||
else
|
else
|
||||||
sdata = wasaStringToRcl(theconfig, u8, reason);
|
sdata = wasaStringToRcl(theconfig, stemlang, u8, reason);
|
||||||
if (sdata == 0) {
|
if (sdata == 0) {
|
||||||
QMessageBox::warning(0, "Recoll", tr("Bad query string") + ": " +
|
QMessageBox::warning(0, "Recoll", tr("Bad query string") + ": " +
|
||||||
QString::fromAscii(reason.c_str()));
|
QString::fromAscii(reason.c_str()));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
sdata = new Rcl::SearchData(Rcl::SCLT_OR);
|
sdata = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
|
||||||
if (sdata == 0) {
|
if (sdata == 0) {
|
||||||
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
|
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
|
||||||
return;
|
return;
|
||||||
@ -166,11 +168,6 @@ void SSearch::startSimpleSearch()
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (prefs.ssearchAutoPhrase && rcldb) {
|
if (prefs.ssearchAutoPhrase && rcldb) {
|
||||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
|
||||||
if (stemLang == "ALL") {
|
|
||||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
|
||||||
}
|
|
||||||
sdata->setStemlang(stemLang);
|
|
||||||
sdata->maybeAddAutoPhrase(*rcldb,
|
sdata->maybeAddAutoPhrase(*rcldb,
|
||||||
prefs.ssearchAutoPhraseThreshPC / 100.0);
|
prefs.ssearchAutoPhraseThreshPC / 100.0);
|
||||||
}
|
}
|
||||||
@ -277,10 +274,9 @@ void SSearch::completion()
|
|||||||
// Query database
|
// Query database
|
||||||
const int max = 100;
|
const int max = 100;
|
||||||
Rcl::TermMatchResult tmres;
|
Rcl::TermMatchResult tmres;
|
||||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
|
||||||
if (stemLang == "ALL") {
|
string stemLang = prefs.stemlang();
|
||||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
|
||||||
}
|
|
||||||
if (!rcldb->termMatch(Rcl::Db::ET_WILD, stemLang, s, tmres, max) ||
|
if (!rcldb->termMatch(Rcl::Db::ET_WILD, stemLang, s, tmres, max) ||
|
||||||
tmres.entries.size() == 0) {
|
tmres.entries.size() == 0) {
|
||||||
QApplication::beep();
|
QApplication::beep();
|
||||||
|
|||||||
@ -146,7 +146,8 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
|
|||||||
LOGDEB(("DocSequenceDb::setFiltSpec\n"));
|
LOGDEB(("DocSequenceDb::setFiltSpec\n"));
|
||||||
if (fs.isNotNull()) {
|
if (fs.isNotNull()) {
|
||||||
// We build a search spec by adding a filtering layer to the base one.
|
// We build a search spec by adding a filtering layer to the base one.
|
||||||
m_fsdata = RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
|
m_fsdata = RefCntr<Rcl::SearchData>(
|
||||||
|
new Rcl::SearchData(Rcl::SCLT_AND, m_sdata->getStemLang()));
|
||||||
Rcl::SearchDataClauseSub *cl =
|
Rcl::SearchDataClauseSub *cl =
|
||||||
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
|
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
|
||||||
m_fsdata->addClause(cl);
|
m_fsdata->addClause(cl);
|
||||||
@ -164,6 +165,7 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
|
|||||||
string reason;
|
string reason;
|
||||||
Rcl::SearchData *sd =
|
Rcl::SearchData *sd =
|
||||||
wasaStringToRcl(m_q->whatDb()->getConf(),
|
wasaStringToRcl(m_q->whatDb()->getConf(),
|
||||||
|
m_sdata->getStemLang(),
|
||||||
fs.values[i], reason);
|
fs.values[i], reason);
|
||||||
if (sd) {
|
if (sd) {
|
||||||
Rcl::SearchDataClauseSub *cl1 =
|
Rcl::SearchDataClauseSub *cl1 =
|
||||||
|
|||||||
@ -50,7 +50,10 @@ static string vecStringToString(const vector<string>& t)
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct MatchEntry {
|
struct MatchEntry {
|
||||||
|
// Start/End byte offsets in the document text
|
||||||
pair<int, int> offs;
|
pair<int, int> offs;
|
||||||
|
// Index of the search group this comes from: this is to relate a
|
||||||
|
// match to the original user input.
|
||||||
unsigned int grpidx;
|
unsigned int grpidx;
|
||||||
MatchEntry(int sta, int sto, unsigned int idx)
|
MatchEntry(int sta, int sto, unsigned int idx)
|
||||||
: offs(sta, sto), grpidx(idx)
|
: offs(sta, sto), grpidx(idx)
|
||||||
@ -76,11 +79,31 @@ class TextSplitPTR : public TextSplit {
|
|||||||
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
||||||
vit != hdata.groups.end(); vit++) {
|
vit != hdata.groups.end(); vit++) {
|
||||||
if (vit->size() == 1) {
|
if (vit->size() == 1) {
|
||||||
m_terms[vit->front()] = vit - hdata.groups.begin();
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
|
m_terms[vit->front()] = vit - hdata.groups.begin();
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
string dumb = vit->front();
|
||||||
|
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||||
|
m_terms[dumb] = vit - hdata.groups.begin();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
} else if (vit->size() > 1) {
|
} else if (vit->size() > 1) {
|
||||||
for (vector<string>::const_iterator it = vit->begin();
|
for (vector<string>::const_iterator it = vit->begin();
|
||||||
it != vit->end(); it++) {
|
it != vit->end(); it++) {
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
m_gterms.insert(*it);
|
m_gterms.insert(*it);
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
string dumb = *it;
|
||||||
|
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||||
|
m_gterms.insert(dumb);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -286,7 +286,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
|||||||
Rcl::SearchData *sd = 0;
|
Rcl::SearchData *sd = 0;
|
||||||
|
|
||||||
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
|
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
|
||||||
sd = new Rcl::SearchData(Rcl::SCLT_OR);
|
sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
|
||||||
Rcl::SearchDataClause *clp = 0;
|
Rcl::SearchDataClause *clp = 0;
|
||||||
if (op_flags & OPT_f) {
|
if (op_flags & OPT_f) {
|
||||||
clp = new Rcl::SearchDataClauseFilename(qs);
|
clp = new Rcl::SearchDataClauseFilename(qs);
|
||||||
@ -305,14 +305,13 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
|||||||
if (sd)
|
if (sd)
|
||||||
sd->addClause(clp);
|
sd->addClause(clp);
|
||||||
} else {
|
} else {
|
||||||
sd = wasaStringToRcl(rclconfig, qs, reason);
|
sd = wasaStringToRcl(rclconfig, stemlang, qs, reason);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!sd) {
|
if (!sd) {
|
||||||
cerr << "Query string interpretation failed: " << reason << endl;
|
cerr << "Query string interpretation failed: " << reason << endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
sd->setStemlang(stemlang);
|
|
||||||
|
|
||||||
RefCntr<Rcl::SearchData> rq(sd);
|
RefCntr<Rcl::SearchData> rq(sd);
|
||||||
Rcl::Query query(&rcldb);
|
Rcl::Query query(&rcldb);
|
||||||
|
|||||||
@ -320,9 +320,16 @@ void ResListPager::displayPage(RclConfig *config)
|
|||||||
map<string, vector<string> > spellings;
|
map<string, vector<string> > spellings;
|
||||||
suggest(uterms, spellings);
|
suggest(uterms, spellings);
|
||||||
if (!spellings.empty()) {
|
if (!spellings.empty()) {
|
||||||
chunk <<
|
if (o_index_stripchars) {
|
||||||
trans("<p><i>Alternate spellings (accents suppressed): </i>")
|
chunk <<
|
||||||
<< "<br /><blockquote>";
|
trans("<p><i>Alternate spellings (accents suppressed): </i>")
|
||||||
|
<< "<br /><blockquote>";
|
||||||
|
} else {
|
||||||
|
chunk <<
|
||||||
|
trans("<p><i>Alternate spellings: </i>")
|
||||||
|
<< "<br /><blockquote>";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
for (map<string, vector<string> >::const_iterator it0 =
|
for (map<string, vector<string> >::const_iterator it0 =
|
||||||
spellings.begin(); it0 != spellings.end(); it0++) {
|
spellings.begin(); it0 != spellings.end(); it0++) {
|
||||||
|
|||||||
@ -32,7 +32,9 @@ using std::list;
|
|||||||
#include "refcntr.h"
|
#include "refcntr.h"
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
|
|
||||||
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config,
|
||||||
|
const string& stemlang,
|
||||||
|
WasaQuery *wasa,
|
||||||
const string& autosuffs, string& reason)
|
const string& autosuffs, string& reason)
|
||||||
{
|
{
|
||||||
if (wasa == 0) {
|
if (wasa == 0) {
|
||||||
@ -47,7 +49,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||||||
|
|
||||||
Rcl::SearchData *sdata = new
|
Rcl::SearchData *sdata = new
|
||||||
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
|
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
|
||||||
Rcl::SCLT_OR);
|
Rcl::SCLT_OR, stemlang);
|
||||||
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
|
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
|
||||||
"AND" : "OR"));
|
"AND" : "OR"));
|
||||||
|
|
||||||
@ -167,6 +169,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||||||
|
|
||||||
// "Regular" processing follows:
|
// "Regular" processing follows:
|
||||||
unsigned int mods = (unsigned int)(*it)->m_modifiers;
|
unsigned int mods = (unsigned int)(*it)->m_modifiers;
|
||||||
|
LOGDEB0(("wasaQueryToRcl: clause modifiers 0x%x\n", mods));
|
||||||
nclause = 0;
|
nclause = 0;
|
||||||
|
|
||||||
switch ((*it)->m_op) {
|
switch ((*it)->m_op) {
|
||||||
@ -178,7 +181,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
case WasaQuery::OP_LEAF: {
|
case WasaQuery::OP_LEAF: {
|
||||||
LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n",
|
LOGDEB0(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n",
|
||||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
|
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
|
||||||
(*it)->m_slack));
|
(*it)->m_slack));
|
||||||
|
|
||||||
@ -250,7 +253,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
|
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
|
||||||
// Create a subquery.
|
// Create a subquery.
|
||||||
Rcl::SearchData *sub =
|
Rcl::SearchData *sub =
|
||||||
wasaQueryToRcl(config, *it, autosuffs, reason);
|
wasaQueryToRcl(config, stemlang, *it, autosuffs, reason);
|
||||||
if (sub == 0) {
|
if (sub == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -278,7 +281,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||||||
return sdata;
|
return sdata;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::SearchData *wasaStringToRcl(RclConfig *config,
|
Rcl::SearchData *wasaStringToRcl(RclConfig *config, const string& stemlang,
|
||||||
const string &qs, string &reason,
|
const string &qs, string &reason,
|
||||||
const string& autosuffs)
|
const string& autosuffs)
|
||||||
{
|
{
|
||||||
@ -286,5 +289,5 @@ Rcl::SearchData *wasaStringToRcl(RclConfig *config,
|
|||||||
WasaQuery *wq = parser.stringToQuery(qs, reason);
|
WasaQuery *wq = parser.stringToQuery(qs, reason);
|
||||||
if (wq == 0)
|
if (wq == 0)
|
||||||
return 0;
|
return 0;
|
||||||
return wasaQueryToRcl(config, wq, autosuffs, reason);
|
return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -25,7 +25,7 @@ using std::string;
|
|||||||
|
|
||||||
class RclConfig;
|
class RclConfig;
|
||||||
|
|
||||||
extern Rcl::SearchData *wasaStringToRcl(RclConfig *,
|
extern Rcl::SearchData *wasaStringToRcl(RclConfig *, const string& stemlang,
|
||||||
const string& query, string &reason,
|
const string& query, string &reason,
|
||||||
const string& autosuffs = string());
|
const string& autosuffs = string());
|
||||||
#endif /* _WASATORCL_H_INCLUDED_ */
|
#endif /* _WASATORCL_H_INCLUDED_ */
|
||||||
|
|||||||
@ -14,6 +14,9 @@
|
|||||||
* Free Software Foundation, Inc.,
|
* Free Software Foundation, Inc.,
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
@ -36,7 +39,8 @@ using namespace std;
|
|||||||
static string thisprog;
|
static string thisprog;
|
||||||
|
|
||||||
static string usage =
|
static string usage =
|
||||||
" -d <dbdir> -e <output encoding>\n"
|
" -d <dbdir> \n"
|
||||||
|
"-e <output encoding>\n"
|
||||||
" -i docid -D : get document data for docid\n"
|
" -i docid -D : get document data for docid\n"
|
||||||
" -i docid -X : delete document docid\n"
|
" -i docid -X : delete document docid\n"
|
||||||
" -i docid -b : 'rebuild' document from term positions\n"
|
" -i docid -b : 'rebuild' document from term positions\n"
|
||||||
@ -112,6 +116,23 @@ static void sigcleanup(int sig)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
bool o_index_stripchars;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline bool has_prefix(const string& trm)
|
||||||
|
{
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
|
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
return trm.size() > 0 && trm[0] == ':';
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
string dbdir = path_cat(path_home(), ".recoll/xapiandb");
|
string dbdir = path_cat(path_home(), ".recoll/xapiandb");
|
||||||
@ -188,10 +209,22 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
db = new Xapian::Database(dbdir);
|
db = new Xapian::Database(dbdir);
|
||||||
|
|
||||||
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
||||||
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
// If we have terms with a leading ':' it's a new style,
|
||||||
|
// unstripped index
|
||||||
|
{
|
||||||
|
Xapian::TermIterator term = db->allterms_begin(":");
|
||||||
|
if (term == db->allterms_end())
|
||||||
|
o_index_stripchars = true;
|
||||||
|
else
|
||||||
|
o_index_stripchars = false;
|
||||||
|
cout<<"DB: terms are "<<(o_index_stripchars?"stripped":"raw")<<endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (op_flags & OPT_T) {
|
if (op_flags & OPT_T) {
|
||||||
Xapian::TermIterator term;
|
Xapian::TermIterator term;
|
||||||
string printable;
|
string printable;
|
||||||
@ -201,8 +234,7 @@ int main(int argc, char **argv)
|
|||||||
for (term = db->termlist_begin(docid);
|
for (term = db->termlist_begin(docid);
|
||||||
term != db->termlist_end(docid);term++) {
|
term != db->termlist_end(docid);term++) {
|
||||||
const string& s = *term;
|
const string& s = *term;
|
||||||
if ((op_flags&OPT_l) &&
|
if ((op_flags&OPT_l) && has_prefix(s))
|
||||||
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
|
|
||||||
continue;
|
continue;
|
||||||
cout << op << detailstring(s) << cl << endl;
|
cout << op << detailstring(s) << cl << endl;
|
||||||
}
|
}
|
||||||
@ -210,8 +242,7 @@ int main(int argc, char **argv)
|
|||||||
for (term = db->allterms_begin();
|
for (term = db->allterms_begin();
|
||||||
term != db->allterms_end();term++) {
|
term != db->allterms_end();term++) {
|
||||||
const string& s = *term;
|
const string& s = *term;
|
||||||
if ((op_flags&OPT_l) &&
|
if ((op_flags&OPT_l) && has_prefix(s))
|
||||||
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
|
|
||||||
continue;
|
continue;
|
||||||
if (op_flags & OPT_f)
|
if (op_flags & OPT_f)
|
||||||
cout << db->get_collection_freq(*term) << " "
|
cout << db->get_collection_freq(*term) << " "
|
||||||
|
|||||||
@ -63,17 +63,19 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||||||
// Unaccented stem dbs
|
// Unaccented stem dbs
|
||||||
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
||||||
// We can reuse the same stemmer pointers, the objects are stateless.
|
// We can reuse the same stemmer pointers, the objects are stateless.
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
if (!o_index_stripchars) {
|
||||||
unacstemdbs.push_back(
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
unacstemdbs.push_back(
|
||||||
stemmers.back().getptr()));
|
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
||||||
unacstemdbs.back().recreate();
|
stemmers.back().getptr()));
|
||||||
|
unacstemdbs.back().recreate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
||||||
XapWritableComputableSynFamMember
|
XapWritableComputableSynFamMember
|
||||||
diacasedb(wdb, synFamDiac, "all", &transunac);
|
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
||||||
diacasedb.recreate();
|
if (!o_index_stripchars)
|
||||||
|
diacasedb.recreate();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Walk the list of all terms, and stem/unac each.
|
// Walk the list of all terms, and stem/unac each.
|
||||||
@ -109,8 +111,10 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||||||
// is the input to the stem db, and add a synonym from the
|
// is the input to the stem db, and add a synonym from the
|
||||||
// stripped term to the cased and accented one, for accent
|
// stripped term to the cased and accented one, for accent
|
||||||
// and case expansion at query time
|
// and case expansion at query time
|
||||||
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
if (!o_index_stripchars) {
|
||||||
diacasedb.addSynonym(*it);
|
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
||||||
|
diacasedb.addSynonym(*it);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Create stemming synonym for every language. The input is the
|
// Create stemming synonym for every language. The input is the
|
||||||
@ -124,12 +128,15 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||||||
// the unaccented term. While this may be incorrect, it is
|
// the unaccented term. While this may be incorrect, it is
|
||||||
// also necessary for searching in a diacritic-unsensitive
|
// also necessary for searching in a diacritic-unsensitive
|
||||||
// way on a raw index
|
// way on a raw index
|
||||||
string unac;
|
if (!o_index_stripchars) {
|
||||||
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
string unac;
|
||||||
if (unac != lower)
|
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
if (unac != lower) {
|
||||||
unacstemdbs[i].addSynonym(unac);
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
|
unacstemdbs[i].addSynonym(unac);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
|
|||||||
@ -24,10 +24,13 @@
|
|||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
/* A Capitals/Diacritics removal functor for using with
|
/** A Capitals/Diacritics removal functor for using with
|
||||||
XapComputableSynFamMember */
|
* XapComputableSynFamMember */
|
||||||
class SynTermTransUnac : public SynTermTrans {
|
class SynTermTransUnac : public SynTermTrans {
|
||||||
public:
|
public:
|
||||||
|
/** Constructor
|
||||||
|
* @param op defines if we remove diacritics, case or both
|
||||||
|
*/
|
||||||
SynTermTransUnac(UnacOp op)
|
SynTermTransUnac(UnacOp op)
|
||||||
: m_op(op)
|
: m_op(op)
|
||||||
{
|
{
|
||||||
@ -43,7 +46,9 @@ public:
|
|||||||
UnacOp m_op;
|
UnacOp m_op;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Walk the Xapian term list and create all the expansion dbs in one go */
|
/** Walk the Xapian term list and create all the expansion dbs in one go.
|
||||||
|
*
|
||||||
|
*/
|
||||||
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
const std::vector<std::string>& langs);
|
const std::vector<std::string>& langs);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,6 +14,8 @@
|
|||||||
* Free Software Foundation, Inc.,
|
* Free Software Foundation, Inc.,
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
@ -53,6 +55,7 @@ using namespace std;
|
|||||||
#include "cancelcheck.h"
|
#include "cancelcheck.h"
|
||||||
#include "ptmutex.h"
|
#include "ptmutex.h"
|
||||||
#include "termproc.h"
|
#include "termproc.h"
|
||||||
|
#include "expansiondbs.h"
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(A,B) (A>B?A:B)
|
#define MAX(A,B) (A>B?A:B)
|
||||||
@ -84,9 +87,16 @@ static const string xapday_prefix = "D";
|
|||||||
static const string xapmonth_prefix = "M";
|
static const string xapmonth_prefix = "M";
|
||||||
static const string xapyear_prefix = "Y";
|
static const string xapyear_prefix = "Y";
|
||||||
const string pathelt_prefix = "XP";
|
const string pathelt_prefix = "XP";
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
const string start_of_field_term = "XXST";
|
const string start_of_field_term = "XXST";
|
||||||
const string end_of_field_term = "XXND";
|
const string end_of_field_term = "XXND";
|
||||||
static const string page_break_term = "XXPG";
|
static const string page_break_term = "XXPG";
|
||||||
|
#else
|
||||||
|
string start_of_field_term;
|
||||||
|
string end_of_field_term;
|
||||||
|
const string page_break_term = "XXPG/";
|
||||||
|
#endif
|
||||||
|
|
||||||
// Field name for the unsplit file name. Has to exist in the field file
|
// Field name for the unsplit file name. Has to exist in the field file
|
||||||
// because of usage in termmatch()
|
// because of usage in termmatch()
|
||||||
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||||
@ -197,7 +207,7 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
|
|||||||
{
|
{
|
||||||
for (vector<string>::const_iterator qit = in.begin();
|
for (vector<string>::const_iterator qit = in.begin();
|
||||||
qit != in.end(); qit++) {
|
qit != in.end(); qit++) {
|
||||||
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
|
if (!has_prefix(*qit))
|
||||||
out.push_back(*qit);
|
out.push_back(*qit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -591,7 +601,7 @@ abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query,
|
|||||||
for (term = xrdb.termlist_begin(docid);
|
for (term = xrdb.termlist_begin(docid);
|
||||||
term != xrdb.termlist_end(docid); term++) {
|
term != xrdb.termlist_end(docid); term++) {
|
||||||
// Ignore prefixed terms
|
// Ignore prefixed terms
|
||||||
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
|
if (has_prefix(*term))
|
||||||
continue;
|
continue;
|
||||||
if (cutoff-- < 0) {
|
if (cutoff-- < 0) {
|
||||||
ret = ABSRES_TRUNC;
|
ret = ABSRES_TRUNC;
|
||||||
@ -672,7 +682,9 @@ abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query,
|
|||||||
vabs.push_back(pair<int,string>(page, chunk));
|
vabs.push_back(pair<int,string>(page, chunk));
|
||||||
chunk.clear();
|
chunk.clear();
|
||||||
} else {
|
} else {
|
||||||
chunk += it->second;
|
if (it->second.compare(end_of_field_term) &&
|
||||||
|
it->second.compare(start_of_field_term))
|
||||||
|
chunk += it->second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!chunk.empty())
|
if (!chunk.empty())
|
||||||
@ -692,6 +704,18 @@ Db::Db(RclConfig *cfp)
|
|||||||
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
||||||
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
||||||
{
|
{
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (start_of_field_term.empty()) {
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
start_of_field_term = "XXST";
|
||||||
|
end_of_field_term = "XXND";
|
||||||
|
} else {
|
||||||
|
start_of_field_term = "XXST/";
|
||||||
|
end_of_field_term = "XXND/";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
m_ndb = new Native(this);
|
m_ndb = new Native(this);
|
||||||
if (m_config) {
|
if (m_config) {
|
||||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||||
@ -894,11 +918,14 @@ int Db::termDocCnt(const string& _term)
|
|||||||
if (!m_ndb || !m_ndb->m_isopen)
|
if (!m_ndb || !m_ndb->m_isopen)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
string term;
|
string term = _term;
|
||||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
if (o_index_stripchars)
|
||||||
return 0;
|
#endif
|
||||||
}
|
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
|
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (m_stops.isStop(term)) {
|
if (m_stops.isStop(term)) {
|
||||||
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
||||||
@ -1014,8 +1041,19 @@ class TextSplitDb : public TextSplitP {
|
|||||||
{}
|
{}
|
||||||
// Reimplement text_to_words to add start and end special terms
|
// Reimplement text_to_words to add start and end special terms
|
||||||
virtual bool text_to_words(const string &in);
|
virtual bool text_to_words(const string &in);
|
||||||
void setprefix(const string& pref) {prefix = pref;}
|
|
||||||
void setwdfinc(int i) {wdfinc = i;}
|
void setprefix(const string& pref)
|
||||||
|
{
|
||||||
|
if (pref.empty())
|
||||||
|
prefix.clear();
|
||||||
|
else
|
||||||
|
prefix = wrap_prefix(pref);
|
||||||
|
}
|
||||||
|
|
||||||
|
void setwdfinc(int i)
|
||||||
|
{
|
||||||
|
wdfinc = i;
|
||||||
|
}
|
||||||
|
|
||||||
friend class TermProcIdx;
|
friend class TermProcIdx;
|
||||||
|
|
||||||
@ -1147,11 +1185,17 @@ string Db::getSpellingSuggestion(const string& word)
|
|||||||
{
|
{
|
||||||
if (m_ndb == 0)
|
if (m_ndb == 0)
|
||||||
return string();
|
return string();
|
||||||
string term;
|
|
||||||
|
string term = word;
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars)
|
||||||
|
#endif
|
||||||
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
||||||
return string();
|
return string();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isSpellingCandidate(term))
|
if (!isSpellingCandidate(term))
|
||||||
return string();
|
return string();
|
||||||
return m_ndb->xrdb.get_spelling_suggestion(term);
|
return m_ndb->xrdb.get_spelling_suggestion(term);
|
||||||
@ -1259,8 +1303,13 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
TermProcIdx tpidx;
|
TermProcIdx tpidx;
|
||||||
TermProc *nxt = &tpidx;
|
TermProc *nxt = &tpidx;
|
||||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||||
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
|
||||||
|
TermProcPrep tpprep(nxt);
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars)
|
||||||
|
#endif
|
||||||
|
nxt = &tpprep;
|
||||||
|
|
||||||
TextSplitDb splitter(newdocument, nxt);
|
TextSplitDb splitter(newdocument, nxt);
|
||||||
tpidx.setTSD(&splitter);
|
tpidx.setTSD(&splitter);
|
||||||
@ -1286,7 +1335,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
vector<string> vpath;
|
vector<string> vpath;
|
||||||
stringToTokens(path, vpath, "/");
|
stringToTokens(path, vpath, "/");
|
||||||
splitter.curpos = 0;
|
splitter.curpos = 0;
|
||||||
newdocument.add_posting(pathelt_prefix,
|
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
||||||
splitter.basepos + splitter.curpos++);
|
splitter.basepos + splitter.curpos++);
|
||||||
for (vector<string>::iterator it = vpath.begin();
|
for (vector<string>::iterator it = vpath.begin();
|
||||||
it != vpath.end(); it++){
|
it != vpath.end(); it++){
|
||||||
@ -1294,7 +1343,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
// Just truncate it. May still be useful because of wildcards
|
// Just truncate it. May still be useful because of wildcards
|
||||||
*it = it->substr(0, 230);
|
*it = it->substr(0, 230);
|
||||||
}
|
}
|
||||||
newdocument.add_posting(pathelt_prefix + *it,
|
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
||||||
splitter.basepos + splitter.curpos++);
|
splitter.basepos + splitter.curpos++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1339,7 +1388,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
|
|
||||||
////// Special terms for other metadata. No positions for these.
|
////// Special terms for other metadata. No positions for these.
|
||||||
// Mime type
|
// Mime type
|
||||||
newdocument.add_term(mimetype_prefix + doc.mimetype);
|
newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
||||||
|
|
||||||
// Simple file name indexed unsplit for specific "file name"
|
// Simple file name indexed unsplit for specific "file name"
|
||||||
// searches. This is not the same as a filename: clause inside the
|
// searches. This is not the same as a filename: clause inside the
|
||||||
@ -1355,9 +1404,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
utf8truncate(fn, 230);
|
utf8truncate(fn, 230);
|
||||||
string::size_type pos = fn.rfind('.');
|
string::size_type pos = fn.rfind('.');
|
||||||
if (pos != string::npos && pos != fn.length() - 1) {
|
if (pos != string::npos && pos != fn.length() - 1) {
|
||||||
newdocument.add_term(fileext_prefix + fn.substr(pos + 1));
|
newdocument.add_term(wrap_prefix(fileext_prefix) +
|
||||||
|
fn.substr(pos + 1));
|
||||||
}
|
}
|
||||||
newdocument.add_term(unsplitfilename_prefix + fn);
|
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1376,12 +1426,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
struct tm *tm = localtime(&mtime);
|
struct tm *tm = localtime(&mtime);
|
||||||
char buf[9];
|
char buf[9];
|
||||||
snprintf(buf, 9, "%04d%02d%02d",
|
snprintf(buf, 9, "%04d%02d%02d",
|
||||||
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||||
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
|
// Date (YYYYMMDD)
|
||||||
|
newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
|
||||||
|
// Month (YYYYMM)
|
||||||
buf[6] = '\0';
|
buf[6] = '\0';
|
||||||
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
|
newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
||||||
|
// Year (YYYY)
|
||||||
buf[4] = '\0';
|
buf[4] = '\0';
|
||||||
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
|
newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
@ -1856,7 +1909,7 @@ bool Db::maxYearSpan(int *minyear, int *maxyear)
|
|||||||
*minyear = 1000000;
|
*minyear = 1000000;
|
||||||
*maxyear = -1000000;
|
*maxyear = -1000000;
|
||||||
TermMatchResult result;
|
TermMatchResult result;
|
||||||
if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))
|
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
|
||||||
return false;
|
return false;
|
||||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||||
it != result.entries.end(); it++) {
|
it != result.entries.end(); it++) {
|
||||||
@ -1921,30 +1974,35 @@ const string cstr_wildSpecChars = "*?[";
|
|||||||
const string cstr_regSpecChars = "(.[{";
|
const string cstr_regSpecChars = "(.[{";
|
||||||
|
|
||||||
// Find all index terms that match a wildcard or regular expression
|
// Find all index terms that match a wildcard or regular expression
|
||||||
|
// If field is set, we return a list of appropriately prefixed terms (which
|
||||||
|
// are going to be used to build a Xapian query).
|
||||||
bool Db::termMatch(MatchType typ, const string &lang,
|
bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
const string &root,
|
const string &root,
|
||||||
TermMatchResult& res,
|
TermMatchResult& res,
|
||||||
int max,
|
int max,
|
||||||
const string& field,
|
const string& field)
|
||||||
string *prefixp
|
|
||||||
)
|
|
||||||
{
|
{
|
||||||
if (!m_ndb || !m_ndb->m_isopen)
|
if (!m_ndb || !m_ndb->m_isopen)
|
||||||
return false;
|
return false;
|
||||||
Xapian::Database xdb = m_ndb->xdb();
|
Xapian::Database xdb = m_ndb->xdb();
|
||||||
|
|
||||||
res.clear();
|
|
||||||
XAPTRY(res.dbdoccount = xdb.get_doccount();
|
XAPTRY(res.dbdoccount = xdb.get_doccount();
|
||||||
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
|
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
|
||||||
if (!m_reason.empty())
|
if (!m_reason.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Get rid of capitals and accents
|
// Get rid of capitals and accents
|
||||||
string droot;
|
|
||||||
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
string droot = root;
|
||||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
|
||||||
return false;
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
}
|
if (o_index_stripchars)
|
||||||
|
#endif
|
||||||
|
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
|
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
||||||
|
|
||||||
string prefix;
|
string prefix;
|
||||||
@ -1954,17 +2012,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||||||
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
||||||
field.c_str()));
|
field.c_str()));
|
||||||
} else {
|
} else {
|
||||||
prefix = ftp->pfx;
|
prefix = wrap_prefix(ftp->pfx);
|
||||||
}
|
}
|
||||||
if (prefixp)
|
|
||||||
*prefixp = prefix;
|
|
||||||
}
|
}
|
||||||
|
res.prefix = prefix;
|
||||||
|
|
||||||
if (typ == ET_STEM) {
|
if (typ == ET_STEM) {
|
||||||
if (!stemExpand(lang, root, res, max))
|
if (!stemExpand(lang, root, res, max))
|
||||||
return false;
|
return false;
|
||||||
sort(res.entries.begin(), res.entries.end());
|
|
||||||
unique(res.entries.begin(), res.entries.end());
|
|
||||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||||
it != res.entries.end(); it++) {
|
it != res.entries.end(); it++) {
|
||||||
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
||||||
@ -2054,7 +2109,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||||||
TermMatchCmpByTerm tcmp;
|
TermMatchCmpByTerm tcmp;
|
||||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||||
TermMatchTermEqual teq;
|
TermMatchTermEqual teq;
|
||||||
unique(res.entries.begin(), res.entries.end(), teq);
|
vector<TermMatchEntry>::iterator uit =
|
||||||
|
unique(res.entries.begin(), res.entries.end(), teq);
|
||||||
|
res.entries.resize(uit - res.entries.begin());
|
||||||
TermMatchCmpByWcf wcmp;
|
TermMatchCmpByWcf wcmp;
|
||||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||||
if (max > 0) {
|
if (max > 0) {
|
||||||
|
|||||||
@ -17,6 +17,8 @@
|
|||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
|
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -78,21 +80,50 @@ class Query;
|
|||||||
/** Used for returning result lists for index terms matching some criteria */
|
/** Used for returning result lists for index terms matching some criteria */
|
||||||
class TermMatchEntry {
|
class TermMatchEntry {
|
||||||
public:
|
public:
|
||||||
TermMatchEntry() : wcf(0) {}
|
TermMatchEntry()
|
||||||
TermMatchEntry(const string&t, int f, int d) : term(t), wcf(f), docs(d) {}
|
: wcf(0)
|
||||||
TermMatchEntry(const string&t) : term(t), wcf(0) {}
|
{
|
||||||
bool operator==(const TermMatchEntry &o) const { return term == o.term;}
|
}
|
||||||
bool operator<(const TermMatchEntry &o) const { return term < o.term;}
|
TermMatchEntry(const string& t, int f, int d)
|
||||||
|
: term(t), wcf(f), docs(d)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
TermMatchEntry(const string& t)
|
||||||
|
: term(t), wcf(0)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
bool operator==(const TermMatchEntry &o) const
|
||||||
|
{
|
||||||
|
return term == o.term;
|
||||||
|
}
|
||||||
|
bool operator<(const TermMatchEntry &o) const
|
||||||
|
{
|
||||||
|
return term < o.term;
|
||||||
|
}
|
||||||
|
|
||||||
string term;
|
string term;
|
||||||
int wcf; // Total count of occurrences within collection.
|
int wcf; // Total count of occurrences within collection.
|
||||||
int docs; // Number of documents countaining term.
|
int docs; // Number of documents countaining term.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Term match result list header: statistics and global info */
|
||||||
class TermMatchResult {
|
class TermMatchResult {
|
||||||
public:
|
public:
|
||||||
TermMatchResult() {clear();}
|
TermMatchResult()
|
||||||
void clear() {entries.clear(); dbdoccount = 0; dbavgdoclen = 0;}
|
{
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
void clear()
|
||||||
|
{
|
||||||
|
entries.clear();
|
||||||
|
dbdoccount = 0;
|
||||||
|
dbavgdoclen = 0;
|
||||||
|
}
|
||||||
|
// Term expansion
|
||||||
vector<TermMatchEntry> entries;
|
vector<TermMatchEntry> entries;
|
||||||
|
// If a field was specified, this is the corresponding index prefix
|
||||||
|
string prefix;
|
||||||
|
// Index-wide stats
|
||||||
unsigned int dbdoccount;
|
unsigned int dbdoccount;
|
||||||
double dbavgdoclen;
|
double dbavgdoclen;
|
||||||
};
|
};
|
||||||
@ -100,6 +131,33 @@ public:
|
|||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
extern void *DbUpdWorker(void*);
|
extern void *DbUpdWorker(void*);
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
|
|
||||||
|
inline bool has_prefix(const string& trm)
|
||||||
|
{
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
|
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
return !trm.empty() && trm[0] == ':';
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline string wrap_prefix(const string& pfx)
|
||||||
|
{
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
|
return pfx;
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
return cstr_colon + pfx + cstr_colon;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper class for the native database.
|
* Wrapper class for the native database.
|
||||||
*/
|
*/
|
||||||
@ -137,6 +195,8 @@ class Db {
|
|||||||
{
|
{
|
||||||
if (term.empty() || term.length() > 50)
|
if (term.empty() || term.length() > 50)
|
||||||
return false;
|
return false;
|
||||||
|
if (has_prefix(term))
|
||||||
|
return false;
|
||||||
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
|
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
|
||||||
!= string::npos)
|
!= string::npos)
|
||||||
return false;
|
return false;
|
||||||
@ -210,12 +270,23 @@ class Db {
|
|||||||
|
|
||||||
/** Return the index terms that match the input string
|
/** Return the index terms that match the input string
|
||||||
* Expansion is performed either with either wildcard or regexp processing
|
* Expansion is performed either with either wildcard or regexp processing
|
||||||
* Stem expansion is performed if lang is not empty */
|
* Stem expansion is performed if lang is not empty
|
||||||
|
*
|
||||||
|
* @param typ defines the kind of expansion: wildcard, regexp or stemming
|
||||||
|
* @param lang sets the stemming language(s). Can be a space-separated list
|
||||||
|
* @param term is the term to expand
|
||||||
|
* @param result is the main output
|
||||||
|
* @param max defines the maximum result count
|
||||||
|
* @param field if set, defines the field within with the expansion should
|
||||||
|
* be performed. Only used for wildcards and regexps, stemming is
|
||||||
|
* always global. If this is set, the resulting output terms
|
||||||
|
* will be appropriately prefix and the prefix value will be set
|
||||||
|
* in the TermMatchResult header
|
||||||
|
*/
|
||||||
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
|
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
|
||||||
bool termMatch(MatchType typ, const string &lang, const string &s,
|
bool termMatch(MatchType typ, const string &lang, const string &term,
|
||||||
TermMatchResult& result, int max = -1,
|
TermMatchResult& result, int max = -1,
|
||||||
const string& field = cstr_null,
|
const string& field = cstr_null
|
||||||
string *prefix = 0
|
|
||||||
);
|
);
|
||||||
/** Return min and max years for doc mod times in db */
|
/** Return min and max years for doc mod times in db */
|
||||||
bool maxYearSpan(int *minyear, int *maxyear);
|
bool maxYearSpan(int *minyear, int *maxyear);
|
||||||
@ -337,9 +408,13 @@ private:
|
|||||||
string version_string();
|
string version_string();
|
||||||
|
|
||||||
extern const string pathelt_prefix;
|
extern const string pathelt_prefix;
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
extern const string start_of_field_term;
|
extern const string start_of_field_term;
|
||||||
extern const string end_of_field_term;
|
extern const string end_of_field_term;
|
||||||
|
#else
|
||||||
|
extern string start_of_field_term;
|
||||||
|
extern string end_of_field_term;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* _DB_H_INCLUDED_ */
|
#endif /* _DB_H_INCLUDED_ */
|
||||||
|
|||||||
@ -18,12 +18,17 @@
|
|||||||
#ifndef _rcldb_p_h_included_
|
#ifndef _rcldb_p_h_included_
|
||||||
#define _rcldb_p_h_included_
|
#define _rcldb_p_h_included_
|
||||||
|
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
|
#include <xapian.h>
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
#include "workqueue.h"
|
#include "workqueue.h"
|
||||||
|
#include "debuglog.h"
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
#include "xapian.h"
|
|
||||||
#include "xmacros.h"
|
#include "xmacros.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|||||||
@ -446,7 +446,7 @@ vector<string> Query::expand(const Doc &doc)
|
|||||||
for (Xapian::ESetIterator it = eset.begin();
|
for (Xapian::ESetIterator it = eset.begin();
|
||||||
it != eset.end(); it++) {
|
it != eset.end(); it++) {
|
||||||
LOGDEB((" [%s]\n", (*it).c_str()));
|
LOGDEB((" [%s]\n", (*it).c_str()));
|
||||||
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
|
if ((*it).empty() || has_prefix(*it))
|
||||||
continue;
|
continue;
|
||||||
res.push_back(*it);
|
res.push_back(*it);
|
||||||
if (res.size() >= 10)
|
if (res.size() >= 10)
|
||||||
|
|||||||
@ -16,17 +16,22 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Handle translation from rcl's SearchData structures to Xapian Queries
|
// Handle translation from rcl's SearchData structures to Xapian Queries
|
||||||
|
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <fnmatch.h>
|
#include <fnmatch.h>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
|
|
||||||
#include "cstr.h"
|
#include "cstr.h"
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
|
#include "rcldb_p.h"
|
||||||
#include "searchdata.h"
|
#include "searchdata.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
@ -36,11 +41,11 @@
|
|||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
#include "termproc.h"
|
#include "termproc.h"
|
||||||
|
#include "synfamily.h"
|
||||||
|
#include "stemdb.h"
|
||||||
|
#include "expansiondbs.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using namespace std;
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
||||||
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
||||||
@ -71,13 +76,35 @@ static const int original_term_wqf_booster = 10;
|
|||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||||
* USA
|
* USA
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
#define bufprefix(BUF, L) {(BUF)[0] = L;}
|
||||||
|
#define bpoffs() 1
|
||||||
|
#else
|
||||||
|
static inline void bufprefix(char *buf, char c)
|
||||||
|
{
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
buf[0] = c;
|
||||||
|
} else {
|
||||||
|
buf[0] = ':';
|
||||||
|
buf[1] = c;
|
||||||
|
buf[2] = ':';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline int bpoffs()
|
||||||
|
{
|
||||||
|
return o_index_stripchars ? 1 : 3;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static Xapian::Query
|
static Xapian::Query
|
||||||
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
||||||
{
|
{
|
||||||
// Xapian uses a smallbuf and snprintf. Can't be bothered, we're
|
// Xapian uses a smallbuf and snprintf. Can't be bothered, we're
|
||||||
// only doing %d's !
|
// only doing %d's !
|
||||||
char buf[200];
|
char buf[200];
|
||||||
sprintf(buf, "D%04d%02d", y1, m1);
|
bufprefix(buf, 'D');
|
||||||
|
sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
|
||||||
vector<Xapian::Query> v;
|
vector<Xapian::Query> v;
|
||||||
|
|
||||||
int d_last = monthdays(m1, y1);
|
int d_last = monthdays(m1, y1);
|
||||||
@ -88,11 +115,11 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||||||
// Deal with any initial partial month
|
// Deal with any initial partial month
|
||||||
if (d1 > 1 || d_end < d_last) {
|
if (d1 > 1 || d_end < d_last) {
|
||||||
for ( ; d1 <= d_end ; d1++) {
|
for ( ; d1 <= d_end ; d1++) {
|
||||||
sprintf(buf + 7, "%02d", d1);
|
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
buf[0] = 'M';
|
bufprefix(buf, 'M');
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,36 +129,36 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||||||
|
|
||||||
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
||||||
while (++m1 <= m_last) {
|
while (++m1 <= m_last) {
|
||||||
sprintf(buf + 5, "%02d", m1);
|
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
||||||
buf[0] = 'M';
|
bufprefix(buf, 'M');
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (y1 < y2) {
|
if (y1 < y2) {
|
||||||
while (++y1 < y2) {
|
while (++y1 < y2) {
|
||||||
sprintf(buf + 1, "%04d", y1);
|
sprintf(buf + bpoffs(), "%04d", y1);
|
||||||
buf[0] = 'Y';
|
bufprefix(buf, 'Y');
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
sprintf(buf + 1, "%04d", y2);
|
sprintf(buf + bpoffs(), "%04d", y2);
|
||||||
buf[0] = 'M';
|
bufprefix(buf, 'M');
|
||||||
for (m1 = 1; m1 < m2; m1++) {
|
for (m1 = 1; m1 < m2; m1++) {
|
||||||
sprintf(buf + 5, "%02d", m1);
|
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sprintf(buf + 5, "%02d", m2);
|
sprintf(buf + 2 + bpoffs(), "%02d", m2);
|
||||||
|
|
||||||
// Deal with any final partial month
|
// Deal with any final partial month
|
||||||
if (d2 < monthdays(m2, y2)) {
|
if (d2 < monthdays(m2, y2)) {
|
||||||
buf[0] = 'D';
|
bufprefix(buf, 'D');
|
||||||
for (d1 = 1 ; d1 <= d2; d1++) {
|
for (d1 = 1 ; d1 <= d2; d1++) {
|
||||||
sprintf(buf + 7, "%02d", d1);
|
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
buf[0] = 'M';
|
bufprefix(buf, 'M');
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,31 +199,27 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
||||||
|
vector<SearchDataClause*>& query,
|
||||||
|
string& reason, void *d)
|
||||||
{
|
{
|
||||||
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
|
|
||||||
m_stemlang.c_str()));
|
|
||||||
Xapian::Query xq;
|
Xapian::Query xq;
|
||||||
m_reason.erase();
|
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
||||||
|
|
||||||
// Walk the clause list translating each in turn and building the
|
|
||||||
// Xapian query tree
|
|
||||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
|
||||||
Xapian::Query nq;
|
Xapian::Query nq;
|
||||||
if (!(*it)->toNativeQuery(db, &nq, m_stemlang)) {
|
if (!(*it)->toNativeQuery(db, &nq)) {
|
||||||
LOGERR(("SearchData::toNativeQuery: failed\n"));
|
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n"));
|
||||||
m_reason = (*it)->getReason();
|
reason = (*it)->getReason();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (nq.empty()) {
|
if (nq.empty()) {
|
||||||
LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
|
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||||
// Else this is an OR list, and there can't be excl clauses (checked by
|
// Else this is an OR list, and there can't be excl clauses (checked by
|
||||||
// addClause())
|
// addClause())
|
||||||
Xapian::Query::op op;
|
Xapian::Query::op op;
|
||||||
if (m_tp == SCLT_AND) {
|
if (tp == SCLT_AND) {
|
||||||
if ((*it)->m_tp == SCLT_EXCL) {
|
if ((*it)->m_tp == SCLT_EXCL) {
|
||||||
op = Xapian::Query::OP_AND_NOT;
|
op = Xapian::Query::OP_AND_NOT;
|
||||||
} else {
|
} else {
|
||||||
@ -217,6 +240,23 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||||||
if (xq.empty())
|
if (xq.empty())
|
||||||
xq = Xapian::Query::MatchAll;
|
xq = Xapian::Query::MatchAll;
|
||||||
|
|
||||||
|
*((Xapian::Query *)d) = xq;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||||
|
{
|
||||||
|
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
||||||
|
m_reason.erase();
|
||||||
|
|
||||||
|
// Walk the clause list translating each in turn and building the
|
||||||
|
// Xapian query tree
|
||||||
|
Xapian::Query xq;
|
||||||
|
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
||||||
|
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (m_haveDates) {
|
if (m_haveDates) {
|
||||||
// If one of the extremities is unset, compute db extremas
|
// If one of the extremities is unset, compute db extremas
|
||||||
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
||||||
@ -326,10 +366,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||||||
stringToTokens(dit->dir, vpath, "/");
|
stringToTokens(dit->dir, vpath, "/");
|
||||||
vector<string> pvpath;
|
vector<string> pvpath;
|
||||||
if (dit->dir[0] == '/')
|
if (dit->dir[0] == '/')
|
||||||
pvpath.push_back(pathelt_prefix);
|
pvpath.push_back(wrap_prefix(pathelt_prefix));
|
||||||
for (vector<string>::const_iterator pit = vpath.begin();
|
for (vector<string>::const_iterator pit = vpath.begin();
|
||||||
pit != vpath.end(); pit++){
|
pit != vpath.end(); pit++){
|
||||||
pvpath.push_back(pathelt_prefix + *pit);
|
pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
|
||||||
}
|
}
|
||||||
Xapian::Query::op tdop;
|
Xapian::Query::op tdop;
|
||||||
if (dit->weight == 1.0) {
|
if (dit->weight == 1.0) {
|
||||||
@ -446,7 +486,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
|||||||
// My type is AND. Change it to OR and insert two queries, one
|
// My type is AND. Change it to OR and insert two queries, one
|
||||||
// being the original query as a subquery, the other the
|
// being the original query as a subquery, the other the
|
||||||
// phrase.
|
// phrase.
|
||||||
SearchData *sd = new SearchData(m_tp);
|
SearchData *sd = new SearchData(m_tp, m_stemlang);
|
||||||
sd->m_query = m_query;
|
sd->m_query = m_query;
|
||||||
sd->m_stemlang = m_stemlang;
|
sd->m_stemlang = m_stemlang;
|
||||||
m_tp = SCLT_OR;
|
m_tp = SCLT_OR;
|
||||||
@ -586,25 +626,28 @@ public:
|
|||||||
{ }
|
{ }
|
||||||
|
|
||||||
bool processUserString(const string &iq,
|
bool processUserString(const string &iq,
|
||||||
|
int mods,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
vector<Xapian::Query> &pqueries,
|
vector<Xapian::Query> &pqueries,
|
||||||
const StopList &stops,
|
|
||||||
int slack = 0, bool useNear = false);
|
int slack = 0, bool useNear = false);
|
||||||
private:
|
private:
|
||||||
void expandTerm(bool dont, const string& term, vector<string>& exp,
|
void expandTerm(int mods,
|
||||||
|
const string& term, vector<string>& exp,
|
||||||
string& sterm, const string& prefix);
|
string& sterm, const string& prefix);
|
||||||
// After splitting entry on whitespace: process non-phrase element
|
// After splitting entry on whitespace: process non-phrase element
|
||||||
void processSimpleSpan(const string& span, bool nostemexp,
|
void processSimpleSpan(const string& span,
|
||||||
|
int mods,
|
||||||
vector<Xapian::Query> &pqueries);
|
vector<Xapian::Query> &pqueries);
|
||||||
// Process phrase/near element
|
// Process phrase/near element
|
||||||
void processPhraseOrNear(TextSplitQ *splitData,
|
void processPhraseOrNear(TextSplitQ *splitData,
|
||||||
|
int mods,
|
||||||
vector<Xapian::Query> &pqueries,
|
vector<Xapian::Query> &pqueries,
|
||||||
bool useNear, int slack, int mods);
|
bool useNear, int slack);
|
||||||
|
|
||||||
Db& m_db;
|
Db& m_db;
|
||||||
const string& m_field;
|
const string& m_field;
|
||||||
const string& m_stemlang;
|
const string& m_stemlang;
|
||||||
bool m_doBoostUserTerms;
|
const bool m_doBoostUserTerms;
|
||||||
HighlightData& m_hld;
|
HighlightData& m_hld;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -619,61 +662,204 @@ static void listVector(const string& what, const vector<string>&l)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/** Take simple term and expand stem and wildcards
|
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
||||||
|
* diacritics...
|
||||||
*
|
*
|
||||||
* @param nostemexp don't perform stem expansion. This is mainly used to
|
* @param mods stem expansion, case and diacritics sensitivity control.
|
||||||
* prevent stem expansion inside phrases (because the user probably
|
|
||||||
* does not expect it). This does NOT prevent wild card expansion.
|
|
||||||
* Other factors than nostemexp can prevent stem expansion:
|
|
||||||
* a null stemlang, resulting from a global user preference, a
|
|
||||||
* capitalized term, or wildcard(s)
|
|
||||||
* @param term input single word
|
* @param term input single word
|
||||||
* @param exp output expansion list
|
* @param exp output expansion list
|
||||||
* @param sterm output original input term if there were no wildcards
|
* @param sterm output original input term if there were no wildcards
|
||||||
|
* @param prefix field prefix in index. We could recompute it, but the caller
|
||||||
|
* has it already. Used in the simple case where there is nothing to expand,
|
||||||
|
* and we just return the prefixed term (else Db::termMatch deals with it).
|
||||||
*/
|
*/
|
||||||
void StringToXapianQ::expandTerm(bool nostemexp,
|
void StringToXapianQ::expandTerm(int mods,
|
||||||
const string& term,
|
const string& term,
|
||||||
vector<string>& exp,
|
vector<string>& oexp, string &sterm,
|
||||||
string &sterm, const string& prefix)
|
const string& prefix)
|
||||||
{
|
{
|
||||||
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
|
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
||||||
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
|
mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
|
||||||
sterm.erase();
|
sterm.clear();
|
||||||
exp.clear();
|
oexp.clear();
|
||||||
if (term.empty()) {
|
if (term.empty())
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||||
|
|
||||||
// No stemming if there are wildcards or prevented globally.
|
// If there are no wildcards, add term to the list of user-entered terms
|
||||||
|
if (!haswild)
|
||||||
|
m_hld.uterms.insert(term);
|
||||||
|
|
||||||
|
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
||||||
|
|
||||||
|
// No stem expansion if there are wildcards or if prevented by caller
|
||||||
if (haswild || m_stemlang.empty()) {
|
if (haswild || m_stemlang.empty()) {
|
||||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||||
nostemexp = true;
|
nostemexp = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!haswild)
|
bool noexpansion = nostemexp && !haswild;
|
||||||
m_hld.uterms.insert(term);
|
|
||||||
|
|
||||||
if (nostemexp && !haswild) {
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
sterm = term;
|
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
||||||
exp.resize(1);
|
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
||||||
exp[0] = prefix + term;
|
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
diac_sensitive = case_sensitive = false;
|
||||||
} else {
|
} else {
|
||||||
TermMatchResult res;
|
// If we are working with a raw index, apply the rules for case and
|
||||||
if (haswild) {
|
// diacritics sensitivity.
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
|
||||||
m_field);
|
// If any character has a diacritic, we become
|
||||||
} else {
|
// diacritic-sensitive. Note that the way that the test is
|
||||||
sterm = term;
|
// performed (conversion+comparison) will automatically ignore
|
||||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
|
// accented characters which are actually a separate letter
|
||||||
m_field);
|
if (unachasaccents(term))
|
||||||
}
|
diac_sensitive = true;
|
||||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
|
||||||
it != res.entries.end(); it++) {
|
// If any character apart the first is uppercase, we become
|
||||||
exp.push_back(it->term);
|
// case-sensitive. The first character is reserved for
|
||||||
}
|
// turning off stemming. You need to use a query language
|
||||||
|
// modifier to search for Floor in a case-sensitive way.
|
||||||
|
Utf8Iter it(term);
|
||||||
|
it++;
|
||||||
|
if (unachasuppercase(term.substr(it.getBpos())))
|
||||||
|
case_sensitive = true;
|
||||||
|
|
||||||
|
// If we are sensitive to case or diacritics turn stemming off
|
||||||
|
if (diac_sensitive || case_sensitive)
|
||||||
|
nostemexp = true;
|
||||||
|
|
||||||
|
if (!case_sensitive || !diac_sensitive)
|
||||||
|
noexpansion = false;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (noexpansion) {
|
||||||
|
sterm = term;
|
||||||
|
oexp.push_back(prefix + term);
|
||||||
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||||
|
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all",
|
||||||
|
&unacfoldtrans);
|
||||||
|
vector<string> lexp;
|
||||||
|
|
||||||
|
TermMatchResult res;
|
||||||
|
if (haswild) {
|
||||||
|
// Note that if there are wildcards, we do a direct from-index
|
||||||
|
// expansion, which means that we are casediac-sensitive. There
|
||||||
|
// would be nothing to prevent us to expand from the casediac
|
||||||
|
// synonyms first. To be done later
|
||||||
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
||||||
|
m_field);
|
||||||
|
goto termmatchtoresult;
|
||||||
|
}
|
||||||
|
|
||||||
|
sterm = term;
|
||||||
|
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
|
||||||
|
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
// If the index is raw, we can only come here if nostemexp is unset
|
||||||
|
// and we just need stem expansion.
|
||||||
|
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||||
|
goto termmatchtoresult;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No stem expansion when diacritic or case sensitivity is set, it
|
||||||
|
// makes no sense (it would mess with the diacritics anyway if
|
||||||
|
// they are not in the stem part). In these 3 cases, perform
|
||||||
|
// appropriate expansion from the charstripping db, and do a bogus
|
||||||
|
// wildcard expansion (there is no wild card) to generate the
|
||||||
|
// result:
|
||||||
|
|
||||||
|
if (diac_sensitive && case_sensitive) {
|
||||||
|
// No expansion whatsoever
|
||||||
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
|
||||||
|
goto termmatchtoresult;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (diac_sensitive) {
|
||||||
|
// Expand for accents and case, filtering for same accents,
|
||||||
|
// then bogus wildcard expansion for generating result
|
||||||
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||||
|
synac.synExpand(term, lexp, &foldtrans);
|
||||||
|
goto exptotermatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (case_sensitive) {
|
||||||
|
// Expand for accents and case, filtering for same case, then
|
||||||
|
// bogus wildcard expansion for generating result
|
||||||
|
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||||
|
synac.synExpand(term, lexp, &unactrans);
|
||||||
|
goto exptotermatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We are neither accent- nor case- sensitive and may need stem
|
||||||
|
// expansion or not.
|
||||||
|
|
||||||
|
// Expand for accents and case
|
||||||
|
synac.synExpand(term, lexp);
|
||||||
|
LOGDEB(("ExpTerm: casediac: %s\n", stringsToString(lexp).c_str()));
|
||||||
|
if (nostemexp)
|
||||||
|
goto exptotermatch;
|
||||||
|
|
||||||
|
// Need stem expansion. Lowercase the result of accent and case
|
||||||
|
// expansion for input to stemdb.
|
||||||
|
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||||
|
string lower;
|
||||||
|
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||||
|
lexp[i] = lower;
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
{
|
||||||
|
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
||||||
|
lexp.resize(uit - lexp.begin());
|
||||||
|
StemDb db(m_db.m_ndb->xrdb);
|
||||||
|
vector<string> exp1;
|
||||||
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
|
it != lexp.end(); it++) {
|
||||||
|
db.stemExpand(m_stemlang, *it, exp1);
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpTerm: stem: %s\n", stringsToString(exp1).c_str()));
|
||||||
|
|
||||||
|
// Expand the resulting list for case (all stemdb content
|
||||||
|
// is lowercase)
|
||||||
|
lexp.clear();
|
||||||
|
for (vector<string>::const_iterator it = exp1.begin();
|
||||||
|
it != exp1.end(); it++) {
|
||||||
|
synac.synExpand(*it, lexp);
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
uit = unique(lexp.begin(), lexp.end());
|
||||||
|
lexp.resize(uit - lexp.begin());
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpTerm: case exp of stem: %s\n", stringsToString(lexp).c_str()));
|
||||||
|
|
||||||
|
// Bogus wildcard expand to generate the result
|
||||||
|
exptotermatch:
|
||||||
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
|
it != lexp.end(); it++) {
|
||||||
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
|
||||||
|
res, -1, m_field);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Term match entries to vector of terms
|
||||||
|
termmatchtoresult:
|
||||||
|
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||||
|
it != res.entries.end(); it++) {
|
||||||
|
oexp.push_back(it->term);
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
||||||
@ -710,21 +896,22 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
void StringToXapianQ::processSimpleSpan(const string& span,
|
||||||
|
int mods,
|
||||||
vector<Xapian::Query> &pqueries)
|
vector<Xapian::Query> &pqueries)
|
||||||
{
|
{
|
||||||
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
|
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
||||||
span.c_str(), int(nostemexp)));
|
span.c_str(), (unsigned int)mods));
|
||||||
vector<string> exp;
|
vector<string> exp;
|
||||||
string sterm; // dumb version of user term
|
string sterm; // dumb version of user term
|
||||||
|
|
||||||
string prefix;
|
string prefix;
|
||||||
const FieldTraits *ftp;
|
const FieldTraits *ftp;
|
||||||
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
||||||
prefix = ftp->pfx;
|
prefix = wrap_prefix(ftp->pfx);
|
||||||
}
|
}
|
||||||
|
|
||||||
expandTerm(nostemexp, span, exp, sterm, prefix);
|
expandTerm(mods, span, exp, sterm, prefix);
|
||||||
|
|
||||||
// Set up the highlight data. No prefix should go in there
|
// Set up the highlight data. No prefix should go in there
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
for (vector<string>::const_iterator it = exp.begin();
|
||||||
@ -755,8 +942,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
|||||||
// queries if the terms get expanded by stemming or wildcards (we
|
// queries if the terms get expanded by stemming or wildcards (we
|
||||||
// don't do stemming for PHRASE though)
|
// don't do stemming for PHRASE though)
|
||||||
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||||
|
int mods,
|
||||||
vector<Xapian::Query> &pqueries,
|
vector<Xapian::Query> &pqueries,
|
||||||
bool useNear, int slack, int mods)
|
bool useNear, int slack)
|
||||||
{
|
{
|
||||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||||
Xapian::Query::OP_PHRASE;
|
Xapian::Query::OP_PHRASE;
|
||||||
@ -769,7 +957,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||||||
string prefix;
|
string prefix;
|
||||||
const FieldTraits *ftp;
|
const FieldTraits *ftp;
|
||||||
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
||||||
prefix = ftp->pfx;
|
prefix = wrap_prefix(ftp->pfx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
||||||
@ -790,10 +978,12 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||||||
|| hadmultiple
|
|| hadmultiple
|
||||||
#endif // single OR inside NEAR
|
#endif // single OR inside NEAR
|
||||||
;
|
;
|
||||||
|
int lmods = mods;
|
||||||
|
if (nostemexp)
|
||||||
|
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||||
string sterm;
|
string sterm;
|
||||||
vector<string> exp;
|
vector<string> exp;
|
||||||
expandTerm(nostemexp, *it, exp, sterm, prefix);
|
expandTerm(lmods, *it, exp, sterm, prefix);
|
||||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
||||||
listVector("", exp);
|
listVector("", exp);
|
||||||
// groups is used for highlighting, we don't want prefixes in there.
|
// groups is used for highlighting, we don't want prefixes in there.
|
||||||
@ -882,16 +1072,19 @@ static int stringToMods(string& s)
|
|||||||
* count)
|
* count)
|
||||||
*/
|
*/
|
||||||
bool StringToXapianQ::processUserString(const string &iq,
|
bool StringToXapianQ::processUserString(const string &iq,
|
||||||
|
int mods,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
vector<Xapian::Query> &pqueries,
|
vector<Xapian::Query> &pqueries,
|
||||||
const StopList& stops,
|
|
||||||
int slack,
|
int slack,
|
||||||
bool useNear
|
bool useNear
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
|
LOGDEB(("StringToXapianQ:: qstr [%s] mods 0x%x slack %d near %d\n",
|
||||||
|
iq.c_str(), mods, slack, useNear));
|
||||||
ermsg.erase();
|
ermsg.erase();
|
||||||
|
|
||||||
|
const StopList stops = m_db.getStopList();
|
||||||
|
|
||||||
// Simple whitespace-split input into user-level words and
|
// Simple whitespace-split input into user-level words and
|
||||||
// double-quoted phrases: word1 word2 "this is a phrase".
|
// double-quoted phrases: word1 word2 "this is a phrase".
|
||||||
//
|
//
|
||||||
@ -908,8 +1101,10 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
for (vector<string>::iterator it = phrases.begin();
|
for (vector<string>::iterator it = phrases.begin();
|
||||||
it != phrases.end(); it++) {
|
it != phrases.end(); it++) {
|
||||||
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
||||||
int mods = stringToMods(*it);
|
// Anchoring modifiers
|
||||||
int terminc = mods != 0 ? 1 : 0;
|
int amods = stringToMods(*it);
|
||||||
|
int terminc = amods != 0 ? 1 : 0;
|
||||||
|
mods |= amods;
|
||||||
// If there are multiple spans in this element, including
|
// If there are multiple spans in this element, including
|
||||||
// at least one composite, we have to increase the slack
|
// at least one composite, we have to increase the slack
|
||||||
// else a phrase query including a span would fail.
|
// else a phrase query including a span would fail.
|
||||||
@ -930,11 +1125,15 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||||
//tpcommon.onlygrams(true);
|
//tpcommon.onlygrams(true);
|
||||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
TermProcPrep tpprep(nxt);
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars)
|
||||||
|
#endif
|
||||||
|
nxt = &tpprep;
|
||||||
|
|
||||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD),
|
TextSplit::TXTS_KEEPWILD),
|
||||||
stops, nxt);
|
stops, nxt);
|
||||||
tpq.setTSQ(&splitter);
|
tpq.setTSQ(&splitter);
|
||||||
splitter.text_to_words(*it);
|
splitter.text_to_words(*it);
|
||||||
|
|
||||||
@ -944,14 +1143,17 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
switch (splitter.terms.size() + terminc) {
|
switch (splitter.terms.size() + terminc) {
|
||||||
case 0:
|
case 0:
|
||||||
continue;// ??
|
continue;// ??
|
||||||
case 1:
|
case 1: {
|
||||||
|
int lmods = mods;
|
||||||
|
if (splitter.nostemexps.front())
|
||||||
|
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||||
m_hld.ugroups.push_back(vector<string>(1, *it));
|
m_hld.ugroups.push_back(vector<string>(1, *it));
|
||||||
processSimpleSpan(splitter.terms.front(),
|
processSimpleSpan(splitter.terms.front(), lmods, pqueries);
|
||||||
splitter.nostemexps.front(), pqueries);
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
m_hld.ugroups.push_back(vector<string>(1, *it));
|
m_hld.ugroups.push_back(vector<string>(1, *it));
|
||||||
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
|
processPhraseOrNear(&splitter, mods, pqueries, useNear, slack);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
@ -971,13 +1173,10 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Translate a simple OR, AND, or EXCL search clause.
|
// Translate a simple OR, AND, or EXCL search clause.
|
||||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
||||||
const string& stemlang)
|
|
||||||
{
|
{
|
||||||
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
|
|
||||||
stemlang;
|
|
||||||
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
||||||
stemlang.c_str()));
|
getStemLang().c_str()));
|
||||||
|
|
||||||
Xapian::Query *qp = (Xapian::Query *)p;
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
*qp = Xapian::Query();
|
*qp = Xapian::Query();
|
||||||
@ -1000,8 +1199,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||||
(m_parentSearch == 0 && !m_haveWildCards);
|
(m_parentSearch == 0 && !m_haveWildCards);
|
||||||
|
|
||||||
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
|
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
|
||||||
if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList()))
|
if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||||
@ -1024,8 +1223,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
// about expanding multiple fragments in the past. We just take the
|
// about expanding multiple fragments in the past. We just take the
|
||||||
// value blanks and all and expand this against the indexed unsplit
|
// value blanks and all and expand this against the indexed unsplit
|
||||||
// file names
|
// file names
|
||||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
||||||
const string&)
|
|
||||||
{
|
{
|
||||||
Xapian::Query *qp = (Xapian::Query *)p;
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
*qp = Xapian::Query();
|
*qp = Xapian::Query();
|
||||||
@ -1041,11 +1239,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Translate NEAR or PHRASE clause.
|
// Translate NEAR or PHRASE clause.
|
||||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
||||||
const string& stemlang)
|
|
||||||
{
|
{
|
||||||
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
|
|
||||||
stemlang;
|
|
||||||
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
||||||
|
|
||||||
Xapian::Query *qp = (Xapian::Query *)p;
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
@ -1069,8 +1264,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
}
|
}
|
||||||
string s = cstr_dquote + m_text + cstr_dquote;
|
string s = cstr_dquote + m_text + cstr_dquote;
|
||||||
bool useNear = (m_tp == SCLT_NEAR);
|
bool useNear = (m_tp == SCLT_NEAR);
|
||||||
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
|
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
|
||||||
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
|
if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
|
||||||
m_slack, useNear))
|
m_slack, useNear))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
|
|||||||
@ -70,9 +70,9 @@ class SearchDataClause;
|
|||||||
*/
|
*/
|
||||||
class SearchData {
|
class SearchData {
|
||||||
public:
|
public:
|
||||||
SearchData(SClType tp)
|
SearchData(SClType tp, const string& stemlang)
|
||||||
: m_tp(tp), m_haveDates(false), m_maxSize(size_t(-1)),
|
: m_tp(tp), m_haveDates(false), m_maxSize(size_t(-1)),
|
||||||
m_minSize(size_t(-1)), m_haveWildCards(false)
|
m_minSize(size_t(-1)), m_haveWildCards(false), m_stemlang(stemlang)
|
||||||
{
|
{
|
||||||
if (m_tp != SCLT_OR && m_tp != SCLT_AND)
|
if (m_tp != SCLT_OR && m_tp != SCLT_AND)
|
||||||
m_tp = SCLT_OR;
|
m_tp = SCLT_OR;
|
||||||
@ -91,6 +91,7 @@ public:
|
|||||||
/** Translate to Xapian query. rcldb knows about the void* */
|
/** Translate to Xapian query. rcldb knows about the void* */
|
||||||
bool toNativeQuery(Rcl::Db &db, void *);
|
bool toNativeQuery(Rcl::Db &db, void *);
|
||||||
|
|
||||||
|
|
||||||
/** We become the owner of cl and will delete it */
|
/** We become the owner of cl and will delete it */
|
||||||
bool addClause(SearchDataClause *cl);
|
bool addClause(SearchDataClause *cl);
|
||||||
|
|
||||||
@ -109,6 +110,8 @@ public:
|
|||||||
m_dirspecs.push_back(DirSpec(t, excl, w));
|
m_dirspecs.push_back(DirSpec(t, excl, w));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::string& getStemLang() {return m_stemlang;}
|
||||||
|
|
||||||
void setMinSize(size_t size) {m_minSize = size;}
|
void setMinSize(size_t size) {m_minSize = size;}
|
||||||
void setMaxSize(size_t size) {m_maxSize = size;}
|
void setMaxSize(size_t size) {m_maxSize = size;}
|
||||||
|
|
||||||
@ -120,8 +123,6 @@ public:
|
|||||||
/** Add file type to not wanted list */
|
/** Add file type to not wanted list */
|
||||||
void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);}
|
void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);}
|
||||||
|
|
||||||
void setStemlang(const std::string& lang = "english") {m_stemlang = lang;}
|
|
||||||
|
|
||||||
/** Retrieve error description */
|
/** Retrieve error description */
|
||||||
std::string getReason() {return m_reason;}
|
std::string getReason() {return m_reason;}
|
||||||
|
|
||||||
@ -170,7 +171,12 @@ private:
|
|||||||
std::string m_reason;
|
std::string m_reason;
|
||||||
bool m_haveWildCards;
|
bool m_haveWildCards;
|
||||||
std::string m_stemlang;
|
std::string m_stemlang;
|
||||||
|
|
||||||
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
|
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
|
||||||
|
bool clausesToQuery(Rcl::Db &db, SClType tp,
|
||||||
|
std::vector<SearchDataClause*>& query,
|
||||||
|
string& reason, void *d);
|
||||||
|
|
||||||
/* Copyconst and assignment private and forbidden */
|
/* Copyconst and assignment private and forbidden */
|
||||||
SearchData(const SearchData &) {}
|
SearchData(const SearchData &) {}
|
||||||
SearchData& operator=(const SearchData&) {return *this;};
|
SearchData& operator=(const SearchData&) {return *this;};
|
||||||
@ -186,7 +192,7 @@ public:
|
|||||||
m_modifiers(SDCM_NONE), m_weight(1.0)
|
m_modifiers(SDCM_NONE), m_weight(1.0)
|
||||||
{}
|
{}
|
||||||
virtual ~SearchDataClause() {}
|
virtual ~SearchDataClause() {}
|
||||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const std::string&) = 0;
|
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
|
||||||
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
|
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
|
||||||
virtual std::string getReason() const {return m_reason;}
|
virtual std::string getReason() const {return m_reason;}
|
||||||
virtual void getTerms(HighlightData & hldata) const = 0;
|
virtual void getTerms(HighlightData & hldata) const = 0;
|
||||||
@ -199,6 +205,11 @@ public:
|
|||||||
{
|
{
|
||||||
m_parentSearch = p;
|
m_parentSearch = p;
|
||||||
}
|
}
|
||||||
|
string getStemLang()
|
||||||
|
{
|
||||||
|
return (m_modifiers & SDCM_NOSTEMMING) || m_parentSearch == 0 ?
|
||||||
|
cstr_null : m_parentSearch->getStemLang();
|
||||||
|
}
|
||||||
virtual void setModifiers(Modifier mod)
|
virtual void setModifiers(Modifier mod)
|
||||||
{
|
{
|
||||||
m_modifiers = mod;
|
m_modifiers = mod;
|
||||||
@ -255,7 +266,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Translate to Xapian query */
|
/** Translate to Xapian query */
|
||||||
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
|
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||||
|
|
||||||
virtual void getTerms(HighlightData& hldata) const
|
virtual void getTerms(HighlightData& hldata) const
|
||||||
{
|
{
|
||||||
@ -296,7 +307,7 @@ public:
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
|
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -315,7 +326,7 @@ public:
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
|
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||||
private:
|
private:
|
||||||
int m_slack;
|
int m_slack;
|
||||||
};
|
};
|
||||||
@ -323,17 +334,11 @@ private:
|
|||||||
/** Subquery */
|
/** Subquery */
|
||||||
class SearchDataClauseSub : public SearchDataClause {
|
class SearchDataClauseSub : public SearchDataClause {
|
||||||
public:
|
public:
|
||||||
// We take charge of the SearchData * and will delete it.
|
|
||||||
SearchDataClauseSub(SClType tp, RefCntr<SearchData> sub)
|
SearchDataClauseSub(SClType tp, RefCntr<SearchData> sub)
|
||||||
: SearchDataClause(tp), m_sub(sub)
|
: SearchDataClause(tp), m_sub(sub)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
virtual bool toNativeQuery(Rcl::Db &db, void *p)
|
||||||
virtual ~SearchDataClauseSub()
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool toNativeQuery(Rcl::Db &db, void *p, const std::string&)
|
|
||||||
{
|
{
|
||||||
return m_sub->toNativeQuery(db, p);
|
return m_sub->toNativeQuery(db, p);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -19,181 +19,66 @@
|
|||||||
* Management of the auxiliary databases listing stems and their expansion
|
* Management of the auxiliary databases listing stems and their expansion
|
||||||
* terms
|
* terms
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <iostream>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include <xapian.h>
|
#include <xapian.h>
|
||||||
|
|
||||||
#include "stemdb.h"
|
#include "stemdb.h"
|
||||||
#include "pathut.h"
|
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "utf8iter.h"
|
|
||||||
#include "textsplit.h"
|
|
||||||
#include "rcldb.h"
|
|
||||||
#include "rcldb_p.h"
|
|
||||||
#include "synfamily.h"
|
#include "synfamily.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
|
#include "rclconfig.h"
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
// Fast raw detection of non-natural-language words: look for ascii
|
|
||||||
// chars which are not lowercase letters. Not too sure what islower()
|
|
||||||
// would do with 8 bit values, so not using it here. If we want to be
|
|
||||||
// more complete we'd need to go full utf-8
|
|
||||||
inline static bool p_notlowerascii(unsigned int c)
|
|
||||||
{
|
|
||||||
if (c < 'a' || (c > 'z' && c < 128))
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create database of stem to parents associations for a given language.
|
|
||||||
*/
|
|
||||||
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|
||||||
const vector<string>& langs)
|
|
||||||
{
|
|
||||||
LOGDEB(("StemDb::createExpansionDbs\n"));
|
|
||||||
Chrono cron;
|
|
||||||
|
|
||||||
vector<XapWritableSynFamily> stemdbs;
|
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
||||||
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
|
|
||||||
stemdbs[i].deleteMember(langs[i]);
|
|
||||||
stemdbs[i].createMember(langs[i]);
|
|
||||||
stemdbs[i].setCurrentMemberName(langs[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// We walk the list of all terms, and stem each. We skip terms which
|
|
||||||
// don't look like natural language.
|
|
||||||
// If the stem is not identical to the term, we add a synonym entry.
|
|
||||||
// Statistics
|
|
||||||
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
|
|
||||||
int stemconst = 0; // Stem == term
|
|
||||||
int allsyns = 0; // Total number of entries created
|
|
||||||
|
|
||||||
string ermsg;
|
|
||||||
try {
|
|
||||||
vector<Xapian::Stem> stemmers;
|
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
||||||
stemmers.push_back(Xapian::Stem(langs[i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Xapian::TermIterator it = wdb.allterms_begin();
|
|
||||||
it != wdb.allterms_end(); it++) {
|
|
||||||
// If the term has any non-lowercase 7bit char (that is,
|
|
||||||
// numbers, capitals and punctuation) dont stem.
|
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
|
||||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
|
||||||
++nostem;
|
|
||||||
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
|
|
||||||
(*it).c_str(), *sit));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Detect and skip CJK terms.
|
|
||||||
// We're still sending all other multibyte utf-8 chars to
|
|
||||||
// the stemmer, which is not too well defined for
|
|
||||||
// xapian<1.0 (very obsolete now), but seems to work
|
|
||||||
// anyway. There shouldn't be too many in any case because
|
|
||||||
// accents are stripped at this point.
|
|
||||||
// The effect of stripping accents on stemming is not good,
|
|
||||||
// (e.g: in french partimes -> partim, parti^mes -> part)
|
|
||||||
// but fixing the issue would be complicated.
|
|
||||||
Utf8Iter utfit(*it);
|
|
||||||
if (TextSplit::isCJK(*utfit)) {
|
|
||||||
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create stemming synonym for every lang
|
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
||||||
string stem = stemmers[i](*it);
|
|
||||||
if (stem == *it) {
|
|
||||||
++stemconst;
|
|
||||||
} else {
|
|
||||||
stemdbs[i].addSynonym(stem, *it);
|
|
||||||
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
|
|
||||||
(*it).c_str(), langs[i].c_str(), stem.c_str()));
|
|
||||||
++allsyns;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
} XCATCHERROR(ermsg);
|
|
||||||
if (!ermsg.empty()) {
|
|
||||||
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
|
|
||||||
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
|
||||||
nostem, stemconst, allsyns));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Expand term to list of all terms which stem to the same term, for one
|
|
||||||
* expansion language
|
|
||||||
*/
|
|
||||||
bool StemDb::expandOne(const std::string& lang,
|
|
||||||
const std::string& term,
|
|
||||||
vector<string>& result)
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
Xapian::Stem stemmer(lang);
|
|
||||||
string stem = stemmer(term);
|
|
||||||
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
|
||||||
lang.c_str(), term.c_str(), stem.c_str()));
|
|
||||||
|
|
||||||
if (!synExpand(lang, stem, result)) {
|
|
||||||
// ?
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the user term or stem are not in the list, add them
|
|
||||||
if (find(result.begin(), result.end(), term) == result.end()) {
|
|
||||||
result.push_back(term);
|
|
||||||
}
|
|
||||||
if (find(result.begin(), result.end(), stem) == result.end()) {
|
|
||||||
result.push_back(stem);
|
|
||||||
}
|
|
||||||
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
|
|
||||||
stringsToString(result).c_str()));
|
|
||||||
|
|
||||||
} catch (...) {
|
|
||||||
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
|
|
||||||
lang.c_str()));
|
|
||||||
result.push_back(term);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expand for one or several languages
|
* Expand for one or several languages
|
||||||
*/
|
*/
|
||||||
bool StemDb::stemExpand(const std::string& langs,
|
bool StemDb::stemExpand(const std::string& langs, const std::string& term,
|
||||||
const std::string& term,
|
|
||||||
vector<string>& result)
|
vector<string>& result)
|
||||||
{
|
{
|
||||||
vector<string> llangs;
|
vector<string> llangs;
|
||||||
stringToStrings(langs, llangs);
|
stringToStrings(langs, llangs);
|
||||||
|
|
||||||
for (vector<string>::const_iterator it = llangs.begin();
|
for (vector<string>::const_iterator it = llangs.begin();
|
||||||
it != llangs.end(); it++) {
|
it != llangs.end(); it++) {
|
||||||
vector<string> oneexp;
|
SynTermTransStem stemmer(*it);
|
||||||
expandOne(*it, term, oneexp);
|
XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
|
||||||
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
(void)expander.synExpand(term, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
// Expand the unaccented stem
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
for (vector<string>::const_iterator it = llangs.begin();
|
||||||
|
it != llangs.end(); it++) {
|
||||||
|
SynTermTransStem stemmer(*it);
|
||||||
|
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
|
||||||
|
*it, &stemmer);
|
||||||
|
string unac;
|
||||||
|
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
|
||||||
|
(void)expander.synExpand(unac, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (result.empty())
|
||||||
|
result.push_back(term);
|
||||||
|
|
||||||
sort(result.begin(), result.end());
|
sort(result.begin(), result.end());
|
||||||
unique(result.begin(), result.end());
|
vector<string>::iterator uit = unique(result.begin(), result.end());
|
||||||
|
result.resize(uit - result.begin());
|
||||||
|
LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
|
||||||
|
stringsToString(result).c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -55,9 +55,30 @@
|
|||||||
#include <xapian.h>
|
#include <xapian.h>
|
||||||
|
|
||||||
#include "synfamily.h"
|
#include "synfamily.h"
|
||||||
|
#include "unacpp.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
|
/* A stemming functor for using with XapComputableSynFamMember */
|
||||||
|
class SynTermTransStem : public SynTermTrans {
|
||||||
|
public:
|
||||||
|
SynTermTransStem(const std::string& lang)
|
||||||
|
: m_stemmer(lang), m_lang(lang)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
virtual std::string operator()(const std::string& in)
|
||||||
|
{
|
||||||
|
string out = m_stemmer(in);
|
||||||
|
LOGDEB2(("SynTermTransStem(%s): in [%s] out [%s]\n", m_lang.c_str(),
|
||||||
|
in.c_str(), out.c_str()));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
Xapian::Stem m_stemmer;
|
||||||
|
std::string m_lang;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Stemdb is a bit special as a SynFamily as we may want to expand for one
|
||||||
|
* or several members (languages) */
|
||||||
class StemDb : public XapSynFamily {
|
class StemDb : public XapSynFamily {
|
||||||
public:
|
public:
|
||||||
StemDb(Xapian::Database& xdb)
|
StemDb(Xapian::Database& xdb)
|
||||||
@ -67,18 +88,10 @@ public:
|
|||||||
|
|
||||||
/** Expand for a number of languages */
|
/** Expand for a number of languages */
|
||||||
bool stemExpand(const std::string& langs,
|
bool stemExpand(const std::string& langs,
|
||||||
const std::string& term,
|
const std::string& term,
|
||||||
std::vector<std::string>& result);
|
std::vector<std::string>& result);
|
||||||
private:
|
|
||||||
/** Compute stem and call synExpand() */
|
|
||||||
bool expandOne(const std::string& lang,
|
|
||||||
const std::string& term,
|
|
||||||
std::vector<std::string>& result);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|
||||||
const std::vector<std::string>& langs);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* _STEMDB_H_INCLUDED_ */
|
#endif /* _STEMDB_H_INCLUDED_ */
|
||||||
|
|||||||
@ -28,31 +28,6 @@ using namespace std;
|
|||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
bool XapSynFamily::synExpand(const string& member, const string& term,
|
|
||||||
vector<string>& result)
|
|
||||||
{
|
|
||||||
string key = entryprefix(member) + term;
|
|
||||||
string ermsg;
|
|
||||||
try {
|
|
||||||
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
|
|
||||||
xit != m_rdb.synonyms_end(key); xit++) {
|
|
||||||
result.push_back(*xit);
|
|
||||||
}
|
|
||||||
} XCATCHERROR(ermsg);
|
|
||||||
if (!ermsg.empty()) {
|
|
||||||
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
|
|
||||||
member.c_str(), term.c_str()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#if 0
|
|
||||||
string out;
|
|
||||||
stringsToString(result, out);
|
|
||||||
LOGDEB0(("XapSynFamily::synExpand:%s: [%s] -> %s\n", member.c_str(),
|
|
||||||
term.c_str(), out.c_str()));
|
|
||||||
#endif
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool XapSynFamily::getMembers(vector<string>& members)
|
bool XapSynFamily::getMembers(vector<string>& members)
|
||||||
{
|
{
|
||||||
string key = memberskey();
|
string key = memberskey();
|
||||||
@ -100,6 +75,35 @@ bool XapSynFamily::listMap(const string& membername)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool XapSynFamily::synExpand(const string& member, const string& term,
|
||||||
|
vector<string>& result)
|
||||||
|
{
|
||||||
|
LOGDEB(("XapSynFamily::synExpand:(%s) %s for %s\n",
|
||||||
|
m_prefix1.c_str(), term.c_str(), member.c_str()));
|
||||||
|
|
||||||
|
string key = entryprefix(member) + term;
|
||||||
|
string ermsg;
|
||||||
|
try {
|
||||||
|
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
|
||||||
|
xit != m_rdb.synonyms_end(key); xit++) {
|
||||||
|
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
|
||||||
|
result.push_back(*xit);
|
||||||
|
}
|
||||||
|
} XCATCHERROR(ermsg);
|
||||||
|
if (!ermsg.empty()) {
|
||||||
|
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
|
||||||
|
member.c_str(), term.c_str()));
|
||||||
|
result.push_back(term);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// If the input term is not in the list, add it
|
||||||
|
if (find(result.begin(), result.end(), term) == result.end()) {
|
||||||
|
result.push_back(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool XapWritableSynFamily::deleteMember(const string& membername)
|
bool XapWritableSynFamily::deleteMember(const string& membername)
|
||||||
{
|
{
|
||||||
string key = entryprefix(membername);
|
string key = entryprefix(membername);
|
||||||
@ -119,32 +123,61 @@ bool XapWritableSynFamily::createMember(const string& membername)
|
|||||||
m_wdb.add_synonym(memberskey(), membername);
|
m_wdb.add_synonym(memberskey(), membername);
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("XapSynFamily::createMember: xapian error %s\n", ermsg.c_str()));
|
LOGERR(("XapSynFamily::createMember: error: %s\n", ermsg.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool XapWritableSynFamily::addSynonyms(const string& membername,
|
bool XapComputableSynFamMember::synExpand(const string& term,
|
||||||
const string& term,
|
vector<string>& result,
|
||||||
const vector<string>& trans)
|
SynTermTrans *filtertrans)
|
||||||
{
|
{
|
||||||
string key = entryprefix(membername) + term;
|
string root = (*m_trans)(term);
|
||||||
|
string filter_root;
|
||||||
|
if (filtertrans)
|
||||||
|
filter_root = (*filtertrans)(term);
|
||||||
|
|
||||||
|
/* We could call XapSynFamily::synExpand() here instead of doing it
|
||||||
|
ourselves... */
|
||||||
|
string key = m_prefix + root;
|
||||||
|
|
||||||
|
LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n",
|
||||||
|
m_prefix.c_str(), term.c_str(), root.c_str()));
|
||||||
|
|
||||||
string ermsg;
|
string ermsg;
|
||||||
try {
|
try {
|
||||||
for (vector<string>::const_iterator it = trans.begin();
|
for (Xapian::TermIterator xit = m_family.getdb().synonyms_begin(key);
|
||||||
it != trans.end(); it++) {
|
xit != m_family.getdb().synonyms_end(key); xit++) {
|
||||||
m_wdb.add_synonym(key, *it);
|
if (!filtertrans || (*filtertrans)(*xit) == filter_root) {
|
||||||
|
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
|
||||||
|
result.push_back(*xit);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("XapSynFamily::addSynonyms: xapian error %s\n", ermsg.c_str()));
|
LOGERR(("XapSynDb::synExpand: error for term [%s] (key %s)\n",
|
||||||
|
term.c_str(), key.c_str()));
|
||||||
|
result.push_back(term);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the input term and root are not in the list, add them
|
||||||
|
if (find(result.begin(), result.end(), term) == result.end()) {
|
||||||
|
LOGDEB2((" Pushing %s\n", term.c_str()));
|
||||||
|
result.push_back(term);
|
||||||
|
}
|
||||||
|
if (root != term &&
|
||||||
|
find(result.begin(), result.end(), root) == result.end()) {
|
||||||
|
if (!filtertrans || (*filtertrans)(root) == filter_root) {
|
||||||
|
LOGDEB2((" Pushing %s\n", root.c_str()));
|
||||||
|
result.push_back(root);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // TEST_SYNFAMILY
|
#else // TEST_SYNFAMILY
|
||||||
@ -169,16 +202,16 @@ using namespace std;
|
|||||||
|
|
||||||
static string thisprog;
|
static string thisprog;
|
||||||
static int op_flags;
|
static int op_flags;
|
||||||
#define OPT_a 0x4
|
|
||||||
#define OPT_c 0x8
|
|
||||||
#define OPT_D 0x1
|
#define OPT_D 0x1
|
||||||
#define OPT_d 0x10
|
|
||||||
#define OPT_L 0x2
|
#define OPT_L 0x2
|
||||||
|
#define OPT_a 0x4
|
||||||
|
#define OPT_u 0x8
|
||||||
|
#define OPT_d 0x10
|
||||||
#define OPT_l 0x20
|
#define OPT_l 0x20
|
||||||
#define OPT_s 0x40
|
#define OPT_s 0x40
|
||||||
#define OPT_e 0x80
|
#define OPT_e 0x80
|
||||||
static string usage =
|
static string usage =
|
||||||
" -d <dbdir> {-s|-a|-c} database dir and synfamily: stem accents case\n"
|
" -d <dbdir> {-s|-a|-u} database dir and synfamily: stem accents/case ustem\n"
|
||||||
" -l : list members\n"
|
" -l : list members\n"
|
||||||
" -L <member>: list entries for given member\n"
|
" -L <member>: list entries for given member\n"
|
||||||
" -e <member> <key> : list expansion for given member and key\n"
|
" -e <member> <key> : list expansion for given member and key\n"
|
||||||
@ -209,7 +242,6 @@ int main(int argc, char **argv)
|
|||||||
while (**argv)
|
while (**argv)
|
||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case 'a': op_flags |= OPT_a; break;
|
case 'a': op_flags |= OPT_a; break;
|
||||||
case 'c': op_flags |= OPT_c; break;
|
|
||||||
case 'D': op_flags |= OPT_D; break;
|
case 'D': op_flags |= OPT_D; break;
|
||||||
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
|
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
|
||||||
dbdir = *(++argv); argc--;
|
dbdir = *(++argv); argc--;
|
||||||
@ -223,6 +255,7 @@ int main(int argc, char **argv)
|
|||||||
member = *(++argv); argc--;
|
member = *(++argv); argc--;
|
||||||
goto b1;
|
goto b1;
|
||||||
case 's': op_flags |= OPT_s; break;
|
case 's': op_flags |= OPT_s; break;
|
||||||
|
case 'u': op_flags |= OPT_u; break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
b1: argc--; argv++;
|
b1: argc--; argv++;
|
||||||
@ -231,12 +264,11 @@ int main(int argc, char **argv)
|
|||||||
if (argc != 0)
|
if (argc != 0)
|
||||||
Usage();
|
Usage();
|
||||||
|
|
||||||
// We do stem only for now
|
|
||||||
string familyname;
|
string familyname;
|
||||||
if (op_flags & OPT_a) {
|
if (op_flags & OPT_a) {
|
||||||
familyname = Rcl::synFamDiac;
|
familyname = Rcl::synFamDiCa;
|
||||||
} else if (op_flags &OPT_c) {
|
} else if (op_flags & OPT_u) {
|
||||||
familyname = Rcl::synFamCase;
|
familyname = Rcl::synFamStemUnac;
|
||||||
} else {
|
} else {
|
||||||
familyname = Rcl::synFamStem;
|
familyname = Rcl::synFamStem;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,7 +42,7 @@
|
|||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
class XapSynFamily {
|
class XapSynFamily {
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Construct from readable xapian database and family name (ie: Stm)
|
* Construct from readable xapian database and family name (ie: Stm)
|
||||||
@ -53,38 +53,50 @@ public:
|
|||||||
m_prefix1 = std::string(":") + familyname;
|
m_prefix1 = std::string(":") + familyname;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Expand one term (e.g.: familier) inside one family number (e.g: french)
|
|
||||||
*/
|
|
||||||
virtual bool synExpand(const std::string& fammember,
|
|
||||||
const std::string& key,
|
|
||||||
std::vector<std::string>& result);
|
|
||||||
|
|
||||||
/** Retrieve all members of this family (e.g: french english german...) */
|
/** Retrieve all members of this family (e.g: french english german...) */
|
||||||
virtual bool getMembers(std::vector<std::string>&);
|
virtual bool getMembers(std::vector<std::string>&);
|
||||||
|
|
||||||
/** debug: list map for one member to stdout */
|
/** debug: list map for one member to stdout */
|
||||||
virtual bool listMap(const std::string& fam);
|
virtual bool listMap(const std::string& fam);
|
||||||
|
|
||||||
protected:
|
/** Expand term to list of synonyms for given member */
|
||||||
Xapian::Database m_rdb;
|
bool synExpand(const std::string& membername,
|
||||||
std::string m_prefix1;
|
const std::string& term, std::vector<std::string>& result);
|
||||||
|
|
||||||
|
// The prefix shared by all synonym entries inside a family member
|
||||||
virtual std::string entryprefix(const std::string& member)
|
virtual std::string entryprefix(const std::string& member)
|
||||||
{
|
{
|
||||||
return m_prefix1 + ":" + member + ":";
|
return m_prefix1 + ":" + member + ":";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The key for the "list of members" entry
|
||||||
virtual std::string memberskey()
|
virtual std::string memberskey()
|
||||||
{
|
{
|
||||||
return m_prefix1 + ";" + "members";
|
return m_prefix1 + ";" + "members";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Xapian::Database& getdb()
|
||||||
|
{
|
||||||
|
return m_rdb;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
Xapian::Database m_rdb;
|
||||||
|
std::string m_prefix1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Modify ops for a synonyms family
|
||||||
|
*
|
||||||
|
* A method to add a synonym entry inside a given member would make sense,
|
||||||
|
* but would not be used presently as all these ops go through
|
||||||
|
* ComputableSynFamMember objects
|
||||||
|
*/
|
||||||
class XapWritableSynFamily : public XapSynFamily {
|
class XapWritableSynFamily : public XapSynFamily {
|
||||||
public:
|
public:
|
||||||
/** Construct with Xapian db open for r/w */
|
/** Construct with Xapian db open for r/w */
|
||||||
XapWritableSynFamily(Xapian::WritableDatabase db, const std::string& pfx)
|
XapWritableSynFamily(Xapian::WritableDatabase db,
|
||||||
: XapSynFamily(db, pfx), m_wdb(db)
|
const std::string& familyname)
|
||||||
|
: XapSynFamily(db, familyname), m_wdb(db)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,36 +107,92 @@ public:
|
|||||||
/** Add to list of members. Idempotent, does not affect actual expansions */
|
/** Add to list of members. Idempotent, does not affect actual expansions */
|
||||||
virtual bool createMember(const std::string& membername);
|
virtual bool createMember(const std::string& membername);
|
||||||
|
|
||||||
/** Add expansion list for term inside family member (e.g., inside
|
Xapian::WritableDatabase getdb() {return m_wdb;}
|
||||||
* the english member, add expansion for floor -> floors, flooring.. */
|
|
||||||
virtual bool addSynonyms(const std::string& membername,
|
|
||||||
const std::string& term,
|
|
||||||
const std::vector<std::string>& trans);
|
|
||||||
|
|
||||||
// Need to call setCurrentMemberName before addSynonym !
|
protected:
|
||||||
// We don't check it, for speed
|
Xapian::WritableDatabase m_wdb;
|
||||||
virtual void setCurrentMemberName(const std::string& nm)
|
};
|
||||||
|
|
||||||
|
/** A functor which transforms a string */
|
||||||
|
class SynTermTrans {
|
||||||
|
public:
|
||||||
|
virtual std::string operator()(const std::string&) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** A member (set of root-synonyms associations) of a SynFamily for
|
||||||
|
* which the root is computable from the input term.
|
||||||
|
* The objects use a functor member to compute the term root on input
|
||||||
|
* (e.g. compute the term sterm or casefold it
|
||||||
|
*/
|
||||||
|
class XapComputableSynFamMember {
|
||||||
|
public:
|
||||||
|
XapComputableSynFamMember(Xapian::Database xdb, std::string familyname,
|
||||||
|
std::string membername, SynTermTrans* trans)
|
||||||
|
: m_family(xdb, familyname), m_membername(membername),
|
||||||
|
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
|
||||||
{
|
{
|
||||||
m_currentPrefix = entryprefix(nm);
|
|
||||||
}
|
}
|
||||||
virtual bool addSynonym(const std::string& term, const std::string& trans)
|
|
||||||
|
/** Expand a term to its list of synonyms. If filtertrans is set we
|
||||||
|
* keep only the results which transform to the same value as the input */
|
||||||
|
bool synExpand(const std::string& term, std::vector<std::string>& result,
|
||||||
|
SynTermTrans *filtertrans = 0);
|
||||||
|
|
||||||
|
private:
|
||||||
|
XapSynFamily m_family;
|
||||||
|
std::string m_membername;
|
||||||
|
SynTermTrans *m_trans;
|
||||||
|
std::string m_prefix;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Computable term root SynFamily member, modify ops */
|
||||||
|
class XapWritableComputableSynFamMember {
|
||||||
|
public:
|
||||||
|
XapWritableComputableSynFamMember(
|
||||||
|
Xapian::WritableDatabase xdb, std::string familyname,
|
||||||
|
std::string membername, SynTermTrans* trans)
|
||||||
|
: m_family(xdb, familyname), m_membername(membername),
|
||||||
|
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
|
||||||
{
|
{
|
||||||
std::string key = m_currentPrefix + term;
|
}
|
||||||
|
|
||||||
|
virtual bool addSynonym(const std::string& term)
|
||||||
|
{
|
||||||
|
LOGDEB2(("addSynonym:me %p term [%s] m_trans %p\n", this,
|
||||||
|
term.c_str(), m_trans));
|
||||||
|
std::string transformed = (*m_trans)(term);
|
||||||
|
LOGDEB2(("addSynonym: transformed [%s]\n", transformed.c_str()));
|
||||||
|
if (transformed == term)
|
||||||
|
return true;
|
||||||
|
|
||||||
std::string ermsg;
|
std::string ermsg;
|
||||||
try {
|
try {
|
||||||
m_wdb.add_synonym(key, trans);
|
m_family.getdb().add_synonym(m_prefix + transformed, term);
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n",
|
LOGERR(("XapWritableComputableSynFamMember::addSynonym: "
|
||||||
ermsg.c_str()));
|
"xapian error %s\n", ermsg.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
void clear()
|
||||||
Xapian::WritableDatabase m_wdb;
|
{
|
||||||
std::string m_currentPrefix;
|
m_family.deleteMember(m_membername);
|
||||||
|
}
|
||||||
|
|
||||||
|
void recreate()
|
||||||
|
{
|
||||||
|
clear();
|
||||||
|
m_family.createMember(m_membername);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
XapWritableSynFamily m_family;
|
||||||
|
std::string m_membername;
|
||||||
|
SynTermTrans *m_trans;
|
||||||
|
std::string m_prefix;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -133,11 +201,13 @@ protected:
|
|||||||
//
|
//
|
||||||
// Stem expansion family prefix. The family member name is the
|
// Stem expansion family prefix. The family member name is the
|
||||||
// language ("all" for Dia and Cse)
|
// language ("all" for Dia and Cse)
|
||||||
|
|
||||||
|
// Lowercase accented stem to expansion
|
||||||
static const std::string synFamStem("Stm");
|
static const std::string synFamStem("Stm");
|
||||||
static const std::string synFamDiac("Dia");
|
// Lowercase unaccented stem to expansion
|
||||||
static const std::string synFamCase("Cse");
|
static const std::string synFamStemUnac("StU");
|
||||||
|
// Lowercase unaccented term to case and accent variations
|
||||||
|
static const std::string synFamDiCa("DCa");
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* _SYNFAMILY_H_INCLUDED_ */
|
#endif /* _SYNFAMILY_H_INCLUDED_ */
|
||||||
|
|||||||
@ -35,7 +35,7 @@
|
|||||||
# Also reserved: F(parentid), Q(uniqueid)
|
# Also reserved: F(parentid), Q(uniqueid)
|
||||||
title = S ; wdfinc = 10
|
title = S ; wdfinc = 10
|
||||||
author = A
|
author = A
|
||||||
abstract =
|
abstract = XS
|
||||||
caption = S
|
caption = S
|
||||||
title = S
|
title = S
|
||||||
subject = S
|
subject = S
|
||||||
|
|||||||
@ -33,17 +33,12 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "hldata.h"
|
#include "hldata.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using namespace std;
|
|
||||||
#endif /* NO_NAMESPACES */
|
|
||||||
|
|
||||||
#define MIN(A,B) ((A)<(B)?(A):(B))
|
|
||||||
|
|
||||||
int stringicmp(const string & s1, const string& s2)
|
int stringicmp(const string & s1, const string& s2)
|
||||||
{
|
{
|
||||||
string::const_iterator it1 = s1.begin();
|
string::const_iterator it1 = s1.begin();
|
||||||
|
|||||||
@ -224,4 +224,11 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifndef MIN
|
||||||
|
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
|
||||||
|
#endif
|
||||||
|
#ifndef MAX
|
||||||
|
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* _SMALLUT_H_INCLUDED_ */
|
#endif /* _SMALLUT_H_INCLUDED_ */
|
||||||
|
|||||||
@ -103,7 +103,7 @@ public:
|
|||||||
|
|
||||||
/** Append current utf-8 possibly multi-byte character to string param.
|
/** Append current utf-8 possibly multi-byte character to string param.
|
||||||
This needs to be fast. No error checking. */
|
This needs to be fast. No error checking. */
|
||||||
unsigned int appendchartostring(std::string &out) {
|
unsigned int appendchartostring(std::string &out) const {
|
||||||
#ifdef UTF8ITER_CHECK
|
#ifdef UTF8ITER_CHECK
|
||||||
assert(m_cl != 0);
|
assert(m_cl != 0);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst
|
|||||||
daemloglevel = 6
|
daemloglevel = 6
|
||||||
daemlogfilename = /tmp/rclmontrace
|
daemlogfilename = /tmp/rclmontrace
|
||||||
|
|
||||||
|
indexStripChars = 1
|
||||||
|
|
||||||
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
||||||
|
|
||||||
skippedPaths = \
|
skippedPaths = \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user