Apply stemming to terms containing a single dash (e.g. thread-safe, thread-safeness)

This commit is contained in:
Jean-Francois Dockes 2022-09-23 18:40:02 +02:00
parent 5087447ef6
commit b1b0a41973
2 changed files with 22 additions and 14 deletions

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2004-2018 J.F.Dockes /* Copyright (C) 2004-2022 J.F.Dockes
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
@ -869,6 +869,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
/* Rcl::Db methods ///////////////////////////////// */ /* Rcl::Db methods ///////////////////////////////// */
bool Db::o_inPlaceReset; bool Db::o_inPlaceReset;
bool Db::o_nospell_chars[256];
Db::Db(const RclConfig *cfp) Db::Db(const RclConfig *cfp)
{ {
@ -885,6 +886,10 @@ Db::Db(const RclConfig *cfp)
start_of_field_term = "XXST/"; start_of_field_term = "XXST/";
end_of_field_term = "XXND/"; end_of_field_term = "XXND/";
} }
memset(o_nospell_chars, 0, sizeof(o_nospell_chars));
for (unsigned char c : " !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") {
o_nospell_chars[(unsigned int)c] = 1;
}
} }
m_ndb = new Native(this); m_ndb = new Native(this);
} }

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2004 J.F.Dockes /* Copyright (C) 2004-2022 J.F.Dockes
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
@ -259,22 +259,26 @@ public:
return false; return false;
#endif #endif
} }
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
!= string::npos) // Most punctuation chars inhibate stemming. We accept one dash. See o_nospell_chars init in
return false; // the rcldb constructor.
int ccnt = 0;
for (unsigned char c : term) {
if (o_nospell_chars[(unsigned int)c] && (c != '-' || ++ccnt > 1))
return false;
}
return true; return true;
} }
/** Return spelling suggestion */ /** Return spelling suggestion */
bool getSpellingSuggestions(const string& word, bool getSpellingSuggestions(const string& word, std::vector<std::string>& suggs);
std::vector<std::string>& suggs);
/* The next two, only for searchdata, should be somehow hidden */ /* The next two, only for searchdata, should be somehow hidden */
/* Return configured stop words */ /* Return configured stop words */
const StopList& getStopList() const {return m_stops;} const StopList& getStopList() const {return m_stops;}
/* Field name to prefix translation (ie: author -> 'A') */ /* Field name to prefix translation (ie: author -> 'A') */
bool fieldToTraits(const string& fldname, const FieldTraits **ftpp, bool fieldToTraits(const string& fldname, const FieldTraits **ftpp, bool isquery = false);
bool isquery = false);
/* Update-related methods ******************************************/ /* Update-related methods ******************************************/
@ -422,9 +426,8 @@ public:
int matchTypeTp(int tp) { int matchTypeTp(int tp) {
return tp & 7; return tp & 7;
} }
bool termMatch(int typ_sens, const string &lang, const string &term, bool termMatch(int typ_sens, const string &lang, const string &term, TermMatchResult& result,
TermMatchResult& result, int max = -1, int max = -1, const string& field = "", vector<string> *multiwords = 0);
const string& field = "", vector<string> *multiwords = 0);
bool dbStats(DbStats& stats, bool listFailed); bool dbStats(DbStats& stats, bool listFailed);
/** Return min and max years for doc mod times in db */ /** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear); bool maxYearSpan(int *minyear, int *maxyear);
@ -521,8 +524,7 @@ public:
/** Test term existence */ /** Test term existence */
bool termExists(const string& term); bool termExists(const string& term);
/** Test if terms stem to different roots. */ /** Test if terms stem to different roots. */
bool stemDiffers(const string& lang, const string& term, bool stemDiffers(const string& lang, const string& term, const string& base);
const string& base);
const RclConfig *getConf() {return m_config;} const RclConfig *getConf() {return m_config;}
@ -620,6 +622,7 @@ private:
// beginning, with the advantage that, for small index formats updates, // beginning, with the advantage that, for small index formats updates,
// between releases the index remains available while being recreated. // between releases the index remains available while being recreated.
static bool o_inPlaceReset; static bool o_inPlaceReset;
static bool o_nospell_chars[256];
/******* End logical constnesss */ /******* End logical constnesss */
#ifdef IDX_THREADS #ifdef IDX_THREADS