experiment with xapian spell support (not ready yet) + take care of some static init issues showing up on the mac
This commit is contained in:
parent
cb884a9cc4
commit
08a65f5cfc
@ -254,14 +254,12 @@ public:
|
|||||||
{}
|
{}
|
||||||
void newData() {
|
void newData() {
|
||||||
while (m_db.termWalkNext(m_tit, *m_input)) {
|
while (m_db.termWalkNext(m_tit, *m_input)) {
|
||||||
// Filter out terms beginning with upper case (special stuff) and
|
// Prefixed terms are also somewhere else without the suffix,
|
||||||
// containing numbers, or too long. Note that the 50 limit is a
|
// skip them
|
||||||
// byte count, so not so high if there are multibyte chars.
|
if (m_input->empty() ||
|
||||||
if (m_input->empty() || m_input->length() > 50)
|
('A' <= m_input->at(0) && m_input->at(0) <= 'Z'))
|
||||||
continue;
|
continue;
|
||||||
if ('A' <= m_input->at(0) && m_input->at(0) <= 'Z')
|
if (!Rcl::Db::isSpellingCandidate(*m_input))
|
||||||
continue;
|
|
||||||
if (m_input->find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") != string::npos)
|
|
||||||
continue;
|
continue;
|
||||||
// Got a non-empty sort-of appropriate term, let's send it to
|
// Got a non-empty sort-of appropriate term, let's send it to
|
||||||
// aspell
|
// aspell
|
||||||
|
|||||||
@ -163,6 +163,13 @@ void SpellW::doExpand()
|
|||||||
for (list<string>::const_iterator it = suggs.begin();
|
for (list<string>::const_iterator it = suggs.begin();
|
||||||
it != suggs.end(); it++)
|
it != suggs.end(); it++)
|
||||||
res.entries.push_back(Rcl::TermMatchEntry(*it));
|
res.entries.push_back(Rcl::TermMatchEntry(*it));
|
||||||
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
|
string rclsugg = rcldb->getSpellingSuggestion(expr);
|
||||||
|
if (!rclsugg.empty()) {
|
||||||
|
res.entries.push_back(Rcl::TermMatchEntry("Xapian spelling:"));
|
||||||
|
res.entries.push_back(Rcl::TermMatchEntry(rclsugg));
|
||||||
|
}
|
||||||
|
#endif // TESTING_XAPIAN_SPELL
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -83,6 +83,40 @@ string version_string(){
|
|||||||
// found in document)
|
// found in document)
|
||||||
static const string rclSyntAbs("?!#@");
|
static const string rclSyntAbs("?!#@");
|
||||||
|
|
||||||
|
// Only ONE field name inside the index data record differs from the
|
||||||
|
// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
|
||||||
|
// omega
|
||||||
|
static const string keycap("caption");
|
||||||
|
|
||||||
|
// Default table for field->prefix translation. We prefer the data
|
||||||
|
// from rclconfig if available. Note that this is logically const
|
||||||
|
// after initialization. Can't use a static object to init this as
|
||||||
|
// the static std::string objects may not be ready
|
||||||
|
static map<string, string> fldToPrefs;
|
||||||
|
static void initFldToPrefs()
|
||||||
|
{
|
||||||
|
fldToPrefs[Doc::keyabs] = string();
|
||||||
|
fldToPrefs["ext"] = "XE";
|
||||||
|
fldToPrefs[Doc::keyfn] = "XSFN";
|
||||||
|
|
||||||
|
fldToPrefs[keycap] = "S";
|
||||||
|
fldToPrefs[Doc::keytt] = "S";
|
||||||
|
fldToPrefs["subject"] = "S";
|
||||||
|
|
||||||
|
fldToPrefs[Doc::keyau] = "A";
|
||||||
|
fldToPrefs["creator"] = "A";
|
||||||
|
fldToPrefs["from"] = "A";
|
||||||
|
|
||||||
|
fldToPrefs[Doc::keykw] = "K";
|
||||||
|
fldToPrefs["keyword"] = "K";
|
||||||
|
fldToPrefs["tag"] = "K";
|
||||||
|
fldToPrefs["tags"] = "K";
|
||||||
|
|
||||||
|
fldToPrefs["xapyear"] = "Y";
|
||||||
|
fldToPrefs["xapyearmon"] = "M";
|
||||||
|
fldToPrefs["xapdate"] = "D";
|
||||||
|
}
|
||||||
|
|
||||||
// Compute the unique term used to link documents to their origin.
|
// Compute the unique term used to link documents to their origin.
|
||||||
// "Q" + external udi
|
// "Q" + external udi
|
||||||
static inline string make_uniterm(const string& udi)
|
static inline string make_uniterm(const string& udi)
|
||||||
@ -131,11 +165,6 @@ bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only ONE field name inside the index data record differs from the
|
|
||||||
// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
|
|
||||||
// omega
|
|
||||||
static const string keycap("caption");
|
|
||||||
|
|
||||||
// Turn data record from db into document fields
|
// Turn data record from db into document fields
|
||||||
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
||||||
Doc &doc)
|
Doc &doc)
|
||||||
@ -510,6 +539,9 @@ Db::Db(RclConfig *cfp)
|
|||||||
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
||||||
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
||||||
{
|
{
|
||||||
|
if (!fldToPrefs.size())
|
||||||
|
initFldToPrefs();
|
||||||
|
|
||||||
m_ndb = new Native(this);
|
m_ndb = new Native(this);
|
||||||
if (m_config) {
|
if (m_config) {
|
||||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||||
@ -759,39 +791,6 @@ bool Db::isopen()
|
|||||||
// reason (old config not updated ?). We use it only if the config
|
// reason (old config not updated ?). We use it only if the config
|
||||||
// translation fails. Also we add in there fields which should be
|
// translation fails. Also we add in there fields which should be
|
||||||
// indexed with no prefix (ie: abstract)
|
// indexed with no prefix (ie: abstract)
|
||||||
|
|
||||||
// Default table. We prefer the data from rclconfig if available. Note
|
|
||||||
// that it is logically const after initialization. This would be
|
|
||||||
// simpler with c0xx initializer lists.
|
|
||||||
static map<string, string> fldToPrefs;
|
|
||||||
class InitFldToPrefs {
|
|
||||||
public:
|
|
||||||
InitFldToPrefs()
|
|
||||||
{
|
|
||||||
fldToPrefs[Doc::keyabs] = string();
|
|
||||||
fldToPrefs["ext"] = "XE";
|
|
||||||
fldToPrefs[Doc::keyfn] = "XSFN";
|
|
||||||
|
|
||||||
fldToPrefs[keycap] = "S";
|
|
||||||
fldToPrefs[Doc::keytt] = "S";
|
|
||||||
fldToPrefs["subject"] = "S";
|
|
||||||
|
|
||||||
fldToPrefs[Doc::keyau] = "A";
|
|
||||||
fldToPrefs["creator"] = "A";
|
|
||||||
fldToPrefs["from"] = "A";
|
|
||||||
|
|
||||||
fldToPrefs[Doc::keykw] = "K";
|
|
||||||
fldToPrefs["keyword"] = "K";
|
|
||||||
fldToPrefs["tag"] = "K";
|
|
||||||
fldToPrefs["tags"] = "K";
|
|
||||||
|
|
||||||
fldToPrefs["xapyear"] = "Y";
|
|
||||||
fldToPrefs["xapyearmon"] = "M";
|
|
||||||
fldToPrefs["xapdate"] = "D";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
static InitFldToPrefs IFTP;
|
|
||||||
|
|
||||||
bool Db::fieldToPrefix(const string& fld, string &pfx)
|
bool Db::fieldToPrefix(const string& fld, string &pfx)
|
||||||
{
|
{
|
||||||
if (m_config && m_config->getFieldPrefix(fld, pfx))
|
if (m_config && m_config->getFieldPrefix(fld, pfx))
|
||||||
@ -810,13 +809,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx)
|
|||||||
// The splitter breaks text into words and adds postings to the Xapian document.
|
// The splitter breaks text into words and adds postings to the Xapian document.
|
||||||
class TextSplitDb : public TextSplit {
|
class TextSplitDb : public TextSplit {
|
||||||
public:
|
public:
|
||||||
|
Xapian::WritableDatabase db;
|
||||||
Xapian::Document &doc; // Xapian document
|
Xapian::Document &doc; // Xapian document
|
||||||
Xapian::termpos basepos; // Base for document section
|
Xapian::termpos basepos; // Base for document section
|
||||||
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||||
// following section
|
// following section
|
||||||
StopList &stops;
|
StopList &stops;
|
||||||
TextSplitDb(Xapian::Document &d, StopList &_stops)
|
TextSplitDb(Xapian::WritableDatabase idb,
|
||||||
: doc(d), basepos(1), curpos(0), stops(_stops)
|
Xapian::Document &d, StopList &_stops)
|
||||||
|
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops)
|
||||||
{}
|
{}
|
||||||
bool takeword(const std::string &term, int pos, int, int);
|
bool takeword(const std::string &term, int pos, int, int);
|
||||||
void setprefix(const string& pref) {prefix = pref;}
|
void setprefix(const string& pref) {prefix = pref;}
|
||||||
@ -856,6 +857,11 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
|||||||
// be possible to assign different weigths to doc parts (ie title)
|
// be possible to assign different weigths to doc parts (ie title)
|
||||||
// by using a higher value
|
// by using a higher value
|
||||||
doc.add_posting(term, pos, 1);
|
doc.add_posting(term, pos, 1);
|
||||||
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
|
if (Db::isSpellingCandidate(term)) {
|
||||||
|
db.add_spelling(term);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (!prefix.empty()) {
|
if (!prefix.empty()) {
|
||||||
doc.add_posting(prefix + term, pos, 1);
|
doc.add_posting(prefix + term, pos, 1);
|
||||||
}
|
}
|
||||||
@ -865,6 +871,22 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
|
string Db::getSpellingSuggestion(const string& word)
|
||||||
|
{
|
||||||
|
if (m_ndb == 0)
|
||||||
|
return string();
|
||||||
|
string term;
|
||||||
|
if (!unacmaybefold(word, term, "UTF-8", true)) {
|
||||||
|
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
||||||
|
return string();
|
||||||
|
}
|
||||||
|
if (!isSpellingCandidate(term))
|
||||||
|
return string();
|
||||||
|
return m_ndb->xrdb.get_spelling_suggestion(term);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Let our user set the parameters for abstract processing
|
// Let our user set the parameters for abstract processing
|
||||||
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
|
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
|
||||||
{
|
{
|
||||||
@ -911,7 +933,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
Doc doc = idoc;
|
Doc doc = idoc;
|
||||||
|
|
||||||
Xapian::Document newdocument;
|
Xapian::Document newdocument;
|
||||||
TextSplitDb splitter(newdocument, m_stops);
|
TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
|
||||||
|
|
||||||
// Split and index file name as document term(s)
|
// Split and index file name as document term(s)
|
||||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||||
|
|||||||
@ -106,9 +106,26 @@ class Db {
|
|||||||
/** List possible stemmer names */
|
/** List possible stemmer names */
|
||||||
static list<string> getStemmerNames();
|
static list<string> getStemmerNames();
|
||||||
|
|
||||||
|
/** Test word for spelling correction candidate: not too long, no
|
||||||
|
special chars... */
|
||||||
|
static bool isSpellingCandidate(const string& term)
|
||||||
|
{
|
||||||
|
if (term.empty() || term.length() > 50)
|
||||||
|
return false;
|
||||||
|
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
|
||||||
|
!= string::npos)
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/** List existing stemming databases */
|
/** List existing stemming databases */
|
||||||
std::list<std::string> getStemLangs();
|
std::list<std::string> getStemLangs();
|
||||||
|
|
||||||
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
|
/** Return spelling suggestion */
|
||||||
|
string getSpellingSuggestion(const string& word);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* The next two, only for searchdata, should be somehow hidden */
|
/* The next two, only for searchdata, should be somehow hidden */
|
||||||
/* Return list of configured stop words */
|
/* Return list of configured stop words */
|
||||||
const StopList& getStopList() const {return m_stops;}
|
const StopList& getStopList() const {return m_stops;}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user