diff --git a/src/aspell/rclaspell.cpp b/src/aspell/rclaspell.cpp index 794a741f..0874118d 100644 --- a/src/aspell/rclaspell.cpp +++ b/src/aspell/rclaspell.cpp @@ -254,14 +254,12 @@ public: {} void newData() { while (m_db.termWalkNext(m_tit, *m_input)) { - // Filter out terms beginning with upper case (special stuff) and - // containing numbers, or too long. Note that the 50 limit is a - // byte count, so not so high if there are multibyte chars. - if (m_input->empty() || m_input->length() > 50) + // Prefixed terms are also somewhere else without the suffix, + // skip them + if (m_input->empty() || + ('A' <= m_input->at(0) && m_input->at(0) <= 'Z')) continue; - if ('A' <= m_input->at(0) && m_input->at(0) <= 'Z') - continue; - if (m_input->find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") != string::npos) + if (!Rcl::Db::isSpellingCandidate(*m_input)) continue; // Got a non-empty sort-of appropriate term, let's send it to // aspell diff --git a/src/qtgui/spell_w.cpp b/src/qtgui/spell_w.cpp index b8aeb85e..91d955cb 100644 --- a/src/qtgui/spell_w.cpp +++ b/src/qtgui/spell_w.cpp @@ -163,6 +163,13 @@ void SpellW::doExpand() for (list::const_iterator it = suggs.begin(); it != suggs.end(); it++) res.entries.push_back(Rcl::TermMatchEntry(*it)); +#ifdef TESTING_XAPIAN_SPELL + string rclsugg = rcldb->getSpellingSuggestion(expr); + if (!rclsugg.empty()) { + res.entries.push_back(Rcl::TermMatchEntry("Xapian spelling:")); + res.entries.push_back(Rcl::TermMatchEntry(rclsugg)); + } +#endif // TESTING_XAPIAN_SPELL } #endif } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 03da775d..3077d1d8 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -83,6 +83,40 @@ string version_string(){ // found in document) static const string rclSyntAbs("?!#@"); +// Only ONE field name inside the index data record differs from the +// Rcl::Doc ones: caption<->title, for a remnant of compatibility with +// omega +static const string keycap("caption"); + +// Default table for field->prefix translation. We prefer the data +// from rclconfig if available. Note that this is logically const +// after initialization. Can't use a static object to init this as +// the static std::string objects may not be ready +static map fldToPrefs; +static void initFldToPrefs() +{ + fldToPrefs[Doc::keyabs] = string(); + fldToPrefs["ext"] = "XE"; + fldToPrefs[Doc::keyfn] = "XSFN"; + + fldToPrefs[keycap] = "S"; + fldToPrefs[Doc::keytt] = "S"; + fldToPrefs["subject"] = "S"; + + fldToPrefs[Doc::keyau] = "A"; + fldToPrefs["creator"] = "A"; + fldToPrefs["from"] = "A"; + + fldToPrefs[Doc::keykw] = "K"; + fldToPrefs["keyword"] = "K"; + fldToPrefs["tag"] = "K"; + fldToPrefs["tags"] = "K"; + + fldToPrefs["xapyear"] = "Y"; + fldToPrefs["xapyearmon"] = "M"; + fldToPrefs["xapdate"] = "D"; +} + // Compute the unique term used to link documents to their origin. // "Q" + external udi static inline string make_uniterm(const string& udi) @@ -131,11 +165,6 @@ bool Db::Native::subDocs(const string &udi, vector& docids) } } -// Only ONE field name inside the index data record differs from the -// Rcl::Doc ones: caption<->title, for a remnant of compatibility with -// omega -static const string keycap("caption"); - // Turn data record from db into document fields bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc) @@ -510,6 +539,9 @@ Db::Db(RclConfig *cfp) m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1), m_maxFsOccupPc(0), m_mode(Db::DbRO) { + if (!fldToPrefs.size()) + initFldToPrefs(); + m_ndb = new Native(this); if (m_config) { m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); @@ -759,39 +791,6 @@ bool Db::isopen() // reason (old config not updated ?). We use it only if the config // translation fails. Also we add in there fields which should be // indexed with no prefix (ie: abstract) - -// Default table. We prefer the data from rclconfig if available. Note -// that it is logically const after initialization. This would be -// simpler with c0xx initializer lists. -static map fldToPrefs; -class InitFldToPrefs { -public: - InitFldToPrefs() - { - fldToPrefs[Doc::keyabs] = string(); - fldToPrefs["ext"] = "XE"; - fldToPrefs[Doc::keyfn] = "XSFN"; - - fldToPrefs[keycap] = "S"; - fldToPrefs[Doc::keytt] = "S"; - fldToPrefs["subject"] = "S"; - - fldToPrefs[Doc::keyau] = "A"; - fldToPrefs["creator"] = "A"; - fldToPrefs["from"] = "A"; - - fldToPrefs[Doc::keykw] = "K"; - fldToPrefs["keyword"] = "K"; - fldToPrefs["tag"] = "K"; - fldToPrefs["tags"] = "K"; - - fldToPrefs["xapyear"] = "Y"; - fldToPrefs["xapyearmon"] = "M"; - fldToPrefs["xapdate"] = "D"; - } -}; -static InitFldToPrefs IFTP; - bool Db::fieldToPrefix(const string& fld, string &pfx) { if (m_config && m_config->getFieldPrefix(fld, pfx)) @@ -810,13 +809,15 @@ bool Db::fieldToPrefix(const string& fld, string &pfx) // The splitter breaks text into words and adds postings to the Xapian document. class TextSplitDb : public TextSplit { public: + Xapian::WritableDatabase db; Xapian::Document &doc; // Xapian document Xapian::termpos basepos; // Base for document section Xapian::termpos curpos; // Current position. Used to set basepos for the // following section StopList &stops; - TextSplitDb(Xapian::Document &d, StopList &_stops) - : doc(d), basepos(1), curpos(0), stops(_stops) + TextSplitDb(Xapian::WritableDatabase idb, + Xapian::Document &d, StopList &_stops) + : db(idb), doc(d), basepos(1), curpos(0), stops(_stops) {} bool takeword(const std::string &term, int pos, int, int); void setprefix(const string& pref) {prefix = pref;} @@ -856,6 +857,11 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int) // be possible to assign different weigths to doc parts (ie title) // by using a higher value doc.add_posting(term, pos, 1); +#ifdef TESTING_XAPIAN_SPELL + if (Db::isSpellingCandidate(term)) { + db.add_spelling(term); + } +#endif if (!prefix.empty()) { doc.add_posting(prefix + term, pos, 1); } @@ -865,6 +871,22 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int) return false; } +#ifdef TESTING_XAPIAN_SPELL +string Db::getSpellingSuggestion(const string& word) +{ + if (m_ndb == 0) + return string(); + string term; + if (!unacmaybefold(word, term, "UTF-8", true)) { + LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str())); + return string(); + } + if (!isSpellingCandidate(term)) + return string(); + return m_ndb->xrdb.get_spelling_suggestion(term); +} +#endif + // Let our user set the parameters for abstract processing void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) { @@ -911,7 +933,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc doc = idoc; Xapian::Document newdocument; - TextSplitDb splitter(newdocument, m_stops); + TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops); // Split and index file name as document term(s) LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index f09ae8cc..eff7b8ec 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -106,9 +106,26 @@ class Db { /** List possible stemmer names */ static list getStemmerNames(); + /** Test word for spelling correction candidate: not too long, no + special chars... */ + static bool isSpellingCandidate(const string& term) + { + if (term.empty() || term.length() > 50) + return false; + if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") + != string::npos) + return false; + return true; + } + /** List existing stemming databases */ std::list getStemLangs(); +#ifdef TESTING_XAPIAN_SPELL + /** Return spelling suggestion */ + string getSpellingSuggestion(const string& word); +#endif + /* The next two, only for searchdata, should be somehow hidden */ /* Return list of configured stop words */ const StopList& getStopList() const {return m_stops;}