From 9661a4431e15e335cff0b2092cf2d2dead76b786 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 18 Apr 2017 14:39:12 +0200 Subject: [PATCH] wen --- src/Makefile.am | 1 + src/common/textsplit.cpp | 13 +++- src/common/textsplit.h | 9 +-- src/qtgui/main.cpp | 19 ------ src/qtgui/recoll.h | 5 -- src/qtgui/recoll.pro.in | 2 +- src/qtgui/reslist.cpp | 40 +++--------- src/qtgui/spell_w.cpp | 49 +++------------ src/qtgui/spell_w.h | 8 +-- src/rcldb/rcldb.cpp | 101 ++++++++++++++++++++++++------- src/rcldb/rcldb.h | 22 ++++--- website/pages/recoll-windows.txt | 19 +++--- 12 files changed, 147 insertions(+), 141 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index f24534fe..711775c4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -32,6 +32,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \ $(X_CFLAGS) \ -DRECOLL_DATADIR=\"${pkgdatadir}\" \ -D_GNU_SOURCE \ + -DTESTING_XAPIAN_SPELL \ $(DEFS) ACLOCAL_AMFLAGS = -I m4 diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 7b797421..58c73589 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -197,7 +197,9 @@ static inline int whatcc(unsigned int c) #define UNICODE_IS_CJK(p) \ ((p) > 127 && \ (((p) >= 0x2E80 && (p) <= 0x2EFF) || \ - ((p) >= 0x3000 && (p) <= 0x9FFF) || \ + ((p) >= 0x3000 && (p) <= 0x309F) || \ + ((p) >= 0x3100 && (p) <= 0x31EF) || \ + ((p) >= 0x3200 && (p) <= 0x9FFF) || \ ((p) >= 0xA700 && (p) <= 0xA71F) || \ ((p) >= 0xAC00 && (p) <= 0xD7AF) || \ ((p) >= 0xF900 && (p) <= 0xFAFF) || \ @@ -206,10 +208,19 @@ static inline int whatcc(unsigned int c) ((p) >= 0x20000 && (p) <= 0x2A6DF) || \ ((p) >= 0x2F800 && (p) <= 0x2FA1F))) +#define UNICODE_IS_KATAKANA(p) \ + ((p) > 127 && \ + (((p) >= 0x30A0 && (p) <= 0x30FF) || \ + ((p) >= 0x31F0 && (p) <= 0x31FF))) + bool TextSplit::isCJK(int c) { return UNICODE_IS_CJK(c); } +bool TextSplit::isKATAKANA(int c) +{ + return UNICODE_IS_KATAKANA(c); +} bool TextSplit::o_processCJK = true; unsigned int TextSplit::o_CJKNgramLen = 2; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index d408bb2e..b68d9430 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -92,8 +92,7 @@ public: /** Called when we encounter formfeed \f 0x0c. Override to use the event. * Mostly or exclusively used with pdftoxx output. Other filters mostly * just don't know about pages. */ - virtual void newpage(int /*pos*/) - { + virtual void newpage(int /*pos*/) { } // Static utility functions: @@ -111,10 +110,12 @@ public: * non-utf-8 input (iso-8859 config files work ok). This hopefully * handles all Unicode whitespace, but needs correct utf-8 input */ - static bool stringToStrings(const std::string &s, std::vector &tokens); + static bool stringToStrings(const std::string &s, + std::vector &tokens); - /** Is char CJK ? */ + /** Is char CJK ? (excluding Katakana) */ static bool isCJK(int c); + static bool isKATAKANA(int c); /** Statistics about word length (average and dispersion) can * detect bad data like undecoded base64 or other mis-identified diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp index e3ba48f1..1a39b72a 100644 --- a/src/qtgui/main.cpp +++ b/src/qtgui/main.cpp @@ -40,9 +40,6 @@ #include "rclmain_w.h" #include "ssearch_w.h" #include "guiutils.h" -#ifdef RCL_USE_ASPELL -#include "rclaspell.h" -#endif #include "smallut.h" #include "readfile.h" @@ -83,9 +80,6 @@ void deleteAllTempFiles() Rcl::Db *rcldb; -#ifdef RCL_USE_ASPELL -Aspell *aspell; -#endif int recollNeedsExit; RclMain *mainWindow; @@ -158,10 +152,6 @@ static void recollCleanup() deleteAllTempFiles(); -#ifdef RCL_USE_ASPELL - deleteZ(aspell); -#endif - LOGDEB2("recollCleanup: done\n" ); } @@ -322,15 +312,6 @@ int main(int argc, char **argv) // fprintf(stderr, "Translations installed\n"); -#ifdef RCL_USE_ASPELL - aspell = new Aspell(theconfig); - aspell->init(reason); - if (!aspell || !aspell->ok()) { - LOGDEB("Aspell speller creation failed " << (reason) << "\n" ); - aspell = 0; - } -#endif - string historyfile = path_cat(theconfig->getConfDir(), "history"); g_dynconf = new RclDynConf(historyfile); if (!g_dynconf || !g_dynconf->ok()) { diff --git a/src/qtgui/recoll.h b/src/qtgui/recoll.h index 7ee0dfe9..c3fb1bd3 100644 --- a/src/qtgui/recoll.h +++ b/src/qtgui/recoll.h @@ -46,11 +46,6 @@ extern void startManual(const string& helpindex); extern void applyStyleSheet(const QString&); -#ifdef RCL_USE_ASPELL -class Aspell; -extern Aspell *aspell; -#endif - inline std::string qs2utf8s(const QString& qs) { return std::string((const char *)qs.toUtf8()); diff --git a/src/qtgui/recoll.pro.in b/src/qtgui/recoll.pro.in index c58105f0..e8dc3345 100644 --- a/src/qtgui/recoll.pro.in +++ b/src/qtgui/recoll.pro.in @@ -2,7 +2,7 @@ TEMPLATE = app LANGUAGE = C++ VPATH = @srcdir@ -DEFINES += BUILDING_RECOLL +DEFINES += BUILDING_RECOLL TESTING_XAPIAN_SPELL @QMAKE_ENABLE_WEBKIT@ QT += webkit @QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index a7c1448f..3368c143 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -55,9 +55,6 @@ #include "reslist.h" #include "moc_reslist.cpp" #include "rclhelp.h" -#ifdef RCL_USE_ASPELL -#include "rclaspell.h" -#endif #include "appformime.h" #include "respopup.h" @@ -201,53 +198,36 @@ void QtGuiResListPager::suggest(const vectoruterms, map >& sugg) { sugg.clear(); -#ifdef RCL_USE_ASPELL - bool noaspell = false; - theconfig->getConfParam("noaspell", &noaspell); - if (noaspell) - return; - if (!aspell) { - LOGERR("QtGuiResListPager:: aspell not initialized\n" ); - return; - } - bool issimple = m_reslist && m_reslist->m_rclmain && m_reslist->m_rclmain->lastSearchSimple(); - for (vector::const_iterator uit = uterms.begin(); - uit != uterms.end(); uit++) { - list asuggs; - string reason; + for (const auto& uit : uterms) { + vector tsuggs; // If the term is in the dictionary, Aspell::suggest won't // list alternatives. In fact we may want to check the // frequencies and propose something anyway if a possible // variation is much more common (as google does) ? - if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) { - LOGERR("QtGuiResListPager::suggest: aspell failed: " << (reason) << "\n" ); + if (!rcldb->getSpellingSuggestions(uit, tsuggs)) { continue; } - // We should check that the term stems differently from the // base word (else it's not useful to expand the search). Or // is it ? This should depend if stemming is turned on or not - if (!asuggs.empty()) { - sugg[*uit] = vector(asuggs.begin(), asuggs.end()); - if (sugg[*uit].size() > 5) - sugg[*uit].resize(5); + if (!tsuggs.empty()) { + sugg[uit] = vector(tsuggs.begin(), tsuggs.end()); + if (sugg[uit].size() > 5) + sugg[uit].resize(5); // Set up the links as a . - for (vector::iterator it = sugg[*uit].begin(); - it != sugg[*uit].end(); it++) { + for (auto& it : sugg[uit]) { if (issimple) { - *it = string("" + - *it + ""; + it = string("" + + it + ""; } } } } -#endif - } string QtGuiResListPager::iconUrl(RclConfig *config, Rcl::Doc& doc) diff --git a/src/qtgui/spell_w.cpp b/src/qtgui/spell_w.cpp index 4d743f6e..7cbe5b7d 100644 --- a/src/qtgui/spell_w.cpp +++ b/src/qtgui/spell_w.cpp @@ -47,10 +47,6 @@ #include "execmd.h" #include "indexer.h" -#ifdef RCL_USE_ASPELL -#include "rclaspell.h" -#endif - using std::list; using std::multimap; using std::string; @@ -64,14 +60,8 @@ void SpellW::init() m_c2t.push_back(TYPECMB_REG); expTypeCMB->addItem(tr("Stem expansion")); m_c2t.push_back(TYPECMB_STEM); -#ifdef RCL_USE_ASPELL - bool noaspell = false; - theconfig->getConfParam("noaspell", &noaspell); - if (!noaspell) { - expTypeCMB->addItem(tr("Spelling/Phonetic")); - m_c2t.push_back(TYPECMB_ASPELL); - } -#endif + expTypeCMB->addItem(tr("Spelling/Phonetic")); + m_c2t.push_back(TYPECMB_SPELL); expTypeCMB->addItem(tr("Show index statistics")); m_c2t.push_back(TYPECMB_STATS); @@ -189,37 +179,19 @@ void SpellW::doExpand() break; -#ifdef RCL_USE_ASPELL - case TYPECMB_ASPELL: + case TYPECMB_SPELL: { - LOGDEB("SpellW::doExpand: aspelling\n" ); - if (!aspell) { - QMessageBox::warning(0, "Recoll", - tr("Aspell init failed. " - "Aspell not installed?")); - LOGDEB("SpellW::doExpand: aspell init error\n" ); - return; + LOGDEB("SpellW::doExpand: spelling [" << expr << "]\n" ); + vector suggs; + if (!rcldb->getSpellingSuggestions(expr, suggs)) { + QMessageBox::warning(0, "Recoll", tr("Spell expansion error. ")); } - list suggs; - if (!aspell->suggest(*rcldb, expr, suggs, reason)) { - QMessageBox::warning(0, "Recoll", - tr("Aspell expansion error. ")); - LOGERR("SpellW::doExpand:suggest failed: " << (reason) << "\n" ); - } - for (list::const_iterator it = suggs.begin(); - it != suggs.end(); it++) - res.entries.push_back(Rcl::TermMatchEntry(*it)); -#ifdef TESTING_XAPIAN_SPELL - string rclsugg = rcldb->getSpellingSuggestion(expr); - if (!rclsugg.empty()) { - res.entries.push_back(Rcl::TermMatchEntry("Xapian spelling:")); - res.entries.push_back(Rcl::TermMatchEntry(rclsugg)); - } -#endif // TESTING_XAPIAN_SPELL + for (const auto& it : suggs) { + res.entries.push_back(Rcl::TermMatchEntry(it)); + } statsLBL->setText(tr("%1 results").arg(res.entries.size())); } break; -#endif // RCL_USE_ASPELL case TYPECMB_STATS: { @@ -229,7 +201,6 @@ void SpellW::doExpand() break; } - if (res.entries.empty()) { resTW->setItem(0, 0, new QTableWidgetItem(tr("No expansion found"))); } else { diff --git a/src/qtgui/spell_w.h b/src/qtgui/spell_w.h index d1cfd6a3..ac0a5dd4 100644 --- a/src/qtgui/spell_w.h +++ b/src/qtgui/spell_w.h @@ -14,8 +14,8 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#ifndef _ASPELL_W_H_INCLUDED_ -#define _ASPELL_W_H_INCLUDED_ +#ifndef _SPELL_W_H_INCLUDED_ +#define _SPELL_W_H_INCLUDED_ #include @@ -36,7 +36,7 @@ public: virtual bool eventFilter(QObject *target, QEvent *event ); enum comboboxchoice {TYPECMB_NONE, TYPECMB_WILD, TYPECMB_REG, TYPECMB_STEM, - TYPECMB_ASPELL, TYPECMB_STATS}; + TYPECMB_SPELL, TYPECMB_STATS}; public slots: virtual void doExpand(); virtual void wordChanged(const QString&); @@ -62,4 +62,4 @@ private: void setModeCommon(comboboxchoice mode); }; -#endif /* _ASPELL_W_H_INCLUDED_ */ +#endif /* _SPELL_W_H_INCLUDED_ */ diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 0f4253ae..c79983b5 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -57,6 +57,9 @@ using namespace std; #include "rclinit.h" #include "internfile.h" #include "utf8fn.h" +#ifdef RCL_USE_ASPELL +#include "rclaspell.h" +#endif // Recoll index format version is stored in user metadata. When this change, // we can't open the db and will have to reindex. @@ -731,11 +734,13 @@ Db::Db(const RclConfig *cfp) Db::~Db() { - LOGDEB2("Db::~Db\n" ); + LOGDEB2("Db::~Db\n"); if (m_ndb == 0) return; - LOGDEB("Db::~Db: isopen " << (m_ndb->m_isopen) << " m_iswritable " << (m_ndb->m_iswritable) << "\n" ); + LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " << + m_ndb->m_iswritable << "\n"); i_close(true); + delete m_aspell; delete m_config; } @@ -1055,9 +1060,11 @@ class TextSplitDb : public TextSplitP { // gets added to basepos in addition to the inter-section increment // to compute the first position of the next section. Xapian::termpos curpos; + Xapian::WritableDatabase& wdb; - TextSplitDb(Xapian::Document &d, TermProc *prc) - : TextSplitP(prc), doc(d), basepos(1), curpos(0) + TextSplitDb(Xapian::WritableDatabase& _wdb, Xapian::Document &d, + TermProc *prc) + : TextSplitP(prc), doc(d), basepos(1), curpos(0), wdb(_wdb) {} // Reimplement text_to_words to insert the begin and end anchor terms. @@ -1132,8 +1139,8 @@ public: m_ts->doc.add_posting(term, pos, m_ts->ft.wdfinc); #ifdef TESTING_XAPIAN_SPELL - if (Db::isSpellingCandidate(term)) { - m_ts->db.add_spelling(term); + if (Db::isSpellingCandidate(term, false)) { + m_ts->wdb.add_spelling(term); } #endif // Index the prefixed term. @@ -1192,30 +1199,80 @@ public: }; -#ifdef TESTING_XAPIAN_SPELL -string Db::getSpellingSuggestion(const string& word) +// At the moment, we normally use the Xapian speller for Katakana and +// aspell for everything else +bool Db::getSpellingSuggestions(const string& word, vector& suggs) { - if (m_ndb == 0) - return string(); + LOGDEB("Db::getSpellingSuggestions:[" << word << "]\n" ); + suggs.clear(); + if (nullptr == m_ndb) { + return false; + } string term = word; - if (o_index_stripchars) - if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) { - LOGINFO("Db::getSpelling: unac failed for [" << (word) << "]\n" ); - return string(); - } + if (isSpellingCandidate(term, true)) { + // Term is candidate for aspell processing +#ifdef RCL_USE_ASPELL + bool noaspell = false; + m_config->getConfParam("noaspell", &noaspell); + if (noaspell) { + return false; + } + if (nullptr == m_aspell) { + m_aspell = new Aspell(m_config); + if (m_aspell) { + string reason; + m_aspell->init(reason); + if (!m_aspell->ok()) { + LOGDEB(("Aspell speller init failed %s\n", reason.c_str())); + delete m_aspell; + m_aspell = 0; + } + } + } - if (!isSpellingCandidate(term)) - return string(); - return m_ndb->xrdb.get_spelling_suggestion(term); -} + if (nullptr == m_aspell) { + LOGERR("Db::getSpellingSuggestions: aspell not initialized\n"); + return false; + } + + list asuggs; + string reason; + if (!m_aspell->suggest(*this, term, asuggs, reason)) { + LOGERR("Db::getSpellingSuggestions: aspell failed: " << reason << + "\n"); + return false; + } + suggs = vector(asuggs.begin(), asuggs.end()); #endif + } else { +#ifdef TESTING_XAPIAN_SPELL + // Was not aspell candidate (e.g.: katakana). Maybe use Xapian + // speller? + if (isSpellingCandidate(term, false)) { + if (!o_index_stripchars) { + if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO("Db::getSpelling: unac failed for [" << word << + "]\n"); + return false; + } + } + string sugg = m_ndb->xrdb.get_spelling_suggestion(term); + if (!sugg.empty()) { + suggs.push_back(sugg); + } + } +#endif + } + return true; +} // Let our user set the parameters for abstract processing void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) { - LOGDEB1("Db::setAbstractParams: trunc " << (idxtrunc) << " syntlen " << (syntlen) << " ctxlen " << (syntctxlen) << "\n" ); + LOGDEB1("Db::setAbstractParams: trunc " << idxtrunc << " syntlen " << + syntlen << " ctxlen " << syntctxlen << "\n"); if (idxtrunc >= 0) m_idxAbsTruncLen = idxtrunc; if (syntlen > 0) @@ -1238,7 +1295,7 @@ static const string cstr_nc("\n\r\x0c\\"); // metadata), and update database bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) { - LOGDEB("Db::add: udi [" << (udi) << "] parent [" << (parent_udi) << "]\n" ); + LOGDEB("Db::add: udi [" << udi << "] parent [" << parent_udi << "]\n"); if (m_ndb == 0) return false; @@ -1259,7 +1316,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) if (o_index_stripchars) nxt = &tpprep; - TextSplitDb splitter(newdocument, nxt); + TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt); tpidx.setTSD(&splitter); // Udi unique term: this is used for file existence/uptodate diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index e0699a79..89a5c360 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -54,6 +54,7 @@ using std::vector; // reasonable) class RclConfig; +class Aspell; namespace Rcl { @@ -200,26 +201,30 @@ class Db { /** Test word for spelling correction candidate: not too long, no special chars... */ - static bool isSpellingCandidate(const string& term) + static bool isSpellingCandidate(const string& term, bool aspell=true) { if (term.empty() || term.length() > 50) return false; if (has_prefix(term)) return false; Utf8Iter u8i(term); - if (TextSplit::isCJK(*u8i)) - return false; + if (aspell) { + if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i)) + return false; + } else { + if (!TextSplit::isKATAKANA(*u8i)) { + return false; + } + } if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") != string::npos) return false; return true; } - -#ifdef TESTING_XAPIAN_SPELL /** Return spelling suggestion */ - string getSpellingSuggestion(const string& word); -#endif + bool getSpellingSuggestions(const string& word, + std::vector& suggs); /* The next two, only for searchdata, should be somehow hidden */ /* Return configured stop words */ @@ -490,6 +495,9 @@ private: // place for this. SynGroups m_syngroups; + // Aspell object if needed + Aspell *m_aspell = nullptr; + /*************** * Parameters cached out of the configuration files. Logically const * after init */ diff --git a/website/pages/recoll-windows.txt b/website/pages/recoll-windows.txt index 079dc966..0737ea6d 100644 --- a/website/pages/recoll-windows.txt +++ b/website/pages/recoll-windows.txt @@ -3,6 +3,7 @@ Jean-Francois Dockes :date: :recollversion: 1.23.0-2017-01-07-78b8ad +:windir: downwin-0e7f2 image:recoll-windows10-thumb.png[link="recoll-windows10.png"] @@ -35,7 +36,7 @@ files which would take space for nothing otherwise. == Installation - Download the - http://www.recoll.org/windows/recoll-setup-{recollversion}.exe[Recoll + http://www.recoll.org/{windir}/recoll-setup-{recollversion}.exe[Recoll setup file]. - Execute the setup file. This is a vanilla installer generated by Inno @@ -50,14 +51,14 @@ files which would take space for nothing otherwise. http://www.7-zip.org/. This is only useful if you need to index files compressed with Unix methods (not needed for zip files). -NOTE: The installer needs administrator rights in order to install to -`C:\Program Files`. If you want to install on a machine where you have no -administrator rights, you can use the -http://www.recoll.org/windows/recoll-{recollversion}.7z[installation -directory archive] instead and extract it anywhere, this works just the -same (you will need the free http://www.7-zip.org/[7z] to extract it). If -you are in this case, you can ignore the setup-related steps of the -procedure of course. +//NOTE: The installer needs administrator rights in order to install to +//`C:\Program Files`. If you want to install on a machine where you have no +//administrator rights, you can use the +//http://www.recoll.org/{windir}/recoll-{recollversion}.7z[installation +//directory archive] instead and extract it anywhere, this works just the +//same (you will need the free http://www.7-zip.org/[7z] to extract it). If +//you are in this case, you can ignore the setup-related steps of the +//procedure of course. == Configuration