wen

2017-04-18 14:39:12 +02:00 · 2017-04-18 14:39:12 +02:00 · 9661a4431e
commit 9661a4431e
parent 9d8ce3df62
12 changed files with 147 additions and 141 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -32,6 +32,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(X_CFLAGS) \
    -DRECOLL_DATADIR=\"${pkgdatadir}\" \
    -D_GNU_SOURCE \
    -DTESTING_XAPIAN_SPELL \
    $(DEFS)
 ACLOCAL_AMFLAGS = -I m4
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -197,7 +197,9 @@ static inline int whatcc(unsigned int c)
 #define UNICODE_IS_CJK(p)						\
    ((p) > 127 &&							\
     (((p) >= 0x2E80 && (p) <= 0x2EFF) ||				\
-      ((p) >= 0x3000 && (p) <= 0x9FFF) ||				\
+      ((p) >= 0x3000 && (p) <= 0x309F) ||				\
      ((p) >= 0x3100 && (p) <= 0x31EF) ||				\
      ((p) >= 0x3200 && (p) <= 0x9FFF) ||				\
      ((p) >= 0xA700 && (p) <= 0xA71F) ||				\
      ((p) >= 0xAC00 && (p) <= 0xD7AF) ||				\
      ((p) >= 0xF900 && (p) <= 0xFAFF) ||				\
@ -206,10 +208,19 @@ static inline int whatcc(unsigned int c)
      ((p) >= 0x20000 && (p) <= 0x2A6DF) ||				\
      ((p) >= 0x2F800 && (p) <= 0x2FA1F)))
 #define UNICODE_IS_KATAKANA(p)                                          \
    ((p) > 127 &&							\
     (((p) >= 0x30A0 && (p) <= 0x30FF) ||				\
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
 bool TextSplit::isCJK(int c)
 {
    return UNICODE_IS_CJK(c);
 }
 bool TextSplit::isKATAKANA(int c)
 {
    return UNICODE_IS_KATAKANA(c);
 }
 bool          TextSplit::o_processCJK = true;
 unsigned int  TextSplit::o_CJKNgramLen = 2;
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -92,8 +92,7 @@ public:
    /** Called when we encounter formfeed \f 0x0c. Override to use the event.
     * Mostly or exclusively used with pdftoxx output. Other filters mostly 
     * just don't know about pages. */
-    virtual void newpage(int /*pos*/)
+    virtual void newpage(int /*pos*/) {
    {
    }
    // Static utility functions:
@ -111,10 +110,12 @@ public:
     * non-utf-8 input (iso-8859 config files work ok). This hopefully
     * handles all Unicode whitespace, but needs correct utf-8 input
     */
-    static bool stringToStrings(const std::string &s, std::vector<std::string> &tokens);
+    static bool stringToStrings(const std::string &s,
                                std::vector<std::string> &tokens);
-    /** Is char CJK ? */
+    /** Is char CJK ? (excluding Katakana) */
    static bool isCJK(int c);
    static bool isKATAKANA(int c);
    /** Statistics about word length (average and dispersion) can
     * detect bad data like undecoded base64 or other mis-identified
--- a/src/qtgui/main.cpp
+++ b/src/qtgui/main.cpp
@ -40,9 +40,6 @@
 #include "rclmain_w.h"
 #include "ssearch_w.h"
 #include "guiutils.h"
 #ifdef RCL_USE_ASPELL
 #include "rclaspell.h"
 #endif
 #include "smallut.h"
 #include "readfile.h"
@ -83,9 +80,6 @@ void deleteAllTempFiles()
 Rcl::Db *rcldb;
 #ifdef RCL_USE_ASPELL
 Aspell *aspell;
 #endif
 int recollNeedsExit;
 RclMain *mainWindow;
@ -158,10 +152,6 @@ static void recollCleanup()
    deleteAllTempFiles();
 #ifdef RCL_USE_ASPELL
    deleteZ(aspell);
 #endif
    LOGDEB2("recollCleanup: done\n" );
 }
@ -322,15 +312,6 @@ int main(int argc, char **argv)
    //    fprintf(stderr, "Translations installed\n");
 #ifdef RCL_USE_ASPELL
    aspell = new Aspell(theconfig);
    aspell->init(reason);
    if (!aspell || !aspell->ok()) {
 	LOGDEB("Aspell speller creation failed "  << (reason) << "\n" );
 	aspell = 0;
    }
 #endif
    string historyfile = path_cat(theconfig->getConfDir(), "history");
    g_dynconf = new RclDynConf(historyfile);
    if (!g_dynconf || !g_dynconf->ok()) {
--- a/src/qtgui/recoll.h
+++ b/src/qtgui/recoll.h
@ -46,11 +46,6 @@ extern void startManual(const string& helpindex);
 extern void applyStyleSheet(const QString&);
 #ifdef RCL_USE_ASPELL
 class Aspell;
 extern Aspell *aspell;
 #endif
 inline std::string qs2utf8s(const QString& qs)
 {
    return std::string((const char *)qs.toUtf8());
--- a/src/qtgui/recoll.pro.in
+++ b/src/qtgui/recoll.pro.in
@ -2,7 +2,7 @@ TEMPLATE        = app
 LANGUAGE        = C++
 VPATH = @srcdir@
-DEFINES += BUILDING_RECOLL
+DEFINES += BUILDING_RECOLL  TESTING_XAPIAN_SPELL
@QMAKE_ENABLE_WEBKIT@ QT += webkit
@QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER
--- a/src/qtgui/reslist.cpp
+++ b/src/qtgui/reslist.cpp
@ -55,9 +55,6 @@
 #include "reslist.h"
 #include "moc_reslist.cpp"
 #include "rclhelp.h"
 #ifdef RCL_USE_ASPELL
 #include "rclaspell.h"
 #endif
 #include "appformime.h"
 #include "respopup.h"
@ -201,53 +198,36 @@ void QtGuiResListPager::suggest(const vector<string>uterms,
 				map<string, vector<string> >& sugg)
 {
    sugg.clear();
 #ifdef RCL_USE_ASPELL
    bool noaspell = false;
    theconfig->getConfParam("noaspell", &noaspell);
    if (noaspell)
        return;
    if (!aspell) {
        LOGERR("QtGuiResListPager:: aspell not initialized\n" );
        return;
    }
    bool issimple = m_reslist && m_reslist->m_rclmain && 
 	m_reslist->m_rclmain->lastSearchSimple();
-    for (vector<string>::const_iterator uit = uterms.begin();
+    for (const auto& uit : uterms) {
-         uit != uterms.end(); uit++) {
+        vector<string> tsuggs;
        list<string> asuggs;
        string reason;
 	// If the term is in the dictionary, Aspell::suggest won't
 	// list alternatives. In fact we may want to check the
 	// frequencies and propose something anyway if a possible
 	// variation is much more common (as google does) ?
-        if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
+        if (!rcldb->getSpellingSuggestions(uit, tsuggs)) {
            LOGERR("QtGuiResListPager::suggest: aspell failed: "  << (reason) << "\n" );
            continue;
        }
 	// We should check that the term stems differently from the
 	// base word (else it's not useful to expand the search). Or
 	// is it ? This should depend if stemming is turned on or not
-        if (!asuggs.empty()) {
+        if (!tsuggs.empty()) {
-            sugg[*uit] = vector<string>(asuggs.begin(), asuggs.end());
+            sugg[uit] = vector<string>(tsuggs.begin(), tsuggs.end());
-	    if (sugg[*uit].size() > 5)
+	    if (sugg[uit].size() > 5)
-		sugg[*uit].resize(5);
+		sugg[uit].resize(5);
 	    // Set up the links as a <href="Sold|new">. 
-	    for (vector<string>::iterator it = sugg[*uit].begin();
+	    for (auto& it : sugg[uit]) {
 		 it != sugg[*uit].end(); it++) {
 		if (issimple) {
-		    *it = string("<a href=\"S") + *uit + "|" + *it + "\">" +
+		    it = string("<a href=\"S") + uit + "|" + it + "\">" +
-			*it + "</a>";
+			it + "</a>";
 		}
 	    }
        }
    }
 #endif
 }
 string QtGuiResListPager::iconUrl(RclConfig *config, Rcl::Doc& doc)
--- a/src/qtgui/spell_w.cpp
+++ b/src/qtgui/spell_w.cpp
@ -47,10 +47,6 @@
 #include "execmd.h"
 #include "indexer.h"
 #ifdef RCL_USE_ASPELL
 #include "rclaspell.h"
 #endif
 using std::list;
 using std::multimap;
 using std::string;
@ -64,14 +60,8 @@ void SpellW::init()
    m_c2t.push_back(TYPECMB_REG);
    expTypeCMB->addItem(tr("Stem expansion"));
    m_c2t.push_back(TYPECMB_STEM);
-#ifdef RCL_USE_ASPELL
+    expTypeCMB->addItem(tr("Spelling/Phonetic"));
-    bool noaspell = false;
+    m_c2t.push_back(TYPECMB_SPELL);
    theconfig->getConfParam("noaspell", &noaspell);
    if (!noaspell) {
 	expTypeCMB->addItem(tr("Spelling/Phonetic"));
 	m_c2t.push_back(TYPECMB_ASPELL);
    }
 #endif
    expTypeCMB->addItem(tr("Show index statistics"));
    m_c2t.push_back(TYPECMB_STATS);
@ -189,37 +179,19 @@ void SpellW::doExpand()
    break;
-#ifdef RCL_USE_ASPELL
+    case TYPECMB_SPELL: 
    case TYPECMB_ASPELL: 
    {
-	LOGDEB("SpellW::doExpand: aspelling\n" );
+	LOGDEB("SpellW::doExpand: spelling [" << expr << "]\n" );
-	if (!aspell) {
+	vector<string> suggs;
-	    QMessageBox::warning(0, "Recoll",
+	if (!rcldb->getSpellingSuggestions(expr, suggs)) {
-				 tr("Aspell init failed. "
+	    QMessageBox::warning(0, "Recoll", tr("Spell expansion error. "));
 				    "Aspell not installed?"));
 	    LOGDEB("SpellW::doExpand: aspell init error\n" );
 	    return;
 	}
-	list<string> suggs;
+	for (const auto& it : suggs) {
-	if (!aspell->suggest(*rcldb, expr, suggs, reason)) {
+	    res.entries.push_back(Rcl::TermMatchEntry(it));
-	    QMessageBox::warning(0, "Recoll",
+        }
 				 tr("Aspell expansion error. "));
 	    LOGERR("SpellW::doExpand:suggest failed: "  << (reason) << "\n" );
 	}
 	for (list<string>::const_iterator it = suggs.begin(); 
 	     it != suggs.end(); it++) 
 	    res.entries.push_back(Rcl::TermMatchEntry(*it));
 #ifdef TESTING_XAPIAN_SPELL
 	string rclsugg = rcldb->getSpellingSuggestion(expr);
 	if (!rclsugg.empty()) {
 	    res.entries.push_back(Rcl::TermMatchEntry("Xapian spelling:"));
 	    res.entries.push_back(Rcl::TermMatchEntry(rclsugg));
 	}
 #endif // TESTING_XAPIAN_SPELL
        statsLBL->setText(tr("%1 results").arg(res.entries.size()));
    }
    break;
 #endif // RCL_USE_ASPELL
    case TYPECMB_STATS: 
    {
@ -229,7 +201,6 @@ void SpellW::doExpand()
    break;
    }
    if (res.entries.empty()) {
        resTW->setItem(0, 0, new QTableWidgetItem(tr("No expansion found")));
    } else {
--- a/src/qtgui/spell_w.h
+++ b/src/qtgui/spell_w.h
@ -14,8 +14,8 @@
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
-#ifndef _ASPELL_W_H_INCLUDED_
+#ifndef _SPELL_W_H_INCLUDED_
-#define _ASPELL_W_H_INCLUDED_
+#define _SPELL_W_H_INCLUDED_
 #include <vector>
@ -36,7 +36,7 @@ public:
    virtual bool eventFilter(QObject *target, QEvent *event );
    enum comboboxchoice {TYPECMB_NONE, TYPECMB_WILD, TYPECMB_REG, TYPECMB_STEM, 
-			 TYPECMB_ASPELL, TYPECMB_STATS};
+			 TYPECMB_SPELL, TYPECMB_STATS};
 public slots:
    virtual void doExpand();
    virtual void wordChanged(const QString&);
@ -62,4 +62,4 @@ private:
    void setModeCommon(comboboxchoice mode);
 };
-#endif /* _ASPELL_W_H_INCLUDED_ */
+#endif /* _SPELL_W_H_INCLUDED_ */
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -57,6 +57,9 @@ using namespace std;
 #include "rclinit.h"
 #include "internfile.h"
 #include "utf8fn.h"
 #ifdef RCL_USE_ASPELL
 #include "rclaspell.h"
 #endif
 // Recoll index format version is stored in user metadata. When this change,
 // we can't open the db and will have to reindex.
@ -731,11 +734,13 @@ Db::Db(const RclConfig *cfp)
 Db::~Db()
 {
-    LOGDEB2("Db::~Db\n" );
+    LOGDEB2("Db::~Db\n");
    if (m_ndb == 0)
 	return;
-    LOGDEB("Db::~Db: isopen "  << (m_ndb->m_isopen) << " m_iswritable "  << (m_ndb->m_iswritable) << "\n" );
+    LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
           m_ndb->m_iswritable << "\n");
    i_close(true);
    delete m_aspell;
    delete m_config;
 }
@ -1055,9 +1060,11 @@ class TextSplitDb : public TextSplitP {
    // gets added to basepos in addition to the inter-section increment
    // to compute the first position of the next section.
    Xapian::termpos curpos;
    Xapian::WritableDatabase& wdb;
-    TextSplitDb(Xapian::Document &d, TermProc *prc)
+    TextSplitDb(Xapian::WritableDatabase& _wdb, Xapian::Document &d,
-	: TextSplitP(prc), doc(d), basepos(1), curpos(0)
+                TermProc *prc)
 	: TextSplitP(prc), doc(d), basepos(1), curpos(0), wdb(_wdb)
    {}
    // Reimplement text_to_words to insert the begin and end anchor terms.
@ -1132,8 +1139,8 @@ public:
                m_ts->doc.add_posting(term, pos, m_ts->ft.wdfinc);
 #ifdef TESTING_XAPIAN_SPELL
-	    if (Db::isSpellingCandidate(term)) {
+	    if (Db::isSpellingCandidate(term, false)) {
-		m_ts->db.add_spelling(term);
+		m_ts->wdb.add_spelling(term);
 	    }
 #endif
 	    // Index the prefixed term.
@ -1192,30 +1199,80 @@ public:
 };
-#ifdef TESTING_XAPIAN_SPELL
+// At the moment, we normally use the Xapian speller for Katakana and
-string Db::getSpellingSuggestion(const string& word)
+// aspell for everything else
 bool Db::getSpellingSuggestions(const string& word, vector<string>& suggs)
 {
-    if (m_ndb == 0)
+    LOGDEB("Db::getSpellingSuggestions:[" << word << "]\n" );
-	return string();
+    suggs.clear();
    if (nullptr == m_ndb) {
 	return false;
    }
    string term = word;
-    if (o_index_stripchars)
+    if (isSpellingCandidate(term, true)) {
-	if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
+        // Term is candidate for aspell processing
-	    LOGINFO("Db::getSpelling: unac failed for ["  << (word) << "]\n" );
+#ifdef RCL_USE_ASPELL
-	    return string();
+        bool noaspell = false;
-	}
+        m_config->getConfParam("noaspell", &noaspell);
        if (noaspell) {
            return false;
        }
        if (nullptr == m_aspell) {
            m_aspell = new Aspell(m_config);
            if (m_aspell) {
                string reason;
                m_aspell->init(reason);
                if (!m_aspell->ok()) {
                    LOGDEB(("Aspell speller init failed %s\n", reason.c_str()));
                    delete m_aspell;
                    m_aspell = 0;
                }
            }
        }
-    if (!isSpellingCandidate(term))
+        if (nullptr == m_aspell) {
-	return string();
+            LOGERR("Db::getSpellingSuggestions: aspell not initialized\n");
-    return m_ndb->xrdb.get_spelling_suggestion(term);
+            return false;
-}
+        }
        list<string> asuggs;
        string reason;
        if (!m_aspell->suggest(*this, term, asuggs, reason)) {
            LOGERR("Db::getSpellingSuggestions: aspell failed: " << reason <<
                   "\n");
            return false;
        }
        suggs = vector<string>(asuggs.begin(), asuggs.end());
 #endif
    } else {
 #ifdef TESTING_XAPIAN_SPELL
        // Was not aspell candidate (e.g.: katakana). Maybe use Xapian
        // speller?
        if (isSpellingCandidate(term, false)) {
            if (!o_index_stripchars) {
                if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
                    LOGINFO("Db::getSpelling: unac failed for [" << word <<
                            "]\n");
                    return false;
                }
            }
            string sugg = m_ndb->xrdb.get_spelling_suggestion(term);
            if (!sugg.empty()) {
                suggs.push_back(sugg);
            }
        }
 #endif
    }
    return true;
 }
 // Let our user set the parameters for abstract processing
 void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
 {
-    LOGDEB1("Db::setAbstractParams: trunc "  << (idxtrunc) << " syntlen "  << (syntlen) << " ctxlen "  << (syntctxlen) << "\n" );
+    LOGDEB1("Db::setAbstractParams: trunc " << idxtrunc << " syntlen " <<
            syntlen << " ctxlen " << syntctxlen << "\n");
    if (idxtrunc >= 0)
 	m_idxAbsTruncLen = idxtrunc;
    if (syntlen > 0)
@ -1238,7 +1295,7 @@ static const string cstr_nc("\n\r\x0c\\");
 // metadata), and update database
 bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 {
-    LOGDEB("Db::add: udi ["  << (udi) << "] parent ["  << (parent_udi) << "]\n" );
+    LOGDEB("Db::add: udi [" << udi << "] parent [" << parent_udi << "]\n");
    if (m_ndb == 0)
 	return false;
@ -1259,7 +1316,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
    if (o_index_stripchars)
 	nxt = &tpprep;
-    TextSplitDb splitter(newdocument, nxt);
+    TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
    tpidx.setTSD(&splitter);
    // Udi unique term: this is used for file existence/uptodate
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -54,6 +54,7 @@ using std::vector;
 // reasonable)
 class RclConfig;
 class Aspell;
 namespace Rcl {
@ -200,26 +201,30 @@ class Db {
    /** Test word for spelling correction candidate: not too long, no 
 	special chars... */
-    static bool isSpellingCandidate(const string& term)
+    static bool isSpellingCandidate(const string& term, bool aspell=true)
    {
 	if (term.empty() || term.length() > 50)
 	    return false;
 	if (has_prefix(term))
 	    return false;
 	Utf8Iter u8i(term);
-	if (TextSplit::isCJK(*u8i)) 
+        if (aspell) {
-	    return false;
+            if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
                return false;
        } else {
            if (!TextSplit::isKATAKANA(*u8i)) {
                return false;
            }
        }
 	if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") 
 	    != string::npos)
 	    return false;
 	return true;
    }
 #ifdef TESTING_XAPIAN_SPELL
    /** Return spelling suggestion */
-    string getSpellingSuggestion(const string& word);
+    bool getSpellingSuggestions(const string& word,
-#endif
+                                std::vector<std::string>& suggs);
    /* The next two, only for searchdata, should be somehow hidden */
    /* Return configured stop words */
@ -490,6 +495,9 @@ private:
    // place for this.
    SynGroups m_syngroups;
    // Aspell object if needed
    Aspell *m_aspell = nullptr;
    /***************
     * Parameters cached out of the configuration files. Logically const 
     * after init */
--- a/website/pages/recoll-windows.txt
+++ b/website/pages/recoll-windows.txt
@ -3,6 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
 :date:
 :recollversion: 1.23.0-2017-01-07-78b8ad
 :windir: downwin-0e7f2
 image:recoll-windows10-thumb.png[link="recoll-windows10.png"]
@ -35,7 +36,7 @@ files which would take space for nothing otherwise.
 == Installation
 - Download the 
-  http://www.recoll.org/windows/recoll-setup-{recollversion}.exe[Recoll
+  http://www.recoll.org/{windir}/recoll-setup-{recollversion}.exe[Recoll
  setup file].
 - Execute the setup file. This is a vanilla installer generated by Inno
@ -50,14 +51,14 @@ files which would take space for nothing otherwise.
  http://www.7-zip.org/. This is only useful if you need to index files
  compressed with Unix methods (not needed for zip files).
-NOTE: The installer needs administrator rights in order to install to
+//NOTE: The installer needs administrator rights in order to install to
-`C:\Program Files`. If you want to install on a machine where you have no
+//`C:\Program Files`. If you want to install on a machine where you have no
-administrator rights, you can use the
+//administrator rights, you can use the
-http://www.recoll.org/windows/recoll-{recollversion}.7z[installation
+//http://www.recoll.org/{windir}/recoll-{recollversion}.7z[installation
-directory archive] instead and extract it anywhere, this works just the
+//directory archive] instead and extract it anywhere, this works just the
-same (you will need the free http://www.7-zip.org/[7z] to extract it). If
+//same (you will need the free http://www.7-zip.org/[7z] to extract it). If
-you are in this case, you can ignore the setup-related steps of the
+//you are in this case, you can ignore the setup-related steps of the
-procedure of course.
+//procedure of course.
 == Configuration