recoll/src/aspell/rclaspell.cpp

#ifndef TEST_RCLASPELL
/* Copyright (C) 2006 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
#ifdef HAVE_CONFIG_H
#include "autoconfig.h"
#endif

#ifdef RCL_USE_ASPELL

#include <mutex>

#include <unistd.h>
#include <dlfcn.h>
#include <stdlib.h>

#include ASPELL_INCLUDE

#include "pathut.h"
#include "execmd.h"
#include "rclaspell.h"
#include "log.h"
#include "unacpp.h"

using namespace std;

// Just a place where we keep the Aspell library entry points together
class AspellApi {
public:
    struct AspellConfig *(*new_aspell_config)();
    int (*aspell_config_replace)(struct AspellConfig *, const char * key,
				 const char * value);
    struct AspellCanHaveError *(*new_aspell_speller)(struct AspellConfig *);
    void (*delete_aspell_config)(struct AspellConfig *);
    void (*delete_aspell_can_have_error)(struct AspellCanHaveError *);
    struct AspellSpeller * (*to_aspell_speller)(struct AspellCanHaveError *);
    struct AspellConfig * (*aspell_speller_config)(struct AspellSpeller *);
    const struct AspellWordList * (*aspell_speller_suggest)
	(struct AspellSpeller *, const char *, int);
    int (*aspell_speller_check)(struct AspellSpeller *, const char *, int);
    struct AspellStringEnumeration * (*aspell_word_list_elements)
	(const struct AspellWordList * ths);
    const char * (*aspell_string_enumeration_next)
	(struct AspellStringEnumeration * ths);
    void (*delete_aspell_string_enumeration)(struct AspellStringEnumeration *);
    const struct AspellError *(*aspell_error)
	(const struct AspellCanHaveError *);
    const char *(*aspell_error_message)(const struct AspellCanHaveError *);
    const char *(*aspell_speller_error_message)(const struct AspellSpeller *);
    void (*delete_aspell_speller)(struct AspellSpeller *);

};
static AspellApi aapi;
static std::mutex o_aapi_mutex;

#define NMTOPTR(NM, TP)							\
    if ((aapi.NM = TP dlsym(m_data->m_handle, #NM)) == 0) {		\
	badnames += #NM + string(" ");					\
    }

static const char *aspell_lib_suffixes[] = {
  ".so",
  ".so.15",
  ".so.16"
};
static const unsigned int nlibsuffs  = sizeof(aspell_lib_suffixes) / sizeof(char *);

// Stuff that we don't wish to see in the .h (possible sysdeps, etc.)
class AspellData {
public:
    AspellData()
        : m_handle(0), m_speller(0)
    {}
    ~AspellData() {
        LOGDEB2("~AspellData\n" );
	if (m_handle) {
	    dlclose(m_handle);
            m_handle = 0;
        }
        if (m_speller) {
            // Dumps core if I do this??
            //aapi.delete_aspell_speller(m_speller);
            m_speller = 0;
            LOGDEB2("~AspellData: speller done\n" );
        }
    }

    void  *m_handle;
    string m_exec;
    AspellSpeller *m_speller;
};

Aspell::Aspell(const RclConfig *cnf)
    : m_config(cnf), m_data(0)
{
}

Aspell::~Aspell()
{
    deleteZ(m_data);
}

bool Aspell::init(string &reason)
{
    std::unique_lock<std::mutex> locker(o_aapi_mutex);
    deleteZ(m_data);

    // Language: we get this from the configuration, else from the NLS
    // environment. The aspell language names used for selecting language
    // definition files (used to create dictionaries) are like en, fr
    if (!m_config->getConfParam("aspellLanguage", m_lang) || m_lang.empty()) {
	string lang = "en";
	const char *cp;
	if ((cp = getenv("LC_ALL")))
	    lang = cp;
	else if ((cp = getenv("LANG")))
	    lang = cp;
	if (!lang.compare("C"))
	    lang = "en";
	m_lang = lang.substr(0, lang.find_first_of("_"));
    }

    m_data = new AspellData;

    const char *aspell_prog_from_env = getenv("ASPELL_PROG");
    if (aspell_prog_from_env && access(aspell_prog_from_env, X_OK) == 0) {
	m_data->m_exec = aspell_prog_from_env;
#ifdef ASPELL_PROG
    } else if (access(ASPELL_PROG, X_OK) == 0) {
	m_data->m_exec = ASPELL_PROG;
#endif // ASPELL_PROG
    } else {
	ExecCmd::which("aspell", m_data->m_exec);
    }

    if (m_data->m_exec.empty()) {
	reason = "aspell program not found or not executable";
        deleteZ(m_data);
	return false;
    }

    reason = "Could not open shared library ";
    string libbase("libaspell");
    string lib;
    for (unsigned int i = 0; i < nlibsuffs; i++) {
        lib = libbase + aspell_lib_suffixes[i];
        reason += string("[") + lib + "] ";
        if ((m_data->m_handle = dlopen(lib.c_str(), RTLD_LAZY)) != 0) {
            reason.erase();
            goto found;
        }
    }

 found:
    if (m_data->m_handle == 0) {
        reason += string(" : ") + dlerror();
        deleteZ(m_data);
        return false;
    }

    string badnames;
    NMTOPTR(new_aspell_config, (struct AspellConfig *(*)()));
    NMTOPTR(aspell_config_replace, (int (*)(struct AspellConfig *,
					    const char *, const char *)));
    NMTOPTR(new_aspell_speller,
	    (struct AspellCanHaveError *(*)(struct AspellConfig *)));
    NMTOPTR(delete_aspell_config,
	    (void (*)(struct AspellConfig *)));
    NMTOPTR(delete_aspell_can_have_error,
	    (void (*)(struct AspellCanHaveError *)));
    NMTOPTR(to_aspell_speller,
	    (struct AspellSpeller *(*)(struct AspellCanHaveError *)));
    NMTOPTR(aspell_speller_config,
	    (struct AspellConfig *(*)(struct AspellSpeller *)));
    NMTOPTR(aspell_speller_suggest,
	    (const struct AspellWordList *(*)(struct AspellSpeller *,
					       const char *, int)));
    NMTOPTR(aspell_speller_check,
	    (int (*)(struct AspellSpeller *, const char *, int)));
    NMTOPTR(aspell_word_list_elements,
	    (struct AspellStringEnumeration *(*)
	     (const struct AspellWordList *)));
    NMTOPTR(aspell_string_enumeration_next,
	    (const char * (*)(struct AspellStringEnumeration *)));
    NMTOPTR(delete_aspell_string_enumeration,
	    (void (*)(struct AspellStringEnumeration *)));
    NMTOPTR(aspell_error,
	    (const struct AspellError*(*)(const struct AspellCanHaveError *)));
    NMTOPTR(aspell_error_message,
	    (const char *(*)(const struct AspellCanHaveError *)));
    NMTOPTR(aspell_speller_error_message,
	    (const char *(*)(const struct AspellSpeller *)));
    NMTOPTR(delete_aspell_speller, (void (*)(struct AspellSpeller *)));

    if (!badnames.empty()) {
	reason = string("Aspell::init: symbols not found:") + badnames;
        deleteZ(m_data);
	return false;
    }

    return true;
}

bool Aspell::ok() const
{
    return m_data != 0 && m_data->m_handle != 0;
}

string Aspell::dicPath()
{
    string ccdir = m_config->getAspellcacheDir();
    return path_cat(ccdir, string("aspdict.") + m_lang + string(".rws"));
}


// The data source for the create dictionary aspell command. We walk
// the term list, filtering out things that are probably not words.
// Note that the manual for the current version (0.60) of aspell
// states that utf-8 is not well supported, so that we should maybe
// also filter all 8bit chars. Info is contradictory, so we only
// filter out CJK which is definitely not supported (katakana would
// make sense though, but currently no support).
class AspExecPv : public ExecCmdProvide {
public:
    string *m_input; // pointer to string used as input buffer to command
    Rcl::TermIter *m_tit;
    Rcl::Db &m_db;
    AspExecPv(string *i, Rcl::TermIter *tit, Rcl::Db &db)
	: m_input(i), m_tit(tit), m_db(db)
    {}
    void newData() {
	while (m_db.termWalkNext(m_tit, *m_input)) {
	    LOGDEB2("Aspell::buildDict: term: ["  << (m_input) << "]\n" );
	    if (!Rcl::Db::isSpellingCandidate(*m_input)) {
		LOGDEB2("Aspell::buildDict: SKIP\n" );
		continue;
	    }
	    if (!o_index_stripchars) {
		string lower;
		if (!unacmaybefold(*m_input, lower, "UTF-8", UNACOP_FOLD))
		    continue;
		m_input->swap(lower);
	    }
	    // Got a non-empty sort-of appropriate term, let's send it to
	    // aspell
	    LOGDEB2("Apell::buildDict: SEND\n" );
	    m_input->append("\n");
	    return;
	}
	// End of data. Tell so. Exec will close cmd.
	m_input->erase();
    }
};


bool Aspell::buildDict(Rcl::Db &db, string &reason)
{
    if (!ok())
	return false;

    string addCreateParam;
    m_config->getConfParam("aspellAddCreateParam", addCreateParam);

    // We create the dictionary by executing the aspell command:
    // aspell --lang=[lang] create master [dictApath]
    string cmdstring(m_data->m_exec);
    ExecCmd aspell;
    vector<string> args;
    args.push_back(string("--lang=")+ m_lang);
    cmdstring += string(" ") + string("--lang=") + m_lang;
    args.push_back("--encoding=utf-8");
    cmdstring += string(" ") + "--encoding=utf-8";
    if (!addCreateParam.empty()) {
	args.push_back(addCreateParam);
	cmdstring += string(" ") + addCreateParam;
    }
    args.push_back("create");
    cmdstring += string(" ") + "create";
    args.push_back("master");
    cmdstring += string(" ") + "master";
    args.push_back(dicPath());
    cmdstring += string(" ") + dicPath();

    // Have to disable stderr, as numerous messages about bad strings are
    // printed. We'd like to keep errors about missing databases though, so
    // make it configurable for diags
    bool keepStderr = false;
    m_config->getConfParam("aspellKeepStderr", &keepStderr);
    if (!keepStderr)
	aspell.setStderr("/dev/null");

    Rcl::TermIter *tit = db.termWalkOpen();
    if (tit == 0) {
	reason = "termWalkOpen failed\n";
	return false;
    }
    string termbuf;
    AspExecPv pv(&termbuf, tit, db);
    aspell.setProvide(&pv);

    if (aspell.doexec(m_data->m_exec, args, &termbuf)) {
	ExecCmd cmd;
	args.clear();
	args.push_back("dicts");
	string dicts;
	bool hasdict = false;
	if (cmd.doexec(m_data->m_exec, args, 0, &dicts)) {
	    vector<string> vdicts;
	    stringToTokens(dicts, vdicts, "\n\r\t ");
	    if (find(vdicts.begin(), vdicts.end(), m_lang) != vdicts.end()) {
		hasdict = true;
	    }
	}
	if (hasdict)
	    reason = string(
		"\naspell dictionary creation command [") +
                cmdstring + string("] failed. Reason unknown.\n"
		"Try to set aspellKeepStderr = 1 in recoll.conf, and execute \n"
		"the indexing command in a terminal to see the aspell "
		"diagnostic output.\n");
	else
	    reason = string("aspell dictionary creation command failed:\n") +
                cmdstring + "\n"
                "One possible reason might be missing language "
                "data files for lang = " + m_lang +
                ". Maybe try to execute the command by hand for a better diag.";
	return false;
    }
    db.termWalkClose(tit);
    return true;
}


bool Aspell::make_speller(string& reason)
{
    if (!ok())
	return false;
    if (m_data->m_speller != 0)
        return true;

    AspellCanHaveError *ret;

    AspellConfig *config = aapi.new_aspell_config();
    aapi.aspell_config_replace(config, "lang", m_lang.c_str());
    aapi.aspell_config_replace(config, "encoding", "utf-8");
    aapi.aspell_config_replace(config, "master", dicPath().c_str());
    aapi.aspell_config_replace(config, "sug-mode", "fast");
    //    aapi.aspell_config_replace(config, "sug-edit-dist", "2");
    ret = aapi.new_aspell_speller(config);
    aapi.delete_aspell_config(config);

    if (aapi.aspell_error(ret) != 0) {
	reason = aapi.aspell_error_message(ret);
	aapi.delete_aspell_can_have_error(ret);
	return false;
    }
    m_data->m_speller = aapi.to_aspell_speller(ret);
    return true;
}

bool Aspell::check(const string &iterm, string& reason)
{
    LOGDEB("Aspell::check [" << iterm << "]\n");
    string mterm(iterm);

    if (!Rcl::Db::isSpellingCandidate(mterm)) {
        LOGDEB0("Aspell::check: [" << mterm <<
                " not spelling candidate, return true\n");
        return true;
    }
    if (!ok() || !make_speller(reason))
	return false;
    if (iterm.empty())
        return true; //??

    if (!o_index_stripchars) {
	string lower;
	if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
	    LOGERR("Aspell::check: cant lowercase input\n");
	    return false;
	}
	mterm.swap(lower);
    }

    int ret = aapi.aspell_speller_check(m_data->m_speller,
                                        mterm.c_str(), mterm.length());
    reason.clear();
    switch (ret) {
    case 0: return false;
    case 1: return true;
    default:
    case -1:
        reason.append("Aspell error: ");
        reason.append(aapi.aspell_speller_error_message(m_data->m_speller));
        return false;
    }
}

bool Aspell::suggest(Rcl::Db &db, const string &_term,
                     list<string>& suggestions, string& reason)
{
    LOGDEB("Aspell::suggest: term [" << _term << "]\n");
    if (!ok() || !make_speller(reason))
	return false;
    string mterm(_term);
    if (mterm.empty())
        return true; //??

    if (!Rcl::Db::isSpellingCandidate(mterm)) {
        LOGDEB0("Aspell::suggest: [" << mterm <<
                " not spelling candidate, return empty/true\n");
        return true;
    }

    if (!o_index_stripchars) {
	string lower;
	if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
	    LOGERR("Aspell::check : cant lowercase input\n");
	    return false;
	}
	mterm.swap(lower);
    }

    AspellCanHaveError *ret;

    const AspellWordList *wl =
	aapi.aspell_speller_suggest(m_data->m_speller,
                                    mterm.c_str(), mterm.length());
    if (wl == 0) {
	reason = aapi.aspell_speller_error_message(m_data->m_speller);
	return false;
    }
    AspellStringEnumeration *els = aapi.aspell_word_list_elements(wl);
    const char *word;
    while ((word = aapi.aspell_string_enumeration_next(els)) != 0) {
        LOGDEB0("Aspell::suggest: got [" << word << "]\n");
	// Check that the word exists in the index (we don't want
	// aspell computed stuff, only exact terms from the
	// dictionary).  We used to also check that it stems
	// differently from the base word but this is complicated
	// (stemming on/off + language), so we now leave this to the
	// caller.
	if (db.termExists(word))
	    suggestions.push_back(word);
    }
    aapi.delete_aspell_string_enumeration(els);
    return true;
}

#endif // RCL_USE_ASPELL

#else // TEST_RCLASPELL test driver ->

#ifdef HAVE_CONFIG_H
#include "autoconfig.h"
#endif

#ifdef RCL_USE_ASPELL

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>

#include <iostream>
using namespace std;

#include "rclinit.h"
#include "rclconfig.h"
#include "rcldb.h"
#include "rclaspell.h"

static char *thisprog;
RclConfig *rclconfig;

static char usage [] =
" -b : build dictionary\n"
" -s <term>: suggestions for term\n"
" -c <term>: check term\n"
"\n"
;
static void
Usage(void)
{
    fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
    exit(1);
}

static int     op_flags;
#define OPT_MOINS 0x1
#define OPT_s	  0x2
#define OPT_b	  0x4
#define OPT_c     0x8

int main(int argc, char **argv)
{
    string word;

    thisprog = argv[0];
    argc--; argv++;

    while (argc > 0 && **argv == '-') {
	(*argv)++;
	if (!(**argv))
	    /* Cas du "adb - core" */
	    Usage();
	while (**argv)
	    switch (*(*argv)++) {
	    case 'b':	op_flags |= OPT_b; break;
	    case 'c':	op_flags |= OPT_c; if (argc < 2)  Usage();
		word = *(++argv);
		argc--;
		goto b1;
	    case 's':	op_flags |= OPT_s; if (argc < 2)  Usage();
		word = *(++argv);
		argc--;
		goto b1;
	    default: Usage();	break;
	    }
    b1: argc--; argv++;
    }

    if (argc != 0 || op_flags == 0)
	Usage();

    string reason;
    rclconfig = recollinit(0, 0, reason);
    if (!rclconfig || !rclconfig->ok()) {
	fprintf(stderr, "Configuration problem: %s\n", reason.c_str());
	exit(1);
    }

    string dbdir = rclconfig->getDbDir();
    if (dbdir.empty()) {
	fprintf(stderr, "No db directory in configuration");
	exit(1);
    }

    Rcl::Db rcldb(rclconfig);

    if (!rcldb.open(Rcl::Db::DbRO, 0)) {
	fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
	exit(1);
    }

    Aspell aspell(rclconfig);

    if (!aspell.init(reason)) {
	cerr << "Init failed: " << reason << endl;
	exit(1);
    }
    if (op_flags & OPT_b) {
	if (!aspell.buildDict(rcldb, reason)) {
	    cerr << "buildDict failed: " << reason << endl;
	    exit(1);
	}
    } else if (op_flags & OPT_c) {
	bool ret = aspell.check(word, reason);
	if (!ret && reason.size()) {
	    cerr << "Aspell error: " << reason << endl;
	    return 1;
	}
	cout << word;
	if (ret) {
	    cout << " is in dictionary" << endl;
	} else {
	    cout << " not in dictionary" << endl;
	}
    } else {
	list<string> suggs;
	if (!aspell.suggest(rcldb, word, suggs, reason)) {
	    cerr << "suggest failed: " << reason << endl;
	    exit(1);
	}
	cout << "Suggestions for " << word << ":" << endl;
	for (list<string>::iterator it = suggs.begin();
	     it != suggs.end(); it++) {
	    cout << *it << endl;
	}
    }
    exit(0);
}
#else
int main(int argc, char **argv)
{return 1;}
#endif // RCL_USE_ASPELL

#endif // TEST_RCLASPELL test driver