diff --git a/.gitignore b/.gitignore index 920281f1..62a4b003 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ \#* libtool ptrans +src/python/pychm/setup.py src/Makefile src/Makefile.in src/aclocal.m4 @@ -68,6 +69,8 @@ src/recollindex src/recollq src/sampleconf/rclmon.sh src/sampleconf/recoll.conf +src/testmains/Makefile +src/testmains/Makefile.in src/xadump stamp-h1 tests/casediac/aspdict.en.rws diff --git a/src/Makefile.am b/src/Makefile.am index cf84533b..a0f13ee2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,4 +1,12 @@ +# Conditionally enable building the small test drivers, but don't +# distribute them, they are not generally useful +if COND_TESTMAINS + MAYBE_TESTMAINS = testmains +endif +SUBDIRS = . $(MAYBE_TESTMAINS) +DIST_SUBDIRS = . + CXXFLAGS ?= @CXXFLAGS@ LIBXAPIAN=@LIBXAPIAN@ XAPIANCXXFLAGS=@XAPIANCXXFLAGS@ diff --git a/src/configure.ac b/src/configure.ac index dc12c5e1..bee7ac84 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -220,6 +220,12 @@ if test X$idxthreadsEnabled = Xyes ; then AC_DEFINE(IDX_THREADS, 1, [Use multiple threads for indexing]) fi +AC_ARG_ENABLE(testmains, + AC_HELP_STRING([--enable-testmains], + [Enable building small test drivers. These are not unit tests.]), + buildtestmains=$enableval, buildtestmains=no) +AM_CONDITIONAL([COND_TESTMAINS], [test "$buildtestmains" = yes]) + # Enable CamelCase word splitting. This is optional because it causes # problems with phrases: with camelcase enabled, "MySQL manual" # will be matched by "MySQL manual" and "my sql manual" but not @@ -545,10 +551,8 @@ AC_SUBST(RCLLIBVERSION) AC_SUBST(XSLT_CFLAGS) AC_SUBST(XSLT_LINKADD) -# All object files depend on localdefs which has the cc flags. Avoid -# changing it unless necessary -AC_CONFIG_FILES(Makefile) -AC_CONFIG_FILES(python/recoll/setup.py) -AC_CONFIG_FILES(python/pychm/setup.py) +AC_CONFIG_FILES([Makefile testmains/Makefile + python/recoll/setup.py + python/pychm/setup.py]) AC_OUTPUT diff --git a/src/testmains/Makefile.am b/src/testmains/Makefile.am new file mode 100644 index 00000000..4a1f8802 --- /dev/null +++ b/src/testmains/Makefile.am @@ -0,0 +1,43 @@ +CXXFLAGS ?= @CXXFLAGS@ +LIBXAPIAN=@LIBXAPIAN@ +XAPIANCXXFLAGS=@XAPIANCXXFLAGS@ +XSLT_CFLAGS=@XSLT_CFLAGS@ +XSLT_LINKADD=@XSLT_LINKADD@ +LIBICONV=@LIBICONV@ +INCICONV=@INCICONV@ +LIBFAM = @LIBFAM@ +RCLLIBVERSION=@RCLLIBVERSION@ +X_CFLAGS=@X_CFLAGS@ +X_PRE_LIBS=@X_PRE_LIBS@ +X_LIBS=@X_LIBS@ +X_EXTRA_LIBS=@X_EXTRA_LIBS@ +X_LIBX11=@X_LIBX11@ +DEFS=@DEFS@ + +COMMONCPPFLAGS = -I. \ + -I$(top_srcdir)/aspell \ + -I$(top_srcdir)/bincimapmime \ + -I$(top_srcdir)/common \ + -I$(top_srcdir)/index \ + -I$(top_srcdir)/internfile \ + -I$(top_srcdir)/rcldb \ + -I$(top_srcdir)/unac \ + -I$(top_srcdir)/utils \ + -I$(top_srcdir)/xaposix \ + -DBUILDING_RECOLL + +AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \ + $(COMMONCPPFLAGS) \ + $(INCICONV) \ + $(XAPIANCXXFLAGS) \ + $(XSLT_CFLAGS) \ + $(X_CFLAGS) \ + -DRECOLL_DATADIR=\"${pkgdatadir}\" \ + -DREADFILE_ENABLE_ZLIB -DREADFILE_ENABLE_MINIZ -DREADFILE_ENABLE_MD5 \ + -D_GNU_SOURCE \ + $(DEFS) + +noinst_PROGRAMS = textsplit + +textsplit_SOURCES = trtextsplit.cpp +textsplit_LDADD = ../librecoll.la diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp new file mode 100644 index 00000000..243cebeb --- /dev/null +++ b/src/testmains/trtextsplit.cpp @@ -0,0 +1,259 @@ +#include "autoconfig.h" + +#include "textsplit.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "readfile.h" +#include "log.h" +#include "transcode.h" +#include "unacpp.h" +#include "termproc.h" +#include "rclutil.h" +#include "rclconfig.h" + +using namespace std; + +class myTermProc : public Rcl::TermProc { + int first; + bool nooutput; +public: + myTermProc() : TermProc(0), first(1), nooutput(false) {} + void setNoOut(bool val) {nooutput = val;} + virtual bool takeword(const string &term, int pos, int bs, int be) + { + if (nooutput) + return true; + FILE *fp = stdout; + if (first) { + fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); + first = 0; + } + fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); + return true; + } +}; + +#define OPT_s 0x1 +#define OPT_w 0x2 +#define OPT_q 0x4 +#define OPT_c 0x8 +#define OPT_k 0x10 +#define OPT_C 0x20 +#define OPT_n 0x40 +#define OPT_S 0x80 +#define OPT_u 0x100 +#define OPT_p 0x200 + +bool dosplit(const string& data, TextSplit::Flags flags, int op_flags) +{ + myTermProc printproc; + + Rcl::TermProc *nxt = &printproc; + +// Rcl::TermProcCommongrams commonproc(nxt, stoplist); +// if (op_flags & OPT_S) +// nxt = &commonproc; + + Rcl::TermProcPrep preproc(nxt); + if (op_flags & OPT_u) + nxt = &preproc; + + Rcl::TextSplitP splitter(nxt, flags); + + if (op_flags & OPT_q) + printproc.setNoOut(true); + + splitter.text_to_words(data); + +#ifdef TEXTSPLIT_STATS + TextSplit::Stats::Values v = splitter.getStats(); + cout << "Average length: " + << v.avglen + << " Standard deviation: " + << v.sigma + << " Coef of variation " + << v.sigma / v.avglen + << endl; +#endif + return true; +} + +static const char *teststrings[] = { + "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n", + "\"Jean-Francois Dockes\" \n", + "n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'", + "_network_ some_span", + "data123\n", + "134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n", + "@^#$(#$(*)\n", + "192.168.4.1 one\n\rtwo\r", + "[olala][ululu] (valeur) (23)\n", + "utf-8 ucs-4© \\nodef\n", + "A b C 2 . +", + "','this\n", + " ,able,test-domain", + " -wl,--export-dynamic", + " ~/.xsession-errors", + "this_very_long_span_this_very_long_span_this_very_long_span", + "soft\xc2\xadhyphen", + "soft\xc2\xad\nhyphen", + "soft\xc2\xad\n\rhyphen", + "real\xe2\x80\x90hyphen", + "real\xe2\x80\x90\nhyphen", + "hyphen-\nminus", +}; +const int teststrings_cnt = sizeof(teststrings)/sizeof(char *); + +static string teststring1 = " nouvel-an "; + +static string thisprog; + +static string usage = + " textsplit [opts] [filename]\n" + " -q : no output\n" + " -s : only spans\n" + " -w : only words\n" + " -n : no numbers\n" + " -k : preserve wildcards (?*)\n" + " -c : just count words\n" + " -u : use unac\n" + " -C [charset] : input charset\n" + " -S [stopfile] : stopfile to use for commongrams\n" + " if filename is 'stdin', will read stdin for data (end with ^D)\n\n" + " textplit -p somephrase : display results from stringToStrings()\n" + " \n" + ; + +static void +Usage(void) +{ + cerr << thisprog << ": usage:\n" << usage; + exit(1); +} + +static int op_flags; + +int main(int argc, char **argv) +{ + string charset, stopfile; + + thisprog = argv[0]; + argc--; argv++; + + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + /* Cas du "adb - core" */ + Usage(); + while (**argv) + switch (*(*argv)++) { + case 'c': op_flags |= OPT_c; break; + case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); + charset = *(++argv); argc--; + goto b1; + case 'k': op_flags |= OPT_k; break; + case 'n': op_flags |= OPT_n; break; + case 'p': op_flags |= OPT_p; break; + case 'q': op_flags |= OPT_q; break; + case 's': op_flags |= OPT_s; break; + case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); + stopfile = *(++argv); argc--; + goto b1; + case 'u': op_flags |= OPT_u; break; + case 'w': op_flags |= OPT_w; break; + default: Usage(); break; + } + b1: argc--; argv++; + } + + TextSplit::Flags flags = TextSplit::TXTS_NONE; + + if (op_flags&OPT_s) + flags = TextSplit::TXTS_ONLYSPANS; + else if (op_flags&OPT_w) + flags = TextSplit::TXTS_NOSPANS; + if (op_flags & OPT_k) + flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); + + + // We need a configuration file, which we build in a temp file + TempFile tmpconf("conf"); + string cffn(tmpconf.filename()); + FILE *fp = fopen(tmpconf.filename(), "w"); + if (op_flags & OPT_n) { + fprintf(fp, "nonumbers = 1"); + } + fclose(fp); + + RclConfig *config = new RclConfig(&cffn); + TextSplit::staticConfInit(config); + + + Rcl::StopList stoplist; + if (op_flags & OPT_S) { + if (!stoplist.setFile(stopfile)) { + cerr << "Can't read stopfile: " << stopfile << endl; + exit(1); + } + } + string odata, reason; + if (argc == 1) { + const char *filename = *argv++; argc--; + if (op_flags& OPT_p) { + vector tokens; + TextSplit::stringToStrings(filename, tokens); + for (vector::const_iterator it = tokens.begin(); + it != tokens.end(); it++) { + cout << "[" << *it << "] "; + } + cout << endl; + exit(0); + } + if (!strcmp(filename, "stdin")) { + char buf[1024]; + int nread; + while ((nread = read(0, buf, 1024)) > 0) { + odata.append(buf, nread); + } + } else if (!file_to_string(filename, odata, &reason)) { + cerr << "Failed: file_to_string(" << filename << ") failed: " + << reason << endl; + exit(1); + } + } else { + if (op_flags & OPT_p) + Usage(); + for (int i = 0; i < teststrings_cnt; i++) { + cout << endl << teststrings[i] << endl; + dosplit(teststrings[i], flags, op_flags); + } + exit(0); + } + + string& data = odata; + string ndata; + if ((op_flags & OPT_C)) { + if (!transcode(odata, ndata, charset, "UTF-8")) { + cerr << "Failed: transcode error" << endl; + exit(1); + } else { + data = ndata; + } + } + + if (op_flags & OPT_c) { + int n = TextSplit::countWords(data, flags); + cout << n << " words" << endl; + } else { + dosplit(data, flags, op_flags); + } +}