setup directory for small test and trials programs
This commit is contained in:
parent
bbeaebf632
commit
2c337caf94
3
.gitignore
vendored
3
.gitignore
vendored
@ -20,6 +20,7 @@
|
||||
\#*
|
||||
libtool
|
||||
ptrans
|
||||
src/python/pychm/setup.py
|
||||
src/Makefile
|
||||
src/Makefile.in
|
||||
src/aclocal.m4
|
||||
@ -68,6 +69,8 @@ src/recollindex
|
||||
src/recollq
|
||||
src/sampleconf/rclmon.sh
|
||||
src/sampleconf/recoll.conf
|
||||
src/testmains/Makefile
|
||||
src/testmains/Makefile.in
|
||||
src/xadump
|
||||
stamp-h1
|
||||
tests/casediac/aspdict.en.rws
|
||||
|
||||
@ -1,4 +1,12 @@
|
||||
|
||||
# Conditionally enable building the small test drivers, but don't
|
||||
# distribute them, they are not generally useful
|
||||
if COND_TESTMAINS
|
||||
MAYBE_TESTMAINS = testmains
|
||||
endif
|
||||
SUBDIRS = . $(MAYBE_TESTMAINS)
|
||||
DIST_SUBDIRS = .
|
||||
|
||||
CXXFLAGS ?= @CXXFLAGS@
|
||||
LIBXAPIAN=@LIBXAPIAN@
|
||||
XAPIANCXXFLAGS=@XAPIANCXXFLAGS@
|
||||
|
||||
@ -220,6 +220,12 @@ if test X$idxthreadsEnabled = Xyes ; then
|
||||
AC_DEFINE(IDX_THREADS, 1, [Use multiple threads for indexing])
|
||||
fi
|
||||
|
||||
AC_ARG_ENABLE(testmains,
|
||||
AC_HELP_STRING([--enable-testmains],
|
||||
[Enable building small test drivers. These are not unit tests.]),
|
||||
buildtestmains=$enableval, buildtestmains=no)
|
||||
AM_CONDITIONAL([COND_TESTMAINS], [test "$buildtestmains" = yes])
|
||||
|
||||
# Enable CamelCase word splitting. This is optional because it causes
|
||||
# problems with phrases: with camelcase enabled, "MySQL manual"
|
||||
# will be matched by "MySQL manual" and "my sql manual" but not
|
||||
@ -545,10 +551,8 @@ AC_SUBST(RCLLIBVERSION)
|
||||
AC_SUBST(XSLT_CFLAGS)
|
||||
AC_SUBST(XSLT_LINKADD)
|
||||
|
||||
# All object files depend on localdefs which has the cc flags. Avoid
|
||||
# changing it unless necessary
|
||||
AC_CONFIG_FILES(Makefile)
|
||||
AC_CONFIG_FILES(python/recoll/setup.py)
|
||||
AC_CONFIG_FILES(python/pychm/setup.py)
|
||||
AC_CONFIG_FILES([Makefile testmains/Makefile
|
||||
python/recoll/setup.py
|
||||
python/pychm/setup.py])
|
||||
|
||||
AC_OUTPUT
|
||||
|
||||
43
src/testmains/Makefile.am
Normal file
43
src/testmains/Makefile.am
Normal file
@ -0,0 +1,43 @@
|
||||
CXXFLAGS ?= @CXXFLAGS@
|
||||
LIBXAPIAN=@LIBXAPIAN@
|
||||
XAPIANCXXFLAGS=@XAPIANCXXFLAGS@
|
||||
XSLT_CFLAGS=@XSLT_CFLAGS@
|
||||
XSLT_LINKADD=@XSLT_LINKADD@
|
||||
LIBICONV=@LIBICONV@
|
||||
INCICONV=@INCICONV@
|
||||
LIBFAM = @LIBFAM@
|
||||
RCLLIBVERSION=@RCLLIBVERSION@
|
||||
X_CFLAGS=@X_CFLAGS@
|
||||
X_PRE_LIBS=@X_PRE_LIBS@
|
||||
X_LIBS=@X_LIBS@
|
||||
X_EXTRA_LIBS=@X_EXTRA_LIBS@
|
||||
X_LIBX11=@X_LIBX11@
|
||||
DEFS=@DEFS@
|
||||
|
||||
COMMONCPPFLAGS = -I. \
|
||||
-I$(top_srcdir)/aspell \
|
||||
-I$(top_srcdir)/bincimapmime \
|
||||
-I$(top_srcdir)/common \
|
||||
-I$(top_srcdir)/index \
|
||||
-I$(top_srcdir)/internfile \
|
||||
-I$(top_srcdir)/rcldb \
|
||||
-I$(top_srcdir)/unac \
|
||||
-I$(top_srcdir)/utils \
|
||||
-I$(top_srcdir)/xaposix \
|
||||
-DBUILDING_RECOLL
|
||||
|
||||
AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
|
||||
$(COMMONCPPFLAGS) \
|
||||
$(INCICONV) \
|
||||
$(XAPIANCXXFLAGS) \
|
||||
$(XSLT_CFLAGS) \
|
||||
$(X_CFLAGS) \
|
||||
-DRECOLL_DATADIR=\"${pkgdatadir}\" \
|
||||
-DREADFILE_ENABLE_ZLIB -DREADFILE_ENABLE_MINIZ -DREADFILE_ENABLE_MD5 \
|
||||
-D_GNU_SOURCE \
|
||||
$(DEFS)
|
||||
|
||||
noinst_PROGRAMS = textsplit
|
||||
|
||||
textsplit_SOURCES = trtextsplit.cpp
|
||||
textsplit_LDADD = ../librecoll.la
|
||||
259
src/testmains/trtextsplit.cpp
Normal file
259
src/testmains/trtextsplit.cpp
Normal file
@ -0,0 +1,259 @@
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "textsplit.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "readfile.h"
|
||||
#include "log.h"
|
||||
#include "transcode.h"
|
||||
#include "unacpp.h"
|
||||
#include "termproc.h"
|
||||
#include "rclutil.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class myTermProc : public Rcl::TermProc {
|
||||
int first;
|
||||
bool nooutput;
|
||||
public:
|
||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||
void setNoOut(bool val) {nooutput = val;}
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||
{
|
||||
if (nooutput)
|
||||
return true;
|
||||
FILE *fp = stdout;
|
||||
if (first) {
|
||||
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||
first = 0;
|
||||
}
|
||||
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
#define OPT_s 0x1
|
||||
#define OPT_w 0x2
|
||||
#define OPT_q 0x4
|
||||
#define OPT_c 0x8
|
||||
#define OPT_k 0x10
|
||||
#define OPT_C 0x20
|
||||
#define OPT_n 0x40
|
||||
#define OPT_S 0x80
|
||||
#define OPT_u 0x100
|
||||
#define OPT_p 0x200
|
||||
|
||||
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
||||
{
|
||||
myTermProc printproc;
|
||||
|
||||
Rcl::TermProc *nxt = &printproc;
|
||||
|
||||
// Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
||||
// if (op_flags & OPT_S)
|
||||
// nxt = &commonproc;
|
||||
|
||||
Rcl::TermProcPrep preproc(nxt);
|
||||
if (op_flags & OPT_u)
|
||||
nxt = &preproc;
|
||||
|
||||
Rcl::TextSplitP splitter(nxt, flags);
|
||||
|
||||
if (op_flags & OPT_q)
|
||||
printproc.setNoOut(true);
|
||||
|
||||
splitter.text_to_words(data);
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
TextSplit::Stats::Values v = splitter.getStats();
|
||||
cout << "Average length: "
|
||||
<< v.avglen
|
||||
<< " Standard deviation: "
|
||||
<< v.sigma
|
||||
<< " Coef of variation "
|
||||
<< v.sigma / v.avglen
|
||||
<< endl;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char *teststrings[] = {
|
||||
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n",
|
||||
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n",
|
||||
"n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'",
|
||||
"_network_ some_span",
|
||||
"data123\n",
|
||||
"134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n",
|
||||
"@^#$(#$(*)\n",
|
||||
"192.168.4.1 one\n\rtwo\r",
|
||||
"[olala][ululu] (valeur) (23)\n",
|
||||
"utf-8 ucs-4© \\nodef\n",
|
||||
"A b C 2 . +",
|
||||
"','this\n",
|
||||
" ,able,test-domain",
|
||||
" -wl,--export-dynamic",
|
||||
" ~/.xsession-errors",
|
||||
"this_very_long_span_this_very_long_span_this_very_long_span",
|
||||
"soft\xc2\xadhyphen",
|
||||
"soft\xc2\xad\nhyphen",
|
||||
"soft\xc2\xad\n\rhyphen",
|
||||
"real\xe2\x80\x90hyphen",
|
||||
"real\xe2\x80\x90\nhyphen",
|
||||
"hyphen-\nminus",
|
||||
};
|
||||
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
||||
|
||||
static string teststring1 = " nouvel-an ";
|
||||
|
||||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" textsplit [opts] [filename]\n"
|
||||
" -q : no output\n"
|
||||
" -s : only spans\n"
|
||||
" -w : only words\n"
|
||||
" -n : no numbers\n"
|
||||
" -k : preserve wildcards (?*)\n"
|
||||
" -c : just count words\n"
|
||||
" -u : use unac\n"
|
||||
" -C [charset] : input charset\n"
|
||||
" -S [stopfile] : stopfile to use for commongrams\n"
|
||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
|
||||
" textplit -p somephrase : display results from stringToStrings()\n"
|
||||
" \n"
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:\n" << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string charset, stopfile;
|
||||
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
/* Cas du "adb - core" */
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'c': op_flags |= OPT_c; break;
|
||||
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
|
||||
charset = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 'k': op_flags |= OPT_k; break;
|
||||
case 'n': op_flags |= OPT_n; break;
|
||||
case 'p': op_flags |= OPT_p; break;
|
||||
case 'q': op_flags |= OPT_q; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
|
||||
stopfile = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 'u': op_flags |= OPT_u; break;
|
||||
case 'w': op_flags |= OPT_w; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
}
|
||||
|
||||
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
||||
|
||||
if (op_flags&OPT_s)
|
||||
flags = TextSplit::TXTS_ONLYSPANS;
|
||||
else if (op_flags&OPT_w)
|
||||
flags = TextSplit::TXTS_NOSPANS;
|
||||
if (op_flags & OPT_k)
|
||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
||||
|
||||
|
||||
// We need a configuration file, which we build in a temp file
|
||||
TempFile tmpconf("conf");
|
||||
string cffn(tmpconf.filename());
|
||||
FILE *fp = fopen(tmpconf.filename(), "w");
|
||||
if (op_flags & OPT_n) {
|
||||
fprintf(fp, "nonumbers = 1");
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
RclConfig *config = new RclConfig(&cffn);
|
||||
TextSplit::staticConfInit(config);
|
||||
|
||||
|
||||
Rcl::StopList stoplist;
|
||||
if (op_flags & OPT_S) {
|
||||
if (!stoplist.setFile(stopfile)) {
|
||||
cerr << "Can't read stopfile: " << stopfile << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
string odata, reason;
|
||||
if (argc == 1) {
|
||||
const char *filename = *argv++; argc--;
|
||||
if (op_flags& OPT_p) {
|
||||
vector<string> tokens;
|
||||
TextSplit::stringToStrings(filename, tokens);
|
||||
for (vector<string>::const_iterator it = tokens.begin();
|
||||
it != tokens.end(); it++) {
|
||||
cout << "[" << *it << "] ";
|
||||
}
|
||||
cout << endl;
|
||||
exit(0);
|
||||
}
|
||||
if (!strcmp(filename, "stdin")) {
|
||||
char buf[1024];
|
||||
int nread;
|
||||
while ((nread = read(0, buf, 1024)) > 0) {
|
||||
odata.append(buf, nread);
|
||||
}
|
||||
} else if (!file_to_string(filename, odata, &reason)) {
|
||||
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
||||
<< reason << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
if (op_flags & OPT_p)
|
||||
Usage();
|
||||
for (int i = 0; i < teststrings_cnt; i++) {
|
||||
cout << endl << teststrings[i] << endl;
|
||||
dosplit(teststrings[i], flags, op_flags);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
string& data = odata;
|
||||
string ndata;
|
||||
if ((op_flags & OPT_C)) {
|
||||
if (!transcode(odata, ndata, charset, "UTF-8")) {
|
||||
cerr << "Failed: transcode error" << endl;
|
||||
exit(1);
|
||||
} else {
|
||||
data = ndata;
|
||||
}
|
||||
}
|
||||
|
||||
if (op_flags & OPT_c) {
|
||||
int n = TextSplit::countWords(data, flags);
|
||||
cout << n << " words" << endl;
|
||||
} else {
|
||||
dosplit(data, flags, op_flags);
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user