From 60e9949663e427520e470f9cd0e97bc18440f4fb Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 6 May 2020 15:27:27 +0200 Subject: [PATCH] texsplit test driver: add options for korean tagger --- src/testmains/trtextsplit.cpp | 75 ++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp index e5cf027a..730d2011 100644 --- a/src/testmains/trtextsplit.cpp +++ b/src/testmains/trtextsplit.cpp @@ -41,19 +41,22 @@ using namespace std; -#define OPT_s 0x1 -#define OPT_w 0x2 -#define OPT_q 0x4 -#define OPT_c 0x8 -#define OPT_k 0x10 -#define OPT_C 0x20 -#define OPT_n 0x40 -#define OPT_S 0x80 -#define OPT_u 0x100 -#define OPT_p 0x200 -#define OPT_I 0x400 -#define OPT_d 0x800 -#define OPT_l 0x1000 +#define OPT_C 0x1 +#define OPT_c 0x2 +#define OPT_d 0x4 +#define OPT_I 0x8 +#define OPT_k 0x10 +#define OPT_l 0x20 +#define OPT_L 0x40 +#define OPT_n 0x80 +#define OPT_p 0x100 +#define OPT_q 0x200 +#define OPT_S 0x400 +#define OPT_s 0x800 +#define OPT_t 0x1000 +#define OPT_u 0x2000 +#define OPT_w 0x4000 + static string thisprog; @@ -68,12 +71,14 @@ static string usage = " -k : preserve wildcards (?*)\n" " -c : just count words\n" " -u : use unac\n" + " -t [tagger] : korean tagger name (Mecab/Okt/Komoran)\n" " -C [charset] : input charset\n" " -S [stopfile] : stopfile to use for commongrams\n" " -l : set max term length (bytes)\n" - " if filename is 'stdin', will read stdin for data (end with ^D)\n\n" + " -L : set max term length (bytes)\n" " -p somephrase : display results from stringToStrings()\n" - " \n" + "\n" + " if filename is 'stdin', will read stdin for data (end with ^D)\n" ; static void @@ -155,14 +160,14 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags) } #ifdef TEXTSPLIT_STATS - TextSplit::Stats::Values v = splitter.getStats(); - cout << "Average length: " - << v.avglen - << " Standard deviation: " - << v.sigma - << " Coef of variation " - << v.sigma / v.avglen - << endl; + TextSplit::Stats::Values v = splitter.getStats(); + cout << "Average length: " + << v.avglen + << " Standard deviation: " + << v.sigma + << " Coef of variation " + << v.sigma / v.avglen + << endl; #endif return true; } @@ -197,7 +202,8 @@ static string teststring1 = " nouvel-an "; int main(int argc, char **argv) { - string charset, stopfile; + string charset, stopfile, kotagger; + int loglevel = 4; int maxtermlen{-1}; thisprog = argv[0]; @@ -212,21 +218,22 @@ int main(int argc, char **argv) switch (*(*argv)++) { case 'c': op_flags |= OPT_c; break; case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); - charset = *(++argv); argc--; - goto b1; + charset = *(++argv); argc--; goto b1; case 'd': op_flags |= OPT_d|OPT_q; break; case 'I': op_flags |= OPT_I; break; case 'k': op_flags |= OPT_k; break; case 'l': op_flags |= OPT_l; if (argc < 2) Usage(); - maxtermlen = atoi(*(++argv)); argc--; - goto b1; + maxtermlen = atoi(*(++argv)); argc--; goto b1; + case 'L': op_flags |= OPT_L; if (argc < 2) Usage(); + loglevel = atoi(*(++argv)); argc--; goto b1; case 'n': op_flags |= OPT_n; break; case 'p': op_flags |= OPT_p; break; case 'q': op_flags |= OPT_q; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); - stopfile = *(++argv); argc--; - goto b1; + stopfile = *(++argv); argc--; goto b1; + case 't': op_flags |= OPT_t; if (argc < 2) Usage(); + kotagger = *(++argv); argc--; goto b1; case 'u': op_flags |= OPT_u; break; case 'w': op_flags |= OPT_w; break; default: Usage(); break; @@ -248,20 +255,26 @@ int main(int argc, char **argv) TempDir tmpconf; string cffn(path_cat(tmpconf.dirname(), "recoll.conf")); FILE *fp = fopen(cffn.c_str(), "w"); + fprintf(fp, "loglevel = %d\n", loglevel); if (op_flags & OPT_n) { fprintf(fp, "nonumbers = 1\n"); } if (op_flags & OPT_l) { fprintf(fp, "maxtermlength = %d\n", maxtermlen); } + if (!kotagger.empty()) { + fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); + } fclose(fp); + Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel)); + string dn(tmpconf.dirname()); RclConfig *config = new RclConfig(&dn); if (!config->ok()) { cerr << "Could not build configuration: " << config->getReason() <setLogLevel(Logger::LLDEB0); + TextSplit::staticConfInit(config); LOGDEB("Trtextsplit starting up\n");