texsplit test driver: add options for korean tagger

This commit is contained in:
Jean-Francois Dockes 2020-05-06 15:27:27 +02:00
parent 026874a625
commit 60e9949663

View File

@ -41,19 +41,22 @@
using namespace std; using namespace std;
#define OPT_s 0x1 #define OPT_C 0x1
#define OPT_w 0x2 #define OPT_c 0x2
#define OPT_q 0x4 #define OPT_d 0x4
#define OPT_c 0x8 #define OPT_I 0x8
#define OPT_k 0x10 #define OPT_k 0x10
#define OPT_C 0x20 #define OPT_l 0x20
#define OPT_n 0x40 #define OPT_L 0x40
#define OPT_S 0x80 #define OPT_n 0x80
#define OPT_u 0x100 #define OPT_p 0x100
#define OPT_p 0x200 #define OPT_q 0x200
#define OPT_I 0x400 #define OPT_S 0x400
#define OPT_d 0x800 #define OPT_s 0x800
#define OPT_l 0x1000 #define OPT_t 0x1000
#define OPT_u 0x2000
#define OPT_w 0x4000
static string thisprog; static string thisprog;
@ -68,12 +71,14 @@ static string usage =
" -k : preserve wildcards (?*)\n" " -k : preserve wildcards (?*)\n"
" -c : just count words\n" " -c : just count words\n"
" -u : use unac\n" " -u : use unac\n"
" -t [tagger] : korean tagger name (Mecab/Okt/Komoran)\n"
" -C [charset] : input charset\n" " -C [charset] : input charset\n"
" -S [stopfile] : stopfile to use for commongrams\n" " -S [stopfile] : stopfile to use for commongrams\n"
" -l <maxtermlen> : set max term length (bytes)\n" " -l <maxtermlen> : set max term length (bytes)\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n" " -L <loglevel> : set max term length (bytes)\n"
" -p somephrase : display results from stringToStrings()\n" " -p somephrase : display results from stringToStrings()\n"
" \n" "\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
; ;
static void static void
@ -155,14 +160,14 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
} }
#ifdef TEXTSPLIT_STATS #ifdef TEXTSPLIT_STATS
TextSplit::Stats::Values v = splitter.getStats(); TextSplit::Stats::Values v = splitter.getStats();
cout << "Average length: " cout << "Average length: "
<< v.avglen << v.avglen
<< " Standard deviation: " << " Standard deviation: "
<< v.sigma << v.sigma
<< " Coef of variation " << " Coef of variation "
<< v.sigma / v.avglen << v.sigma / v.avglen
<< endl; << endl;
#endif #endif
return true; return true;
} }
@ -197,7 +202,8 @@ static string teststring1 = " nouvel-an ";
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
string charset, stopfile; string charset, stopfile, kotagger;
int loglevel = 4;
int maxtermlen{-1}; int maxtermlen{-1};
thisprog = argv[0]; thisprog = argv[0];
@ -212,21 +218,22 @@ int main(int argc, char **argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break; case 'c': op_flags |= OPT_c; break;
case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
charset = *(++argv); argc--; charset = *(++argv); argc--; goto b1;
goto b1;
case 'd': op_flags |= OPT_d|OPT_q; break; case 'd': op_flags |= OPT_d|OPT_q; break;
case 'I': op_flags |= OPT_I; break; case 'I': op_flags |= OPT_I; break;
case 'k': op_flags |= OPT_k; break; case 'k': op_flags |= OPT_k; break;
case 'l': op_flags |= OPT_l; if (argc < 2) Usage(); case 'l': op_flags |= OPT_l; if (argc < 2) Usage();
maxtermlen = atoi(*(++argv)); argc--; maxtermlen = atoi(*(++argv)); argc--; goto b1;
goto b1; case 'L': op_flags |= OPT_L; if (argc < 2) Usage();
loglevel = atoi(*(++argv)); argc--; goto b1;
case 'n': op_flags |= OPT_n; break; case 'n': op_flags |= OPT_n; break;
case 'p': op_flags |= OPT_p; break; case 'p': op_flags |= OPT_p; break;
case 'q': op_flags |= OPT_q; break; case 'q': op_flags |= OPT_q; break;
case 's': op_flags |= OPT_s; break; case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
stopfile = *(++argv); argc--; stopfile = *(++argv); argc--; goto b1;
goto b1; case 't': op_flags |= OPT_t; if (argc < 2) Usage();
kotagger = *(++argv); argc--; goto b1;
case 'u': op_flags |= OPT_u; break; case 'u': op_flags |= OPT_u; break;
case 'w': op_flags |= OPT_w; break; case 'w': op_flags |= OPT_w; break;
default: Usage(); break; default: Usage(); break;
@ -248,20 +255,26 @@ int main(int argc, char **argv)
TempDir tmpconf; TempDir tmpconf;
string cffn(path_cat(tmpconf.dirname(), "recoll.conf")); string cffn(path_cat(tmpconf.dirname(), "recoll.conf"));
FILE *fp = fopen(cffn.c_str(), "w"); FILE *fp = fopen(cffn.c_str(), "w");
fprintf(fp, "loglevel = %d\n", loglevel);
if (op_flags & OPT_n) { if (op_flags & OPT_n) {
fprintf(fp, "nonumbers = 1\n"); fprintf(fp, "nonumbers = 1\n");
} }
if (op_flags & OPT_l) { if (op_flags & OPT_l) {
fprintf(fp, "maxtermlength = %d\n", maxtermlen); fprintf(fp, "maxtermlength = %d\n", maxtermlen);
} }
if (!kotagger.empty()) {
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
}
fclose(fp); fclose(fp);
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));
string dn(tmpconf.dirname()); string dn(tmpconf.dirname());
RclConfig *config = new RclConfig(&dn); RclConfig *config = new RclConfig(&dn);
if (!config->ok()) { if (!config->ok()) {
cerr << "Could not build configuration: " << config->getReason() <<endl; cerr << "Could not build configuration: " << config->getReason() <<endl;
} }
Logger::getTheLog("stderr")->setLogLevel(Logger::LLDEB0);
TextSplit::staticConfInit(config); TextSplit::staticConfInit(config);
LOGDEB("Trtextsplit starting up\n"); LOGDEB("Trtextsplit starting up\n");