trtextsplit: add option for max term length

This commit is contained in:
Jean-Francois Dockes 2019-09-13 13:01:35 +02:00
parent 179be1a07c
commit 8d92b9debd

View File

@ -53,6 +53,7 @@ using namespace std;
#define OPT_p 0x200 #define OPT_p 0x200
#define OPT_I 0x400 #define OPT_I 0x400
#define OPT_d 0x800 #define OPT_d 0x800
#define OPT_l 0x1000
static string thisprog; static string thisprog;
@ -69,6 +70,7 @@ static string usage =
" -u : use unac\n" " -u : use unac\n"
" -C [charset] : input charset\n" " -C [charset] : input charset\n"
" -S [stopfile] : stopfile to use for commongrams\n" " -S [stopfile] : stopfile to use for commongrams\n"
" -l <maxtermlen> : set max term length (bytes)\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
" -p somephrase : display results from stringToStrings()\n" " -p somephrase : display results from stringToStrings()\n"
" \n" " \n"
@ -196,6 +198,7 @@ static string teststring1 = " nouvel-an ";
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
string charset, stopfile; string charset, stopfile;
int maxtermlen{-1};
thisprog = argv[0]; thisprog = argv[0];
argc--; argv++; argc--; argv++;
@ -214,6 +217,9 @@ int main(int argc, char **argv)
case 'd': op_flags |= OPT_d|OPT_q; break; case 'd': op_flags |= OPT_d|OPT_q; break;
case 'I': op_flags |= OPT_I; break; case 'I': op_flags |= OPT_I; break;
case 'k': op_flags |= OPT_k; break; case 'k': op_flags |= OPT_k; break;
case 'l': op_flags |= OPT_l; if (argc < 2) Usage();
maxtermlen = atoi(*(++argv)); argc--;
goto b1;
case 'n': op_flags |= OPT_n; break; case 'n': op_flags |= OPT_n; break;
case 'p': op_flags |= OPT_p; break; case 'p': op_flags |= OPT_p; break;
case 'q': op_flags |= OPT_q; break; case 'q': op_flags |= OPT_q; break;
@ -239,18 +245,26 @@ int main(int argc, char **argv)
// We need a configuration file, which we build in a temp file // We need a configuration file, which we build in a temp file
TempFile tmpconf("conf"); TempDir tmpconf;
string cffn(tmpconf.filename()); string cffn(path_cat(tmpconf.dirname(), "recoll.conf"));
FILE *fp = fopen(tmpconf.filename(), "w"); FILE *fp = fopen(cffn.c_str(), "w");
if (op_flags & OPT_n) { if (op_flags & OPT_n) {
fprintf(fp, "nonumbers = 1"); fprintf(fp, "nonumbers = 1\n");
}
if (op_flags & OPT_l) {
fprintf(fp, "maxtermlength = %d\n", maxtermlen);
} }
fclose(fp); fclose(fp);
RclConfig *config = new RclConfig(&cffn); string dn(tmpconf.dirname());
TextSplit::staticConfInit(config); RclConfig *config = new RclConfig(&dn);
if (!config->ok()) {
cerr << "Could not build configuration: " << config->getReason() <<endl;
}
Logger::getTheLog("stderr")->setLogLevel(Logger::LLDEB0); Logger::getTheLog("stderr")->setLogLevel(Logger::LLDEB0);
TextSplit::staticConfInit(config);
LOGDEB("Trtextsplit starting up\n");
Rcl::StopList stoplist; Rcl::StopList stoplist;
if (op_flags & OPT_S) { if (op_flags & OPT_S) {
if (!stoplist.setFile(stopfile)) { if (!stoplist.setFile(stopfile)) {