/* Copyright (C) 2017-2019 J.F.Dockes * * License: LGPL 2.1 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "autoconfig.h" #include "textsplit.h" #include #include #include #include #include #include #include #include #include "readfile.h" #include "log.h" #include "transcode.h" #include "unacpp.h" #include "termproc.h" #include "rclutil.h" #include "rclconfig.h" using namespace std; #define OPT_C 0x1 #define OPT_c 0x2 #define OPT_d 0x4 #define OPT_I 0x8 #define OPT_k 0x10 #define OPT_l 0x20 #define OPT_L 0x40 #define OPT_n 0x80 #define OPT_p 0x100 #define OPT_q 0x200 #define OPT_S 0x400 #define OPT_s 0x800 #define OPT_t 0x1000 #define OPT_u 0x2000 #define OPT_w 0x4000 static string thisprog; static string usage = " textsplit [opts] [filename]\n" " -I : use internal data. Else read filename or stdin if no param.\n" " -q : no output\n" " -d : print position and byte lists for input to hldata\n" " -s : only spans\n" " -w : only words\n" " -n : no numbers\n" " -k : preserve wildcards (?*)\n" " -c : just count words\n" " -u : use unac\n" " -t [tagger] : korean tagger name (Mecab/Okt/Komoran)\n" " -C [charset] : input charset\n" " -S [stopfile] : stopfile to use for commongrams\n" " -l : set max term length (bytes)\n" " -L : set max term length (bytes)\n" " -p somephrase : display results from stringToStrings()\n" "\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" ; static void Usage(void) { cerr << thisprog << ": usage:\n" << usage; exit(1); } static int op_flags; class myTermProc : public Rcl::TermProc { int first; bool nooutput; public: myTermProc() : TermProc(0), first(1), nooutput(false) {} void setNoOut(bool val) {nooutput = val;} virtual bool takeword(const string &term, int pos, int bs, int be) { m_plists[term].push_back(pos); m_gpostobytes[pos] = pair(bs, be); if (nooutput) return true; FILE *fp = stdout; if (first) { fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); first = 0; } fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); return true; } void printpos() { cout << "{"; for (const auto& lst : m_plists) { cout << "{\"" << lst.first << "\", {"; for (int pos : lst.second) { cout << pos << ","; } cout << "}}, "; } cout << "};\n"; cout << "{"; for (const auto& ent : m_gpostobytes) { cout << "{" << ent.first << ", {"; cout << ent.second.first << ", " << ent.second.second << "}}, "; } cout << "};\n"; } private: // group/near terms word positions. map > m_plists; map > m_gpostobytes; }; bool dosplit(const string& data, TextSplit::Flags flags, int op_flags) { myTermProc printproc; Rcl::TermProc *nxt = &printproc; // Rcl::TermProcCommongrams commonproc(nxt, stoplist); // if (op_flags & OPT_S) // nxt = &commonproc; Rcl::TermProcPrep preproc(nxt); if (op_flags & OPT_u) nxt = &preproc; Rcl::TextSplitP splitter(nxt, flags); if (op_flags & OPT_q) printproc.setNoOut(true); splitter.text_to_words(data); if (op_flags & OPT_d) { printproc.printpos(); } #ifdef TEXTSPLIT_STATS TextSplit::Stats::Values v = splitter.getStats(); cout << "Average length: " << v.avglen << " Standard deviation: " << v.sigma << " Coef of variation " << v.sigma / v.avglen << endl; #endif return true; } static const char *teststrings[] = { "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n", "\"Jean-Francois Dockes\" \n", "n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'", "_network_ some_span", "data123\n", "134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n", "@^#$(#$(*)\n", "192.168.4.1 one\n\rtwo\r", "[olala][ululu] (valeur) (23)\n", "utf-8 ucs-4© \\nodef\n", "A b C 2 . +", "','this\n", " ,able,test-domain", " -wl,--export-dynamic", " ~/.xsession-errors", "this_very_long_span_this_very_long_span_this_very_long_span", "soft\xc2\xadhyphen", "soft\xc2\xad\nhyphen", "soft\xc2\xad\n\rhyphen", "real\xe2\x80\x90hyphen", "real\xe2\x80\x90\nhyphen", "hyphen-\nminus", }; const int teststrings_cnt = sizeof(teststrings)/sizeof(char *); static string teststring1 = " nouvel-an "; int main(int argc, char **argv) { string charset, stopfile, kotagger; int loglevel = 4; int maxtermlen{-1}; thisprog = argv[0]; argc--; argv++; while (argc > 0 && **argv == '-') { (*argv)++; if (!(**argv)) /* Cas du "adb - core" */ Usage(); while (**argv) switch (*(*argv)++) { case 'c': op_flags |= OPT_c; break; case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); charset = *(++argv); argc--; goto b1; case 'd': op_flags |= OPT_d|OPT_q; break; case 'I': op_flags |= OPT_I; break; case 'k': op_flags |= OPT_k; break; case 'l': op_flags |= OPT_l; if (argc < 2) Usage(); maxtermlen = atoi(*(++argv)); argc--; goto b1; case 'L': op_flags |= OPT_L; if (argc < 2) Usage(); loglevel = atoi(*(++argv)); argc--; goto b1; case 'n': op_flags |= OPT_n; break; case 'p': op_flags |= OPT_p; break; case 'q': op_flags |= OPT_q; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); stopfile = *(++argv); argc--; goto b1; case 't': op_flags |= OPT_t; if (argc < 2) Usage(); kotagger = *(++argv); argc--; goto b1; case 'u': op_flags |= OPT_u; break; case 'w': op_flags |= OPT_w; break; default: Usage(); break; } b1: argc--; argv++; } TextSplit::Flags flags = TextSplit::TXTS_NONE; if (op_flags&OPT_s) flags = TextSplit::TXTS_ONLYSPANS; else if (op_flags&OPT_w) flags = TextSplit::TXTS_NOSPANS; if (op_flags & OPT_k) flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); // We need a configuration file, which we build in a temp file TempDir tmpconf; string cffn(path_cat(tmpconf.dirname(), "recoll.conf")); FILE *fp = fopen(cffn.c_str(), "w"); fprintf(fp, "loglevel = %d\n", loglevel); if (op_flags & OPT_n) { fprintf(fp, "nonumbers = 1\n"); } if (op_flags & OPT_l) { fprintf(fp, "maxtermlength = %d\n", maxtermlen); } if (!kotagger.empty()) { fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); } fprintf(fp, "underscoreasletter = 0\n"); fclose(fp); Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel)); string dn(tmpconf.dirname()); RclConfig *config = new RclConfig(&dn); if (!config->ok()) { cerr << "Could not build configuration: " << config->getReason() < tokens; TextSplit::stringToStrings(argv[0], tokens); for (vector::const_iterator it = tokens.begin(); it != tokens.end(); it++) { cout << "[" << *it << "] "; } cout << endl; exit(0); } string odata, reason; if (argc == 1) { const char *filename = *argv++; argc--; if (!file_to_string(filename, odata, &reason)) { cerr << "Failed: file_to_string(" << filename << ") failed: " << reason << endl; exit(1); } } else { char buf[1024]; int nread; while ((nread = read(0, buf, 1024)) > 0) { odata.append(buf, nread); } } string& data = odata; string ndata; if ((op_flags & OPT_C)) { if (!transcode(odata, ndata, charset, "UTF-8")) { cerr << "Failed: transcode error" << endl; exit(1); } else { data = ndata; } } if (op_flags & OPT_c) { int n = TextSplit::countWords(data, flags); cout << n << " words" << endl; } else { dosplit(data, flags, op_flags); } }