diff --git a/src/testmains/Makefile.am b/src/testmains/Makefile.am index 5efc8f1b..1c91a23c 100644 --- a/src/testmains/Makefile.am +++ b/src/testmains/Makefile.am @@ -37,7 +37,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \ -D_GNU_SOURCE \ $(DEFS) -noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig +noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata textsplit_SOURCES = trtextsplit.cpp textsplit_LDADD = ../librecoll.la @@ -51,3 +51,6 @@ fstreewalk_LDADD = ../librecoll.la rclconfig_SOURCES = trrclconfig.cpp rclconfig_LDADD = ../librecoll.la +hldata_SOURCES = trhldata.cpp +hldata_LDADD = ../librecoll.la + diff --git a/src/testmains/trhldata.cpp b/src/testmains/trhldata.cpp new file mode 100644 index 00000000..bd37cca4 --- /dev/null +++ b/src/testmains/trhldata.cpp @@ -0,0 +1,145 @@ +/* Copyright (C) 2019 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include +#include + +#include +#include +#include + +#include "log.h" +#include "hldata.h" +#include "smallut.h" + +using namespace std; + + +const char *thisprog; +static char usage [] = +"hldata\n" +" test the near/phrase matching code used for highlighting and snippets\n" +; + +void Usage() { + fprintf(stderr, "%s:%s\n", thisprog, usage); + exit(1); +} + +static int op_flags; +#define OPT_v 0x2 + +vector kindflags { + CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM), + CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR), + CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE), + }; + +// Provides a constructor for HighlightData, for easy static init. +class HLDataInitializer { +public: + HLDataInitializer(vector > groups, int slack, + HighlightData::TermGroup::TGK kind, bool res) { + hldata.index_term_groups.clear(); + hldata.index_term_groups.push_back(HighlightData::TermGroup()); + hldata.index_term_groups[0].orgroups = groups; + hldata.index_term_groups[0].slack = slack; + hldata.index_term_groups[0].kind = kind; + expected = res; + } + HighlightData hldata; + bool expected; + void print() { + const auto& tgp{hldata.index_term_groups[0]}; + cout << "{"; + for (const auto& group:tgp.orgroups) { + cout << "{"; + for (const auto& term: group) { + cout << term << ", "; + } + cout << "}, "; + } + cout << "} slack: " << tgp.slack << " kind " << + valToString(kindflags, tgp.kind) << endl; + } +}; + + +// Data: source text (for display), +string text1{"0 1 2 3 4"}; +// Positions produced by textsplit -d from the above +map > plists1 +{{"0", {0,}}, {"1", {1,}}, {"2", {2,}}, {"3", {3,}}, {"4", {4,}}, }; +map> gpostobytes1 +{{0, {0, 1}}, {1, {2, 3}}, {2, {4, 5}}, {3, {6, 7}}, {4, {8, 9}}, }; + + +vector hldvec { + {{{"0"}, {"1"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true}, + {{{"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true}, + {{{"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true}, + {{{"0"}, {"1"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true}, + {{{"1"}, {"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true}, + {{{"0"}, {"1"}, {"2"}, {"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true}, + {{{"0"}, {"2"}}, 1, HighlightData::TermGroup::TGK_PHRASE, true}, // slack 1 + {{{"0"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false}, + {{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false}, + {{{"3"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false}, + {{{"4"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false}, + + {{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, true}, + {{{"2"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, false}, + {{{"2"}, {"0"}}, 1, HighlightData::TermGroup::TGK_NEAR, true}, + {{{"4"}, {"0"}}, 2, HighlightData::TermGroup::TGK_NEAR, false}, + {{{"4"}, {"0"}}, 3, HighlightData::TermGroup::TGK_NEAR, true}, +}; + +int main(int argc, char **argv) +{ + thisprog = argv[0]; + argc--; argv++; + + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + Usage(); + while (**argv) + switch (*(*argv)++) { + case 'v': op_flags |= OPT_v; break; + default: Usage(); break; + } + argc--;argv++; + } + + cout << "text, bpos:\n"; + cout << "0123456789\n"; + cout << "0 1 2 3 4\n"; + for (auto& hld : hldvec) { + vector tboffs; + bool ret = matchGroup(hld.hldata, 0, plists1, gpostobytes1, tboffs); + if (ret && !hld.expected) { + cout << "matchGroup: ok, expected false: "; + hld.print(); + for (const auto& ent: tboffs) { + cout << "{" << ent.offs.first << ", " << ent.offs.second << "} "; + } + cout << "\n"; + } else if (!ret && hld.expected) { + cout << "matchGroup: failed, expected true:\n"; + hld.print(); + } + } +} diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp index a7cc3589..46aef1a9 100644 --- a/src/testmains/trtextsplit.cpp +++ b/src/testmains/trtextsplit.cpp @@ -1,3 +1,22 @@ +/* Copyright (C) 2017-2019 J.F.Dockes + * + * License: LGPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ #include "autoconfig.h" #include "textsplit.h" @@ -22,26 +41,6 @@ using namespace std; -class myTermProc : public Rcl::TermProc { - int first; - bool nooutput; -public: - myTermProc() : TermProc(0), first(1), nooutput(false) {} - void setNoOut(bool val) {nooutput = val;} - virtual bool takeword(const string &term, int pos, int bs, int be) - { - if (nooutput) - return true; - FILE *fp = stdout; - if (first) { - fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); - first = 0; - } - fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); - return true; - } -}; - #define OPT_s 0x1 #define OPT_w 0x2 #define OPT_q 0x4 @@ -52,6 +51,82 @@ public: #define OPT_S 0x80 #define OPT_u 0x100 #define OPT_p 0x200 +#define OPT_I 0x400 +#define OPT_d 0x800 + +static string thisprog; + +static string usage = + " textsplit [opts] [filename]\n" + " -I : use internal data. Else read filename or stdin if no param.\n" + " -q : no output\n" + " -d : print position and byte lists for input to hldata\n" + " -s : only spans\n" + " -w : only words\n" + " -n : no numbers\n" + " -k : preserve wildcards (?*)\n" + " -c : just count words\n" + " -u : use unac\n" + " -C [charset] : input charset\n" + " -S [stopfile] : stopfile to use for commongrams\n" + " if filename is 'stdin', will read stdin for data (end with ^D)\n\n" + " -p somephrase : display results from stringToStrings()\n" + " \n" + ; + +static void +Usage(void) +{ + cerr << thisprog << ": usage:\n" << usage; + exit(1); +} + +static int op_flags; + + +class myTermProc : public Rcl::TermProc { + int first; + bool nooutput; +public: + myTermProc() : TermProc(0), first(1), nooutput(false) {} + void setNoOut(bool val) {nooutput = val;} + virtual bool takeword(const string &term, int pos, int bs, int be) { + m_plists[term].push_back(pos); + m_gpostobytes[pos] = pair(bs, be); + if (nooutput) + return true; + FILE *fp = stdout; + if (first) { + fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); + first = 0; + } + fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); + return true; + } + + void printpos() { + cout << "{"; + for (const auto& lst : m_plists) { + cout << "{\"" << lst.first << "\", {"; + for (int pos : lst.second) { + cout << pos << ","; + } + cout << "}}, "; + } + cout << "};\n"; + cout << "{"; + for (const auto& ent : m_gpostobytes) { + cout << "{" << ent.first << ", {"; + cout << ent.second.first << ", " << ent.second.second << "}}, "; + } + cout << "};\n"; + } +private: + // group/near terms word positions. + map > m_plists; + map > m_gpostobytes; +}; + bool dosplit(const string& data, TextSplit::Flags flags, int op_flags) { @@ -73,6 +148,9 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags) printproc.setNoOut(true); splitter.text_to_words(data); + if (op_flags & OPT_d) { + printproc.printpos(); + } #ifdef TEXTSPLIT_STATS TextSplit::Stats::Values v = splitter.getStats(); @@ -115,33 +193,6 @@ const int teststrings_cnt = sizeof(teststrings)/sizeof(char *); static string teststring1 = " nouvel-an "; -static string thisprog; - -static string usage = - " textsplit [opts] [filename]\n" - " -q : no output\n" - " -s : only spans\n" - " -w : only words\n" - " -n : no numbers\n" - " -k : preserve wildcards (?*)\n" - " -c : just count words\n" - " -u : use unac\n" - " -C [charset] : input charset\n" - " -S [stopfile] : stopfile to use for commongrams\n" - " if filename is 'stdin', will read stdin for data (end with ^D)\n\n" - " textplit -p somephrase : display results from stringToStrings()\n" - " \n" - ; - -static void -Usage(void) -{ - cerr << thisprog << ": usage:\n" << usage; - exit(1); -} - -static int op_flags; - int main(int argc, char **argv) { string charset, stopfile; @@ -160,6 +211,8 @@ int main(int argc, char **argv) case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); charset = *(++argv); argc--; goto b1; + case 'd': op_flags |= OPT_d|OPT_q; break; + case 'I': op_flags |= OPT_I; break; case 'k': op_flags |= OPT_k; break; case 'n': op_flags |= OPT_n; break; case 'p': op_flags |= OPT_p; break; @@ -205,31 +258,10 @@ int main(int argc, char **argv) exit(1); } } - string odata, reason; - if (argc == 1) { - const char *filename = *argv++; argc--; - if (op_flags& OPT_p) { - vector tokens; - TextSplit::stringToStrings(filename, tokens); - for (vector::const_iterator it = tokens.begin(); - it != tokens.end(); it++) { - cout << "[" << *it << "] "; - } - cout << endl; - exit(0); - } - if (!strcmp(filename, "stdin")) { - char buf[1024]; - int nread; - while ((nread = read(0, buf, 1024)) > 0) { - odata.append(buf, nread); - } - } else if (!file_to_string(filename, odata, &reason)) { - cerr << "Failed: file_to_string(" << filename << ") failed: " - << reason << endl; - exit(1); - } - } else { + + if (op_flags & OPT_I) { + if (argc) + Usage(); if (op_flags & OPT_p) Usage(); for (int i = 0; i < teststrings_cnt; i++) { @@ -237,6 +269,34 @@ int main(int argc, char **argv) dosplit(teststrings[i], flags, op_flags); } exit(0); + } else if (op_flags& OPT_p) { + if (!argc) + Usage(); + vector tokens; + TextSplit::stringToStrings(argv[0], tokens); + for (vector::const_iterator it = tokens.begin(); + it != tokens.end(); it++) { + cout << "[" << *it << "] "; + } + cout << endl; + exit(0); + } + + + string odata, reason; + if (argc == 1) { + const char *filename = *argv++; argc--; + if (!file_to_string(filename, odata, &reason)) { + cerr << "Failed: file_to_string(" << filename << ") failed: " + << reason << endl; + exit(1); + } + } else { + char buf[1024]; + int nread; + while ((nread = read(0, buf, 1024)) > 0) { + odata.append(buf, nread); + } } string& data = odata;