add test driver for hldata:matchGroup + some help from textsplit

2019-07-06 11:39:09 +02:00 · 2019-07-06 11:39:09 +02:00 · 5b6436ca08
commit 5b6436ca08
parent c588fddb83
3 changed files with 281 additions and 73 deletions
--- a/src/testmains/Makefile.am
+++ b/src/testmains/Makefile.am
@ -37,7 +37,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    -D_GNU_SOURCE \
    $(DEFS)

-noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig
+noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata

 textsplit_SOURCES = trtextsplit.cpp
 textsplit_LDADD = ../librecoll.la
@ -51,3 +51,6 @@ fstreewalk_LDADD = ../librecoll.la
 rclconfig_SOURCES = trrclconfig.cpp
 rclconfig_LDADD = ../librecoll.la

+hldata_SOURCES = trhldata.cpp
+hldata_LDADD = ../librecoll.la
+
--- a/src/testmains/trhldata.cpp
+++ b/src/testmains/trhldata.cpp
@ -0,0 +1,145 @@
+/* Copyright (C) 2019 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "log.h"
+#include "hldata.h"
+#include "smallut.h"
+
+using namespace std;
+
+
+const char *thisprog;
+static char usage [] =
+"hldata\n"
+" test the near/phrase matching code used for highlighting and snippets\n"
+;
+
+void Usage() {
+    fprintf(stderr, "%s:%s\n", thisprog, usage);
+    exit(1);
+}
+
+static int     op_flags;
+#define OPT_v     0x2 
+
+vector<CharFlags> kindflags {
+    CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
+        CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
+        CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
+        };
+
+// Provides a constructor for HighlightData, for easy static init.
+class HLDataInitializer {
+public:
+    HLDataInitializer(vector<vector<string> > groups, int slack,
+                      HighlightData::TermGroup::TGK kind, bool res) {
+        hldata.index_term_groups.clear();
+        hldata.index_term_groups.push_back(HighlightData::TermGroup());
+        hldata.index_term_groups[0].orgroups = groups;
+        hldata.index_term_groups[0].slack = slack;
+        hldata.index_term_groups[0].kind = kind;
+        expected = res;
+    }
+    HighlightData hldata;
+    bool expected;
+    void print() {
+        const auto& tgp{hldata.index_term_groups[0]};
+        cout << "{";
+        for (const auto& group:tgp.orgroups) {
+            cout << "{";
+            for (const auto& term: group) {
+                cout << term << ", ";
+            }
+            cout << "}, ";
+        }
+        cout << "} slack: " << tgp.slack << " kind " <<
+            valToString(kindflags, tgp.kind) << endl;
+    }
+};
+
+
+// Data: source text (for display), 
+string text1{"0 1 2 3 4"};
+// Positions produced by textsplit -d from the above
+map<string, vector<int> > plists1
+{{"0", {0,}}, {"1", {1,}}, {"2", {2,}}, {"3", {3,}}, {"4", {4,}}, };
+map<int, pair<int,int>> gpostobytes1
+{{0, {0, 1}}, {1, {2, 3}}, {2, {4, 5}}, {3, {6, 7}}, {4, {8, 9}}, };
+
+
+vector<HLDataInitializer> hldvec {
+    {{{"0"}, {"1"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
+    {{{"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
+    {{{"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
+    {{{"0"}, {"1"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
+    {{{"1"}, {"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
+    {{{"0"}, {"1"}, {"2"}, {"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
+    {{{"0"}, {"2"}}, 1, HighlightData::TermGroup::TGK_PHRASE, true}, // slack 1 
+    {{{"0"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
+    {{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
+    {{{"3"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
+    {{{"4"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
+
+    {{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, true},
+    {{{"2"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, false},
+    {{{"2"}, {"0"}}, 1, HighlightData::TermGroup::TGK_NEAR, true},
+    {{{"4"}, {"0"}}, 2, HighlightData::TermGroup::TGK_NEAR, false},
+    {{{"4"}, {"0"}}, 3, HighlightData::TermGroup::TGK_NEAR, true},
+};
+
+int main(int argc, char **argv)
+{
+    thisprog = argv[0];
+    argc--; argv++;
+
+    while (argc > 0 && **argv == '-') {
+        (*argv)++;
+        if (!(**argv))
+            Usage();
+        while (**argv)
+            switch (*(*argv)++) {
+            case 'v':   op_flags |= OPT_v; break;
+            default: Usage();   break;
+            }
+        argc--;argv++;
+    }
+
+    cout << "text, bpos:\n";
+    cout << "0123456789\n";
+    cout << "0 1 2 3 4\n";
+    for (auto& hld : hldvec) {
+        vector<GroupMatchEntry> tboffs;
+        bool ret = matchGroup(hld.hldata, 0, plists1, gpostobytes1, tboffs);
+        if (ret && !hld.expected) {
+            cout << "matchGroup: ok, expected false: ";
+            hld.print();
+            for (const auto& ent: tboffs) {
+                cout << "{" << ent.offs.first << ", " << ent.offs.second << "} ";
+            }
+            cout << "\n";
+        } else if (!ret && hld.expected) {
+            cout << "matchGroup: failed, expected true:\n";
+            hld.print();
+        }
+    }
+}
--- a/src/testmains/trtextsplit.cpp
+++ b/src/testmains/trtextsplit.cpp
@ -1,3 +1,22 @@
+/* Copyright (C) 2017-2019 J.F.Dockes
+ *
+ * License: LGPL 2.1
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
 #include "autoconfig.h"

 #include "textsplit.h"
@ -22,26 +41,6 @@

 using namespace std;

-class myTermProc : public Rcl::TermProc {
-    int first;
-    bool nooutput;
-public:
-    myTermProc() : TermProc(0), first(1), nooutput(false) {}
-    void setNoOut(bool val) {nooutput = val;}
-    virtual bool takeword(const string &term, int pos, int bs, int be)
-    {
-        if (nooutput)
-            return true;
-        FILE *fp = stdout;
-        if (first) {
-            fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
-            first = 0;
-        }
-        fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
-        return true;
-    }
-};
-
 #define OPT_s     0x1 
 #define OPT_w     0x2
 #define OPT_q     0x4
@ -52,6 +51,82 @@ public:
 #define OPT_S     0x80
 #define OPT_u     0x100
 #define OPT_p     0x200
+#define OPT_I     0x400
+#define OPT_d     0x800
+
+static string thisprog;
+
+static string usage =
+    " textsplit [opts] [filename]\n"
+    "   -I : use internal data. Else read filename or stdin if no param.\n"
+    "   -q : no output\n"
+    "   -d : print position and byte lists for input to hldata\n"
+    "   -s :  only spans\n"
+    "   -w :  only words\n"
+    "   -n :  no numbers\n"
+    "   -k :  preserve wildcards (?*)\n"
+    "   -c : just count words\n"
+    "   -u : use unac\n"
+    "   -C [charset] : input charset\n"
+    "   -S [stopfile] : stopfile to use for commongrams\n"
+    "    if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
+    "   -p somephrase : display results from stringToStrings()\n"
+    "  \n"
+    ;
+
+static void
+Usage(void)
+{
+    cerr << thisprog  << ": usage:\n" << usage;
+    exit(1);
+}
+
+static int        op_flags;
+
+
+class myTermProc : public Rcl::TermProc {
+    int first;
+    bool nooutput;
+public:
+    myTermProc() : TermProc(0), first(1), nooutput(false) {}
+    void setNoOut(bool val) {nooutput = val;}
+    virtual bool takeword(const string &term, int pos, int bs, int be) {
+        m_plists[term].push_back(pos);
+        m_gpostobytes[pos] = pair<int,int>(bs, be);
+        if (nooutput)
+            return true;
+        FILE *fp = stdout;
+        if (first) {
+            fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
+            first = 0;
+        }
+        fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
+        return true;
+    }
+
+    void printpos() {
+        cout << "{";
+        for (const auto& lst : m_plists) {
+            cout << "{\"" << lst.first << "\", {";
+            for (int pos : lst.second) {
+                cout << pos << ",";
+            }
+            cout << "}}, ";
+        }
+        cout << "};\n";
+        cout << "{";
+        for (const auto& ent : m_gpostobytes) {
+            cout << "{" << ent.first << ", {";
+            cout << ent.second.first << ", " << ent.second.second << "}}, ";
+        }
+        cout << "};\n";
+    }
+private:
+    // group/near terms word positions.
+    map<string, vector<int> > m_plists;
+    map<int, pair<int, int> > m_gpostobytes;
+};
+

 bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
 {
@ -73,6 +148,9 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
        printproc.setNoOut(true);

    splitter.text_to_words(data);
+    if (op_flags & OPT_d) {
+        printproc.printpos();
+    }

 #ifdef TEXTSPLIT_STATS
        TextSplit::Stats::Values v = splitter.getStats();
@ -115,33 +193,6 @@ const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);

 static string teststring1 = " nouvel-an ";

-static string thisprog;
-
-static string usage =
-    " textsplit [opts] [filename]\n"
-    "   -q : no output\n"
-    "   -s :  only spans\n"
-    "   -w :  only words\n"
-    "   -n :  no numbers\n"
-    "   -k :  preserve wildcards (?*)\n"
-    "   -c : just count words\n"
-    "   -u : use unac\n"
-    "   -C [charset] : input charset\n"
-    "   -S [stopfile] : stopfile to use for commongrams\n"
-    " if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
-    " textplit -p somephrase : display results from stringToStrings()\n"
-    "  \n"
-    ;
-
-static void
-Usage(void)
-{
-    cerr << thisprog  << ": usage:\n" << usage;
-    exit(1);
-}
-
-static int        op_flags;
-
 int main(int argc, char **argv)
 {
    string charset, stopfile;
@ -160,6 +211,8 @@ int main(int argc, char **argv)
            case 'C':   op_flags |= OPT_C; if (argc < 2)  Usage();
                charset = *(++argv); argc--; 
                goto b1;
+            case 'd':   op_flags |= OPT_d|OPT_q; break;
+            case 'I':   op_flags |= OPT_I; break;
            case 'k':   op_flags |= OPT_k; break;
            case 'n':   op_flags |= OPT_n; break;
            case 'p':   op_flags |= OPT_p; break;
@ -205,31 +258,10 @@ int main(int argc, char **argv)
            exit(1);
        }
    }
-    string odata, reason;
-    if (argc == 1) {
-        const char *filename = *argv++; argc--;
-        if (op_flags& OPT_p) {
-            vector<string> tokens;
-            TextSplit::stringToStrings(filename, tokens);
-            for (vector<string>::const_iterator it = tokens.begin();
-                 it != tokens.end(); it++) {
-                cout << "[" << *it << "] ";
-            }
-            cout << endl;
-            exit(0);
-        }
-        if (!strcmp(filename, "stdin")) {
-            char buf[1024];
-            int nread;
-            while ((nread = read(0, buf, 1024)) > 0) {
-                odata.append(buf, nread);
-            }
-        } else if (!file_to_string(filename, odata, &reason)) {
-            cerr << "Failed: file_to_string(" << filename << ") failed: " 
-                 << reason << endl;
-            exit(1);
-        }
-    } else {
+
+    if (op_flags & OPT_I) {
+        if (argc)
+            Usage();
        if (op_flags & OPT_p)
            Usage();
        for (int i = 0; i < teststrings_cnt; i++) {
@ -237,6 +269,34 @@ int main(int argc, char **argv)
            dosplit(teststrings[i], flags, op_flags);
        }
        exit(0);
+    } else if (op_flags& OPT_p) {
+        if (!argc)
+            Usage();
+        vector<string> tokens;
+        TextSplit::stringToStrings(argv[0], tokens);
+        for (vector<string>::const_iterator it = tokens.begin();
+             it != tokens.end(); it++) {
+            cout << "[" << *it << "] ";
+        }
+        cout << endl;
+        exit(0);
+    }
+
+
+    string odata, reason;
+    if (argc == 1) {
+        const char *filename = *argv++; argc--;
+        if (!file_to_string(filename, odata, &reason)) {
+            cerr << "Failed: file_to_string(" << filename << ") failed: " 
+                 << reason << endl;
+            exit(1);
+        }
+    } else {
+        char buf[1024];
+        int nread;
+        while ((nread = read(0, buf, 1024)) > 0) {
+            odata.append(buf, nread);
+        }
    }

    string& data = odata;