add test driver for hldata:matchGroup + some help from textsplit

This commit is contained in:
Jean-Francois Dockes 2019-07-06 11:39:09 +02:00
parent c588fddb83
commit 5b6436ca08
3 changed files with 281 additions and 73 deletions

View File

@ -37,7 +37,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
-D_GNU_SOURCE \
$(DEFS)
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata
textsplit_SOURCES = trtextsplit.cpp
textsplit_LDADD = ../librecoll.la
@ -51,3 +51,6 @@ fstreewalk_LDADD = ../librecoll.la
rclconfig_SOURCES = trrclconfig.cpp
rclconfig_LDADD = ../librecoll.la
hldata_SOURCES = trhldata.cpp
hldata_LDADD = ../librecoll.la

145
src/testmains/trhldata.cpp Normal file
View File

@ -0,0 +1,145 @@
/* Copyright (C) 2019 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <vector>
#include "log.h"
#include "hldata.h"
#include "smallut.h"
using namespace std;
const char *thisprog;
static char usage [] =
"hldata\n"
" test the near/phrase matching code used for highlighting and snippets\n"
;
void Usage() {
fprintf(stderr, "%s:%s\n", thisprog, usage);
exit(1);
}
static int op_flags;
#define OPT_v 0x2
vector<CharFlags> kindflags {
CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
};
// Provides a constructor for HighlightData, for easy static init.
class HLDataInitializer {
public:
HLDataInitializer(vector<vector<string> > groups, int slack,
HighlightData::TermGroup::TGK kind, bool res) {
hldata.index_term_groups.clear();
hldata.index_term_groups.push_back(HighlightData::TermGroup());
hldata.index_term_groups[0].orgroups = groups;
hldata.index_term_groups[0].slack = slack;
hldata.index_term_groups[0].kind = kind;
expected = res;
}
HighlightData hldata;
bool expected;
void print() {
const auto& tgp{hldata.index_term_groups[0]};
cout << "{";
for (const auto& group:tgp.orgroups) {
cout << "{";
for (const auto& term: group) {
cout << term << ", ";
}
cout << "}, ";
}
cout << "} slack: " << tgp.slack << " kind " <<
valToString(kindflags, tgp.kind) << endl;
}
};
// Data: source text (for display),
string text1{"0 1 2 3 4"};
// Positions produced by textsplit -d from the above
map<string, vector<int> > plists1
{{"0", {0,}}, {"1", {1,}}, {"2", {2,}}, {"3", {3,}}, {"4", {4,}}, };
map<int, pair<int,int>> gpostobytes1
{{0, {0, 1}}, {1, {2, 3}}, {2, {4, 5}}, {3, {6, 7}}, {4, {8, 9}}, };
vector<HLDataInitializer> hldvec {
{{{"0"}, {"1"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
{{{"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
{{{"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
{{{"0"}, {"1"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
{{{"1"}, {"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
{{{"0"}, {"1"}, {"2"}, {"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
{{{"0"}, {"2"}}, 1, HighlightData::TermGroup::TGK_PHRASE, true}, // slack 1
{{{"0"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
{{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
{{{"3"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
{{{"4"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
{{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, true},
{{{"2"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, false},
{{{"2"}, {"0"}}, 1, HighlightData::TermGroup::TGK_NEAR, true},
{{{"4"}, {"0"}}, 2, HighlightData::TermGroup::TGK_NEAR, false},
{{{"4"}, {"0"}}, 3, HighlightData::TermGroup::TGK_NEAR, true},
};
int main(int argc, char **argv)
{
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
Usage();
while (**argv)
switch (*(*argv)++) {
case 'v': op_flags |= OPT_v; break;
default: Usage(); break;
}
argc--;argv++;
}
cout << "text, bpos:\n";
cout << "0123456789\n";
cout << "0 1 2 3 4\n";
for (auto& hld : hldvec) {
vector<GroupMatchEntry> tboffs;
bool ret = matchGroup(hld.hldata, 0, plists1, gpostobytes1, tboffs);
if (ret && !hld.expected) {
cout << "matchGroup: ok, expected false: ";
hld.print();
for (const auto& ent: tboffs) {
cout << "{" << ent.offs.first << ", " << ent.offs.second << "} ";
}
cout << "\n";
} else if (!ret && hld.expected) {
cout << "matchGroup: failed, expected true:\n";
hld.print();
}
}
}

View File

@ -1,3 +1,22 @@
/* Copyright (C) 2017-2019 J.F.Dockes
*
* License: LGPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "autoconfig.h"
#include "textsplit.h"
@ -22,26 +41,6 @@
using namespace std;
class myTermProc : public Rcl::TermProc {
int first;
bool nooutput;
public:
myTermProc() : TermProc(0), first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;}
virtual bool takeword(const string &term, int pos, int bs, int be)
{
if (nooutput)
return true;
FILE *fp = stdout;
if (first) {
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
first = 0;
}
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
return true;
}
};
#define OPT_s 0x1
#define OPT_w 0x2
#define OPT_q 0x4
@ -52,6 +51,82 @@ public:
#define OPT_S 0x80
#define OPT_u 0x100
#define OPT_p 0x200
#define OPT_I 0x400
#define OPT_d 0x800
static string thisprog;
static string usage =
" textsplit [opts] [filename]\n"
" -I : use internal data. Else read filename or stdin if no param.\n"
" -q : no output\n"
" -d : print position and byte lists for input to hldata\n"
" -s : only spans\n"
" -w : only words\n"
" -n : no numbers\n"
" -k : preserve wildcards (?*)\n"
" -c : just count words\n"
" -u : use unac\n"
" -C [charset] : input charset\n"
" -S [stopfile] : stopfile to use for commongrams\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
" -p somephrase : display results from stringToStrings()\n"
" \n"
;
static void
Usage(void)
{
cerr << thisprog << ": usage:\n" << usage;
exit(1);
}
static int op_flags;
class myTermProc : public Rcl::TermProc {
int first;
bool nooutput;
public:
myTermProc() : TermProc(0), first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;}
virtual bool takeword(const string &term, int pos, int bs, int be) {
m_plists[term].push_back(pos);
m_gpostobytes[pos] = pair<int,int>(bs, be);
if (nooutput)
return true;
FILE *fp = stdout;
if (first) {
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
first = 0;
}
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
return true;
}
void printpos() {
cout << "{";
for (const auto& lst : m_plists) {
cout << "{\"" << lst.first << "\", {";
for (int pos : lst.second) {
cout << pos << ",";
}
cout << "}}, ";
}
cout << "};\n";
cout << "{";
for (const auto& ent : m_gpostobytes) {
cout << "{" << ent.first << ", {";
cout << ent.second.first << ", " << ent.second.second << "}}, ";
}
cout << "};\n";
}
private:
// group/near terms word positions.
map<string, vector<int> > m_plists;
map<int, pair<int, int> > m_gpostobytes;
};
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
{
@ -73,6 +148,9 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
printproc.setNoOut(true);
splitter.text_to_words(data);
if (op_flags & OPT_d) {
printproc.printpos();
}
#ifdef TEXTSPLIT_STATS
TextSplit::Stats::Values v = splitter.getStats();
@ -115,33 +193,6 @@ const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
static string teststring1 = " nouvel-an ";
static string thisprog;
static string usage =
" textsplit [opts] [filename]\n"
" -q : no output\n"
" -s : only spans\n"
" -w : only words\n"
" -n : no numbers\n"
" -k : preserve wildcards (?*)\n"
" -c : just count words\n"
" -u : use unac\n"
" -C [charset] : input charset\n"
" -S [stopfile] : stopfile to use for commongrams\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
" textplit -p somephrase : display results from stringToStrings()\n"
" \n"
;
static void
Usage(void)
{
cerr << thisprog << ": usage:\n" << usage;
exit(1);
}
static int op_flags;
int main(int argc, char **argv)
{
string charset, stopfile;
@ -160,6 +211,8 @@ int main(int argc, char **argv)
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
charset = *(++argv); argc--;
goto b1;
case 'd': op_flags |= OPT_d|OPT_q; break;
case 'I': op_flags |= OPT_I; break;
case 'k': op_flags |= OPT_k; break;
case 'n': op_flags |= OPT_n; break;
case 'p': op_flags |= OPT_p; break;
@ -205,31 +258,10 @@ int main(int argc, char **argv)
exit(1);
}
}
string odata, reason;
if (argc == 1) {
const char *filename = *argv++; argc--;
if (op_flags& OPT_p) {
vector<string> tokens;
TextSplit::stringToStrings(filename, tokens);
for (vector<string>::const_iterator it = tokens.begin();
it != tokens.end(); it++) {
cout << "[" << *it << "] ";
}
cout << endl;
exit(0);
}
if (!strcmp(filename, "stdin")) {
char buf[1024];
int nread;
while ((nread = read(0, buf, 1024)) > 0) {
odata.append(buf, nread);
}
} else if (!file_to_string(filename, odata, &reason)) {
cerr << "Failed: file_to_string(" << filename << ") failed: "
<< reason << endl;
exit(1);
}
} else {
if (op_flags & OPT_I) {
if (argc)
Usage();
if (op_flags & OPT_p)
Usage();
for (int i = 0; i < teststrings_cnt; i++) {
@ -237,6 +269,34 @@ int main(int argc, char **argv)
dosplit(teststrings[i], flags, op_flags);
}
exit(0);
} else if (op_flags& OPT_p) {
if (!argc)
Usage();
vector<string> tokens;
TextSplit::stringToStrings(argv[0], tokens);
for (vector<string>::const_iterator it = tokens.begin();
it != tokens.end(); it++) {
cout << "[" << *it << "] ";
}
cout << endl;
exit(0);
}
string odata, reason;
if (argc == 1) {
const char *filename = *argv++; argc--;
if (!file_to_string(filename, odata, &reason)) {
cerr << "Failed: file_to_string(" << filename << ") failed: "
<< reason << endl;
exit(1);
}
} else {
char buf[1024];
int nread;
while ((nread = read(0, buf, 1024)) > 0) {
odata.append(buf, nread);
}
}
string& data = odata;