add test driver for hldata:matchGroup + some help from textsplit
This commit is contained in:
parent
c588fddb83
commit
5b6436ca08
@ -37,7 +37,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
|
|||||||
-D_GNU_SOURCE \
|
-D_GNU_SOURCE \
|
||||||
$(DEFS)
|
$(DEFS)
|
||||||
|
|
||||||
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig
|
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata
|
||||||
|
|
||||||
textsplit_SOURCES = trtextsplit.cpp
|
textsplit_SOURCES = trtextsplit.cpp
|
||||||
textsplit_LDADD = ../librecoll.la
|
textsplit_LDADD = ../librecoll.la
|
||||||
@ -51,3 +51,6 @@ fstreewalk_LDADD = ../librecoll.la
|
|||||||
rclconfig_SOURCES = trrclconfig.cpp
|
rclconfig_SOURCES = trrclconfig.cpp
|
||||||
rclconfig_LDADD = ../librecoll.la
|
rclconfig_LDADD = ../librecoll.la
|
||||||
|
|
||||||
|
hldata_SOURCES = trhldata.cpp
|
||||||
|
hldata_LDADD = ../librecoll.la
|
||||||
|
|
||||||
|
|||||||
145
src/testmains/trhldata.cpp
Normal file
145
src/testmains/trhldata.cpp
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
/* Copyright (C) 2019 J.F.Dockes
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc.,
|
||||||
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
|
#include "hldata.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
|
||||||
|
const char *thisprog;
|
||||||
|
static char usage [] =
|
||||||
|
"hldata\n"
|
||||||
|
" test the near/phrase matching code used for highlighting and snippets\n"
|
||||||
|
;
|
||||||
|
|
||||||
|
void Usage() {
|
||||||
|
fprintf(stderr, "%s:%s\n", thisprog, usage);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int op_flags;
|
||||||
|
#define OPT_v 0x2
|
||||||
|
|
||||||
|
vector<CharFlags> kindflags {
|
||||||
|
CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
|
||||||
|
CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
|
||||||
|
CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Provides a constructor for HighlightData, for easy static init.
|
||||||
|
class HLDataInitializer {
|
||||||
|
public:
|
||||||
|
HLDataInitializer(vector<vector<string> > groups, int slack,
|
||||||
|
HighlightData::TermGroup::TGK kind, bool res) {
|
||||||
|
hldata.index_term_groups.clear();
|
||||||
|
hldata.index_term_groups.push_back(HighlightData::TermGroup());
|
||||||
|
hldata.index_term_groups[0].orgroups = groups;
|
||||||
|
hldata.index_term_groups[0].slack = slack;
|
||||||
|
hldata.index_term_groups[0].kind = kind;
|
||||||
|
expected = res;
|
||||||
|
}
|
||||||
|
HighlightData hldata;
|
||||||
|
bool expected;
|
||||||
|
void print() {
|
||||||
|
const auto& tgp{hldata.index_term_groups[0]};
|
||||||
|
cout << "{";
|
||||||
|
for (const auto& group:tgp.orgroups) {
|
||||||
|
cout << "{";
|
||||||
|
for (const auto& term: group) {
|
||||||
|
cout << term << ", ";
|
||||||
|
}
|
||||||
|
cout << "}, ";
|
||||||
|
}
|
||||||
|
cout << "} slack: " << tgp.slack << " kind " <<
|
||||||
|
valToString(kindflags, tgp.kind) << endl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Data: source text (for display),
|
||||||
|
string text1{"0 1 2 3 4"};
|
||||||
|
// Positions produced by textsplit -d from the above
|
||||||
|
map<string, vector<int> > plists1
|
||||||
|
{{"0", {0,}}, {"1", {1,}}, {"2", {2,}}, {"3", {3,}}, {"4", {4,}}, };
|
||||||
|
map<int, pair<int,int>> gpostobytes1
|
||||||
|
{{0, {0, 1}}, {1, {2, 3}}, {2, {4, 5}}, {3, {6, 7}}, {4, {8, 9}}, };
|
||||||
|
|
||||||
|
|
||||||
|
vector<HLDataInitializer> hldvec {
|
||||||
|
{{{"0"}, {"1"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||||
|
{{{"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||||
|
{{{"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||||
|
{{{"0"}, {"1"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||||
|
{{{"1"}, {"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||||
|
{{{"0"}, {"1"}, {"2"}, {"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||||
|
{{{"0"}, {"2"}}, 1, HighlightData::TermGroup::TGK_PHRASE, true}, // slack 1
|
||||||
|
{{{"0"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||||
|
{{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||||
|
{{{"3"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||||
|
{{{"4"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||||
|
|
||||||
|
{{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, true},
|
||||||
|
{{{"2"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, false},
|
||||||
|
{{{"2"}, {"0"}}, 1, HighlightData::TermGroup::TGK_NEAR, true},
|
||||||
|
{{{"4"}, {"0"}}, 2, HighlightData::TermGroup::TGK_NEAR, false},
|
||||||
|
{{{"4"}, {"0"}}, 3, HighlightData::TermGroup::TGK_NEAR, true},
|
||||||
|
};
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
thisprog = argv[0];
|
||||||
|
argc--; argv++;
|
||||||
|
|
||||||
|
while (argc > 0 && **argv == '-') {
|
||||||
|
(*argv)++;
|
||||||
|
if (!(**argv))
|
||||||
|
Usage();
|
||||||
|
while (**argv)
|
||||||
|
switch (*(*argv)++) {
|
||||||
|
case 'v': op_flags |= OPT_v; break;
|
||||||
|
default: Usage(); break;
|
||||||
|
}
|
||||||
|
argc--;argv++;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "text, bpos:\n";
|
||||||
|
cout << "0123456789\n";
|
||||||
|
cout << "0 1 2 3 4\n";
|
||||||
|
for (auto& hld : hldvec) {
|
||||||
|
vector<GroupMatchEntry> tboffs;
|
||||||
|
bool ret = matchGroup(hld.hldata, 0, plists1, gpostobytes1, tboffs);
|
||||||
|
if (ret && !hld.expected) {
|
||||||
|
cout << "matchGroup: ok, expected false: ";
|
||||||
|
hld.print();
|
||||||
|
for (const auto& ent: tboffs) {
|
||||||
|
cout << "{" << ent.offs.first << ", " << ent.offs.second << "} ";
|
||||||
|
}
|
||||||
|
cout << "\n";
|
||||||
|
} else if (!ret && hld.expected) {
|
||||||
|
cout << "matchGroup: failed, expected true:\n";
|
||||||
|
hld.print();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,3 +1,22 @@
|
|||||||
|
/* Copyright (C) 2017-2019 J.F.Dockes
|
||||||
|
*
|
||||||
|
* License: LGPL 2.1
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2.1 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc.,
|
||||||
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*/
|
||||||
#include "autoconfig.h"
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
@ -22,26 +41,6 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
class myTermProc : public Rcl::TermProc {
|
|
||||||
int first;
|
|
||||||
bool nooutput;
|
|
||||||
public:
|
|
||||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
|
||||||
void setNoOut(bool val) {nooutput = val;}
|
|
||||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
|
||||||
{
|
|
||||||
if (nooutput)
|
|
||||||
return true;
|
|
||||||
FILE *fp = stdout;
|
|
||||||
if (first) {
|
|
||||||
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
|
||||||
first = 0;
|
|
||||||
}
|
|
||||||
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define OPT_s 0x1
|
#define OPT_s 0x1
|
||||||
#define OPT_w 0x2
|
#define OPT_w 0x2
|
||||||
#define OPT_q 0x4
|
#define OPT_q 0x4
|
||||||
@ -52,6 +51,82 @@ public:
|
|||||||
#define OPT_S 0x80
|
#define OPT_S 0x80
|
||||||
#define OPT_u 0x100
|
#define OPT_u 0x100
|
||||||
#define OPT_p 0x200
|
#define OPT_p 0x200
|
||||||
|
#define OPT_I 0x400
|
||||||
|
#define OPT_d 0x800
|
||||||
|
|
||||||
|
static string thisprog;
|
||||||
|
|
||||||
|
static string usage =
|
||||||
|
" textsplit [opts] [filename]\n"
|
||||||
|
" -I : use internal data. Else read filename or stdin if no param.\n"
|
||||||
|
" -q : no output\n"
|
||||||
|
" -d : print position and byte lists for input to hldata\n"
|
||||||
|
" -s : only spans\n"
|
||||||
|
" -w : only words\n"
|
||||||
|
" -n : no numbers\n"
|
||||||
|
" -k : preserve wildcards (?*)\n"
|
||||||
|
" -c : just count words\n"
|
||||||
|
" -u : use unac\n"
|
||||||
|
" -C [charset] : input charset\n"
|
||||||
|
" -S [stopfile] : stopfile to use for commongrams\n"
|
||||||
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
|
||||||
|
" -p somephrase : display results from stringToStrings()\n"
|
||||||
|
" \n"
|
||||||
|
;
|
||||||
|
|
||||||
|
static void
|
||||||
|
Usage(void)
|
||||||
|
{
|
||||||
|
cerr << thisprog << ": usage:\n" << usage;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int op_flags;
|
||||||
|
|
||||||
|
|
||||||
|
class myTermProc : public Rcl::TermProc {
|
||||||
|
int first;
|
||||||
|
bool nooutput;
|
||||||
|
public:
|
||||||
|
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||||
|
void setNoOut(bool val) {nooutput = val;}
|
||||||
|
virtual bool takeword(const string &term, int pos, int bs, int be) {
|
||||||
|
m_plists[term].push_back(pos);
|
||||||
|
m_gpostobytes[pos] = pair<int,int>(bs, be);
|
||||||
|
if (nooutput)
|
||||||
|
return true;
|
||||||
|
FILE *fp = stdout;
|
||||||
|
if (first) {
|
||||||
|
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||||
|
first = 0;
|
||||||
|
}
|
||||||
|
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void printpos() {
|
||||||
|
cout << "{";
|
||||||
|
for (const auto& lst : m_plists) {
|
||||||
|
cout << "{\"" << lst.first << "\", {";
|
||||||
|
for (int pos : lst.second) {
|
||||||
|
cout << pos << ",";
|
||||||
|
}
|
||||||
|
cout << "}}, ";
|
||||||
|
}
|
||||||
|
cout << "};\n";
|
||||||
|
cout << "{";
|
||||||
|
for (const auto& ent : m_gpostobytes) {
|
||||||
|
cout << "{" << ent.first << ", {";
|
||||||
|
cout << ent.second.first << ", " << ent.second.second << "}}, ";
|
||||||
|
}
|
||||||
|
cout << "};\n";
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
// group/near terms word positions.
|
||||||
|
map<string, vector<int> > m_plists;
|
||||||
|
map<int, pair<int, int> > m_gpostobytes;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
||||||
{
|
{
|
||||||
@ -73,6 +148,9 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
|||||||
printproc.setNoOut(true);
|
printproc.setNoOut(true);
|
||||||
|
|
||||||
splitter.text_to_words(data);
|
splitter.text_to_words(data);
|
||||||
|
if (op_flags & OPT_d) {
|
||||||
|
printproc.printpos();
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
#ifdef TEXTSPLIT_STATS
|
||||||
TextSplit::Stats::Values v = splitter.getStats();
|
TextSplit::Stats::Values v = splitter.getStats();
|
||||||
@ -115,33 +193,6 @@ const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
|||||||
|
|
||||||
static string teststring1 = " nouvel-an ";
|
static string teststring1 = " nouvel-an ";
|
||||||
|
|
||||||
static string thisprog;
|
|
||||||
|
|
||||||
static string usage =
|
|
||||||
" textsplit [opts] [filename]\n"
|
|
||||||
" -q : no output\n"
|
|
||||||
" -s : only spans\n"
|
|
||||||
" -w : only words\n"
|
|
||||||
" -n : no numbers\n"
|
|
||||||
" -k : preserve wildcards (?*)\n"
|
|
||||||
" -c : just count words\n"
|
|
||||||
" -u : use unac\n"
|
|
||||||
" -C [charset] : input charset\n"
|
|
||||||
" -S [stopfile] : stopfile to use for commongrams\n"
|
|
||||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
|
|
||||||
" textplit -p somephrase : display results from stringToStrings()\n"
|
|
||||||
" \n"
|
|
||||||
;
|
|
||||||
|
|
||||||
static void
|
|
||||||
Usage(void)
|
|
||||||
{
|
|
||||||
cerr << thisprog << ": usage:\n" << usage;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int op_flags;
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
string charset, stopfile;
|
string charset, stopfile;
|
||||||
@ -160,6 +211,8 @@ int main(int argc, char **argv)
|
|||||||
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
|
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
|
||||||
charset = *(++argv); argc--;
|
charset = *(++argv); argc--;
|
||||||
goto b1;
|
goto b1;
|
||||||
|
case 'd': op_flags |= OPT_d|OPT_q; break;
|
||||||
|
case 'I': op_flags |= OPT_I; break;
|
||||||
case 'k': op_flags |= OPT_k; break;
|
case 'k': op_flags |= OPT_k; break;
|
||||||
case 'n': op_flags |= OPT_n; break;
|
case 'n': op_flags |= OPT_n; break;
|
||||||
case 'p': op_flags |= OPT_p; break;
|
case 'p': op_flags |= OPT_p; break;
|
||||||
@ -205,31 +258,10 @@ int main(int argc, char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
string odata, reason;
|
|
||||||
if (argc == 1) {
|
if (op_flags & OPT_I) {
|
||||||
const char *filename = *argv++; argc--;
|
if (argc)
|
||||||
if (op_flags& OPT_p) {
|
Usage();
|
||||||
vector<string> tokens;
|
|
||||||
TextSplit::stringToStrings(filename, tokens);
|
|
||||||
for (vector<string>::const_iterator it = tokens.begin();
|
|
||||||
it != tokens.end(); it++) {
|
|
||||||
cout << "[" << *it << "] ";
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
if (!strcmp(filename, "stdin")) {
|
|
||||||
char buf[1024];
|
|
||||||
int nread;
|
|
||||||
while ((nread = read(0, buf, 1024)) > 0) {
|
|
||||||
odata.append(buf, nread);
|
|
||||||
}
|
|
||||||
} else if (!file_to_string(filename, odata, &reason)) {
|
|
||||||
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
|
||||||
<< reason << endl;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (op_flags & OPT_p)
|
if (op_flags & OPT_p)
|
||||||
Usage();
|
Usage();
|
||||||
for (int i = 0; i < teststrings_cnt; i++) {
|
for (int i = 0; i < teststrings_cnt; i++) {
|
||||||
@ -237,6 +269,34 @@ int main(int argc, char **argv)
|
|||||||
dosplit(teststrings[i], flags, op_flags);
|
dosplit(teststrings[i], flags, op_flags);
|
||||||
}
|
}
|
||||||
exit(0);
|
exit(0);
|
||||||
|
} else if (op_flags& OPT_p) {
|
||||||
|
if (!argc)
|
||||||
|
Usage();
|
||||||
|
vector<string> tokens;
|
||||||
|
TextSplit::stringToStrings(argv[0], tokens);
|
||||||
|
for (vector<string>::const_iterator it = tokens.begin();
|
||||||
|
it != tokens.end(); it++) {
|
||||||
|
cout << "[" << *it << "] ";
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
string odata, reason;
|
||||||
|
if (argc == 1) {
|
||||||
|
const char *filename = *argv++; argc--;
|
||||||
|
if (!file_to_string(filename, odata, &reason)) {
|
||||||
|
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
||||||
|
<< reason << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
char buf[1024];
|
||||||
|
int nread;
|
||||||
|
while ((nread = read(0, buf, 1024)) > 0) {
|
||||||
|
odata.append(buf, nread);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
string& data = odata;
|
string& data = odata;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user