add test driver for hldata:matchGroup + some help from textsplit
This commit is contained in:
parent
4a56748fe5
commit
41c9ea92c7
@ -37,7 +37,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
|
||||
-D_GNU_SOURCE \
|
||||
$(DEFS)
|
||||
|
||||
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig
|
||||
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata
|
||||
|
||||
textsplit_SOURCES = trtextsplit.cpp
|
||||
textsplit_LDADD = ../librecoll.la
|
||||
@ -51,3 +51,6 @@ fstreewalk_LDADD = ../librecoll.la
|
||||
rclconfig_SOURCES = trrclconfig.cpp
|
||||
rclconfig_LDADD = ../librecoll.la
|
||||
|
||||
hldata_SOURCES = trhldata.cpp
|
||||
hldata_LDADD = ../librecoll.la
|
||||
|
||||
|
||||
145
src/testmains/trhldata.cpp
Normal file
145
src/testmains/trhldata.cpp
Normal file
@ -0,0 +1,145 @@
|
||||
/* Copyright (C) 2019 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "log.h"
|
||||
#include "hldata.h"
|
||||
#include "smallut.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
const char *thisprog;
|
||||
static char usage [] =
|
||||
"hldata\n"
|
||||
" test the near/phrase matching code used for highlighting and snippets\n"
|
||||
;
|
||||
|
||||
void Usage() {
|
||||
fprintf(stderr, "%s:%s\n", thisprog, usage);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_v 0x2
|
||||
|
||||
vector<CharFlags> kindflags {
|
||||
CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
|
||||
CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
|
||||
CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
|
||||
};
|
||||
|
||||
// Provides a constructor for HighlightData, for easy static init.
|
||||
class HLDataInitializer {
|
||||
public:
|
||||
HLDataInitializer(vector<vector<string> > groups, int slack,
|
||||
HighlightData::TermGroup::TGK kind, bool res) {
|
||||
hldata.index_term_groups.clear();
|
||||
hldata.index_term_groups.push_back(HighlightData::TermGroup());
|
||||
hldata.index_term_groups[0].orgroups = groups;
|
||||
hldata.index_term_groups[0].slack = slack;
|
||||
hldata.index_term_groups[0].kind = kind;
|
||||
expected = res;
|
||||
}
|
||||
HighlightData hldata;
|
||||
bool expected;
|
||||
void print() {
|
||||
const auto& tgp{hldata.index_term_groups[0]};
|
||||
cout << "{";
|
||||
for (const auto& group:tgp.orgroups) {
|
||||
cout << "{";
|
||||
for (const auto& term: group) {
|
||||
cout << term << ", ";
|
||||
}
|
||||
cout << "}, ";
|
||||
}
|
||||
cout << "} slack: " << tgp.slack << " kind " <<
|
||||
valToString(kindflags, tgp.kind) << endl;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Data: source text (for display),
|
||||
string text1{"0 1 2 3 4"};
|
||||
// Positions produced by textsplit -d from the above
|
||||
map<string, vector<int> > plists1
|
||||
{{"0", {0,}}, {"1", {1,}}, {"2", {2,}}, {"3", {3,}}, {"4", {4,}}, };
|
||||
map<int, pair<int,int>> gpostobytes1
|
||||
{{0, {0, 1}}, {1, {2, 3}}, {2, {4, 5}}, {3, {6, 7}}, {4, {8, 9}}, };
|
||||
|
||||
|
||||
vector<HLDataInitializer> hldvec {
|
||||
{{{"0"}, {"1"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||
{{{"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||
{{{"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||
{{{"0"}, {"1"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||
{{{"1"}, {"2"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||
{{{"0"}, {"1"}, {"2"}, {"3"}, {"4"}}, 0, HighlightData::TermGroup::TGK_PHRASE, true},
|
||||
{{{"0"}, {"2"}}, 1, HighlightData::TermGroup::TGK_PHRASE, true}, // slack 1
|
||||
{{{"0"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||
{{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||
{{{"3"}, {"2"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||
{{{"4"}, {"3"}}, 0, HighlightData::TermGroup::TGK_PHRASE, false},
|
||||
|
||||
{{{"1"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, true},
|
||||
{{{"2"}, {"0"}}, 0, HighlightData::TermGroup::TGK_NEAR, false},
|
||||
{{{"2"}, {"0"}}, 1, HighlightData::TermGroup::TGK_NEAR, true},
|
||||
{{{"4"}, {"0"}}, 2, HighlightData::TermGroup::TGK_NEAR, false},
|
||||
{{{"4"}, {"0"}}, 3, HighlightData::TermGroup::TGK_NEAR, true},
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'v': op_flags |= OPT_v; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
argc--;argv++;
|
||||
}
|
||||
|
||||
cout << "text, bpos:\n";
|
||||
cout << "0123456789\n";
|
||||
cout << "0 1 2 3 4\n";
|
||||
for (auto& hld : hldvec) {
|
||||
vector<GroupMatchEntry> tboffs;
|
||||
bool ret = matchGroup(hld.hldata, 0, plists1, gpostobytes1, tboffs);
|
||||
if (ret && !hld.expected) {
|
||||
cout << "matchGroup: ok, expected false: ";
|
||||
hld.print();
|
||||
for (const auto& ent: tboffs) {
|
||||
cout << "{" << ent.offs.first << ", " << ent.offs.second << "} ";
|
||||
}
|
||||
cout << "\n";
|
||||
} else if (!ret && hld.expected) {
|
||||
cout << "matchGroup: failed, expected true:\n";
|
||||
hld.print();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,3 +1,22 @@
|
||||
/* Copyright (C) 2017-2019 J.F.Dockes
|
||||
*
|
||||
* License: LGPL 2.1
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation; either version 2.1 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "textsplit.h"
|
||||
@ -22,26 +41,6 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
class myTermProc : public Rcl::TermProc {
|
||||
int first;
|
||||
bool nooutput;
|
||||
public:
|
||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||
void setNoOut(bool val) {nooutput = val;}
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||
{
|
||||
if (nooutput)
|
||||
return true;
|
||||
FILE *fp = stdout;
|
||||
if (first) {
|
||||
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||
first = 0;
|
||||
}
|
||||
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
#define OPT_s 0x1
|
||||
#define OPT_w 0x2
|
||||
#define OPT_q 0x4
|
||||
@ -52,6 +51,82 @@ public:
|
||||
#define OPT_S 0x80
|
||||
#define OPT_u 0x100
|
||||
#define OPT_p 0x200
|
||||
#define OPT_I 0x400
|
||||
#define OPT_d 0x800
|
||||
|
||||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" textsplit [opts] [filename]\n"
|
||||
" -I : use internal data. Else read filename or stdin if no param.\n"
|
||||
" -q : no output\n"
|
||||
" -d : print position and byte lists for input to hldata\n"
|
||||
" -s : only spans\n"
|
||||
" -w : only words\n"
|
||||
" -n : no numbers\n"
|
||||
" -k : preserve wildcards (?*)\n"
|
||||
" -c : just count words\n"
|
||||
" -u : use unac\n"
|
||||
" -C [charset] : input charset\n"
|
||||
" -S [stopfile] : stopfile to use for commongrams\n"
|
||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
|
||||
" -p somephrase : display results from stringToStrings()\n"
|
||||
" \n"
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:\n" << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
|
||||
|
||||
class myTermProc : public Rcl::TermProc {
|
||||
int first;
|
||||
bool nooutput;
|
||||
public:
|
||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||
void setNoOut(bool val) {nooutput = val;}
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be) {
|
||||
m_plists[term].push_back(pos);
|
||||
m_gpostobytes[pos] = pair<int,int>(bs, be);
|
||||
if (nooutput)
|
||||
return true;
|
||||
FILE *fp = stdout;
|
||||
if (first) {
|
||||
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||
first = 0;
|
||||
}
|
||||
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
||||
return true;
|
||||
}
|
||||
|
||||
void printpos() {
|
||||
cout << "{";
|
||||
for (const auto& lst : m_plists) {
|
||||
cout << "{\"" << lst.first << "\", {";
|
||||
for (int pos : lst.second) {
|
||||
cout << pos << ",";
|
||||
}
|
||||
cout << "}}, ";
|
||||
}
|
||||
cout << "};\n";
|
||||
cout << "{";
|
||||
for (const auto& ent : m_gpostobytes) {
|
||||
cout << "{" << ent.first << ", {";
|
||||
cout << ent.second.first << ", " << ent.second.second << "}}, ";
|
||||
}
|
||||
cout << "};\n";
|
||||
}
|
||||
private:
|
||||
// group/near terms word positions.
|
||||
map<string, vector<int> > m_plists;
|
||||
map<int, pair<int, int> > m_gpostobytes;
|
||||
};
|
||||
|
||||
|
||||
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
||||
{
|
||||
@ -73,6 +148,9 @@ bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
||||
printproc.setNoOut(true);
|
||||
|
||||
splitter.text_to_words(data);
|
||||
if (op_flags & OPT_d) {
|
||||
printproc.printpos();
|
||||
}
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
TextSplit::Stats::Values v = splitter.getStats();
|
||||
@ -115,33 +193,6 @@ const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
||||
|
||||
static string teststring1 = " nouvel-an ";
|
||||
|
||||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" textsplit [opts] [filename]\n"
|
||||
" -q : no output\n"
|
||||
" -s : only spans\n"
|
||||
" -w : only words\n"
|
||||
" -n : no numbers\n"
|
||||
" -k : preserve wildcards (?*)\n"
|
||||
" -c : just count words\n"
|
||||
" -u : use unac\n"
|
||||
" -C [charset] : input charset\n"
|
||||
" -S [stopfile] : stopfile to use for commongrams\n"
|
||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
|
||||
" textplit -p somephrase : display results from stringToStrings()\n"
|
||||
" \n"
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:\n" << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string charset, stopfile;
|
||||
@ -160,6 +211,8 @@ int main(int argc, char **argv)
|
||||
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
|
||||
charset = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 'd': op_flags |= OPT_d|OPT_q; break;
|
||||
case 'I': op_flags |= OPT_I; break;
|
||||
case 'k': op_flags |= OPT_k; break;
|
||||
case 'n': op_flags |= OPT_n; break;
|
||||
case 'p': op_flags |= OPT_p; break;
|
||||
@ -205,31 +258,10 @@ int main(int argc, char **argv)
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
string odata, reason;
|
||||
if (argc == 1) {
|
||||
const char *filename = *argv++; argc--;
|
||||
if (op_flags& OPT_p) {
|
||||
vector<string> tokens;
|
||||
TextSplit::stringToStrings(filename, tokens);
|
||||
for (vector<string>::const_iterator it = tokens.begin();
|
||||
it != tokens.end(); it++) {
|
||||
cout << "[" << *it << "] ";
|
||||
}
|
||||
cout << endl;
|
||||
exit(0);
|
||||
}
|
||||
if (!strcmp(filename, "stdin")) {
|
||||
char buf[1024];
|
||||
int nread;
|
||||
while ((nread = read(0, buf, 1024)) > 0) {
|
||||
odata.append(buf, nread);
|
||||
}
|
||||
} else if (!file_to_string(filename, odata, &reason)) {
|
||||
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
||||
<< reason << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
|
||||
if (op_flags & OPT_I) {
|
||||
if (argc)
|
||||
Usage();
|
||||
if (op_flags & OPT_p)
|
||||
Usage();
|
||||
for (int i = 0; i < teststrings_cnt; i++) {
|
||||
@ -237,6 +269,34 @@ int main(int argc, char **argv)
|
||||
dosplit(teststrings[i], flags, op_flags);
|
||||
}
|
||||
exit(0);
|
||||
} else if (op_flags& OPT_p) {
|
||||
if (!argc)
|
||||
Usage();
|
||||
vector<string> tokens;
|
||||
TextSplit::stringToStrings(argv[0], tokens);
|
||||
for (vector<string>::const_iterator it = tokens.begin();
|
||||
it != tokens.end(); it++) {
|
||||
cout << "[" << *it << "] ";
|
||||
}
|
||||
cout << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
string odata, reason;
|
||||
if (argc == 1) {
|
||||
const char *filename = *argv++; argc--;
|
||||
if (!file_to_string(filename, odata, &reason)) {
|
||||
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
||||
<< reason << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
char buf[1024];
|
||||
int nread;
|
||||
while ((nread = read(0, buf, 1024)) > 0) {
|
||||
odata.append(buf, nread);
|
||||
}
|
||||
}
|
||||
|
||||
string& data = odata;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user