hldata: cleanup + support phrases
This commit is contained in:
parent
3f33d1d0ea
commit
4ad8a08030
@ -29,42 +29,6 @@ using std::map;
|
|||||||
using std::vector;
|
using std::vector;
|
||||||
using std::pair;
|
using std::pair;
|
||||||
|
|
||||||
bool do_proximity_test(int window, vector<const vector<int>*>& plists,
|
|
||||||
unsigned int i, int min, int max,
|
|
||||||
int *sp, int *ep, int minpos)
|
|
||||||
{
|
|
||||||
LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
|
|
||||||
min << " max " << max << " minpos " << minpos << "\n");
|
|
||||||
int tmp = max + 1 - window;
|
|
||||||
if (tmp < minpos)
|
|
||||||
tmp = minpos;
|
|
||||||
|
|
||||||
// Find 1st position bigger than window start
|
|
||||||
auto it = plists[i]->begin();
|
|
||||||
while (it != plists[i]->end() && *it < tmp)
|
|
||||||
it++;
|
|
||||||
|
|
||||||
// Look for position inside window. If not found, no match. If
|
|
||||||
// found: if this is the last list we're done, else recurse on
|
|
||||||
// next list after adjusting the window
|
|
||||||
while (it != plists[i]->end()) {
|
|
||||||
int pos = *it;
|
|
||||||
if (pos > min + window - 1)
|
|
||||||
return false;
|
|
||||||
if (i + 1 == plists.size()) {
|
|
||||||
setWinMinMax(pos, *sp, *ep);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
setWinMinMax(pos, min, max);
|
|
||||||
if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
|
|
||||||
setWinMinMax(pos, *sp, *ep);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
it++;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef DEBUGGROUPS
|
#undef DEBUGGROUPS
|
||||||
#ifdef DEBUGGROUPS
|
#ifdef DEBUGGROUPS
|
||||||
#define LOGRP LOGDEB
|
#define LOGRP LOGDEB
|
||||||
@ -72,19 +36,91 @@ bool do_proximity_test(int window, vector<const vector<int>*>& plists,
|
|||||||
#define LOGRP LOGDEB1
|
#define LOGRP LOGDEB1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Find NEAR matches for one group of terms
|
static inline void setWinMinMax(int pos, int& sta, int& sto)
|
||||||
|
{
|
||||||
|
if (pos < sta) {
|
||||||
|
sta = pos;
|
||||||
|
}
|
||||||
|
if (pos > sto) {
|
||||||
|
sto = pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* @param window the total width for the "near" area, in positions.
|
||||||
|
|
||||||
|
* @param plists the position vectors for the terms. The array is
|
||||||
|
* sorted shorted first for optimization. The function does a
|
||||||
|
* recursive call on the next array if the match is still possible
|
||||||
|
* after dealing with the current one
|
||||||
|
|
||||||
|
* @param plist_idx the index for the position list we will work with.
|
||||||
|
* @param min, max the current minimum and maximum term positions.
|
||||||
|
* @param[output] sp, ep, the start and end positions of the found match.
|
||||||
|
* @param minpos Highest end of a found match. While looking for
|
||||||
|
* further matches, we don't want the search to extend before
|
||||||
|
* this, because it does not make sense for highlight regions to
|
||||||
|
* overlap.
|
||||||
|
* @param isphrase if true, the position lists are in term order, and
|
||||||
|
* we only look for the next match beyond the current window top.
|
||||||
|
*/
|
||||||
|
static bool do_proximity_test(
|
||||||
|
const int window, vector<const vector<int>*>& plists,
|
||||||
|
unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
|
||||||
|
bool isphrase)
|
||||||
|
{
|
||||||
|
LOGINF("do_prox_test: win " << window << " plist_idx " << plist_idx <<
|
||||||
|
" min " << min << " max " << max << " minpos " << minpos <<
|
||||||
|
" isphrase " << isphrase << "\n");
|
||||||
|
|
||||||
|
// Overlap interdiction: possibly adjust window start by input minpos
|
||||||
|
int actualminpos = isphrase ? max + 1 : max + 1 - window;
|
||||||
|
if (actualminpos < minpos)
|
||||||
|
actualminpos = minpos;
|
||||||
|
|
||||||
|
// Find 1st position bigger than window start
|
||||||
|
auto it = plists[plist_idx]->begin();
|
||||||
|
while (it != plists[plist_idx]->end() && *it < actualminpos)
|
||||||
|
it++;
|
||||||
|
|
||||||
|
// Look for position inside window. If not found, no match. If
|
||||||
|
// found: if this is the last list we're done, else recurse on
|
||||||
|
// next list after adjusting the window
|
||||||
|
while (it != plists[plist_idx]->end()) {
|
||||||
|
int pos = *it;
|
||||||
|
if (pos > min + window - 1)
|
||||||
|
return false;
|
||||||
|
if (plist_idx + 1 == plists.size()) {
|
||||||
|
// Done: set return values
|
||||||
|
setWinMinMax(pos, *sp, *ep);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
setWinMinMax(pos, min, max);
|
||||||
|
if (do_proximity_test(window,plists, plist_idx + 1,
|
||||||
|
min, max, sp, ep, minpos)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Find matches for one group of terms
|
||||||
bool matchGroup(const HighlightData& hldata,
|
bool matchGroup(const HighlightData& hldata,
|
||||||
unsigned int grpidx,
|
unsigned int grpidx,
|
||||||
const map<string, vector<int>>& inplists,
|
const map<string, vector<int>>& inplists,
|
||||||
const map<int, pair<int,int>>& gpostobytes,
|
const map<int, pair<int,int>>& gpostobytes,
|
||||||
vector<GroupMatchEntry>& tboffs
|
vector<GroupMatchEntry>& tboffs,
|
||||||
|
bool isphrase
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const vector<string>& terms = hldata.groups[grpidx];
|
isphrase=true;
|
||||||
int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]);
|
const vector<string>& terms = hldata.index_term_groups[grpidx];
|
||||||
|
int window = int(terms.size() + hldata.slacks[grpidx]);
|
||||||
|
|
||||||
LOGRP("TextSplitPTR::matchGroup:d " << window << ": " <<
|
LOGRP("TextSplitPTR::matchGroup:d " << window << ": " <<
|
||||||
stringsToString(terms) << "\n");
|
stringsToString(terms) << "\n");
|
||||||
|
|
||||||
// The position lists we are going to work with. We extract them from the
|
// The position lists we are going to work with. We extract them from the
|
||||||
// (string->plist) map
|
// (string->plist) map
|
||||||
@ -100,7 +136,7 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
map<string, vector<int> >::const_iterator pl = inplists.find(term);
|
map<string, vector<int> >::const_iterator pl = inplists.find(term);
|
||||||
if (pl == inplists.end()) {
|
if (pl == inplists.end()) {
|
||||||
LOGRP("TextSplitPTR::matchGroup: [" << term <<
|
LOGRP("TextSplitPTR::matchGroup: [" << term <<
|
||||||
"] not found in plists\n");
|
"] not found in plists\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
plists.push_back(&(pl->second));
|
plists.push_back(&(pl->second));
|
||||||
@ -112,13 +148,16 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
|
LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Sort the positions lists so that the shorter is first
|
|
||||||
std::sort(plists.begin(), plists.end(),
|
|
||||||
[](const vector<int> *a, const vector<int> *b) -> bool {
|
|
||||||
return a->size() < b->size();
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
|
if (!isphrase) {
|
||||||
|
// Sort the positions lists so that the shorter is first
|
||||||
|
std::sort(plists.begin(), plists.end(),
|
||||||
|
[](const vector<int> *a, const vector<int> *b) -> bool {
|
||||||
|
return a->size() < b->size();
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if (0) { // Debug
|
if (0) { // Debug
|
||||||
auto it = plistToTerm.find(plists[0]);
|
auto it = plistToTerm.find(plists[0]);
|
||||||
if (it == plistToTerm.end()) {
|
if (it == plistToTerm.end()) {
|
||||||
@ -127,7 +166,7 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOGRP("matchGroup: walking the shortest plist. Term [" <<
|
LOGRP("matchGroup: walking the shortest plist. Term [" <<
|
||||||
it->second << "], len " << plists[0]->size() << "\n");
|
it->second << "], len " << plists[0]->size() << "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Minpos is the highest end of a found match. While looking for
|
// Minpos is the highest end of a found match. While looking for
|
||||||
@ -139,12 +178,11 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
for (int pos : *(plists[0])) {
|
for (int pos : *(plists[0])) {
|
||||||
int sta = INT_MAX, sto = 0;
|
int sta = INT_MAX, sto = 0;
|
||||||
LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
|
LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
|
||||||
if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
|
if (do_proximity_test(
|
||||||
LOGRP("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
|
window, plists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
|
||||||
"," << sto << "]\n");
|
|
||||||
// Maybe extend the window by 1st term position, this was not
|
|
||||||
// done by do_prox..
|
|
||||||
setWinMinMax(pos, sta, sto);
|
setWinMinMax(pos, sta, sto);
|
||||||
|
LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
|
||||||
|
"," << sto << "]\n");
|
||||||
minpos = sto + 1;
|
minpos = sto + 1;
|
||||||
// Translate the position window into a byte offset window
|
// Translate the position window into a byte offset window
|
||||||
auto i1 = gpostobytes.find(sta);
|
auto i1 = gpostobytes.find(sta);
|
||||||
@ -153,7 +191,7 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
|
LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
|
||||||
i1->second.first << " " << i2->second.second << "\n");
|
i1->second.first << " " << i2->second.second << "\n");
|
||||||
tboffs.push_back(GroupMatchEntry(i1->second.first,
|
tboffs.push_back(GroupMatchEntry(i1->second.first,
|
||||||
i2->second.second, grpidx));
|
i2->second.second, grpidx));
|
||||||
} else {
|
} else {
|
||||||
LOGDEB0("matchGroup: no bpos found for " << sta << " or "
|
LOGDEB0("matchGroup: no bpos found for " << sta << " or "
|
||||||
<< sto << "\n");
|
<< sto << "\n");
|
||||||
@ -169,24 +207,23 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
void HighlightData::toString(string& out) const
|
void HighlightData::toString(string& out) const
|
||||||
{
|
{
|
||||||
out.append("\nUser terms (orthograph): ");
|
out.append("\nUser terms (orthograph): ");
|
||||||
for (std::set<string>::const_iterator it = uterms.begin();
|
for (const auto& term : uterms) {
|
||||||
it != uterms.end(); it++) {
|
out.append(" [").append(term).append("]");
|
||||||
out.append(" [").append(*it).append("]");
|
|
||||||
}
|
}
|
||||||
out.append("\nUser terms to Query terms:");
|
out.append("\nUser terms to Query terms:");
|
||||||
for (map<string, string>::const_iterator it = terms.begin();
|
for (const auto& entry: terms) {
|
||||||
it != terms.end(); it++) {
|
out.append("[").append(entry.first).append("]->[");
|
||||||
out.append("[").append(it->first).append("]->[");
|
out.append(entry.second).append("] ");
|
||||||
out.append(it->second).append("] ");
|
|
||||||
}
|
}
|
||||||
out.append("\nGroups: ");
|
out.append("\nGroups: ");
|
||||||
char cbuf[200];
|
char cbuf[200];
|
||||||
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
|
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
|
||||||
int(groups.size()), int(grpsugidx.size()), int(ugroups.size()));
|
int(index_term_groups.size()), int(grpsugidx.size()),
|
||||||
|
int(ugroups.size()));
|
||||||
out.append(cbuf);
|
out.append(cbuf);
|
||||||
|
|
||||||
size_t ugidx = (size_t) - 1;
|
size_t ugidx = (size_t) - 1;
|
||||||
for (unsigned int i = 0; i < groups.size(); i++) {
|
for (unsigned int i = 0; i < index_term_groups.size(); i++) {
|
||||||
if (ugidx != grpsugidx[i]) {
|
if (ugidx != grpsugidx[i]) {
|
||||||
ugidx = grpsugidx[i];
|
ugidx = grpsugidx[i];
|
||||||
out.append("\n(");
|
out.append("\n(");
|
||||||
@ -196,8 +233,8 @@ void HighlightData::toString(string& out) const
|
|||||||
out.append(") ->");
|
out.append(") ->");
|
||||||
}
|
}
|
||||||
out.append(" {");
|
out.append(" {");
|
||||||
for (unsigned int j = 0; j < groups[i].size(); j++) {
|
for (unsigned int j = 0; j < index_term_groups[i].size(); j++) {
|
||||||
out.append("[").append(groups[i][j]).append("]");
|
out.append("[").append(index_term_groups[i][j]).append("]");
|
||||||
}
|
}
|
||||||
sprintf(cbuf, "%d", slacks[i]);
|
sprintf(cbuf, "%d", slacks[i]);
|
||||||
out.append("}").append(cbuf);
|
out.append("}").append(cbuf);
|
||||||
@ -212,10 +249,12 @@ void HighlightData::append(const HighlightData& hl)
|
|||||||
size_t ugsz0 = ugroups.size();
|
size_t ugsz0 = ugroups.size();
|
||||||
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
||||||
|
|
||||||
groups.insert(groups.end(), hl.groups.begin(), hl.groups.end());
|
index_term_groups.insert(index_term_groups.end(),
|
||||||
|
hl.index_term_groups.begin(),
|
||||||
|
hl.index_term_groups.end());
|
||||||
slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end());
|
slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end());
|
||||||
for (std::vector<size_t>::const_iterator it = hl.grpsugidx.begin();
|
for (std::vector<size_t>::const_iterator it = hl.grpsugidx.begin();
|
||||||
it != hl.grpsugidx.end(); it++) {
|
it != hl.grpsugidx.end(); it++) {
|
||||||
grpsugidx.push_back(*it + ugsz0);
|
grpsugidx.push_back(*it + ugsz0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -40,8 +40,11 @@ struct HighlightData {
|
|||||||
* (unaccented and lowercased as needed depending on
|
* (unaccented and lowercased as needed depending on
|
||||||
* configuration), and the list may include values
|
* configuration), and the list may include values
|
||||||
* expanded from the original terms by stem or wildcard expansion.
|
* expanded from the original terms by stem or wildcard expansion.
|
||||||
|
* NEAR clauses are expanded to all possible combinations of the
|
||||||
|
* stem-expanded member terms. Ex:
|
||||||
|
* "clean floor"p -> (clean floor) (clean floors) (cleaning floor)...
|
||||||
*/
|
*/
|
||||||
std::vector<std::vector<std::string> > groups;
|
std::vector<std::vector<std::string> > index_term_groups;
|
||||||
/** Group slacks. Parallel to groups */
|
/** Group slacks. Parallel to groups */
|
||||||
std::vector<int> slacks;
|
std::vector<int> slacks;
|
||||||
|
|
||||||
@ -53,11 +56,10 @@ struct HighlightData {
|
|||||||
*/
|
*/
|
||||||
std::vector<size_t> grpsugidx;
|
std::vector<size_t> grpsugidx;
|
||||||
|
|
||||||
void clear()
|
void clear() {
|
||||||
{
|
|
||||||
uterms.clear();
|
uterms.clear();
|
||||||
ugroups.clear();
|
ugroups.clear();
|
||||||
groups.clear();
|
index_term_groups.clear();
|
||||||
slacks.clear();
|
slacks.clear();
|
||||||
grpsugidx.clear();
|
grpsugidx.clear();
|
||||||
}
|
}
|
||||||
@ -67,35 +69,7 @@ struct HighlightData {
|
|||||||
void toString(std::string& out) const;
|
void toString(std::string& out) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline void setWinMinMax(int pos, int& sta, int& sto)
|
/* The following is used by plaintorich.cpp for finding zones to
|
||||||
{
|
|
||||||
if (pos < sta) {
|
|
||||||
sta = pos;
|
|
||||||
}
|
|
||||||
if (pos > sto) {
|
|
||||||
sto = pos;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check that at least an entry from the first position list is inside
|
|
||||||
// the window and recurse on next list. The window is readjusted as
|
|
||||||
// the successive terms are found. Mostly copied from Xapian code.
|
|
||||||
//
|
|
||||||
// @param window the search window width
|
|
||||||
// @param plists the position list vector
|
|
||||||
// @param i the position list to process (we then recurse with the next list)
|
|
||||||
// @param min the current minimum pos for a found term
|
|
||||||
// @param max the current maximum pos for a found term
|
|
||||||
// @param sp, ep output: the found area
|
|
||||||
// @param minpos bottom of search: this is the highest point of
|
|
||||||
// any previous match. We don't look below this as overlapping matches
|
|
||||||
// make no sense for highlighting.
|
|
||||||
extern bool do_proximity_test(
|
|
||||||
int window, std::vector<const std::vector<int>*>& plists,
|
|
||||||
unsigned int i, int min, int max, int *sp, int *ep, int minpos);
|
|
||||||
|
|
||||||
|
|
||||||
/**** The following is used by plaintorich.cpp for finding zones to
|
|
||||||
highlight and by rclabsfromtext.cpp to choose fragments for the
|
highlight and by rclabsfromtext.cpp to choose fragments for the
|
||||||
abstract */
|
abstract */
|
||||||
|
|
||||||
@ -112,17 +86,31 @@ struct GroupMatchEntry {
|
|||||||
|
|
||||||
// Find NEAR matches for one group of terms.
|
// Find NEAR matches for one group of terms.
|
||||||
//
|
//
|
||||||
// @param hldata Data about the user query
|
// @param hldata User query expansion descriptor (see above).
|
||||||
// @param grpidx Index in hldata.groups for the group we process
|
//
|
||||||
// @param inplists Position lists for the the group terms
|
// @param grpidx Index in hldata.index_term_groups for the group we
|
||||||
// @param gpostobytes Translation of term position to start/end byte offsets
|
// process. This is used by us to get the terms and slacks, and
|
||||||
// @param[out] tboffs Found matches
|
// set in the output GroupMatchEntry structures to allow the
|
||||||
|
// caller to link a match with a specific user input (e.g. for
|
||||||
|
// walking the match in the GUI preview)
|
||||||
|
//
|
||||||
|
// @param inplists Position lists for the the group terms. This is the
|
||||||
|
// data used to look for matches.
|
||||||
|
//
|
||||||
|
// @param gpostobytes Translation of term position to start/end byte
|
||||||
|
// offsets. This is used to translate term positions to byte
|
||||||
|
// positions in the output, for ease of use by caller.
|
||||||
|
//
|
||||||
|
// @param[out] tboffs Found matches. Each match has a begin and end
|
||||||
|
// byte offset and an index linking to the origin data in the
|
||||||
|
// HighlightData structure.
|
||||||
extern bool matchGroup(
|
extern bool matchGroup(
|
||||||
const HighlightData& hldata,
|
const HighlightData& hldata,
|
||||||
unsigned int grpidx,
|
unsigned int grpidx,
|
||||||
const std::map<std::string, std::vector<int>>& inplists,
|
const std::map<std::string, std::vector<int>>& inplists,
|
||||||
const std::map<int, std::pair<int,int>>& gpostobytes,
|
const std::map<int, std::pair<int,int>>& gpostobytes,
|
||||||
std::vector<GroupMatchEntry>& tboffs
|
std::vector<GroupMatchEntry>& tboffs,
|
||||||
|
bool isphrase = false
|
||||||
);
|
);
|
||||||
|
|
||||||
#endif /* _hldata_h_included_ */
|
#endif /* _hldata_h_included_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user