337 lines
12 KiB
C++
337 lines
12 KiB
C++
/* Copyright (C) 2017-2019 J.F.Dockes
|
|
*
|
|
* License: GPL 2.1
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2.1 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*/
|
|
#include "autoconfig.h"
|
|
|
|
#include "hldata.h"
|
|
|
|
#include <algorithm>
|
|
#include <limits.h>
|
|
|
|
#include "log.h"
|
|
#include "smallut.h"
|
|
|
|
using std::string;
|
|
using std::unordered_map;
|
|
using std::vector;
|
|
using std::pair;
|
|
|
|
#undef DEBUGGROUPS
|
|
#ifdef DEBUGGROUPS
|
|
#define LOGRP LOGINF
|
|
#else
|
|
#define LOGRP LOGDEB1
|
|
#endif
|
|
|
|
// Combined position list for or'd terms
|
|
struct OrPList {
|
|
void addplist(const string& term, const vector<int>* pl) {
|
|
terms.push_back(term);
|
|
plists.push_back(pl);
|
|
indexes.push_back(0);
|
|
totalsize += pl->size();
|
|
}
|
|
|
|
// Returns -1 for eof, else the next smallest value in the
|
|
// combined lists, according to the current indexes.
|
|
int value() {
|
|
int minval = INT_MAX;
|
|
int minidx = -1;
|
|
for (unsigned ii = 0; ii < indexes.size(); ii++) {
|
|
const vector<int>& pl(*plists[ii]);
|
|
if (indexes[ii] >= pl.size())
|
|
continue; // this list done
|
|
if (pl[indexes[ii]] < minval) {
|
|
minval = pl[indexes[ii]];
|
|
minidx = ii;
|
|
}
|
|
}
|
|
if (minidx != -1) {
|
|
LOGRP("OrPList::value() -> " << minval << " for " <<
|
|
terms[minidx] << "\n");
|
|
currentidx = minidx;
|
|
return minval;
|
|
} else {
|
|
LOGRP("OrPList::value(): EOL for " << stringsToString(terms)<<"\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
int next() {
|
|
if (currentidx != -1) {
|
|
indexes[currentidx]++;
|
|
}
|
|
return value();
|
|
}
|
|
|
|
int size() const {
|
|
return totalsize;
|
|
}
|
|
void rewind() {
|
|
for (auto& idx : indexes) {
|
|
idx = 0;
|
|
}
|
|
currentidx = -1;
|
|
}
|
|
|
|
vector<const vector<int>*> plists;
|
|
vector<unsigned int> indexes;
|
|
vector<string> terms;
|
|
int currentidx{-1};
|
|
int totalsize{0};
|
|
};
|
|
|
|
static inline void setWinMinMax(int pos, int& sta, int& sto)
|
|
{
|
|
if (pos < sta) {
|
|
sta = pos;
|
|
}
|
|
if (pos > sto) {
|
|
sto = pos;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* @param window the total width for the "near" area, in positions.
|
|
|
|
* @param plists the position vectors for the terms. The array is
|
|
* sorted shorted first for optimization. The function does a
|
|
* recursive call on the next array if the match is still possible
|
|
* after dealing with the current one
|
|
|
|
* @param plist_idx the index for the position list we will work with.
|
|
* @param min, max the current minimum and maximum term positions.
|
|
* @param[output] sp, ep, the start and end positions of the found match.
|
|
* @param minpos Highest end of a found match. While looking for
|
|
* further matches, we don't want the search to extend before
|
|
* this, because it does not make sense for highlight regions to
|
|
* overlap.
|
|
* @param isphrase if true, the position lists are in term order, and
|
|
* we only look for the next match beyond the current window top.
|
|
*/
|
|
static bool do_proximity_test(
|
|
const int window, vector<OrPList>& plists,
|
|
unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
|
|
bool isphrase)
|
|
{
|
|
// Overlap interdiction: possibly adjust window start by input minpos
|
|
int actualminpos = isphrase ? max + 1 : max + 1 - window;
|
|
if (actualminpos < minpos)
|
|
actualminpos = minpos;
|
|
LOGRP("do_prox_test: win " << window << " plist_idx " << plist_idx <<
|
|
" min " << min << " max " << max << " minpos " << minpos <<
|
|
" isphrase " << isphrase << " actualminpos " << actualminpos << "\n");
|
|
|
|
// Find 1st position bigger than window start. A previous call may
|
|
// have advanced the index, so we begin by retrieving the current
|
|
// value.
|
|
int nextpos = plists[plist_idx].value();
|
|
while (nextpos != -1 && nextpos < actualminpos)
|
|
nextpos = plists[plist_idx].next();
|
|
|
|
// Look for position inside window. If not found, no match. If
|
|
// found: if this is the last list we're done, else recurse on
|
|
// next list after adjusting the window
|
|
while (nextpos != -1) {
|
|
if (nextpos > min + window - 1) {
|
|
return false;
|
|
}
|
|
if (plist_idx + 1 == plists.size()) {
|
|
// We already checked pos > min, now we also have pos <
|
|
// max, and we are the last list: done: set return values.
|
|
setWinMinMax(nextpos, *sp, *ep);
|
|
return true;
|
|
}
|
|
setWinMinMax(nextpos, min, max);
|
|
if (do_proximity_test(window, plists, plist_idx + 1,
|
|
min, max, sp, ep, minpos, isphrase)) {
|
|
return true;
|
|
}
|
|
nextpos = plists[plist_idx].next();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
// Find matches for one group of terms
|
|
bool matchGroup(const HighlightData& hldata,
|
|
unsigned int grpidx,
|
|
const unordered_map<string, vector<int>>& inplists,
|
|
const unordered_map<int, pair<int,int>>& gpostobytes,
|
|
vector<GroupMatchEntry>& tboffs)
|
|
{
|
|
|
|
const auto& tg(hldata.index_term_groups[grpidx]);
|
|
bool isphrase = tg.kind == HighlightData::TermGroup::TGK_PHRASE;
|
|
string allplterms;
|
|
for (const auto& entry:inplists) {
|
|
allplterms += entry.first + " ";
|
|
}
|
|
LOGRP("matchGroup: isphrase " << isphrase << ". Have plists for [" << allplterms << "]\n");
|
|
//LOGRP("matchGroup: hldata: " << hldata.toString() << std::endl);
|
|
|
|
int window = int(tg.orgroups.size() + tg.slack);
|
|
// The position lists we are going to work with. We extract them from the
|
|
// (string->plist) map
|
|
vector<OrPList> orplists;
|
|
|
|
// Find the position list for each term in the group and build the combined lists for the term
|
|
// or groups (each group is the result of the exansion of one user term). It is possible that
|
|
// this particular group was not actually matched by the search, so that some terms are not
|
|
// found, in which case we bail out.
|
|
for (const auto& group : tg.orgroups) {
|
|
orplists.push_back(OrPList());
|
|
for (const auto& term : group) {
|
|
const auto pl = inplists.find(term);
|
|
if (pl == inplists.end()) {
|
|
LOGRP("TextSplitPTR::matchGroup: term [" << term << "] not found in plists\n");
|
|
continue;
|
|
}
|
|
orplists.back().addplist(pl->first, &(pl->second));
|
|
}
|
|
if (orplists.back().plists.empty()) {
|
|
LOGRP("No positions list found for OR group [" << stringsToString(group) <<
|
|
"] : input has no group match, returning false\n");
|
|
return false;
|
|
} else {
|
|
LOGRP("Created OrPList has " << orplists.back().plists.size() << " members\n");
|
|
}
|
|
}
|
|
|
|
// I think this can't actually happen, was useful when we used to
|
|
// prune the groups, but doesn't hurt.
|
|
if (orplists.size() < 2) {
|
|
LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
|
|
return false;
|
|
}
|
|
|
|
if (!isphrase) {
|
|
// Sort the positions lists so that the shorter is first
|
|
std::sort(orplists.begin(), orplists.end(),
|
|
[](const OrPList& a, const OrPList& b) -> bool {
|
|
return a.size() < b.size();
|
|
}
|
|
);
|
|
}
|
|
|
|
// Minpos is the highest end of a found match. While looking for
|
|
// further matches, we don't want the search to extend before
|
|
// this, because it does not make sense for highlight regions to
|
|
// overlap
|
|
int minpos = 0;
|
|
// Walk the shortest plist and look for matches
|
|
int pos;
|
|
while ((pos = orplists[0].next()) != -1) {
|
|
int sta = INT_MAX, sto = 0;
|
|
LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
|
|
if (do_proximity_test(
|
|
window, orplists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
|
|
setWinMinMax(pos, sta, sto);
|
|
LOGRP("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
|
|
"," << sto << "]\n");
|
|
minpos = sto + 1;
|
|
// Translate the position window into a byte offset window
|
|
auto i1 = gpostobytes.find(sta);
|
|
auto i2 = gpostobytes.find(sto);
|
|
if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) {
|
|
LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
|
|
i1->second.first << " " << i2->second.second << "\n");
|
|
tboffs.push_back(GroupMatchEntry(i1->second.first,
|
|
i2->second.second, grpidx));
|
|
} else {
|
|
LOGDEB0("matchGroup: no bpos found for " << sta << " or " << sto << "\n");
|
|
}
|
|
} else {
|
|
LOGRP("matchGroup: no group match found at this position\n");
|
|
}
|
|
}
|
|
|
|
return !tboffs.empty();
|
|
}
|
|
|
|
vector<CharFlags> kindflags {
|
|
CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
|
|
CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
|
|
CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
|
|
};
|
|
|
|
string HighlightData::toString() const
|
|
{
|
|
string out;
|
|
out.append("\nUser terms (orthograph): ");
|
|
for (const auto& term : uterms) {
|
|
out.append(" [").append(term).append("]");
|
|
}
|
|
out.append("\nUser terms to Query terms:");
|
|
for (const auto& entry: terms) {
|
|
out.append("[").append(entry.first).append("]->[");
|
|
out.append(entry.second).append("] ");
|
|
}
|
|
out.append("\nGroups: ");
|
|
char cbuf[200];
|
|
sprintf(cbuf, "index_term_groups size %d ugroups size %d",
|
|
int(index_term_groups.size()), int(ugroups.size()));
|
|
out.append(cbuf);
|
|
|
|
size_t ugidx = (size_t) - 1;
|
|
for (HighlightData::TermGroup tg : index_term_groups) {
|
|
if (ugidx != tg.grpsugidx) {
|
|
ugidx = tg.grpsugidx;
|
|
out.append("\n(");
|
|
for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
|
|
out.append("[").append(ugroups[ugidx][j]).append("] ");
|
|
}
|
|
out.append(") ->");
|
|
}
|
|
if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
|
|
out.append(" <").append(tg.term).append(">");
|
|
} else {
|
|
out.append(" {");
|
|
for (unsigned int j = 0; j < tg.orgroups.size(); j++) {
|
|
out.append(" {");
|
|
for (unsigned int k = 0; k < tg.orgroups[j].size(); k++) {
|
|
out.append("[").append(tg.orgroups[j][k]).append("]");
|
|
}
|
|
out.append("}");
|
|
}
|
|
out.append("} ");
|
|
out.append(valToString(kindflags, tg.kind)).append("-").append(lltodecstr(tg.slack));
|
|
}
|
|
}
|
|
out.append("\n");
|
|
return out;
|
|
}
|
|
|
|
void HighlightData::append(const HighlightData& hl)
|
|
{
|
|
uterms.insert(hl.uterms.begin(), hl.uterms.end());
|
|
terms.insert(hl.terms.begin(), hl.terms.end());
|
|
size_t ugsz0 = ugroups.size();
|
|
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
|
|
|
size_t itgsize = index_term_groups.size();
|
|
index_term_groups.insert(index_term_groups.end(),
|
|
hl.index_term_groups.begin(),
|
|
hl.index_term_groups.end());
|
|
// Adjust the grpsugidx values for the newly inserted entries
|
|
for (unsigned int idx = itgsize; idx < index_term_groups.size(); idx++) {
|
|
index_term_groups[idx].grpsugidx += ugsz0;
|
|
}
|
|
}
|