recoll/src/utils/hldata.cpp

/* Copyright (C) 2017-2019 J.F.Dockes
 *
 * License: GPL 2.1
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
#include "autoconfig.h"

#include "hldata.h"

#include <algorithm>
#include <limits.h>

#include "log.h"
#include "smallut.h"

using std::string;
using std::unordered_map;
using std::vector;
using std::pair;

#undef DEBUGGROUPS
#ifdef DEBUGGROUPS
#define LOGRP LOGINF
#else
#define LOGRP LOGDEB1
#endif

// Combined position list for or'd terms
struct OrPList {
    void addplist(const string& term, const vector<int>* pl) {
        terms.push_back(term);
        plists.push_back(pl);
        indexes.push_back(0);
        totalsize += pl->size();
    }

    // Returns -1 for eof, else the next smallest value in the
    // combined lists, according to the current indexes.
    int value() {
        int minval = INT_MAX;
        int minidx = -1;
        for (unsigned ii = 0; ii < indexes.size(); ii++) {
            const vector<int>& pl(*plists[ii]);
            if (indexes[ii] >= pl.size())
                continue; // this list done
            if (pl[indexes[ii]] < minval) {
                minval = pl[indexes[ii]];
                minidx = ii;
            }
        }
        if (minidx != -1) {
            LOGRP("OrPList::value() -> " << minval << " for " <<
                  terms[minidx] << "\n");
            currentidx = minidx;
            return minval;
        } else {
            LOGRP("OrPList::value(): EOL for " << stringsToString(terms)<<"\n");
            return -1;
        }
    }

    int next() {
        if (currentidx != -1) {
            indexes[currentidx]++;
        }
        return value();
    }

    int size() const {
        return totalsize;
    }
    void rewind() {
        for (auto& idx : indexes) {
            idx = 0;
        }
        currentidx = -1;
    }

    vector<const vector<int>*> plists;
    vector<unsigned int> indexes;
    vector<string> terms;
    int currentidx{-1};
    int totalsize{0};
};

static inline void setWinMinMax(int pos, int& sta, int& sto)
{
    if (pos < sta) {
        sta = pos;
    }
    if (pos > sto) {
        sto = pos;
    }
}

/*
 * @param window the total width for the "near" area, in positions.

 * @param plists the position vectors for the terms. The array is
 *    sorted shorted first for optimization. The function does a
 *    recursive call on the next array if the match is still possible
 *    after dealing with the current one

 * @param plist_idx the index for the position list we will work with.
 * @param min, max the current minimum and maximum term positions.
 * @param[output] sp, ep, the start and end positions of the found match.
 * @param minpos  Highest end of a found match. While looking for
 *   further matches, we don't want the search to extend before
 *   this, because it does not make sense for highlight regions to
 *   overlap.
 * @param isphrase if true, the position lists are in term order, and
 *     we only look for the next match beyond the current window top.
 */
static bool do_proximity_test(
    const int window, vector<OrPList>& plists,
    unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
    bool isphrase)
{
    // Overlap interdiction: possibly adjust window start by input minpos
    int actualminpos = isphrase ? max + 1 : max + 1 - window;
    if (actualminpos < minpos)
        actualminpos = minpos;
    LOGRP("do_prox_test: win " << window << " plist_idx " << plist_idx <<
          " min " <<  min << " max " << max << " minpos " << minpos <<
          " isphrase " << isphrase << " actualminpos " << actualminpos << "\n");

    // Find 1st position bigger than window start. A previous call may
    // have advanced the index, so we begin by retrieving the current
    // value.
    int nextpos = plists[plist_idx].value();
    while (nextpos != -1 && nextpos < actualminpos)
        nextpos = plists[plist_idx].next();

    // Look for position inside window. If not found, no match. If
    // found: if this is the last list we're done, else recurse on
    // next list after adjusting the window
    while (nextpos != -1) {
        if (nextpos > min + window - 1) {
            return false;
        }
        if (plist_idx + 1 == plists.size()) {
            // We already checked pos > min, now we also have pos <
            // max, and we are the last list: done: set return values.
            setWinMinMax(nextpos, *sp, *ep);
            return true;
        }
        setWinMinMax(nextpos, min, max);
        if (do_proximity_test(window, plists, plist_idx + 1,
                              min, max, sp, ep, minpos, isphrase)) {
            return true;
        }
        nextpos = plists[plist_idx].next();
    }
    return false;
}


// Find matches for one group of terms
bool matchGroup(const HighlightData& hldata,
                unsigned int grpidx,
                const unordered_map<string, vector<int>>& inplists,
                const unordered_map<int, pair<int,int>>& gpostobytes,
                vector<GroupMatchEntry>& tboffs)
{

    const auto& tg(hldata.index_term_groups[grpidx]);
    bool isphrase =  tg.kind == HighlightData::TermGroup::TGK_PHRASE;
    string allplterms;
    for (const auto& entry:inplists) {
        allplterms += entry.first + " ";
    }
    LOGRP("matchGroup: isphrase " << isphrase << ". Have plists for [" << allplterms << "]\n");
    //LOGRP("matchGroup: hldata: " << hldata.toString() << std::endl);

    int window = int(tg.orgroups.size() + tg.slack);
    // The position lists we are going to work with. We extract them from the
    // (string->plist) map
    vector<OrPList> orplists;

    // Find the position list for each term in the group and build the combined lists for the term
    // or groups (each group is the result of the exansion of one user term). It is possible that
    // this particular group was not actually matched by the search, so that some terms are not
    // found, in which case we bail out.
    for (const auto& group : tg.orgroups) {
        orplists.push_back(OrPList());
        for (const auto& term : group) {
            const auto pl = inplists.find(term);
            if (pl == inplists.end()) {
                LOGRP("TextSplitPTR::matchGroup: term [" << term << "] not found in plists\n");
                continue;
            }
            orplists.back().addplist(pl->first, &(pl->second));
        }
        if (orplists.back().plists.empty()) {
            LOGRP("No positions list found for OR group [" << stringsToString(group) <<
                  "] : input has no group match, returning false\n");
            return false;
        } else {
            LOGRP("Created OrPList has " << orplists.back().plists.size() << " members\n");
        }
    }

    // I think this can't actually happen, was useful when we used to
    // prune the groups, but doesn't hurt.
    if (orplists.size() < 2) {
        LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
        return false;
    }

    if (!isphrase) {
        // Sort the positions lists so that the shorter is first
        std::sort(orplists.begin(), orplists.end(),
                  [](const OrPList& a, const OrPList& b) -> bool {
                      return a.size() < b.size();
                  }
            );
    }

    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
    // this, because it does not make sense for highlight regions to
    // overlap
    int minpos = 0;
    // Walk the shortest plist and look for matches
    int pos;
    while ((pos = orplists[0].next()) != -1) {
        int sta = INT_MAX, sto = 0;
        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
        if (do_proximity_test(
                window, orplists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
            setWinMinMax(pos, sta, sto);
            LOGRP("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
                    "," << sto << "]\n");
            minpos = sto + 1;
            // Translate the position window into a byte offset window
            auto i1 =  gpostobytes.find(sta);
            auto i2 =  gpostobytes.find(sto);
            if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) {
                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
                        i1->second.first << " " << i2->second.second << "\n");
                tboffs.push_back(GroupMatchEntry(i1->second.first,
                                                 i2->second.second, grpidx));
            } else {
                LOGDEB0("matchGroup: no bpos found for " << sta << " or " << sto << "\n");
            }
        } else {
            LOGRP("matchGroup: no group match found at this position\n");
        }
    }

    return !tboffs.empty();
}

vector<CharFlags> kindflags {
    CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
        CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
        CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
        };

string HighlightData::toString() const
{
    string out;
    out.append("\nUser terms (orthograph): ");
    for (const auto& term : uterms) {
        out.append(" [").append(term).append("]");
    }
    out.append("\nUser terms to Query terms:");
    for (const auto& entry: terms) {
        out.append("[").append(entry.first).append("]->[");
        out.append(entry.second).append("] ");
    }
    out.append("\nGroups: ");
    char cbuf[200];
    sprintf(cbuf, "index_term_groups size %d ugroups size %d",
            int(index_term_groups.size()), int(ugroups.size()));
    out.append(cbuf);

    size_t ugidx = (size_t) - 1;
    for (HighlightData::TermGroup tg : index_term_groups) {
        if (ugidx != tg.grpsugidx) {
            ugidx = tg.grpsugidx;
            out.append("\n(");
            for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
                out.append("[").append(ugroups[ugidx][j]).append("] ");
            }
            out.append(") ->");
        }
        if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
            out.append(" <").append(tg.term).append(">");
        } else {
            out.append(" {");
            for (unsigned int j = 0; j < tg.orgroups.size(); j++) {
                out.append(" {");
                for (unsigned int k = 0; k < tg.orgroups[j].size(); k++) {
                    out.append("[").append(tg.orgroups[j][k]).append("]");
                }
                out.append("}");
            }
            out.append("} ");
            out.append(valToString(kindflags, tg.kind)).append("-").append(lltodecstr(tg.slack));
        }
    }
    out.append("\n");
    return out;
}

void HighlightData::append(const HighlightData& hl)
{
    uterms.insert(hl.uterms.begin(), hl.uterms.end());
    terms.insert(hl.terms.begin(), hl.terms.end());
    size_t ugsz0 = ugroups.size();
    ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());

    size_t itgsize = index_term_groups.size();
    index_term_groups.insert(index_term_groups.end(),
                             hl.index_term_groups.begin(),
                             hl.index_term_groups.end());
    // Adjust the grpsugidx values for the newly inserted entries
    for (unsigned int idx = itgsize; idx < index_term_groups.size(); idx++) {
        index_term_groups[idx].grpsugidx += ugsz0;
    }
}