recoll/src/utils/hldata.cpp

/* Copyright (C) 2016 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
#include "autoconfig.h"

#include "hldata.h"

#include <algorithm>
#include <limits.h>

#include "log.h"
#include "smallut.h"

using std::string;
using std::map;
using std::vector;
using std::pair;

bool do_proximity_test(int window, vector<const vector<int>*>& plists,
                       unsigned int i, int min, int max,
                       int *sp, int *ep, int minpos)
{
    LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
            min << " max " << max << " minpos " << minpos << "\n");
    int tmp = max + 1 - window;
    if (tmp < minpos)
        tmp = minpos;

    // Find 1st position bigger than window start
    auto it = plists[i]->begin();
    while (it != plists[i]->end() && *it < tmp)
        it++;

    // Look for position inside window. If not found, no match. If
    // found: if this is the last list we're done, else recurse on
    // next list after adjusting the window
    while (it != plists[i]->end()) {
        int pos = *it;
        if (pos > min + window - 1)
            return false;
        if (i + 1 == plists.size()) {
            setWinMinMax(pos, *sp, *ep);
            return true;
        }
        setWinMinMax(pos, min, max);
        if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
            setWinMinMax(pos, *sp, *ep);
            return true;
        }
        it++;
    }
    return false;
}

#define DEBUGGROUPS
#ifdef DEBUGGROUPS
#define LOGRP LOGDEB
#else
#define LOGRP LOGDEB1
#endif

// Find NEAR matches for one group of terms
bool matchGroup(const HighlightData& hldata,
                unsigned int grpidx,
                const map<string, vector<int>>& inplists,
                const map<int, pair<int,int>>& gpostobytes,
                vector<GroupMatchEntry>& tboffs
    )
{
    const vector<string>& terms = hldata.groups[grpidx];
    int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]);

    LOGRP("TextSplitPTR::matchGroup:d " << window << ": " <<
            stringsToString(terms) << "\n");

    // The position lists we are going to work with. We extract them from the
    // (string->plist) map
    vector<const vector<int>*> plists;
    // A revert plist->term map. This is so that we can find who is who after
    // sorting the plists by length.
    map<const vector<int>*, string> plistToTerm;

    // Find the position list for each term in the group. It is
    // possible that this particular group was not actually matched by
    // the search, so that some terms are not found.
    for (const auto& term : terms) {
        map<string, vector<int> >::const_iterator pl = inplists.find(term);
        if (pl == inplists.end()) {
            LOGRP("TextSplitPTR::matchGroup: [" << term <<
                    "] not found in plists\n");
            return false;
        }
        plists.push_back(&(pl->second));
        plistToTerm[&(pl->second)] = term;
    }
    // I think this can't actually happen, was useful when we used to
    // prune the groups, but doesn't hurt.
    if (plists.size() < 2) {
        LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
        return false;
    }
    // Sort the positions lists so that the shorter is first
    std::sort(plists.begin(), plists.end(),
              [](const vector<int> *a, const vector<int> *b) -> bool {
                  return a->size() < b->size();
              }
        );

    if (0) { // Debug
        auto it = plistToTerm.find(plists[0]);
        if (it == plistToTerm.end()) {
            // SuperWeird
            LOGERR("matchGroup: term for first list not found !?!\n");
            return false;
        }
        LOGRP("matchGroup: walking the shortest plist. Term [" <<
                it->second << "], len " << plists[0]->size() << "\n");
    }

    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
    // this, because it does not make sense for highlight regions to
    // overlap
    int minpos = 0;
    // Walk the shortest plist and look for matches
    for (int pos : *(plists[0])) {
        int sta = INT_MAX, sto = 0;
        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
        if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
            LOGRP("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
                    "," << sto << "]\n");
            // Maybe extend the window by 1st term position, this was not
            // done by do_prox..
            setWinMinMax(pos, sta, sto);
            minpos = sto + 1;
            // Translate the position window into a byte offset window
            auto i1 =  gpostobytes.find(sta);
            auto i2 =  gpostobytes.find(sto);
            if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) {
                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
                        i1->second.first << " " << i2->second.second << "\n");
                tboffs.push_back(GroupMatchEntry(i1->second.first,
                                            i2->second.second, grpidx));
            } else {
                LOGDEB0("matchGroup: no bpos found for " << sta << " or "
                        << sto << "\n");
            }
        } else {
            LOGRP("matchGroup: no group match found at this position\n");
        }
    }

    return true;
}

void HighlightData::toString(string& out)
{
    out.append("\nUser terms (orthograph): ");
    for (std::set<string>::const_iterator it = uterms.begin();
            it != uterms.end(); it++) {
        out.append(" [").append(*it).append("]");
    }
    out.append("\nUser terms to Query terms:");
    for (map<string, string>::const_iterator it = terms.begin();
            it != terms.end(); it++) {
        out.append("[").append(it->first).append("]->[");
        out.append(it->second).append("] ");
    }
    out.append("\nGroups: ");
    char cbuf[200];
    sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
            int(groups.size()), int(grpsugidx.size()), int(ugroups.size()));
    out.append(cbuf);

    size_t ugidx = (size_t) - 1;
    for (unsigned int i = 0; i < groups.size(); i++) {
        if (ugidx != grpsugidx[i]) {
            ugidx = grpsugidx[i];
            out.append("\n(");
            for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
                out.append("[").append(ugroups[ugidx][j]).append("] ");
            }
            out.append(") ->");
        }
        out.append(" {");
        for (unsigned int j = 0; j < groups[i].size(); j++) {
            out.append("[").append(groups[i][j]).append("]");
        }
        sprintf(cbuf, "%d", slacks[i]);
        out.append("}").append(cbuf);
    }
    out.append("\n");
}

void HighlightData::append(const HighlightData& hl)
{
    uterms.insert(hl.uterms.begin(), hl.uterms.end());
    terms.insert(hl.terms.begin(), hl.terms.end());
    size_t ugsz0 = ugroups.size();
    ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());

    groups.insert(groups.end(), hl.groups.begin(), hl.groups.end());
    slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end());
    for (std::vector<size_t>::const_iterator it = hl.grpsugidx.begin();
            it != hl.grpsugidx.end(); it++) {
        grpsugidx.push_back(*it + ugsz0);
    }
}