From 3f8f31732da29f4f68c0a34d658e661feff01775 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 26 Dec 2017 12:42:53 +0100 Subject: [PATCH] plaintorich: indent and log lines --- src/query/plaintorich.cpp | 388 +++++++++++++++++++------------------- src/query/plaintorich.h | 40 ++-- 2 files changed, 213 insertions(+), 215 deletions(-) diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp index a1752e46..6e7813f0 100644 --- a/src/query/plaintorich.cpp +++ b/src/query/plaintorich.cpp @@ -47,71 +47,72 @@ struct MatchEntry { // match to the original user input. size_t grpidx; MatchEntry(int sta, int sto, size_t idx) - : offs(sta, sto), grpidx(idx) - { + : offs(sta, sto), grpidx(idx) { } }; // Text splitter used to take note of the position of query terms // inside the result text. This is then used to insert highlight tags. class TextSplitPTR : public TextSplit { - public: +public: // Out: begin and end byte positions of query terms/groups in text vector tboffs; TextSplitPTR(const HighlightData& hdata) - : m_wcount(0), m_hdata(hdata) - { - // We separate single terms and groups and extract the group - // terms for computing positions list before looking for group - // matches - for (vector >::const_iterator vit = hdata.groups.begin(); - vit != hdata.groups.end(); vit++) { - if (vit->size() == 1) { - m_terms[vit->front()] = vit - hdata.groups.begin(); - } else if (vit->size() > 1) { - for (vector::const_iterator it = vit->begin(); - it != vit->end(); it++) { - m_gterms.insert(*it); - } - } - } + : m_wcount(0), m_hdata(hdata) { + // We separate single terms and groups and extract the group + // terms for computing positions list before looking for group + // matches + for (vector >::const_iterator vit = hdata.groups.begin(); + vit != hdata.groups.end(); vit++) { + if (vit->size() == 1) { + m_terms[vit->front()] = vit - hdata.groups.begin(); + } else if (vit->size() > 1) { + for (vector::const_iterator it = vit->begin(); + it != vit->end(); it++) { + m_gterms.insert(*it); + } + } + } } // Accept word and its position. If word is search term, add // highlight zone definition. If word is part of search group // (phrase or near), update positions list. virtual bool takeword(const std::string& term, int pos, int bts, int bte) { - string dumb = term; - if (o_index_stripchars) { - if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) { - LOGINFO("PlainToRich::takeword: unac failed for [" << (term) << "]\n" ); - return true; - } - } + string dumb = term; + if (o_index_stripchars) { + if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO("PlainToRich::takeword: unac failed for [" << term << + "]\n"); + return true; + } + } - //LOGDEB2("Input dumbbed term: '" << (dumb) << "' " << (// pos) << " " << (bts) << " " << (bte) << "\n" ); + LOGDEB2("Input dumbbed term: '" << dumb << "' " << pos << " " << bts + << " " << bte << "\n"); - // If this word is a search term, remember its byte-offset span. - map::const_iterator it = m_terms.find(dumb); - if (it != m_terms.end()) { - tboffs.push_back(MatchEntry(bts, bte, (*it).second)); - } - - // If word is part of a search group, update its positions list - if (m_gterms.find(dumb) != m_gterms.end()) { - // Term group (phrase/near) handling - m_plists[dumb].push_back(pos); - m_gpostobytes[pos] = pair(bts, bte); - //LOGDEB2("Recorded bpos for " << (pos) << ": " << (bts) << " " << (bte) << "\n" ); - } + // If this word is a search term, remember its byte-offset span. + map::const_iterator it = m_terms.find(dumb); + if (it != m_terms.end()) { + tboffs.push_back(MatchEntry(bts, bte, it->second)); + } + + // If word is part of a search group, update its positions list + if (m_gterms.find(dumb) != m_gterms.end()) { + // Term group (phrase/near) handling + m_plists[dumb].push_back(pos); + m_gpostobytes[pos] = pair(bts, bte); + LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " << + bte << "\n"); + } - // Check for cancellation request - if ((m_wcount++ & 0xfff) == 0) - CancelCheck::instance().checkCancel(); + // Check for cancellation request + if ((m_wcount++ & 0xfff) == 0) + CancelCheck::instance().checkCancel(); - return true; + return true; } // Must be called after the split to find the phrase/near match positions @@ -139,16 +140,15 @@ private: /** Sort by shorter comparison class */ class VecIntCmpShorter { - public: - /** Return true if and only if a is strictly shorter than b. - */ - bool operator()(const vector *a, const vector *b) { - return a->size() < b->size(); - } +public: + /** Return true if and only if a is strictly shorter than b. */ + bool operator()(const vector *a, const vector *b) { + return a->size() < b->size(); + } }; -#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \ - if ((POS) > (STO)) (STO) = (POS);} +#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \ + if ((POS) > (STO)) (STO) = (POS);} // Check that at least an entry from the first position list is inside // the window and recurse on next list. The window is readjusted as @@ -164,36 +164,37 @@ class VecIntCmpShorter { // any previous match. We don't look below this as overlapping matches // make no sense for highlighting. static bool do_proximity_test(int window, vector* >& plists, - unsigned int i, int min, int max, - int *sp, int *ep, int minpos) + unsigned int i, int min, int max, + int *sp, int *ep, int minpos) { - LOGDEB1("do_prox_test: win " << (window) << " i " << (i) << " min " << (min) << " max " << (max) << " minpos " << (minpos) << "\n" ); + LOGDEB1("do_prox_test: win " << window << " i " << i << " min " << + min << " max " << max << " minpos " << minpos << "\n"); int tmp = max + 1 - window; if (tmp < minpos) - tmp = minpos; + tmp = minpos; // Find 1st position bigger than window start vector::iterator it = plists[i]->begin(); while (it != plists[i]->end() && *it < tmp) - it++; + it++; // Look for position inside window. If not found, no match. If // found: if this is the last list we're done, else recurse on // next list after adjusting the window while (it != plists[i]->end()) { - int pos = *it; - if (pos > min + window - 1) - return false; - if (i + 1 == plists.size()) { - SETMINMAX(pos, *sp, *ep); - return true; - } - SETMINMAX(pos, min, max); - if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) { - SETMINMAX(pos, *sp, *ep); - return true; - } - it++; + int pos = *it; + if (pos > min + window - 1) + return false; + if (i + 1 == plists.size()) { + SETMINMAX(pos, *sp, *ep); + return true; + } + SETMINMAX(pos, min, max); + if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) { + SETMINMAX(pos, *sp, *ep); + return true; + } + it++; } return false; } @@ -204,7 +205,8 @@ bool TextSplitPTR::matchGroup(unsigned int grpidx) const vector& terms = m_hdata.groups[grpidx]; int window = int(m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx]); - LOGDEB1("TextSplitPTR::matchGroup:d " << (window) << ": " << (vecStringToString(terms)) << "\n" ); + LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " << + stringsToString(terms) << "\n"); // The position lists we are going to work with. We extract them from the // (string->plist) map @@ -217,33 +219,35 @@ bool TextSplitPTR::matchGroup(unsigned int grpidx) // possible that this particular group was not actually matched by // the search, so that some terms are not found. for (vector::const_iterator it = terms.begin(); - it != terms.end(); it++) { - map >::iterator pl = m_plists.find(*it); - if (pl == m_plists.end()) { - LOGDEB1("TextSplitPTR::matchGroup: [" << ((*it)) << "] not found in m_plists\n" ); - return false; - } - plists.push_back(&(pl->second)); - plistToTerm[&(pl->second)] = *it; + it != terms.end(); it++) { + map >::iterator pl = m_plists.find(*it); + if (pl == m_plists.end()) { + LOGDEB1("TextSplitPTR::matchGroup: [" << *it << + "] not found in m_plists\n"); + return false; + } + plists.push_back(&(pl->second)); + plistToTerm[&(pl->second)] = *it; } // I think this can't actually happen, was useful when we used to // prune the groups, but doesn't hurt. if (plists.size() < 2) { - LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n" ); - return false; + LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n"); + return false; } // Sort the positions lists so that the shorter is first std::sort(plists.begin(), plists.end(), VecIntCmpShorter()); { // Debug - map*, string>::iterator it; - it = plistToTerm.find(plists[0]); - if (it == plistToTerm.end()) { - // SuperWeird - LOGERR("matchGroup: term for first list not found !?!\n" ); - return false; - } - LOGDEB1("matchGroup: walking the shortest plist. Term [" << (it->second) << "], len " << (plists[0]->size()) << "\n" ); + map*, string>::iterator it; + it = plistToTerm.find(plists[0]); + if (it == plistToTerm.end()) { + // SuperWeird + LOGERR("matchGroup: term for first list not found !?!\n"); + return false; + } + LOGDEB1("matchGroup: walking the shortest plist. Term [" << + it->second << "], len " << plists[0]->size() << "\n"); } // Minpos is the highest end of a found match. While looking for @@ -253,29 +257,32 @@ bool TextSplitPTR::matchGroup(unsigned int grpidx) int minpos = 0; // Walk the shortest plist and look for matches for (vector::iterator it = plists[0]->begin(); - it != plists[0]->end(); it++) { - int pos = *it; - int sta = INT_MAX, sto = 0; - LOGDEB2("MatchGroup: Testing at pos " << (pos) << "\n" ); - if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) { - LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << (sta) << "," << (sto) << "]\n" ); - // Maybe extend the window by 1st term position, this was not - // done by do_prox.. - SETMINMAX(pos, sta, sto); - minpos = sto+1; - // Translate the position window into a byte offset window - map >::iterator i1 = m_gpostobytes.find(sta); - map >::iterator i2 = m_gpostobytes.find(sto); - if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { - LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " << (i1->second.first) << " " << (i2->second.second) << "\n" ); - tboffs.push_back(MatchEntry(i1->second.first, - i2->second.second, grpidx)); - } else { - LOGDEB0("matchGroup: no bpos found for " << (sta) << " or " << (sto) << "\n" ); - } - } else { - LOGDEB1("matchGroup: no group match found at this position\n" ); - } + it != plists[0]->end(); it++) { + int pos = *it; + int sta = INT_MAX, sto = 0; + LOGDEB2("MatchGroup: Testing at pos " << pos << "\n"); + if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) { + LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta << + "," << sto << "]\n"); + // Maybe extend the window by 1st term position, this was not + // done by do_prox.. + SETMINMAX(pos, sta, sto); + minpos = sto+1; + // Translate the position window into a byte offset window + map >::iterator i1 = m_gpostobytes.find(sta); + map >::iterator i2 = m_gpostobytes.find(sto); + if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { + LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " << + i1->second.first << " " << i2->second.second << "\n"); + tboffs.push_back(MatchEntry(i1->second.first, + i2->second.second, grpidx)); + } else { + LOGDEB0("matchGroup: no bpos found for " << sta << " or " + << sto << "\n"); + } + } else { + LOGDEB1("matchGroup: no group match found at this position\n"); + } } return true; @@ -285,9 +292,9 @@ bool TextSplitPTR::matchGroup(unsigned int grpidx) class PairIntCmpFirst { public: bool operator()(const MatchEntry& a, const MatchEntry& b) { - if (a.offs.first != b.offs.first) - return a.offs.first < b.offs.first; - return a.offs.second > b.offs.second; + if (a.offs.first != b.offs.first) + return a.offs.first < b.offs.first; + return a.offs.second > b.offs.second; } }; @@ -298,9 +305,9 @@ public: bool TextSplitPTR::matchGroups() { for (unsigned int i = 0; i < m_hdata.groups.size(); i++) { - if (m_hdata.groups[i].size() <= 1) - continue; - matchGroup(i); + if (m_hdata.groups[i].size() <= 1) + continue; + matchGroup(i); } // Sort regions by increasing start and decreasing width. @@ -319,13 +326,13 @@ bool TextSplitPTR::matchGroups() // to cut in the middle of a tag, which would confuse qtextedit. If // the input is html, the body is always a single output chunk. bool PlainToRich::plaintorich(const string& in, - list& out, // Output chunk list - const HighlightData& hdata, - int chunksize) + list& out, // Output chunk list + const HighlightData& hdata, + int chunksize) { Chrono chron; bool ret = true; - LOGDEB1("plaintorichich: in: [" << (in) << "]\n" ); + LOGDEB1("plaintorichich: in: [" << in << "]\n"); m_hdata = &hdata; // Compute the positions for the query terms. We use the text @@ -335,10 +342,10 @@ bool PlainToRich::plaintorich(const string& in, // Note: the splitter returns the term locations in byte, not // character, offsets. splitter.text_to_words(in); - LOGDEB2("plaintorich: split done " << (chron.millis()) << " mS\n" ); + LOGDEB2("plaintorich: split done " << chron.millis() << " mS\n"); // Compute the positions for NEAR and PHRASE groups. splitter.matchGroups(); - LOGDEB2("plaintorich: group match done " << (chron.millis()) << " mS\n" ); + LOGDEB2("plaintorich: group match done " << chron.millis() << " mS\n"); out.clear(); out.push_back(""); @@ -351,8 +358,8 @@ bool PlainToRich::plaintorich(const string& in, // a term match when we are actually looking for a group match // (the snippet generator does this...). if (splitter.tboffs.empty()) { - LOGDEB1("plaintorich: no term matches\n" ); - ret = false; + LOGDEB1("plaintorich: no term matches\n"); + ret = false; } // Iterator for the list of input term positions. We use it to @@ -363,8 +370,8 @@ bool PlainToRich::plaintorich(const string& in, #if 0 for (vector >::const_iterator it = splitter.tboffs.begin(); - it != splitter.tboffs.end(); it++) { - LOGDEB2("plaintorich: region: " << (it->first) << " " << (it->second) << "\n" ); + it != splitter.tboffs.end(); it++) { + LOGDEB2("plaintorich: region: " << it->first << " "<second<< "\n"); } #endif @@ -384,41 +391,41 @@ bool PlainToRich::plaintorich(const string& in, string::size_type headend = 0; if (m_inputhtml) { - headend = in.find(""); - if (headend == string::npos) - headend = in.find(""); - if (headend != string::npos) - headend += 7; + headend = in.find(""); + if (headend == string::npos) + headend = in.find(""); + if (headend != string::npos) + headend += 7; } for (string::size_type pos = 0; pos != string::npos; pos = chariter++) { - // Check from time to time if we need to stop - if ((pos & 0xfff) == 0) { - CancelCheck::instance().checkCancel(); - } + // Check from time to time if we need to stop + if ((pos & 0xfff) == 0) { + CancelCheck::instance().checkCancel(); + } - // If we still have terms positions, check (byte) position. If - // we are at or after a term match, mark. - if (tPosIt != tPosEnd) { - int ibyteidx = int(chariter.getBpos()); - if (ibyteidx == tPosIt->offs.first) { - if (!intag && ibyteidx >= (int)headend) { - *olit += startMatch((unsigned int)(tPosIt->grpidx)); - } + // If we still have terms positions, check (byte) position. If + // we are at or after a term match, mark. + if (tPosIt != tPosEnd) { + int ibyteidx = int(chariter.getBpos()); + if (ibyteidx == tPosIt->offs.first) { + if (!intag && ibyteidx >= (int)headend) { + *olit += startMatch((unsigned int)(tPosIt->grpidx)); + } inrcltag = 1; - } else if (ibyteidx == tPosIt->offs.second) { - // Output end of match region tags - if (!intag && ibyteidx > (int)headend) { - *olit += endMatch(); - } - // Skip all highlight areas that would overlap this one - int crend = tPosIt->offs.second; - while (tPosIt != splitter.tboffs.end() && - tPosIt->offs.first < crend) - tPosIt++; + } else if (ibyteidx == tPosIt->offs.second) { + // Output end of match region tags + if (!intag && ibyteidx > (int)headend) { + *olit += endMatch(); + } + // Skip all highlight areas that would overlap this one + int crend = tPosIt->offs.second; + while (tPosIt != splitter.tboffs.end() && + tPosIt->offs.first < crend) + tPosIt++; inrcltag = 0; - } - } + } + } unsigned int car = *chariter; @@ -433,13 +440,13 @@ bool PlainToRich::plaintorich(const string& in, continue; } else if (eol) { // Got non eol char in line break state. Do line break; - inindent = 1; + inindent = 1; hadcr = 0; if (eol > 2) eol = 2; while (eol) { - if (!m_inputhtml && m_eolbr) - *olit += "
"; + if (!m_inputhtml && m_eolbr) + *olit += "
"; *olit += "\n"; eol--; } @@ -455,7 +462,7 @@ bool PlainToRich::plaintorich(const string& in, switch (car) { case '<': - inindent = 0; + inindent = 0; if (m_inputhtml) { if (!inparamvalue) intag = true; @@ -465,7 +472,7 @@ bool PlainToRich::plaintorich(const string& in, } break; case '>': - inindent = 0; + inindent = 0; if (m_inputhtml) { if (!inparamvalue) intag = false; @@ -473,7 +480,7 @@ bool PlainToRich::plaintorich(const string& in, chariter.appendchartostring(*olit); break; case '&': - inindent = 0; + inindent = 0; if (m_inputhtml) { chariter.appendchartostring(*olit); } else { @@ -481,30 +488,30 @@ bool PlainToRich::plaintorich(const string& in, } break; case '"': - inindent = 0; + inindent = 0; if (m_inputhtml && intag) { inparamvalue = !inparamvalue; } chariter.appendchartostring(*olit); break; - case ' ': - if (m_eolbr && inindent) { - *olit += " "; - } else { - chariter.appendchartostring(*olit); - } - break; - case '\t': - if (m_eolbr && inindent) { - *olit += "    "; - } else { - chariter.appendchartostring(*olit); - } - break; + case ' ': + if (m_eolbr && inindent) { + *olit += " "; + } else { + chariter.appendchartostring(*olit); + } + break; + case '\t': + if (m_eolbr && inindent) { + *olit += "    "; + } else { + chariter.appendchartostring(*olit); + } + break; default: - inindent = 0; + inindent = 0; chariter.appendchartostring(*olit); } @@ -512,19 +519,18 @@ bool PlainToRich::plaintorich(const string& in, #if 0 { - FILE *fp = fopen("/tmp/debugplaintorich", "a"); - fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n"); - for (list::iterator it = out.begin(); - it != out.end(); it++) { - fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n"); - fprintf(fp, "%s", it->c_str()); - fprintf(fp, "ENDOFPLAINTORICHCHUNK\n"); - } - fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n"); - fclose(fp); + FILE *fp = fopen("/tmp/debugplaintorich", "a"); + fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n"); + for (list::iterator it = out.begin(); + it != out.end(); it++) { + fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n"); + fprintf(fp, "%s", it->c_str()); + fprintf(fp, "ENDOFPLAINTORICHCHUNK\n"); + } + fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n"); + fclose(fp); } #endif - LOGDEB2("plaintorich: done " << (chron.millis()) << " mS\n" ); + LOGDEB2("plaintorich: done " << chron.millis() << " mS\n"); return ret; } - diff --git a/src/query/plaintorich.h b/src/query/plaintorich.h index cbb13d9e..67a0c42a 100644 --- a/src/query/plaintorich.h +++ b/src/query/plaintorich.h @@ -32,17 +32,13 @@ class PlainToRich { public: PlainToRich() - : m_inputhtml(false), m_eolbr(false), m_hdata(0) - { + : m_inputhtml(false), m_eolbr(false), m_hdata(0) { } - virtual ~PlainToRich() - { - } + virtual ~PlainToRich() {} - void set_inputhtml(bool v) - { - m_inputhtml = v; + void set_inputhtml(bool v) { + m_inputhtml = v; } /** @@ -67,33 +63,29 @@ public: * @param chunksize max size of chunks in output list */ virtual bool plaintorich(const std::string &in, std::list &out, - const HighlightData& hdata, - int chunksize = 50000 - ); + const HighlightData& hdata, + int chunksize = 50000 + ); /* Overridable output methods for headers, highlighting and marking tags */ - virtual std::string header() - { - return cstr_null; + virtual std::string header() { + return cstr_null; } /** Return match prefix (e.g.:
). - @param groupidx the index into hdata.groups */ - virtual std::string startMatch(unsigned int) - { - return cstr_null; + @param groupidx the index into hdata.groups */ + virtual std::string startMatch(unsigned int) { + return cstr_null; } /** Return data for end of match area (e.g.:
). */ - virtual std::string endMatch() - { - return cstr_null; + virtual std::string endMatch() { + return cstr_null; } - virtual std::string startChunk() - { - return cstr_null; + virtual std::string startChunk() { + return cstr_null; } protected: