From a963035b939a8ba5d6df04e7d9e4003cfaf6f71a Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 17 Nov 2006 10:09:07 +0000 Subject: [PATCH] Remember searchData and use it in plaintorich for phrase/group highlighting --- src/qtgui/plaintorich.cpp | 236 +++++++++++++++++++++++++++++++++++--- src/qtgui/plaintorich.h | 6 +- src/qtgui/preview_w.cpp | 22 ++-- src/qtgui/preview_w.h | 18 ++- src/qtgui/rclmain_w.cpp | 17 +-- src/qtgui/reslist.cpp | 14 +-- src/qtgui/reslist.h | 5 +- 7 files changed, 264 insertions(+), 54 deletions(-) diff --git a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp index 1aefeaa8..4f47fec8 100644 --- a/src/qtgui/plaintorich.cpp +++ b/src/qtgui/plaintorich.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.13 2006-11-13 08:15:57 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.14 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -24,6 +24,9 @@ static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.13 2006-11-13 08:15:57 dockes #include #include #include +#include +#include + #ifndef NO_NAMESPACES using std::vector; using std::list; @@ -41,42 +44,218 @@ using std::set; #include "plaintorich.h" #include "cancelcheck.h" + +static string vecStringToString(const vector& t) +{ + string sterms; + for (vector::const_iterator it = t.begin(); it != t.end(); it++) { + sterms += "[" + *it + "] "; + } + return sterms; +} + // Text splitter callback used to take note of the position of query terms -// inside the result text. This is then used to post highlight tags. +// inside the result text. This is then used to insert highlight tags. class myTextSplitCB : public TextSplitCB { public: - // in: user query terms + // In: user query terms set terms; - // Out: begin and end byte positions of query terms in text - vector > tboffs; + // + const vector >& m_groups; + const vector& m_slacks; + set gterms; + // Out: first term found in text string firstTerm; + int firstTermPos; - myTextSplitCB(const list& its) { - for (list::const_iterator it = its.begin(); it != its.end(); - it++) { - string s; - Rcl::dumb_string(*it, s); - terms.insert(s); + // Out: begin and end byte positions of query terms/groups in text + vector > tboffs; + + // group/near terms word positions. + map > m_plists; + map > m_gpostobytes; + + myTextSplitCB(const vector& its, vector >&groups, + vector& slacks) : m_groups(groups), m_slacks(slacks) + { + for (vector::const_iterator it = its.begin(); + it != its.end(); it++) { + terms.insert(*it); + } + for (vector >::const_iterator vit = m_groups.begin(); + vit != m_groups.end(); vit++) { + for (vector::const_iterator it = (*vit).begin(); + it != (*vit).end(); it++) { + gterms.insert(*it); + } } } // Callback called by the text-to-words breaker for each word - virtual bool takeword(const std::string& term, int, int bts, int bte) { + virtual bool takeword(const std::string& term, int pos, int bts, int bte) { string dumb; Rcl::dumb_string(term, dumb); //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), // pos, bts, bte)); + + // Single search term highlighting: if this word is a search term, + // Note its byte-offset span. if (terms.find(dumb) != terms.end()) { tboffs.push_back(pair(bts, bte)); - if (firstTerm.empty()) + if (firstTerm.empty()) { firstTerm = term; + firstTermPos = pos; + } } + + if (gterms.find(dumb) != gterms.end()) { + // Term group (phrase/near) handling + m_plists[dumb].push_back(pos); + m_gpostobytes[pos] = pair(bts, bte); + LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte)); + } + CancelCheck::instance().checkCancel(); return true; } + virtual bool matchGroup(const vector& terms, int dist); + virtual bool matchGroups(); }; +// Code for checking for a NEAR match comes out of xapian phrasepostlist.cc +/** Sort by shorter comparison class */ +class VecIntCmpShorter { + public: + /** Return true if and only if a is strictly shorter than b. + */ + bool operator()(const vector *a, const vector *b) { + return a->size() < b->size(); + } +}; + +bool do_test(int window, vector* >& plists, + unsigned int i, int min, int max, int *sp, int *ep) +{ + int tmp = max + 1; + // take care to avoid underflow + if (window <= tmp) + tmp -= window; + else + tmp = 0; + vector::iterator it = plists[i]->begin(); + + // Find 1st position bigger than window start + while (it != plists[i]->end() && *it < tmp) + it++; + + // Try each position inside window in turn for match with other lists + while (it != plists[i]->end()) { + int pos = *it; + if (pos > min + window - 1) + return false; + if (i + 1 == plists.size()) { + *sp = min; + *ep = max; + return true; + } + if (pos < min) + min = pos; + else if (pos > max) + max = pos; + if (do_test(window, plists, i + 1, min, max, sp, ep)) + return true; + it++; + } + return false; +} + +// Check if there is a NEAR match for the group of terms +bool myTextSplitCB::matchGroup(const vector& terms, int window) +{ + LOGDEB(("myTextSplitCB::matchGroup:d %d: %s\n", window, + vecStringToString(terms).c_str())); + vector* > plists; + // Check that each of the group terms has a position list + for (vector::const_iterator it = terms.begin(); it != terms.end(); + it++) { + map >::iterator pl; + if ((pl = m_plists.find(*it)) == m_plists.end()) { + LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n", + (*it).c_str())); + return false; + } + plists.push_back(&(pl->second)); + } + + // Sort the positions lists so that the shorter is first + std::sort(plists.begin(), plists.end(), VecIntCmpShorter()); + + // Walk the shortest plist and look for matches + int sta, sto; + int pos; + vector::iterator it = plists[0]->begin(); + do { + if (it == plists[0]->end()) + return false; + pos = *it++; + } while (!do_test(window, plists, 1, pos, pos, &sta, &sto)); + + LOGDEB(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto)); + + if (firstTerm.empty() || firstTermPos > sta) { + // firsTerm is used to try an position the preview window over + // the match. As it's difficult to divine byte/word positions, + // we use a string search. Try to use the shortest plist for + // this, which hopefully gives a better chance for the group + // to be found (it's hopeless to try and match the whole + // group) + unsigned int minl = (unsigned int)10E9; + for (vector::const_iterator it = terms.begin(); + it != terms.end(); it++) { + map >::iterator pl = m_plists.find(*it); + if (pl != m_plists.end() && pl->second.size() < minl) { + firstTerm = *it; + LOGDEB(("Firstterm->%s\n", firstTerm.c_str())); + minl = pl->second.size(); + } + } + } + + map >::iterator i1 = m_gpostobytes.find(sta); + map >::iterator i2 = m_gpostobytes.find(sto); + if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { + LOGDEB(("myTextSplitCB::matchGroup: pushing %d %d\n", + i1->second.first, i2->second.second)); + tboffs.push_back(pair(i1->second.first, i2->second.second)); + } else { + LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", + sta, sto)); + } + return true; +} + +class PairIntCmpFirst { +public: + /** Return true if and only if a is strictly shorter than b. + */ + bool operator()(pair a, pairb) { + return a.first < b.first; + } +}; + +bool myTextSplitCB::matchGroups() +{ + vector >::const_iterator vit = m_groups.begin(); + vector::const_iterator sit = m_slacks.begin(); + for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) { + matchGroup(*vit, *sit + (*vit).size()); + } + + std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst()); + return true; +} + // Fix result text for display inside the gui text window. // // To compute the term character positions in the output text, we used @@ -86,22 +265,41 @@ class myTextSplitCB : public TextSplitCB { // don't know the term par/car positions in the editor text. Instead, // we return the first term encountered, and the caller will use the // editor's find() function to position on it -bool plaintorich(const string& in, string& out, const list& terms, +bool plaintorich(const string& in, string& out, + RefCntr sdata, string *firstTerm, bool noHeader) { Chrono chron; - LOGDEB(("plaintorich: terms: %s\n", - stringlistdisp(terms).c_str())); out.erase(); + vector terms; + vector > groups; + vector slacks; + + sdata->getTerms(terms, groups, slacks); + + { + LOGDEB(("plaintorich: terms: \n")); + string sterms = vecStringToString(terms); + LOGDEB((" %s\n", sterms.c_str())); + sterms = "\n"; + LOGDEB(("plaintorich: groups: \n")); + for (vector >::iterator vit = groups.begin(); + vit != groups.end(); vit++) { + sterms += vecStringToString(*vit); + sterms += "\n"; + } + LOGDEB((" %s", sterms.c_str())); + } // We first use the text splitter to break the text into words, // and compare the words to the search terms, which yields the // query terms positions inside the text - myTextSplitCB cb(terms); + myTextSplitCB cb(terms, groups, slacks); TextSplit splitter(&cb, TextSplit::TXTS_ONLYSPANS); // Note that splitter returns the term locations in byte, not // character offset splitter.text_to_words(in); + cb.matchGroups(); if (firstTerm) *firstTerm = cb.firstTerm; @@ -118,6 +316,10 @@ bool plaintorich(const string& in, string& out, const list& terms, // output text vector >::iterator tPosIt = cb.tboffs.begin(); + for (vector >::const_iterator it = cb.tboffs.begin(); + it != cb.tboffs.end(); it++) { + LOGDEB(("plaintorich: region: %d %d\n", it->first, it->second)); + } // Input character iterator Utf8Iter chariter(in); // State variable used to limitate the number of consecutive empty lines diff --git a/src/qtgui/plaintorich.h b/src/qtgui/plaintorich.h index a3bd1307..82b88aaf 100644 --- a/src/qtgui/plaintorich.h +++ b/src/qtgui/plaintorich.h @@ -16,10 +16,12 @@ */ #ifndef _PLAINTORICH_H_INCLUDED_ #define _PLAINTORICH_H_INCLUDED_ -/* @(#$Id: plaintorich.h,v 1.7 2006-09-13 14:57:56 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: plaintorich.h,v 1.8 2006-11-17 10:09:07 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include "searchdata.h" + /** * Transform plain text into qt rich text for the preview window. * @@ -33,7 +35,7 @@ * @param noHeader if true don't output header (...) */ extern bool plaintorich(const string &in, string &out, - const list<string>& terms, + RefCntr<Rcl::SearchData> sdata, string* firstTerm, bool noHeader = false); #endif /* _PLAINTORICH_H_INCLUDED_ */ diff --git a/src/qtgui/preview_w.cpp b/src/qtgui/preview_w.cpp index 2e2b2d27..16ddca42 100644 --- a/src/qtgui/preview_w.cpp +++ b/src/qtgui/preview_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.5 2006-11-09 19:04:28 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -171,7 +171,8 @@ QTextEdit *Preview::getCurrentEditor() // current search, trying to advance and possibly wrapping around. If next is // false, the search string has been modified, we search for the new string, // starting from the current position -void Preview::doSearch(const QString &text, bool next, bool reverse) +void Preview::doSearch(const QString &text, bool next, bool reverse, + bool wo) { LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse))); QTextEdit *edit = getCurrentEditor(); @@ -203,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse) } } - bool found = edit->find(text, matchCase, false, + bool found = edit->find(text, matchCase, wo, !reverse, &mspara, &msindex); LOGDEB(("Found at para: %d index %d\n", mspara, msindex)); @@ -448,14 +449,14 @@ class LoadThread : public QThread { /* A thread to convert to rich text (mark search terms) */ class ToRichThread : public QThread { string ∈ - list<string> &terms; + RefCntr<Rcl::SearchData> m_searchData; string& firstTerm; QString &out; int loglevel; public: - ToRichThread(string &i, list<string> &trms, + ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData, string& ft, QString &o) - : in(i), terms(trms), firstTerm(ft), out(o) + : in(i), m_searchData(searchData), firstTerm(ft), out(o) { loglevel = DebugLog::getdbl()->getlevel(); } @@ -464,7 +465,7 @@ class ToRichThread : public QThread { DebugLog::getdbl()->setloglevel(loglevel); string rich; try { - plaintorich(in, rich, terms, &firstTerm); + plaintorich(in, rich, m_searchData, &firstTerm); } catch (CancelExcept) { } out = QString::fromUtf8(rich.c_str(), rich.length()); @@ -546,11 +547,9 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, QString richTxt; bool highlightTerms = fdoc.text.length() < 1000 *1024; string firstTerm; - list<string> terms; - rcldb->getMatchTerms(idoc, terms); if (highlightTerms) { progress.setLabelText(tr("Creating preview text")); - ToRichThread rthr(fdoc.text, terms, firstTerm, richTxt); + ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt); rthr.start(); for (;;prog++) { @@ -630,11 +629,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, if (!firstTerm.empty()) { bool wasC = matchCheck->isChecked(); matchCheck->setChecked(false); - doSearch(QString::fromUtf8(terms.begin()->c_str()), true, false); + doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true); matchCheck->setChecked(wasC); } } emit(previewExposed(m_searchId, docnum)); return true; } - diff --git a/src/qtgui/preview_w.h b/src/qtgui/preview_w.h index 4cdc9901..d938982d 100644 --- a/src/qtgui/preview_w.h +++ b/src/qtgui/preview_w.h @@ -1,6 +1,6 @@ #ifndef _PREVIEW_W_H_INCLUDED_ #define _PREVIEW_W_H_INCLUDED_ -/* @(#$Id: preview_w.h,v 1.3 2006-09-21 12:56:57 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: preview_w.h,v 1.4 2006-11-17 10:09:07 dockes Exp $ (C) 2006 J.F.Dockes */ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,6 +22,8 @@ #include <qwidget.h> #include "rcldb.h" #include "preview.h" +#include "refcntr.h" +#include "searchdata.h" // We keep a list of data associated to each tab class TabData { @@ -45,7 +47,11 @@ public: ~Preview(){} - virtual void setSId(int sid) {m_searchId = sid;} + virtual void setSId(int sid, RefCntr<Rcl::SearchData> sdata) + { + m_searchId = sid; + m_searchData = sdata; + } virtual void closeEvent( QCloseEvent *e ); virtual bool eventFilter( QObject *target, QEvent *event ); virtual bool makeDocCurrent( const string & fn, const Rcl::Doc & doc ); @@ -56,7 +62,8 @@ public: public slots: virtual void searchTextLine_textChanged( const QString & text ); - virtual void doSearch( const QString &str, bool next, bool reverse ); + virtual void doSearch(const QString &str, bool next, bool reverse, + bool wo = false); virtual void nextPressed(); virtual void prevPressed(); virtual void currentChanged( QWidget * tw ); @@ -72,7 +79,7 @@ signals: void showPrev(int sid, int docnum); void previewExposed(int sid, int docnum); -protected: +private: int m_searchId; // Identifier of search in main window. This is so that // we make sense when requesting the next document when // browsing successive search results in a tab. @@ -82,8 +89,7 @@ protected: bool canBeep; list<TabData> tabData; QWidget *currentW; - -private: + RefCntr<Rcl::SearchData> m_searchData; void init(); virtual void destroy(); TabData *tabDataForCurrent(); // Return auxiliary data pointer for cur tab diff --git a/src/qtgui/rclmain_w.cpp b/src/qtgui/rclmain_w.cpp index 36ae7e96..fe85e7c6 100644 --- a/src/qtgui/rclmain_w.cpp +++ b/src/qtgui/rclmain_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.7 2006-11-14 13:55:43 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.8 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -472,7 +472,7 @@ void RclMain::startPreview(int docnum) QMessageBox::NoButton); return; } - curPreview->setSId(m_searchId); + curPreview->setSId(m_searchId, resList->getSearchData()); curPreview->setCaption(resList->getDescription()); connect(curPreview, SIGNAL(previewClosed(QWidget *)), this, SLOT(previewClosed(QWidget *))); @@ -712,14 +712,17 @@ void RclMain::showDocHistory() if (sortspecs.sortwidth > 0) { DocSequenceHistory myseq(rcldb, g_dynconf, string(tr("Document history").utf8())); - docsource = new DocSeqSorted(myseq, sortspecs, - string(tr("Document history (sorted)").utf8())); + docsource = new + DocSeqSorted(myseq, sortspecs, + string(tr("Document history (sorted)").utf8())); } else { - docsource = new DocSequenceHistory(rcldb, g_dynconf, - string(tr("Document history").utf8())); + docsource = new + DocSequenceHistory(rcldb, g_dynconf, + string(tr("Document history").utf8())); } + // Construct a bogus SearchData RefCntr<Rcl::SearchData> sdata(new Rcl::SearchData(Rcl::SCLT_AND)); - sdata->m_description = tr("History data").utf8(); + sdata->setDescription((const char *)tr("History data").utf8()); m_searchId++; resList->setDocSource(docsource, sdata); } diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index 020813d0..104e8b03 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: reslist.cpp,v 1.9 2006-11-13 08:58:47 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: reslist.cpp,v 1.10 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include <time.h> @@ -76,7 +76,7 @@ void ResList::setDocSource(DocSequence *docsource, delete m_docsource; m_winfirst = -1; m_docsource = docsource; - m_queryData = sdt; + m_searchData = sdt; m_curPvDoc = -1; resultPageNext(); @@ -264,9 +264,7 @@ void ResList::resultPageNext() QStyleSheetItem *item = new QStyleSheetItem(styleSheet(), "termtag" ); item->setColor("blue"); - // item->setFontWeight(QFont::Bold); - list<string> qTerms; - m_docsource->getTerms(qTerms); + // item->setFontWeight(QFont::Bold); // Result paragraph format string sformat = string(prefs.reslistformat.utf8()); @@ -383,7 +381,7 @@ void ResList::resultPageNext() // Abstract string abst; - plaintorich(doc.abstract, abst, qTerms, 0, true); + plaintorich(doc.abstract, abst, m_searchData, 0, true); // Links; string linksbuf; @@ -609,7 +607,7 @@ void ResList::menuExpand() QString ResList::getDescription() { - return QString::fromUtf8(m_queryData->m_description.c_str()); + return QString::fromUtf8(m_searchData->getDescription().c_str()); } /** Show detailed expansion of a query */ @@ -619,7 +617,7 @@ void ResList::showQueryDetails() // Also limit the total number of lines. const unsigned int ll = 100; const unsigned int maxlines = 50; - string query = m_queryData->m_description; + string query = m_searchData->getDescription(); string oq; unsigned int nlines = 0; while (query.length() > 0) { diff --git a/src/qtgui/reslist.h b/src/qtgui/reslist.h index b74778a9..47efba15 100644 --- a/src/qtgui/reslist.h +++ b/src/qtgui/reslist.h @@ -1,6 +1,6 @@ #ifndef _RESLIST_H_INCLUDED_ #define _RESLIST_H_INCLUDED_ -/* @(#$Id: reslist.h,v 1.2 2006-11-13 08:58:47 dockes Exp $ (C) 2005 J.F.Dockes */ +/* @(#$Id: reslist.h,v 1.3 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes */ #include <list> @@ -35,6 +35,7 @@ class ResList : public QTextBrowser virtual QPopupMenu *createPopupMenu(const QPoint& pos); virtual QString getDescription(); // Printable actual query performed on db virtual int getResCnt(); // Return total result list size + virtual RefCntr<Rcl::SearchData> getSearchData() {return m_searchData;} public slots: virtual void resetSearch() {m_winfirst = -1;clear();} @@ -71,7 +72,7 @@ class ResList : public QTextBrowser private: std::map<int,int> m_pageParaToReldocnums; - RefCntr<Rcl::SearchData> m_queryData; + RefCntr<Rcl::SearchData> m_searchData; DocSequence *m_docsource; std::vector<Rcl::Doc> m_curDocs; int m_winfirst;