#ifndef lint static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.10 2006-02-07 09:44:33 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #ifndef NO_NAMESPACES using std::list; using std::pair; using std::set; #endif /* NO_NAMESPACES */ #include "rcldb.h" #include "rclconfig.h" #include "debuglog.h" #include "textsplit.h" #include "utf8iter.h" #include "transcode.h" #include "smallut.h" #include "plaintorich.h" #include "cancelcheck.h" // Text splitter callback used to take note of the position of query terms // inside the result text. This is then used to post highlight tags. class myTextSplitCB : public TextSplitCB { public: string firstTerm; set terms; // in: user query terms list > tboffs; // out: begin and end positions of // query terms in text myTextSplitCB(const list& its) { for (list::const_iterator it = its.begin(); it != its.end(); it++) { string s; Rcl::dumb_string(*it, s); terms.insert(s); } } // Callback called by the text-to-words breaker for each word virtual bool takeword(const std::string& term, int, int bts, int bte) { string dumb; Rcl::dumb_string(term, dumb); //LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), // pos, bts, bte)); if (terms.find(dumb) != terms.end()) { tboffs.push_back(pair(bts, bte)); if (firstTerm.empty()) firstTerm = term; } CancelCheck::instance().checkCancel(); return true; } }; // Fix result text for display inside the gui text window. // // To compute the term character positions in the output text, we used // to emulate how qt's textedit counts chars (ignoring tags and // duplicate whitespace etc...). This was tricky business, dependant // on qtextedit internals, and we don't do it any more, so we finally // don't know the term par/car positions in the editor text. Instead, // we return the first term encountered, and the caller will use the // editor's find() function to position on it bool plaintorich(const string& in, string& out, const list& terms, string *firstTerm) { Chrono chron; LOGDEB(("plaintorich: terms: %s\n", stringlistdisp(terms).c_str())); out.erase(); // We first use the text splitter to break the text into words, // and compare the words to the search terms, which yields the // query terms positions inside the text myTextSplitCB cb(terms); TextSplit splitter(&cb, true); // Note that splitter returns the term locations in byte, not // character offset splitter.text_to_words(in); if (firstTerm) *firstTerm = cb.firstTerm; LOGDEB(("plaintorich: split done %d mS\n", chron.millis())); // Rich text output out = "

"; // Iterator for the list of input term positions. We use it to // output highlight tags and to compute term positions in the // output text list >::iterator tPosIt = cb.tboffs.begin(); // Input character iterator Utf8Iter chariter(in); // State variable used to limitate the number of consecutive empty lines int ateol = 0; // State variable to update the char pos only for the first of // consecutive blank chars int atblank = 0; for (string::size_type pos = 0; pos != string::npos; pos = chariter++) { if (pos && (pos % 1000) == 0) { CancelCheck::instance().checkCancel(); } // If we still have terms positions, check (byte) position if (tPosIt != cb.tboffs.end()) { int ibyteidx = chariter.getBpos(); if (ibyteidx == tPosIt->first) { out += ""; } else if (ibyteidx == tPosIt->second) { if (tPosIt != cb.tboffs.end()) tPosIt++; out += ""; } } switch(*chariter) { case '\n': if (ateol < 2) { out += "
\n"; ateol++; } break; case '\r': break; case '<': ateol = 0; out += "<"; break; case '&': ateol = 0; out += "&"; break; default: // We don't change the eol status for whitespace, want a real line if (*chariter == ' ' || *chariter == '\t') { atblank = 1; } else { ateol = 0; atblank = 0; } chariter.appendchartostring(out); } } #if 0 { FILE *fp = fopen("/tmp/debugplaintorich", "w"); fprintf(fp, "%s\n", out.c_str()); fclose(fp); } #endif LOGDEB(("plaintorich: done %d mS\n", chron.millis())); return true; }