diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 405f5e51..f2a850d4 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.34 2006-04-30 07:39:09 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.35 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -81,6 +81,9 @@ bool DbIndexer::indexDb(bool resetbefore, list *topdirs) // Set the current directory in config so that subsequent // getConfParams() will get local values m_config->setKeyDir(*it); + int abslen; + if (m_config->getConfParam("idxabsmlen", &abslen)) + m_db.setAbstractParams(abslen, -1, -1); // Set up skipped patterns for this subtree. This probably should be // done in the directory change code in processone() instead. @@ -179,6 +182,9 @@ bool DbIndexer::indexFiles(const list &filenames) list::const_iterator it; for (it = filenames.begin(); it != filenames.end();it++) { m_config->setKeyDir(path_getfather(*it)); + int abslen; + if (m_config->getConfParam("idxabsmlen", &abslen)) + m_db.setAbstractParams(abslen, -1, -1); struct stat stb; if (stat(it->c_str(), &stb) != 0) { LOGERR(("DbIndexer::indexFiles: stat(%s): %s", it->c_str(), @@ -228,6 +234,9 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) { m_config->setKeyDir(fn); + int abslen; + if (m_config->getConfParam("idxabsmlen", &abslen)) + m_db.setAbstractParams(abslen, -1, -1); return FsTreeWalker::FtwOk; } diff --git a/src/qtgui/guiutils.cpp b/src/qtgui/guiutils.cpp index f39a8052..5e806c40 100644 --- a/src/qtgui/guiutils.cpp +++ b/src/qtgui/guiutils.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.16 2006-09-13 08:13:36 dockes Exp $ (C) 2005 Jean-Francois Dockes"; +static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.17 2006-09-13 13:53:35 dockes Exp $ (C) 2005 Jean-Francois Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -151,6 +151,10 @@ void rwSettings(bool writing) "/Recoll/prefs/query/buildAbstract", Bool, true); SETTING_RW(prefs.queryReplaceAbstract, "/Recoll/prefs/query/replaceAbstract", Bool, false); + SETTING_RW(prefs.syntAbsLen, "/Recoll/prefs/query/syntAbsLen", + Num, 250); + SETTING_RW(prefs.syntAbsCtx, "/Recoll/prefs/query/syntAbsCtx", + Num, 4); // Ssearch combobox history list if (writing) { diff --git a/src/qtgui/guiutils.h b/src/qtgui/guiutils.h index 8aa0a68d..ad940071 100644 --- a/src/qtgui/guiutils.h +++ b/src/qtgui/guiutils.h @@ -17,7 +17,7 @@ #ifndef _GUIUTILS_H_INCLUDED_ #define _GUIUTILS_H_INCLUDED_ /* - * @(#$Id: guiutils.h,v 1.8 2006-09-13 08:13:36 dockes Exp $ (C) 2005 Jean-Francois Dockes + * @(#$Id: guiutils.h,v 1.9 2006-09-13 13:53:35 dockes Exp $ (C) 2005 Jean-Francois Dockes * jean-francois.dockes@wanadoo.fr * * This program is free software; you can redistribute it and/or modify @@ -78,6 +78,9 @@ class PrefsPack { // Ignored file types in adv search (startup default) QStringList asearchIgnFilTyps; + int syntAbsLen; + int syntAbsCtx; + PrefsPack() : showicons(true), respagesize(8), diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp index b7db3767..b3705909 100644 --- a/src/qtgui/main.cpp +++ b/src/qtgui/main.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: main.cpp,v 1.48 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: main.cpp,v 1.49 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -103,6 +103,7 @@ bool maybeOpenDb(string &reason, bool force) dbdir + " wait for indexing to complete?"; return false; } + rcldb->setAbstractParams(-1, prefs.syntAbsLen, prefs.syntAbsCtx); return true; } diff --git a/src/qtgui/rclmain.cpp b/src/qtgui/rclmain.cpp index 75d1595e..09218b9f 100644 --- a/src/qtgui/rclmain.cpp +++ b/src/qtgui/rclmain.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclmain.cpp,v 1.31 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclmain.cpp,v 1.32 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -131,12 +131,6 @@ void RclMain::init() nextPageAction->setIconSet(createIconSet("nextpage.png")); prevPageAction->setIconSet(createIconSet("prevpage.png")); - - - if (prefs.startWithAdvSearchOpen) - showAdvSearchDialog(); - if (prefs.startWithSortToolOpen) - showSortDialog(); } // We also want to get rid of the advanced search form and previews @@ -667,7 +661,6 @@ void RclMain::docExpand(int docnum) // We need to insert item here, its not auto-done like when the user types // CR sSearch->queryText->setEditText(text); - sSearch->queryText->insertItem(text, 0); sSearch->setAnyTermMode(); sSearch->startSimpleSearch(); } diff --git a/src/qtgui/ssearch_w.cpp b/src/qtgui/ssearch_w.cpp index 4ce28a14..4f685188 100644 --- a/src/qtgui/ssearch_w.cpp +++ b/src/qtgui/ssearch_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.4 2006-09-12 10:11:36 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.5 2006-09-13 13:53:35 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -89,12 +89,20 @@ void SSearch::startSimpleSearch() // the listbox list, The qt listbox doesn't do lru correctly (if // already in the list the new entry would remain at it's place, // not jump at the top as it should + LOGDEB3(("Querytext list count %d\n", queryText->count())); + // Have to save current text, this will change while we clean up the list + QString txt = queryText->currentText(); bool changed; do { changed = false; for (int index = 0; index < queryText->count(); index++) { + LOGDEB3(("Querytext[%d] = [%s]\n", index, + (const char *)(queryText->text(index).utf8()))); if (queryText->text(index).length() == 0 || - queryText->text(index) == queryText->currentText()) { + QString::compare(queryText->text(index), txt) == 0) { + LOGDEB3(("Querytext removing at %d [%s] [%s]\n", index, + (const char *)(queryText->text(index).utf8()), + (const char *)(txt.utf8()))); queryText->removeItem(index); changed = true; break; @@ -102,13 +110,14 @@ void SSearch::startSimpleSearch() } } while (changed); // The combobox is set for no insertion, insert here: - queryText->insertItem(queryText->currentText(), 0); + queryText->insertItem(txt, 0); + queryText->setCurrentItem(0); // Save the current state of the listbox list to file prefs.ssearchHistory.clear(); - for (int index = 0; index < queryText->count(); index++) + for (int index = 0; index < queryText->count(); index++) { prefs.ssearchHistory.push_back(queryText->text(index).utf8()); - + } emit startSearch(sdata); } diff --git a/src/qtgui/uiprefs.ui b/src/qtgui/uiprefs.ui index a650dc19..73f7fb0c 100644 --- a/src/qtgui/uiprefs.ui +++ b/src/qtgui/uiprefs.ui @@ -47,7 +47,7 @@ - layout5 + layout1 @@ -92,7 +92,7 @@ - textLabel3 + textLabel4 Result list font @@ -181,7 +181,6 @@ false - initStartAdvCB @@ -204,9 +203,6 @@ false - - - @@ -276,6 +272,97 @@ May be slow for big documents. Do we synthetize an abstract even if the document seemed to have one? + + + layout16 + + + + unnamed + + + + textLabel2 + + + + 5 + 5 + 2 + 0 + + + + Synthetic abstract size (characters) + + + + + syntlenSB + + + + 7 + 0 + 1 + 0 + + + + 10 + + + 80 + + + 999 + + + 250 + + + + + + + layout17 + + + + unnamed + + + + textLabel3 + + + + 5 + 5 + 1 + 0 + + + + Synthetic abstract context words + + + + + syntctxSB + + + 20 + + + 2 + + + 4 + + + + spacer2 @@ -310,7 +397,7 @@ May be slow for big documents. - layout12 + layout15 @@ -479,7 +566,7 @@ May be slow for big documents. - textLabel3 + textLabel5 Active databases diff --git a/src/qtgui/uiprefs_w.cpp b/src/qtgui/uiprefs_w.cpp index 1f46d735..f1c1984f 100644 --- a/src/qtgui/uiprefs_w.cpp +++ b/src/qtgui/uiprefs_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.3 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.4 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -54,6 +54,8 @@ void UIPrefsDialog::init() // Show icons checkbox useIconsCB->setChecked(prefs.showicons); autoSearchCB->setChecked(prefs.autoSearchOnWS); + syntlenSB->setValue(prefs.syntAbsLen); + syntctxSB->setValue(prefs.syntAbsCtx); initStartAdvCB->setChecked(prefs.startWithAdvSearchOpen); initStartSortCB->setChecked(prefs.startWithSortToolOpen); @@ -156,6 +158,9 @@ void UIPrefsDialog::accept() prefs.startWithAdvSearchOpen = initStartAdvCB->isChecked(); prefs.startWithSortToolOpen = initStartSortCB->isChecked(); + prefs.syntAbsLen = syntlenSB->value(); + prefs.syntAbsCtx = syntctxSB->value(); + prefs.activeExtraDbs.clear(); for (unsigned int i = 0; i < actDbsLB->count(); i++) { QListBoxItem *item = actDbsLB->item(i); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 2f4f83d8..9ab838c0 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.75 2006-05-09 10:15:14 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.76 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -56,17 +56,6 @@ using namespace std; #ifndef NO_NAMESPACES namespace Rcl { #endif -// This is how long an abstract we keep or build from beginning of text when -// indexing. It only has an influence on the size of the db as we are free -// to shorten it again when displaying -#define INDEX_ABSTRACT_SIZE 250 - -// This is the size of the abstract that we synthetize out of query -// term contexts at query time -#define MA_ABSTRACT_SIZE 250 -// This is how many words (context size) we keep around query terms -// when building the abstract -#define MA_EXTRACT_WIDTH 4 // Truncate longer path and uniquize with hash . The goal for this is // to avoid xapian max term length limitations, not to gain space (we @@ -81,6 +70,7 @@ const static string rclSyntAbs = "?!#@"; // ones for indexing or query as there is not much in common. class Native { public: + Db *m_db; bool m_isopen; bool m_iswritable; Db::OpenMode m_mode; @@ -106,8 +96,9 @@ class Native { Xapian::docid docid, const list& terms); - Native() - : m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) + Native(Db *db) + : m_db(db), + m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) { } ~Native() { delete enquire; @@ -149,9 +140,10 @@ class Native { }; Db::Db() - : m_qOpts(QO_NONE) + : m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250), + m_synthAbsWordCtxLen(4) { - m_ndb = new Native; + m_ndb = new Native(this); } Db::~Db() @@ -282,7 +274,7 @@ bool Db::close() LOGDEB(("Rcl:Db: Called xapian flush\n")); } delete m_ndb; - m_ndb = new Native; + m_ndb = new Native(this); if (m_ndb) return true; } catch (const Xapian::Error &e) { @@ -442,6 +434,19 @@ bool dumb_string(const string &in, string &out) return true; } +// Let our user set the parameters for abstract processing +void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) +{ + LOGDEB(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n", + idxtrunc, syntlen, syntctxlen)); + if (idxtrunc > 0 && idxtrunc < 2000) + m_idxAbsTruncLen = idxtrunc; + if (syntlen > 0 && syntlen < 2000) + m_synthAbsLen = syntlen; + if (syntctxlen > 0 && syntctxlen < 20) + m_synthAbsWordCtxLen = syntctxlen; +} + // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more @@ -457,14 +462,16 @@ bool Db::add(const string &fn, const Doc &idoc, // Truncate abstract, title and keywords to reasonable lengths. If // abstract is currently empty, we make up one with the beginning - // of the document. + // of the document. This is then not indexed, but part of the doc + // data so that we can return it to a query without having to + // decode the original file. bool syntabs = false; if (doc.abstract.empty()) { syntabs = true; doc.abstract = rclSyntAbs + - truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE); + truncate_to_word(doc.text, m_idxAbsTruncLen); } else { - doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE); + doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen); } doc.abstract = neutchars(doc.abstract, "\n\r"); doc.title = truncate_to_word(doc.title, 100); @@ -513,14 +520,20 @@ bool Db::add(const string &fn, const Doc &idoc, splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - // Split and index abstract + // Split and index abstract. We don't do this if it is synthetic + // any more (this used to give a relevance boost to the beginning + // of text, why ?) LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str())); - if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : - doc.abstract, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; + if (!syntabs) { + // syntabs indicator test kept here in case we want to go back + // to indexing synthetic abstracts one day + if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : + doc.abstract, noacc)) { + LOGERR(("Db::add: dumb_string failed\n")); + return false; + } + splitter.text_to_words(noacc); } - splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; ////// Special terms for metadata @@ -1182,17 +1195,21 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc, parms.get(string("caption"), doc.title); parms.get(string("keywords"), doc.keywords); parms.get(string("abstract"), doc.abstract); + // Possibly remove synthetic abstract indicator (if it's there, we + // used to index the beginning of the text as abstract). bool syntabs = false; if (doc.abstract.find(rclSyntAbs) == 0) { doc.abstract = doc.abstract.substr(rclSyntAbs.length()); syntabs = true; } + // If the option is set and the abstract is synthetic or empty , build + // abstract from position data. if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) { - LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n")); + LOGDEB(("dbDataToRclDoc:: building abstract from position data\n")); if (doc.abstract.empty() || syntabs || (qopts & Db::QO_REPLACE_ABSTRACT)) doc.abstract = makeAbstract(docid, terms); - } + } parms.get(string("ipath"), doc.ipath); parms.get(string("fbytes"), doc.fbytes); parms.get(string("dbytes"), doc.dbytes); @@ -1397,6 +1414,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) // remember the position and its neigbours vector qtermposs; // The term positions set chunkposs; // All the positions we shall populate + int totaloccs = 0; for (list::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { Xapian::PositionIterator pos; @@ -1409,15 +1427,15 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) unsigned int ipos = *pos; LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos)); // Possibly extend the array. Do it in big chunks - if (ipos + MA_EXTRACT_WIDTH >= buf.size()) { - buf.resize(ipos + MA_EXTRACT_WIDTH + 1000); + if (ipos + m_db->m_synthAbsWordCtxLen >= buf.size()) { + buf.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000); } buf[ipos] = *qit; // Remember the term position qtermposs.push_back(ipos); // Add adjacent slots to the set to populate at next step - for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); - ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) { + for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen); + ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); ii++) { chunkposs.insert(ii); } // Limit the number of occurences we keep for each @@ -1427,6 +1445,9 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) } } catch (...) { } + // Limit total size + if (totaloccs++ > 100) + break; } LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", @@ -1470,21 +1491,21 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) for (vector::const_iterator it = qtermposs.begin(); it != qtermposs.end(); it++) { unsigned int ipos = *it; - unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH); - unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); + unsigned int start = MAX(0, ipos-m_db->m_synthAbsWordCtxLen); + unsigned int end = MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); string chunk; for (unsigned int ii = start; ii <= end; ii++) { if (!buf[ii].empty()) { chunk += buf[ii] + " "; abslen += buf[ii].length(); } - if (abslen > MA_ABSTRACT_SIZE) + if (int(abslen) > m_db->m_synthAbsLen) break; } if (end != buf.size()-1) chunk += "... "; mabs[ipos] = chunk; - if (abslen > MA_ABSTRACT_SIZE) + if (int(abslen) > m_db->m_synthAbsLen) break; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index ad033d69..b10b8198 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.35 2006-04-27 06:12:10 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.36 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -171,6 +171,8 @@ class Db { std::list getStemLangs(); string getDbDir(); + void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); + private: string m_filterTopDir; // Current query filter on subtree top directory @@ -183,6 +185,17 @@ private: // xapian)-specific defs to show in here unsigned int m_qOpts; + + // This is how long an abstract we keep or build from beginning of + // text when indexing. It only has an influence on the size of the + // db as we are free to shorten it again when displaying + int m_idxAbsTruncLen; + // This is the size of the abstract that we synthetize out of query + // term contexts at *query time* + int m_synthAbsLen; + // This is how many words (context size) we keep around query terms + // when building the abstract + int m_synthAbsWordCtxLen; bool reOpen(); // Close/open, same mode/opts /* Copyconst and assignemt private and forbidden */ diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index 694b2ae1..ab0d1d7b 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -1,4 +1,4 @@ -# @(#$Id: recoll.conf.in,v 1.10 2006-09-08 08:51:47 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: recoll.conf.in,v 1.11 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes # # Recoll default configuration file. This should be copied to # ~/.recoll/recoll.conf @@ -56,6 +56,9 @@ usesystemfilecommand = 1 # know? (we can otherwise just ignore them) indexallfilenames = 1 +# Length of abstracts we store while indexing. Longer will make for a +# bigger db +# idxabsmlen = 250 # You could specify different parameters for a subdirectory like this: #[~/hungariandocs/plain]