make constant lengths for abstracts config params

This commit is contained in:
dockes 2006-09-13 13:53:35 +00:00
parent d76382ce2e
commit b536c9c46c
11 changed files with 212 additions and 64 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.34 2006-04-30 07:39:09 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: indexer.cpp,v 1.35 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -81,6 +81,9 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
// Set the current directory in config so that subsequent // Set the current directory in config so that subsequent
// getConfParams() will get local values // getConfParams() will get local values
m_config->setKeyDir(*it); m_config->setKeyDir(*it);
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db.setAbstractParams(abslen, -1, -1);
// Set up skipped patterns for this subtree. This probably should be // Set up skipped patterns for this subtree. This probably should be
// done in the directory change code in processone() instead. // done in the directory change code in processone() instead.
@ -179,6 +182,9 @@ bool DbIndexer::indexFiles(const list<string> &filenames)
list<string>::const_iterator it; list<string>::const_iterator it;
for (it = filenames.begin(); it != filenames.end();it++) { for (it = filenames.begin(); it != filenames.end();it++) {
m_config->setKeyDir(path_getfather(*it)); m_config->setKeyDir(path_getfather(*it));
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db.setAbstractParams(abslen, -1, -1);
struct stat stb; struct stat stb;
if (stat(it->c_str(), &stb) != 0) { if (stat(it->c_str(), &stb) != 0) {
LOGERR(("DbIndexer::indexFiles: stat(%s): %s", it->c_str(), LOGERR(("DbIndexer::indexFiles: stat(%s): %s", it->c_str(),
@ -228,6 +234,9 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
if (flg == FsTreeWalker::FtwDirEnter || if (flg == FsTreeWalker::FtwDirEnter ||
flg == FsTreeWalker::FtwDirReturn) { flg == FsTreeWalker::FtwDirReturn) {
m_config->setKeyDir(fn); m_config->setKeyDir(fn);
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db.setAbstractParams(abslen, -1, -1);
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.16 2006-09-13 08:13:36 dockes Exp $ (C) 2005 Jean-Francois Dockes"; static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.17 2006-09-13 13:53:35 dockes Exp $ (C) 2005 Jean-Francois Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -151,6 +151,10 @@ void rwSettings(bool writing)
"/Recoll/prefs/query/buildAbstract", Bool, true); "/Recoll/prefs/query/buildAbstract", Bool, true);
SETTING_RW(prefs.queryReplaceAbstract, SETTING_RW(prefs.queryReplaceAbstract,
"/Recoll/prefs/query/replaceAbstract", Bool, false); "/Recoll/prefs/query/replaceAbstract", Bool, false);
SETTING_RW(prefs.syntAbsLen, "/Recoll/prefs/query/syntAbsLen",
Num, 250);
SETTING_RW(prefs.syntAbsCtx, "/Recoll/prefs/query/syntAbsCtx",
Num, 4);
// Ssearch combobox history list // Ssearch combobox history list
if (writing) { if (writing) {

View File

@ -17,7 +17,7 @@
#ifndef _GUIUTILS_H_INCLUDED_ #ifndef _GUIUTILS_H_INCLUDED_
#define _GUIUTILS_H_INCLUDED_ #define _GUIUTILS_H_INCLUDED_
/* /*
* @(#$Id: guiutils.h,v 1.8 2006-09-13 08:13:36 dockes Exp $ (C) 2005 Jean-Francois Dockes * @(#$Id: guiutils.h,v 1.9 2006-09-13 13:53:35 dockes Exp $ (C) 2005 Jean-Francois Dockes
* jean-francois.dockes@wanadoo.fr * jean-francois.dockes@wanadoo.fr
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -78,6 +78,9 @@ class PrefsPack {
// Ignored file types in adv search (startup default) // Ignored file types in adv search (startup default)
QStringList asearchIgnFilTyps; QStringList asearchIgnFilTyps;
int syntAbsLen;
int syntAbsCtx;
PrefsPack() : PrefsPack() :
showicons(true), showicons(true),
respagesize(8), respagesize(8),

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: main.cpp,v 1.48 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: main.cpp,v 1.49 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -103,6 +103,7 @@ bool maybeOpenDb(string &reason, bool force)
dbdir + " wait for indexing to complete?"; dbdir + " wait for indexing to complete?";
return false; return false;
} }
rcldb->setAbstractParams(-1, prefs.syntAbsLen, prefs.syntAbsCtx);
return true; return true;
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rclmain.cpp,v 1.31 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: rclmain.cpp,v 1.32 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -131,12 +131,6 @@ void RclMain::init()
nextPageAction->setIconSet(createIconSet("nextpage.png")); nextPageAction->setIconSet(createIconSet("nextpage.png"));
prevPageAction->setIconSet(createIconSet("prevpage.png")); prevPageAction->setIconSet(createIconSet("prevpage.png"));
if (prefs.startWithAdvSearchOpen)
showAdvSearchDialog();
if (prefs.startWithSortToolOpen)
showSortDialog();
} }
// We also want to get rid of the advanced search form and previews // We also want to get rid of the advanced search form and previews
@ -667,7 +661,6 @@ void RclMain::docExpand(int docnum)
// We need to insert item here, its not auto-done like when the user types // We need to insert item here, its not auto-done like when the user types
// CR // CR
sSearch->queryText->setEditText(text); sSearch->queryText->setEditText(text);
sSearch->queryText->insertItem(text, 0);
sSearch->setAnyTermMode(); sSearch->setAnyTermMode();
sSearch->startSimpleSearch(); sSearch->startSimpleSearch();
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.4 2006-09-12 10:11:36 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.5 2006-09-13 13:53:35 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -89,12 +89,20 @@ void SSearch::startSimpleSearch()
// the listbox list, The qt listbox doesn't do lru correctly (if // the listbox list, The qt listbox doesn't do lru correctly (if
// already in the list the new entry would remain at it's place, // already in the list the new entry would remain at it's place,
// not jump at the top as it should // not jump at the top as it should
LOGDEB3(("Querytext list count %d\n", queryText->count()));
// Have to save current text, this will change while we clean up the list
QString txt = queryText->currentText();
bool changed; bool changed;
do { do {
changed = false; changed = false;
for (int index = 0; index < queryText->count(); index++) { for (int index = 0; index < queryText->count(); index++) {
LOGDEB3(("Querytext[%d] = [%s]\n", index,
(const char *)(queryText->text(index).utf8())));
if (queryText->text(index).length() == 0 || if (queryText->text(index).length() == 0 ||
queryText->text(index) == queryText->currentText()) { QString::compare(queryText->text(index), txt) == 0) {
LOGDEB3(("Querytext removing at %d [%s] [%s]\n", index,
(const char *)(queryText->text(index).utf8()),
(const char *)(txt.utf8())));
queryText->removeItem(index); queryText->removeItem(index);
changed = true; changed = true;
break; break;
@ -102,13 +110,14 @@ void SSearch::startSimpleSearch()
} }
} while (changed); } while (changed);
// The combobox is set for no insertion, insert here: // The combobox is set for no insertion, insert here:
queryText->insertItem(queryText->currentText(), 0); queryText->insertItem(txt, 0);
queryText->setCurrentItem(0);
// Save the current state of the listbox list to file // Save the current state of the listbox list to file
prefs.ssearchHistory.clear(); prefs.ssearchHistory.clear();
for (int index = 0; index < queryText->count(); index++) for (int index = 0; index < queryText->count(); index++) {
prefs.ssearchHistory.push_back(queryText->text(index).utf8()); prefs.ssearchHistory.push_back(queryText->text(index).utf8());
}
emit startSearch(sdata); emit startSearch(sdata);
} }

View File

@ -47,7 +47,7 @@
</property> </property>
<widget class="QLayoutWidget"> <widget class="QLayoutWidget">
<property name="name"> <property name="name">
<cstring>layout5</cstring> <cstring>layout1</cstring>
</property> </property>
<hbox> <hbox>
<property name="name"> <property name="name">
@ -92,7 +92,7 @@
</property> </property>
<widget class="QLabel"> <widget class="QLabel">
<property name="name"> <property name="name">
<cstring>textLabel3</cstring> <cstring>textLabel4</cstring>
</property> </property>
<property name="text"> <property name="text">
<string>Result list font</string> <string>Result list font</string>
@ -181,7 +181,6 @@
<bool>false</bool> <bool>false</bool>
</property> </property>
</widget> </widget>
<widget class="QCheckBox"> <widget class="QCheckBox">
<property name="name"> <property name="name">
<cstring>initStartAdvCB</cstring> <cstring>initStartAdvCB</cstring>
@ -204,9 +203,6 @@
<bool>false</bool> <bool>false</bool>
</property> </property>
</widget> </widget>
</vbox> </vbox>
</widget> </widget>
</vbox> </vbox>
@ -276,6 +272,97 @@ May be slow for big documents.</string>
<string>Do we synthetize an abstract even if the document seemed to have one?</string> <string>Do we synthetize an abstract even if the document seemed to have one?</string>
</property> </property>
</widget> </widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout16</cstring>
</property>
<hbox>
<property name="name">
<cstring>unnamed</cstring>
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel2</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>5</hsizetype>
<vsizetype>5</vsizetype>
<horstretch>2</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Synthetic abstract size (characters)</string>
</property>
</widget>
<widget class="QSpinBox">
<property name="name">
<cstring>syntlenSB</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>0</vsizetype>
<horstretch>1</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="lineStep">
<number>10</number>
</property>
<property name="minValue">
<number>80</number>
</property>
<property name="maxValue">
<number>999</number>
</property>
<property name="value">
<number>250</number>
</property>
</widget>
</hbox>
</widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout17</cstring>
</property>
<hbox>
<property name="name">
<cstring>unnamed</cstring>
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel3</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>5</hsizetype>
<vsizetype>5</vsizetype>
<horstretch>1</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Synthetic abstract context words</string>
</property>
</widget>
<widget class="QSpinBox">
<property name="name">
<cstring>syntctxSB</cstring>
</property>
<property name="maxValue">
<number>20</number>
</property>
<property name="minValue">
<number>2</number>
</property>
<property name="value">
<number>4</number>
</property>
</widget>
</hbox>
</widget>
<spacer> <spacer>
<property name="name"> <property name="name">
<cstring>spacer2</cstring> <cstring>spacer2</cstring>
@ -310,7 +397,7 @@ May be slow for big documents.</string>
</property> </property>
<widget class="QLayoutWidget"> <widget class="QLayoutWidget">
<property name="name"> <property name="name">
<cstring>layout12</cstring> <cstring>layout15</cstring>
</property> </property>
<hbox> <hbox>
<property name="name"> <property name="name">
@ -479,7 +566,7 @@ May be slow for big documents.</string>
</property> </property>
<widget class="QLabel"> <widget class="QLabel">
<property name="name"> <property name="name">
<cstring>textLabel3</cstring> <cstring>textLabel5</cstring>
</property> </property>
<property name="text"> <property name="text">
<string>Active databases</string> <string>Active databases</string>

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.3 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.4 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -54,6 +54,8 @@ void UIPrefsDialog::init()
// Show icons checkbox // Show icons checkbox
useIconsCB->setChecked(prefs.showicons); useIconsCB->setChecked(prefs.showicons);
autoSearchCB->setChecked(prefs.autoSearchOnWS); autoSearchCB->setChecked(prefs.autoSearchOnWS);
syntlenSB->setValue(prefs.syntAbsLen);
syntctxSB->setValue(prefs.syntAbsCtx);
initStartAdvCB->setChecked(prefs.startWithAdvSearchOpen); initStartAdvCB->setChecked(prefs.startWithAdvSearchOpen);
initStartSortCB->setChecked(prefs.startWithSortToolOpen); initStartSortCB->setChecked(prefs.startWithSortToolOpen);
@ -156,6 +158,9 @@ void UIPrefsDialog::accept()
prefs.startWithAdvSearchOpen = initStartAdvCB->isChecked(); prefs.startWithAdvSearchOpen = initStartAdvCB->isChecked();
prefs.startWithSortToolOpen = initStartSortCB->isChecked(); prefs.startWithSortToolOpen = initStartSortCB->isChecked();
prefs.syntAbsLen = syntlenSB->value();
prefs.syntAbsCtx = syntctxSB->value();
prefs.activeExtraDbs.clear(); prefs.activeExtraDbs.clear();
for (unsigned int i = 0; i < actDbsLB->count(); i++) { for (unsigned int i = 0; i < actDbsLB->count(); i++) {
QListBoxItem *item = actDbsLB->item(i); QListBoxItem *item = actDbsLB->item(i);

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.75 2006-05-09 10:15:14 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.76 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -56,17 +56,6 @@ using namespace std;
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
namespace Rcl { namespace Rcl {
#endif #endif
// This is how long an abstract we keep or build from beginning of text when
// indexing. It only has an influence on the size of the db as we are free
// to shorten it again when displaying
#define INDEX_ABSTRACT_SIZE 250
// This is the size of the abstract that we synthetize out of query
// term contexts at query time
#define MA_ABSTRACT_SIZE 250
// This is how many words (context size) we keep around query terms
// when building the abstract
#define MA_EXTRACT_WIDTH 4
// Truncate longer path and uniquize with hash . The goal for this is // Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we // to avoid xapian max term length limitations, not to gain space (we
@ -81,6 +70,7 @@ const static string rclSyntAbs = "?!#@";
// ones for indexing or query as there is not much in common. // ones for indexing or query as there is not much in common.
class Native { class Native {
public: public:
Db *m_db;
bool m_isopen; bool m_isopen;
bool m_iswritable; bool m_iswritable;
Db::OpenMode m_mode; Db::OpenMode m_mode;
@ -106,8 +96,9 @@ class Native {
Xapian::docid docid, Xapian::docid docid,
const list<string>& terms); const list<string>& terms);
Native() Native(Db *db)
: m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) : m_db(db),
m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0)
{ } { }
~Native() { ~Native() {
delete enquire; delete enquire;
@ -149,9 +140,10 @@ class Native {
}; };
Db::Db() Db::Db()
: m_qOpts(QO_NONE) : m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
m_synthAbsWordCtxLen(4)
{ {
m_ndb = new Native; m_ndb = new Native(this);
} }
Db::~Db() Db::~Db()
@ -282,7 +274,7 @@ bool Db::close()
LOGDEB(("Rcl:Db: Called xapian flush\n")); LOGDEB(("Rcl:Db: Called xapian flush\n"));
} }
delete m_ndb; delete m_ndb;
m_ndb = new Native; m_ndb = new Native(this);
if (m_ndb) if (m_ndb)
return true; return true;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
@ -442,6 +434,19 @@ bool dumb_string(const string &in, string &out)
return true; return true;
} }
// Let our user set the parameters for abstract processing
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
{
LOGDEB(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n",
idxtrunc, syntlen, syntctxlen));
if (idxtrunc > 0 && idxtrunc < 2000)
m_idxAbsTruncLen = idxtrunc;
if (syntlen > 0 && syntlen < 2000)
m_synthAbsLen = syntlen;
if (syntctxlen > 0 && syntctxlen < 20)
m_synthAbsWordCtxLen = syntctxlen;
}
// Add document in internal form to the database: index the terms in // Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name, // the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more // date, mime type ... , create the document data record (more
@ -457,14 +462,16 @@ bool Db::add(const string &fn, const Doc &idoc,
// Truncate abstract, title and keywords to reasonable lengths. If // Truncate abstract, title and keywords to reasonable lengths. If
// abstract is currently empty, we make up one with the beginning // abstract is currently empty, we make up one with the beginning
// of the document. // of the document. This is then not indexed, but part of the doc
// data so that we can return it to a query without having to
// decode the original file.
bool syntabs = false; bool syntabs = false;
if (doc.abstract.empty()) { if (doc.abstract.empty()) {
syntabs = true; syntabs = true;
doc.abstract = rclSyntAbs + doc.abstract = rclSyntAbs +
truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE); truncate_to_word(doc.text, m_idxAbsTruncLen);
} else { } else {
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE); doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
} }
doc.abstract = neutchars(doc.abstract, "\n\r"); doc.abstract = neutchars(doc.abstract, "\n\r");
doc.title = truncate_to_word(doc.title, 100); doc.title = truncate_to_word(doc.title, 100);
@ -513,14 +520,20 @@ bool Db::add(const string &fn, const Doc &idoc,
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split and index abstract // Split and index abstract. We don't do this if it is synthetic
// any more (this used to give a relevance boost to the beginning
// of text, why ?)
LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str())); LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : if (!syntabs) {
doc.abstract, noacc)) { // syntabs indicator test kept here in case we want to go back
LOGERR(("Db::add: dumb_string failed\n")); // to indexing synthetic abstracts one day
return false; if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
doc.abstract, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
} }
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
////// Special terms for metadata ////// Special terms for metadata
@ -1182,17 +1195,21 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
parms.get(string("caption"), doc.title); parms.get(string("caption"), doc.title);
parms.get(string("keywords"), doc.keywords); parms.get(string("keywords"), doc.keywords);
parms.get(string("abstract"), doc.abstract); parms.get(string("abstract"), doc.abstract);
// Possibly remove synthetic abstract indicator (if it's there, we
// used to index the beginning of the text as abstract).
bool syntabs = false; bool syntabs = false;
if (doc.abstract.find(rclSyntAbs) == 0) { if (doc.abstract.find(rclSyntAbs) == 0) {
doc.abstract = doc.abstract.substr(rclSyntAbs.length()); doc.abstract = doc.abstract.substr(rclSyntAbs.length());
syntabs = true; syntabs = true;
} }
// If the option is set and the abstract is synthetic or empty , build
// abstract from position data.
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) { if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n")); LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
if (doc.abstract.empty() || syntabs || if (doc.abstract.empty() || syntabs ||
(qopts & Db::QO_REPLACE_ABSTRACT)) (qopts & Db::QO_REPLACE_ABSTRACT))
doc.abstract = makeAbstract(docid, terms); doc.abstract = makeAbstract(docid, terms);
} }
parms.get(string("ipath"), doc.ipath); parms.get(string("ipath"), doc.ipath);
parms.get(string("fbytes"), doc.fbytes); parms.get(string("fbytes"), doc.fbytes);
parms.get(string("dbytes"), doc.dbytes); parms.get(string("dbytes"), doc.dbytes);
@ -1397,6 +1414,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
// remember the position and its neigbours // remember the position and its neigbours
vector<unsigned int> qtermposs; // The term positions vector<unsigned int> qtermposs; // The term positions
set<unsigned int> chunkposs; // All the positions we shall populate set<unsigned int> chunkposs; // All the positions we shall populate
int totaloccs = 0;
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end(); for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
qit++) { qit++) {
Xapian::PositionIterator pos; Xapian::PositionIterator pos;
@ -1409,15 +1427,15 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
unsigned int ipos = *pos; unsigned int ipos = *pos;
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos)); LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
// Possibly extend the array. Do it in big chunks // Possibly extend the array. Do it in big chunks
if (ipos + MA_EXTRACT_WIDTH >= buf.size()) { if (ipos + m_db->m_synthAbsWordCtxLen >= buf.size()) {
buf.resize(ipos + MA_EXTRACT_WIDTH + 1000); buf.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000);
} }
buf[ipos] = *qit; buf[ipos] = *qit;
// Remember the term position // Remember the term position
qtermposs.push_back(ipos); qtermposs.push_back(ipos);
// Add adjacent slots to the set to populate at next step // Add adjacent slots to the set to populate at next step
for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) { ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); ii++) {
chunkposs.insert(ii); chunkposs.insert(ii);
} }
// Limit the number of occurences we keep for each // Limit the number of occurences we keep for each
@ -1427,6 +1445,9 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
} }
} catch (...) { } catch (...) {
} }
// Limit total size
if (totaloccs++ > 100)
break;
} }
LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n",
@ -1470,21 +1491,21 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
for (vector<unsigned int>::const_iterator it = qtermposs.begin(); for (vector<unsigned int>::const_iterator it = qtermposs.begin();
it != qtermposs.end(); it++) { it != qtermposs.end(); it++) {
unsigned int ipos = *it; unsigned int ipos = *it;
unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH); unsigned int start = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); unsigned int end = MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1);
string chunk; string chunk;
for (unsigned int ii = start; ii <= end; ii++) { for (unsigned int ii = start; ii <= end; ii++) {
if (!buf[ii].empty()) { if (!buf[ii].empty()) {
chunk += buf[ii] + " "; chunk += buf[ii] + " ";
abslen += buf[ii].length(); abslen += buf[ii].length();
} }
if (abslen > MA_ABSTRACT_SIZE) if (int(abslen) > m_db->m_synthAbsLen)
break; break;
} }
if (end != buf.size()-1) if (end != buf.size()-1)
chunk += "... "; chunk += "... ";
mabs[ipos] = chunk; mabs[ipos] = chunk;
if (abslen > MA_ABSTRACT_SIZE) if (int(abslen) > m_db->m_synthAbsLen)
break; break;
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.35 2006-04-27 06:12:10 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.36 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -171,6 +171,8 @@ class Db {
std::list<std::string> getStemLangs(); std::list<std::string> getStemLangs();
string getDbDir(); string getDbDir();
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
private: private:
string m_filterTopDir; // Current query filter on subtree top directory string m_filterTopDir; // Current query filter on subtree top directory
@ -183,6 +185,17 @@ private:
// xapian)-specific defs to show in here // xapian)-specific defs to show in here
unsigned int m_qOpts; unsigned int m_qOpts;
// This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying
int m_idxAbsTruncLen;
// This is the size of the abstract that we synthetize out of query
// term contexts at *query time*
int m_synthAbsLen;
// This is how many words (context size) we keep around query terms
// when building the abstract
int m_synthAbsWordCtxLen;
bool reOpen(); // Close/open, same mode/opts bool reOpen(); // Close/open, same mode/opts
/* Copyconst and assignemt private and forbidden */ /* Copyconst and assignemt private and forbidden */

View File

@ -1,4 +1,4 @@
# @(#$Id: recoll.conf.in,v 1.10 2006-09-08 08:51:47 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: recoll.conf.in,v 1.11 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes
# #
# Recoll default configuration file. This should be copied to # Recoll default configuration file. This should be copied to
# ~/.recoll/recoll.conf # ~/.recoll/recoll.conf
@ -56,6 +56,9 @@ usesystemfilecommand = 1
# know? (we can otherwise just ignore them) # know? (we can otherwise just ignore them)
indexallfilenames = 1 indexallfilenames = 1
# Length of abstracts we store while indexing. Longer will make for a
# bigger db
# idxabsmlen = 250
# You could specify different parameters for a subdirectory like this: # You could specify different parameters for a subdirectory like this:
#[~/hungariandocs/plain] #[~/hungariandocs/plain]