make constant lengths for abstracts config params

This commit is contained in:
dockes 2006-09-13 13:53:35 +00:00
parent d76382ce2e
commit b536c9c46c
11 changed files with 212 additions and 64 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.34 2006-04-30 07:39:09 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.35 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -81,6 +81,9 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
// Set the current directory in config so that subsequent
// getConfParams() will get local values
m_config->setKeyDir(*it);
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db.setAbstractParams(abslen, -1, -1);
// Set up skipped patterns for this subtree. This probably should be
// done in the directory change code in processone() instead.
@ -179,6 +182,9 @@ bool DbIndexer::indexFiles(const list<string> &filenames)
list<string>::const_iterator it;
for (it = filenames.begin(); it != filenames.end();it++) {
m_config->setKeyDir(path_getfather(*it));
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db.setAbstractParams(abslen, -1, -1);
struct stat stb;
if (stat(it->c_str(), &stb) != 0) {
LOGERR(("DbIndexer::indexFiles: stat(%s): %s", it->c_str(),
@ -228,6 +234,9 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
if (flg == FsTreeWalker::FtwDirEnter ||
flg == FsTreeWalker::FtwDirReturn) {
m_config->setKeyDir(fn);
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db.setAbstractParams(abslen, -1, -1);
return FsTreeWalker::FtwOk;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.16 2006-09-13 08:13:36 dockes Exp $ (C) 2005 Jean-Francois Dockes";
static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.17 2006-09-13 13:53:35 dockes Exp $ (C) 2005 Jean-Francois Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -151,6 +151,10 @@ void rwSettings(bool writing)
"/Recoll/prefs/query/buildAbstract", Bool, true);
SETTING_RW(prefs.queryReplaceAbstract,
"/Recoll/prefs/query/replaceAbstract", Bool, false);
SETTING_RW(prefs.syntAbsLen, "/Recoll/prefs/query/syntAbsLen",
Num, 250);
SETTING_RW(prefs.syntAbsCtx, "/Recoll/prefs/query/syntAbsCtx",
Num, 4);
// Ssearch combobox history list
if (writing) {

View File

@ -17,7 +17,7 @@
#ifndef _GUIUTILS_H_INCLUDED_
#define _GUIUTILS_H_INCLUDED_
/*
* @(#$Id: guiutils.h,v 1.8 2006-09-13 08:13:36 dockes Exp $ (C) 2005 Jean-Francois Dockes
* @(#$Id: guiutils.h,v 1.9 2006-09-13 13:53:35 dockes Exp $ (C) 2005 Jean-Francois Dockes
* jean-francois.dockes@wanadoo.fr
*
* This program is free software; you can redistribute it and/or modify
@ -78,6 +78,9 @@ class PrefsPack {
// Ignored file types in adv search (startup default)
QStringList asearchIgnFilTyps;
int syntAbsLen;
int syntAbsCtx;
PrefsPack() :
showicons(true),
respagesize(8),

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: main.cpp,v 1.48 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: main.cpp,v 1.49 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -103,6 +103,7 @@ bool maybeOpenDb(string &reason, bool force)
dbdir + " wait for indexing to complete?";
return false;
}
rcldb->setAbstractParams(-1, prefs.syntAbsLen, prefs.syntAbsCtx);
return true;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclmain.cpp,v 1.31 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclmain.cpp,v 1.32 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -131,12 +131,6 @@ void RclMain::init()
nextPageAction->setIconSet(createIconSet("nextpage.png"));
prevPageAction->setIconSet(createIconSet("prevpage.png"));
if (prefs.startWithAdvSearchOpen)
showAdvSearchDialog();
if (prefs.startWithSortToolOpen)
showSortDialog();
}
// We also want to get rid of the advanced search form and previews
@ -667,7 +661,6 @@ void RclMain::docExpand(int docnum)
// We need to insert item here, its not auto-done like when the user types
// CR
sSearch->queryText->setEditText(text);
sSearch->queryText->insertItem(text, 0);
sSearch->setAnyTermMode();
sSearch->startSimpleSearch();
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.4 2006-09-12 10:11:36 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.5 2006-09-13 13:53:35 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -89,12 +89,20 @@ void SSearch::startSimpleSearch()
// the listbox list, The qt listbox doesn't do lru correctly (if
// already in the list the new entry would remain at it's place,
// not jump at the top as it should
LOGDEB3(("Querytext list count %d\n", queryText->count()));
// Have to save current text, this will change while we clean up the list
QString txt = queryText->currentText();
bool changed;
do {
changed = false;
for (int index = 0; index < queryText->count(); index++) {
LOGDEB3(("Querytext[%d] = [%s]\n", index,
(const char *)(queryText->text(index).utf8())));
if (queryText->text(index).length() == 0 ||
queryText->text(index) == queryText->currentText()) {
QString::compare(queryText->text(index), txt) == 0) {
LOGDEB3(("Querytext removing at %d [%s] [%s]\n", index,
(const char *)(queryText->text(index).utf8()),
(const char *)(txt.utf8())));
queryText->removeItem(index);
changed = true;
break;
@ -102,13 +110,14 @@ void SSearch::startSimpleSearch()
}
} while (changed);
// The combobox is set for no insertion, insert here:
queryText->insertItem(queryText->currentText(), 0);
queryText->insertItem(txt, 0);
queryText->setCurrentItem(0);
// Save the current state of the listbox list to file
prefs.ssearchHistory.clear();
for (int index = 0; index < queryText->count(); index++)
for (int index = 0; index < queryText->count(); index++) {
prefs.ssearchHistory.push_back(queryText->text(index).utf8());
}
emit startSearch(sdata);
}

View File

@ -47,7 +47,7 @@
</property>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout5</cstring>
<cstring>layout1</cstring>
</property>
<hbox>
<property name="name">
@ -92,7 +92,7 @@
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel3</cstring>
<cstring>textLabel4</cstring>
</property>
<property name="text">
<string>Result list font</string>
@ -181,7 +181,6 @@
<bool>false</bool>
</property>
</widget>
<widget class="QCheckBox">
<property name="name">
<cstring>initStartAdvCB</cstring>
@ -204,9 +203,6 @@
<bool>false</bool>
</property>
</widget>
</vbox>
</widget>
</vbox>
@ -276,6 +272,97 @@ May be slow for big documents.</string>
<string>Do we synthetize an abstract even if the document seemed to have one?</string>
</property>
</widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout16</cstring>
</property>
<hbox>
<property name="name">
<cstring>unnamed</cstring>
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel2</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>5</hsizetype>
<vsizetype>5</vsizetype>
<horstretch>2</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Synthetic abstract size (characters)</string>
</property>
</widget>
<widget class="QSpinBox">
<property name="name">
<cstring>syntlenSB</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>0</vsizetype>
<horstretch>1</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="lineStep">
<number>10</number>
</property>
<property name="minValue">
<number>80</number>
</property>
<property name="maxValue">
<number>999</number>
</property>
<property name="value">
<number>250</number>
</property>
</widget>
</hbox>
</widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout17</cstring>
</property>
<hbox>
<property name="name">
<cstring>unnamed</cstring>
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel3</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>5</hsizetype>
<vsizetype>5</vsizetype>
<horstretch>1</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Synthetic abstract context words</string>
</property>
</widget>
<widget class="QSpinBox">
<property name="name">
<cstring>syntctxSB</cstring>
</property>
<property name="maxValue">
<number>20</number>
</property>
<property name="minValue">
<number>2</number>
</property>
<property name="value">
<number>4</number>
</property>
</widget>
</hbox>
</widget>
<spacer>
<property name="name">
<cstring>spacer2</cstring>
@ -310,7 +397,7 @@ May be slow for big documents.</string>
</property>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout12</cstring>
<cstring>layout15</cstring>
</property>
<hbox>
<property name="name">
@ -479,7 +566,7 @@ May be slow for big documents.</string>
</property>
<widget class="QLabel">
<property name="name">
<cstring>textLabel3</cstring>
<cstring>textLabel5</cstring>
</property>
<property name="text">
<string>Active databases</string>

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.3 2006-09-13 08:13:36 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.4 2006-09-13 13:53:35 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -54,6 +54,8 @@ void UIPrefsDialog::init()
// Show icons checkbox
useIconsCB->setChecked(prefs.showicons);
autoSearchCB->setChecked(prefs.autoSearchOnWS);
syntlenSB->setValue(prefs.syntAbsLen);
syntctxSB->setValue(prefs.syntAbsCtx);
initStartAdvCB->setChecked(prefs.startWithAdvSearchOpen);
initStartSortCB->setChecked(prefs.startWithSortToolOpen);
@ -156,6 +158,9 @@ void UIPrefsDialog::accept()
prefs.startWithAdvSearchOpen = initStartAdvCB->isChecked();
prefs.startWithSortToolOpen = initStartSortCB->isChecked();
prefs.syntAbsLen = syntlenSB->value();
prefs.syntAbsCtx = syntctxSB->value();
prefs.activeExtraDbs.clear();
for (unsigned int i = 0; i < actDbsLB->count(); i++) {
QListBoxItem *item = actDbsLB->item(i);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.75 2006-05-09 10:15:14 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.76 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -56,17 +56,6 @@ using namespace std;
#ifndef NO_NAMESPACES
namespace Rcl {
#endif
// This is how long an abstract we keep or build from beginning of text when
// indexing. It only has an influence on the size of the db as we are free
// to shorten it again when displaying
#define INDEX_ABSTRACT_SIZE 250
// This is the size of the abstract that we synthetize out of query
// term contexts at query time
#define MA_ABSTRACT_SIZE 250
// This is how many words (context size) we keep around query terms
// when building the abstract
#define MA_EXTRACT_WIDTH 4
// Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we
@ -81,6 +70,7 @@ const static string rclSyntAbs = "?!#@";
// ones for indexing or query as there is not much in common.
class Native {
public:
Db *m_db;
bool m_isopen;
bool m_iswritable;
Db::OpenMode m_mode;
@ -106,8 +96,9 @@ class Native {
Xapian::docid docid,
const list<string>& terms);
Native()
: m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0)
Native(Db *db)
: m_db(db),
m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0)
{ }
~Native() {
delete enquire;
@ -149,9 +140,10 @@ class Native {
};
Db::Db()
: m_qOpts(QO_NONE)
: m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
m_synthAbsWordCtxLen(4)
{
m_ndb = new Native;
m_ndb = new Native(this);
}
Db::~Db()
@ -282,7 +274,7 @@ bool Db::close()
LOGDEB(("Rcl:Db: Called xapian flush\n"));
}
delete m_ndb;
m_ndb = new Native;
m_ndb = new Native(this);
if (m_ndb)
return true;
} catch (const Xapian::Error &e) {
@ -442,6 +434,19 @@ bool dumb_string(const string &in, string &out)
return true;
}
// Let our user set the parameters for abstract processing
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
{
LOGDEB(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n",
idxtrunc, syntlen, syntctxlen));
if (idxtrunc > 0 && idxtrunc < 2000)
m_idxAbsTruncLen = idxtrunc;
if (syntlen > 0 && syntlen < 2000)
m_synthAbsLen = syntlen;
if (syntctxlen > 0 && syntctxlen < 20)
m_synthAbsWordCtxLen = syntctxlen;
}
// Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
@ -457,14 +462,16 @@ bool Db::add(const string &fn, const Doc &idoc,
// Truncate abstract, title and keywords to reasonable lengths. If
// abstract is currently empty, we make up one with the beginning
// of the document.
// of the document. This is then not indexed, but part of the doc
// data so that we can return it to a query without having to
// decode the original file.
bool syntabs = false;
if (doc.abstract.empty()) {
syntabs = true;
doc.abstract = rclSyntAbs +
truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
truncate_to_word(doc.text, m_idxAbsTruncLen);
} else {
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
}
doc.abstract = neutchars(doc.abstract, "\n\r");
doc.title = truncate_to_word(doc.title, 100);
@ -513,14 +520,20 @@ bool Db::add(const string &fn, const Doc &idoc,
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split and index abstract
// Split and index abstract. We don't do this if it is synthetic
// any more (this used to give a relevance boost to the beginning
// of text, why ?)
LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
doc.abstract, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
if (!syntabs) {
// syntabs indicator test kept here in case we want to go back
// to indexing synthetic abstracts one day
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
doc.abstract, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
}
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
////// Special terms for metadata
@ -1182,17 +1195,21 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
parms.get(string("caption"), doc.title);
parms.get(string("keywords"), doc.keywords);
parms.get(string("abstract"), doc.abstract);
// Possibly remove synthetic abstract indicator (if it's there, we
// used to index the beginning of the text as abstract).
bool syntabs = false;
if (doc.abstract.find(rclSyntAbs) == 0) {
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
syntabs = true;
}
// If the option is set and the abstract is synthetic or empty , build
// abstract from position data.
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
if (doc.abstract.empty() || syntabs ||
(qopts & Db::QO_REPLACE_ABSTRACT))
doc.abstract = makeAbstract(docid, terms);
}
}
parms.get(string("ipath"), doc.ipath);
parms.get(string("fbytes"), doc.fbytes);
parms.get(string("dbytes"), doc.dbytes);
@ -1397,6 +1414,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
// remember the position and its neigbours
vector<unsigned int> qtermposs; // The term positions
set<unsigned int> chunkposs; // All the positions we shall populate
int totaloccs = 0;
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
qit++) {
Xapian::PositionIterator pos;
@ -1409,15 +1427,15 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
unsigned int ipos = *pos;
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
// Possibly extend the array. Do it in big chunks
if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
if (ipos + m_db->m_synthAbsWordCtxLen >= buf.size()) {
buf.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000);
}
buf[ipos] = *qit;
// Remember the term position
qtermposs.push_back(ipos);
// Add adjacent slots to the set to populate at next step
for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH);
ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); ii++) {
chunkposs.insert(ii);
}
// Limit the number of occurences we keep for each
@ -1427,6 +1445,9 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
}
} catch (...) {
}
// Limit total size
if (totaloccs++ > 100)
break;
}
LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n",
@ -1470,21 +1491,21 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
it != qtermposs.end(); it++) {
unsigned int ipos = *it;
unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
unsigned int start = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
unsigned int end = MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1);
string chunk;
for (unsigned int ii = start; ii <= end; ii++) {
if (!buf[ii].empty()) {
chunk += buf[ii] + " ";
abslen += buf[ii].length();
}
if (abslen > MA_ABSTRACT_SIZE)
if (int(abslen) > m_db->m_synthAbsLen)
break;
}
if (end != buf.size()-1)
chunk += "... ";
mabs[ipos] = chunk;
if (abslen > MA_ABSTRACT_SIZE)
if (int(abslen) > m_db->m_synthAbsLen)
break;
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.35 2006-04-27 06:12:10 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.36 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -171,6 +171,8 @@ class Db {
std::list<std::string> getStemLangs();
string getDbDir();
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
private:
string m_filterTopDir; // Current query filter on subtree top directory
@ -183,6 +185,17 @@ private:
// xapian)-specific defs to show in here
unsigned int m_qOpts;
// This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying
int m_idxAbsTruncLen;
// This is the size of the abstract that we synthetize out of query
// term contexts at *query time*
int m_synthAbsLen;
// This is how many words (context size) we keep around query terms
// when building the abstract
int m_synthAbsWordCtxLen;
bool reOpen(); // Close/open, same mode/opts
/* Copyconst and assignemt private and forbidden */

View File

@ -1,4 +1,4 @@
# @(#$Id: recoll.conf.in,v 1.10 2006-09-08 08:51:47 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: recoll.conf.in,v 1.11 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes
#
# Recoll default configuration file. This should be copied to
# ~/.recoll/recoll.conf
@ -56,6 +56,9 @@ usesystemfilecommand = 1
# know? (we can otherwise just ignore them)
indexallfilenames = 1
# Length of abstracts we store while indexing. Longer will make for a
# bigger db
# idxabsmlen = 250
# You could specify different parameters for a subdirectory like this:
#[~/hungariandocs/plain]