indexStoreDocText config variable, fix terms and conditions when we store the doc text or not

This commit is contained in:
Jean-Francois Dockes 2018-01-19 12:02:12 +01:00
parent 491dfb5f3b
commit 60cb39c935
4 changed files with 133 additions and 66 deletions

View File

@ -649,7 +649,7 @@ int Query::Native::makeAbstract(Xapian::docid docid,
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
maxtotaloccs << " ctxwords " << ctxwords << "\n");
if (o_index_storedoctext) {
if (ndb->m_storetext) {
return abstractFromText(ndb, docid, matchedTerms, byQ,
totalweight, ctxwords, maxtotaloccs, vabs,
chron);

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2004 J.F.Dockes
/* Copyright (C) 2004-2018 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -63,10 +63,21 @@ using namespace std;
#endif
#include "zlibut.h"
#ifndef XAPIAN_AT_LEAST
// Added in Xapian 1.4.2. Define it here for older versions
#define XAPIAN_AT_LEAST(A,B,C) \
(XAPIAN_MAJOR_VERSION > (A) || \
(XAPIAN_MAJOR_VERSION == (A) && \
(XAPIAN_MINOR_VERSION > (B) || \
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
#endif
// Recoll index format version is stored in user metadata. When this change,
// we can't open the db and will have to reindex.
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
static const string cstr_RCL_IDX_VERSION("1");
static const string cstr_RCL_IDX_DESCRIPTOR_KEY("RCL_IDX_DESCRIPTOR_KEY");
static const string cstr_mbreaks("rclmbreaks");
@ -242,9 +253,85 @@ void Db::Native::maybeStartThreads()
#endif // IDX_THREADS
void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
{
int action = (mode == Db::DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
Xapian::DB_CREATE_OR_OVERWRITE;
if (::access(dir.c_str(), 0) == 0) {
// Existing index
xwdb = Xapian::WritableDatabase(dir, action);
} else {
// New index. If possible, and depending on config, use a stub
// to force using Chert. No sense in doing this if we are
// storing the text anyway.
#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
// New Xapian with Chert support. Use Chert and the old
// abstract generation method, except if told otherwise by the
// configuration.
if (o_index_storedoctext) {
xwdb = Xapian::WritableDatabase(dir, action);
m_storetext = true;
} else {
// Force Chert format, don't store the text.
string stub = path_cat(m_rcldb->m_config->getConfDir(),
"xapian.stub");
FILE *fp = fopen(stub.c_str(), "w");
if (nullptr == fp) {
throw(string("Can't create ") + stub);
}
fprintf(fp, "chert %s\n", dir.c_str());
fclose(fp);
xwdb = Xapian::WritableDatabase(stub, action);
m_storetext = false;
}
#elif ! XAPIAN_AT_LEAST(1,3,0)
// Old Xapian. Use the default index format and let the user
// decide of the abstract generation method.
xwdb = Xapian::WritableDatabase(dir, action);
m_storetext = o_index_storedoctext;
#else
// Newer Xapian with no Chert support. Store the text.
xwdb = Xapian::WritableDatabase(dir, action);
m_storetext = true;
#endif
// Set the storetext value inside the index descriptor (new
// with recoll 1.24, maybe we'll have other stuff to store in
// there in the future).
string desc = string("storetext=") + (m_storetext ? "1" : "0") + "\n";
xwdb.set_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY, desc);
}
// If the index is empty, write the data format version at once
// to avoid stupid error messages:
if (xwdb.get_doccount() == 0) {
xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
}
m_iswritable = true;
#ifdef IDX_THREADS
maybeStartThreads();
#endif
}
void Db::Native::openRead(const string& dir)
{
m_iswritable = false;
xrdb = Xapian::Database(dir);
string desc = xrdb.get_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY);
ConfSimple cf(desc, 1);
string val;
m_storetext = false;
if (cf.get("storetext", val) && stringToBool(val)) {
m_storetext = true;
}
LOGDEB("Db::openRead: index " << (m_storetext?"stores":"does not store") <<
" document text\n");
}
/* See comment in class declaration: return all subdocuments of a
* document given by its unique id.
*/
* document given by its unique id. */
bool Db::Native::subDocs(const string &udi, int idxi,
vector<Xapian::docid>& docids)
{
@ -782,6 +869,7 @@ vector<string> Db::getStemmerNames()
return res;
}
bool Db::open(OpenMode mode, OpenError *error)
{
if (error)
@ -808,63 +896,28 @@ bool Db::open(OpenMode mode, OpenError *error)
switch (mode) {
case DbUpd:
case DbTrunc:
{
int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
Xapian::DB_CREATE_OR_OVERWRITE;
if (!o_index_storedoctext && ::access(dir.c_str(), 0) != 0) {
// New index. use a stub to force using Chert. No
// sense in doing this if we are storing the text
// anyway.
string stub = path_cat(m_config->getConfDir(),
"xapian.stub");
FILE *fp = fopen(stub.c_str(), "w");
if (nullptr == fp) {
throw(string("Can't create ") + stub);
}
fprintf(fp, "chert %s\n", dir.c_str());
fclose(fp);
m_ndb->xwdb = Xapian::WritableDatabase(stub, action);
} else {
m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
}
// If db is empty, write the data format version at once
// to avoid stupid error messages:
if (m_ndb->xwdb.get_doccount() == 0)
m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY,
cstr_RCL_IDX_VERSION);
m_ndb->m_iswritable = true;
#ifdef IDX_THREADS
m_ndb->maybeStartThreads();
#endif
// We used to open a readonly object in addition to
// the r/w one because some operations were faster
// when performed through a Database: no forced
// flushes on allterms_begin(), used in
// subDocs(). This issue has been gone for a long time
// (now: Xapian 1.2) and the separate objects seem to
// trigger other Xapian issues, so the query db is now
// a clone of the update one.
m_ndb->xrdb = m_ndb->xwdb;
LOGDEB("Db::open: lastdocid: " << m_ndb->xwdb.get_lastdocid() <<
"\n");
LOGDEB2("Db::open: resetting updated\n");
updated.resize(m_ndb->xwdb.get_lastdocid() + 1);
for (unsigned int i = 0; i < updated.size(); i++)
updated[i] = false;
}
m_ndb->openWrite(dir, mode);
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
// We used to open a readonly object in addition to the
// r/w one because some operations were faster when
// performed through a Database: no forced flushes on
// allterms_begin(), used in subDocs(). This issue has
// been gone for a long time (now: Xapian 1.2) and the
// separate objects seem to trigger other Xapian issues,
// so the query db is now a clone of the update one.
m_ndb->xrdb = m_ndb->xwdb;
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
break;
case DbRO:
default:
m_ndb->m_iswritable = false;
m_ndb->xrdb = Xapian::Database(dir);
for (vector<string>::iterator it = m_extraDbs.begin();
it != m_extraDbs.end(); it++) {
m_ndb->openRead(dir);
for (auto& db : m_extraDbs) {
if (error)
*error = DbOpenExtraDb;
LOGDEB("Db::Open: adding query db [" << &(*it) << "]\n");
LOGDEB("Db::Open: adding query db [" << &db << "]\n");
// An error here used to be non-fatal (1.13 and older)
// but I can't see why
m_ndb->xrdb.add_database(Xapian::Database(*it));
m_ndb->xrdb.add_database(Xapian::Database(db));
}
break;
}
@ -1489,7 +1542,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
LOGDEB("Db::addOrUpdate: split failed for main text\n");
} else {
#if defined(RAWTEXT_IN_METADATA)
if (o_index_storedoctext) {
if (m_ndb->m_storetext) {
ZLibUtBuf buf;
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
rawztext.assign(buf.getBuf(), buf.getCnt());
@ -1707,7 +1760,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
}
#ifdef RAWTEXT_IN_DATA
if (o_index_storedoctext) {
if (m_ndb->m_storetext) {
RECORD_APPEND(record, string("RAWTEXT"),
neutchars(doc.text, cstr_nc));
}

View File

@ -92,6 +92,7 @@ class Db::Native {
bool m_isopen;
bool m_iswritable;
bool m_noversionwrite; //Set if open failed because of version mismatch!
bool m_storetext{false};
#ifdef IDX_THREADS
WorkQueue<DbUpdTask*> m_wqueue;
std::mutex m_mutex;
@ -112,6 +113,9 @@ class Db::Native {
friend void *DbUpdWorker(void*);
#endif // IDX_THREADS
void openWrite(const std::string& dir, Db::OpenMode mode);
void openRead(const string& dir);
// Final steps of doc update, part which need to be single-threaded
bool addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document *doc, size_t txtlen

View File

@ -233,16 +233,26 @@ indexStripChars = 1
# <var name="indexStoreDocText" type="bool"><brief>Decide if we store the
# documents' text content in the index.</brief><descr>Storing the text
# allows extracting snippets from it at query time,
# instead of building them from index position data. This Has become
# necessary for versions of Xapian 1.6, which have dropped support
# for the chert index format, and adopted a setup which renders our
# use of positions list unacceptably slow in cases. 'raw' text here
# means that the text is not stripped of upper-case, diacritics, or
# punctuation signs. It is still translated from its original format
# to UTF-8 plain text. This increases the index size by 10-20% typically,
# but also allows for nicer snippets, so it may be worth enabling it even
# if not strictly needed for performance if you can afford the space.
# allows extracting snippets from it at query time, instead of building
# them from index position data.
#
# Newer Xapian index formats have rendered our use of positions list
# unacceptably slow in some cases. The last Xapian index format with good
# performance for the old method is Chert, which is default for 1.2, still
# supported but not default in 1.4 and will be dropped in 1.6.
#
# The document text is translated from its original format to UTF-8 plain
# text, but not stripped of upper-case, diacritics, or punctuation
# signs. Storing it increases the index size by 10-20% typically, but also
# allows for nicer snippets, so it may be worth enabling it even if not
# strictly needed for performance if you can afford the space.
#
# The variable only has an effect when creating an index, tested as
# xapiandb directory not existing. Its exact effect depends on the Xapian
# version. For Xapian 1.2, you can force the new method by setting the
# variable to 1. For Xapian 1.4, the Chert format will be used, and the text
# will not be stored if the variable is not set or set to 0. For later
# Xapian versions, the variable does nothing, the text is always stored.
# </desc></var>
indexStoreDocText = 0