indexStoreDocText config variable, fix terms and conditions when we store the doc text or not
This commit is contained in:
parent
491dfb5f3b
commit
60cb39c935
@ -649,7 +649,7 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
|||||||
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
|
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
|
||||||
maxtotaloccs << " ctxwords " << ctxwords << "\n");
|
maxtotaloccs << " ctxwords " << ctxwords << "\n");
|
||||||
|
|
||||||
if (o_index_storedoctext) {
|
if (ndb->m_storetext) {
|
||||||
return abstractFromText(ndb, docid, matchedTerms, byQ,
|
return abstractFromText(ndb, docid, matchedTerms, byQ,
|
||||||
totalweight, ctxwords, maxtotaloccs, vabs,
|
totalweight, ctxwords, maxtotaloccs, vabs,
|
||||||
chron);
|
chron);
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2004 J.F.Dockes
|
/* Copyright (C) 2004-2018 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -63,10 +63,21 @@ using namespace std;
|
|||||||
#endif
|
#endif
|
||||||
#include "zlibut.h"
|
#include "zlibut.h"
|
||||||
|
|
||||||
|
#ifndef XAPIAN_AT_LEAST
|
||||||
|
// Added in Xapian 1.4.2. Define it here for older versions
|
||||||
|
#define XAPIAN_AT_LEAST(A,B,C) \
|
||||||
|
(XAPIAN_MAJOR_VERSION > (A) || \
|
||||||
|
(XAPIAN_MAJOR_VERSION == (A) && \
|
||||||
|
(XAPIAN_MINOR_VERSION > (B) || \
|
||||||
|
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// Recoll index format version is stored in user metadata. When this change,
|
// Recoll index format version is stored in user metadata. When this change,
|
||||||
// we can't open the db and will have to reindex.
|
// we can't open the db and will have to reindex.
|
||||||
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
||||||
static const string cstr_RCL_IDX_VERSION("1");
|
static const string cstr_RCL_IDX_VERSION("1");
|
||||||
|
static const string cstr_RCL_IDX_DESCRIPTOR_KEY("RCL_IDX_DESCRIPTOR_KEY");
|
||||||
|
|
||||||
static const string cstr_mbreaks("rclmbreaks");
|
static const string cstr_mbreaks("rclmbreaks");
|
||||||
|
|
||||||
@ -242,9 +253,85 @@ void Db::Native::maybeStartThreads()
|
|||||||
|
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
|
|
||||||
|
void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
|
||||||
|
{
|
||||||
|
int action = (mode == Db::DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
|
||||||
|
Xapian::DB_CREATE_OR_OVERWRITE;
|
||||||
|
|
||||||
|
if (::access(dir.c_str(), 0) == 0) {
|
||||||
|
// Existing index
|
||||||
|
xwdb = Xapian::WritableDatabase(dir, action);
|
||||||
|
} else {
|
||||||
|
// New index. If possible, and depending on config, use a stub
|
||||||
|
// to force using Chert. No sense in doing this if we are
|
||||||
|
// storing the text anyway.
|
||||||
|
#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
|
||||||
|
// New Xapian with Chert support. Use Chert and the old
|
||||||
|
// abstract generation method, except if told otherwise by the
|
||||||
|
// configuration.
|
||||||
|
if (o_index_storedoctext) {
|
||||||
|
xwdb = Xapian::WritableDatabase(dir, action);
|
||||||
|
m_storetext = true;
|
||||||
|
} else {
|
||||||
|
// Force Chert format, don't store the text.
|
||||||
|
string stub = path_cat(m_rcldb->m_config->getConfDir(),
|
||||||
|
"xapian.stub");
|
||||||
|
FILE *fp = fopen(stub.c_str(), "w");
|
||||||
|
if (nullptr == fp) {
|
||||||
|
throw(string("Can't create ") + stub);
|
||||||
|
}
|
||||||
|
fprintf(fp, "chert %s\n", dir.c_str());
|
||||||
|
fclose(fp);
|
||||||
|
xwdb = Xapian::WritableDatabase(stub, action);
|
||||||
|
m_storetext = false;
|
||||||
|
}
|
||||||
|
#elif ! XAPIAN_AT_LEAST(1,3,0)
|
||||||
|
// Old Xapian. Use the default index format and let the user
|
||||||
|
// decide of the abstract generation method.
|
||||||
|
xwdb = Xapian::WritableDatabase(dir, action);
|
||||||
|
m_storetext = o_index_storedoctext;
|
||||||
|
#else
|
||||||
|
// Newer Xapian with no Chert support. Store the text.
|
||||||
|
xwdb = Xapian::WritableDatabase(dir, action);
|
||||||
|
m_storetext = true;
|
||||||
|
#endif
|
||||||
|
// Set the storetext value inside the index descriptor (new
|
||||||
|
// with recoll 1.24, maybe we'll have other stuff to store in
|
||||||
|
// there in the future).
|
||||||
|
string desc = string("storetext=") + (m_storetext ? "1" : "0") + "\n";
|
||||||
|
xwdb.set_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY, desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the index is empty, write the data format version at once
|
||||||
|
// to avoid stupid error messages:
|
||||||
|
if (xwdb.get_doccount() == 0) {
|
||||||
|
xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_iswritable = true;
|
||||||
|
|
||||||
|
#ifdef IDX_THREADS
|
||||||
|
maybeStartThreads();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void Db::Native::openRead(const string& dir)
|
||||||
|
{
|
||||||
|
m_iswritable = false;
|
||||||
|
xrdb = Xapian::Database(dir);
|
||||||
|
string desc = xrdb.get_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY);
|
||||||
|
ConfSimple cf(desc, 1);
|
||||||
|
string val;
|
||||||
|
m_storetext = false;
|
||||||
|
if (cf.get("storetext", val) && stringToBool(val)) {
|
||||||
|
m_storetext = true;
|
||||||
|
}
|
||||||
|
LOGDEB("Db::openRead: index " << (m_storetext?"stores":"does not store") <<
|
||||||
|
" document text\n");
|
||||||
|
}
|
||||||
|
|
||||||
/* See comment in class declaration: return all subdocuments of a
|
/* See comment in class declaration: return all subdocuments of a
|
||||||
* document given by its unique id.
|
* document given by its unique id. */
|
||||||
*/
|
|
||||||
bool Db::Native::subDocs(const string &udi, int idxi,
|
bool Db::Native::subDocs(const string &udi, int idxi,
|
||||||
vector<Xapian::docid>& docids)
|
vector<Xapian::docid>& docids)
|
||||||
{
|
{
|
||||||
@ -782,6 +869,7 @@ vector<string> Db::getStemmerNames()
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Db::open(OpenMode mode, OpenError *error)
|
bool Db::open(OpenMode mode, OpenError *error)
|
||||||
{
|
{
|
||||||
if (error)
|
if (error)
|
||||||
@ -808,63 +896,28 @@ bool Db::open(OpenMode mode, OpenError *error)
|
|||||||
switch (mode) {
|
switch (mode) {
|
||||||
case DbUpd:
|
case DbUpd:
|
||||||
case DbTrunc:
|
case DbTrunc:
|
||||||
{
|
m_ndb->openWrite(dir, mode);
|
||||||
int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
|
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
|
||||||
Xapian::DB_CREATE_OR_OVERWRITE;
|
// We used to open a readonly object in addition to the
|
||||||
if (!o_index_storedoctext && ::access(dir.c_str(), 0) != 0) {
|
// r/w one because some operations were faster when
|
||||||
// New index. use a stub to force using Chert. No
|
// performed through a Database: no forced flushes on
|
||||||
// sense in doing this if we are storing the text
|
// allterms_begin(), used in subDocs(). This issue has
|
||||||
// anyway.
|
// been gone for a long time (now: Xapian 1.2) and the
|
||||||
string stub = path_cat(m_config->getConfDir(),
|
// separate objects seem to trigger other Xapian issues,
|
||||||
"xapian.stub");
|
// so the query db is now a clone of the update one.
|
||||||
FILE *fp = fopen(stub.c_str(), "w");
|
m_ndb->xrdb = m_ndb->xwdb;
|
||||||
if (nullptr == fp) {
|
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
|
||||||
throw(string("Can't create ") + stub);
|
|
||||||
}
|
|
||||||
fprintf(fp, "chert %s\n", dir.c_str());
|
|
||||||
fclose(fp);
|
|
||||||
m_ndb->xwdb = Xapian::WritableDatabase(stub, action);
|
|
||||||
} else {
|
|
||||||
m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
|
|
||||||
}
|
|
||||||
// If db is empty, write the data format version at once
|
|
||||||
// to avoid stupid error messages:
|
|
||||||
if (m_ndb->xwdb.get_doccount() == 0)
|
|
||||||
m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY,
|
|
||||||
cstr_RCL_IDX_VERSION);
|
|
||||||
m_ndb->m_iswritable = true;
|
|
||||||
#ifdef IDX_THREADS
|
|
||||||
m_ndb->maybeStartThreads();
|
|
||||||
#endif
|
|
||||||
// We used to open a readonly object in addition to
|
|
||||||
// the r/w one because some operations were faster
|
|
||||||
// when performed through a Database: no forced
|
|
||||||
// flushes on allterms_begin(), used in
|
|
||||||
// subDocs(). This issue has been gone for a long time
|
|
||||||
// (now: Xapian 1.2) and the separate objects seem to
|
|
||||||
// trigger other Xapian issues, so the query db is now
|
|
||||||
// a clone of the update one.
|
|
||||||
m_ndb->xrdb = m_ndb->xwdb;
|
|
||||||
LOGDEB("Db::open: lastdocid: " << m_ndb->xwdb.get_lastdocid() <<
|
|
||||||
"\n");
|
|
||||||
LOGDEB2("Db::open: resetting updated\n");
|
|
||||||
updated.resize(m_ndb->xwdb.get_lastdocid() + 1);
|
|
||||||
for (unsigned int i = 0; i < updated.size(); i++)
|
|
||||||
updated[i] = false;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case DbRO:
|
case DbRO:
|
||||||
default:
|
default:
|
||||||
m_ndb->m_iswritable = false;
|
m_ndb->openRead(dir);
|
||||||
m_ndb->xrdb = Xapian::Database(dir);
|
for (auto& db : m_extraDbs) {
|
||||||
for (vector<string>::iterator it = m_extraDbs.begin();
|
|
||||||
it != m_extraDbs.end(); it++) {
|
|
||||||
if (error)
|
if (error)
|
||||||
*error = DbOpenExtraDb;
|
*error = DbOpenExtraDb;
|
||||||
LOGDEB("Db::Open: adding query db [" << &(*it) << "]\n");
|
LOGDEB("Db::Open: adding query db [" << &db << "]\n");
|
||||||
// An error here used to be non-fatal (1.13 and older)
|
// An error here used to be non-fatal (1.13 and older)
|
||||||
// but I can't see why
|
// but I can't see why
|
||||||
m_ndb->xrdb.add_database(Xapian::Database(*it));
|
m_ndb->xrdb.add_database(Xapian::Database(db));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1489,7 +1542,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
||||||
} else {
|
} else {
|
||||||
#if defined(RAWTEXT_IN_METADATA)
|
#if defined(RAWTEXT_IN_METADATA)
|
||||||
if (o_index_storedoctext) {
|
if (m_ndb->m_storetext) {
|
||||||
ZLibUtBuf buf;
|
ZLibUtBuf buf;
|
||||||
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
||||||
rawztext.assign(buf.getBuf(), buf.getCnt());
|
rawztext.assign(buf.getBuf(), buf.getCnt());
|
||||||
@ -1707,7 +1760,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef RAWTEXT_IN_DATA
|
#ifdef RAWTEXT_IN_DATA
|
||||||
if (o_index_storedoctext) {
|
if (m_ndb->m_storetext) {
|
||||||
RECORD_APPEND(record, string("RAWTEXT"),
|
RECORD_APPEND(record, string("RAWTEXT"),
|
||||||
neutchars(doc.text, cstr_nc));
|
neutchars(doc.text, cstr_nc));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -92,6 +92,7 @@ class Db::Native {
|
|||||||
bool m_isopen;
|
bool m_isopen;
|
||||||
bool m_iswritable;
|
bool m_iswritable;
|
||||||
bool m_noversionwrite; //Set if open failed because of version mismatch!
|
bool m_noversionwrite; //Set if open failed because of version mismatch!
|
||||||
|
bool m_storetext{false};
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
WorkQueue<DbUpdTask*> m_wqueue;
|
WorkQueue<DbUpdTask*> m_wqueue;
|
||||||
std::mutex m_mutex;
|
std::mutex m_mutex;
|
||||||
@ -112,6 +113,9 @@ class Db::Native {
|
|||||||
friend void *DbUpdWorker(void*);
|
friend void *DbUpdWorker(void*);
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
|
|
||||||
|
void openWrite(const std::string& dir, Db::OpenMode mode);
|
||||||
|
void openRead(const string& dir);
|
||||||
|
|
||||||
// Final steps of doc update, part which need to be single-threaded
|
// Final steps of doc update, part which need to be single-threaded
|
||||||
bool addOrUpdateWrite(const string& udi, const string& uniterm,
|
bool addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||||
Xapian::Document *doc, size_t txtlen
|
Xapian::Document *doc, size_t txtlen
|
||||||
|
|||||||
@ -233,16 +233,26 @@ indexStripChars = 1
|
|||||||
|
|
||||||
# <var name="indexStoreDocText" type="bool"><brief>Decide if we store the
|
# <var name="indexStoreDocText" type="bool"><brief>Decide if we store the
|
||||||
# documents' text content in the index.</brief><descr>Storing the text
|
# documents' text content in the index.</brief><descr>Storing the text
|
||||||
# allows extracting snippets from it at query time,
|
# allows extracting snippets from it at query time, instead of building
|
||||||
# instead of building them from index position data. This Has become
|
# them from index position data.
|
||||||
# necessary for versions of Xapian 1.6, which have dropped support
|
#
|
||||||
# for the chert index format, and adopted a setup which renders our
|
# Newer Xapian index formats have rendered our use of positions list
|
||||||
# use of positions list unacceptably slow in cases. 'raw' text here
|
# unacceptably slow in some cases. The last Xapian index format with good
|
||||||
# means that the text is not stripped of upper-case, diacritics, or
|
# performance for the old method is Chert, which is default for 1.2, still
|
||||||
# punctuation signs. It is still translated from its original format
|
# supported but not default in 1.4 and will be dropped in 1.6.
|
||||||
# to UTF-8 plain text. This increases the index size by 10-20% typically,
|
#
|
||||||
# but also allows for nicer snippets, so it may be worth enabling it even
|
# The document text is translated from its original format to UTF-8 plain
|
||||||
# if not strictly needed for performance if you can afford the space.
|
# text, but not stripped of upper-case, diacritics, or punctuation
|
||||||
|
# signs. Storing it increases the index size by 10-20% typically, but also
|
||||||
|
# allows for nicer snippets, so it may be worth enabling it even if not
|
||||||
|
# strictly needed for performance if you can afford the space.
|
||||||
|
#
|
||||||
|
# The variable only has an effect when creating an index, tested as
|
||||||
|
# xapiandb directory not existing. Its exact effect depends on the Xapian
|
||||||
|
# version. For Xapian 1.2, you can force the new method by setting the
|
||||||
|
# variable to 1. For Xapian 1.4, the Chert format will be used, and the text
|
||||||
|
# will not be stored if the variable is not set or set to 0. For later
|
||||||
|
# Xapian versions, the variable does nothing, the text is always stored.
|
||||||
# </desc></var>
|
# </desc></var>
|
||||||
indexStoreDocText = 0
|
indexStoreDocText = 0
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user