make sure that python rclextract.idoctofile always retrieves an uncompressed file of the correct MIME type. + misc comments

This commit is contained in:
Jean-Francois Dockes 2017-07-20 12:52:24 +02:00
parent 32e79d301b
commit 29c6f75423
6 changed files with 114 additions and 83 deletions

View File

@ -118,7 +118,6 @@ bool FileInterner::ipathContains(const string& parent, const string& child)
// Split into "constructor calls init()" to allow use from other constructor
FileInterner::FileInterner(const string &fn, const struct stat *stp,
RclConfig *cnf, int flags, const string *imime)
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
{
LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
if (fn.empty()) {
@ -219,8 +218,18 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
}
// Look for appropriate handler (might still return empty)
// Get fields computed from extended attributes. We use the
// original file, not the m_fn which may be the uncompressed temp
// file
if (!m_noxattrs)
reapXAttrs(m_cfg, f, m_XAttrsFields);
// Gather metadata from external commands as configured.
reapMetaCmds(m_cfg, f, m_cmdFields);
m_mimetype = l_mime;
// Look for appropriate handler (might still return empty)
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
if (!df || df->is_unknown()) {
@ -234,15 +243,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
m_forPreview ? "view" : "index");
df->set_property(Dijon::Filter::DJF_UDI, udi);
// Get fields computed from extended attributes. We use the
// original file, not the m_fn which may be the uncompressed temp
// file
if (!m_noxattrs)
reapXAttrs(m_cfg, f, m_XAttrsFields);
// Gather metadata from external commands as configured.
reapMetaCmds(m_cfg, f, m_cmdFields);
df->set_docsize(docsize);
if (!df->set_document_file(l_mime, m_fn)) {
delete df;
@ -258,7 +258,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
// Setup from memory data (ie: out of the web cache). imime needs to be set.
FileInterner::FileInterner(const string &data, RclConfig *cnf,
int flags, const string& imime)
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
{
LOGDEB0("FileInterner::FileInterner(data)\n");
initcommon(cnf, flags);
@ -313,7 +312,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
void FileInterner::initcommon(RclConfig *cnf, int flags)
{
m_cfg = cnf;
m_forPreview = ((flags & FIF_forPreview) != 0);
m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
// Initialize handler stack.
m_handlers.reserve(MAXHANDLERS);
for (unsigned int i = 0; i < MAXHANDLERS; i++)
@ -324,7 +323,6 @@ void FileInterner::initcommon(RclConfig *cnf, int flags)
}
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
: m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0))
{
LOGDEB0("FileInterner::FileInterner(idoc)\n");
initcommon(cnf, flags);
@ -347,6 +345,9 @@ FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
init(rawdoc.data, cnf, flags, idoc.mimetype);
break;
case DocFetcher::RawDoc::RDK_DATADIRECT:
// Note: only used for demo with the sample python external
// mbox indexer at this point. The external program is
// responsible for all the extraction process.
init(rawdoc.data, cnf, flags, idoc.mimetype);
m_direct = true;
break;
@ -735,8 +736,8 @@ int FileInterner::addHandler()
}
}
if (!setres) {
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
" for mtype " << mimetype << "\n");
LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn <<
"] for mtype " << mimetype << "\n");
delete newflt;
if (m_forPreview)
return ADD_ERROR;
@ -918,36 +919,24 @@ bool FileInterner::tempFileForMT(TempFile& otemp, RclConfig* cnf,
TempFile temp(new TempFileInternal(
cnf->getSuffixFromMimeType(mimetype)));
if (!temp->ok()) {
LOGERR("FileInterner::interntofile: can't create temp file\n");
LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
return false;
}
otemp = temp;
return true;
}
// Extract document (typically subdoc of multidoc) into temporary file.
// We do the usual internfile stuff: create a temporary directory,
// then create an interner and call internfile. The target mtype is set to
// the input mtype, so that no data conversion is performed.
// We then write the data out of the resulting document into the output file.
// There are two temporary objects:
// - The internfile temporary directory gets destroyed by its destructor
// - The output temporary file which is held in a reference-counted
// object and will be deleted when done with.
//
// If the ipath is null, maybe we're called because the file is not
// stored in the regular file system. We use the docfetcher to get a
// copy (in topdocToFile())
//
// We currently don't handle the case of an internal doc of a non-fs document.
bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc)
// Static method, creates a FileInterner object to do the job.
bool FileInterner::idocToFile(
TempFile& otemp, const string& tofile, RclConfig *cnf,
const Rcl::Doc& idoc, bool uncompress)
{
LOGDEB("FileInterner::idocToFile\n");
if (idoc.ipath.empty()) {
return topdocToFile(otemp, tofile, cnf, idoc);
// Because of the mandatory first conversion in the
// FileInterner constructor, need to use a specific method.
return topdocToFile(otemp, tofile, cnf, idoc, uncompress);
}
// We set FIF_forPreview for consistency with the previous version
@ -958,17 +947,21 @@ bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
}
bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc)
// This is only needed because the FileInterner constructor always performs
// the first conversion, so that we need another approach for accessing the
// original document (targetmtype won't do).
bool FileInterner::topdocToFile(
TempFile& otemp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress)
{
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
if (fetcher == 0) {
LOGERR("FileInterner::idocToFile no backend\n");
LOGERR("FileInterner::topdocToFile no backend\n");
return false;
}
DocFetcher::RawDoc rawdoc;
if (!fetcher->fetch(cnf, idoc, rawdoc)) {
LOGERR("FileInterner::idocToFile fetcher failed\n");
LOGERR("FileInterner::topdocToFile fetcher failed\n");
return false;
}
const char *filename = "";
@ -983,13 +976,24 @@ bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
}
string reason;
switch (rawdoc.kind) {
case DocFetcher::RawDoc::RDK_FILENAME:
if (!copyfile(rawdoc.data.c_str(), filename, reason)) {
case DocFetcher::RawDoc::RDK_FILENAME: {
string fn(rawdoc.data);
TempFile temp;
if (uncompress && isCompressed(fn, cnf)) {
if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) {
LOGERR("FileInterner::idocToFile: uncompress failed\n");
return false;
}
}
fn = temp ? temp->filename() : rawdoc.data;
if (!copyfile(fn.c_str(), filename, reason)) {
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
return false;
}
}
break;
case DocFetcher::RawDoc::RDK_DATA:
case DocFetcher::RawDoc::RDK_DATADIRECT:
if (!stringtofile(rawdoc.data, filename, reason)) {
LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
return false;
@ -1019,11 +1023,12 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
}
// Specialcase text/html. This is to work around a bug that will
// get fixed some day: internfile initialisation does not check
// targetmtype, so that at least one conversion is always
// performed. A common case would be an "Open" on an html file
// (we'd end up with text/plain content). As the html version is
// saved in this case, use it.
// get fixed some day: the internfile constructor always loads the
// first handler so that at least one conversion is always
// performed (and the access to the original data may be lost). A
// common case is an "Open" on an HTML file (we end up
// with text/plain content). As the HTML version is saved in this
// case, use it.
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
doc.text = get_html();
doc.mimetype = cstr_texthtml;

View File

@ -118,21 +118,20 @@ class FileInterner {
/**
* Alternate constructor for the case where the data is in memory.
* This is mainly for data extracted from the web cache. The mime type
* must be set, input must be already uncompressed.
* This is mainly for data extracted from the web cache.
* The MIME type must be set, and the data must be uncompressed.
*/
FileInterner(const string &data, RclConfig *cnf,
int flags, const string& mtype);
/**
* Alternate constructor used at query time. We don't know where
* the data was stored, this is determined from the Rcl::Doc data
* the data was stored, and use the fetcher interface to reach it.
*
* @param idoc Rcl::Doc object built from index data. The back-end
* storage identifier (rclbes field) is used to build the
* appropriate fetcher which uses the rest of the Doc fields (url,
* ipath...) to retrieve the file or a file reference, which we
* then process normally.
* storage identifier (rclbes field) is used by the fetcher factory
* to build the appropriate object to return a file name or data which
* is then used with the appropriate init method.
*/
FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags);
@ -214,10 +213,15 @@ class FileInterner {
* to query the right backend. Used to check up-to-dateness at query time */
static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig);
/** Extract internal document into temporary file.
* This is used mainly for starting an external viewer for a
* subdocument (ie: mail attachment). This really would not need to be
* a member. It creates a FileInterner object to do the actual work
/** Extract internal document into temporary file, without converting the
* data.
*
* This is used mainly for starting an external viewer for a
* subdocument (ie: mail attachment), but, for consistency, it also
* works with a top level (null ipath) document.
* This would not actually need to be a member method, it creates a
* FileInterner object to do the actual work.
*
* @return true for success.
* @param temp output reference-counted temp file object (goes
* away magically). Only used if tofile.empty()
@ -225,13 +229,14 @@ class FileInterner {
* @param cnf The recoll config
* @param doc Doc data taken from the index. We use it to construct a
* FileInterner object.
* @param uncompress if true, uncompress compressed original doc. Only does
* anything for a top level document.
*/
static bool idocToFile(TempFile& temp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& doc);
RclConfig *cnf, const Rcl::Doc& doc,
bool uncompress = true);
/**
* Does file appear to be the compressed version of a document?
*/
/** Does file appear to be the compressed version of a document? */
static bool isCompressed(const string& fn, RclConfig *cnf);
/**
@ -253,7 +258,7 @@ class FileInterner {
string m_targetMType;
string m_reachedMType; // target or text/plain
string m_tfile;
bool m_ok; // Set after construction if ok
bool m_ok{false}; // Set after construction if ok
// Fields found in file extended attributes. This is kept here,
// not in the file-level handler because we are only interested in
// the top-level file, not any temp file necessitated by
@ -270,7 +275,7 @@ class FileInterner {
vector<TempFile> m_tempfiles;
// Error data if any
string m_reason;
FIMissingStore *m_missingdatap;
FIMissingStore *m_missingdatap{nullptr};
Uncomp m_uncomp;
@ -294,7 +299,8 @@ class FileInterner {
static bool tempFileForMT(TempFile& otemp, RclConfig *cnf,
const std::string& mimetype);
static bool topdocToFile(TempFile& otemp, const std::string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc);
RclConfig *cnf, const Rcl::Doc& idoc,
bool uncompress);
};

View File

@ -44,12 +44,16 @@ typedef struct {
/* Type-specific fields go here. */
FileInterner *xtr;
RclConfig *rclconfig;
recoll_DocObject *docobject;
} rclx_ExtractorObject;
static void
Extractor_dealloc(rclx_ExtractorObject *self)
{
LOGDEB("Extractor_dealloc\n" );
if (self->docobject) {
Py_DECREF(&self->docobject);
}
delete self->xtr;
Py_TYPE(self)->tp_free((PyObject*)self);
}
@ -64,6 +68,7 @@ Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
return 0;
self->xtr = 0;
self->rclconfig = 0;
self->docobject = 0;
return (PyObject *)self;
}
@ -82,6 +87,9 @@ Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
return -1;
}
self->docobject = dobj;
Py_INCREF(dobj);
self->rclconfig = dobj->rclconfig;
self->xtr = new FileInterner(*dobj->doc, self->rclconfig,
FileInterner::FIF_forPreview);
@ -177,17 +185,29 @@ Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args,
PyErr_SetString(PyExc_AttributeError, "idoctofile: null object");
return 0;
}
if (ipath.empty()) {
PyErr_SetString(PyExc_ValueError, "idoctofile: null ipath");
return 0;
}
self->xtr->setTargetMType(mimetype);
// If ipath is empty and we want the original mimetype, we can't
// use FileInterner::internToFile() because the first conversion
// was performed by the FileInterner constructor, so that we can't
// reach the original object this way. Instead, if the data comes
// from a file (m_fn set), we just copy it, else, we call
// idoctofile, which will call topdoctofile (and re-fetch the
// data, yes, wastefull)
TempFile temp;
bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
bool status = false;
LOGDEB("Extractor_idoctofile: ipath [" << ipath << "] mimetype [" <<
mimetype << "] doc mimetype [" << self->docobject->doc->mimetype <<
"\n");
if (ipath.empty() && !mimetype.compare(self->docobject->doc->mimetype)) {
status = FileInterner::idocToFile(temp, outfile, self->rclconfig,
*self->docobject->doc);
} else {
self->xtr->setTargetMType(mimetype);
status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
}
if (!status) {
PyErr_SetString(PyExc_AttributeError, "interntofile failure");
return 0;
return 0;
}
if (outfile.empty())
temp->setnoremove(1);

View File

@ -55,8 +55,8 @@ void multiSave(QWidget *p, vector<Rcl::Doc>& docs)
LOGDEB2("multiSave: got dir " << (dir) << "\n" );
/* Save doc to files in target directory. Issues:
- It is quite common to have docs in the array with the save
file names, e.g. all messages in a folder have the save file
- It is quite common to have docs in the array with the same
file names, e.g. all messages in a folder have the same file
name (the folder's).
- There is no warranty that the ipath is going to be acceptable
as a file name or interesting at all. We don't use it.
@ -131,7 +131,7 @@ void multiSave(QWidget *p, vector<Rcl::Doc>& docs)
}
// There is still a race condition here, should we care ?
TempFile temp;// not used
if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i])) {
if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i], false)) {
QMessageBox::warning(0, "Recoll",
QWidget::tr("Cannot extract document: ") +
QString::fromLocal8Bit(docs[i].url.c_str()) +

View File

@ -819,14 +819,14 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
// If this is an image, display it instead of the text.
if (!idoc.mimetype.compare(0, 6, "image/")) {
string fn = fileurltolocalpath(idoc.url);
// If the command wants a file but this is not a file url, or
// there is an ipath that it won't understand, we need a temp file:
theconfig->setKeyDir(fn.empty() ? "" : path_getfather(fn));
// We want a real file, so if this comes from data or we have
// an ipath, create it.
if (fn.empty() || !idoc.ipath.empty()) {
TempFile temp = lthr.tmpimg;
if (temp) {
LOGDEB1("Preview: load: got temp file from internfile\n" );
LOGDEB1("Preview: load: got temp file from internfile\n");
} else if (!FileInterner::idocToFile(temp, string(),
theconfig, idoc)) {
temp.reset(); // just in case.

View File

@ -300,7 +300,8 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
bool enterHistory = false;
bool istempfile = false;
LOGDEB("RclMain::startNV: groksipath " << (groksipath) << " wantsf " << (wantsfile) << " wantsparentf " << (wantsparentfile) << "\n" );
LOGDEB("RclMain::startNV: groksipath " << groksipath << " wantsf " <<
wantsfile << " wantsparentf " << wantsparentfile << "\n");
// If the command wants a file but this is not a file url, or
// there is an ipath that it won't understand, we need a temp file:
@ -328,8 +329,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
if (!fn.empty() && theconfig->mimeViewerNeedsUncomp(doc.mimetype)) {
if (access(fn.c_str(), R_OK) != 0) {
QMessageBox::warning(0, "Recoll",
tr("Can't access file: ") +
QString::fromLocal8Bit(fn.c_str()));
tr("Can't access file: ") + u8s2qs(fn));
return;
}
TempFile temp;