make sure that python rclextract.idoctofile always retrieves an uncompressed file of the correct MIME type. + misc comments

This commit is contained in:
Jean-Francois Dockes 2017-07-20 12:52:24 +02:00
parent 32e79d301b
commit 29c6f75423
6 changed files with 114 additions and 83 deletions

View File

@ -118,7 +118,6 @@ bool FileInterner::ipathContains(const string& parent, const string& child)
// Split into "constructor calls init()" to allow use from other constructor // Split into "constructor calls init()" to allow use from other constructor
FileInterner::FileInterner(const string &fn, const struct stat *stp, FileInterner::FileInterner(const string &fn, const struct stat *stp,
RclConfig *cnf, int flags, const string *imime) RclConfig *cnf, int flags, const string *imime)
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
{ {
LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n"); LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
if (fn.empty()) { if (fn.empty()) {
@ -219,8 +218,18 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n"); LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
} }
// Look for appropriate handler (might still return empty) // Get fields computed from extended attributes. We use the
// original file, not the m_fn which may be the uncompressed temp
// file
if (!m_noxattrs)
reapXAttrs(m_cfg, f, m_XAttrsFields);
// Gather metadata from external commands as configured.
reapMetaCmds(m_cfg, f, m_cmdFields);
m_mimetype = l_mime; m_mimetype = l_mime;
// Look for appropriate handler (might still return empty)
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview); RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
if (!df || df->is_unknown()) { if (!df || df->is_unknown()) {
@ -234,15 +243,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
m_forPreview ? "view" : "index"); m_forPreview ? "view" : "index");
df->set_property(Dijon::Filter::DJF_UDI, udi); df->set_property(Dijon::Filter::DJF_UDI, udi);
// Get fields computed from extended attributes. We use the
// original file, not the m_fn which may be the uncompressed temp
// file
if (!m_noxattrs)
reapXAttrs(m_cfg, f, m_XAttrsFields);
// Gather metadata from external commands as configured.
reapMetaCmds(m_cfg, f, m_cmdFields);
df->set_docsize(docsize); df->set_docsize(docsize);
if (!df->set_document_file(l_mime, m_fn)) { if (!df->set_document_file(l_mime, m_fn)) {
delete df; delete df;
@ -258,7 +258,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
// Setup from memory data (ie: out of the web cache). imime needs to be set. // Setup from memory data (ie: out of the web cache). imime needs to be set.
FileInterner::FileInterner(const string &data, RclConfig *cnf, FileInterner::FileInterner(const string &data, RclConfig *cnf,
int flags, const string& imime) int flags, const string& imime)
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
{ {
LOGDEB0("FileInterner::FileInterner(data)\n"); LOGDEB0("FileInterner::FileInterner(data)\n");
initcommon(cnf, flags); initcommon(cnf, flags);
@ -313,7 +312,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
void FileInterner::initcommon(RclConfig *cnf, int flags) void FileInterner::initcommon(RclConfig *cnf, int flags)
{ {
m_cfg = cnf; m_cfg = cnf;
m_forPreview = ((flags & FIF_forPreview) != 0); m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
// Initialize handler stack. // Initialize handler stack.
m_handlers.reserve(MAXHANDLERS); m_handlers.reserve(MAXHANDLERS);
for (unsigned int i = 0; i < MAXHANDLERS; i++) for (unsigned int i = 0; i < MAXHANDLERS; i++)
@ -324,7 +323,6 @@ void FileInterner::initcommon(RclConfig *cnf, int flags)
} }
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags) FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
: m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0))
{ {
LOGDEB0("FileInterner::FileInterner(idoc)\n"); LOGDEB0("FileInterner::FileInterner(idoc)\n");
initcommon(cnf, flags); initcommon(cnf, flags);
@ -347,6 +345,9 @@ FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
init(rawdoc.data, cnf, flags, idoc.mimetype); init(rawdoc.data, cnf, flags, idoc.mimetype);
break; break;
case DocFetcher::RawDoc::RDK_DATADIRECT: case DocFetcher::RawDoc::RDK_DATADIRECT:
// Note: only used for demo with the sample python external
// mbox indexer at this point. The external program is
// responsible for all the extraction process.
init(rawdoc.data, cnf, flags, idoc.mimetype); init(rawdoc.data, cnf, flags, idoc.mimetype);
m_direct = true; m_direct = true;
break; break;
@ -735,8 +736,8 @@ int FileInterner::addHandler()
} }
} }
if (!setres) { if (!setres) {
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn << LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn <<
" for mtype " << mimetype << "\n"); "] for mtype " << mimetype << "\n");
delete newflt; delete newflt;
if (m_forPreview) if (m_forPreview)
return ADD_ERROR; return ADD_ERROR;
@ -918,36 +919,24 @@ bool FileInterner::tempFileForMT(TempFile& otemp, RclConfig* cnf,
TempFile temp(new TempFileInternal( TempFile temp(new TempFileInternal(
cnf->getSuffixFromMimeType(mimetype))); cnf->getSuffixFromMimeType(mimetype)));
if (!temp->ok()) { if (!temp->ok()) {
LOGERR("FileInterner::interntofile: can't create temp file\n"); LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
return false; return false;
} }
otemp = temp; otemp = temp;
return true; return true;
} }
// Extract document (typically subdoc of multidoc) into temporary file. // Static method, creates a FileInterner object to do the job.
// We do the usual internfile stuff: create a temporary directory, bool FileInterner::idocToFile(
// then create an interner and call internfile. The target mtype is set to TempFile& otemp, const string& tofile, RclConfig *cnf,
// the input mtype, so that no data conversion is performed. const Rcl::Doc& idoc, bool uncompress)
// We then write the data out of the resulting document into the output file.
// There are two temporary objects:
// - The internfile temporary directory gets destroyed by its destructor
// - The output temporary file which is held in a reference-counted
// object and will be deleted when done with.
//
// If the ipath is null, maybe we're called because the file is not
// stored in the regular file system. We use the docfetcher to get a
// copy (in topdocToFile())
//
// We currently don't handle the case of an internal doc of a non-fs document.
bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc)
{ {
LOGDEB("FileInterner::idocToFile\n"); LOGDEB("FileInterner::idocToFile\n");
if (idoc.ipath.empty()) { if (idoc.ipath.empty()) {
return topdocToFile(otemp, tofile, cnf, idoc); // Because of the mandatory first conversion in the
// FileInterner constructor, need to use a specific method.
return topdocToFile(otemp, tofile, cnf, idoc, uncompress);
} }
// We set FIF_forPreview for consistency with the previous version // We set FIF_forPreview for consistency with the previous version
@ -958,17 +947,21 @@ bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype); return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
} }
bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile, // This is only needed because the FileInterner constructor always performs
RclConfig *cnf, const Rcl::Doc& idoc) // the first conversion, so that we need another approach for accessing the
// original document (targetmtype won't do).
bool FileInterner::topdocToFile(
TempFile& otemp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress)
{ {
DocFetcher *fetcher = docFetcherMake(cnf, idoc); DocFetcher *fetcher = docFetcherMake(cnf, idoc);
if (fetcher == 0) { if (fetcher == 0) {
LOGERR("FileInterner::idocToFile no backend\n"); LOGERR("FileInterner::topdocToFile no backend\n");
return false; return false;
} }
DocFetcher::RawDoc rawdoc; DocFetcher::RawDoc rawdoc;
if (!fetcher->fetch(cnf, idoc, rawdoc)) { if (!fetcher->fetch(cnf, idoc, rawdoc)) {
LOGERR("FileInterner::idocToFile fetcher failed\n"); LOGERR("FileInterner::topdocToFile fetcher failed\n");
return false; return false;
} }
const char *filename = ""; const char *filename = "";
@ -983,13 +976,24 @@ bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
} }
string reason; string reason;
switch (rawdoc.kind) { switch (rawdoc.kind) {
case DocFetcher::RawDoc::RDK_FILENAME: case DocFetcher::RawDoc::RDK_FILENAME: {
if (!copyfile(rawdoc.data.c_str(), filename, reason)) { string fn(rawdoc.data);
TempFile temp;
if (uncompress && isCompressed(fn, cnf)) {
if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) {
LOGERR("FileInterner::idocToFile: uncompress failed\n");
return false;
}
}
fn = temp ? temp->filename() : rawdoc.data;
if (!copyfile(fn.c_str(), filename, reason)) {
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n"); LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
return false; return false;
} }
}
break; break;
case DocFetcher::RawDoc::RDK_DATA: case DocFetcher::RawDoc::RDK_DATA:
case DocFetcher::RawDoc::RDK_DATADIRECT:
if (!stringtofile(rawdoc.data, filename, reason)) { if (!stringtofile(rawdoc.data, filename, reason)) {
LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n"); LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
return false; return false;
@ -1019,11 +1023,12 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
} }
// Specialcase text/html. This is to work around a bug that will // Specialcase text/html. This is to work around a bug that will
// get fixed some day: internfile initialisation does not check // get fixed some day: the internfile constructor always loads the
// targetmtype, so that at least one conversion is always // first handler so that at least one conversion is always
// performed. A common case would be an "Open" on an html file // performed (and the access to the original data may be lost). A
// (we'd end up with text/plain content). As the html version is // common case is an "Open" on an HTML file (we end up
// saved in this case, use it. // with text/plain content). As the HTML version is saved in this
// case, use it.
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) { if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
doc.text = get_html(); doc.text = get_html();
doc.mimetype = cstr_texthtml; doc.mimetype = cstr_texthtml;

View File

@ -118,21 +118,20 @@ class FileInterner {
/** /**
* Alternate constructor for the case where the data is in memory. * Alternate constructor for the case where the data is in memory.
* This is mainly for data extracted from the web cache. The mime type * This is mainly for data extracted from the web cache.
* must be set, input must be already uncompressed. * The MIME type must be set, and the data must be uncompressed.
*/ */
FileInterner(const string &data, RclConfig *cnf, FileInterner(const string &data, RclConfig *cnf,
int flags, const string& mtype); int flags, const string& mtype);
/** /**
* Alternate constructor used at query time. We don't know where * Alternate constructor used at query time. We don't know where
* the data was stored, this is determined from the Rcl::Doc data * the data was stored, and use the fetcher interface to reach it.
* *
* @param idoc Rcl::Doc object built from index data. The back-end * @param idoc Rcl::Doc object built from index data. The back-end
* storage identifier (rclbes field) is used to build the * storage identifier (rclbes field) is used by the fetcher factory
* appropriate fetcher which uses the rest of the Doc fields (url, * to build the appropriate object to return a file name or data which
* ipath...) to retrieve the file or a file reference, which we * is then used with the appropriate init method.
* then process normally.
*/ */
FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags); FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags);
@ -214,10 +213,15 @@ class FileInterner {
* to query the right backend. Used to check up-to-dateness at query time */ * to query the right backend. Used to check up-to-dateness at query time */
static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig); static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig);
/** Extract internal document into temporary file. /** Extract internal document into temporary file, without converting the
* data.
*
* This is used mainly for starting an external viewer for a * This is used mainly for starting an external viewer for a
* subdocument (ie: mail attachment). This really would not need to be * subdocument (ie: mail attachment), but, for consistency, it also
* a member. It creates a FileInterner object to do the actual work * works with a top level (null ipath) document.
* This would not actually need to be a member method, it creates a
* FileInterner object to do the actual work.
*
* @return true for success. * @return true for success.
* @param temp output reference-counted temp file object (goes * @param temp output reference-counted temp file object (goes
* away magically). Only used if tofile.empty() * away magically). Only used if tofile.empty()
@ -225,13 +229,14 @@ class FileInterner {
* @param cnf The recoll config * @param cnf The recoll config
* @param doc Doc data taken from the index. We use it to construct a * @param doc Doc data taken from the index. We use it to construct a
* FileInterner object. * FileInterner object.
* @param uncompress if true, uncompress compressed original doc. Only does
* anything for a top level document.
*/ */
static bool idocToFile(TempFile& temp, const string& tofile, static bool idocToFile(TempFile& temp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& doc); RclConfig *cnf, const Rcl::Doc& doc,
bool uncompress = true);
/** /** Does file appear to be the compressed version of a document? */
* Does file appear to be the compressed version of a document?
*/
static bool isCompressed(const string& fn, RclConfig *cnf); static bool isCompressed(const string& fn, RclConfig *cnf);
/** /**
@ -253,7 +258,7 @@ class FileInterner {
string m_targetMType; string m_targetMType;
string m_reachedMType; // target or text/plain string m_reachedMType; // target or text/plain
string m_tfile; string m_tfile;
bool m_ok; // Set after construction if ok bool m_ok{false}; // Set after construction if ok
// Fields found in file extended attributes. This is kept here, // Fields found in file extended attributes. This is kept here,
// not in the file-level handler because we are only interested in // not in the file-level handler because we are only interested in
// the top-level file, not any temp file necessitated by // the top-level file, not any temp file necessitated by
@ -270,7 +275,7 @@ class FileInterner {
vector<TempFile> m_tempfiles; vector<TempFile> m_tempfiles;
// Error data if any // Error data if any
string m_reason; string m_reason;
FIMissingStore *m_missingdatap; FIMissingStore *m_missingdatap{nullptr};
Uncomp m_uncomp; Uncomp m_uncomp;
@ -294,7 +299,8 @@ class FileInterner {
static bool tempFileForMT(TempFile& otemp, RclConfig *cnf, static bool tempFileForMT(TempFile& otemp, RclConfig *cnf,
const std::string& mimetype); const std::string& mimetype);
static bool topdocToFile(TempFile& otemp, const std::string& tofile, static bool topdocToFile(TempFile& otemp, const std::string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc); RclConfig *cnf, const Rcl::Doc& idoc,
bool uncompress);
}; };

View File

@ -44,12 +44,16 @@ typedef struct {
/* Type-specific fields go here. */ /* Type-specific fields go here. */
FileInterner *xtr; FileInterner *xtr;
RclConfig *rclconfig; RclConfig *rclconfig;
recoll_DocObject *docobject;
} rclx_ExtractorObject; } rclx_ExtractorObject;
static void static void
Extractor_dealloc(rclx_ExtractorObject *self) Extractor_dealloc(rclx_ExtractorObject *self)
{ {
LOGDEB("Extractor_dealloc\n" ); LOGDEB("Extractor_dealloc\n" );
if (self->docobject) {
Py_DECREF(&self->docobject);
}
delete self->xtr; delete self->xtr;
Py_TYPE(self)->tp_free((PyObject*)self); Py_TYPE(self)->tp_free((PyObject*)self);
} }
@ -64,6 +68,7 @@ Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
return 0; return 0;
self->xtr = 0; self->xtr = 0;
self->rclconfig = 0; self->rclconfig = 0;
self->docobject = 0;
return (PyObject *)self; return (PyObject *)self;
} }
@ -82,6 +87,9 @@ Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "Null Doc ?"); PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
return -1; return -1;
} }
self->docobject = dobj;
Py_INCREF(dobj);
self->rclconfig = dobj->rclconfig; self->rclconfig = dobj->rclconfig;
self->xtr = new FileInterner(*dobj->doc, self->rclconfig, self->xtr = new FileInterner(*dobj->doc, self->rclconfig,
FileInterner::FIF_forPreview); FileInterner::FIF_forPreview);
@ -177,14 +185,26 @@ Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args,
PyErr_SetString(PyExc_AttributeError, "idoctofile: null object"); PyErr_SetString(PyExc_AttributeError, "idoctofile: null object");
return 0; return 0;
} }
if (ipath.empty()) {
PyErr_SetString(PyExc_ValueError, "idoctofile: null ipath");
return 0;
}
self->xtr->setTargetMType(mimetype); // If ipath is empty and we want the original mimetype, we can't
// use FileInterner::internToFile() because the first conversion
// was performed by the FileInterner constructor, so that we can't
// reach the original object this way. Instead, if the data comes
// from a file (m_fn set), we just copy it, else, we call
// idoctofile, which will call topdoctofile (and re-fetch the
// data, yes, wastefull)
TempFile temp; TempFile temp;
bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype); bool status = false;
LOGDEB("Extractor_idoctofile: ipath [" << ipath << "] mimetype [" <<
mimetype << "] doc mimetype [" << self->docobject->doc->mimetype <<
"\n");
if (ipath.empty() && !mimetype.compare(self->docobject->doc->mimetype)) {
status = FileInterner::idocToFile(temp, outfile, self->rclconfig,
*self->docobject->doc);
} else {
self->xtr->setTargetMType(mimetype);
status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
}
if (!status) { if (!status) {
PyErr_SetString(PyExc_AttributeError, "interntofile failure"); PyErr_SetString(PyExc_AttributeError, "interntofile failure");
return 0; return 0;

View File

@ -55,8 +55,8 @@ void multiSave(QWidget *p, vector<Rcl::Doc>& docs)
LOGDEB2("multiSave: got dir " << (dir) << "\n" ); LOGDEB2("multiSave: got dir " << (dir) << "\n" );
/* Save doc to files in target directory. Issues: /* Save doc to files in target directory. Issues:
- It is quite common to have docs in the array with the save - It is quite common to have docs in the array with the same
file names, e.g. all messages in a folder have the save file file names, e.g. all messages in a folder have the same file
name (the folder's). name (the folder's).
- There is no warranty that the ipath is going to be acceptable - There is no warranty that the ipath is going to be acceptable
as a file name or interesting at all. We don't use it. as a file name or interesting at all. We don't use it.
@ -131,7 +131,7 @@ void multiSave(QWidget *p, vector<Rcl::Doc>& docs)
} }
// There is still a race condition here, should we care ? // There is still a race condition here, should we care ?
TempFile temp;// not used TempFile temp;// not used
if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i])) { if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i], false)) {
QMessageBox::warning(0, "Recoll", QMessageBox::warning(0, "Recoll",
QWidget::tr("Cannot extract document: ") + QWidget::tr("Cannot extract document: ") +
QString::fromLocal8Bit(docs[i].url.c_str()) + QString::fromLocal8Bit(docs[i].url.c_str()) +

View File

@ -819,10 +819,10 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
// If this is an image, display it instead of the text. // If this is an image, display it instead of the text.
if (!idoc.mimetype.compare(0, 6, "image/")) { if (!idoc.mimetype.compare(0, 6, "image/")) {
string fn = fileurltolocalpath(idoc.url); string fn = fileurltolocalpath(idoc.url);
// If the command wants a file but this is not a file url, or
// there is an ipath that it won't understand, we need a temp file:
theconfig->setKeyDir(fn.empty() ? "" : path_getfather(fn)); theconfig->setKeyDir(fn.empty() ? "" : path_getfather(fn));
// We want a real file, so if this comes from data or we have
// an ipath, create it.
if (fn.empty() || !idoc.ipath.empty()) { if (fn.empty() || !idoc.ipath.empty()) {
TempFile temp = lthr.tmpimg; TempFile temp = lthr.tmpimg;
if (temp) { if (temp) {

View File

@ -300,7 +300,8 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
bool enterHistory = false; bool enterHistory = false;
bool istempfile = false; bool istempfile = false;
LOGDEB("RclMain::startNV: groksipath " << (groksipath) << " wantsf " << (wantsfile) << " wantsparentf " << (wantsparentfile) << "\n" ); LOGDEB("RclMain::startNV: groksipath " << groksipath << " wantsf " <<
wantsfile << " wantsparentf " << wantsparentfile << "\n");
// If the command wants a file but this is not a file url, or // If the command wants a file but this is not a file url, or
// there is an ipath that it won't understand, we need a temp file: // there is an ipath that it won't understand, we need a temp file:
@ -328,8 +329,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
if (!fn.empty() && theconfig->mimeViewerNeedsUncomp(doc.mimetype)) { if (!fn.empty() && theconfig->mimeViewerNeedsUncomp(doc.mimetype)) {
if (access(fn.c_str(), R_OK) != 0) { if (access(fn.c_str(), R_OK) != 0) {
QMessageBox::warning(0, "Recoll", QMessageBox::warning(0, "Recoll",
tr("Can't access file: ") + tr("Can't access file: ") + u8s2qs(fn));
QString::fromLocal8Bit(fn.c_str()));
return; return;
} }
TempFile temp; TempFile temp;