diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 136e288b..1c4a07c4 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -118,7 +118,6 @@ bool FileInterner::ipathContains(const string& parent, const string& child) // Split into "constructor calls init()" to allow use from other constructor FileInterner::FileInterner(const string &fn, const struct stat *stp, RclConfig *cnf, int flags, const string *imime) - : m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0) { LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n"); if (fn.empty()) { @@ -219,8 +218,18 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n"); } - // Look for appropriate handler (might still return empty) + // Get fields computed from extended attributes. We use the + // original file, not the m_fn which may be the uncompressed temp + // file + if (!m_noxattrs) + reapXAttrs(m_cfg, f, m_XAttrsFields); + + // Gather metadata from external commands as configured. + reapMetaCmds(m_cfg, f, m_cmdFields); + m_mimetype = l_mime; + + // Look for appropriate handler (might still return empty) RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview); if (!df || df->is_unknown()) { @@ -234,15 +243,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, m_forPreview ? "view" : "index"); df->set_property(Dijon::Filter::DJF_UDI, udi); - // Get fields computed from extended attributes. We use the - // original file, not the m_fn which may be the uncompressed temp - // file - if (!m_noxattrs) - reapXAttrs(m_cfg, f, m_XAttrsFields); - - // Gather metadata from external commands as configured. - reapMetaCmds(m_cfg, f, m_cmdFields); - df->set_docsize(docsize); if (!df->set_document_file(l_mime, m_fn)) { delete df; @@ -258,7 +258,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, // Setup from memory data (ie: out of the web cache). imime needs to be set. FileInterner::FileInterner(const string &data, RclConfig *cnf, int flags, const string& imime) - : m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0) { LOGDEB0("FileInterner::FileInterner(data)\n"); initcommon(cnf, flags); @@ -313,7 +312,7 @@ void FileInterner::init(const string &data, RclConfig *cnf, void FileInterner::initcommon(RclConfig *cnf, int flags) { m_cfg = cnf; - m_forPreview = ((flags & FIF_forPreview) != 0); + m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0); // Initialize handler stack. m_handlers.reserve(MAXHANDLERS); for (unsigned int i = 0; i < MAXHANDLERS; i++) @@ -324,7 +323,6 @@ void FileInterner::initcommon(RclConfig *cnf, int flags) } FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags) - : m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0)) { LOGDEB0("FileInterner::FileInterner(idoc)\n"); initcommon(cnf, flags); @@ -347,6 +345,9 @@ FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags) init(rawdoc.data, cnf, flags, idoc.mimetype); break; case DocFetcher::RawDoc::RDK_DATADIRECT: + // Note: only used for demo with the sample python external + // mbox indexer at this point. The external program is + // responsible for all the extraction process. init(rawdoc.data, cnf, flags, idoc.mimetype); m_direct = true; break; @@ -735,8 +736,8 @@ int FileInterner::addHandler() } } if (!setres) { - LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn << - " for mtype " << mimetype << "\n"); + LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn << + "] for mtype " << mimetype << "\n"); delete newflt; if (m_forPreview) return ADD_ERROR; @@ -918,36 +919,24 @@ bool FileInterner::tempFileForMT(TempFile& otemp, RclConfig* cnf, TempFile temp(new TempFileInternal( cnf->getSuffixFromMimeType(mimetype))); if (!temp->ok()) { - LOGERR("FileInterner::interntofile: can't create temp file\n"); + LOGERR("FileInterner::tempFileForMT: can't create temp file\n"); return false; } otemp = temp; return true; } -// Extract document (typically subdoc of multidoc) into temporary file. -// We do the usual internfile stuff: create a temporary directory, -// then create an interner and call internfile. The target mtype is set to -// the input mtype, so that no data conversion is performed. -// We then write the data out of the resulting document into the output file. -// There are two temporary objects: -// - The internfile temporary directory gets destroyed by its destructor -// - The output temporary file which is held in a reference-counted -// object and will be deleted when done with. -// -// If the ipath is null, maybe we're called because the file is not -// stored in the regular file system. We use the docfetcher to get a -// copy (in topdocToFile()) -// -// We currently don't handle the case of an internal doc of a non-fs document. - -bool FileInterner::idocToFile(TempFile& otemp, const string& tofile, - RclConfig *cnf, const Rcl::Doc& idoc) +// Static method, creates a FileInterner object to do the job. +bool FileInterner::idocToFile( + TempFile& otemp, const string& tofile, RclConfig *cnf, + const Rcl::Doc& idoc, bool uncompress) { LOGDEB("FileInterner::idocToFile\n"); if (idoc.ipath.empty()) { - return topdocToFile(otemp, tofile, cnf, idoc); + // Because of the mandatory first conversion in the + // FileInterner constructor, need to use a specific method. + return topdocToFile(otemp, tofile, cnf, idoc, uncompress); } // We set FIF_forPreview for consistency with the previous version @@ -958,17 +947,21 @@ bool FileInterner::idocToFile(TempFile& otemp, const string& tofile, return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype); } -bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile, - RclConfig *cnf, const Rcl::Doc& idoc) +// This is only needed because the FileInterner constructor always performs +// the first conversion, so that we need another approach for accessing the +// original document (targetmtype won't do). +bool FileInterner::topdocToFile( + TempFile& otemp, const string& tofile, + RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress) { DocFetcher *fetcher = docFetcherMake(cnf, idoc); if (fetcher == 0) { - LOGERR("FileInterner::idocToFile no backend\n"); + LOGERR("FileInterner::topdocToFile no backend\n"); return false; } DocFetcher::RawDoc rawdoc; if (!fetcher->fetch(cnf, idoc, rawdoc)) { - LOGERR("FileInterner::idocToFile fetcher failed\n"); + LOGERR("FileInterner::topdocToFile fetcher failed\n"); return false; } const char *filename = ""; @@ -983,13 +976,24 @@ bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile, } string reason; switch (rawdoc.kind) { - case DocFetcher::RawDoc::RDK_FILENAME: - if (!copyfile(rawdoc.data.c_str(), filename, reason)) { + case DocFetcher::RawDoc::RDK_FILENAME: { + string fn(rawdoc.data); + TempFile temp; + if (uncompress && isCompressed(fn, cnf)) { + if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) { + LOGERR("FileInterner::idocToFile: uncompress failed\n"); + return false; + } + } + fn = temp ? temp->filename() : rawdoc.data; + if (!copyfile(fn.c_str(), filename, reason)) { LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n"); return false; } + } break; case DocFetcher::RawDoc::RDK_DATA: + case DocFetcher::RawDoc::RDK_DATADIRECT: if (!stringtofile(rawdoc.data, filename, reason)) { LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n"); return false; @@ -1019,11 +1023,12 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile, } // Specialcase text/html. This is to work around a bug that will - // get fixed some day: internfile initialisation does not check - // targetmtype, so that at least one conversion is always - // performed. A common case would be an "Open" on an html file - // (we'd end up with text/plain content). As the html version is - // saved in this case, use it. + // get fixed some day: the internfile constructor always loads the + // first handler so that at least one conversion is always + // performed (and the access to the original data may be lost). A + // common case is an "Open" on an HTML file (we end up + // with text/plain content). As the HTML version is saved in this + // case, use it. if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) { doc.text = get_html(); doc.mimetype = cstr_texthtml; diff --git a/src/internfile/internfile.h b/src/internfile/internfile.h index 5e7c6c4b..3d5345de 100644 --- a/src/internfile/internfile.h +++ b/src/internfile/internfile.h @@ -118,21 +118,20 @@ class FileInterner { /** * Alternate constructor for the case where the data is in memory. - * This is mainly for data extracted from the web cache. The mime type - * must be set, input must be already uncompressed. + * This is mainly for data extracted from the web cache. + * The MIME type must be set, and the data must be uncompressed. */ FileInterner(const string &data, RclConfig *cnf, int flags, const string& mtype); /** * Alternate constructor used at query time. We don't know where - * the data was stored, this is determined from the Rcl::Doc data + * the data was stored, and use the fetcher interface to reach it. * * @param idoc Rcl::Doc object built from index data. The back-end - * storage identifier (rclbes field) is used to build the - * appropriate fetcher which uses the rest of the Doc fields (url, - * ipath...) to retrieve the file or a file reference, which we - * then process normally. + * storage identifier (rclbes field) is used by the fetcher factory + * to build the appropriate object to return a file name or data which + * is then used with the appropriate init method. */ FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags); @@ -214,10 +213,15 @@ class FileInterner { * to query the right backend. Used to check up-to-dateness at query time */ static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig); - /** Extract internal document into temporary file. - * This is used mainly for starting an external viewer for a - * subdocument (ie: mail attachment). This really would not need to be - * a member. It creates a FileInterner object to do the actual work + /** Extract internal document into temporary file, without converting the + * data. + * + * This is used mainly for starting an external viewer for a + * subdocument (ie: mail attachment), but, for consistency, it also + * works with a top level (null ipath) document. + * This would not actually need to be a member method, it creates a + * FileInterner object to do the actual work. + * * @return true for success. * @param temp output reference-counted temp file object (goes * away magically). Only used if tofile.empty() @@ -225,13 +229,14 @@ class FileInterner { * @param cnf The recoll config * @param doc Doc data taken from the index. We use it to construct a * FileInterner object. + * @param uncompress if true, uncompress compressed original doc. Only does + * anything for a top level document. */ static bool idocToFile(TempFile& temp, const string& tofile, - RclConfig *cnf, const Rcl::Doc& doc); + RclConfig *cnf, const Rcl::Doc& doc, + bool uncompress = true); - /** - * Does file appear to be the compressed version of a document? - */ + /** Does file appear to be the compressed version of a document? */ static bool isCompressed(const string& fn, RclConfig *cnf); /** @@ -253,7 +258,7 @@ class FileInterner { string m_targetMType; string m_reachedMType; // target or text/plain string m_tfile; - bool m_ok; // Set after construction if ok + bool m_ok{false}; // Set after construction if ok // Fields found in file extended attributes. This is kept here, // not in the file-level handler because we are only interested in // the top-level file, not any temp file necessitated by @@ -270,7 +275,7 @@ class FileInterner { vector m_tempfiles; // Error data if any string m_reason; - FIMissingStore *m_missingdatap; + FIMissingStore *m_missingdatap{nullptr}; Uncomp m_uncomp; @@ -294,7 +299,8 @@ class FileInterner { static bool tempFileForMT(TempFile& otemp, RclConfig *cnf, const std::string& mimetype); static bool topdocToFile(TempFile& otemp, const std::string& tofile, - RclConfig *cnf, const Rcl::Doc& idoc); + RclConfig *cnf, const Rcl::Doc& idoc, + bool uncompress); }; diff --git a/src/python/recoll/pyrclextract.cpp b/src/python/recoll/pyrclextract.cpp index 8f632e81..5d8db839 100644 --- a/src/python/recoll/pyrclextract.cpp +++ b/src/python/recoll/pyrclextract.cpp @@ -44,12 +44,16 @@ typedef struct { /* Type-specific fields go here. */ FileInterner *xtr; RclConfig *rclconfig; + recoll_DocObject *docobject; } rclx_ExtractorObject; static void Extractor_dealloc(rclx_ExtractorObject *self) { LOGDEB("Extractor_dealloc\n" ); + if (self->docobject) { + Py_DECREF(&self->docobject); + } delete self->xtr; Py_TYPE(self)->tp_free((PyObject*)self); } @@ -64,6 +68,7 @@ Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return 0; self->xtr = 0; self->rclconfig = 0; + self->docobject = 0; return (PyObject *)self; } @@ -82,6 +87,9 @@ Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs) PyErr_SetString(PyExc_AttributeError, "Null Doc ?"); return -1; } + self->docobject = dobj; + Py_INCREF(dobj); + self->rclconfig = dobj->rclconfig; self->xtr = new FileInterner(*dobj->doc, self->rclconfig, FileInterner::FIF_forPreview); @@ -177,17 +185,29 @@ Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args, PyErr_SetString(PyExc_AttributeError, "idoctofile: null object"); return 0; } - if (ipath.empty()) { - PyErr_SetString(PyExc_ValueError, "idoctofile: null ipath"); - return 0; - } - - self->xtr->setTargetMType(mimetype); + + // If ipath is empty and we want the original mimetype, we can't + // use FileInterner::internToFile() because the first conversion + // was performed by the FileInterner constructor, so that we can't + // reach the original object this way. Instead, if the data comes + // from a file (m_fn set), we just copy it, else, we call + // idoctofile, which will call topdoctofile (and re-fetch the + // data, yes, wastefull) TempFile temp; - bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype); + bool status = false; + LOGDEB("Extractor_idoctofile: ipath [" << ipath << "] mimetype [" << + mimetype << "] doc mimetype [" << self->docobject->doc->mimetype << + "\n"); + if (ipath.empty() && !mimetype.compare(self->docobject->doc->mimetype)) { + status = FileInterner::idocToFile(temp, outfile, self->rclconfig, + *self->docobject->doc); + } else { + self->xtr->setTargetMType(mimetype); + status = self->xtr->interntofile(temp, outfile, ipath, mimetype); + } if (!status) { PyErr_SetString(PyExc_AttributeError, "interntofile failure"); - return 0; + return 0; } if (outfile.empty()) temp->setnoremove(1); diff --git a/src/qtgui/multisave.cpp b/src/qtgui/multisave.cpp index 00849d66..4ba8e5ab 100644 --- a/src/qtgui/multisave.cpp +++ b/src/qtgui/multisave.cpp @@ -55,8 +55,8 @@ void multiSave(QWidget *p, vector& docs) LOGDEB2("multiSave: got dir " << (dir) << "\n" ); /* Save doc to files in target directory. Issues: - - It is quite common to have docs in the array with the save - file names, e.g. all messages in a folder have the save file + - It is quite common to have docs in the array with the same + file names, e.g. all messages in a folder have the same file name (the folder's). - There is no warranty that the ipath is going to be acceptable as a file name or interesting at all. We don't use it. @@ -131,7 +131,7 @@ void multiSave(QWidget *p, vector& docs) } // There is still a race condition here, should we care ? TempFile temp;// not used - if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i])) { + if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i], false)) { QMessageBox::warning(0, "Recoll", QWidget::tr("Cannot extract document: ") + QString::fromLocal8Bit(docs[i].url.c_str()) + diff --git a/src/qtgui/preview_w.cpp b/src/qtgui/preview_w.cpp index 52d1e65a..5f786770 100644 --- a/src/qtgui/preview_w.cpp +++ b/src/qtgui/preview_w.cpp @@ -819,14 +819,14 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum) // If this is an image, display it instead of the text. if (!idoc.mimetype.compare(0, 6, "image/")) { string fn = fileurltolocalpath(idoc.url); - - // If the command wants a file but this is not a file url, or - // there is an ipath that it won't understand, we need a temp file: theconfig->setKeyDir(fn.empty() ? "" : path_getfather(fn)); + + // We want a real file, so if this comes from data or we have + // an ipath, create it. if (fn.empty() || !idoc.ipath.empty()) { TempFile temp = lthr.tmpimg; if (temp) { - LOGDEB1("Preview: load: got temp file from internfile\n" ); + LOGDEB1("Preview: load: got temp file from internfile\n"); } else if (!FileInterner::idocToFile(temp, string(), theconfig, idoc)) { temp.reset(); // just in case. diff --git a/src/qtgui/rclm_view.cpp b/src/qtgui/rclm_view.cpp index cc2bab52..0001e027 100644 --- a/src/qtgui/rclm_view.cpp +++ b/src/qtgui/rclm_view.cpp @@ -300,7 +300,8 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) bool enterHistory = false; bool istempfile = false; - LOGDEB("RclMain::startNV: groksipath " << (groksipath) << " wantsf " << (wantsfile) << " wantsparentf " << (wantsparentfile) << "\n" ); + LOGDEB("RclMain::startNV: groksipath " << groksipath << " wantsf " << + wantsfile << " wantsparentf " << wantsparentfile << "\n"); // If the command wants a file but this is not a file url, or // there is an ipath that it won't understand, we need a temp file: @@ -328,8 +329,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) if (!fn.empty() && theconfig->mimeViewerNeedsUncomp(doc.mimetype)) { if (access(fn.c_str(), R_OK) != 0) { QMessageBox::warning(0, "Recoll", - tr("Can't access file: ") + - QString::fromLocal8Bit(fn.c_str())); + tr("Can't access file: ") + u8s2qs(fn)); return; } TempFile temp;