make sure that python rclextract.idoctofile always retrieves an uncompressed file of the correct MIME type. + misc comments
This commit is contained in:
parent
32e79d301b
commit
29c6f75423
@ -118,7 +118,6 @@ bool FileInterner::ipathContains(const string& parent, const string& child)
|
||||
// Split into "constructor calls init()" to allow use from other constructor
|
||||
FileInterner::FileInterner(const string &fn, const struct stat *stp,
|
||||
RclConfig *cnf, int flags, const string *imime)
|
||||
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
|
||||
{
|
||||
LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
|
||||
if (fn.empty()) {
|
||||
@ -219,8 +218,18 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
|
||||
}
|
||||
|
||||
// Look for appropriate handler (might still return empty)
|
||||
// Get fields computed from extended attributes. We use the
|
||||
// original file, not the m_fn which may be the uncompressed temp
|
||||
// file
|
||||
if (!m_noxattrs)
|
||||
reapXAttrs(m_cfg, f, m_XAttrsFields);
|
||||
|
||||
// Gather metadata from external commands as configured.
|
||||
reapMetaCmds(m_cfg, f, m_cmdFields);
|
||||
|
||||
m_mimetype = l_mime;
|
||||
|
||||
// Look for appropriate handler (might still return empty)
|
||||
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
|
||||
|
||||
if (!df || df->is_unknown()) {
|
||||
@ -234,15 +243,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
m_forPreview ? "view" : "index");
|
||||
df->set_property(Dijon::Filter::DJF_UDI, udi);
|
||||
|
||||
// Get fields computed from extended attributes. We use the
|
||||
// original file, not the m_fn which may be the uncompressed temp
|
||||
// file
|
||||
if (!m_noxattrs)
|
||||
reapXAttrs(m_cfg, f, m_XAttrsFields);
|
||||
|
||||
// Gather metadata from external commands as configured.
|
||||
reapMetaCmds(m_cfg, f, m_cmdFields);
|
||||
|
||||
df->set_docsize(docsize);
|
||||
if (!df->set_document_file(l_mime, m_fn)) {
|
||||
delete df;
|
||||
@ -258,7 +258,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
// Setup from memory data (ie: out of the web cache). imime needs to be set.
|
||||
FileInterner::FileInterner(const string &data, RclConfig *cnf,
|
||||
int flags, const string& imime)
|
||||
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
|
||||
{
|
||||
LOGDEB0("FileInterner::FileInterner(data)\n");
|
||||
initcommon(cnf, flags);
|
||||
@ -313,7 +312,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
void FileInterner::initcommon(RclConfig *cnf, int flags)
|
||||
{
|
||||
m_cfg = cnf;
|
||||
m_forPreview = ((flags & FIF_forPreview) != 0);
|
||||
m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
|
||||
// Initialize handler stack.
|
||||
m_handlers.reserve(MAXHANDLERS);
|
||||
for (unsigned int i = 0; i < MAXHANDLERS; i++)
|
||||
@ -324,7 +323,6 @@ void FileInterner::initcommon(RclConfig *cnf, int flags)
|
||||
}
|
||||
|
||||
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
|
||||
: m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0))
|
||||
{
|
||||
LOGDEB0("FileInterner::FileInterner(idoc)\n");
|
||||
initcommon(cnf, flags);
|
||||
@ -347,6 +345,9 @@ FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
|
||||
init(rawdoc.data, cnf, flags, idoc.mimetype);
|
||||
break;
|
||||
case DocFetcher::RawDoc::RDK_DATADIRECT:
|
||||
// Note: only used for demo with the sample python external
|
||||
// mbox indexer at this point. The external program is
|
||||
// responsible for all the extraction process.
|
||||
init(rawdoc.data, cnf, flags, idoc.mimetype);
|
||||
m_direct = true;
|
||||
break;
|
||||
@ -735,8 +736,8 @@ int FileInterner::addHandler()
|
||||
}
|
||||
}
|
||||
if (!setres) {
|
||||
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
|
||||
" for mtype " << mimetype << "\n");
|
||||
LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn <<
|
||||
"] for mtype " << mimetype << "\n");
|
||||
delete newflt;
|
||||
if (m_forPreview)
|
||||
return ADD_ERROR;
|
||||
@ -918,36 +919,24 @@ bool FileInterner::tempFileForMT(TempFile& otemp, RclConfig* cnf,
|
||||
TempFile temp(new TempFileInternal(
|
||||
cnf->getSuffixFromMimeType(mimetype)));
|
||||
if (!temp->ok()) {
|
||||
LOGERR("FileInterner::interntofile: can't create temp file\n");
|
||||
LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
|
||||
return false;
|
||||
}
|
||||
otemp = temp;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Extract document (typically subdoc of multidoc) into temporary file.
|
||||
// We do the usual internfile stuff: create a temporary directory,
|
||||
// then create an interner and call internfile. The target mtype is set to
|
||||
// the input mtype, so that no data conversion is performed.
|
||||
// We then write the data out of the resulting document into the output file.
|
||||
// There are two temporary objects:
|
||||
// - The internfile temporary directory gets destroyed by its destructor
|
||||
// - The output temporary file which is held in a reference-counted
|
||||
// object and will be deleted when done with.
|
||||
//
|
||||
// If the ipath is null, maybe we're called because the file is not
|
||||
// stored in the regular file system. We use the docfetcher to get a
|
||||
// copy (in topdocToFile())
|
||||
//
|
||||
// We currently don't handle the case of an internal doc of a non-fs document.
|
||||
|
||||
bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
|
||||
RclConfig *cnf, const Rcl::Doc& idoc)
|
||||
// Static method, creates a FileInterner object to do the job.
|
||||
bool FileInterner::idocToFile(
|
||||
TempFile& otemp, const string& tofile, RclConfig *cnf,
|
||||
const Rcl::Doc& idoc, bool uncompress)
|
||||
{
|
||||
LOGDEB("FileInterner::idocToFile\n");
|
||||
|
||||
if (idoc.ipath.empty()) {
|
||||
return topdocToFile(otemp, tofile, cnf, idoc);
|
||||
// Because of the mandatory first conversion in the
|
||||
// FileInterner constructor, need to use a specific method.
|
||||
return topdocToFile(otemp, tofile, cnf, idoc, uncompress);
|
||||
}
|
||||
|
||||
// We set FIF_forPreview for consistency with the previous version
|
||||
@ -958,17 +947,21 @@ bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
|
||||
return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
|
||||
}
|
||||
|
||||
bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
|
||||
RclConfig *cnf, const Rcl::Doc& idoc)
|
||||
// This is only needed because the FileInterner constructor always performs
|
||||
// the first conversion, so that we need another approach for accessing the
|
||||
// original document (targetmtype won't do).
|
||||
bool FileInterner::topdocToFile(
|
||||
TempFile& otemp, const string& tofile,
|
||||
RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress)
|
||||
{
|
||||
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
|
||||
if (fetcher == 0) {
|
||||
LOGERR("FileInterner::idocToFile no backend\n");
|
||||
LOGERR("FileInterner::topdocToFile no backend\n");
|
||||
return false;
|
||||
}
|
||||
DocFetcher::RawDoc rawdoc;
|
||||
if (!fetcher->fetch(cnf, idoc, rawdoc)) {
|
||||
LOGERR("FileInterner::idocToFile fetcher failed\n");
|
||||
LOGERR("FileInterner::topdocToFile fetcher failed\n");
|
||||
return false;
|
||||
}
|
||||
const char *filename = "";
|
||||
@ -983,13 +976,24 @@ bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
|
||||
}
|
||||
string reason;
|
||||
switch (rawdoc.kind) {
|
||||
case DocFetcher::RawDoc::RDK_FILENAME:
|
||||
if (!copyfile(rawdoc.data.c_str(), filename, reason)) {
|
||||
case DocFetcher::RawDoc::RDK_FILENAME: {
|
||||
string fn(rawdoc.data);
|
||||
TempFile temp;
|
||||
if (uncompress && isCompressed(fn, cnf)) {
|
||||
if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) {
|
||||
LOGERR("FileInterner::idocToFile: uncompress failed\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fn = temp ? temp->filename() : rawdoc.data;
|
||||
if (!copyfile(fn.c_str(), filename, reason)) {
|
||||
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DocFetcher::RawDoc::RDK_DATA:
|
||||
case DocFetcher::RawDoc::RDK_DATADIRECT:
|
||||
if (!stringtofile(rawdoc.data, filename, reason)) {
|
||||
LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
|
||||
return false;
|
||||
@ -1019,11 +1023,12 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
|
||||
}
|
||||
|
||||
// Specialcase text/html. This is to work around a bug that will
|
||||
// get fixed some day: internfile initialisation does not check
|
||||
// targetmtype, so that at least one conversion is always
|
||||
// performed. A common case would be an "Open" on an html file
|
||||
// (we'd end up with text/plain content). As the html version is
|
||||
// saved in this case, use it.
|
||||
// get fixed some day: the internfile constructor always loads the
|
||||
// first handler so that at least one conversion is always
|
||||
// performed (and the access to the original data may be lost). A
|
||||
// common case is an "Open" on an HTML file (we end up
|
||||
// with text/plain content). As the HTML version is saved in this
|
||||
// case, use it.
|
||||
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
|
||||
doc.text = get_html();
|
||||
doc.mimetype = cstr_texthtml;
|
||||
|
||||
@ -118,21 +118,20 @@ class FileInterner {
|
||||
|
||||
/**
|
||||
* Alternate constructor for the case where the data is in memory.
|
||||
* This is mainly for data extracted from the web cache. The mime type
|
||||
* must be set, input must be already uncompressed.
|
||||
* This is mainly for data extracted from the web cache.
|
||||
* The MIME type must be set, and the data must be uncompressed.
|
||||
*/
|
||||
FileInterner(const string &data, RclConfig *cnf,
|
||||
int flags, const string& mtype);
|
||||
|
||||
/**
|
||||
* Alternate constructor used at query time. We don't know where
|
||||
* the data was stored, this is determined from the Rcl::Doc data
|
||||
* the data was stored, and use the fetcher interface to reach it.
|
||||
*
|
||||
* @param idoc Rcl::Doc object built from index data. The back-end
|
||||
* storage identifier (rclbes field) is used to build the
|
||||
* appropriate fetcher which uses the rest of the Doc fields (url,
|
||||
* ipath...) to retrieve the file or a file reference, which we
|
||||
* then process normally.
|
||||
* storage identifier (rclbes field) is used by the fetcher factory
|
||||
* to build the appropriate object to return a file name or data which
|
||||
* is then used with the appropriate init method.
|
||||
*/
|
||||
FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags);
|
||||
|
||||
@ -214,10 +213,15 @@ class FileInterner {
|
||||
* to query the right backend. Used to check up-to-dateness at query time */
|
||||
static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig);
|
||||
|
||||
/** Extract internal document into temporary file.
|
||||
* This is used mainly for starting an external viewer for a
|
||||
* subdocument (ie: mail attachment). This really would not need to be
|
||||
* a member. It creates a FileInterner object to do the actual work
|
||||
/** Extract internal document into temporary file, without converting the
|
||||
* data.
|
||||
*
|
||||
* This is used mainly for starting an external viewer for a
|
||||
* subdocument (ie: mail attachment), but, for consistency, it also
|
||||
* works with a top level (null ipath) document.
|
||||
* This would not actually need to be a member method, it creates a
|
||||
* FileInterner object to do the actual work.
|
||||
*
|
||||
* @return true for success.
|
||||
* @param temp output reference-counted temp file object (goes
|
||||
* away magically). Only used if tofile.empty()
|
||||
@ -225,13 +229,14 @@ class FileInterner {
|
||||
* @param cnf The recoll config
|
||||
* @param doc Doc data taken from the index. We use it to construct a
|
||||
* FileInterner object.
|
||||
* @param uncompress if true, uncompress compressed original doc. Only does
|
||||
* anything for a top level document.
|
||||
*/
|
||||
static bool idocToFile(TempFile& temp, const string& tofile,
|
||||
RclConfig *cnf, const Rcl::Doc& doc);
|
||||
RclConfig *cnf, const Rcl::Doc& doc,
|
||||
bool uncompress = true);
|
||||
|
||||
/**
|
||||
* Does file appear to be the compressed version of a document?
|
||||
*/
|
||||
/** Does file appear to be the compressed version of a document? */
|
||||
static bool isCompressed(const string& fn, RclConfig *cnf);
|
||||
|
||||
/**
|
||||
@ -253,7 +258,7 @@ class FileInterner {
|
||||
string m_targetMType;
|
||||
string m_reachedMType; // target or text/plain
|
||||
string m_tfile;
|
||||
bool m_ok; // Set after construction if ok
|
||||
bool m_ok{false}; // Set after construction if ok
|
||||
// Fields found in file extended attributes. This is kept here,
|
||||
// not in the file-level handler because we are only interested in
|
||||
// the top-level file, not any temp file necessitated by
|
||||
@ -270,7 +275,7 @@ class FileInterner {
|
||||
vector<TempFile> m_tempfiles;
|
||||
// Error data if any
|
||||
string m_reason;
|
||||
FIMissingStore *m_missingdatap;
|
||||
FIMissingStore *m_missingdatap{nullptr};
|
||||
|
||||
Uncomp m_uncomp;
|
||||
|
||||
@ -294,7 +299,8 @@ class FileInterner {
|
||||
static bool tempFileForMT(TempFile& otemp, RclConfig *cnf,
|
||||
const std::string& mimetype);
|
||||
static bool topdocToFile(TempFile& otemp, const std::string& tofile,
|
||||
RclConfig *cnf, const Rcl::Doc& idoc);
|
||||
RclConfig *cnf, const Rcl::Doc& idoc,
|
||||
bool uncompress);
|
||||
};
|
||||
|
||||
|
||||
|
||||
@ -44,12 +44,16 @@ typedef struct {
|
||||
/* Type-specific fields go here. */
|
||||
FileInterner *xtr;
|
||||
RclConfig *rclconfig;
|
||||
recoll_DocObject *docobject;
|
||||
} rclx_ExtractorObject;
|
||||
|
||||
static void
|
||||
Extractor_dealloc(rclx_ExtractorObject *self)
|
||||
{
|
||||
LOGDEB("Extractor_dealloc\n" );
|
||||
if (self->docobject) {
|
||||
Py_DECREF(&self->docobject);
|
||||
}
|
||||
delete self->xtr;
|
||||
Py_TYPE(self)->tp_free((PyObject*)self);
|
||||
}
|
||||
@ -64,6 +68,7 @@ Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||
return 0;
|
||||
self->xtr = 0;
|
||||
self->rclconfig = 0;
|
||||
self->docobject = 0;
|
||||
return (PyObject *)self;
|
||||
}
|
||||
|
||||
@ -82,6 +87,9 @@ Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
|
||||
PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
|
||||
return -1;
|
||||
}
|
||||
self->docobject = dobj;
|
||||
Py_INCREF(dobj);
|
||||
|
||||
self->rclconfig = dobj->rclconfig;
|
||||
self->xtr = new FileInterner(*dobj->doc, self->rclconfig,
|
||||
FileInterner::FIF_forPreview);
|
||||
@ -177,17 +185,29 @@ Extractor_idoctofile(rclx_ExtractorObject* self, PyObject *args,
|
||||
PyErr_SetString(PyExc_AttributeError, "idoctofile: null object");
|
||||
return 0;
|
||||
}
|
||||
if (ipath.empty()) {
|
||||
PyErr_SetString(PyExc_ValueError, "idoctofile: null ipath");
|
||||
return 0;
|
||||
}
|
||||
|
||||
self->xtr->setTargetMType(mimetype);
|
||||
|
||||
// If ipath is empty and we want the original mimetype, we can't
|
||||
// use FileInterner::internToFile() because the first conversion
|
||||
// was performed by the FileInterner constructor, so that we can't
|
||||
// reach the original object this way. Instead, if the data comes
|
||||
// from a file (m_fn set), we just copy it, else, we call
|
||||
// idoctofile, which will call topdoctofile (and re-fetch the
|
||||
// data, yes, wastefull)
|
||||
TempFile temp;
|
||||
bool status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
|
||||
bool status = false;
|
||||
LOGDEB("Extractor_idoctofile: ipath [" << ipath << "] mimetype [" <<
|
||||
mimetype << "] doc mimetype [" << self->docobject->doc->mimetype <<
|
||||
"\n");
|
||||
if (ipath.empty() && !mimetype.compare(self->docobject->doc->mimetype)) {
|
||||
status = FileInterner::idocToFile(temp, outfile, self->rclconfig,
|
||||
*self->docobject->doc);
|
||||
} else {
|
||||
self->xtr->setTargetMType(mimetype);
|
||||
status = self->xtr->interntofile(temp, outfile, ipath, mimetype);
|
||||
}
|
||||
if (!status) {
|
||||
PyErr_SetString(PyExc_AttributeError, "interntofile failure");
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
if (outfile.empty())
|
||||
temp->setnoremove(1);
|
||||
|
||||
@ -55,8 +55,8 @@ void multiSave(QWidget *p, vector<Rcl::Doc>& docs)
|
||||
LOGDEB2("multiSave: got dir " << (dir) << "\n" );
|
||||
|
||||
/* Save doc to files in target directory. Issues:
|
||||
- It is quite common to have docs in the array with the save
|
||||
file names, e.g. all messages in a folder have the save file
|
||||
- It is quite common to have docs in the array with the same
|
||||
file names, e.g. all messages in a folder have the same file
|
||||
name (the folder's).
|
||||
- There is no warranty that the ipath is going to be acceptable
|
||||
as a file name or interesting at all. We don't use it.
|
||||
@ -131,7 +131,7 @@ void multiSave(QWidget *p, vector<Rcl::Doc>& docs)
|
||||
}
|
||||
// There is still a race condition here, should we care ?
|
||||
TempFile temp;// not used
|
||||
if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i])) {
|
||||
if (!FileInterner::idocToFile(temp, fn, theconfig, docs[i], false)) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QWidget::tr("Cannot extract document: ") +
|
||||
QString::fromLocal8Bit(docs[i].url.c_str()) +
|
||||
|
||||
@ -819,14 +819,14 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
|
||||
// If this is an image, display it instead of the text.
|
||||
if (!idoc.mimetype.compare(0, 6, "image/")) {
|
||||
string fn = fileurltolocalpath(idoc.url);
|
||||
|
||||
// If the command wants a file but this is not a file url, or
|
||||
// there is an ipath that it won't understand, we need a temp file:
|
||||
theconfig->setKeyDir(fn.empty() ? "" : path_getfather(fn));
|
||||
|
||||
// We want a real file, so if this comes from data or we have
|
||||
// an ipath, create it.
|
||||
if (fn.empty() || !idoc.ipath.empty()) {
|
||||
TempFile temp = lthr.tmpimg;
|
||||
if (temp) {
|
||||
LOGDEB1("Preview: load: got temp file from internfile\n" );
|
||||
LOGDEB1("Preview: load: got temp file from internfile\n");
|
||||
} else if (!FileInterner::idocToFile(temp, string(),
|
||||
theconfig, idoc)) {
|
||||
temp.reset(); // just in case.
|
||||
|
||||
@ -300,7 +300,8 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
|
||||
bool enterHistory = false;
|
||||
bool istempfile = false;
|
||||
|
||||
LOGDEB("RclMain::startNV: groksipath " << (groksipath) << " wantsf " << (wantsfile) << " wantsparentf " << (wantsparentfile) << "\n" );
|
||||
LOGDEB("RclMain::startNV: groksipath " << groksipath << " wantsf " <<
|
||||
wantsfile << " wantsparentf " << wantsparentfile << "\n");
|
||||
|
||||
// If the command wants a file but this is not a file url, or
|
||||
// there is an ipath that it won't understand, we need a temp file:
|
||||
@ -328,8 +329,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
|
||||
if (!fn.empty() && theconfig->mimeViewerNeedsUncomp(doc.mimetype)) {
|
||||
if (access(fn.c_str(), R_OK) != 0) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
tr("Can't access file: ") +
|
||||
QString::fromLocal8Bit(fn.c_str()));
|
||||
tr("Can't access file: ") + u8s2qs(fn));
|
||||
return;
|
||||
}
|
||||
TempFile temp;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user