343 lines
11 KiB
C++
343 lines
11 KiB
C++
/* Copyright (C) 2005 J.F.Dockes
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
#include <stdio.h>
|
|
|
|
#include <iostream>
|
|
#include <sstream>
|
|
using namespace std;
|
|
|
|
#include "cstr.h"
|
|
#include "mh_execm.h"
|
|
#include "mh_html.h"
|
|
#include "log.h"
|
|
#include "cancelcheck.h"
|
|
#include "smallut.h"
|
|
#include "md5ut.h"
|
|
#include "rclconfig.h"
|
|
#include "mimetype.h"
|
|
#include "idfile.h"
|
|
|
|
#include <sys/types.h>
|
|
#include "safesyswait.h"
|
|
|
|
bool MimeHandlerExecMultiple::startCmd()
|
|
{
|
|
LOGDEB("MimeHandlerExecMultiple::startCmd\n");
|
|
if (params.empty()) {
|
|
// Hu ho
|
|
LOGERR("MHExecMultiple::startCmd: empty params\n");
|
|
m_reason = "RECFILTERROR BADCONFIG";
|
|
return false;
|
|
}
|
|
|
|
// Command name
|
|
string cmd = params.front();
|
|
|
|
m_maxmemberkb = 50000;
|
|
m_config->getConfParam("membermaxkbs", &m_maxmemberkb);
|
|
ostringstream oss;
|
|
oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb;
|
|
m_cmd.putenv(oss.str());
|
|
|
|
m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
|
|
m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
|
|
"RECOLL_FILTER_FORPREVIEW=no");
|
|
|
|
m_cmd.setrlimit_as(m_filtermaxmbytes);
|
|
m_adv.setmaxsecs(m_filtermaxseconds);
|
|
m_cmd.setAdvise(&m_adv);
|
|
|
|
// Build parameter list: delete cmd name
|
|
vector<string>myparams(params.begin() + 1, params.end());
|
|
|
|
if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) {
|
|
m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
|
|
missingHelper = true;
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Note: data is not used if this is the "document:" field: it goes
|
|
// directly to m_metaData[cstr_dj_keycontent] to avoid an extra copy
|
|
//
|
|
// Messages are made of data elements. Each element is like:
|
|
// name: len\ndata
|
|
// An empty line signals the end of the message, so the whole thing
|
|
// would look like:
|
|
// Name1: Len1\nData1Name2: Len2\nData2\n
|
|
bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
|
|
{
|
|
string ibuf;
|
|
|
|
// Read name and length
|
|
if (m_cmd.getline(ibuf) <= 0) {
|
|
LOGERR("MHExecMultiple: getline error\n");
|
|
return false;
|
|
}
|
|
|
|
LOGDEB1("MHEM:rde: line [" << ibuf << "]\n");
|
|
|
|
// Empty line (end of message) ?
|
|
if (!ibuf.compare("\n")) {
|
|
LOGDEB("MHExecMultiple: Got empty line\n");
|
|
name.clear();
|
|
return true;
|
|
}
|
|
|
|
// Filters will sometimes abort before entering the real protocol, ie if
|
|
// a module can't be loaded. Check the special filter error first word:
|
|
if (ibuf.find("RECFILTERROR ") == 0) {
|
|
m_reason = ibuf;
|
|
if (ibuf.find("HELPERNOTFOUND") != string::npos)
|
|
missingHelper = true;
|
|
return false;
|
|
}
|
|
|
|
// We're expecting something like Name: len\n
|
|
vector<string> tokens;
|
|
stringToTokens(ibuf, tokens);
|
|
if (tokens.size() != 2) {
|
|
LOGERR("MHExecMultiple: bad line in filter output: [" << ibuf << "]\n");
|
|
return false;
|
|
}
|
|
vector<string>::iterator it = tokens.begin();
|
|
name = *it++;
|
|
string& slen = *it;
|
|
int len;
|
|
if (sscanf(slen.c_str(), "%d", &len) != 1) {
|
|
LOGERR("MHExecMultiple: bad line in filter output: [" << ibuf << "]\n");
|
|
return false;
|
|
}
|
|
|
|
if (len / 1024 > m_maxmemberkb) {
|
|
LOGERR("MHExecMultiple: data len > maxmemberkb\n");
|
|
return false;
|
|
}
|
|
|
|
// Hack: check for 'Document:' and read directly the document data
|
|
// to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky
|
|
// piece
|
|
string *datap = &data;
|
|
if (!stringlowercmp("document:", name)) {
|
|
datap = &m_metaData[cstr_dj_keycontent];
|
|
} else {
|
|
datap = &data;
|
|
}
|
|
|
|
// Read element data
|
|
datap->erase();
|
|
if (len > 0 && m_cmd.receive(*datap, len) != len) {
|
|
LOGERR("MHExecMultiple: expected " << len << " bytes of data, got " <<
|
|
datap->length() << "\n");
|
|
return false;
|
|
}
|
|
LOGDEB1("MHExecMe:rdDtElt got: name [" << name << "] len " << len <<
|
|
"value [" << (datap->size() > 100 ?
|
|
(datap->substr(0, 100) + " ...") : datap) << endl);
|
|
return true;
|
|
}
|
|
|
|
bool MimeHandlerExecMultiple::next_document()
|
|
{
|
|
LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
|
|
if (m_havedoc == false)
|
|
return false;
|
|
|
|
if (missingHelper) {
|
|
LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
|
|
return false;
|
|
}
|
|
|
|
if (m_cmd.getChildPid() <= 0 && !startCmd()) {
|
|
return false;
|
|
}
|
|
|
|
m_metaData.clear();
|
|
|
|
// Send request to child process. This maybe the first/only
|
|
// request for a given file, or a continuation request. We send an
|
|
// empty file name in the latter case.
|
|
// We also compute the file md5 before starting the extraction:
|
|
// under Windows, we may not be able to do it while the file
|
|
// is opened by the filter.
|
|
ostringstream obuf;
|
|
string file_md5;
|
|
if (m_filefirst) {
|
|
if (!m_forPreview && !m_nomd5) {
|
|
string md5, xmd5, reason;
|
|
if (MD5File(m_fn, md5, &reason)) {
|
|
file_md5 = MD5HexPrint(md5, xmd5);
|
|
} else {
|
|
LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
|
|
"]: " << reason << "\n");
|
|
}
|
|
}
|
|
obuf << "FileName: " << m_fn.length() << "\n" << m_fn;
|
|
// m_filefirst is set to true by set_document_file()
|
|
m_filefirst = false;
|
|
} else {
|
|
obuf << "Filename: " << 0 << "\n";
|
|
}
|
|
if (!m_ipath.empty()) {
|
|
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
|
|
m_ipath << "]\n");
|
|
obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath;
|
|
}
|
|
if (!m_dfltInputCharset.empty()) {
|
|
obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n"
|
|
<< m_dfltInputCharset;
|
|
}
|
|
obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
|
|
obuf << "\n";
|
|
if (m_cmd.send(obuf.str()) < 0) {
|
|
m_cmd.zapChild();
|
|
LOGERR("MHExecMultiple: send error\n");
|
|
return false;
|
|
}
|
|
|
|
m_adv.reset();
|
|
|
|
// Read answer (multiple elements)
|
|
LOGDEB1("MHExecMultiple: reading answer\n");
|
|
bool eofnext_received = false;
|
|
bool eofnow_received = false;
|
|
bool fileerror_received = false;
|
|
bool subdocerror_received = false;
|
|
string ipath;
|
|
string mtype;
|
|
string charset;
|
|
for (int loop=0;;loop++) {
|
|
string name, data;
|
|
try {
|
|
if (!readDataElement(name, data)) {
|
|
m_cmd.zapChild();
|
|
return false;
|
|
}
|
|
} catch (HandlerTimeout) {
|
|
LOGINFO("MHExecMultiple: timeout\n");
|
|
m_cmd.zapChild();
|
|
return false;
|
|
} catch (CancelExcept) {
|
|
LOGINFO("MHExecMultiple: interrupt\n");
|
|
m_cmd.zapChild();
|
|
return false;
|
|
}
|
|
if (name.empty())
|
|
break;
|
|
if (!stringlowercmp("eofnext:", name)) {
|
|
LOGDEB("MHExecMultiple: got EOFNEXT\n");
|
|
eofnext_received = true;
|
|
} else if (!stringlowercmp("eofnow:", name)) {
|
|
LOGDEB("MHExecMultiple: got EOFNOW\n");
|
|
eofnow_received = true;
|
|
} else if (!stringlowercmp("fileerror:", name)) {
|
|
LOGDEB("MHExecMultiple: got FILEERROR\n");
|
|
fileerror_received = true;
|
|
} else if (!stringlowercmp("subdocerror:", name)) {
|
|
LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
|
|
subdocerror_received = true;
|
|
} else if (!stringlowercmp("ipath:", name)) {
|
|
ipath = data;
|
|
LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
|
|
} else if (!stringlowercmp("charset:", name)) {
|
|
charset = data;
|
|
LOGDEB("MHExecMultiple: got charset [" << data << "]\n");
|
|
} else if (!stringlowercmp("mimetype:", name)) {
|
|
mtype = data;
|
|
LOGDEB("MHExecMultiple: got mimetype [" << data << "]\n");
|
|
} else {
|
|
string nm = stringtolower((const string&)name);
|
|
trimstring(nm, ":");
|
|
LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
|
|
m_metaData[nm] += data;
|
|
}
|
|
if (loop == 200) {
|
|
// ??
|
|
LOGERR("MHExecMultiple: handler sent more than 200 attributes\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (eofnow_received || fileerror_received) {
|
|
// No more docs
|
|
m_havedoc = false;
|
|
return false;
|
|
}
|
|
if (subdocerror_received) {
|
|
return false;
|
|
}
|
|
|
|
// It used to be that eof could be signalled just by an empty document, but
|
|
// this was wrong. Empty documents can be found ie in zip files and should
|
|
// not be interpreted as eof.
|
|
if (m_metaData[cstr_dj_keycontent].empty()) {
|
|
LOGDEB0("MHExecMultiple: got empty document inside [" << m_fn <<
|
|
"]: [" << ipath << "]\n");
|
|
}
|
|
|
|
if (!ipath.empty()) {
|
|
// If this has an ipath, it is an internal doc from a
|
|
// multi-document file. In this case, either the filter
|
|
// supplies the mimetype, or the ipath MUST be a filename-like
|
|
// string which we can use to compute a mime type
|
|
m_metaData[cstr_dj_keyipath] = ipath;
|
|
if (mtype.empty()) {
|
|
LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
|
|
"for a guess\n");
|
|
mtype = mimetype(ipath, 0, m_config, false);
|
|
if (mtype.empty()) {
|
|
// mimetype() won't call idFile when there is no file. Do it
|
|
mtype = idFileMem(m_metaData[cstr_dj_keycontent]);
|
|
if (mtype.empty()) {
|
|
// Note this happens for example for directory zip members
|
|
// We could recognize them by the end /, but wouldn't know
|
|
// what to do with them anyway.
|
|
LOGINFO("MHExecMultiple: cant guess mime type\n");
|
|
mtype = "application/octet-stream";
|
|
}
|
|
}
|
|
}
|
|
m_metaData[cstr_dj_keymt] = mtype;
|
|
if (!m_forPreview) {
|
|
string md5, xmd5;
|
|
MD5String(m_metaData[cstr_dj_keycontent], md5);
|
|
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
|
}
|
|
} else {
|
|
// "Self" document.
|
|
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
|
|
m_metaData.erase(cstr_dj_keyipath);
|
|
if (!m_forPreview) {
|
|
m_metaData[cstr_dj_keymd5] = file_md5;
|
|
}
|
|
}
|
|
|
|
handle_cs(m_metaData[cstr_dj_keymt], charset);
|
|
|
|
if (eofnext_received)
|
|
m_havedoc = false;
|
|
|
|
LOGDEB0("MHExecMultiple: returning " <<
|
|
m_metaData[cstr_dj_keycontent].size() <<
|
|
" bytes of content, mtype [" << m_metaData[cstr_dj_keymt] <<
|
|
"] charset [" << m_metaData[cstr_dj_keycharset] << "]\n");
|
|
LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
|
|
return true;
|
|
}
|
|
|