diff --git a/src/filters/rclimg b/src/filters/rclimg index 60b8697e..11005e2c 100755 --- a/src/filters/rclimg +++ b/src/filters/rclimg @@ -119,10 +119,12 @@ sub readparam { } my $paramname = lc $l[0]; my $paramsize = $l[1]; - my $n = read STDIN, $paramdata, $paramsize; - if ($n != $paramsize) { - print STDERR "RCLIMG: [$paramname] expected $paramsize, got $n\n"; - exit 1; + if ($paramsize > 0) { + my $n = read STDIN, $paramdata, $paramsize; + if ($n != $paramsize) { + print STDERR "RCLIMG: [$paramname] expected $paramsize, got $n\n"; + exit 1; + } } # print STDERR "RCLIMG: [$paramname] $paramsize bytes: [$paramdata]\n"; return ($paramname, $paramdata); @@ -156,14 +158,19 @@ while (1) { } unless (defined $params{"filename:"}) { print STDERR "RCLIMG: no filename ??\n"; - exit 1; + # Recoll is requesting next subdocument, but we have none, just say + # so: + print "Document: 0\n\n"; + next; } my $data = imgTagsToHtml($params{"filename:"}); my $l = length($data); - print "Data: $l\n"; + print "Document: $l\n"; # print STDERR "RCLIMG: writing $l bytes of data\n"; print $data; + # Say we have no further documents for this file + print "Eof: 0\n"; # End of output parameters: print empty line print "\n"; # print STDERR "RCLIMG: done writing data\n"; diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index 76ab2594..3807b431 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -40,6 +40,7 @@ using namespace std; #include "rclmon.h" #include "x11mon.h" #include "rclversion.h" +#include "cancelcheck.h" // Globals for exit cleanup ConfIndexer *confindexer; @@ -63,6 +64,7 @@ static void sigcleanup(int sig) { fprintf(stderr, "sigcleanup\n"); LOGDEB(("sigcleanup\n")); + CancelCheck::instance().setCancel(); stopindexing = 1; } diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 6e886392..194b60e7 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -28,6 +28,9 @@ static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.14 2008-10-09 09:19:37 dockes Exp #include "smallut.h" #include "transcode.h" #include "md5.h" +#include "rclconfig.h" +#include "mimetype.h" +#include "idfile.h" #include #include @@ -39,6 +42,13 @@ using namespace std; bool MimeHandlerExecMultiple::startCmd() { LOGDEB(("MimeHandlerExecMultiple::startCmd\n")); + if (params.empty()) { + // Hu ho + LOGERR(("MHExecMultiple::mkDoc: empty params\n")); + m_reason = "RECFILTERROR BADCONFIG"; + return false; + } + // Command name string cmd = params.front(); @@ -56,20 +66,31 @@ bool MimeHandlerExecMultiple::startCmd() return true; } -bool MimeHandlerExecMultiple::readDataElement(string& name) +// Note: data is not used if this is the "document:" field: it goes +// directly to m_metaData["content"] to avoid an extra copy +// +// Messages are made of data elements. Each element is like: +// name: len\ndata +// An empty line signals the end of the message, so the whole thing +// would look like: +// Name1: Len1\nData1Name2: Len2\nData2\n +bool MimeHandlerExecMultiple::readDataElement(string& name, string &data) { string ibuf; + + // Read name and length if (m_cmd.getline(ibuf) <= 0) { LOGERR(("MHExecMultiple: getline error\n")); return false; } + // Empty line (end of message) ? if (!ibuf.compare("\n")) { LOGDEB(("MHExecMultiple: Got empty line\n")); name = ""; return true; } - // We're expecting something like paramname: len\n + // We're expecting something like Name: len\n list tokens; stringToTokens(ibuf, tokens); if (tokens.size() != 2) { @@ -86,19 +107,21 @@ bool MimeHandlerExecMultiple::readDataElement(string& name) ibuf.c_str())); return false; } - LOGDEB(("MHExecMultiple: got paramname [%s] len: %d\n", - name.c_str(), len)); - // We only care about the "data:" field for now - string discard; - string *datap; - if (!stringlowercmp("data:", name)) { + LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len)); + + // Hack: check for 'Document:' and read directly the document data + // to m_metaData["content"] to avoid an extra copy of the bulky + // piece + string *datap = &data; + if (!stringlowercmp("document:", name)) { datap = &m_metaData["content"]; } else { - datap = &discard; + datap = &data; } - // Then the data. + + // Read element data datap->erase(); - if (m_cmd.receive(*datap, len) != len) { + if (len > 0 && m_cmd.receive(*datap, len) != len) { LOGERR(("MHExecMultiple: expected %d bytes of data, got %d\n", len, datap->length())); return false; @@ -106,52 +129,116 @@ bool MimeHandlerExecMultiple::readDataElement(string& name) return true; } -// Execute an external program to translate a file from its native -// format to text or html. bool MimeHandlerExecMultiple::next_document() { + LOGDEB(("MimeHandlerExecMultiple::next_document(): [%s]\n", m_fn.c_str())); if (m_havedoc == false) return false; + if (missingHelper) { LOGDEB(("MHExecMultiple::next_document(): helper known missing\n")); return false; } - if (params.empty()) { - // Hu ho - LOGERR(("MHExecMultiple::mkDoc: empty params\n")); - m_reason = "RECFILTERROR BADCONFIG"; - return false; - } if (m_cmd.getChildPid() < 0 && !startCmd()) { return false; } - // Send request to child process + // Send request to child process. This maybe the first/only + // request for a given file, or a continuation request. We send an + // empty file name in the latter case. ostringstream obuf; - obuf << "FileName: " << m_fn.length() << endl << m_fn << endl; + if (m_filefirst) { + obuf << "FileName: " << m_fn.length() << "\n" << m_fn; + // m_filefirst is set to true by set_document_file() + m_filefirst = false; + } else { + obuf << "Filename: " << 0 << "\n"; + } + if (m_ipath.length()) { + obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath; + } + obuf << "\n"; if (m_cmd.send(obuf.str()) < 0) { LOGERR(("MHExecMultiple: send error\n")); return false; } - // Read answer - LOGDEB(("MHExecMultiple: reading answer\n")); + // Read answer (multiple elements) + LOGDEB1(("MHExecMultiple: reading answer\n")); + bool eof_received = false; + string ipath; + string mtype; for (int loop=0;;loop++) { - string name; - if (!readDataElement(name)) { + string name, data; + if (!readDataElement(name, data)) { return false; } if (name.empty()) break; + if (!stringlowercmp("eof:", name)) { + LOGDEB(("MHExecMultiple: got EOF\n")); + eof_received = true; + } + if (!stringlowercmp("ipath:", name)) { + ipath = data; + LOGDEB(("MHExecMultiple: got ipath [%s]\n", data.c_str())); + } + if (!stringlowercmp("mimetype:", name)) { + mtype = data; + LOGDEB(("MHExecMultiple: got mimetype [%s]\n", data.c_str())); + } if (loop == 10) { // ?? LOGERR(("MHExecMultiple: filter sent too many parameters\n")); return false; } } - - finaldetails(); - m_havedoc = false; + // The end of data can be signaled from the filter in two ways: + // either by returning an empty document (if the filter just hits + // eof while trying to read the doc), or with an "eof:" field + // accompanying a normal document (if the filter hit eof at the + // end of the current doc, which is the preferred way). + if (m_metaData["content"].length() == 0) { + LOGDEB(("MHExecMultiple: got empty document\n")); + m_havedoc = false; + return false; + } + + // If this has an ipath, it is an internal doc from a + // multi-document file. In this case, either the filter supplies the + // mimetype, or the ipath MUST be a filename-like string which we can use + // to compute a mime type + if (!ipath.empty()) { + m_metaData["ipath"] = ipath; + if (mtype.empty()) { + mtype = mimetype(ipath, 0, RclConfig::getMainConfig(), false); + if (mtype.empty()) { + // mimetype() won't call idFile when there is no file. Do it + mtype = idFileMem(m_metaData["content"]); + if (mtype.empty()) { + LOGERR(("MHExecMultiple: cant guess mime type\n")); + mtype = "application/octet-stream"; + } + } + } + m_metaData["mimetype"] = mtype; + string md5, xmd5; + MD5String(m_metaData["content"], md5); + m_metaData["md5"] = MD5HexPrint(md5, xmd5); + } else { + m_metaData.erase("ipath"); + string md5, xmd5, reason; + if (MD5File(m_fn, md5, &reason)) { + m_metaData["md5"] = MD5HexPrint(md5, xmd5); + } else { + LOGERR(("MimeHandlerExecM: cant compute md5 for [%s]: %s\n", + m_fn.c_str(), reason.c_str())); + } + } + + if (eof_received) + m_havedoc = false; + return true; } diff --git a/src/internfile/mh_execm.h b/src/internfile/mh_execm.h index 7cead3f7..dcf3b62a 100644 --- a/src/internfile/mh_execm.h +++ b/src/internfile/mh_execm.h @@ -28,7 +28,8 @@ * which is built in mimehandler.cpp out of data from the mimeconf file. * * This version uses persistent filters which can handle multiple requests - * without exiting, with a simple question/response protocol. + * without exiting (both multiple files and multiple documents per file), + * with a simple question/response protocol. * * The data is exchanged in TLV fashion, in a way that should be * usable in most script languages. The basic unit has one line with a @@ -49,11 +50,11 @@ text/plainData: 10 0123456789 & args, NetconCli *iclicon = new NetconCli(); iclicon->setconn(m_pipein[1]); m_tocmd = NetconP(iclicon); - m_pipein[1] = 0; } if (has_output) { close(m_pipeout[1]); @@ -234,7 +233,6 @@ int ExecCmd::startExec(const string &cmd, const list& args, NetconCli *oclicon = new NetconCli(); oclicon->setconn(m_pipeout[0]); m_fromcmd = NetconP(oclicon); - m_pipeout[0] = -1; } /* Don't want to undo what we just did ! */ @@ -361,6 +359,18 @@ int ExecCmd::doexec(const string &cmd, const list& args, } } LOGDEB0(("ExecCmd::doexec: selectloop returned %d\n", ret)); + + // The netcons don't take ownership of the fds: we have to close them + // (have to do it before wait, this may be the signal the child is + // waiting for exiting). + if (input) { + close(m_pipein[1]); + m_pipein[1] = -1; + } + if (output) { + close(m_pipeout[0]); + m_pipeout[0] = -1; + } } // Normal return: deactivate cleaner, wait() will do the cleanup