diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 237de5d2..60d7f696 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.20 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.21 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -21,6 +21,7 @@ static char rcsid[] = "@(#$Id: internfile.cpp,v 1.20 2006-12-15 16:33:15 dockes #ifndef TEST_INTERNFILE #include +#include #include #include #include @@ -41,6 +42,10 @@ using namespace std; #include "wipedir.h" #include "rclconfig.h" +// The internal path element separator. This can't be the same as the rcldb +// file to ipath separator : "|" +static const string isep(":"); + // Execute the command to uncompress a file into a temporary one. static bool uncompressfile(RclConfig *conf, const string& ifn, const list& cmdv, const string& tdir, @@ -133,7 +138,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) { return; } - LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n", + LOGDEB1(("internfile: after ucomp: m_tdir %s, tfile %s\n", m_tdir.c_str(), m_tfile.c_str())); m_fn = m_tfile; l_mime = mimetype(m_fn, m_cfg, usfci); @@ -167,77 +172,151 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, LOGERR(("FileInterner:: error parsing %s\n", m_fn.c_str())); return; } - m_handlers.reserve(20); + m_handlers.reserve(MAXHANDLERS); + for (unsigned int i = 0; i < MAXHANDLERS; i++) + m_tmpflgs[i] = false; m_handlers.push_back(df); LOGDEB(("FileInterner::FileInterner: %s [%s]\n", l_mime.c_str(), - m_fn.c_str())); + m_fn.c_str())); + m_targetMType = "text/plain"; } FileInterner::~FileInterner() { - while (!m_handlers.empty()) { - delete m_handlers.back(); - m_handlers.pop_back(); - } tmpcleanup(); + for (vector::iterator it = m_handlers.begin(); + it != m_handlers.end(); it++) + delete *it; + // m_tempfiles will take care of itself } -static const string string_empty; -static const string get_mimetype(Dijon::Filter* df) +bool FileInterner::dataToTempFile(const string& dt, const string& mt, + string& fn) { - const std::map *docdata = &df->get_meta_data(); - map::const_iterator it; - it = docdata->find("mimetype"); - if (it != docdata->end()) { - return it->second; + // Find appropriate suffix for mime type + TempFile temp(new TempFileInternal(m_cfg->getSuffixFromMimeType(mt))); + if (temp->ok()) { + m_tmpflgs[m_handlers.size()-1] = true; + m_tempfiles.push_back(temp); } else { - return string_empty; + LOGERR(("FileInterner::dataToTempFile: cant create tempfile\n")); + return false; } + + int fd = open(temp->filename(), O_WRONLY); + if (fd < 0) { + LOGERR(("FileInterner::dataToTempFile: open(%s) failed errno %d\n", + temp->filename(), errno)); + return false; + } + if (write(fd, dt.c_str(), dt.length()) != (int)dt.length()) { + close(fd); + LOGERR(("FileInterner::dataToTempFile: write to %s failed errno %d\n", + temp->filename(), errno)); + return false; + } + close(fd); + fn = temp->filename(); + return true; } +static inline bool getKeyValue(const map& docdata, + const string& key, string& value) +{ + map::const_iterator it; + it = docdata.find(key); + if (it != docdata.end()) { + value = it->second; + return true; + } + return false; +} + +static const string keyab("abstract"); +static const string keycs("charset"); +static const string keyct("content"); +static const string keyfn("filename"); +static const string keykw("keywords"); +static const string keymd("modificationdate"); +static const string keymt("mimetype"); +static const string keyoc("origcharset"); +static const string keysm("sample"); +static const string keytt("title"); + bool FileInterner::dijontorcl(Rcl::Doc& doc) { Dijon::Filter *df = m_handlers.back(); - const std::map *docdata = &df->get_meta_data(); - map::const_iterator it; + const std::map& docdata = df->get_meta_data(); - it = docdata->find("origcharset"); - if (it != docdata->end()) - doc.origcharset = it->second; - - it = docdata->find("content"); - if (it != docdata->end()) - doc.text = it->second; - - it = docdata->find("title"); - if (it != docdata->end()) - doc.title = it->second; - - it = docdata->find("keywords"); - if (it != docdata->end()) - doc.keywords = it->second; - - it = docdata->find("modificationdate"); - if (it != docdata->end()) - doc.dmtime = it->second; - - it = docdata->find("abstract"); - if (it != docdata->end()) { - doc.abstract = it->second; - } else { - it = docdata->find("sample"); - if (it != docdata->end()) - doc.abstract = it->second; - } + getKeyValue(docdata, keyoc, doc.origcharset); + getKeyValue(docdata, keyct, doc.text); + getKeyValue(docdata, keytt, doc.title); + getKeyValue(docdata, keykw, doc.keywords); + getKeyValue(docdata, keymd, doc.dmtime); + if (!getKeyValue(docdata, keyab, doc.abstract)) + getKeyValue(docdata, keysm, doc.abstract); + LOGDEB1(("FILENAME: %s\n", doc.utf8fn.c_str())); return true; } +// Collect the ipath stack. +// While we're at it, we also set the mimetype and filename, which are special +// properties: we want to get them from the topmost doc +// with an ipath, not the last one which is usually text/plain +void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) +{ + bool hasipath = false; -static const unsigned int MAXHANDLERS = 20; + // If there is no ipath stack, the mimetype is the one from the file + doc.mimetype = m_mimetype; + LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str())); + + string ipathel; + for (vector::const_iterator hit = m_handlers.begin(); + hit != m_handlers.end(); hit++) { + const map& docdata = (*hit)->get_meta_data(); + if (getKeyValue(docdata, "ipath", ipathel)) { + if (!ipathel.empty()) { + // We have a non-empty ipath + hasipath = true; + getKeyValue(docdata, keymt, doc.mimetype); + getKeyValue(docdata, keyfn, doc.utf8fn); + } + ipath += ipathel + isep; + } else { + ipath += isep; + } + } + + // Trim empty tail elements in ipath. + if (hasipath) { + LOGDEB2(("IPATH [%s]\n", ipath.c_str())); + string::size_type sit = ipath.find_last_not_of(isep); + if (sit == string::npos) + ipath.erase(); + else if (sit < ipath.length() -1) + ipath.erase(sit+1); + } else { + ipath.erase(); + } +} + +// Remove handler from stack. Clean up temp file if needed. +void FileInterner::popHandler() +{ + int i = m_handlers.size()-1; + if (m_tmpflgs[i]) { + m_tempfiles.pop_back(); + m_tmpflgs[i] = false; + } + delete m_handlers.back(); + m_handlers.pop_back(); +} FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) { - if (m_handlers.size() != 1) { + LOGDEB(("FileInterner::internfile. ipath [%s]\n", ipath.c_str())); + if (m_handlers.size() < 1) { LOGERR(("FileInterner::internfile: bad stack size %d !!\n", m_handlers.size())); return FIError; @@ -252,7 +331,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) int vipathidx = 0; if (!ipath.empty()) { list lipath; - stringToTokens(ipath, lipath, "|", true); + stringToTokens(ipath, lipath, isep, true); vipath.insert(vipath.begin(), lipath.begin(), lipath.end()); if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){ LOGERR(("FileInterner::internfile: can't skip\n")); @@ -261,12 +340,17 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) } /* Try to get doc from the topmost filter */ + // Security counter: we try not to loop but ... + int loop = 0; while (!m_handlers.empty()) { + if (loop++ > 30) { + LOGERR(("FileInterner:: looping!\n")); + return FIError; + } if (!m_handlers.back()->has_documents()) { // No docs at the current top level. Pop and see if there // is something at the previous one - delete m_handlers.back(); - m_handlers.pop_back(); + popHandler(); continue; } @@ -276,21 +360,16 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) } // Look at what we've got - const std::map *docdata = - &m_handlers.back()->get_meta_data(); - map::const_iterator it; - string charset; - it = docdata->find("charset"); - if (it != docdata->end()) - charset = it->second; - string mimetype; - it = docdata->find("mimetype"); - if (it != docdata->end()) - mimetype = it->second; + const std::map& docdata = + m_handlers.back()->get_meta_data(); + string charset, mimetype; + getKeyValue(docdata, keycs, charset); + getKeyValue(docdata, keymt, mimetype); - LOGDEB(("FileInterner::internfile:next_doc is %s\n",mimetype.c_str())); + LOGDEB(("FileInterner::internfile: next_doc is %s\n", + mimetype.c_str())); // If we find a text/plain doc, we're done - if (!strcmp(mimetype.c_str(), "text/plain")) + if (!stringicmp(mimetype, m_targetMType)) break; // Got a non text/plain doc. We need to stack another @@ -298,7 +377,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) if (m_handlers.size() > MAXHANDLERS) { // Stack too big. Skip this and go on to check if there is // something else in the current back() - LOGDEB(("FileInterner::internfile: stack too high\n")); + LOGINFO(("FileInterner::internfile: stack too high\n")); continue; } @@ -306,7 +385,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) if (!again) { // If we can't find a filter, this doc can't be handled // but there can be other ones so we go on - LOGERR(("FileInterner::internfile: no filter for [%s]\n", + LOGINFO(("FileInterner::internfile: no filter for [%s]\n", mimetype.c_str())); continue; } @@ -316,18 +395,37 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) charset); string ns; const string *txt = &ns; - it = docdata->find("content"); - if (it != docdata->end()) + map::const_iterator it; + it = docdata.find("content"); + if (it != docdata.end()) txt = &it->second; - if (!again->set_document_string(*txt)) { - LOGERR(("FileInterner::internfile: error reparsing for %s\n", + + bool setres = false; + if (again->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { + setres = again->set_document_string(*txt); + } else if (again->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { + setres = again->set_document_data(txt->c_str(), txt->length()); + }else if(again->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) { + string filename; + if (dataToTempFile(*txt, mimetype, filename)) { + if (!(setres = again->set_document_file(filename))) { + m_tmpflgs[m_handlers.size()-1] = false; + m_tempfiles.pop_back(); + } + } + } + if (!setres) { + LOGINFO(("FileInterner::internfile: set_doc failed inside %s\n", m_fn.c_str())); delete again; + if (m_forPreview) + return FIError; continue; } - // add filter and go on + // add filter and go on, maybe this one will give us text... m_handlers.push_back(again); - if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){ + if (!ipath.empty() && + !m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){ LOGERR(("FileInterner::internfile: can't skip\n")); return FIError; } @@ -338,64 +436,79 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) return FIError; } - // If indexing, we have to collect the ipath stack. - - // While we're at it, we also set the mimetype, which is a special - // property:we want to get it from the topmost doc - // with an ipath, not the last one which is always text/html + // If indexing compute ipath and significant mimetype // Note that ipath is returned through the parameter not doc.ipath - if (!m_forPreview) { - bool hasipath = false; - doc.mimetype = m_mimetype; - LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str())); - map::const_iterator titi; - - for (vector::const_iterator hit = m_handlers.begin(); - hit != m_handlers.end(); hit++) { - - const map& docdata = (*hit)->get_meta_data(); - map::const_iterator iti = docdata.find("ipath"); - - if (iti != docdata.end()) { - if (!iti->second.empty()) { - // We have a non-empty ipath - hasipath = true; - titi = docdata.find("mimetype"); - if (titi != docdata.end()) - doc.mimetype = titi->second; - } - ipath += iti->second + "|"; - } else { - ipath += "|"; - } - } - - // Walk done, transform the list into a string - if (hasipath) { - LOGDEB2(("IPATH [%s]\n", ipath.c_str())); - string::size_type sit = ipath.find_last_not_of("|"); - if (sit == string::npos) - ipath.erase(); - else if (sit < ipath.length() -1) - ipath.erase(sit+1); - } else { - ipath.erase(); - } - } + if (!m_forPreview) + collectIpathAndMT(doc, ipath); dijontorcl(doc); // Destack what can be while (!m_handlers.empty() && !m_handlers.back()->has_documents()) { - delete m_handlers.back(); - m_handlers.pop_back(); + popHandler(); } - if (m_handlers.empty() || !m_handlers.back()->has_documents()) + if (m_handlers.empty()) return FIDone; else return FIAgain; } + +class DirWiper { + public: + string dir; + bool do_it; + DirWiper(string d) : dir(d), do_it(true) {} + ~DirWiper() { + if (do_it) { + wipedir(dir); + rmdir(dir.c_str()); + } + } +}; + +bool FileInterner::idocTempFile(TempFile& otemp, RclConfig *cnf, + const string& fn, const string& ipath, + const string& mtype) +{ + string tmpdir, reason; + if (!maketmpdir(tmpdir, reason)) + return false; + DirWiper wiper(tmpdir); + + FileInterner interner(fn, cnf, tmpdir, &mtype); + interner.setTargetMType(mtype); + Rcl::Doc doc; + string mipath = ipath; + Status ret = interner.internfile(doc, mipath); + if (ret == FileInterner::FIError) { + LOGERR(("FileInterner::idocTempFile: internfile() failed\n")); + return false; + } + TempFile temp(new TempFileInternal(cnf->getSuffixFromMimeType(mtype))); + if (!temp->ok()) { + LOGERR(("FileInterner::idocTempFile: cannot create temporary file")); + return false; + } + int fd = open(temp->filename(), O_WRONLY); + if (fd < 0) { + LOGERR(("FileInterner::idocTempFile: open(%s) failed errno %d\n", + temp->filename(), errno)); + return false; + } + const string& dt = doc.text; + if (write(fd, dt.c_str(), dt.length()) != (int)dt.length()) { + close(fd); + LOGERR(("FileInterner::idocTempFile: write to %s failed errno %d\n", + temp->filename(), errno)); + return false; + } + close(fd); + otemp = temp; + return true; +} + + #else #include diff --git a/src/internfile/internfile.h b/src/internfile/internfile.h index 8515e40c..bb48aaa2 100644 --- a/src/internfile/internfile.h +++ b/src/internfile/internfile.h @@ -16,13 +16,14 @@ */ #ifndef _INTERNFILE_H_INCLUDED_ #define _INTERNFILE_H_INCLUDED_ -/* @(#$Id: internfile.h,v 1.8 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: internfile.h,v 1.9 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include using std::string; using std::vector; +#include "pathut.h" #include "Filter.h" class RclConfig; @@ -30,10 +31,13 @@ namespace Rcl { class Doc; } -/// Turn external file into internal representation, according to mime -/// type etc +/** + * A class to convert a file into possibly multiple documents in internal + * representation. + */ class FileInterner { public: + /** * Identify and possibly decompress file, create adequate * handler. The mtype parameter is only set when the object is @@ -70,20 +74,39 @@ class FileInterner { * should be called again to get the following one(s). */ Status internfile(Rcl::Doc& doc, string &ipath); + + /** Return the file's mimetype (useful for container files) */ const string& get_mimetype() {return m_mimetype;} + /** We normally always return text/plain data. A caller can request + * that we stop conversion at the native document type (ie: text/html) + */ + void setTargetMType(const string& tp) {m_targetMType = tp;} + + /** Utility function: extract internal document and make temporary file */ + static bool idocTempFile(TempFile& temp, RclConfig *cnf, const string& fn, + const string& ipath, const string& mtype); + private: + static const unsigned int MAXHANDLERS = 20; RclConfig *m_cfg; string m_fn; string m_mimetype; // Mime type for [uncompressed] file bool m_forPreview; + string m_targetMType; // m_tdir and m_tfile are used only for decompressing input file if needed const string& m_tdir; string m_tfile; vector m_handlers; + bool m_tmpflgs[MAXHANDLERS]; + vector m_tempfiles; void tmpcleanup(); bool dijontorcl(Rcl::Doc&); + void collectIpathAndMT(Rcl::Doc&, string& ipath); + bool dataToTempFile(const string& data, const string& mt, string& fn); + void popHandler(); }; + #endif /* _INTERNFILE_H_INCLUDED_ */ diff --git a/src/internfile/mh_html.h b/src/internfile/mh_html.h index db1cf8da..fea22b2a 100644 --- a/src/internfile/mh_html.h +++ b/src/internfile/mh_html.h @@ -16,7 +16,7 @@ */ #ifndef _HTML_H_INCLUDED_ #define _HTML_H_INCLUDED_ -/* @(#$Id: mh_html.h,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_html.h,v 1.9 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -31,6 +31,11 @@ class MimeHandlerHtml : public RecollFilter { virtual ~MimeHandlerHtml() {} virtual bool set_document_file(const string &file_path); virtual bool set_document_string(const string &data); + virtual bool is_data_input_ok(DataInput input) const { + if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING) + return true; + return false; + } virtual bool next_document(); private: string m_html; diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index f11b23ba..13c8b2ed 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.25 2006-12-15 16:33:15 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.26 2006-12-16 15:39:54 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -49,9 +49,15 @@ MimeHandlerMail::~MimeHandlerMail() if (m_fd >= 0) close(m_fd); delete m_stream; + for (vector::iterator it = m_attachments.begin(); + it != m_attachments.end(); it++) { + delete *it; + } } + bool MimeHandlerMail::set_document_file(const string &fn) { + LOGDEB(("MimeHandlerMail::set_document_file(%s)\n", fn.c_str())); if (m_fd >= 0) { close(m_fd); m_fd = -1; @@ -76,6 +82,7 @@ bool MimeHandlerMail::set_document_file(const string &fn) bool MimeHandlerMail::set_document_string(const string &msgtxt) { + LOGDEB1(("MimeHandlerMail::set_document_string\n")); LOGDEB2(("Message text: [%s]\n", msgtxt.c_str())); delete m_stream; m_stream = new stringstream(msgtxt); @@ -90,15 +97,36 @@ bool MimeHandlerMail::set_document_string(const string &msgtxt) return true; } +bool MimeHandlerMail::skip_to_document(const string& ipath) +{ + LOGDEB(("MimeHandlerMail::skip_to_document(%s)\n", ipath.c_str())); + if (m_idx == -1) { + // No decoding done yet. If ipath is null need do nothing + if (ipath == "" || ipath == "-1") + return true; + // ipath points to attachment: need to decode message + if (!next_document()) { + LOGERR(("MimeHandlerMail::skip_to_doc: next_document failed\n")); + return false; + } + } + m_idx = atoi(ipath.c_str()); + return true; +} + bool MimeHandlerMail::next_document() { + LOGDEB(("MimeHandlerMail::next_document m_idx %d m_havedoc %d\n", + m_idx, m_havedoc)); if (!m_havedoc) return false; bool res = false; if (m_idx == -1) { m_metaData["mimetype"] = "text/plain"; - res =processMsg(m_bincdoc, 0); + res = processMsg(m_bincdoc, 0); + LOGDEB1(("MimeHandlerMail::next_document: mimetype %s\n", + m_metaData["mimetype"].c_str())); } else { res = processAttach(); } @@ -107,9 +135,61 @@ bool MimeHandlerMail::next_document() return res; } +// Decode according to content transfer encoding +static bool decodeBody(const string& cte, const string& body, string& decoded, + const string** respp) +{ + // By default, there is no encoding (7bit,8bit,raw). Also in case of + // decoding error + *respp = &body; + + if (!stringlowercmp("quoted-printable", cte)) { + if (!qp_decode(body, decoded)) { + LOGERR(("decodeBody: quoted-printable decoding failed !\n")); + return false; + } + *respp = &decoded; + } else if (!stringlowercmp("base64", cte)) { + if (!base64_decode(body, decoded)) { + LOGERR(("decodeBody: base64 decoding failed !. body [%s]\n", + body.c_str())); + return false; + } + *respp = &decoded; + } + return true; +} + bool MimeHandlerMail::processAttach() { - return false; + LOGDEB(("MimeHandlerMail::processAttach() m_idx %d\n", m_idx)); + if (!m_havedoc) + return false; + if (m_idx >= (int)m_attachments.size()) { + m_havedoc = false; + return false; + } + MHMailAttach *att = m_attachments[m_idx]; + + LOGDEB1(("processAttach:content-type: %s\n", att->m_contentType.c_str())); + m_metaData["mimetype"] = att->m_contentType; + m_metaData["charset"] = att->m_charset; + m_metaData["filename"] = att->m_filename; + + m_metaData["content"] = ""; + string& body = m_metaData["content"]; + att->m_part->getBody(body, 0, att->m_part->bodylength); + string decoded; + const string *bdp; + if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) { + return false; + } + if (bdp != &body) + body = decoded; + char nbuf[10]; + sprintf(nbuf, "%d", m_idx); + m_metaData["ipath"] = nbuf; + return true; } // Transform a single message into a document. The subject becomes the @@ -124,7 +204,7 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth)); if (depth++ >= maxdepth) { // Have to stop somewhere - LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n", + LOGINFO(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n", maxdepth)); // Return true anyway, better to index partially than not at all return true; @@ -218,7 +298,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) // Get and parse content-type header Binc::HeaderItem hi; if (!it->h.getFirstHeader("Content-Type", hi)) { - LOGDEB(("No content-type header for part %d\n", i)); + LOGDEB(("walkmime:no ctent-type header for part %d\n", i)); continue; } MimeHeaderValue content_type; @@ -297,30 +377,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) // "Simple" part. LOGDEB2(("walkmime: simple part\n")); - - // If the Content-Disposition is not inline, we treat it as - // attachment, as per rfc2183. We don't process attachments - // for now, except for indexing/displaying the file name - // If it is inline but not text or html, same thing. - if (stringlowercmp("inline", content_disposition.value) || - (stringlowercmp("text/plain", content_type.value) && - stringlowercmp("text/html", content_type.value)) ) { - if (!filename.empty()) { - out += "\n"; - if (m_forPreview) - out += "[" + dispindic + " " + content_type.value + ": "; - out += filename; - if (m_forPreview) - out += "]"; - out += "\n\n"; - } - // m_attachments.push_back(&doc); - // We're done with this part - return; - } - - // We are dealing with an inline part of text/plain or text/html type - // Normally the default charset is us-ascii. But it happens that // 8 bit chars exist in a message that is stated as us-ascii. Ie the // mailer used by yahoo support ('KANA') does this. We could convert @@ -345,34 +401,52 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) cte = hi.getValue(); } + // If the Content-Disposition is not inline, we treat it as + // attachment, as per rfc2183. We don't process attachments + // for now, except for indexing/displaying the file name + // If it is inline but not text or html, same thing. + if (stringlowercmp("inline", content_disposition.value) || + (stringlowercmp("text/plain", content_type.value) && + stringlowercmp("text/html", content_type.value)) ) { + if (!filename.empty()) { + out += "\n"; + if (m_forPreview) + out += "[" + dispindic + " " + content_type.value + ": "; + out += filename; + if (m_forPreview) + out += "]"; + out += "\n\n"; + } + LOGDEB(("walkmime: pushing attchmnt fn [%s]\n", filename.c_str())); + MHMailAttach *att = new MHMailAttach; + if (att == 0) { + LOGERR(("Out of memory\n")); + return; + } + att->m_contentType = content_type.value; + att->m_filename = filename; + att->m_charset = charset; + att->m_contentTransferEncoding = cte; + att->m_part = doc; + m_attachments.push_back(att); + return; + } + + // We are dealing with an inline part of text/plain or text/html type + + LOGDEB2(("walkmime: final: body start offset %d, length %d\n", doc->getBodyStartOffset(), doc->getBodyLength())); string body; doc->getBody(body, 0, doc->bodylength); - // Decode according to content transfer encoding - if (!stringlowercmp("quoted-printable", cte)) { - string decoded; - if (!qp_decode(body, decoded)) { - LOGERR(("walkmime: quoted-printable decoding failed !\n")); - return; - } - body = decoded; - } else if (!stringlowercmp("base64", cte)) { - string decoded; - if (!base64_decode(body, decoded)) { - LOGERR(("walkmime: base64 decoding failed !\n")); -#if 0 - FILE *fp = fopen("/tmp/recoll_decodefail", "w"); - if (fp) { - fprintf(fp, "%s", body.c_str()); - fclose(fp); - } -#endif - return; - } - body = decoded; + string decoded; + const string *bdp; + if (!decodeBody(cte, body, decoded, &bdp)) { + LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n")); } + if (bdp != &body) + body = decoded; // Handle html stripping and transcoding to utf8 string utf8; @@ -390,6 +464,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) out += it->second; } else { // Transcode to utf-8 + LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str())); if (!transcode(body, utf8, charset, "UTF-8")) { LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", charset.c_str())); diff --git a/src/internfile/mh_mail.h b/src/internfile/mh_mail.h index a694a3e4..2e45166b 100644 --- a/src/internfile/mh_mail.h +++ b/src/internfile/mh_mail.h @@ -16,7 +16,7 @@ */ #ifndef _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_ -/* @(#$Id: mh_mail.h,v 1.10 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mail.h,v 1.11 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -29,6 +29,8 @@ namespace Binc { class MimePart; } +class MHMailAttach; + /** * Translate a mail folder file into internal documents (also works * for maildir files). This has to keep state while parsing a mail folder @@ -40,9 +42,15 @@ class MimeHandlerMail : public RecollFilter { : RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1) {} virtual ~MimeHandlerMail(); - virtual bool set_document_file(const string &file_path); - virtual bool set_document_string(const string &data); + virtual bool set_document_file(const string& file_path); + virtual bool set_document_string(const string& data); + virtual bool is_data_input_ok(DataInput input) const { + if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING) + return true; + return false; + } virtual bool next_document(); + virtual bool skip_to_document(const string& ipath); private: bool processMsg(Binc::MimePart *doc, int depth); @@ -53,7 +61,16 @@ class MimeHandlerMail : public RecollFilter { std::stringstream *m_stream; int m_idx; // starts at -1 for self, then index into // attachments; - vector m_attachments; + vector m_attachments; +}; + +class MHMailAttach { +public: + string m_contentType; + string m_filename; + string m_charset; + string m_contentTransferEncoding; + Binc::MimePart *m_part; }; #endif /* _MAIL_H_INCLUDED_ */ diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index baa78c12..7c7578a8 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -16,7 +16,7 @@ */ #ifndef _MH_TEXT_H_INCLUDED_ #define _MH_TEXT_H_INCLUDED_ -/* @(#$Id: mh_text.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_text.h,v 1.4 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ #include using std::string; @@ -34,6 +34,11 @@ class MimeHandlerText : public RecollFilter { virtual ~MimeHandlerText() {} virtual bool set_document_file(const string &file_path); virtual bool set_document_string(const string&); + virtual bool is_data_input_ok(DataInput input) const { + if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING) + return true; + return false; + } virtual bool next_document(); private: string m_text; diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 340d5a21..95fd4594 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -16,7 +16,7 @@ */ #ifndef _MIMEHANDLER_H_INCLUDED_ #define _MIMEHANDLER_H_INCLUDED_ -/* @(#$Id: mimehandler.h,v 1.13 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mimehandler.h,v 1.14 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -66,8 +66,12 @@ public: return false; } - virtual DataInput get_required_data_input() const - {return DOCUMENT_FILE_NAME;} + virtual bool is_data_input_ok(DataInput input) const { + if (input == DOCUMENT_FILE_NAME) + return true; + return false; + } + virtual string get_error() const { return m_reason; } diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp index cdf97813..be584674 100644 --- a/src/qtgui/main.cpp +++ b/src/qtgui/main.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: main.cpp,v 1.56 2006-12-05 15:23:50 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: main.cpp,v 1.57 2006-12-16 15:39:54 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -281,6 +281,7 @@ int main(int argc, char **argv) // Connect exit handlers etc.. app.connect(&app, SIGNAL(lastWindowClosed()), &app, SLOT(quit())); + app.connect(&app, SIGNAL(aboutToQuit()), mainWindow, SLOT(close())); QTimer *timer = new QTimer(&app); mainWindow->connect(timer, SIGNAL(timeout()), mainWindow, SLOT(periodic100())); diff --git a/src/qtgui/rclmain_w.cpp b/src/qtgui/rclmain_w.cpp index 25c4d8a7..6dc5c057 100644 --- a/src/qtgui/rclmain_w.cpp +++ b/src/qtgui/rclmain_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.14 2006-12-14 13:53:43 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.15 2006-12-16 15:39:54 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -66,6 +66,7 @@ using std::pair; #include "refcntr.h" #include "ssearch_w.h" #include "execmd.h" +#include "internfile.h" #include "rclmain_w.h" #include "moc_rclmain_w.cpp" @@ -161,15 +162,6 @@ void RclMain::init() #endif } -// We also want to get rid of the advanced search form and previews -// when we exit (not our children so that it's not systematically -// created over the main form). -bool RclMain::close(bool) -{ - LOGDEB(("RclMain::close\n")); - fileExit(); - return false; -} //#define SHOWEVENTS #if defined(SHOWEVENTS) @@ -257,9 +249,20 @@ static const char *eventTypeToStr(int tp) } #endif +// We also want to get rid of the advanced search form and previews +// when we exit (not our children so that it's not systematically +// created over the main form). +bool RclMain::close() +{ + LOGDEB(("RclMain::close\n")); + fileExit(); + return false; +} + void RclMain::fileExit() { - LOGDEB1(("RclMain: fileExit\n")); + LOGDEB(("RclMain: fileExit\n")); + m_tempfiles.clear(); prefs.mainwidth = width(); prefs.mainheight = height(); prefs.ssearchTyp = sSearch->searchTypCMB->currentItem(); @@ -686,15 +689,38 @@ void RclMain::startNativeViewer(int docnum) } } - string fn = urltolocalpath(doc.url); - string url = url_encode(doc.url, 7); - string ipath = doc.ipath; - // Substitute %u (url) and %f (file name) inside prototype command + // For files with an ipath, we do things differently depending if the + // configured command seems to be able to grok it or not. + bool wantsipath = cmd.find("%i") != string::npos; + bool istempfile = false; + string fn, url; + if (doc.ipath.empty() || wantsipath) { + fn = urltolocalpath(doc.url); + url = url_encode(doc.url, 7); + } else { + // There is an ipath and the command does not know about + // them. We need a temp file. + TempFile temp; + if (!FileInterner::idocTempFile(temp, rclconfig, + urltolocalpath(doc.url), + doc.ipath, doc.mimetype)) { + QMessageBox::warning(0, "Recoll", + tr("Cannot extract document or create " + "temporary file")); + return; + } + istempfile = true; + m_tempfiles.push_back(temp); + fn = temp->filename(); + url = string("file://") + fn; + } + + // Substitute %xx inside prototype command string ncmd; map subs; subs['u'] = escapeShell(url); subs['f'] = escapeShell(fn); - subs['i'] = escapeShell(ipath); + subs['i'] = escapeShell(doc.ipath); pcSubst(cmd, ncmd, subs); ncmd += " &"; @@ -707,7 +733,10 @@ void RclMain::startNativeViewer(int docnum) QString::fromUtf8(prcmd.c_str()) + "]"; stb->message(msg, 5000); } - g_dynconf->enterDoc(fn, doc.ipath); + if (!istempfile) + g_dynconf->enterDoc(fn, doc.ipath); + // We should actually monitor these processes so that we can + // delete the temp files when they exit system(ncmd.c_str()); } diff --git a/src/qtgui/rclmain_w.h b/src/qtgui/rclmain_w.h index a574eb21..ee03281c 100644 --- a/src/qtgui/rclmain_w.h +++ b/src/qtgui/rclmain_w.h @@ -29,6 +29,7 @@ #include "searchdata.h" #include "spell_w.h" #include "refcntr.h" +#include "pathut.h" #if QT_VERSION < 0x040000 #include "rclmain.h" @@ -64,9 +65,9 @@ public: } ~RclMain() {} - virtual bool close( bool ); public slots: + virtual bool close(); virtual void fileExit(); virtual void periodic100(); virtual void startIndexing(); @@ -103,7 +104,8 @@ private: RefCntr m_searchData; DocSeqSortSpec m_sortspecs; RefCntr m_docSource; - + + vector m_tempfiles; // Serial number of current search for this process. // Used to match to preview windows int m_searchId;