diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 92038618..35f760b3 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.46 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.47 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -426,7 +426,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, Rcl::Doc fileDoc; fileDoc.fmtime = doc.fmtime; fileDoc.utf8fn = doc.utf8fn; - fileDoc.mimetype = doc.mimetype; + fileDoc.mimetype = interner.get_mimetype(); if (!m_db.add(fn, fileDoc, stp)) return FsTreeWalker::FtwError; } diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index 4bc30602..04c9c910 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -51,7 +51,7 @@ namespace Dijon { public: /// Builds an empty filter. - Filter(const std::string &mime_type) {} + Filter(const std::string & /*mime_type */) {} /// Destroys the filter. virtual ~Filter() {} diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 82381c78..237de5d2 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.20 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -149,6 +149,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, } // Look for appropriate handler (might still return empty) + m_mimetype = l_mime; Dijon::Filter *df = getMimeHandler(l_mime, m_cfg); if (!df) { @@ -172,6 +173,66 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, m_fn.c_str())); } +FileInterner::~FileInterner() +{ + while (!m_handlers.empty()) { + delete m_handlers.back(); + m_handlers.pop_back(); + } + tmpcleanup(); +} + +static const string string_empty; +static const string get_mimetype(Dijon::Filter* df) +{ + const std::map *docdata = &df->get_meta_data(); + map::const_iterator it; + it = docdata->find("mimetype"); + if (it != docdata->end()) { + return it->second; + } else { + return string_empty; + } +} + +bool FileInterner::dijontorcl(Rcl::Doc& doc) +{ + Dijon::Filter *df = m_handlers.back(); + const std::map *docdata = &df->get_meta_data(); + map::const_iterator it; + + it = docdata->find("origcharset"); + if (it != docdata->end()) + doc.origcharset = it->second; + + it = docdata->find("content"); + if (it != docdata->end()) + doc.text = it->second; + + it = docdata->find("title"); + if (it != docdata->end()) + doc.title = it->second; + + it = docdata->find("keywords"); + if (it != docdata->end()) + doc.keywords = it->second; + + it = docdata->find("modificationdate"); + if (it != docdata->end()) + doc.dmtime = it->second; + + it = docdata->find("abstract"); + if (it != docdata->end()) { + doc.abstract = it->second; + } else { + it = docdata->find("sample"); + if (it != docdata->end()) + doc.abstract = it->second; + } + return true; +} + + static const unsigned int MAXHANDLERS = 20; FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) @@ -182,8 +243,11 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) return FIError; } + // Ipath vector. // Note that the vector is big enough for the maximum stack. All values // over the last significant one are "" + // We set the ipath for the first handler here, others are set + // when they're pushed on the stack vector vipath(MAXHANDLERS); int vipathidx = 0; if (!ipath.empty()) { @@ -196,12 +260,8 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) } } - /* Try to get doc from the topmost filter */ while (!m_handlers.empty()) { - if (!vipath.empty()) { - - } if (!m_handlers.back()->has_documents()) { // No docs at the current top level. Pop and see if there // is something at the previous one @@ -277,23 +337,42 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) LOGERR(("FileInterner::internfile: stack empty\n")); return FIError; } + + // If indexing, we have to collect the ipath stack. + + // While we're at it, we also set the mimetype, which is a special + // property:we want to get it from the topmost doc + // with an ipath, not the last one which is always text/html + // Note that ipath is returned through the parameter not doc.ipath if (!m_forPreview) { - string &ipath = doc.ipath; bool hasipath = false; - for (vector::const_iterator it = m_handlers.begin(); - it != m_handlers.end(); it++) { - map::const_iterator iti = - (*it)->get_meta_data().find("ipath"); - if (iti != (*it)->get_meta_data().end()) { - if (!iti->second.empty()) + doc.mimetype = m_mimetype; + LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str())); + map::const_iterator titi; + + for (vector::const_iterator hit = m_handlers.begin(); + hit != m_handlers.end(); hit++) { + + const map& docdata = (*hit)->get_meta_data(); + map::const_iterator iti = docdata.find("ipath"); + + if (iti != docdata.end()) { + if (!iti->second.empty()) { + // We have a non-empty ipath hasipath = true; + titi = docdata.find("mimetype"); + if (titi != docdata.end()) + doc.mimetype = titi->second; + } ipath += iti->second + "|"; } else { ipath += "|"; } } + + // Walk done, transform the list into a string if (hasipath) { - LOGDEB(("IPATH [%s]\n", ipath.c_str())); + LOGDEB2(("IPATH [%s]\n", ipath.c_str())); string::size_type sit = ipath.find_last_not_of("|"); if (sit == string::npos) ipath.erase(); @@ -304,7 +383,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) } } - dijontorcl(m_handlers.back(), doc); + dijontorcl(doc); // Destack what can be while (!m_handlers.empty() && !m_handlers.back()->has_documents()) { @@ -317,56 +396,6 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) return FIAgain; } - -bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc) -{ - const std::map *docdata = &df->get_meta_data(); - map::const_iterator it; - - it = docdata->find("mimetype"); - if (it != docdata->end()) - doc.mimetype = it->second; - - it = docdata->find("origcharset"); - if (it != docdata->end()) - doc.origcharset = it->second; - - it = docdata->find("content"); - if (it != docdata->end()) - doc.text = it->second; - - it = docdata->find("title"); - if (it != docdata->end()) - doc.title = it->second; - - it = docdata->find("keywords"); - if (it != docdata->end()) - doc.keywords = it->second; - - it = docdata->find("modificationdate"); - if (it != docdata->end()) - doc.dmtime = it->second; - - it = docdata->find("abstract"); - if (it != docdata->end()) { - doc.abstract = it->second; - } else { - it = docdata->find("sample"); - if (it != docdata->end()) - doc.abstract = it->second; - } - return true; -} - -FileInterner::~FileInterner() -{ - while (!m_handlers.empty()) { - delete m_handlers.back(); - m_handlers.pop_back(); - } - tmpcleanup(); -} - #else #include diff --git a/src/internfile/internfile.h b/src/internfile/internfile.h index 0eac6360..8515e40c 100644 --- a/src/internfile/internfile.h +++ b/src/internfile/internfile.h @@ -16,7 +16,7 @@ */ #ifndef _INTERNFILE_H_INCLUDED_ #define _INTERNFILE_H_INCLUDED_ -/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: internfile.h,v 1.8 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -70,10 +70,12 @@ class FileInterner { * should be called again to get the following one(s). */ Status internfile(Rcl::Doc& doc, string &ipath); + const string& get_mimetype() {return m_mimetype;} private: RclConfig *m_cfg; string m_fn; + string m_mimetype; // Mime type for [uncompressed] file bool m_forPreview; // m_tdir and m_tfile are used only for decompressing input file if needed const string& m_tdir; @@ -81,7 +83,7 @@ class FileInterner { vector m_handlers; void tmpcleanup(); - static bool dijontorcl(Dijon::Filter *, Rcl::Doc&); + bool dijontorcl(Rcl::Doc&); }; #endif /* _INTERNFILE_H_INCLUDED_ */ diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 8479ceca..bba522f6 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -64,8 +64,8 @@ bool MimeHandlerHtml::next_document() if (m_havedoc == false) return false; m_havedoc = false; - LOGDEB(("textHtmlToDoc: next_document\n")); string charset = m_defcharset; + LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str())); // - We first try to convert from the default configured charset // (which may depend of the current directory) to utf-8. If this @@ -76,10 +76,11 @@ bool MimeHandlerHtml::next_document() LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); - MyHtmlParser p(m_metaData["content"]); + MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { string transcoded; LOGDEB(("Html::mkDoc: pass %d\n", pass)); + MyHtmlParser p; // Try transcoding. If it fails, use original text. if (!transcode(m_html, transcoded, charset, "UTF-8")) { LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n", @@ -97,16 +98,18 @@ bool MimeHandlerHtml::next_document() try { p.parse_html(transcoded); // No exception: ok? + result = p; break; } catch (bool diag) { + result = p; if (diag == true) break; LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", - charset.c_str(), p.doccharset.c_str())); - if (!p.doccharset.empty() && - !samecharset(p.doccharset, p.ocharset)) { + charset.c_str(),result.doccharset.c_str())); + if (!result.doccharset.empty() && + !samecharset(result.doccharset, result.ocharset)) { LOGDEB(("textHtmlToDoc: reparse for charsets\n")); - charset = p.doccharset; + charset = result.doccharset; } else { LOGERR(("textHtmlToDoc:: error: non charset exception\n")); return false; @@ -115,11 +118,12 @@ bool MimeHandlerHtml::next_document() } m_metaData["origcharset"] = m_defcharset; + m_metaData["content"] = result.dump; m_metaData["charset"] = "utf-8"; - m_metaData["title"] = p.title; - m_metaData["keywords"] = p.keywords; - m_metaData["modificationdate"] = p.dmtime; - m_metaData["sample"] = p.sample; + m_metaData["title"] = result.title; + m_metaData["keywords"] = result.keywords; + m_metaData["modificationdate"] = result.dmtime; + m_metaData["sample"] = result.sample; m_metaData["mimetype"] = "text/plain"; return true; } diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 41611740..f11b23ba 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.25 2006-12-15 16:33:15 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -94,9 +94,22 @@ bool MimeHandlerMail::next_document() { if (!m_havedoc) return false; - m_havedoc = false; - m_metaData["mimetype"] = "text/plain"; - return processMsg(m_bincdoc, 0); + bool res = false; + + if (m_idx == -1) { + m_metaData["mimetype"] = "text/plain"; + res =processMsg(m_bincdoc, 0); + } else { + res = processAttach(); + } + m_idx++; + m_havedoc = m_idx < (int)m_attachments.size(); + return res; +} + +bool MimeHandlerMail::processAttach() +{ + return false; } // Transform a single message into a document. The subject becomes the @@ -301,6 +314,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) out += "]"; out += "\n\n"; } + // m_attachments.push_back(&doc); // We're done with this part return; } @@ -373,19 +387,18 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) map::const_iterator it = mh.get_meta_data().find("content"); if (it != mh.get_meta_data().end()) - putf8 = &it->second; + out += it->second; } else { // Transcode to utf-8 if (!transcode(body, utf8, charset, "UTF-8")) { LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", charset.c_str())); - putf8 = &body; + out += body; } else { - putf8 = &utf8; + out += utf8; } } - if (putf8) - out += *putf8; + if (out.length() && out[out.length()-1] != '\n') out += '\n'; diff --git a/src/internfile/mh_mail.h b/src/internfile/mh_mail.h index 94abb303..a694a3e4 100644 --- a/src/internfile/mh_mail.h +++ b/src/internfile/mh_mail.h @@ -16,9 +16,12 @@ */ #ifndef _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_ -/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mail.h,v 1.10 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include +using std::vector; + #include "mimehandler.h" namespace Binc { @@ -34,18 +37,23 @@ namespace Binc { class MimeHandlerMail : public RecollFilter { public: MimeHandlerMail(const string &mt) - : RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0) + : RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1) {} virtual ~MimeHandlerMail(); virtual bool set_document_file(const string &file_path); virtual bool set_document_string(const string &data); virtual bool next_document(); + private: - Binc::MimeDocument *m_bincdoc; bool processMsg(Binc::MimePart *doc, int depth); void walkmime(Binc::MimePart* doc, int depth); - int m_fd; - std::stringstream *m_stream; + bool processAttach(); + Binc::MimeDocument *m_bincdoc; + int m_fd; + std::stringstream *m_stream; + int m_idx; // starts at -1 for self, then index into + // attachments; + vector m_attachments; }; #endif /* _MAIL_H_INCLUDED_ */ diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index ec4b8c94..3c855d68 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -37,13 +37,11 @@ class MyHtmlParser : public HtmlParser { bool in_body_tag; bool in_pre_tag; bool pending_space; - bool indexing_allowed; - string title, sample, keywords, dmtime; - string localdump; - string &dump; + string title, sample, keywords, dump, dmtime; string ocharset; // This is the charset our user thinks the doc was string charset; // This is the charset it was supposedly converted to string doccharset; // Set this to value of charset parameter in header + bool indexing_allowed; void process_text(const string &text); void opening_tag(const string &tag, const map &p); void closing_tag(const string &tag); @@ -54,16 +52,5 @@ class MyHtmlParser : public HtmlParser { in_body_tag(false), in_pre_tag(false), pending_space(false), - indexing_allowed(true), - dump(localdump) - { } - MyHtmlParser(string& buf) : - in_script_tag(false), - in_style_tag(false), - in_body_tag(false), - in_pre_tag(false), - pending_space(false), - indexing_allowed(true), - dump(buf) - { } + indexing_allowed(true) { } };