Dijon filters 1st step: mostly working needs check and optim

This commit is contained in:
dockes 2006-12-15 12:40:24 +00:00
parent 1973c06346
commit 33c95ef1ba
22 changed files with 979 additions and 480 deletions

164
src/internfile/Filter.h Normal file
View File

@ -0,0 +1,164 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DIJON_FILTER_H
#define _DIJON_FILTER_H
#include <string>
#include <set>
#include <map>
namespace Dijon
{
class Filter;
/** Provides the list of MIME types supported by the filter(s).
* The character string is allocated with new[].
* This function is exported by dynamically loaded filter libraries.
*/
typedef bool (get_filter_types_func)(std::set<std::string> &);
/** Returns what data should be passed to the filter(s).
* Output is cast from Filter::DataInput to int for convenience.
* This function is exported by dynamically loaded filter libraries.
* The aim is to let the client application know before-hand whether
* it should load documents or not.
*/
typedef int (get_filter_data_input_func)(void);
/** Returns a Filter that handles the given MIME type.
* The Filter object is allocated with new.
* This function is exported by dynamically loaded filter libraries
* and serves as a factory for Filter objects, so that the client
* application doesn't have to know which Filter sub-types handle
* which MIME types.
*/
typedef Filter *(get_filter_func)(const std::string &);
/// Filter interface.
class Filter
{
public:
/// Builds an empty filter.
Filter(const std::string &mime_type) {}
/// Destroys the filter.
virtual ~Filter() {}
// Enumerations.
/** What data a filter supports as input.
* It can be either the whole document data, its file name, or its URI.
*/
typedef enum { DOCUMENT_DATA=0, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput;
/** Input properties supported by the filter.
* - PREFERRED_CHARSET is the charset preferred by the client application.
* The filter will convert document's content to this charset if possible.
* - OPERATING_MODE can be set to either view or index.
*/
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE } Properties;
// Information.
/// Returns what data the filter requires as input.
virtual DataInput get_required_data_input(void) const = 0;
// Initialization.
/** Sets a property, prior to calling set_document_XXX().
* Returns false if the property is not supported.
*/
virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0;
/** (Re)initializes the filter with the given data.
* Caller should ensure the given pointer is valid until the
* Filter object is destroyed, as some filters may not need to
* do a deep copy of the data.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_data(const char *data_ptr, unsigned int data_length) = 0;
virtual bool set_document_string(const string&) = 0;
/** (Re)initializes the filter with the given file.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_file(const std::string &file_path) = 0;
/** (Re)initializes the filter with the given URI.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_uri(const std::string &uri) = 0;
// Going from one nested document to the next.
/** Returns true if there are nested documents left to extract.
* Returns false if the end of the parent document was reached
* or an error occured.
*/
virtual bool has_documents(void) const = 0;
/** Moves to the next nested document.
* Returns false if there are none left.
*/
virtual bool next_document(void) = 0;
/** Skips to the nested document with the given ipath.
* Returns false if no such document exists.
*/
virtual bool skip_to_document(const std::string &ipath) = 0;
// Accessing documents' contents.
/// Returns the message for the most recent error that has occured.
virtual std::string get_error(void) const = 0;
/** Returns a dictionary of metadata extracted from the current document.
* Metadata fields may include one or more of the following :
* content, title, ipath, mimetype, language, charset, author, creator,
* publisher, modificationdate, creationdate, size
* Special considerations apply :
* - content may contain binary data, watch out !
* - ipath is an internal path to the nested document that can be
* later passed to skip_to_document(). It may be empty if the parent
* document's type doesn't allow embedding, in which case the filter
* should only return one document.
* - mimetype should be text/plain if the document could be handled
* internally, empty if unknown. If any other value, it is expected
* that the client application can pass the nested document's content
* to another filter that supports this particular type.
*/
const std::map<std::string, std::string> &get_meta_data(void) const
{
return m_metaData;
}
protected:
/// Metadata dictionary.
std::map<std::string, std::string> m_metaData;
private:
/// Filter objects cannot be copied.
Filter(const Filter &other);
/// Filter objects cannot be copied.
Filter& operator=(const Filter& other);
};
}
#endif // _DIJON_FILTER_H

View File

@ -1,9 +1,9 @@
# @(#$Id: Makefile,v 1.1 2006-11-15 07:27:42 dockes Exp $ (C) 2005 J.F.Dockes # @(#$Id: Makefile,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes
depth = .. depth = ..
include $(depth)/mk/sysconf include $(depth)/mk/sysconf
# Only test executables get build in here # Only test executables get build in here
PROGS = internfile unacpp textsplit rclconfig PROGS = internfile
all: $(BIGLIB) $(PROGS) all: $(BIGLIB) $(PROGS)

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.18 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -32,12 +32,14 @@ using namespace std;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
#include "internfile.h" #include "internfile.h"
#include "rcldoc.h"
#include "mimetype.h" #include "mimetype.h"
#include "debuglog.h" #include "debuglog.h"
#include "mimehandler.h" #include "mimehandler.h"
#include "execmd.h" #include "execmd.h"
#include "pathut.h" #include "pathut.h"
#include "wipedir.h" #include "wipedir.h"
#include "rclconfig.h"
// Execute the command to uncompress a file into a temporary one. // Execute the command to uncompress a file into a temporary one.
static bool uncompressfile(RclConfig *conf, const string& ifn, static bool uncompressfile(RclConfig *conf, const string& ifn,
@ -106,98 +108,262 @@ void FileInterner::tmpcleanup()
// internfile // internfile
FileInterner::FileInterner(const std::string &f, RclConfig *cnf, FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
const string& td, const string *imime) const string& td, const string *imime)
: m_fn(f), m_cfg(cnf), m_tdir(td), m_handler(0) : m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td)
{ {
// We are actually going to access the file, so it's ok bool usfci = false;
// performancewise to check this config variable at every call cnf->getConfParam("usesystemfilecommand", &usfci);
// even if it can only change when we change directories
string usfc;
int usfci;
if (!cnf->getConfParam("usesystemfilecommand", usfc))
usfci = 0;
else
usfci = atoi(usfc.c_str()) ? 1 : 0;
LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci)); LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci));
bool forPreview = imime ? true : false;
// We need to run mime type identification in any case to check // We need to run mime type identification in any case to check
// for a compressed file. // for a compressed file.
m_mime = mimetype(m_fn, m_cfg, usfci); string l_mime = mimetype(m_fn, m_cfg, usfci);
// If identification fails, try to use the input parameter. This // If identification fails, try to use the input parameter. This
// is then normally not a compressed type (it's the mime type from // is then normally not a compressed type (it's the mime type from
// the db), and is only set when previewing, not for indexing // the db), and is only set when previewing, not for indexing
if (m_mime.empty() && imime) if (l_mime.empty() && imime)
m_mime = *imime; l_mime = *imime;
if (!m_mime.empty()) { if (!l_mime.empty()) {
// Has mime: check for a compressed file. If so, create a // Has mime: check for a compressed file. If so, create a
// temporary uncompressed file, and rerun the mime type // temporary uncompressed file, and rerun the mime type
// identification, then do the rest with the temp file. // identification, then do the rest with the temp file.
list<string>ucmd; list<string>ucmd;
if (m_cfg->getUncompressor(m_mime, ucmd)) { if (m_cfg->getUncompressor(l_mime, ucmd)) {
if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) { if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
return; return;
} }
LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n", LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n",
m_tdir.c_str(), m_tfile.c_str())); m_tdir.c_str(), m_tfile.c_str()));
m_fn = m_tfile; m_fn = m_tfile;
m_mime = mimetype(m_fn, m_cfg, usfci); l_mime = mimetype(m_fn, m_cfg, usfci);
if (m_mime.empty() && imime) if (l_mime.empty() && imime)
m_mime = *imime; l_mime = *imime;
} }
} }
if (m_mime.empty()) { if (l_mime.empty()) {
// No mime type. We let it through as config may warrant that // No mime type. We let it through as config may warrant that
// we index all file names // we index all file names
LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str())); LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
} }
// Look for appropriate handler (might still return empty) // Look for appropriate handler (might still return empty)
m_handler = getMimeHandler(m_mime, m_cfg); Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);
if (!m_handler) { if (!df) {
// No handler for this type, for now :( if indexallfilenames // No handler for this type, for now :( if indexallfilenames
// is set in the config, this normally wont happen (we get mh_unknown) // is set in the config, this normally wont happen (we get mh_unknown)
LOGDEB(("FileInterner::FileInterner: %s: no handler\n", LOGDEB(("FileInterner:: no handler for %s\n", l_mime.c_str()));
m_mime.c_str()));
return; return;
} }
m_handler->setForPreview(forPreview); df->set_property(Dijon::Filter::OPERATING_MODE,
LOGDEB(("FileInterner::FileInterner: %s [%s]\n", m_mime.c_str(), m_forPreview ? "view" : "index");
string charset = m_cfg->getDefCharset();
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
if (!df->set_document_file(m_fn)) {
LOGERR(("FileInterner:: error parsing %s\n", m_fn.c_str()));
return;
}
m_handlers.reserve(20);
m_handlers.push_back(df);
LOGDEB(("FileInterner::FileInterner: %s [%s]\n", l_mime.c_str(),
m_fn.c_str())); m_fn.c_str()));
} }
static const unsigned int MAXHANDLERS = 20;
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
{ {
if (!m_handler) { if (m_handlers.size() != 1) {
LOGERR(("FileInterner::internfile: no handler !!\n")); LOGERR(("FileInterner::internfile: bad stack size %d !!\n",
m_handlers.size()));
return FIError; return FIError;
} }
// Turn file into a document. The document has fields for title, body // Note that the vector is big enough for the maximum stack. All values
// etc., all text converted to utf8 // over the last significant one are ""
MimeHandler::Status mhs = vector<string> vipath(MAXHANDLERS);
m_handler->mkDoc(m_cfg, m_fn, m_mime, doc, ipath); int vipathidx = 0;
FileInterner::Status ret = FIError; if (!ipath.empty()) {
switch (mhs) { list<string> lipath;
case MimeHandler::MHError: stringToTokens(ipath, lipath, "|", true);
LOGERR(("FileInterner::internfile: error parsing %s\n", m_fn.c_str())); vipath.insert(vipath.begin(), lipath.begin(), lipath.end());
break; if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
case MimeHandler::MHDone: ret = FIDone;break; LOGERR(("FileInterner::internfile: can't skip\n"));
case MimeHandler::MHAgain: ret = FIAgain;break; return FIError;
}
} }
doc.mimetype = m_mime;
return ret; /* Try to get doc from the topmost filter */
while (!m_handlers.empty()) {
if (!vipath.empty()) {
}
if (!m_handlers.back()->has_documents()) {
// No docs at the current top level. Pop and see if there
// is something at the previous one
delete m_handlers.back();
m_handlers.pop_back();
continue;
}
if (!m_handlers.back()->next_document()) {
LOGERR(("FileInterner::internfile: next_document failed\n"));
return FIError;
}
// Look at what we've got
const std::map<std::string, std::string> *docdata =
&m_handlers.back()->get_meta_data();
map<string,string>::const_iterator it;
string charset;
it = docdata->find("charset");
if (it != docdata->end())
charset = it->second;
string mimetype;
it = docdata->find("mimetype");
if (it != docdata->end())
mimetype = it->second;
LOGDEB(("FileInterner::internfile:next_doc is %s\n",mimetype.c_str()));
// If we find a text/plain doc, we're done
if (!strcmp(mimetype.c_str(), "text/plain"))
break;
// Got a non text/plain doc. We need to stack another
// filter. Check current size
if (m_handlers.size() > MAXHANDLERS) {
// Stack too big. Skip this and go on to check if there is
// something else in the current back()
LOGDEB(("FileInterner::internfile: stack too high\n"));
continue;
}
Dijon::Filter *again = getMimeHandler(mimetype, m_cfg);
if (!again) {
// If we can't find a filter, this doc can't be handled
// but there can be other ones so we go on
LOGERR(("FileInterner::internfile: no filter for [%s]\n",
mimetype.c_str()));
continue;
}
again->set_property(Dijon::Filter::OPERATING_MODE,
m_forPreview ? "view" : "index");
again->set_property(Dijon::Filter::DEFAULT_CHARSET,
charset);
string ns;
const string *txt = &ns;
it = docdata->find("content");
if (it != docdata->end())
txt = &it->second;
if (!again->set_document_string(*txt)) {
LOGERR(("FileInterner::internfile: error reparsing for %s\n",
m_fn.c_str()));
delete again;
continue;
}
// add filter and go on
m_handlers.push_back(again);
if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
LOGERR(("FileInterner::internfile: can't skip\n"));
return FIError;
}
}
if (m_handlers.empty()) {
LOGERR(("FileInterner::internfile: stack empty\n"));
return FIError;
}
if (!m_forPreview) {
string &ipath = doc.ipath;
bool hasipath = false;
for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin();
it != m_handlers.end(); it++) {
map<string,string>::const_iterator iti =
(*it)->get_meta_data().find("ipath");
if (iti != (*it)->get_meta_data().end()) {
if (!iti->second.empty())
hasipath = true;
ipath += iti->second + "|";
} else {
ipath += "|";
}
}
if (hasipath) {
LOGDEB(("IPATH [%s]\n", ipath.c_str()));
string::size_type sit = ipath.find_last_not_of("|");
if (sit == string::npos)
ipath.erase();
else if (sit < ipath.length() -1)
ipath.erase(sit+1);
} else {
ipath.erase();
}
}
dijontorcl(m_handlers.back(), doc);
// Destack what can be
while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
delete m_handlers.back();
m_handlers.pop_back();
}
if (m_handlers.empty() || !m_handlers.back()->has_documents())
return FIDone;
else
return FIAgain;
}
bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
{
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("mimetype");
if (it != docdata->end())
doc.mimetype = it->second;
it = docdata->find("origcharset");
if (it != docdata->end())
doc.origcharset = it->second;
it = docdata->find("content");
if (it != docdata->end())
doc.text = it->second;
it = docdata->find("title");
if (it != docdata->end())
doc.title = it->second;
it = docdata->find("keywords");
if (it != docdata->end())
doc.keywords = it->second;
it = docdata->find("modificationdate");
if (it != docdata->end())
doc.dmtime = it->second;
it = docdata->find("abstract");
if (it != docdata->end()) {
doc.abstract = it->second;
} else {
it = docdata->find("sample");
if (it != docdata->end())
doc.abstract = it->second;
}
return true;
} }
FileInterner::~FileInterner() FileInterner::~FileInterner()
{ {
delete m_handler; while (!m_handlers.empty()) {
m_handler = 0; delete m_handlers.back();
m_handlers.pop_back();
}
tmpcleanup(); tmpcleanup();
} }
@ -212,6 +378,8 @@ using namespace std;
#include "debuglog.h" #include "debuglog.h"
#include "rclinit.h" #include "rclinit.h"
#include "internfile.h" #include "internfile.h"
#include "rclconfig.h"
#include "rcldoc.h"
static string thisprog; static string thisprog;

View File

@ -16,14 +16,19 @@
*/ */
#ifndef _INTERNFILE_H_INCLUDED_ #ifndef _INTERNFILE_H_INCLUDED_
#define _INTERNFILE_H_INCLUDED_ #define _INTERNFILE_H_INCLUDED_
/* @(#$Id: internfile.h,v 1.6 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <vector>
using std::string;
using std::vector;
#include "rclconfig.h" #include "Filter.h"
#include "rcldb.h"
class MimeHandler; class RclConfig;
namespace Rcl {
class Doc;
}
/// Turn external file into internal representation, according to mime /// Turn external file into internal representation, according to mime
/// type etc /// type etc
@ -43,8 +48,8 @@ class FileInterner {
* mime type for the uncompressed version. This currently doubles up * mime type for the uncompressed version. This currently doubles up
* to indicate that this object is for previewing (not indexing). * to indicate that this object is for previewing (not indexing).
*/ */
FileInterner(const std::string &fn, RclConfig *cnf, const string& td, FileInterner(const string &fn, RclConfig *cnf, const string& td,
const std::string *mtype = 0); const string *mtype = 0);
~FileInterner(); ~FileInterner();
@ -67,15 +72,16 @@ class FileInterner {
Status internfile(Rcl::Doc& doc, string &ipath); Status internfile(Rcl::Doc& doc, string &ipath);
private: private:
string m_fn; RclConfig *m_cfg;
RclConfig *m_cfg; string m_fn;
const string &m_tdir; bool m_forPreview;
MimeHandler *m_handler; // m_tdir and m_tfile are used only for decompressing input file if needed
const string& m_tdir;
string m_tfile; string m_tfile;
string m_mime; vector<Dijon::Filter*> m_handlers;
void tmpcleanup(); void tmpcleanup();
static bool dijontorcl(Dijon::Filter *, Rcl::Doc&);
}; };
#endif /* _INTERNFILE_H_INCLUDED_ */ #endif /* _INTERNFILE_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.7 2006-12-13 09:13:18 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -37,15 +37,15 @@ public:
// Execute an external program to translate a file from its native format // Execute an external program to translate a file from its native format
// to html. Then call the html parser to do the actual indexing // to html. Then call the html parser to do the actual indexing
MimeHandler::Status bool MimeHandlerExec::next_document()
MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{ {
if (m_havedoc == false)
return false;
m_havedoc = false;
if (params.empty()) { if (params.empty()) {
// Hu ho // Hu ho
LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n", LOGERR(("MimeHandlerExec::mkDoc: empty params\n"));
mtype.c_str())); return false;
return MimeHandler::MHError;
} }
// Command name // Command name
@ -54,10 +54,10 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
// Build parameter list: delete cmd name and add the file name // Build parameter list: delete cmd name and add the file name
list<string>::iterator it = params.begin(); list<string>::iterator it = params.begin();
list<string>myparams(++it, params.end()); list<string>myparams(++it, params.end());
myparams.push_back(fn); myparams.push_back(m_fn);
// Execute command and store the result text, which is supposedly html // Execute command and store the result text, which is supposedly html
string html; string& html = m_metaData["content"];
ExecCmd mexec; ExecCmd mexec;
MEAdv adv; MEAdv adv;
mexec.setAdvise(&adv); mexec.setAdvise(&adv);
@ -67,10 +67,12 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
if (status) { if (status) {
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n", LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
status, cmd.c_str())); status, cmd.c_str()));
return MimeHandler::MHError; return false;
} }
// Process/index the html m_metaData["origcharset"] = m_defcharset;
MimeHandlerHtml hh; // All recoll filters output utf-8
return hh.mkDoc(conf, fn, html, mtype, docout); m_metaData["charset"] = "utf-8";
m_metaData["mimetype"] = "text/html";
return true;
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _MH_EXEC_H_INCLUDED_ #ifndef _MH_EXEC_H_INCLUDED_
#define _MH_EXEC_H_INCLUDED_ #define _MH_EXEC_H_INCLUDED_
/* @(#$Id: mh_exec.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_exec.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -29,14 +29,19 @@
Turn external document into internal one by executing an external filter. Turn external document into internal one by executing an external filter.
The command to execute, and its parameters, come from the mimeconf file The command to execute, and its parameters, come from the mimeconf file
*/ */
class MimeHandlerExec : public MimeHandler { class MimeHandlerExec : public RecollFilter {
public: public:
std::list<std::string> params; std::list<std::string> params;
MimeHandlerExec(const string& mt) : RecollFilter(mt) {}
virtual ~MimeHandlerExec() {} virtual ~MimeHandlerExec() {}
virtual MimeHandler::Status virtual bool set_document_file(const string &file_path) {
mkDoc(RclConfig *conf, const std::string &fn, m_fn = file_path;
const std::string &mtype, Rcl::Doc &docout, std::string&); m_havedoc = true;
return true;
}
virtual bool next_document();
private:
string m_fn;
}; };
#endif /* _MH_EXEC_H_INCLUDED_ */ #endif /* _MH_EXEC_H_INCLUDED_ */

View File

@ -41,36 +41,31 @@ using namespace std;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
MimeHandler::Status bool MimeHandlerHtml::set_document_file(const string &fn)
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{ {
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str())); LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
string otext; string otext;
if (!file_to_string(fn, otext)) { if (!file_to_string(fn, otext)) {
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str())); LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
return MimeHandler::MHError; return false;
} }
return mkDoc(conf, fn, otext, mtype, docout); return set_document_string(otext);
} }
MimeHandler::Status bool MimeHandlerHtml::set_document_string(const string& htext)
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
const string& htext,
const string &mtype, Rcl::Doc &docout)
{ {
//LOGDEB(("textHtmlToDoc: htext: %s\n", htext.c_str())); m_html = htext;
// Character set handling: the initial guessed charset depends on m_havedoc = true;
// external factors: possible hint (ie mime charset in a mail return true;
// message), charset guessing, or default configured charset. }
string charset;
if (!charsethint.empty()) {
charset = charsethint;
} else if (conf->getGuessCharset()) {
charset = csguess(htext, conf->getDefCharset());
} else
charset = conf->getDefCharset();
bool MimeHandlerHtml::next_document()
{
if (m_havedoc == false)
return false;
m_havedoc = false;
LOGDEB(("textHtmlToDoc: next_document\n"));
string charset = m_defcharset;
// - We first try to convert from the default configured charset // - We first try to convert from the default configured charset
// (which may depend of the current directory) to utf-8. If this // (which may depend of the current directory) to utf-8. If this
@ -80,16 +75,16 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
// instead of the configuration one. // instead of the configuration one.
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
MyHtmlParser result;
MyHtmlParser p(m_metaData["content"]);
for (int pass = 0; pass < 2; pass++) { for (int pass = 0; pass < 2; pass++) {
string transcoded; string transcoded;
LOGDEB(("Html::mkDoc: pass %d\n", pass)); LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p;
// Try transcoding. If it fails, use original text. // Try transcoding. If it fails, use original text.
if (!transcode(htext, transcoded, charset, "UTF-8")) { if (!transcode(m_html, transcoded, charset, "UTF-8")) {
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n", LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
charset.c_str())); charset.c_str()));
transcoded = htext; transcoded = m_html;
// We don't know the charset, at all // We don't know the charset, at all
p.ocharset = p.charset = charset = ""; p.ocharset = p.charset = charset = "";
} else { } else {
@ -102,31 +97,29 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
try { try {
p.parse_html(transcoded); p.parse_html(transcoded);
// No exception: ok? // No exception: ok?
result = p;
break; break;
} catch (bool diag) { } catch (bool diag) {
result = p;
if (diag == true) if (diag == true)
break; break;
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
charset.c_str(),result.doccharset.c_str())); charset.c_str(), p.doccharset.c_str()));
if (!result.doccharset.empty() && if (!p.doccharset.empty() &&
!samecharset(result.doccharset, result.ocharset)) { !samecharset(p.doccharset, p.ocharset)) {
LOGDEB(("textHtmlToDoc: reparse for charsets\n")); LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
charset = result.doccharset; charset = p.doccharset;
} else { } else {
LOGERR(("textHtmlToDoc:: error: non charset exception\n")); LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
return MimeHandler::MHError; return false;
} }
} }
} }
docout.origcharset = charset; m_metaData["origcharset"] = m_defcharset;
docout.text = result.dump; m_metaData["charset"] = "utf-8";
//LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str())); m_metaData["title"] = p.title;
docout.title = result.title; m_metaData["keywords"] = p.keywords;
docout.keywords = result.keywords; m_metaData["modificationdate"] = p.dmtime;
docout.abstract = result.sample; m_metaData["sample"] = p.sample;
docout.dmtime = result.dmtime; m_metaData["mimetype"] = "text/plain";
return MimeHandler::MHDone; return true;
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _HTML_H_INCLUDED_ #ifndef _HTML_H_INCLUDED_
#define _HTML_H_INCLUDED_ #define _HTML_H_INCLUDED_
/* @(#$Id: mh_html.h,v 1.7 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_html.h,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
@ -24,26 +24,16 @@
/** /**
Translate html document to internal one. Translate html document to internal one.
There are 2 interfaces, depending if we're working on a file, or
on a string. The string form is applied to the output of external
handlers for foreign formats: they return a result in html, which
has the advantage to be text (easy to use in shell-scripts), and
semi-structured (can carry titles, abstracts, whatever)
*/ */
class MimeHandlerHtml : public MimeHandler { class MimeHandlerHtml : public RecollFilter {
public: public:
std::string charsethint; MimeHandlerHtml(const string& mt) : RecollFilter(mt) {}
virtual ~MimeHandlerHtml() {}
/** Create internal document from html file (standard interface) */ virtual bool set_document_file(const string &file_path);
virtual MimeHandler::Status virtual bool set_document_string(const string &data);
mkDoc(RclConfig *conf, const std::string &fn, virtual bool next_document();
const std::string &mtype, Rcl::Doc &docout, std::string&); private:
string m_html;
/** Create internal doc from html string (postfilter for external ones) */
virtual MimeHandler::Status
mkDoc(RclConfig *conf, const std::string &fn, const std::string& htext,
const std::string &mtype, Rcl::Doc &docout);
}; };
#endif /* _HTML_H_INCLUDED_ */ #endif /* _HTML_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -23,192 +23,81 @@ static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp
#include <errno.h> #include <errno.h>
#include <unistd.h> #include <unistd.h>
#include <time.h> #include <time.h>
#include <regex.h>
#include <map> #include <map>
#include <sstream> #include <sstream>
#include "mimehandler.h" #include "mimehandler.h"
#include "debuglog.h"
#include "csguess.h"
#include "readfile.h" #include "readfile.h"
#include "transcode.h" #include "transcode.h"
#include "mimeparse.h" #include "mimeparse.h"
#include "indextext.h"
#include "mh_mail.h" #include "mh_mail.h"
#include "debuglog.h" #include "debuglog.h"
#include "smallut.h" #include "smallut.h"
#include "mimeparse.h"
#include "mh_html.h" #include "mh_html.h"
// binc imap mime definitions // binc imap mime definitions
#include "mime.h" #include "mime.h"
#ifndef NO_NAMESPACES
using namespace std; using namespace std;
#endif /* NO_NAMESPACES */
static const int maxdepth = 20; static const int maxdepth = 20;
MimeHandlerMail::~MimeHandlerMail() MimeHandlerMail::~MimeHandlerMail()
{ {
if (m_vfp) { delete m_bincdoc;
fclose((FILE *)m_vfp); if (m_fd >= 0)
m_vfp = 0; close(m_fd);
} delete m_stream;
} }
bool MimeHandlerMail::set_document_file(const string &fn)
// We are called for two different file types: mbox-type folders
// holding multiple messages, and maildir-type files with one message
// ipath is non empty only when we are called for retrieving a single message
// for preview. It is always empty during indexing, and we fill it up with
// the message number for the returned doc
MimeHandler::Status
MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn,
const string &mtype, Rcl::Doc &docout, string& ipath)
{ {
LOGDEB2(("MimeHandlerMail::mkDoc: %s [%s]\n", mtype.c_str(), fn.c_str())); if (m_fd >= 0) {
m_conf = cnf; close(m_fd);
m_fd = -1;
if (!stringlowercmp("message/rfc822", mtype)) {
ipath = "";
int fd;
if ((fd = open(fn.c_str(), 0)) < 0) {
LOGERR(("MimeHandlerMail::mkDoc: open(%s) errno %d\n",
fn.c_str(), errno));
return MimeHandler::MHError;
}
Binc::MimeDocument doc;
doc.parseFull(fd);
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
fn.c_str()));
return MimeHandler::MHError;
}
MimeHandler::Status ret = processMsg(docout, doc, 0);
close(fd);
return ret;
} else if (!stringlowercmp("text/x-mail", mtype)) {
return processmbox(fn, docout, ipath);
} else // hu ho
return MimeHandler::MHError;
}
static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
static regex_t fromregex;
static bool regcompiled;
MimeHandler::Status
MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
{
int mtarg = 0;
if (ipath != "") {
sscanf(ipath.c_str(), "%d", &mtarg);
} }
LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(), m_fd = open(fn.c_str(), 0);
mtarg)); if (m_fd < 0) {
LOGERR(("MimeHandlerMail::set_document_file: open(%s) errno %d\n",
FILE *fp; fn.c_str(), errno));
// Open the file on first call, then save/reuse the file pointer return false;
if (!m_vfp) {
fp = fopen(fn.c_str(), "r");
if (fp == 0) {
LOGERR(("MimeHandlerMail::processmbox: error opening %s\n",
fn.c_str()));
return MimeHandler::MHError;
}
m_vfp = fp;
} else {
fp = (FILE *)m_vfp;
} }
if (!regcompiled) { delete m_bincdoc;
regcomp(&fromregex, frompat, REG_NOSUB); m_bincdoc = new Binc::MimeDocument;
regcompiled = true; m_bincdoc->parseFull(m_fd);
} if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
// If we are called to retrieve a specific message, seek to bof
// (then scan up to the message). This is for the case where the
// same object is reused to fetch several messages (else the fp is
// just opened no need for a seek). We could also check if the
// current message number is lower than the requested one and
// avoid rereading the whole thing in this case. But I'm not sure
// we're ever used in this way (multiple retrieves on same
// object). So:
if (mtarg > 0) {
fseek(fp, 0, SEEK_SET);
m_msgnum = 0;
}
off_t start, end;
bool iseof = false;
bool hademptyline = true;
string msgtxt;
do {
// Look for next 'From ' Line, start of message. Set start to
// line after this
char line[501];
for (;;) {
if (!fgets(line, 500, fp)) {
// Eof hit while looking for 'From ' -> file done. We'd need
// another return code here
return MimeHandler::MHError;
}
if (line[0] == '\n' || line[0] == '\r') {
hademptyline = true;
continue;
}
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
start = ftello(fp);
m_msgnum++;
break;
}
hademptyline = false;
}
// Look for next 'From ' line or eof, end of message.
for (;;) {
end = ftello(fp);
if (!fgets(line, 500, fp)) {
if (ferror(fp) || feof(fp))
iseof = true;
break;
}
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
break;
}
if (mtarg <= 0 || m_msgnum == mtarg) {
msgtxt += line;
}
if (line[0] == '\n' || line[0] == '\r') {
hademptyline = true;
} else {
hademptyline = false;
}
}
fseek(fp, end, SEEK_SET);
} while (mtarg > 0 && m_msgnum < mtarg);
stringstream s(msgtxt);
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
Binc::MimeDocument doc;
doc.parseFull(s);
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n",
fn.c_str())); fn.c_str()));
return MimeHandler::MHError; return false;
} }
m_havedoc = true;
MimeHandler::Status ret = processMsg(docout, doc, 0); return true;
if (ret == MimeHandler::MHError)
return ret;
char buf[20];
sprintf(buf, "%d", m_msgnum);
ipath = buf;
return iseof ? MimeHandler::MHDone :
(mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain;
} }
bool MimeHandlerMail::set_document_string(const string &msgtxt)
{
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
delete m_stream;
m_stream = new stringstream(msgtxt);
delete m_bincdoc;
m_bincdoc = new Binc::MimeDocument;
m_bincdoc->parseFull(*m_stream);
if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
LOGERR(("MimeHandlerMail::set_document_string: mime parse error\n"));
return false;
}
m_havedoc = true;
return true;
}
bool MimeHandlerMail::next_document()
{
if (!m_havedoc)
return false;
m_havedoc = false;
m_metaData["mimetype"] = "text/plain";
return processMsg(m_bincdoc, 0);
}
// Transform a single message into a document. The subject becomes the // Transform a single message into a document. The subject becomes the
// title, and any simple body part with a content-type of text or html // title, and any simple body part with a content-type of text or html
@ -217,58 +106,59 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
// If depth is not zero, we're called recursively for an // If depth is not zero, we're called recursively for an
// message/rfc822 part and we must not touch the doc fields except the // message/rfc822 part and we must not touch the doc fields except the
// text // text
MimeHandler::Status bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
int depth)
{ {
LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth)); LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth));
if (depth++ >= maxdepth) { if (depth++ >= maxdepth) {
// Have to stop somewhere // Have to stop somewhere
LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n", LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n",
maxdepth)); maxdepth));
return MimeHandler::MHDone; // Return true anyway, better to index partially than not at all
return true;
} }
// Handle some headers. // Handle some headers.
string& text = m_metaData["content"];
Binc::HeaderItem hi; Binc::HeaderItem hi;
string transcoded; string transcoded;
if (doc.h.getFirstHeader("From", hi)) { if (doc->h.getFirstHeader("From", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
docout.text += string("From: ") + transcoded + string("\n"); text += string("From: ") + transcoded + string("\n");
} }
if (doc.h.getFirstHeader("To", hi)) { if (doc->h.getFirstHeader("To", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
docout.text += string("To: ") + transcoded + string("\n"); text += string("To: ") + transcoded + string("\n");
} }
if (doc.h.getFirstHeader("Date", hi)) { if (doc->h.getFirstHeader("Date", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
if (depth == 1) { if (depth == 1) {
time_t t = rfc2822DateToUxTime(transcoded); time_t t = rfc2822DateToUxTime(transcoded);
if (t != (time_t)-1) { if (t != (time_t)-1) {
char ascuxtime[100]; char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)t); sprintf(ascuxtime, "%ld", (long)t);
docout.dmtime = ascuxtime; m_metaData["modificationdate"] = ascuxtime;
} else { } else {
// Leave mtime field alone, ftime will be used instead. // Leave mtime field alone, ftime will be used instead.
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str())); LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
} }
} }
docout.text += string("Date: ") + transcoded + string("\n"); text += string("Date: ") + transcoded + string("\n");
} }
if (doc.h.getFirstHeader("Subject", hi)) { if (doc->h.getFirstHeader("Subject", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
if (depth == 1) if (depth == 1)
docout.title = transcoded; m_metaData["title"] = transcoded;
docout.text += string("Subject: ") + transcoded + string("\n"); text += string("Subject: ") + transcoded + string("\n");
} }
docout.text += '\n'; text += '\n';
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n", LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str())); doc->isMultipart(), doc->getSubType().c_str()));
walkmime(docout, doc, depth); walkmime(doc, depth);
LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str())); LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n",
return MimeHandler::MHDone; m_metaData["content"].c_str()));
return true;
} }
// Recursively walk the message mime parts and concatenate all the // Recursively walk the message mime parts and concatenate all the
@ -281,8 +171,7 @@ MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
// //
// multipart can be mixed, alternative, parallel, digest. // multipart can be mixed, alternative, parallel, digest.
// message/rfc822 may also be of interest. // message/rfc822 may also be of interest.
void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
{ {
LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth)); LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth));
if (depth++ >= maxdepth) { if (depth++ >= maxdepth) {
@ -290,28 +179,29 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
return; return;
} }
string &out = docout.text; string& out = m_metaData["content"];
if (doc.isMultipart()) { if (doc->isMultipart()) {
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str())); doc->isMultipart(), doc->getSubType().c_str()));
// We only handle alternative, related and mixed (no digests). // We only handle alternative, related and mixed (no digests).
std::vector<Binc::MimePart>::iterator it; std::vector<Binc::MimePart>::iterator it;
if (!stringicmp("mixed", doc.getSubType()) || if (!stringicmp("mixed", doc->getSubType()) ||
!stringicmp("related", doc.getSubType())) { !stringicmp("related", doc->getSubType())) {
// Multipart mixed and related: process each part. // Multipart mixed and related: process each part.
for (it = doc.members.begin(); it != doc.members.end();it++) { for (it = doc->members.begin(); it != doc->members.end();it++) {
walkmime(docout, *it, depth); walkmime(&(*it), depth);
} }
} else if (!stringicmp("alternative", doc.getSubType())) { } else if (!stringicmp("alternative", doc->getSubType())) {
// Multipart/alternative: look for a text/plain part, then html. // Multipart/alternative: look for a text/plain part, then html.
// Process if found // Process if found
std::vector<Binc::MimePart>::iterator ittxt, ithtml; std::vector<Binc::MimePart>::iterator ittxt, ithtml;
ittxt = ithtml = doc.members.end(); ittxt = ithtml = doc->members.end();
int i = 1; int i = 1;
for (it = doc.members.begin(); it != doc.members.end();it++, i++) { for (it = doc->members.begin();
it != doc->members.end(); it++, i++) {
// Get and parse content-type header // Get and parse content-type header
Binc::HeaderItem hi; Binc::HeaderItem hi;
if (!it->h.getFirstHeader("Content-Type", hi)) { if (!it->h.getFirstHeader("Content-Type", hi)) {
@ -326,12 +216,12 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
else if (!stringlowercmp("text/html", content_type.value)) else if (!stringlowercmp("text/html", content_type.value))
ithtml = it; ithtml = it;
} }
if (ittxt != doc.members.end()) { if (ittxt != doc->members.end()) {
LOGDEB2(("walkmime: alternative: chose text/plain part\n")) LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
walkmime(docout, *ittxt, depth); walkmime(&(*ittxt), depth);
} else if (ithtml != doc.members.end()) { } else if (ithtml != doc->members.end()) {
LOGDEB2(("walkmime: alternative: chose text/html part\n")) LOGDEB2(("walkmime: alternative: chose text/html part\n"))
walkmime(docout, *ithtml, depth); walkmime(&(*ithtml), depth);
} }
} }
return; return;
@ -343,7 +233,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
// Get and parse content-type header. // Get and parse content-type header.
Binc::HeaderItem hi; Binc::HeaderItem hi;
string ctt = "text/plain"; string ctt = "text/plain";
if (doc.h.getFirstHeader("Content-Type", hi)) { if (doc->h.getFirstHeader("Content-Type", hi)) {
ctt = hi.getValue(); ctt = hi.getValue();
} }
LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str())); LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
@ -352,7 +242,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
// Get and parse Content-Disposition header // Get and parse Content-Disposition header
string ctd = "inline"; string ctd = "inline";
if (doc.h.getFirstHeader("Content-Disposition", hi)) { if (doc->h.getFirstHeader("Content-Disposition", hi)) {
ctd = hi.getValue(); ctd = hi.getValue();
} }
MimeHeaderValue content_disposition; MimeHeaderValue content_disposition;
@ -371,13 +261,13 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
if (it != content_disposition.params.end()) if (it != content_disposition.params.end())
filename = it->second; filename = it->second;
if (doc.isMessageRFC822()) { if (doc->isMessageRFC822()) {
LOGDEB2(("walkmime: message/RFC822 part\n")); LOGDEB2(("walkmime: message/RFC822 part\n"));
// The first part is the already parsed message. Call // The first part is the already parsed message. Call
// processMsg instead of walkmime so that mail headers get // processMsg instead of walkmime so that mail headers get
// printed. The depth will tell it what to do // printed. The depth will tell it what to do
if (doc.members.empty()) { if (doc->members.empty()) {
//?? //??
return; return;
} }
@ -388,7 +278,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
if (m_forPreview) if (m_forPreview)
out += "]"; out += "]";
out += "\n\n"; out += "\n\n";
processMsg(docout, doc.members[0], depth); processMsg(&doc->members[0], depth);
return; return;
} }
@ -437,14 +327,14 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
// Content transfer encoding // Content transfer encoding
string cte = "7bit"; string cte = "7bit";
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) { if (doc->h.getFirstHeader("Content-Transfer-Encoding", hi)) {
cte = hi.getValue(); cte = hi.getValue();
} }
LOGDEB2(("walkmime: final: body start offset %d, length %d\n", LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
doc.getBodyStartOffset(), doc.getBodyLength())); doc->getBodyStartOffset(), doc->getBodyLength()));
string body; string body;
doc.getBody(body, 0, doc.bodylength); doc->getBody(body, 0, doc->bodylength);
// Decode according to content transfer encoding // Decode according to content transfer encoding
if (!stringlowercmp("quoted-printable", cte)) { if (!stringlowercmp("quoted-printable", cte)) {
@ -472,22 +362,30 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
// Handle html stripping and transcoding to utf8 // Handle html stripping and transcoding to utf8
string utf8; string utf8;
const string *putf8 = 0;
if (!stringlowercmp("text/html", content_type.value)) { if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh; MimeHandlerHtml mh("text/html");
Rcl::Doc hdoc; mh.set_property(Dijon::Filter::OPERATING_MODE,
mh.charsethint = charset; m_forPreview ? "view" : "index");
mh.mkDoc(m_conf, "", body, content_type.value, hdoc); mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
utf8 = hdoc.text; mh.set_document_string(body);
mh.next_document();
map<string, string>::const_iterator it =
mh.get_meta_data().find("content");
if (it != mh.get_meta_data().end())
putf8 = &it->second;
} else { } else {
// Transcode to utf-8 // Transcode to utf-8
if (!transcode(body, utf8, charset, "UTF-8")) { if (!transcode(body, utf8, charset, "UTF-8")) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str())); charset.c_str()));
utf8 = body; putf8 = &body;
} else {
putf8 = &utf8;
} }
} }
if (putf8)
out += utf8; out += *putf8;
if (out.length() && out[out.length()-1] != '\n') if (out.length() && out[out.length()-1] != '\n')
out += '\n'; out += '\n';

View File

@ -16,8 +16,9 @@
*/ */
#ifndef _MAIL_H_INCLUDED_ #ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.8 2006-09-19 14:30:39 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <sstream>
#include "mimehandler.h" #include "mimehandler.h"
namespace Binc { namespace Binc {
@ -30,26 +31,21 @@ namespace Binc {
* for maildir files). This has to keep state while parsing a mail folder * for maildir files). This has to keep state while parsing a mail folder
* file. * file.
*/ */
class MimeHandlerMail : public MimeHandler { class MimeHandlerMail : public RecollFilter {
public: public:
MimeHandlerMail() : m_vfp(0), m_msgnum(0), m_conf(0) {} MimeHandlerMail(const string &mt)
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0)
virtual MimeHandler::Status {}
mkDoc(RclConfig *conf, const std::string &fn,
const std::string &mtype, Rcl::Doc &docout, std::string& ipath);
virtual ~MimeHandlerMail(); virtual ~MimeHandlerMail();
virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string &data);
virtual bool next_document();
private: private:
void *m_vfp; // File pointer for folder Binc::MimeDocument *m_bincdoc;
int m_msgnum; // Current message number in folder. Starts at 1 bool processMsg(Binc::MimePart *doc, int depth);
RclConfig *m_conf; // Keep pointer to rclconfig around void walkmime(Binc::MimePart* doc, int depth);
int m_fd;
MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout, std::stringstream *m_stream;
string &ipath);
MimeHandler::Status processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
int depth);
void walkmime(Rcl::Doc &docout, Binc::MimePart& doc, int depth);
}; };
#endif /* _MAIL_H_INCLUDED_ */ #endif /* _MAIL_H_INCLUDED_ */

166
src/internfile/mh_mbox.cpp Normal file
View File

@ -0,0 +1,166 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <time.h>
#include <regex.h>
#include <map>
#include <sstream>
#include "mimehandler.h"
#include "debuglog.h"
#include "readfile.h"
#include "mh_mbox.h"
#include "smallut.h"
using namespace std;
MimeHandlerMbox::~MimeHandlerMbox()
{
if (m_vfp) {
fclose((FILE *)m_vfp);
m_vfp = 0;
}
}
bool MimeHandlerMbox::set_document_file(const string &fn)
{
LOGDEB(("MimeHandlerMbox::set_document_file(%s)\n", fn.c_str()));
m_fn = fn;
if (m_vfp) {
fclose((FILE *)m_vfp);
m_vfp = 0;
}
m_vfp = fopen(fn.c_str(), "r");
if (m_vfp == 0) {
LOGERR(("MimeHandlerMail::set_document_file: error opening %s\n",
fn.c_str()));
return false;
}
m_havedoc = true;
return true;
}
static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
static regex_t fromregex;
static bool regcompiled;
bool MimeHandlerMbox::next_document()
{
if (m_vfp == 0) {
LOGERR(("MimeHandlerMbox::next_document: not open\n"));
return false;
}
if (!m_havedoc) {
return false;
}
FILE *fp = (FILE *)m_vfp;
int mtarg = 0;
if (m_ipath != "") {
sscanf(m_ipath.c_str(), "%d", &mtarg);
} else if (m_forPreview) {
// Can't preview an mbox
return false;
}
LOGDEB(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n",
m_fn.c_str(), m_msgnum, mtarg));
if (!regcompiled) {
regcomp(&fromregex, frompat, REG_NOSUB);
regcompiled = true;
}
// If we are called to retrieve a specific message, seek to bof
// (then scan up to the message). This is for the case where the
// same object is reused to fetch several messages (else the fp is
// just opened no need for a seek). We could also check if the
// current message number is lower than the requested one and
// avoid rereading the whole thing in this case. But I'm not sure
// we're ever used in this way (multiple retrieves on same
// object). So:
if (mtarg > 0) {
fseek(fp, 0, SEEK_SET);
m_msgnum = 0;
}
off_t start, end;
bool iseof = false;
bool hademptyline = true;
string& msgtxt = m_metaData["content"];
msgtxt.erase();
do {
// Look for next 'From ' Line, start of message. Set start to
// line after this
char line[501];
for (;;) {
if (!fgets(line, 500, fp)) {
// Eof hit while looking for 'From ' -> file done. We'd need
// another return code here
return false;
}
if (line[0] == '\n' || line[0] == '\r') {
hademptyline = true;
continue;
}
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
start = ftello(fp);
m_msgnum++;
break;
}
hademptyline = false;
}
// Look for next 'From ' line or eof, end of message.
for (;;) {
end = ftello(fp);
if (!fgets(line, 500, fp)) {
if (ferror(fp) || feof(fp))
iseof = true;
break;
}
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
break;
}
if (mtarg <= 0 || m_msgnum == mtarg) {
msgtxt += line;
}
if (line[0] == '\n' || line[0] == '\r') {
hademptyline = true;
} else {
hademptyline = false;
}
}
fseek(fp, end, SEEK_SET);
} while (mtarg > 0 && m_msgnum < mtarg);
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
char buf[20];
sprintf(buf, "%d", m_msgnum);
m_metaData["ipath"] = buf;
m_metaData["mimetype"] = "message/rfc822";
if (iseof)
m_havedoc = false;
return msgtxt.empty() ? false : true;
}

51
src/internfile/mh_mbox.h Normal file
View File

@ -0,0 +1,51 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _MBOX_H_INCLUDED_
#define _MBOX_H_INCLUDED_
/* @(#$Id: mh_mbox.h,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
using std::string;
#include "mimehandler.h"
/**
* Translate a mail folder file into internal documents (also works
* for maildir files). This has to keep state while parsing a mail folder
* file.
*/
class MimeHandlerMbox : public RecollFilter {
public:
MimeHandlerMbox(const string& mime)
: RecollFilter(mime), m_vfp(0), m_msgnum(0)
{}
virtual ~MimeHandlerMbox();
virtual bool set_document_file(const string &file_path);
virtual bool next_document();
virtual bool skip_to_document(const string& ipath) {
m_ipath = ipath;
return true;
}
private:
string m_fn; // File name
void *m_vfp; // File pointer for folder
int m_msgnum; // Current message number in folder. Starts at 1
string m_ipath;
};
#endif /* _MBOX_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.5 2006-03-20 15:14:08 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.6 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -31,34 +31,44 @@ using namespace std;
#include "transcode.h" #include "transcode.h"
// Process a plain text file // Process a plain text file
MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn, bool MimeHandlerText::set_document_file(const string &fn)
const string &mtype, Rcl::Doc &docout, string&)
{ {
string otext; string otext;
if (!file_to_string(fn, otext)) if (!file_to_string(fn, otext))
return MimeHandler::MHError; return false;
return set_document_string(otext);
// Try to guess charset, then convert to utf-8, and fill document }
// fields The charset guesser really doesnt work well in general
// and should be avoided (especially for short documents) bool MimeHandlerText::set_document_string(const string& otext)
string charset; {
if (conf->getGuessCharset()) { m_text = otext;
charset = csguess(otext, conf->getDefCharset()); m_havedoc = true;
} else return true;
charset = conf->getDefCharset(); }
bool MimeHandlerText::next_document()
{
if (m_havedoc == false)
return false;
m_havedoc = false;
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
charset.c_str())); m_defcharset.c_str()));
string utf8; // Avoid unneeded copy. This gets a reference to an empty string which is
if (!transcode(otext, utf8, charset, "UTF-8")) { // the entry for "content"
string& utf8 = m_metaData["content"];
// Note that we transcode always even if defcharset is already utf-8:
// this validates the encoding.
if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) {
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed " LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
"for charset [%s]\n", charset.c_str())); "for charset [%s]\n", m_defcharset.c_str()));
otext.erase(); utf8.erase();
return MimeHandler::MHError; return false;
} }
docout.origcharset = charset; m_metaData["origcharset"] = m_defcharset;
docout.text = utf8; m_metaData["charset"] = "utf-8";
return MimeHandler::MHDone; m_metaData["mimetype"] = "text/plain";
return true;
} }

View File

@ -16,12 +16,11 @@
*/ */
#ifndef _MH_TEXT_H_INCLUDED_ #ifndef _MH_TEXT_H_INCLUDED_
#define _MH_TEXT_H_INCLUDED_ #define _MH_TEXT_H_INCLUDED_
/* @(#$Id: mh_text.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_text.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
using std::string;
#include "rclconfig.h"
#include "rcldb.h"
#include "mimehandler.h" #include "mimehandler.h"
/** /**
@ -29,12 +28,15 @@
* *
* Maybe try to guess charset, or use default, then transcode to utf8 * Maybe try to guess charset, or use default, then transcode to utf8
*/ */
class MimeHandlerText : public MimeHandler { class MimeHandlerText : public RecollFilter {
public: public:
MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, MimeHandlerText(const string& mt) : RecollFilter(mt) {}
const std::string &mtype, Rcl::Doc &docout, virtual ~MimeHandlerText() {}
std::string&); virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string&);
virtual bool next_document();
private:
string m_text;
}; };
#endif /* _MH_TEXT_H_INCLUDED_ */ #endif /* _MH_TEXT_H_INCLUDED_ */

View File

@ -16,24 +16,33 @@
*/ */
#ifndef _MH_UNKNOWN_H_INCLUDED_ #ifndef _MH_UNKNOWN_H_INCLUDED_
#define _MH_UNKNOWN_H_INCLUDED_ #define _MH_UNKNOWN_H_INCLUDED_
/* @(#$Id: mh_unknown.h,v 1.1 2006-03-28 09:36:53 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_unknown.h,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include "rclconfig.h"
#include "rcldb.h"
#include "mimehandler.h" #include "mimehandler.h"
/** /**
* Handler for files with no content handler: does nothing. * Handler for files with no content handler: does nothing.
* *
*/ */
class MimeHandlerUnknown : public MimeHandler { class MimeHandlerUnknown : public RecollFilter {
public: public:
MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, MimeHandlerUnknown(const string& mt) : RecollFilter(mt) {}
const std::string &mtype, Rcl::Doc &docout, virtual ~MimeHandlerUnknown() {}
std::string&) { virtual bool set_document_string(const string&) {
return MimeHandler::MHDone; return m_havedoc = true;
}
virtual bool set_document_file(const string&) {
return m_havedoc = true;
}
virtual bool next_document() {
if (m_havedoc == false)
return false;
m_havedoc = false;
m_metaData["content"] = "";
m_metaData["mimetype"] = "text/plain";
return true;
} }
}; };

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.20 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -20,37 +20,40 @@ static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes
#include <iostream> #include <iostream>
#include <string> #include <string>
#ifndef NO_NAMESPACES
using namespace std; using namespace std;
#endif /* NO_NAMESPACES */
#include "mimehandler.h" #include "mimehandler.h"
#include "debuglog.h" #include "debuglog.h"
#include "rclconfig.h"
#include "smallut.h" #include "smallut.h"
#include "mh_exec.h"
#include "mh_html.h" #include "mh_html.h"
#include "mh_mail.h" #include "mh_mail.h"
#include "mh_mbox.h"
#include "mh_text.h" #include "mh_text.h"
#include "mh_exec.h"
#include "mh_unknown.h" #include "mh_unknown.h"
/** Create internal handler object appropriate for given mime type */ /** Create internal handler object appropriate for given mime type */
static MimeHandler *mhFactory(const string &mime) static Dijon::Filter *mhFactory(const string &mime)
{ {
if (!stringlowercmp("text/plain", mime)) if (!stringlowercmp("text/plain", mime))
return new MimeHandlerText; return new MimeHandlerText("text/plain");
else if (!stringlowercmp("text/html", mime)) else if (!stringlowercmp("text/html", mime))
return new MimeHandlerHtml; return new MimeHandlerHtml("text/html");
else if (!stringlowercmp("text/x-mail", mime)) else if (!stringlowercmp("text/x-mail", mime))
return new MimeHandlerMail; return new MimeHandlerMbox("text/x-mail");
else if (!stringlowercmp("message/rfc822", mime)) else if (!stringlowercmp("message/rfc822", mime))
return new MimeHandlerMail; return new MimeHandlerMail("message/rfc822");
return 0; else
return new MimeHandlerUnknown("application/octet-stream");
} }
/** /**
* Return handler object for given mime type: * Return handler object for given mime type:
*/ */
MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg) Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg)
{ {
// Get handler definition for mime type // Get handler definition for mime type
string hs; string hs;
@ -78,7 +81,7 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
mtype.c_str(), hs.c_str())); mtype.c_str(), hs.c_str()));
return 0; return 0;
} }
MimeHandlerExec *h = new MimeHandlerExec; MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str());
it++; it++;
h->params.push_back(cfg->findFilter(*it++)); h->params.push_back(cfg->findFilter(*it++));
h->params.insert(h->params.end(), it, toks.end()); h->params.insert(h->params.end(), it, toks.end());
@ -93,7 +96,8 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
bool indexunknown = false; bool indexunknown = false;
cfg->getConfParam("indexallfilenames", &indexunknown); cfg->getConfParam("indexallfilenames", &indexunknown);
if (indexunknown) { if (indexunknown) {
return new MimeHandlerUnknown; LOGDEB(("getMimeHandler: returning MimeHandlerUnknown\n"));
return new MimeHandlerUnknown("application/octet-stream");
} else { } else {
return 0; return 0;
} }

View File

@ -16,60 +16,74 @@
*/ */
#ifndef _MIMEHANDLER_H_INCLUDED_ #ifndef _MIMEHANDLER_H_INCLUDED_
#define _MIMEHANDLER_H_INCLUDED_ #define _MIMEHANDLER_H_INCLUDED_
/* @(#$Id: mimehandler.h,v 1.12 2006-03-29 13:08:08 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mimehandler.h,v 1.13 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
using std::string;
using std::list;
#include "rclconfig.h" #include <Filter.h>
#include "rcldb.h"
class RclConfig;
/** class RecollFilter : public Dijon::Filter {
* Document interner class. public:
*/ RecollFilter(const string& mtype)
class MimeHandler { : Dijon::Filter(mtype), m_forPreview(false), m_havedoc(false)
public: {}
MimeHandler() : m_forPreview(false) {} virtual ~RecollFilter() {}
virtual ~MimeHandler() {} virtual bool set_property(Properties p, const string &v) {
switch (p) {
case DEFAULT_CHARSET:
m_defcharset = v;
break;
case OPERATING_MODE:
if (!v.empty() && v[0] == 'v')
m_forPreview = true;
else
m_forPreview = false;
break;
}
return true;
}
/// Status from mkDoc method. // We don't use this for now
enum Status {MHError, MHDone, MHAgain}; virtual bool set_document_uri(const std::string &) {return false;}
/**
* Transform external data into internal utf8 document
*
* @param conf the global configuration
* @param filename File from which the data comes from
* @param mimetype its mime type (from the mimemap configuration file)
* @param outdoc The output document
* @param ipath the access path for the document inside the file.
* For mono-document file types, this will always be empty.
* It is used, for example for mbox files which may contain
* multiple emails. If this is not empty in input, then the
* caller is requesting a single document (ie: for display).
* If this is empty (during indexation), it will be filled-up
* by the function, and all the file's documents will be
* returned by successive calls.
* @return The return value indicates if there are more documents to be
* fetched from the same file.
*/
virtual MimeHandler::Status mkDoc(RclConfig * conf,
const std::string &filename,
const std::string &mimetype,
Rcl::Doc& outdoc,
string& ipath) = 0;
virtual void setForPreview(bool onoff) {m_forPreview = onoff;}; // Default implementations
virtual bool set_document_string(const std::string &) {return false;}
virtual bool set_document_data(const char *cp, unsigned int sz) {
return set_document_string(string(cp, sz));
}
protected: virtual bool has_documents() const {return m_havedoc;}
bool m_forPreview;
// Most doc types are single-doc
virtual bool skip_to_document(const string& s) {
if (s.empty())
return true;
return false;
}
virtual DataInput get_required_data_input() const
{return DOCUMENT_FILE_NAME;}
virtual string get_error() const {
return m_reason;
}
protected:
bool m_forPreview;
string m_defcharset;
string m_reason;
bool m_havedoc;
}; };
/** /**
* Return indexing handler object for the given mime type * Return indexing handler object for the given mime type
* returned pointer should be deleted by caller * returned pointer should be deleted by caller
*/ */
extern MimeHandler *getMimeHandler(const std::string &mtyp, RclConfig *cfg); extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg);
/// Can this mime type be interned ? /// Can this mime type be interned ?
extern bool canIntern(const std::string mimetype, RclConfig *cfg); extern bool canIntern(const std::string mimetype, RclConfig *cfg);

View File

@ -37,11 +37,13 @@ class MyHtmlParser : public HtmlParser {
bool in_body_tag; bool in_body_tag;
bool in_pre_tag; bool in_pre_tag;
bool pending_space; bool pending_space;
string title, sample, keywords, dump, dmtime; bool indexing_allowed;
string title, sample, keywords, dmtime;
string localdump;
string &dump;
string ocharset; // This is the charset our user thinks the doc was string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text); void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p); void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag); void closing_tag(const string &tag);
@ -52,5 +54,16 @@ class MyHtmlParser : public HtmlParser {
in_body_tag(false), in_body_tag(false),
in_pre_tag(false), in_pre_tag(false),
pending_space(false), pending_space(false),
indexing_allowed(true) { } indexing_allowed(true),
dump(localdump)
{ }
MyHtmlParser(string& buf) :
in_script_tag(false),
in_style_tag(false),
in_body_tag(false),
in_pre_tag(false),
pending_space(false),
indexing_allowed(true),
dump(buf)
{ }
}; };

View File

@ -8,8 +8,8 @@ LIBS = librcl.a
all: $(LIBS) all: $(LIBS)
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_text.o mimehandler.o myhtmlparse.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o
@ -35,6 +35,10 @@ mimetype.o : ../index/mimetype.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../index/mimetype.cpp $(CXX) $(ALL_CXXFLAGS) -c ../index/mimetype.cpp
htmlparse.o : ../internfile/htmlparse.cpp htmlparse.o : ../internfile/htmlparse.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/htmlparse.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/htmlparse.cpp
myhtmlparse.o : ../internfile/myhtmlparse.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp
mimehandler.o : ../internfile/mimehandler.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp
internfile.o : ../internfile/internfile.cpp internfile.o : ../internfile/internfile.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/internfile.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/internfile.cpp
mh_exec.o : ../internfile/mh_exec.cpp mh_exec.o : ../internfile/mh_exec.cpp
@ -43,12 +47,10 @@ mh_html.o : ../internfile/mh_html.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_html.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_html.cpp
mh_mail.o : ../internfile/mh_mail.cpp mh_mail.o : ../internfile/mh_mail.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mail.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mail.cpp
mh_mbox.o : ../internfile/mh_mbox.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp
mh_text.o : ../internfile/mh_text.cpp mh_text.o : ../internfile/mh_text.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp
mimehandler.o : ../internfile/mimehandler.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp
myhtmlparse.o : ../internfile/myhtmlparse.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp
docseq.o : ../query/docseq.cpp docseq.o : ../query/docseq.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp $(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp
history.o : ../query/history.cpp history.o : ../query/history.cpp
@ -124,6 +126,12 @@ mimetype.dep.stamp : ../index/mimetype.cpp
htmlparse.dep.stamp : ../internfile/htmlparse.cpp htmlparse.dep.stamp : ../internfile/htmlparse.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/htmlparse.cpp > htmlparse.dep $(CXX) -M $(ALL_CXXFLAGS) ../internfile/htmlparse.cpp > htmlparse.dep
touch htmlparse.dep.stamp touch htmlparse.dep.stamp
myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep
touch myhtmlparse.dep.stamp
mimehandler.dep.stamp : ../internfile/mimehandler.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep
touch mimehandler.dep.stamp
internfile.dep.stamp : ../internfile/internfile.cpp internfile.dep.stamp : ../internfile/internfile.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/internfile.cpp > internfile.dep $(CXX) -M $(ALL_CXXFLAGS) ../internfile/internfile.cpp > internfile.dep
touch internfile.dep.stamp touch internfile.dep.stamp
@ -136,15 +144,12 @@ mh_html.dep.stamp : ../internfile/mh_html.cpp
mh_mail.dep.stamp : ../internfile/mh_mail.cpp mh_mail.dep.stamp : ../internfile/mh_mail.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mail.cpp > mh_mail.dep $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mail.cpp > mh_mail.dep
touch mh_mail.dep.stamp touch mh_mail.dep.stamp
mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mbox.cpp > mh_mbox.dep
touch mh_mbox.dep.stamp
mh_text.dep.stamp : ../internfile/mh_text.cpp mh_text.dep.stamp : ../internfile/mh_text.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep
touch mh_text.dep.stamp touch mh_text.dep.stamp
mimehandler.dep.stamp : ../internfile/mimehandler.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep
touch mimehandler.dep.stamp
myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep
touch myhtmlparse.dep.stamp
docseq.dep.stamp : ../query/docseq.cpp docseq.dep.stamp : ../query/docseq.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep $(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep
touch docseq.dep.stamp touch docseq.dep.stamp
@ -217,13 +222,14 @@ include csguess.dep
include indexer.dep include indexer.dep
include mimetype.dep include mimetype.dep
include htmlparse.dep include htmlparse.dep
include myhtmlparse.dep
include mimehandler.dep
include internfile.dep include internfile.dep
include mh_exec.dep include mh_exec.dep
include mh_html.dep include mh_html.dep
include mh_mail.dep include mh_mail.dep
include mh_mbox.dep
include mh_text.dep include mh_text.dep
include mimehandler.dep
include myhtmlparse.dep
include docseq.dep include docseq.dep
include history.dep include history.dep
include sortseq.dep include sortseq.dep

View File

@ -13,13 +13,14 @@ ${depth}/index/csguess.cpp \
${depth}/index/indexer.cpp \ ${depth}/index/indexer.cpp \
${depth}/index/mimetype.cpp \ ${depth}/index/mimetype.cpp \
${depth}/internfile/htmlparse.cpp \ ${depth}/internfile/htmlparse.cpp \
${depth}/internfile/myhtmlparse.cpp \
${depth}/internfile/mimehandler.cpp \
${depth}/internfile/internfile.cpp \ ${depth}/internfile/internfile.cpp \
${depth}/internfile/mh_exec.cpp \ ${depth}/internfile/mh_exec.cpp \
${depth}/internfile/mh_html.cpp \ ${depth}/internfile/mh_html.cpp \
${depth}/internfile/mh_mail.cpp \ ${depth}/internfile/mh_mail.cpp \
${depth}/internfile/mh_mbox.cpp \
${depth}/internfile/mh_text.cpp \ ${depth}/internfile/mh_text.cpp \
${depth}/internfile/mimehandler.cpp \
${depth}/internfile/myhtmlparse.cpp \
${depth}/query/docseq.cpp \ ${depth}/query/docseq.cpp \
${depth}/query/history.cpp \ ${depth}/query/history.cpp \
${depth}/query/sortseq.cpp \ ${depth}/query/sortseq.cpp \

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: smallut.cpp,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -260,13 +260,14 @@ bool stringToStrings(const string &s, std::list<string> &tokens)
} }
void stringToTokens(const string& str, list<string>& tokens, void stringToTokens(const string& str, list<string>& tokens,
const string& delims) const string& delims, bool skipinit)
{ {
string::size_type startPos, pos; string::size_type startPos = 0, pos;
for (pos = 0;;) { for (pos = 0;;) {
// Skip initial delims, break if this eats all. // Skip initial delims, break if this eats all.
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos) if (skipinit &&
(startPos = str.find_first_not_of(delims, pos)) == string::npos)
break; break;
// Find next delimiter or end of string (end of token) // Find next delimiter or end of string (end of token)
pos = str.find_first_of(delims, startPos); pos = str.find_first_of(delims, startPos);

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _SMALLUT_H_INCLUDED_ #ifndef _SMALLUT_H_INCLUDED_
#define _SMALLUT_H_INCLUDED_ #define _SMALLUT_H_INCLUDED_
/* @(#$Id: smallut.h,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: smallut.h,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
#include <map> #include <map>
@ -51,7 +51,7 @@ extern bool stringToStrings(const string &s, list<string> &tokens);
* Split input string. No handling of quoting * Split input string. No handling of quoting
*/ */
extern void stringToTokens(const string &s, list<string> &tokens, extern void stringToTokens(const string &s, list<string> &tokens,
const string &delims = " \t"); const string &delims = " \t", bool skipinit=true);
/** Convert string to boolean */ /** Convert string to boolean */
extern bool stringToBool(const string &s); extern bool stringToBool(const string &s);