Dijon filters 1st step: mostly working needs check and optim
This commit is contained in:
parent
1973c06346
commit
33c95ef1ba
164
src/internfile/Filter.h
Normal file
164
src/internfile/Filter.h
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
/*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Library General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _DIJON_FILTER_H
|
||||||
|
#define _DIJON_FILTER_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
namespace Dijon
|
||||||
|
{
|
||||||
|
class Filter;
|
||||||
|
|
||||||
|
/** Provides the list of MIME types supported by the filter(s).
|
||||||
|
* The character string is allocated with new[].
|
||||||
|
* This function is exported by dynamically loaded filter libraries.
|
||||||
|
*/
|
||||||
|
typedef bool (get_filter_types_func)(std::set<std::string> &);
|
||||||
|
/** Returns what data should be passed to the filter(s).
|
||||||
|
* Output is cast from Filter::DataInput to int for convenience.
|
||||||
|
* This function is exported by dynamically loaded filter libraries.
|
||||||
|
* The aim is to let the client application know before-hand whether
|
||||||
|
* it should load documents or not.
|
||||||
|
*/
|
||||||
|
typedef int (get_filter_data_input_func)(void);
|
||||||
|
/** Returns a Filter that handles the given MIME type.
|
||||||
|
* The Filter object is allocated with new.
|
||||||
|
* This function is exported by dynamically loaded filter libraries
|
||||||
|
* and serves as a factory for Filter objects, so that the client
|
||||||
|
* application doesn't have to know which Filter sub-types handle
|
||||||
|
* which MIME types.
|
||||||
|
*/
|
||||||
|
typedef Filter *(get_filter_func)(const std::string &);
|
||||||
|
|
||||||
|
/// Filter interface.
|
||||||
|
class Filter
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
/// Builds an empty filter.
|
||||||
|
Filter(const std::string &mime_type) {}
|
||||||
|
/// Destroys the filter.
|
||||||
|
virtual ~Filter() {}
|
||||||
|
|
||||||
|
|
||||||
|
// Enumerations.
|
||||||
|
|
||||||
|
/** What data a filter supports as input.
|
||||||
|
* It can be either the whole document data, its file name, or its URI.
|
||||||
|
*/
|
||||||
|
typedef enum { DOCUMENT_DATA=0, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput;
|
||||||
|
|
||||||
|
/** Input properties supported by the filter.
|
||||||
|
* - PREFERRED_CHARSET is the charset preferred by the client application.
|
||||||
|
* The filter will convert document's content to this charset if possible.
|
||||||
|
* - OPERATING_MODE can be set to either view or index.
|
||||||
|
*/
|
||||||
|
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE } Properties;
|
||||||
|
|
||||||
|
|
||||||
|
// Information.
|
||||||
|
|
||||||
|
/// Returns what data the filter requires as input.
|
||||||
|
virtual DataInput get_required_data_input(void) const = 0;
|
||||||
|
|
||||||
|
|
||||||
|
// Initialization.
|
||||||
|
|
||||||
|
/** Sets a property, prior to calling set_document_XXX().
|
||||||
|
* Returns false if the property is not supported.
|
||||||
|
*/
|
||||||
|
virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0;
|
||||||
|
|
||||||
|
/** (Re)initializes the filter with the given data.
|
||||||
|
* Caller should ensure the given pointer is valid until the
|
||||||
|
* Filter object is destroyed, as some filters may not need to
|
||||||
|
* do a deep copy of the data.
|
||||||
|
* Returns false if this input is not supported or an error occured.
|
||||||
|
*/
|
||||||
|
virtual bool set_document_data(const char *data_ptr, unsigned int data_length) = 0;
|
||||||
|
virtual bool set_document_string(const string&) = 0;
|
||||||
|
|
||||||
|
/** (Re)initializes the filter with the given file.
|
||||||
|
* Returns false if this input is not supported or an error occured.
|
||||||
|
*/
|
||||||
|
virtual bool set_document_file(const std::string &file_path) = 0;
|
||||||
|
|
||||||
|
/** (Re)initializes the filter with the given URI.
|
||||||
|
* Returns false if this input is not supported or an error occured.
|
||||||
|
*/
|
||||||
|
virtual bool set_document_uri(const std::string &uri) = 0;
|
||||||
|
|
||||||
|
|
||||||
|
// Going from one nested document to the next.
|
||||||
|
|
||||||
|
/** Returns true if there are nested documents left to extract.
|
||||||
|
* Returns false if the end of the parent document was reached
|
||||||
|
* or an error occured.
|
||||||
|
*/
|
||||||
|
virtual bool has_documents(void) const = 0;
|
||||||
|
|
||||||
|
/** Moves to the next nested document.
|
||||||
|
* Returns false if there are none left.
|
||||||
|
*/
|
||||||
|
virtual bool next_document(void) = 0;
|
||||||
|
|
||||||
|
/** Skips to the nested document with the given ipath.
|
||||||
|
* Returns false if no such document exists.
|
||||||
|
*/
|
||||||
|
virtual bool skip_to_document(const std::string &ipath) = 0;
|
||||||
|
|
||||||
|
|
||||||
|
// Accessing documents' contents.
|
||||||
|
|
||||||
|
/// Returns the message for the most recent error that has occured.
|
||||||
|
virtual std::string get_error(void) const = 0;
|
||||||
|
|
||||||
|
/** Returns a dictionary of metadata extracted from the current document.
|
||||||
|
* Metadata fields may include one or more of the following :
|
||||||
|
* content, title, ipath, mimetype, language, charset, author, creator,
|
||||||
|
* publisher, modificationdate, creationdate, size
|
||||||
|
* Special considerations apply :
|
||||||
|
* - content may contain binary data, watch out !
|
||||||
|
* - ipath is an internal path to the nested document that can be
|
||||||
|
* later passed to skip_to_document(). It may be empty if the parent
|
||||||
|
* document's type doesn't allow embedding, in which case the filter
|
||||||
|
* should only return one document.
|
||||||
|
* - mimetype should be text/plain if the document could be handled
|
||||||
|
* internally, empty if unknown. If any other value, it is expected
|
||||||
|
* that the client application can pass the nested document's content
|
||||||
|
* to another filter that supports this particular type.
|
||||||
|
*/
|
||||||
|
const std::map<std::string, std::string> &get_meta_data(void) const
|
||||||
|
{
|
||||||
|
return m_metaData;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/// Metadata dictionary.
|
||||||
|
std::map<std::string, std::string> m_metaData;
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// Filter objects cannot be copied.
|
||||||
|
Filter(const Filter &other);
|
||||||
|
/// Filter objects cannot be copied.
|
||||||
|
Filter& operator=(const Filter& other);
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // _DIJON_FILTER_H
|
||||||
@ -1,9 +1,9 @@
|
|||||||
# @(#$Id: Makefile,v 1.1 2006-11-15 07:27:42 dockes Exp $ (C) 2005 J.F.Dockes
|
# @(#$Id: Makefile,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes
|
||||||
depth = ..
|
depth = ..
|
||||||
include $(depth)/mk/sysconf
|
include $(depth)/mk/sysconf
|
||||||
|
|
||||||
# Only test executables get build in here
|
# Only test executables get build in here
|
||||||
PROGS = internfile unacpp textsplit rclconfig
|
PROGS = internfile
|
||||||
|
|
||||||
all: $(BIGLIB) $(PROGS)
|
all: $(BIGLIB) $(PROGS)
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.18 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -32,12 +32,14 @@ using namespace std;
|
|||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
|
||||||
#include "internfile.h"
|
#include "internfile.h"
|
||||||
|
#include "rcldoc.h"
|
||||||
#include "mimetype.h"
|
#include "mimetype.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
#include "execmd.h"
|
#include "execmd.h"
|
||||||
#include "pathut.h"
|
#include "pathut.h"
|
||||||
#include "wipedir.h"
|
#include "wipedir.h"
|
||||||
|
#include "rclconfig.h"
|
||||||
|
|
||||||
// Execute the command to uncompress a file into a temporary one.
|
// Execute the command to uncompress a file into a temporary one.
|
||||||
static bool uncompressfile(RclConfig *conf, const string& ifn,
|
static bool uncompressfile(RclConfig *conf, const string& ifn,
|
||||||
@ -106,98 +108,262 @@ void FileInterner::tmpcleanup()
|
|||||||
// internfile
|
// internfile
|
||||||
FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
|
FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
|
||||||
const string& td, const string *imime)
|
const string& td, const string *imime)
|
||||||
: m_fn(f), m_cfg(cnf), m_tdir(td), m_handler(0)
|
: m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td)
|
||||||
{
|
{
|
||||||
// We are actually going to access the file, so it's ok
|
bool usfci = false;
|
||||||
// performancewise to check this config variable at every call
|
cnf->getConfParam("usesystemfilecommand", &usfci);
|
||||||
// even if it can only change when we change directories
|
|
||||||
string usfc;
|
|
||||||
int usfci;
|
|
||||||
if (!cnf->getConfParam("usesystemfilecommand", usfc))
|
|
||||||
usfci = 0;
|
|
||||||
else
|
|
||||||
usfci = atoi(usfc.c_str()) ? 1 : 0;
|
|
||||||
LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci));
|
LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci));
|
||||||
|
|
||||||
bool forPreview = imime ? true : false;
|
|
||||||
|
|
||||||
// We need to run mime type identification in any case to check
|
// We need to run mime type identification in any case to check
|
||||||
// for a compressed file.
|
// for a compressed file.
|
||||||
m_mime = mimetype(m_fn, m_cfg, usfci);
|
string l_mime = mimetype(m_fn, m_cfg, usfci);
|
||||||
|
|
||||||
// If identification fails, try to use the input parameter. This
|
// If identification fails, try to use the input parameter. This
|
||||||
// is then normally not a compressed type (it's the mime type from
|
// is then normally not a compressed type (it's the mime type from
|
||||||
// the db), and is only set when previewing, not for indexing
|
// the db), and is only set when previewing, not for indexing
|
||||||
if (m_mime.empty() && imime)
|
if (l_mime.empty() && imime)
|
||||||
m_mime = *imime;
|
l_mime = *imime;
|
||||||
|
|
||||||
if (!m_mime.empty()) {
|
if (!l_mime.empty()) {
|
||||||
// Has mime: check for a compressed file. If so, create a
|
// Has mime: check for a compressed file. If so, create a
|
||||||
// temporary uncompressed file, and rerun the mime type
|
// temporary uncompressed file, and rerun the mime type
|
||||||
// identification, then do the rest with the temp file.
|
// identification, then do the rest with the temp file.
|
||||||
list<string>ucmd;
|
list<string>ucmd;
|
||||||
if (m_cfg->getUncompressor(m_mime, ucmd)) {
|
if (m_cfg->getUncompressor(l_mime, ucmd)) {
|
||||||
if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
|
if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n",
|
LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n",
|
||||||
m_tdir.c_str(), m_tfile.c_str()));
|
m_tdir.c_str(), m_tfile.c_str()));
|
||||||
m_fn = m_tfile;
|
m_fn = m_tfile;
|
||||||
m_mime = mimetype(m_fn, m_cfg, usfci);
|
l_mime = mimetype(m_fn, m_cfg, usfci);
|
||||||
if (m_mime.empty() && imime)
|
if (l_mime.empty() && imime)
|
||||||
m_mime = *imime;
|
l_mime = *imime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_mime.empty()) {
|
if (l_mime.empty()) {
|
||||||
// No mime type. We let it through as config may warrant that
|
// No mime type. We let it through as config may warrant that
|
||||||
// we index all file names
|
// we index all file names
|
||||||
LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
|
LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Look for appropriate handler (might still return empty)
|
// Look for appropriate handler (might still return empty)
|
||||||
m_handler = getMimeHandler(m_mime, m_cfg);
|
Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);
|
||||||
|
|
||||||
if (!m_handler) {
|
if (!df) {
|
||||||
// No handler for this type, for now :( if indexallfilenames
|
// No handler for this type, for now :( if indexallfilenames
|
||||||
// is set in the config, this normally wont happen (we get mh_unknown)
|
// is set in the config, this normally wont happen (we get mh_unknown)
|
||||||
LOGDEB(("FileInterner::FileInterner: %s: no handler\n",
|
LOGDEB(("FileInterner:: no handler for %s\n", l_mime.c_str()));
|
||||||
m_mime.c_str()));
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
m_handler->setForPreview(forPreview);
|
df->set_property(Dijon::Filter::OPERATING_MODE,
|
||||||
LOGDEB(("FileInterner::FileInterner: %s [%s]\n", m_mime.c_str(),
|
m_forPreview ? "view" : "index");
|
||||||
|
|
||||||
|
string charset = m_cfg->getDefCharset();
|
||||||
|
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||||
|
if (!df->set_document_file(m_fn)) {
|
||||||
|
LOGERR(("FileInterner:: error parsing %s\n", m_fn.c_str()));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
m_handlers.reserve(20);
|
||||||
|
m_handlers.push_back(df);
|
||||||
|
LOGDEB(("FileInterner::FileInterner: %s [%s]\n", l_mime.c_str(),
|
||||||
m_fn.c_str()));
|
m_fn.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const unsigned int MAXHANDLERS = 20;
|
||||||
|
|
||||||
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||||
{
|
{
|
||||||
if (!m_handler) {
|
if (m_handlers.size() != 1) {
|
||||||
LOGERR(("FileInterner::internfile: no handler !!\n"));
|
LOGERR(("FileInterner::internfile: bad stack size %d !!\n",
|
||||||
|
m_handlers.size()));
|
||||||
return FIError;
|
return FIError;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Turn file into a document. The document has fields for title, body
|
// Note that the vector is big enough for the maximum stack. All values
|
||||||
// etc., all text converted to utf8
|
// over the last significant one are ""
|
||||||
MimeHandler::Status mhs =
|
vector<string> vipath(MAXHANDLERS);
|
||||||
m_handler->mkDoc(m_cfg, m_fn, m_mime, doc, ipath);
|
int vipathidx = 0;
|
||||||
FileInterner::Status ret = FIError;
|
if (!ipath.empty()) {
|
||||||
switch (mhs) {
|
list<string> lipath;
|
||||||
case MimeHandler::MHError:
|
stringToTokens(ipath, lipath, "|", true);
|
||||||
LOGERR(("FileInterner::internfile: error parsing %s\n", m_fn.c_str()));
|
vipath.insert(vipath.begin(), lipath.begin(), lipath.end());
|
||||||
break;
|
if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
|
||||||
case MimeHandler::MHDone: ret = FIDone;break;
|
LOGERR(("FileInterner::internfile: can't skip\n"));
|
||||||
case MimeHandler::MHAgain: ret = FIAgain;break;
|
return FIError;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
doc.mimetype = m_mime;
|
|
||||||
return ret;
|
/* Try to get doc from the topmost filter */
|
||||||
|
while (!m_handlers.empty()) {
|
||||||
|
if (!vipath.empty()) {
|
||||||
|
|
||||||
|
}
|
||||||
|
if (!m_handlers.back()->has_documents()) {
|
||||||
|
// No docs at the current top level. Pop and see if there
|
||||||
|
// is something at the previous one
|
||||||
|
delete m_handlers.back();
|
||||||
|
m_handlers.pop_back();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!m_handlers.back()->next_document()) {
|
||||||
|
LOGERR(("FileInterner::internfile: next_document failed\n"));
|
||||||
|
return FIError;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look at what we've got
|
||||||
|
const std::map<std::string, std::string> *docdata =
|
||||||
|
&m_handlers.back()->get_meta_data();
|
||||||
|
map<string,string>::const_iterator it;
|
||||||
|
string charset;
|
||||||
|
it = docdata->find("charset");
|
||||||
|
if (it != docdata->end())
|
||||||
|
charset = it->second;
|
||||||
|
string mimetype;
|
||||||
|
it = docdata->find("mimetype");
|
||||||
|
if (it != docdata->end())
|
||||||
|
mimetype = it->second;
|
||||||
|
|
||||||
|
LOGDEB(("FileInterner::internfile:next_doc is %s\n",mimetype.c_str()));
|
||||||
|
// If we find a text/plain doc, we're done
|
||||||
|
if (!strcmp(mimetype.c_str(), "text/plain"))
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Got a non text/plain doc. We need to stack another
|
||||||
|
// filter. Check current size
|
||||||
|
if (m_handlers.size() > MAXHANDLERS) {
|
||||||
|
// Stack too big. Skip this and go on to check if there is
|
||||||
|
// something else in the current back()
|
||||||
|
LOGDEB(("FileInterner::internfile: stack too high\n"));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Dijon::Filter *again = getMimeHandler(mimetype, m_cfg);
|
||||||
|
if (!again) {
|
||||||
|
// If we can't find a filter, this doc can't be handled
|
||||||
|
// but there can be other ones so we go on
|
||||||
|
LOGERR(("FileInterner::internfile: no filter for [%s]\n",
|
||||||
|
mimetype.c_str()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
again->set_property(Dijon::Filter::OPERATING_MODE,
|
||||||
|
m_forPreview ? "view" : "index");
|
||||||
|
again->set_property(Dijon::Filter::DEFAULT_CHARSET,
|
||||||
|
charset);
|
||||||
|
string ns;
|
||||||
|
const string *txt = &ns;
|
||||||
|
it = docdata->find("content");
|
||||||
|
if (it != docdata->end())
|
||||||
|
txt = &it->second;
|
||||||
|
if (!again->set_document_string(*txt)) {
|
||||||
|
LOGERR(("FileInterner::internfile: error reparsing for %s\n",
|
||||||
|
m_fn.c_str()));
|
||||||
|
delete again;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// add filter and go on
|
||||||
|
m_handlers.push_back(again);
|
||||||
|
if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
|
||||||
|
LOGERR(("FileInterner::internfile: can't skip\n"));
|
||||||
|
return FIError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m_handlers.empty()) {
|
||||||
|
LOGERR(("FileInterner::internfile: stack empty\n"));
|
||||||
|
return FIError;
|
||||||
|
}
|
||||||
|
if (!m_forPreview) {
|
||||||
|
string &ipath = doc.ipath;
|
||||||
|
bool hasipath = false;
|
||||||
|
for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin();
|
||||||
|
it != m_handlers.end(); it++) {
|
||||||
|
map<string,string>::const_iterator iti =
|
||||||
|
(*it)->get_meta_data().find("ipath");
|
||||||
|
if (iti != (*it)->get_meta_data().end()) {
|
||||||
|
if (!iti->second.empty())
|
||||||
|
hasipath = true;
|
||||||
|
ipath += iti->second + "|";
|
||||||
|
} else {
|
||||||
|
ipath += "|";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (hasipath) {
|
||||||
|
LOGDEB(("IPATH [%s]\n", ipath.c_str()));
|
||||||
|
string::size_type sit = ipath.find_last_not_of("|");
|
||||||
|
if (sit == string::npos)
|
||||||
|
ipath.erase();
|
||||||
|
else if (sit < ipath.length() -1)
|
||||||
|
ipath.erase(sit+1);
|
||||||
|
} else {
|
||||||
|
ipath.erase();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dijontorcl(m_handlers.back(), doc);
|
||||||
|
|
||||||
|
// Destack what can be
|
||||||
|
while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
|
||||||
|
delete m_handlers.back();
|
||||||
|
m_handlers.pop_back();
|
||||||
|
}
|
||||||
|
if (m_handlers.empty() || !m_handlers.back()->has_documents())
|
||||||
|
return FIDone;
|
||||||
|
else
|
||||||
|
return FIAgain;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
|
||||||
|
{
|
||||||
|
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
|
||||||
|
map<string,string>::const_iterator it;
|
||||||
|
|
||||||
|
it = docdata->find("mimetype");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.mimetype = it->second;
|
||||||
|
|
||||||
|
it = docdata->find("origcharset");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.origcharset = it->second;
|
||||||
|
|
||||||
|
it = docdata->find("content");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.text = it->second;
|
||||||
|
|
||||||
|
it = docdata->find("title");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.title = it->second;
|
||||||
|
|
||||||
|
it = docdata->find("keywords");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.keywords = it->second;
|
||||||
|
|
||||||
|
it = docdata->find("modificationdate");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.dmtime = it->second;
|
||||||
|
|
||||||
|
it = docdata->find("abstract");
|
||||||
|
if (it != docdata->end()) {
|
||||||
|
doc.abstract = it->second;
|
||||||
|
} else {
|
||||||
|
it = docdata->find("sample");
|
||||||
|
if (it != docdata->end())
|
||||||
|
doc.abstract = it->second;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
FileInterner::~FileInterner()
|
FileInterner::~FileInterner()
|
||||||
{
|
{
|
||||||
delete m_handler;
|
while (!m_handlers.empty()) {
|
||||||
m_handler = 0;
|
delete m_handlers.back();
|
||||||
|
m_handlers.pop_back();
|
||||||
|
}
|
||||||
tmpcleanup();
|
tmpcleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,6 +378,8 @@ using namespace std;
|
|||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "rclinit.h"
|
#include "rclinit.h"
|
||||||
#include "internfile.h"
|
#include "internfile.h"
|
||||||
|
#include "rclconfig.h"
|
||||||
|
#include "rcldoc.h"
|
||||||
|
|
||||||
static string thisprog;
|
static string thisprog;
|
||||||
|
|
||||||
|
|||||||
@ -16,14 +16,19 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _INTERNFILE_H_INCLUDED_
|
#ifndef _INTERNFILE_H_INCLUDED_
|
||||||
#define _INTERNFILE_H_INCLUDED_
|
#define _INTERNFILE_H_INCLUDED_
|
||||||
/* @(#$Id: internfile.h,v 1.6 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
#include "rclconfig.h"
|
#include "Filter.h"
|
||||||
#include "rcldb.h"
|
|
||||||
|
|
||||||
class MimeHandler;
|
class RclConfig;
|
||||||
|
namespace Rcl {
|
||||||
|
class Doc;
|
||||||
|
}
|
||||||
|
|
||||||
/// Turn external file into internal representation, according to mime
|
/// Turn external file into internal representation, according to mime
|
||||||
/// type etc
|
/// type etc
|
||||||
@ -43,8 +48,8 @@ class FileInterner {
|
|||||||
* mime type for the uncompressed version. This currently doubles up
|
* mime type for the uncompressed version. This currently doubles up
|
||||||
* to indicate that this object is for previewing (not indexing).
|
* to indicate that this object is for previewing (not indexing).
|
||||||
*/
|
*/
|
||||||
FileInterner(const std::string &fn, RclConfig *cnf, const string& td,
|
FileInterner(const string &fn, RclConfig *cnf, const string& td,
|
||||||
const std::string *mtype = 0);
|
const string *mtype = 0);
|
||||||
|
|
||||||
~FileInterner();
|
~FileInterner();
|
||||||
|
|
||||||
@ -67,15 +72,16 @@ class FileInterner {
|
|||||||
Status internfile(Rcl::Doc& doc, string &ipath);
|
Status internfile(Rcl::Doc& doc, string &ipath);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string m_fn;
|
RclConfig *m_cfg;
|
||||||
RclConfig *m_cfg;
|
string m_fn;
|
||||||
const string &m_tdir;
|
bool m_forPreview;
|
||||||
MimeHandler *m_handler;
|
// m_tdir and m_tfile are used only for decompressing input file if needed
|
||||||
|
const string& m_tdir;
|
||||||
string m_tfile;
|
string m_tfile;
|
||||||
string m_mime;
|
vector<Dijon::Filter*> m_handlers;
|
||||||
|
|
||||||
void tmpcleanup();
|
void tmpcleanup();
|
||||||
|
static bool dijontorcl(Dijon::Filter *, Rcl::Doc&);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _INTERNFILE_H_INCLUDED_ */
|
#endif /* _INTERNFILE_H_INCLUDED_ */
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.7 2006-12-13 09:13:18 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -37,15 +37,15 @@ public:
|
|||||||
|
|
||||||
// Execute an external program to translate a file from its native format
|
// Execute an external program to translate a file from its native format
|
||||||
// to html. Then call the html parser to do the actual indexing
|
// to html. Then call the html parser to do the actual indexing
|
||||||
MimeHandler::Status
|
bool MimeHandlerExec::next_document()
|
||||||
MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
|
|
||||||
const string &mtype, Rcl::Doc &docout, string&)
|
|
||||||
{
|
{
|
||||||
|
if (m_havedoc == false)
|
||||||
|
return false;
|
||||||
|
m_havedoc = false;
|
||||||
if (params.empty()) {
|
if (params.empty()) {
|
||||||
// Hu ho
|
// Hu ho
|
||||||
LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n",
|
LOGERR(("MimeHandlerExec::mkDoc: empty params\n"));
|
||||||
mtype.c_str()));
|
return false;
|
||||||
return MimeHandler::MHError;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Command name
|
// Command name
|
||||||
@ -54,10 +54,10 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
|
|||||||
// Build parameter list: delete cmd name and add the file name
|
// Build parameter list: delete cmd name and add the file name
|
||||||
list<string>::iterator it = params.begin();
|
list<string>::iterator it = params.begin();
|
||||||
list<string>myparams(++it, params.end());
|
list<string>myparams(++it, params.end());
|
||||||
myparams.push_back(fn);
|
myparams.push_back(m_fn);
|
||||||
|
|
||||||
// Execute command and store the result text, which is supposedly html
|
// Execute command and store the result text, which is supposedly html
|
||||||
string html;
|
string& html = m_metaData["content"];
|
||||||
ExecCmd mexec;
|
ExecCmd mexec;
|
||||||
MEAdv adv;
|
MEAdv adv;
|
||||||
mexec.setAdvise(&adv);
|
mexec.setAdvise(&adv);
|
||||||
@ -67,10 +67,12 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
|
|||||||
if (status) {
|
if (status) {
|
||||||
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
|
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
|
||||||
status, cmd.c_str()));
|
status, cmd.c_str()));
|
||||||
return MimeHandler::MHError;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process/index the html
|
m_metaData["origcharset"] = m_defcharset;
|
||||||
MimeHandlerHtml hh;
|
// All recoll filters output utf-8
|
||||||
return hh.mkDoc(conf, fn, html, mtype, docout);
|
m_metaData["charset"] = "utf-8";
|
||||||
|
m_metaData["mimetype"] = "text/html";
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _MH_EXEC_H_INCLUDED_
|
#ifndef _MH_EXEC_H_INCLUDED_
|
||||||
#define _MH_EXEC_H_INCLUDED_
|
#define _MH_EXEC_H_INCLUDED_
|
||||||
/* @(#$Id: mh_exec.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: mh_exec.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -29,14 +29,19 @@
|
|||||||
Turn external document into internal one by executing an external filter.
|
Turn external document into internal one by executing an external filter.
|
||||||
The command to execute, and its parameters, come from the mimeconf file
|
The command to execute, and its parameters, come from the mimeconf file
|
||||||
*/
|
*/
|
||||||
class MimeHandlerExec : public MimeHandler {
|
class MimeHandlerExec : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
std::list<std::string> params;
|
std::list<std::string> params;
|
||||||
|
MimeHandlerExec(const string& mt) : RecollFilter(mt) {}
|
||||||
virtual ~MimeHandlerExec() {}
|
virtual ~MimeHandlerExec() {}
|
||||||
virtual MimeHandler::Status
|
virtual bool set_document_file(const string &file_path) {
|
||||||
mkDoc(RclConfig *conf, const std::string &fn,
|
m_fn = file_path;
|
||||||
const std::string &mtype, Rcl::Doc &docout, std::string&);
|
m_havedoc = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
virtual bool next_document();
|
||||||
|
private:
|
||||||
|
string m_fn;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _MH_EXEC_H_INCLUDED_ */
|
#endif /* _MH_EXEC_H_INCLUDED_ */
|
||||||
|
|||||||
@ -41,36 +41,31 @@ using namespace std;
|
|||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
|
||||||
|
|
||||||
MimeHandler::Status
|
bool MimeHandlerHtml::set_document_file(const string &fn)
|
||||||
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &fn,
|
|
||||||
const string &mtype, Rcl::Doc &docout, string&)
|
|
||||||
{
|
{
|
||||||
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
||||||
string otext;
|
string otext;
|
||||||
if (!file_to_string(fn, otext)) {
|
if (!file_to_string(fn, otext)) {
|
||||||
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
||||||
return MimeHandler::MHError;
|
return false;
|
||||||
}
|
}
|
||||||
return mkDoc(conf, fn, otext, mtype, docout);
|
return set_document_string(otext);
|
||||||
}
|
}
|
||||||
|
|
||||||
MimeHandler::Status
|
bool MimeHandlerHtml::set_document_string(const string& htext)
|
||||||
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
|
||||||
const string& htext,
|
|
||||||
const string &mtype, Rcl::Doc &docout)
|
|
||||||
{
|
{
|
||||||
//LOGDEB(("textHtmlToDoc: htext: %s\n", htext.c_str()));
|
m_html = htext;
|
||||||
// Character set handling: the initial guessed charset depends on
|
m_havedoc = true;
|
||||||
// external factors: possible hint (ie mime charset in a mail
|
return true;
|
||||||
// message), charset guessing, or default configured charset.
|
}
|
||||||
string charset;
|
|
||||||
if (!charsethint.empty()) {
|
|
||||||
charset = charsethint;
|
|
||||||
} else if (conf->getGuessCharset()) {
|
|
||||||
charset = csguess(htext, conf->getDefCharset());
|
|
||||||
} else
|
|
||||||
charset = conf->getDefCharset();
|
|
||||||
|
|
||||||
|
bool MimeHandlerHtml::next_document()
|
||||||
|
{
|
||||||
|
if (m_havedoc == false)
|
||||||
|
return false;
|
||||||
|
m_havedoc = false;
|
||||||
|
LOGDEB(("textHtmlToDoc: next_document\n"));
|
||||||
|
string charset = m_defcharset;
|
||||||
|
|
||||||
// - We first try to convert from the default configured charset
|
// - We first try to convert from the default configured charset
|
||||||
// (which may depend of the current directory) to utf-8. If this
|
// (which may depend of the current directory) to utf-8. If this
|
||||||
@ -80,16 +75,16 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
|||||||
// instead of the configuration one.
|
// instead of the configuration one.
|
||||||
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
||||||
|
|
||||||
MyHtmlParser result;
|
|
||||||
|
MyHtmlParser p(m_metaData["content"]);
|
||||||
for (int pass = 0; pass < 2; pass++) {
|
for (int pass = 0; pass < 2; pass++) {
|
||||||
string transcoded;
|
string transcoded;
|
||||||
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
||||||
MyHtmlParser p;
|
|
||||||
// Try transcoding. If it fails, use original text.
|
// Try transcoding. If it fails, use original text.
|
||||||
if (!transcode(htext, transcoded, charset, "UTF-8")) {
|
if (!transcode(m_html, transcoded, charset, "UTF-8")) {
|
||||||
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
||||||
charset.c_str()));
|
charset.c_str()));
|
||||||
transcoded = htext;
|
transcoded = m_html;
|
||||||
// We don't know the charset, at all
|
// We don't know the charset, at all
|
||||||
p.ocharset = p.charset = charset = "";
|
p.ocharset = p.charset = charset = "";
|
||||||
} else {
|
} else {
|
||||||
@ -102,31 +97,29 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
|||||||
try {
|
try {
|
||||||
p.parse_html(transcoded);
|
p.parse_html(transcoded);
|
||||||
// No exception: ok?
|
// No exception: ok?
|
||||||
result = p;
|
|
||||||
break;
|
break;
|
||||||
} catch (bool diag) {
|
} catch (bool diag) {
|
||||||
result = p;
|
|
||||||
if (diag == true)
|
if (diag == true)
|
||||||
break;
|
break;
|
||||||
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
||||||
charset.c_str(),result.doccharset.c_str()));
|
charset.c_str(), p.doccharset.c_str()));
|
||||||
if (!result.doccharset.empty() &&
|
if (!p.doccharset.empty() &&
|
||||||
!samecharset(result.doccharset, result.ocharset)) {
|
!samecharset(p.doccharset, p.ocharset)) {
|
||||||
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
||||||
charset = result.doccharset;
|
charset = p.doccharset;
|
||||||
} else {
|
} else {
|
||||||
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
||||||
return MimeHandler::MHError;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
docout.origcharset = charset;
|
m_metaData["origcharset"] = m_defcharset;
|
||||||
docout.text = result.dump;
|
m_metaData["charset"] = "utf-8";
|
||||||
//LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str()));
|
m_metaData["title"] = p.title;
|
||||||
docout.title = result.title;
|
m_metaData["keywords"] = p.keywords;
|
||||||
docout.keywords = result.keywords;
|
m_metaData["modificationdate"] = p.dmtime;
|
||||||
docout.abstract = result.sample;
|
m_metaData["sample"] = p.sample;
|
||||||
docout.dmtime = result.dmtime;
|
m_metaData["mimetype"] = "text/plain";
|
||||||
return MimeHandler::MHDone;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _HTML_H_INCLUDED_
|
#ifndef _HTML_H_INCLUDED_
|
||||||
#define _HTML_H_INCLUDED_
|
#define _HTML_H_INCLUDED_
|
||||||
/* @(#$Id: mh_html.h,v 1.7 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: mh_html.h,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -24,26 +24,16 @@
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
Translate html document to internal one.
|
Translate html document to internal one.
|
||||||
|
|
||||||
There are 2 interfaces, depending if we're working on a file, or
|
|
||||||
on a string. The string form is applied to the output of external
|
|
||||||
handlers for foreign formats: they return a result in html, which
|
|
||||||
has the advantage to be text (easy to use in shell-scripts), and
|
|
||||||
semi-structured (can carry titles, abstracts, whatever)
|
|
||||||
*/
|
*/
|
||||||
class MimeHandlerHtml : public MimeHandler {
|
class MimeHandlerHtml : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
std::string charsethint;
|
MimeHandlerHtml(const string& mt) : RecollFilter(mt) {}
|
||||||
|
virtual ~MimeHandlerHtml() {}
|
||||||
/** Create internal document from html file (standard interface) */
|
virtual bool set_document_file(const string &file_path);
|
||||||
virtual MimeHandler::Status
|
virtual bool set_document_string(const string &data);
|
||||||
mkDoc(RclConfig *conf, const std::string &fn,
|
virtual bool next_document();
|
||||||
const std::string &mtype, Rcl::Doc &docout, std::string&);
|
private:
|
||||||
|
string m_html;
|
||||||
/** Create internal doc from html string (postfilter for external ones) */
|
|
||||||
virtual MimeHandler::Status
|
|
||||||
mkDoc(RclConfig *conf, const std::string &fn, const std::string& htext,
|
|
||||||
const std::string &mtype, Rcl::Doc &docout);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _HTML_H_INCLUDED_ */
|
#endif /* _HTML_H_INCLUDED_ */
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -23,192 +23,81 @@ static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <regex.h>
|
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
#include "debuglog.h"
|
|
||||||
#include "csguess.h"
|
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
#include "mimeparse.h"
|
#include "mimeparse.h"
|
||||||
#include "indextext.h"
|
|
||||||
#include "mh_mail.h"
|
#include "mh_mail.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "mimeparse.h"
|
|
||||||
#include "mh_html.h"
|
#include "mh_html.h"
|
||||||
|
|
||||||
// binc imap mime definitions
|
// binc imap mime definitions
|
||||||
#include "mime.h"
|
#include "mime.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
#endif /* NO_NAMESPACES */
|
|
||||||
|
|
||||||
static const int maxdepth = 20;
|
static const int maxdepth = 20;
|
||||||
|
|
||||||
MimeHandlerMail::~MimeHandlerMail()
|
MimeHandlerMail::~MimeHandlerMail()
|
||||||
{
|
{
|
||||||
if (m_vfp) {
|
delete m_bincdoc;
|
||||||
fclose((FILE *)m_vfp);
|
if (m_fd >= 0)
|
||||||
m_vfp = 0;
|
close(m_fd);
|
||||||
}
|
delete m_stream;
|
||||||
}
|
}
|
||||||
|
bool MimeHandlerMail::set_document_file(const string &fn)
|
||||||
// We are called for two different file types: mbox-type folders
|
|
||||||
// holding multiple messages, and maildir-type files with one message
|
|
||||||
// ipath is non empty only when we are called for retrieving a single message
|
|
||||||
// for preview. It is always empty during indexing, and we fill it up with
|
|
||||||
// the message number for the returned doc
|
|
||||||
MimeHandler::Status
|
|
||||||
MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn,
|
|
||||||
const string &mtype, Rcl::Doc &docout, string& ipath)
|
|
||||||
{
|
{
|
||||||
LOGDEB2(("MimeHandlerMail::mkDoc: %s [%s]\n", mtype.c_str(), fn.c_str()));
|
if (m_fd >= 0) {
|
||||||
m_conf = cnf;
|
close(m_fd);
|
||||||
|
m_fd = -1;
|
||||||
if (!stringlowercmp("message/rfc822", mtype)) {
|
|
||||||
ipath = "";
|
|
||||||
int fd;
|
|
||||||
if ((fd = open(fn.c_str(), 0)) < 0) {
|
|
||||||
LOGERR(("MimeHandlerMail::mkDoc: open(%s) errno %d\n",
|
|
||||||
fn.c_str(), errno));
|
|
||||||
return MimeHandler::MHError;
|
|
||||||
}
|
|
||||||
Binc::MimeDocument doc;
|
|
||||||
doc.parseFull(fd);
|
|
||||||
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
|
|
||||||
LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
|
|
||||||
fn.c_str()));
|
|
||||||
return MimeHandler::MHError;
|
|
||||||
}
|
|
||||||
MimeHandler::Status ret = processMsg(docout, doc, 0);
|
|
||||||
close(fd);
|
|
||||||
return ret;
|
|
||||||
} else if (!stringlowercmp("text/x-mail", mtype)) {
|
|
||||||
return processmbox(fn, docout, ipath);
|
|
||||||
} else // hu ho
|
|
||||||
return MimeHandler::MHError;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
|
|
||||||
static regex_t fromregex;
|
|
||||||
static bool regcompiled;
|
|
||||||
|
|
||||||
MimeHandler::Status
|
|
||||||
MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
|
|
||||||
{
|
|
||||||
int mtarg = 0;
|
|
||||||
if (ipath != "") {
|
|
||||||
sscanf(ipath.c_str(), "%d", &mtarg);
|
|
||||||
}
|
}
|
||||||
LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
|
m_fd = open(fn.c_str(), 0);
|
||||||
mtarg));
|
if (m_fd < 0) {
|
||||||
|
LOGERR(("MimeHandlerMail::set_document_file: open(%s) errno %d\n",
|
||||||
FILE *fp;
|
fn.c_str(), errno));
|
||||||
// Open the file on first call, then save/reuse the file pointer
|
return false;
|
||||||
if (!m_vfp) {
|
|
||||||
fp = fopen(fn.c_str(), "r");
|
|
||||||
if (fp == 0) {
|
|
||||||
LOGERR(("MimeHandlerMail::processmbox: error opening %s\n",
|
|
||||||
fn.c_str()));
|
|
||||||
return MimeHandler::MHError;
|
|
||||||
}
|
|
||||||
m_vfp = fp;
|
|
||||||
} else {
|
|
||||||
fp = (FILE *)m_vfp;
|
|
||||||
}
|
}
|
||||||
if (!regcompiled) {
|
delete m_bincdoc;
|
||||||
regcomp(&fromregex, frompat, REG_NOSUB);
|
m_bincdoc = new Binc::MimeDocument;
|
||||||
regcompiled = true;
|
m_bincdoc->parseFull(m_fd);
|
||||||
}
|
if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
|
||||||
|
LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
|
||||||
// If we are called to retrieve a specific message, seek to bof
|
|
||||||
// (then scan up to the message). This is for the case where the
|
|
||||||
// same object is reused to fetch several messages (else the fp is
|
|
||||||
// just opened no need for a seek). We could also check if the
|
|
||||||
// current message number is lower than the requested one and
|
|
||||||
// avoid rereading the whole thing in this case. But I'm not sure
|
|
||||||
// we're ever used in this way (multiple retrieves on same
|
|
||||||
// object). So:
|
|
||||||
if (mtarg > 0) {
|
|
||||||
fseek(fp, 0, SEEK_SET);
|
|
||||||
m_msgnum = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
off_t start, end;
|
|
||||||
bool iseof = false;
|
|
||||||
bool hademptyline = true;
|
|
||||||
string msgtxt;
|
|
||||||
do {
|
|
||||||
// Look for next 'From ' Line, start of message. Set start to
|
|
||||||
// line after this
|
|
||||||
char line[501];
|
|
||||||
for (;;) {
|
|
||||||
if (!fgets(line, 500, fp)) {
|
|
||||||
// Eof hit while looking for 'From ' -> file done. We'd need
|
|
||||||
// another return code here
|
|
||||||
return MimeHandler::MHError;
|
|
||||||
}
|
|
||||||
if (line[0] == '\n' || line[0] == '\r') {
|
|
||||||
hademptyline = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
|
|
||||||
start = ftello(fp);
|
|
||||||
m_msgnum++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
hademptyline = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for next 'From ' line or eof, end of message.
|
|
||||||
for (;;) {
|
|
||||||
end = ftello(fp);
|
|
||||||
if (!fgets(line, 500, fp)) {
|
|
||||||
if (ferror(fp) || feof(fp))
|
|
||||||
iseof = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (mtarg <= 0 || m_msgnum == mtarg) {
|
|
||||||
msgtxt += line;
|
|
||||||
}
|
|
||||||
if (line[0] == '\n' || line[0] == '\r') {
|
|
||||||
hademptyline = true;
|
|
||||||
} else {
|
|
||||||
hademptyline = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fseek(fp, end, SEEK_SET);
|
|
||||||
} while (mtarg > 0 && m_msgnum < mtarg);
|
|
||||||
|
|
||||||
stringstream s(msgtxt);
|
|
||||||
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
|
||||||
Binc::MimeDocument doc;
|
|
||||||
doc.parseFull(s);
|
|
||||||
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
|
|
||||||
LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n",
|
|
||||||
fn.c_str()));
|
fn.c_str()));
|
||||||
return MimeHandler::MHError;
|
return false;
|
||||||
}
|
}
|
||||||
|
m_havedoc = true;
|
||||||
MimeHandler::Status ret = processMsg(docout, doc, 0);
|
return true;
|
||||||
|
|
||||||
if (ret == MimeHandler::MHError)
|
|
||||||
return ret;
|
|
||||||
char buf[20];
|
|
||||||
sprintf(buf, "%d", m_msgnum);
|
|
||||||
ipath = buf;
|
|
||||||
return iseof ? MimeHandler::MHDone :
|
|
||||||
(mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool MimeHandlerMail::set_document_string(const string &msgtxt)
|
||||||
|
{
|
||||||
|
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
||||||
|
delete m_stream;
|
||||||
|
m_stream = new stringstream(msgtxt);
|
||||||
|
delete m_bincdoc;
|
||||||
|
m_bincdoc = new Binc::MimeDocument;
|
||||||
|
m_bincdoc->parseFull(*m_stream);
|
||||||
|
if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
|
||||||
|
LOGERR(("MimeHandlerMail::set_document_string: mime parse error\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
m_havedoc = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MimeHandlerMail::next_document()
|
||||||
|
{
|
||||||
|
if (!m_havedoc)
|
||||||
|
return false;
|
||||||
|
m_havedoc = false;
|
||||||
|
m_metaData["mimetype"] = "text/plain";
|
||||||
|
return processMsg(m_bincdoc, 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Transform a single message into a document. The subject becomes the
|
// Transform a single message into a document. The subject becomes the
|
||||||
// title, and any simple body part with a content-type of text or html
|
// title, and any simple body part with a content-type of text or html
|
||||||
@ -217,58 +106,59 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
|
|||||||
// If depth is not zero, we're called recursively for an
|
// If depth is not zero, we're called recursively for an
|
||||||
// message/rfc822 part and we must not touch the doc fields except the
|
// message/rfc822 part and we must not touch the doc fields except the
|
||||||
// text
|
// text
|
||||||
MimeHandler::Status
|
bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
||||||
MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
|
|
||||||
int depth)
|
|
||||||
{
|
{
|
||||||
LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth));
|
LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth));
|
||||||
if (depth++ >= maxdepth) {
|
if (depth++ >= maxdepth) {
|
||||||
// Have to stop somewhere
|
// Have to stop somewhere
|
||||||
LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n",
|
LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n",
|
||||||
maxdepth));
|
maxdepth));
|
||||||
return MimeHandler::MHDone;
|
// Return true anyway, better to index partially than not at all
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle some headers.
|
// Handle some headers.
|
||||||
|
string& text = m_metaData["content"];
|
||||||
Binc::HeaderItem hi;
|
Binc::HeaderItem hi;
|
||||||
string transcoded;
|
string transcoded;
|
||||||
if (doc.h.getFirstHeader("From", hi)) {
|
if (doc->h.getFirstHeader("From", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), transcoded);
|
||||||
docout.text += string("From: ") + transcoded + string("\n");
|
text += string("From: ") + transcoded + string("\n");
|
||||||
}
|
}
|
||||||
if (doc.h.getFirstHeader("To", hi)) {
|
if (doc->h.getFirstHeader("To", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), transcoded);
|
||||||
docout.text += string("To: ") + transcoded + string("\n");
|
text += string("To: ") + transcoded + string("\n");
|
||||||
}
|
}
|
||||||
if (doc.h.getFirstHeader("Date", hi)) {
|
if (doc->h.getFirstHeader("Date", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), transcoded);
|
||||||
if (depth == 1) {
|
if (depth == 1) {
|
||||||
time_t t = rfc2822DateToUxTime(transcoded);
|
time_t t = rfc2822DateToUxTime(transcoded);
|
||||||
if (t != (time_t)-1) {
|
if (t != (time_t)-1) {
|
||||||
char ascuxtime[100];
|
char ascuxtime[100];
|
||||||
sprintf(ascuxtime, "%ld", (long)t);
|
sprintf(ascuxtime, "%ld", (long)t);
|
||||||
docout.dmtime = ascuxtime;
|
m_metaData["modificationdate"] = ascuxtime;
|
||||||
} else {
|
} else {
|
||||||
// Leave mtime field alone, ftime will be used instead.
|
// Leave mtime field alone, ftime will be used instead.
|
||||||
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
docout.text += string("Date: ") + transcoded + string("\n");
|
text += string("Date: ") + transcoded + string("\n");
|
||||||
}
|
}
|
||||||
if (doc.h.getFirstHeader("Subject", hi)) {
|
if (doc->h.getFirstHeader("Subject", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), transcoded);
|
||||||
if (depth == 1)
|
if (depth == 1)
|
||||||
docout.title = transcoded;
|
m_metaData["title"] = transcoded;
|
||||||
docout.text += string("Subject: ") + transcoded + string("\n");
|
text += string("Subject: ") + transcoded + string("\n");
|
||||||
}
|
}
|
||||||
docout.text += '\n';
|
text += '\n';
|
||||||
|
|
||||||
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
|
LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n",
|
||||||
doc.isMultipart(), doc.getSubType().c_str()));
|
doc->isMultipart(), doc->getSubType().c_str()));
|
||||||
walkmime(docout, doc, depth);
|
walkmime(doc, depth);
|
||||||
|
|
||||||
LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str()));
|
LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n",
|
||||||
return MimeHandler::MHDone;
|
m_metaData["content"].c_str()));
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Recursively walk the message mime parts and concatenate all the
|
// Recursively walk the message mime parts and concatenate all the
|
||||||
@ -281,8 +171,7 @@ MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
|
|||||||
//
|
//
|
||||||
// multipart can be mixed, alternative, parallel, digest.
|
// multipart can be mixed, alternative, parallel, digest.
|
||||||
// message/rfc822 may also be of interest.
|
// message/rfc822 may also be of interest.
|
||||||
|
void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|
||||||
{
|
{
|
||||||
LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth));
|
LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth));
|
||||||
if (depth++ >= maxdepth) {
|
if (depth++ >= maxdepth) {
|
||||||
@ -290,28 +179,29 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
string &out = docout.text;
|
string& out = m_metaData["content"];
|
||||||
|
|
||||||
if (doc.isMultipart()) {
|
if (doc->isMultipart()) {
|
||||||
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
|
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
|
||||||
doc.isMultipart(), doc.getSubType().c_str()));
|
doc->isMultipart(), doc->getSubType().c_str()));
|
||||||
// We only handle alternative, related and mixed (no digests).
|
// We only handle alternative, related and mixed (no digests).
|
||||||
std::vector<Binc::MimePart>::iterator it;
|
std::vector<Binc::MimePart>::iterator it;
|
||||||
|
|
||||||
if (!stringicmp("mixed", doc.getSubType()) ||
|
if (!stringicmp("mixed", doc->getSubType()) ||
|
||||||
!stringicmp("related", doc.getSubType())) {
|
!stringicmp("related", doc->getSubType())) {
|
||||||
// Multipart mixed and related: process each part.
|
// Multipart mixed and related: process each part.
|
||||||
for (it = doc.members.begin(); it != doc.members.end();it++) {
|
for (it = doc->members.begin(); it != doc->members.end();it++) {
|
||||||
walkmime(docout, *it, depth);
|
walkmime(&(*it), depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (!stringicmp("alternative", doc.getSubType())) {
|
} else if (!stringicmp("alternative", doc->getSubType())) {
|
||||||
// Multipart/alternative: look for a text/plain part, then html.
|
// Multipart/alternative: look for a text/plain part, then html.
|
||||||
// Process if found
|
// Process if found
|
||||||
std::vector<Binc::MimePart>::iterator ittxt, ithtml;
|
std::vector<Binc::MimePart>::iterator ittxt, ithtml;
|
||||||
ittxt = ithtml = doc.members.end();
|
ittxt = ithtml = doc->members.end();
|
||||||
int i = 1;
|
int i = 1;
|
||||||
for (it = doc.members.begin(); it != doc.members.end();it++, i++) {
|
for (it = doc->members.begin();
|
||||||
|
it != doc->members.end(); it++, i++) {
|
||||||
// Get and parse content-type header
|
// Get and parse content-type header
|
||||||
Binc::HeaderItem hi;
|
Binc::HeaderItem hi;
|
||||||
if (!it->h.getFirstHeader("Content-Type", hi)) {
|
if (!it->h.getFirstHeader("Content-Type", hi)) {
|
||||||
@ -326,12 +216,12 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
else if (!stringlowercmp("text/html", content_type.value))
|
else if (!stringlowercmp("text/html", content_type.value))
|
||||||
ithtml = it;
|
ithtml = it;
|
||||||
}
|
}
|
||||||
if (ittxt != doc.members.end()) {
|
if (ittxt != doc->members.end()) {
|
||||||
LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
|
LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
|
||||||
walkmime(docout, *ittxt, depth);
|
walkmime(&(*ittxt), depth);
|
||||||
} else if (ithtml != doc.members.end()) {
|
} else if (ithtml != doc->members.end()) {
|
||||||
LOGDEB2(("walkmime: alternative: chose text/html part\n"))
|
LOGDEB2(("walkmime: alternative: chose text/html part\n"))
|
||||||
walkmime(docout, *ithtml, depth);
|
walkmime(&(*ithtml), depth);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -343,7 +233,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
// Get and parse content-type header.
|
// Get and parse content-type header.
|
||||||
Binc::HeaderItem hi;
|
Binc::HeaderItem hi;
|
||||||
string ctt = "text/plain";
|
string ctt = "text/plain";
|
||||||
if (doc.h.getFirstHeader("Content-Type", hi)) {
|
if (doc->h.getFirstHeader("Content-Type", hi)) {
|
||||||
ctt = hi.getValue();
|
ctt = hi.getValue();
|
||||||
}
|
}
|
||||||
LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
|
LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
|
||||||
@ -352,7 +242,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
|
|
||||||
// Get and parse Content-Disposition header
|
// Get and parse Content-Disposition header
|
||||||
string ctd = "inline";
|
string ctd = "inline";
|
||||||
if (doc.h.getFirstHeader("Content-Disposition", hi)) {
|
if (doc->h.getFirstHeader("Content-Disposition", hi)) {
|
||||||
ctd = hi.getValue();
|
ctd = hi.getValue();
|
||||||
}
|
}
|
||||||
MimeHeaderValue content_disposition;
|
MimeHeaderValue content_disposition;
|
||||||
@ -371,13 +261,13 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
if (it != content_disposition.params.end())
|
if (it != content_disposition.params.end())
|
||||||
filename = it->second;
|
filename = it->second;
|
||||||
|
|
||||||
if (doc.isMessageRFC822()) {
|
if (doc->isMessageRFC822()) {
|
||||||
LOGDEB2(("walkmime: message/RFC822 part\n"));
|
LOGDEB2(("walkmime: message/RFC822 part\n"));
|
||||||
|
|
||||||
// The first part is the already parsed message. Call
|
// The first part is the already parsed message. Call
|
||||||
// processMsg instead of walkmime so that mail headers get
|
// processMsg instead of walkmime so that mail headers get
|
||||||
// printed. The depth will tell it what to do
|
// printed. The depth will tell it what to do
|
||||||
if (doc.members.empty()) {
|
if (doc->members.empty()) {
|
||||||
//??
|
//??
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -388,7 +278,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
if (m_forPreview)
|
if (m_forPreview)
|
||||||
out += "]";
|
out += "]";
|
||||||
out += "\n\n";
|
out += "\n\n";
|
||||||
processMsg(docout, doc.members[0], depth);
|
processMsg(&doc->members[0], depth);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -437,14 +327,14 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
|
|
||||||
// Content transfer encoding
|
// Content transfer encoding
|
||||||
string cte = "7bit";
|
string cte = "7bit";
|
||||||
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
|
if (doc->h.getFirstHeader("Content-Transfer-Encoding", hi)) {
|
||||||
cte = hi.getValue();
|
cte = hi.getValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
|
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
|
||||||
doc.getBodyStartOffset(), doc.getBodyLength()));
|
doc->getBodyStartOffset(), doc->getBodyLength()));
|
||||||
string body;
|
string body;
|
||||||
doc.getBody(body, 0, doc.bodylength);
|
doc->getBody(body, 0, doc->bodylength);
|
||||||
|
|
||||||
// Decode according to content transfer encoding
|
// Decode according to content transfer encoding
|
||||||
if (!stringlowercmp("quoted-printable", cte)) {
|
if (!stringlowercmp("quoted-printable", cte)) {
|
||||||
@ -472,22 +362,30 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|||||||
|
|
||||||
// Handle html stripping and transcoding to utf8
|
// Handle html stripping and transcoding to utf8
|
||||||
string utf8;
|
string utf8;
|
||||||
|
const string *putf8 = 0;
|
||||||
if (!stringlowercmp("text/html", content_type.value)) {
|
if (!stringlowercmp("text/html", content_type.value)) {
|
||||||
MimeHandlerHtml mh;
|
MimeHandlerHtml mh("text/html");
|
||||||
Rcl::Doc hdoc;
|
mh.set_property(Dijon::Filter::OPERATING_MODE,
|
||||||
mh.charsethint = charset;
|
m_forPreview ? "view" : "index");
|
||||||
mh.mkDoc(m_conf, "", body, content_type.value, hdoc);
|
mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||||
utf8 = hdoc.text;
|
mh.set_document_string(body);
|
||||||
|
mh.next_document();
|
||||||
|
map<string, string>::const_iterator it =
|
||||||
|
mh.get_meta_data().find("content");
|
||||||
|
if (it != mh.get_meta_data().end())
|
||||||
|
putf8 = &it->second;
|
||||||
} else {
|
} else {
|
||||||
// Transcode to utf-8
|
// Transcode to utf-8
|
||||||
if (!transcode(body, utf8, charset, "UTF-8")) {
|
if (!transcode(body, utf8, charset, "UTF-8")) {
|
||||||
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
||||||
charset.c_str()));
|
charset.c_str()));
|
||||||
utf8 = body;
|
putf8 = &body;
|
||||||
|
} else {
|
||||||
|
putf8 = &utf8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (putf8)
|
||||||
out += utf8;
|
out += *putf8;
|
||||||
if (out.length() && out[out.length()-1] != '\n')
|
if (out.length() && out[out.length()-1] != '\n')
|
||||||
out += '\n';
|
out += '\n';
|
||||||
|
|
||||||
|
|||||||
@ -16,8 +16,9 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _MAIL_H_INCLUDED_
|
#ifndef _MAIL_H_INCLUDED_
|
||||||
#define _MAIL_H_INCLUDED_
|
#define _MAIL_H_INCLUDED_
|
||||||
/* @(#$Id: mh_mail.h,v 1.8 2006-09-19 14:30:39 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
|
|
||||||
namespace Binc {
|
namespace Binc {
|
||||||
@ -30,26 +31,21 @@ namespace Binc {
|
|||||||
* for maildir files). This has to keep state while parsing a mail folder
|
* for maildir files). This has to keep state while parsing a mail folder
|
||||||
* file.
|
* file.
|
||||||
*/
|
*/
|
||||||
class MimeHandlerMail : public MimeHandler {
|
class MimeHandlerMail : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
MimeHandlerMail() : m_vfp(0), m_msgnum(0), m_conf(0) {}
|
MimeHandlerMail(const string &mt)
|
||||||
|
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0)
|
||||||
virtual MimeHandler::Status
|
{}
|
||||||
mkDoc(RclConfig *conf, const std::string &fn,
|
|
||||||
const std::string &mtype, Rcl::Doc &docout, std::string& ipath);
|
|
||||||
|
|
||||||
virtual ~MimeHandlerMail();
|
virtual ~MimeHandlerMail();
|
||||||
|
virtual bool set_document_file(const string &file_path);
|
||||||
|
virtual bool set_document_string(const string &data);
|
||||||
|
virtual bool next_document();
|
||||||
private:
|
private:
|
||||||
void *m_vfp; // File pointer for folder
|
Binc::MimeDocument *m_bincdoc;
|
||||||
int m_msgnum; // Current message number in folder. Starts at 1
|
bool processMsg(Binc::MimePart *doc, int depth);
|
||||||
RclConfig *m_conf; // Keep pointer to rclconfig around
|
void walkmime(Binc::MimePart* doc, int depth);
|
||||||
|
int m_fd;
|
||||||
MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout,
|
std::stringstream *m_stream;
|
||||||
string &ipath);
|
|
||||||
MimeHandler::Status processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
|
|
||||||
int depth);
|
|
||||||
void walkmime(Rcl::Doc &docout, Binc::MimePart& doc, int depth);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _MAIL_H_INCLUDED_ */
|
#endif /* _MAIL_H_INCLUDED_ */
|
||||||
|
|||||||
166
src/internfile/mh_mbox.cpp
Normal file
166
src/internfile/mh_mbox.cpp
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc.,
|
||||||
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <regex.h>
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "mimehandler.h"
|
||||||
|
#include "debuglog.h"
|
||||||
|
#include "readfile.h"
|
||||||
|
#include "mh_mbox.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
MimeHandlerMbox::~MimeHandlerMbox()
|
||||||
|
{
|
||||||
|
if (m_vfp) {
|
||||||
|
fclose((FILE *)m_vfp);
|
||||||
|
m_vfp = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MimeHandlerMbox::set_document_file(const string &fn)
|
||||||
|
{
|
||||||
|
LOGDEB(("MimeHandlerMbox::set_document_file(%s)\n", fn.c_str()));
|
||||||
|
m_fn = fn;
|
||||||
|
if (m_vfp) {
|
||||||
|
fclose((FILE *)m_vfp);
|
||||||
|
m_vfp = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_vfp = fopen(fn.c_str(), "r");
|
||||||
|
if (m_vfp == 0) {
|
||||||
|
LOGERR(("MimeHandlerMail::set_document_file: error opening %s\n",
|
||||||
|
fn.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
m_havedoc = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
|
||||||
|
static regex_t fromregex;
|
||||||
|
static bool regcompiled;
|
||||||
|
|
||||||
|
bool MimeHandlerMbox::next_document()
|
||||||
|
{
|
||||||
|
if (m_vfp == 0) {
|
||||||
|
LOGERR(("MimeHandlerMbox::next_document: not open\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!m_havedoc) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
FILE *fp = (FILE *)m_vfp;
|
||||||
|
int mtarg = 0;
|
||||||
|
if (m_ipath != "") {
|
||||||
|
sscanf(m_ipath.c_str(), "%d", &mtarg);
|
||||||
|
} else if (m_forPreview) {
|
||||||
|
// Can't preview an mbox
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
LOGDEB(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n",
|
||||||
|
m_fn.c_str(), m_msgnum, mtarg));
|
||||||
|
|
||||||
|
if (!regcompiled) {
|
||||||
|
regcomp(&fromregex, frompat, REG_NOSUB);
|
||||||
|
regcompiled = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we are called to retrieve a specific message, seek to bof
|
||||||
|
// (then scan up to the message). This is for the case where the
|
||||||
|
// same object is reused to fetch several messages (else the fp is
|
||||||
|
// just opened no need for a seek). We could also check if the
|
||||||
|
// current message number is lower than the requested one and
|
||||||
|
// avoid rereading the whole thing in this case. But I'm not sure
|
||||||
|
// we're ever used in this way (multiple retrieves on same
|
||||||
|
// object). So:
|
||||||
|
if (mtarg > 0) {
|
||||||
|
fseek(fp, 0, SEEK_SET);
|
||||||
|
m_msgnum = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
off_t start, end;
|
||||||
|
bool iseof = false;
|
||||||
|
bool hademptyline = true;
|
||||||
|
string& msgtxt = m_metaData["content"];
|
||||||
|
msgtxt.erase();
|
||||||
|
do {
|
||||||
|
// Look for next 'From ' Line, start of message. Set start to
|
||||||
|
// line after this
|
||||||
|
char line[501];
|
||||||
|
for (;;) {
|
||||||
|
if (!fgets(line, 500, fp)) {
|
||||||
|
// Eof hit while looking for 'From ' -> file done. We'd need
|
||||||
|
// another return code here
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (line[0] == '\n' || line[0] == '\r') {
|
||||||
|
hademptyline = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
|
||||||
|
start = ftello(fp);
|
||||||
|
m_msgnum++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
hademptyline = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for next 'From ' line or eof, end of message.
|
||||||
|
for (;;) {
|
||||||
|
end = ftello(fp);
|
||||||
|
if (!fgets(line, 500, fp)) {
|
||||||
|
if (ferror(fp) || feof(fp))
|
||||||
|
iseof = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (mtarg <= 0 || m_msgnum == mtarg) {
|
||||||
|
msgtxt += line;
|
||||||
|
}
|
||||||
|
if (line[0] == '\n' || line[0] == '\r') {
|
||||||
|
hademptyline = true;
|
||||||
|
} else {
|
||||||
|
hademptyline = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fseek(fp, end, SEEK_SET);
|
||||||
|
} while (mtarg > 0 && m_msgnum < mtarg);
|
||||||
|
|
||||||
|
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
||||||
|
char buf[20];
|
||||||
|
sprintf(buf, "%d", m_msgnum);
|
||||||
|
m_metaData["ipath"] = buf;
|
||||||
|
m_metaData["mimetype"] = "message/rfc822";
|
||||||
|
if (iseof)
|
||||||
|
m_havedoc = false;
|
||||||
|
return msgtxt.empty() ? false : true;
|
||||||
|
}
|
||||||
51
src/internfile/mh_mbox.h
Normal file
51
src/internfile/mh_mbox.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc.,
|
||||||
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
*/
|
||||||
|
#ifndef _MBOX_H_INCLUDED_
|
||||||
|
#define _MBOX_H_INCLUDED_
|
||||||
|
/* @(#$Id: mh_mbox.h,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
#include "mimehandler.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Translate a mail folder file into internal documents (also works
|
||||||
|
* for maildir files). This has to keep state while parsing a mail folder
|
||||||
|
* file.
|
||||||
|
*/
|
||||||
|
class MimeHandlerMbox : public RecollFilter {
|
||||||
|
public:
|
||||||
|
MimeHandlerMbox(const string& mime)
|
||||||
|
: RecollFilter(mime), m_vfp(0), m_msgnum(0)
|
||||||
|
{}
|
||||||
|
virtual ~MimeHandlerMbox();
|
||||||
|
virtual bool set_document_file(const string &file_path);
|
||||||
|
virtual bool next_document();
|
||||||
|
virtual bool skip_to_document(const string& ipath) {
|
||||||
|
m_ipath = ipath;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
string m_fn; // File name
|
||||||
|
void *m_vfp; // File pointer for folder
|
||||||
|
int m_msgnum; // Current message number in folder. Starts at 1
|
||||||
|
string m_ipath;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* _MBOX_H_INCLUDED_ */
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.5 2006-03-20 15:14:08 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.6 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -31,34 +31,44 @@ using namespace std;
|
|||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
|
|
||||||
// Process a plain text file
|
// Process a plain text file
|
||||||
MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn,
|
bool MimeHandlerText::set_document_file(const string &fn)
|
||||||
const string &mtype, Rcl::Doc &docout, string&)
|
|
||||||
{
|
{
|
||||||
string otext;
|
string otext;
|
||||||
if (!file_to_string(fn, otext))
|
if (!file_to_string(fn, otext))
|
||||||
return MimeHandler::MHError;
|
return false;
|
||||||
|
return set_document_string(otext);
|
||||||
// Try to guess charset, then convert to utf-8, and fill document
|
}
|
||||||
// fields The charset guesser really doesnt work well in general
|
|
||||||
// and should be avoided (especially for short documents)
|
bool MimeHandlerText::set_document_string(const string& otext)
|
||||||
string charset;
|
{
|
||||||
if (conf->getGuessCharset()) {
|
m_text = otext;
|
||||||
charset = csguess(otext, conf->getDefCharset());
|
m_havedoc = true;
|
||||||
} else
|
return true;
|
||||||
charset = conf->getDefCharset();
|
}
|
||||||
|
|
||||||
|
bool MimeHandlerText::next_document()
|
||||||
|
{
|
||||||
|
if (m_havedoc == false)
|
||||||
|
return false;
|
||||||
|
m_havedoc = false;
|
||||||
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
|
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
|
||||||
charset.c_str()));
|
m_defcharset.c_str()));
|
||||||
|
|
||||||
string utf8;
|
// Avoid unneeded copy. This gets a reference to an empty string which is
|
||||||
if (!transcode(otext, utf8, charset, "UTF-8")) {
|
// the entry for "content"
|
||||||
|
string& utf8 = m_metaData["content"];
|
||||||
|
|
||||||
|
// Note that we transcode always even if defcharset is already utf-8:
|
||||||
|
// this validates the encoding.
|
||||||
|
if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) {
|
||||||
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
|
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
|
||||||
"for charset [%s]\n", charset.c_str()));
|
"for charset [%s]\n", m_defcharset.c_str()));
|
||||||
otext.erase();
|
utf8.erase();
|
||||||
return MimeHandler::MHError;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
docout.origcharset = charset;
|
m_metaData["origcharset"] = m_defcharset;
|
||||||
docout.text = utf8;
|
m_metaData["charset"] = "utf-8";
|
||||||
return MimeHandler::MHDone;
|
m_metaData["mimetype"] = "text/plain";
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,12 +16,11 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _MH_TEXT_H_INCLUDED_
|
#ifndef _MH_TEXT_H_INCLUDED_
|
||||||
#define _MH_TEXT_H_INCLUDED_
|
#define _MH_TEXT_H_INCLUDED_
|
||||||
/* @(#$Id: mh_text.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: mh_text.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
using std::string;
|
||||||
|
|
||||||
#include "rclconfig.h"
|
|
||||||
#include "rcldb.h"
|
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -29,12 +28,15 @@
|
|||||||
*
|
*
|
||||||
* Maybe try to guess charset, or use default, then transcode to utf8
|
* Maybe try to guess charset, or use default, then transcode to utf8
|
||||||
*/
|
*/
|
||||||
class MimeHandlerText : public MimeHandler {
|
class MimeHandlerText : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn,
|
MimeHandlerText(const string& mt) : RecollFilter(mt) {}
|
||||||
const std::string &mtype, Rcl::Doc &docout,
|
virtual ~MimeHandlerText() {}
|
||||||
std::string&);
|
virtual bool set_document_file(const string &file_path);
|
||||||
|
virtual bool set_document_string(const string&);
|
||||||
|
virtual bool next_document();
|
||||||
|
private:
|
||||||
|
string m_text;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _MH_TEXT_H_INCLUDED_ */
|
#endif /* _MH_TEXT_H_INCLUDED_ */
|
||||||
|
|||||||
@ -16,24 +16,33 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _MH_UNKNOWN_H_INCLUDED_
|
#ifndef _MH_UNKNOWN_H_INCLUDED_
|
||||||
#define _MH_UNKNOWN_H_INCLUDED_
|
#define _MH_UNKNOWN_H_INCLUDED_
|
||||||
/* @(#$Id: mh_unknown.h,v 1.1 2006-03-28 09:36:53 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: mh_unknown.h,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "rclconfig.h"
|
|
||||||
#include "rcldb.h"
|
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handler for files with no content handler: does nothing.
|
* Handler for files with no content handler: does nothing.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class MimeHandlerUnknown : public MimeHandler {
|
class MimeHandlerUnknown : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn,
|
MimeHandlerUnknown(const string& mt) : RecollFilter(mt) {}
|
||||||
const std::string &mtype, Rcl::Doc &docout,
|
virtual ~MimeHandlerUnknown() {}
|
||||||
std::string&) {
|
virtual bool set_document_string(const string&) {
|
||||||
return MimeHandler::MHDone;
|
return m_havedoc = true;
|
||||||
|
}
|
||||||
|
virtual bool set_document_file(const string&) {
|
||||||
|
return m_havedoc = true;
|
||||||
|
}
|
||||||
|
virtual bool next_document() {
|
||||||
|
if (m_havedoc == false)
|
||||||
|
return false;
|
||||||
|
m_havedoc = false;
|
||||||
|
m_metaData["content"] = "";
|
||||||
|
m_metaData["mimetype"] = "text/plain";
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.20 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -20,37 +20,40 @@ static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
#endif /* NO_NAMESPACES */
|
|
||||||
|
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "rclconfig.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
|
|
||||||
|
#include "mh_exec.h"
|
||||||
#include "mh_html.h"
|
#include "mh_html.h"
|
||||||
#include "mh_mail.h"
|
#include "mh_mail.h"
|
||||||
|
#include "mh_mbox.h"
|
||||||
#include "mh_text.h"
|
#include "mh_text.h"
|
||||||
#include "mh_exec.h"
|
|
||||||
#include "mh_unknown.h"
|
#include "mh_unknown.h"
|
||||||
|
|
||||||
/** Create internal handler object appropriate for given mime type */
|
/** Create internal handler object appropriate for given mime type */
|
||||||
static MimeHandler *mhFactory(const string &mime)
|
static Dijon::Filter *mhFactory(const string &mime)
|
||||||
{
|
{
|
||||||
if (!stringlowercmp("text/plain", mime))
|
if (!stringlowercmp("text/plain", mime))
|
||||||
return new MimeHandlerText;
|
return new MimeHandlerText("text/plain");
|
||||||
else if (!stringlowercmp("text/html", mime))
|
else if (!stringlowercmp("text/html", mime))
|
||||||
return new MimeHandlerHtml;
|
return new MimeHandlerHtml("text/html");
|
||||||
else if (!stringlowercmp("text/x-mail", mime))
|
else if (!stringlowercmp("text/x-mail", mime))
|
||||||
return new MimeHandlerMail;
|
return new MimeHandlerMbox("text/x-mail");
|
||||||
else if (!stringlowercmp("message/rfc822", mime))
|
else if (!stringlowercmp("message/rfc822", mime))
|
||||||
return new MimeHandlerMail;
|
return new MimeHandlerMail("message/rfc822");
|
||||||
return 0;
|
else
|
||||||
|
return new MimeHandlerUnknown("application/octet-stream");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return handler object for given mime type:
|
* Return handler object for given mime type:
|
||||||
*/
|
*/
|
||||||
MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
|
Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg)
|
||||||
{
|
{
|
||||||
// Get handler definition for mime type
|
// Get handler definition for mime type
|
||||||
string hs;
|
string hs;
|
||||||
@ -78,7 +81,7 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
|
|||||||
mtype.c_str(), hs.c_str()));
|
mtype.c_str(), hs.c_str()));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
MimeHandlerExec *h = new MimeHandlerExec;
|
MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str());
|
||||||
it++;
|
it++;
|
||||||
h->params.push_back(cfg->findFilter(*it++));
|
h->params.push_back(cfg->findFilter(*it++));
|
||||||
h->params.insert(h->params.end(), it, toks.end());
|
h->params.insert(h->params.end(), it, toks.end());
|
||||||
@ -93,7 +96,8 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
|
|||||||
bool indexunknown = false;
|
bool indexunknown = false;
|
||||||
cfg->getConfParam("indexallfilenames", &indexunknown);
|
cfg->getConfParam("indexallfilenames", &indexunknown);
|
||||||
if (indexunknown) {
|
if (indexunknown) {
|
||||||
return new MimeHandlerUnknown;
|
LOGDEB(("getMimeHandler: returning MimeHandlerUnknown\n"));
|
||||||
|
return new MimeHandlerUnknown("application/octet-stream");
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,60 +16,74 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _MIMEHANDLER_H_INCLUDED_
|
#ifndef _MIMEHANDLER_H_INCLUDED_
|
||||||
#define _MIMEHANDLER_H_INCLUDED_
|
#define _MIMEHANDLER_H_INCLUDED_
|
||||||
/* @(#$Id: mimehandler.h,v 1.12 2006-03-29 13:08:08 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: mimehandler.h,v 1.13 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
using std::string;
|
||||||
|
using std::list;
|
||||||
|
|
||||||
#include "rclconfig.h"
|
#include <Filter.h>
|
||||||
#include "rcldb.h"
|
|
||||||
|
|
||||||
|
class RclConfig;
|
||||||
|
|
||||||
/**
|
class RecollFilter : public Dijon::Filter {
|
||||||
* Document interner class.
|
public:
|
||||||
*/
|
RecollFilter(const string& mtype)
|
||||||
class MimeHandler {
|
: Dijon::Filter(mtype), m_forPreview(false), m_havedoc(false)
|
||||||
public:
|
{}
|
||||||
MimeHandler() : m_forPreview(false) {}
|
virtual ~RecollFilter() {}
|
||||||
virtual ~MimeHandler() {}
|
virtual bool set_property(Properties p, const string &v) {
|
||||||
|
switch (p) {
|
||||||
|
case DEFAULT_CHARSET:
|
||||||
|
m_defcharset = v;
|
||||||
|
break;
|
||||||
|
case OPERATING_MODE:
|
||||||
|
if (!v.empty() && v[0] == 'v')
|
||||||
|
m_forPreview = true;
|
||||||
|
else
|
||||||
|
m_forPreview = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/// Status from mkDoc method.
|
// We don't use this for now
|
||||||
enum Status {MHError, MHDone, MHAgain};
|
virtual bool set_document_uri(const std::string &) {return false;}
|
||||||
/**
|
|
||||||
* Transform external data into internal utf8 document
|
|
||||||
*
|
|
||||||
* @param conf the global configuration
|
|
||||||
* @param filename File from which the data comes from
|
|
||||||
* @param mimetype its mime type (from the mimemap configuration file)
|
|
||||||
* @param outdoc The output document
|
|
||||||
* @param ipath the access path for the document inside the file.
|
|
||||||
* For mono-document file types, this will always be empty.
|
|
||||||
* It is used, for example for mbox files which may contain
|
|
||||||
* multiple emails. If this is not empty in input, then the
|
|
||||||
* caller is requesting a single document (ie: for display).
|
|
||||||
* If this is empty (during indexation), it will be filled-up
|
|
||||||
* by the function, and all the file's documents will be
|
|
||||||
* returned by successive calls.
|
|
||||||
* @return The return value indicates if there are more documents to be
|
|
||||||
* fetched from the same file.
|
|
||||||
*/
|
|
||||||
virtual MimeHandler::Status mkDoc(RclConfig * conf,
|
|
||||||
const std::string &filename,
|
|
||||||
const std::string &mimetype,
|
|
||||||
Rcl::Doc& outdoc,
|
|
||||||
string& ipath) = 0;
|
|
||||||
|
|
||||||
virtual void setForPreview(bool onoff) {m_forPreview = onoff;};
|
// Default implementations
|
||||||
|
virtual bool set_document_string(const std::string &) {return false;}
|
||||||
|
virtual bool set_document_data(const char *cp, unsigned int sz) {
|
||||||
|
return set_document_string(string(cp, sz));
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
virtual bool has_documents() const {return m_havedoc;}
|
||||||
bool m_forPreview;
|
|
||||||
|
// Most doc types are single-doc
|
||||||
|
virtual bool skip_to_document(const string& s) {
|
||||||
|
if (s.empty())
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual DataInput get_required_data_input() const
|
||||||
|
{return DOCUMENT_FILE_NAME;}
|
||||||
|
virtual string get_error() const {
|
||||||
|
return m_reason;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
bool m_forPreview;
|
||||||
|
string m_defcharset;
|
||||||
|
string m_reason;
|
||||||
|
bool m_havedoc;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return indexing handler object for the given mime type
|
* Return indexing handler object for the given mime type
|
||||||
* returned pointer should be deleted by caller
|
* returned pointer should be deleted by caller
|
||||||
*/
|
*/
|
||||||
extern MimeHandler *getMimeHandler(const std::string &mtyp, RclConfig *cfg);
|
extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg);
|
||||||
|
|
||||||
/// Can this mime type be interned ?
|
/// Can this mime type be interned ?
|
||||||
extern bool canIntern(const std::string mimetype, RclConfig *cfg);
|
extern bool canIntern(const std::string mimetype, RclConfig *cfg);
|
||||||
|
|||||||
@ -37,11 +37,13 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
bool in_body_tag;
|
bool in_body_tag;
|
||||||
bool in_pre_tag;
|
bool in_pre_tag;
|
||||||
bool pending_space;
|
bool pending_space;
|
||||||
string title, sample, keywords, dump, dmtime;
|
bool indexing_allowed;
|
||||||
|
string title, sample, keywords, dmtime;
|
||||||
|
string localdump;
|
||||||
|
string &dump;
|
||||||
string ocharset; // This is the charset our user thinks the doc was
|
string ocharset; // This is the charset our user thinks the doc was
|
||||||
string charset; // This is the charset it was supposedly converted to
|
string charset; // This is the charset it was supposedly converted to
|
||||||
string doccharset; // Set this to value of charset parameter in header
|
string doccharset; // Set this to value of charset parameter in header
|
||||||
bool indexing_allowed;
|
|
||||||
void process_text(const string &text);
|
void process_text(const string &text);
|
||||||
void opening_tag(const string &tag, const map<string,string> &p);
|
void opening_tag(const string &tag, const map<string,string> &p);
|
||||||
void closing_tag(const string &tag);
|
void closing_tag(const string &tag);
|
||||||
@ -52,5 +54,16 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
in_body_tag(false),
|
in_body_tag(false),
|
||||||
in_pre_tag(false),
|
in_pre_tag(false),
|
||||||
pending_space(false),
|
pending_space(false),
|
||||||
indexing_allowed(true) { }
|
indexing_allowed(true),
|
||||||
|
dump(localdump)
|
||||||
|
{ }
|
||||||
|
MyHtmlParser(string& buf) :
|
||||||
|
in_script_tag(false),
|
||||||
|
in_style_tag(false),
|
||||||
|
in_body_tag(false),
|
||||||
|
in_pre_tag(false),
|
||||||
|
pending_space(false),
|
||||||
|
indexing_allowed(true),
|
||||||
|
dump(buf)
|
||||||
|
{ }
|
||||||
};
|
};
|
||||||
|
|||||||
@ -8,8 +8,8 @@ LIBS = librcl.a
|
|||||||
|
|
||||||
all: $(LIBS)
|
all: $(LIBS)
|
||||||
|
|
||||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_text.o mimehandler.o myhtmlparse.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o
|
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o
|
||||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp
|
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp
|
||||||
|
|
||||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||||
ar ru librcl.a $(OBJS) unac.o
|
ar ru librcl.a $(OBJS) unac.o
|
||||||
@ -35,6 +35,10 @@ mimetype.o : ../index/mimetype.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c ../index/mimetype.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../index/mimetype.cpp
|
||||||
htmlparse.o : ../internfile/htmlparse.cpp
|
htmlparse.o : ../internfile/htmlparse.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/htmlparse.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/htmlparse.cpp
|
||||||
|
myhtmlparse.o : ../internfile/myhtmlparse.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp
|
||||||
|
mimehandler.o : ../internfile/mimehandler.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp
|
||||||
internfile.o : ../internfile/internfile.cpp
|
internfile.o : ../internfile/internfile.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/internfile.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/internfile.cpp
|
||||||
mh_exec.o : ../internfile/mh_exec.cpp
|
mh_exec.o : ../internfile/mh_exec.cpp
|
||||||
@ -43,12 +47,10 @@ mh_html.o : ../internfile/mh_html.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_html.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_html.cpp
|
||||||
mh_mail.o : ../internfile/mh_mail.cpp
|
mh_mail.o : ../internfile/mh_mail.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mail.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mail.cpp
|
||||||
|
mh_mbox.o : ../internfile/mh_mbox.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp
|
||||||
mh_text.o : ../internfile/mh_text.cpp
|
mh_text.o : ../internfile/mh_text.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp
|
||||||
mimehandler.o : ../internfile/mimehandler.cpp
|
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp
|
|
||||||
myhtmlparse.o : ../internfile/myhtmlparse.cpp
|
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp
|
|
||||||
docseq.o : ../query/docseq.cpp
|
docseq.o : ../query/docseq.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp
|
||||||
history.o : ../query/history.cpp
|
history.o : ../query/history.cpp
|
||||||
@ -124,6 +126,12 @@ mimetype.dep.stamp : ../index/mimetype.cpp
|
|||||||
htmlparse.dep.stamp : ../internfile/htmlparse.cpp
|
htmlparse.dep.stamp : ../internfile/htmlparse.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/htmlparse.cpp > htmlparse.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/htmlparse.cpp > htmlparse.dep
|
||||||
touch htmlparse.dep.stamp
|
touch htmlparse.dep.stamp
|
||||||
|
myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep
|
||||||
|
touch myhtmlparse.dep.stamp
|
||||||
|
mimehandler.dep.stamp : ../internfile/mimehandler.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep
|
||||||
|
touch mimehandler.dep.stamp
|
||||||
internfile.dep.stamp : ../internfile/internfile.cpp
|
internfile.dep.stamp : ../internfile/internfile.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/internfile.cpp > internfile.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/internfile.cpp > internfile.dep
|
||||||
touch internfile.dep.stamp
|
touch internfile.dep.stamp
|
||||||
@ -136,15 +144,12 @@ mh_html.dep.stamp : ../internfile/mh_html.cpp
|
|||||||
mh_mail.dep.stamp : ../internfile/mh_mail.cpp
|
mh_mail.dep.stamp : ../internfile/mh_mail.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mail.cpp > mh_mail.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mail.cpp > mh_mail.dep
|
||||||
touch mh_mail.dep.stamp
|
touch mh_mail.dep.stamp
|
||||||
|
mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mbox.cpp > mh_mbox.dep
|
||||||
|
touch mh_mbox.dep.stamp
|
||||||
mh_text.dep.stamp : ../internfile/mh_text.cpp
|
mh_text.dep.stamp : ../internfile/mh_text.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep
|
||||||
touch mh_text.dep.stamp
|
touch mh_text.dep.stamp
|
||||||
mimehandler.dep.stamp : ../internfile/mimehandler.cpp
|
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep
|
|
||||||
touch mimehandler.dep.stamp
|
|
||||||
myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp
|
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep
|
|
||||||
touch myhtmlparse.dep.stamp
|
|
||||||
docseq.dep.stamp : ../query/docseq.cpp
|
docseq.dep.stamp : ../query/docseq.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep
|
||||||
touch docseq.dep.stamp
|
touch docseq.dep.stamp
|
||||||
@ -217,13 +222,14 @@ include csguess.dep
|
|||||||
include indexer.dep
|
include indexer.dep
|
||||||
include mimetype.dep
|
include mimetype.dep
|
||||||
include htmlparse.dep
|
include htmlparse.dep
|
||||||
|
include myhtmlparse.dep
|
||||||
|
include mimehandler.dep
|
||||||
include internfile.dep
|
include internfile.dep
|
||||||
include mh_exec.dep
|
include mh_exec.dep
|
||||||
include mh_html.dep
|
include mh_html.dep
|
||||||
include mh_mail.dep
|
include mh_mail.dep
|
||||||
|
include mh_mbox.dep
|
||||||
include mh_text.dep
|
include mh_text.dep
|
||||||
include mimehandler.dep
|
|
||||||
include myhtmlparse.dep
|
|
||||||
include docseq.dep
|
include docseq.dep
|
||||||
include history.dep
|
include history.dep
|
||||||
include sortseq.dep
|
include sortseq.dep
|
||||||
|
|||||||
@ -13,13 +13,14 @@ ${depth}/index/csguess.cpp \
|
|||||||
${depth}/index/indexer.cpp \
|
${depth}/index/indexer.cpp \
|
||||||
${depth}/index/mimetype.cpp \
|
${depth}/index/mimetype.cpp \
|
||||||
${depth}/internfile/htmlparse.cpp \
|
${depth}/internfile/htmlparse.cpp \
|
||||||
|
${depth}/internfile/myhtmlparse.cpp \
|
||||||
|
${depth}/internfile/mimehandler.cpp \
|
||||||
${depth}/internfile/internfile.cpp \
|
${depth}/internfile/internfile.cpp \
|
||||||
${depth}/internfile/mh_exec.cpp \
|
${depth}/internfile/mh_exec.cpp \
|
||||||
${depth}/internfile/mh_html.cpp \
|
${depth}/internfile/mh_html.cpp \
|
||||||
${depth}/internfile/mh_mail.cpp \
|
${depth}/internfile/mh_mail.cpp \
|
||||||
|
${depth}/internfile/mh_mbox.cpp \
|
||||||
${depth}/internfile/mh_text.cpp \
|
${depth}/internfile/mh_text.cpp \
|
||||||
${depth}/internfile/mimehandler.cpp \
|
|
||||||
${depth}/internfile/myhtmlparse.cpp \
|
|
||||||
${depth}/query/docseq.cpp \
|
${depth}/query/docseq.cpp \
|
||||||
${depth}/query/history.cpp \
|
${depth}/query/history.cpp \
|
||||||
${depth}/query/sortseq.cpp \
|
${depth}/query/sortseq.cpp \
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -260,13 +260,14 @@ bool stringToStrings(const string &s, std::list<string> &tokens)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void stringToTokens(const string& str, list<string>& tokens,
|
void stringToTokens(const string& str, list<string>& tokens,
|
||||||
const string& delims)
|
const string& delims, bool skipinit)
|
||||||
{
|
{
|
||||||
string::size_type startPos, pos;
|
string::size_type startPos = 0, pos;
|
||||||
|
|
||||||
for (pos = 0;;) {
|
for (pos = 0;;) {
|
||||||
// Skip initial delims, break if this eats all.
|
// Skip initial delims, break if this eats all.
|
||||||
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
if (skipinit &&
|
||||||
|
(startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
||||||
break;
|
break;
|
||||||
// Find next delimiter or end of string (end of token)
|
// Find next delimiter or end of string (end of token)
|
||||||
pos = str.find_first_of(delims, startPos);
|
pos = str.find_first_of(delims, startPos);
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _SMALLUT_H_INCLUDED_
|
#ifndef _SMALLUT_H_INCLUDED_
|
||||||
#define _SMALLUT_H_INCLUDED_
|
#define _SMALLUT_H_INCLUDED_
|
||||||
/* @(#$Id: smallut.h,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: smallut.h,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <map>
|
#include <map>
|
||||||
@ -51,7 +51,7 @@ extern bool stringToStrings(const string &s, list<string> &tokens);
|
|||||||
* Split input string. No handling of quoting
|
* Split input string. No handling of quoting
|
||||||
*/
|
*/
|
||||||
extern void stringToTokens(const string &s, list<string> &tokens,
|
extern void stringToTokens(const string &s, list<string> &tokens,
|
||||||
const string &delims = " \t");
|
const string &delims = " \t", bool skipinit=true);
|
||||||
|
|
||||||
/** Convert string to boolean */
|
/** Convert string to boolean */
|
||||||
extern bool stringToBool(const string &s);
|
extern bool stringToBool(const string &s);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user