test data indexing result same terms as 1.6.3
This commit is contained in:
parent
33c95ef1ba
commit
229eb0de78
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.46 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.47 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -426,7 +426,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
Rcl::Doc fileDoc;
|
||||
fileDoc.fmtime = doc.fmtime;
|
||||
fileDoc.utf8fn = doc.utf8fn;
|
||||
fileDoc.mimetype = doc.mimetype;
|
||||
fileDoc.mimetype = interner.get_mimetype();
|
||||
if (!m_db.add(fn, fileDoc, stp))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ namespace Dijon
|
||||
{
|
||||
public:
|
||||
/// Builds an empty filter.
|
||||
Filter(const std::string &mime_type) {}
|
||||
Filter(const std::string & /*mime_type */) {}
|
||||
/// Destroys the filter.
|
||||
virtual ~Filter() {}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.20 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -149,6 +149,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
|
||||
}
|
||||
|
||||
// Look for appropriate handler (might still return empty)
|
||||
m_mimetype = l_mime;
|
||||
Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);
|
||||
|
||||
if (!df) {
|
||||
@ -172,6 +173,66 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
|
||||
m_fn.c_str()));
|
||||
}
|
||||
|
||||
FileInterner::~FileInterner()
|
||||
{
|
||||
while (!m_handlers.empty()) {
|
||||
delete m_handlers.back();
|
||||
m_handlers.pop_back();
|
||||
}
|
||||
tmpcleanup();
|
||||
}
|
||||
|
||||
static const string string_empty;
|
||||
static const string get_mimetype(Dijon::Filter* df)
|
||||
{
|
||||
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
|
||||
map<string,string>::const_iterator it;
|
||||
it = docdata->find("mimetype");
|
||||
if (it != docdata->end()) {
|
||||
return it->second;
|
||||
} else {
|
||||
return string_empty;
|
||||
}
|
||||
}
|
||||
|
||||
bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
{
|
||||
Dijon::Filter *df = m_handlers.back();
|
||||
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
|
||||
map<string,string>::const_iterator it;
|
||||
|
||||
it = docdata->find("origcharset");
|
||||
if (it != docdata->end())
|
||||
doc.origcharset = it->second;
|
||||
|
||||
it = docdata->find("content");
|
||||
if (it != docdata->end())
|
||||
doc.text = it->second;
|
||||
|
||||
it = docdata->find("title");
|
||||
if (it != docdata->end())
|
||||
doc.title = it->second;
|
||||
|
||||
it = docdata->find("keywords");
|
||||
if (it != docdata->end())
|
||||
doc.keywords = it->second;
|
||||
|
||||
it = docdata->find("modificationdate");
|
||||
if (it != docdata->end())
|
||||
doc.dmtime = it->second;
|
||||
|
||||
it = docdata->find("abstract");
|
||||
if (it != docdata->end()) {
|
||||
doc.abstract = it->second;
|
||||
} else {
|
||||
it = docdata->find("sample");
|
||||
if (it != docdata->end())
|
||||
doc.abstract = it->second;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static const unsigned int MAXHANDLERS = 20;
|
||||
|
||||
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
@ -182,8 +243,11 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
return FIError;
|
||||
}
|
||||
|
||||
// Ipath vector.
|
||||
// Note that the vector is big enough for the maximum stack. All values
|
||||
// over the last significant one are ""
|
||||
// We set the ipath for the first handler here, others are set
|
||||
// when they're pushed on the stack
|
||||
vector<string> vipath(MAXHANDLERS);
|
||||
int vipathidx = 0;
|
||||
if (!ipath.empty()) {
|
||||
@ -196,12 +260,8 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Try to get doc from the topmost filter */
|
||||
while (!m_handlers.empty()) {
|
||||
if (!vipath.empty()) {
|
||||
|
||||
}
|
||||
if (!m_handlers.back()->has_documents()) {
|
||||
// No docs at the current top level. Pop and see if there
|
||||
// is something at the previous one
|
||||
@ -277,23 +337,42 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
LOGERR(("FileInterner::internfile: stack empty\n"));
|
||||
return FIError;
|
||||
}
|
||||
|
||||
// If indexing, we have to collect the ipath stack.
|
||||
|
||||
// While we're at it, we also set the mimetype, which is a special
|
||||
// property:we want to get it from the topmost doc
|
||||
// with an ipath, not the last one which is always text/html
|
||||
// Note that ipath is returned through the parameter not doc.ipath
|
||||
if (!m_forPreview) {
|
||||
string &ipath = doc.ipath;
|
||||
bool hasipath = false;
|
||||
for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin();
|
||||
it != m_handlers.end(); it++) {
|
||||
map<string,string>::const_iterator iti =
|
||||
(*it)->get_meta_data().find("ipath");
|
||||
if (iti != (*it)->get_meta_data().end()) {
|
||||
if (!iti->second.empty())
|
||||
doc.mimetype = m_mimetype;
|
||||
LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str()));
|
||||
map<string,string>::const_iterator titi;
|
||||
|
||||
for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
|
||||
hit != m_handlers.end(); hit++) {
|
||||
|
||||
const map<string, string>& docdata = (*hit)->get_meta_data();
|
||||
map<string, string>::const_iterator iti = docdata.find("ipath");
|
||||
|
||||
if (iti != docdata.end()) {
|
||||
if (!iti->second.empty()) {
|
||||
// We have a non-empty ipath
|
||||
hasipath = true;
|
||||
titi = docdata.find("mimetype");
|
||||
if (titi != docdata.end())
|
||||
doc.mimetype = titi->second;
|
||||
}
|
||||
ipath += iti->second + "|";
|
||||
} else {
|
||||
ipath += "|";
|
||||
}
|
||||
}
|
||||
|
||||
// Walk done, transform the list into a string
|
||||
if (hasipath) {
|
||||
LOGDEB(("IPATH [%s]\n", ipath.c_str()));
|
||||
LOGDEB2(("IPATH [%s]\n", ipath.c_str()));
|
||||
string::size_type sit = ipath.find_last_not_of("|");
|
||||
if (sit == string::npos)
|
||||
ipath.erase();
|
||||
@ -304,7 +383,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
}
|
||||
}
|
||||
|
||||
dijontorcl(m_handlers.back(), doc);
|
||||
dijontorcl(doc);
|
||||
|
||||
// Destack what can be
|
||||
while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
|
||||
@ -317,56 +396,6 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
return FIAgain;
|
||||
}
|
||||
|
||||
|
||||
bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
|
||||
{
|
||||
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
|
||||
map<string,string>::const_iterator it;
|
||||
|
||||
it = docdata->find("mimetype");
|
||||
if (it != docdata->end())
|
||||
doc.mimetype = it->second;
|
||||
|
||||
it = docdata->find("origcharset");
|
||||
if (it != docdata->end())
|
||||
doc.origcharset = it->second;
|
||||
|
||||
it = docdata->find("content");
|
||||
if (it != docdata->end())
|
||||
doc.text = it->second;
|
||||
|
||||
it = docdata->find("title");
|
||||
if (it != docdata->end())
|
||||
doc.title = it->second;
|
||||
|
||||
it = docdata->find("keywords");
|
||||
if (it != docdata->end())
|
||||
doc.keywords = it->second;
|
||||
|
||||
it = docdata->find("modificationdate");
|
||||
if (it != docdata->end())
|
||||
doc.dmtime = it->second;
|
||||
|
||||
it = docdata->find("abstract");
|
||||
if (it != docdata->end()) {
|
||||
doc.abstract = it->second;
|
||||
} else {
|
||||
it = docdata->find("sample");
|
||||
if (it != docdata->end())
|
||||
doc.abstract = it->second;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
FileInterner::~FileInterner()
|
||||
{
|
||||
while (!m_handlers.empty()) {
|
||||
delete m_handlers.back();
|
||||
m_handlers.pop_back();
|
||||
}
|
||||
tmpcleanup();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _INTERNFILE_H_INCLUDED_
|
||||
#define _INTERNFILE_H_INCLUDED_
|
||||
/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: internfile.h,v 1.8 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@ -70,10 +70,12 @@ class FileInterner {
|
||||
* should be called again to get the following one(s).
|
||||
*/
|
||||
Status internfile(Rcl::Doc& doc, string &ipath);
|
||||
const string& get_mimetype() {return m_mimetype;}
|
||||
|
||||
private:
|
||||
RclConfig *m_cfg;
|
||||
string m_fn;
|
||||
string m_mimetype; // Mime type for [uncompressed] file
|
||||
bool m_forPreview;
|
||||
// m_tdir and m_tfile are used only for decompressing input file if needed
|
||||
const string& m_tdir;
|
||||
@ -81,7 +83,7 @@ class FileInterner {
|
||||
vector<Dijon::Filter*> m_handlers;
|
||||
|
||||
void tmpcleanup();
|
||||
static bool dijontorcl(Dijon::Filter *, Rcl::Doc&);
|
||||
bool dijontorcl(Rcl::Doc&);
|
||||
};
|
||||
|
||||
#endif /* _INTERNFILE_H_INCLUDED_ */
|
||||
|
||||
@ -64,8 +64,8 @@ bool MimeHandlerHtml::next_document()
|
||||
if (m_havedoc == false)
|
||||
return false;
|
||||
m_havedoc = false;
|
||||
LOGDEB(("textHtmlToDoc: next_document\n"));
|
||||
string charset = m_defcharset;
|
||||
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str()));
|
||||
|
||||
// - We first try to convert from the default configured charset
|
||||
// (which may depend of the current directory) to utf-8. If this
|
||||
@ -76,10 +76,11 @@ bool MimeHandlerHtml::next_document()
|
||||
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
||||
|
||||
|
||||
MyHtmlParser p(m_metaData["content"]);
|
||||
MyHtmlParser result;
|
||||
for (int pass = 0; pass < 2; pass++) {
|
||||
string transcoded;
|
||||
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
||||
MyHtmlParser p;
|
||||
// Try transcoding. If it fails, use original text.
|
||||
if (!transcode(m_html, transcoded, charset, "UTF-8")) {
|
||||
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
||||
@ -97,16 +98,18 @@ bool MimeHandlerHtml::next_document()
|
||||
try {
|
||||
p.parse_html(transcoded);
|
||||
// No exception: ok?
|
||||
result = p;
|
||||
break;
|
||||
} catch (bool diag) {
|
||||
result = p;
|
||||
if (diag == true)
|
||||
break;
|
||||
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
||||
charset.c_str(), p.doccharset.c_str()));
|
||||
if (!p.doccharset.empty() &&
|
||||
!samecharset(p.doccharset, p.ocharset)) {
|
||||
charset.c_str(),result.doccharset.c_str()));
|
||||
if (!result.doccharset.empty() &&
|
||||
!samecharset(result.doccharset, result.ocharset)) {
|
||||
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
||||
charset = p.doccharset;
|
||||
charset = result.doccharset;
|
||||
} else {
|
||||
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
||||
return false;
|
||||
@ -115,11 +118,12 @@ bool MimeHandlerHtml::next_document()
|
||||
}
|
||||
|
||||
m_metaData["origcharset"] = m_defcharset;
|
||||
m_metaData["content"] = result.dump;
|
||||
m_metaData["charset"] = "utf-8";
|
||||
m_metaData["title"] = p.title;
|
||||
m_metaData["keywords"] = p.keywords;
|
||||
m_metaData["modificationdate"] = p.dmtime;
|
||||
m_metaData["sample"] = p.sample;
|
||||
m_metaData["title"] = result.title;
|
||||
m_metaData["keywords"] = result.keywords;
|
||||
m_metaData["modificationdate"] = result.dmtime;
|
||||
m_metaData["sample"] = result.sample;
|
||||
m_metaData["mimetype"] = "text/plain";
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.25 2006-12-15 16:33:15 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -94,9 +94,22 @@ bool MimeHandlerMail::next_document()
|
||||
{
|
||||
if (!m_havedoc)
|
||||
return false;
|
||||
m_havedoc = false;
|
||||
m_metaData["mimetype"] = "text/plain";
|
||||
return processMsg(m_bincdoc, 0);
|
||||
bool res = false;
|
||||
|
||||
if (m_idx == -1) {
|
||||
m_metaData["mimetype"] = "text/plain";
|
||||
res =processMsg(m_bincdoc, 0);
|
||||
} else {
|
||||
res = processAttach();
|
||||
}
|
||||
m_idx++;
|
||||
m_havedoc = m_idx < (int)m_attachments.size();
|
||||
return res;
|
||||
}
|
||||
|
||||
bool MimeHandlerMail::processAttach()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Transform a single message into a document. The subject becomes the
|
||||
@ -301,6 +314,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||
out += "]";
|
||||
out += "\n\n";
|
||||
}
|
||||
// m_attachments.push_back(&doc);
|
||||
// We're done with this part
|
||||
return;
|
||||
}
|
||||
@ -373,19 +387,18 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||
map<string, string>::const_iterator it =
|
||||
mh.get_meta_data().find("content");
|
||||
if (it != mh.get_meta_data().end())
|
||||
putf8 = &it->second;
|
||||
out += it->second;
|
||||
} else {
|
||||
// Transcode to utf-8
|
||||
if (!transcode(body, utf8, charset, "UTF-8")) {
|
||||
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
||||
charset.c_str()));
|
||||
putf8 = &body;
|
||||
out += body;
|
||||
} else {
|
||||
putf8 = &utf8;
|
||||
out += utf8;
|
||||
}
|
||||
}
|
||||
if (putf8)
|
||||
out += *putf8;
|
||||
|
||||
if (out.length() && out[out.length()-1] != '\n')
|
||||
out += '\n';
|
||||
|
||||
|
||||
@ -16,9 +16,12 @@
|
||||
*/
|
||||
#ifndef _MAIL_H_INCLUDED_
|
||||
#define _MAIL_H_INCLUDED_
|
||||
/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mh_mail.h,v 1.10 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
using std::vector;
|
||||
|
||||
#include "mimehandler.h"
|
||||
|
||||
namespace Binc {
|
||||
@ -34,18 +37,23 @@ namespace Binc {
|
||||
class MimeHandlerMail : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerMail(const string &mt)
|
||||
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0)
|
||||
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
|
||||
{}
|
||||
virtual ~MimeHandlerMail();
|
||||
virtual bool set_document_file(const string &file_path);
|
||||
virtual bool set_document_string(const string &data);
|
||||
virtual bool next_document();
|
||||
|
||||
private:
|
||||
Binc::MimeDocument *m_bincdoc;
|
||||
bool processMsg(Binc::MimePart *doc, int depth);
|
||||
void walkmime(Binc::MimePart* doc, int depth);
|
||||
int m_fd;
|
||||
std::stringstream *m_stream;
|
||||
bool processAttach();
|
||||
Binc::MimeDocument *m_bincdoc;
|
||||
int m_fd;
|
||||
std::stringstream *m_stream;
|
||||
int m_idx; // starts at -1 for self, then index into
|
||||
// attachments;
|
||||
vector<Binc::MimePart *> m_attachments;
|
||||
};
|
||||
|
||||
#endif /* _MAIL_H_INCLUDED_ */
|
||||
|
||||
@ -37,13 +37,11 @@ class MyHtmlParser : public HtmlParser {
|
||||
bool in_body_tag;
|
||||
bool in_pre_tag;
|
||||
bool pending_space;
|
||||
bool indexing_allowed;
|
||||
string title, sample, keywords, dmtime;
|
||||
string localdump;
|
||||
string &dump;
|
||||
string title, sample, keywords, dump, dmtime;
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
string charset; // This is the charset it was supposedly converted to
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
bool indexing_allowed;
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag, const map<string,string> &p);
|
||||
void closing_tag(const string &tag);
|
||||
@ -54,16 +52,5 @@ class MyHtmlParser : public HtmlParser {
|
||||
in_body_tag(false),
|
||||
in_pre_tag(false),
|
||||
pending_space(false),
|
||||
indexing_allowed(true),
|
||||
dump(localdump)
|
||||
{ }
|
||||
MyHtmlParser(string& buf) :
|
||||
in_script_tag(false),
|
||||
in_style_tag(false),
|
||||
in_body_tag(false),
|
||||
in_pre_tag(false),
|
||||
pending_space(false),
|
||||
indexing_allowed(true),
|
||||
dump(buf)
|
||||
{ }
|
||||
indexing_allowed(true) { }
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user