test data indexing result same terms as 1.6.3

This commit is contained in:
dockes 2006-12-15 16:33:15 +00:00
parent 33c95ef1ba
commit 229eb0de78
8 changed files with 152 additions and 109 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.46 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: indexer.cpp,v 1.47 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -426,7 +426,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
Rcl::Doc fileDoc; Rcl::Doc fileDoc;
fileDoc.fmtime = doc.fmtime; fileDoc.fmtime = doc.fmtime;
fileDoc.utf8fn = doc.utf8fn; fileDoc.utf8fn = doc.utf8fn;
fileDoc.mimetype = doc.mimetype; fileDoc.mimetype = interner.get_mimetype();
if (!m_db.add(fn, fileDoc, stp)) if (!m_db.add(fn, fileDoc, stp))
return FsTreeWalker::FtwError; return FsTreeWalker::FtwError;
} }

View File

@ -51,7 +51,7 @@ namespace Dijon
{ {
public: public:
/// Builds an empty filter. /// Builds an empty filter.
Filter(const std::string &mime_type) {} Filter(const std::string & /*mime_type */) {}
/// Destroys the filter. /// Destroys the filter.
virtual ~Filter() {} virtual ~Filter() {}

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: internfile.cpp,v 1.20 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -149,6 +149,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
} }
// Look for appropriate handler (might still return empty) // Look for appropriate handler (might still return empty)
m_mimetype = l_mime;
Dijon::Filter *df = getMimeHandler(l_mime, m_cfg); Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);
if (!df) { if (!df) {
@ -172,6 +173,66 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
m_fn.c_str())); m_fn.c_str()));
} }
FileInterner::~FileInterner()
{
while (!m_handlers.empty()) {
delete m_handlers.back();
m_handlers.pop_back();
}
tmpcleanup();
}
static const string string_empty;
static const string get_mimetype(Dijon::Filter* df)
{
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("mimetype");
if (it != docdata->end()) {
return it->second;
} else {
return string_empty;
}
}
bool FileInterner::dijontorcl(Rcl::Doc& doc)
{
Dijon::Filter *df = m_handlers.back();
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("origcharset");
if (it != docdata->end())
doc.origcharset = it->second;
it = docdata->find("content");
if (it != docdata->end())
doc.text = it->second;
it = docdata->find("title");
if (it != docdata->end())
doc.title = it->second;
it = docdata->find("keywords");
if (it != docdata->end())
doc.keywords = it->second;
it = docdata->find("modificationdate");
if (it != docdata->end())
doc.dmtime = it->second;
it = docdata->find("abstract");
if (it != docdata->end()) {
doc.abstract = it->second;
} else {
it = docdata->find("sample");
if (it != docdata->end())
doc.abstract = it->second;
}
return true;
}
static const unsigned int MAXHANDLERS = 20; static const unsigned int MAXHANDLERS = 20;
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
@ -182,8 +243,11 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
return FIError; return FIError;
} }
// Ipath vector.
// Note that the vector is big enough for the maximum stack. All values // Note that the vector is big enough for the maximum stack. All values
// over the last significant one are "" // over the last significant one are ""
// We set the ipath for the first handler here, others are set
// when they're pushed on the stack
vector<string> vipath(MAXHANDLERS); vector<string> vipath(MAXHANDLERS);
int vipathidx = 0; int vipathidx = 0;
if (!ipath.empty()) { if (!ipath.empty()) {
@ -196,12 +260,8 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
} }
} }
/* Try to get doc from the topmost filter */ /* Try to get doc from the topmost filter */
while (!m_handlers.empty()) { while (!m_handlers.empty()) {
if (!vipath.empty()) {
}
if (!m_handlers.back()->has_documents()) { if (!m_handlers.back()->has_documents()) {
// No docs at the current top level. Pop and see if there // No docs at the current top level. Pop and see if there
// is something at the previous one // is something at the previous one
@ -277,23 +337,42 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
LOGERR(("FileInterner::internfile: stack empty\n")); LOGERR(("FileInterner::internfile: stack empty\n"));
return FIError; return FIError;
} }
// If indexing, we have to collect the ipath stack.
// While we're at it, we also set the mimetype, which is a special
// property:we want to get it from the topmost doc
// with an ipath, not the last one which is always text/html
// Note that ipath is returned through the parameter not doc.ipath
if (!m_forPreview) { if (!m_forPreview) {
string &ipath = doc.ipath;
bool hasipath = false; bool hasipath = false;
for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin(); doc.mimetype = m_mimetype;
it != m_handlers.end(); it++) { LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str()));
map<string,string>::const_iterator iti = map<string,string>::const_iterator titi;
(*it)->get_meta_data().find("ipath");
if (iti != (*it)->get_meta_data().end()) { for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
if (!iti->second.empty()) hit != m_handlers.end(); hit++) {
const map<string, string>& docdata = (*hit)->get_meta_data();
map<string, string>::const_iterator iti = docdata.find("ipath");
if (iti != docdata.end()) {
if (!iti->second.empty()) {
// We have a non-empty ipath
hasipath = true; hasipath = true;
titi = docdata.find("mimetype");
if (titi != docdata.end())
doc.mimetype = titi->second;
}
ipath += iti->second + "|"; ipath += iti->second + "|";
} else { } else {
ipath += "|"; ipath += "|";
} }
} }
// Walk done, transform the list into a string
if (hasipath) { if (hasipath) {
LOGDEB(("IPATH [%s]\n", ipath.c_str())); LOGDEB2(("IPATH [%s]\n", ipath.c_str()));
string::size_type sit = ipath.find_last_not_of("|"); string::size_type sit = ipath.find_last_not_of("|");
if (sit == string::npos) if (sit == string::npos)
ipath.erase(); ipath.erase();
@ -304,7 +383,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
} }
} }
dijontorcl(m_handlers.back(), doc); dijontorcl(doc);
// Destack what can be // Destack what can be
while (!m_handlers.empty() && !m_handlers.back()->has_documents()) { while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
@ -317,56 +396,6 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
return FIAgain; return FIAgain;
} }
bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
{
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("mimetype");
if (it != docdata->end())
doc.mimetype = it->second;
it = docdata->find("origcharset");
if (it != docdata->end())
doc.origcharset = it->second;
it = docdata->find("content");
if (it != docdata->end())
doc.text = it->second;
it = docdata->find("title");
if (it != docdata->end())
doc.title = it->second;
it = docdata->find("keywords");
if (it != docdata->end())
doc.keywords = it->second;
it = docdata->find("modificationdate");
if (it != docdata->end())
doc.dmtime = it->second;
it = docdata->find("abstract");
if (it != docdata->end()) {
doc.abstract = it->second;
} else {
it = docdata->find("sample");
if (it != docdata->end())
doc.abstract = it->second;
}
return true;
}
FileInterner::~FileInterner()
{
while (!m_handlers.empty()) {
delete m_handlers.back();
m_handlers.pop_back();
}
tmpcleanup();
}
#else #else
#include <stdio.h> #include <stdio.h>

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _INTERNFILE_H_INCLUDED_ #ifndef _INTERNFILE_H_INCLUDED_
#define _INTERNFILE_H_INCLUDED_ #define _INTERNFILE_H_INCLUDED_
/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: internfile.h,v 1.8 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <vector> #include <vector>
@ -70,10 +70,12 @@ class FileInterner {
* should be called again to get the following one(s). * should be called again to get the following one(s).
*/ */
Status internfile(Rcl::Doc& doc, string &ipath); Status internfile(Rcl::Doc& doc, string &ipath);
const string& get_mimetype() {return m_mimetype;}
private: private:
RclConfig *m_cfg; RclConfig *m_cfg;
string m_fn; string m_fn;
string m_mimetype; // Mime type for [uncompressed] file
bool m_forPreview; bool m_forPreview;
// m_tdir and m_tfile are used only for decompressing input file if needed // m_tdir and m_tfile are used only for decompressing input file if needed
const string& m_tdir; const string& m_tdir;
@ -81,7 +83,7 @@ class FileInterner {
vector<Dijon::Filter*> m_handlers; vector<Dijon::Filter*> m_handlers;
void tmpcleanup(); void tmpcleanup();
static bool dijontorcl(Dijon::Filter *, Rcl::Doc&); bool dijontorcl(Rcl::Doc&);
}; };
#endif /* _INTERNFILE_H_INCLUDED_ */ #endif /* _INTERNFILE_H_INCLUDED_ */

View File

@ -64,8 +64,8 @@ bool MimeHandlerHtml::next_document()
if (m_havedoc == false) if (m_havedoc == false)
return false; return false;
m_havedoc = false; m_havedoc = false;
LOGDEB(("textHtmlToDoc: next_document\n"));
string charset = m_defcharset; string charset = m_defcharset;
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str()));
// - We first try to convert from the default configured charset // - We first try to convert from the default configured charset
// (which may depend of the current directory) to utf-8. If this // (which may depend of the current directory) to utf-8. If this
@ -76,10 +76,11 @@ bool MimeHandlerHtml::next_document()
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
MyHtmlParser p(m_metaData["content"]); MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) { for (int pass = 0; pass < 2; pass++) {
string transcoded; string transcoded;
LOGDEB(("Html::mkDoc: pass %d\n", pass)); LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p;
// Try transcoding. If it fails, use original text. // Try transcoding. If it fails, use original text.
if (!transcode(m_html, transcoded, charset, "UTF-8")) { if (!transcode(m_html, transcoded, charset, "UTF-8")) {
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n", LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
@ -97,16 +98,18 @@ bool MimeHandlerHtml::next_document()
try { try {
p.parse_html(transcoded); p.parse_html(transcoded);
// No exception: ok? // No exception: ok?
result = p;
break; break;
} catch (bool diag) { } catch (bool diag) {
result = p;
if (diag == true) if (diag == true)
break; break;
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
charset.c_str(), p.doccharset.c_str())); charset.c_str(),result.doccharset.c_str()));
if (!p.doccharset.empty() && if (!result.doccharset.empty() &&
!samecharset(p.doccharset, p.ocharset)) { !samecharset(result.doccharset, result.ocharset)) {
LOGDEB(("textHtmlToDoc: reparse for charsets\n")); LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
charset = p.doccharset; charset = result.doccharset;
} else { } else {
LOGERR(("textHtmlToDoc:: error: non charset exception\n")); LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
return false; return false;
@ -115,11 +118,12 @@ bool MimeHandlerHtml::next_document()
} }
m_metaData["origcharset"] = m_defcharset; m_metaData["origcharset"] = m_defcharset;
m_metaData["content"] = result.dump;
m_metaData["charset"] = "utf-8"; m_metaData["charset"] = "utf-8";
m_metaData["title"] = p.title; m_metaData["title"] = result.title;
m_metaData["keywords"] = p.keywords; m_metaData["keywords"] = result.keywords;
m_metaData["modificationdate"] = p.dmtime; m_metaData["modificationdate"] = result.dmtime;
m_metaData["sample"] = p.sample; m_metaData["sample"] = result.sample;
m_metaData["mimetype"] = "text/plain"; m_metaData["mimetype"] = "text/plain";
return true; return true;
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.25 2006-12-15 16:33:15 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -94,9 +94,22 @@ bool MimeHandlerMail::next_document()
{ {
if (!m_havedoc) if (!m_havedoc)
return false; return false;
m_havedoc = false; bool res = false;
m_metaData["mimetype"] = "text/plain";
return processMsg(m_bincdoc, 0); if (m_idx == -1) {
m_metaData["mimetype"] = "text/plain";
res =processMsg(m_bincdoc, 0);
} else {
res = processAttach();
}
m_idx++;
m_havedoc = m_idx < (int)m_attachments.size();
return res;
}
bool MimeHandlerMail::processAttach()
{
return false;
} }
// Transform a single message into a document. The subject becomes the // Transform a single message into a document. The subject becomes the
@ -301,6 +314,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
out += "]"; out += "]";
out += "\n\n"; out += "\n\n";
} }
// m_attachments.push_back(&doc);
// We're done with this part // We're done with this part
return; return;
} }
@ -373,19 +387,18 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
map<string, string>::const_iterator it = map<string, string>::const_iterator it =
mh.get_meta_data().find("content"); mh.get_meta_data().find("content");
if (it != mh.get_meta_data().end()) if (it != mh.get_meta_data().end())
putf8 = &it->second; out += it->second;
} else { } else {
// Transcode to utf-8 // Transcode to utf-8
if (!transcode(body, utf8, charset, "UTF-8")) { if (!transcode(body, utf8, charset, "UTF-8")) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str())); charset.c_str()));
putf8 = &body; out += body;
} else { } else {
putf8 = &utf8; out += utf8;
} }
} }
if (putf8)
out += *putf8;
if (out.length() && out[out.length()-1] != '\n') if (out.length() && out[out.length()-1] != '\n')
out += '\n'; out += '\n';

View File

@ -16,9 +16,12 @@
*/ */
#ifndef _MAIL_H_INCLUDED_ #ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_mail.h,v 1.10 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */
#include <sstream> #include <sstream>
#include <vector>
using std::vector;
#include "mimehandler.h" #include "mimehandler.h"
namespace Binc { namespace Binc {
@ -34,18 +37,23 @@ namespace Binc {
class MimeHandlerMail : public RecollFilter { class MimeHandlerMail : public RecollFilter {
public: public:
MimeHandlerMail(const string &mt) MimeHandlerMail(const string &mt)
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0) : RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
{} {}
virtual ~MimeHandlerMail(); virtual ~MimeHandlerMail();
virtual bool set_document_file(const string &file_path); virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string &data); virtual bool set_document_string(const string &data);
virtual bool next_document(); virtual bool next_document();
private: private:
Binc::MimeDocument *m_bincdoc;
bool processMsg(Binc::MimePart *doc, int depth); bool processMsg(Binc::MimePart *doc, int depth);
void walkmime(Binc::MimePart* doc, int depth); void walkmime(Binc::MimePart* doc, int depth);
int m_fd; bool processAttach();
std::stringstream *m_stream; Binc::MimeDocument *m_bincdoc;
int m_fd;
std::stringstream *m_stream;
int m_idx; // starts at -1 for self, then index into
// attachments;
vector<Binc::MimePart *> m_attachments;
}; };
#endif /* _MAIL_H_INCLUDED_ */ #endif /* _MAIL_H_INCLUDED_ */

View File

@ -37,13 +37,11 @@ class MyHtmlParser : public HtmlParser {
bool in_body_tag; bool in_body_tag;
bool in_pre_tag; bool in_pre_tag;
bool pending_space; bool pending_space;
bool indexing_allowed; string title, sample, keywords, dump, dmtime;
string title, sample, keywords, dmtime;
string localdump;
string &dump;
string ocharset; // This is the charset our user thinks the doc was string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text); void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p); void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag); void closing_tag(const string &tag);
@ -54,16 +52,5 @@ class MyHtmlParser : public HtmlParser {
in_body_tag(false), in_body_tag(false),
in_pre_tag(false), in_pre_tag(false),
pending_space(false), pending_space(false),
indexing_allowed(true), indexing_allowed(true) { }
dump(localdump)
{ }
MyHtmlParser(string& buf) :
in_script_tag(false),
in_style_tag(false),
in_body_tag(false),
in_pre_tag(false),
pending_space(false),
indexing_allowed(true),
dump(buf)
{ }
}; };