test data indexing result same terms as 1.6.3

This commit is contained in:
dockes 2006-12-15 16:33:15 +00:00
parent 33c95ef1ba
commit 229eb0de78
8 changed files with 152 additions and 109 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.46 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.47 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -426,7 +426,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
Rcl::Doc fileDoc;
fileDoc.fmtime = doc.fmtime;
fileDoc.utf8fn = doc.utf8fn;
fileDoc.mimetype = doc.mimetype;
fileDoc.mimetype = interner.get_mimetype();
if (!m_db.add(fn, fileDoc, stp))
return FsTreeWalker::FtwError;
}

View File

@ -51,7 +51,7 @@ namespace Dijon
{
public:
/// Builds an empty filter.
Filter(const std::string &mime_type) {}
Filter(const std::string & /*mime_type */) {}
/// Destroys the filter.
virtual ~Filter() {}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.20 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -149,6 +149,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
}
// Look for appropriate handler (might still return empty)
m_mimetype = l_mime;
Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);
if (!df) {
@ -172,6 +173,66 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
m_fn.c_str()));
}
FileInterner::~FileInterner()
{
while (!m_handlers.empty()) {
delete m_handlers.back();
m_handlers.pop_back();
}
tmpcleanup();
}
static const string string_empty;
static const string get_mimetype(Dijon::Filter* df)
{
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("mimetype");
if (it != docdata->end()) {
return it->second;
} else {
return string_empty;
}
}
bool FileInterner::dijontorcl(Rcl::Doc& doc)
{
Dijon::Filter *df = m_handlers.back();
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("origcharset");
if (it != docdata->end())
doc.origcharset = it->second;
it = docdata->find("content");
if (it != docdata->end())
doc.text = it->second;
it = docdata->find("title");
if (it != docdata->end())
doc.title = it->second;
it = docdata->find("keywords");
if (it != docdata->end())
doc.keywords = it->second;
it = docdata->find("modificationdate");
if (it != docdata->end())
doc.dmtime = it->second;
it = docdata->find("abstract");
if (it != docdata->end()) {
doc.abstract = it->second;
} else {
it = docdata->find("sample");
if (it != docdata->end())
doc.abstract = it->second;
}
return true;
}
static const unsigned int MAXHANDLERS = 20;
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
@ -182,8 +243,11 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
return FIError;
}
// Ipath vector.
// Note that the vector is big enough for the maximum stack. All values
// over the last significant one are ""
// We set the ipath for the first handler here, others are set
// when they're pushed on the stack
vector<string> vipath(MAXHANDLERS);
int vipathidx = 0;
if (!ipath.empty()) {
@ -196,12 +260,8 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
}
}
/* Try to get doc from the topmost filter */
while (!m_handlers.empty()) {
if (!vipath.empty()) {
}
if (!m_handlers.back()->has_documents()) {
// No docs at the current top level. Pop and see if there
// is something at the previous one
@ -277,23 +337,42 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
LOGERR(("FileInterner::internfile: stack empty\n"));
return FIError;
}
// If indexing, we have to collect the ipath stack.
// While we're at it, we also set the mimetype, which is a special
// property:we want to get it from the topmost doc
// with an ipath, not the last one which is always text/html
// Note that ipath is returned through the parameter not doc.ipath
if (!m_forPreview) {
string &ipath = doc.ipath;
bool hasipath = false;
for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin();
it != m_handlers.end(); it++) {
map<string,string>::const_iterator iti =
(*it)->get_meta_data().find("ipath");
if (iti != (*it)->get_meta_data().end()) {
if (!iti->second.empty())
doc.mimetype = m_mimetype;
LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str()));
map<string,string>::const_iterator titi;
for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
hit != m_handlers.end(); hit++) {
const map<string, string>& docdata = (*hit)->get_meta_data();
map<string, string>::const_iterator iti = docdata.find("ipath");
if (iti != docdata.end()) {
if (!iti->second.empty()) {
// We have a non-empty ipath
hasipath = true;
titi = docdata.find("mimetype");
if (titi != docdata.end())
doc.mimetype = titi->second;
}
ipath += iti->second + "|";
} else {
ipath += "|";
}
}
// Walk done, transform the list into a string
if (hasipath) {
LOGDEB(("IPATH [%s]\n", ipath.c_str()));
LOGDEB2(("IPATH [%s]\n", ipath.c_str()));
string::size_type sit = ipath.find_last_not_of("|");
if (sit == string::npos)
ipath.erase();
@ -304,7 +383,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
}
}
dijontorcl(m_handlers.back(), doc);
dijontorcl(doc);
// Destack what can be
while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
@ -317,56 +396,6 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
return FIAgain;
}
bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
{
const std::map<std::string, std::string> *docdata = &df->get_meta_data();
map<string,string>::const_iterator it;
it = docdata->find("mimetype");
if (it != docdata->end())
doc.mimetype = it->second;
it = docdata->find("origcharset");
if (it != docdata->end())
doc.origcharset = it->second;
it = docdata->find("content");
if (it != docdata->end())
doc.text = it->second;
it = docdata->find("title");
if (it != docdata->end())
doc.title = it->second;
it = docdata->find("keywords");
if (it != docdata->end())
doc.keywords = it->second;
it = docdata->find("modificationdate");
if (it != docdata->end())
doc.dmtime = it->second;
it = docdata->find("abstract");
if (it != docdata->end()) {
doc.abstract = it->second;
} else {
it = docdata->find("sample");
if (it != docdata->end())
doc.abstract = it->second;
}
return true;
}
FileInterner::~FileInterner()
{
while (!m_handlers.empty()) {
delete m_handlers.back();
m_handlers.pop_back();
}
tmpcleanup();
}
#else
#include <stdio.h>

View File

@ -16,7 +16,7 @@
*/
#ifndef _INTERNFILE_H_INCLUDED_
#define _INTERNFILE_H_INCLUDED_
/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: internfile.h,v 1.8 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <vector>
@ -70,10 +70,12 @@ class FileInterner {
* should be called again to get the following one(s).
*/
Status internfile(Rcl::Doc& doc, string &ipath);
const string& get_mimetype() {return m_mimetype;}
private:
RclConfig *m_cfg;
string m_fn;
string m_mimetype; // Mime type for [uncompressed] file
bool m_forPreview;
// m_tdir and m_tfile are used only for decompressing input file if needed
const string& m_tdir;
@ -81,7 +83,7 @@ class FileInterner {
vector<Dijon::Filter*> m_handlers;
void tmpcleanup();
static bool dijontorcl(Dijon::Filter *, Rcl::Doc&);
bool dijontorcl(Rcl::Doc&);
};
#endif /* _INTERNFILE_H_INCLUDED_ */

View File

@ -64,8 +64,8 @@ bool MimeHandlerHtml::next_document()
if (m_havedoc == false)
return false;
m_havedoc = false;
LOGDEB(("textHtmlToDoc: next_document\n"));
string charset = m_defcharset;
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str()));
// - We first try to convert from the default configured charset
// (which may depend of the current directory) to utf-8. If this
@ -76,10 +76,11 @@ bool MimeHandlerHtml::next_document()
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
MyHtmlParser p(m_metaData["content"]);
MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
if (!transcode(m_html, transcoded, charset, "UTF-8")) {
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
@ -97,16 +98,18 @@ bool MimeHandlerHtml::next_document()
try {
p.parse_html(transcoded);
// No exception: ok?
result = p;
break;
} catch (bool diag) {
result = p;
if (diag == true)
break;
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
charset.c_str(), p.doccharset.c_str()));
if (!p.doccharset.empty() &&
!samecharset(p.doccharset, p.ocharset)) {
charset.c_str(),result.doccharset.c_str()));
if (!result.doccharset.empty() &&
!samecharset(result.doccharset, result.ocharset)) {
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
charset = p.doccharset;
charset = result.doccharset;
} else {
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
return false;
@ -115,11 +118,12 @@ bool MimeHandlerHtml::next_document()
}
m_metaData["origcharset"] = m_defcharset;
m_metaData["content"] = result.dump;
m_metaData["charset"] = "utf-8";
m_metaData["title"] = p.title;
m_metaData["keywords"] = p.keywords;
m_metaData["modificationdate"] = p.dmtime;
m_metaData["sample"] = p.sample;
m_metaData["title"] = result.title;
m_metaData["keywords"] = result.keywords;
m_metaData["modificationdate"] = result.dmtime;
m_metaData["sample"] = result.sample;
m_metaData["mimetype"] = "text/plain";
return true;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.25 2006-12-15 16:33:15 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -94,9 +94,22 @@ bool MimeHandlerMail::next_document()
{
if (!m_havedoc)
return false;
m_havedoc = false;
m_metaData["mimetype"] = "text/plain";
return processMsg(m_bincdoc, 0);
bool res = false;
if (m_idx == -1) {
m_metaData["mimetype"] = "text/plain";
res =processMsg(m_bincdoc, 0);
} else {
res = processAttach();
}
m_idx++;
m_havedoc = m_idx < (int)m_attachments.size();
return res;
}
bool MimeHandlerMail::processAttach()
{
return false;
}
// Transform a single message into a document. The subject becomes the
@ -301,6 +314,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
out += "]";
out += "\n\n";
}
// m_attachments.push_back(&doc);
// We're done with this part
return;
}
@ -373,19 +387,18 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
map<string, string>::const_iterator it =
mh.get_meta_data().find("content");
if (it != mh.get_meta_data().end())
putf8 = &it->second;
out += it->second;
} else {
// Transcode to utf-8
if (!transcode(body, utf8, charset, "UTF-8")) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
putf8 = &body;
out += body;
} else {
putf8 = &utf8;
out += utf8;
}
}
if (putf8)
out += *putf8;
if (out.length() && out[out.length()-1] != '\n')
out += '\n';

View File

@ -16,9 +16,12 @@
*/
#ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_mail.h,v 1.10 2006-12-15 16:33:15 dockes Exp $ (C) 2004 J.F.Dockes */
#include <sstream>
#include <vector>
using std::vector;
#include "mimehandler.h"
namespace Binc {
@ -34,18 +37,23 @@ namespace Binc {
class MimeHandlerMail : public RecollFilter {
public:
MimeHandlerMail(const string &mt)
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0)
: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
{}
virtual ~MimeHandlerMail();
virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string &data);
virtual bool next_document();
private:
Binc::MimeDocument *m_bincdoc;
bool processMsg(Binc::MimePart *doc, int depth);
void walkmime(Binc::MimePart* doc, int depth);
int m_fd;
std::stringstream *m_stream;
bool processAttach();
Binc::MimeDocument *m_bincdoc;
int m_fd;
std::stringstream *m_stream;
int m_idx; // starts at -1 for self, then index into
// attachments;
vector<Binc::MimePart *> m_attachments;
};
#endif /* _MAIL_H_INCLUDED_ */

View File

@ -37,13 +37,11 @@ class MyHtmlParser : public HtmlParser {
bool in_body_tag;
bool in_pre_tag;
bool pending_space;
bool indexing_allowed;
string title, sample, keywords, dmtime;
string localdump;
string &dump;
string title, sample, keywords, dump, dmtime;
string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag);
@ -54,16 +52,5 @@ class MyHtmlParser : public HtmlParser {
in_body_tag(false),
in_pre_tag(false),
pending_space(false),
indexing_allowed(true),
dump(localdump)
{ }
MyHtmlParser(string& buf) :
in_script_tag(false),
in_style_tag(false),
in_body_tag(false),
in_pre_tag(false),
pending_space(false),
indexing_allowed(true),
dump(buf)
{ }
indexing_allowed(true) { }
};