added open-ended field name handling

This commit is contained in:
dockes 2007-06-19 08:36:24 +00:00
parent c4b099e8d3
commit 0c74bd6e36
15 changed files with 176 additions and 176 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.46 2007-06-18 13:04:14 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.47 2007-06-19 08:36:23 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -373,13 +373,13 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype)
return hs; return hs;
} }
string RclConfig::getFieldPrefix(const string& fld) bool RclConfig::getFieldPrefix(const string& fld, string &pfx)
{ {
string hs; if (!mimeconf->get(fld, pfx, "prefixes")) {
if (!mimeconf->get(fld, hs, "prefixes")) {
LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str())); LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str()));
return false;
} }
return hs; return true;
} }
string RclConfig::getMimeViewerDef(const string &mtype) string RclConfig::getMimeViewerDef(const string &mtype)

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _RCLCONFIG_H_INCLUDED_ #ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.34 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rclconfig.h,v 1.35 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
#include <list> #include <list>
#include <string> #include <string>
@ -138,7 +138,7 @@ class RclConfig {
bool getMimeCatTypes(const string& cat, list<string>&); bool getMimeCatTypes(const string& cat, list<string>&);
/** mimeconf: get field prefix from field name */ /** mimeconf: get field prefix from field name */
string getFieldPrefix(const string& fldname); bool getFieldPrefix(const string& fldname, string &pfx);
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */ /** mimeview: get/set external viewer exec string(s) for mimetype(s) */
string getMimeViewerDef(const string &mimetype); string getMimeViewerDef(const string &mimetype);

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.30 2007-05-23 08:29:04 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: internfile.cpp,v 1.31 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -270,12 +270,12 @@ static const string keyab("abstract");
static const string keyau("author"); static const string keyau("author");
static const string keycs("charset"); static const string keycs("charset");
static const string keyct("content"); static const string keyct("content");
static const string keyds("description");
static const string keyfn("filename"); static const string keyfn("filename");
static const string keykw("keywords"); static const string keykw("keywords");
static const string keymd("modificationdate"); static const string keymd("modificationdate");
static const string keymt("mimetype"); static const string keymt("mimetype");
static const string keyoc("origcharset"); static const string keyoc("origcharset");
static const string keysm("sample");
static const string keytt("title"); static const string keytt("title");
bool FileInterner::dijontorcl(Rcl::Doc& doc) bool FileInterner::dijontorcl(Rcl::Doc& doc)
@ -283,15 +283,24 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
Dijon::Filter *df = m_handlers.back(); Dijon::Filter *df = m_handlers.back();
const std::map<std::string, std::string>& docdata = df->get_meta_data(); const std::map<std::string, std::string>& docdata = df->get_meta_data();
getKeyValue(docdata, keyau, doc.author); for (map<string,string>::const_iterator it = docdata.begin();
getKeyValue(docdata, keyoc, doc.origcharset); it != docdata.end(); it++) {
getKeyValue(docdata, keyct, doc.text); if (it->first == keyct) {
getKeyValue(docdata, keytt, doc.title); doc.text = it->second;
getKeyValue(docdata, keykw, doc.keywords); } else if (it->first == keymd) {
getKeyValue(docdata, keymd, doc.dmtime); doc.dmtime = it->second;
if (!getKeyValue(docdata, keyab, doc.abstract)) } else if (it->first == keyoc) {
getKeyValue(docdata, keysm, doc.abstract); doc.origcharset = it->second;
LOGDEB1(("FILENAME: %s\n", doc.utf8fn.c_str())); } else if (it->first == keymt || it->first == keycs) {
// don't need these.
} else {
doc.meta[it->first] = it->second;
}
}
if (doc.meta[keyab].empty() && !doc.meta[keyds].empty()) {
doc.meta[keyab] = doc.meta[keyds];
doc.meta.erase(keyds);
}
return true; return true;
} }
@ -324,7 +333,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const
} else { } else {
ipath += isep; ipath += isep;
} }
getKeyValue(docdata, keyau, doc.author); getKeyValue(docdata, keyau, doc.meta["author"]);
getKeyValue(docdata, keymd, doc.dmtime); getKeyValue(docdata, keymd, doc.dmtime);
} }
@ -672,7 +681,7 @@ int main(int argc, char **argv)
"]]]]\n-----------------------------------------------------\n" << "]]]]\n-----------------------------------------------------\n" <<
"doc.keywords [[[[" << doc.keywords << "doc.keywords [[[[" << doc.keywords <<
"]]]]\n-----------------------------------------------------\n" << "]]]]\n-----------------------------------------------------\n" <<
"doc.abstract [[[[" << doc.abstract << "doc.meta["abstract"] [[[[" << doc.meta["abstract"] <<
"]]]]\n-----------------------------------------------------\n" << "]]]]\n-----------------------------------------------------\n" <<
"doc.text [[[[" << doc.text << "]]]]\n"; "doc.text [[[[" << doc.text << "]]]]\n";
} }

View File

@ -136,15 +136,16 @@ bool MimeHandlerHtml::next_document()
m_metaData["origcharset"] = m_defcharset; m_metaData["origcharset"] = m_defcharset;
m_metaData["content"] = result.dump; m_metaData["content"] = result.dump;
m_metaData["charset"] = "utf-8"; m_metaData["charset"] = "utf-8";
m_metaData["title"] = result.title;
m_metaData["keywords"] = result.keywords;
// Avoid setting empty values which would crush ones possibly inherited // Avoid setting empty values which would crush ones possibly inherited
// from parent (if we're an attachment) // from parent (if we're an attachment)
if (!result.author.empty())
m_metaData["author"] = result.author;
if (!result.dmtime.empty()) if (!result.dmtime.empty())
m_metaData["modificationdate"] = result.dmtime; m_metaData["modificationdate"] = result.dmtime;
m_metaData["sample"] = result.sample;
m_metaData["mimetype"] = "text/plain"; m_metaData["mimetype"] = "text/plain";
for (map<string,string>::const_iterator it = result.meta.begin();
it != result.meta.end(); it++) {
if (!it->second.empty())
m_metaData[it->first] = it->second;
}
return true; return true;
} }

View File

@ -144,22 +144,7 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
if ((j = p.find("name")) != p.end()) { if ((j = p.find("name")) != p.end()) {
string name = j->second; string name = j->second;
lowercase_term(name); lowercase_term(name);
if (name == "description") { if (name == "date") {
if (sample.empty()) {
sample = i->second;
decode_entities(sample);
}
} else if (name == "keywords") {
if (!keywords.empty()) keywords += ' ';
string tmp = i->second;
decode_entities(tmp);
keywords += tmp;
} else if (name == "author") {
if (!author.empty()) author += ' ';
string tmp = i->second;
decode_entities(tmp);
author += tmp;
} else if (name == "date") {
// Yes this doesnt exist. It's output by filters // Yes this doesnt exist. It's output by filters
// And the format isn't even standard http/html // And the format isn't even standard http/html
// FIXME // FIXME
@ -172,6 +157,13 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
sprintf(ascuxtime, "%ld", (long)mktime(&tm)); sprintf(ascuxtime, "%ld", (long)mktime(&tm));
dmtime = ascuxtime; dmtime = ascuxtime;
} }
} else if (name == "robots") {
} else {
if (!meta[name].empty())
meta[name] += ' ';
string tmp = i->second;
decode_entities(tmp);
meta[name] += tmp;
} }
} else if ((j = p.find("http-equiv")) != p.end()) { } else if ((j = p.find("http-equiv")) != p.end()) {
string hequiv = j->second; string hequiv = j->second;
@ -309,8 +301,8 @@ MyHtmlParser::closing_tag(const string &tag)
break; break;
case 't': case 't':
if (tag == "title") { if (tag == "title") {
if (title.empty()) { if (meta["title"].empty()) {
title = dump; meta["title"] = dump;
dump = ""; dump = "";
} }
break; break;

View File

@ -22,6 +22,8 @@
* USA * USA
* -----END-LICENCE----- * -----END-LICENCE-----
*/ */
#include <map>
using std::map;
#include "htmlparse.h" #include "htmlparse.h"
@ -37,7 +39,8 @@ class MyHtmlParser : public HtmlParser {
bool in_body_tag; bool in_body_tag;
bool in_pre_tag; bool in_pre_tag;
bool pending_space; bool pending_space;
string title, sample, keywords, dump, dmtime, author; map<string,string> meta;
string dump, dmtime;
string ocharset; // This is the charset our user thinks the doc was string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header string doccharset; // Set this to value of charset parameter in header

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.20 2007-06-12 13:31:38 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.21 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -400,8 +400,12 @@ QTextEdit *Preview::addEditorTab()
void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc, void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
int docnum) int docnum)
{ {
QString title = QString::fromUtf8(doc.title.c_str(), QString title;
doc.title.length()); map<string,string>::const_iterator meta_it;
if ((meta_it = doc.meta.find("title")) != doc.meta.end()) {
title = QString::fromUtf8(meta_it->second.c_str(),
meta_it->second.length());
}
if (title.length() > 20) { if (title.length() > 20) {
title = title.left(10) + "..." + title.right(10); title = title.left(10) + "..." + title.right(10);
} }
@ -421,8 +425,8 @@ void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
printableUrl(doc.url, url); printableUrl(doc.url, url);
string tiptxt = url + string("\n"); string tiptxt = url + string("\n");
tiptxt += doc.mimetype + " " + string(datebuf) + "\n"; tiptxt += doc.mimetype + " " + string(datebuf) + "\n";
if (!doc.title.empty()) if (meta_it != doc.meta.end() && !meta_it->second.empty())
tiptxt += doc.title + "\n"; tiptxt += meta_it->second + "\n";
pvTab->setTabToolTip(w,QString::fromUtf8(tiptxt.c_str(), tiptxt.length())); pvTab->setTabToolTip(w,QString::fromUtf8(tiptxt.c_str(), tiptxt.length()));
for (list<TabData>::iterator it = tabData.begin(); for (list<TabData>::iterator it = tabData.begin();
@ -607,8 +611,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
Rcl::Doc doc = idoc; Rcl::Doc doc = idoc;
bool cancel = false; bool cancel = false;
if (doc.title.empty()) if (doc.meta["title"].empty())
doc.title = path_getsimple(doc.url); doc.meta["title"] = path_getsimple(doc.url);
setCurTabProps(fn, doc, docnum); setCurTabProps(fn, doc, docnum);

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.26 2007-06-13 17:03:23 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: reslist.cpp,v 1.27 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
#include <time.h> #include <time.h>
@ -399,7 +399,7 @@ void ResList::resultPageNext()
if (percent == -1) { if (percent == -1) {
percent = 0; percent = 0;
// Document not available, maybe other further, will go on. // Document not available, maybe other further, will go on.
doc.abstract = string(tr("Unavailable document").utf8()); doc.meta["abstract"] = string(tr("Unavailable document").utf8());
} }
// Determine icon to display if any // Determine icon to display if any
@ -426,8 +426,8 @@ void ResList::resultPageNext()
printableUrl(doc.url, url); printableUrl(doc.url, url);
// Make title out of file name if none yet // Make title out of file name if none yet
if (doc.title.empty()) { if (doc.meta["title"].empty()) {
doc.title = path_getsimple(url); doc.meta["title"] = path_getsimple(url);
} }
// Result number // Result number
@ -469,7 +469,7 @@ void ResList::resultPageNext()
(doc.syntabs || prefs.queryReplaceAbstract)) { (doc.syntabs || prefs.queryReplaceAbstract)) {
abstract = m_docSource->getAbstract(doc); abstract = m_docSource->getAbstract(doc);
} else { } else {
abstract = doc.abstract; abstract = doc.meta["abstract"];
} }
// No need to call escapeHtml(), plaintorich handles it // No need to call escapeHtml(), plaintorich handles it
string richabst; string richabst;
@ -505,14 +505,14 @@ void ResList::resultPageNext()
map<char,string> subs; map<char,string> subs;
subs['A'] = !richabst.empty() ? richabst + "<br>" : ""; subs['A'] = !richabst.empty() ? richabst + "<br>" : "";
subs['D'] = datebuf; subs['D'] = datebuf;
subs['K'] = !doc.keywords.empty() ? escapeHtml(doc.keywords) + "<br>" subs['K'] = !doc.meta["keywords"].empty() ?
: ""; escapeHtml(doc.meta["keywords"]) + "<br>" : "";
subs['L'] = linksbuf; subs['L'] = linksbuf;
subs['N'] = numbuf; subs['N'] = numbuf;
subs['M'] = doc.mimetype; subs['M'] = doc.mimetype;
subs['R'] = perbuf; subs['R'] = perbuf;
subs['S'] = sizebuf; subs['S'] = sizebuf;
subs['T'] = escapeHtml(doc.title); subs['T'] = escapeHtml(doc.meta["title"]);
subs['U'] = url; subs['U'] = url;
string formatted; string formatted;

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _DOCSEQ_H_INCLUDED_ #ifndef _DOCSEQ_H_INCLUDED_
#define _DOCSEQ_H_INCLUDED_ #define _DOCSEQ_H_INCLUDED_
/* @(#$Id: docseq.h,v 1.11 2007-01-19 15:22:50 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: docseq.h,v 1.12 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
#include <vector> #include <vector>
@ -70,7 +70,7 @@ class DocSequence {
* The default is to return the input doc's abstract fields, but some * The default is to return the input doc's abstract fields, but some
* sequences can compute a better value (ie: docseqdb) */ * sequences can compute a better value (ie: docseqdb) */
virtual string getAbstract(Rcl::Doc& doc) { virtual string getAbstract(Rcl::Doc& doc) {
return doc.abstract; return doc.meta["abstract"];
} }
/** Get estimated total count in results */ /** Get estimated total count in results */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.2 2007-01-19 15:22:50 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -42,9 +42,9 @@ int DocSequenceDb::getResCnt()
string DocSequenceDb::getAbstract(Rcl::Doc &doc) string DocSequenceDb::getAbstract(Rcl::Doc &doc)
{ {
if (!m_db) if (!m_db)
return doc.abstract; return doc.meta["abstract"];
string abstract; string abstract;
m_db->makeDocAbstract(doc, abstract); m_db->makeDocAbstract(doc, abstract);
return abstract.empty() ? doc.abstract : abstract; return abstract.empty() ? doc.meta["abstract"] : abstract;
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.114 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.115 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -200,14 +200,14 @@ bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
parms.get(string("fmtime"), doc.fmtime); parms.get(string("fmtime"), doc.fmtime);
parms.get(string("dmtime"), doc.dmtime); parms.get(string("dmtime"), doc.dmtime);
parms.get(string("origcharset"), doc.origcharset); parms.get(string("origcharset"), doc.origcharset);
parms.get(string("caption"), doc.title); parms.get(string("caption"), doc.meta["title"]);
parms.get(string("keywords"), doc.keywords); parms.get(string("keywords"), doc.meta["keywords"]);
parms.get(string("abstract"), doc.abstract); parms.get(string("abstract"), doc.meta["abstract"]);
// Possibly remove synthetic abstract indicator (if it's there, we // Possibly remove synthetic abstract indicator (if it's there, we
// used to index the beginning of the text as abstract). // used to index the beginning of the text as abstract).
doc.syntabs = false; doc.syntabs = false;
if (doc.abstract.find(rclSyntAbs) == 0) { if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
doc.abstract = doc.abstract.substr(rclSyntAbs.length()); doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
doc.syntabs = true; doc.syntabs = true;
} }
parms.get(string("ipath"), doc.ipath); parms.get(string("ipath"), doc.ipath);
@ -743,12 +743,15 @@ bool Db::isopen()
// Try to translate field specification into field prefix. We have a // Try to translate field specification into field prefix. We have a
// default table used if translations are not in the config for some // default table used if translations are not in the config for some
// reason (old config not updated ?). We use it only if the config // reason (old config not updated ?). We use it only if the config
// translation fails // translation fails. Also we add in there fields which should be
string Db::fieldToPrefix(const string& fldname) // indexed with no prefix (ie: abstract)
bool Db::fieldToPrefix(const string& fldname, string &pfx)
{ {
// This is the default table // This is the default table
static map<string, string> fldToPrefs; static map<string, string> fldToPrefs;
if (fldToPrefs.empty()) { if (fldToPrefs.empty()) {
fldToPrefs["abstract"] = "";
fldToPrefs["title"] = "S"; fldToPrefs["title"] = "S";
fldToPrefs["caption"] = "S"; fldToPrefs["caption"] = "S";
fldToPrefs["subject"] = "S"; fldToPrefs["subject"] = "S";
@ -763,17 +766,19 @@ string Db::fieldToPrefix(const string& fldname)
fldToPrefs["tags"] = "K"; fldToPrefs["tags"] = "K";
} }
string fld(fldname), pfx; string fld(fldname);
stringtolower(fld); stringtolower(fld);
RclConfig *config = RclConfig::getMainConfig(); RclConfig *config = RclConfig::getMainConfig();
if (config) if (config && config->getFieldPrefix(fld, pfx))
pfx = config->getFieldPrefix(fld); return true;
if (pfx.empty()) {
map<string, string>::const_iterator it = fldToPrefs.find(fld); map<string, string>::const_iterator it = fldToPrefs.find(fld);
if (it != fldToPrefs.end()) if (it != fldToPrefs.end()) {
fld = it->second; pfx = it->second;
return true;
} }
return pfx; return false;
} }
@ -880,11 +885,12 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
LOGDEB1(("Db::add: fn %s\n", fn.c_str())); LOGDEB1(("Db::add: fn %s\n", fn.c_str()));
if (m_ndb == 0) if (m_ndb == 0)
return false; return false;
static int first = 1;
// Check file system full every mbyte of indexed text. // Check file system full every mbyte of indexed text.
if (m_maxFsOccupPc > 0 && (m_curtxtsz - m_occtxtsz) / MB >= 1) { if (m_maxFsOccupPc > 0 && (first || (m_curtxtsz - m_occtxtsz) / MB >= 1)) {
LOGDEB(("Db::add: checking file system usage\n")); LOGDEB(("Db::add: checking file system usage\n"));
int pc; int pc;
first = 0;
if (fsocc(m_basedir, &pc) && pc >= m_maxFsOccupPc) { if (fsocc(m_basedir, &pc) && pc >= m_maxFsOccupPc) {
LOGERR(("Db::add: stop indexing: file system " LOGERR(("Db::add: stop indexing: file system "
"%d%% full > max %d%%\n", pc, m_maxFsOccupPc)); "%d%% full > max %d%%\n", pc, m_maxFsOccupPc));
@ -895,37 +901,38 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
Doc doc = idoc; Doc doc = idoc;
// The title, author, abstract and keywords fields are special, they
// get stored in the document data record.
// Truncate abstract, title and keywords to reasonable lengths. If // Truncate abstract, title and keywords to reasonable lengths. If
// abstract is currently empty, we make up one with the beginning // abstract is currently empty, we make up one with the beginning
// of the document. This is then not indexed, but part of the doc // of the document. This is then not indexed, but part of the doc
// data so that we can return it to a query without having to // data so that we can return it to a query without having to
// decode the original file. // decode the original file.
bool syntabs = false; bool syntabs = false;
if (doc.abstract.empty()) { // Note that the map accesses by operator[] create empty entries if they
// don't exist yet.
if (doc.meta["abstract"].empty()) {
syntabs = true; syntabs = true;
doc.abstract = rclSyntAbs + doc.meta["abstract"] = rclSyntAbs +
truncate_to_word(doc.text, m_idxAbsTruncLen); neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
} else { } else {
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen); doc.meta["abstract"] =
neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
"\n\r");
} }
doc.abstract = neutchars(doc.abstract, "\n\r"); if (doc.meta["title"].empty())
doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r"); doc.meta["title"] = doc.utf8fn, "\n\r";
doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r"); doc.meta["title"] =
doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r"); neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
doc.meta["author"] =
neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
doc.meta["keywords"] =
neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
Xapian::Document newdocument; Xapian::Document newdocument;
mySplitterCB splitData(newdocument, m_stops); mySplitterCB splitData(newdocument, m_stops);
TextSplit splitter(&splitData); TextSplit splitter(&splitData);
// Index the title, document text, keywords and other textual
// metadata. These are all indexed as text with positions, as we
// may want to do phrase searches with them (this makes no sense
// for keywords by the way, but wtf).
/
// The order has no importance, and we set a position gap of 100
// between fields to avoid false proximity matches.
string noacc; string noacc;
// Split and index file name as document term(s) // Split and index file name as document term(s)
@ -935,35 +942,39 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
} }
// Split and index title. If title is empty here, use file name // Index textual metadata. These are all indexed as text with
if (doc.title.empty()) // positions, as we may want to do phrase searches with them (this
doc.title = doc.utf8fn; // makes no sense for keywords by the way).
if (!doc.title.empty()) { //
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str())); // The order has no importance, and we set a position gap of 100
if (!dumb_string(doc.title, noacc)) { // between fields to avoid false proximity matches.
LOGERR(("Db::add: dumb_string failed\n")); map<string,string>::iterator meta_it;
return false; string pfx;
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
if (!meta_it->second.empty()) {
if (meta_it->first == "abstract" && syntabs)
continue;
if (!fieldToPrefix(meta_it->first, pfx)) {
LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
meta_it->first.c_str()));
continue;
}
LOGDEB(("Db::add: field [%s] pfx [%s]: [%s]\n",
meta_it->first.c_str(), pfx.c_str(),
meta_it->second.c_str()));
if (!dumb_string(meta_it->second, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix(pfx); // Subject
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
} }
splitData.setprefix("S"); // Subject
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
} }
// Split and index author
if (!doc.author.empty()) {
LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str()));
if (!dumb_string(doc.author, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix("A");
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
}
// Split and index body // Split and index body text
LOGDEB2(("Db::add: split body\n")); LOGDEB2(("Db::add: split body\n"));
if (!dumb_string(doc.text, noacc)) { if (!dumb_string(doc.text, noacc)) {
LOGERR(("Db::add: dumb_string failed\n")); LOGERR(("Db::add: dumb_string failed\n"));
@ -972,36 +983,8 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split and index keywords
if (!doc.keywords.empty()) {
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix("K");
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
}
// Split and index abstract. We don't do this if it is synthetic ////// Special terms for other metadata. No positions for these.
// any more (this used to give a relevance boost to the beginning
// of text, why ?)
LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
if (!syntabs) {
// syntabs indicator test kept here in case we want to go back
// to indexing synthetic abstracts one day
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
doc.abstract, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
}
splitData.basepos += splitData.curpos + 100;
////// Special terms for metadata
// Mime type // Mime type
newdocument.add_term("T" + doc.mimetype); newdocument.add_term("T" + doc.mimetype);
@ -1075,11 +1058,14 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
if (!doc.ipath.empty()) { if (!doc.ipath.empty()) {
record += "\nipath=" + doc.ipath; record += "\nipath=" + doc.ipath;
} }
record += "\ncaption=" + doc.title; if (!doc.meta["title"].empty())
record += "\nkeywords=" + doc.keywords; record += "\ncaption=" + doc.meta["title"];
record += "\nabstract=" + doc.abstract; if (!doc.meta["keywords"].empty())
if (!doc.author.empty()) { record += "\nkeywords=" + doc.meta["keywords"];
record += "\nauthor=" + doc.author; if (!doc.meta["abstract"].empty())
record += "\nabstract=" + doc.meta["abstract"];
if (!doc.meta["author"].empty()) {
record += "\nauthor=" + doc.meta["author"];
} }
record += "\n"; record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str())); LOGDEB1(("Newdocument data: %s\n", record.c_str()));

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.51 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.52 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -95,7 +95,7 @@ class Db {
const StopList& getStopList() const {return m_stops;} const StopList& getStopList() const {return m_stops;}
/** Field name to prefix translation (ie: author -> 'A') */ /** Field name to prefix translation (ie: author -> 'A') */
string fieldToPrefix(const string& fldname); bool fieldToPrefix(const string& fldname, string &pfx);
/* Update-related methods ******************************************/ /* Update-related methods ******************************************/

View File

@ -16,12 +16,14 @@
*/ */
#ifndef _RCLDOC_H_INCLUDED_ #ifndef _RCLDOC_H_INCLUDED_
#define _RCLDOC_H_INCLUDED_ #define _RCLDOC_H_INCLUDED_
/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */ /* @(#$Id: rcldoc.h,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes */
#include <string> #include <string>
#include <map>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
using std::string; using std::string;
using std::map;
namespace Rcl { namespace Rcl {
#endif #endif
@ -47,12 +49,16 @@ class Doc {
// Possibly set by handler // Possibly set by handler
string origcharset; // Charset we transcoded from (in case we want back) string origcharset; // Charset we transcoded from (in case we want back)
// Possibly set by handler // Possibly set by handler
string title; // Possibly set by handler
string author; // Possibly set by handler // A map for textual metadata like, author, keywords, abstract, title
string keywords; // Possibly set by handler // Entries possibly set by handler. If a field-name to prefix translation
string abstract; // Possibly set by handler // exists, the terms will be indexed with a prefix.
bool syntabs; // true if abstract is just the top of doc, not an map<string, string> meta;
// explicit document attribute
// Attribute for the "abstract" entry. true if it is just the top
// of doc, not a native document attribute
bool syntabs;
string fbytes; // File size. Set by Db::Add string fbytes; // File size. Set by Db::Add
string dbytes; // Doc size. Set by Db::Add from text length string dbytes; // Doc size. Set by Db::Add from text length
@ -72,9 +78,7 @@ class Doc {
fmtime.erase(); fmtime.erase();
dmtime.erase(); dmtime.erase();
origcharset.erase(); origcharset.erase();
title.erase(); meta.clear();
keywords.erase();
abstract.erase();
syntabs = false; syntabs = false;
fbytes.erase(); fbytes.erase();
dbytes.erase(); dbytes.erase();

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.15 2007-06-18 13:04:15 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.16 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -487,7 +487,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
} }
string prefix; string prefix;
if (!m_field.empty()) if (!m_field.empty())
prefix = db.fieldToPrefix(m_field); db.fieldToPrefix(m_field, prefix);
list<Xapian::Query> pqueries; list<Xapian::Query> pqueries;
// We normally boost the original term in the stem expansion list. Don't // We normally boost the original term in the stem expansion list. Don't
@ -541,7 +541,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
string prefix; string prefix;
if (!m_field.empty()) if (!m_field.empty())
prefix = db.fieldToPrefix(m_field); db.fieldToPrefix(m_field, prefix);
// We normally boost the original term in the stem expansion list. Don't // We normally boost the original term in the stem expansion list. Don't
// do it if there are wildcards anywhere, this would skew the results. // do it if there are wildcards anywhere, this would skew the results.

View File

@ -1,4 +1,4 @@
# @(#$Id: mimeconf,v 1.29 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: mimeconf,v 1.30 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll : associations of mime types to processing filters. # Recoll : associations of mime types to processing filters.
# There are different sections for decompression, 'interning' for indexing # There are different sections for decompression, 'interning' for indexing
@ -144,3 +144,4 @@ keyword = K
tag = K tag = K
keywords = K keywords = K
tags = K tags = K