added open-ended field name handling
This commit is contained in:
parent
c4b099e8d3
commit
0c74bd6e36
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.46 2007-06-18 13:04:14 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.47 2007-06-19 08:36:23 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -373,13 +373,13 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype)
|
||||
return hs;
|
||||
}
|
||||
|
||||
string RclConfig::getFieldPrefix(const string& fld)
|
||||
bool RclConfig::getFieldPrefix(const string& fld, string &pfx)
|
||||
{
|
||||
string hs;
|
||||
if (!mimeconf->get(fld, hs, "prefixes")) {
|
||||
if (!mimeconf->get(fld, pfx, "prefixes")) {
|
||||
LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str()));
|
||||
return false;
|
||||
}
|
||||
return hs;
|
||||
return true;
|
||||
}
|
||||
|
||||
string RclConfig::getMimeViewerDef(const string &mtype)
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _RCLCONFIG_H_INCLUDED_
|
||||
#define _RCLCONFIG_H_INCLUDED_
|
||||
/* @(#$Id: rclconfig.h,v 1.34 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rclconfig.h,v 1.35 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
@ -138,7 +138,7 @@ class RclConfig {
|
||||
bool getMimeCatTypes(const string& cat, list<string>&);
|
||||
|
||||
/** mimeconf: get field prefix from field name */
|
||||
string getFieldPrefix(const string& fldname);
|
||||
bool getFieldPrefix(const string& fldname, string &pfx);
|
||||
|
||||
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */
|
||||
string getMimeViewerDef(const string &mimetype);
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.30 2007-05-23 08:29:04 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.31 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -270,12 +270,12 @@ static const string keyab("abstract");
|
||||
static const string keyau("author");
|
||||
static const string keycs("charset");
|
||||
static const string keyct("content");
|
||||
static const string keyds("description");
|
||||
static const string keyfn("filename");
|
||||
static const string keykw("keywords");
|
||||
static const string keymd("modificationdate");
|
||||
static const string keymt("mimetype");
|
||||
static const string keyoc("origcharset");
|
||||
static const string keysm("sample");
|
||||
static const string keytt("title");
|
||||
|
||||
bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
@ -283,15 +283,24 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
Dijon::Filter *df = m_handlers.back();
|
||||
const std::map<std::string, std::string>& docdata = df->get_meta_data();
|
||||
|
||||
getKeyValue(docdata, keyau, doc.author);
|
||||
getKeyValue(docdata, keyoc, doc.origcharset);
|
||||
getKeyValue(docdata, keyct, doc.text);
|
||||
getKeyValue(docdata, keytt, doc.title);
|
||||
getKeyValue(docdata, keykw, doc.keywords);
|
||||
getKeyValue(docdata, keymd, doc.dmtime);
|
||||
if (!getKeyValue(docdata, keyab, doc.abstract))
|
||||
getKeyValue(docdata, keysm, doc.abstract);
|
||||
LOGDEB1(("FILENAME: %s\n", doc.utf8fn.c_str()));
|
||||
for (map<string,string>::const_iterator it = docdata.begin();
|
||||
it != docdata.end(); it++) {
|
||||
if (it->first == keyct) {
|
||||
doc.text = it->second;
|
||||
} else if (it->first == keymd) {
|
||||
doc.dmtime = it->second;
|
||||
} else if (it->first == keyoc) {
|
||||
doc.origcharset = it->second;
|
||||
} else if (it->first == keymt || it->first == keycs) {
|
||||
// don't need these.
|
||||
} else {
|
||||
doc.meta[it->first] = it->second;
|
||||
}
|
||||
}
|
||||
if (doc.meta[keyab].empty() && !doc.meta[keyds].empty()) {
|
||||
doc.meta[keyab] = doc.meta[keyds];
|
||||
doc.meta.erase(keyds);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -324,7 +333,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const
|
||||
} else {
|
||||
ipath += isep;
|
||||
}
|
||||
getKeyValue(docdata, keyau, doc.author);
|
||||
getKeyValue(docdata, keyau, doc.meta["author"]);
|
||||
getKeyValue(docdata, keymd, doc.dmtime);
|
||||
}
|
||||
|
||||
@ -672,7 +681,7 @@ int main(int argc, char **argv)
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.keywords [[[[" << doc.keywords <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.abstract [[[[" << doc.abstract <<
|
||||
"doc.meta["abstract"] [[[[" << doc.meta["abstract"] <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.text [[[[" << doc.text << "]]]]\n";
|
||||
}
|
||||
|
||||
@ -136,15 +136,16 @@ bool MimeHandlerHtml::next_document()
|
||||
m_metaData["origcharset"] = m_defcharset;
|
||||
m_metaData["content"] = result.dump;
|
||||
m_metaData["charset"] = "utf-8";
|
||||
m_metaData["title"] = result.title;
|
||||
m_metaData["keywords"] = result.keywords;
|
||||
// Avoid setting empty values which would crush ones possibly inherited
|
||||
// from parent (if we're an attachment)
|
||||
if (!result.author.empty())
|
||||
m_metaData["author"] = result.author;
|
||||
if (!result.dmtime.empty())
|
||||
m_metaData["modificationdate"] = result.dmtime;
|
||||
m_metaData["sample"] = result.sample;
|
||||
m_metaData["mimetype"] = "text/plain";
|
||||
|
||||
for (map<string,string>::const_iterator it = result.meta.begin();
|
||||
it != result.meta.end(); it++) {
|
||||
if (!it->second.empty())
|
||||
m_metaData[it->first] = it->second;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -144,22 +144,7 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
if ((j = p.find("name")) != p.end()) {
|
||||
string name = j->second;
|
||||
lowercase_term(name);
|
||||
if (name == "description") {
|
||||
if (sample.empty()) {
|
||||
sample = i->second;
|
||||
decode_entities(sample);
|
||||
}
|
||||
} else if (name == "keywords") {
|
||||
if (!keywords.empty()) keywords += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
keywords += tmp;
|
||||
} else if (name == "author") {
|
||||
if (!author.empty()) author += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
author += tmp;
|
||||
} else if (name == "date") {
|
||||
if (name == "date") {
|
||||
// Yes this doesnt exist. It's output by filters
|
||||
// And the format isn't even standard http/html
|
||||
// FIXME
|
||||
@ -172,7 +157,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
||||
dmtime = ascuxtime;
|
||||
}
|
||||
}
|
||||
} else if (name == "robots") {
|
||||
} else {
|
||||
if (!meta[name].empty())
|
||||
meta[name] += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
meta[name] += tmp;
|
||||
}
|
||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||
string hequiv = j->second;
|
||||
lowercase_term(hequiv);
|
||||
@ -309,8 +301,8 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "title") {
|
||||
if (title.empty()) {
|
||||
title = dump;
|
||||
if (meta["title"].empty()) {
|
||||
meta["title"] = dump;
|
||||
dump = "";
|
||||
}
|
||||
break;
|
||||
|
||||
@ -22,6 +22,8 @@
|
||||
* USA
|
||||
* -----END-LICENCE-----
|
||||
*/
|
||||
#include <map>
|
||||
using std::map;
|
||||
|
||||
#include "htmlparse.h"
|
||||
|
||||
@ -37,7 +39,8 @@ class MyHtmlParser : public HtmlParser {
|
||||
bool in_body_tag;
|
||||
bool in_pre_tag;
|
||||
bool pending_space;
|
||||
string title, sample, keywords, dump, dmtime, author;
|
||||
map<string,string> meta;
|
||||
string dump, dmtime;
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
string charset; // This is the charset it was supposedly converted to
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.20 2007-06-12 13:31:38 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.21 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -400,8 +400,12 @@ QTextEdit *Preview::addEditorTab()
|
||||
void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
|
||||
int docnum)
|
||||
{
|
||||
QString title = QString::fromUtf8(doc.title.c_str(),
|
||||
doc.title.length());
|
||||
QString title;
|
||||
map<string,string>::const_iterator meta_it;
|
||||
if ((meta_it = doc.meta.find("title")) != doc.meta.end()) {
|
||||
title = QString::fromUtf8(meta_it->second.c_str(),
|
||||
meta_it->second.length());
|
||||
}
|
||||
if (title.length() > 20) {
|
||||
title = title.left(10) + "..." + title.right(10);
|
||||
}
|
||||
@ -421,8 +425,8 @@ void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
|
||||
printableUrl(doc.url, url);
|
||||
string tiptxt = url + string("\n");
|
||||
tiptxt += doc.mimetype + " " + string(datebuf) + "\n";
|
||||
if (!doc.title.empty())
|
||||
tiptxt += doc.title + "\n";
|
||||
if (meta_it != doc.meta.end() && !meta_it->second.empty())
|
||||
tiptxt += meta_it->second + "\n";
|
||||
pvTab->setTabToolTip(w,QString::fromUtf8(tiptxt.c_str(), tiptxt.length()));
|
||||
|
||||
for (list<TabData>::iterator it = tabData.begin();
|
||||
@ -607,8 +611,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
|
||||
Rcl::Doc doc = idoc;
|
||||
bool cancel = false;
|
||||
|
||||
if (doc.title.empty())
|
||||
doc.title = path_getsimple(doc.url);
|
||||
if (doc.meta["title"].empty())
|
||||
doc.meta["title"] = path_getsimple(doc.url);
|
||||
|
||||
setCurTabProps(fn, doc, docnum);
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.26 2007-06-13 17:03:23 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.27 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <time.h>
|
||||
@ -399,7 +399,7 @@ void ResList::resultPageNext()
|
||||
if (percent == -1) {
|
||||
percent = 0;
|
||||
// Document not available, maybe other further, will go on.
|
||||
doc.abstract = string(tr("Unavailable document").utf8());
|
||||
doc.meta["abstract"] = string(tr("Unavailable document").utf8());
|
||||
}
|
||||
|
||||
// Determine icon to display if any
|
||||
@ -426,8 +426,8 @@ void ResList::resultPageNext()
|
||||
printableUrl(doc.url, url);
|
||||
|
||||
// Make title out of file name if none yet
|
||||
if (doc.title.empty()) {
|
||||
doc.title = path_getsimple(url);
|
||||
if (doc.meta["title"].empty()) {
|
||||
doc.meta["title"] = path_getsimple(url);
|
||||
}
|
||||
|
||||
// Result number
|
||||
@ -469,7 +469,7 @@ void ResList::resultPageNext()
|
||||
(doc.syntabs || prefs.queryReplaceAbstract)) {
|
||||
abstract = m_docSource->getAbstract(doc);
|
||||
} else {
|
||||
abstract = doc.abstract;
|
||||
abstract = doc.meta["abstract"];
|
||||
}
|
||||
// No need to call escapeHtml(), plaintorich handles it
|
||||
string richabst;
|
||||
@ -505,14 +505,14 @@ void ResList::resultPageNext()
|
||||
map<char,string> subs;
|
||||
subs['A'] = !richabst.empty() ? richabst + "<br>" : "";
|
||||
subs['D'] = datebuf;
|
||||
subs['K'] = !doc.keywords.empty() ? escapeHtml(doc.keywords) + "<br>"
|
||||
: "";
|
||||
subs['K'] = !doc.meta["keywords"].empty() ?
|
||||
escapeHtml(doc.meta["keywords"]) + "<br>" : "";
|
||||
subs['L'] = linksbuf;
|
||||
subs['N'] = numbuf;
|
||||
subs['M'] = doc.mimetype;
|
||||
subs['R'] = perbuf;
|
||||
subs['S'] = sizebuf;
|
||||
subs['T'] = escapeHtml(doc.title);
|
||||
subs['T'] = escapeHtml(doc.meta["title"]);
|
||||
subs['U'] = url;
|
||||
|
||||
string formatted;
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _DOCSEQ_H_INCLUDED_
|
||||
#define _DOCSEQ_H_INCLUDED_
|
||||
/* @(#$Id: docseq.h,v 1.11 2007-01-19 15:22:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: docseq.h,v 1.12 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
@ -70,7 +70,7 @@ class DocSequence {
|
||||
* The default is to return the input doc's abstract fields, but some
|
||||
* sequences can compute a better value (ie: docseqdb) */
|
||||
virtual string getAbstract(Rcl::Doc& doc) {
|
||||
return doc.abstract;
|
||||
return doc.meta["abstract"];
|
||||
}
|
||||
|
||||
/** Get estimated total count in results */
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.2 2007-01-19 15:22:50 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -42,9 +42,9 @@ int DocSequenceDb::getResCnt()
|
||||
string DocSequenceDb::getAbstract(Rcl::Doc &doc)
|
||||
{
|
||||
if (!m_db)
|
||||
return doc.abstract;
|
||||
return doc.meta["abstract"];
|
||||
string abstract;
|
||||
m_db->makeDocAbstract(doc, abstract);
|
||||
return abstract.empty() ? doc.abstract : abstract;
|
||||
return abstract.empty() ? doc.meta["abstract"] : abstract;
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.114 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.115 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -200,14 +200,14 @@ bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
|
||||
parms.get(string("fmtime"), doc.fmtime);
|
||||
parms.get(string("dmtime"), doc.dmtime);
|
||||
parms.get(string("origcharset"), doc.origcharset);
|
||||
parms.get(string("caption"), doc.title);
|
||||
parms.get(string("keywords"), doc.keywords);
|
||||
parms.get(string("abstract"), doc.abstract);
|
||||
parms.get(string("caption"), doc.meta["title"]);
|
||||
parms.get(string("keywords"), doc.meta["keywords"]);
|
||||
parms.get(string("abstract"), doc.meta["abstract"]);
|
||||
// Possibly remove synthetic abstract indicator (if it's there, we
|
||||
// used to index the beginning of the text as abstract).
|
||||
doc.syntabs = false;
|
||||
if (doc.abstract.find(rclSyntAbs) == 0) {
|
||||
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
||||
if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
|
||||
doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
|
||||
doc.syntabs = true;
|
||||
}
|
||||
parms.get(string("ipath"), doc.ipath);
|
||||
@ -743,12 +743,15 @@ bool Db::isopen()
|
||||
// Try to translate field specification into field prefix. We have a
|
||||
// default table used if translations are not in the config for some
|
||||
// reason (old config not updated ?). We use it only if the config
|
||||
// translation fails
|
||||
string Db::fieldToPrefix(const string& fldname)
|
||||
// translation fails. Also we add in there fields which should be
|
||||
// indexed with no prefix (ie: abstract)
|
||||
bool Db::fieldToPrefix(const string& fldname, string &pfx)
|
||||
{
|
||||
// This is the default table
|
||||
static map<string, string> fldToPrefs;
|
||||
if (fldToPrefs.empty()) {
|
||||
fldToPrefs["abstract"] = "";
|
||||
|
||||
fldToPrefs["title"] = "S";
|
||||
fldToPrefs["caption"] = "S";
|
||||
fldToPrefs["subject"] = "S";
|
||||
@ -763,17 +766,19 @@ string Db::fieldToPrefix(const string& fldname)
|
||||
fldToPrefs["tags"] = "K";
|
||||
}
|
||||
|
||||
string fld(fldname), pfx;
|
||||
string fld(fldname);
|
||||
stringtolower(fld);
|
||||
|
||||
RclConfig *config = RclConfig::getMainConfig();
|
||||
if (config)
|
||||
pfx = config->getFieldPrefix(fld);
|
||||
if (pfx.empty()) {
|
||||
map<string, string>::const_iterator it = fldToPrefs.find(fld);
|
||||
if (it != fldToPrefs.end())
|
||||
fld = it->second;
|
||||
if (config && config->getFieldPrefix(fld, pfx))
|
||||
return true;
|
||||
|
||||
map<string, string>::const_iterator it = fldToPrefs.find(fld);
|
||||
if (it != fldToPrefs.end()) {
|
||||
pfx = it->second;
|
||||
return true;
|
||||
}
|
||||
return pfx;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -880,11 +885,12 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
LOGDEB1(("Db::add: fn %s\n", fn.c_str()));
|
||||
if (m_ndb == 0)
|
||||
return false;
|
||||
|
||||
static int first = 1;
|
||||
// Check file system full every mbyte of indexed text.
|
||||
if (m_maxFsOccupPc > 0 && (m_curtxtsz - m_occtxtsz) / MB >= 1) {
|
||||
if (m_maxFsOccupPc > 0 && (first || (m_curtxtsz - m_occtxtsz) / MB >= 1)) {
|
||||
LOGDEB(("Db::add: checking file system usage\n"));
|
||||
int pc;
|
||||
first = 0;
|
||||
if (fsocc(m_basedir, &pc) && pc >= m_maxFsOccupPc) {
|
||||
LOGERR(("Db::add: stop indexing: file system "
|
||||
"%d%% full > max %d%%\n", pc, m_maxFsOccupPc));
|
||||
@ -895,37 +901,38 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
|
||||
Doc doc = idoc;
|
||||
|
||||
// The title, author, abstract and keywords fields are special, they
|
||||
// get stored in the document data record.
|
||||
// Truncate abstract, title and keywords to reasonable lengths. If
|
||||
// abstract is currently empty, we make up one with the beginning
|
||||
// of the document. This is then not indexed, but part of the doc
|
||||
// data so that we can return it to a query without having to
|
||||
// decode the original file.
|
||||
bool syntabs = false;
|
||||
if (doc.abstract.empty()) {
|
||||
// Note that the map accesses by operator[] create empty entries if they
|
||||
// don't exist yet.
|
||||
if (doc.meta["abstract"].empty()) {
|
||||
syntabs = true;
|
||||
doc.abstract = rclSyntAbs +
|
||||
truncate_to_word(doc.text, m_idxAbsTruncLen);
|
||||
doc.meta["abstract"] = rclSyntAbs +
|
||||
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
|
||||
} else {
|
||||
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
|
||||
doc.meta["abstract"] =
|
||||
neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
|
||||
"\n\r");
|
||||
}
|
||||
doc.abstract = neutchars(doc.abstract, "\n\r");
|
||||
doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r");
|
||||
doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r");
|
||||
doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r");
|
||||
if (doc.meta["title"].empty())
|
||||
doc.meta["title"] = doc.utf8fn, "\n\r";
|
||||
doc.meta["title"] =
|
||||
neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
|
||||
doc.meta["author"] =
|
||||
neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
|
||||
doc.meta["keywords"] =
|
||||
neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
|
||||
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
mySplitterCB splitData(newdocument, m_stops);
|
||||
|
||||
TextSplit splitter(&splitData);
|
||||
|
||||
// Index the title, document text, keywords and other textual
|
||||
// metadata. These are all indexed as text with positions, as we
|
||||
// may want to do phrase searches with them (this makes no sense
|
||||
// for keywords by the way, but wtf).
|
||||
/
|
||||
// The order has no importance, and we set a position gap of 100
|
||||
// between fields to avoid false proximity matches.
|
||||
string noacc;
|
||||
|
||||
// Split and index file name as document term(s)
|
||||
@ -935,35 +942,39 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
|
||||
// Split and index title. If title is empty here, use file name
|
||||
if (doc.title.empty())
|
||||
doc.title = doc.utf8fn;
|
||||
if (!doc.title.empty()) {
|
||||
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
|
||||
if (!dumb_string(doc.title, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
// Index textual metadata. These are all indexed as text with
|
||||
// positions, as we may want to do phrase searches with them (this
|
||||
// makes no sense for keywords by the way).
|
||||
//
|
||||
// The order has no importance, and we set a position gap of 100
|
||||
// between fields to avoid false proximity matches.
|
||||
map<string,string>::iterator meta_it;
|
||||
string pfx;
|
||||
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||
if (!meta_it->second.empty()) {
|
||||
if (meta_it->first == "abstract" && syntabs)
|
||||
continue;
|
||||
if (!fieldToPrefix(meta_it->first, pfx)) {
|
||||
LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
|
||||
meta_it->first.c_str()));
|
||||
continue;
|
||||
}
|
||||
LOGDEB(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
||||
meta_it->first.c_str(), pfx.c_str(),
|
||||
meta_it->second.c_str()));
|
||||
if (!dumb_string(meta_it->second, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix(pfx); // Subject
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
splitData.setprefix("S"); // Subject
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
|
||||
// Split and index author
|
||||
if (!doc.author.empty()) {
|
||||
LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str()));
|
||||
if (!dumb_string(doc.author, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix("A");
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
|
||||
// Split and index body
|
||||
// Split and index body text
|
||||
LOGDEB2(("Db::add: split body\n"));
|
||||
if (!dumb_string(doc.text, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
@ -972,36 +983,8 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split and index keywords
|
||||
if (!doc.keywords.empty()) {
|
||||
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
|
||||
if (!dumb_string(doc.keywords, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix("K");
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
|
||||
// Split and index abstract. We don't do this if it is synthetic
|
||||
// any more (this used to give a relevance boost to the beginning
|
||||
// of text, why ?)
|
||||
LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
|
||||
if (!syntabs) {
|
||||
// syntabs indicator test kept here in case we want to go back
|
||||
// to indexing synthetic abstracts one day
|
||||
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
|
||||
doc.abstract, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
}
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
////// Special terms for metadata
|
||||
////// Special terms for other metadata. No positions for these.
|
||||
// Mime type
|
||||
newdocument.add_term("T" + doc.mimetype);
|
||||
|
||||
@ -1075,11 +1058,14 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
if (!doc.ipath.empty()) {
|
||||
record += "\nipath=" + doc.ipath;
|
||||
}
|
||||
record += "\ncaption=" + doc.title;
|
||||
record += "\nkeywords=" + doc.keywords;
|
||||
record += "\nabstract=" + doc.abstract;
|
||||
if (!doc.author.empty()) {
|
||||
record += "\nauthor=" + doc.author;
|
||||
if (!doc.meta["title"].empty())
|
||||
record += "\ncaption=" + doc.meta["title"];
|
||||
if (!doc.meta["keywords"].empty())
|
||||
record += "\nkeywords=" + doc.meta["keywords"];
|
||||
if (!doc.meta["abstract"].empty())
|
||||
record += "\nabstract=" + doc.meta["abstract"];
|
||||
if (!doc.meta["author"].empty()) {
|
||||
record += "\nauthor=" + doc.meta["author"];
|
||||
}
|
||||
record += "\n";
|
||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.51 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.52 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -95,7 +95,7 @@ class Db {
|
||||
const StopList& getStopList() const {return m_stops;}
|
||||
|
||||
/** Field name to prefix translation (ie: author -> 'A') */
|
||||
string fieldToPrefix(const string& fldname);
|
||||
bool fieldToPrefix(const string& fldname, string &pfx);
|
||||
|
||||
/* Update-related methods ******************************************/
|
||||
|
||||
|
||||
@ -16,12 +16,14 @@
|
||||
*/
|
||||
#ifndef _RCLDOC_H_INCLUDED_
|
||||
#define _RCLDOC_H_INCLUDED_
|
||||
/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
/* @(#$Id: rcldoc.h,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::map;
|
||||
namespace Rcl {
|
||||
#endif
|
||||
|
||||
@ -47,12 +49,16 @@ class Doc {
|
||||
// Possibly set by handler
|
||||
string origcharset; // Charset we transcoded from (in case we want back)
|
||||
// Possibly set by handler
|
||||
string title; // Possibly set by handler
|
||||
string author; // Possibly set by handler
|
||||
string keywords; // Possibly set by handler
|
||||
string abstract; // Possibly set by handler
|
||||
bool syntabs; // true if abstract is just the top of doc, not an
|
||||
// explicit document attribute
|
||||
|
||||
// A map for textual metadata like, author, keywords, abstract, title
|
||||
// Entries possibly set by handler. If a field-name to prefix translation
|
||||
// exists, the terms will be indexed with a prefix.
|
||||
map<string, string> meta;
|
||||
|
||||
// Attribute for the "abstract" entry. true if it is just the top
|
||||
// of doc, not a native document attribute
|
||||
bool syntabs;
|
||||
|
||||
string fbytes; // File size. Set by Db::Add
|
||||
string dbytes; // Doc size. Set by Db::Add from text length
|
||||
|
||||
@ -72,9 +78,7 @@ class Doc {
|
||||
fmtime.erase();
|
||||
dmtime.erase();
|
||||
origcharset.erase();
|
||||
title.erase();
|
||||
keywords.erase();
|
||||
abstract.erase();
|
||||
meta.clear();
|
||||
syntabs = false;
|
||||
fbytes.erase();
|
||||
dbytes.erase();
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.15 2007-06-18 13:04:15 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.16 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -487,7 +487,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
}
|
||||
string prefix;
|
||||
if (!m_field.empty())
|
||||
prefix = db.fieldToPrefix(m_field);
|
||||
db.fieldToPrefix(m_field, prefix);
|
||||
list<Xapian::Query> pqueries;
|
||||
|
||||
// We normally boost the original term in the stem expansion list. Don't
|
||||
@ -541,7 +541,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
|
||||
string prefix;
|
||||
if (!m_field.empty())
|
||||
prefix = db.fieldToPrefix(m_field);
|
||||
db.fieldToPrefix(m_field, prefix);
|
||||
|
||||
// We normally boost the original term in the stem expansion list. Don't
|
||||
// do it if there are wildcards anywhere, this would skew the results.
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# @(#$Id: mimeconf,v 1.29 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: mimeconf,v 1.30 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
|
||||
# Recoll : associations of mime types to processing filters.
|
||||
# There are different sections for decompression, 'interning' for indexing
|
||||
@ -144,3 +144,4 @@ keyword = K
|
||||
tag = K
|
||||
keywords = K
|
||||
tags = K
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user