diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index d6aaeba6..c1dc8289 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.46 2007-06-18 13:04:14 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.47 2007-06-19 08:36:23 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -373,13 +373,13 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype) return hs; } -string RclConfig::getFieldPrefix(const string& fld) +bool RclConfig::getFieldPrefix(const string& fld, string &pfx) { - string hs; - if (!mimeconf->get(fld, hs, "prefixes")) { + if (!mimeconf->get(fld, pfx, "prefixes")) { LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str())); + return false; } - return hs; + return true; } string RclConfig::getMimeViewerDef(const string &mtype) diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 181f19e8..665f955d 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -16,7 +16,7 @@ */ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.34 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.35 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -138,7 +138,7 @@ class RclConfig { bool getMimeCatTypes(const string& cat, list&); /** mimeconf: get field prefix from field name */ - string getFieldPrefix(const string& fldname); + bool getFieldPrefix(const string& fldname, string &pfx); /** mimeview: get/set external viewer exec string(s) for mimetype(s) */ string getMimeViewerDef(const string &mimetype); diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index d3062e14..c4f02c8e 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.30 2007-05-23 08:29:04 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.31 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -270,12 +270,12 @@ static const string keyab("abstract"); static const string keyau("author"); static const string keycs("charset"); static const string keyct("content"); +static const string keyds("description"); static const string keyfn("filename"); static const string keykw("keywords"); static const string keymd("modificationdate"); static const string keymt("mimetype"); static const string keyoc("origcharset"); -static const string keysm("sample"); static const string keytt("title"); bool FileInterner::dijontorcl(Rcl::Doc& doc) @@ -283,15 +283,24 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) Dijon::Filter *df = m_handlers.back(); const std::map& docdata = df->get_meta_data(); - getKeyValue(docdata, keyau, doc.author); - getKeyValue(docdata, keyoc, doc.origcharset); - getKeyValue(docdata, keyct, doc.text); - getKeyValue(docdata, keytt, doc.title); - getKeyValue(docdata, keykw, doc.keywords); - getKeyValue(docdata, keymd, doc.dmtime); - if (!getKeyValue(docdata, keyab, doc.abstract)) - getKeyValue(docdata, keysm, doc.abstract); - LOGDEB1(("FILENAME: %s\n", doc.utf8fn.c_str())); + for (map::const_iterator it = docdata.begin(); + it != docdata.end(); it++) { + if (it->first == keyct) { + doc.text = it->second; + } else if (it->first == keymd) { + doc.dmtime = it->second; + } else if (it->first == keyoc) { + doc.origcharset = it->second; + } else if (it->first == keymt || it->first == keycs) { + // don't need these. + } else { + doc.meta[it->first] = it->second; + } + } + if (doc.meta[keyab].empty() && !doc.meta[keyds].empty()) { + doc.meta[keyab] = doc.meta[keyds]; + doc.meta.erase(keyds); + } return true; } @@ -324,7 +333,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const } else { ipath += isep; } - getKeyValue(docdata, keyau, doc.author); + getKeyValue(docdata, keyau, doc.meta["author"]); getKeyValue(docdata, keymd, doc.dmtime); } @@ -672,7 +681,7 @@ int main(int argc, char **argv) "]]]]\n-----------------------------------------------------\n" << "doc.keywords [[[[" << doc.keywords << "]]]]\n-----------------------------------------------------\n" << - "doc.abstract [[[[" << doc.abstract << + "doc.meta["abstract"] [[[[" << doc.meta["abstract"] << "]]]]\n-----------------------------------------------------\n" << "doc.text [[[[" << doc.text << "]]]]\n"; } diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index e4702e69..788e1993 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -136,15 +136,16 @@ bool MimeHandlerHtml::next_document() m_metaData["origcharset"] = m_defcharset; m_metaData["content"] = result.dump; m_metaData["charset"] = "utf-8"; - m_metaData["title"] = result.title; - m_metaData["keywords"] = result.keywords; // Avoid setting empty values which would crush ones possibly inherited // from parent (if we're an attachment) - if (!result.author.empty()) - m_metaData["author"] = result.author; if (!result.dmtime.empty()) m_metaData["modificationdate"] = result.dmtime; - m_metaData["sample"] = result.sample; m_metaData["mimetype"] = "text/plain"; + + for (map::const_iterator it = result.meta.begin(); + it != result.meta.end(); it++) { + if (!it->second.empty()) + m_metaData[it->first] = it->second; + } return true; } diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index e5343033..12485304 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -144,22 +144,7 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) if ((j = p.find("name")) != p.end()) { string name = j->second; lowercase_term(name); - if (name == "description") { - if (sample.empty()) { - sample = i->second; - decode_entities(sample); - } - } else if (name == "keywords") { - if (!keywords.empty()) keywords += ' '; - string tmp = i->second; - decode_entities(tmp); - keywords += tmp; - } else if (name == "author") { - if (!author.empty()) author += ' '; - string tmp = i->second; - decode_entities(tmp); - author += tmp; - } else if (name == "date") { + if (name == "date") { // Yes this doesnt exist. It's output by filters // And the format isn't even standard http/html // FIXME @@ -172,7 +157,14 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) sprintf(ascuxtime, "%ld", (long)mktime(&tm)); dmtime = ascuxtime; } - } + } else if (name == "robots") { + } else { + if (!meta[name].empty()) + meta[name] += ' '; + string tmp = i->second; + decode_entities(tmp); + meta[name] += tmp; + } } else if ((j = p.find("http-equiv")) != p.end()) { string hequiv = j->second; lowercase_term(hequiv); @@ -309,8 +301,8 @@ MyHtmlParser::closing_tag(const string &tag) break; case 't': if (tag == "title") { - if (title.empty()) { - title = dump; + if (meta["title"].empty()) { + meta["title"] = dump; dump = ""; } break; diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 233a5c0c..2abde21e 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -22,6 +22,8 @@ * USA * -----END-LICENCE----- */ +#include +using std::map; #include "htmlparse.h" @@ -37,7 +39,8 @@ class MyHtmlParser : public HtmlParser { bool in_body_tag; bool in_pre_tag; bool pending_space; - string title, sample, keywords, dump, dmtime, author; + map meta; + string dump, dmtime; string ocharset; // This is the charset our user thinks the doc was string charset; // This is the charset it was supposedly converted to string doccharset; // Set this to value of charset parameter in header diff --git a/src/qtgui/preview_w.cpp b/src/qtgui/preview_w.cpp index c8012658..8be82962 100644 --- a/src/qtgui/preview_w.cpp +++ b/src/qtgui/preview_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.20 2007-06-12 13:31:38 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.21 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -400,8 +400,12 @@ QTextEdit *Preview::addEditorTab() void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc, int docnum) { - QString title = QString::fromUtf8(doc.title.c_str(), - doc.title.length()); + QString title; + map::const_iterator meta_it; + if ((meta_it = doc.meta.find("title")) != doc.meta.end()) { + title = QString::fromUtf8(meta_it->second.c_str(), + meta_it->second.length()); + } if (title.length() > 20) { title = title.left(10) + "..." + title.right(10); } @@ -421,8 +425,8 @@ void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc, printableUrl(doc.url, url); string tiptxt = url + string("\n"); tiptxt += doc.mimetype + " " + string(datebuf) + "\n"; - if (!doc.title.empty()) - tiptxt += doc.title + "\n"; + if (meta_it != doc.meta.end() && !meta_it->second.empty()) + tiptxt += meta_it->second + "\n"; pvTab->setTabToolTip(w,QString::fromUtf8(tiptxt.c_str(), tiptxt.length())); for (list::iterator it = tabData.begin(); @@ -607,8 +611,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, Rcl::Doc doc = idoc; bool cancel = false; - if (doc.title.empty()) - doc.title = path_getsimple(doc.url); + if (doc.meta["title"].empty()) + doc.meta["title"] = path_getsimple(doc.url); setCurTabProps(fn, doc, docnum); diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index 33aa50b9..2b5f7613 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: reslist.cpp,v 1.26 2007-06-13 17:03:23 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: reslist.cpp,v 1.27 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -399,7 +399,7 @@ void ResList::resultPageNext() if (percent == -1) { percent = 0; // Document not available, maybe other further, will go on. - doc.abstract = string(tr("Unavailable document").utf8()); + doc.meta["abstract"] = string(tr("Unavailable document").utf8()); } // Determine icon to display if any @@ -426,8 +426,8 @@ void ResList::resultPageNext() printableUrl(doc.url, url); // Make title out of file name if none yet - if (doc.title.empty()) { - doc.title = path_getsimple(url); + if (doc.meta["title"].empty()) { + doc.meta["title"] = path_getsimple(url); } // Result number @@ -469,7 +469,7 @@ void ResList::resultPageNext() (doc.syntabs || prefs.queryReplaceAbstract)) { abstract = m_docSource->getAbstract(doc); } else { - abstract = doc.abstract; + abstract = doc.meta["abstract"]; } // No need to call escapeHtml(), plaintorich handles it string richabst; @@ -505,14 +505,14 @@ void ResList::resultPageNext() map subs; subs['A'] = !richabst.empty() ? richabst + "
" : ""; subs['D'] = datebuf; - subs['K'] = !doc.keywords.empty() ? escapeHtml(doc.keywords) + "
" - : ""; + subs['K'] = !doc.meta["keywords"].empty() ? + escapeHtml(doc.meta["keywords"]) + "
" : ""; subs['L'] = linksbuf; subs['N'] = numbuf; subs['M'] = doc.mimetype; subs['R'] = perbuf; subs['S'] = sizebuf; - subs['T'] = escapeHtml(doc.title); + subs['T'] = escapeHtml(doc.meta["title"]); subs['U'] = url; string formatted; diff --git a/src/query/docseq.h b/src/query/docseq.h index dda1be88..121fc7dd 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -16,7 +16,7 @@ */ #ifndef _DOCSEQ_H_INCLUDED_ #define _DOCSEQ_H_INCLUDED_ -/* @(#$Id: docseq.h,v 1.11 2007-01-19 15:22:50 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: docseq.h,v 1.12 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #include @@ -70,7 +70,7 @@ class DocSequence { * The default is to return the input doc's abstract fields, but some * sequences can compute a better value (ie: docseqdb) */ virtual string getAbstract(Rcl::Doc& doc) { - return doc.abstract; + return doc.meta["abstract"]; } /** Get estimated total count in results */ diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index 6754adce..1e45b95d 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.2 2007-01-19 15:22:50 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -42,9 +42,9 @@ int DocSequenceDb::getResCnt() string DocSequenceDb::getAbstract(Rcl::Doc &doc) { if (!m_db) - return doc.abstract; + return doc.meta["abstract"]; string abstract; m_db->makeDocAbstract(doc, abstract); - return abstract.empty() ? doc.abstract : abstract; + return abstract.empty() ? doc.meta["abstract"] : abstract; } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 345fdab8..c98e9a03 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.114 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.115 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -200,14 +200,14 @@ bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc) parms.get(string("fmtime"), doc.fmtime); parms.get(string("dmtime"), doc.dmtime); parms.get(string("origcharset"), doc.origcharset); - parms.get(string("caption"), doc.title); - parms.get(string("keywords"), doc.keywords); - parms.get(string("abstract"), doc.abstract); + parms.get(string("caption"), doc.meta["title"]); + parms.get(string("keywords"), doc.meta["keywords"]); + parms.get(string("abstract"), doc.meta["abstract"]); // Possibly remove synthetic abstract indicator (if it's there, we // used to index the beginning of the text as abstract). doc.syntabs = false; - if (doc.abstract.find(rclSyntAbs) == 0) { - doc.abstract = doc.abstract.substr(rclSyntAbs.length()); + if (doc.meta["abstract"].find(rclSyntAbs) == 0) { + doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length()); doc.syntabs = true; } parms.get(string("ipath"), doc.ipath); @@ -743,12 +743,15 @@ bool Db::isopen() // Try to translate field specification into field prefix. We have a // default table used if translations are not in the config for some // reason (old config not updated ?). We use it only if the config -// translation fails -string Db::fieldToPrefix(const string& fldname) +// translation fails. Also we add in there fields which should be +// indexed with no prefix (ie: abstract) +bool Db::fieldToPrefix(const string& fldname, string &pfx) { // This is the default table static map fldToPrefs; if (fldToPrefs.empty()) { + fldToPrefs["abstract"] = ""; + fldToPrefs["title"] = "S"; fldToPrefs["caption"] = "S"; fldToPrefs["subject"] = "S"; @@ -763,17 +766,19 @@ string Db::fieldToPrefix(const string& fldname) fldToPrefs["tags"] = "K"; } - string fld(fldname), pfx; + string fld(fldname); stringtolower(fld); + RclConfig *config = RclConfig::getMainConfig(); - if (config) - pfx = config->getFieldPrefix(fld); - if (pfx.empty()) { - map::const_iterator it = fldToPrefs.find(fld); - if (it != fldToPrefs.end()) - fld = it->second; + if (config && config->getFieldPrefix(fld, pfx)) + return true; + + map::const_iterator it = fldToPrefs.find(fld); + if (it != fldToPrefs.end()) { + pfx = it->second; + return true; } - return pfx; + return false; } @@ -880,11 +885,12 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) LOGDEB1(("Db::add: fn %s\n", fn.c_str())); if (m_ndb == 0) return false; - + static int first = 1; // Check file system full every mbyte of indexed text. - if (m_maxFsOccupPc > 0 && (m_curtxtsz - m_occtxtsz) / MB >= 1) { + if (m_maxFsOccupPc > 0 && (first || (m_curtxtsz - m_occtxtsz) / MB >= 1)) { LOGDEB(("Db::add: checking file system usage\n")); int pc; + first = 0; if (fsocc(m_basedir, &pc) && pc >= m_maxFsOccupPc) { LOGERR(("Db::add: stop indexing: file system " "%d%% full > max %d%%\n", pc, m_maxFsOccupPc)); @@ -895,37 +901,38 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) Doc doc = idoc; + // The title, author, abstract and keywords fields are special, they + // get stored in the document data record. // Truncate abstract, title and keywords to reasonable lengths. If // abstract is currently empty, we make up one with the beginning // of the document. This is then not indexed, but part of the doc // data so that we can return it to a query without having to // decode the original file. bool syntabs = false; - if (doc.abstract.empty()) { + // Note that the map accesses by operator[] create empty entries if they + // don't exist yet. + if (doc.meta["abstract"].empty()) { syntabs = true; - doc.abstract = rclSyntAbs + - truncate_to_word(doc.text, m_idxAbsTruncLen); + doc.meta["abstract"] = rclSyntAbs + + neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r"); } else { - doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen); + doc.meta["abstract"] = + neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen), + "\n\r"); } - doc.abstract = neutchars(doc.abstract, "\n\r"); - doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r"); - doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r"); - doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r"); + if (doc.meta["title"].empty()) + doc.meta["title"] = doc.utf8fn, "\n\r"; + doc.meta["title"] = + neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r"); + doc.meta["author"] = + neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r"); + doc.meta["keywords"] = + neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r"); + Xapian::Document newdocument; - mySplitterCB splitData(newdocument, m_stops); - TextSplit splitter(&splitData); - - // Index the title, document text, keywords and other textual - // metadata. These are all indexed as text with positions, as we - // may want to do phrase searches with them (this makes no sense - // for keywords by the way, but wtf). - / - // The order has no importance, and we set a position gap of 100 - // between fields to avoid false proximity matches. string noacc; // Split and index file name as document term(s) @@ -935,35 +942,39 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) splitData.basepos += splitData.curpos + 100; } - // Split and index title. If title is empty here, use file name - if (doc.title.empty()) - doc.title = doc.utf8fn; - if (!doc.title.empty()) { - LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str())); - if (!dumb_string(doc.title, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; + // Index textual metadata. These are all indexed as text with + // positions, as we may want to do phrase searches with them (this + // makes no sense for keywords by the way). + // + // The order has no importance, and we set a position gap of 100 + // between fields to avoid false proximity matches. + map::iterator meta_it; + string pfx; + for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { + if (!meta_it->second.empty()) { + if (meta_it->first == "abstract" && syntabs) + continue; + if (!fieldToPrefix(meta_it->first, pfx)) { + LOGDEB(("Db::add: no prefix for field [%s], no indexing\n", + meta_it->first.c_str())); + continue; + } + LOGDEB(("Db::add: field [%s] pfx [%s]: [%s]\n", + meta_it->first.c_str(), pfx.c_str(), + meta_it->second.c_str())); + if (!dumb_string(meta_it->second, noacc)) { + LOGERR(("Db::add: dumb_string failed\n")); + return false; + } + splitData.setprefix(pfx); // Subject + splitter.text_to_words(noacc); + splitData.setprefix(emptystring); + splitData.basepos += splitData.curpos + 100; } - splitData.setprefix("S"); // Subject - splitter.text_to_words(noacc); - splitData.setprefix(emptystring); - splitData.basepos += splitData.curpos + 100; } - // Split and index author - if (!doc.author.empty()) { - LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str())); - if (!dumb_string(doc.author, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; - } - splitData.setprefix("A"); - splitter.text_to_words(noacc); - splitData.setprefix(emptystring); - splitData.basepos += splitData.curpos + 100; - } - // Split and index body + // Split and index body text LOGDEB2(("Db::add: split body\n")); if (!dumb_string(doc.text, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); @@ -972,36 +983,8 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - // Split and index keywords - if (!doc.keywords.empty()) { - LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str())); - if (!dumb_string(doc.keywords, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; - } - splitData.setprefix("K"); - splitter.text_to_words(noacc); - splitData.setprefix(emptystring); - splitData.basepos += splitData.curpos + 100; - } - // Split and index abstract. We don't do this if it is synthetic - // any more (this used to give a relevance boost to the beginning - // of text, why ?) - LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str())); - if (!syntabs) { - // syntabs indicator test kept here in case we want to go back - // to indexing synthetic abstracts one day - if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : - doc.abstract, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; - } - splitter.text_to_words(noacc); - } - splitData.basepos += splitData.curpos + 100; - - ////// Special terms for metadata + ////// Special terms for other metadata. No positions for these. // Mime type newdocument.add_term("T" + doc.mimetype); @@ -1075,11 +1058,14 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) if (!doc.ipath.empty()) { record += "\nipath=" + doc.ipath; } - record += "\ncaption=" + doc.title; - record += "\nkeywords=" + doc.keywords; - record += "\nabstract=" + doc.abstract; - if (!doc.author.empty()) { - record += "\nauthor=" + doc.author; + if (!doc.meta["title"].empty()) + record += "\ncaption=" + doc.meta["title"]; + if (!doc.meta["keywords"].empty()) + record += "\nkeywords=" + doc.meta["keywords"]; + if (!doc.meta["abstract"].empty()) + record += "\nabstract=" + doc.meta["abstract"]; + if (!doc.meta["author"].empty()) { + record += "\nauthor=" + doc.meta["author"]; } record += "\n"; LOGDEB1(("Newdocument data: %s\n", record.c_str())); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 981fa2d0..d273f97c 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.51 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.52 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -95,7 +95,7 @@ class Db { const StopList& getStopList() const {return m_stops;} /** Field name to prefix translation (ie: author -> 'A') */ - string fieldToPrefix(const string& fldname); + bool fieldToPrefix(const string& fldname, string &pfx); /* Update-related methods ******************************************/ diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index 1dea0189..80d93b44 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -16,12 +16,14 @@ */ #ifndef _RCLDOC_H_INCLUDED_ #define _RCLDOC_H_INCLUDED_ -/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: rcldoc.h,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes */ #include +#include #ifndef NO_NAMESPACES using std::string; +using std::map; namespace Rcl { #endif @@ -47,12 +49,16 @@ class Doc { // Possibly set by handler string origcharset; // Charset we transcoded from (in case we want back) // Possibly set by handler - string title; // Possibly set by handler - string author; // Possibly set by handler - string keywords; // Possibly set by handler - string abstract; // Possibly set by handler - bool syntabs; // true if abstract is just the top of doc, not an - // explicit document attribute + + // A map for textual metadata like, author, keywords, abstract, title + // Entries possibly set by handler. If a field-name to prefix translation + // exists, the terms will be indexed with a prefix. + map meta; + + // Attribute for the "abstract" entry. true if it is just the top + // of doc, not a native document attribute + bool syntabs; + string fbytes; // File size. Set by Db::Add string dbytes; // Doc size. Set by Db::Add from text length @@ -72,9 +78,7 @@ class Doc { fmtime.erase(); dmtime.erase(); origcharset.erase(); - title.erase(); - keywords.erase(); - abstract.erase(); + meta.clear(); syntabs = false; fbytes.erase(); dbytes.erase(); diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 80e4cd51..a0650129 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.15 2007-06-18 13:04:15 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.16 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -487,7 +487,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, } string prefix; if (!m_field.empty()) - prefix = db.fieldToPrefix(m_field); + db.fieldToPrefix(m_field, prefix); list pqueries; // We normally boost the original term in the stem expansion list. Don't @@ -541,7 +541,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, string prefix; if (!m_field.empty()) - prefix = db.fieldToPrefix(m_field); + db.fieldToPrefix(m_field, prefix); // We normally boost the original term in the stem expansion list. Don't // do it if there are wildcards anywhere, this would skew the results. diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 62a6d125..4d2a6011 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -1,4 +1,4 @@ -# @(#$Id: mimeconf,v 1.29 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeconf,v 1.30 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes # Recoll : associations of mime types to processing filters. # There are different sections for decompression, 'interning' for indexing @@ -144,3 +144,4 @@ keyword = K tag = K keywords = K tags = K +