compute md5 checksums for all docs and optionally collapse duplicates in results

This commit is contained in:
dockes 2009-01-09 14:56:36 +00:00
parent 9f14bf317c
commit f57d4a91f9
15 changed files with 96 additions and 15 deletions

View File

@ -25,6 +25,7 @@ static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.14 2008-10-09 09:19:37 dockes Exp
#include "cancelcheck.h"
#include "smallut.h"
#include "transcode.h"
#include "md5.h"
#include <sys/types.h>
#include <sys/wait.h>
@ -130,5 +131,14 @@ bool MimeHandlerExec::next_document()
// could still be overridden by the content-type meta tag.
m_metaData["charset"] = charset;
m_metaData["mimetype"] = mt;
string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) {
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
} else {
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
m_fn.c_str(), reason.c_str()));
}
return true;
}

View File

@ -29,6 +29,7 @@ static char rcsid[] = "@(#$Id: mh_html.cpp,v 1.26 2008-10-03 06:17:46 dockes Exp
#include "indextext.h"
#include "mh_html.h"
#include "smallut.h"
#include "md5.h"
#include <iostream>
@ -53,6 +54,12 @@ bool MimeHandlerHtml::set_document_string(const string& htext)
{
m_html = htext;
m_havedoc = true;
// We want to compute the md5 now because we may modify m_html later
string md5, xmd5;
MD5String(htext, md5);
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
return true;
}

View File

@ -38,6 +38,7 @@ static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.35 2008-10-04 14:26:59 dockes Exp
#include "mh_html.h"
#include "rclconfig.h"
#include "mimetype.h"
#include "md5.h"
// binc imap mime definitions
#include "mime.h"
@ -81,6 +82,18 @@ bool MimeHandlerMail::set_document_file(const string &fn)
close(m_fd);
m_fd = -1;
}
// Yes, we read the file twice. It would be possible in theory to add
// the md5 computation to the mime analysis, but ...
string md5, xmd5, reason;
if (MD5File(fn, md5, &reason)) {
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
} else {
LOGERR(("MimeHandlerMail: cant compute md5 for [%s]: %s\n", fn.c_str(),
reason.c_str()));
}
m_fd = open(fn.c_str(), 0);
if (m_fd < 0) {
LOGERR(("MimeHandlerMail::set_document_file: open(%s) errno %d\n",
@ -104,6 +117,11 @@ bool MimeHandlerMail::set_document_string(const string &msgtxt)
LOGDEB1(("MimeHandlerMail::set_document_string\n"));
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
delete m_stream;
string md5, xmd5;
MD5String(msgtxt, md5);
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
m_stream = new stringstream(msgtxt);
delete m_bincdoc;
m_bincdoc = new Binc::MimeDocument;

View File

@ -29,19 +29,26 @@ using namespace std;
#include "debuglog.h"
#include "readfile.h"
#include "transcode.h"
#include "md5.h"
// Process a plain text file
bool MimeHandlerText::set_document_file(const string &fn)
{
string otext;
if (!file_to_string(fn, otext))
string reason;
if (!file_to_string(fn, otext, &reason)) {
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
return false;
}
return set_document_string(otext);
}
bool MimeHandlerText::set_document_string(const string& otext)
{
m_text = otext;
string md5, xmd5;
MD5String(m_text, md5);
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
m_havedoc = true;
return true;
}

View File

@ -175,6 +175,8 @@ void rwSettings(bool writing)
SETTING_RW(prefs.ssearchAutoPhrase,
"/Recoll/prefs/ssearchAutoPhrase", Bool, false);
SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Num, 8);
SETTING_RW(prefs.collapseDuplicates,
"/Recoll/prefs/reslist/collapseDuplicates", Bool, false);
SETTING_RW(prefs.maxhltextmbs, "/Recoll/prefs/preview/maxhltextmbs", Num, 3);
SETTING_RW(prefs.qtermcolor, "/Recoll/prefs/qtermcolor", , "blue");
if (!writing && prefs.qtermcolor == "")

View File

@ -82,6 +82,7 @@ class PrefsPack {
bool startWithAdvSearchOpen;
bool startWithSortToolOpen;
bool previewHtml;
bool collapseDuplicates;
// Extra query indexes. This are encoded to base64 before storing
// to the qt settings file to avoid any bin string/ charset conv issues
list<string> allExtraDbs;

View File

@ -471,6 +471,7 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
sdata->setStemlang(stemLang);
Rcl::Query *query = new Rcl::Query(rcldb);
query->setCollapseDuplicates(prefs.collapseDuplicates);
if (!query || !query->setQuery(sdata)) {
QMessageBox::warning(0, "Recoll", tr("Can't start query: ") +

View File

@ -90,6 +90,20 @@
</widget>
</hbox>
</widget>
<widget class="QCheckBox">
<property name="name">
<cstring>collapseDupsCB</cstring>
</property>
<property name="toolTip" stdset="0">
<string>If checked, results with the same content under different names will only be shown once.</string>
</property>
<property name="text">
<string>Hide duplicate results.</string>
</property>
<property name="checked">
<bool>false</bool>
</property>
</widget>
<widget class="QLayoutWidget">
<property name="name">
<cstring>qtermcolor</cstring>

View File

@ -97,6 +97,7 @@ void UIPrefsDialog::setFromPrefs()
{
// Entries per result page spinbox
pageLenSB->setValue(prefs.respagesize);
collapseDupsCB->setChecked(prefs.collapseDuplicates);
maxHLTSB->setValue(prefs.maxhltextmbs);
autoSearchCB->setChecked(prefs.autoSearchOnWS);
syntlenSB->setValue(prefs.syntAbsLen);
@ -173,6 +174,7 @@ void UIPrefsDialog::accept()
{
prefs.autoSearchOnWS = autoSearchCB->isChecked();
prefs.respagesize = pageLenSB->value();
prefs.collapseDuplicates = collapseDupsCB->isChecked();
prefs.maxhltextmbs = maxHLTSB->value();
prefs.qtermcolor = qtermColorLE->text();

View File

@ -48,7 +48,7 @@ using namespace std;
#include "searchdata.h"
#include "rclquery.h"
#include "rclquery_p.h"
#include "md5.h"
#ifndef MAX
#define MAX(A,B) (A>B?A:B)
@ -57,15 +57,6 @@ using namespace std;
#define MIN(A,B) (A<B?A:B)
#endif
// Omega compatible values. We leave a hole for future omega values. Not sure
// it makes any sense to keep any level of omega compat given that the index
// is incompatible anyway.
enum value_slot {
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
};
// Recoll index format version is stored in user metadata. When this change,
// we can't open the db and will have to reindex.
static const string RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
@ -1078,6 +1069,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
}
}
// If the file's md5 was computed, add value. This is optionally
// used for query result duplicate elimination.
string& md5 = doc.meta[Doc::keymd5];
if (!md5.empty()) {
string digest;
MD5HexScan(md5, digest);
newdocument.add_value(VALUE_MD5, digest);
}
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
newdocument.set_data(record);

View File

@ -56,6 +56,15 @@ class RclConfig;
namespace Rcl {
#endif
// Omega compatible values. We leave a hole for future omega values. Not sure
// it makes any sense to keep any level of omega compat given that the index
// is incompatible anyway.
enum value_slot {
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
};
class SearchData;
class TermIter;
class Query;

View File

@ -22,4 +22,5 @@ namespace Rcl {
const string Doc::keyau("author");
const string Doc::keytt("title");
const string Doc::keykw("keywords");
const string Doc::keymd5("md5");
}

View File

@ -157,6 +157,7 @@ class Doc {
static const string keyau; // author
static const string keytt; // title
static const string keykw; // keywords
static const string keymd5; // file md5 checksum
};

View File

@ -89,7 +89,8 @@ private:
};
Query::Query(Db *db)
: m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true)
: m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true),
m_collapseDuplicates(false)
{
}
@ -155,7 +156,11 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
string d;
try {
m_nq->enquire = new Xapian::Enquire(m_db->m_ndb->db);
m_nq->enquire->set_query(m_nq->query);
if (m_collapseDuplicates) {
m_nq->enquire->set_collapse_key(Rcl::VALUE_MD5);
} else {
m_nq->enquire->set_collapse_key(Xapian::BAD_VALUENO);
}
if (!m_sortField.empty()) {
if (m_sorter) {
delete (QSorter*)m_sorter;
@ -167,6 +172,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
m_nq->enquire->set_sort_by_key((QSorter*)m_sorter,
!m_sortAscending);
}
m_nq->enquire->set_query(m_nq->query);
m_nq->mset = Xapian::MSet();
// Get the query description and trim the "Xapian::Query"
d = m_nq->query.get_description();

View File

@ -54,6 +54,7 @@ class Query {
/** Choose sort order. Must be called before setQuery */
void setSortBy(const string& fld, bool ascending = true);
void setCollapseDuplicates(bool on) {m_collapseDuplicates = on;}
const string& getSortBy() const {return m_sortField;}
bool getSortAscending() const {return m_sortAscending;}
@ -90,8 +91,9 @@ private:
string m_reason; // Error explanation
Db *m_db;
void *m_sorter;
string m_sortField;
bool m_sortAscending;
string m_sortField;
bool m_sortAscending;
bool m_collapseDuplicates;
/* Copyconst and assignement private and forbidden */
Query(const Query &) {}
Query & operator=(const Query &) {return *this;};