compute md5 checksums for all docs and optionally collapse duplicates in results
This commit is contained in:
parent
9f14bf317c
commit
f57d4a91f9
@ -25,6 +25,7 @@ static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.14 2008-10-09 09:19:37 dockes Exp
|
||||
#include "cancelcheck.h"
|
||||
#include "smallut.h"
|
||||
#include "transcode.h"
|
||||
#include "md5.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
@ -130,5 +131,14 @@ bool MimeHandlerExec::next_document()
|
||||
// could still be overridden by the content-type meta tag.
|
||||
m_metaData["charset"] = charset;
|
||||
m_metaData["mimetype"] = mt;
|
||||
|
||||
string md5, xmd5, reason;
|
||||
if (MD5File(m_fn, md5, &reason)) {
|
||||
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
|
||||
} else {
|
||||
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
|
||||
m_fn.c_str(), reason.c_str()));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -29,6 +29,7 @@ static char rcsid[] = "@(#$Id: mh_html.cpp,v 1.26 2008-10-03 06:17:46 dockes Exp
|
||||
#include "indextext.h"
|
||||
#include "mh_html.h"
|
||||
#include "smallut.h"
|
||||
#include "md5.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
@ -53,6 +54,12 @@ bool MimeHandlerHtml::set_document_string(const string& htext)
|
||||
{
|
||||
m_html = htext;
|
||||
m_havedoc = true;
|
||||
|
||||
// We want to compute the md5 now because we may modify m_html later
|
||||
string md5, xmd5;
|
||||
MD5String(htext, md5);
|
||||
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -38,6 +38,7 @@ static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.35 2008-10-04 14:26:59 dockes Exp
|
||||
#include "mh_html.h"
|
||||
#include "rclconfig.h"
|
||||
#include "mimetype.h"
|
||||
#include "md5.h"
|
||||
|
||||
// binc imap mime definitions
|
||||
#include "mime.h"
|
||||
@ -81,6 +82,18 @@ bool MimeHandlerMail::set_document_file(const string &fn)
|
||||
close(m_fd);
|
||||
m_fd = -1;
|
||||
}
|
||||
|
||||
// Yes, we read the file twice. It would be possible in theory to add
|
||||
// the md5 computation to the mime analysis, but ...
|
||||
string md5, xmd5, reason;
|
||||
if (MD5File(fn, md5, &reason)) {
|
||||
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
|
||||
} else {
|
||||
LOGERR(("MimeHandlerMail: cant compute md5 for [%s]: %s\n", fn.c_str(),
|
||||
reason.c_str()));
|
||||
}
|
||||
|
||||
|
||||
m_fd = open(fn.c_str(), 0);
|
||||
if (m_fd < 0) {
|
||||
LOGERR(("MimeHandlerMail::set_document_file: open(%s) errno %d\n",
|
||||
@ -104,6 +117,11 @@ bool MimeHandlerMail::set_document_string(const string &msgtxt)
|
||||
LOGDEB1(("MimeHandlerMail::set_document_string\n"));
|
||||
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
||||
delete m_stream;
|
||||
|
||||
string md5, xmd5;
|
||||
MD5String(msgtxt, md5);
|
||||
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
|
||||
|
||||
m_stream = new stringstream(msgtxt);
|
||||
delete m_bincdoc;
|
||||
m_bincdoc = new Binc::MimeDocument;
|
||||
|
||||
@ -29,19 +29,26 @@ using namespace std;
|
||||
#include "debuglog.h"
|
||||
#include "readfile.h"
|
||||
#include "transcode.h"
|
||||
#include "md5.h"
|
||||
|
||||
// Process a plain text file
|
||||
bool MimeHandlerText::set_document_file(const string &fn)
|
||||
{
|
||||
string otext;
|
||||
if (!file_to_string(fn, otext))
|
||||
string reason;
|
||||
if (!file_to_string(fn, otext, &reason)) {
|
||||
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
return set_document_string(otext);
|
||||
}
|
||||
|
||||
bool MimeHandlerText::set_document_string(const string& otext)
|
||||
{
|
||||
m_text = otext;
|
||||
string md5, xmd5;
|
||||
MD5String(m_text, md5);
|
||||
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
|
||||
m_havedoc = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -175,6 +175,8 @@ void rwSettings(bool writing)
|
||||
SETTING_RW(prefs.ssearchAutoPhrase,
|
||||
"/Recoll/prefs/ssearchAutoPhrase", Bool, false);
|
||||
SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Num, 8);
|
||||
SETTING_RW(prefs.collapseDuplicates,
|
||||
"/Recoll/prefs/reslist/collapseDuplicates", Bool, false);
|
||||
SETTING_RW(prefs.maxhltextmbs, "/Recoll/prefs/preview/maxhltextmbs", Num, 3);
|
||||
SETTING_RW(prefs.qtermcolor, "/Recoll/prefs/qtermcolor", , "blue");
|
||||
if (!writing && prefs.qtermcolor == "")
|
||||
|
||||
@ -82,6 +82,7 @@ class PrefsPack {
|
||||
bool startWithAdvSearchOpen;
|
||||
bool startWithSortToolOpen;
|
||||
bool previewHtml;
|
||||
bool collapseDuplicates;
|
||||
// Extra query indexes. This are encoded to base64 before storing
|
||||
// to the qt settings file to avoid any bin string/ charset conv issues
|
||||
list<string> allExtraDbs;
|
||||
|
||||
@ -471,6 +471,7 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
|
||||
sdata->setStemlang(stemLang);
|
||||
|
||||
Rcl::Query *query = new Rcl::Query(rcldb);
|
||||
query->setCollapseDuplicates(prefs.collapseDuplicates);
|
||||
|
||||
if (!query || !query->setQuery(sdata)) {
|
||||
QMessageBox::warning(0, "Recoll", tr("Can't start query: ") +
|
||||
|
||||
@ -90,6 +90,20 @@
|
||||
</widget>
|
||||
</hbox>
|
||||
</widget>
|
||||
<widget class="QCheckBox">
|
||||
<property name="name">
|
||||
<cstring>collapseDupsCB</cstring>
|
||||
</property>
|
||||
<property name="toolTip" stdset="0">
|
||||
<string>If checked, results with the same content under different names will only be shown once.</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Hide duplicate results.</string>
|
||||
</property>
|
||||
<property name="checked">
|
||||
<bool>false</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLayoutWidget">
|
||||
<property name="name">
|
||||
<cstring>qtermcolor</cstring>
|
||||
|
||||
@ -97,6 +97,7 @@ void UIPrefsDialog::setFromPrefs()
|
||||
{
|
||||
// Entries per result page spinbox
|
||||
pageLenSB->setValue(prefs.respagesize);
|
||||
collapseDupsCB->setChecked(prefs.collapseDuplicates);
|
||||
maxHLTSB->setValue(prefs.maxhltextmbs);
|
||||
autoSearchCB->setChecked(prefs.autoSearchOnWS);
|
||||
syntlenSB->setValue(prefs.syntAbsLen);
|
||||
@ -173,6 +174,7 @@ void UIPrefsDialog::accept()
|
||||
{
|
||||
prefs.autoSearchOnWS = autoSearchCB->isChecked();
|
||||
prefs.respagesize = pageLenSB->value();
|
||||
prefs.collapseDuplicates = collapseDupsCB->isChecked();
|
||||
prefs.maxhltextmbs = maxHLTSB->value();
|
||||
|
||||
prefs.qtermcolor = qtermColorLE->text();
|
||||
|
||||
@ -48,7 +48,7 @@ using namespace std;
|
||||
#include "searchdata.h"
|
||||
#include "rclquery.h"
|
||||
#include "rclquery_p.h"
|
||||
|
||||
#include "md5.h"
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (A>B?A:B)
|
||||
@ -57,15 +57,6 @@ using namespace std;
|
||||
#define MIN(A,B) (A<B?A:B)
|
||||
#endif
|
||||
|
||||
// Omega compatible values. We leave a hole for future omega values. Not sure
|
||||
// it makes any sense to keep any level of omega compat given that the index
|
||||
// is incompatible anyway.
|
||||
enum value_slot {
|
||||
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
|
||||
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
|
||||
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
|
||||
};
|
||||
|
||||
// Recoll index format version is stored in user metadata. When this change,
|
||||
// we can't open the db and will have to reindex.
|
||||
static const string RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
||||
@ -1078,6 +1069,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file's md5 was computed, add value. This is optionally
|
||||
// used for query result duplicate elimination.
|
||||
string& md5 = doc.meta[Doc::keymd5];
|
||||
if (!md5.empty()) {
|
||||
string digest;
|
||||
MD5HexScan(md5, digest);
|
||||
newdocument.add_value(VALUE_MD5, digest);
|
||||
}
|
||||
|
||||
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
|
||||
|
||||
@ -56,6 +56,15 @@ class RclConfig;
|
||||
namespace Rcl {
|
||||
#endif
|
||||
|
||||
// Omega compatible values. We leave a hole for future omega values. Not sure
|
||||
// it makes any sense to keep any level of omega compat given that the index
|
||||
// is incompatible anyway.
|
||||
enum value_slot {
|
||||
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
|
||||
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
|
||||
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
|
||||
};
|
||||
|
||||
class SearchData;
|
||||
class TermIter;
|
||||
class Query;
|
||||
|
||||
@ -22,4 +22,5 @@ namespace Rcl {
|
||||
const string Doc::keyau("author");
|
||||
const string Doc::keytt("title");
|
||||
const string Doc::keykw("keywords");
|
||||
const string Doc::keymd5("md5");
|
||||
}
|
||||
|
||||
@ -157,6 +157,7 @@ class Doc {
|
||||
static const string keyau; // author
|
||||
static const string keytt; // title
|
||||
static const string keykw; // keywords
|
||||
static const string keymd5; // file md5 checksum
|
||||
};
|
||||
|
||||
|
||||
|
||||
@ -89,7 +89,8 @@ private:
|
||||
};
|
||||
|
||||
Query::Query(Db *db)
|
||||
: m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true)
|
||||
: m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true),
|
||||
m_collapseDuplicates(false)
|
||||
{
|
||||
}
|
||||
|
||||
@ -155,7 +156,11 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
||||
string d;
|
||||
try {
|
||||
m_nq->enquire = new Xapian::Enquire(m_db->m_ndb->db);
|
||||
m_nq->enquire->set_query(m_nq->query);
|
||||
if (m_collapseDuplicates) {
|
||||
m_nq->enquire->set_collapse_key(Rcl::VALUE_MD5);
|
||||
} else {
|
||||
m_nq->enquire->set_collapse_key(Xapian::BAD_VALUENO);
|
||||
}
|
||||
if (!m_sortField.empty()) {
|
||||
if (m_sorter) {
|
||||
delete (QSorter*)m_sorter;
|
||||
@ -167,6 +172,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
||||
m_nq->enquire->set_sort_by_key((QSorter*)m_sorter,
|
||||
!m_sortAscending);
|
||||
}
|
||||
m_nq->enquire->set_query(m_nq->query);
|
||||
m_nq->mset = Xapian::MSet();
|
||||
// Get the query description and trim the "Xapian::Query"
|
||||
d = m_nq->query.get_description();
|
||||
|
||||
@ -54,6 +54,7 @@ class Query {
|
||||
|
||||
/** Choose sort order. Must be called before setQuery */
|
||||
void setSortBy(const string& fld, bool ascending = true);
|
||||
void setCollapseDuplicates(bool on) {m_collapseDuplicates = on;}
|
||||
const string& getSortBy() const {return m_sortField;}
|
||||
bool getSortAscending() const {return m_sortAscending;}
|
||||
|
||||
@ -90,8 +91,9 @@ private:
|
||||
string m_reason; // Error explanation
|
||||
Db *m_db;
|
||||
void *m_sorter;
|
||||
string m_sortField;
|
||||
bool m_sortAscending;
|
||||
string m_sortField;
|
||||
bool m_sortAscending;
|
||||
bool m_collapseDuplicates;
|
||||
/* Copyconst and assignement private and forbidden */
|
||||
Query(const Query &) {}
|
||||
Query & operator=(const Query &) {return *this;};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user