doc and comments

This commit is contained in:
Jean-Francois Dockes 2020-06-25 16:06:45 +02:00
parent 101a566dec
commit 02556e7d08
4 changed files with 73 additions and 71 deletions

View File

@ -3,7 +3,7 @@
<html>
<head>
<meta name="generator" content=
"HTML Tidy for HTML5 for Linux version 5.6.0">
"HTML Tidy for HTML5 for Linux version 5.2.0">
<meta http-equiv="Content-Type" content=
"text/html; charset=utf-8">
<title>Recoll user manual</title>
@ -1135,8 +1135,8 @@ alink="#0000FF">
different areas of the file system to different
indexes. For example, if you were to issue the
following command:</p>
<pre class=
"programlisting">recoll -c ~/.indexes-email</pre>
<pre class="programlisting">
recoll -c ~/.indexes-email</pre>
<p>Then <span class="application">Recoll</span> would
use configuration files stored in <code class=
"filename">~/.indexes-email/</code> and, (unless
@ -3874,8 +3874,8 @@ fs.inotify.max_user_watches=32768
that every user does not have to do it. The variable
should define a colon-separated list of index
directories, ie:</p>
<pre class=
"screen">export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db</pre>
<pre class="screen">
export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db</pre>
<p>Another environment variable, <code class=
"envar">RECOLL_ACTIVE_EXTRA_DBS</code> allows adding to
the active list of indexes. This variable was suggested
@ -4677,8 +4677,8 @@ fs.inotify.max_user_watches=32768
parent folder expansion, usually creating a file
manager window on the folder where the container file
resides. E.g.:</p>
<pre class=
"programlisting">&lt;a href="F%N"&gt;%P&lt;/a&gt;</pre>
<pre class="programlisting">
&lt;a href="F%N"&gt;%P&lt;/a&gt;</pre>
<p>A link target defined as <code class=
"literal">R%N|<em class=
"replaceable"><code>scriptname</code></em></code>
@ -4820,8 +4820,8 @@ fs.inotify.max_user_watches=32768
<span class="application">javascript</span> program to
the documents, like the following example, which would
initiate a search by double-clicking any term:</p>
<pre class=
"programlisting">&lt;script language="JavaScript"&gt;
<pre class="programlisting">
&lt;script language="JavaScript"&gt;
function recollsearch() {
var t = document.getSelection();
window.location.href = 'recoll://search/query?qtp=a&amp;p=0&amp;q=' +
@ -5115,7 +5115,17 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
<li class="listitem">
<p><code class="literal">ext</code> specifies the
file name extension (Ex: <code class=
"literal">ext:html</code>)</p>
"literal">ext:html</code>).</p>
</li>
<li class="listitem">
<p><code class="literal">rclmd5</code> the MD5
checksum for the document. This is used for
displaying the duplicates of a search result (when
querying with the option to collapse duplicate
results). Incidentally, this could be used to find
the duplicates of any given file by computing its MD5
checksum and executing a query with just the
<code class="literal">rclmd5</code> value.</p>
</li>
</ul>
</div>
@ -10055,8 +10065,8 @@ for i in range(nres):
"filename">.xml</code> extension but should be handled
specially, which is possible because they are usually all
located in one place. Example:</p>
<pre class=
"programlisting">[~/.kde/share/apps/okular/docdata]
<pre class="programlisting">
[~/.kde/share/apps/okular/docdata]
.xml = application/x-okular-notes</pre>
<p>The <code class="varname">recoll_noindex</code>
<code class="filename">mimemap</code> variable has been

View File

@ -3896,27 +3896,36 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
name for an email attachment.</para></listitem>
<listitem><para><literal>containerfilename</literal>. This is
set for all documents, both top-level and contained
sub-documents, and is always the name of the filesystem directory
entry which contains the data. The terms from this field can
only be matched by an explicit field specification (as opposed
to terms from <literal>filename</literal> which are also indexed
as general document content). This avoids getting matches for
all the sub-documents when searching for the container file
name.</para></listitem>
set for all documents, both top-level and contained
sub-documents, and is always the name of the filesystem directory
entry which contains the data. The terms from this field can
only be matched by an explicit field specification (as opposed
to terms from <literal>filename</literal> which are also indexed
as general document content). This avoids getting matches for
all the sub-documents when searching for the container file
name.</para></listitem>
<listitem><para><literal>ext</literal> specifies the file
name extension (Ex: <literal>ext:html</literal>)</para>
name extension
(Ex: <literal>ext:html</literal>).</para></listitem>
<listitem><para><literal>rclmd5</literal> the MD5 checksum for the
document. This is used for displaying the duplicates of a
search result (when querying with the option to collapse
duplicate results). Incidentally, this could be used to find
the duplicates of any given file by computing its MD5 checksum
and executing a query with just the <literal>rclmd5</literal>
value.</para>
</listitem>
</itemizedlist>
<para>&RCL; 1.20 and later have a way to specify aliases for the
field names, which will save typing, for example by aliasing
<literal>filename</literal> to <replaceable>fn</replaceable> or
<literal>containerfilename</literal> to
<replaceable>cfn</replaceable>. See the
<link linkend="RCL.INSTALL.CONFIG.FIELDS">section about the <filename>fields</filename> file</link>.
field names, which will save typing, for example by aliasing
<literal>filename</literal> to <replaceable>fn</replaceable> or
<literal>containerfilename</literal> to
<replaceable>cfn</replaceable>. See the
<link linkend="RCL.INSTALL.CONFIG.FIELDS">section about the <filename>fields</filename> file</link>.
</para>
<para>The document input handlers used while indexing have the

View File

@ -1866,10 +1866,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
}
// If the file's md5 was computed, add value and term.
// The value is optionally used for query result duplicate elimination,
// and the term to find the duplicates.
// We don't do this for empty docs.
// If the file's md5 was computed, add value and term. The
// value is optionally used for query result duplicate
// elimination, and the term to find the duplicates (XM is the
// prefix for rclmd5 in fields) We don't do this for empty
// docs.
const string *md5;
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
md5->compare(cstr_md5empty)) {

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2013 J.F.Dockes
/* Copyright (C) 2013-2020 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -35,36 +35,36 @@ using namespace std;
namespace Rcl {
/** Retrieve the dups of a given document. The input has to be a query result
* because we use the xdocid. We get the md5 from this, then the dups */
* because we use the xdocid. We get the md5 from this, then the dups */
bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
{
if (m_ndb == 0) {
LOGERR("Db::docDups: no db\n" );
return false;
LOGERR("Db::docDups: no db\n");
return false;
}
if (idoc.xdocid == 0) {
LOGERR("Db::docDups: null xdocid in input doc\n" );
return false;
LOGERR("Db::docDups: null xdocid in input doc\n");
return false;
}
// Get the xapian doc
Xapian::Document xdoc;
XAPTRY(xdoc = m_ndb->xrdb.get_document(Xapian::docid(idoc.xdocid)),
m_ndb->xrdb, m_reason);
m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR("Db::docDups: xapian error: " << (m_reason) << "\n" );
return false;
LOGERR("Db::docDups: xapian error: " << m_reason << "\n");
return false;
}
// Get the md5
string digest;
XAPTRY(digest = xdoc.get_value(VALUE_MD5), m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR("Db::docDups: xapian error: " << (m_reason) << "\n" );
return false;
LOGERR("Db::docDups: xapian error: " << m_reason << "\n");
return false;
}
if (digest.empty()) {
LOGDEB("Db::docDups: doc has no md5\n" );
return false;
LOGDEB("Db::docDups: doc has no md5\n");
return false;
}
string md5;
MD5HexPrint(digest, md5);
@ -72,45 +72,27 @@ bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
SearchData *sdp = new SearchData();
std::shared_ptr<SearchData> sd(sdp);
SearchDataClauseSimple *sdc =
new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
sdc->addModifier(SearchDataClause::SDCM_CASESENS);
sdc->addModifier(SearchDataClause::SDCM_DIACSENS);
sd->addClause(sdc);
Query query(this);
query.setCollapseDuplicates(0);
if (!query.setQuery(sd)) {
LOGERR("Db::docDups: setQuery failed\n" );
return false;
LOGERR("Db::docDups: setQuery failed\n");
return false;
}
int cnt = query.getResCnt();
for (int i = 0; i < cnt; i++) {
Doc doc;
if (!query.getDoc(i, doc)) {
LOGERR("Db::docDups: getDoc failed at " << (i) << " (cnt " << (cnt) << ")\n" );
return false;
}
odocs.push_back(doc);
Doc doc;
if (!query.getDoc(i, doc)) {
LOGERR("Db::docDups: getDoc failed at " << i << " (cnt " << cnt <<
")\n");
return false;
}
odocs.push_back(doc);
}
return true;
}
#if 0
{
vector<Doc> dups;
bool ret;
LOGDEB("DOCDUPS\n" );
ret = m_db->docDups(doc, dups);
if (!ret) {
LOGDEB("docDups failed\n" );
} else if (dups.size() == 1) {
LOGDEB("No dups\n" );
} else {
for (unsigned int i = 0; i < dups.size(); i++) {
LOGDEB("Dup: " << (dups[i].url) << "\n" );
}
}
}
#endif
}