From 02556e7d088ce630cde956759d0768c0d42fba71 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Thu, 25 Jun 2020 16:06:45 +0200
Subject: [PATCH] doc and comments

---
 src/doc/user/usermanual.html | 34 ++++++++++++-------
 src/doc/user/usermanual.xml  | 37 +++++++++++++--------
 src/rcldb/rcldb.cpp          |  9 ++---
 src/rcldb/rcldups.cpp        | 64 +++++++++++++-----------------------
 4 files changed, 73 insertions(+), 71 deletions(-)
diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html
index 78a7b56c..29c8d24e 100644
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@@ -3,7 +3,7 @@
 <html>
 <head>
   <meta name="generator" content=
-  "HTML Tidy for HTML5 for Linux version 5.6.0">
+  "HTML Tidy for HTML5 for Linux version 5.2.0">
   <meta http-equiv="Content-Type" content=
   "text/html; charset=utf-8">
   <title>Recoll user manual</title>
@@ -1135,8 +1135,8 @@ alink="#0000FF">
               different areas of the file system to different
               indexes. For example, if you were to issue the
               following command:</p>
-              <pre class=
-              "programlisting">recoll -c ~/.indexes-email</pre>
+              <pre class="programlisting">
+              recoll -c ~/.indexes-email</pre>
               <p>Then <span class="application">Recoll</span> would
               use configuration files stored in <code class=
               "filename">~/.indexes-email/</code> and, (unless
@@ -3874,8 +3874,8 @@ fs.inotify.max_user_watches=32768
           that every user does not have to do it. The variable
           should define a colon-separated list of index
           directories, ie:</p>
-          <pre class=
-          "screen">export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db</pre>
+          <pre class="screen">
+          export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db</pre>
           <p>Another environment variable, <code class=
           "envar">RECOLL_ACTIVE_EXTRA_DBS</code> allows adding to
           the active list of indexes. This variable was suggested
@@ -4677,8 +4677,8 @@ fs.inotify.max_user_watches=32768
               parent folder expansion, usually creating a file
               manager window on the folder where the container file
               resides. E.g.:</p>
-              <pre class=
-              "programlisting">&lt;a href="F%N"&gt;%P&lt;/a&gt;</pre>
+              <pre class="programlisting">
+              &lt;a href="F%N"&gt;%P&lt;/a&gt;</pre>
               <p>A link target defined as <code class=
               "literal">R%N|<em class=
               "replaceable"><code>scriptname</code></em></code>
@@ -4820,8 +4820,8 @@ fs.inotify.max_user_watches=32768
           <span class="application">javascript</span> program to
           the documents, like the following example, which would
           initiate a search by double-clicking any term:</p>
-          <pre class=
-          "programlisting">&lt;script language="JavaScript"&gt;
+          <pre class="programlisting">
+          &lt;script language="JavaScript"&gt;
         function recollsearch() {
         var t = document.getSelection();
         window.location.href = 'recoll://search/query?qtp=a&amp;p=0&amp;q=' +
@@ -5115,7 +5115,17 @@ text/html       [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
             <li class="listitem">
               <p><code class="literal">ext</code> specifies the
               file name extension (Ex: <code class=
-              "literal">ext:html</code>)</p>
+              "literal">ext:html</code>).</p>
+            </li>
+            <li class="listitem">
+              <p><code class="literal">rclmd5</code> the MD5
+              checksum for the document. This is used for
+              displaying the duplicates of a search result (when
+              querying with the option to collapse duplicate
+              results). Incidentally, this could be used to find
+              the duplicates of any given file by computing its MD5
+              checksum and executing a query with just the
+              <code class="literal">rclmd5</code> value.</p>
             </li>
           </ul>
         </div>
@@ -10055,8 +10065,8 @@ for i in range(nres):
           "filename">.xml</code> extension but should be handled
           specially, which is possible because they are usually all
           located in one place. Example:</p>
-          <pre class=
-          "programlisting">[~/.kde/share/apps/okular/docdata]
+          <pre class="programlisting">
+          [~/.kde/share/apps/okular/docdata]
         .xml = application/x-okular-notes</pre>
           <p>The <code class="varname">recoll_noindex</code>
           <code class="filename">mimemap</code> variable has been
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index 66ccd5cb..fefe4b3f 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -3896,27 +3896,36 @@ text/html       [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
         name for an email attachment.</para></listitem> 
 
         <listitem><para><literal>containerfilename</literal>. This is
-        set for all documents, both top-level and contained
-        sub-documents, and is always the name of the filesystem directory
-        entry which contains the data. The terms from this field can
-        only be matched by an explicit field specification (as opposed
-        to terms from <literal>filename</literal> which are also indexed
-        as general document content). This avoids getting matches for
-        all the sub-documents when searching for the container file
-        name.</para></listitem> 
+            set for all documents, both top-level and contained
+            sub-documents, and is always the name of the filesystem directory
+            entry which contains the data. The terms from this field can
+            only be matched by an explicit field specification (as opposed
+            to terms from <literal>filename</literal> which are also indexed
+            as general document content). This avoids getting matches for
+            all the sub-documents when searching for the container file
+            name.</para></listitem> 
         
         <listitem><para><literal>ext</literal> specifies the file
-        name extension (Ex: <literal>ext:html</literal>)</para>
+            name extension
+            (Ex: <literal>ext:html</literal>).</para></listitem>
+
+        <listitem><para><literal>rclmd5</literal> the MD5 checksum for the
+            document. This is used for displaying the duplicates of a
+            search result (when querying with the option to collapse
+            duplicate results). Incidentally, this could be used to find
+            the duplicates of any given file by computing its MD5 checksum
+            and executing a query with just the <literal>rclmd5</literal>
+            value.</para>
         </listitem>
 
       </itemizedlist>
 
       <para>&RCL; 1.20 and later have a way to specify aliases for the
-      field names, which will save typing, for example by aliasing
-      <literal>filename</literal> to <replaceable>fn</replaceable> or
-      <literal>containerfilename</literal> to
-      <replaceable>cfn</replaceable>. See the
-      <link linkend="RCL.INSTALL.CONFIG.FIELDS">section about the <filename>fields</filename> file</link>.
+        field names, which will save typing, for example by aliasing
+        <literal>filename</literal> to <replaceable>fn</replaceable> or
+        <literal>containerfilename</literal> to
+        <replaceable>cfn</replaceable>. See the
+        <link linkend="RCL.INSTALL.CONFIG.FIELDS">section about the <filename>fields</filename> file</link>.
       </para> 
 
       <para>The document input handlers used while indexing have the
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 6cedda3c..5a94c2f2 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1866,10 +1866,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
             RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
         }
     
-        // If the file's md5 was computed, add value and term. 
-        // The value is optionally used for query result duplicate elimination, 
-        // and the term to find the duplicates.
-        // We don't do this for empty docs.
+        // If the file's md5 was computed, add value and term.  The
+        // value is optionally used for query result duplicate
+        // elimination, and the term to find the duplicates (XM is the
+        // prefix for rclmd5 in fields) We don't do this for empty
+        // docs.
         const string *md5;
         if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
             md5->compare(cstr_md5empty)) {
diff --git a/src/rcldb/rcldups.cpp b/src/rcldb/rcldups.cpp
index 1a8e3780..06f5266b 100644
--- a/src/rcldb/rcldups.cpp
+++ b/src/rcldb/rcldups.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013 J.F.Dockes
+/* Copyright (C) 2013-2020 J.F.Dockes
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
@@ -35,36 +35,36 @@ using namespace std;
 namespace Rcl {
 
 /** Retrieve the dups of a given document. The input has to be a query result
-  * because we use the xdocid. We get the md5 from this, then the dups */
+ * because we use the xdocid. We get the md5 from this, then the dups */
 bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
 {
     if (m_ndb == 0) {
-    LOGERR("Db::docDups: no db\n" );
-    return false;
+        LOGERR("Db::docDups: no db\n");
+        return false;
     }
     if (idoc.xdocid == 0) {
-    LOGERR("Db::docDups: null xdocid in input doc\n" );
-    return false;
+        LOGERR("Db::docDups: null xdocid in input doc\n");
+        return false;
     }
     // Get the xapian doc
     Xapian::Document xdoc;
     XAPTRY(xdoc = m_ndb->xrdb.get_document(Xapian::docid(idoc.xdocid)), 
-       m_ndb->xrdb, m_reason);
+           m_ndb->xrdb, m_reason);
     if (!m_reason.empty()) {
-    LOGERR("Db::docDups: xapian error: "  << (m_reason) << "\n" );
-    return false;
+        LOGERR("Db::docDups: xapian error: " << m_reason << "\n");
+        return false;
     }
 
     // Get the md5
     string digest;
     XAPTRY(digest = xdoc.get_value(VALUE_MD5), m_ndb->xrdb, m_reason);
     if (!m_reason.empty()) {
-    LOGERR("Db::docDups: xapian error: "  << (m_reason) << "\n" );
-    return false;
+        LOGERR("Db::docDups: xapian error: " << m_reason << "\n");
+        return false;
     }
     if (digest.empty()) {
-    LOGDEB("Db::docDups: doc has no md5\n" );
-    return false;
+        LOGDEB("Db::docDups: doc has no md5\n");
+        return false;
     }
     string md5;
     MD5HexPrint(digest, md5);
@@ -72,45 +72,27 @@ bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
     SearchData *sdp = new SearchData();
     std::shared_ptr<SearchData> sd(sdp);
     SearchDataClauseSimple *sdc = 
-    new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
+        new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
     sdc->addModifier(SearchDataClause::SDCM_CASESENS);
     sdc->addModifier(SearchDataClause::SDCM_DIACSENS);
     sd->addClause(sdc);
     Query query(this);
     query.setCollapseDuplicates(0);
     if (!query.setQuery(sd)) {
-    LOGERR("Db::docDups: setQuery failed\n" );
-    return false;
+        LOGERR("Db::docDups: setQuery failed\n");
+        return false;
     }
     int cnt = query.getResCnt();
     for (int i = 0; i < cnt; i++) {
-    Doc doc;
-    if (!query.getDoc(i, doc)) {
-        LOGERR("Db::docDups: getDoc failed at "  << (i) << " (cnt "  << (cnt) << ")\n" );
-        return false;
-    }
-    odocs.push_back(doc);
+        Doc doc;
+        if (!query.getDoc(i, doc)) {
+            LOGERR("Db::docDups: getDoc failed at " << i << " (cnt " << cnt <<
+                   ")\n");
+            return false;
+        }
+        odocs.push_back(doc);
     }
     return true;
 }
 
-#if 0
-    {
-    vector<Doc> dups;
-    bool ret;
-    LOGDEB("DOCDUPS\n" );
-    ret = m_db->docDups(doc, dups);
-    if (!ret) {
-        LOGDEB("docDups failed\n" );
-    } else if (dups.size() == 1) {
-        LOGDEB("No dups\n" );
-    } else {
-        for (unsigned int i = 0; i < dups.size(); i++) {
-        LOGDEB("Dup: "  << (dups[i].url) << "\n" );
-        }
-    }
-    }
-#endif
-
 }
-