doc and comments

2020-06-25 16:06:45 +02:00 · 2020-06-25 16:06:45 +02:00 · 02556e7d08
commit 02556e7d08
parent 101a566dec
4 changed files with 73 additions and 71 deletions
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@ -3,7 +3,7 @@
 <html>
 <head>
  <meta name="generator" content=
-  "HTML Tidy for HTML5 for Linux version 5.6.0">
+  "HTML Tidy for HTML5 for Linux version 5.2.0">
  <meta http-equiv="Content-Type" content=
  "text/html; charset=utf-8">
  <title>Recoll user manual</title>
@ -1135,8 +1135,8 @@ alink="#0000FF">
              different areas of the file system to different
              indexes. For example, if you were to issue the
              following command:</p>
-              <pre class=
-              "programlisting">recoll -c ~/.indexes-email</pre>
+              <pre class="programlisting">
+              recoll -c ~/.indexes-email</pre>
              <p>Then <span class="application">Recoll</span> would
              use configuration files stored in <code class=
              "filename">~/.indexes-email/</code> and, (unless
@ -3874,8 +3874,8 @@ fs.inotify.max_user_watches=32768
          that every user does not have to do it. The variable
          should define a colon-separated list of index
          directories, ie:</p>
-          <pre class=
-          "screen">export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db</pre>
+          <pre class="screen">
+          export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db</pre>
          <p>Another environment variable, <code class=
          "envar">RECOLL_ACTIVE_EXTRA_DBS</code> allows adding to
          the active list of indexes. This variable was suggested
@ -4677,8 +4677,8 @@ fs.inotify.max_user_watches=32768
              parent folder expansion, usually creating a file
              manager window on the folder where the container file
              resides. E.g.:</p>
-              <pre class=
-              "programlisting">&lt;a href="F%N"&gt;%P&lt;/a&gt;</pre>
+              <pre class="programlisting">
+              &lt;a href="F%N"&gt;%P&lt;/a&gt;</pre>
              <p>A link target defined as <code class=
              "literal">R%N|<em class=
              "replaceable"><code>scriptname</code></em></code>
@ -4820,8 +4820,8 @@ fs.inotify.max_user_watches=32768
          <span class="application">javascript</span> program to
          the documents, like the following example, which would
          initiate a search by double-clicking any term:</p>
-          <pre class=
-          "programlisting">&lt;script language="JavaScript"&gt;
+          <pre class="programlisting">
+          &lt;script language="JavaScript"&gt;
        function recollsearch() {
        var t = document.getSelection();
        window.location.href = 'recoll://search/query?qtp=a&amp;p=0&amp;q=' +
@ -5115,7 +5115,17 @@ text/html       [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
            <li class="listitem">
              <p><code class="literal">ext</code> specifies the
              file name extension (Ex: <code class=
-              "literal">ext:html</code>)</p>
+              "literal">ext:html</code>).</p>
+            </li>
+            <li class="listitem">
+              <p><code class="literal">rclmd5</code> the MD5
+              checksum for the document. This is used for
+              displaying the duplicates of a search result (when
+              querying with the option to collapse duplicate
+              results). Incidentally, this could be used to find
+              the duplicates of any given file by computing its MD5
+              checksum and executing a query with just the
+              <code class="literal">rclmd5</code> value.</p>
            </li>
          </ul>
        </div>
@ -10055,8 +10065,8 @@ for i in range(nres):
          "filename">.xml</code> extension but should be handled
          specially, which is possible because they are usually all
          located in one place. Example:</p>
-          <pre class=
-          "programlisting">[~/.kde/share/apps/okular/docdata]
+          <pre class="programlisting">
+          [~/.kde/share/apps/okular/docdata]
        .xml = application/x-okular-notes</pre>
          <p>The <code class="varname">recoll_noindex</code>
          <code class="filename">mimemap</code> variable has been
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -3896,27 +3896,36 @@ text/html       [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
        name for an email attachment.</para></listitem> 

        <listitem><para><literal>containerfilename</literal>. This is
-        set for all documents, both top-level and contained
-        sub-documents, and is always the name of the filesystem directory
-        entry which contains the data. The terms from this field can
-        only be matched by an explicit field specification (as opposed
-        to terms from <literal>filename</literal> which are also indexed
-        as general document content). This avoids getting matches for
-        all the sub-documents when searching for the container file
-        name.</para></listitem> 
+            set for all documents, both top-level and contained
+            sub-documents, and is always the name of the filesystem directory
+            entry which contains the data. The terms from this field can
+            only be matched by an explicit field specification (as opposed
+            to terms from <literal>filename</literal> which are also indexed
+            as general document content). This avoids getting matches for
+            all the sub-documents when searching for the container file
+            name.</para></listitem> 
        
        <listitem><para><literal>ext</literal> specifies the file
-        name extension (Ex: <literal>ext:html</literal>)</para>
+            name extension
+            (Ex: <literal>ext:html</literal>).</para></listitem>
+
+        <listitem><para><literal>rclmd5</literal> the MD5 checksum for the
+            document. This is used for displaying the duplicates of a
+            search result (when querying with the option to collapse
+            duplicate results). Incidentally, this could be used to find
+            the duplicates of any given file by computing its MD5 checksum
+            and executing a query with just the <literal>rclmd5</literal>
+            value.</para>
        </listitem>

      </itemizedlist>

      <para>&RCL; 1.20 and later have a way to specify aliases for the
-      field names, which will save typing, for example by aliasing
-      <literal>filename</literal> to <replaceable>fn</replaceable> or
-      <literal>containerfilename</literal> to
-      <replaceable>cfn</replaceable>. See the
-      <link linkend="RCL.INSTALL.CONFIG.FIELDS">section about the <filename>fields</filename> file</link>.
+        field names, which will save typing, for example by aliasing
+        <literal>filename</literal> to <replaceable>fn</replaceable> or
+        <literal>containerfilename</literal> to
+        <replaceable>cfn</replaceable>. See the
+        <link linkend="RCL.INSTALL.CONFIG.FIELDS">section about the <filename>fields</filename> file</link>.
      </para> 

      <para>The document input handlers used while indexing have the
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1866,10 +1866,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
            RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
        }
    
-        // If the file's md5 was computed, add value and term. 
-        // The value is optionally used for query result duplicate elimination, 
-        // and the term to find the duplicates.
-        // We don't do this for empty docs.
+        // If the file's md5 was computed, add value and term.  The
+        // value is optionally used for query result duplicate
+        // elimination, and the term to find the duplicates (XM is the
+        // prefix for rclmd5 in fields) We don't do this for empty
+        // docs.
        const string *md5;
        if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
            md5->compare(cstr_md5empty)) {
--- a/src/rcldb/rcldups.cpp
+++ b/src/rcldb/rcldups.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2013 J.F.Dockes
+/* Copyright (C) 2013-2020 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -35,36 +35,36 @@ using namespace std;
 namespace Rcl {

 /** Retrieve the dups of a given document. The input has to be a query result
-  * because we use the xdocid. We get the md5 from this, then the dups */
+ * because we use the xdocid. We get the md5 from this, then the dups */
 bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
 {
    if (m_ndb == 0) {
-    LOGERR("Db::docDups: no db\n" );
-    return false;
+        LOGERR("Db::docDups: no db\n");
+        return false;
    }
    if (idoc.xdocid == 0) {
-    LOGERR("Db::docDups: null xdocid in input doc\n" );
-    return false;
+        LOGERR("Db::docDups: null xdocid in input doc\n");
+        return false;
    }
    // Get the xapian doc
    Xapian::Document xdoc;
    XAPTRY(xdoc = m_ndb->xrdb.get_document(Xapian::docid(idoc.xdocid)), 
-       m_ndb->xrdb, m_reason);
+           m_ndb->xrdb, m_reason);
    if (!m_reason.empty()) {
-    LOGERR("Db::docDups: xapian error: "  << (m_reason) << "\n" );
-    return false;
+        LOGERR("Db::docDups: xapian error: " << m_reason << "\n");
+        return false;
    }

    // Get the md5
    string digest;
    XAPTRY(digest = xdoc.get_value(VALUE_MD5), m_ndb->xrdb, m_reason);
    if (!m_reason.empty()) {
-    LOGERR("Db::docDups: xapian error: "  << (m_reason) << "\n" );
-    return false;
+        LOGERR("Db::docDups: xapian error: " << m_reason << "\n");
+        return false;
    }
    if (digest.empty()) {
-    LOGDEB("Db::docDups: doc has no md5\n" );
-    return false;
+        LOGDEB("Db::docDups: doc has no md5\n");
+        return false;
    }
    string md5;
    MD5HexPrint(digest, md5);
@ -72,45 +72,27 @@ bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
    SearchData *sdp = new SearchData();
    std::shared_ptr<SearchData> sd(sdp);
    SearchDataClauseSimple *sdc = 
-    new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
+        new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
    sdc->addModifier(SearchDataClause::SDCM_CASESENS);
    sdc->addModifier(SearchDataClause::SDCM_DIACSENS);
    sd->addClause(sdc);
    Query query(this);
    query.setCollapseDuplicates(0);
    if (!query.setQuery(sd)) {
-    LOGERR("Db::docDups: setQuery failed\n" );
-    return false;
+        LOGERR("Db::docDups: setQuery failed\n");
+        return false;
    }
    int cnt = query.getResCnt();
    for (int i = 0; i < cnt; i++) {
-    Doc doc;
-    if (!query.getDoc(i, doc)) {
-        LOGERR("Db::docDups: getDoc failed at "  << (i) << " (cnt "  << (cnt) << ")\n" );
-        return false;
-    }
-    odocs.push_back(doc);
+        Doc doc;
+        if (!query.getDoc(i, doc)) {
+            LOGERR("Db::docDups: getDoc failed at " << i << " (cnt " << cnt <<
+                   ")\n");
+            return false;
+        }
+        odocs.push_back(doc);
    }
    return true;
 }

-#if 0
-    {
-    vector<Doc> dups;
-    bool ret;
-    LOGDEB("DOCDUPS\n" );
-    ret = m_db->docDups(doc, dups);
-    if (!ret) {
-        LOGDEB("docDups failed\n" );
-    } else if (dups.size() == 1) {
-        LOGDEB("No dups\n" );
-    } else {
-        for (unsigned int i = 0; i < dups.size(); i++) {
-        LOGDEB("Dup: "  << (dups[i].url) << "\n" );
-        }
-    }
-    }
-#endif
-
 }
-