Added a configuration parameter to set specific unaccenting/lowercasing for some characters to be handled differently than would result from using the Unicode database. Exemple: "a with ring above" could be set to be preserved by a Swedish locutor

2012-04-09 12:42:23 +02:00 · 2012-04-09 12:42:23 +02:00 · a4c17941b1
commit a4c17941b1
parent 29ef5bd143
14 changed files with 365 additions and 23 deletions
--- a/src/common/rclinit.cpp
+++ b/src/common/rclinit.cpp
@ -107,9 +107,15 @@ RclConfig *recollinit(RclInitFlags flags,
    // Make sure the locale charset is initialized (so that multiple
    // threads don't try to do it at once).
    config->getDefCharset();
+
    // Init unac locking
    unac_init_mt();

+    // Init Unac translation exceptions
+    string unacex;
+    if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty()) 
+	unac_set_except_translations(unacex.c_str());
+
    int flushmb;
    if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
 	LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@ -88,6 +88,7 @@ using namespace std;

 #include "unacpp.h"
 #include "readfile.h"
+#include "rclinit.h"

 int main(int argc, char **argv)
 {
@ -98,8 +99,13 @@ int main(int argc, char **argv)
    }
    const char *encoding = argv[1];
    string ifn = argv[2];
+    if (!ifn.compare("stdin"))
+	ifn.clear();
    const char *ofn = argv[3];

+    string reason;
+    (void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
+
    string odata;
    if (!file_to_string(ifn, odata)) {
 	cerr << "file_to_string: " << odata << endl;
@ -111,7 +117,12 @@ int main(int argc, char **argv)
 	exit(1);
    }
    
-    int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
+    int fd;
+    if (strcmp(ofn, "stdout")) {
+	fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
+    } else {
+	fd = 1;
+    }
    if (fd < 0) {
 	cerr << "Open/Create " << ofn << " failed: " << strerror(errno) 
 	     << endl;
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@ -34,6 +34,9 @@
      <para>This document introduces full text search notions
      and describes the installation and use of the &RCL;
      application. It currently describes &RCL; &RCLVERSION;.</para>
+<!--      <para>[ <ulink url="index.html">Split HTML</ulink> / 
+             <ulink url="usermanual-xml.html">Single HTML</ulink> ]</para>
+-->
    </abstract>


@ -3849,6 +3852,32 @@ skippedPaths = ~/somedir/&lowast;.txt
 	   </listitem>
         </varlistentry>

+          <varlistentry><term><literal>unac_except_trans</literal></term>
+            <listitem><para>This is a list of characters which should be
+            handled specially when converting text to unaccented lowercase.
+            For example, in Swedish, the letter <literal>a with diaeresis
+            </literal> has full alphabet citizenship and should not be
+            turned into an <literal>a</literal>. Each element in the
+            space-separated list has the special character as first element
+            and the translation following. The handling of both the
+            lowercase and upper-case versions of a character should be
+            specified, as appartenance to the list will turn-off both
+            standard accent and case processing. Example for Swedish:</para>
+                <programlisting>
+unac_except_trans =  åå Åå ää Ää öö Öö
+            </programlisting>
+
+            <para>Note that the translation is not limited to a single
+            character, you could very well have something like
+            <literal>üue</literal> in the list.</para>
+
+             <para>This parameter can't be defined for subdirectories, it
+             is global, because there is no way to do otherwise when
+             querying. If you have document sets which would need different
+             values, you will have to index and query them separately.</para> 
+              </listitem>
+            </varlistentry>
+
          <varlistentry><term><literal>maildefcharset</literal></term>
            <listitem><para>This can be used to define the default
 		character set specifically for email messages which don't
--- a/src/doc/user/xmlmake.sh
+++ b/src/doc/user/xmlmake.sh
@ -14,6 +14,9 @@
 # Wherever docbook.xsl and chunk.xsl live
 XSLDIR="/usr/local/share/xsl/docbook/"

+dochunky=1
+test $# -eq 1 && dochunky=0
+
 # Remove the SGML header and uncomment the XML one
 sed -e '\!//FreeBSD//DTD!d' \
    -e '\!DTD DocBook XML!s/<!--//' \
@ -31,7 +34,7 @@ commonoptions="--stringparam section.autolabel 1 \
 "

 # Do the chunky thing
-eval xsltproc $commonoptions \
+test $dochunky -ne 0 && eval xsltproc $commonoptions \
    --stringparam use.id.as.filename 1 \
    "$XSLDIR/html/chunk.xsl" \
    usermanual.xml
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -14,7 +14,7 @@ librcl.a : $(DEPS) $(OBJS) unac.o
 	$(RANLIB) librcl.a

 unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs
-	$(CC) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
+	$(CXX) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
 rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
 	$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
 beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
--- a/src/lib/mkMake
+++ b/src/lib/mkMake
@ -121,7 +121,7 @@ librcl.a : \$(DEPS) \$(OBJS) unac.o
 	\$(RANLIB) librcl.a

 unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs
-	\$(CC) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
+	\$(CXX) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
 EOF

 for c in $SRC_CPP;do
--- a/src/unac/unac.c
+++ b/src/unac/unac.c
@ -17,15 +17,57 @@
 */

 #ifdef HAVE_CONFIG_H
+#ifdef RECOLL_DATADIR
+#include "autoconfig.h"
+#else
 #include "config.h"
+#endif /* RECOLL */
 #endif /* HAVE_CONFIG_H */

+#ifdef RECOLL_DATADIR
+/* Yes, recoll unac is actually c++, lets face modernity, I will not be
+   caught writing another binary search  */
+#include <vector>
+#include <map>
+#include <string>
+#include <algorithm>
+using std::string;
+using std::vector;
+using std::map;
+#include "smallut.h"
+
+/* 
+   Storage for the exception translations. These are chars which
+   should not be translated according to what UnicodeData says, but
+   instead according to some local rule. There will usually be very
+   few of them, but they must be looked up for every translated char.
+   
+   We use a sorted vector for fastest elimination by binary search and
+   a vector<string> to store the translations
+ */
+static vector<unsigned short> except_chars;
+static vector<string> except_trans;
+static inline size_t is_except_char(unsigned short c)
+{
+    vector<unsigned short>::iterator it = 
+	std::lower_bound(except_chars.begin(), except_chars.end(), c);
+    if (it == except_chars.end() || *it != c) {
+	return (size_t(-1));
+    }
+    return std::distance(except_chars.begin(), it);
+}
+#endif /* RECOLL_DATADIR */
+
 /*
 * If configure.in has not defined this symbol, assume const. It
 * does not harm much: a warning will be issued during compilation.
 */
 #ifndef ICONV_CONST
+#ifdef RCL_ICONV_INBUF_CONST
+#define ICONV_CONST const
+#else
 #define ICONV_CONST
+#endif
 #endif /* ICONV_CONST */

 #include <stdlib.h>
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
  char* out;
  int out_size;
  int out_length;
-  int i;
+  unsigned int i;

  out_size = in_length > 0 ? in_length : 1024;

  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char*)realloc(out, out_size + 1);
  if(out == 0) {
      if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size+1);
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
    /*
     * Lookup the tables for decomposition information
     */
-    if (dofold) {
-	unacfold_char_utf16(c, p, l);
+#ifdef RECOLL_DATADIR
+    size_t idx;
+    if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
+	p = (unsigned short *)(except_trans[idx].c_str() + 2);
+	l = (except_trans[idx].size() - 2) / 2;
+	/* unsigned char *cp = (unsigned char *)p;
+	   fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0], 
+	   (unsigned int)cp[1]);*/
    } else {
-	unac_char_utf16(c, p, l);
+#endif /* RECOLL_DATADIR */
+	if (dofold) {
+	    unacfold_char_utf16(c, p, l);
+	} else {
+	    unac_char_utf16(c, p, l);
+	}
+#ifdef RECOLL_DATADIR
    }
+#endif /* RECOLL_DATADIR */
+
    /*
     * Explain what's done in great detail
     */
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
      char *saved;
      out_size += ((l + 1) * 2) + 1024;
      saved = out;
-      out = realloc(out, out_size);
+      out = (char *)realloc(out, out_size);
      if(out == 0) {
 	if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size);
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
  out_size = in_length > 0 ? in_length : 1024;

  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char *)realloc(out, out_size + 1);
  if(out == 0) {
      /* *outp still valid, no freeing */
      if(debug_level >= UNAC_DEBUG_LOW)
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
 	  {
 	      char *saved = out_base;
 	      /* +1 for null */
-	      out_base = realloc(out_base, out_size + 1);
+	      out_base = (char *)realloc(out_base, out_size + 1);
 	      if (out_base == 0) {
 		  /* *outp potentially not valid any more. Free here,
 		   * and zero out */
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
   */
  if (in_length <= 0) {
      if(!*outp) {
-	  if ((*outp = malloc(32)) == 0)
+	  if ((*outp = (char*)malloc(32)) == 0)
 	      return -1;
      }
      (*outp)[0] = '\0';
@ -12975,3 +13031,64 @@ const char* unac_version(void)
  return UNAC_VERSION;
 }

+#ifdef RECOLL_DATADIR
+void unac_set_except_translations(const char *spectrans)
+{
+    except_chars.clear();
+    except_trans.clear();
+    if (!spectrans || !spectrans[0])
+	return;
+
+    // The translation tables out of Unicode are in machine byte order (we
+    // just let the compiler read the values). 
+    // For the translation part, we need to choose our encoding in accordance )
+    // (16BE or 16LE depending on processor)
+    // On the contrary, the source char is always to be compared to
+    // the input text, which is encoded in UTF-16BE ... What a mess.
+    static const char *machinecoding = 0;
+    bool littleendian = true;
+    if (machinecoding == 0) {
+	const char*  charshort = "\001\002";
+	short *ip = (short *)charshort;
+	if (*ip == 0x0102) {
+	    littleendian = false;
+	    machinecoding = "UTF-16BE";
+	} else {
+	    littleendian = true;
+	    machinecoding = "UTF-16LE";
+	}
+    }
+
+    vector<string> vtrans;
+    stringToStrings(spectrans, vtrans);
+
+    for (vector<string>::iterator it = vtrans.begin();
+	 it != vtrans.end(); it++) {
+
+	/* Convert the whole thing to utf-16be/le according to endianness */
+	char *out = 0;
+	size_t outsize;
+	if (convert("UTF-8", machinecoding,
+		    it->c_str(), it->size(),
+		    &out, &outsize) != 0 || outsize < 2)
+	    continue;
+
+	/* The source char must be utf-16be as this is what we convert the
+	   input text to for internal processing */
+	unsigned short ch;
+	if (littleendian)
+	    ch = (out[1] << 8) | (out[0] & 0xff);
+	else
+	    ch = (out[0] << 8) | (out[1] & 0xff);
+
+	/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
+	except_chars.push_back(ch);
+	// We keep ch as the first 2 bytes in the translation so that 
+	// both vectors sort identically
+	except_trans.push_back(string((const char *)out, outsize));
+	free(out);
+    }
+    std::sort(except_chars.begin(), except_chars.end());
+    std::sort(except_trans.begin(), except_trans.end());
+}
+#endif /* RECOLL_DATADIR */
--- a/src/unac/unac.h
+++ b/src/unac/unac.h
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
 /* To be called before starting threads in mt programs */
 void unac_init_mt();

+#ifdef RECOLL_DATADIR
+#include <string>
+/** 
+ * Set exceptions for unaccenting, for characters which should not be
+ * handled according to what the Unicode tables say. For example "a
+ * with circle above" should not be stripped to a in swedish, etc.
+ * 
+ * @param spectrans defines the translations as a blank separated list of 
+ *  UTF-8 strings. Inside each string, the first character is the exception
+ *  the rest is the translation (which may be empty). You can use double 
+ *  quotes for translations which should include white space. The double-quote
+ *  can't be an exception character, deal with it...
+ */
+void unac_set_except_translations(const char *spectrans);
+#endif /* RECOLL_DATADIR */
+
 /*
 * Return unac version number.
 */
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@ -15,6 +15,8 @@ daemSkippedPaths =  \

 defaultcharset = iso-8859-1

+unac_except_trans = åå Åå ää Ää öö Öö
+
 [/home/dockes/projets/fulltext/testrecoll/special]
 defaultcharset = iso-8859-1
 [/home/dockes/projets/fulltext/testrecoll/txt]
--- a/tests/lyx/lyx.sh
+++ b/tests/lyx/lyx.sh
@ -8,7 +8,9 @@ initvariables $0
 (
 recollq 'Bienvenue Dans Univers De Lyx' 
 recollq 'Welcome To Lyx' 
-recollq 'Udvozli Ont A LyX' 
+recollq 'LyX rendkivul jol dokumentalt'
+# cant use this one because o-diaeresis is in the swedish unacex
+# recollq 'Udvozli Ont A LyX' 
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/unacex/unacex.sh
+++ b/tests/unacex/unacex.sh
@ -0,0 +1,20 @@
+#!/bin/sh
+
+topdir=`dirname $0`/..
+. $topdir/shared.sh
+
+initvariables $0
+
+# We need an utf-8 locale for the 1st command to properly read its argument
+export LANG=fr_FR.UTF-8
+
+(
+# Should succeed
+recollq '"strippes: UNACEXååääöö"' 
+# Should fail
+recollq '"strippes: UNACEXaaaaoo"'
+)  2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
+
+diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
+
+checkresult
--- a/tests/unacex/unacex.txt
+++ b/tests/unacex/unacex.txt
@ -0,0 +1,3 @@
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/unacex/swedishchars.html]	[Some chars]	293	bytes	
+0 results
--- a/unac/unac.c
+++ b/unac/unac.c
@ -17,15 +17,57 @@
 */

 #ifdef HAVE_CONFIG_H
+#ifdef RECOLL_DATADIR
+#include "autoconfig.h"
+#else
 #include "config.h"
+#endif /* RECOLL */
 #endif /* HAVE_CONFIG_H */

+#ifdef RECOLL_DATADIR
+/* Yes, recoll unac is actually c++, lets face modernity, I will not be
+   caught writing another binary search  */
+#include <vector>
+#include <map>
+#include <string>
+#include <algorithm>
+using std::string;
+using std::vector;
+using std::map;
+#include "smallut.h"
+
+/* 
+   Storage for the exception translations. These are chars which
+   should not be translated according to what UnicodeData says, but
+   instead according to some local rule. There will usually be very
+   few of them, but they must be looked up for every translated char.
+   
+   We use a sorted vector for fastest elimination by binary search and
+   a vector<string> to store the translations
+ */
+static vector<unsigned short> except_chars;
+static vector<string> except_trans;
+static inline size_t is_except_char(unsigned short c)
+{
+    vector<unsigned short>::iterator it = 
+	std::lower_bound(except_chars.begin(), except_chars.end(), c);
+    if (it == except_chars.end() || *it != c) {
+	return (size_t(-1));
+    }
+    return std::distance(except_chars.begin(), it);
+}
+#endif /* RECOLL_DATADIR */
+
 /*
 * If configure.in has not defined this symbol, assume const. It
 * does not harm much: a warning will be issued during compilation.
 */
 #ifndef ICONV_CONST
+#ifdef RCL_ICONV_INBUF_CONST
+#define ICONV_CONST const
+#else
 #define ICONV_CONST
+#endif
 #endif /* ICONV_CONST */

 #include <stdlib.h>
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
  char* out;
  int out_size;
  int out_length;
-  int i;
+  unsigned int i;

  out_size = in_length > 0 ? in_length : 1024;

  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char*)realloc(out, out_size + 1);
  if(out == 0) {
      if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size+1);
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
    /*
     * Lookup the tables for decomposition information
     */
-    if (dofold) {
-	unacfold_char_utf16(c, p, l);
+#ifdef RECOLL_DATADIR
+    size_t idx;
+    if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
+	p = (unsigned short *)(except_trans[idx].c_str() + 2);
+	l = (except_trans[idx].size() - 2) / 2;
+	/* unsigned char *cp = (unsigned char *)p;
+	   fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0], 
+	   (unsigned int)cp[1]);*/
    } else {
-	unac_char_utf16(c, p, l);
+#endif /* RECOLL_DATADIR */
+	if (dofold) {
+	    unacfold_char_utf16(c, p, l);
+	} else {
+	    unac_char_utf16(c, p, l);
+	}
+#ifdef RECOLL_DATADIR
    }
+#endif /* RECOLL_DATADIR */
+
    /*
     * Explain what's done in great detail
     */
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
      char *saved;
      out_size += ((l + 1) * 2) + 1024;
      saved = out;
-      out = realloc(out, out_size);
+      out = (char *)realloc(out, out_size);
      if(out == 0) {
 	if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size);
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
  out_size = in_length > 0 ? in_length : 1024;

  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char *)realloc(out, out_size + 1);
  if(out == 0) {
      /* *outp still valid, no freeing */
      if(debug_level >= UNAC_DEBUG_LOW)
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
 	  {
 	      char *saved = out_base;
 	      /* +1 for null */
-	      out_base = realloc(out_base, out_size + 1);
+	      out_base = (char *)realloc(out_base, out_size + 1);
 	      if (out_base == 0) {
 		  /* *outp potentially not valid any more. Free here,
 		   * and zero out */
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
   */
  if (in_length <= 0) {
      if(!*outp) {
-	  if ((*outp = malloc(32)) == 0)
+	  if ((*outp = (char*)malloc(32)) == 0)
 	      return -1;
      }
      (*outp)[0] = '\0';
@ -12975,3 +13031,64 @@ const char* unac_version(void)
  return UNAC_VERSION;
 }

+#ifdef RECOLL_DATADIR
+void unac_set_except_translations(const char *spectrans)
+{
+    except_chars.clear();
+    except_trans.clear();
+    if (!spectrans || !spectrans[0])
+	return;
+
+    // The translation tables out of Unicode are in machine byte order (we
+    // just let the compiler read the values). 
+    // For the translation part, we need to choose our encoding in accordance )
+    // (16BE or 16LE depending on processor)
+    // On the contrary, the source char is always to be compared to
+    // the input text, which is encoded in UTF-16BE ... What a mess.
+    static const char *machinecoding = 0;
+    bool littleendian = true;
+    if (machinecoding == 0) {
+	const char*  charshort = "\001\002";
+	short *ip = (short *)charshort;
+	if (*ip == 0x0102) {
+	    littleendian = false;
+	    machinecoding = "UTF-16BE";
+	} else {
+	    littleendian = true;
+	    machinecoding = "UTF-16LE";
+	}
+    }
+
+    vector<string> vtrans;
+    stringToStrings(spectrans, vtrans);
+
+    for (vector<string>::iterator it = vtrans.begin();
+	 it != vtrans.end(); it++) {
+
+	/* Convert the whole thing to utf-16be/le according to endianness */
+	char *out = 0;
+	size_t outsize;
+	if (convert("UTF-8", machinecoding,
+		    it->c_str(), it->size(),
+		    &out, &outsize) != 0 || outsize < 2)
+	    continue;
+
+	/* The source char must be utf-16be as this is what we convert the
+	   input text to for internal processing */
+	unsigned short ch;
+	if (littleendian)
+	    ch = (out[1] << 8) | (out[0] & 0xff);
+	else
+	    ch = (out[0] << 8) | (out[1] & 0xff);
+
+	/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
+	except_chars.push_back(ch);
+	// We keep ch as the first 2 bytes in the translation so that 
+	// both vectors sort identically
+	except_trans.push_back(string((const char *)out, outsize));
+	free(out);
+    }
+    std::sort(except_chars.begin(), except_chars.end());
+    std::sort(except_trans.begin(), except_trans.end());
+}
+#endif /* RECOLL_DATADIR */
--- a/unac/unac.h
+++ b/unac/unac.h
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
 /* To be called before starting threads in mt programs */
 void unac_init_mt();

+#ifdef RECOLL_DATADIR
+#include <string>
+/** 
+ * Set exceptions for unaccenting, for characters which should not be
+ * handled according to what the Unicode tables say. For example "a
+ * with circle above" should not be stripped to a in swedish, etc.
+ * 
+ * @param spectrans defines the translations as a blank separated list of 
+ *  UTF-8 strings. Inside each string, the first character is the exception
+ *  the rest is the translation (which may be empty). You can use double 
+ *  quotes for translations which should include white space. The double-quote
+ *  can't be an exception character, deal with it...
+ */
+void unac_set_except_translations(const char *spectrans);
+#endif /* RECOLL_DATADIR */
+
 /*
 * Return unac version number.
 */