Added a configuration parameter to set specific unaccenting/lowercasing for some characters to be handled differently than would result from using the Unicode database. Exemple: "a with ring above" could be set to be preserved by a Swedish locutor

2012-04-09 12:42:23 +02:00 · 2012-04-09 12:42:23 +02:00 · a4c17941b1
commit a4c17941b1
parent 29ef5bd143
14 changed files with 365 additions and 23 deletions
--- a/src/common/rclinit.cpp
+++ b/src/common/rclinit.cpp
@ -107,9 +107,15 @@ RclConfig *recollinit(RclInitFlags flags,
    // Make sure the locale charset is initialized (so that multiple
    // threads don't try to do it at once).
    config->getDefCharset();
    // Init unac locking
    unac_init_mt();
    // Init Unac translation exceptions
    string unacex;
    if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty()) 
 	unac_set_except_translations(unacex.c_str());
    int flushmb;
    if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
 	LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@ -88,6 +88,7 @@ using namespace std;
 #include "unacpp.h"
 #include "readfile.h"
 #include "rclinit.h"
 int main(int argc, char **argv)
 {
@ -98,8 +99,13 @@ int main(int argc, char **argv)
    }
    const char *encoding = argv[1];
    string ifn = argv[2];
    if (!ifn.compare("stdin"))
 	ifn.clear();
    const char *ofn = argv[3];
    string reason;
    (void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
    string odata;
    if (!file_to_string(ifn, odata)) {
 	cerr << "file_to_string: " << odata << endl;
@ -111,7 +117,12 @@ int main(int argc, char **argv)
 	exit(1);
    }
-    int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
+    int fd;
    if (strcmp(ofn, "stdout")) {
 	fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
    } else {
 	fd = 1;
    }
    if (fd < 0) {
 	cerr << "Open/Create " << ofn << " failed: " << strerror(errno) 
 	     << endl;
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@ -34,6 +34,9 @@
      <para>This document introduces full text search notions
      and describes the installation and use of the &RCL;
      application. It currently describes &RCL; &RCLVERSION;.</para>
 <!--      <para>[ <ulink url="index.html">Split HTML</ulink> / 
             <ulink url="usermanual-xml.html">Single HTML</ulink> ]</para>
 -->
    </abstract>
@ -3849,6 +3852,32 @@ skippedPaths = ~/somedir/&lowast;.txt
 	   </listitem>
         </varlistentry>
          <varlistentry><term><literal>unac_except_trans</literal></term>
            <listitem><para>This is a list of characters which should be
            handled specially when converting text to unaccented lowercase.
            For example, in Swedish, the letter <literal>a with diaeresis
            </literal> has full alphabet citizenship and should not be
            turned into an <literal>a</literal>. Each element in the
            space-separated list has the special character as first element
            and the translation following. The handling of both the
            lowercase and upper-case versions of a character should be
            specified, as appartenance to the list will turn-off both
            standard accent and case processing. Example for Swedish:</para>
                <programlisting>
 unac_except_trans =  åå Åå ää Ää öö Öö
            </programlisting>
            <para>Note that the translation is not limited to a single
            character, you could very well have something like
            <literal>üue</literal> in the list.</para>
             <para>This parameter can't be defined for subdirectories, it
             is global, because there is no way to do otherwise when
             querying. If you have document sets which would need different
             values, you will have to index and query them separately.</para> 
              </listitem>
            </varlistentry>
          <varlistentry><term><literal>maildefcharset</literal></term>
            <listitem><para>This can be used to define the default
 		character set specifically for email messages which don't
--- a/src/doc/user/xmlmake.sh
+++ b/src/doc/user/xmlmake.sh
@ -14,6 +14,9 @@
 # Wherever docbook.xsl and chunk.xsl live
 XSLDIR="/usr/local/share/xsl/docbook/"
 dochunky=1
 test $# -eq 1 && dochunky=0
 # Remove the SGML header and uncomment the XML one
 sed -e '\!//FreeBSD//DTD!d' \
    -e '\!DTD DocBook XML!s/<!--//' \
@ -31,7 +34,7 @@ commonoptions="--stringparam section.autolabel 1 \
 "
 # Do the chunky thing
-eval xsltproc $commonoptions \
+test $dochunky -ne 0 && eval xsltproc $commonoptions \
    --stringparam use.id.as.filename 1 \
    "$XSLDIR/html/chunk.xsl" \
    usermanual.xml
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -14,7 +14,7 @@ librcl.a : $(DEPS) $(OBJS) unac.o
 	$(RANLIB) librcl.a
 unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs
-	$(CC) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
+	$(CXX) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
 rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
 	$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
 beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
--- a/src/lib/mkMake
+++ b/src/lib/mkMake
@ -121,7 +121,7 @@ librcl.a : \$(DEPS) \$(OBJS) unac.o
 	\$(RANLIB) librcl.a
 unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs
-	\$(CC) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
+	\$(CXX) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
 EOF
 for c in $SRC_CPP;do
--- a/src/unac/unac.c
+++ b/src/unac/unac.c
@ -17,15 +17,57 @@
 */
 #ifdef HAVE_CONFIG_H
 #ifdef RECOLL_DATADIR
 #include "autoconfig.h"
 #else
 #include "config.h"
 #endif /* RECOLL */
 #endif /* HAVE_CONFIG_H */
 #ifdef RECOLL_DATADIR
 /* Yes, recoll unac is actually c++, lets face modernity, I will not be
   caught writing another binary search  */
 #include <vector>
 #include <map>
 #include <string>
 #include <algorithm>
 using std::string;
 using std::vector;
 using std::map;
 #include "smallut.h"
 /* 
   Storage for the exception translations. These are chars which
   should not be translated according to what UnicodeData says, but
   instead according to some local rule. There will usually be very
   few of them, but they must be looked up for every translated char.
   We use a sorted vector for fastest elimination by binary search and
   a vector<string> to store the translations
 */
 static vector<unsigned short> except_chars;
 static vector<string> except_trans;
 static inline size_t is_except_char(unsigned short c)
 {
    vector<unsigned short>::iterator it = 
 	std::lower_bound(except_chars.begin(), except_chars.end(), c);
    if (it == except_chars.end() || *it != c) {
 	return (size_t(-1));
    }
    return std::distance(except_chars.begin(), it);
 }
 #endif /* RECOLL_DATADIR */
 /*
 * If configure.in has not defined this symbol, assume const. It
 * does not harm much: a warning will be issued during compilation.
 */
 #ifndef ICONV_CONST
 #ifdef RCL_ICONV_INBUF_CONST
 #define ICONV_CONST const
 #else
 #define ICONV_CONST
 #endif
 #endif /* ICONV_CONST */
 #include <stdlib.h>
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
  char* out;
  int out_size;
  int out_length;
-  int i;
+  unsigned int i;
  out_size = in_length > 0 ? in_length : 1024;
  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char*)realloc(out, out_size + 1);
  if(out == 0) {
      if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size+1);
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
    /*
     * Lookup the tables for decomposition information
     */
-    if (dofold) {
+#ifdef RECOLL_DATADIR
-	unacfold_char_utf16(c, p, l);
+    size_t idx;
    if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
 	p = (unsigned short *)(except_trans[idx].c_str() + 2);
 	l = (except_trans[idx].size() - 2) / 2;
 	/* unsigned char *cp = (unsigned char *)p;
 	   fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0], 
 	   (unsigned int)cp[1]);*/
    } else {
-	unac_char_utf16(c, p, l);
+#endif /* RECOLL_DATADIR */
 	if (dofold) {
 	    unacfold_char_utf16(c, p, l);
 	} else {
 	    unac_char_utf16(c, p, l);
 	}
 #ifdef RECOLL_DATADIR
    }
 #endif /* RECOLL_DATADIR */
    /*
     * Explain what's done in great detail
     */
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
      char *saved;
      out_size += ((l + 1) * 2) + 1024;
      saved = out;
-      out = realloc(out, out_size);
+      out = (char *)realloc(out, out_size);
      if(out == 0) {
 	if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size);
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
  out_size = in_length > 0 ? in_length : 1024;
  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char *)realloc(out, out_size + 1);
  if(out == 0) {
      /* *outp still valid, no freeing */
      if(debug_level >= UNAC_DEBUG_LOW)
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
 	  {
 	      char *saved = out_base;
 	      /* +1 for null */
-	      out_base = realloc(out_base, out_size + 1);
+	      out_base = (char *)realloc(out_base, out_size + 1);
 	      if (out_base == 0) {
 		  /* *outp potentially not valid any more. Free here,
 		   * and zero out */
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
   */
  if (in_length <= 0) {
      if(!*outp) {
-	  if ((*outp = malloc(32)) == 0)
+	  if ((*outp = (char*)malloc(32)) == 0)
 	      return -1;
      }
      (*outp)[0] = '\0';
@ -12975,3 +13031,64 @@ const char* unac_version(void)
  return UNAC_VERSION;
 }
 #ifdef RECOLL_DATADIR
 void unac_set_except_translations(const char *spectrans)
 {
    except_chars.clear();
    except_trans.clear();
    if (!spectrans || !spectrans[0])
 	return;
    // The translation tables out of Unicode are in machine byte order (we
    // just let the compiler read the values). 
    // For the translation part, we need to choose our encoding in accordance )
    // (16BE or 16LE depending on processor)
    // On the contrary, the source char is always to be compared to
    // the input text, which is encoded in UTF-16BE ... What a mess.
    static const char *machinecoding = 0;
    bool littleendian = true;
    if (machinecoding == 0) {
 	const char*  charshort = "\001\002";
 	short *ip = (short *)charshort;
 	if (*ip == 0x0102) {
 	    littleendian = false;
 	    machinecoding = "UTF-16BE";
 	} else {
 	    littleendian = true;
 	    machinecoding = "UTF-16LE";
 	}
    }
    vector<string> vtrans;
    stringToStrings(spectrans, vtrans);
    for (vector<string>::iterator it = vtrans.begin();
 	 it != vtrans.end(); it++) {
 	/* Convert the whole thing to utf-16be/le according to endianness */
 	char *out = 0;
 	size_t outsize;
 	if (convert("UTF-8", machinecoding,
 		    it->c_str(), it->size(),
 		    &out, &outsize) != 0 || outsize < 2)
 	    continue;
 	/* The source char must be utf-16be as this is what we convert the
 	   input text to for internal processing */
 	unsigned short ch;
 	if (littleendian)
 	    ch = (out[1] << 8) | (out[0] & 0xff);
 	else
 	    ch = (out[0] << 8) | (out[1] & 0xff);
 	/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
 	except_chars.push_back(ch);
 	// We keep ch as the first 2 bytes in the translation so that 
 	// both vectors sort identically
 	except_trans.push_back(string((const char *)out, outsize));
 	free(out);
    }
    std::sort(except_chars.begin(), except_chars.end());
    std::sort(except_trans.begin(), except_trans.end());
 }
 #endif /* RECOLL_DATADIR */
--- a/src/unac/unac.h
+++ b/src/unac/unac.h
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
 /* To be called before starting threads in mt programs */
 void unac_init_mt();
 #ifdef RECOLL_DATADIR
 #include <string>
 /** 
 * Set exceptions for unaccenting, for characters which should not be
 * handled according to what the Unicode tables say. For example "a
 * with circle above" should not be stripped to a in swedish, etc.
 * 
 * @param spectrans defines the translations as a blank separated list of 
 *  UTF-8 strings. Inside each string, the first character is the exception
 *  the rest is the translation (which may be empty). You can use double 
 *  quotes for translations which should include white space. The double-quote
 *  can't be an exception character, deal with it...
 */
 void unac_set_except_translations(const char *spectrans);
 #endif /* RECOLL_DATADIR */
 /*
 * Return unac version number.
 */
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@ -15,6 +15,8 @@ daemSkippedPaths =  \
 defaultcharset = iso-8859-1
 unac_except_trans = åå Åå ää Ää öö Öö
 [/home/dockes/projets/fulltext/testrecoll/special]
 defaultcharset = iso-8859-1
 [/home/dockes/projets/fulltext/testrecoll/txt]
--- a/tests/lyx/lyx.sh
+++ b/tests/lyx/lyx.sh
@ -8,7 +8,9 @@ initvariables $0
 (
 recollq 'Bienvenue Dans Univers De Lyx' 
 recollq 'Welcome To Lyx' 
-recollq 'Udvozli Ont A LyX' 
+recollq 'LyX rendkivul jol dokumentalt'
 # cant use this one because o-diaeresis is in the swedish unacex
 # recollq 'Udvozli Ont A LyX' 
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/unacex/unacex.sh
+++ b/tests/unacex/unacex.sh
@ -0,0 +1,20 @@
 #!/bin/sh
 topdir=`dirname $0`/..
 . $topdir/shared.sh
 initvariables $0
 # We need an utf-8 locale for the 1st command to properly read its argument
 export LANG=fr_FR.UTF-8
 (
 # Should succeed
 recollq '"strippes: UNACEXååääöö"' 
 # Should fail
 recollq '"strippes: UNACEXaaaaoo"'
 )  2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
 checkresult
--- a/tests/unacex/unacex.txt
+++ b/tests/unacex/unacex.txt
@ -0,0 +1,3 @@
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/unacex/swedishchars.html]	[Some chars]	293	bytes	
 0 results
--- a/unac/unac.c
+++ b/unac/unac.c
@ -17,15 +17,57 @@
 */
 #ifdef HAVE_CONFIG_H
 #ifdef RECOLL_DATADIR
 #include "autoconfig.h"
 #else
 #include "config.h"
 #endif /* RECOLL */
 #endif /* HAVE_CONFIG_H */
 #ifdef RECOLL_DATADIR
 /* Yes, recoll unac is actually c++, lets face modernity, I will not be
   caught writing another binary search  */
 #include <vector>
 #include <map>
 #include <string>
 #include <algorithm>
 using std::string;
 using std::vector;
 using std::map;
 #include "smallut.h"
 /* 
   Storage for the exception translations. These are chars which
   should not be translated according to what UnicodeData says, but
   instead according to some local rule. There will usually be very
   few of them, but they must be looked up for every translated char.
   We use a sorted vector for fastest elimination by binary search and
   a vector<string> to store the translations
 */
 static vector<unsigned short> except_chars;
 static vector<string> except_trans;
 static inline size_t is_except_char(unsigned short c)
 {
    vector<unsigned short>::iterator it = 
 	std::lower_bound(except_chars.begin(), except_chars.end(), c);
    if (it == except_chars.end() || *it != c) {
 	return (size_t(-1));
    }
    return std::distance(except_chars.begin(), it);
 }
 #endif /* RECOLL_DATADIR */
 /*
 * If configure.in has not defined this symbol, assume const. It
 * does not harm much: a warning will be issued during compilation.
 */
 #ifndef ICONV_CONST
 #ifdef RCL_ICONV_INBUF_CONST
 #define ICONV_CONST const
 #else
 #define ICONV_CONST
 #endif
 #endif /* ICONV_CONST */
 #include <stdlib.h>
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
  char* out;
  int out_size;
  int out_length;
-  int i;
+  unsigned int i;
  out_size = in_length > 0 ? in_length : 1024;
  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char*)realloc(out, out_size + 1);
  if(out == 0) {
      if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size+1);
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
    /*
     * Lookup the tables for decomposition information
     */
-    if (dofold) {
+#ifdef RECOLL_DATADIR
-	unacfold_char_utf16(c, p, l);
+    size_t idx;
    if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
 	p = (unsigned short *)(except_trans[idx].c_str() + 2);
 	l = (except_trans[idx].size() - 2) / 2;
 	/* unsigned char *cp = (unsigned char *)p;
 	   fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0], 
 	   (unsigned int)cp[1]);*/
    } else {
-	unac_char_utf16(c, p, l);
+#endif /* RECOLL_DATADIR */
 	if (dofold) {
 	    unacfold_char_utf16(c, p, l);
 	} else {
 	    unac_char_utf16(c, p, l);
 	}
 #ifdef RECOLL_DATADIR
    }
 #endif /* RECOLL_DATADIR */
    /*
     * Explain what's done in great detail
     */
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
      char *saved;
      out_size += ((l + 1) * 2) + 1024;
      saved = out;
-      out = realloc(out, out_size);
+      out = (char *)realloc(out, out_size);
      if(out == 0) {
 	if(debug_level >= UNAC_DEBUG_LOW)
 	  DEBUG("realloc %d bytes failed\n", out_size);
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
  out_size = in_length > 0 ? in_length : 1024;
  out = *outp;
-  out = realloc(out, out_size + 1);
+  out = (char *)realloc(out, out_size + 1);
  if(out == 0) {
      /* *outp still valid, no freeing */
      if(debug_level >= UNAC_DEBUG_LOW)
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
 	  {
 	      char *saved = out_base;
 	      /* +1 for null */
-	      out_base = realloc(out_base, out_size + 1);
+	      out_base = (char *)realloc(out_base, out_size + 1);
 	      if (out_base == 0) {
 		  /* *outp potentially not valid any more. Free here,
 		   * and zero out */
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
   */
  if (in_length <= 0) {
      if(!*outp) {
-	  if ((*outp = malloc(32)) == 0)
+	  if ((*outp = (char*)malloc(32)) == 0)
 	      return -1;
      }
      (*outp)[0] = '\0';
@ -12975,3 +13031,64 @@ const char* unac_version(void)
  return UNAC_VERSION;
 }
 #ifdef RECOLL_DATADIR
 void unac_set_except_translations(const char *spectrans)
 {
    except_chars.clear();
    except_trans.clear();
    if (!spectrans || !spectrans[0])
 	return;
    // The translation tables out of Unicode are in machine byte order (we
    // just let the compiler read the values). 
    // For the translation part, we need to choose our encoding in accordance )
    // (16BE or 16LE depending on processor)
    // On the contrary, the source char is always to be compared to
    // the input text, which is encoded in UTF-16BE ... What a mess.
    static const char *machinecoding = 0;
    bool littleendian = true;
    if (machinecoding == 0) {
 	const char*  charshort = "\001\002";
 	short *ip = (short *)charshort;
 	if (*ip == 0x0102) {
 	    littleendian = false;
 	    machinecoding = "UTF-16BE";
 	} else {
 	    littleendian = true;
 	    machinecoding = "UTF-16LE";
 	}
    }
    vector<string> vtrans;
    stringToStrings(spectrans, vtrans);
    for (vector<string>::iterator it = vtrans.begin();
 	 it != vtrans.end(); it++) {
 	/* Convert the whole thing to utf-16be/le according to endianness */
 	char *out = 0;
 	size_t outsize;
 	if (convert("UTF-8", machinecoding,
 		    it->c_str(), it->size(),
 		    &out, &outsize) != 0 || outsize < 2)
 	    continue;
 	/* The source char must be utf-16be as this is what we convert the
 	   input text to for internal processing */
 	unsigned short ch;
 	if (littleendian)
 	    ch = (out[1] << 8) | (out[0] & 0xff);
 	else
 	    ch = (out[0] << 8) | (out[1] & 0xff);
 	/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
 	except_chars.push_back(ch);
 	// We keep ch as the first 2 bytes in the translation so that 
 	// both vectors sort identically
 	except_trans.push_back(string((const char *)out, outsize));
 	free(out);
    }
    std::sort(except_chars.begin(), except_chars.end());
    std::sort(except_trans.begin(), except_trans.end());
 }
 #endif /* RECOLL_DATADIR */
--- a/unac/unac.h
+++ b/unac/unac.h
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
 /* To be called before starting threads in mt programs */
 void unac_init_mt();
 #ifdef RECOLL_DATADIR
 #include <string>
 /** 
 * Set exceptions for unaccenting, for characters which should not be
 * handled according to what the Unicode tables say. For example "a
 * with circle above" should not be stripped to a in swedish, etc.
 * 
 * @param spectrans defines the translations as a blank separated list of 
 *  UTF-8 strings. Inside each string, the first character is the exception
 *  the rest is the translation (which may be empty). You can use double 
 *  quotes for translations which should include white space. The double-quote
 *  can't be an exception character, deal with it...
 */
 void unac_set_except_translations(const char *spectrans);
 #endif /* RECOLL_DATADIR */
 /*
 * Return unac version number.
 */