Added a configuration parameter to set specific unaccenting/lowercasing for some characters to be handled differently than would result from using the Unicode database. Exemple: "a with ring above" could be set to be preserved by a Swedish locutor

This commit is contained in:
Jean-Francois Dockes 2012-04-09 12:42:23 +02:00
parent 29ef5bd143
commit a4c17941b1
14 changed files with 365 additions and 23 deletions

View File

@ -107,9 +107,15 @@ RclConfig *recollinit(RclInitFlags flags,
// Make sure the locale charset is initialized (so that multiple // Make sure the locale charset is initialized (so that multiple
// threads don't try to do it at once). // threads don't try to do it at once).
config->getDefCharset(); config->getDefCharset();
// Init unac locking // Init unac locking
unac_init_mt(); unac_init_mt();
// Init Unac translation exceptions
string unacex;
if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty())
unac_set_except_translations(unacex.c_str());
int flushmb; int flushmb;
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) { if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n", LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",

View File

@ -88,6 +88,7 @@ using namespace std;
#include "unacpp.h" #include "unacpp.h"
#include "readfile.h" #include "readfile.h"
#include "rclinit.h"
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
@ -98,8 +99,13 @@ int main(int argc, char **argv)
} }
const char *encoding = argv[1]; const char *encoding = argv[1];
string ifn = argv[2]; string ifn = argv[2];
if (!ifn.compare("stdin"))
ifn.clear();
const char *ofn = argv[3]; const char *ofn = argv[3];
string reason;
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
string odata; string odata;
if (!file_to_string(ifn, odata)) { if (!file_to_string(ifn, odata)) {
cerr << "file_to_string: " << odata << endl; cerr << "file_to_string: " << odata << endl;
@ -111,7 +117,12 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666); int fd;
if (strcmp(ofn, "stdout")) {
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
} else {
fd = 1;
}
if (fd < 0) { if (fd < 0) {
cerr << "Open/Create " << ofn << " failed: " << strerror(errno) cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
<< endl; << endl;

View File

@ -34,6 +34,9 @@
<para>This document introduces full text search notions <para>This document introduces full text search notions
and describes the installation and use of the &RCL; and describes the installation and use of the &RCL;
application. It currently describes &RCL; &RCLVERSION;.</para> application. It currently describes &RCL; &RCLVERSION;.</para>
<!-- <para>[ <ulink url="index.html">Split HTML</ulink> /
<ulink url="usermanual-xml.html">Single HTML</ulink> ]</para>
-->
</abstract> </abstract>
@ -3849,6 +3852,32 @@ skippedPaths = ~/somedir/&lowast;.txt
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry><term><literal>unac_except_trans</literal></term>
<listitem><para>This is a list of characters which should be
handled specially when converting text to unaccented lowercase.
For example, in Swedish, the letter <literal>a with diaeresis
</literal> has full alphabet citizenship and should not be
turned into an <literal>a</literal>. Each element in the
space-separated list has the special character as first element
and the translation following. The handling of both the
lowercase and upper-case versions of a character should be
specified, as appartenance to the list will turn-off both
standard accent and case processing. Example for Swedish:</para>
<programlisting>
unac_except_trans = åå Åå ää Ää öö Öö
</programlisting>
<para>Note that the translation is not limited to a single
character, you could very well have something like
<literal>üue</literal> in the list.</para>
<para>This parameter can't be defined for subdirectories, it
is global, because there is no way to do otherwise when
querying. If you have document sets which would need different
values, you will have to index and query them separately.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>maildefcharset</literal></term> <varlistentry><term><literal>maildefcharset</literal></term>
<listitem><para>This can be used to define the default <listitem><para>This can be used to define the default
character set specifically for email messages which don't character set specifically for email messages which don't

View File

@ -14,6 +14,9 @@
# Wherever docbook.xsl and chunk.xsl live # Wherever docbook.xsl and chunk.xsl live
XSLDIR="/usr/local/share/xsl/docbook/" XSLDIR="/usr/local/share/xsl/docbook/"
dochunky=1
test $# -eq 1 && dochunky=0
# Remove the SGML header and uncomment the XML one # Remove the SGML header and uncomment the XML one
sed -e '\!//FreeBSD//DTD!d' \ sed -e '\!//FreeBSD//DTD!d' \
-e '\!DTD DocBook XML!s/<!--//' \ -e '\!DTD DocBook XML!s/<!--//' \
@ -31,7 +34,7 @@ commonoptions="--stringparam section.autolabel 1 \
" "
# Do the chunky thing # Do the chunky thing
eval xsltproc $commonoptions \ test $dochunky -ne 0 && eval xsltproc $commonoptions \
--stringparam use.id.as.filename 1 \ --stringparam use.id.as.filename 1 \
"$XSLDIR/html/chunk.xsl" \ "$XSLDIR/html/chunk.xsl" \
usermanual.xml usermanual.xml

View File

@ -14,7 +14,7 @@ librcl.a : $(DEPS) $(OBJS) unac.o
$(RANLIB) librcl.a $(RANLIB) librcl.a
unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs
$(CC) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c $(CXX) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp $(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs

View File

@ -121,7 +121,7 @@ librcl.a : \$(DEPS) \$(OBJS) unac.o
\$(RANLIB) librcl.a \$(RANLIB) librcl.a
unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs
\$(CC) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c \$(CXX) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
EOF EOF
for c in $SRC_CPP;do for c in $SRC_CPP;do

View File

@ -17,15 +17,57 @@
*/ */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#ifdef RECOLL_DATADIR
#include "autoconfig.h"
#else
#include "config.h" #include "config.h"
#endif /* RECOLL */
#endif /* HAVE_CONFIG_H */ #endif /* HAVE_CONFIG_H */
#ifdef RECOLL_DATADIR
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
caught writing another binary search */
#include <vector>
#include <map>
#include <string>
#include <algorithm>
using std::string;
using std::vector;
using std::map;
#include "smallut.h"
/*
Storage for the exception translations. These are chars which
should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/
static vector<unsigned short> except_chars;
static vector<string> except_trans;
static inline size_t is_except_char(unsigned short c)
{
vector<unsigned short>::iterator it =
std::lower_bound(except_chars.begin(), except_chars.end(), c);
if (it == except_chars.end() || *it != c) {
return (size_t(-1));
}
return std::distance(except_chars.begin(), it);
}
#endif /* RECOLL_DATADIR */
/* /*
* If configure.in has not defined this symbol, assume const. It * If configure.in has not defined this symbol, assume const. It
* does not harm much: a warning will be issued during compilation. * does not harm much: a warning will be issued during compilation.
*/ */
#ifndef ICONV_CONST #ifndef ICONV_CONST
#ifdef RCL_ICONV_INBUF_CONST
#define ICONV_CONST const
#else
#define ICONV_CONST #define ICONV_CONST
#endif
#endif /* ICONV_CONST */ #endif /* ICONV_CONST */
#include <stdlib.h> #include <stdlib.h>
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
char* out; char* out;
int out_size; int out_size;
int out_length; int out_length;
int i; unsigned int i;
out_size = in_length > 0 ? in_length : 1024; out_size = in_length > 0 ? in_length : 1024;
out = *outp; out = *outp;
out = realloc(out, out_size + 1); out = (char*)realloc(out, out_size + 1);
if(out == 0) { if(out == 0) {
if(debug_level >= UNAC_DEBUG_LOW) if(debug_level >= UNAC_DEBUG_LOW)
DEBUG("realloc %d bytes failed\n", out_size+1); DEBUG("realloc %d bytes failed\n", out_size+1);
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
/* /*
* Lookup the tables for decomposition information * Lookup the tables for decomposition information
*/ */
if (dofold) { #ifdef RECOLL_DATADIR
unacfold_char_utf16(c, p, l); size_t idx;
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
p = (unsigned short *)(except_trans[idx].c_str() + 2);
l = (except_trans[idx].size() - 2) / 2;
/* unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);*/
} else { } else {
unac_char_utf16(c, p, l); #endif /* RECOLL_DATADIR */
if (dofold) {
unacfold_char_utf16(c, p, l);
} else {
unac_char_utf16(c, p, l);
}
#ifdef RECOLL_DATADIR
} }
#endif /* RECOLL_DATADIR */
/* /*
* Explain what's done in great detail * Explain what's done in great detail
*/ */
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
char *saved; char *saved;
out_size += ((l + 1) * 2) + 1024; out_size += ((l + 1) * 2) + 1024;
saved = out; saved = out;
out = realloc(out, out_size); out = (char *)realloc(out, out_size);
if(out == 0) { if(out == 0) {
if(debug_level >= UNAC_DEBUG_LOW) if(debug_level >= UNAC_DEBUG_LOW)
DEBUG("realloc %d bytes failed\n", out_size); DEBUG("realloc %d bytes failed\n", out_size);
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
out_size = in_length > 0 ? in_length : 1024; out_size = in_length > 0 ? in_length : 1024;
out = *outp; out = *outp;
out = realloc(out, out_size + 1); out = (char *)realloc(out, out_size + 1);
if(out == 0) { if(out == 0) {
/* *outp still valid, no freeing */ /* *outp still valid, no freeing */
if(debug_level >= UNAC_DEBUG_LOW) if(debug_level >= UNAC_DEBUG_LOW)
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
{ {
char *saved = out_base; char *saved = out_base;
/* +1 for null */ /* +1 for null */
out_base = realloc(out_base, out_size + 1); out_base = (char *)realloc(out_base, out_size + 1);
if (out_base == 0) { if (out_base == 0) {
/* *outp potentially not valid any more. Free here, /* *outp potentially not valid any more. Free here,
* and zero out */ * and zero out */
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
*/ */
if (in_length <= 0) { if (in_length <= 0) {
if(!*outp) { if(!*outp) {
if ((*outp = malloc(32)) == 0) if ((*outp = (char*)malloc(32)) == 0)
return -1; return -1;
} }
(*outp)[0] = '\0'; (*outp)[0] = '\0';
@ -12975,3 +13031,64 @@ const char* unac_version(void)
return UNAC_VERSION; return UNAC_VERSION;
} }
#ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans)
{
except_chars.clear();
except_trans.clear();
if (!spectrans || !spectrans[0])
return;
// The translation tables out of Unicode are in machine byte order (we
// just let the compiler read the values).
// For the translation part, we need to choose our encoding in accordance )
// (16BE or 16LE depending on processor)
// On the contrary, the source char is always to be compared to
// the input text, which is encoded in UTF-16BE ... What a mess.
static const char *machinecoding = 0;
bool littleendian = true;
if (machinecoding == 0) {
const char* charshort = "\001\002";
short *ip = (short *)charshort;
if (*ip == 0x0102) {
littleendian = false;
machinecoding = "UTF-16BE";
} else {
littleendian = true;
machinecoding = "UTF-16LE";
}
}
vector<string> vtrans;
stringToStrings(spectrans, vtrans);
for (vector<string>::iterator it = vtrans.begin();
it != vtrans.end(); it++) {
/* Convert the whole thing to utf-16be/le according to endianness */
char *out = 0;
size_t outsize;
if (convert("UTF-8", machinecoding,
it->c_str(), it->size(),
&out, &outsize) != 0 || outsize < 2)
continue;
/* The source char must be utf-16be as this is what we convert the
input text to for internal processing */
unsigned short ch;
if (littleendian)
ch = (out[1] << 8) | (out[0] & 0xff);
else
ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
free(out);
}
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
}
#endif /* RECOLL_DATADIR */

View File

@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
/* To be called before starting threads in mt programs */ /* To be called before starting threads in mt programs */
void unac_init_mt(); void unac_init_mt();
#ifdef RECOLL_DATADIR
#include <string>
/**
* Set exceptions for unaccenting, for characters which should not be
* handled according to what the Unicode tables say. For example "a
* with circle above" should not be stripped to a in swedish, etc.
*
* @param spectrans defines the translations as a blank separated list of
* UTF-8 strings. Inside each string, the first character is the exception
* the rest is the translation (which may be empty). You can use double
* quotes for translations which should include white space. The double-quote
* can't be an exception character, deal with it...
*/
void unac_set_except_translations(const char *spectrans);
#endif /* RECOLL_DATADIR */
/* /*
* Return unac version number. * Return unac version number.
*/ */

View File

@ -15,6 +15,8 @@ daemSkippedPaths = \
defaultcharset = iso-8859-1 defaultcharset = iso-8859-1
unac_except_trans = åå Åå ää Ää öö Öö
[/home/dockes/projets/fulltext/testrecoll/special] [/home/dockes/projets/fulltext/testrecoll/special]
defaultcharset = iso-8859-1 defaultcharset = iso-8859-1
[/home/dockes/projets/fulltext/testrecoll/txt] [/home/dockes/projets/fulltext/testrecoll/txt]

View File

@ -8,7 +8,9 @@ initvariables $0
( (
recollq 'Bienvenue Dans Univers De Lyx' recollq 'Bienvenue Dans Univers De Lyx'
recollq 'Welcome To Lyx' recollq 'Welcome To Lyx'
recollq 'Udvozli Ont A LyX' recollq 'LyX rendkivul jol dokumentalt'
# cant use this one because o-diaeresis is in the swedish unacex
# recollq 'Udvozli Ont A LyX'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

20
tests/unacex/unacex.sh Executable file
View File

@ -0,0 +1,20 @@
#!/bin/sh
topdir=`dirname $0`/..
. $topdir/shared.sh
initvariables $0
# We need an utf-8 locale for the 1st command to properly read its argument
export LANG=fr_FR.UTF-8
(
# Should succeed
recollq '"strippes: UNACEXååääöö"'
# Should fail
recollq '"strippes: UNACEXaaaaoo"'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
checkresult

3
tests/unacex/unacex.txt Normal file
View File

@ -0,0 +1,3 @@
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/unacex/swedishchars.html] [Some chars] 293 bytes
0 results

View File

@ -17,15 +17,57 @@
*/ */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#ifdef RECOLL_DATADIR
#include "autoconfig.h"
#else
#include "config.h" #include "config.h"
#endif /* RECOLL */
#endif /* HAVE_CONFIG_H */ #endif /* HAVE_CONFIG_H */
#ifdef RECOLL_DATADIR
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
caught writing another binary search */
#include <vector>
#include <map>
#include <string>
#include <algorithm>
using std::string;
using std::vector;
using std::map;
#include "smallut.h"
/*
Storage for the exception translations. These are chars which
should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/
static vector<unsigned short> except_chars;
static vector<string> except_trans;
static inline size_t is_except_char(unsigned short c)
{
vector<unsigned short>::iterator it =
std::lower_bound(except_chars.begin(), except_chars.end(), c);
if (it == except_chars.end() || *it != c) {
return (size_t(-1));
}
return std::distance(except_chars.begin(), it);
}
#endif /* RECOLL_DATADIR */
/* /*
* If configure.in has not defined this symbol, assume const. It * If configure.in has not defined this symbol, assume const. It
* does not harm much: a warning will be issued during compilation. * does not harm much: a warning will be issued during compilation.
*/ */
#ifndef ICONV_CONST #ifndef ICONV_CONST
#ifdef RCL_ICONV_INBUF_CONST
#define ICONV_CONST const
#else
#define ICONV_CONST #define ICONV_CONST
#endif
#endif /* ICONV_CONST */ #endif /* ICONV_CONST */
#include <stdlib.h> #include <stdlib.h>
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
char* out; char* out;
int out_size; int out_size;
int out_length; int out_length;
int i; unsigned int i;
out_size = in_length > 0 ? in_length : 1024; out_size = in_length > 0 ? in_length : 1024;
out = *outp; out = *outp;
out = realloc(out, out_size + 1); out = (char*)realloc(out, out_size + 1);
if(out == 0) { if(out == 0) {
if(debug_level >= UNAC_DEBUG_LOW) if(debug_level >= UNAC_DEBUG_LOW)
DEBUG("realloc %d bytes failed\n", out_size+1); DEBUG("realloc %d bytes failed\n", out_size+1);
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
/* /*
* Lookup the tables for decomposition information * Lookup the tables for decomposition information
*/ */
if (dofold) { #ifdef RECOLL_DATADIR
unacfold_char_utf16(c, p, l); size_t idx;
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
p = (unsigned short *)(except_trans[idx].c_str() + 2);
l = (except_trans[idx].size() - 2) / 2;
/* unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);*/
} else { } else {
unac_char_utf16(c, p, l); #endif /* RECOLL_DATADIR */
if (dofold) {
unacfold_char_utf16(c, p, l);
} else {
unac_char_utf16(c, p, l);
}
#ifdef RECOLL_DATADIR
} }
#endif /* RECOLL_DATADIR */
/* /*
* Explain what's done in great detail * Explain what's done in great detail
*/ */
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
char *saved; char *saved;
out_size += ((l + 1) * 2) + 1024; out_size += ((l + 1) * 2) + 1024;
saved = out; saved = out;
out = realloc(out, out_size); out = (char *)realloc(out, out_size);
if(out == 0) { if(out == 0) {
if(debug_level >= UNAC_DEBUG_LOW) if(debug_level >= UNAC_DEBUG_LOW)
DEBUG("realloc %d bytes failed\n", out_size); DEBUG("realloc %d bytes failed\n", out_size);
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
out_size = in_length > 0 ? in_length : 1024; out_size = in_length > 0 ? in_length : 1024;
out = *outp; out = *outp;
out = realloc(out, out_size + 1); out = (char *)realloc(out, out_size + 1);
if(out == 0) { if(out == 0) {
/* *outp still valid, no freeing */ /* *outp still valid, no freeing */
if(debug_level >= UNAC_DEBUG_LOW) if(debug_level >= UNAC_DEBUG_LOW)
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
{ {
char *saved = out_base; char *saved = out_base;
/* +1 for null */ /* +1 for null */
out_base = realloc(out_base, out_size + 1); out_base = (char *)realloc(out_base, out_size + 1);
if (out_base == 0) { if (out_base == 0) {
/* *outp potentially not valid any more. Free here, /* *outp potentially not valid any more. Free here,
* and zero out */ * and zero out */
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
*/ */
if (in_length <= 0) { if (in_length <= 0) {
if(!*outp) { if(!*outp) {
if ((*outp = malloc(32)) == 0) if ((*outp = (char*)malloc(32)) == 0)
return -1; return -1;
} }
(*outp)[0] = '\0'; (*outp)[0] = '\0';
@ -12975,3 +13031,64 @@ const char* unac_version(void)
return UNAC_VERSION; return UNAC_VERSION;
} }
#ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans)
{
except_chars.clear();
except_trans.clear();
if (!spectrans || !spectrans[0])
return;
// The translation tables out of Unicode are in machine byte order (we
// just let the compiler read the values).
// For the translation part, we need to choose our encoding in accordance )
// (16BE or 16LE depending on processor)
// On the contrary, the source char is always to be compared to
// the input text, which is encoded in UTF-16BE ... What a mess.
static const char *machinecoding = 0;
bool littleendian = true;
if (machinecoding == 0) {
const char* charshort = "\001\002";
short *ip = (short *)charshort;
if (*ip == 0x0102) {
littleendian = false;
machinecoding = "UTF-16BE";
} else {
littleendian = true;
machinecoding = "UTF-16LE";
}
}
vector<string> vtrans;
stringToStrings(spectrans, vtrans);
for (vector<string>::iterator it = vtrans.begin();
it != vtrans.end(); it++) {
/* Convert the whole thing to utf-16be/le according to endianness */
char *out = 0;
size_t outsize;
if (convert("UTF-8", machinecoding,
it->c_str(), it->size(),
&out, &outsize) != 0 || outsize < 2)
continue;
/* The source char must be utf-16be as this is what we convert the
input text to for internal processing */
unsigned short ch;
if (littleendian)
ch = (out[1] << 8) | (out[0] & 0xff);
else
ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
free(out);
}
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
}
#endif /* RECOLL_DATADIR */

View File

@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
/* To be called before starting threads in mt programs */ /* To be called before starting threads in mt programs */
void unac_init_mt(); void unac_init_mt();
#ifdef RECOLL_DATADIR
#include <string>
/**
* Set exceptions for unaccenting, for characters which should not be
* handled according to what the Unicode tables say. For example "a
* with circle above" should not be stripped to a in swedish, etc.
*
* @param spectrans defines the translations as a blank separated list of
* UTF-8 strings. Inside each string, the first character is the exception
* the rest is the translation (which may be empty). You can use double
* quotes for translations which should include white space. The double-quote
* can't be an exception character, deal with it...
*/
void unac_set_except_translations(const char *spectrans);
#endif /* RECOLL_DATADIR */
/* /*
* Return unac version number. * Return unac version number.
*/ */