Added a configuration parameter to set specific unaccenting/lowercasing for some characters to be handled differently than would result from using the Unicode database. Exemple: "a with ring above" could be set to be preserved by a Swedish locutor
This commit is contained in:
parent
29ef5bd143
commit
a4c17941b1
@ -107,9 +107,15 @@ RclConfig *recollinit(RclInitFlags flags,
|
||||
// Make sure the locale charset is initialized (so that multiple
|
||||
// threads don't try to do it at once).
|
||||
config->getDefCharset();
|
||||
|
||||
// Init unac locking
|
||||
unac_init_mt();
|
||||
|
||||
// Init Unac translation exceptions
|
||||
string unacex;
|
||||
if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty())
|
||||
unac_set_except_translations(unacex.c_str());
|
||||
|
||||
int flushmb;
|
||||
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
|
||||
LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
|
||||
|
||||
@ -88,6 +88,7 @@ using namespace std;
|
||||
|
||||
#include "unacpp.h"
|
||||
#include "readfile.h"
|
||||
#include "rclinit.h"
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@ -98,8 +99,13 @@ int main(int argc, char **argv)
|
||||
}
|
||||
const char *encoding = argv[1];
|
||||
string ifn = argv[2];
|
||||
if (!ifn.compare("stdin"))
|
||||
ifn.clear();
|
||||
const char *ofn = argv[3];
|
||||
|
||||
string reason;
|
||||
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||
|
||||
string odata;
|
||||
if (!file_to_string(ifn, odata)) {
|
||||
cerr << "file_to_string: " << odata << endl;
|
||||
@ -111,7 +117,12 @@ int main(int argc, char **argv)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||
int fd;
|
||||
if (strcmp(ofn, "stdout")) {
|
||||
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||
} else {
|
||||
fd = 1;
|
||||
}
|
||||
if (fd < 0) {
|
||||
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
||||
<< endl;
|
||||
|
||||
@ -34,6 +34,9 @@
|
||||
<para>This document introduces full text search notions
|
||||
and describes the installation and use of the &RCL;
|
||||
application. It currently describes &RCL; &RCLVERSION;.</para>
|
||||
<!-- <para>[ <ulink url="index.html">Split HTML</ulink> /
|
||||
<ulink url="usermanual-xml.html">Single HTML</ulink> ]</para>
|
||||
-->
|
||||
</abstract>
|
||||
|
||||
|
||||
@ -3849,6 +3852,32 @@ skippedPaths = ~/somedir/∗.txt
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>unac_except_trans</literal></term>
|
||||
<listitem><para>This is a list of characters which should be
|
||||
handled specially when converting text to unaccented lowercase.
|
||||
For example, in Swedish, the letter <literal>a with diaeresis
|
||||
</literal> has full alphabet citizenship and should not be
|
||||
turned into an <literal>a</literal>. Each element in the
|
||||
space-separated list has the special character as first element
|
||||
and the translation following. The handling of both the
|
||||
lowercase and upper-case versions of a character should be
|
||||
specified, as appartenance to the list will turn-off both
|
||||
standard accent and case processing. Example for Swedish:</para>
|
||||
<programlisting>
|
||||
unac_except_trans = åå Åå ää Ää öö Öö
|
||||
</programlisting>
|
||||
|
||||
<para>Note that the translation is not limited to a single
|
||||
character, you could very well have something like
|
||||
<literal>üue</literal> in the list.</para>
|
||||
|
||||
<para>This parameter can't be defined for subdirectories, it
|
||||
is global, because there is no way to do otherwise when
|
||||
querying. If you have document sets which would need different
|
||||
values, you will have to index and query them separately.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>maildefcharset</literal></term>
|
||||
<listitem><para>This can be used to define the default
|
||||
character set specifically for email messages which don't
|
||||
|
||||
@ -14,6 +14,9 @@
|
||||
# Wherever docbook.xsl and chunk.xsl live
|
||||
XSLDIR="/usr/local/share/xsl/docbook/"
|
||||
|
||||
dochunky=1
|
||||
test $# -eq 1 && dochunky=0
|
||||
|
||||
# Remove the SGML header and uncomment the XML one
|
||||
sed -e '\!//FreeBSD//DTD!d' \
|
||||
-e '\!DTD DocBook XML!s/<!--//' \
|
||||
@ -31,7 +34,7 @@ commonoptions="--stringparam section.autolabel 1 \
|
||||
"
|
||||
|
||||
# Do the chunky thing
|
||||
eval xsltproc $commonoptions \
|
||||
test $dochunky -ne 0 && eval xsltproc $commonoptions \
|
||||
--stringparam use.id.as.filename 1 \
|
||||
"$XSLDIR/html/chunk.xsl" \
|
||||
usermanual.xml
|
||||
|
||||
@ -14,7 +14,7 @@ librcl.a : $(DEPS) $(OBJS) unac.o
|
||||
$(RANLIB) librcl.a
|
||||
|
||||
unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs
|
||||
$(CC) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
|
||||
$(CXX) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
|
||||
rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
|
||||
beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
|
||||
|
||||
@ -121,7 +121,7 @@ librcl.a : \$(DEPS) \$(OBJS) unac.o
|
||||
\$(RANLIB) librcl.a
|
||||
|
||||
unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs
|
||||
\$(CC) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
|
||||
\$(CXX) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
|
||||
EOF
|
||||
|
||||
for c in $SRC_CPP;do
|
||||
|
||||
135
src/unac/unac.c
135
src/unac/unac.c
@ -17,15 +17,57 @@
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#ifdef RECOLL_DATADIR
|
||||
#include "autoconfig.h"
|
||||
#else
|
||||
#include "config.h"
|
||||
#endif /* RECOLL */
|
||||
#endif /* HAVE_CONFIG_H */
|
||||
|
||||
#ifdef RECOLL_DATADIR
|
||||
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
||||
caught writing another binary search */
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::map;
|
||||
#include "smallut.h"
|
||||
|
||||
/*
|
||||
Storage for the exception translations. These are chars which
|
||||
should not be translated according to what UnicodeData says, but
|
||||
instead according to some local rule. There will usually be very
|
||||
few of them, but they must be looked up for every translated char.
|
||||
|
||||
We use a sorted vector for fastest elimination by binary search and
|
||||
a vector<string> to store the translations
|
||||
*/
|
||||
static vector<unsigned short> except_chars;
|
||||
static vector<string> except_trans;
|
||||
static inline size_t is_except_char(unsigned short c)
|
||||
{
|
||||
vector<unsigned short>::iterator it =
|
||||
std::lower_bound(except_chars.begin(), except_chars.end(), c);
|
||||
if (it == except_chars.end() || *it != c) {
|
||||
return (size_t(-1));
|
||||
}
|
||||
return std::distance(except_chars.begin(), it);
|
||||
}
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
/*
|
||||
* If configure.in has not defined this symbol, assume const. It
|
||||
* does not harm much: a warning will be issued during compilation.
|
||||
*/
|
||||
#ifndef ICONV_CONST
|
||||
#ifdef RCL_ICONV_INBUF_CONST
|
||||
#define ICONV_CONST const
|
||||
#else
|
||||
#define ICONV_CONST
|
||||
#endif
|
||||
#endif /* ICONV_CONST */
|
||||
|
||||
#include <stdlib.h>
|
||||
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
char* out;
|
||||
int out_size;
|
||||
int out_length;
|
||||
int i;
|
||||
unsigned int i;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = realloc(out, out_size + 1);
|
||||
out = (char*)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
/*
|
||||
* Lookup the tables for decomposition information
|
||||
*/
|
||||
if (dofold) {
|
||||
unacfold_char_utf16(c, p, l);
|
||||
#ifdef RECOLL_DATADIR
|
||||
size_t idx;
|
||||
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
|
||||
p = (unsigned short *)(except_trans[idx].c_str() + 2);
|
||||
l = (except_trans[idx].size() - 2) / 2;
|
||||
/* unsigned char *cp = (unsigned char *)p;
|
||||
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
|
||||
(unsigned int)cp[1]);*/
|
||||
} else {
|
||||
unac_char_utf16(c, p, l);
|
||||
#endif /* RECOLL_DATADIR */
|
||||
if (dofold) {
|
||||
unacfold_char_utf16(c, p, l);
|
||||
} else {
|
||||
unac_char_utf16(c, p, l);
|
||||
}
|
||||
#ifdef RECOLL_DATADIR
|
||||
}
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
/*
|
||||
* Explain what's done in great detail
|
||||
*/
|
||||
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
char *saved;
|
||||
out_size += ((l + 1) * 2) + 1024;
|
||||
saved = out;
|
||||
out = realloc(out, out_size);
|
||||
out = (char *)realloc(out, out_size);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size);
|
||||
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = realloc(out, out_size + 1);
|
||||
out = (char *)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
/* *outp still valid, no freeing */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
|
||||
{
|
||||
char *saved = out_base;
|
||||
/* +1 for null */
|
||||
out_base = realloc(out_base, out_size + 1);
|
||||
out_base = (char *)realloc(out_base, out_size + 1);
|
||||
if (out_base == 0) {
|
||||
/* *outp potentially not valid any more. Free here,
|
||||
* and zero out */
|
||||
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
|
||||
*/
|
||||
if (in_length <= 0) {
|
||||
if(!*outp) {
|
||||
if ((*outp = malloc(32)) == 0)
|
||||
if ((*outp = (char*)malloc(32)) == 0)
|
||||
return -1;
|
||||
}
|
||||
(*outp)[0] = '\0';
|
||||
@ -12975,3 +13031,64 @@ const char* unac_version(void)
|
||||
return UNAC_VERSION;
|
||||
}
|
||||
|
||||
#ifdef RECOLL_DATADIR
|
||||
void unac_set_except_translations(const char *spectrans)
|
||||
{
|
||||
except_chars.clear();
|
||||
except_trans.clear();
|
||||
if (!spectrans || !spectrans[0])
|
||||
return;
|
||||
|
||||
// The translation tables out of Unicode are in machine byte order (we
|
||||
// just let the compiler read the values).
|
||||
// For the translation part, we need to choose our encoding in accordance )
|
||||
// (16BE or 16LE depending on processor)
|
||||
// On the contrary, the source char is always to be compared to
|
||||
// the input text, which is encoded in UTF-16BE ... What a mess.
|
||||
static const char *machinecoding = 0;
|
||||
bool littleendian = true;
|
||||
if (machinecoding == 0) {
|
||||
const char* charshort = "\001\002";
|
||||
short *ip = (short *)charshort;
|
||||
if (*ip == 0x0102) {
|
||||
littleendian = false;
|
||||
machinecoding = "UTF-16BE";
|
||||
} else {
|
||||
littleendian = true;
|
||||
machinecoding = "UTF-16LE";
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> vtrans;
|
||||
stringToStrings(spectrans, vtrans);
|
||||
|
||||
for (vector<string>::iterator it = vtrans.begin();
|
||||
it != vtrans.end(); it++) {
|
||||
|
||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||
char *out = 0;
|
||||
size_t outsize;
|
||||
if (convert("UTF-8", machinecoding,
|
||||
it->c_str(), it->size(),
|
||||
&out, &outsize) != 0 || outsize < 2)
|
||||
continue;
|
||||
|
||||
/* The source char must be utf-16be as this is what we convert the
|
||||
input text to for internal processing */
|
||||
unsigned short ch;
|
||||
if (littleendian)
|
||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||
else
|
||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||
|
||||
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
|
||||
except_chars.push_back(ch);
|
||||
// We keep ch as the first 2 bytes in the translation so that
|
||||
// both vectors sort identically
|
||||
except_trans.push_back(string((const char *)out, outsize));
|
||||
free(out);
|
||||
}
|
||||
std::sort(except_chars.begin(), except_chars.end());
|
||||
std::sort(except_trans.begin(), except_trans.end());
|
||||
}
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
|
||||
/* To be called before starting threads in mt programs */
|
||||
void unac_init_mt();
|
||||
|
||||
#ifdef RECOLL_DATADIR
|
||||
#include <string>
|
||||
/**
|
||||
* Set exceptions for unaccenting, for characters which should not be
|
||||
* handled according to what the Unicode tables say. For example "a
|
||||
* with circle above" should not be stripped to a in swedish, etc.
|
||||
*
|
||||
* @param spectrans defines the translations as a blank separated list of
|
||||
* UTF-8 strings. Inside each string, the first character is the exception
|
||||
* the rest is the translation (which may be empty). You can use double
|
||||
* quotes for translations which should include white space. The double-quote
|
||||
* can't be an exception character, deal with it...
|
||||
*/
|
||||
void unac_set_except_translations(const char *spectrans);
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
/*
|
||||
* Return unac version number.
|
||||
*/
|
||||
|
||||
@ -15,6 +15,8 @@ daemSkippedPaths = \
|
||||
|
||||
defaultcharset = iso-8859-1
|
||||
|
||||
unac_except_trans = åå Åå ää Ää öö Öö
|
||||
|
||||
[/home/dockes/projets/fulltext/testrecoll/special]
|
||||
defaultcharset = iso-8859-1
|
||||
[/home/dockes/projets/fulltext/testrecoll/txt]
|
||||
|
||||
@ -8,7 +8,9 @@ initvariables $0
|
||||
(
|
||||
recollq 'Bienvenue Dans Univers De Lyx'
|
||||
recollq 'Welcome To Lyx'
|
||||
recollq 'Udvozli Ont A LyX'
|
||||
recollq 'LyX rendkivul jol dokumentalt'
|
||||
# cant use this one because o-diaeresis is in the swedish unacex
|
||||
# recollq 'Udvozli Ont A LyX'
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
|
||||
20
tests/unacex/unacex.sh
Executable file
20
tests/unacex/unacex.sh
Executable file
@ -0,0 +1,20 @@
|
||||
#!/bin/sh
|
||||
|
||||
topdir=`dirname $0`/..
|
||||
. $topdir/shared.sh
|
||||
|
||||
initvariables $0
|
||||
|
||||
# We need an utf-8 locale for the 1st command to properly read its argument
|
||||
export LANG=fr_FR.UTF-8
|
||||
|
||||
(
|
||||
# Should succeed
|
||||
recollq '"strippes: UNACEXååääöö"'
|
||||
# Should fail
|
||||
recollq '"strippes: UNACEXaaaaoo"'
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
|
||||
checkresult
|
||||
3
tests/unacex/unacex.txt
Normal file
3
tests/unacex/unacex.txt
Normal file
@ -0,0 +1,3 @@
|
||||
1 results
|
||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/unacex/swedishchars.html] [Some chars] 293 bytes
|
||||
0 results
|
||||
135
unac/unac.c
135
unac/unac.c
@ -17,15 +17,57 @@
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#ifdef RECOLL_DATADIR
|
||||
#include "autoconfig.h"
|
||||
#else
|
||||
#include "config.h"
|
||||
#endif /* RECOLL */
|
||||
#endif /* HAVE_CONFIG_H */
|
||||
|
||||
#ifdef RECOLL_DATADIR
|
||||
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
||||
caught writing another binary search */
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::map;
|
||||
#include "smallut.h"
|
||||
|
||||
/*
|
||||
Storage for the exception translations. These are chars which
|
||||
should not be translated according to what UnicodeData says, but
|
||||
instead according to some local rule. There will usually be very
|
||||
few of them, but they must be looked up for every translated char.
|
||||
|
||||
We use a sorted vector for fastest elimination by binary search and
|
||||
a vector<string> to store the translations
|
||||
*/
|
||||
static vector<unsigned short> except_chars;
|
||||
static vector<string> except_trans;
|
||||
static inline size_t is_except_char(unsigned short c)
|
||||
{
|
||||
vector<unsigned short>::iterator it =
|
||||
std::lower_bound(except_chars.begin(), except_chars.end(), c);
|
||||
if (it == except_chars.end() || *it != c) {
|
||||
return (size_t(-1));
|
||||
}
|
||||
return std::distance(except_chars.begin(), it);
|
||||
}
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
/*
|
||||
* If configure.in has not defined this symbol, assume const. It
|
||||
* does not harm much: a warning will be issued during compilation.
|
||||
*/
|
||||
#ifndef ICONV_CONST
|
||||
#ifdef RCL_ICONV_INBUF_CONST
|
||||
#define ICONV_CONST const
|
||||
#else
|
||||
#define ICONV_CONST
|
||||
#endif
|
||||
#endif /* ICONV_CONST */
|
||||
|
||||
#include <stdlib.h>
|
||||
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
char* out;
|
||||
int out_size;
|
||||
int out_length;
|
||||
int i;
|
||||
unsigned int i;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = realloc(out, out_size + 1);
|
||||
out = (char*)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
/*
|
||||
* Lookup the tables for decomposition information
|
||||
*/
|
||||
if (dofold) {
|
||||
unacfold_char_utf16(c, p, l);
|
||||
#ifdef RECOLL_DATADIR
|
||||
size_t idx;
|
||||
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
|
||||
p = (unsigned short *)(except_trans[idx].c_str() + 2);
|
||||
l = (except_trans[idx].size() - 2) / 2;
|
||||
/* unsigned char *cp = (unsigned char *)p;
|
||||
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
|
||||
(unsigned int)cp[1]);*/
|
||||
} else {
|
||||
unac_char_utf16(c, p, l);
|
||||
#endif /* RECOLL_DATADIR */
|
||||
if (dofold) {
|
||||
unacfold_char_utf16(c, p, l);
|
||||
} else {
|
||||
unac_char_utf16(c, p, l);
|
||||
}
|
||||
#ifdef RECOLL_DATADIR
|
||||
}
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
/*
|
||||
* Explain what's done in great detail
|
||||
*/
|
||||
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
char *saved;
|
||||
out_size += ((l + 1) * 2) + 1024;
|
||||
saved = out;
|
||||
out = realloc(out, out_size);
|
||||
out = (char *)realloc(out, out_size);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size);
|
||||
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = realloc(out, out_size + 1);
|
||||
out = (char *)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
/* *outp still valid, no freeing */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
|
||||
{
|
||||
char *saved = out_base;
|
||||
/* +1 for null */
|
||||
out_base = realloc(out_base, out_size + 1);
|
||||
out_base = (char *)realloc(out_base, out_size + 1);
|
||||
if (out_base == 0) {
|
||||
/* *outp potentially not valid any more. Free here,
|
||||
* and zero out */
|
||||
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
|
||||
*/
|
||||
if (in_length <= 0) {
|
||||
if(!*outp) {
|
||||
if ((*outp = malloc(32)) == 0)
|
||||
if ((*outp = (char*)malloc(32)) == 0)
|
||||
return -1;
|
||||
}
|
||||
(*outp)[0] = '\0';
|
||||
@ -12975,3 +13031,64 @@ const char* unac_version(void)
|
||||
return UNAC_VERSION;
|
||||
}
|
||||
|
||||
#ifdef RECOLL_DATADIR
|
||||
void unac_set_except_translations(const char *spectrans)
|
||||
{
|
||||
except_chars.clear();
|
||||
except_trans.clear();
|
||||
if (!spectrans || !spectrans[0])
|
||||
return;
|
||||
|
||||
// The translation tables out of Unicode are in machine byte order (we
|
||||
// just let the compiler read the values).
|
||||
// For the translation part, we need to choose our encoding in accordance )
|
||||
// (16BE or 16LE depending on processor)
|
||||
// On the contrary, the source char is always to be compared to
|
||||
// the input text, which is encoded in UTF-16BE ... What a mess.
|
||||
static const char *machinecoding = 0;
|
||||
bool littleendian = true;
|
||||
if (machinecoding == 0) {
|
||||
const char* charshort = "\001\002";
|
||||
short *ip = (short *)charshort;
|
||||
if (*ip == 0x0102) {
|
||||
littleendian = false;
|
||||
machinecoding = "UTF-16BE";
|
||||
} else {
|
||||
littleendian = true;
|
||||
machinecoding = "UTF-16LE";
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> vtrans;
|
||||
stringToStrings(spectrans, vtrans);
|
||||
|
||||
for (vector<string>::iterator it = vtrans.begin();
|
||||
it != vtrans.end(); it++) {
|
||||
|
||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||
char *out = 0;
|
||||
size_t outsize;
|
||||
if (convert("UTF-8", machinecoding,
|
||||
it->c_str(), it->size(),
|
||||
&out, &outsize) != 0 || outsize < 2)
|
||||
continue;
|
||||
|
||||
/* The source char must be utf-16be as this is what we convert the
|
||||
input text to for internal processing */
|
||||
unsigned short ch;
|
||||
if (littleendian)
|
||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||
else
|
||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||
|
||||
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
|
||||
except_chars.push_back(ch);
|
||||
// We keep ch as the first 2 bytes in the translation so that
|
||||
// both vectors sort identically
|
||||
except_trans.push_back(string((const char *)out, outsize));
|
||||
free(out);
|
||||
}
|
||||
std::sort(except_chars.begin(), except_chars.end());
|
||||
std::sort(except_trans.begin(), except_trans.end());
|
||||
}
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
16
unac/unac.h
16
unac/unac.h
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
|
||||
/* To be called before starting threads in mt programs */
|
||||
void unac_init_mt();
|
||||
|
||||
#ifdef RECOLL_DATADIR
|
||||
#include <string>
|
||||
/**
|
||||
* Set exceptions for unaccenting, for characters which should not be
|
||||
* handled according to what the Unicode tables say. For example "a
|
||||
* with circle above" should not be stripped to a in swedish, etc.
|
||||
*
|
||||
* @param spectrans defines the translations as a blank separated list of
|
||||
* UTF-8 strings. Inside each string, the first character is the exception
|
||||
* the rest is the translation (which may be empty). You can use double
|
||||
* quotes for translations which should include white space. The double-quote
|
||||
* can't be an exception character, deal with it...
|
||||
*/
|
||||
void unac_set_except_translations(const char *spectrans);
|
||||
#endif /* RECOLL_DATADIR */
|
||||
|
||||
/*
|
||||
* Return unac version number.
|
||||
*/
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user