Added a configuration parameter to set specific unaccenting/lowercasing for some characters to be handled differently than would result from using the Unicode database. Exemple: "a with ring above" could be set to be preserved by a Swedish locutor
This commit is contained in:
parent
29ef5bd143
commit
a4c17941b1
@ -107,9 +107,15 @@ RclConfig *recollinit(RclInitFlags flags,
|
|||||||
// Make sure the locale charset is initialized (so that multiple
|
// Make sure the locale charset is initialized (so that multiple
|
||||||
// threads don't try to do it at once).
|
// threads don't try to do it at once).
|
||||||
config->getDefCharset();
|
config->getDefCharset();
|
||||||
|
|
||||||
// Init unac locking
|
// Init unac locking
|
||||||
unac_init_mt();
|
unac_init_mt();
|
||||||
|
|
||||||
|
// Init Unac translation exceptions
|
||||||
|
string unacex;
|
||||||
|
if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty())
|
||||||
|
unac_set_except_translations(unacex.c_str());
|
||||||
|
|
||||||
int flushmb;
|
int flushmb;
|
||||||
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
|
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
|
||||||
LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
|
LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
|
||||||
|
|||||||
@ -88,6 +88,7 @@ using namespace std;
|
|||||||
|
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
|
#include "rclinit.h"
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@ -98,8 +99,13 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
const char *encoding = argv[1];
|
const char *encoding = argv[1];
|
||||||
string ifn = argv[2];
|
string ifn = argv[2];
|
||||||
|
if (!ifn.compare("stdin"))
|
||||||
|
ifn.clear();
|
||||||
const char *ofn = argv[3];
|
const char *ofn = argv[3];
|
||||||
|
|
||||||
|
string reason;
|
||||||
|
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||||
|
|
||||||
string odata;
|
string odata;
|
||||||
if (!file_to_string(ifn, odata)) {
|
if (!file_to_string(ifn, odata)) {
|
||||||
cerr << "file_to_string: " << odata << endl;
|
cerr << "file_to_string: " << odata << endl;
|
||||||
@ -111,7 +117,12 @@ int main(int argc, char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
int fd;
|
||||||
|
if (strcmp(ofn, "stdout")) {
|
||||||
|
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||||
|
} else {
|
||||||
|
fd = 1;
|
||||||
|
}
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
||||||
<< endl;
|
<< endl;
|
||||||
|
|||||||
@ -34,6 +34,9 @@
|
|||||||
<para>This document introduces full text search notions
|
<para>This document introduces full text search notions
|
||||||
and describes the installation and use of the &RCL;
|
and describes the installation and use of the &RCL;
|
||||||
application. It currently describes &RCL; &RCLVERSION;.</para>
|
application. It currently describes &RCL; &RCLVERSION;.</para>
|
||||||
|
<!-- <para>[ <ulink url="index.html">Split HTML</ulink> /
|
||||||
|
<ulink url="usermanual-xml.html">Single HTML</ulink> ]</para>
|
||||||
|
-->
|
||||||
</abstract>
|
</abstract>
|
||||||
|
|
||||||
|
|
||||||
@ -3849,6 +3852,32 @@ skippedPaths = ~/somedir/∗.txt
|
|||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
<varlistentry><term><literal>unac_except_trans</literal></term>
|
||||||
|
<listitem><para>This is a list of characters which should be
|
||||||
|
handled specially when converting text to unaccented lowercase.
|
||||||
|
For example, in Swedish, the letter <literal>a with diaeresis
|
||||||
|
</literal> has full alphabet citizenship and should not be
|
||||||
|
turned into an <literal>a</literal>. Each element in the
|
||||||
|
space-separated list has the special character as first element
|
||||||
|
and the translation following. The handling of both the
|
||||||
|
lowercase and upper-case versions of a character should be
|
||||||
|
specified, as appartenance to the list will turn-off both
|
||||||
|
standard accent and case processing. Example for Swedish:</para>
|
||||||
|
<programlisting>
|
||||||
|
unac_except_trans = åå Åå ää Ää öö Öö
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
<para>Note that the translation is not limited to a single
|
||||||
|
character, you could very well have something like
|
||||||
|
<literal>üue</literal> in the list.</para>
|
||||||
|
|
||||||
|
<para>This parameter can't be defined for subdirectories, it
|
||||||
|
is global, because there is no way to do otherwise when
|
||||||
|
querying. If you have document sets which would need different
|
||||||
|
values, you will have to index and query them separately.</para>
|
||||||
|
</listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
<varlistentry><term><literal>maildefcharset</literal></term>
|
<varlistentry><term><literal>maildefcharset</literal></term>
|
||||||
<listitem><para>This can be used to define the default
|
<listitem><para>This can be used to define the default
|
||||||
character set specifically for email messages which don't
|
character set specifically for email messages which don't
|
||||||
|
|||||||
@ -14,6 +14,9 @@
|
|||||||
# Wherever docbook.xsl and chunk.xsl live
|
# Wherever docbook.xsl and chunk.xsl live
|
||||||
XSLDIR="/usr/local/share/xsl/docbook/"
|
XSLDIR="/usr/local/share/xsl/docbook/"
|
||||||
|
|
||||||
|
dochunky=1
|
||||||
|
test $# -eq 1 && dochunky=0
|
||||||
|
|
||||||
# Remove the SGML header and uncomment the XML one
|
# Remove the SGML header and uncomment the XML one
|
||||||
sed -e '\!//FreeBSD//DTD!d' \
|
sed -e '\!//FreeBSD//DTD!d' \
|
||||||
-e '\!DTD DocBook XML!s/<!--//' \
|
-e '\!DTD DocBook XML!s/<!--//' \
|
||||||
@ -31,7 +34,7 @@ commonoptions="--stringparam section.autolabel 1 \
|
|||||||
"
|
"
|
||||||
|
|
||||||
# Do the chunky thing
|
# Do the chunky thing
|
||||||
eval xsltproc $commonoptions \
|
test $dochunky -ne 0 && eval xsltproc $commonoptions \
|
||||||
--stringparam use.id.as.filename 1 \
|
--stringparam use.id.as.filename 1 \
|
||||||
"$XSLDIR/html/chunk.xsl" \
|
"$XSLDIR/html/chunk.xsl" \
|
||||||
usermanual.xml
|
usermanual.xml
|
||||||
|
|||||||
@ -14,7 +14,7 @@ librcl.a : $(DEPS) $(OBJS) unac.o
|
|||||||
$(RANLIB) librcl.a
|
$(RANLIB) librcl.a
|
||||||
|
|
||||||
unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs
|
unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $(depth)/mk/localdefs
|
||||||
$(CC) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
|
$(CXX) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
|
||||||
rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
|
rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
|
||||||
beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
|
beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
|
||||||
|
|||||||
@ -121,7 +121,7 @@ librcl.a : \$(DEPS) \$(OBJS) unac.o
|
|||||||
\$(RANLIB) librcl.a
|
\$(RANLIB) librcl.a
|
||||||
|
|
||||||
unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs
|
unac.o : \$(depth)/unac/unac.c \$(depth)/unac/unac.h $defs
|
||||||
\$(CC) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
|
\$(CXX) \$(ALL_CXXFLAGS) -c \$(depth)/unac/unac.c
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
for c in $SRC_CPP;do
|
for c in $SRC_CPP;do
|
||||||
|
|||||||
135
src/unac/unac.c
135
src/unac/unac.c
@ -17,15 +17,57 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef HAVE_CONFIG_H
|
#ifdef HAVE_CONFIG_H
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
#include "autoconfig.h"
|
||||||
|
#else
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
#endif /* RECOLL */
|
||||||
#endif /* HAVE_CONFIG_H */
|
#endif /* HAVE_CONFIG_H */
|
||||||
|
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
||||||
|
caught writing another binary search */
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <algorithm>
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
using std::map;
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
Storage for the exception translations. These are chars which
|
||||||
|
should not be translated according to what UnicodeData says, but
|
||||||
|
instead according to some local rule. There will usually be very
|
||||||
|
few of them, but they must be looked up for every translated char.
|
||||||
|
|
||||||
|
We use a sorted vector for fastest elimination by binary search and
|
||||||
|
a vector<string> to store the translations
|
||||||
|
*/
|
||||||
|
static vector<unsigned short> except_chars;
|
||||||
|
static vector<string> except_trans;
|
||||||
|
static inline size_t is_except_char(unsigned short c)
|
||||||
|
{
|
||||||
|
vector<unsigned short>::iterator it =
|
||||||
|
std::lower_bound(except_chars.begin(), except_chars.end(), c);
|
||||||
|
if (it == except_chars.end() || *it != c) {
|
||||||
|
return (size_t(-1));
|
||||||
|
}
|
||||||
|
return std::distance(except_chars.begin(), it);
|
||||||
|
}
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If configure.in has not defined this symbol, assume const. It
|
* If configure.in has not defined this symbol, assume const. It
|
||||||
* does not harm much: a warning will be issued during compilation.
|
* does not harm much: a warning will be issued during compilation.
|
||||||
*/
|
*/
|
||||||
#ifndef ICONV_CONST
|
#ifndef ICONV_CONST
|
||||||
|
#ifdef RCL_ICONV_INBUF_CONST
|
||||||
|
#define ICONV_CONST const
|
||||||
|
#else
|
||||||
#define ICONV_CONST
|
#define ICONV_CONST
|
||||||
|
#endif
|
||||||
#endif /* ICONV_CONST */
|
#endif /* ICONV_CONST */
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
|||||||
char* out;
|
char* out;
|
||||||
int out_size;
|
int out_size;
|
||||||
int out_length;
|
int out_length;
|
||||||
int i;
|
unsigned int i;
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
|
|
||||||
out = *outp;
|
out = *outp;
|
||||||
out = realloc(out, out_size + 1);
|
out = (char*)realloc(out, out_size + 1);
|
||||||
if(out == 0) {
|
if(out == 0) {
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
|||||||
/*
|
/*
|
||||||
* Lookup the tables for decomposition information
|
* Lookup the tables for decomposition information
|
||||||
*/
|
*/
|
||||||
if (dofold) {
|
#ifdef RECOLL_DATADIR
|
||||||
unacfold_char_utf16(c, p, l);
|
size_t idx;
|
||||||
|
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
|
||||||
|
p = (unsigned short *)(except_trans[idx].c_str() + 2);
|
||||||
|
l = (except_trans[idx].size() - 2) / 2;
|
||||||
|
/* unsigned char *cp = (unsigned char *)p;
|
||||||
|
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
|
||||||
|
(unsigned int)cp[1]);*/
|
||||||
} else {
|
} else {
|
||||||
unac_char_utf16(c, p, l);
|
#endif /* RECOLL_DATADIR */
|
||||||
|
if (dofold) {
|
||||||
|
unacfold_char_utf16(c, p, l);
|
||||||
|
} else {
|
||||||
|
unac_char_utf16(c, p, l);
|
||||||
|
}
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
}
|
}
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Explain what's done in great detail
|
* Explain what's done in great detail
|
||||||
*/
|
*/
|
||||||
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
|||||||
char *saved;
|
char *saved;
|
||||||
out_size += ((l + 1) * 2) + 1024;
|
out_size += ((l + 1) * 2) + 1024;
|
||||||
saved = out;
|
saved = out;
|
||||||
out = realloc(out, out_size);
|
out = (char *)realloc(out, out_size);
|
||||||
if(out == 0) {
|
if(out == 0) {
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
DEBUG("realloc %d bytes failed\n", out_size);
|
DEBUG("realloc %d bytes failed\n", out_size);
|
||||||
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
|
|||||||
out_size = in_length > 0 ? in_length : 1024;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
|
|
||||||
out = *outp;
|
out = *outp;
|
||||||
out = realloc(out, out_size + 1);
|
out = (char *)realloc(out, out_size + 1);
|
||||||
if(out == 0) {
|
if(out == 0) {
|
||||||
/* *outp still valid, no freeing */
|
/* *outp still valid, no freeing */
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
|
|||||||
{
|
{
|
||||||
char *saved = out_base;
|
char *saved = out_base;
|
||||||
/* +1 for null */
|
/* +1 for null */
|
||||||
out_base = realloc(out_base, out_size + 1);
|
out_base = (char *)realloc(out_base, out_size + 1);
|
||||||
if (out_base == 0) {
|
if (out_base == 0) {
|
||||||
/* *outp potentially not valid any more. Free here,
|
/* *outp potentially not valid any more. Free here,
|
||||||
* and zero out */
|
* and zero out */
|
||||||
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
|
|||||||
*/
|
*/
|
||||||
if (in_length <= 0) {
|
if (in_length <= 0) {
|
||||||
if(!*outp) {
|
if(!*outp) {
|
||||||
if ((*outp = malloc(32)) == 0)
|
if ((*outp = (char*)malloc(32)) == 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
(*outp)[0] = '\0';
|
(*outp)[0] = '\0';
|
||||||
@ -12975,3 +13031,64 @@ const char* unac_version(void)
|
|||||||
return UNAC_VERSION;
|
return UNAC_VERSION;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
void unac_set_except_translations(const char *spectrans)
|
||||||
|
{
|
||||||
|
except_chars.clear();
|
||||||
|
except_trans.clear();
|
||||||
|
if (!spectrans || !spectrans[0])
|
||||||
|
return;
|
||||||
|
|
||||||
|
// The translation tables out of Unicode are in machine byte order (we
|
||||||
|
// just let the compiler read the values).
|
||||||
|
// For the translation part, we need to choose our encoding in accordance )
|
||||||
|
// (16BE or 16LE depending on processor)
|
||||||
|
// On the contrary, the source char is always to be compared to
|
||||||
|
// the input text, which is encoded in UTF-16BE ... What a mess.
|
||||||
|
static const char *machinecoding = 0;
|
||||||
|
bool littleendian = true;
|
||||||
|
if (machinecoding == 0) {
|
||||||
|
const char* charshort = "\001\002";
|
||||||
|
short *ip = (short *)charshort;
|
||||||
|
if (*ip == 0x0102) {
|
||||||
|
littleendian = false;
|
||||||
|
machinecoding = "UTF-16BE";
|
||||||
|
} else {
|
||||||
|
littleendian = true;
|
||||||
|
machinecoding = "UTF-16LE";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<string> vtrans;
|
||||||
|
stringToStrings(spectrans, vtrans);
|
||||||
|
|
||||||
|
for (vector<string>::iterator it = vtrans.begin();
|
||||||
|
it != vtrans.end(); it++) {
|
||||||
|
|
||||||
|
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||||
|
char *out = 0;
|
||||||
|
size_t outsize;
|
||||||
|
if (convert("UTF-8", machinecoding,
|
||||||
|
it->c_str(), it->size(),
|
||||||
|
&out, &outsize) != 0 || outsize < 2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* The source char must be utf-16be as this is what we convert the
|
||||||
|
input text to for internal processing */
|
||||||
|
unsigned short ch;
|
||||||
|
if (littleendian)
|
||||||
|
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||||
|
else
|
||||||
|
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||||
|
|
||||||
|
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
|
||||||
|
except_chars.push_back(ch);
|
||||||
|
// We keep ch as the first 2 bytes in the translation so that
|
||||||
|
// both vectors sort identically
|
||||||
|
except_trans.push_back(string((const char *)out, outsize));
|
||||||
|
free(out);
|
||||||
|
}
|
||||||
|
std::sort(except_chars.begin(), except_chars.end());
|
||||||
|
std::sort(except_trans.begin(), except_trans.end());
|
||||||
|
}
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|||||||
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
|
|||||||
/* To be called before starting threads in mt programs */
|
/* To be called before starting threads in mt programs */
|
||||||
void unac_init_mt();
|
void unac_init_mt();
|
||||||
|
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
#include <string>
|
||||||
|
/**
|
||||||
|
* Set exceptions for unaccenting, for characters which should not be
|
||||||
|
* handled according to what the Unicode tables say. For example "a
|
||||||
|
* with circle above" should not be stripped to a in swedish, etc.
|
||||||
|
*
|
||||||
|
* @param spectrans defines the translations as a blank separated list of
|
||||||
|
* UTF-8 strings. Inside each string, the first character is the exception
|
||||||
|
* the rest is the translation (which may be empty). You can use double
|
||||||
|
* quotes for translations which should include white space. The double-quote
|
||||||
|
* can't be an exception character, deal with it...
|
||||||
|
*/
|
||||||
|
void unac_set_except_translations(const char *spectrans);
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return unac version number.
|
* Return unac version number.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -15,6 +15,8 @@ daemSkippedPaths = \
|
|||||||
|
|
||||||
defaultcharset = iso-8859-1
|
defaultcharset = iso-8859-1
|
||||||
|
|
||||||
|
unac_except_trans = åå Åå ää Ää öö Öö
|
||||||
|
|
||||||
[/home/dockes/projets/fulltext/testrecoll/special]
|
[/home/dockes/projets/fulltext/testrecoll/special]
|
||||||
defaultcharset = iso-8859-1
|
defaultcharset = iso-8859-1
|
||||||
[/home/dockes/projets/fulltext/testrecoll/txt]
|
[/home/dockes/projets/fulltext/testrecoll/txt]
|
||||||
|
|||||||
@ -8,7 +8,9 @@ initvariables $0
|
|||||||
(
|
(
|
||||||
recollq 'Bienvenue Dans Univers De Lyx'
|
recollq 'Bienvenue Dans Univers De Lyx'
|
||||||
recollq 'Welcome To Lyx'
|
recollq 'Welcome To Lyx'
|
||||||
recollq 'Udvozli Ont A LyX'
|
recollq 'LyX rendkivul jol dokumentalt'
|
||||||
|
# cant use this one because o-diaeresis is in the swedish unacex
|
||||||
|
# recollq 'Udvozli Ont A LyX'
|
||||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|||||||
20
tests/unacex/unacex.sh
Executable file
20
tests/unacex/unacex.sh
Executable file
@ -0,0 +1,20 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
topdir=`dirname $0`/..
|
||||||
|
. $topdir/shared.sh
|
||||||
|
|
||||||
|
initvariables $0
|
||||||
|
|
||||||
|
# We need an utf-8 locale for the 1st command to properly read its argument
|
||||||
|
export LANG=fr_FR.UTF-8
|
||||||
|
|
||||||
|
(
|
||||||
|
# Should succeed
|
||||||
|
recollq '"strippes: UNACEXååääöö"'
|
||||||
|
# Should fail
|
||||||
|
recollq '"strippes: UNACEXaaaaoo"'
|
||||||
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|
||||||
|
checkresult
|
||||||
3
tests/unacex/unacex.txt
Normal file
3
tests/unacex/unacex.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
1 results
|
||||||
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/unacex/swedishchars.html] [Some chars] 293 bytes
|
||||||
|
0 results
|
||||||
135
unac/unac.c
135
unac/unac.c
@ -17,15 +17,57 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef HAVE_CONFIG_H
|
#ifdef HAVE_CONFIG_H
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
#include "autoconfig.h"
|
||||||
|
#else
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
#endif /* RECOLL */
|
||||||
#endif /* HAVE_CONFIG_H */
|
#endif /* HAVE_CONFIG_H */
|
||||||
|
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
||||||
|
caught writing another binary search */
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <algorithm>
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
using std::map;
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
Storage for the exception translations. These are chars which
|
||||||
|
should not be translated according to what UnicodeData says, but
|
||||||
|
instead according to some local rule. There will usually be very
|
||||||
|
few of them, but they must be looked up for every translated char.
|
||||||
|
|
||||||
|
We use a sorted vector for fastest elimination by binary search and
|
||||||
|
a vector<string> to store the translations
|
||||||
|
*/
|
||||||
|
static vector<unsigned short> except_chars;
|
||||||
|
static vector<string> except_trans;
|
||||||
|
static inline size_t is_except_char(unsigned short c)
|
||||||
|
{
|
||||||
|
vector<unsigned short>::iterator it =
|
||||||
|
std::lower_bound(except_chars.begin(), except_chars.end(), c);
|
||||||
|
if (it == except_chars.end() || *it != c) {
|
||||||
|
return (size_t(-1));
|
||||||
|
}
|
||||||
|
return std::distance(except_chars.begin(), it);
|
||||||
|
}
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If configure.in has not defined this symbol, assume const. It
|
* If configure.in has not defined this symbol, assume const. It
|
||||||
* does not harm much: a warning will be issued during compilation.
|
* does not harm much: a warning will be issued during compilation.
|
||||||
*/
|
*/
|
||||||
#ifndef ICONV_CONST
|
#ifndef ICONV_CONST
|
||||||
|
#ifdef RCL_ICONV_INBUF_CONST
|
||||||
|
#define ICONV_CONST const
|
||||||
|
#else
|
||||||
#define ICONV_CONST
|
#define ICONV_CONST
|
||||||
|
#endif
|
||||||
#endif /* ICONV_CONST */
|
#endif /* ICONV_CONST */
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -12622,12 +12664,12 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
|||||||
char* out;
|
char* out;
|
||||||
int out_size;
|
int out_size;
|
||||||
int out_length;
|
int out_length;
|
||||||
int i;
|
unsigned int i;
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
|
|
||||||
out = *outp;
|
out = *outp;
|
||||||
out = realloc(out, out_size + 1);
|
out = (char*)realloc(out, out_size + 1);
|
||||||
if(out == 0) {
|
if(out == 0) {
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
@ -12646,11 +12688,25 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
|||||||
/*
|
/*
|
||||||
* Lookup the tables for decomposition information
|
* Lookup the tables for decomposition information
|
||||||
*/
|
*/
|
||||||
if (dofold) {
|
#ifdef RECOLL_DATADIR
|
||||||
unacfold_char_utf16(c, p, l);
|
size_t idx;
|
||||||
|
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
|
||||||
|
p = (unsigned short *)(except_trans[idx].c_str() + 2);
|
||||||
|
l = (except_trans[idx].size() - 2) / 2;
|
||||||
|
/* unsigned char *cp = (unsigned char *)p;
|
||||||
|
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
|
||||||
|
(unsigned int)cp[1]);*/
|
||||||
} else {
|
} else {
|
||||||
unac_char_utf16(c, p, l);
|
#endif /* RECOLL_DATADIR */
|
||||||
|
if (dofold) {
|
||||||
|
unacfold_char_utf16(c, p, l);
|
||||||
|
} else {
|
||||||
|
unac_char_utf16(c, p, l);
|
||||||
|
}
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
}
|
}
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Explain what's done in great detail
|
* Explain what's done in great detail
|
||||||
*/
|
*/
|
||||||
@ -12678,7 +12734,7 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
|||||||
char *saved;
|
char *saved;
|
||||||
out_size += ((l + 1) * 2) + 1024;
|
out_size += ((l + 1) * 2) + 1024;
|
||||||
saved = out;
|
saved = out;
|
||||||
out = realloc(out, out_size);
|
out = (char *)realloc(out, out_size);
|
||||||
if(out == 0) {
|
if(out == 0) {
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
DEBUG("realloc %d bytes failed\n", out_size);
|
DEBUG("realloc %d bytes failed\n", out_size);
|
||||||
@ -12798,7 +12854,7 @@ static int convert(const char* from, const char* to,
|
|||||||
out_size = in_length > 0 ? in_length : 1024;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
|
|
||||||
out = *outp;
|
out = *outp;
|
||||||
out = realloc(out, out_size + 1);
|
out = (char *)realloc(out, out_size + 1);
|
||||||
if(out == 0) {
|
if(out == 0) {
|
||||||
/* *outp still valid, no freeing */
|
/* *outp still valid, no freeing */
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
@ -12884,7 +12940,7 @@ static int convert(const char* from, const char* to,
|
|||||||
{
|
{
|
||||||
char *saved = out_base;
|
char *saved = out_base;
|
||||||
/* +1 for null */
|
/* +1 for null */
|
||||||
out_base = realloc(out_base, out_size + 1);
|
out_base = (char *)realloc(out_base, out_size + 1);
|
||||||
if (out_base == 0) {
|
if (out_base == 0) {
|
||||||
/* *outp potentially not valid any more. Free here,
|
/* *outp potentially not valid any more. Free here,
|
||||||
* and zero out */
|
* and zero out */
|
||||||
@ -12929,7 +12985,7 @@ int unacmaybefold_string(const char* charset,
|
|||||||
*/
|
*/
|
||||||
if (in_length <= 0) {
|
if (in_length <= 0) {
|
||||||
if(!*outp) {
|
if(!*outp) {
|
||||||
if ((*outp = malloc(32)) == 0)
|
if ((*outp = (char*)malloc(32)) == 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
(*outp)[0] = '\0';
|
(*outp)[0] = '\0';
|
||||||
@ -12975,3 +13031,64 @@ const char* unac_version(void)
|
|||||||
return UNAC_VERSION;
|
return UNAC_VERSION;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
void unac_set_except_translations(const char *spectrans)
|
||||||
|
{
|
||||||
|
except_chars.clear();
|
||||||
|
except_trans.clear();
|
||||||
|
if (!spectrans || !spectrans[0])
|
||||||
|
return;
|
||||||
|
|
||||||
|
// The translation tables out of Unicode are in machine byte order (we
|
||||||
|
// just let the compiler read the values).
|
||||||
|
// For the translation part, we need to choose our encoding in accordance )
|
||||||
|
// (16BE or 16LE depending on processor)
|
||||||
|
// On the contrary, the source char is always to be compared to
|
||||||
|
// the input text, which is encoded in UTF-16BE ... What a mess.
|
||||||
|
static const char *machinecoding = 0;
|
||||||
|
bool littleendian = true;
|
||||||
|
if (machinecoding == 0) {
|
||||||
|
const char* charshort = "\001\002";
|
||||||
|
short *ip = (short *)charshort;
|
||||||
|
if (*ip == 0x0102) {
|
||||||
|
littleendian = false;
|
||||||
|
machinecoding = "UTF-16BE";
|
||||||
|
} else {
|
||||||
|
littleendian = true;
|
||||||
|
machinecoding = "UTF-16LE";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<string> vtrans;
|
||||||
|
stringToStrings(spectrans, vtrans);
|
||||||
|
|
||||||
|
for (vector<string>::iterator it = vtrans.begin();
|
||||||
|
it != vtrans.end(); it++) {
|
||||||
|
|
||||||
|
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||||
|
char *out = 0;
|
||||||
|
size_t outsize;
|
||||||
|
if (convert("UTF-8", machinecoding,
|
||||||
|
it->c_str(), it->size(),
|
||||||
|
&out, &outsize) != 0 || outsize < 2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* The source char must be utf-16be as this is what we convert the
|
||||||
|
input text to for internal processing */
|
||||||
|
unsigned short ch;
|
||||||
|
if (littleendian)
|
||||||
|
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||||
|
else
|
||||||
|
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||||
|
|
||||||
|
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
|
||||||
|
except_chars.push_back(ch);
|
||||||
|
// We keep ch as the first 2 bytes in the translation so that
|
||||||
|
// both vectors sort identically
|
||||||
|
except_trans.push_back(string((const char *)out, outsize));
|
||||||
|
free(out);
|
||||||
|
}
|
||||||
|
std::sort(except_chars.begin(), except_chars.end());
|
||||||
|
std::sort(except_trans.begin(), except_trans.end());
|
||||||
|
}
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|||||||
16
unac/unac.h
16
unac/unac.h
@ -116,6 +116,22 @@ int unacfold_string(const char* charset,
|
|||||||
/* To be called before starting threads in mt programs */
|
/* To be called before starting threads in mt programs */
|
||||||
void unac_init_mt();
|
void unac_init_mt();
|
||||||
|
|
||||||
|
#ifdef RECOLL_DATADIR
|
||||||
|
#include <string>
|
||||||
|
/**
|
||||||
|
* Set exceptions for unaccenting, for characters which should not be
|
||||||
|
* handled according to what the Unicode tables say. For example "a
|
||||||
|
* with circle above" should not be stripped to a in swedish, etc.
|
||||||
|
*
|
||||||
|
* @param spectrans defines the translations as a blank separated list of
|
||||||
|
* UTF-8 strings. Inside each string, the first character is the exception
|
||||||
|
* the rest is the translation (which may be empty). You can use double
|
||||||
|
* quotes for translations which should include white space. The double-quote
|
||||||
|
* can't be an exception character, deal with it...
|
||||||
|
*/
|
||||||
|
void unac_set_except_translations(const char *spectrans);
|
||||||
|
#endif /* RECOLL_DATADIR */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return unac version number.
|
* Return unac version number.
|
||||||
*/
|
*/
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user