simplified the except_trans container, previous method was buggy

This commit is contained in:
Jean-Francois Dockes 2012-09-20 13:46:09 +02:00
parent de4225e1ae
commit 1d2f93802f
2 changed files with 32 additions and 60 deletions

View File

@ -31,9 +31,9 @@
#include <map>
#include <string>
#include <algorithm>
#include <tr1/unordered_map>
using std::string;
using std::vector;
using std::map;
using std::tr1::unordered_map;
#include "smallut.h"
/*
@ -41,20 +41,16 @@ using std::map;
should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/
static vector<unsigned short> except_chars;
static vector<string> except_trans;
static inline size_t is_except_char(unsigned short c)
unordered_map<unsigned short, string> except_trans;
static inline bool is_except_char(unsigned short c, string& trans)
{
vector<unsigned short>::iterator it =
std::lower_bound(except_chars.begin(), except_chars.end(), c);
if (it == except_chars.end() || *it != c) {
return (size_t(-1));
}
return std::distance(except_chars.begin(), it);
unordered_map<unsigned short, string>::const_iterator it
= except_trans.find(c);
if (it == except_trans.end())
return false;
trans = it->second;
return true;
}
#endif /* RECOLL_DATADIR */
@ -12715,21 +12711,18 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
// - unaccenting: do nothing (copy original char)
// - unac+fold: use table
// - fold: use the unicode data.
size_t idx;
if (what != UNAC_FOLD && except_chars.size() != 0 &&
(idx=is_except_char(c)) != (size_t)-1) {
string trans;
if (what != UNAC_FOLD && except_trans.size() != 0 &&
is_except_char(c, trans)) {
if (what == UNAC_UNAC) {
// Unaccent only. Do nothing
p = 0;
l = 0;
} else {
// Has to be UNAC_UNACFOLD: use table
p = (unsigned short *)(except_trans[idx].c_str() + 2);
l = (except_trans[idx].size() - 2) / 2;
p = (unsigned short *)trans.c_str();
l = trans.size() / 2;
}
/* if (p) {unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);}*/
} else {
#endif /* RECOLL_DATADIR */
unac_uf_char_utf16_(c, p, l, what)
@ -13076,7 +13069,6 @@ const char* unac_version(void)
#ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans)
{
except_chars.clear();
except_trans.clear();
if (!spectrans || !spectrans[0])
return;
@ -13123,14 +13115,8 @@ void unac_set_except_translations(const char *spectrans)
else
ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
except_trans[ch] = string((const char *)(out + 2), outsize-2);
free(out);
}
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
}
#endif /* RECOLL_DATADIR */

View File

@ -31,9 +31,9 @@
#include <map>
#include <string>
#include <algorithm>
#include <tr1/unordered_map>
using std::string;
using std::vector;
using std::map;
using std::tr1::unordered_map;
#include "smallut.h"
/*
@ -41,20 +41,16 @@ using std::map;
should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/
static vector<unsigned short> except_chars;
static vector<string> except_trans;
static inline size_t is_except_char(unsigned short c)
unordered_map<unsigned short, string> except_trans;
static inline bool is_except_char(unsigned short c, string& trans)
{
vector<unsigned short>::iterator it =
std::lower_bound(except_chars.begin(), except_chars.end(), c);
if (it == except_chars.end() || *it != c) {
return (size_t(-1));
}
return std::distance(except_chars.begin(), it);
unordered_map<unsigned short, string>::const_iterator it
= except_trans.find(c);
if (it == except_trans.end())
return false;
trans = it->second;
return true;
}
#endif /* RECOLL_DATADIR */
@ -12715,21 +12711,18 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
// - unaccenting: do nothing (copy original char)
// - unac+fold: use table
// - fold: use the unicode data.
size_t idx;
if (what != UNAC_FOLD && except_chars.size() != 0 &&
(idx=is_except_char(c)) != (size_t)-1) {
string trans;
if (what != UNAC_FOLD && except_trans.size() != 0 &&
is_except_char(c, trans)) {
if (what == UNAC_UNAC) {
// Unaccent only. Do nothing
p = 0;
l = 0;
} else {
// Has to be UNAC_UNACFOLD: use table
p = (unsigned short *)(except_trans[idx].c_str() + 2);
l = (except_trans[idx].size() - 2) / 2;
p = (unsigned short *)trans.c_str();
l = trans.size() / 2;
}
/* if (p) {unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);}*/
} else {
#endif /* RECOLL_DATADIR */
unac_uf_char_utf16_(c, p, l, what)
@ -13076,7 +13069,6 @@ const char* unac_version(void)
#ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans)
{
except_chars.clear();
except_trans.clear();
if (!spectrans || !spectrans[0])
return;
@ -13123,14 +13115,8 @@ void unac_set_except_translations(const char *spectrans)
else
ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
except_trans[ch] = string((const char *)(out + 2), outsize-2);
free(out);
}
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
}
#endif /* RECOLL_DATADIR */