simplified the except_trans container, previous method was buggy

This commit is contained in:
Jean-Francois Dockes 2012-09-20 13:46:09 +02:00
parent de4225e1ae
commit 1d2f93802f
2 changed files with 32 additions and 60 deletions

View File

@ -31,9 +31,9 @@
#include <map> #include <map>
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include <tr1/unordered_map>
using std::string; using std::string;
using std::vector; using std::tr1::unordered_map;
using std::map;
#include "smallut.h" #include "smallut.h"
/* /*
@ -41,20 +41,16 @@ using std::map;
should not be translated according to what UnicodeData says, but should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char. few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/ */
static vector<unsigned short> except_chars; unordered_map<unsigned short, string> except_trans;
static vector<string> except_trans; static inline bool is_except_char(unsigned short c, string& trans)
static inline size_t is_except_char(unsigned short c)
{ {
vector<unsigned short>::iterator it = unordered_map<unsigned short, string>::const_iterator it
std::lower_bound(except_chars.begin(), except_chars.end(), c); = except_trans.find(c);
if (it == except_chars.end() || *it != c) { if (it == except_trans.end())
return (size_t(-1)); return false;
} trans = it->second;
return std::distance(except_chars.begin(), it); return true;
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
@ -12715,21 +12711,18 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
// - unaccenting: do nothing (copy original char) // - unaccenting: do nothing (copy original char)
// - unac+fold: use table // - unac+fold: use table
// - fold: use the unicode data. // - fold: use the unicode data.
size_t idx; string trans;
if (what != UNAC_FOLD && except_chars.size() != 0 && if (what != UNAC_FOLD && except_trans.size() != 0 &&
(idx=is_except_char(c)) != (size_t)-1) { is_except_char(c, trans)) {
if (what == UNAC_UNAC) { if (what == UNAC_UNAC) {
// Unaccent only. Do nothing // Unaccent only. Do nothing
p = 0; p = 0;
l = 0; l = 0;
} else { } else {
// Has to be UNAC_UNACFOLD: use table // Has to be UNAC_UNACFOLD: use table
p = (unsigned short *)(except_trans[idx].c_str() + 2); p = (unsigned short *)trans.c_str();
l = (except_trans[idx].size() - 2) / 2; l = trans.size() / 2;
} }
/* if (p) {unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);}*/
} else { } else {
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
unac_uf_char_utf16_(c, p, l, what) unac_uf_char_utf16_(c, p, l, what)
@ -13076,7 +13069,6 @@ const char* unac_version(void)
#ifdef RECOLL_DATADIR #ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans) void unac_set_except_translations(const char *spectrans)
{ {
except_chars.clear();
except_trans.clear(); except_trans.clear();
if (!spectrans || !spectrans[0]) if (!spectrans || !spectrans[0])
return; return;
@ -13123,14 +13115,8 @@ void unac_set_except_translations(const char *spectrans)
else else
ch = (out[0] << 8) | (out[1] & 0xff); ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/ except_trans[ch] = string((const char *)(out + 2), outsize-2);
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
free(out); free(out);
} }
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */

View File

@ -31,9 +31,9 @@
#include <map> #include <map>
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include <tr1/unordered_map>
using std::string; using std::string;
using std::vector; using std::tr1::unordered_map;
using std::map;
#include "smallut.h" #include "smallut.h"
/* /*
@ -41,20 +41,16 @@ using std::map;
should not be translated according to what UnicodeData says, but should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char. few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/ */
static vector<unsigned short> except_chars; unordered_map<unsigned short, string> except_trans;
static vector<string> except_trans; static inline bool is_except_char(unsigned short c, string& trans)
static inline size_t is_except_char(unsigned short c)
{ {
vector<unsigned short>::iterator it = unordered_map<unsigned short, string>::const_iterator it
std::lower_bound(except_chars.begin(), except_chars.end(), c); = except_trans.find(c);
if (it == except_chars.end() || *it != c) { if (it == except_trans.end())
return (size_t(-1)); return false;
} trans = it->second;
return std::distance(except_chars.begin(), it); return true;
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
@ -12715,21 +12711,18 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
// - unaccenting: do nothing (copy original char) // - unaccenting: do nothing (copy original char)
// - unac+fold: use table // - unac+fold: use table
// - fold: use the unicode data. // - fold: use the unicode data.
size_t idx; string trans;
if (what != UNAC_FOLD && except_chars.size() != 0 && if (what != UNAC_FOLD && except_trans.size() != 0 &&
(idx=is_except_char(c)) != (size_t)-1) { is_except_char(c, trans)) {
if (what == UNAC_UNAC) { if (what == UNAC_UNAC) {
// Unaccent only. Do nothing // Unaccent only. Do nothing
p = 0; p = 0;
l = 0; l = 0;
} else { } else {
// Has to be UNAC_UNACFOLD: use table // Has to be UNAC_UNACFOLD: use table
p = (unsigned short *)(except_trans[idx].c_str() + 2); p = (unsigned short *)trans.c_str();
l = (except_trans[idx].size() - 2) / 2; l = trans.size() / 2;
} }
/* if (p) {unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);}*/
} else { } else {
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
unac_uf_char_utf16_(c, p, l, what) unac_uf_char_utf16_(c, p, l, what)
@ -13076,7 +13069,6 @@ const char* unac_version(void)
#ifdef RECOLL_DATADIR #ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans) void unac_set_except_translations(const char *spectrans)
{ {
except_chars.clear();
except_trans.clear(); except_trans.clear();
if (!spectrans || !spectrans[0]) if (!spectrans || !spectrans[0])
return; return;
@ -13123,14 +13115,8 @@ void unac_set_except_translations(const char *spectrans)
else else
ch = (out[0] << 8) | (out[1] & 0xff); ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/ except_trans[ch] = string((const char *)(out + 2), outsize-2);
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
free(out); free(out);
} }
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */