added code for unac to perform pure case-folding

This commit is contained in:
Jean-Francois Dockes 2012-08-27 12:40:57 +02:00
parent 9327527101
commit 913dffc597
12 changed files with 2133 additions and 1972 deletions

View File

@ -28,14 +28,27 @@
#include "utf8iter.h"
bool unacmaybefold(const string &in, string &out,
const char *encoding, bool dofold)
const char *encoding, UnacOp what)
{
char *cout = 0;
size_t out_len;
int status;
status = dofold ?
unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len) :
unac_string(encoding, in.c_str(), in.length(), &cout, &out_len);
int status = -1;
switch (what) {
case UNACOP_UNAC:
status = unac_string(encoding, in.c_str(), in.length(),
&cout, &out_len);
break;
case UNACOP_UNACFOLD:
status = unacfold_string(encoding, in.c_str(), in.length(),
&cout, &out_len);
break;
case UNACOP_FOLD:
status = fold_string(encoding, in.c_str(), in.length(),
&cout, &out_len);
break;
}
if (status < 0) {
if (cout)
free(cout);
@ -59,11 +72,11 @@ bool unaciscapital(const string& in)
it.appendchartostring(shorter);
string noacterm, noaclowterm;
if (!unacmaybefold(shorter, noacterm, "UTF-8", false)) {
if (!unacmaybefold(shorter, noacterm, "UTF-8", UNACOP_UNAC)) {
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
return false;
}
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
return false;
}
@ -90,31 +103,76 @@ using namespace std;
#include "readfile.h"
#include "rclinit.h"
static char *thisprog;
static char usage [] = "\n"
"[-c|-C] <encoding> <infile> <outfile>\n"
" Default : unaccent\n"
" -c : unaccent and casefold\n"
" -C : casefold only\n"
"\n";
;
static void
Usage(void)
{
fprintf(stderr, "%s: usage: %s\n", thisprog, usage);
exit(1);
}
static int op_flags;
#define OPT_c 0x2
#define OPT_C 0x4
int main(int argc, char **argv)
{
bool dofold = true;
if (argc != 4) {
cerr << "Usage: unacpp <encoding> <infile> <outfile>" << endl;
exit(1);
UnacOp op = UNACOP_UNAC;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break;
case 'C': op_flags |= OPT_C; break;
default: Usage(); break;
}
argc--; argv++;
}
const char *encoding = argv[1];
string ifn = argv[2];
if (op_flags & OPT_c) {
op = UNACOP_UNACFOLD;
} else if (op_flags & OPT_C) {
op = UNACOP_FOLD;
}
if (argc != 3) {
Usage();
}
const char *encoding = *argv++; argc--;
string ifn = *argv++; argc--;
if (!ifn.compare("stdin"))
ifn.clear();
const char *ofn = argv[3];
const char *ofn = *argv++; argc--;
string reason;
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
string odata;
if (!file_to_string(ifn, odata)) {
cerr << "file_to_string: " << odata << endl;
exit(1);
cerr << "file_to_string " << ifn << " : " << odata << endl;
return 1;
}
string ndata;
if (!unacmaybefold(odata, ndata, encoding, dofold)) {
if (!unacmaybefold(odata, ndata, encoding, op)) {
cerr << "unac: " << ndata << endl;
exit(1);
return 1;
}
int fd;
@ -126,14 +184,14 @@ int main(int argc, char **argv)
if (fd < 0) {
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
<< endl;
exit(1);
return 1;
}
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
cerr << "Write(2) failed: " << strerror(errno) << endl;
exit(1);
return 1;
}
close(fd);
exit(0);
return 0;
}
#endif

View File

@ -24,8 +24,9 @@ using std::string;
#endif /* NO_NAMESPACES */
// A small stringified wrapper for unac.c
enum UnacOp {UNACOP_UNAC, UNACOP_UNACFOLD, UNACOP_FOLD};
extern bool unacmaybefold(const string& in, string& out,
const char *encoding, bool dofold);
const char *encoding, UnacOp what);
// Utility function to determine if string begins with capital
extern bool unaciscapital(const string& in);

View File

@ -91,7 +91,7 @@ class TextSplitPTR : public TextSplit {
// (phrase or near), update positions list.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb;
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
term.c_str()));
return true;

View File

@ -117,7 +117,7 @@ public:
// approach to collating...)
string sortterm;
// We're not even sure the term is utf8 here (ie: url)
if (!unacmaybefold(term, sortterm, "UTF-8", true)) {
if (!unacmaybefold(term, sortterm, "UTF-8", UNACOP_UNACFOLD)) {
sortterm = term;
}
// Also remove some common uninteresting starting characters

View File

@ -41,7 +41,7 @@ bool StopList::setFile(const string &filename)
for (set<string>::iterator it = stops.begin();
it != stops.end(); it++) {
string dterm;
unacmaybefold(*it, dterm, "UTF-8", true);
unacmaybefold(*it, dterm, "UTF-8", UNACOP_UNACFOLD);
m_stops.insert(dterm);
}

View File

@ -117,7 +117,7 @@ public:
{
m_totalterms++;
string otrm;
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
m_unacerrors++;
// We don't generate a fatal error because of a bad term,

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 418
#define UNAC_BLOCK_COUNT 422
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -45,38 +45,31 @@ extern "C" {
* by <p> is returned in the <l> argument.
* The C++ prototype of this macro would be:
*
* void unac_char(const unsigned short c, unsigned short*& p, int& l)
* void unac_char(const unsigned short c, unsigned short*& p, int& l, int o)
*
* See unac(3) in IMPLEMENTATION NOTES for more information about the
* tables (unac_data_table, unac_positions) layout.
*
* Each transformed char has 3 possible outputs: unaccented, unaccented and
* folded, or just folded. These are kept at offset 0,1,2 in the position table
*/
#define unac_char_utf16(c,p,l) \
{ \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 2*((c) & UNAC_BLOCK_MASK); \
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
if((l) == 1 && *(p) == 0xFFFF) { \
(p) = 0; \
(l) = 0; \
} \
}
/*
* Save as unac_ but case-folded
*/
#define unacfold_char_utf16(c,p,l) \
#define unac_uf_char_utf16_(c,p,l,o) \
{ \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 2*((c) & UNAC_BLOCK_MASK)+1; \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 3*((c) & UNAC_BLOCK_MASK) + (o); \
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
(l) = unac_positions[index][position + 1] \
- unac_positions[index][position]; \
if((l) == 1 && *(p) == 0xFFFF) { \
(p) = 0; \
(l) = 0; \
} \
}
#define unac_char_utf16(c,p,l) unac_uf_char_utf16_((c),(p),(l),0)
#define unacfold_char_utf16(c,p,l) unac_uf_char_utf16_((c),(p),(l),1)
#define fold_char_utf16(c,p,l) unac_uf_char_utf16_((c),(p),(l),2)
/*
* Return the unaccented equivalent of the UTF-16 string <in> of
* length <in_length> in the pointer <out>. The length of the UTF-16
@ -94,6 +87,8 @@ int unac_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
int unacfold_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
int fold_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
/*
* The semantic of this function is stricly equal to the function
@ -112,6 +107,9 @@ int unac_string(const char* charset,
int unacfold_string(const char* charset,
const char* in, size_t in_length,
char** out, size_t* out_length);
int fold_string(const char* charset,
const char* in, size_t in_length,
char** out, size_t* out_length);
/* To be called before starting threads in mt programs */
void unac_init_mt();
@ -180,7 +178,7 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data);
/* Generated by builder. Do not modify. Start declarations */
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1];
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1];
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
extern unsigned short unac_data0[];
extern unsigned short unac_data1[];
@ -600,6 +598,10 @@ extern unsigned short unac_data414[];
extern unsigned short unac_data415[];
extern unsigned short unac_data416[];
extern unsigned short unac_data417[];
extern unsigned short unac_data418[];
extern unsigned short unac_data419[];
extern unsigned short unac_data420[];
extern unsigned short unac_data421[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus

View File

@ -1 +1 @@
#define UNAC_VERSION "1.0.7"
#define UNAC_VERSION "1.7.0"

View File

@ -341,36 +341,50 @@ sub source {
@values = ();
}
$code_value = uc(sprintf("%04x", $code_value));
#print "$code_value UNAC ";
if(exists($decomposition->{$code_value})) {
push(@values, $decomposition->{$code_value});
#print "$decomposition->{$code_value} ";
} else {
push(@values, "FFFF");
#print "FFFF ";
}
# We also push the case-folded version of the unaccented char
# Note that by pushing the case-folded version of the original
# char, we'd have the possibility of independant unaccenting and
# case folding, but with less performance.
# We could also keep the three chunks, using a little more memory
# We push both the case-folded version of the unaccented char
# and the case-folded version of the original one. This
# makes the table a little bigger, but allows
# independantly unaccenting, folding or both
#print "UNACFOLD ";
if(exists($decomposition->{$code_value})) {
my($cv);
my(@vl);
foreach $cv (split(' ', $decomposition->{$code_value})) {
if(exists($casefold->{$cv})) {
push(@vl, $casefold->{$cv});
#print "$casefold->{$cv} ";
} else {
push(@vl, $cv);
#print "$cv ";
}
}
#print STDERR "Pushing " . join(" ", @vl) . " for " .
#$code_value . "\n";
push(@values, join(" ", @vl));
} else {
if(exists($casefold->{$code_value})) {
push(@values, $casefold->{$code_value});
#print "$casefold->{$code_value} ";
} else {
push(@values, "FFFF");
#print "FFFF ";
}
}
#print "FOLD ";
if(exists($casefold->{$code_value})) {
push(@values, $casefold->{$code_value});
#print "$casefold->{$code_value} ";
} else {
push(@values, "FFFF");
#print "FFFF ";
}
#print "\n";
}
print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose);
my($block_size) = 0;
@ -466,7 +480,7 @@ EOF
$block_number++;
}
my($position_type) = $highest_position >= 256 ? "short" : "char";
my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1] = {\n";
my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1] = {\n";
$positions_out .= join(",\n", @positions_out);
$positions_out .= "\n};\n";
@ -481,7 +495,7 @@ EOF
my($declarations);
$declarations = <<EOF;
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1];
extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1];
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
EOF
for($block_number = 0; $block_number < $block_count; $block_number++) {

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 418
#define UNAC_BLOCK_COUNT 422
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -45,38 +45,31 @@ extern "C" {
* by <p> is returned in the <l> argument.
* The C++ prototype of this macro would be:
*
* void unac_char(const unsigned short c, unsigned short*& p, int& l)
* void unac_char(const unsigned short c, unsigned short*& p, int& l, int o)
*
* See unac(3) in IMPLEMENTATION NOTES for more information about the
* tables (unac_data_table, unac_positions) layout.
*
* Each transformed char has 3 possible outputs: unaccented, unaccented and
* folded, or just folded. These are kept at offset 0,1,2 in the position table
*/
#define unac_char_utf16(c,p,l) \
{ \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 2*((c) & UNAC_BLOCK_MASK); \
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
if((l) == 1 && *(p) == 0xFFFF) { \
(p) = 0; \
(l) = 0; \
} \
}
/*
* Save as unac_ but case-folded
*/
#define unacfold_char_utf16(c,p,l) \
#define unac_uf_char_utf16_(c,p,l,o) \
{ \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 2*((c) & UNAC_BLOCK_MASK)+1; \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 3*((c) & UNAC_BLOCK_MASK) + (o); \
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
(l) = unac_positions[index][position + 1] \
- unac_positions[index][position]; \
if((l) == 1 && *(p) == 0xFFFF) { \
(p) = 0; \
(l) = 0; \
} \
}
#define unac_char_utf16(c,p,l) unac_uf_char_utf16_((c),(p),(l),0)
#define unacfold_char_utf16(c,p,l) unac_uf_char_utf16_((c),(p),(l),1)
#define fold_char_utf16(c,p,l) unac_uf_char_utf16_((c),(p),(l),2)
/*
* Return the unaccented equivalent of the UTF-16 string <in> of
* length <in_length> in the pointer <out>. The length of the UTF-16
@ -94,6 +87,8 @@ int unac_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
int unacfold_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
int fold_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
/*
* The semantic of this function is stricly equal to the function
@ -112,6 +107,9 @@ int unac_string(const char* charset,
int unacfold_string(const char* charset,
const char* in, size_t in_length,
char** out, size_t* out_length);
int fold_string(const char* charset,
const char* in, size_t in_length,
char** out, size_t* out_length);
/* To be called before starting threads in mt programs */
void unac_init_mt();
@ -180,7 +178,7 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data);
/* Generated by builder. Do not modify. Start declarations */
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1];
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1];
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
extern unsigned short unac_data0[];
extern unsigned short unac_data1[];
@ -600,6 +598,10 @@ extern unsigned short unac_data414[];
extern unsigned short unac_data415[];
extern unsigned short unac_data416[];
extern unsigned short unac_data417[];
extern unsigned short unac_data418[];
extern unsigned short unac_data419[];
extern unsigned short unac_data420[];
extern unsigned short unac_data421[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus