integrated case-folding into unac for better performance

This commit is contained in:
dockes 2006-01-06 13:19:38 +00:00
parent 9a9ce93728
commit 33f54536ed
9 changed files with 1034 additions and 3269 deletions

View File

@ -5,7 +5,7 @@ LIBS = librcl.a
all: depend $(LIBS)
OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
OBJS = base64.o conftree.o csguess.o debuglog.o \
execmd.o wipedir.o \
fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o history.o \
htmlparse.o \
@ -15,7 +15,7 @@ OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
textsplit.o transcode.o \
unacpp.o unac.o docseq.o sortseq.o copyfile.o
SRCS = ../utils/caseconvert.cpp ../utils/conftree.cpp \
SRCS = ../utils/conftree.cpp \
../index/csguess.cpp ../utils/debuglog.cpp \
../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
../utils/wipedir.cpp ../utils/fstreewalk.cpp \
@ -46,8 +46,6 @@ debuglog.o : ../utils/debuglog.cpp
$(CXX) $(CXXFLAGS) -c $<
execmd.o : ../utils/execmd.cpp
$(CXX) $(CXXFLAGS) -c $<
caseconvert.o : ../utils/caseconvert.cpp
$(CXX) $(CXXFLAGS) -c $<
wipedir.o : ../utils/wipedir.cpp
$(CXX) $(CXXFLAGS) -c $<
fstreewalk.o : ../utils/fstreewalk.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.47 2006-01-06 13:19:38 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -22,7 +22,6 @@ using namespace std;
#include "pathut.h"
#include "smallut.h"
#include "pathhash.h"
#include "caseconvert.h"
#include "xapian.h"
#include <xapian/stem.h>

File diff suppressed because it is too large Load Diff

View File

@ -32,10 +32,10 @@ extern "C" {
#endif
/* Generated by builder. Do not modify. Start defines */
#define UNAC_BLOCK_SHIFT 5
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 178
#define UNAC_BLOCK_COUNT 315
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -53,7 +53,7 @@ extern "C" {
#define unac_char_utf16(c,p,l) \
{ \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = (c) & UNAC_BLOCK_MASK; \
unsigned char position = 2*((c) & UNAC_BLOCK_MASK); \
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
if((l) == 1 && *(p) == 0xFFFF) { \
@ -62,6 +62,21 @@ extern "C" {
} \
}
/*
* Save as unac_ but case-folded
*/
#define unacfold_char_utf16(c,p,l) \
{ \
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
unsigned char position = 2*((c) & UNAC_BLOCK_MASK)+1; \
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
if((l) == 1 && *(p) == 0xFFFF) { \
(p) = 0; \
(l) = 0; \
} \
}
/*
* Return the unaccented equivalent of the UTF-16 string <in> of
* length <in_length> in the pointer <out>. The length of the UTF-16
@ -77,6 +92,8 @@ extern "C" {
*/
int unac_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
int unacfold_string_utf16(const char* in, size_t in_length,
char** out, size_t* out_length);
/*
* The semantic of this function is stricly equal to the function
@ -92,6 +109,9 @@ int unac_string_utf16(const char* in, size_t in_length,
int unac_string(const char* charset,
const char* in, size_t in_length,
char** out, size_t* out_length);
int unacfold_string(const char* charset,
const char* in, size_t in_length,
char** out, size_t* out_length);
/*
* Return unac version number.
@ -141,7 +161,7 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data);
/* Generated by builder. Do not modify. Start declarations */
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1];
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1];
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
extern unsigned short unac_data0[];
extern unsigned short unac_data1[];
@ -321,6 +341,143 @@ extern unsigned short unac_data174[];
extern unsigned short unac_data175[];
extern unsigned short unac_data176[];
extern unsigned short unac_data177[];
extern unsigned short unac_data178[];
extern unsigned short unac_data179[];
extern unsigned short unac_data180[];
extern unsigned short unac_data181[];
extern unsigned short unac_data182[];
extern unsigned short unac_data183[];
extern unsigned short unac_data184[];
extern unsigned short unac_data185[];
extern unsigned short unac_data186[];
extern unsigned short unac_data187[];
extern unsigned short unac_data188[];
extern unsigned short unac_data189[];
extern unsigned short unac_data190[];
extern unsigned short unac_data191[];
extern unsigned short unac_data192[];
extern unsigned short unac_data193[];
extern unsigned short unac_data194[];
extern unsigned short unac_data195[];
extern unsigned short unac_data196[];
extern unsigned short unac_data197[];
extern unsigned short unac_data198[];
extern unsigned short unac_data199[];
extern unsigned short unac_data200[];
extern unsigned short unac_data201[];
extern unsigned short unac_data202[];
extern unsigned short unac_data203[];
extern unsigned short unac_data204[];
extern unsigned short unac_data205[];
extern unsigned short unac_data206[];
extern unsigned short unac_data207[];
extern unsigned short unac_data208[];
extern unsigned short unac_data209[];
extern unsigned short unac_data210[];
extern unsigned short unac_data211[];
extern unsigned short unac_data212[];
extern unsigned short unac_data213[];
extern unsigned short unac_data214[];
extern unsigned short unac_data215[];
extern unsigned short unac_data216[];
extern unsigned short unac_data217[];
extern unsigned short unac_data218[];
extern unsigned short unac_data219[];
extern unsigned short unac_data220[];
extern unsigned short unac_data221[];
extern unsigned short unac_data222[];
extern unsigned short unac_data223[];
extern unsigned short unac_data224[];
extern unsigned short unac_data225[];
extern unsigned short unac_data226[];
extern unsigned short unac_data227[];
extern unsigned short unac_data228[];
extern unsigned short unac_data229[];
extern unsigned short unac_data230[];
extern unsigned short unac_data231[];
extern unsigned short unac_data232[];
extern unsigned short unac_data233[];
extern unsigned short unac_data234[];
extern unsigned short unac_data235[];
extern unsigned short unac_data236[];
extern unsigned short unac_data237[];
extern unsigned short unac_data238[];
extern unsigned short unac_data239[];
extern unsigned short unac_data240[];
extern unsigned short unac_data241[];
extern unsigned short unac_data242[];
extern unsigned short unac_data243[];
extern unsigned short unac_data244[];
extern unsigned short unac_data245[];
extern unsigned short unac_data246[];
extern unsigned short unac_data247[];
extern unsigned short unac_data248[];
extern unsigned short unac_data249[];
extern unsigned short unac_data250[];
extern unsigned short unac_data251[];
extern unsigned short unac_data252[];
extern unsigned short unac_data253[];
extern unsigned short unac_data254[];
extern unsigned short unac_data255[];
extern unsigned short unac_data256[];
extern unsigned short unac_data257[];
extern unsigned short unac_data258[];
extern unsigned short unac_data259[];
extern unsigned short unac_data260[];
extern unsigned short unac_data261[];
extern unsigned short unac_data262[];
extern unsigned short unac_data263[];
extern unsigned short unac_data264[];
extern unsigned short unac_data265[];
extern unsigned short unac_data266[];
extern unsigned short unac_data267[];
extern unsigned short unac_data268[];
extern unsigned short unac_data269[];
extern unsigned short unac_data270[];
extern unsigned short unac_data271[];
extern unsigned short unac_data272[];
extern unsigned short unac_data273[];
extern unsigned short unac_data274[];
extern unsigned short unac_data275[];
extern unsigned short unac_data276[];
extern unsigned short unac_data277[];
extern unsigned short unac_data278[];
extern unsigned short unac_data279[];
extern unsigned short unac_data280[];
extern unsigned short unac_data281[];
extern unsigned short unac_data282[];
extern unsigned short unac_data283[];
extern unsigned short unac_data284[];
extern unsigned short unac_data285[];
extern unsigned short unac_data286[];
extern unsigned short unac_data287[];
extern unsigned short unac_data288[];
extern unsigned short unac_data289[];
extern unsigned short unac_data290[];
extern unsigned short unac_data291[];
extern unsigned short unac_data292[];
extern unsigned short unac_data293[];
extern unsigned short unac_data294[];
extern unsigned short unac_data295[];
extern unsigned short unac_data296[];
extern unsigned short unac_data297[];
extern unsigned short unac_data298[];
extern unsigned short unac_data299[];
extern unsigned short unac_data300[];
extern unsigned short unac_data301[];
extern unsigned short unac_data302[];
extern unsigned short unac_data303[];
extern unsigned short unac_data304[];
extern unsigned short unac_data305[];
extern unsigned short unac_data306[];
extern unsigned short unac_data307[];
extern unsigned short unac_data308[];
extern unsigned short unac_data309[];
extern unsigned short unac_data310[];
extern unsigned short unac_data311[];
extern unsigned short unac_data312[];
extern unsigned short unac_data313[];
extern unsigned short unac_data314[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
PROGS = trcaseconvert trconftree wipedir smallut trfstreewalk trpathut \
PROGS = trconftree wipedir smallut trfstreewalk trpathut \
transcode \
trmimeparse trexecmd utf8iter idfile
@ -21,12 +21,6 @@ trpathut : $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
CASECONVERT_OBJS= trcaseconvert.o caseconvert.o $(BIGLIB)
trcaseconvert : $(CASECONVERT_OBJS)
$(CXX) $(CXXFLAGS) -o trcaseconvert $(CASECONVERT_OBJS)
trcaseconvert.o : caseconvert.cpp caseconvert.h
$(CXX) -o trcaseconvert.o -c $(CXXFLAGS) -DTEST_CASECONVERT caseconvert.cpp
EXECMD_OBJS= trexecmd.o $(BIGLIB)
trexecmd : $(EXECMD_OBJS)
$(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +0,0 @@
#ifndef _CASECONVERT_H_INCLUDED_
#define _CASECONVERT_H_INCLUDED_
/* @(#$Id: caseconvert.h,v 1.1 2006-01-05 16:16:14 dockes Exp $ (C) 2005 J.F.Dockes */
#include <string>
// Lower-case string
// Input and output must be utf-16be
extern bool ucs2lower(const std::string &in, std::string &out);
#endif /* _CASECONVERT_H_INCLUDED_ */

View File

@ -1,121 +0,0 @@
#!/bin/sh
###############
## Use awk and gperf to generate casefolding efficient function
awk -F';' \
'
BEGIN {
printf "%%{\n"
printf "// Automatically generated by gencasefold.sh, do not edit\n"
printf "#ifndef TEST_CASECONVERT\n"
printf "%%}\n"
printf "struct mapping { char *name; unsigned short value; };\n\n"
printf("%%%%\n");
}
/^#/{next}
/^$/{next}
{
if (length($1) <= 4 && ($2 ~ "C" || $2 ~ "S")) {
gsub(" ", "", $3);
printf "%s, 0x%s\n", $1, $3
}
#else {printf "T/F/higher plane line: %s\n", $0}
}
' \
< CaseFolding.txt | \
gperf -I -n -LC++ -t > caseconvert.cpp
#############
## Append wrapper function
cat <<EOF >> caseconvert.cpp
#include <stdio.h>
#include <string>
#include "caseconvert.h"
using std::string;
// Input and output must be utf-16
bool ucs2lower(const string &in, string &out)
{
if (in.length() < 2) {
out.erase();
return true;
}
static const char hex[]="0123456789ABCDEF";
char key[5];
key[4] = 0;
for (unsigned int i = 0; i < in.length() - 1; i += 2) {
struct mapping *m;
// Convert 16 bits to 4 hex chars as key
key[0] = hex[(in[i]&0xf0) >> 4];
key[1] = hex[in[i] & 0x0f];
key[2] = hex[(in[i+1]&0xf0) >> 4];
key[3] = hex[in[i+1] & 0x0f];
//fprintf(stderr, "Key: %s\n", key);
if ((m = Perfect_Hash::in_word_set(key, 4)) && m->name[0]) {
#if 0
char sval[50];
sprintf(sval, "%X", (unsigned int)(m->value));
fprintf(stderr, "svalue: %s\n", sval);
#endif
out += char((m->value & 0xff00) >> 16);
out += char(m->value & 0x00ff);
} else
{
out += in[i];
out += in[i+1];
}
}
return true;
}
#else // !TESTING->
#include <errno.h>
#include <string>
#include <iostream>
#include <unistd.h>
#include <fcntl.h>
using namespace std;
#include "readfile.h"
#include "caseconvert.h"
int main(int argc, char **argv)
{
if (argc != 3) {
cerr << "Usage: trcaseconvert ifilename ofilename" << endl;
cerr << "Input and output must be utf16be" << endl;
exit(1);
}
const string ifilename = argv[1];
const string ofilename = argv[2];
string text;
if (!file_to_string(ifilename, text)) {
cerr << "Couldnt read file, errno " << errno << endl;
exit(1);
}
string out;
if (!ucs2lower(text, out)) {
cerr << "ucs2lower failed" << endl;
exit(1);
}
int fd = open(ofilename.c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0666);
if (fd < 0) {
perror("Open/create output");
exit(1);
}
if (write(fd, out.c_str(), out.length()) != (int)out.length()) {
perror("write");
exit(1);
}
close(fd);
exit(0);
}
#endif // TEST_CASEFOLDING
EOF