integrated case-folding into unac for better performance
This commit is contained in:
parent
9a9ce93728
commit
33f54536ed
@ -5,7 +5,7 @@ LIBS = librcl.a
|
||||
|
||||
all: depend $(LIBS)
|
||||
|
||||
OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
|
||||
OBJS = base64.o conftree.o csguess.o debuglog.o \
|
||||
execmd.o wipedir.o \
|
||||
fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o history.o \
|
||||
htmlparse.o \
|
||||
@ -15,7 +15,7 @@ OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
|
||||
textsplit.o transcode.o \
|
||||
unacpp.o unac.o docseq.o sortseq.o copyfile.o
|
||||
|
||||
SRCS = ../utils/caseconvert.cpp ../utils/conftree.cpp \
|
||||
SRCS = ../utils/conftree.cpp \
|
||||
../index/csguess.cpp ../utils/debuglog.cpp \
|
||||
../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
|
||||
../utils/wipedir.cpp ../utils/fstreewalk.cpp \
|
||||
@ -46,8 +46,6 @@ debuglog.o : ../utils/debuglog.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
execmd.o : ../utils/execmd.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
caseconvert.o : ../utils/caseconvert.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
wipedir.o : ../utils/wipedir.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
fstreewalk.o : ../utils/fstreewalk.cpp
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.47 2006-01-06 13:19:38 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -22,7 +22,6 @@ using namespace std;
|
||||
#include "pathut.h"
|
||||
#include "smallut.h"
|
||||
#include "pathhash.h"
|
||||
#include "caseconvert.h"
|
||||
|
||||
#include "xapian.h"
|
||||
#include <xapian/stem.h>
|
||||
|
||||
1161
src/unac/unac.c
1161
src/unac/unac.c
File diff suppressed because it is too large
Load Diff
165
src/unac/unac.h
165
src/unac/unac.h
@ -32,10 +32,10 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
/* Generated by builder. Do not modify. Start defines */
|
||||
#define UNAC_BLOCK_SHIFT 5
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 178
|
||||
#define UNAC_BLOCK_COUNT 315
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
@ -53,7 +53,7 @@ extern "C" {
|
||||
#define unac_char_utf16(c,p,l) \
|
||||
{ \
|
||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
|
||||
unsigned char position = (c) & UNAC_BLOCK_MASK; \
|
||||
unsigned char position = 2*((c) & UNAC_BLOCK_MASK); \
|
||||
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
|
||||
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
|
||||
if((l) == 1 && *(p) == 0xFFFF) { \
|
||||
@ -62,6 +62,21 @@ extern "C" {
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* Save as unac_ but case-folded
|
||||
*/
|
||||
#define unacfold_char_utf16(c,p,l) \
|
||||
{ \
|
||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT]; \
|
||||
unsigned char position = 2*((c) & UNAC_BLOCK_MASK)+1; \
|
||||
(p) = &(unac_data_table[index][unac_positions[index][position]]); \
|
||||
(l) = unac_positions[index][position + 1] - unac_positions[index][position]; \
|
||||
if((l) == 1 && *(p) == 0xFFFF) { \
|
||||
(p) = 0; \
|
||||
(l) = 0; \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the unaccented equivalent of the UTF-16 string <in> of
|
||||
* length <in_length> in the pointer <out>. The length of the UTF-16
|
||||
@ -77,6 +92,8 @@ extern "C" {
|
||||
*/
|
||||
int unac_string_utf16(const char* in, size_t in_length,
|
||||
char** out, size_t* out_length);
|
||||
int unacfold_string_utf16(const char* in, size_t in_length,
|
||||
char** out, size_t* out_length);
|
||||
|
||||
/*
|
||||
* The semantic of this function is stricly equal to the function
|
||||
@ -92,6 +109,9 @@ int unac_string_utf16(const char* in, size_t in_length,
|
||||
int unac_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** out, size_t* out_length);
|
||||
int unacfold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** out, size_t* out_length);
|
||||
|
||||
/*
|
||||
* Return unac version number.
|
||||
@ -141,7 +161,7 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data);
|
||||
|
||||
/* Generated by builder. Do not modify. Start declarations */
|
||||
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
|
||||
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1];
|
||||
extern unsigned char unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1];
|
||||
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
|
||||
extern unsigned short unac_data0[];
|
||||
extern unsigned short unac_data1[];
|
||||
@ -321,6 +341,143 @@ extern unsigned short unac_data174[];
|
||||
extern unsigned short unac_data175[];
|
||||
extern unsigned short unac_data176[];
|
||||
extern unsigned short unac_data177[];
|
||||
extern unsigned short unac_data178[];
|
||||
extern unsigned short unac_data179[];
|
||||
extern unsigned short unac_data180[];
|
||||
extern unsigned short unac_data181[];
|
||||
extern unsigned short unac_data182[];
|
||||
extern unsigned short unac_data183[];
|
||||
extern unsigned short unac_data184[];
|
||||
extern unsigned short unac_data185[];
|
||||
extern unsigned short unac_data186[];
|
||||
extern unsigned short unac_data187[];
|
||||
extern unsigned short unac_data188[];
|
||||
extern unsigned short unac_data189[];
|
||||
extern unsigned short unac_data190[];
|
||||
extern unsigned short unac_data191[];
|
||||
extern unsigned short unac_data192[];
|
||||
extern unsigned short unac_data193[];
|
||||
extern unsigned short unac_data194[];
|
||||
extern unsigned short unac_data195[];
|
||||
extern unsigned short unac_data196[];
|
||||
extern unsigned short unac_data197[];
|
||||
extern unsigned short unac_data198[];
|
||||
extern unsigned short unac_data199[];
|
||||
extern unsigned short unac_data200[];
|
||||
extern unsigned short unac_data201[];
|
||||
extern unsigned short unac_data202[];
|
||||
extern unsigned short unac_data203[];
|
||||
extern unsigned short unac_data204[];
|
||||
extern unsigned short unac_data205[];
|
||||
extern unsigned short unac_data206[];
|
||||
extern unsigned short unac_data207[];
|
||||
extern unsigned short unac_data208[];
|
||||
extern unsigned short unac_data209[];
|
||||
extern unsigned short unac_data210[];
|
||||
extern unsigned short unac_data211[];
|
||||
extern unsigned short unac_data212[];
|
||||
extern unsigned short unac_data213[];
|
||||
extern unsigned short unac_data214[];
|
||||
extern unsigned short unac_data215[];
|
||||
extern unsigned short unac_data216[];
|
||||
extern unsigned short unac_data217[];
|
||||
extern unsigned short unac_data218[];
|
||||
extern unsigned short unac_data219[];
|
||||
extern unsigned short unac_data220[];
|
||||
extern unsigned short unac_data221[];
|
||||
extern unsigned short unac_data222[];
|
||||
extern unsigned short unac_data223[];
|
||||
extern unsigned short unac_data224[];
|
||||
extern unsigned short unac_data225[];
|
||||
extern unsigned short unac_data226[];
|
||||
extern unsigned short unac_data227[];
|
||||
extern unsigned short unac_data228[];
|
||||
extern unsigned short unac_data229[];
|
||||
extern unsigned short unac_data230[];
|
||||
extern unsigned short unac_data231[];
|
||||
extern unsigned short unac_data232[];
|
||||
extern unsigned short unac_data233[];
|
||||
extern unsigned short unac_data234[];
|
||||
extern unsigned short unac_data235[];
|
||||
extern unsigned short unac_data236[];
|
||||
extern unsigned short unac_data237[];
|
||||
extern unsigned short unac_data238[];
|
||||
extern unsigned short unac_data239[];
|
||||
extern unsigned short unac_data240[];
|
||||
extern unsigned short unac_data241[];
|
||||
extern unsigned short unac_data242[];
|
||||
extern unsigned short unac_data243[];
|
||||
extern unsigned short unac_data244[];
|
||||
extern unsigned short unac_data245[];
|
||||
extern unsigned short unac_data246[];
|
||||
extern unsigned short unac_data247[];
|
||||
extern unsigned short unac_data248[];
|
||||
extern unsigned short unac_data249[];
|
||||
extern unsigned short unac_data250[];
|
||||
extern unsigned short unac_data251[];
|
||||
extern unsigned short unac_data252[];
|
||||
extern unsigned short unac_data253[];
|
||||
extern unsigned short unac_data254[];
|
||||
extern unsigned short unac_data255[];
|
||||
extern unsigned short unac_data256[];
|
||||
extern unsigned short unac_data257[];
|
||||
extern unsigned short unac_data258[];
|
||||
extern unsigned short unac_data259[];
|
||||
extern unsigned short unac_data260[];
|
||||
extern unsigned short unac_data261[];
|
||||
extern unsigned short unac_data262[];
|
||||
extern unsigned short unac_data263[];
|
||||
extern unsigned short unac_data264[];
|
||||
extern unsigned short unac_data265[];
|
||||
extern unsigned short unac_data266[];
|
||||
extern unsigned short unac_data267[];
|
||||
extern unsigned short unac_data268[];
|
||||
extern unsigned short unac_data269[];
|
||||
extern unsigned short unac_data270[];
|
||||
extern unsigned short unac_data271[];
|
||||
extern unsigned short unac_data272[];
|
||||
extern unsigned short unac_data273[];
|
||||
extern unsigned short unac_data274[];
|
||||
extern unsigned short unac_data275[];
|
||||
extern unsigned short unac_data276[];
|
||||
extern unsigned short unac_data277[];
|
||||
extern unsigned short unac_data278[];
|
||||
extern unsigned short unac_data279[];
|
||||
extern unsigned short unac_data280[];
|
||||
extern unsigned short unac_data281[];
|
||||
extern unsigned short unac_data282[];
|
||||
extern unsigned short unac_data283[];
|
||||
extern unsigned short unac_data284[];
|
||||
extern unsigned short unac_data285[];
|
||||
extern unsigned short unac_data286[];
|
||||
extern unsigned short unac_data287[];
|
||||
extern unsigned short unac_data288[];
|
||||
extern unsigned short unac_data289[];
|
||||
extern unsigned short unac_data290[];
|
||||
extern unsigned short unac_data291[];
|
||||
extern unsigned short unac_data292[];
|
||||
extern unsigned short unac_data293[];
|
||||
extern unsigned short unac_data294[];
|
||||
extern unsigned short unac_data295[];
|
||||
extern unsigned short unac_data296[];
|
||||
extern unsigned short unac_data297[];
|
||||
extern unsigned short unac_data298[];
|
||||
extern unsigned short unac_data299[];
|
||||
extern unsigned short unac_data300[];
|
||||
extern unsigned short unac_data301[];
|
||||
extern unsigned short unac_data302[];
|
||||
extern unsigned short unac_data303[];
|
||||
extern unsigned short unac_data304[];
|
||||
extern unsigned short unac_data305[];
|
||||
extern unsigned short unac_data306[];
|
||||
extern unsigned short unac_data307[];
|
||||
extern unsigned short unac_data308[];
|
||||
extern unsigned short unac_data309[];
|
||||
extern unsigned short unac_data310[];
|
||||
extern unsigned short unac_data311[];
|
||||
extern unsigned short unac_data312[];
|
||||
extern unsigned short unac_data313[];
|
||||
extern unsigned short unac_data314[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
|
||||
PROGS = trcaseconvert trconftree wipedir smallut trfstreewalk trpathut \
|
||||
PROGS = trconftree wipedir smallut trfstreewalk trpathut \
|
||||
transcode \
|
||||
trmimeparse trexecmd utf8iter idfile
|
||||
|
||||
@ -21,12 +21,6 @@ trpathut : $(PATHUT_OBJS)
|
||||
trpathut.o : pathut.cpp pathut.h
|
||||
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
|
||||
|
||||
CASECONVERT_OBJS= trcaseconvert.o caseconvert.o $(BIGLIB)
|
||||
trcaseconvert : $(CASECONVERT_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o trcaseconvert $(CASECONVERT_OBJS)
|
||||
trcaseconvert.o : caseconvert.cpp caseconvert.h
|
||||
$(CXX) -o trcaseconvert.o -c $(CXXFLAGS) -DTEST_CASECONVERT caseconvert.cpp
|
||||
|
||||
EXECMD_OBJS= trexecmd.o $(BIGLIB)
|
||||
trexecmd : $(EXECMD_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,10 +0,0 @@
|
||||
#ifndef _CASECONVERT_H_INCLUDED_
|
||||
#define _CASECONVERT_H_INCLUDED_
|
||||
/* @(#$Id: caseconvert.h,v 1.1 2006-01-05 16:16:14 dockes Exp $ (C) 2005 J.F.Dockes */
|
||||
#include <string>
|
||||
|
||||
// Lower-case string
|
||||
// Input and output must be utf-16be
|
||||
extern bool ucs2lower(const std::string &in, std::string &out);
|
||||
|
||||
#endif /* _CASECONVERT_H_INCLUDED_ */
|
||||
@ -1,121 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
###############
|
||||
## Use awk and gperf to generate casefolding efficient function
|
||||
awk -F';' \
|
||||
'
|
||||
BEGIN {
|
||||
printf "%%{\n"
|
||||
printf "// Automatically generated by gencasefold.sh, do not edit\n"
|
||||
printf "#ifndef TEST_CASECONVERT\n"
|
||||
printf "%%}\n"
|
||||
printf "struct mapping { char *name; unsigned short value; };\n\n"
|
||||
printf("%%%%\n");
|
||||
}
|
||||
/^#/{next}
|
||||
/^$/{next}
|
||||
{
|
||||
if (length($1) <= 4 && ($2 ~ "C" || $2 ~ "S")) {
|
||||
gsub(" ", "", $3);
|
||||
printf "%s, 0x%s\n", $1, $3
|
||||
}
|
||||
#else {printf "T/F/higher plane line: %s\n", $0}
|
||||
}
|
||||
' \
|
||||
< CaseFolding.txt | \
|
||||
gperf -I -n -LC++ -t > caseconvert.cpp
|
||||
|
||||
|
||||
#############
|
||||
## Append wrapper function
|
||||
|
||||
cat <<EOF >> caseconvert.cpp
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include "caseconvert.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
// Input and output must be utf-16
|
||||
bool ucs2lower(const string &in, string &out)
|
||||
{
|
||||
if (in.length() < 2) {
|
||||
out.erase();
|
||||
return true;
|
||||
}
|
||||
static const char hex[]="0123456789ABCDEF";
|
||||
char key[5];
|
||||
key[4] = 0;
|
||||
for (unsigned int i = 0; i < in.length() - 1; i += 2) {
|
||||
struct mapping *m;
|
||||
// Convert 16 bits to 4 hex chars as key
|
||||
key[0] = hex[(in[i]&0xf0) >> 4];
|
||||
key[1] = hex[in[i] & 0x0f];
|
||||
key[2] = hex[(in[i+1]&0xf0) >> 4];
|
||||
key[3] = hex[in[i+1] & 0x0f];
|
||||
//fprintf(stderr, "Key: %s\n", key);
|
||||
if ((m = Perfect_Hash::in_word_set(key, 4)) && m->name[0]) {
|
||||
#if 0
|
||||
char sval[50];
|
||||
sprintf(sval, "%X", (unsigned int)(m->value));
|
||||
fprintf(stderr, "svalue: %s\n", sval);
|
||||
#endif
|
||||
out += char((m->value & 0xff00) >> 16);
|
||||
out += char(m->value & 0x00ff);
|
||||
} else
|
||||
{
|
||||
out += in[i];
|
||||
out += in[i+1];
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#else // !TESTING->
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "readfile.h"
|
||||
#include "caseconvert.h"
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 3) {
|
||||
cerr << "Usage: trcaseconvert ifilename ofilename" << endl;
|
||||
cerr << "Input and output must be utf16be" << endl;
|
||||
exit(1);
|
||||
}
|
||||
const string ifilename = argv[1];
|
||||
const string ofilename = argv[2];
|
||||
|
||||
string text;
|
||||
if (!file_to_string(ifilename, text)) {
|
||||
cerr << "Couldnt read file, errno " << errno << endl;
|
||||
exit(1);
|
||||
}
|
||||
string out;
|
||||
if (!ucs2lower(text, out)) {
|
||||
cerr << "ucs2lower failed" << endl;
|
||||
exit(1);
|
||||
}
|
||||
int fd = open(ofilename.c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0666);
|
||||
if (fd < 0) {
|
||||
perror("Open/create output");
|
||||
exit(1);
|
||||
}
|
||||
if (write(fd, out.c_str(), out.length()) != (int)out.length()) {
|
||||
perror("write");
|
||||
exit(1);
|
||||
}
|
||||
close(fd);
|
||||
exit(0);
|
||||
}
|
||||
#endif // TEST_CASEFOLDING
|
||||
EOF
|
||||
Loading…
x
Reference in New Issue
Block a user