Use proper unicode lowercasing

This commit is contained in:
dockes 2006-01-05 16:37:27 +00:00
parent 66878ddf15
commit 50501c6953
5 changed files with 67 additions and 24 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.4 2005-11-24 07:16:15 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_UNACPP
@ -33,6 +33,22 @@ bool unac_cpp(const std::string &in, std::string &out, const char *encoding)
return true;
}
bool unac_cpp_utf16be(const std::string &in, std::string &out)
{
char *cout = 0;
size_t out_len;
if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) {
char cerrno[20];
sprintf(cerrno, "%d", errno);
out = string("unac_string failed, errno : ") + cerrno;
return false;
}
out.assign(cout, out_len);
free(cout);
return true;
}
#else // not testing
#include <unistd.h>

View File

@ -1,11 +1,12 @@
#ifndef _UNACPP_H_INCLUDED_
#define _UNACPP_H_INCLUDED_
/* @(#$Id: unacpp.h,v 1.1 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
// A small wrapper for unac.c
extern bool unac_cpp(const std::string &in, std::string &out,
const char *encoding = "UTF-8");
extern bool unac_cpp_utf16be(const std::string &in, std::string &out);
#endif /* _UNACPP_H_INCLUDED_ */

View File

@ -5,7 +5,7 @@ LIBS = librcl.a
all: depend $(LIBS)
OBJS = base64.o conftree.o csguess.o debuglog.o \
OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
execmd.o wipedir.o \
fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o history.o \
htmlparse.o \
@ -15,7 +15,8 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \
textsplit.o transcode.o \
unacpp.o unac.o docseq.o sortseq.o copyfile.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
SRCS = ../utils/caseconvert.cpp ../utils/conftree.cpp \
../index/csguess.cpp ../utils/debuglog.cpp \
../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
../utils/wipedir.cpp ../utils/fstreewalk.cpp \
../common/mh_html.cpp ../common/mh_mail.cpp ../common/mh_exec.cpp \
@ -45,6 +46,8 @@ debuglog.o : ../utils/debuglog.cpp
$(CXX) $(CXXFLAGS) -c $<
execmd.o : ../utils/execmd.cpp
$(CXX) $(CXXFLAGS) -c $<
caseconvert.o : ../utils/caseconvert.cpp
$(CXX) $(CXXFLAGS) -c $<
wipedir.o : ../utils/wipedir.cpp
$(CXX) $(CXXFLAGS) -c $<
fstreewalk.o : ../utils/fstreewalk.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.44 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -22,6 +22,7 @@ using namespace std;
#include "pathut.h"
#include "smallut.h"
#include "pathhash.h"
#include "caseconvert.h"
#include "xapian.h"
#include <xapian/stem.h>
@ -210,31 +211,46 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
return true;
}
// Unaccent and lowercase data: use unac
// for accents, and do it by hand for upper / lower.
// TOBEDONE: lowercasing is done only for ascii letters, just A-Z -> a-z
// Unaccent and lowercase data, replace \n\r with spaces
// Removing crlfs is so that we can use the text in the document data fields.
// Use unac for removing accents
// Use our own lower-casing function (built from Unicode tables)
// Everything is converted to/from UTF-16BE at begin/end as this the internal
// format used by the processing functions.
//
// A possible optimization would be to remove accented characters from
// the lowercasing function tables, as we execute unac first. It
// might even be possible must probably non trivial to combine both
// conversions
bool Rcl::dumb_string(const string &in, string &out)
{
string inter;
out.erase();
if (in.empty())
return true;
if (!unac_cpp(in, inter)) {
LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
// Ok, no need to stop the whole show
inter = "";
string s1, s2;
for (unsigned int i = 0; i < in.length(); i++) {
if (in[i] == '\n' || in[i] == '\r')
s1 += ' ';
else
s1 += in[i];
}
out.reserve(inter.length());
for (unsigned int i = 0; i < inter.length(); i++) {
if (inter[i] >= 'A' && inter[i] <= 'Z') {
out += inter[i] + 'a' - 'A';
} else {
if (inter[i] == '\n' || inter[i] == '\r')
out += ' ';
else
out += inter[i];
}
if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
LOGERR(("dumb_string: convert to utf-16be failed\n"));
return false;
}
if (!unac_cpp_utf16be(s2, s1)) {
LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
return false;
}
if (!ucs2lower(s1, s2)) {
LOGERR(("dumb_string: ucs2lower failed\n"));
return false;
}
if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
LOGERR(("dumb_string: convert back to utf-8 failed\n"));
return false;
}
return true;
}

View File

@ -1,5 +1,6 @@
PROGS = trconftree wipedir smallut trfstreewalk trpathut transcode \
PROGS = trcaseconvert trconftree wipedir smallut trfstreewalk trpathut \
transcode \
trmimeparse trexecmd utf8iter idfile
all: $(PROGS) $(BIGLIB)
@ -20,6 +21,12 @@ trpathut : $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
CASECONVERT_OBJS= trcaseconvert.o caseconvert.o $(BIGLIB)
trcaseconvert : $(CASECONVERT_OBJS)
$(CXX) $(CXXFLAGS) -o trcaseconvert $(CASECONVERT_OBJS)
trcaseconvert.o : caseconvert.cpp caseconvert.h
$(CXX) -o trcaseconvert.o -c $(CXXFLAGS) -DTEST_CASECONVERT caseconvert.cpp
EXECMD_OBJS= trexecmd.o $(BIGLIB)
trexecmd : $(EXECMD_OBJS)
$(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)