From 50501c69538db585fc94c8315dec36154a4ab42b Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 5 Jan 2006 16:37:27 +0000 Subject: [PATCH] Use proper unicode lowercasing --- src/common/unacpp.cpp | 18 ++++++++++++++- src/common/unacpp.h | 3 ++- src/lib/Makefile | 7 ++++-- src/rcldb/rcldb.cpp | 54 ++++++++++++++++++++++++++++--------------- src/utils/Makefile | 9 +++++++- 5 files changed, 67 insertions(+), 24 deletions(-) diff --git a/src/common/unacpp.cpp b/src/common/unacpp.cpp index 5ea46699..7f309e3c 100644 --- a/src/common/unacpp.cpp +++ b/src/common/unacpp.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.4 2005-11-24 07:16:15 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_UNACPP @@ -33,6 +33,22 @@ bool unac_cpp(const std::string &in, std::string &out, const char *encoding) return true; } +bool unac_cpp_utf16be(const std::string &in, std::string &out) +{ + char *cout = 0; + size_t out_len; + + if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) { + char cerrno[20]; + sprintf(cerrno, "%d", errno); + out = string("unac_string failed, errno : ") + cerrno; + return false; + } + out.assign(cout, out_len); + free(cout); + return true; +} + #else // not testing #include diff --git a/src/common/unacpp.h b/src/common/unacpp.h index 90bad9c9..9d38698e 100644 --- a/src/common/unacpp.h +++ b/src/common/unacpp.h @@ -1,11 +1,12 @@ #ifndef _UNACPP_H_INCLUDED_ #define _UNACPP_H_INCLUDED_ -/* @(#$Id: unacpp.h,v 1.1 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes */ #include // A small wrapper for unac.c extern bool unac_cpp(const std::string &in, std::string &out, const char *encoding = "UTF-8"); +extern bool unac_cpp_utf16be(const std::string &in, std::string &out); #endif /* _UNACPP_H_INCLUDED_ */ diff --git a/src/lib/Makefile b/src/lib/Makefile index 88e09ec1..a605ce2b 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -5,7 +5,7 @@ LIBS = librcl.a all: depend $(LIBS) -OBJS = base64.o conftree.o csguess.o debuglog.o \ +OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \ execmd.o wipedir.o \ fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o history.o \ htmlparse.o \ @@ -15,7 +15,8 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \ textsplit.o transcode.o \ unacpp.o unac.o docseq.o sortseq.o copyfile.o -SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ +SRCS = ../utils/caseconvert.cpp ../utils/conftree.cpp \ + ../index/csguess.cpp ../utils/debuglog.cpp \ ../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \ ../utils/wipedir.cpp ../utils/fstreewalk.cpp \ ../common/mh_html.cpp ../common/mh_mail.cpp ../common/mh_exec.cpp \ @@ -45,6 +46,8 @@ debuglog.o : ../utils/debuglog.cpp $(CXX) $(CXXFLAGS) -c $< execmd.o : ../utils/execmd.cpp $(CXX) $(CXXFLAGS) -c $< +caseconvert.o : ../utils/caseconvert.cpp + $(CXX) $(CXXFLAGS) -c $< wipedir.o : ../utils/wipedir.cpp $(CXX) $(CXXFLAGS) -c $< fstreewalk.o : ../utils/fstreewalk.cpp diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index bd606bb2..b1a06d83 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.44 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -22,6 +22,7 @@ using namespace std; #include "pathut.h" #include "smallut.h" #include "pathhash.h" +#include "caseconvert.h" #include "xapian.h" #include @@ -210,31 +211,46 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int) return true; } -// Unaccent and lowercase data: use unac -// for accents, and do it by hand for upper / lower. -// TOBEDONE: lowercasing is done only for ascii letters, just A-Z -> a-z +// Unaccent and lowercase data, replace \n\r with spaces // Removing crlfs is so that we can use the text in the document data fields. +// Use unac for removing accents +// Use our own lower-casing function (built from Unicode tables) +// Everything is converted to/from UTF-16BE at begin/end as this the internal +// format used by the processing functions. +// +// A possible optimization would be to remove accented characters from +// the lowercasing function tables, as we execute unac first. It +// might even be possible must probably non trivial to combine both +// conversions bool Rcl::dumb_string(const string &in, string &out) { - string inter; out.erase(); if (in.empty()) return true; - if (!unac_cpp(in, inter)) { - LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str())); - // Ok, no need to stop the whole show - inter = ""; + + string s1, s2; + for (unsigned int i = 0; i < in.length(); i++) { + if (in[i] == '\n' || in[i] == '\r') + s1 += ' '; + else + s1 += in[i]; } - out.reserve(inter.length()); - for (unsigned int i = 0; i < inter.length(); i++) { - if (inter[i] >= 'A' && inter[i] <= 'Z') { - out += inter[i] + 'a' - 'A'; - } else { - if (inter[i] == '\n' || inter[i] == '\r') - out += ' '; - else - out += inter[i]; - } + if (!transcode(s1, s2, "UTF-8","UTF-16BE")) { + LOGERR(("dumb_string: convert to utf-16be failed\n")); + return false; + } + + if (!unac_cpp_utf16be(s2, s1)) { + LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str())); + return false; + } + if (!ucs2lower(s1, s2)) { + LOGERR(("dumb_string: ucs2lower failed\n")); + return false; + } + if (!transcode(s2, out, "UTF-16BE", "UTF-8")) { + LOGERR(("dumb_string: convert back to utf-8 failed\n")); + return false; } return true; } diff --git a/src/utils/Makefile b/src/utils/Makefile index 2f27b381..75ceb8ff 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -1,5 +1,6 @@ -PROGS = trconftree wipedir smallut trfstreewalk trpathut transcode \ +PROGS = trcaseconvert trconftree wipedir smallut trfstreewalk trpathut \ + transcode \ trmimeparse trexecmd utf8iter idfile all: $(PROGS) $(BIGLIB) @@ -20,6 +21,12 @@ trpathut : $(PATHUT_OBJS) trpathut.o : pathut.cpp pathut.h $(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp +CASECONVERT_OBJS= trcaseconvert.o caseconvert.o $(BIGLIB) +trcaseconvert : $(CASECONVERT_OBJS) + $(CXX) $(CXXFLAGS) -o trcaseconvert $(CASECONVERT_OBJS) +trcaseconvert.o : caseconvert.cpp caseconvert.h + $(CXX) -o trcaseconvert.o -c $(CXXFLAGS) -DTEST_CASECONVERT caseconvert.cpp + EXECMD_OBJS= trexecmd.o $(BIGLIB) trexecmd : $(EXECMD_OBJS) $(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)