From 50501c69538db585fc94c8315dec36154a4ab42b Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Thu, 5 Jan 2006 16:37:27 +0000
Subject: [PATCH] Use proper unicode lowercasing

---
 src/common/unacpp.cpp | 18 ++++++++++++++-
 src/common/unacpp.h   |  3 ++-
 src/lib/Makefile      |  7 ++++--
 src/rcldb/rcldb.cpp   | 54 ++++++++++++++++++++++++++++---------------
 src/utils/Makefile    |  9 +++++++-
 5 files changed, 67 insertions(+), 24 deletions(-)
diff --git a/src/common/unacpp.cpp b/src/common/unacpp.cpp
index 5ea46699..7f309e3c 100644
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.4 2005-11-24 07:16:15 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #ifndef TEST_UNACPP
@@ -33,6 +33,22 @@ bool unac_cpp(const std::string &in, std::string &out, const char *encoding)
     return true;
 }
 
+bool unac_cpp_utf16be(const std::string &in, std::string &out)
+{
+    char *cout = 0;
+    size_t out_len;
+
+    if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) {
+	char cerrno[20];
+	sprintf(cerrno, "%d", errno);
+	out = string("unac_string failed, errno : ") + cerrno;
+	return false;
+    }
+    out.assign(cout, out_len);
+    free(cout);
+    return true;
+}
+
 #else // not testing
 
 #include <unistd.h>
diff --git a/src/common/unacpp.h b/src/common/unacpp.h
index 90bad9c9..9d38698e 100644
--- a/src/common/unacpp.h
+++ b/src/common/unacpp.h
@@ -1,11 +1,12 @@
 #ifndef _UNACPP_H_INCLUDED_
 #define _UNACPP_H_INCLUDED_
-/* @(#$Id: unacpp.h,v 1.1 2004-12-17 15:36:13 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 
 // A small wrapper for unac.c
 extern bool unac_cpp(const std::string &in, std::string &out, 
 		     const char *encoding = "UTF-8");
+extern bool unac_cpp_utf16be(const std::string &in, std::string &out);
 
 #endif /* _UNACPP_H_INCLUDED_ */
diff --git a/src/lib/Makefile b/src/lib/Makefile
index 88e09ec1..a605ce2b 100644
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@@ -5,7 +5,7 @@ LIBS = librcl.a
 
 all: depend $(LIBS)
 
-OBJS = base64.o conftree.o csguess.o debuglog.o \
+OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
      execmd.o wipedir.o \
      fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o history.o \
      htmlparse.o \
@@ -15,7 +15,8 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \
      textsplit.o transcode.o \
      unacpp.o unac.o docseq.o sortseq.o copyfile.o
 
-SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
+SRCS = ../utils/caseconvert.cpp ../utils/conftree.cpp \
+     ../index/csguess.cpp ../utils/debuglog.cpp \
      ../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
      ../utils/wipedir.cpp ../utils/fstreewalk.cpp \
      ../common/mh_html.cpp ../common/mh_mail.cpp ../common/mh_exec.cpp \
@@ -45,6 +46,8 @@ debuglog.o : ../utils/debuglog.cpp
 	$(CXX) $(CXXFLAGS) -c $<
 execmd.o : ../utils/execmd.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
+caseconvert.o : ../utils/caseconvert.cpp 
+	$(CXX) $(CXXFLAGS) -c $<
 wipedir.o : ../utils/wipedir.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 fstreewalk.o : ../utils/fstreewalk.cpp 
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index bd606bb2..b1a06d83 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.44 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@@ -22,6 +22,7 @@ using namespace std;
 #include "pathut.h"
 #include "smallut.h"
 #include "pathhash.h"
+#include "caseconvert.h"
 
 #include "xapian.h"
 #include <xapian/stem.h>
@@ -210,31 +211,46 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
     return true;
 }
 
-// Unaccent and lowercase data: use unac 
-// for accents, and do it by hand for upper / lower. 
-// TOBEDONE: lowercasing is done only for ascii letters, just A-Z -> a-z 
+// Unaccent and lowercase data, replace \n\r with spaces
 // Removing crlfs is so that we can use the text in the document data fields.
+// Use unac for removing accents
+// Use our own lower-casing function (built from Unicode tables)
+// Everything is converted to/from UTF-16BE at begin/end as this the internal
+// format used by the processing functions.
+//
+// A possible optimization would be to remove accented characters from
+// the lowercasing function tables, as we execute unac first.  It
+// might even be possible must probably non trivial to combine both
+// conversions
 bool Rcl::dumb_string(const string &in, string &out)
 {
-    string inter;
     out.erase();
     if (in.empty())
 	return true;
-    if (!unac_cpp(in, inter)) {
-	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
-	// Ok, no need to stop the whole show
-	inter = "";
+
+    string s1, s2;
+    for (unsigned int i = 0; i < in.length(); i++) {
+	if (in[i] == '\n' || in[i] == '\r')
+	    s1 += ' ';
+	else
+	    s1 += in[i];
     }
-    out.reserve(inter.length());
-    for (unsigned int i = 0; i < inter.length(); i++) {
-	if (inter[i] >= 'A' && inter[i] <= 'Z') {
-	    out += inter[i] + 'a' - 'A';
-	} else {
-	    if (inter[i] == '\n' || inter[i] == '\r')
-		out += ' ';
-	    else
-		out += inter[i];
-	}
+    if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
+	LOGERR(("dumb_string: convert to utf-16be failed\n"));
+	return false;
+    }
+
+    if (!unac_cpp_utf16be(s2, s1)) {
+	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
+	return false;
+    }
+    if (!ucs2lower(s1, s2)) {
+	LOGERR(("dumb_string: ucs2lower failed\n"));
+	return false;
+    }
+    if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
+	LOGERR(("dumb_string: convert back to utf-8 failed\n"));
+	return false;
     }
     return true;
 }
diff --git a/src/utils/Makefile b/src/utils/Makefile
index 2f27b381..75ceb8ff 100644
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@@ -1,5 +1,6 @@
 
-PROGS = trconftree wipedir smallut trfstreewalk trpathut transcode \
+PROGS = trcaseconvert trconftree wipedir smallut trfstreewalk trpathut \
+      transcode \
       trmimeparse trexecmd utf8iter idfile
 
 all: $(PROGS) $(BIGLIB)
@@ -20,6 +21,12 @@ trpathut : $(PATHUT_OBJS)
 trpathut.o : pathut.cpp pathut.h
 	$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
 
+CASECONVERT_OBJS= trcaseconvert.o caseconvert.o  $(BIGLIB)
+trcaseconvert : $(CASECONVERT_OBJS)
+	$(CXX) $(CXXFLAGS) -o trcaseconvert $(CASECONVERT_OBJS)
+trcaseconvert.o : caseconvert.cpp caseconvert.h
+	$(CXX) -o trcaseconvert.o -c $(CXXFLAGS) -DTEST_CASECONVERT caseconvert.cpp
+
 EXECMD_OBJS= trexecmd.o $(BIGLIB)
 trexecmd : $(EXECMD_OBJS)
 	$(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)