From 9a9ce937283489a24b5c5e6b6ee8d6e282538cf4 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Fri, 6 Jan 2006 13:18:17 +0000
Subject: [PATCH] integrated case-folding into unac for better performance

---
 src/common/Makefile   |  9 +++++----
 src/common/unacpp.cpp | 31 ++++++++++---------------------
 src/common/unacpp.h   | 10 ++++------
 src/rcldb/rcldb.cpp   | 36 ++++++++++--------------------------
 4 files changed, 29 insertions(+), 57 deletions(-)
diff --git a/src/common/Makefile b/src/common/Makefile
index 032c6cda..762c00bc 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -1,5 +1,8 @@
+# @(#$Id: Makefile,v 1.8 2006-01-06 13:18:17 dockes Exp $  (C) 2005 J.F.Dockes
 
-# Only test progs in there
+include ../mk/sysconf
+
+# Only test executables get build in here
 PROGS = internfile unacpp textsplit
 
 all: $(BIGLIB) $(PROGS) 
@@ -29,8 +32,6 @@ trinternfile.o : internfile.cpp
 	$(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \
 	       internfile.cpp
 
-clean::
+clean:
 	rm -f *.o $(PROGS)
 
-include ../mk/sysconf
-
diff --git a/src/common/unacpp.cpp b/src/common/unacpp.cpp
index 7f309e3c..35eea5fb 100644
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.6 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #ifndef TEST_UNACPP
@@ -17,28 +17,16 @@ using std::string;
 #include "unac.h"
 
 
-bool unac_cpp(const std::string &in, std::string &out, const char *encoding)
+bool unacmaybefold(const std::string &in, std::string &out, 
+		   const char *encoding, bool dofold)
 {
     char *cout = 0;
     size_t out_len;
-
-    if (unac_string(encoding, in.c_str(), in.length(), &cout, &out_len) < 0) {
-	char cerrno[20];
-	sprintf(cerrno, "%d", errno);
-	out = string("unac_string failed, errno : ") + cerrno;
-	return false;
-    }
-    out.assign(cout, out_len);
-    free(cout);
-    return true;
-}
-
-bool unac_cpp_utf16be(const std::string &in, std::string &out)
-{
-    char *cout = 0;
-    size_t out_len;
-
-    if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) {
+    int status;
+    status = dofold ? 
+	unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len) :
+	unac_string(encoding, in.c_str(), in.length(), &cout, &out_len);
+    if (status < 0) {
 	char cerrno[20];
 	sprintf(cerrno, "%d", errno);
 	out = string("unac_string failed, errno : ") + cerrno;
@@ -65,6 +53,7 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
+    bool dofold = true;
     if (argc != 4) {
 	cerr << "Usage: unacpp  <encoding> <infile> <outfile>" << endl;
 	exit(1);
@@ -79,7 +68,7 @@ int main(int argc, char **argv)
 	exit(1);
     }
     string ndata;
-    if (!unac_cpp(odata, ndata, encoding)) {
+    if (!unacmaybefold(odata, ndata, encoding, dofold)) {
 	cerr << "unac: " << ndata << endl;
 	exit(1);
     }
diff --git a/src/common/unacpp.h b/src/common/unacpp.h
index 9d38698e..2f51eef9 100644
--- a/src/common/unacpp.h
+++ b/src/common/unacpp.h
@@ -1,12 +1,10 @@
 #ifndef _UNACPP_H_INCLUDED_
 #define _UNACPP_H_INCLUDED_
-/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: unacpp.h,v 1.3 2006-01-06 13:18:17 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 
-// A small wrapper for unac.c
-extern bool unac_cpp(const std::string &in, std::string &out, 
-		     const char *encoding = "UTF-8");
-extern bool unac_cpp_utf16be(const std::string &in, std::string &out);
-
+// A small stringified wrapper for unac.c
+extern bool unacmaybefold(const std::string &in, std::string &out, 
+			  const char *encoding, bool dofold);
 #endif /* _UNACPP_H_INCLUDED_ */
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index b1a06d83..aafcc5aa 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@@ -213,44 +213,28 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
 
 // Unaccent and lowercase data, replace \n\r with spaces
 // Removing crlfs is so that we can use the text in the document data fields.
-// Use unac for removing accents
-// Use our own lower-casing function (built from Unicode tables)
-// Everything is converted to/from UTF-16BE at begin/end as this the internal
-// format used by the processing functions.
+// Use unac (with folding extension) for removing accents and casefolding
 //
-// A possible optimization would be to remove accented characters from
-// the lowercasing function tables, as we execute unac first.  It
-// might even be possible must probably non trivial to combine both
-// conversions
+// Note that we always return true (but set out to "" on error). We don't
+// want to stop indexation because of a bad string
 bool Rcl::dumb_string(const string &in, string &out)
 {
     out.erase();
     if (in.empty())
 	return true;
 
-    string s1, s2;
+    string s1;
+    s1.reserve(in.length());
     for (unsigned int i = 0; i < in.length(); i++) {
 	if (in[i] == '\n' || in[i] == '\r')
 	    s1 += ' ';
 	else
 	    s1 += in[i];
     }
-    if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
-	LOGERR(("dumb_string: convert to utf-16be failed\n"));
-	return false;
-    }
-
-    if (!unac_cpp_utf16be(s2, s1)) {
-	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
-	return false;
-    }
-    if (!ucs2lower(s1, s2)) {
-	LOGERR(("dumb_string: ucs2lower failed\n"));
-	return false;
-    }
-    if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
-	LOGERR(("dumb_string: convert back to utf-8 failed\n"));
-	return false;
+    if (!unacmaybefold(s1, out, "UTF-8", true)) {
+	LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
+	out.erase();
+	return true;
     }
     return true;
 }