Use proper unicode lowercasing

2006-01-05 16:37:27 +00:00 · 2006-01-05 16:37:27 +00:00 · 50501c6953
commit 50501c6953
parent 66878ddf15
5 changed files with 67 additions and 24 deletions
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.4 2005-11-24 07:16:15 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif

 #ifndef TEST_UNACPP
@ -33,6 +33,22 @@ bool unac_cpp(const std::string &in, std::string &out, const char *encoding)
    return true;
 }

+bool unac_cpp_utf16be(const std::string &in, std::string &out)
+{
+    char *cout = 0;
+    size_t out_len;
+
+    if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) {
+	char cerrno[20];
+	sprintf(cerrno, "%d", errno);
+	out = string("unac_string failed, errno : ") + cerrno;
+	return false;
+    }
+    out.assign(cout, out_len);
+    free(cout);
+    return true;
+}
+
 #else // not testing

 #include <unistd.h>
--- a/src/common/unacpp.h
+++ b/src/common/unacpp.h
@ -1,11 +1,12 @@
 #ifndef _UNACPP_H_INCLUDED_
 #define _UNACPP_H_INCLUDED_
-/* @(#$Id: unacpp.h,v 1.1 2004-12-17 15:36:13 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

 // A small wrapper for unac.c
 extern bool unac_cpp(const std::string &in, std::string &out, 
 		     const char *encoding = "UTF-8");
+extern bool unac_cpp_utf16be(const std::string &in, std::string &out);

 #endif /* _UNACPP_H_INCLUDED_ */
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -5,7 +5,7 @@ LIBS = librcl.a

 all: depend $(LIBS)

-OBJS = base64.o conftree.o csguess.o debuglog.o \
+OBJS = base64.o caseconvert.o conftree.o csguess.o debuglog.o \
     execmd.o wipedir.o \
     fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o history.o \
     htmlparse.o \
@ -15,7 +15,8 @@ OBJS = base64.o conftree.o csguess.o debuglog.o \
     textsplit.o transcode.o \
     unacpp.o unac.o docseq.o sortseq.o copyfile.o

-SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
+SRCS = ../utils/caseconvert.cpp ../utils/conftree.cpp \
+     ../index/csguess.cpp ../utils/debuglog.cpp \
     ../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
     ../utils/wipedir.cpp ../utils/fstreewalk.cpp \
     ../common/mh_html.cpp ../common/mh_mail.cpp ../common/mh_exec.cpp \
@ -45,6 +46,8 @@ debuglog.o : ../utils/debuglog.cpp
 	$(CXX) $(CXXFLAGS) -c $<
 execmd.o : ../utils/execmd.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
+caseconvert.o : ../utils/caseconvert.cpp 
+	$(CXX) $(CXXFLAGS) -c $<
 wipedir.o : ../utils/wipedir.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 fstreewalk.o : ../utils/fstreewalk.cpp 
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.44 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -22,6 +22,7 @@ using namespace std;
 #include "pathut.h"
 #include "smallut.h"
 #include "pathhash.h"
+#include "caseconvert.h"

 #include "xapian.h"
 #include <xapian/stem.h>
@ -210,31 +211,46 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
    return true;
 }

-// Unaccent and lowercase data: use unac 
-// for accents, and do it by hand for upper / lower. 
-// TOBEDONE: lowercasing is done only for ascii letters, just A-Z -> a-z 
+// Unaccent and lowercase data, replace \n\r with spaces
 // Removing crlfs is so that we can use the text in the document data fields.
+// Use unac for removing accents
+// Use our own lower-casing function (built from Unicode tables)
+// Everything is converted to/from UTF-16BE at begin/end as this the internal
+// format used by the processing functions.
+//
+// A possible optimization would be to remove accented characters from
+// the lowercasing function tables, as we execute unac first.  It
+// might even be possible must probably non trivial to combine both
+// conversions
 bool Rcl::dumb_string(const string &in, string &out)
 {
-    string inter;
    out.erase();
    if (in.empty())
 	return true;
-    if (!unac_cpp(in, inter)) {
-	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
-	// Ok, no need to stop the whole show
-	inter = "";
+
+    string s1, s2;
+    for (unsigned int i = 0; i < in.length(); i++) {
+	if (in[i] == '\n' || in[i] == '\r')
+	    s1 += ' ';
+	else
+	    s1 += in[i];
    }
-    out.reserve(inter.length());
-    for (unsigned int i = 0; i < inter.length(); i++) {
-	if (inter[i] >= 'A' && inter[i] <= 'Z') {
-	    out += inter[i] + 'a' - 'A';
-	} else {
-	    if (inter[i] == '\n' || inter[i] == '\r')
-		out += ' ';
-	    else
-		out += inter[i];
-	}
+    if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
+	LOGERR(("dumb_string: convert to utf-16be failed\n"));
+	return false;
+    }
+
+    if (!unac_cpp_utf16be(s2, s1)) {
+	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
+	return false;
+    }
+    if (!ucs2lower(s1, s2)) {
+	LOGERR(("dumb_string: ucs2lower failed\n"));
+	return false;
+    }
+    if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
+	LOGERR(("dumb_string: convert back to utf-8 failed\n"));
+	return false;
    }
    return true;
 }
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@ -1,5 +1,6 @@

-PROGS = trconftree wipedir smallut trfstreewalk trpathut transcode \
+PROGS = trcaseconvert trconftree wipedir smallut trfstreewalk trpathut \
+      transcode \
      trmimeparse trexecmd utf8iter idfile

 all: $(PROGS) $(BIGLIB)
@ -20,6 +21,12 @@ trpathut : $(PATHUT_OBJS)
 trpathut.o : pathut.cpp pathut.h
 	$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp

+CASECONVERT_OBJS= trcaseconvert.o caseconvert.o  $(BIGLIB)
+trcaseconvert : $(CASECONVERT_OBJS)
+	$(CXX) $(CXXFLAGS) -o trcaseconvert $(CASECONVERT_OBJS)
+trcaseconvert.o : caseconvert.cpp caseconvert.h
+	$(CXX) -o trcaseconvert.o -c $(CXXFLAGS) -DTEST_CASECONVERT caseconvert.cpp
+
 EXECMD_OBJS= trexecmd.o $(BIGLIB)
 trexecmd : $(EXECMD_OBJS)
 	$(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)