From 9a9ce937283489a24b5c5e6b6ee8d6e282538cf4 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 6 Jan 2006 13:18:17 +0000 Subject: [PATCH] integrated case-folding into unac for better performance --- src/common/Makefile | 9 +++++---- src/common/unacpp.cpp | 31 ++++++++++--------------------- src/common/unacpp.h | 10 ++++------ src/rcldb/rcldb.cpp | 36 ++++++++++-------------------------- 4 files changed, 29 insertions(+), 57 deletions(-) diff --git a/src/common/Makefile b/src/common/Makefile index 032c6cda..762c00bc 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -1,5 +1,8 @@ +# @(#$Id: Makefile,v 1.8 2006-01-06 13:18:17 dockes Exp $ (C) 2005 J.F.Dockes -# Only test progs in there +include ../mk/sysconf + +# Only test executables get build in here PROGS = internfile unacpp textsplit all: $(BIGLIB) $(PROGS) @@ -29,8 +32,6 @@ trinternfile.o : internfile.cpp $(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \ internfile.cpp -clean:: +clean: rm -f *.o $(PROGS) -include ../mk/sysconf - diff --git a/src/common/unacpp.cpp b/src/common/unacpp.cpp index 7f309e3c..35eea5fb 100644 --- a/src/common/unacpp.cpp +++ b/src/common/unacpp.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.6 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_UNACPP @@ -17,28 +17,16 @@ using std::string; #include "unac.h" -bool unac_cpp(const std::string &in, std::string &out, const char *encoding) +bool unacmaybefold(const std::string &in, std::string &out, + const char *encoding, bool dofold) { char *cout = 0; size_t out_len; - - if (unac_string(encoding, in.c_str(), in.length(), &cout, &out_len) < 0) { - char cerrno[20]; - sprintf(cerrno, "%d", errno); - out = string("unac_string failed, errno : ") + cerrno; - return false; - } - out.assign(cout, out_len); - free(cout); - return true; -} - -bool unac_cpp_utf16be(const std::string &in, std::string &out) -{ - char *cout = 0; - size_t out_len; - - if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) { + int status; + status = dofold ? + unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len) : + unac_string(encoding, in.c_str(), in.length(), &cout, &out_len); + if (status < 0) { char cerrno[20]; sprintf(cerrno, "%d", errno); out = string("unac_string failed, errno : ") + cerrno; @@ -65,6 +53,7 @@ using namespace std; int main(int argc, char **argv) { + bool dofold = true; if (argc != 4) { cerr << "Usage: unacpp " << endl; exit(1); @@ -79,7 +68,7 @@ int main(int argc, char **argv) exit(1); } string ndata; - if (!unac_cpp(odata, ndata, encoding)) { + if (!unacmaybefold(odata, ndata, encoding, dofold)) { cerr << "unac: " << ndata << endl; exit(1); } diff --git a/src/common/unacpp.h b/src/common/unacpp.h index 9d38698e..2f51eef9 100644 --- a/src/common/unacpp.h +++ b/src/common/unacpp.h @@ -1,12 +1,10 @@ #ifndef _UNACPP_H_INCLUDED_ #define _UNACPP_H_INCLUDED_ -/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: unacpp.h,v 1.3 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes */ #include -// A small wrapper for unac.c -extern bool unac_cpp(const std::string &in, std::string &out, - const char *encoding = "UTF-8"); -extern bool unac_cpp_utf16be(const std::string &in, std::string &out); - +// A small stringified wrapper for unac.c +extern bool unacmaybefold(const std::string &in, std::string &out, + const char *encoding, bool dofold); #endif /* _UNACPP_H_INCLUDED_ */ diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index b1a06d83..aafcc5aa 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -213,44 +213,28 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int) // Unaccent and lowercase data, replace \n\r with spaces // Removing crlfs is so that we can use the text in the document data fields. -// Use unac for removing accents -// Use our own lower-casing function (built from Unicode tables) -// Everything is converted to/from UTF-16BE at begin/end as this the internal -// format used by the processing functions. +// Use unac (with folding extension) for removing accents and casefolding // -// A possible optimization would be to remove accented characters from -// the lowercasing function tables, as we execute unac first. It -// might even be possible must probably non trivial to combine both -// conversions +// Note that we always return true (but set out to "" on error). We don't +// want to stop indexation because of a bad string bool Rcl::dumb_string(const string &in, string &out) { out.erase(); if (in.empty()) return true; - string s1, s2; + string s1; + s1.reserve(in.length()); for (unsigned int i = 0; i < in.length(); i++) { if (in[i] == '\n' || in[i] == '\r') s1 += ' '; else s1 += in[i]; } - if (!transcode(s1, s2, "UTF-8","UTF-16BE")) { - LOGERR(("dumb_string: convert to utf-16be failed\n")); - return false; - } - - if (!unac_cpp_utf16be(s2, s1)) { - LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str())); - return false; - } - if (!ucs2lower(s1, s2)) { - LOGERR(("dumb_string: ucs2lower failed\n")); - return false; - } - if (!transcode(s2, out, "UTF-16BE", "UTF-8")) { - LOGERR(("dumb_string: convert back to utf-8 failed\n")); - return false; + if (!unacmaybefold(s1, out, "UTF-8", true)) { + LOGERR(("dumb_string: unac failed for %s\n", in.c_str())); + out.erase(); + return true; } return true; }