integrated case-folding into unac for better performance

This commit is contained in:
dockes 2006-01-06 13:18:17 +00:00
parent 00b954c4ef
commit 9a9ce93728
4 changed files with 29 additions and 57 deletions

View File

@ -1,5 +1,8 @@
# @(#$Id: Makefile,v 1.8 2006-01-06 13:18:17 dockes Exp $ (C) 2005 J.F.Dockes
# Only test progs in there include ../mk/sysconf
# Only test executables get build in here
PROGS = internfile unacpp textsplit PROGS = internfile unacpp textsplit
all: $(BIGLIB) $(PROGS) all: $(BIGLIB) $(PROGS)
@ -29,8 +32,6 @@ trinternfile.o : internfile.cpp
$(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \ $(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \
internfile.cpp internfile.cpp
clean:: clean:
rm -f *.o $(PROGS) rm -f *.o $(PROGS)
include ../mk/sysconf

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.5 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.6 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_UNACPP #ifndef TEST_UNACPP
@ -17,28 +17,16 @@ using std::string;
#include "unac.h" #include "unac.h"
bool unac_cpp(const std::string &in, std::string &out, const char *encoding) bool unacmaybefold(const std::string &in, std::string &out,
const char *encoding, bool dofold)
{ {
char *cout = 0; char *cout = 0;
size_t out_len; size_t out_len;
int status;
if (unac_string(encoding, in.c_str(), in.length(), &cout, &out_len) < 0) { status = dofold ?
char cerrno[20]; unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len) :
sprintf(cerrno, "%d", errno); unac_string(encoding, in.c_str(), in.length(), &cout, &out_len);
out = string("unac_string failed, errno : ") + cerrno; if (status < 0) {
return false;
}
out.assign(cout, out_len);
free(cout);
return true;
}
bool unac_cpp_utf16be(const std::string &in, std::string &out)
{
char *cout = 0;
size_t out_len;
if (unac_string_utf16(in.c_str(), in.length(), &cout, &out_len) < 0) {
char cerrno[20]; char cerrno[20];
sprintf(cerrno, "%d", errno); sprintf(cerrno, "%d", errno);
out = string("unac_string failed, errno : ") + cerrno; out = string("unac_string failed, errno : ") + cerrno;
@ -65,6 +53,7 @@ using namespace std;
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
bool dofold = true;
if (argc != 4) { if (argc != 4) {
cerr << "Usage: unacpp <encoding> <infile> <outfile>" << endl; cerr << "Usage: unacpp <encoding> <infile> <outfile>" << endl;
exit(1); exit(1);
@ -79,7 +68,7 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
string ndata; string ndata;
if (!unac_cpp(odata, ndata, encoding)) { if (!unacmaybefold(odata, ndata, encoding, dofold)) {
cerr << "unac: " << ndata << endl; cerr << "unac: " << ndata << endl;
exit(1); exit(1);
} }

View File

@ -1,12 +1,10 @@
#ifndef _UNACPP_H_INCLUDED_ #ifndef _UNACPP_H_INCLUDED_
#define _UNACPP_H_INCLUDED_ #define _UNACPP_H_INCLUDED_
/* @(#$Id: unacpp.h,v 1.2 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: unacpp.h,v 1.3 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
// A small wrapper for unac.c // A small stringified wrapper for unac.c
extern bool unac_cpp(const std::string &in, std::string &out, extern bool unacmaybefold(const std::string &in, std::string &out,
const char *encoding = "UTF-8"); const char *encoding, bool dofold);
extern bool unac_cpp_utf16be(const std::string &in, std::string &out);
#endif /* _UNACPP_H_INCLUDED_ */ #endif /* _UNACPP_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -213,44 +213,28 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
// Unaccent and lowercase data, replace \n\r with spaces // Unaccent and lowercase data, replace \n\r with spaces
// Removing crlfs is so that we can use the text in the document data fields. // Removing crlfs is so that we can use the text in the document data fields.
// Use unac for removing accents // Use unac (with folding extension) for removing accents and casefolding
// Use our own lower-casing function (built from Unicode tables)
// Everything is converted to/from UTF-16BE at begin/end as this the internal
// format used by the processing functions.
// //
// A possible optimization would be to remove accented characters from // Note that we always return true (but set out to "" on error). We don't
// the lowercasing function tables, as we execute unac first. It // want to stop indexation because of a bad string
// might even be possible must probably non trivial to combine both
// conversions
bool Rcl::dumb_string(const string &in, string &out) bool Rcl::dumb_string(const string &in, string &out)
{ {
out.erase(); out.erase();
if (in.empty()) if (in.empty())
return true; return true;
string s1, s2; string s1;
s1.reserve(in.length());
for (unsigned int i = 0; i < in.length(); i++) { for (unsigned int i = 0; i < in.length(); i++) {
if (in[i] == '\n' || in[i] == '\r') if (in[i] == '\n' || in[i] == '\r')
s1 += ' '; s1 += ' ';
else else
s1 += in[i]; s1 += in[i];
} }
if (!transcode(s1, s2, "UTF-8","UTF-16BE")) { if (!unacmaybefold(s1, out, "UTF-8", true)) {
LOGERR(("dumb_string: convert to utf-16be failed\n")); LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
return false; out.erase();
} return true;
if (!unac_cpp_utf16be(s2, s1)) {
LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
return false;
}
if (!ucs2lower(s1, s2)) {
LOGERR(("dumb_string: ucs2lower failed\n"));
return false;
}
if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
LOGERR(("dumb_string: convert back to utf-8 failed\n"));
return false;
} }
return true; return true;
} }