From 40a5905b15ba94fb78ee79dbd201dc7a0225dc34 Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 10 Feb 2005 19:52:50 +0000 Subject: [PATCH] *** empty log message *** --- src/common/textsplit.cpp | 19 ++++++++- src/utils/Makefile | 11 ++++- src/utils/utf8iter.cpp | 53 +++++++++++++++++++++++ src/utils/utf8iter.h | 90 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+), 3 deletions(-) create mode 100644 src/utils/utf8iter.cpp create mode 100644 src/utils/utf8iter.h diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index f0078595..a46ba26b 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.8 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -22,6 +22,12 @@ using namespace std; * of a 256 slot array). * * We are also not using capitalization information. + * + * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first. + * Then specialcase all 'real' utf chars, by checking for the few + punctuation ones we're interested in (put them in a map). Then + classify all other non-ascii as letter, and use the current method + for chars < 127. */ // Character classes: we have three main groups, and then some chars @@ -117,6 +123,9 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, return true; } +// A routine called from different places in text_to_words(), to adjust +// the current state and call the word handler. This is purely for +// factoring common code from different places text_to_words() bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos, bool spanerase, int bp) { @@ -126,19 +135,25 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos, word.erase(); return true; } + + // Emit span or both word and span if they are different if (!emitterm(true, span, spanpos, bp-span.length(), bp)) return false; if (word.length() != span.length() && !fq) if (!emitterm(false, word, wordpos, bp-word.length(), bp)) return false; + + // Adjust state wordpos++; if (spanerase) span.erase(); word.erase(); + return true; } -/* +/** + * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, * are handled properly, diff --git a/src/utils/Makefile b/src/utils/Makefile index f35fa2ca..0fc42f8b 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -2,7 +2,9 @@ include ../mk/sysconf BIGLIB = ../lib/librcl.a -PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse trexecmd +PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse \ + trexecmd utf8iter + all: $(PROGS) FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o @@ -51,5 +53,12 @@ wipedir : $(WIPEDIR_OBJS) trwipedir.o : ../utils/wipedir.cpp $(CXX) $(CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \ wipedir.cpp + +UTF8ITER_OBJS= trutf8iter.o $(BIGLIB) +utf8iter : $(UTF8ITER_OBJS) + $(CXX) $(CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV) +trutf8iter.o : ../utils/utf8iter.cpp utf8iter.h + $(CXX) $(CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \ + utf8iter.cpp clean: rm -f *.o $(PROGS) diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp new file mode 100644 index 00000000..a7ba91a0 --- /dev/null +++ b/src/utils/utf8iter.cpp @@ -0,0 +1,53 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif +#include +#include +#include +#include +#include "debuglog.h" +using namespace std; + +#include "utf8iter.h" +#include "readfile.h" + + + +int main(int argc, char **argv) +{ + if (argc != 3) { + cerr << "Usage: utf8iter infile outfile" << endl; + exit(1); + } + const char *infile = argv[1]; + const char *outfile = argv[2]; + string in; + string out; + if (!file_to_string(infile, in)) { + cerr << "Cant read file\n" << endl; + exit(1); + } + Utf8Iter it(in); + FILE *fp = fopen(outfile, "w"); + if (fp == 0) { + fprintf(stderr, "cant create %s\n", outfile); + exit(1); + } + while (!it.eof()) { + unsigned int value = *it; + it.appendchartostring(out); + it++; + fwrite(&value, 4, 1, fp); + } + fclose(fp); + if (it.error()) { + fprintf(stderr, "Conversion error occurred\n"); + exit(1); + } + if (in != out) { + fprintf(stderr, "error: out != in\n"); + exit(1); + } + exit(0); +} + diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h new file mode 100644 index 00000000..513fa228 --- /dev/null +++ b/src/utils/utf8iter.h @@ -0,0 +1,90 @@ +#ifndef _UTF8ITER_H_INCLUDED_ +#define _UTF8ITER_H_INCLUDED_ +/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */ + +/** + * A small helper class to iterate over utf8 strings. This is not an + * STL iterator and this is not well designed, just convenient for + some specific uses + */ +class Utf8Iter { + unsigned int cl; + const string &s; + string::size_type pos; + bool bad; + int compute_cl() { + cl = 0; + if (bad) + return -1; + unsigned int z = (unsigned char)s[pos]; + if (z <= 127) { + cl = 1; + } else if (z>=192 && z <= 223) { + cl = 2; + } else if (z >= 224 && z <= 239) { + cl = 3; + } else if (z >= 240 && z <= 247) { + cl = 4; + } else if (z >= 248 && z <= 251) { + cl = 5; + } else if (z >= 252 && z <= 253) { + cl = 6; + } + if (!cl || s.length() - pos < cl) { + bad = true; + cl = 0; + return -1; + } + return 0; + } + public: + Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {} + + /** operator* returns the ucs4 value as a machine integer*/ + unsigned int operator*() { + if (!cl && compute_cl() < 0) + return (unsigned int)-1; + switch (cl) { + case 1: return (unsigned char)s[pos]; + case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ; + case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128; + case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 + + ((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128; + case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 + + ((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128; + case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 + + ((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 + + ((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128; + default: + bad = true; + cl = 0; + return (unsigned int)-1; + } + } + + string::size_type operator++(int) { + if (bad || (!cl && compute_cl() < 0)) { + return string::npos; + } + pos += cl; + cl = 0; + return pos; + } + + bool appendchartostring(string &out) { + if (bad || (!cl && compute_cl() < 0)) { + return false; + } + out += s.substr(pos, cl); + return true; + } + bool eof() { + return bad || pos == s.length(); + } + bool error() { + return bad; + } +}; + + +#endif /* _UTF8ITER_H_INCLUDED_ */