*** empty log message ***

2005-02-10 19:52:50 +00:00 · 2005-02-10 19:52:50 +00:00 · 40a5905b15
commit 40a5905b15
parent 1a897c47b3
4 changed files with 170 additions and 3 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.8 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT

@ -22,6 +22,12 @@ using namespace std;
 * of a 256 slot array).
 *
 * We are also not using capitalization information.
+ *
+ * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
+ * Then specialcase all 'real' utf chars, by checking for the few
+   punctuation ones we're interested in (put them in a map). Then
+   classify all other non-ascii as letter, and use the current method
+   for chars < 127.
 */

 // Character classes: we have three main groups, and then some chars
@ -117,6 +123,9 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
    return true;
 }

+// A routine called from different places in text_to_words(), to adjust
+// the current state and call the word handler. This is purely for
+// factoring common code from different places text_to_words()
 bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
 		       bool spanerase, int bp)
 {
@ -126,19 +135,25 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
 	word.erase();
 	return true;
    }
+
+    // Emit span or both word and span if they are different
    if (!emitterm(true, span, spanpos, bp-span.length(), bp))
 	return false;
    if (word.length() != span.length() && !fq)
 	if (!emitterm(false, word, wordpos, bp-word.length(), bp))
 	    return false;
+
+    // Adjust state
    wordpos++;
    if (spanerase)
 	span.erase();
    word.erase();
+
    return true;
 }

-/* 
+/** 
+ * Splitting a text into terms to be indexed.
 * We basically emit a word every time we see a separator, but some chars are
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
 * are handled properly,
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@ -2,7 +2,9 @@ include ../mk/sysconf

 BIGLIB = ../lib/librcl.a

-PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse trexecmd
+PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse \
+      trexecmd utf8iter
+
 all: $(PROGS)

 FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
@ -51,5 +53,12 @@ wipedir : $(WIPEDIR_OBJS)
 trwipedir.o : ../utils/wipedir.cpp 
 	$(CXX) $(CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
 	       wipedir.cpp
+
+UTF8ITER_OBJS= trutf8iter.o  $(BIGLIB)
+utf8iter : $(UTF8ITER_OBJS)
+	$(CXX) $(CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
+trutf8iter.o : ../utils/utf8iter.cpp utf8iter.h
+	$(CXX) $(CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
+	       utf8iter.cpp
 clean: 
 	rm -f *.o $(PROGS)
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@ -0,0 +1,53 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes";
+#endif
+#include <stdio.h>
+#include <string>
+#include <iostream>
+#include <list>
+#include "debuglog.h"
+using namespace std;
+
+#include "utf8iter.h"
+#include "readfile.h"
+
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 3) {
+	cerr << "Usage: utf8iter infile outfile" << endl;
+	exit(1);
+    }
+    const char *infile = argv[1];
+    const char *outfile = argv[2];
+    string in;
+    string out;
+    if (!file_to_string(infile, in)) {
+	cerr << "Cant read file\n" << endl;
+	exit(1);
+    }
+    Utf8Iter it(in);
+    FILE *fp = fopen(outfile, "w");
+    if (fp == 0) {
+	fprintf(stderr, "cant create %s\n", outfile);
+	exit(1);
+    }
+    while (!it.eof()) {
+	unsigned int value = *it;
+	it.appendchartostring(out);
+	it++;
+	fwrite(&value, 4, 1, fp);
+    }
+    fclose(fp);
+    if (it.error()) {
+	fprintf(stderr, "Conversion error occurred\n");
+	exit(1);
+    }
+    if (in != out) {
+	fprintf(stderr, "error: out != in\n");
+	exit(1);
+    }
+    exit(0);
+}
+
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -0,0 +1,90 @@
+#ifndef _UTF8ITER_H_INCLUDED_
+#define _UTF8ITER_H_INCLUDED_
+/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $  (C) 2004 J.F.Dockes */
+
+/** 
+ * A small helper class to iterate over utf8 strings. This is not an
+ * STL iterator and this is not well designed, just convenient for
+   some specific uses
+ */
+class Utf8Iter {
+    unsigned int cl;
+    const string &s;
+    string::size_type pos;
+    bool bad;
+    int compute_cl() {
+	cl = 0;
+	if (bad)
+	    return -1;
+	unsigned int z = (unsigned char)s[pos];
+	if (z <= 127) {
+	    cl = 1;
+	} else if (z>=192 && z <= 223) {
+	    cl = 2;
+	} else if (z >= 224 && z <= 239) {
+	    cl = 3;
+	} else if (z >= 240 && z <= 247) {
+	    cl = 4;
+	} else if (z >= 248 && z <= 251) {
+	    cl = 5;
+	} else if (z >= 252 && z <= 253) {
+	    cl = 6;
+	} 
+	if (!cl || s.length() - pos < cl) {
+	    bad = true;
+	    cl = 0;
+	    return -1;
+	}
+	return 0;
+    }
+ public:
+    Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
+
+    /** operator* returns the ucs4 value as a machine integer*/
+    unsigned int operator*() {
+	if (!cl && compute_cl() < 0)
+	    return (unsigned int)-1;
+	switch (cl) {
+	case 1: return (unsigned char)s[pos];
+	case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
+	case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
+	case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 + 
+		((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
+	case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 + 
+		((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
+	case 6: return  ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 + 
+		((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 + 
+		((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
+	default:
+	    bad = true;
+	    cl = 0;
+	    return (unsigned int)-1;
+	}
+    }
+
+    string::size_type operator++(int) {
+	if (bad || (!cl && compute_cl() < 0)) {
+	    return string::npos;
+	}
+	pos += cl;
+	cl = 0;
+	return pos;
+    }
+
+    bool appendchartostring(string &out) {
+	if (bad || (!cl && compute_cl() < 0)) {
+	    return false;
+	}
+	out += s.substr(pos, cl);
+	return true;
+    }
+    bool eof() {
+	return bad || pos == s.length();
+    }
+    bool error() {
+	return bad;
+    }
+};
+
+
+#endif /* _UTF8ITER_H_INCLUDED_ */