*** empty log message ***

This commit is contained in:
dockes 2005-02-10 19:52:50 +00:00
parent 1a897c47b3
commit 40a5905b15
4 changed files with 170 additions and 3 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.8 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_TEXTSPLIT
@ -22,6 +22,12 @@ using namespace std;
* of a 256 slot array).
*
* We are also not using capitalization information.
*
* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
* Then specialcase all 'real' utf chars, by checking for the few
punctuation ones we're interested in (put them in a map). Then
classify all other non-ascii as letter, and use the current method
for chars < 127.
*/
// Character classes: we have three main groups, and then some chars
@ -117,6 +123,9 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
return true;
}
// A routine called from different places in text_to_words(), to adjust
// the current state and call the word handler. This is purely for
// factoring common code from different places text_to_words()
bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
bool spanerase, int bp)
{
@ -126,19 +135,25 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
word.erase();
return true;
}
// Emit span or both word and span if they are different
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
return false;
if (word.length() != span.length() && !fq)
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
return false;
// Adjust state
wordpos++;
if (spanerase)
span.erase();
word.erase();
return true;
}
/*
/**
* Splitting a text into terms to be indexed.
* We basically emit a word every time we see a separator, but some chars are
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
* are handled properly,

View File

@ -2,7 +2,9 @@ include ../mk/sysconf
BIGLIB = ../lib/librcl.a
PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse trexecmd
PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse \
trexecmd utf8iter
all: $(PROGS)
FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
@ -51,5 +53,12 @@ wipedir : $(WIPEDIR_OBJS)
trwipedir.o : ../utils/wipedir.cpp
$(CXX) $(CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
wipedir.cpp
UTF8ITER_OBJS= trutf8iter.o $(BIGLIB)
utf8iter : $(UTF8ITER_OBJS)
$(CXX) $(CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
trutf8iter.o : ../utils/utf8iter.cpp utf8iter.h
$(CXX) $(CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
utf8iter.cpp
clean:
rm -f *.o $(PROGS)

53
src/utils/utf8iter.cpp Normal file
View File

@ -0,0 +1,53 @@
#ifndef lint
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
#include <string>
#include <iostream>
#include <list>
#include "debuglog.h"
using namespace std;
#include "utf8iter.h"
#include "readfile.h"
int main(int argc, char **argv)
{
if (argc != 3) {
cerr << "Usage: utf8iter infile outfile" << endl;
exit(1);
}
const char *infile = argv[1];
const char *outfile = argv[2];
string in;
string out;
if (!file_to_string(infile, in)) {
cerr << "Cant read file\n" << endl;
exit(1);
}
Utf8Iter it(in);
FILE *fp = fopen(outfile, "w");
if (fp == 0) {
fprintf(stderr, "cant create %s\n", outfile);
exit(1);
}
while (!it.eof()) {
unsigned int value = *it;
it.appendchartostring(out);
it++;
fwrite(&value, 4, 1, fp);
}
fclose(fp);
if (it.error()) {
fprintf(stderr, "Conversion error occurred\n");
exit(1);
}
if (in != out) {
fprintf(stderr, "error: out != in\n");
exit(1);
}
exit(0);
}

90
src/utils/utf8iter.h Normal file
View File

@ -0,0 +1,90 @@
#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A small helper class to iterate over utf8 strings. This is not an
* STL iterator and this is not well designed, just convenient for
some specific uses
*/
class Utf8Iter {
unsigned int cl;
const string &s;
string::size_type pos;
bool bad;
int compute_cl() {
cl = 0;
if (bad)
return -1;
unsigned int z = (unsigned char)s[pos];
if (z <= 127) {
cl = 1;
} else if (z>=192 && z <= 223) {
cl = 2;
} else if (z >= 224 && z <= 239) {
cl = 3;
} else if (z >= 240 && z <= 247) {
cl = 4;
} else if (z >= 248 && z <= 251) {
cl = 5;
} else if (z >= 252 && z <= 253) {
cl = 6;
}
if (!cl || s.length() - pos < cl) {
bad = true;
cl = 0;
return -1;
}
return 0;
}
public:
Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*() {
if (!cl && compute_cl() < 0)
return (unsigned int)-1;
switch (cl) {
case 1: return (unsigned char)s[pos];
case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 +
((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 +
((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 +
((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 +
((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
default:
bad = true;
cl = 0;
return (unsigned int)-1;
}
}
string::size_type operator++(int) {
if (bad || (!cl && compute_cl() < 0)) {
return string::npos;
}
pos += cl;
cl = 0;
return pos;
}
bool appendchartostring(string &out) {
if (bad || (!cl && compute_cl() < 0)) {
return false;
}
out += s.substr(pos, cl);
return true;
}
bool eof() {
return bad || pos == s.length();
}
bool error() {
return bad;
}
};
#endif /* _UTF8ITER_H_INCLUDED_ */