*** empty log message ***
This commit is contained in:
parent
1a897c47b3
commit
40a5905b15
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.8 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_TEXTSPLIT
|
||||
|
||||
@ -22,6 +22,12 @@ using namespace std;
|
||||
* of a 256 slot array).
|
||||
*
|
||||
* We are also not using capitalization information.
|
||||
*
|
||||
* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
|
||||
* Then specialcase all 'real' utf chars, by checking for the few
|
||||
punctuation ones we're interested in (put them in a map). Then
|
||||
classify all other non-ascii as letter, and use the current method
|
||||
for chars < 127.
|
||||
*/
|
||||
|
||||
// Character classes: we have three main groups, and then some chars
|
||||
@ -117,6 +123,9 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
return true;
|
||||
}
|
||||
|
||||
// A routine called from different places in text_to_words(), to adjust
|
||||
// the current state and call the word handler. This is purely for
|
||||
// factoring common code from different places text_to_words()
|
||||
bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
|
||||
bool spanerase, int bp)
|
||||
{
|
||||
@ -126,19 +135,25 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
|
||||
word.erase();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Emit span or both word and span if they are different
|
||||
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
||||
return false;
|
||||
if (word.length() != span.length() && !fq)
|
||||
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
|
||||
return false;
|
||||
|
||||
// Adjust state
|
||||
wordpos++;
|
||||
if (spanerase)
|
||||
span.erase();
|
||||
word.erase();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Splitting a text into terms to be indexed.
|
||||
* We basically emit a word every time we see a separator, but some chars are
|
||||
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
||||
* are handled properly,
|
||||
|
||||
@ -2,7 +2,9 @@ include ../mk/sysconf
|
||||
|
||||
BIGLIB = ../lib/librcl.a
|
||||
|
||||
PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse trexecmd
|
||||
PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse \
|
||||
trexecmd utf8iter
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
|
||||
@ -51,5 +53,12 @@ wipedir : $(WIPEDIR_OBJS)
|
||||
trwipedir.o : ../utils/wipedir.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
|
||||
wipedir.cpp
|
||||
|
||||
UTF8ITER_OBJS= trutf8iter.o $(BIGLIB)
|
||||
utf8iter : $(UTF8ITER_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
|
||||
trutf8iter.o : ../utils/utf8iter.cpp utf8iter.h
|
||||
$(CXX) $(CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
|
||||
utf8iter.cpp
|
||||
clean:
|
||||
rm -f *.o $(PROGS)
|
||||
|
||||
53
src/utils/utf8iter.cpp
Normal file
53
src/utils/utf8iter.cpp
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include "debuglog.h"
|
||||
using namespace std;
|
||||
|
||||
#include "utf8iter.h"
|
||||
#include "readfile.h"
|
||||
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 3) {
|
||||
cerr << "Usage: utf8iter infile outfile" << endl;
|
||||
exit(1);
|
||||
}
|
||||
const char *infile = argv[1];
|
||||
const char *outfile = argv[2];
|
||||
string in;
|
||||
string out;
|
||||
if (!file_to_string(infile, in)) {
|
||||
cerr << "Cant read file\n" << endl;
|
||||
exit(1);
|
||||
}
|
||||
Utf8Iter it(in);
|
||||
FILE *fp = fopen(outfile, "w");
|
||||
if (fp == 0) {
|
||||
fprintf(stderr, "cant create %s\n", outfile);
|
||||
exit(1);
|
||||
}
|
||||
while (!it.eof()) {
|
||||
unsigned int value = *it;
|
||||
it.appendchartostring(out);
|
||||
it++;
|
||||
fwrite(&value, 4, 1, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
if (it.error()) {
|
||||
fprintf(stderr, "Conversion error occurred\n");
|
||||
exit(1);
|
||||
}
|
||||
if (in != out) {
|
||||
fprintf(stderr, "error: out != in\n");
|
||||
exit(1);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
90
src/utils/utf8iter.h
Normal file
90
src/utils/utf8iter.h
Normal file
@ -0,0 +1,90 @@
|
||||
#ifndef _UTF8ITER_H_INCLUDED_
|
||||
#define _UTF8ITER_H_INCLUDED_
|
||||
/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
/**
|
||||
* A small helper class to iterate over utf8 strings. This is not an
|
||||
* STL iterator and this is not well designed, just convenient for
|
||||
some specific uses
|
||||
*/
|
||||
class Utf8Iter {
|
||||
unsigned int cl;
|
||||
const string &s;
|
||||
string::size_type pos;
|
||||
bool bad;
|
||||
int compute_cl() {
|
||||
cl = 0;
|
||||
if (bad)
|
||||
return -1;
|
||||
unsigned int z = (unsigned char)s[pos];
|
||||
if (z <= 127) {
|
||||
cl = 1;
|
||||
} else if (z>=192 && z <= 223) {
|
||||
cl = 2;
|
||||
} else if (z >= 224 && z <= 239) {
|
||||
cl = 3;
|
||||
} else if (z >= 240 && z <= 247) {
|
||||
cl = 4;
|
||||
} else if (z >= 248 && z <= 251) {
|
||||
cl = 5;
|
||||
} else if (z >= 252 && z <= 253) {
|
||||
cl = 6;
|
||||
}
|
||||
if (!cl || s.length() - pos < cl) {
|
||||
bad = true;
|
||||
cl = 0;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
public:
|
||||
Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
|
||||
|
||||
/** operator* returns the ucs4 value as a machine integer*/
|
||||
unsigned int operator*() {
|
||||
if (!cl && compute_cl() < 0)
|
||||
return (unsigned int)-1;
|
||||
switch (cl) {
|
||||
case 1: return (unsigned char)s[pos];
|
||||
case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
|
||||
case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
|
||||
case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 +
|
||||
((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
|
||||
case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 +
|
||||
((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
|
||||
case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 +
|
||||
((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 +
|
||||
((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
|
||||
default:
|
||||
bad = true;
|
||||
cl = 0;
|
||||
return (unsigned int)-1;
|
||||
}
|
||||
}
|
||||
|
||||
string::size_type operator++(int) {
|
||||
if (bad || (!cl && compute_cl() < 0)) {
|
||||
return string::npos;
|
||||
}
|
||||
pos += cl;
|
||||
cl = 0;
|
||||
return pos;
|
||||
}
|
||||
|
||||
bool appendchartostring(string &out) {
|
||||
if (bad || (!cl && compute_cl() < 0)) {
|
||||
return false;
|
||||
}
|
||||
out += s.substr(pos, cl);
|
||||
return true;
|
||||
}
|
||||
bool eof() {
|
||||
return bad || pos == s.length();
|
||||
}
|
||||
bool error() {
|
||||
return bad;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#endif /* _UTF8ITER_H_INCLUDED_ */
|
||||
Loading…
x
Reference in New Issue
Block a user