521 lines
13 KiB
C++
521 lines
13 KiB
C++
#ifndef lint
|
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes";
|
|
#endif
|
|
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
#ifndef TEST_TEXTSPLIT
|
|
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <set>
|
|
#include "textsplit.h"
|
|
#include "debuglog.h"
|
|
|
|
//#define UTF8ITER_CHECK
|
|
#include "utf8iter.h"
|
|
|
|
#include "uproplist.h"
|
|
|
|
#ifndef NO_NAMESPACES
|
|
using namespace std;
|
|
#endif /* NO_NAMESPACES */
|
|
|
|
/**
|
|
* Splitting a text into words. The code in this file works with utf-8
|
|
* in a semi-clean way (see uproplist.h)
|
|
*
|
|
* We are also not using capitalization information.
|
|
*
|
|
* There are a few remnants of the initial utf8-ignorant version in this file.
|
|
*/
|
|
|
|
// Character classes: we have three main groups, and then some chars
|
|
// are their own class because they want special handling.
|
|
//
|
|
// We have an array with 256 slots where we keep the character types.
|
|
// The array could be fully static, but we use a small function to fill it
|
|
// once.
|
|
// The array is actually a remnant of the original version which did no utf8
|
|
// It could be reduced to 128, because real (over 128) utf8 chars are now
|
|
// handled with a set holding all the separator values.
|
|
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
|
|
static int charclasses[256];
|
|
|
|
static set<unsigned int> unicign;
|
|
static void setcharclasses()
|
|
{
|
|
static int init = 0;
|
|
if (init)
|
|
return;
|
|
unsigned int i;
|
|
for (i = 0 ; i < 256 ; i ++)
|
|
charclasses[i] = LETTER;
|
|
|
|
for (i = 0; i < ' ';i++)
|
|
charclasses[i] = SPACE;
|
|
|
|
char digits[] = "0123456789";
|
|
for (i = 0; i < strlen(digits); i++)
|
|
charclasses[int(digits[i])] = DIGIT;
|
|
|
|
char blankspace[] = "\t\v\f ";
|
|
for (i = 0; i < strlen(blankspace); i++)
|
|
charclasses[int(blankspace[i])] = SPACE;
|
|
|
|
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`";
|
|
for (i = 0; i < strlen(seps); i++)
|
|
charclasses[int(seps[i])] = SPACE;
|
|
|
|
char wild[] = "*?";
|
|
for (i = 0; i < strlen(wild); i++)
|
|
charclasses[int(wild[i])] = WILD;
|
|
|
|
char special[] = ".@+-,#'\n\r";
|
|
for (i = 0; i < strlen(special); i++)
|
|
charclasses[int(special[i])] = special[i];
|
|
|
|
for (i = 0; i < sizeof(uniign); i++)
|
|
unicign.insert(uniign[i]);
|
|
unicign.insert((unsigned int)-1);
|
|
|
|
init = 1;
|
|
}
|
|
|
|
// Do some checking (the kind which is simpler to do here than in the
|
|
// main loop), then send term to our client.
|
|
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|
int btstart, int btend)
|
|
{
|
|
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
|
|
|
unsigned int l = w.length();
|
|
if (l > 0 && l < (unsigned)maxWordLength) {
|
|
// 1 char word: we index single letters and digits, but
|
|
// nothing else. We might want to turn this into a test for a single
|
|
// utf8 character instead.
|
|
if (l == 1) {
|
|
int c = (int)w[0];
|
|
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
|
//cerr << "ERASING single letter term " << c << endl;
|
|
return true;
|
|
}
|
|
}
|
|
if (pos != prevpos || l != prevlen) {
|
|
bool ret = cb->takeword(w, pos, btstart, btend);
|
|
prevlen = w.length();
|
|
prevpos = pos;
|
|
return ret;
|
|
}
|
|
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* A routine called from different places in text_to_words(), to
|
|
* adjust the current state of the parser, and call the word
|
|
* handler/emitter. Emit and reset the current word, possibly emit the current
|
|
* span (if different). In query mode, words are not emitted, only final spans
|
|
*
|
|
* This is purely for factoring common code from different places
|
|
* text_to_words().
|
|
*
|
|
* @return true if ok, false for error. Splitting should stop in this case.
|
|
* @param spanerase Set if the current span is at its end. Reset it.
|
|
* @param bp The current BYTE position in the stream
|
|
*/
|
|
inline bool TextSplit::doemit(bool spanerase, int bp)
|
|
{
|
|
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
|
|
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
|
|
|
// Emit span. When splitting for query, we only emit final spans
|
|
bool spanemitted = false;
|
|
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
|
// Maybe trim at end These are chars that we would keep inside
|
|
// a span, but not at the end
|
|
while (span.length() > 0) {
|
|
switch (span[span.length()-1]) {
|
|
case '.':
|
|
case ',':
|
|
case '@':
|
|
case '\'':
|
|
span.resize(span.length()-1);
|
|
if (--bp < 0)
|
|
bp=0;
|
|
break;
|
|
default:
|
|
goto breakloop1;
|
|
}
|
|
}
|
|
breakloop1:
|
|
spanemitted = true;
|
|
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
|
return false;
|
|
}
|
|
|
|
// Emit word if different from span and not 'no words' mode
|
|
if (!(m_flags & TXTS_ONLYSPANS) && wordLen &&
|
|
(!spanemitted || wordLen != span.length())) {
|
|
string s(span.substr(wordStart, wordLen));
|
|
if (!emitterm(false, s, wordpos, bp-wordLen, bp))
|
|
return false;
|
|
}
|
|
|
|
// Adjust state
|
|
wordpos++;
|
|
wordLen = 0;
|
|
if (spanerase) {
|
|
span.erase();
|
|
spanpos = wordpos;
|
|
wordStart = 0;
|
|
} else {
|
|
wordStart = span.length();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline int whatcc(unsigned int c)
|
|
{
|
|
if (c <= 127) {
|
|
return charclasses[c];
|
|
} else {
|
|
if (unicign.find(c) != unicign.end())
|
|
return SPACE;
|
|
else
|
|
return LETTER;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Splitting a text into terms to be indexed.
|
|
* We basically emit a word every time we see a separator, but some chars are
|
|
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
|
* are handled properly,
|
|
*/
|
|
bool TextSplit::text_to_words(const string &in)
|
|
{
|
|
LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb,
|
|
in.substr(0,50).c_str()));
|
|
|
|
setcharclasses();
|
|
|
|
span.erase();
|
|
number = false;
|
|
wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;
|
|
|
|
Utf8Iter it(in);
|
|
|
|
for (; !it.eof(); it++) {
|
|
unsigned int c = *it;
|
|
|
|
if (c == (unsigned int)-1) {
|
|
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
|
return false;
|
|
}
|
|
int cc = whatcc(c);
|
|
switch (cc) {
|
|
case LETTER:
|
|
wordLen += it.appendchartostring(span);
|
|
break;
|
|
|
|
case DIGIT:
|
|
if (wordLen == 0)
|
|
number = true;
|
|
wordLen += it.appendchartostring(span);
|
|
break;
|
|
|
|
case SPACE:
|
|
SPACE:
|
|
if (wordLen || span.length()) {
|
|
if (!doemit(true, it.getBpos()))
|
|
return false;
|
|
number = false;
|
|
}
|
|
break;
|
|
case WILD:
|
|
if (m_flags & TXTS_KEEPWILD)
|
|
goto NORMALCHAR;
|
|
else
|
|
goto SPACE;
|
|
break;
|
|
case '-':
|
|
case '+':
|
|
if (wordLen == 0) {
|
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
|
number = true;
|
|
wordLen += it.appendchartostring(span);
|
|
} else {
|
|
wordStart += it.appendchartostring(span);
|
|
}
|
|
} else {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
number = false;
|
|
wordStart += it.appendchartostring(span);
|
|
}
|
|
break;
|
|
case '.':
|
|
case ',':
|
|
if (number) {
|
|
// 132.jpg ?
|
|
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
|
goto SPACE;
|
|
wordLen += it.appendchartostring(span);
|
|
break;
|
|
} else {
|
|
// If . inside a word, keep it, else, this is whitespace.
|
|
// We also keep an initial '.' for catching .net, but this adds
|
|
// quite a few spurious terms !
|
|
// Another problem is that something like .x-errs
|
|
// will be split as .x-errs, x, errs but not x-errs
|
|
// A final comma in a word will be removed by doemit
|
|
if (cc == '.') {
|
|
if (wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
// span length could have been adjusted by trimming
|
|
// inside doemit
|
|
if (span.length())
|
|
wordStart += it.appendchartostring(span);
|
|
break;
|
|
} else {
|
|
wordStart += it.appendchartostring(span);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
goto SPACE;
|
|
break;
|
|
case '@':
|
|
if (wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
number = false;
|
|
}
|
|
wordStart += it.appendchartostring(span);
|
|
break;
|
|
case '\'':
|
|
// If in word, potential span: o'brien, else, this is more
|
|
// whitespace
|
|
if (wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
number = false;
|
|
wordStart += it.appendchartostring(span);
|
|
}
|
|
break;
|
|
case '#':
|
|
// Keep it only at end of word ... Special case for c# you see...
|
|
if (wordLen > 0) {
|
|
int w = whatcc(it[it.getCpos()+1]);
|
|
if (w == SPACE || w == '\n' || w == '\r') {
|
|
wordLen += it.appendchartostring(span);
|
|
break;
|
|
}
|
|
}
|
|
goto SPACE;
|
|
break;
|
|
case '\n':
|
|
case '\r':
|
|
if (span.length() && span[span.length() - 1] == '-') {
|
|
// if '-' is the last char before end of line, just
|
|
// ignore the line change. This is the right thing to
|
|
// do almost always. We'd then need a way to check if
|
|
// the - was added as part of the word hyphenation, or was
|
|
// there in the first place, but this would need a dictionary.
|
|
// Also we'd need to check for a soft-hyphen and remove it,
|
|
// but this would require more utf-8 magic
|
|
} else {
|
|
// Handle like a normal separator
|
|
goto SPACE;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
NORMALCHAR:
|
|
wordLen += it.appendchartostring(span);
|
|
break;
|
|
}
|
|
}
|
|
if (wordLen || span.length()) {
|
|
if (!doemit(true, it.getBpos()))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Callback class for utility function usage
|
|
class utSplitterCB : public TextSplitCB {
|
|
public:
|
|
int wcnt;
|
|
utSplitterCB() : wcnt(0) {}
|
|
bool takeword(const string &term, int pos, int bs, int be) {
|
|
wcnt++;
|
|
return true;
|
|
}
|
|
};
|
|
|
|
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
|
{
|
|
utSplitterCB cb;
|
|
TextSplit splitter(&cb, flgs);
|
|
splitter.text_to_words(s);
|
|
return cb.wcnt;
|
|
}
|
|
|
|
#else // TEST driver ->
|
|
|
|
#include <unistd.h>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <stdio.h>
|
|
|
|
#include <iostream>
|
|
|
|
#include "textsplit.h"
|
|
#include "readfile.h"
|
|
#include "debuglog.h"
|
|
|
|
using namespace std;
|
|
|
|
// A small class to hold state while splitting text
|
|
class mySplitterCB : public TextSplitCB {
|
|
int first;
|
|
bool nooutput;
|
|
public:
|
|
mySplitterCB() : first(1), nooutput(false) {}
|
|
void setNoOut(bool val) {nooutput = val;}
|
|
bool takeword(const string &term, int pos, int bs, int be) {
|
|
if (nooutput)
|
|
return true;
|
|
if (first) {
|
|
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
|
first = 0;
|
|
}
|
|
printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
|
return true;
|
|
}
|
|
};
|
|
|
|
static string teststring =
|
|
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n"
|
|
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n"
|
|
"n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n"
|
|
"134 +134 -14 -1.5 +1.5 1.54e10 1,2 1,2e30\n"
|
|
"@^#$(#$(*)\n"
|
|
"192.168.4.1 one\n\rtwo\r"
|
|
"Debut-\ncontinue\n"
|
|
"[olala][ululu] (valeur) (23)\n"
|
|
"utf-8 ucs-4© \\nodef\n"
|
|
"','this\n"
|
|
" ,able,test-domain "
|
|
" -wl,--export-dynamic "
|
|
" ~/.xsession-errors "
|
|
;
|
|
static string teststring1 = " nouvel-an ";
|
|
|
|
static string thisprog;
|
|
|
|
static string usage =
|
|
" textsplit [opts] [filename]\n"
|
|
" -S: no output\n"
|
|
" -s: only spans\n"
|
|
" -w: only words\n"
|
|
" -k: preserve wildcards (?*)\n"
|
|
" -c: just count words\n"
|
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
|
" \n\n"
|
|
;
|
|
|
|
static void
|
|
Usage(void)
|
|
{
|
|
cerr << thisprog << ": usage:\n" << usage;
|
|
exit(1);
|
|
}
|
|
|
|
static int op_flags;
|
|
#define OPT_s 0x1
|
|
#define OPT_w 0x2
|
|
#define OPT_S 0x4
|
|
#define OPT_c 0x8
|
|
#define OPT_k 0x10
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
thisprog = argv[0];
|
|
argc--; argv++;
|
|
|
|
while (argc > 0 && **argv == '-') {
|
|
(*argv)++;
|
|
if (!(**argv))
|
|
/* Cas du "adb - core" */
|
|
Usage();
|
|
while (**argv)
|
|
switch (*(*argv)++) {
|
|
case 'c': op_flags |= OPT_c; break;
|
|
case 'k': op_flags |= OPT_k; break;
|
|
case 's': op_flags |= OPT_s; break;
|
|
case 'S': op_flags |= OPT_S; break;
|
|
case 'w': op_flags |= OPT_w; break;
|
|
default: Usage(); break;
|
|
}
|
|
argc--; argv++;
|
|
}
|
|
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
|
DebugLog::setfilename("stderr");
|
|
|
|
mySplitterCB cb;
|
|
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
|
|
|
if (op_flags&OPT_S)
|
|
cb.setNoOut(true);
|
|
|
|
if (op_flags&OPT_s)
|
|
flags = TextSplit::TXTS_ONLYSPANS;
|
|
else if (op_flags&OPT_w)
|
|
flags = TextSplit::TXTS_NOSPANS;
|
|
if (op_flags & OPT_k)
|
|
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
|
|
|
string data;
|
|
if (argc == 1) {
|
|
const char *filename = *argv++; argc--;
|
|
if (!strcmp(filename, "stdin")) {
|
|
char buf[1024];
|
|
int nread;
|
|
while ((nread = read(0, buf, 1024)) > 0) {
|
|
data.append(buf, nread);
|
|
}
|
|
} else if (!file_to_string(filename, data))
|
|
exit(1);
|
|
} else {
|
|
cout << endl << teststring << endl << endl;
|
|
data = teststring;
|
|
}
|
|
if (op_flags & OPT_c) {
|
|
int n = TextSplit::countWords(data, flags);
|
|
cout << n << " words" << endl;
|
|
} else {
|
|
TextSplit splitter(&cb, flags);
|
|
splitter.text_to_words(data);
|
|
}
|
|
}
|
|
#endif // TEST
|