#ifndef lint static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef TEST_TEXTSPLIT #include #include #include #include "textsplit.h" #include "debuglog.h" //#define UTF8ITER_CHECK #include "utf8iter.h" #include "uproplist.h" #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ /** * Splitting a text into words. The code in this file works with utf-8 * in a semi-clean way (see uproplist.h) * * We are also not using capitalization information. * * There are a few remnants of the initial utf8-ignorant version in this file. */ // Character classes: we have three main groups, and then some chars // are their own class because they want special handling. // // We have an array with 256 slots where we keep the character types. // The array could be fully static, but we use a small function to fill it // once. // The array is actually a remnant of the original version which did no utf8 // It could be reduced to 128, because real (over 128) utf8 chars are now // handled with a set holding all the separator values. enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259}; static int charclasses[256]; static set unicign; static void setcharclasses() { static int init = 0; if (init) return; unsigned int i; for (i = 0 ; i < 256 ; i ++) charclasses[i] = LETTER; for (i = 0; i < ' ';i++) charclasses[i] = SPACE; char digits[] = "0123456789"; for (i = 0; i < strlen(digits); i++) charclasses[int(digits[i])] = DIGIT; char blankspace[] = "\t\v\f "; for (i = 0; i < strlen(blankspace); i++) charclasses[int(blankspace[i])] = SPACE; char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`"; for (i = 0; i < strlen(seps); i++) charclasses[int(seps[i])] = SPACE; char wild[] = "*?"; for (i = 0; i < strlen(wild); i++) charclasses[int(wild[i])] = WILD; char special[] = ".@+-,#'\n\r"; for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; for (i = 0; i < sizeof(uniign); i++) unicign.insert(uniign[i]); unicign.insert((unsigned int)-1); init = 1; } // Do some checking (the kind which is simpler to do here than in the // main loop), then send term to our client. inline bool TextSplit::emitterm(bool isspan, string &w, int pos, int btstart, int btend) { LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos)); unsigned int l = w.length(); if (l > 0 && l < (unsigned)maxWordLength) { // 1 char word: we index single letters and digits, but // nothing else. We might want to turn this into a test for a single // utf8 character instead. if (l == 1) { int c = (int)w[0]; if (charclasses[c] != LETTER && charclasses[c] != DIGIT) { //cerr << "ERASING single letter term " << c << endl; return true; } } if (pos != prevpos || l != prevlen) { bool ret = cb->takeword(w, pos, btstart, btend); prevlen = w.length(); prevpos = pos; return ret; } LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos)); } return true; } /** * A routine called from different places in text_to_words(), to * adjust the current state of the parser, and call the word * handler/emitter. Emit and reset the current word, possibly emit the current * span (if different). In query mode, words are not emitted, only final spans * * This is purely for factoring common code from different places * text_to_words(). * * @return true if ok, false for error. Splitting should stop in this case. * @param spanerase Set if the current span is at its end. Reset it. * @param bp The current BYTE position in the stream */ inline bool TextSplit::doemit(bool spanerase, int bp) { LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n", span.c_str(), spanpos, wordStart, wordLen, spanerase, bp)); // Emit span. When splitting for query, we only emit final spans bool spanemitted = false; if (spanerase && !(m_flags & TXTS_NOSPANS)) { // Maybe trim at end These are chars that we would keep inside // a span, but not at the end while (span.length() > 0) { switch (span[span.length()-1]) { case '.': case ',': case '@': case '\'': span.resize(span.length()-1); if (--bp < 0) bp=0; break; default: goto breakloop1; } } breakloop1: spanemitted = true; if (!emitterm(true, span, spanpos, bp-span.length(), bp)) return false; } // Emit word if different from span and not 'no words' mode if (!(m_flags & TXTS_ONLYSPANS) && wordLen && (!spanemitted || wordLen != span.length())) { string s(span.substr(wordStart, wordLen)); if (!emitterm(false, s, wordpos, bp-wordLen, bp)) return false; } // Adjust state wordpos++; wordLen = 0; if (spanerase) { span.erase(); spanpos = wordpos; wordStart = 0; } else { wordStart = span.length(); } return true; } static inline int whatcc(unsigned int c) { if (c <= 127) { return charclasses[c]; } else { if (unicign.find(c) != unicign.end()) return SPACE; else return LETTER; } } /** * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, * are handled properly, */ bool TextSplit::text_to_words(const string &in) { LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb, in.substr(0,50).c_str())); setcharclasses(); span.erase(); number = false; wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0; Utf8Iter it(in); for (; !it.eof(); it++) { unsigned int c = *it; if (c == (unsigned int)-1) { LOGERR(("Textsplit: error occured while scanning UTF-8 string\n")); return false; } int cc = whatcc(c); switch (cc) { case LETTER: wordLen += it.appendchartostring(span); break; case DIGIT: if (wordLen == 0) number = true; wordLen += it.appendchartostring(span); break; case SPACE: SPACE: if (wordLen || span.length()) { if (!doemit(true, it.getBpos())) return false; number = false; } break; case WILD: if (m_flags & TXTS_KEEPWILD) goto NORMALCHAR; else goto SPACE; break; case '-': case '+': if (wordLen == 0) { if (whatcc(it[it.getCpos()+1]) == DIGIT) { number = true; wordLen += it.appendchartostring(span); } else { wordStart += it.appendchartostring(span); } } else { if (!doemit(false, it.getBpos())) return false; number = false; wordStart += it.appendchartostring(span); } break; case '.': case ',': if (number) { // 132.jpg ? if (whatcc(it[it.getCpos()+1]) != DIGIT) goto SPACE; wordLen += it.appendchartostring(span); break; } else { // If . inside a word, keep it, else, this is whitespace. // We also keep an initial '.' for catching .net, but this adds // quite a few spurious terms ! // Another problem is that something like .x-errs // will be split as .x-errs, x, errs but not x-errs // A final comma in a word will be removed by doemit if (cc == '.') { if (wordLen) { if (!doemit(false, it.getBpos())) return false; // span length could have been adjusted by trimming // inside doemit if (span.length()) wordStart += it.appendchartostring(span); break; } else { wordStart += it.appendchartostring(span); break; } } } goto SPACE; break; case '@': if (wordLen) { if (!doemit(false, it.getBpos())) return false; number = false; } wordStart += it.appendchartostring(span); break; case '\'': // If in word, potential span: o'brien, else, this is more // whitespace if (wordLen) { if (!doemit(false, it.getBpos())) return false; number = false; wordStart += it.appendchartostring(span); } break; case '#': // Keep it only at end of word ... Special case for c# you see... if (wordLen > 0) { int w = whatcc(it[it.getCpos()+1]); if (w == SPACE || w == '\n' || w == '\r') { wordLen += it.appendchartostring(span); break; } } goto SPACE; break; case '\n': case '\r': if (span.length() && span[span.length() - 1] == '-') { // if '-' is the last char before end of line, just // ignore the line change. This is the right thing to // do almost always. We'd then need a way to check if // the - was added as part of the word hyphenation, or was // there in the first place, but this would need a dictionary. // Also we'd need to check for a soft-hyphen and remove it, // but this would require more utf-8 magic } else { // Handle like a normal separator goto SPACE; } break; default: NORMALCHAR: wordLen += it.appendchartostring(span); break; } } if (wordLen || span.length()) { if (!doemit(true, it.getBpos())) return false; } return true; } // Callback class for utility function usage class utSplitterCB : public TextSplitCB { public: int wcnt; utSplitterCB() : wcnt(0) {} bool takeword(const string &term, int pos, int bs, int be) { wcnt++; return true; } }; int TextSplit::countWords(const string& s, TextSplit::Flags flgs) { utSplitterCB cb; TextSplit splitter(&cb, flgs); splitter.text_to_words(s); return cb.wcnt; } #else // TEST driver -> #include #include #include #include #include #include "textsplit.h" #include "readfile.h" #include "debuglog.h" using namespace std; // A small class to hold state while splitting text class mySplitterCB : public TextSplitCB { int first; bool nooutput; public: mySplitterCB() : first(1), nooutput(false) {} void setNoOut(bool val) {nooutput = val;} bool takeword(const string &term, int pos, int bs, int be) { if (nooutput) return true; if (first) { printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); first = 0; } printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); return true; } }; static string teststring = "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n" "\"Jean-Francois Dockes\" \n" "n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n" "134 +134 -14 -1.5 +1.5 1.54e10 1,2 1,2e30\n" "@^#$(#$(*)\n" "192.168.4.1 one\n\rtwo\r" "Debut-\ncontinue\n" "[olala][ululu] (valeur) (23)\n" "utf-8 ucs-4© \\nodef\n" "','this\n" " ,able,test-domain " " -wl,--export-dynamic " " ~/.xsession-errors " ; static string teststring1 = " nouvel-an "; static string thisprog; static string usage = " textsplit [opts] [filename]\n" " -S: no output\n" " -s: only spans\n" " -w: only words\n" " -k: preserve wildcards (?*)\n" " -c: just count words\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" ; static void Usage(void) { cerr << thisprog << ": usage:\n" << usage; exit(1); } static int op_flags; #define OPT_s 0x1 #define OPT_w 0x2 #define OPT_S 0x4 #define OPT_c 0x8 #define OPT_k 0x10 int main(int argc, char **argv) { thisprog = argv[0]; argc--; argv++; while (argc > 0 && **argv == '-') { (*argv)++; if (!(**argv)) /* Cas du "adb - core" */ Usage(); while (**argv) switch (*(*argv)++) { case 'c': op_flags |= OPT_c; break; case 'k': op_flags |= OPT_k; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; break; case 'w': op_flags |= OPT_w; break; default: Usage(); break; } argc--; argv++; } DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); mySplitterCB cb; TextSplit::Flags flags = TextSplit::TXTS_NONE; if (op_flags&OPT_S) cb.setNoOut(true); if (op_flags&OPT_s) flags = TextSplit::TXTS_ONLYSPANS; else if (op_flags&OPT_w) flags = TextSplit::TXTS_NOSPANS; if (op_flags & OPT_k) flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); string data; if (argc == 1) { const char *filename = *argv++; argc--; if (!strcmp(filename, "stdin")) { char buf[1024]; int nread; while ((nread = read(0, buf, 1024)) > 0) { data.append(buf, nread); } } else if (!file_to_string(filename, data)) exit(1); } else { cout << endl << teststring << endl << endl; data = teststring; } if (op_flags & OPT_c) { int n = TextSplit::countWords(data, flags); cout << n << " words" << endl; } else { TextSplit splitter(&cb, flags); splitter.text_to_words(data); } } #endif // TEST