#ifndef lint static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.11 2005-09-22 11:10:11 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT #include #include #include #include "textsplit.h" #include "debuglog.h" #include "utf8iter.h" #include "uproplist.h" using namespace std; /** * Splitting a text into words. The code in this file will work with any * charset where the basic separators (.,- etc.) have their ascii values * (ok for UTF-8, ascii, iso8859* and quite a few others). * * We work in a way which would make it quite difficult to handle non-ascii * separator chars (en-dash, etc.). We would then need to actually parse the * utf-8 stream, and use a different way to classify the characters (instead * of a 256 slot array). * * We are also not using capitalization information. * * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first. * Then specialcase all 'real' utf chars, by checking for the few * punctuation ones we're interested in (put them in a map). Then * classify all other non-ascii as letter, and use the current method * for chars < 127. */ // Character classes: we have three main groups, and then some chars // are their own class because they want special handling. // We have an array with 256 slots where we keep the character types. // The array could be fully static, but we use a small function to fill it // once. enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; static int charclasses[256]; static set unicign; static void setcharclasses() { static int init = 0; if (init) return; unsigned int i; for (i = 0 ; i < 256 ; i ++) charclasses[i] = LETTER; for (i = 0; i < ' ';i++) charclasses[i] = SPACE; char digits[] = "0123456789"; for (i = 0; i < strlen(digits); i++) charclasses[int(digits[i])] = DIGIT; char blankspace[] = "\t\v\f "; for (i = 0; i < strlen(blankspace); i++) charclasses[int(blankspace[i])] = SPACE; char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?"; for (i = 0; i < strlen(seps); i++) charclasses[int(seps[i])] = SPACE; char special[] = ".@+-,#'\n\r"; for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; init = 1; //for (i=0;i<256;i++)cerr< "< 0) { switch (w[w.length()-1]) { case '.': case ',': case '@': case '\'': w.erase(w.length()-1); break; default: goto breakloop1; } } breakloop1: // In addition, it doesn't make sense currently to keep ' at the beginning while (w.length() > 0) { switch (w[0]) { case ',': case '\'': w.erase(w.length()-1); break; default: goto breakloop2; } } breakloop2: // 1 char word: we index single letters, but nothing else if (w.length() == 1) { int c = (int)w[0]; if (charclasses[c] != LETTER && charclasses[c] != DIGIT) { //cerr << "ERASING single letter term " << c << endl; w.erase(); } } if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { if (w != prevterm || pos != prevpos) { bool ret = cb->takeword(w, pos, btstart, btend); prevterm = w; prevpos = pos; return ret; } } return true; } // A routine called from different places in text_to_words(), to adjust // the current state and call the word handler. This is purely for // factoring common code from different places text_to_words() bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos, bool spanerase, int bp) { #if 0 cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" << span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp << endl; #endif // When splitting for query, we only emit final spans if (fq && !spanerase) { wordpos++; word.erase(); return true; } // Emit span or both word and span if they are different if (!emitterm(true, span, spanpos, bp-span.length(), bp)) return false; if (word.length() != span.length() && !fq) if (!emitterm(false, word, wordpos, bp-word.length(), bp)) return false; // Adjust state wordpos++; if (spanerase) span.erase(); word.erase(); return true; } static inline int whatcc(unsigned int c) { int cc; if (c <= 127) { cc = charclasses[c]; } else { if (c == (unsigned int)-1) cc = SPACE; else if (unicign.find(c) != unicign.end()) cc = SPACE; else cc = LETTER; } return cc; } /** * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, * are handled properly, */ bool TextSplit::text_to_words(const string &in) { LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb)); setcharclasses(); string span; // Current span. Might be jf.dockes@wanadoo.f string word; // Current word: no punctuation at all in there bool number = false; int wordpos = 0; // Term position of current word int spanpos = 0; // Term position of current span int charpos = 0; // Character position Utf8Iter it(in); for (; !it.eof(); it++, charpos++) { unsigned int c = *it; if (c == (unsigned int)-1) { LOGERR(("Textsplit: error occured while scanning UTF-8 string\n")); return false; } int cc = whatcc(c); switch (cc) { case SPACE: SPACE: if (word.length() || span.length()) { if (!doemit(word, wordpos, span, spanpos, true, it.getBpos())) return false; number = false; } spanpos = wordpos; span.erase(); break; case '-': case '+': if (word.length() == 0) { if (whatcc(it[charpos+1]) == DIGIT) { number = true; word += it; span += it; } else span += it; } else { if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) return false; number = false; span += it; } break; case '@': if (word.length()) { if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) return false; number = false; } else word += it; span += it; break; case '\'': if (word.length()) { if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) return false; number = false; span += it; } break; case '.': if (number) { word += it; } else { //cerr<<"Got . span: '"< 0 && (whatcc(it[charpos+1]) == SPACE || whatcc(it[charpos+1]) == '\n' || whatcc(it[charpos+1]) == '\r')) { word += it; span += it; } break; case '\n': case '\r': if (span.length() && span[span.length() - 1] == '-') { // if '-' is the last char before end of line, just // ignore the line change. This is the right thing to // do almost always. We'd then need a way to check if // the - was added as part of the word hyphenation, or was // there in the first place, but this would need a dictionary. } else { // Handle like a normal separator goto SPACE; } break; case LETTER: case DIGIT: default: if (word.length() == 0) { if (cc == DIGIT) number = true; else number = false; } word += it; span += it; break; } } if (word.length() || span.length()) { if (!doemit(word, wordpos, span, spanpos, true, it.getBpos())) return false; } return true; } #else // TEST driver -> #include #include #include #include #include #include "textsplit.h" #include "readfile.h" #include "debuglog.h" using namespace std; // A small class to hold state while splitting text class mySplitterCB : public TextSplitCB { int first; public: mySplitterCB() : first(0) {} bool takeword(const std::string &term, int pos, int bs, int be) { if (first) { printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); first = 0; } printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); return true; } }; static string teststring = "Un bout de texte \n" "normal. " "jfd@okyz.com " "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami " "a 134 +134 -14 -1.5 +1.5 1.54e10 a " "@^#$(#$(*) " "192.168.4.1 " "one\n\rtwo\nthree-\nfour " "[olala][ululu] " "'o'brien' " "utf-8 ucs-4©" "\n" ; static string teststring1 = "c++ "; static string thisprog; static string usage = " textsplit [opts] [filename]\n" " -q: query mode\n" " \n\n" ; static void Usage(void) { cerr << thisprog << ": usage:\n" << usage; exit(1); } static int op_flags; #define OPT_q 0x1 int main(int argc, char **argv) { thisprog = argv[0]; argc--; argv++; while (argc > 0 && **argv == '-') { (*argv)++; if (!(**argv)) /* Cas du "adb - core" */ Usage(); while (**argv) switch (*(*argv)++) { case 'q': op_flags |= OPT_q; break; default: Usage(); break; } argc--; argv++; } DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); mySplitterCB cb; TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false); if (argc == 1) { string data; if (!file_to_string(*argv++, data)) exit(1); argc--; splitter.text_to_words(data); } else { cout << endl << teststring << endl << endl; splitter.text_to_words(teststring); } } #endif // TEST