From 3223d1245a29c9aa95abf97869fe4b39293b1d38 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 9 Oct 2009 13:57:33 +0000 Subject: [PATCH] process camelCase --- src/common/textsplit.cpp | 130 ++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 37 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 8be82ec7..fa14b49b 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -19,6 +19,8 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.38 2008-12-12 11:53:45 dockes E */ #ifndef TEST_TEXTSPLIT +#include + #include #include #include @@ -26,11 +28,8 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.38 2008-12-12 11:53:45 dockes E #include "textsplit.h" #include "debuglog.h" -#include "assert.h" - //#define UTF8ITER_CHECK #include "utf8iter.h" - #include "uproplist.h" #ifndef NO_NAMESPACES @@ -39,11 +38,7 @@ using namespace std; /** * Splitting a text into words. The code in this file works with utf-8 - * in a semi-clean way (see uproplist.h) - * - * We are also not using capitalization information. - * - * There are a few remnants of the initial utf8-ignorant version in this file. + * in a semi-clean way (see uproplist.h). Ascii still gets special treatment. */ // Character classes: we have three main groups, and then some chars @@ -52,37 +47,43 @@ using namespace std; // We have an array with 256 slots where we keep the character types. // The array could be fully static, but we use a small function to fill it // once. -// The array is actually a remnant of the original version which did no utf8 -// It could be reduced to 128, because real (over 128) utf8 chars are now -// handled with a set holding all the separator values. -enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259}; +// The array is actually a remnant of the original version which did no utf8. +// Only the lower 127 slots are now used, but keep it at 256 +// because it makes some tests in the code simpler. +enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, + A_ULETTER=260, A_LLETTER=261}; static int charclasses[256]; +// Real UTF-8 characters are handled with sets holding all characters +// with interesting properties. This is far from full-blown management +// of Unicode properties, but seems to do the job well enough in most +// common cases static set unicign; static set visiblewhite; + +// Set up character classes array and the additional unicode sets static void setcharclasses() { static int init = 0; if (init) return; unsigned int i; - for (i = 0 ; i < 256 ; i ++) - charclasses[i] = LETTER; - for (i = 0; i < ' ';i++) + // Set default value for all: SPACE + for (i = 0 ; i < 256 ; i ++) charclasses[i] = SPACE; char digits[] = "0123456789"; for (i = 0; i < strlen(digits); i++) charclasses[int(digits[i])] = DIGIT; - char blankspace[] = "\t\v\f "; - for (i = 0; i < strlen(blankspace); i++) - charclasses[int(blankspace[i])] = SPACE; + char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + for (i = 0; i < strlen(upper); i++) + charclasses[int(upper[i])] = A_ULETTER; - char seps[] = "!\"$%&()/<=>\\^{|}~:;`"; - for (i = 0; i < strlen(seps); i++) - charclasses[int(seps[i])] = SPACE; + char lower[] = "abcdefghijklmnopqrstuvwxyz"; + for (i = 0; i < strlen(lower); i++) + charclasses[int(lower[i])] = A_LLETTER; char wild[] = "*?[]"; for (i = 0; i < strlen(wild); i++) @@ -116,6 +117,9 @@ static inline int whatcc(unsigned int c) } } + +// CJK Unicode character detection: +// // 2E80..2EFF; CJK Radicals Supplement // 3000..303F; CJK Symbols and Punctuation // 3040..309F; Hiragana @@ -168,12 +172,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, unsigned int l = w.length(); if (l > 0 && l < (unsigned)m_maxWordLength) { - // 1 char word: we index single letters and digits, but - // nothing else. We might want to turn this into a test for a single - // utf8 character instead. + // 1 byte word: we index single ascii letters and digits, but + // nothing else. We might want to turn this into a test for a + // single utf8 character instead ? if (l == 1) { int c = (int)w[0]; - if (charclasses[c] != LETTER && charclasses[c] != DIGIT) { + if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER && + charclasses[c] != DIGIT) { //cerr << "ERASING single letter term " << c << endl; return true; } @@ -195,7 +200,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, * handler/emitter. Emit and reset the current word, possibly emit the current * span (if different). In query mode, words are not emitted, only final spans * - * This is purely for factoring common code from different places + * This is purely for factoring common code from different places in * text_to_words(). * * @return true if ok, false for error. Splitting should stop in this case. @@ -259,7 +264,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) /** * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are - * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, + * handled specially so that special cases, ie, c++ and jfd@recoll.com etc, * are handled properly, */ bool TextSplit::text_to_words(const string &in) @@ -310,10 +315,6 @@ bool TextSplit::text_to_words(const string &in) int cc = whatcc(c); switch (cc) { - case LETTER: - m_wordLen += it.appendchartostring(m_span); - break; - case DIGIT: if (m_wordLen == 0) m_inNumber = true; @@ -448,6 +449,41 @@ bool TextSplit::text_to_words(const string &in) } break; + // Camelcase handling. + // If we get uppercase ascii after lowercase ascii, emit word. + // This emits "camel" when hitting the 'C' of camelCase + case A_ULETTER: + if (m_span.length() && + charclasses[(unsigned int)m_span[m_span.length() - 1]] == + A_LLETTER) { + if (m_wordLen) { + if (!doemit(false, it.getBpos())) + return false; + } + } + goto NORMALCHAR; + + // CamelCase handling. + // If we get lowercase after uppercase and the current + // word length is bigger than one, it means we had a + // string of several upper-case letters: an + // acronym (readHTML) or a single letter article (ALittleHelp). + // Emit the uppercase word before proceeding + case A_LLETTER: + if (m_span.length() && + charclasses[(unsigned int)m_span[m_span.length() - 1]] == + A_ULETTER && m_wordLen > 1) { + // Multiple upper-case letters. Single letter word + // or acronym which we want to emit now + m_wordLen--; + if (!doemit(false, it.getBpos())) + return false; + m_wordStart--; + m_wordLen++; + } + goto NORMALCHAR; + + default: NORMALCHAR: m_wordLen += it.appendchartostring(m_span); @@ -678,6 +714,7 @@ bool TextSplit::stringToStrings(const string &s, list &tokens) #include "textsplit.h" #include "readfile.h" #include "debuglog.h" +#include "transcode.h" using namespace std; @@ -711,6 +748,7 @@ static string teststring = "Debut-\ncontinue\n" "[olala][ululu] (valeur) (23)\n" "utf-8 ucs-4© \\nodef\n" + "A b C 2 . +" "','this\n" " ,able,test-domain " " -wl,--export-dynamic " @@ -727,6 +765,7 @@ static string usage = " -w: only words\n" " -k: preserve wildcards (?*)\n" " -c: just count words\n" + " -C [charset] : input charset\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" ; @@ -748,6 +787,7 @@ static int op_flags; int main(int argc, char **argv) { + string charset; thisprog = argv[0]; argc--; argv++; @@ -759,14 +799,16 @@ int main(int argc, char **argv) while (**argv) switch (*(*argv)++) { case 'c': op_flags |= OPT_c; break; - case 'C': op_flags |= OPT_C; break; + case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); + charset = *(++argv); argc--; + goto b1; case 'k': op_flags |= OPT_k; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; break; case 'w': op_flags |= OPT_w; break; default: Usage(); break; } - argc--; argv++; + b1: argc--; argv++; } DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); @@ -784,21 +826,35 @@ int main(int argc, char **argv) if (op_flags & OPT_k) flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); - string data; + string odata, reason; if (argc == 1) { const char *filename = *argv++; argc--; if (!strcmp(filename, "stdin")) { char buf[1024]; int nread; while ((nread = read(0, buf, 1024)) > 0) { - data.append(buf, nread); + odata.append(buf, nread); } - } else if (!file_to_string(filename, data)) + } else if (!file_to_string(filename, odata, &reason)) { + cerr << "Failed: file_to_string(" << filename << ") failed: " + << reason << endl; exit(1); + } } else { cout << endl << teststring << endl << endl; - data = teststring; + odata = teststring; } + string& data = odata; + string ndata; + if ((op_flags & OPT_C)) { + if (!transcode(odata, ndata, charset, "UTF-8")) { + cerr << "Failed: transcode error" << endl; + exit(1); + } else { + data = ndata; + } + } + if (op_flags & OPT_c) { int n = TextSplit::countWords(data, flags); cout << n << " words" << endl;