diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 53cd3462..2938f703 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -32,29 +32,23 @@ using namespace std; #endif /* NO_NAMESPACES */ /** - * Splitting a text into words. The code in this file will work with any - * charset where the basic separators (.,- etc.) have their ascii values - * (ok for UTF-8, ascii, iso8859* and quite a few others). - * - * We work in a way which would make it quite difficult to handle non-ascii - * separator chars (en-dash, etc.). We would then need to actually parse the - * utf-8 stream, and use a different way to classify the characters (instead - * of a 256 slot array). + * Splitting a text into words. The code in this file works with utf-8 + * in a semi-clean way (see uproplist.h) * * We are also not using capitalization information. * - * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first. - * Then specialcase all 'real' utf chars, by checking for the few - * punctuation ones we're interested in (put them in a map). Then - * classify all other non-ascii as letter, and use the current method - * for chars < 127. + * There are a few remnants of the initial utf8-ignorant version in this file. */ // Character classes: we have three main groups, and then some chars // are their own class because they want special handling. +// // We have an array with 256 slots where we keep the character types. // The array could be fully static, but we use a small function to fill it // once. +// The array is actually a remnant of the original version which did no utf8 +// It could be reduced to 128, because real (over 128) utf8 chars are now +// handled with a set holding all the separator values. enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; static int charclasses[256]; @@ -87,11 +81,11 @@ static void setcharclasses() for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; - init = 1; - //for (i=0;i<256;i++)cerr< "< 0 && l < (unsigned)maxWordLength) { + // 1 char word: we index single letters and digits, but + // nothing else. We might want to turn this into a test for a single + // utf8 character instead. if (l == 1) { - // 1 char word: we index single letters and digits, but - // nothing else int c = (int)w[0]; if (charclasses[c] != LETTER && charclasses[c] != DIGIT) { //cerr << "ERASING single letter term " << c << endl; @@ -227,6 +222,18 @@ bool TextSplit::text_to_words(const string &in) } int cc = whatcc(c); switch (cc) { + case LETTER: + word += it; + span += it; + break; + + case DIGIT: + if (word.length() == 0) + number = true; + word += it; + span += it; + break; + case SPACE: SPACE: if (word.length() || span.length()) { @@ -326,11 +333,7 @@ bool TextSplit::text_to_words(const string &in) goto SPACE; } break; - case DIGIT: - if (word.length() == 0) - number = true; - /* FALLTHROUGH */ - case LETTER: + default: word += it; span += it; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 6664d566..79e3bdcd 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.63 2006-04-07 13:10:22 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.64 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -399,14 +399,7 @@ bool dumb_string(const string &in, string &out) if (in.empty()) return true; - string s1; - s1.reserve(in.length()); - for (unsigned int i = 0; i < in.length(); i++) { - if (in[i] == '\n' || in[i] == '\r') - s1 += ' '; - else - s1 += in[i]; - } + string s1 = neutchars(in, "\n\r"); if (!unacmaybefold(s1, out, "UTF-8", true)) { LOGERR(("dumb_string: unac failed for %s\n", in.c_str())); out.erase(); @@ -416,58 +409,6 @@ bool dumb_string(const string &in, string &out) return true; } -/* From omindex direct */ -/* Truncate a string to a given maxlength, avoiding cutting off midword - * if reasonably possible. */ -static string -truncate_to_word(string & input, string::size_type maxlen) -{ - string output; - if (input.length() <= maxlen) { - output = input; - } else { - output = input.substr(0, maxlen); - const char *SEPAR = " \t\n\r-:.;,/[]{}"; - string::size_type space = output.find_last_of(SEPAR); - // Original version only truncated at space if space was found after - // maxlen/2. But we HAVE to truncate at space, else we'd need to do - // utf8 stuff to avoid truncating at multibyte char. In any case, - // not finding space means that the text probably has no value. - // Except probably for Asian languages, so we may want to fix this - // one day - if (space == string::npos) { - output.erase(); - } else { - output.erase(space); - } - - output += " ..."; - } - return output; -} - -// Remove some chars and replace them with spaces -static string stripchars(const string &str, string delims) -{ - string out; - string::size_type startPos, pos; - - for (pos = 0;;) { - // Skip initial delims, break if this eats all. - if ((startPos = str.find_first_not_of(delims, pos)) == string::npos) - break; - // Find next delimiter or end of string (end of token) - pos = str.find_first_of(delims, startPos); - // Add token to the vector. Note: token cant be empty here - if (pos == string::npos) { - out += str.substr(startPos) + " "; - } else { - out += str.substr(startPos, pos - startPos) + " "; - } - } - return out; -} - // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more @@ -490,7 +431,7 @@ bool Db::add(const string &fn, const Doc &idoc, } else { doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE); } - doc.abstract = stripchars(doc.abstract, "\n\r"); + doc.abstract = neutchars(doc.abstract, "\n\r"); doc.title = truncate_to_word(doc.title, 100); doc.keywords = truncate_to_word(doc.keywords, 300); @@ -720,14 +661,20 @@ static string stemdbname(const string& basename, string lang) return nm; } -// Deciding if we try to stem the term. If it has numerals or capitals -// we don't -inline static bool -p_notlowerorutf(unsigned int c) +// Return list of existing stem db languages +list Db::getStemLangs() { - if (c < 'a' || (c > 'z' && c < 128)) - return true; - return false; + list dirs; + LOGDEB(("Db::getStemLang\n")); + if (m_ndb == 0) + return dirs; + string pattern = stemdirstem + "*"; + dirs = path_dirglob(m_ndb->m_basedir, pattern); + for (list::iterator it = dirs.begin(); it != dirs.end(); it++) { + *it = path_basename(*it); + *it = it->substr(stemdirstem.length(), string::npos); + } + return dirs; } /** @@ -747,6 +694,16 @@ bool Db::deleteStemDb(const string& lang) return false; } +// Deciding if we try to stem the term. If it has numerals or capitals +// we don't +inline static bool +p_notlowerascii(unsigned int c) +{ + if (c < 'a' || (c > 'z' && c < 128)) + return true; + return false; +} + /** * Create database of stem to parents associations for a given language. * We walk the list of all terms, stem them, and create another Xapian db @@ -780,7 +737,7 @@ bool Db::createStemDb(const string& lang) it != m_ndb->wdb.allterms_end(); it++) { // If it has any non-lowercase 7bit char, cant be stemmable string::iterator sit = (*it).begin(), eit = sit + (*it).length(); - if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) { + if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) { ++nostem; // LOGDEB(("stemskipped: [%s], because of 0x%x\n", // (*it).c_str(), *sit)); @@ -886,26 +843,10 @@ bool Db::createStemDb(const string& lang) return true; } -list Db::getStemLangs() -{ - list dirs; - LOGDEB(("Db::getStemLang\n")); - if (m_ndb == 0) - return dirs; - string pattern = stemdirstem + "*"; - dirs = path_dirglob(m_ndb->m_basedir, pattern); - for (list::iterator it = dirs.begin(); it != dirs.end(); it++) { - *it = path_basename(*it); - *it = it->substr(stemdirstem.length(), string::npos); - } - return dirs; -} - /** * This is called at the end of an indexing session, to delete the - * documents for files that are no longer there. We also build the - * stem database while we are at it. + * documents for files that are no longer there. */ bool Db::purge() { diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index 0d924a3a..4dcc1267 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: smallut.cpp,v 1.15 2006-01-26 12:29:20 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: smallut.cpp,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -331,6 +331,58 @@ void trimstring(string &s, const char *ws) s.replace(pos+1, string::npos, ""); } +// Remove some chars and replace them with spaces +string neutchars(const string &str, string delims) +{ + string out; + string::size_type startPos, pos; + + for (pos = 0;;) { + // Skip initial delims, break if this eats all. + if ((startPos = str.find_first_not_of(delims, pos)) == string::npos) + break; + // Find next delimiter or end of string (end of token) + pos = str.find_first_of(delims, startPos); + // Add token to the output. Note: token cant be empty here + if (pos == string::npos) { + out += str.substr(startPos); + } else { + out += str.substr(startPos, pos - startPos) + " "; + } + } + return out; +} + + +/* Truncate a string to a given maxlength, avoiding cutting off midword + * if reasonably possible. Note: we could also use textsplit, stopping when + * we have enough, this would be cleanly utf8-aware but would remove + * punctuation */ +static const string SEPAR = " \t\n\r-:.;,/[]{}"; +string truncate_to_word(string & input, string::size_type maxlen) +{ + string output; + if (input.length() <= maxlen) { + output = input; + } else { + output = input.substr(0, maxlen); + string::size_type space = output.find_last_of(SEPAR); + // Original version only truncated at space if space was found after + // maxlen/2. But we HAVE to truncate at space, else we'd need to do + // utf8 stuff to avoid truncating at multibyte char. In any case, + // not finding space means that the text probably has no value. + // Except probably for Asian languages, so we may want to fix this + // one day + if (space == string::npos) { + output.erase(); + } else { + output.erase(space); + } + output += " ..."; + } + return output; +} + // Escape things that would look like markup string escapeHtml(const string &in) { diff --git a/src/utils/smallut.h b/src/utils/smallut.h index f82ff755..f72091ca 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -16,7 +16,7 @@ */ #ifndef _SMALLUT_H_INCLUDED_ #define _SMALLUT_H_INCLUDED_ -/* @(#$Id: smallut.h,v 1.15 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: smallut.h,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -59,6 +59,13 @@ extern void trimstring(string &s, const char *ws = " \t"); /** Escape things like < or & by turining them to entities */ extern string escapeHtml(const string &in); +/** Replace some chars with spaces (ie: newline chars). This is not utf8-aware + * so chars should only contain ascii */ +extern string neutchars(const string &str, string chars); + +/** Truncate a string to a given maxlength, avoiding cutting off midword + * if reasonably possible. */ +extern string truncate_to_word(string &input, string::size_type maxlen); class Chrono { public: