comments and moving some util routines out of rcldb.cpp

2006-04-11 06:49:45 +00:00 · 2006-04-11 06:49:45 +00:00 · 930bdc870d
commit 930bdc870d
parent 9001129bf4
4 changed files with 115 additions and 112 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -32,29 +32,23 @@ using namespace std;
 #endif /* NO_NAMESPACES */
 /**
- * Splitting a text into words. The code in this file will work with any 
+ * Splitting a text into words. The code in this file works with utf-8
- * charset where the basic separators (.,- etc.) have their ascii values 
+ * in a semi-clean way (see uproplist.h)
 * (ok for UTF-8, ascii, iso8859* and quite a few others).
 *
 * We work in a way which would make it quite difficult to handle non-ascii
 * separator chars (en-dash, etc.). We would then need to actually parse the 
 * utf-8 stream, and use a different way to classify the characters (instead 
 * of a 256 slot array).
 *
 * We are also not using capitalization information.
 *
- * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
+ * There are a few remnants of the initial utf8-ignorant version in this file.
 * Then specialcase all 'real' utf chars, by checking for the few
 * punctuation ones we're interested in (put them in a map). Then
 * classify all other non-ascii as letter, and use the current method
 * for chars < 127.
 */
 // Character classes: we have three main groups, and then some chars
 // are their own class because they want special handling.
 // 
 // We have an array with 256 slots where we keep the character types. 
 // The array could be fully static, but we use a small function to fill it 
 // once.
 // The array is actually a remnant of the original version which did no utf8
 // It could be reduced to 128, because real (over 128) utf8 chars are now 
 // handled with a set holding all the separator values.
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
 static int charclasses[256];
@ -87,11 +81,11 @@ static void setcharclasses()
    for (i = 0; i  < strlen(special); i++)
 	charclasses[int(special[i])] = special[i];
    init = 1;
    //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
    for (i = 0; i < sizeof(uniign); i++) 
 	unicign.insert(uniign[i]);
    unicign.insert((unsigned int)-1);
    init = 1;
 }
 // Do some checking (the kind which is simpler to do here than in the
@ -103,9 +97,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
    unsigned int l = w.length();
    if (l > 0 && l < (unsigned)maxWordLength) {
 	// 1 char word: we index single letters and digits, but
 	// nothing else. We might want to turn this into a test for a single
 	// utf8 character instead.
 	if (l == 1) {
 	    // 1 char word: we index single letters and digits, but
 	    // nothing else
 	    int c = (int)w[0];
 	    if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
 		//cerr << "ERASING single letter term " << c << endl;
@ -227,6 +222,18 @@ bool TextSplit::text_to_words(const string &in)
 	}
 	int cc = whatcc(c);
 	switch (cc) {
 	case LETTER:
 	    word += it;
 	    span += it;
 	    break;
 	case DIGIT:
 	    if (word.length() == 0)
 		number = true;
 	    word += it;
 	    span += it;
 	    break;
 	case SPACE:
 	SPACE:
 	    if (word.length() || span.length()) {
@ -326,11 +333,7 @@ bool TextSplit::text_to_words(const string &in)
 		goto SPACE;
 	    }
 	    break;
-	case DIGIT:
+
 	    if (word.length() == 0)
 		number = true;
 	    /* FALLTHROUGH */
 	case LETTER:
 	default:
 	    word += it;
 	    span += it;
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.63 2006-04-07 13:10:22 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.64 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -399,14 +399,7 @@ bool dumb_string(const string &in, string &out)
    if (in.empty())
 	return true;
-    string s1;
+    string s1 = neutchars(in, "\n\r");
    s1.reserve(in.length());
    for (unsigned int i = 0; i < in.length(); i++) {
 	if (in[i] == '\n' || in[i] == '\r')
 	    s1 += ' ';
 	else
 	    s1 += in[i];
    }
    if (!unacmaybefold(s1, out, "UTF-8", true)) {
 	LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
 	out.erase();
@ -416,58 +409,6 @@ bool dumb_string(const string &in, string &out)
    return true;
 }
 /* From omindex direct */
 /* Truncate a string to a given maxlength, avoiding cutting off midword
 * if reasonably possible. */
 static string
 truncate_to_word(string & input, string::size_type maxlen)
 {
    string output;
    if (input.length() <= maxlen) {
 	output = input;
    } else {
 	output = input.substr(0, maxlen);
 	const char *SEPAR = " \t\n\r-:.;,/[]{}";
 	string::size_type space = output.find_last_of(SEPAR);
 	// Original version only truncated at space if space was found after
 	// maxlen/2. But we HAVE to truncate at space, else we'd need to do
 	// utf8 stuff to avoid truncating at multibyte char. In any case,
 	// not finding space means that the text probably has no value.
 	// Except probably for Asian languages, so we may want to fix this 
 	// one day
 	if (space == string::npos) {
 	    output.erase();
 	} else {
 	    output.erase(space);
 	}
 	output += " ...";
    }
    return output;
 }
 // Remove some chars and replace them with spaces
 static string stripchars(const string &str, string delims)
 {
    string out;
    string::size_type startPos, pos;
    for (pos = 0;;) { 
        // Skip initial delims, break if this eats all.
        if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
 	    break;
        // Find next delimiter or end of string (end of token)
        pos = str.find_first_of(delims, startPos);
        // Add token to the vector. Note: token cant be empty here
 	if (pos == string::npos) {
 	    out += str.substr(startPos) + " ";
 	} else {
 	    out += str.substr(startPos, pos - startPos) + " ";
 	}
    }
    return out;
 }
 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
@ -490,7 +431,7 @@ bool Db::add(const string &fn, const Doc &idoc,
    } else {
 	doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
    }
-    doc.abstract = stripchars(doc.abstract, "\n\r");
+    doc.abstract = neutchars(doc.abstract, "\n\r");
    doc.title = truncate_to_word(doc.title, 100);
    doc.keywords = truncate_to_word(doc.keywords, 300);
@ -720,14 +661,20 @@ static string stemdbname(const string& basename, string lang)
    return nm;
 }
-// Deciding if we try to stem the term. If it has numerals or capitals
+// Return list of existing stem db languages
-// we don't
+list<string> Db::getStemLangs()
 inline static bool
 p_notlowerorutf(unsigned int c)
 {
-    if (c < 'a' || (c > 'z' && c < 128))
+    list<string> dirs;
-	return true;
+    LOGDEB(("Db::getStemLang\n"));
-    return false;
+    if (m_ndb == 0)
 	return dirs;
    string pattern = stemdirstem + "*";
    dirs = path_dirglob(m_ndb->m_basedir, pattern);
    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
 	*it = path_basename(*it);
 	*it = it->substr(stemdirstem.length(), string::npos);
    }
    return dirs;
 }
 /**
@ -747,6 +694,16 @@ bool Db::deleteStemDb(const string& lang)
    return false;
 }
 // Deciding if we try to stem the term. If it has numerals or capitals
 // we don't
 inline static bool
 p_notlowerascii(unsigned int c)
 {
    if (c < 'a' || (c > 'z' && c < 128))
 	return true;
    return false;
 }
 /**
 * Create database of stem to parents associations for a given language.
 * We walk the list of all terms, stem them, and create another Xapian db
@ -780,7 +737,7 @@ bool Db::createStemDb(const string& lang)
 	     it != m_ndb->wdb.allterms_end(); it++) {
 	    // If it has any non-lowercase 7bit char, cant be stemmable
 	    string::iterator sit = (*it).begin(), eit = sit + (*it).length();
-	    if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
+	    if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
 		++nostem;
 		// LOGDEB(("stemskipped: [%s], because of 0x%x\n", 
 		// (*it).c_str(), *sit));
@ -886,26 +843,10 @@ bool Db::createStemDb(const string& lang)
    return true;
 }
 list<string> Db::getStemLangs()
 {
    list<string> dirs;
    LOGDEB(("Db::getStemLang\n"));
    if (m_ndb == 0)
 	return dirs;
    string pattern = stemdirstem + "*";
    dirs = path_dirglob(m_ndb->m_basedir, pattern);
    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
 	*it = path_basename(*it);
 	*it = it->substr(stemdirstem.length(), string::npos);
    }
    return dirs;
 }
 /**
 * This is called at the end of an indexing session, to delete the
- *  documents for files that are no longer there. We also build the
+ * documents for files that are no longer there. 
 *  stem database while we are at it.
 */
 bool Db::purge()
 {
--- a/src/utils/smallut.cpp
+++ b/src/utils/smallut.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: smallut.cpp,v 1.15 2006-01-26 12:29:20 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: smallut.cpp,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -331,6 +331,58 @@ void trimstring(string &s, const char *ws)
 	s.replace(pos+1, string::npos, "");
 }
 // Remove some chars and replace them with spaces
 string neutchars(const string &str, string delims)
 {
    string out;
    string::size_type startPos, pos;
    for (pos = 0;;) { 
        // Skip initial delims, break if this eats all.
        if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
 	    break;
        // Find next delimiter or end of string (end of token)
        pos = str.find_first_of(delims, startPos);
        // Add token to the output. Note: token cant be empty here
 	if (pos == string::npos) {
 	    out += str.substr(startPos);
 	} else {
 	    out += str.substr(startPos, pos - startPos) + " ";
 	}
    }
    return out;
 }
 /* Truncate a string to a given maxlength, avoiding cutting off midword
 * if reasonably possible. Note: we could also use textsplit, stopping when
 * we have enough, this would be cleanly utf8-aware but would remove 
 * punctuation */
 static const string SEPAR = " \t\n\r-:.;,/[]{}";
 string truncate_to_word(string & input, string::size_type maxlen)
 {
    string output;
    if (input.length() <= maxlen) {
 	output = input;
    } else {
 	output = input.substr(0, maxlen);
 	string::size_type space = output.find_last_of(SEPAR);
 	// Original version only truncated at space if space was found after
 	// maxlen/2. But we HAVE to truncate at space, else we'd need to do
 	// utf8 stuff to avoid truncating at multibyte char. In any case,
 	// not finding space means that the text probably has no value.
 	// Except probably for Asian languages, so we may want to fix this 
 	// one day
 	if (space == string::npos) {
 	    output.erase();
 	} else {
 	    output.erase(space);
 	}
 	output += " ...";
    }
    return output;
 }
 // Escape things that would look like markup
 string escapeHtml(const string &in)
 {
--- a/src/utils/smallut.h
+++ b/src/utils/smallut.h
@ -16,7 +16,7 @@
 */
 #ifndef _SMALLUT_H_INCLUDED_
 #define _SMALLUT_H_INCLUDED_
-/* @(#$Id: smallut.h,v 1.15 2006-01-30 11:15:28 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: smallut.h,v 1.16 2006-04-11 06:49:45 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
@ -59,6 +59,13 @@ extern void trimstring(string &s, const char *ws = " \t");
 /** Escape things like < or & by turining them to entities */
 extern string escapeHtml(const string &in);
 /** Replace some chars with spaces (ie: newline chars). This is not utf8-aware
 *  so chars should only contain ascii */
 extern string neutchars(const string &str, string chars);
 /** Truncate a string to a given maxlength, avoiding cutting off midword
 *  if reasonably possible. */
 extern string truncate_to_word(string &input, string::size_type maxlen);
 class Chrono {
 public: