comments and moving some util routines out of rcldb.cpp

2006-04-11 06:49:45 +00:00 · 2006-04-11 06:49:45 +00:00 · 930bdc870d
commit 930bdc870d
parent 9001129bf4
4 changed files with 115 additions and 112 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -32,29 +32,23 @@ using namespace std;
 #endif /* NO_NAMESPACES */

 /**
- * Splitting a text into words. The code in this file will work with any 
- * charset where the basic separators (.,- etc.) have their ascii values 
- * (ok for UTF-8, ascii, iso8859* and quite a few others).
- *
- * We work in a way which would make it quite difficult to handle non-ascii
- * separator chars (en-dash, etc.). We would then need to actually parse the 
- * utf-8 stream, and use a different way to classify the characters (instead 
- * of a 256 slot array).
+ * Splitting a text into words. The code in this file works with utf-8
+ * in a semi-clean way (see uproplist.h)
 *
 * We are also not using capitalization information.
 *
- * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
- * Then specialcase all 'real' utf chars, by checking for the few
- * punctuation ones we're interested in (put them in a map). Then
- * classify all other non-ascii as letter, and use the current method
- * for chars < 127.
+ * There are a few remnants of the initial utf8-ignorant version in this file.
 */

 // Character classes: we have three main groups, and then some chars
 // are their own class because they want special handling.
+// 
 // We have an array with 256 slots where we keep the character types. 
 // The array could be fully static, but we use a small function to fill it 
 // once.
+// The array is actually a remnant of the original version which did no utf8
+// It could be reduced to 128, because real (over 128) utf8 chars are now 
+// handled with a set holding all the separator values.
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
 static int charclasses[256];

@ -87,11 +81,11 @@ static void setcharclasses()
    for (i = 0; i  < strlen(special); i++)
 	charclasses[int(special[i])] = special[i];

-    init = 1;
-    //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
    for (i = 0; i < sizeof(uniign); i++) 
 	unicign.insert(uniign[i]);
    unicign.insert((unsigned int)-1);
+
+    init = 1;
 }

 // Do some checking (the kind which is simpler to do here than in the
@ -103,9 +97,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,

    unsigned int l = w.length();
    if (l > 0 && l < (unsigned)maxWordLength) {
+	// 1 char word: we index single letters and digits, but
+	// nothing else. We might want to turn this into a test for a single
+	// utf8 character instead.
 	if (l == 1) {
-	    // 1 char word: we index single letters and digits, but
-	    // nothing else
 	    int c = (int)w[0];
 	    if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
 		//cerr << "ERASING single letter term " << c << endl;
@ -227,6 +222,18 @@ bool TextSplit::text_to_words(const string &in)
 	}
 	int cc = whatcc(c);
 	switch (cc) {
+	case LETTER:
+	    word += it;
+	    span += it;
+	    break;
+
+	case DIGIT:
+	    if (word.length() == 0)
+		number = true;
+	    word += it;
+	    span += it;
+	    break;
+
 	case SPACE:
 	SPACE:
 	    if (word.length() || span.length()) {
@ -326,11 +333,7 @@ bool TextSplit::text_to_words(const string &in)
 		goto SPACE;
 	    }
 	    break;
-	case DIGIT:
-	    if (word.length() == 0)
-		number = true;
-	    /* FALLTHROUGH */
-	case LETTER:
+
 	default:
 	    word += it;
 	    span += it;
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.63 2006-04-07 13:10:22 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.64 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -399,14 +399,7 @@ bool dumb_string(const string &in, string &out)
    if (in.empty())
 	return true;

-    string s1;
-    s1.reserve(in.length());
-    for (unsigned int i = 0; i < in.length(); i++) {
-	if (in[i] == '\n' || in[i] == '\r')
-	    s1 += ' ';
-	else
-	    s1 += in[i];
-    }
+    string s1 = neutchars(in, "\n\r");
    if (!unacmaybefold(s1, out, "UTF-8", true)) {
 	LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
 	out.erase();
@ -416,58 +409,6 @@ bool dumb_string(const string &in, string &out)
    return true;
 }

-/* From omindex direct */
-/* Truncate a string to a given maxlength, avoiding cutting off midword
- * if reasonably possible. */
-static string
-truncate_to_word(string & input, string::size_type maxlen)
-{
-    string output;
-    if (input.length() <= maxlen) {
-	output = input;
-    } else {
-	output = input.substr(0, maxlen);
-	const char *SEPAR = " \t\n\r-:.;,/[]{}";
-	string::size_type space = output.find_last_of(SEPAR);
-	// Original version only truncated at space if space was found after
-	// maxlen/2. But we HAVE to truncate at space, else we'd need to do
-	// utf8 stuff to avoid truncating at multibyte char. In any case,
-	// not finding space means that the text probably has no value.
-	// Except probably for Asian languages, so we may want to fix this 
-	// one day
-	if (space == string::npos) {
-	    output.erase();
-	} else {
-	    output.erase(space);
-	}
-
-	output += " ...";
-    }
-    return output;
-}
-
-// Remove some chars and replace them with spaces
-static string stripchars(const string &str, string delims)
-{
-    string out;
-    string::size_type startPos, pos;
-
-    for (pos = 0;;) { 
-        // Skip initial delims, break if this eats all.
-        if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
-	    break;
-        // Find next delimiter or end of string (end of token)
-        pos = str.find_first_of(delims, startPos);
-        // Add token to the vector. Note: token cant be empty here
-	if (pos == string::npos) {
-	    out += str.substr(startPos) + " ";
-	} else {
-	    out += str.substr(startPos, pos - startPos) + " ";
-	}
-    }
-    return out;
-}
-
 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
@ -490,7 +431,7 @@ bool Db::add(const string &fn, const Doc &idoc,
    } else {
 	doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
    }
-    doc.abstract = stripchars(doc.abstract, "\n\r");
+    doc.abstract = neutchars(doc.abstract, "\n\r");
    doc.title = truncate_to_word(doc.title, 100);
    doc.keywords = truncate_to_word(doc.keywords, 300);

@ -720,14 +661,20 @@ static string stemdbname(const string& basename, string lang)
    return nm;
 }

-// Deciding if we try to stem the term. If it has numerals or capitals
-// we don't
-inline static bool
-p_notlowerorutf(unsigned int c)
+// Return list of existing stem db languages
+list<string> Db::getStemLangs()
 {
-    if (c < 'a' || (c > 'z' && c < 128))
-	return true;
-    return false;
+    list<string> dirs;
+    LOGDEB(("Db::getStemLang\n"));
+    if (m_ndb == 0)
+	return dirs;
+    string pattern = stemdirstem + "*";
+    dirs = path_dirglob(m_ndb->m_basedir, pattern);
+    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
+	*it = path_basename(*it);
+	*it = it->substr(stemdirstem.length(), string::npos);
+    }
+    return dirs;
 }

 /**
@ -747,6 +694,16 @@ bool Db::deleteStemDb(const string& lang)
    return false;
 }

+// Deciding if we try to stem the term. If it has numerals or capitals
+// we don't
+inline static bool
+p_notlowerascii(unsigned int c)
+{
+    if (c < 'a' || (c > 'z' && c < 128))
+	return true;
+    return false;
+}
+
 /**
 * Create database of stem to parents associations for a given language.
 * We walk the list of all terms, stem them, and create another Xapian db
@ -780,7 +737,7 @@ bool Db::createStemDb(const string& lang)
 	     it != m_ndb->wdb.allterms_end(); it++) {
 	    // If it has any non-lowercase 7bit char, cant be stemmable
 	    string::iterator sit = (*it).begin(), eit = sit + (*it).length();
-	    if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
+	    if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
 		++nostem;
 		// LOGDEB(("stemskipped: [%s], because of 0x%x\n", 
 		// (*it).c_str(), *sit));
@ -886,26 +843,10 @@ bool Db::createStemDb(const string& lang)
    return true;
 }

-list<string> Db::getStemLangs()
-{
-    list<string> dirs;
-    LOGDEB(("Db::getStemLang\n"));
-    if (m_ndb == 0)
-	return dirs;
-    string pattern = stemdirstem + "*";
-    dirs = path_dirglob(m_ndb->m_basedir, pattern);
-    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
-	*it = path_basename(*it);
-	*it = it->substr(stemdirstem.length(), string::npos);
-    }
-    return dirs;
-}
-

 /**
 * This is called at the end of an indexing session, to delete the
- *  documents for files that are no longer there. We also build the
- *  stem database while we are at it.
+ * documents for files that are no longer there. 
 */
 bool Db::purge()
 {
--- a/src/utils/smallut.cpp
+++ b/src/utils/smallut.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: smallut.cpp,v 1.15 2006-01-26 12:29:20 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: smallut.cpp,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -331,6 +331,58 @@ void trimstring(string &s, const char *ws)
 	s.replace(pos+1, string::npos, "");
 }

+// Remove some chars and replace them with spaces
+string neutchars(const string &str, string delims)
+{
+    string out;
+    string::size_type startPos, pos;
+
+    for (pos = 0;;) { 
+        // Skip initial delims, break if this eats all.
+        if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
+	    break;
+        // Find next delimiter or end of string (end of token)
+        pos = str.find_first_of(delims, startPos);
+        // Add token to the output. Note: token cant be empty here
+	if (pos == string::npos) {
+	    out += str.substr(startPos);
+	} else {
+	    out += str.substr(startPos, pos - startPos) + " ";
+	}
+    }
+    return out;
+}
+
+
+/* Truncate a string to a given maxlength, avoiding cutting off midword
+ * if reasonably possible. Note: we could also use textsplit, stopping when
+ * we have enough, this would be cleanly utf8-aware but would remove 
+ * punctuation */
+static const string SEPAR = " \t\n\r-:.;,/[]{}";
+string truncate_to_word(string & input, string::size_type maxlen)
+{
+    string output;
+    if (input.length() <= maxlen) {
+	output = input;
+    } else {
+	output = input.substr(0, maxlen);
+	string::size_type space = output.find_last_of(SEPAR);
+	// Original version only truncated at space if space was found after
+	// maxlen/2. But we HAVE to truncate at space, else we'd need to do
+	// utf8 stuff to avoid truncating at multibyte char. In any case,
+	// not finding space means that the text probably has no value.
+	// Except probably for Asian languages, so we may want to fix this 
+	// one day
+	if (space == string::npos) {
+	    output.erase();
+	} else {
+	    output.erase(space);
+	}
+	output += " ...";
+    }
+    return output;
+}
+
 // Escape things that would look like markup
 string escapeHtml(const string &in)
 {
--- a/src/utils/smallut.h
+++ b/src/utils/smallut.h
@ -16,7 +16,7 @@
 */
 #ifndef _SMALLUT_H_INCLUDED_
 #define _SMALLUT_H_INCLUDED_
-/* @(#$Id: smallut.h,v 1.15 2006-01-30 11:15:28 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: smallut.h,v 1.16 2006-04-11 06:49:45 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>

@ -59,6 +59,13 @@ extern void trimstring(string &s, const char *ws = " \t");
 /** Escape things like < or & by turining them to entities */
 extern string escapeHtml(const string &in);

+/** Replace some chars with spaces (ie: newline chars). This is not utf8-aware
+ *  so chars should only contain ascii */
+extern string neutchars(const string &str, string chars);
+
+/** Truncate a string to a given maxlength, avoiding cutting off midword
+ *  if reasonably possible. */
+extern string truncate_to_word(string &input, string::size_type maxlen);

 class Chrono {
 public: