comments and moving some util routines out of rcldb.cpp

This commit is contained in:
dockes 2006-04-11 06:49:45 +00:00
parent 9001129bf4
commit 930bdc870d
4 changed files with 115 additions and 112 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -32,29 +32,23 @@ using namespace std;
#endif /* NO_NAMESPACES */
/**
* Splitting a text into words. The code in this file will work with any
* charset where the basic separators (.,- etc.) have their ascii values
* (ok for UTF-8, ascii, iso8859* and quite a few others).
*
* We work in a way which would make it quite difficult to handle non-ascii
* separator chars (en-dash, etc.). We would then need to actually parse the
* utf-8 stream, and use a different way to classify the characters (instead
* of a 256 slot array).
* Splitting a text into words. The code in this file works with utf-8
* in a semi-clean way (see uproplist.h)
*
* We are also not using capitalization information.
*
* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
* Then specialcase all 'real' utf chars, by checking for the few
* punctuation ones we're interested in (put them in a map). Then
* classify all other non-ascii as letter, and use the current method
* for chars < 127.
* There are a few remnants of the initial utf8-ignorant version in this file.
*/
// Character classes: we have three main groups, and then some chars
// are their own class because they want special handling.
//
// We have an array with 256 slots where we keep the character types.
// The array could be fully static, but we use a small function to fill it
// once.
// The array is actually a remnant of the original version which did no utf8
// It could be reduced to 128, because real (over 128) utf8 chars are now
// handled with a set holding all the separator values.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
static int charclasses[256];
@ -87,11 +81,11 @@ static void setcharclasses()
for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i];
init = 1;
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
for (i = 0; i < sizeof(uniign); i++)
unicign.insert(uniign[i]);
unicign.insert((unsigned int)-1);
init = 1;
}
// Do some checking (the kind which is simpler to do here than in the
@ -103,9 +97,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
unsigned int l = w.length();
if (l > 0 && l < (unsigned)maxWordLength) {
// 1 char word: we index single letters and digits, but
// nothing else. We might want to turn this into a test for a single
// utf8 character instead.
if (l == 1) {
// 1 char word: we index single letters and digits, but
// nothing else
int c = (int)w[0];
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
//cerr << "ERASING single letter term " << c << endl;
@ -227,6 +222,18 @@ bool TextSplit::text_to_words(const string &in)
}
int cc = whatcc(c);
switch (cc) {
case LETTER:
word += it;
span += it;
break;
case DIGIT:
if (word.length() == 0)
number = true;
word += it;
span += it;
break;
case SPACE:
SPACE:
if (word.length() || span.length()) {
@ -326,11 +333,7 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE;
}
break;
case DIGIT:
if (word.length() == 0)
number = true;
/* FALLTHROUGH */
case LETTER:
default:
word += it;
span += it;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.63 2006-04-07 13:10:22 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.64 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -399,14 +399,7 @@ bool dumb_string(const string &in, string &out)
if (in.empty())
return true;
string s1;
s1.reserve(in.length());
for (unsigned int i = 0; i < in.length(); i++) {
if (in[i] == '\n' || in[i] == '\r')
s1 += ' ';
else
s1 += in[i];
}
string s1 = neutchars(in, "\n\r");
if (!unacmaybefold(s1, out, "UTF-8", true)) {
LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
out.erase();
@ -416,58 +409,6 @@ bool dumb_string(const string &in, string &out)
return true;
}
/* From omindex direct */
/* Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. */
static string
truncate_to_word(string & input, string::size_type maxlen)
{
string output;
if (input.length() <= maxlen) {
output = input;
} else {
output = input.substr(0, maxlen);
const char *SEPAR = " \t\n\r-:.;,/[]{}";
string::size_type space = output.find_last_of(SEPAR);
// Original version only truncated at space if space was found after
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
// utf8 stuff to avoid truncating at multibyte char. In any case,
// not finding space means that the text probably has no value.
// Except probably for Asian languages, so we may want to fix this
// one day
if (space == string::npos) {
output.erase();
} else {
output.erase(space);
}
output += " ...";
}
return output;
}
// Remove some chars and replace them with spaces
static string stripchars(const string &str, string delims)
{
string out;
string::size_type startPos, pos;
for (pos = 0;;) {
// Skip initial delims, break if this eats all.
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
break;
// Find next delimiter or end of string (end of token)
pos = str.find_first_of(delims, startPos);
// Add token to the vector. Note: token cant be empty here
if (pos == string::npos) {
out += str.substr(startPos) + " ";
} else {
out += str.substr(startPos, pos - startPos) + " ";
}
}
return out;
}
// Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
@ -490,7 +431,7 @@ bool Db::add(const string &fn, const Doc &idoc,
} else {
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
}
doc.abstract = stripchars(doc.abstract, "\n\r");
doc.abstract = neutchars(doc.abstract, "\n\r");
doc.title = truncate_to_word(doc.title, 100);
doc.keywords = truncate_to_word(doc.keywords, 300);
@ -720,14 +661,20 @@ static string stemdbname(const string& basename, string lang)
return nm;
}
// Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool
p_notlowerorutf(unsigned int c)
// Return list of existing stem db languages
list<string> Db::getStemLangs()
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
list<string> dirs;
LOGDEB(("Db::getStemLang\n"));
if (m_ndb == 0)
return dirs;
string pattern = stemdirstem + "*";
dirs = path_dirglob(m_ndb->m_basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
/**
@ -747,6 +694,16 @@ bool Db::deleteStemDb(const string& lang)
return false;
}
// Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool
p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db
@ -780,7 +737,7 @@ bool Db::createStemDb(const string& lang)
it != m_ndb->wdb.allterms_end(); it++) {
// If it has any non-lowercase 7bit char, cant be stemmable
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
// (*it).c_str(), *sit));
@ -886,26 +843,10 @@ bool Db::createStemDb(const string& lang)
return true;
}
list<string> Db::getStemLangs()
{
list<string> dirs;
LOGDEB(("Db::getStemLang\n"));
if (m_ndb == 0)
return dirs;
string pattern = stemdirstem + "*";
dirs = path_dirglob(m_ndb->m_basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
/**
* This is called at the end of an indexing session, to delete the
* documents for files that are no longer there. We also build the
* stem database while we are at it.
* documents for files that are no longer there.
*/
bool Db::purge()
{

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.15 2006-01-26 12:29:20 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -331,6 +331,58 @@ void trimstring(string &s, const char *ws)
s.replace(pos+1, string::npos, "");
}
// Remove some chars and replace them with spaces
string neutchars(const string &str, string delims)
{
string out;
string::size_type startPos, pos;
for (pos = 0;;) {
// Skip initial delims, break if this eats all.
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
break;
// Find next delimiter or end of string (end of token)
pos = str.find_first_of(delims, startPos);
// Add token to the output. Note: token cant be empty here
if (pos == string::npos) {
out += str.substr(startPos);
} else {
out += str.substr(startPos, pos - startPos) + " ";
}
}
return out;
}
/* Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. Note: we could also use textsplit, stopping when
* we have enough, this would be cleanly utf8-aware but would remove
* punctuation */
static const string SEPAR = " \t\n\r-:.;,/[]{}";
string truncate_to_word(string & input, string::size_type maxlen)
{
string output;
if (input.length() <= maxlen) {
output = input;
} else {
output = input.substr(0, maxlen);
string::size_type space = output.find_last_of(SEPAR);
// Original version only truncated at space if space was found after
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
// utf8 stuff to avoid truncating at multibyte char. In any case,
// not finding space means that the text probably has no value.
// Except probably for Asian languages, so we may want to fix this
// one day
if (space == string::npos) {
output.erase();
} else {
output.erase(space);
}
output += " ...";
}
return output;
}
// Escape things that would look like markup
string escapeHtml(const string &in)
{

View File

@ -16,7 +16,7 @@
*/
#ifndef _SMALLUT_H_INCLUDED_
#define _SMALLUT_H_INCLUDED_
/* @(#$Id: smallut.h,v 1.15 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: smallut.h,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -59,6 +59,13 @@ extern void trimstring(string &s, const char *ws = " \t");
/** Escape things like < or & by turining them to entities */
extern string escapeHtml(const string &in);
/** Replace some chars with spaces (ie: newline chars). This is not utf8-aware
* so chars should only contain ascii */
extern string neutchars(const string &str, string chars);
/** Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. */
extern string truncate_to_word(string &input, string::size_type maxlen);
class Chrono {
public: