comments and moving some util routines out of rcldb.cpp
This commit is contained in:
parent
9001129bf4
commit
930bdc870d
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -32,29 +32,23 @@ using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
/**
|
||||
* Splitting a text into words. The code in this file will work with any
|
||||
* charset where the basic separators (.,- etc.) have their ascii values
|
||||
* (ok for UTF-8, ascii, iso8859* and quite a few others).
|
||||
*
|
||||
* We work in a way which would make it quite difficult to handle non-ascii
|
||||
* separator chars (en-dash, etc.). We would then need to actually parse the
|
||||
* utf-8 stream, and use a different way to classify the characters (instead
|
||||
* of a 256 slot array).
|
||||
* Splitting a text into words. The code in this file works with utf-8
|
||||
* in a semi-clean way (see uproplist.h)
|
||||
*
|
||||
* We are also not using capitalization information.
|
||||
*
|
||||
* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
|
||||
* Then specialcase all 'real' utf chars, by checking for the few
|
||||
* punctuation ones we're interested in (put them in a map). Then
|
||||
* classify all other non-ascii as letter, and use the current method
|
||||
* for chars < 127.
|
||||
* There are a few remnants of the initial utf8-ignorant version in this file.
|
||||
*/
|
||||
|
||||
// Character classes: we have three main groups, and then some chars
|
||||
// are their own class because they want special handling.
|
||||
//
|
||||
// We have an array with 256 slots where we keep the character types.
|
||||
// The array could be fully static, but we use a small function to fill it
|
||||
// once.
|
||||
// The array is actually a remnant of the original version which did no utf8
|
||||
// It could be reduced to 128, because real (over 128) utf8 chars are now
|
||||
// handled with a set holding all the separator values.
|
||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
||||
static int charclasses[256];
|
||||
|
||||
@ -87,11 +81,11 @@ static void setcharclasses()
|
||||
for (i = 0; i < strlen(special); i++)
|
||||
charclasses[int(special[i])] = special[i];
|
||||
|
||||
init = 1;
|
||||
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
||||
for (i = 0; i < sizeof(uniign); i++)
|
||||
unicign.insert(uniign[i]);
|
||||
unicign.insert((unsigned int)-1);
|
||||
|
||||
init = 1;
|
||||
}
|
||||
|
||||
// Do some checking (the kind which is simpler to do here than in the
|
||||
@ -103,9 +97,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
|
||||
unsigned int l = w.length();
|
||||
if (l > 0 && l < (unsigned)maxWordLength) {
|
||||
// 1 char word: we index single letters and digits, but
|
||||
// nothing else. We might want to turn this into a test for a single
|
||||
// utf8 character instead.
|
||||
if (l == 1) {
|
||||
// 1 char word: we index single letters and digits, but
|
||||
// nothing else
|
||||
int c = (int)w[0];
|
||||
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
||||
//cerr << "ERASING single letter term " << c << endl;
|
||||
@ -227,6 +222,18 @@ bool TextSplit::text_to_words(const string &in)
|
||||
}
|
||||
int cc = whatcc(c);
|
||||
switch (cc) {
|
||||
case LETTER:
|
||||
word += it;
|
||||
span += it;
|
||||
break;
|
||||
|
||||
case DIGIT:
|
||||
if (word.length() == 0)
|
||||
number = true;
|
||||
word += it;
|
||||
span += it;
|
||||
break;
|
||||
|
||||
case SPACE:
|
||||
SPACE:
|
||||
if (word.length() || span.length()) {
|
||||
@ -326,11 +333,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
goto SPACE;
|
||||
}
|
||||
break;
|
||||
case DIGIT:
|
||||
if (word.length() == 0)
|
||||
number = true;
|
||||
/* FALLTHROUGH */
|
||||
case LETTER:
|
||||
|
||||
default:
|
||||
word += it;
|
||||
span += it;
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.63 2006-04-07 13:10:22 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.64 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -399,14 +399,7 @@ bool dumb_string(const string &in, string &out)
|
||||
if (in.empty())
|
||||
return true;
|
||||
|
||||
string s1;
|
||||
s1.reserve(in.length());
|
||||
for (unsigned int i = 0; i < in.length(); i++) {
|
||||
if (in[i] == '\n' || in[i] == '\r')
|
||||
s1 += ' ';
|
||||
else
|
||||
s1 += in[i];
|
||||
}
|
||||
string s1 = neutchars(in, "\n\r");
|
||||
if (!unacmaybefold(s1, out, "UTF-8", true)) {
|
||||
LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
|
||||
out.erase();
|
||||
@ -416,58 +409,6 @@ bool dumb_string(const string &in, string &out)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* From omindex direct */
|
||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||
* if reasonably possible. */
|
||||
static string
|
||||
truncate_to_word(string & input, string::size_type maxlen)
|
||||
{
|
||||
string output;
|
||||
if (input.length() <= maxlen) {
|
||||
output = input;
|
||||
} else {
|
||||
output = input.substr(0, maxlen);
|
||||
const char *SEPAR = " \t\n\r-:.;,/[]{}";
|
||||
string::size_type space = output.find_last_of(SEPAR);
|
||||
// Original version only truncated at space if space was found after
|
||||
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
|
||||
// utf8 stuff to avoid truncating at multibyte char. In any case,
|
||||
// not finding space means that the text probably has no value.
|
||||
// Except probably for Asian languages, so we may want to fix this
|
||||
// one day
|
||||
if (space == string::npos) {
|
||||
output.erase();
|
||||
} else {
|
||||
output.erase(space);
|
||||
}
|
||||
|
||||
output += " ...";
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// Remove some chars and replace them with spaces
|
||||
static string stripchars(const string &str, string delims)
|
||||
{
|
||||
string out;
|
||||
string::size_type startPos, pos;
|
||||
|
||||
for (pos = 0;;) {
|
||||
// Skip initial delims, break if this eats all.
|
||||
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
||||
break;
|
||||
// Find next delimiter or end of string (end of token)
|
||||
pos = str.find_first_of(delims, startPos);
|
||||
// Add token to the vector. Note: token cant be empty here
|
||||
if (pos == string::npos) {
|
||||
out += str.substr(startPos) + " ";
|
||||
} else {
|
||||
out += str.substr(startPos, pos - startPos) + " ";
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Add document in internal form to the database: index the terms in
|
||||
// the title abstract and body and add special terms for file name,
|
||||
// date, mime type ... , create the document data record (more
|
||||
@ -490,7 +431,7 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
} else {
|
||||
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
|
||||
}
|
||||
doc.abstract = stripchars(doc.abstract, "\n\r");
|
||||
doc.abstract = neutchars(doc.abstract, "\n\r");
|
||||
doc.title = truncate_to_word(doc.title, 100);
|
||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
||||
|
||||
@ -720,14 +661,20 @@ static string stemdbname(const string& basename, string lang)
|
||||
return nm;
|
||||
}
|
||||
|
||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||
// we don't
|
||||
inline static bool
|
||||
p_notlowerorutf(unsigned int c)
|
||||
// Return list of existing stem db languages
|
||||
list<string> Db::getStemLangs()
|
||||
{
|
||||
if (c < 'a' || (c > 'z' && c < 128))
|
||||
return true;
|
||||
return false;
|
||||
list<string> dirs;
|
||||
LOGDEB(("Db::getStemLang\n"));
|
||||
if (m_ndb == 0)
|
||||
return dirs;
|
||||
string pattern = stemdirstem + "*";
|
||||
dirs = path_dirglob(m_ndb->m_basedir, pattern);
|
||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
*it = path_basename(*it);
|
||||
*it = it->substr(stemdirstem.length(), string::npos);
|
||||
}
|
||||
return dirs;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -747,6 +694,16 @@ bool Db::deleteStemDb(const string& lang)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||
// we don't
|
||||
inline static bool
|
||||
p_notlowerascii(unsigned int c)
|
||||
{
|
||||
if (c < 'a' || (c > 'z' && c < 128))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create database of stem to parents associations for a given language.
|
||||
* We walk the list of all terms, stem them, and create another Xapian db
|
||||
@ -780,7 +737,7 @@ bool Db::createStemDb(const string& lang)
|
||||
it != m_ndb->wdb.allterms_end(); it++) {
|
||||
// If it has any non-lowercase 7bit char, cant be stemmable
|
||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||
if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
|
||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||
++nostem;
|
||||
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
||||
// (*it).c_str(), *sit));
|
||||
@ -886,26 +843,10 @@ bool Db::createStemDb(const string& lang)
|
||||
return true;
|
||||
}
|
||||
|
||||
list<string> Db::getStemLangs()
|
||||
{
|
||||
list<string> dirs;
|
||||
LOGDEB(("Db::getStemLang\n"));
|
||||
if (m_ndb == 0)
|
||||
return dirs;
|
||||
string pattern = stemdirstem + "*";
|
||||
dirs = path_dirglob(m_ndb->m_basedir, pattern);
|
||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
*it = path_basename(*it);
|
||||
*it = it->substr(stemdirstem.length(), string::npos);
|
||||
}
|
||||
return dirs;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is called at the end of an indexing session, to delete the
|
||||
* documents for files that are no longer there. We also build the
|
||||
* stem database while we are at it.
|
||||
* documents for files that are no longer there.
|
||||
*/
|
||||
bool Db::purge()
|
||||
{
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.15 2006-01-26 12:29:20 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -331,6 +331,58 @@ void trimstring(string &s, const char *ws)
|
||||
s.replace(pos+1, string::npos, "");
|
||||
}
|
||||
|
||||
// Remove some chars and replace them with spaces
|
||||
string neutchars(const string &str, string delims)
|
||||
{
|
||||
string out;
|
||||
string::size_type startPos, pos;
|
||||
|
||||
for (pos = 0;;) {
|
||||
// Skip initial delims, break if this eats all.
|
||||
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
||||
break;
|
||||
// Find next delimiter or end of string (end of token)
|
||||
pos = str.find_first_of(delims, startPos);
|
||||
// Add token to the output. Note: token cant be empty here
|
||||
if (pos == string::npos) {
|
||||
out += str.substr(startPos);
|
||||
} else {
|
||||
out += str.substr(startPos, pos - startPos) + " ";
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||
* if reasonably possible. Note: we could also use textsplit, stopping when
|
||||
* we have enough, this would be cleanly utf8-aware but would remove
|
||||
* punctuation */
|
||||
static const string SEPAR = " \t\n\r-:.;,/[]{}";
|
||||
string truncate_to_word(string & input, string::size_type maxlen)
|
||||
{
|
||||
string output;
|
||||
if (input.length() <= maxlen) {
|
||||
output = input;
|
||||
} else {
|
||||
output = input.substr(0, maxlen);
|
||||
string::size_type space = output.find_last_of(SEPAR);
|
||||
// Original version only truncated at space if space was found after
|
||||
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
|
||||
// utf8 stuff to avoid truncating at multibyte char. In any case,
|
||||
// not finding space means that the text probably has no value.
|
||||
// Except probably for Asian languages, so we may want to fix this
|
||||
// one day
|
||||
if (space == string::npos) {
|
||||
output.erase();
|
||||
} else {
|
||||
output.erase(space);
|
||||
}
|
||||
output += " ...";
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// Escape things that would look like markup
|
||||
string escapeHtml(const string &in)
|
||||
{
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _SMALLUT_H_INCLUDED_
|
||||
#define _SMALLUT_H_INCLUDED_
|
||||
/* @(#$Id: smallut.h,v 1.15 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: smallut.h,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
@ -59,6 +59,13 @@ extern void trimstring(string &s, const char *ws = " \t");
|
||||
/** Escape things like < or & by turining them to entities */
|
||||
extern string escapeHtml(const string &in);
|
||||
|
||||
/** Replace some chars with spaces (ie: newline chars). This is not utf8-aware
|
||||
* so chars should only contain ascii */
|
||||
extern string neutchars(const string &str, string chars);
|
||||
|
||||
/** Truncate a string to a given maxlength, avoiding cutting off midword
|
||||
* if reasonably possible. */
|
||||
extern string truncate_to_word(string &input, string::size_type maxlen);
|
||||
|
||||
class Chrono {
|
||||
public:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user