comments and moving some util routines out of rcldb.cpp
This commit is contained in:
parent
9001129bf4
commit
930bdc870d
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -32,29 +32,23 @@ using namespace std;
|
|||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splitting a text into words. The code in this file will work with any
|
* Splitting a text into words. The code in this file works with utf-8
|
||||||
* charset where the basic separators (.,- etc.) have their ascii values
|
* in a semi-clean way (see uproplist.h)
|
||||||
* (ok for UTF-8, ascii, iso8859* and quite a few others).
|
|
||||||
*
|
|
||||||
* We work in a way which would make it quite difficult to handle non-ascii
|
|
||||||
* separator chars (en-dash, etc.). We would then need to actually parse the
|
|
||||||
* utf-8 stream, and use a different way to classify the characters (instead
|
|
||||||
* of a 256 slot array).
|
|
||||||
*
|
*
|
||||||
* We are also not using capitalization information.
|
* We are also not using capitalization information.
|
||||||
*
|
*
|
||||||
* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
|
* There are a few remnants of the initial utf8-ignorant version in this file.
|
||||||
* Then specialcase all 'real' utf chars, by checking for the few
|
|
||||||
* punctuation ones we're interested in (put them in a map). Then
|
|
||||||
* classify all other non-ascii as letter, and use the current method
|
|
||||||
* for chars < 127.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Character classes: we have three main groups, and then some chars
|
// Character classes: we have three main groups, and then some chars
|
||||||
// are their own class because they want special handling.
|
// are their own class because they want special handling.
|
||||||
|
//
|
||||||
// We have an array with 256 slots where we keep the character types.
|
// We have an array with 256 slots where we keep the character types.
|
||||||
// The array could be fully static, but we use a small function to fill it
|
// The array could be fully static, but we use a small function to fill it
|
||||||
// once.
|
// once.
|
||||||
|
// The array is actually a remnant of the original version which did no utf8
|
||||||
|
// It could be reduced to 128, because real (over 128) utf8 chars are now
|
||||||
|
// handled with a set holding all the separator values.
|
||||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
||||||
static int charclasses[256];
|
static int charclasses[256];
|
||||||
|
|
||||||
@ -87,11 +81,11 @@ static void setcharclasses()
|
|||||||
for (i = 0; i < strlen(special); i++)
|
for (i = 0; i < strlen(special); i++)
|
||||||
charclasses[int(special[i])] = special[i];
|
charclasses[int(special[i])] = special[i];
|
||||||
|
|
||||||
init = 1;
|
|
||||||
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
|
||||||
for (i = 0; i < sizeof(uniign); i++)
|
for (i = 0; i < sizeof(uniign); i++)
|
||||||
unicign.insert(uniign[i]);
|
unicign.insert(uniign[i]);
|
||||||
unicign.insert((unsigned int)-1);
|
unicign.insert((unsigned int)-1);
|
||||||
|
|
||||||
|
init = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do some checking (the kind which is simpler to do here than in the
|
// Do some checking (the kind which is simpler to do here than in the
|
||||||
@ -103,9 +97,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
|
|
||||||
unsigned int l = w.length();
|
unsigned int l = w.length();
|
||||||
if (l > 0 && l < (unsigned)maxWordLength) {
|
if (l > 0 && l < (unsigned)maxWordLength) {
|
||||||
|
// 1 char word: we index single letters and digits, but
|
||||||
|
// nothing else. We might want to turn this into a test for a single
|
||||||
|
// utf8 character instead.
|
||||||
if (l == 1) {
|
if (l == 1) {
|
||||||
// 1 char word: we index single letters and digits, but
|
|
||||||
// nothing else
|
|
||||||
int c = (int)w[0];
|
int c = (int)w[0];
|
||||||
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
||||||
//cerr << "ERASING single letter term " << c << endl;
|
//cerr << "ERASING single letter term " << c << endl;
|
||||||
@ -227,6 +222,18 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
|
case LETTER:
|
||||||
|
word += it;
|
||||||
|
span += it;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DIGIT:
|
||||||
|
if (word.length() == 0)
|
||||||
|
number = true;
|
||||||
|
word += it;
|
||||||
|
span += it;
|
||||||
|
break;
|
||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
if (word.length() || span.length()) {
|
if (word.length() || span.length()) {
|
||||||
@ -326,11 +333,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
goto SPACE;
|
goto SPACE;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case DIGIT:
|
|
||||||
if (word.length() == 0)
|
|
||||||
number = true;
|
|
||||||
/* FALLTHROUGH */
|
|
||||||
case LETTER:
|
|
||||||
default:
|
default:
|
||||||
word += it;
|
word += it;
|
||||||
span += it;
|
span += it;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.63 2006-04-07 13:10:22 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.64 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -399,14 +399,7 @@ bool dumb_string(const string &in, string &out)
|
|||||||
if (in.empty())
|
if (in.empty())
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
string s1;
|
string s1 = neutchars(in, "\n\r");
|
||||||
s1.reserve(in.length());
|
|
||||||
for (unsigned int i = 0; i < in.length(); i++) {
|
|
||||||
if (in[i] == '\n' || in[i] == '\r')
|
|
||||||
s1 += ' ';
|
|
||||||
else
|
|
||||||
s1 += in[i];
|
|
||||||
}
|
|
||||||
if (!unacmaybefold(s1, out, "UTF-8", true)) {
|
if (!unacmaybefold(s1, out, "UTF-8", true)) {
|
||||||
LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
|
LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
|
||||||
out.erase();
|
out.erase();
|
||||||
@ -416,58 +409,6 @@ bool dumb_string(const string &in, string &out)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* From omindex direct */
|
|
||||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
|
||||||
* if reasonably possible. */
|
|
||||||
static string
|
|
||||||
truncate_to_word(string & input, string::size_type maxlen)
|
|
||||||
{
|
|
||||||
string output;
|
|
||||||
if (input.length() <= maxlen) {
|
|
||||||
output = input;
|
|
||||||
} else {
|
|
||||||
output = input.substr(0, maxlen);
|
|
||||||
const char *SEPAR = " \t\n\r-:.;,/[]{}";
|
|
||||||
string::size_type space = output.find_last_of(SEPAR);
|
|
||||||
// Original version only truncated at space if space was found after
|
|
||||||
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
|
|
||||||
// utf8 stuff to avoid truncating at multibyte char. In any case,
|
|
||||||
// not finding space means that the text probably has no value.
|
|
||||||
// Except probably for Asian languages, so we may want to fix this
|
|
||||||
// one day
|
|
||||||
if (space == string::npos) {
|
|
||||||
output.erase();
|
|
||||||
} else {
|
|
||||||
output.erase(space);
|
|
||||||
}
|
|
||||||
|
|
||||||
output += " ...";
|
|
||||||
}
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove some chars and replace them with spaces
|
|
||||||
static string stripchars(const string &str, string delims)
|
|
||||||
{
|
|
||||||
string out;
|
|
||||||
string::size_type startPos, pos;
|
|
||||||
|
|
||||||
for (pos = 0;;) {
|
|
||||||
// Skip initial delims, break if this eats all.
|
|
||||||
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
|
||||||
break;
|
|
||||||
// Find next delimiter or end of string (end of token)
|
|
||||||
pos = str.find_first_of(delims, startPos);
|
|
||||||
// Add token to the vector. Note: token cant be empty here
|
|
||||||
if (pos == string::npos) {
|
|
||||||
out += str.substr(startPos) + " ";
|
|
||||||
} else {
|
|
||||||
out += str.substr(startPos, pos - startPos) + " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add document in internal form to the database: index the terms in
|
// Add document in internal form to the database: index the terms in
|
||||||
// the title abstract and body and add special terms for file name,
|
// the title abstract and body and add special terms for file name,
|
||||||
// date, mime type ... , create the document data record (more
|
// date, mime type ... , create the document data record (more
|
||||||
@ -490,7 +431,7 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
} else {
|
} else {
|
||||||
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
|
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
|
||||||
}
|
}
|
||||||
doc.abstract = stripchars(doc.abstract, "\n\r");
|
doc.abstract = neutchars(doc.abstract, "\n\r");
|
||||||
doc.title = truncate_to_word(doc.title, 100);
|
doc.title = truncate_to_word(doc.title, 100);
|
||||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
doc.keywords = truncate_to_word(doc.keywords, 300);
|
||||||
|
|
||||||
@ -720,14 +661,20 @@ static string stemdbname(const string& basename, string lang)
|
|||||||
return nm;
|
return nm;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
// Return list of existing stem db languages
|
||||||
// we don't
|
list<string> Db::getStemLangs()
|
||||||
inline static bool
|
|
||||||
p_notlowerorutf(unsigned int c)
|
|
||||||
{
|
{
|
||||||
if (c < 'a' || (c > 'z' && c < 128))
|
list<string> dirs;
|
||||||
return true;
|
LOGDEB(("Db::getStemLang\n"));
|
||||||
return false;
|
if (m_ndb == 0)
|
||||||
|
return dirs;
|
||||||
|
string pattern = stemdirstem + "*";
|
||||||
|
dirs = path_dirglob(m_ndb->m_basedir, pattern);
|
||||||
|
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||||
|
*it = path_basename(*it);
|
||||||
|
*it = it->substr(stemdirstem.length(), string::npos);
|
||||||
|
}
|
||||||
|
return dirs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -747,6 +694,16 @@ bool Db::deleteStemDb(const string& lang)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||||
|
// we don't
|
||||||
|
inline static bool
|
||||||
|
p_notlowerascii(unsigned int c)
|
||||||
|
{
|
||||||
|
if (c < 'a' || (c > 'z' && c < 128))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create database of stem to parents associations for a given language.
|
* Create database of stem to parents associations for a given language.
|
||||||
* We walk the list of all terms, stem them, and create another Xapian db
|
* We walk the list of all terms, stem them, and create another Xapian db
|
||||||
@ -780,7 +737,7 @@ bool Db::createStemDb(const string& lang)
|
|||||||
it != m_ndb->wdb.allterms_end(); it++) {
|
it != m_ndb->wdb.allterms_end(); it++) {
|
||||||
// If it has any non-lowercase 7bit char, cant be stemmable
|
// If it has any non-lowercase 7bit char, cant be stemmable
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
|
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||||
++nostem;
|
++nostem;
|
||||||
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
||||||
// (*it).c_str(), *sit));
|
// (*it).c_str(), *sit));
|
||||||
@ -886,26 +843,10 @@ bool Db::createStemDb(const string& lang)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
list<string> Db::getStemLangs()
|
|
||||||
{
|
|
||||||
list<string> dirs;
|
|
||||||
LOGDEB(("Db::getStemLang\n"));
|
|
||||||
if (m_ndb == 0)
|
|
||||||
return dirs;
|
|
||||||
string pattern = stemdirstem + "*";
|
|
||||||
dirs = path_dirglob(m_ndb->m_basedir, pattern);
|
|
||||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
|
||||||
*it = path_basename(*it);
|
|
||||||
*it = it->substr(stemdirstem.length(), string::npos);
|
|
||||||
}
|
|
||||||
return dirs;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is called at the end of an indexing session, to delete the
|
* This is called at the end of an indexing session, to delete the
|
||||||
* documents for files that are no longer there. We also build the
|
* documents for files that are no longer there.
|
||||||
* stem database while we are at it.
|
|
||||||
*/
|
*/
|
||||||
bool Db::purge()
|
bool Db::purge()
|
||||||
{
|
{
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.15 2006-01-26 12:29:20 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -331,6 +331,58 @@ void trimstring(string &s, const char *ws)
|
|||||||
s.replace(pos+1, string::npos, "");
|
s.replace(pos+1, string::npos, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove some chars and replace them with spaces
|
||||||
|
string neutchars(const string &str, string delims)
|
||||||
|
{
|
||||||
|
string out;
|
||||||
|
string::size_type startPos, pos;
|
||||||
|
|
||||||
|
for (pos = 0;;) {
|
||||||
|
// Skip initial delims, break if this eats all.
|
||||||
|
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
||||||
|
break;
|
||||||
|
// Find next delimiter or end of string (end of token)
|
||||||
|
pos = str.find_first_of(delims, startPos);
|
||||||
|
// Add token to the output. Note: token cant be empty here
|
||||||
|
if (pos == string::npos) {
|
||||||
|
out += str.substr(startPos);
|
||||||
|
} else {
|
||||||
|
out += str.substr(startPos, pos - startPos) + " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||||
|
* if reasonably possible. Note: we could also use textsplit, stopping when
|
||||||
|
* we have enough, this would be cleanly utf8-aware but would remove
|
||||||
|
* punctuation */
|
||||||
|
static const string SEPAR = " \t\n\r-:.;,/[]{}";
|
||||||
|
string truncate_to_word(string & input, string::size_type maxlen)
|
||||||
|
{
|
||||||
|
string output;
|
||||||
|
if (input.length() <= maxlen) {
|
||||||
|
output = input;
|
||||||
|
} else {
|
||||||
|
output = input.substr(0, maxlen);
|
||||||
|
string::size_type space = output.find_last_of(SEPAR);
|
||||||
|
// Original version only truncated at space if space was found after
|
||||||
|
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
|
||||||
|
// utf8 stuff to avoid truncating at multibyte char. In any case,
|
||||||
|
// not finding space means that the text probably has no value.
|
||||||
|
// Except probably for Asian languages, so we may want to fix this
|
||||||
|
// one day
|
||||||
|
if (space == string::npos) {
|
||||||
|
output.erase();
|
||||||
|
} else {
|
||||||
|
output.erase(space);
|
||||||
|
}
|
||||||
|
output += " ...";
|
||||||
|
}
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
// Escape things that would look like markup
|
// Escape things that would look like markup
|
||||||
string escapeHtml(const string &in)
|
string escapeHtml(const string &in)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _SMALLUT_H_INCLUDED_
|
#ifndef _SMALLUT_H_INCLUDED_
|
||||||
#define _SMALLUT_H_INCLUDED_
|
#define _SMALLUT_H_INCLUDED_
|
||||||
/* @(#$Id: smallut.h,v 1.15 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: smallut.h,v 1.16 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
|
||||||
@ -59,6 +59,13 @@ extern void trimstring(string &s, const char *ws = " \t");
|
|||||||
/** Escape things like < or & by turining them to entities */
|
/** Escape things like < or & by turining them to entities */
|
||||||
extern string escapeHtml(const string &in);
|
extern string escapeHtml(const string &in);
|
||||||
|
|
||||||
|
/** Replace some chars with spaces (ie: newline chars). This is not utf8-aware
|
||||||
|
* so chars should only contain ascii */
|
||||||
|
extern string neutchars(const string &str, string chars);
|
||||||
|
|
||||||
|
/** Truncate a string to a given maxlength, avoiding cutting off midword
|
||||||
|
* if reasonably possible. */
|
||||||
|
extern string truncate_to_word(string &input, string::size_type maxlen);
|
||||||
|
|
||||||
class Chrono {
|
class Chrono {
|
||||||
public:
|
public:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user