improved textsplit speed (needs utf8iter modifs too

This commit is contained in:
dockes 2006-11-20 11:17:53 +00:00
parent a573fbd1a9
commit 9d6963c95a
3 changed files with 50 additions and 53 deletions

View File

@ -1,9 +1,9 @@
# @(#$Id: Makefile,v 1.13 2006-11-15 14:57:53 dockes Exp $ (C) 2005 J.F.Dockes # @(#$Id: Makefile,v 1.14 2006-11-20 11:17:53 dockes Exp $ (C) 2005 J.F.Dockes
depth = .. depth = ..
include $(depth)/mk/sysconf include $(depth)/mk/sysconf
# Only test executables get build in here # Only test executables get build in here
PROGS = internfile unacpp textsplit rclconfig PROGS = unacpp textsplit rclconfig
all: $(BIGLIB) $(PROGS) all: $(BIGLIB) $(PROGS)

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.26 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -24,7 +24,10 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes E
#include <set> #include <set>
#include "textsplit.h" #include "textsplit.h"
#include "debuglog.h" #include "debuglog.h"
//#define UTF8ITER_CHECK
#include "utf8iter.h" #include "utf8iter.h"
#include "uproplist.h" #include "uproplist.h"
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -128,18 +131,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
* text_to_words(). * text_to_words().
* *
* @return true if ok, false for error. Splitting should stop in this case. * @return true if ok, false for error. Splitting should stop in this case.
* @param word Word value. This will be empty on return in ALL non-error
* cases
* @param wordpos Term position for word. Always ++ by us.
* @param span Span value
* @param spanpos Term position for the current span
* @param spanerase Set if the current span is at its end. Reset it. * @param spanerase Set if the current span is at its end. Reset it.
* @param bp The current BYTE position in the stream * @param bp The current BYTE position in the stream
*/ */
inline bool TextSplit::doemit(bool spanerase, int bp) inline bool TextSplit::doemit(bool spanerase, int bp)
{ {
LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n", LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp)); span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
// Emit span. When splitting for query, we only emit final spans // Emit span. When splitting for query, we only emit final spans
bool spanemitted = false; bool spanemitted = false;
@ -166,19 +164,23 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
return false; return false;
} }
// Emit word if different from span and not 'no words' mode // Emit word if different from span and not 'no words' mode
if (!(m_flags & TXTS_ONLYSPANS) && if (!(m_flags & TXTS_ONLYSPANS) && wordLen &&
(!spanemitted || word.length() != span.length())) (!spanemitted || wordLen != span.length())) {
if (!emitterm(false, word, wordpos, bp-word.length(), bp)) string s(span.substr(wordStart, wordLen));
if (!emitterm(false, s, wordpos, bp-wordLen, bp))
return false; return false;
}
// Adjust state // Adjust state
wordpos++; wordpos++;
word.erase(); wordLen = 0;
if (spanerase) { if (spanerase) {
span.erase(); span.erase();
spanpos = wordpos; spanpos = wordpos;
wordStart = 0;
} else {
wordStart = span.length();
} }
return true; return true;
@ -210,14 +212,14 @@ bool TextSplit::text_to_words(const string &in)
setcharclasses(); setcharclasses();
span.erase(); span.erase();
word.erase(); // Current word: no punctuation at all in there
number = false; number = false;
prevpos = prevlen = wordpos = spanpos = charpos = 0; wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;
Utf8Iter it(in); Utf8Iter it(in);
for (; !it.eof(); it++, charpos++) { for (; !it.eof(); it++) {
unsigned int c = *it; unsigned int c = *it;
if (c == (unsigned int)-1) { if (c == (unsigned int)-1) {
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n")); LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
return false; return false;
@ -225,20 +227,18 @@ bool TextSplit::text_to_words(const string &in)
int cc = whatcc(c); int cc = whatcc(c);
switch (cc) { switch (cc) {
case LETTER: case LETTER:
it.appendchartostring(word); wordLen += it.appendchartostring(span);
it.appendchartostring(span);
break; break;
case DIGIT: case DIGIT:
if (word.length() == 0) if (wordLen == 0)
number = true; number = true;
it.appendchartostring(word); wordLen += it.appendchartostring(span);
it.appendchartostring(span);
break; break;
case SPACE: case SPACE:
SPACE: SPACE:
if (word.length() || span.length()) { if (wordLen || span.length()) {
if (!doemit(true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
number = false; number = false;
@ -246,28 +246,27 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '-': case '-':
case '+': case '+':
if (word.length() == 0) { if (wordLen == 0) {
if (whatcc(it[charpos+1]) == DIGIT) { if (whatcc(it[it.getCpos()+1]) == DIGIT) {
number = true; number = true;
it.appendchartostring(word); wordLen += it.appendchartostring(span);
it.appendchartostring(span); } else {
} else wordStart += it.appendchartostring(span);
it.appendchartostring(span); }
} else { } else {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
it.appendchartostring(span); wordStart += it.appendchartostring(span);
} }
break; break;
case '.': case '.':
case ',': case ',':
if (number) { if (number) {
// 132.jpg ? // 132.jpg ?
if (whatcc(it[charpos+1]) != DIGIT) if (whatcc(it[it.getCpos()+1]) != DIGIT)
goto SPACE; goto SPACE;
it.appendchartostring(word); wordLen += it.appendchartostring(span);
it.appendchartostring(span);
break; break;
} else { } else {
// If . inside a word, keep it, else, this is whitespace. // If . inside a word, keep it, else, this is whitespace.
@ -277,16 +276,16 @@ bool TextSplit::text_to_words(const string &in)
// will be split as .x-errs, x, errs but not x-errs // will be split as .x-errs, x, errs but not x-errs
// A final comma in a word will be removed by doemit // A final comma in a word will be removed by doemit
if (cc == '.') { if (cc == '.') {
if (word.length()) { if (wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
// span length could have been adjusted by trimming // span length could have been adjusted by trimming
// inside doemit // inside doemit
if (span.length()) if (span.length())
it.appendchartostring(span); wordStart += it.appendchartostring(span);
break; break;
} else { } else {
it.appendchartostring(span); wordStart += it.appendchartostring(span);
break; break;
} }
} }
@ -294,30 +293,29 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE; goto SPACE;
break; break;
case '@': case '@':
if (word.length()) { if (wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
} }
it.appendchartostring(span); wordStart += it.appendchartostring(span);
break; break;
case '\'': case '\'':
// If in word, potential span: o'brien, else, this is more // If in word, potential span: o'brien, else, this is more
// whitespace // whitespace
if (word.length()) { if (wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
it.appendchartostring(span); wordStart += it.appendchartostring(span);
} }
break; break;
case '#': case '#':
// Keep it only at end of word... Special case for c# you see... // Keep it only at end of word ... Special case for c# you see...
if (word.length() > 0) { if (wordLen > 0) {
int w = whatcc(it[charpos+1]); int w = whatcc(it[it.getCpos()+1]);
if (w == SPACE || w == '\n' || w == '\r') { if (w == SPACE || w == '\n' || w == '\r') {
it.appendchartostring(word); wordLen += it.appendchartostring(span);
it.appendchartostring(span);
break; break;
} }
} }
@ -340,12 +338,11 @@ bool TextSplit::text_to_words(const string &in)
break; break;
default: default:
it.appendchartostring(word); wordLen += it.appendchartostring(span);
it.appendchartostring(span);
break; break;
} }
} }
if (word.length() || span.length()) { if (wordLen || span.length()) {
if (!doemit(true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
} }
@ -401,7 +398,7 @@ static string teststring =
" -wl,--export-dynamic " " -wl,--export-dynamic "
" ~/.xsession-errors " " ~/.xsession-errors "
; ;
static string teststring1 = " 124, "; static string teststring1 = " nouvel-an ";
static string thisprog; static string thisprog;

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.13 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.14 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -61,11 +61,11 @@ class TextSplit {
int maxWordLength; int maxWordLength;
string span; // Current span. Might be jf.dockes@wanadoo.f string span; // Current span. Might be jf.dockes@wanadoo.f
string word; // Current word: no punctuation at all in there int wordStart; // Current word: no punctuation at all in there
unsigned int wordLen;
bool number; bool number;
int wordpos; // Term position of current word int wordpos; // Term position of current word
int spanpos; // Term position of current span int spanpos; // Term position of current span
int charpos; // Character position
// It may happen that our cleanup would result in emitting the // It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this // same term twice. We try to avoid this