improved textsplit speed (needs utf8iter modifs too
This commit is contained in:
parent
a573fbd1a9
commit
9d6963c95a
@ -1,9 +1,9 @@
|
|||||||
# @(#$Id: Makefile,v 1.13 2006-11-15 14:57:53 dockes Exp $ (C) 2005 J.F.Dockes
|
# @(#$Id: Makefile,v 1.14 2006-11-20 11:17:53 dockes Exp $ (C) 2005 J.F.Dockes
|
||||||
depth = ..
|
depth = ..
|
||||||
include $(depth)/mk/sysconf
|
include $(depth)/mk/sysconf
|
||||||
|
|
||||||
# Only test executables get build in here
|
# Only test executables get build in here
|
||||||
PROGS = internfile unacpp textsplit rclconfig
|
PROGS = unacpp textsplit rclconfig
|
||||||
|
|
||||||
all: $(BIGLIB) $(PROGS)
|
all: $(BIGLIB) $(PROGS)
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.26 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -24,7 +24,10 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes E
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
|
||||||
|
//#define UTF8ITER_CHECK
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
|
|
||||||
#include "uproplist.h"
|
#include "uproplist.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -128,18 +131,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
* text_to_words().
|
* text_to_words().
|
||||||
*
|
*
|
||||||
* @return true if ok, false for error. Splitting should stop in this case.
|
* @return true if ok, false for error. Splitting should stop in this case.
|
||||||
* @param word Word value. This will be empty on return in ALL non-error
|
|
||||||
* cases
|
|
||||||
* @param wordpos Term position for word. Always ++ by us.
|
|
||||||
* @param span Span value
|
|
||||||
* @param spanpos Term position for the current span
|
|
||||||
* @param spanerase Set if the current span is at its end. Reset it.
|
* @param spanerase Set if the current span is at its end. Reset it.
|
||||||
* @param bp The current BYTE position in the stream
|
* @param bp The current BYTE position in the stream
|
||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(bool spanerase, int bp)
|
inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||||
{
|
{
|
||||||
LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
|
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
|
||||||
word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));
|
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
||||||
|
|
||||||
// Emit span. When splitting for query, we only emit final spans
|
// Emit span. When splitting for query, we only emit final spans
|
||||||
bool spanemitted = false;
|
bool spanemitted = false;
|
||||||
@ -166,19 +164,23 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Emit word if different from span and not 'no words' mode
|
// Emit word if different from span and not 'no words' mode
|
||||||
if (!(m_flags & TXTS_ONLYSPANS) &&
|
if (!(m_flags & TXTS_ONLYSPANS) && wordLen &&
|
||||||
(!spanemitted || word.length() != span.length()))
|
(!spanemitted || wordLen != span.length())) {
|
||||||
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
|
string s(span.substr(wordStart, wordLen));
|
||||||
|
if (!emitterm(false, s, wordpos, bp-wordLen, bp))
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Adjust state
|
// Adjust state
|
||||||
wordpos++;
|
wordpos++;
|
||||||
word.erase();
|
wordLen = 0;
|
||||||
if (spanerase) {
|
if (spanerase) {
|
||||||
span.erase();
|
span.erase();
|
||||||
spanpos = wordpos;
|
spanpos = wordpos;
|
||||||
|
wordStart = 0;
|
||||||
|
} else {
|
||||||
|
wordStart = span.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -210,14 +212,14 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
setcharclasses();
|
setcharclasses();
|
||||||
|
|
||||||
span.erase();
|
span.erase();
|
||||||
word.erase(); // Current word: no punctuation at all in there
|
|
||||||
number = false;
|
number = false;
|
||||||
prevpos = prevlen = wordpos = spanpos = charpos = 0;
|
wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
|
|
||||||
for (; !it.eof(); it++, charpos++) {
|
for (; !it.eof(); it++) {
|
||||||
unsigned int c = *it;
|
unsigned int c = *it;
|
||||||
|
|
||||||
if (c == (unsigned int)-1) {
|
if (c == (unsigned int)-1) {
|
||||||
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
||||||
return false;
|
return false;
|
||||||
@ -225,20 +227,18 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
case LETTER:
|
case LETTER:
|
||||||
it.appendchartostring(word);
|
wordLen += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DIGIT:
|
case DIGIT:
|
||||||
if (word.length() == 0)
|
if (wordLen == 0)
|
||||||
number = true;
|
number = true;
|
||||||
it.appendchartostring(word);
|
wordLen += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
if (word.length() || span.length()) {
|
if (wordLen || span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
@ -246,28 +246,27 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
if (word.length() == 0) {
|
if (wordLen == 0) {
|
||||||
if (whatcc(it[charpos+1]) == DIGIT) {
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
number = true;
|
number = true;
|
||||||
it.appendchartostring(word);
|
wordLen += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
} else {
|
||||||
} else
|
wordStart += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
it.appendchartostring(span);
|
wordStart += it.appendchartostring(span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '.':
|
case '.':
|
||||||
case ',':
|
case ',':
|
||||||
if (number) {
|
if (number) {
|
||||||
// 132.jpg ?
|
// 132.jpg ?
|
||||||
if (whatcc(it[charpos+1]) != DIGIT)
|
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
it.appendchartostring(word);
|
wordLen += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, keep it, else, this is whitespace.
|
// If . inside a word, keep it, else, this is whitespace.
|
||||||
@ -277,16 +276,16 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// will be split as .x-errs, x, errs but not x-errs
|
// will be split as .x-errs, x, errs but not x-errs
|
||||||
// A final comma in a word will be removed by doemit
|
// A final comma in a word will be removed by doemit
|
||||||
if (cc == '.') {
|
if (cc == '.') {
|
||||||
if (word.length()) {
|
if (wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
// span length could have been adjusted by trimming
|
// span length could have been adjusted by trimming
|
||||||
// inside doemit
|
// inside doemit
|
||||||
if (span.length())
|
if (span.length())
|
||||||
it.appendchartostring(span);
|
wordStart += it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
it.appendchartostring(span);
|
wordStart += it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -294,30 +293,29 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
case '@':
|
case '@':
|
||||||
if (word.length()) {
|
if (wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
it.appendchartostring(span);
|
wordStart += it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
// If in word, potential span: o'brien, else, this is more
|
// If in word, potential span: o'brien, else, this is more
|
||||||
// whitespace
|
// whitespace
|
||||||
if (word.length()) {
|
if (wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
it.appendchartostring(span);
|
wordStart += it.appendchartostring(span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '#':
|
case '#':
|
||||||
// Keep it only at end of word... Special case for c# you see...
|
// Keep it only at end of word ... Special case for c# you see...
|
||||||
if (word.length() > 0) {
|
if (wordLen > 0) {
|
||||||
int w = whatcc(it[charpos+1]);
|
int w = whatcc(it[it.getCpos()+1]);
|
||||||
if (w == SPACE || w == '\n' || w == '\r') {
|
if (w == SPACE || w == '\n' || w == '\r') {
|
||||||
it.appendchartostring(word);
|
wordLen += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -340,12 +338,11 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
it.appendchartostring(word);
|
wordLen += it.appendchartostring(span);
|
||||||
it.appendchartostring(span);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (word.length() || span.length()) {
|
if (wordLen || span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -401,7 +398,7 @@ static string teststring =
|
|||||||
" -wl,--export-dynamic "
|
" -wl,--export-dynamic "
|
||||||
" ~/.xsession-errors "
|
" ~/.xsession-errors "
|
||||||
;
|
;
|
||||||
static string teststring1 = " 124, ";
|
static string teststring1 = " nouvel-an ";
|
||||||
|
|
||||||
static string thisprog;
|
static string thisprog;
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.13 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.14 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -61,11 +61,11 @@ class TextSplit {
|
|||||||
int maxWordLength;
|
int maxWordLength;
|
||||||
|
|
||||||
string span; // Current span. Might be jf.dockes@wanadoo.f
|
string span; // Current span. Might be jf.dockes@wanadoo.f
|
||||||
string word; // Current word: no punctuation at all in there
|
int wordStart; // Current word: no punctuation at all in there
|
||||||
|
unsigned int wordLen;
|
||||||
bool number;
|
bool number;
|
||||||
int wordpos; // Term position of current word
|
int wordpos; // Term position of current word
|
||||||
int spanpos; // Term position of current span
|
int spanpos; // Term position of current span
|
||||||
int charpos; // Character position
|
|
||||||
|
|
||||||
// It may happen that our cleanup would result in emitting the
|
// It may happen that our cleanup would result in emitting the
|
||||||
// same term twice. We try to avoid this
|
// same term twice. We try to avoid this
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user