use m_ prefix for members
This commit is contained in:
parent
45d188f2cf
commit
ba295fae4f
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.29 2007-01-25 15:40:55 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -103,7 +103,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
||||||
|
|
||||||
unsigned int l = w.length();
|
unsigned int l = w.length();
|
||||||
if (l > 0 && l < (unsigned)maxWordLength) {
|
if (l > 0 && l < (unsigned)m_maxWordLength) {
|
||||||
// 1 char word: we index single letters and digits, but
|
// 1 char word: we index single letters and digits, but
|
||||||
// nothing else. We might want to turn this into a test for a single
|
// nothing else. We might want to turn this into a test for a single
|
||||||
// utf8 character instead.
|
// utf8 character instead.
|
||||||
@ -114,10 +114,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pos != prevpos || l != prevlen) {
|
if (pos != m_prevpos || l != m_prevlen) {
|
||||||
bool ret = cb->takeword(w, pos, btstart, btend);
|
bool ret = m_cb->takeword(w, pos, btstart, btend);
|
||||||
prevlen = w.length();
|
m_prevpos = pos;
|
||||||
prevpos = pos;
|
m_prevlen = w.length();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
||||||
@ -146,17 +146,17 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
|||||||
// Emit span. When splitting for query, we only emit final spans
|
// Emit span. When splitting for query, we only emit final spans
|
||||||
bool spanemitted = false;
|
bool spanemitted = false;
|
||||||
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
||||||
// Maybe trim at end These are chars that we would keep inside
|
// Maybe trim at end. These are chars that we would keep inside
|
||||||
// a span, but not at the end
|
// a span, but not at the end
|
||||||
while (span.length() > 0) {
|
while (m_span.length() > 0) {
|
||||||
switch (span[span.length()-1]) {
|
switch (m_span[m_span.length()-1]) {
|
||||||
case '.':
|
case '.':
|
||||||
case ',':
|
case ',':
|
||||||
case '@':
|
case '@':
|
||||||
case '\'':
|
case '\'':
|
||||||
span.resize(span.length()-1);
|
m_span.resize(m_span.length()-1);
|
||||||
if (--bp < 0)
|
if (--bp < 0)
|
||||||
bp=0;
|
bp = 0;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
goto breakloop1;
|
goto breakloop1;
|
||||||
@ -164,27 +164,27 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
|||||||
}
|
}
|
||||||
breakloop1:
|
breakloop1:
|
||||||
spanemitted = true;
|
spanemitted = true;
|
||||||
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Emit word if different from span and not 'no words' mode
|
// Emit word if different from span and not 'no words' mode
|
||||||
if (!(m_flags & TXTS_ONLYSPANS) && wordLen &&
|
if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
|
||||||
(!spanemitted || wordLen != span.length())) {
|
(!spanemitted || m_wordLen != m_span.length())) {
|
||||||
string s(span.substr(wordStart, wordLen));
|
string s(m_span.substr(m_wordStart, m_wordLen));
|
||||||
if (!emitterm(false, s, wordpos, bp-wordLen, bp))
|
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adjust state
|
// Adjust state
|
||||||
wordpos++;
|
m_wordpos++;
|
||||||
wordLen = 0;
|
m_wordLen = 0;
|
||||||
if (spanerase) {
|
if (spanerase) {
|
||||||
span.erase();
|
m_span.erase();
|
||||||
spanpos = wordpos;
|
m_spanpos = m_wordpos;
|
||||||
wordStart = 0;
|
m_wordStart = 0;
|
||||||
} else {
|
} else {
|
||||||
wordStart = span.length();
|
m_wordStart = m_span.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -215,9 +215,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
|
|
||||||
setcharclasses();
|
setcharclasses();
|
||||||
|
|
||||||
span.erase();
|
m_span.erase();
|
||||||
number = false;
|
m_inNumber = false;
|
||||||
wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
|
|
||||||
@ -231,21 +231,21 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
case LETTER:
|
case LETTER:
|
||||||
wordLen += it.appendchartostring(span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DIGIT:
|
case DIGIT:
|
||||||
if (wordLen == 0)
|
if (m_wordLen == 0)
|
||||||
number = true;
|
m_inNumber = true;
|
||||||
wordLen += it.appendchartostring(span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
if (wordLen || span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case WILD:
|
case WILD:
|
||||||
@ -256,27 +256,27 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
if (wordLen == 0) {
|
if (m_wordLen == 0) {
|
||||||
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
number = true;
|
m_inNumber = true;
|
||||||
wordLen += it.appendchartostring(span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
} else {
|
} else {
|
||||||
wordStart += it.appendchartostring(span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
m_inNumber = false;
|
||||||
wordStart += it.appendchartostring(span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '.':
|
case '.':
|
||||||
case ',':
|
case ',':
|
||||||
if (number) {
|
if (m_inNumber) {
|
||||||
// 132.jpg ?
|
// 132.jpg ?
|
||||||
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
wordLen += it.appendchartostring(span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, keep it, else, this is whitespace.
|
// If . inside a word, keep it, else, this is whitespace.
|
||||||
@ -286,16 +286,16 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// will be split as .x-errs, x, errs but not x-errs
|
// will be split as .x-errs, x, errs but not x-errs
|
||||||
// A final comma in a word will be removed by doemit
|
// A final comma in a word will be removed by doemit
|
||||||
if (cc == '.') {
|
if (cc == '.') {
|
||||||
if (wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
// span length could have been adjusted by trimming
|
// span length could have been adjusted by trimming
|
||||||
// inside doemit
|
// inside doemit
|
||||||
if (span.length())
|
if (m_span.length())
|
||||||
wordStart += it.appendchartostring(span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
wordStart += it.appendchartostring(span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -303,29 +303,29 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
case '@':
|
case '@':
|
||||||
if (wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
wordStart += it.appendchartostring(span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
// If in word, potential span: o'brien, else, this is more
|
// If in word, potential span: o'brien, else, this is more
|
||||||
// whitespace
|
// whitespace
|
||||||
if (wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
m_inNumber = false;
|
||||||
wordStart += it.appendchartostring(span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '#':
|
case '#':
|
||||||
// Keep it only at end of word ... Special case for c# you see...
|
// Keep it only at end of word ... Special case for c# you see...
|
||||||
if (wordLen > 0) {
|
if (m_wordLen > 0) {
|
||||||
int w = whatcc(it[it.getCpos()+1]);
|
int w = whatcc(it[it.getCpos()+1]);
|
||||||
if (w == SPACE || w == '\n' || w == '\r') {
|
if (w == SPACE || w == '\n' || w == '\r') {
|
||||||
wordLen += it.appendchartostring(span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -333,7 +333,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
case '\n':
|
case '\n':
|
||||||
case '\r':
|
case '\r':
|
||||||
if (span.length() && span[span.length() - 1] == '-') {
|
if (m_span.length() && m_span[m_span.length() - 1] == '-') {
|
||||||
// if '-' is the last char before end of line, just
|
// if '-' is the last char before end of line, just
|
||||||
// ignore the line change. This is the right thing to
|
// ignore the line change. This is the right thing to
|
||||||
// do almost always. We'd then need a way to check if
|
// do almost always. We'd then need a way to check if
|
||||||
@ -349,11 +349,11 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
|
|
||||||
default:
|
default:
|
||||||
NORMALCHAR:
|
NORMALCHAR:
|
||||||
wordLen += it.appendchartostring(span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (wordLen || span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -44,13 +44,18 @@ public:
|
|||||||
*/
|
*/
|
||||||
class TextSplit {
|
class TextSplit {
|
||||||
public:
|
public:
|
||||||
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2,
|
enum Flags {TXTS_NONE = 0,
|
||||||
TXTS_KEEPWILD = 4};
|
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
||||||
|
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
||||||
|
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor: just store callback object
|
* Constructor: just store callback object
|
||||||
*/
|
*/
|
||||||
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
||||||
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
|
: m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split text, emit words and positions.
|
* Split text, emit words and positions.
|
||||||
*/
|
*/
|
||||||
@ -61,25 +66,34 @@ public:
|
|||||||
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Flags m_flags;
|
Flags m_flags;
|
||||||
TextSplitCB *cb;
|
TextSplitCB *m_cb;
|
||||||
int maxWordLength;
|
int m_maxWordLength;
|
||||||
|
|
||||||
string span; // Current span. Might be jf.dockes@wanadoo.f
|
// Current span. Might be jf.dockes@wanadoo.f
|
||||||
int wordStart; // Current word: no punctuation at all in there
|
string m_span;
|
||||||
unsigned int wordLen;
|
|
||||||
bool number;
|
// Current word: no punctuation at all in there
|
||||||
int wordpos; // Term position of current word
|
int m_wordStart;
|
||||||
int spanpos; // Term position of current span
|
unsigned int m_wordLen;
|
||||||
|
|
||||||
|
// Currently inside number
|
||||||
|
bool m_inNumber;
|
||||||
|
|
||||||
|
// Term position of current word and span
|
||||||
|
int m_wordpos;
|
||||||
|
int m_spanpos;
|
||||||
|
|
||||||
// It may happen that our cleanup would result in emitting the
|
// It may happen that our cleanup would result in emitting the
|
||||||
// same term twice. We try to avoid this
|
// same term twice. We try to avoid this
|
||||||
int prevpos;
|
int m_prevpos;
|
||||||
unsigned int prevlen;
|
unsigned int m_prevlen;
|
||||||
|
|
||||||
|
// This processes cjk text:
|
||||||
|
// bool cjk_to_words();
|
||||||
|
|
||||||
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
||||||
bool doemit(bool spanerase, int bp);
|
bool doemit(bool spanerase, int bp);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user