use m_ prefix for members

This commit is contained in:
dockes 2007-09-18 20:35:31 +00:00
parent 45d188f2cf
commit ba295fae4f
2 changed files with 84 additions and 70 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.29 2007-01-25 15:40:55 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -103,7 +103,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos)); LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
unsigned int l = w.length(); unsigned int l = w.length();
if (l > 0 && l < (unsigned)maxWordLength) { if (l > 0 && l < (unsigned)m_maxWordLength) {
// 1 char word: we index single letters and digits, but // 1 char word: we index single letters and digits, but
// nothing else. We might want to turn this into a test for a single // nothing else. We might want to turn this into a test for a single
// utf8 character instead. // utf8 character instead.
@ -114,10 +114,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
return true; return true;
} }
} }
if (pos != prevpos || l != prevlen) { if (pos != m_prevpos || l != m_prevlen) {
bool ret = cb->takeword(w, pos, btstart, btend); bool ret = m_cb->takeword(w, pos, btstart, btend);
prevlen = w.length(); m_prevpos = pos;
prevpos = pos; m_prevlen = w.length();
return ret; return ret;
} }
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos)); LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
@ -146,17 +146,17 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
// Emit span. When splitting for query, we only emit final spans // Emit span. When splitting for query, we only emit final spans
bool spanemitted = false; bool spanemitted = false;
if (spanerase && !(m_flags & TXTS_NOSPANS)) { if (spanerase && !(m_flags & TXTS_NOSPANS)) {
// Maybe trim at end These are chars that we would keep inside // Maybe trim at end. These are chars that we would keep inside
// a span, but not at the end // a span, but not at the end
while (span.length() > 0) { while (m_span.length() > 0) {
switch (span[span.length()-1]) { switch (m_span[m_span.length()-1]) {
case '.': case '.':
case ',': case ',':
case '@': case '@':
case '\'': case '\'':
span.resize(span.length()-1); m_span.resize(m_span.length()-1);
if (--bp < 0) if (--bp < 0)
bp=0; bp = 0;
break; break;
default: default:
goto breakloop1; goto breakloop1;
@ -164,27 +164,27 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
} }
breakloop1: breakloop1:
spanemitted = true; spanemitted = true;
if (!emitterm(true, span, spanpos, bp-span.length(), bp)) if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
return false; return false;
} }
// Emit word if different from span and not 'no words' mode // Emit word if different from span and not 'no words' mode
if (!(m_flags & TXTS_ONLYSPANS) && wordLen && if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
(!spanemitted || wordLen != span.length())) { (!spanemitted || m_wordLen != m_span.length())) {
string s(span.substr(wordStart, wordLen)); string s(m_span.substr(m_wordStart, m_wordLen));
if (!emitterm(false, s, wordpos, bp-wordLen, bp)) if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
return false; return false;
} }
// Adjust state // Adjust state
wordpos++; m_wordpos++;
wordLen = 0; m_wordLen = 0;
if (spanerase) { if (spanerase) {
span.erase(); m_span.erase();
spanpos = wordpos; m_spanpos = m_wordpos;
wordStart = 0; m_wordStart = 0;
} else { } else {
wordStart = span.length(); m_wordStart = m_span.length();
} }
return true; return true;
@ -215,9 +215,9 @@ bool TextSplit::text_to_words(const string &in)
setcharclasses(); setcharclasses();
span.erase(); m_span.erase();
number = false; m_inNumber = false;
wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0; m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
Utf8Iter it(in); Utf8Iter it(in);
@ -231,21 +231,21 @@ bool TextSplit::text_to_words(const string &in)
int cc = whatcc(c); int cc = whatcc(c);
switch (cc) { switch (cc) {
case LETTER: case LETTER:
wordLen += it.appendchartostring(span); m_wordLen += it.appendchartostring(m_span);
break; break;
case DIGIT: case DIGIT:
if (wordLen == 0) if (m_wordLen == 0)
number = true; m_inNumber = true;
wordLen += it.appendchartostring(span); m_wordLen += it.appendchartostring(m_span);
break; break;
case SPACE: case SPACE:
SPACE: SPACE:
if (wordLen || span.length()) { if (m_wordLen || m_span.length()) {
if (!doemit(true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
number = false; m_inNumber = false;
} }
break; break;
case WILD: case WILD:
@ -256,27 +256,27 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '-': case '-':
case '+': case '+':
if (wordLen == 0) { if (m_wordLen == 0) {
if (whatcc(it[it.getCpos()+1]) == DIGIT) { if (whatcc(it[it.getCpos()+1]) == DIGIT) {
number = true; m_inNumber = true;
wordLen += it.appendchartostring(span); m_wordLen += it.appendchartostring(m_span);
} else { } else {
wordStart += it.appendchartostring(span); m_wordStart += it.appendchartostring(m_span);
} }
} else { } else {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; m_inNumber = false;
wordStart += it.appendchartostring(span); m_wordStart += it.appendchartostring(m_span);
} }
break; break;
case '.': case '.':
case ',': case ',':
if (number) { if (m_inNumber) {
// 132.jpg ? // 132.jpg ?
if (whatcc(it[it.getCpos()+1]) != DIGIT) if (whatcc(it[it.getCpos()+1]) != DIGIT)
goto SPACE; goto SPACE;
wordLen += it.appendchartostring(span); m_wordLen += it.appendchartostring(m_span);
break; break;
} else { } else {
// If . inside a word, keep it, else, this is whitespace. // If . inside a word, keep it, else, this is whitespace.
@ -286,16 +286,16 @@ bool TextSplit::text_to_words(const string &in)
// will be split as .x-errs, x, errs but not x-errs // will be split as .x-errs, x, errs but not x-errs
// A final comma in a word will be removed by doemit // A final comma in a word will be removed by doemit
if (cc == '.') { if (cc == '.') {
if (wordLen) { if (m_wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
// span length could have been adjusted by trimming // span length could have been adjusted by trimming
// inside doemit // inside doemit
if (span.length()) if (m_span.length())
wordStart += it.appendchartostring(span); m_wordStart += it.appendchartostring(m_span);
break; break;
} else { } else {
wordStart += it.appendchartostring(span); m_wordStart += it.appendchartostring(m_span);
break; break;
} }
} }
@ -303,29 +303,29 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE; goto SPACE;
break; break;
case '@': case '@':
if (wordLen) { if (m_wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; m_inNumber = false;
} }
wordStart += it.appendchartostring(span); m_wordStart += it.appendchartostring(m_span);
break; break;
case '\'': case '\'':
// If in word, potential span: o'brien, else, this is more // If in word, potential span: o'brien, else, this is more
// whitespace // whitespace
if (wordLen) { if (m_wordLen) {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; m_inNumber = false;
wordStart += it.appendchartostring(span); m_wordStart += it.appendchartostring(m_span);
} }
break; break;
case '#': case '#':
// Keep it only at end of word ... Special case for c# you see... // Keep it only at end of word ... Special case for c# you see...
if (wordLen > 0) { if (m_wordLen > 0) {
int w = whatcc(it[it.getCpos()+1]); int w = whatcc(it[it.getCpos()+1]);
if (w == SPACE || w == '\n' || w == '\r') { if (w == SPACE || w == '\n' || w == '\r') {
wordLen += it.appendchartostring(span); m_wordLen += it.appendchartostring(m_span);
break; break;
} }
} }
@ -333,7 +333,7 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '\n': case '\n':
case '\r': case '\r':
if (span.length() && span[span.length() - 1] == '-') { if (m_span.length() && m_span[m_span.length() - 1] == '-') {
// if '-' is the last char before end of line, just // if '-' is the last char before end of line, just
// ignore the line change. This is the right thing to // ignore the line change. This is the right thing to
// do almost always. We'd then need a way to check if // do almost always. We'd then need a way to check if
@ -349,11 +349,11 @@ bool TextSplit::text_to_words(const string &in)
default: default:
NORMALCHAR: NORMALCHAR:
wordLen += it.appendchartostring(span); m_wordLen += it.appendchartostring(m_span);
break; break;
} }
} }
if (wordLen || span.length()) { if (m_wordLen || m_span.length()) {
if (!doemit(true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -44,13 +44,18 @@ public:
*/ */
class TextSplit { class TextSplit {
public: public:
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2, enum Flags {TXTS_NONE = 0,
TXTS_KEEPWILD = 4}; TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
TXTS_KEEPWILD = 4 // Handle wildcards as letters
};
/** /**
* Constructor: just store callback object * Constructor: just store callback object
*/ */
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {} : m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {}
/** /**
* Split text, emit words and positions. * Split text, emit words and positions.
*/ */
@ -61,25 +66,34 @@ public:
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS); static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
private: private:
Flags m_flags; Flags m_flags;
TextSplitCB *cb; TextSplitCB *m_cb;
int maxWordLength; int m_maxWordLength;
string span; // Current span. Might be jf.dockes@wanadoo.f // Current span. Might be jf.dockes@wanadoo.f
int wordStart; // Current word: no punctuation at all in there string m_span;
unsigned int wordLen;
bool number; // Current word: no punctuation at all in there
int wordpos; // Term position of current word int m_wordStart;
int spanpos; // Term position of current span unsigned int m_wordLen;
// Currently inside number
bool m_inNumber;
// Term position of current word and span
int m_wordpos;
int m_spanpos;
// It may happen that our cleanup would result in emitting the // It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this // same term twice. We try to avoid this
int prevpos; int m_prevpos;
unsigned int prevlen; unsigned int m_prevlen;
// This processes cjk text:
// bool cjk_to_words();
bool emitterm(bool isspan, string &term, int pos, int bs, int be); bool emitterm(bool isspan, string &term, int pos, int bs, int be);
bool doemit(bool spanerase, int bp); bool doemit(bool spanerase, int bp);
}; };