*** empty log message ***
This commit is contained in:
parent
8c9eb8c6d3
commit
3c78938565
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.17 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.18 2006-01-28 15:36:59 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -163,8 +163,7 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
* @param spanerase Set if the current span is at its end. Reset it.
|
* @param spanerase Set if the current span is at its end. Reset it.
|
||||||
* @param bp The current BYTE position in the stream
|
* @param bp The current BYTE position in the stream
|
||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(string &word, int &wordpos, string &span,
|
inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||||
int &spanpos, bool spanerase, int bp)
|
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
|
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
|
||||||
@ -216,12 +215,10 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
|
|
||||||
setcharclasses();
|
setcharclasses();
|
||||||
|
|
||||||
string span; // Current span. Might be jf.dockes@wanadoo.f
|
span.clear();
|
||||||
string word; // Current word: no punctuation at all in there
|
word.clear(); // Current word: no punctuation at all in there
|
||||||
bool number = false;
|
number = false;
|
||||||
int wordpos = 0; // Term position of current word
|
wordpos = spanpos = charpos = 0;
|
||||||
int spanpos = 0; // Term position of current span
|
|
||||||
int charpos = 0; // Character position
|
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
|
|
||||||
@ -236,7 +233,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
if (word.length() || span.length()) {
|
if (word.length() || span.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
@ -251,7 +248,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
} else
|
} else
|
||||||
span += it;
|
span += it;
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += it;
|
span += it;
|
||||||
@ -265,34 +262,41 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, keep it, else, this is whitespace.
|
// If . inside a word, keep it, else, this is whitespace.
|
||||||
|
// We also keep an initial '.' for catching .net, but this adds
|
||||||
|
// quite a few spurious terms !
|
||||||
|
// Another problem is that something like .x-errs
|
||||||
|
// will be split as .x-errs, x, errs but not x-errs
|
||||||
// A final comma in a word will be removed by doemit
|
// A final comma in a word will be removed by doemit
|
||||||
if (cc == '.' && word.length()) {
|
if (cc == '.') {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false,
|
if (word.length()) {
|
||||||
it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
// span length could have been adjusted by trimming
|
// span length could have been adjusted by trimming
|
||||||
// inside doemit
|
// inside doemit
|
||||||
if (span.length())
|
if (span.length())
|
||||||
|
span += it;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
span += it;
|
span += it;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
case '@':
|
case '@':
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
}
|
||||||
word += it;
|
|
||||||
span += it;
|
span += it;
|
||||||
break;
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
// If in word, potential span: o'brien, else, this is more
|
// If in word, potential span: o'brien, else, this is more
|
||||||
// whitespace
|
// whitespace
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += it;
|
span += it;
|
||||||
@ -337,7 +341,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (word.length() || span.length()) {
|
if (word.length() || span.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -374,7 +378,7 @@ class mySplitterCB : public TextSplitCB {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static string teststring1 =
|
static string teststring =
|
||||||
"Un bout de texte \nnormal. jfd@okyz.com \n"
|
"Un bout de texte \nnormal. jfd@okyz.com \n"
|
||||||
"Ceci. Est;Oui n@d @net .net t@v@c c# c++ -10 o'brien l'ami \n"
|
"Ceci. Est;Oui n@d @net .net t@v@c c# c++ -10 o'brien l'ami \n"
|
||||||
"a 134 +134 -14 -1.5 +1.5 1.54e10 a @^#$(#$(*) 1,2 1,2e30\n"
|
"a 134 +134 -14 -1.5 +1.5 1.54e10 a @^#$(#$(*) 1,2 1,2e30\n"
|
||||||
@ -384,8 +388,9 @@ static string teststring1 =
|
|||||||
"M9R F($AA;F1L:6YG\"0D)\"0D@(\" @(#4P, T)0W)A=&4)\"0D)\"2 @,C4P#0E3"
|
"M9R F($AA;F1L:6YG\"0D)\"0D@(\" @(#4P, T)0W)A=&4)\"0D)\"2 @,C4P#0E3"
|
||||||
" ,able,test-domain "
|
" ,able,test-domain "
|
||||||
" -wl,--export-dynamic "
|
" -wl,--export-dynamic "
|
||||||
|
" ~/.xsession-errors "
|
||||||
;
|
;
|
||||||
static string teststring = " -wl,--export-dynamic ";
|
static string teststring1 = " ~/.xsession-errors ";
|
||||||
|
|
||||||
static string thisprog;
|
static string thisprog;
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.9 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.10 2006-01-28 15:36:59 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -27,26 +27,36 @@ class TextSplitCB {
|
|||||||
* but 'ts much simpler this way...
|
* but 'ts much simpler this way...
|
||||||
*/
|
*/
|
||||||
class TextSplit {
|
class TextSplit {
|
||||||
bool fq; // for query: Are we splitting for query or index ?
|
|
||||||
// It may happen that our cleanup would result in emitting the
|
|
||||||
// same term twice. We try to avoid this
|
|
||||||
string prevterm;
|
|
||||||
int prevpos;
|
|
||||||
TextSplitCB *cb;
|
|
||||||
int maxWordLength;
|
|
||||||
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
|
||||||
bool doemit(string &word, int &wordpos, string &span, int &spanpos,
|
|
||||||
bool spanerase, int bp);
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Constructor: just store callback object
|
* Constructor: just store callback object
|
||||||
*/
|
*/
|
||||||
TextSplit(TextSplitCB *t, bool forquery = false)
|
TextSplit(TextSplitCB *t, bool forquery = false)
|
||||||
: fq(forquery), prevpos(-1), cb(t), maxWordLength(40) {}
|
: fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {}
|
||||||
/**
|
/**
|
||||||
* Split text, emit words and positions.
|
* Split text, emit words and positions.
|
||||||
*/
|
*/
|
||||||
bool text_to_words(const std::string &in);
|
bool text_to_words(const std::string &in);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool fq; // for query: Are we splitting for query or index ?
|
||||||
|
TextSplitCB *cb;
|
||||||
|
int maxWordLength;
|
||||||
|
|
||||||
|
string span; // Current span. Might be jf.dockes@wanadoo.f
|
||||||
|
string word; // Current word: no punctuation at all in there
|
||||||
|
bool number;
|
||||||
|
int wordpos; // Term position of current word
|
||||||
|
int spanpos; // Term position of current span
|
||||||
|
int charpos; // Character position
|
||||||
|
|
||||||
|
// It may happen that our cleanup would result in emitting the
|
||||||
|
// same term twice. We try to avoid this
|
||||||
|
int prevpos;
|
||||||
|
string prevterm;
|
||||||
|
|
||||||
|
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
||||||
|
bool doemit(bool spanerase, int bp);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user