only autophrase if query has several terms

This commit is contained in:
dockes 2006-12-08 07:11:17 +00:00
parent b363de63f0
commit 554f75c99c
3 changed files with 55 additions and 20 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.26 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.27 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -349,6 +349,25 @@ bool TextSplit::text_to_words(const string &in)
return true; return true;
} }
// Callback class for utility function usage
class utSplitterCB : public TextSplitCB {
public:
int wcnt;
utSplitterCB() : wcnt(0) {}
bool takeword(const string &term, int pos, int bs, int be) {
wcnt++;
return true;
}
};
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
{
utSplitterCB cb;
TextSplit splitter(&cb, flgs);
splitter.text_to_words(s);
return cb.wcnt;
}
#else // TEST driver -> #else // TEST driver ->
#include <unistd.h> #include <unistd.h>
@ -371,7 +390,7 @@ class mySplitterCB : public TextSplitCB {
public: public:
mySplitterCB() : first(1), nooutput(false) {} mySplitterCB() : first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;} void setNoOut(bool val) {nooutput = val;}
bool takeword(const std::string &term, int pos, int bs, int be) { bool takeword(const string &term, int pos, int bs, int be) {
if (nooutput) if (nooutput)
return true; return true;
if (first) { if (first) {
@ -403,10 +422,11 @@ static string teststring1 = " nouvel-an ";
static string thisprog; static string thisprog;
static string usage = static string usage =
" textsplit [opts] [filename]\n" " textsplit [opts] [filename]\n"
" -S: no output\n" " -S: no output\n"
" -s: only spans\n" " -s: only spans\n"
" -w: only words\n" " -w: only words\n"
" -c: just count words\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n" " \n\n"
; ;
@ -422,6 +442,7 @@ static int op_flags;
#define OPT_s 0x1 #define OPT_s 0x1
#define OPT_w 0x2 #define OPT_w 0x2
#define OPT_S 0x4 #define OPT_S 0x4
#define OPT_c 0x8
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
@ -435,6 +456,7 @@ int main(int argc, char **argv)
Usage(); Usage();
while (**argv) while (**argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break;
case 's': op_flags |= OPT_s; break; case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break; case 'S': op_flags |= OPT_S; break;
case 'w': op_flags |= OPT_w; break; case 'w': op_flags |= OPT_w; break;
@ -455,9 +477,9 @@ int main(int argc, char **argv)
flags = TextSplit::TXTS_ONLYSPANS; flags = TextSplit::TXTS_ONLYSPANS;
else if (op_flags&OPT_w) else if (op_flags&OPT_w)
flags = TextSplit::TXTS_NOSPANS; flags = TextSplit::TXTS_NOSPANS;
TextSplit splitter(&cb, flags);
string data;
if (argc == 1) { if (argc == 1) {
string data;
const char *filename = *argv++; argc--; const char *filename = *argv++; argc--;
if (!strcmp(filename, "stdin")) { if (!strcmp(filename, "stdin")) {
char buf[1024]; char buf[1024];
@ -467,11 +489,16 @@ int main(int argc, char **argv)
} }
} else if (!file_to_string(filename, data)) } else if (!file_to_string(filename, data))
exit(1); exit(1);
splitter.text_to_words(data);
} else { } else {
cout << endl << teststring << endl << endl; cout << endl << teststring << endl << endl;
splitter.text_to_words(teststring); data = teststring;
} }
if (op_flags & OPT_c) {
int n = TextSplit::countWords(data, flags);
cout << n << " words" << endl;
} else {
TextSplit splitter(&cb, flags);
splitter.text_to_words(data);
}
} }
#endif // TEST #endif // TEST

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.14 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -27,9 +27,9 @@ using std::string;
* Function class whose takeword method is called for every detected word while * splitting text. * Function class whose takeword method is called for every detected word while * splitting text.
*/ */
class TextSplitCB { class TextSplitCB {
public: public:
virtual ~TextSplitCB() {} virtual ~TextSplitCB() {}
virtual bool takeword(const std::string& term, virtual bool takeword(const string& term,
int pos, // term pos int pos, // term pos
int bts, // byte offset of first char in term int bts, // byte offset of first char in term
int bte // byte offset of first char after term int bte // byte offset of first char after term
@ -43,7 +43,7 @@ class TextSplitCB {
* but 'ts much simpler this way... * but 'ts much simpler this way...
*/ */
class TextSplit { class TextSplit {
public: public:
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2}; enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
/** /**
* Constructor: just store callback object * Constructor: just store callback object
@ -53,9 +53,13 @@ class TextSplit {
/** /**
* Split text, emit words and positions. * Split text, emit words and positions.
*/ */
bool text_to_words(const std::string &in); bool text_to_words(const string &in);
private: // Utility functions : these does not need the user to setup a callback
// etc.
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
private:
Flags m_flags; Flags m_flags;
TextSplitCB *cb; TextSplitCB *cb;
int maxWordLength; int maxWordLength;
@ -72,8 +76,10 @@ class TextSplit {
int prevpos; int prevpos;
unsigned int prevlen; unsigned int prevlen;
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be); bool emitterm(bool isspan, string &term, int pos, int bs, int be);
bool doemit(bool spanerase, int bp); bool doemit(bool spanerase, int bp);
}; };
#endif /* _TEXTSPLIT_H_INCLUDED_ */ #endif /* _TEXTSPLIT_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.14 2006-12-08 06:45:05 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -33,6 +33,7 @@ static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.14 2006-12-08 06:45:05 dockes E
#include "searchdata.h" #include "searchdata.h"
#include "ssearch_w.h" #include "ssearch_w.h"
#include "refcntr.h" #include "refcntr.h"
#include "textsplit.h"
enum SSearchType {SST_ANY = 0, SST_ALL = 1, SST_FNM = 2}; enum SSearchType {SST_ANY = 0, SST_ALL = 1, SST_FNM = 2};
@ -82,7 +83,8 @@ void SSearch::startSimpleSearch()
SSearchType tp = (SSearchType)searchTypCMB->currentItem(); SSearchType tp = (SSearchType)searchTypCMB->currentItem();
if (prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) && if (prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) &&
u8.find_first_of("\"") == string::npos) { u8.find_first_of("\"") == string::npos &&
TextSplit::countWords(u8) > 1) {
sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
u8, 0)); u8, 0));
} }