From c9deaa2d31353b64a78c5436f995c71c8eae08b4 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 27 Jan 2015 15:53:39 +0100 Subject: [PATCH 1/4] none --- src/query/wasa.y | 453 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 453 insertions(+) create mode 100644 src/query/wasa.y diff --git a/src/query/wasa.y b/src/query/wasa.y new file mode 100644 index 00000000..54a01bdc --- /dev/null +++ b/src/query/wasa.y @@ -0,0 +1,453 @@ +%{ +#include + +#include +#include + +#include "searchdata.h" + +using namespace std; + +int yylex(void); +void yyerror(char const *); +void logwhere(const char *); +class Expression; +static void qualify(Rcl::SearchDataClauseDist *, const string &); + +string stemlang("english"); + +%} + +%union { + string *str; + Rcl::SearchDataClauseSimple *cl; + Rcl::SearchData *sd; +} + +%type qualquote +%type fieldexpr +%type term +%type orchain +%type query + +%left AND +%right OR + +%token EQUALS +%token CONTAINS +%token SMALLEREQ +%token SMALLER +%token GREATEREQ +%token GREATER + +%token WORD +%token QUOTED +%token QUALIFIERS + +%% + +query: fieldexpr +{ + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, stemlang); + sd->addClause($1); + $$ = sd; + cerr << "q: fieldexpr" << endl; +} +| query fieldexpr +{ + cerr << "q: query fieldexpr" << endl; + $1->addClause($2); + $$ = $1; +} +| query AND fieldexpr +{ + cerr << "q: query AND fieldexpr" << endl; + $1->addClause($3); + $$ = $1; +} +| query AND orchain +{ + cerr << "q: query AND orchain"; + Rcl::SearchDataClauseSub *sub = + new Rcl::SearchDataClauseSub(RefCntr($1)); + $1->addClause(sub); + $$ = $1; +} +| query orchain +{ + cerr << "q: query orchain" << endl; + Rcl::SearchDataClauseSub *sub = + new Rcl::SearchDataClauseSub(RefCntr($1)); + $1->addClause(sub); + $$ = $1; +} +| orchain +{ + cerr << "q: orchain" << endl; + $$ = $1; +} +| '(' query ')' +{ + cerr << "( query )" << endl; + $$ = $2; +} +; + +orchain: +fieldexpr OR fieldexpr +{ + cerr << "orchain: fieldexpr[" << $1->gettext() << "] OR fieldexpr[" << + $3->gettext() << "]" << endl; + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang); + sd->addClause($1); + sd->addClause($3); + $$ = sd; +} +| orchain OR fieldexpr +{ + cerr << "orchain: orchain OR fieldexpr[" << $3->gettext() << "]" << endl; + $1->addClause($3); + $$ = $1; +} +; + +fieldexpr: term +{ + //cerr << "simple fieldexpr: " << $1->gettext() << endl; + $$ = $1; +} +| WORD EQUALS term +{ + //cerr << *$1 << " = " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_EQUALS); + $$ = $3; +} +| WORD CONTAINS term +{ + //cerr << *$1 << " : " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_CONTAINS); + $$ = $3; +} +| WORD SMALLER term +{ + //cerr << *$1 << " < " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_LT); + $$ = $3; +} +| WORD SMALLEREQ term +{ + //cerr << *$1 << " <= " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_LTE); + $$ = $3; +} +| WORD GREATER term +{ + //cerr << *$1 << " > " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_GT); + $$ = $3; +} +| WORD GREATEREQ term +{ + //cerr << *$1 << " >= " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_GTE); + $$ = $3; +} +| '-' fieldexpr +{ + //cerr << "- fieldexpr[" << $2->gettext() << "]" << endl; + $2->setexclude(true); + $$ = $2; +} +; + +term: WORD +{ + //cerr << "term[" << *$1 << "]" << endl; + $$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1); +} +| qualquote +{ + $$ = $1; +} +; + +qualquote: QUOTED +{ + cerr << "QUOTED[" << *$1 << "]" << endl; + $$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); +} +| QUOTED QUALIFIERS +{ + cerr << "QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl; + Rcl::SearchDataClauseDist *cl = + new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); + qualify(cl, *$2); + $$ = cl; +} +; + + +%% + +#include +#include + +void yyerror (char const *s) +{ + cerr << s << endl; +} + +void logwhere(const char *s) +{ + cerr << s << endl; +} + +// Look for int at index, skip and return new index found? value. +static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval) +{ + unsigned int ncur = cur; + if (cur < q.size() - 1) { + char *endptr; + int val = strtol(&q[cur + 1], &endptr, 10); + if (endptr != &q[cur + 1]) { + ncur += endptr - &q[cur + 1]; + *pval = val; + } + } + return ncur; +} + +static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals) +{ + cerr << "qualify(" << cl << ", " << quals << ")" << endl; + for (unsigned int i = 0; i < quals.length(); i++) { + //fprintf(stderr, "qual char %c\n", quals[i]); + switch (quals[i]) { + case 'b': + cl->setWeight(10.0); + break; + case 'c': break; + case 'C': + cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); + break; + case 'd': break; + case 'D': + cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); + break; + case 'e': + cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); + cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); + cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); + break; + case 'l': + cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); + break; + case 'L': break; + case 'o': + { + int slack = 10; + i = qualGetInt(quals, i, &slack); + cl->setslack(slack); + //cerr << "set slack " << cl->getslack() << " done" << endl; + } + break; + case 'p': + cl->setTp(Rcl::SCLT_NEAR); + if (cl->getslack() == 0) { + cl->setslack(10); + //cerr << "set slack " << cl->getslack() << " done" << endl; + } + break; + case '.':case '0':case '1':case '2':case '3':case '4': + case '5':case '6':case '7':case '8':case '9': + { + int n = 0; + float factor = 1.0; + if (sscanf(&(quals[i]), "%f %n", &factor, &n)) { + if (factor != 1.0) { + cl->setWeight(factor); + } + } + if (n > 0) + i += n - 1; + } + default: + break; + } + } +} + + +static stack returns; +static string input; +static unsigned int index; + +int GETCHAR() +{ + if (!returns.empty()) { + int c = returns.top(); + returns.pop(); + return c; + } + if (index < input.size()) + return input[index++]; + return 0; +} +static void UNGETCHAR(int c) +{ + returns.push(c); +} + +// Simpler to let the quoted string reader store qualifiers in there, +// because their nature is determined by the absence of white space +// after the closing dquote. e.g "some term"abc. We could avoid this +// by making white space a token +static string qualifiers; + +// specialstartchars are special only at the beginning of a token +// (e.g. doctor-who is a term, not 2 terms separated by '-') +static string specialstartchars("-"); +// specialinchars are special everywhere except inside a quoted string +static string specialinchars(":=<>()"); +static string whites(" \t\n\r"); + +// Called with the first dquote already read +static int parseString() +{ + string* value = new string(); + qualifiers.clear(); + int c; + while ((c = GETCHAR())) { + switch (c) { + case '\\': + /* Escape: get next char */ + c = GETCHAR(); + if (c == 0) { + value->push_back(c); + goto out; + } + value->push_back(c); + break; + case '"': + /* End of string. Look for qualifiers */ + while ((c = GETCHAR()) && whites.find_first_of(c) == string::npos) + qualifiers.push_back(c); + goto out; + default: + value->push_back(c); + } + } +out: + //cerr << "GOT QUOTED ["<swap(qualifiers); + return QUALIFIERS; + } + + int c; + + /* Skip white space. */ + while ((c = GETCHAR ()) && whites.find_first_of(c) != string::npos) + continue; + + if (c == 0) + return 0; + + if (specialstartchars.find_first_of(c) != string::npos) { + //cerr << "yylex: return " << c << endl; + return c; + } + + // field-term relations + switch (c) { + case '=': return EQUALS; + case ':': return CONTAINS; + case '<': { + int c1 = GETCHAR(); + if (c1 == '=') { + return SMALLEREQ; + } else { + UNGETCHAR(c); + return SMALLER; + } + } + case '>': { + int c1 = GETCHAR(); + if (c1 == '=') { + return GREATEREQ; + } else { + UNGETCHAR(c); + return GREATER; + } + } + case '(': case ')': + return c; + } + + if (c == '"') + return parseString(); + + UNGETCHAR(c); + + // Other chars start a term or field name or reserved word + string* word = new string(); + while ((c = GETCHAR())) { + if (whites.find_first_of(c) != string::npos) { + //cerr << "Word broken by whitespace" << endl; + break; + } else if (specialinchars.find_first_of(c) != string::npos) { + //cerr << "Word broken by special char" << endl; + UNGETCHAR(c); + break; + } else if (c == 0) { + //cerr << "Word broken by EOF" << endl; + break; + } else { + word->push_back(c); + } + } + + if (!word->compare("AND") || !word->compare("&&")) { + delete word; + return AND; + } else if (!word->compare("OR") || !word->compare("||")) { + delete word; + return OR; + } + +// cerr << "Got word [" << word << "]" << endl; + yylval.str = word; + return WORD; +} + +int main (int argc, const char *argv[]) +{ + argc--;argv++; + if (argc == 0) + return 1; + while (argc--) { + input += *argv++; + input += " "; + } + + index = 0; + returns = stack(); + + return yyparse(); +} From c01f4c5a9b2c11cf05534610d98f8569783a57ee Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 27 Jan 2015 19:15:41 +0100 Subject: [PATCH 2/4] ckpt --- src/query/{wasa.y => wasaparse.y} | 128 ++++++++++++++++++------------ 1 file changed, 77 insertions(+), 51 deletions(-) rename src/query/{wasa.y => wasaparse.y} (80%) diff --git a/src/query/wasa.y b/src/query/wasaparse.y similarity index 80% rename from src/query/wasa.y rename to src/query/wasaparse.y index 54a01bdc..e3701b0d 100644 --- a/src/query/wasa.y +++ b/src/query/wasaparse.y @@ -5,10 +5,12 @@ #include #include "searchdata.h" +#include "wasaparse.h" +#include "wasaparse.tab.h" using namespace std; -int yylex(void); +int yylex(yy::parser::semantic_type *); void yyerror(char const *); void logwhere(const char *); class Expression; @@ -16,8 +18,17 @@ static void qualify(Rcl::SearchDataClauseDist *, const string &); string stemlang("english"); +static void addSubQuery(Rcl::SearchData *sd, Rcl::SearchData *sq) +{ + sd->addClause(new Rcl::SearchDataClauseSub(RefCntr(sq))); +} + +static Rcl::SearchData *g_result; %} +%skeleton "lalr1.cc" +%defines + %union { string *str; Rcl::SearchDataClauseSimple *cl; @@ -48,48 +59,53 @@ string stemlang("english"); query: fieldexpr { + cerr << "q: fieldexpr" << endl; Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, stemlang); sd->addClause($1); $$ = sd; - cerr << "q: fieldexpr" << endl; -} + g_result = sd; + } | query fieldexpr { cerr << "q: query fieldexpr" << endl; $1->addClause($2); $$ = $1; + g_result = $$; } | query AND fieldexpr { cerr << "q: query AND fieldexpr" << endl; $1->addClause($3); $$ = $1; + g_result = $$; } | query AND orchain { cerr << "q: query AND orchain"; - Rcl::SearchDataClauseSub *sub = - new Rcl::SearchDataClauseSub(RefCntr($1)); - $1->addClause(sub); + addSubQuery($1, $3); $$ = $1; + g_result = $$; } | query orchain { cerr << "q: query orchain" << endl; - Rcl::SearchDataClauseSub *sub = - new Rcl::SearchDataClauseSub(RefCntr($1)); - $1->addClause(sub); + addSubQuery($1, $2); $$ = $1; + g_result = $$; } | orchain { cerr << "q: orchain" << endl; - $$ = $1; + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, stemlang); + addSubQuery(sd, $1); + $$ = sd; + g_result = $$; } | '(' query ')' { cerr << "( query )" << endl; $$ = $2; + g_result = $$; } ; @@ -284,24 +300,24 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals) } -static stack returns; -static string input; -static unsigned int index; +static stack g_returns; +static string g_input; +static unsigned int g_index; int GETCHAR() { - if (!returns.empty()) { - int c = returns.top(); - returns.pop(); + if (!g_returns.empty()) { + int c = g_returns.top(); + g_returns.pop(); return c; } - if (index < input.size()) - return input[index++]; + if (g_index < g_input.size()) + return g_input[g_index++]; return 0; } static void UNGETCHAR(int c) { - returns.push(c); + g_returns.push(c); } // Simpler to let the quoted string reader store qualifiers in there, @@ -318,7 +334,7 @@ static string specialinchars(":=<>()"); static string whites(" \t\n\r"); // Called with the first dquote already read -static int parseString() +static int parseString(yy::parser::semantic_type *yylval) { string* value = new string(); qualifiers.clear(); @@ -345,19 +361,19 @@ static int parseString() } out: //cerr << "GOT QUOTED ["<str = value; + return yy::parser::token::QUOTED; } -int yylex(void) +int yylex(yy::parser::semantic_type *yylval) { -// cerr << "yylex: input [" << input.substr(index) << "]" << endl; + //cerr << "yylex: input [" << g_input.substr(g_index) << "]" << endl; if (!qualifiers.empty()) { - yylval.str = new string(); - yylval.str->swap(qualifiers); - return QUALIFIERS; + yylval->str = new string(); + yylval->str->swap(qualifiers); + return yy::parser::token::QUALIFIERS; } int c; @@ -376,24 +392,24 @@ int yylex(void) // field-term relations switch (c) { - case '=': return EQUALS; - case ':': return CONTAINS; + case '=': return yy::parser::token::EQUALS; + case ':': return yy::parser::token::CONTAINS; case '<': { int c1 = GETCHAR(); if (c1 == '=') { - return SMALLEREQ; + return yy::parser::token::SMALLEREQ; } else { UNGETCHAR(c); - return SMALLER; + return yy::parser::token::SMALLER; } } case '>': { int c1 = GETCHAR(); if (c1 == '=') { - return GREATEREQ; + return yy::parser::token::GREATEREQ; } else { UNGETCHAR(c); - return GREATER; + return yy::parser::token::GREATER; } } case '(': case ')': @@ -401,7 +417,7 @@ int yylex(void) } if (c == '"') - return parseString(); + return parseString(yylval); UNGETCHAR(c); @@ -425,29 +441,39 @@ int yylex(void) if (!word->compare("AND") || !word->compare("&&")) { delete word; - return AND; + return yy::parser::token::AND; } else if (!word->compare("OR") || !word->compare("||")) { delete word; - return OR; + return yy::parser::token::OR; } // cerr << "Got word [" << word << "]" << endl; - yylval.str = word; - return WORD; + yylval->str = word; + return yy::parser::token::WORD; } -int main (int argc, const char *argv[]) +void yy::parser::error(location_type const&, string const& m) { - argc--;argv++; - if (argc == 0) - return 1; - while (argc--) { - input += *argv++; - input += " "; - } - - index = 0; - returns = stack(); - - return yyparse(); + cerr << m << endl; +} + +Rcl::SearchData *wasaparse(const string& in) +{ + cerr << "wasaparse(" << in << ")" << endl; + + g_index = 0; + g_returns = stack(); + g_input = in; + delete g_result; + g_result = 0; + + yy::parser parser; + if (parser.parse() != 0) { + // Error + cerr << "Parse failed" << endl; + delete g_result; + g_result = 0; + } + cerr << "wasaparse: returning " << g_result << endl; + return g_result; } From b3e2e9d5dd097e19339a2f2f3508db066b26c4ed Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 28 Jan 2015 11:21:03 +0100 Subject: [PATCH 3/4] backout --- src/query/wasaparse.y | 479 ------------------------------------------ 1 file changed, 479 deletions(-) delete mode 100644 src/query/wasaparse.y diff --git a/src/query/wasaparse.y b/src/query/wasaparse.y deleted file mode 100644 index e3701b0d..00000000 --- a/src/query/wasaparse.y +++ /dev/null @@ -1,479 +0,0 @@ -%{ -#include - -#include -#include - -#include "searchdata.h" -#include "wasaparse.h" -#include "wasaparse.tab.h" - -using namespace std; - -int yylex(yy::parser::semantic_type *); -void yyerror(char const *); -void logwhere(const char *); -class Expression; -static void qualify(Rcl::SearchDataClauseDist *, const string &); - -string stemlang("english"); - -static void addSubQuery(Rcl::SearchData *sd, Rcl::SearchData *sq) -{ - sd->addClause(new Rcl::SearchDataClauseSub(RefCntr(sq))); -} - -static Rcl::SearchData *g_result; -%} - -%skeleton "lalr1.cc" -%defines - -%union { - string *str; - Rcl::SearchDataClauseSimple *cl; - Rcl::SearchData *sd; -} - -%type qualquote -%type fieldexpr -%type term -%type orchain -%type query - -%left AND -%right OR - -%token EQUALS -%token CONTAINS -%token SMALLEREQ -%token SMALLER -%token GREATEREQ -%token GREATER - -%token WORD -%token QUOTED -%token QUALIFIERS - -%% - -query: fieldexpr -{ - cerr << "q: fieldexpr" << endl; - Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, stemlang); - sd->addClause($1); - $$ = sd; - g_result = sd; - } -| query fieldexpr -{ - cerr << "q: query fieldexpr" << endl; - $1->addClause($2); - $$ = $1; - g_result = $$; -} -| query AND fieldexpr -{ - cerr << "q: query AND fieldexpr" << endl; - $1->addClause($3); - $$ = $1; - g_result = $$; -} -| query AND orchain -{ - cerr << "q: query AND orchain"; - addSubQuery($1, $3); - $$ = $1; - g_result = $$; -} -| query orchain -{ - cerr << "q: query orchain" << endl; - addSubQuery($1, $2); - $$ = $1; - g_result = $$; -} -| orchain -{ - cerr << "q: orchain" << endl; - Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, stemlang); - addSubQuery(sd, $1); - $$ = sd; - g_result = $$; -} -| '(' query ')' -{ - cerr << "( query )" << endl; - $$ = $2; - g_result = $$; -} -; - -orchain: -fieldexpr OR fieldexpr -{ - cerr << "orchain: fieldexpr[" << $1->gettext() << "] OR fieldexpr[" << - $3->gettext() << "]" << endl; - Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang); - sd->addClause($1); - sd->addClause($3); - $$ = sd; -} -| orchain OR fieldexpr -{ - cerr << "orchain: orchain OR fieldexpr[" << $3->gettext() << "]" << endl; - $1->addClause($3); - $$ = $1; -} -; - -fieldexpr: term -{ - //cerr << "simple fieldexpr: " << $1->gettext() << endl; - $$ = $1; -} -| WORD EQUALS term -{ - //cerr << *$1 << " = " << $3->gettext() << endl; - $3->setfield(*$1); - $3->setrel(Rcl::SearchDataClause::REL_EQUALS); - $$ = $3; -} -| WORD CONTAINS term -{ - //cerr << *$1 << " : " << $3->gettext() << endl; - $3->setfield(*$1); - $3->setrel(Rcl::SearchDataClause::REL_CONTAINS); - $$ = $3; -} -| WORD SMALLER term -{ - //cerr << *$1 << " < " << $3->gettext() << endl; - $3->setfield(*$1); - $3->setrel(Rcl::SearchDataClause::REL_LT); - $$ = $3; -} -| WORD SMALLEREQ term -{ - //cerr << *$1 << " <= " << $3->gettext() << endl; - $3->setfield(*$1); - $3->setrel(Rcl::SearchDataClause::REL_LTE); - $$ = $3; -} -| WORD GREATER term -{ - //cerr << *$1 << " > " << $3->gettext() << endl; - $3->setfield(*$1); - $3->setrel(Rcl::SearchDataClause::REL_GT); - $$ = $3; -} -| WORD GREATEREQ term -{ - //cerr << *$1 << " >= " << $3->gettext() << endl; - $3->setfield(*$1); - $3->setrel(Rcl::SearchDataClause::REL_GTE); - $$ = $3; -} -| '-' fieldexpr -{ - //cerr << "- fieldexpr[" << $2->gettext() << "]" << endl; - $2->setexclude(true); - $$ = $2; -} -; - -term: WORD -{ - //cerr << "term[" << *$1 << "]" << endl; - $$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1); -} -| qualquote -{ - $$ = $1; -} -; - -qualquote: QUOTED -{ - cerr << "QUOTED[" << *$1 << "]" << endl; - $$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); -} -| QUOTED QUALIFIERS -{ - cerr << "QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl; - Rcl::SearchDataClauseDist *cl = - new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); - qualify(cl, *$2); - $$ = cl; -} -; - - -%% - -#include -#include - -void yyerror (char const *s) -{ - cerr << s << endl; -} - -void logwhere(const char *s) -{ - cerr << s << endl; -} - -// Look for int at index, skip and return new index found? value. -static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval) -{ - unsigned int ncur = cur; - if (cur < q.size() - 1) { - char *endptr; - int val = strtol(&q[cur + 1], &endptr, 10); - if (endptr != &q[cur + 1]) { - ncur += endptr - &q[cur + 1]; - *pval = val; - } - } - return ncur; -} - -static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals) -{ - cerr << "qualify(" << cl << ", " << quals << ")" << endl; - for (unsigned int i = 0; i < quals.length(); i++) { - //fprintf(stderr, "qual char %c\n", quals[i]); - switch (quals[i]) { - case 'b': - cl->setWeight(10.0); - break; - case 'c': break; - case 'C': - cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); - break; - case 'd': break; - case 'D': - cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); - break; - case 'e': - cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); - cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); - cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); - break; - case 'l': - cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); - break; - case 'L': break; - case 'o': - { - int slack = 10; - i = qualGetInt(quals, i, &slack); - cl->setslack(slack); - //cerr << "set slack " << cl->getslack() << " done" << endl; - } - break; - case 'p': - cl->setTp(Rcl::SCLT_NEAR); - if (cl->getslack() == 0) { - cl->setslack(10); - //cerr << "set slack " << cl->getslack() << " done" << endl; - } - break; - case '.':case '0':case '1':case '2':case '3':case '4': - case '5':case '6':case '7':case '8':case '9': - { - int n = 0; - float factor = 1.0; - if (sscanf(&(quals[i]), "%f %n", &factor, &n)) { - if (factor != 1.0) { - cl->setWeight(factor); - } - } - if (n > 0) - i += n - 1; - } - default: - break; - } - } -} - - -static stack g_returns; -static string g_input; -static unsigned int g_index; - -int GETCHAR() -{ - if (!g_returns.empty()) { - int c = g_returns.top(); - g_returns.pop(); - return c; - } - if (g_index < g_input.size()) - return g_input[g_index++]; - return 0; -} -static void UNGETCHAR(int c) -{ - g_returns.push(c); -} - -// Simpler to let the quoted string reader store qualifiers in there, -// because their nature is determined by the absence of white space -// after the closing dquote. e.g "some term"abc. We could avoid this -// by making white space a token -static string qualifiers; - -// specialstartchars are special only at the beginning of a token -// (e.g. doctor-who is a term, not 2 terms separated by '-') -static string specialstartchars("-"); -// specialinchars are special everywhere except inside a quoted string -static string specialinchars(":=<>()"); -static string whites(" \t\n\r"); - -// Called with the first dquote already read -static int parseString(yy::parser::semantic_type *yylval) -{ - string* value = new string(); - qualifiers.clear(); - int c; - while ((c = GETCHAR())) { - switch (c) { - case '\\': - /* Escape: get next char */ - c = GETCHAR(); - if (c == 0) { - value->push_back(c); - goto out; - } - value->push_back(c); - break; - case '"': - /* End of string. Look for qualifiers */ - while ((c = GETCHAR()) && whites.find_first_of(c) == string::npos) - qualifiers.push_back(c); - goto out; - default: - value->push_back(c); - } - } -out: - //cerr << "GOT QUOTED ["<str = value; - return yy::parser::token::QUOTED; -} - - -int yylex(yy::parser::semantic_type *yylval) -{ - //cerr << "yylex: input [" << g_input.substr(g_index) << "]" << endl; - - if (!qualifiers.empty()) { - yylval->str = new string(); - yylval->str->swap(qualifiers); - return yy::parser::token::QUALIFIERS; - } - - int c; - - /* Skip white space. */ - while ((c = GETCHAR ()) && whites.find_first_of(c) != string::npos) - continue; - - if (c == 0) - return 0; - - if (specialstartchars.find_first_of(c) != string::npos) { - //cerr << "yylex: return " << c << endl; - return c; - } - - // field-term relations - switch (c) { - case '=': return yy::parser::token::EQUALS; - case ':': return yy::parser::token::CONTAINS; - case '<': { - int c1 = GETCHAR(); - if (c1 == '=') { - return yy::parser::token::SMALLEREQ; - } else { - UNGETCHAR(c); - return yy::parser::token::SMALLER; - } - } - case '>': { - int c1 = GETCHAR(); - if (c1 == '=') { - return yy::parser::token::GREATEREQ; - } else { - UNGETCHAR(c); - return yy::parser::token::GREATER; - } - } - case '(': case ')': - return c; - } - - if (c == '"') - return parseString(yylval); - - UNGETCHAR(c); - - // Other chars start a term or field name or reserved word - string* word = new string(); - while ((c = GETCHAR())) { - if (whites.find_first_of(c) != string::npos) { - //cerr << "Word broken by whitespace" << endl; - break; - } else if (specialinchars.find_first_of(c) != string::npos) { - //cerr << "Word broken by special char" << endl; - UNGETCHAR(c); - break; - } else if (c == 0) { - //cerr << "Word broken by EOF" << endl; - break; - } else { - word->push_back(c); - } - } - - if (!word->compare("AND") || !word->compare("&&")) { - delete word; - return yy::parser::token::AND; - } else if (!word->compare("OR") || !word->compare("||")) { - delete word; - return yy::parser::token::OR; - } - -// cerr << "Got word [" << word << "]" << endl; - yylval->str = word; - return yy::parser::token::WORD; -} - -void yy::parser::error(location_type const&, string const& m) -{ - cerr << m << endl; -} - -Rcl::SearchData *wasaparse(const string& in) -{ - cerr << "wasaparse(" << in << ")" << endl; - - g_index = 0; - g_returns = stack(); - g_input = in; - delete g_result; - g_result = 0; - - yy::parser parser; - if (parser.parse() != 0) { - // Error - cerr << "Parse failed" << endl; - delete g_result; - g_result = 0; - } - cerr << "wasaparse: returning " << g_result << endl; - return g_result; -} From 31de532e0e154d2bda67e1e3e2f94e6e63672ff2 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 28 Jan 2015 11:22:01 +0100 Subject: [PATCH 4/4] release 3790 --- src/INSTALL | 11 +++--- src/README | 111 +++++++++++++++++++++++++++++++--------------------- 2 files changed, 72 insertions(+), 50 deletions(-) diff --git a/src/INSTALL b/src/INSTALL index 3af07ae8..d37f6f00 100644 --- a/src/INSTALL +++ b/src/INSTALL @@ -103,8 +103,8 @@ Chapter 5. Installation and configuration o Openoffice files need unzip and xsltproc. - o PDF files need pdftotext which is part of the Xpdf or Poppler - packages. + o PDF files need pdftotext which is part of Poppler (usually comes with + the poppler-utils package). Avoid the original one from Xpdf. o Postscript files need pstotext. The original version has an issue with shell character in file names, which is corrected in recent packages. @@ -121,9 +121,10 @@ Chapter 5. Installation and configuration o Wordperfect files need wpd2html from the libwpd (or libwpd-tools on Ubuntu) package. - o RTF files need unrtf, which, in its standard version, has much trouble - with non-western character sets. Check - http://www.recoll.org/features.html. + o RTF files need unrtf, which, in its older versions, has much trouble + with non-western character sets. Many Linux distributions carry + outdated unrtf versions. Check http://www.recoll.org/features.html for + details. o TeX files need untex or detex. Check http://www.recoll.org/features.html for sources if it's not packaged diff --git a/src/README b/src/README index 34f9f8bd..753301ff 100644 --- a/src/README +++ b/src/README @@ -197,15 +197,17 @@ Chapter 1. Introduction 1.1. Giving it a try - If you do not like reading manuals (who does?) and would like to give - Recoll a try, just install the application and start the recoll graphical - user interface (GUI), which will ask to index your home directory by + If you do not like reading manuals (who does?) but wish to give Recoll a + try, just install the application and start the recoll graphical user + interface (GUI), which will ask permission to index your home directory by default, allowing you to search immediately after indexing completes. Do not do this if your home directory contains a huge number of documents and you do not want to wait or are very short on disk space. In this case, you may first want to customize the configuration to restrict the indexed - area. + area (for the very impatient with a completed package install, from the + recoll GUI: Preferences -> Indexing configuration, then adjust the Top + directories section). Also be aware that you may need to install the appropriate supporting applications for document types that need them (for example antiword for @@ -213,49 +215,58 @@ Chapter 1. Introduction 1.2. Full text search - Recoll is a full text search application. Full text search applications - let you find your data by content rather than by external attributes (like - a file name). More specifically, they will let you specify words (terms) - that should or should not appear in the text you are looking for, and - return a list of matching documents, ordered so that the most relevant - documents will appear first. + Recoll is a full text search application. Full text search finds your data + by content rather than by external attributes (like a file name). You + specify words (terms) which should or should not appear in the text you + are looking for, and receive in return a list of matching documents, + ordered so that the most relevant documents will appear first. You do not need to remember in what file or email message you stored a given piece of information. You just ask for related terms, and the tool will return a list of documents where these terms are prominent, in a similar way to Internet search engines. - A search application tries to determine which documents are most relevant - to the search terms you provide. Computer algorithms for determining - relevance can be very complex, and in general are inferior to the power of - the human mind to rapidly determine relevance. The quality of relevance - guessing is probably the most important aspect when evaluating a search - application. + Full text search applications try to determine which documents are most + relevant to the search terms you provide. Computer algorithms for + determining relevance can be very complex, and in general are inferior to + the power of the human mind to rapidly determine relevance. The quality of + relevance guessing is probably the most important aspect when evaluating a + search application. - In many cases, you are looking for all the forms of a word, not for a - specific form or spelling. These different forms may include plurals, - different tenses for a verb, or terms derived from the same root or stem - (example: floor, floors, floored, flooring...). Search applications - usually expand queries to all such related terms (words that reduce to the - same stem) and also provide a way to disable this expansion if you are - actually searching for a specific form. + In many cases, you are looking for all the forms of a word, including + plurals, different tenses for a verb, or terms derived from the same root + or stem (example: floor, floors, floored, flooring...). Queries are + usually automatically expanded to all such related terms (words that + reduce to the same stem). This can be prevented for searching for a + specific form. Stemming, by itself, does not accommodate for misspellings or phonetic - searches. Recoll supports these features through a specific tool (the term - explorer) which will let you explore the set of index terms along - different modes. + searches. A full text search application may also support this form of + approximation. For example, a search for aliterattion returning no result + may propose, depending on index contents, alliteration alteration + alterations altercation as possible replacement terms. 1.3. Recoll overview Recoll uses the Xapian information retrieval library as its storage and retrieval engine. Xapian is a very mature package using a sophisticated - probabilistic ranking model. Recoll provides the mechanisms and interface - to get data into and out of the system. + probabilistic ranking model. - In practice, Xapian works by remembering where terms appear in your - document files. The acquisition process is called indexing. + The Xapian library manages an index database which describes where terms + appear in your document files. It efficiently processes the complex + queries which are produced by the Recoll query expansion mechanism, and is + in charge of the all-important relevance computation task. - The resulting index can be big (roughly the size of the original document + Recoll provides the mechanisms and interface to get data into and out of + the index. This includes translating the many possible document formats + into pure text, handling term variations (using Xapian stemmers), and + spelling approximations (using the aspell speller), interpreting user + queries and presenting results. + + In a shorter way, Recoll does the dirty footwork, Xapian deals with the + intelligent parts of the process. + + The Xapian index can be big (roughly the size of the original document set), but it is not a document archive. Recoll can only display documents that still exist at the place from which they were indexed. (Actually, there is a way to reconstruct a document from the information in the @@ -263,8 +274,10 @@ Chapter 1. Introduction capitalization are lost). Recoll stores all internal data in Unicode UTF-8 format, and it can index - files with different character sets, encodings, and languages into the - same index. It has can process many document types. + files of many types with different character sets, encodings, and + languages into the same index. It can process documents embedded inside + other documents (for example a pdf document stored inside a Zip archive + sent as an email attachment...), down to an arbitrary depth. Stemming is the process by which Recoll reduces words to their radicals so that searching does not depend, for example, on a word being singular or @@ -318,13 +331,15 @@ Chapter 1. Introduction The indexing process is started automatically the first time you execute the recoll GUI. Indexing can also be performed by executing the - recollindex command. + recollindex command. Recoll indexing is multithreaded by default when + appropriate hardware resources are available, and can perform in parallel + multiple tasks among text extraction, segmentation and index updates. Searches are usually performed inside the recoll GUI, which has many options to help you find what you are looking for. However, there are other ways to perform Recoll searches: mostly a command line interface, a - Python programming interface, a KDE KIO slave module, and a Ubuntu Unity - Lens module. + Python programming interface, a KDE KIO slave module, and Ubuntu Unity + Lens (for older versions) or Scope (for current versions) modules. Chapter 2. Indexing @@ -332,10 +347,10 @@ Chapter 2. Indexing Indexing is the process by which the set of documents is analyzed and the data entered into the database. Recoll indexing is normally incremental: - documents will only be processed if they have been modified. On the first - execution, all documents will need processing. A full index build can be - forced later by specifying an option to the indexing command (recollindex - -z or -Z). + documents will only be processed if they have been modified since the last + run. On the first execution, all documents will need processing. A full + index build can be forced later by specifying an option to the indexing + command (recollindex -z or -Z). The following sections give an overview of different aspects of the indexing processes and configuration, with links to detailed sections. @@ -1463,6 +1478,11 @@ Chapter 3. Searching cases where the exact search term is not known. For example, you may not remember the exact spelling, or only know the beginning of the name. + The search will only propose replacement terms with spelling variations + when no matching document were found. In some cases, both proper spellings + and mispellings are present in the index, and it may be interesting to + look for them explicitely. + The term explorer tool (started from the toolbar icon or from the Term explorer entry of the Tools menu) can be used to search the full index terms list. It has three modes of operations: @@ -3302,8 +3322,8 @@ Chapter 5. Installation and configuration o Openoffice files need unzip and xsltproc. - o PDF files need pdftotext which is part of the Xpdf or Poppler - packages. + o PDF files need pdftotext which is part of Poppler (usually comes with + the poppler-utils package). Avoid the original one from Xpdf. o Postscript files need pstotext. The original version has an issue with shell character in file names, which is corrected in recent packages. @@ -3320,9 +3340,10 @@ Chapter 5. Installation and configuration o Wordperfect files need wpd2html from the libwpd (or libwpd-tools on Ubuntu) package. - o RTF files need unrtf, which, in its standard version, has much trouble - with non-western character sets. Check - http://www.recoll.org/features.html. + o RTF files need unrtf, which, in its older versions, has much trouble + with non-western character sets. Many Linux distributions carry + outdated unrtf versions. Check http://www.recoll.org/features.html for + details. o TeX files need untex or detex. Check http://www.recoll.org/features.html for sources if it's not packaged