From 10b6069f0de80539282e124122d1ffb594dfed7b Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 12 Nov 2019 09:30:29 +0100 Subject: [PATCH] Add recollq and Python API options to collapse duplicate results --- src/doc/user/usermanual.html | 6 ++-- src/doc/user/usermanual.xml | 4 +-- src/python/recoll/pyrecoll.cpp | 29 ++++++++++++---- src/query/recollq.cpp | 62 +++++++++++++++++++++------------- 4 files changed, 66 insertions(+), 35 deletions(-) diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 9ae4388d..fbdc7975 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -6872,7 +6872,8 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
Query.execute(query_string, stemming=1, - stemlang="english", fetchtext=False)
+ stemlang="english", fetchtext=False, + collapseduplicates=False)

Starts a search for query_string, a @@ -6885,7 +6886,8 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r

Query.executesd(SearchData, - fetchtext=False)
+ fetchtext=False, + collapseduplicates=False)

Starts a search for the query defined by the SearchData object. If the index stores the diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index 6b7d3bc0..e82c8172 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -5271,7 +5271,7 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r -Query.execute(query_string, stemming=1, stemlang="english", fetchtext=False) +Query.execute(query_string, stemming=1, stemlang="english", fetchtext=False, collapseduplicates=False) Starts a search for query_string, a &RCL; search language string. If the index stores the document @@ -5281,7 +5281,7 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r - Query.executesd(SearchData, fetchtext=False) + Query.executesd(SearchData, fetchtext=False, collapseduplicates=False) Starts a search for the query defined by the SearchData object. If the index stores the document texts and fetchtext is True, store the diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index 4e02fe9b..fac7f037 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -958,15 +958,16 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { LOGDEB0("Query_execute\n"); static const char *kwlist[] = {"query_string", "stemming", "stemlang", - "fetchtext", NULL}; + "fetchtext", "collapseduplicates", NULL}; char *sutf8 = 0; // needs freeing char *sstemlang = 0; PyObject *dostemobj = 0; PyObject *fetchtextobj = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OesO:Query_execute", + PyObject *collapseobj = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OesOO:Query_execute", (char**)kwlist, "utf-8", &sutf8, - &dostemobj, - "utf-8", &sstemlang, &fetchtextobj)) { + &dostemobj, "utf-8", &sstemlang, + &fetchtextobj, &collapseobj)) { return 0; } @@ -995,6 +996,12 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) return 0; } + if (collapseobj != 0 && PyObject_IsTrue(collapseobj)) { + self->query->setCollapseDuplicates(true); + } else { + self->query->setCollapseDuplicates(false); + } + // SearchData defaults to stemming in english // Use default for now but need to add way to specify language string reason; @@ -1025,12 +1032,14 @@ static PyObject * Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { LOGDEB0("Query_executeSD\n"); - static const char *kwlist[] = {"searchdata", "fetchtext", NULL}; + static const char *kwlist[] = {"searchdata", "fetchtext", + "collapseduplicates", NULL}; recoll_SearchDataObject *pysd = 0; PyObject *fetchtextobj = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_execute", + PyObject *collapseobj = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OO:Query_execute", (char **)kwlist, &recoll_SearchDataType, - &pysd, &fetchtextobj)) { + &pysd, &fetchtextobj, &collapseobj)) { return 0; } if (pysd == 0 || self->query == 0) { @@ -1042,6 +1051,12 @@ Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) } else { self->fetchtext = false; } + if (collapseobj != 0 && PyObject_IsTrue(collapseobj)) { + self->query->setCollapseDuplicates(true); + } else { + self->query->setCollapseDuplicates(false); + } + self->query->setSortBy(*self->sortfield, self->ascending); self->query->setQuery(pysd->sd); int cnt = self->query->getResCnt(); diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index e2483f07..c2b8bf46 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -107,6 +107,7 @@ static char usage [] = " -q is just ignored (compatibility with the recoll GUI command line).\n" "Common options:\n" " -c : specify config directory, overriding $RECOLL_CONFDIR.\n" +" -C : collapse duplicates\n" " -d also dump file contents.\n" " -n [first-] define the result slice. The default value for [first]\n" " is 0. Without the option, the default max count is 2000.\n" @@ -137,34 +138,43 @@ Usage(void) exit(1); } -// BEWARE COMPATIBILITy WITH recoll OPTIONS letters -// -q, -t and -l are accepted and ignored -// -a/f/o -c have the same meaning -// -h and -v -> Usage() - +// BEWARE COMPATIBILITY WITH recoll OPTIONS letters static int op_flags; + #define OPT_A 0x1 +// gui: -a same #define OPT_a 0x2 #define OPT_b 0x4 -#define OPT_c 0x8 -#define OPT_D 0x10 -#define OPT_d 0x20 -#define OPT_e 0x40 -#define OPT_F 0x80 -#define OPT_f 0x100 -#define OPT_i 0x200 -#define OPT_l 0x400 -#define OPT_m 0x800 -#define OPT_N 0x1000 -#define OPT_n 0x2000 -#define OPT_o 0x4000 -#define OPT_P 0x8000 -#define OPT_Q 0x10000 -#define OPT_q 0x20000 -#define OPT_S 0x40000 -#define OPT_s 0x80000 -#define OPT_T 0x100000 -#define OPT_t 0x200000 +#define OPT_C 0x8 +// gui: -c same +#define OPT_c 0x10 +#define OPT_D 0x20 +#define OPT_d 0x40 +#define OPT_e 0x80 +#define OPT_F 0x100 +// gui: -f same +#define OPT_f 0x200 +// gui uses -h for help. us: usage +#define OPT_i 0x400 +// gui uses -L to set language of messages +// gui: -l same +#define OPT_l 0x800 +#define OPT_m 0x1000 +#define OPT_N 0x2000 +#define OPT_n 0x4000 +// gui: -o same +#define OPT_o 0x8000 +#define OPT_P 0x10000 +#define OPT_Q 0x20000 +// gui: -q same +#define OPT_q 0x40000 +#define OPT_S 0x80000 +#define OPT_s 0x100000 +#define OPT_T 0x200000 +// gui: -t use command line, us: ignored +#define OPT_t 0x400000 +// gui uses -v : show version. Us: usage +// gui uses -w : open minimized int recollq(RclConfig **cfp, int argc, char **argv) { @@ -196,6 +206,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) case 'A': op_flags |= OPT_A; break; case 'a': op_flags |= OPT_a; break; case 'b': op_flags |= OPT_b; break; + case 'C': op_flags |= OPT_C; break; case 'c': op_flags |= OPT_c; if (argc < 2) Usage(); a_config = *(++argv); argc--; goto b1; @@ -344,6 +355,9 @@ endopts: std::shared_ptr rq(sd); Rcl::Query query(&rcldb); + if (op_flags & OPT_C) { + query.setCollapseDuplicates(true); + } if (op_flags & OPT_S) { query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true); }