recollq: add option to extract result document to file + doc and test

This commit is contained in:
Jean-Francois Dockes 2022-08-15 17:46:57 +02:00
parent 7d028e7ca9
commit ea133d7d60
6 changed files with 222 additions and 196 deletions

View File

@ -59,6 +59,10 @@ recollq \- command line / standard output Recoll query command.
.B \-F
<quoted space separated field name list>
]
[
.B \--extract-to
<file path>
]
<query string>
.B recollq \-P
@ -120,9 +124,10 @@ sorts the results according to the specified field. Use
for descending order.
.PP
.B \-n
<cnt>
<[first-]cnt>
can be used to set the maximum number of results that should be
printed. The default is 2000. Use a value of 0 for no limit.
printed. The default is 2000. Use a value of 0 for no limit. If the argument is of the form
first-cnt, it also defines the first result to output (from 0).
.PP
.B \-s
<language>
@ -144,6 +149,11 @@ base64 and separated by one space character. Empty fields are indicated by
consecutive space characters. There is one additional space character at
the end of each line.
.PP
.B \--extract-to
<file path>
Will extract the first result document of the query to the argument path, which must not exist. Use
-n first-cnt to select the document.
.PP
.B recollq \-P
(Period) will print the minimum and maximum modification years for
documents in the index.

View File

@ -3852,42 +3852,48 @@ fs.inotify.max_user_watches=32768
The Usage string follows:</para>
<programlisting><![CDATA[
recollq: usage:
-P: Show the date span for all the documents present in the index
-P: Show the date span for all the documents present in the index.
[-o|-a|-f] [-q] <query string>
Runs a recoll query and displays result lines.
Default: will interpret the argument(s) as a xesam query string
Default: will interpret the argument(s) as a xesam query string.
Query elements:
* Implicit AND, exclusion, field spec: t1 -t2 title:t3
* OR has priority: t1 OR t2 t3 OR t4 means (t1 OR t2) AND (t3 OR t4)
* Phrase: "t1 t2" (needs additional quoting on cmd line)
-o Emulate the GUI simple search in ANY TERM mode
-a Emulate the GUI simple search in ALL TERMS mode
-f Emulate the GUI simple search in filename mode
-q is just ignored (compatibility with the recoll GUI command line)
-o Emulate the GUI simple search in ANY TERM mode.
-a Emulate the GUI simple search in ALL TERMS mode.
-f Emulate the GUI simple search in filename mode.
-q is just ignored (compatibility with the recoll GUI command line).
Common options:
-c <configdir> : specify config directory, overriding $RECOLL_CONFDIR
-d also dump file contents
-c <configdir> : specify config directory, overriding $RECOLL_CONFDIR.
-C : collapse duplicates
-d also dump file contents.
-n [first-]<cnt> define the result slice. The default value for [first]
is 0. Without the option, the default max count is 2000.
Use n=0 for no limit
-b : basic. Just output urls, no mime types or titles
-Q : no result lines, just the processed query and result count
-m : dump the whole document meta[] array for each result
-A : output the document abstracts
-S fld : sort by field <fld>
-D : sort descending
-s stemlang : set stemming language to use (must exist in index...)
Use -s "" to turn off stem expansion
-T <synonyms file>: use the parameter (Thesaurus) for word expansion
-i <dbdir> : additional index, several can be given
-e use url encoding (%xx) for urls
Use n=0 for no limit.
-b : basic. Just output urls, no mime types or titles.
-Q : no result lines, just the processed query and result count.
-m : dump the whole document meta[] array for each result.
-A : output the document abstracts.
-p <cnt> : show <cnt> snippets, with page numbers instead of the concatenated abstract.
-g <cnt> : show <cnt> snippets, with line numbers instead of the concatenated abstract.
-S fld : sort by field <fld>.
-D : sort descending.
-s stemlang : set stemming language to use (must exist in index...).
Use -s "" to turn off stem expansion.
-T <synonyms file>: use the parameter (Thesaurus) for word expansion.
-i <dbdir> : additional index, several can be given.
-e use url encoding (%xx) for urls.
-E use exact result count instead of lower bound estimate.
-F <field name list> : output exactly these fields for each result.
The field values are encoded in base64, output in one line and
separated by one space character. This is the recommended format
for use by other programs. Use a normal query with option -m to
see the field names. Use -F '' to output all fields, but you probably
also want option -N in this case
-N : with -F, print the (plain text) field names before the field values
also want option -N in this case.
-N : with -F, print the (plain text) field names before the field values.
--extract_to <filepath> : extract the first result to filepath, which must not exist.
Use a -n option with an offset to select the appropriate result.
]]></programlisting>
<para>Sample execution:</para>

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2006 J.F.Dockes
/* Copyright (C) 2006-2022 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -22,6 +22,7 @@
#include <errno.h>
#include <string.h>
#include <limits.h>
#include <getopt.h>
#include <iostream>
#include <sstream>
@ -42,6 +43,8 @@
#include "smallut.h"
#include "chrono.h"
#include "base64.h"
#include "rclutil.h"
#include "internfile.h"
using namespace std;
@ -51,9 +54,9 @@ bool dump_contents(RclConfig *rclconfig, Rcl::Doc& idoc)
Rcl::Doc fdoc;
string ipath = idoc.ipath;
if (interner.internfile(fdoc, ipath)) {
cout << fdoc.text << endl;
cout << fdoc.text << "\n";
} else {
cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << endl;
cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << "\n";
}
return true;
}
@ -67,7 +70,7 @@ string make_abstract(Rcl::Doc& doc, Rcl::Query& query, bool asSnippets,
std::ostringstream str;
if (query.makeDocAbstract(doc, snippets, snipcount, -1, true)) {
for (const auto& snippet : snippets) {
str << (showlines ? snippet.line : snippet.page) << " : " << snippet.snippet << endl;
str << (showlines ? snippet.line : snippet.page) << " : " << snippet.snippet << "\n";
}
}
abstract = str.str();
@ -83,7 +86,6 @@ void output_fields(vector<string> fields, Rcl::Doc& doc,
bool asSnippets, int snipcnt, bool showlines)
{
if (fields.empty()) {
map<string,string>::const_iterator it;
for (const auto& entry : doc.meta) {
fields.push_back(entry.first);
}
@ -103,11 +105,13 @@ void output_fields(vector<string> fields, Rcl::Doc& doc,
// fields. This is a problem when printing names and using strtok, but
// have to keep the old behaviour when printnames is not set.
if (!(out.empty() && printnames)) {
if (printnames)
cout << fld << " " << out << " ";
if (printnames) {
cout << fld << " ";
}
cout << out << " ";
}
}
cout << endl;
cout << "\n";
}
static char *thisprog;
@ -151,170 +155,147 @@ static char usage [] =
" for use by other programs. Use a normal query with option -m to \n"
" see the field names. Use -F '' to output all fields, but you probably\n"
" also want option -N in this case.\n"
" -N : with -F, print the (plain text) field names before the field values.\n"
" -N : with -F, print the (plain text) field names before the field values.\n"
" --extract_to <filepath> : extract the first result to filepath, which must not exist.\n"
" Use a -n option with an offset to select the appropriate result.\n"
;
static void
Usage(void)
static void Usage(std::ostream &os = std::cerr)
{
cerr << thisprog << ": usage:" << endl << usage;
os << thisprog << ": usage:" << "\n" << usage;
exit(1);
}
// BEWARE COMPATIBILITY WITH recoll OPTIONS letters
static int op_flags;
#define OPT_A 0x1
// GUI: -a same
#define OPT_a 0x2
#define OPT_b 0x4
#define OPT_C 0x8
// GUI: -c same
#define OPT_c 0x10
#define OPT_D 0x20
#define OPT_d 0x40
#define OPT_e 0x80
#define OPT_F 0x100
#define OPT_g 0x200
// GUI: -f same
#define OPT_f 0x400
// GUI uses -h for help. us: usage
#define OPT_i 0x800
// GUI uses -L to set language of messages
// GUI: -l specifies query language, which is the default. Accept and ignore
#define OPT_l 0x1000
#define OPT_m 0x2000
#define OPT_N 0x4000
#define OPT_n 0x8000
// GUI: -o same
#define OPT_o 0x10000
#define OPT_p 0x20000
#define OPT_P 0x40000
#define OPT_Q 0x80000
// GUI: -q same
#define OPT_q 0x100000
#define OPT_S 0x200000
#define OPT_s 0x400000
#define OPT_T 0x800000
// GUI: -t use command line, us: ignored
#define OPT_t 0x100000
// GUI uses -v : show version. Us: usage
// GUI uses -w : open minimized
#define OPT_E 0x200000
#define OPT_A 0x1
#define OPT_a 0x2
#define OPT_b 0x4
#define OPT_C 0x8
#define OPT_D 0x10
#define OPT_d 0x20
#define OPT_e 0x40
#define OPT_F 0x80
#define OPT_g 0x100
#define OPT_f 0x200
#define OPT_l 0x400
#define OPT_m 0x800
#define OPT_N 0x1000
#define OPT_o 0x2000
#define OPT_p 0x4000
#define OPT_P 0x8000
#define OPT_Q 0x10000
#define OPT_q 0x20000
#define OPT_S 0x40000
#define OPT_E 0x80000
static struct option long_options[] = {
#define OPTION_EXTRACT 1000
{"extract-to", required_argument, 0, OPTION_EXTRACT},
{0, 0, 0, 0}
};
int recollq(RclConfig **cfp, int argc, char **argv)
{
thisprog = argv[0];
string a_config;
string sortfield;
string stemlang("english");
list<string> extra_dbs;
string sf;
vector<string> fields;
string syngroupsfn;
int firstres = 0;
int maxcount = 2000;
int snipcnt = -1;
std::string extractfile;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case '-':
// -- : end of options
if (*(*argv) != 0)
Usage();
goto endopts;
case 'A': op_flags |= OPT_A; break;
case 'a': op_flags |= OPT_a; break;
case 'b': op_flags |= OPT_b; break;
case 'C': op_flags |= OPT_C; break;
case 'c': op_flags |= OPT_c; if (argc < 2) Usage();
a_config = *(++argv);
argc--; goto b1;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'E': op_flags |= OPT_E; break;
case 'e': op_flags |= OPT_e; break;
case 'f': op_flags |= OPT_f; break;
case 'F': op_flags |= OPT_F; if (argc < 2) Usage();
sf = *(++argv);
argc--; goto b1;
case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
extra_dbs.push_back(*(++argv));
argc--; goto b1;
case 'l': op_flags |= OPT_l; break;
case 'm': op_flags |= OPT_m; break;
case 'N': op_flags |= OPT_N; break;
case 'n': op_flags |= OPT_n; if (argc < 2) Usage();
{
string rescnt = *(++argv);
string::size_type dash = rescnt.find("-");
if (dash != string::npos) {
firstres = atoi(rescnt.substr(0, dash).c_str());
if (dash < rescnt.size()-1) {
maxcount = atoi(rescnt.substr(dash+1).c_str());
}
} else {
maxcount = atoi(rescnt.c_str());
int ret;
while ((ret = getopt_long(argc, argv, "+AabCc:DdEefF:hi:lmNn:oPp:g:QqS:s:tT:v",
long_options, NULL)) != -1) {
switch (ret) {
case 'A': op_flags |= OPT_A; break;
// GUI: -a same
case 'a': op_flags |= OPT_a; break;
case 'b': op_flags |= OPT_b; break;
case 'C': op_flags |= OPT_C; break;
// GUI: -c same
case 'c': a_config = optarg; break;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'E': op_flags |= OPT_E; break;
case 'e': op_flags |= OPT_e; break;
case 'F': op_flags |= OPT_F; sf = optarg; break;
// GUI: -f same
case 'f': op_flags |= OPT_f; break;
// GUI: -h same
case 'h': Usage(std::cout); // Usage exits
case 'i': extra_dbs.push_back(optarg); break;
// GUI uses -L to set language of messages
// GUI: -l specifies query language, which is the default. Accept and ignore
case 'l': op_flags |= OPT_l; break;
case 'm': op_flags |= OPT_m; break;
case 'N': op_flags |= OPT_N; break;
case 'n':
{
string rescnt = optarg;
string::size_type dash = rescnt.find("-");
if (dash != string::npos) {
firstres = atoi(rescnt.substr(0, dash).c_str());
if (dash < rescnt.size()-1) {
maxcount = atoi(rescnt.substr(dash+1).c_str());
}
if (maxcount <= 0) maxcount = INT_MAX;
} else {
maxcount = atoi(rescnt.c_str());
}
argc--; goto b1;
case 'o': op_flags |= OPT_o; break;
case 'P': op_flags |= OPT_P; break;
case 'p':
op_flags |= OPT_p;
goto porg;
case 'g':
op_flags |= OPT_g;
goto porg;
if (maxcount <= 0) maxcount = INT_MAX;
}
break;
// GUI: -o same
case 'o': op_flags |= OPT_o; break;
case 'P': op_flags |= OPT_P; break;
case 'p':
op_flags |= OPT_p;
goto porg;
case 'g':
op_flags |= OPT_g;
{
porg:
if (argc < 2)
Usage();
const char *cp = *(++argv);
const char *cp = optarg;
char *cpe;
snipcnt = strtol(cp, &cpe, 10);
if (*cpe != 0)
Usage();
argc--; goto b1;
}
case 'q': op_flags |= OPT_q; break;
case 'Q': op_flags |= OPT_Q; break;
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
sortfield = *(++argv);
argc--; goto b1;
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
stemlang = *(++argv);
argc--; goto b1;
case 't': op_flags |= OPT_t; break;
case 'T': op_flags |= OPT_T; if (argc < 2) Usage();
syngroupsfn = *(++argv);
argc--; goto b1;
default: Usage(); break;
}
b1: argc--; argv++;
break;
case 'Q': op_flags |= OPT_Q; break;
// GUI: -q same
case 'q': op_flags |= OPT_q; break;
case 'S': op_flags |= OPT_S; sortfield = optarg; break;
case 's': stemlang = optarg; break;
// GUI: -t use command line, us: ignored
case 't': break;
case 'T': syngroupsfn = optarg; break;
// GUI: -v same
case 'v': std::cout << Rcl::version_string() << "\n"; return 0;
// GUI uses -w : open minimized
case OPTION_EXTRACT: extractfile=optarg;break;
}
}
endopts:
string reason;
*cfp = recollinit(0, 0, 0, reason, &a_config);
RclConfig *rclconfig = *cfp;
if (!rclconfig || !rclconfig->ok()) {
fprintf(stderr, "Recoll init failed: %s\n", reason.c_str());
std::cerr << "Recoll init failed: " << reason << "\n";
exit(1);
}
if (argc < 1 && !(op_flags & OPT_P)) {
Usage();
}
vector<string> fields;
if (op_flags & OPT_F) {
if (op_flags & (OPT_b|OPT_d|OPT_b|OPT_Q|OPT_m|OPT_A))
Usage();
@ -322,44 +303,43 @@ endopts:
}
Rcl::Db rcldb(rclconfig);
if (!extra_dbs.empty()) {
for (list<string>::iterator it = extra_dbs.begin();
it != extra_dbs.end(); it++) {
if (!rcldb.addQueryDb(*it)) {
cerr << "Can't add index: " << *it << endl;
for (const auto& db : extra_dbs) {
if (!rcldb.addQueryDb(db)) {
cerr << "Can't add index: " << db << "\n";
exit(1);
}
}
}
if (!syngroupsfn.empty()) {
if (!rcldb.setSynGroupsFile(syngroupsfn)) {
cerr << "Can't use synonyms file: " << syngroupsfn << endl;
cerr << "Can't use synonyms file: " << syngroupsfn << "\n";
exit(1);
}
}
if (!rcldb.open(Rcl::Db::DbRO)) {
cerr << "Cant open database in " << rclconfig->getDbDir() <<
" reason: " << rcldb.getReason() << endl;
" reason: " << rcldb.getReason() << "\n";
exit(1);
}
if (op_flags & OPT_P) {
int minyear, maxyear;
if (!rcldb.maxYearSpan(&minyear, &maxyear)) {
cerr << "maxYearSpan failed: " << rcldb.getReason() << endl;
exit(1);
cerr << "maxYearSpan failed: " << rcldb.getReason() << "\n";
return 1;
} else {
cout << "Min year " << minyear << " Max year " << maxyear << endl;
exit(0);
cout << "Min year " << minyear << " Max year " << maxyear << "\n";
return 0;
}
}
if (argc < 1) {
if (optind >= argc) {
Usage();
}
string qs = *argv++;argc--;
while (argc > 0) {
qs += string(" ") + *argv++;argc--;
string qs;
while (optind < argc) {
qs += std::string(argv[optind++]) + " ";
}
{
@ -367,11 +347,11 @@ endopts:
string charset = rclconfig->getDefCharset(true);
int ercnt;
if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) {
fprintf(stderr, "Can't convert command line args to utf-8\n");
exit(1);
std::cerr << "Can't convert command line args to utf-8\n";
return 1;
} else if (ercnt) {
fprintf(stderr, "%d errors while converting arguments from %s "
"to utf-8\n", ercnt, charset.c_str());
std::cerr <<
ercnt << " errors while converting arguments from " << charset << "to utf-8\n";
}
qs = uq;
}
@ -380,13 +360,12 @@ endopts:
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
sd = std::make_shared<Rcl::SearchData>(Rcl::SCLT_OR, stemlang);
Rcl::SearchDataClause *clp = 0;
Rcl::SearchDataClause *clp;
if (op_flags & OPT_f) {
clp = new Rcl::SearchDataClauseFilename(qs);
} else {
clp = new Rcl::SearchDataClauseSimple((op_flags & OPT_o)?
Rcl::SCLT_OR : Rcl::SCLT_AND,
qs);
clp = new Rcl::SearchDataClauseSimple(
(op_flags & OPT_o) ? Rcl::SCLT_OR : Rcl::SCLT_AND, qs);
}
if (sd)
sd->addClause(clp);
@ -395,7 +374,7 @@ endopts:
}
if (!sd) {
cerr << "Query string interpretation failed: " << reason << endl;
std::cerr << "Query string interpretation failed: " << reason << "\n";
return 1;
}
@ -409,8 +388,8 @@ endopts:
}
Chrono chron;
if (!query.setQuery(rq)) {
cerr << "Query setup failed: " << query.getReason() << endl;
return(1);
std::cerr << "Query setup failed: " << query.getReason() << "\n";
return 1;
}
int cnt;
if (op_flags & OPT_E) {
@ -419,29 +398,37 @@ endopts:
cnt = query.getResCnt();
}
if (!(op_flags & OPT_b)) {
cout << "Recoll query: " << rq->getDescription() << endl;
std::cout << "Recoll query: " << rq->getDescription() << "\n";
if (firstres == 0) {
if (cnt <= maxcount)
cout << cnt << " results" << endl;
std::cout << cnt << " results" << "\n";
else
cout << cnt << " results (printing " << maxcount << " max):"
<< endl;
std::cout << cnt << " results (printing " << maxcount << " max):" << "\n";
} else {
cout << "Printing at most " << cnt - (firstres+maxcount) <<
" results from first " << firstres << endl;
std::cout << "Printing at most " << cnt - (firstres+maxcount) <<
" results from first " << firstres << "\n";
}
}
if (op_flags & OPT_Q)
cout << "Query setup took " << chron.millis() << " mS" << endl;
if (op_flags & OPT_Q)
return(0);
if (op_flags & OPT_Q) {
std::cout << "Query setup took " << chron.millis() << " mS" << "\n";
return 0;
}
for (int i = firstres; i < firstres + maxcount; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc))
break;
if (!extractfile.empty()) {
if (path_exists(extractfile)) {
std::cerr << "Output file must not exist.\n";
return 1;
}
TempFile unused;
auto ret = FileInterner::idocToFile(unused, extractfile, rclconfig, doc, false);
return ret ? 0 : 1;
}
if (op_flags & OPT_F) {
output_fields(fields, doc, query, rcldb,
op_flags & OPT_N, op_flags & (OPT_p|OPT_g), snipcnt, op_flags & OPT_g);
@ -452,7 +439,7 @@ endopts:
doc.url = url_encode(doc.url);
if (op_flags & OPT_b) {
cout << doc.url << endl;
cout << doc.url << "\n";
} else {
string titleorfn = doc.meta[Rcl::Doc::keytt];
if (titleorfn.empty())
@ -470,10 +457,10 @@ endopts:
<< "[" << doc.url << "]" << "\t"
<< "[" << titleorfn << "]" << "\t"
<< doc.fbytes << "\tbytes" << "\t"
<< endl;
<< "\n";
if (op_flags & OPT_m) {
for (const auto& ent : doc.meta) {
cout << ent.first << " = " << ent.second << endl;
cout << ent.first << " = " << ent.second << "\n";
}
}
if (op_flags & OPT_A) {
@ -482,9 +469,9 @@ endopts:
string abstract = make_abstract(doc, query, asSnippets, snipcnt, showlines);
string marker = asSnippets ? "SNIPPETS" : "ABSTRACT";
if (!abstract.empty()) {
cout << marker << endl;
cout << marker << "\n";
cout << abstract;
cout << string("/") + marker << endl;
cout << string("/") + marker << "\n";
}
}
}

21
tests/extracto/extracto.sh Executable file
View File

@ -0,0 +1,21 @@
#!/bin/sh
topdir=`dirname $0`/..
. $topdir/shared.sh
initvariables $0
(
tmpfile=$(
echo 'mkstemp(template)' |
m4 -D template="${TMPDIR:-/tmp}/rcltsttmpXXXXXX"
) || exit 1
trap "rm -f -- '$tmpfile'" EXIT
rm -f "$tmpfile"
recollq --extract-to "$tmpfile" -q HtmlAttachment_uniqueTerm
md5sum "$tmpfile" | awk '{print $1}'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
checkresult

View File

@ -0,0 +1,2 @@
1 results
90c700bb60b7df96e56cca46561f0597

View File

@ -7,8 +7,8 @@ initvariables $0
(
recollq -S url -q '"simulating shock turbulence interactions"'
recollq -S url Utf8pathunique
recollq -S fmtime -q '"simulating shock turbulence interactions"'
recollq -S fmtime Utf8pathunique
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout