From ea133d7d6072a247dffabb7126c682c04b9a8fd0 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 15 Aug 2022 17:46:57 +0200 Subject: [PATCH] recollq: add option to extract result document to file + doc and test --- src/doc/man/recollq.1 | 14 +- src/doc/user/usermanual.xml | 50 +++--- src/query/recollq.cpp | 327 +++++++++++++++++------------------ tests/extracto/extracto.sh | 21 +++ tests/extracto/extracto.txt | 2 + tests/utfInPath/utfInPath.sh | 4 +- 6 files changed, 222 insertions(+), 196 deletions(-) create mode 100755 tests/extracto/extracto.sh create mode 100644 tests/extracto/extracto.txt diff --git a/src/doc/man/recollq.1 b/src/doc/man/recollq.1 index 16885e19..569d557d 100644 --- a/src/doc/man/recollq.1 +++ b/src/doc/man/recollq.1 @@ -59,6 +59,10 @@ recollq \- command line / standard output Recoll query command. .B \-F ] +[ +.B \--extract-to + +] .B recollq \-P @@ -120,9 +124,10 @@ sorts the results according to the specified field. Use for descending order. .PP .B \-n - +<[first-]cnt> can be used to set the maximum number of results that should be -printed. The default is 2000. Use a value of 0 for no limit. +printed. The default is 2000. Use a value of 0 for no limit. If the argument is of the form +first-cnt, it also defines the first result to output (from 0). .PP .B \-s @@ -144,6 +149,11 @@ base64 and separated by one space character. Empty fields are indicated by consecutive space characters. There is one additional space character at the end of each line. .PP +.B \--extract-to + +Will extract the first result document of the query to the argument path, which must not exist. Use +-n first-cnt to select the document. +.PP .B recollq \-P (Period) will print the minimum and maximum modification years for documents in the index. diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index f4b17f4b..d15b9b4f 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -3852,42 +3852,48 @@ fs.inotify.max_user_watches=32768 The Usage string follows: Runs a recoll query and displays result lines. - Default: will interpret the argument(s) as a xesam query string + Default: will interpret the argument(s) as a xesam query string. Query elements: * Implicit AND, exclusion, field spec: t1 -t2 title:t3 * OR has priority: t1 OR t2 t3 OR t4 means (t1 OR t2) AND (t3 OR t4) * Phrase: "t1 t2" (needs additional quoting on cmd line) - -o Emulate the GUI simple search in ANY TERM mode - -a Emulate the GUI simple search in ALL TERMS mode - -f Emulate the GUI simple search in filename mode - -q is just ignored (compatibility with the recoll GUI command line) + -o Emulate the GUI simple search in ANY TERM mode. + -a Emulate the GUI simple search in ALL TERMS mode. + -f Emulate the GUI simple search in filename mode. + -q is just ignored (compatibility with the recoll GUI command line). Common options: - -c : specify config directory, overriding $RECOLL_CONFDIR - -d also dump file contents + -c : specify config directory, overriding $RECOLL_CONFDIR. + -C : collapse duplicates + -d also dump file contents. -n [first-] define the result slice. The default value for [first] is 0. Without the option, the default max count is 2000. - Use n=0 for no limit - -b : basic. Just output urls, no mime types or titles - -Q : no result lines, just the processed query and result count - -m : dump the whole document meta[] array for each result - -A : output the document abstracts - -S fld : sort by field - -D : sort descending - -s stemlang : set stemming language to use (must exist in index...) - Use -s "" to turn off stem expansion - -T : use the parameter (Thesaurus) for word expansion - -i : additional index, several can be given - -e use url encoding (%xx) for urls + Use n=0 for no limit. + -b : basic. Just output urls, no mime types or titles. + -Q : no result lines, just the processed query and result count. + -m : dump the whole document meta[] array for each result. + -A : output the document abstracts. + -p : show snippets, with page numbers instead of the concatenated abstract. + -g : show snippets, with line numbers instead of the concatenated abstract. + -S fld : sort by field . + -D : sort descending. + -s stemlang : set stemming language to use (must exist in index...). + Use -s "" to turn off stem expansion. + -T : use the parameter (Thesaurus) for word expansion. + -i : additional index, several can be given. + -e use url encoding (%xx) for urls. + -E use exact result count instead of lower bound estimate. -F : output exactly these fields for each result. The field values are encoded in base64, output in one line and separated by one space character. This is the recommended format for use by other programs. Use a normal query with option -m to see the field names. Use -F '' to output all fields, but you probably - also want option -N in this case - -N : with -F, print the (plain text) field names before the field values + also want option -N in this case. + -N : with -F, print the (plain text) field names before the field values. + --extract_to : extract the first result to filepath, which must not exist. + Use a -n option with an offset to select the appropriate result. ]]> Sample execution: diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 939dd8fc..5848d1bd 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2006 J.F.Dockes +/* Copyright (C) 2006-2022 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ #include "smallut.h" #include "chrono.h" #include "base64.h" +#include "rclutil.h" +#include "internfile.h" using namespace std; @@ -51,9 +54,9 @@ bool dump_contents(RclConfig *rclconfig, Rcl::Doc& idoc) Rcl::Doc fdoc; string ipath = idoc.ipath; if (interner.internfile(fdoc, ipath)) { - cout << fdoc.text << endl; + cout << fdoc.text << "\n"; } else { - cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << endl; + cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << "\n"; } return true; } @@ -67,7 +70,7 @@ string make_abstract(Rcl::Doc& doc, Rcl::Query& query, bool asSnippets, std::ostringstream str; if (query.makeDocAbstract(doc, snippets, snipcount, -1, true)) { for (const auto& snippet : snippets) { - str << (showlines ? snippet.line : snippet.page) << " : " << snippet.snippet << endl; + str << (showlines ? snippet.line : snippet.page) << " : " << snippet.snippet << "\n"; } } abstract = str.str(); @@ -83,7 +86,6 @@ void output_fields(vector fields, Rcl::Doc& doc, bool asSnippets, int snipcnt, bool showlines) { if (fields.empty()) { - map::const_iterator it; for (const auto& entry : doc.meta) { fields.push_back(entry.first); } @@ -103,11 +105,13 @@ void output_fields(vector fields, Rcl::Doc& doc, // fields. This is a problem when printing names and using strtok, but // have to keep the old behaviour when printnames is not set. if (!(out.empty() && printnames)) { - if (printnames) - cout << fld << " " << out << " "; + if (printnames) { + cout << fld << " "; + } + cout << out << " "; } } - cout << endl; + cout << "\n"; } static char *thisprog; @@ -151,170 +155,147 @@ static char usage [] = " for use by other programs. Use a normal query with option -m to \n" " see the field names. Use -F '' to output all fields, but you probably\n" " also want option -N in this case.\n" -" -N : with -F, print the (plain text) field names before the field values.\n" +" -N : with -F, print the (plain text) field names before the field values.\n" +" --extract_to : extract the first result to filepath, which must not exist.\n" +" Use a -n option with an offset to select the appropriate result.\n" ; -static void -Usage(void) +static void Usage(std::ostream &os = std::cerr) { - cerr << thisprog << ": usage:" << endl << usage; + os << thisprog << ": usage:" << "\n" << usage; exit(1); } // BEWARE COMPATIBILITY WITH recoll OPTIONS letters static int op_flags; -#define OPT_A 0x1 -// GUI: -a same -#define OPT_a 0x2 -#define OPT_b 0x4 -#define OPT_C 0x8 -// GUI: -c same -#define OPT_c 0x10 -#define OPT_D 0x20 -#define OPT_d 0x40 -#define OPT_e 0x80 -#define OPT_F 0x100 -#define OPT_g 0x200 -// GUI: -f same -#define OPT_f 0x400 -// GUI uses -h for help. us: usage -#define OPT_i 0x800 -// GUI uses -L to set language of messages -// GUI: -l specifies query language, which is the default. Accept and ignore -#define OPT_l 0x1000 -#define OPT_m 0x2000 -#define OPT_N 0x4000 -#define OPT_n 0x8000 -// GUI: -o same -#define OPT_o 0x10000 -#define OPT_p 0x20000 -#define OPT_P 0x40000 -#define OPT_Q 0x80000 -// GUI: -q same -#define OPT_q 0x100000 -#define OPT_S 0x200000 -#define OPT_s 0x400000 -#define OPT_T 0x800000 -// GUI: -t use command line, us: ignored -#define OPT_t 0x100000 -// GUI uses -v : show version. Us: usage -// GUI uses -w : open minimized -#define OPT_E 0x200000 +#define OPT_A 0x1 +#define OPT_a 0x2 +#define OPT_b 0x4 +#define OPT_C 0x8 +#define OPT_D 0x10 +#define OPT_d 0x20 +#define OPT_e 0x40 +#define OPT_F 0x80 +#define OPT_g 0x100 +#define OPT_f 0x200 +#define OPT_l 0x400 +#define OPT_m 0x800 +#define OPT_N 0x1000 +#define OPT_o 0x2000 +#define OPT_p 0x4000 +#define OPT_P 0x8000 +#define OPT_Q 0x10000 +#define OPT_q 0x20000 +#define OPT_S 0x40000 +#define OPT_E 0x80000 + +static struct option long_options[] = { + #define OPTION_EXTRACT 1000 + {"extract-to", required_argument, 0, OPTION_EXTRACT}, + {0, 0, 0, 0} +}; int recollq(RclConfig **cfp, int argc, char **argv) { + thisprog = argv[0]; + string a_config; string sortfield; string stemlang("english"); list extra_dbs; string sf; - vector fields; string syngroupsfn; - int firstres = 0; int maxcount = 2000; int snipcnt = -1; + std::string extractfile; - thisprog = argv[0]; - argc--; argv++; - - while (argc > 0 && **argv == '-') { - (*argv)++; - if (!(**argv)) - /* Cas du "adb - core" */ - Usage(); - while (**argv) - switch (*(*argv)++) { - case '-': - // -- : end of options - if (*(*argv) != 0) - Usage(); - goto endopts; - case 'A': op_flags |= OPT_A; break; - case 'a': op_flags |= OPT_a; break; - case 'b': op_flags |= OPT_b; break; - case 'C': op_flags |= OPT_C; break; - case 'c': op_flags |= OPT_c; if (argc < 2) Usage(); - a_config = *(++argv); - argc--; goto b1; - case 'd': op_flags |= OPT_d; break; - case 'D': op_flags |= OPT_D; break; - case 'E': op_flags |= OPT_E; break; - case 'e': op_flags |= OPT_e; break; - case 'f': op_flags |= OPT_f; break; - case 'F': op_flags |= OPT_F; if (argc < 2) Usage(); - sf = *(++argv); - argc--; goto b1; - case 'i': op_flags |= OPT_i; if (argc < 2) Usage(); - extra_dbs.push_back(*(++argv)); - argc--; goto b1; - case 'l': op_flags |= OPT_l; break; - case 'm': op_flags |= OPT_m; break; - case 'N': op_flags |= OPT_N; break; - case 'n': op_flags |= OPT_n; if (argc < 2) Usage(); - { - string rescnt = *(++argv); - string::size_type dash = rescnt.find("-"); - if (dash != string::npos) { - firstres = atoi(rescnt.substr(0, dash).c_str()); - if (dash < rescnt.size()-1) { - maxcount = atoi(rescnt.substr(dash+1).c_str()); - } - } else { - maxcount = atoi(rescnt.c_str()); + int ret; + while ((ret = getopt_long(argc, argv, "+AabCc:DdEefF:hi:lmNn:oPp:g:QqS:s:tT:v", + long_options, NULL)) != -1) { + switch (ret) { + case 'A': op_flags |= OPT_A; break; + // GUI: -a same + case 'a': op_flags |= OPT_a; break; + case 'b': op_flags |= OPT_b; break; + case 'C': op_flags |= OPT_C; break; + // GUI: -c same + case 'c': a_config = optarg; break; + case 'd': op_flags |= OPT_d; break; + case 'D': op_flags |= OPT_D; break; + case 'E': op_flags |= OPT_E; break; + case 'e': op_flags |= OPT_e; break; + case 'F': op_flags |= OPT_F; sf = optarg; break; + // GUI: -f same + case 'f': op_flags |= OPT_f; break; + // GUI: -h same + case 'h': Usage(std::cout); // Usage exits + case 'i': extra_dbs.push_back(optarg); break; + // GUI uses -L to set language of messages + // GUI: -l specifies query language, which is the default. Accept and ignore + case 'l': op_flags |= OPT_l; break; + case 'm': op_flags |= OPT_m; break; + case 'N': op_flags |= OPT_N; break; + case 'n': + { + string rescnt = optarg; + string::size_type dash = rescnt.find("-"); + if (dash != string::npos) { + firstres = atoi(rescnt.substr(0, dash).c_str()); + if (dash < rescnt.size()-1) { + maxcount = atoi(rescnt.substr(dash+1).c_str()); } - if (maxcount <= 0) maxcount = INT_MAX; + } else { + maxcount = atoi(rescnt.c_str()); } - argc--; goto b1; - case 'o': op_flags |= OPT_o; break; - case 'P': op_flags |= OPT_P; break; - case 'p': - op_flags |= OPT_p; - goto porg; - case 'g': - op_flags |= OPT_g; - goto porg; + if (maxcount <= 0) maxcount = INT_MAX; + } + break; + // GUI: -o same + case 'o': op_flags |= OPT_o; break; + case 'P': op_flags |= OPT_P; break; + case 'p': + op_flags |= OPT_p; + goto porg; + case 'g': + op_flags |= OPT_g; { porg: - if (argc < 2) - Usage(); - const char *cp = *(++argv); + const char *cp = optarg; char *cpe; snipcnt = strtol(cp, &cpe, 10); if (*cpe != 0) Usage(); - argc--; goto b1; } - case 'q': op_flags |= OPT_q; break; - case 'Q': op_flags |= OPT_Q; break; - case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); - sortfield = *(++argv); - argc--; goto b1; - case 's': op_flags |= OPT_s; if (argc < 2) Usage(); - stemlang = *(++argv); - argc--; goto b1; - case 't': op_flags |= OPT_t; break; - case 'T': op_flags |= OPT_T; if (argc < 2) Usage(); - syngroupsfn = *(++argv); - argc--; goto b1; - default: Usage(); break; - } - b1: argc--; argv++; + break; + case 'Q': op_flags |= OPT_Q; break; + // GUI: -q same + case 'q': op_flags |= OPT_q; break; + case 'S': op_flags |= OPT_S; sortfield = optarg; break; + case 's': stemlang = optarg; break; + // GUI: -t use command line, us: ignored + case 't': break; + case 'T': syngroupsfn = optarg; break; + // GUI: -v same + case 'v': std::cout << Rcl::version_string() << "\n"; return 0; + // GUI uses -w : open minimized + case OPTION_EXTRACT: extractfile=optarg;break; + } } -endopts: string reason; *cfp = recollinit(0, 0, 0, reason, &a_config); RclConfig *rclconfig = *cfp; if (!rclconfig || !rclconfig->ok()) { - fprintf(stderr, "Recoll init failed: %s\n", reason.c_str()); + std::cerr << "Recoll init failed: " << reason << "\n"; exit(1); } if (argc < 1 && !(op_flags & OPT_P)) { Usage(); } + vector fields; if (op_flags & OPT_F) { if (op_flags & (OPT_b|OPT_d|OPT_b|OPT_Q|OPT_m|OPT_A)) Usage(); @@ -322,44 +303,43 @@ endopts: } Rcl::Db rcldb(rclconfig); if (!extra_dbs.empty()) { - for (list::iterator it = extra_dbs.begin(); - it != extra_dbs.end(); it++) { - if (!rcldb.addQueryDb(*it)) { - cerr << "Can't add index: " << *it << endl; + for (const auto& db : extra_dbs) { + if (!rcldb.addQueryDb(db)) { + cerr << "Can't add index: " << db << "\n"; exit(1); } } } if (!syngroupsfn.empty()) { if (!rcldb.setSynGroupsFile(syngroupsfn)) { - cerr << "Can't use synonyms file: " << syngroupsfn << endl; + cerr << "Can't use synonyms file: " << syngroupsfn << "\n"; exit(1); } } if (!rcldb.open(Rcl::Db::DbRO)) { cerr << "Cant open database in " << rclconfig->getDbDir() << - " reason: " << rcldb.getReason() << endl; + " reason: " << rcldb.getReason() << "\n"; exit(1); } if (op_flags & OPT_P) { int minyear, maxyear; if (!rcldb.maxYearSpan(&minyear, &maxyear)) { - cerr << "maxYearSpan failed: " << rcldb.getReason() << endl; - exit(1); + cerr << "maxYearSpan failed: " << rcldb.getReason() << "\n"; + return 1; } else { - cout << "Min year " << minyear << " Max year " << maxyear << endl; - exit(0); + cout << "Min year " << minyear << " Max year " << maxyear << "\n"; + return 0; } } - if (argc < 1) { + if (optind >= argc) { Usage(); } - string qs = *argv++;argc--; - while (argc > 0) { - qs += string(" ") + *argv++;argc--; + string qs; + while (optind < argc) { + qs += std::string(argv[optind++]) + " "; } { @@ -367,11 +347,11 @@ endopts: string charset = rclconfig->getDefCharset(true); int ercnt; if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) { - fprintf(stderr, "Can't convert command line args to utf-8\n"); - exit(1); + std::cerr << "Can't convert command line args to utf-8\n"; + return 1; } else if (ercnt) { - fprintf(stderr, "%d errors while converting arguments from %s " - "to utf-8\n", ercnt, charset.c_str()); + std::cerr << + ercnt << " errors while converting arguments from " << charset << "to utf-8\n"; } qs = uq; } @@ -380,13 +360,12 @@ endopts: if (op_flags & (OPT_a|OPT_o|OPT_f)) { sd = std::make_shared(Rcl::SCLT_OR, stemlang); - Rcl::SearchDataClause *clp = 0; + Rcl::SearchDataClause *clp; if (op_flags & OPT_f) { clp = new Rcl::SearchDataClauseFilename(qs); } else { - clp = new Rcl::SearchDataClauseSimple((op_flags & OPT_o)? - Rcl::SCLT_OR : Rcl::SCLT_AND, - qs); + clp = new Rcl::SearchDataClauseSimple( + (op_flags & OPT_o) ? Rcl::SCLT_OR : Rcl::SCLT_AND, qs); } if (sd) sd->addClause(clp); @@ -395,7 +374,7 @@ endopts: } if (!sd) { - cerr << "Query string interpretation failed: " << reason << endl; + std::cerr << "Query string interpretation failed: " << reason << "\n"; return 1; } @@ -409,8 +388,8 @@ endopts: } Chrono chron; if (!query.setQuery(rq)) { - cerr << "Query setup failed: " << query.getReason() << endl; - return(1); + std::cerr << "Query setup failed: " << query.getReason() << "\n"; + return 1; } int cnt; if (op_flags & OPT_E) { @@ -419,29 +398,37 @@ endopts: cnt = query.getResCnt(); } if (!(op_flags & OPT_b)) { - cout << "Recoll query: " << rq->getDescription() << endl; + std::cout << "Recoll query: " << rq->getDescription() << "\n"; if (firstres == 0) { if (cnt <= maxcount) - cout << cnt << " results" << endl; + std::cout << cnt << " results" << "\n"; else - cout << cnt << " results (printing " << maxcount << " max):" - << endl; + std::cout << cnt << " results (printing " << maxcount << " max):" << "\n"; } else { - cout << "Printing at most " << cnt - (firstres+maxcount) << - " results from first " << firstres << endl; + std::cout << "Printing at most " << cnt - (firstres+maxcount) << + " results from first " << firstres << "\n"; } } - if (op_flags & OPT_Q) - cout << "Query setup took " << chron.millis() << " mS" << endl; - - if (op_flags & OPT_Q) - return(0); + if (op_flags & OPT_Q) { + std::cout << "Query setup took " << chron.millis() << " mS" << "\n"; + return 0; + } for (int i = firstres; i < firstres + maxcount; i++) { Rcl::Doc doc; if (!query.getDoc(i, doc)) break; + if (!extractfile.empty()) { + if (path_exists(extractfile)) { + std::cerr << "Output file must not exist.\n"; + return 1; + } + TempFile unused; + auto ret = FileInterner::idocToFile(unused, extractfile, rclconfig, doc, false); + return ret ? 0 : 1; + } + if (op_flags & OPT_F) { output_fields(fields, doc, query, rcldb, op_flags & OPT_N, op_flags & (OPT_p|OPT_g), snipcnt, op_flags & OPT_g); @@ -452,7 +439,7 @@ endopts: doc.url = url_encode(doc.url); if (op_flags & OPT_b) { - cout << doc.url << endl; + cout << doc.url << "\n"; } else { string titleorfn = doc.meta[Rcl::Doc::keytt]; if (titleorfn.empty()) @@ -470,10 +457,10 @@ endopts: << "[" << doc.url << "]" << "\t" << "[" << titleorfn << "]" << "\t" << doc.fbytes << "\tbytes" << "\t" - << endl; + << "\n"; if (op_flags & OPT_m) { for (const auto& ent : doc.meta) { - cout << ent.first << " = " << ent.second << endl; + cout << ent.first << " = " << ent.second << "\n"; } } if (op_flags & OPT_A) { @@ -482,9 +469,9 @@ endopts: string abstract = make_abstract(doc, query, asSnippets, snipcnt, showlines); string marker = asSnippets ? "SNIPPETS" : "ABSTRACT"; if (!abstract.empty()) { - cout << marker << endl; + cout << marker << "\n"; cout << abstract; - cout << string("/") + marker << endl; + cout << string("/") + marker << "\n"; } } } diff --git a/tests/extracto/extracto.sh b/tests/extracto/extracto.sh new file mode 100755 index 00000000..eee631bc --- /dev/null +++ b/tests/extracto/extracto.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +topdir=`dirname $0`/.. +. $topdir/shared.sh + +initvariables $0 + +( + tmpfile=$( + echo 'mkstemp(template)' | + m4 -D template="${TMPDIR:-/tmp}/rcltsttmpXXXXXX" + ) || exit 1 + trap "rm -f -- '$tmpfile'" EXIT + rm -f "$tmpfile" + recollq --extract-to "$tmpfile" -q HtmlAttachment_uniqueTerm + md5sum "$tmpfile" | awk '{print $1}' +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout + +diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 + +checkresult diff --git a/tests/extracto/extracto.txt b/tests/extracto/extracto.txt new file mode 100644 index 00000000..80101c2a --- /dev/null +++ b/tests/extracto/extracto.txt @@ -0,0 +1,2 @@ +1 results +90c700bb60b7df96e56cca46561f0597 diff --git a/tests/utfInPath/utfInPath.sh b/tests/utfInPath/utfInPath.sh index 37da7240..00aff986 100755 --- a/tests/utfInPath/utfInPath.sh +++ b/tests/utfInPath/utfInPath.sh @@ -7,8 +7,8 @@ initvariables $0 ( -recollq -S url -q '"simulating shock turbulence interactions"' -recollq -S url Utf8pathunique +recollq -S fmtime -q '"simulating shock turbulence interactions"' +recollq -S fmtime Utf8pathunique ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout