add onlyNames config variable for filtering file names

This commit is contained in:
Jean-Francois Dockes 2019-06-17 08:28:14 +02:00
parent be81082f38
commit 45043b816f
12 changed files with 189 additions and 275 deletions

View File

@ -14,7 +14,7 @@
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef TEST_RCLCONFIG
#include "autoconfig.h"
#include <stdio.h>
@ -186,6 +186,7 @@ RclConfig::RclConfig(const RclConfig &r)
m_stpsuffstate(this, {"noContentSuffixes", "noContentSuffixes+",
"noContentSuffixes-"}),
m_skpnstate(this, {"skippedNames", "skippedNames+", "skippedNames-"}),
m_onlnstate(this, "onlyNames"),
m_rmtstate(this, "indexedmimetypes"),
m_xmtstate(this, "excludedmimetypes"),
m_mdrstate(this, "metadatacmds")
@ -198,6 +199,7 @@ RclConfig::RclConfig(const string *argcnf)
m_stpsuffstate(this, {"noContentSuffixes", "noContentSuffixes+",
"noContentSuffixes-"}),
m_skpnstate(this, {"skippedNames", "skippedNames+", "skippedNames-"}),
m_onlnstate(this, "onlyNames"),
m_rmtstate(this, "indexedmimetypes"),
m_xmtstate(this, "excludedmimetypes"),
m_mdrstate(this, "metadatacmds")
@ -1539,6 +1541,14 @@ vector<string>& RclConfig::getSkippedNames()
return m_skpnlist;
}
vector<string>& RclConfig::getOnlyNames()
{
if (m_onlnstate.needrecompute()) {
stringToStrings(m_onlnstate.getvalue(), m_onlnlist);
}
return m_onlnlist;
}
vector<string> RclConfig::getSkippedPaths() const
{
vector<string> skpl;
@ -1680,15 +1690,15 @@ bool RclConfig::getUncompressor(const string &mtype, vector<string>& cmd) const
}
static const char blurb0[] =
"# The system-wide configuration files for recoll are located in:\n"
"# %s\n"
"# The default configuration files are commented, you should take a look\n"
"# at them for an explanation of what can be set (you could also take a look\n"
"# at the manual instead).\n"
"# Values set in this file will override the system-wide values for the file\n"
"# with the same name in the central directory. The syntax for setting\n"
"# values is identical.\n"
;
"# The system-wide configuration files for recoll are located in:\n"
"# %s\n"
"# The default configuration files are commented, you should take a look\n"
"# at them for an explanation of what can be set (you could also take a look\n"
"# at the manual instead).\n"
"# Values set in this file will override the system-wide values for the file\n"
"# with the same name in the central directory. The syntax for setting\n"
"# values is identical.\n"
;
// We just use path_max to print the path to /usr/share/recoll/examples
// inside the config file. At worse, the text is truncated (using
// snprintf). But 4096 should be enough :)
@ -1799,6 +1809,7 @@ void RclConfig::initFrom(const RclConfig& r)
m_xattrtofld = r.m_xattrtofld;
m_maxsufflen = r.m_maxsufflen;
m_skpnlist = r.m_skpnlist;
m_onlnlist = r.m_onlnlist;
m_stopsuffixes = r.m_stopsuffixes;
m_defcharset = r.m_defcharset;
m_restrictMTypes = r.m_restrictMTypes;
@ -1829,196 +1840,8 @@ void RclConfig::initParamStale(ConfNull *cnf, ConfNull *mimemap)
m_oldstpsuffstate.init(mimemap);
m_stpsuffstate.init(cnf);
m_skpnstate.init(cnf);
m_onlnstate.init(cnf);
m_rmtstate.init(cnf);
m_xmtstate.init(cnf);
m_mdrstate.init(cnf);
}
#else // -> Test
#include <stdio.h>
#include <signal.h>
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#include "log.h"
#include "rclinit.h"
#include "rclconfig.h"
#include "cstr.h"
static char *thisprog;
static char usage [] = "\n"
"-c: check a few things in the configuration files\n"
"[-s subkey] -q param : query parameter value\n"
"-f : print some field data\n"
" : default: print parameters\n"
;
static void
Usage(void)
{
fprintf(stderr, "%s: usage: %s\n", thisprog, usage);
exit(1);
}
static int op_flags;
#define OPT_MOINS 0x1
#define OPT_s 0x2
#define OPT_q 0x4
#define OPT_c 0x8
#define OPT_f 0x10
int main(int argc, char **argv)
{
string pname, skey;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break;
case 'f': op_flags |= OPT_f; break;
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
skey = *(++argv);
argc--;
goto b1;
case 'q': op_flags |= OPT_q; if (argc < 2) Usage();
pname = *(++argv);
argc--;
goto b1;
default: Usage(); break;
}
b1: argc--; argv++;
}
if (argc != 0)
Usage();
string reason;
RclConfig *config = recollinit(0, 0, 0, reason);
if (config == 0 || !config->ok()) {
cerr << "Configuration problem: " << reason << endl;
exit(1);
}
if (op_flags & OPT_s)
config->setKeyDir(skey);
if (op_flags & OPT_q) {
string value;
if (!config->getConfParam(pname, value)) {
fprintf(stderr, "getConfParam failed for [%s]\n", pname.c_str());
exit(1);
}
printf("[%s] -> [%s]\n", pname.c_str(), value.c_str());
} else if (op_flags & OPT_f) {
set<string> stored = config->getStoredFields();
set<string> indexed = config->getIndexedFields();
cout << "Stored fields: ";
for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) {
cout << "[" << *it << "] ";
}
cout << endl;
cout << "Indexed fields: ";
for (set<string>::const_iterator it = indexed.begin();
it != indexed.end(); it++) {
const FieldTraits *ftp;
config->getFieldTraits(*it, &ftp);
if (ftp)
cout << "[" << *it << "]" << " -> [" << ftp->pfx << "] ";
else
cout << "[" << *it << "]" << " -> [" << "(none)" << "] ";
}
cout << endl;
} else if (op_flags & OPT_c) {
// Checking the configuration consistency
// Find and display category names
vector<string> catnames;
config->getMimeCategories(catnames);
cout << "Categories: ";
for (vector<string>::const_iterator it = catnames.begin();
it != catnames.end(); it++) {
cout << *it << " ";
}
cout << endl;
// Compute union of all types from each category. Check that there
// are no duplicates while we are at it.
set<string> allmtsfromcats;
for (vector<string>::const_iterator it = catnames.begin();
it != catnames.end(); it++) {
vector<string> cts;
config->getMimeCatTypes(*it, cts);
for (vector<string>::const_iterator it1 = cts.begin();
it1 != cts.end(); it1++) {
// Already in map -> duplicate
if (allmtsfromcats.find(*it1) != allmtsfromcats.end()) {
cout << "Duplicate: [" << *it1 << "]" << endl;
}
allmtsfromcats.insert(*it1);
}
}
// Retrieve complete list of mime types
vector<string> mtypes = config->getAllMimeTypes();
// And check that each mime type is found in exactly one category
for (vector<string>::const_iterator it = mtypes.begin();
it != mtypes.end(); it++) {
if (allmtsfromcats.find(*it) == allmtsfromcats.end()) {
cout << "Not found in catgs: [" << *it << "]" << endl;
}
}
// List mime types not in mimeview
for (vector<string>::const_iterator it = mtypes.begin();
it != mtypes.end(); it++) {
if (config->getMimeViewerDef(*it, "", false).empty()) {
cout << "No viewer: [" << *it << "]" << endl;
}
}
// Check that each mime type has an indexer
for (vector<string>::const_iterator it = mtypes.begin();
it != mtypes.end(); it++) {
if (config->getMimeHandlerDef(*it, false).empty()) {
cout << "No filter: [" << *it << "]" << endl;
}
}
// Check that each mime type has a defined icon
for (vector<string>::const_iterator it = mtypes.begin();
it != mtypes.end(); it++) {
if (config->getMimeIconPath(*it, "") == "document") {
cout << "No or generic icon: [" << *it << "]" << endl;
}
}
} else {
config->setKeyDir(cstr_null);
vector<string> names = config->getConfNames();
for (vector<string>::iterator it = names.begin();
it != names.end();it++) {
string value;
config->getConfParam(*it, value);
cout << *it << " -> [" << value << "]" << endl;
}
}
exit(0);
}
#endif // TEST_RCLCONFIG

View File

@ -214,6 +214,9 @@ class RclConfig {
/** Get list of skipped file names for current keydir */
vector<string>& getSkippedNames();
/** Get list of file name filters for current keydir (only those
names indexed) */
vector<string>& getOnlyNames();
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
vector<string> getSkippedPaths() const;
@ -390,9 +393,14 @@ class RclConfig {
ParamStale m_stpsuffstate;
vector<string> m_stopsuffvec;
// skippedNames state
ParamStale m_skpnstate;
vector<string> m_skpnlist;
// onlyNames state
ParamStale m_onlnstate;
vector<string> m_onlnlist;
// Original current working directory. Set once at init before we do any
// chdir'ing and used for converting user args to absolute paths.
static string o_origcwd;

View File

@ -360,6 +360,7 @@ bool FsIndexer::indexFiles(list<string>& files, int flags)
bool follow = false;
m_config->getConfParam("followLinks", &follow);
walker.setOnlyNames(m_config->getOnlyNames());
walker.setSkippedNames(m_config->getSkippedNames());
// Check path against indexed areas and skipped names/paths
if (!(flags & ConfIndexer::IxFIgnoreSkip) &&
@ -376,7 +377,13 @@ bool FsIndexer::indexFiles(list<string>& files, int flags)
it++;
continue;
}
if (!(flags & ConfIndexer::IxFIgnoreSkip) &&
(S_ISREG(stb.st_mode) || S_ISLNK(stb.st_mode))) {
if (!walker.inOnlyNames(path_getsimple(*it))) {
it++;
continue;
}
}
if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
FsTreeWalker::FtwOk) {
LOGERR("FsIndexer::indexFiles: processone failed\n");
@ -583,7 +590,8 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
flg == FsTreeWalker::FtwDirReturn) {
m_config->setKeyDir(fn);
// Set up skipped patterns for this subtree.
// Set up filter/skipped patterns for this subtree.
m_walker.setOnlyNames(m_config->getOnlyNames());
m_walker.setSkippedNames(m_config->getSkippedNames());
// Adjust local fields from config for this subtree

View File

@ -235,6 +235,7 @@ void *rclMonRcvRun(void *q)
// skippedPaths here, this would be incorrect (because a
// topdir can be under a skippedPath and this was handled
// while adding the watches).
// Also we let the other side process onlyNames.
lconfig.setKeyDir(path_getfather(ev.m_path));
walker.setSkippedNames(lconfig.getSkippedNames());
if (walker.inSkippedNames(path_getsimple(ev.m_path)))

View File

@ -60,6 +60,13 @@ skippedNames- =
# list.</brief><descr></descr></var>
skippedNames+ =
# <var name="onlyNames" type="string>
# <brief>Regular file name filter patterns</brief>
# <descr>If this is set, only the file names not in skippedNames and
# matching one of the patterns will be considered for indexing. Can be
# redefined per subtree. Does not apply to directories.</descr></var>
onlyNames =
# <var name="noContentSuffixes" type="string">
#
# <brief>List of name endings (not necessarily dot-separated suffixes) for

View File

@ -37,7 +37,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
-D_GNU_SOURCE \
$(DEFS)
noinst_PROGRAMS = textsplit utf8iter fstreewalk
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig
textsplit_SOURCES = trtextsplit.cpp
textsplit_LDADD = ../librecoll.la
@ -48,3 +48,6 @@ utf8iter_LDADD = ../librecoll.la
fstreewalk_SOURCES = trfstreewalk.cpp
fstreewalk_LDADD = ../librecoll.la
rclconfig_SOURCES = trrclconfig.cpp
rclconfig_LDADD = ../librecoll.la

View File

@ -29,8 +29,8 @@ using namespace std;
static int op_flags;
#define OPT_MOINS 0x1
#define OPT_p 0x2
#define OPT_P 0x4
#define OPT_p 0x2
#define OPT_P 0x4
#define OPT_r 0x8
#define OPT_c 0x10
#define OPT_b 0x20
@ -41,24 +41,31 @@ static int op_flags;
#define OPT_M 0x400
#define OPT_D 0x800
#define OPT_k 0x1000
#define OPT_y 0x2000
#define OPT_s 0x4000
class myCB : public FsTreeWalkerCB {
public:
public:
FsTreeWalker::Status processone(const string &path,
const struct stat *st,
FsTreeWalker::CbFlag flg)
{
if (flg == FsTreeWalker::FtwDirEnter) {
if (op_flags & OPT_r)
FsTreeWalker::CbFlag flg) {
if (flg == FsTreeWalker::FtwDirEnter) {
if (op_flags & OPT_r) {
cout << path << endl;
} else {
if (!(op_flags&OPT_s)) {
cout << "[Entering " << path << "]" << endl;
}
}
} else if (flg == FsTreeWalker::FtwDirReturn) {
if (!(op_flags&OPT_s)) {
cout << "[Returning to " << path << "]" << endl;
}
} else if (flg == FsTreeWalker::FtwRegular) {
cout << path << endl;
else
cout << "[Entering " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwDirReturn) {
cout << "[Returning to " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwRegular) {
cout << path << endl;
}
return FsTreeWalker::FtwOk;
}
}
return FsTreeWalker::FtwOk;
}
};
static const char *thisprog;
@ -83,16 +90,20 @@ static const char *thisprog;
static char usage [] =
"trfstreewalk [-p pattern] [-P ignpath] [-r] [-c] [-L] topdir\n"
" -r : norecurse\n"
" -c : no path canonification\n"
" -L : follow symbolic links\n"
" -b : use breadth first walk\n"
" -d : use almost depth first (dir files, then subdirs)\n"
" -m : use breadth up to 4 deep then switch to -d\n"
" -w : unset default FNM_PATHNAME when using fnmatch() to match skipped paths\n"
" -M <depth>: limit depth (works with -b/m/d)\n"
" -D : skip dotfiles\n"
"-k : like du\n"
" -L : follow symbolic links\n"
" -M <depth>: limit depth (works with -b/m/d)\n"
" -P <pattern> : add skippedPaths entry\n"
" -p <pattern> : add skippedNames entry\n"
" -b : use breadth first walk\n"
" -c : no path canonification\n"
" -d : use almost depth first (dir files, then subdirs)\n"
" -k : like du\n"
" -m : use breadth up to 4 deep then switch to -d\n"
" -r : norecurse\n"
" -s : don't print dir change info\n"
" -w : unset default FNM_PATHNAME when using fnmatch() to match skipped paths\n"
" -y <pattern> : add onlyNames entry\n"
;
static void
Usage(void)
@ -103,47 +114,53 @@ Usage(void)
int main(int argc, const char **argv)
{
vector<string> patterns;
vector<string> paths;
vector<string> skpnames;
vector<string> onlynames;
vector<string> skppaths;
int maxdepth = -1;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'b': op_flags |= OPT_b; break;
case 'c': op_flags |= OPT_c; break;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'k': op_flags |= OPT_k; break;
case 'L': op_flags |= OPT_L; break;
case 'm': op_flags |= OPT_m; break;
case 'M': op_flags |= OPT_M; if (argc < 2) Usage();
maxdepth = atoi(*(++argv));
argc--;
goto b1;
case 'p': op_flags |= OPT_p; if (argc < 2) Usage();
patterns.push_back(*(++argv));
argc--;
goto b1;
case 'P': op_flags |= OPT_P; if (argc < 2) Usage();
paths.push_back(*(++argv));
argc--;
goto b1;
case 'r': op_flags |= OPT_r; break;
case 'w': op_flags |= OPT_w; break;
default: Usage(); break;
}
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'b': op_flags |= OPT_b; break;
case 'c': op_flags |= OPT_c; break;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'k': op_flags |= OPT_k; break;
case 'L': op_flags |= OPT_L; break;
case 'm': op_flags |= OPT_m; break;
case 'M': op_flags |= OPT_M; if (argc < 2) Usage();
maxdepth = atoi(*(++argv));
argc--;
goto b1;
case 'p': op_flags |= OPT_p; if (argc < 2) Usage();
skpnames.push_back(*(++argv));
argc--;
goto b1;
case 'P': op_flags |= OPT_P; if (argc < 2) Usage();
skppaths.push_back(*(++argv));
argc--;
goto b1;
case 'r': op_flags |= OPT_r; break;
case 's': op_flags |= OPT_s; break;
case 'w': op_flags |= OPT_w; break;
case 'y': op_flags |= OPT_y; if (argc < 2) Usage();
onlynames.push_back(*(++argv));
argc--;
goto b1;
default: Usage(); break;
}
b1: argc--; argv++;
}
if (argc != 1)
Usage();
Usage();
string topdir = *argv++;argc--;
if (op_flags & OPT_k) {
@ -159,36 +176,37 @@ int main(int argc, const char **argv)
int opt = 0;
if (op_flags & OPT_r)
opt |= FsTreeWalker::FtwNoRecurse;
opt |= FsTreeWalker::FtwNoRecurse;
if (op_flags & OPT_c)
opt |= FsTreeWalker::FtwNoCanon;
opt |= FsTreeWalker::FtwNoCanon;
if (op_flags & OPT_L)
opt |= FsTreeWalker::FtwFollow;
opt |= FsTreeWalker::FtwFollow;
if (op_flags & OPT_D)
opt |= FsTreeWalker::FtwSkipDotFiles;
opt |= FsTreeWalker::FtwSkipDotFiles;
if (op_flags & OPT_b)
opt |= FsTreeWalker::FtwTravBreadth;
opt |= FsTreeWalker::FtwTravBreadth;
else if (op_flags & OPT_d)
opt |= FsTreeWalker::FtwTravFilesThenDirs;
opt |= FsTreeWalker::FtwTravFilesThenDirs;
else if (op_flags & OPT_m)
opt |= FsTreeWalker::FtwTravBreadthThenDepth;
opt |= FsTreeWalker::FtwTravBreadthThenDepth;
string reason;
if (!recollinit(0, 0, 0, reason)) {
fprintf(stderr, "Init failed: %s\n", reason.c_str());
exit(1);
fprintf(stderr, "Init failed: %s\n", reason.c_str());
exit(1);
}
if (op_flags & OPT_w) {
FsTreeWalker::setNoFnmPathname();
FsTreeWalker::setNoFnmPathname();
}
FsTreeWalker walker;
walker.setOpts(opt);
walker.setMaxDepth(maxdepth);
walker.setSkippedNames(patterns);
walker.setSkippedPaths(paths);
walker.setSkippedNames(skpnames);
walker.setOnlyNames(onlynames);
walker.setSkippedPaths(skppaths);
myCB cb;
walker.walk(topdir, cb);
if (walker.getErrCnt() > 0)
cout << walker.getReason();
cout << walker.getReason();
}

View File

@ -70,6 +70,7 @@ public:
int basedepth;
stringstream reason;
vector<string> skippedNames;
vector<string> onlyNames;
vector<string> skippedPaths;
// When doing Breadth or FilesThenDirs traversal, we keep a list
// of directory paths to be processed, and we do not recurse.
@ -149,9 +150,26 @@ bool FsTreeWalker::setSkippedNames(const vector<string> &patterns)
}
bool FsTreeWalker::inSkippedNames(const string& name)
{
for (vector<string>::const_iterator it = data->skippedNames.begin();
it != data->skippedNames.end(); it++) {
if (fnmatch(it->c_str(), name.c_str(), 0) == 0) {
for (const auto& pattern : data->skippedNames) {
if (fnmatch(pattern.c_str(), name.c_str(), 0) == 0) {
return true;
}
}
return false;
}
bool FsTreeWalker::setOnlyNames(const vector<string> &patterns)
{
data->onlyNames = patterns;
return true;
}
bool FsTreeWalker::inOnlyNames(const string& name)
{
if (data->onlyNames.empty()) {
// Not set: all match
return true;
}
for (const auto& pattern : data->onlyNames) {
if (fnmatch(pattern.c_str(), name.c_str(), 0) == 0) {
return true;
}
}
@ -463,6 +481,11 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top,
& (FtwStop|FtwError))
goto out;
} else if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) {
// Filtering patterns match ?
if (!data->onlyNames.empty()) {
if (!inOnlyNames(dname))
continue;
}
if ((status = cb.processone(fn, &st, FtwRegular)) &
(FtwStop|FtwError)) {
goto out;

View File

@ -108,6 +108,8 @@ class FsTreeWalker {
bool addSkippedName(const string &pattern);
/** Set the ignored patterns set */
bool setSkippedNames(const vector<string> &patterns);
/** Set the exclusive patterns set */
bool setOnlyNames(const vector<string> &patterns);
/** Same for skipped paths: this are paths, not names, under which we
do not descend (ie: /home/me/.recoll) */
@ -119,6 +121,7 @@ class FsTreeWalker {
* an actual tree walk */
bool inSkippedPaths(const string& path, bool ckparents = false);
bool inSkippedNames(const string& name);
bool inOnlyNames(const string& name);
private:
Status iwalk(const string &dir, struct stat *stp, FsTreeWalkerCB& cb);

View File

@ -66,3 +66,6 @@ excludedmimetypes = text/plain
[/home/dockes/projets/fulltext/testrecoll/excludehtml]
indexedmimetypes = application/pdf
[/home/dockes/projets/fulltext/testrecoll/onlynames]
onlyNames = *.matchesonepat *.matchestwopat

14
tests/onlynames/onlynames.sh Executable file
View File

@ -0,0 +1,14 @@
#!/bin/sh
topdir=`dirname $0`/..
. $topdir/shared.sh
initvariables $0
(
recollq -S url -q onlynametestkeyword
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
checkresult

View File

@ -0,0 +1,3 @@
2 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/onlynames/subdir/fn.matchesonepat] [fn.matchesonepat] 20 bytes
text/plain [file:///home/dockes/projets/fulltext/testrecoll/onlynames/subdir/fn.matchestwopat] [fn.matchestwopat] 20 bytes