indents + use range-base for loops in extrameta.cpp

This commit is contained in:
Jean-Francois Dockes 2022-09-22 17:10:07 +02:00
parent 9e0018034c
commit 20c3a7ed12
6 changed files with 315 additions and 333 deletions

View File

@ -30,20 +30,18 @@ using std::string;
using std::map;
static void docfieldfrommeta(RclConfig* cfg, const string& name,
const string &value, Rcl::Doc& doc)
const string &value, Rcl::Doc& doc)
{
string fieldname = cfg->fieldCanon(name);
LOGDEB0("Internfile:: setting [" << fieldname <<
"] from cmd/xattr value [" << value << "]\n");
LOGDEB0("Internfile:: setting [" << fieldname << "] from cmd/xattr value [" << value << "]\n");
if (fieldname == cstr_dj_keymd) {
doc.dmtime = value;
doc.dmtime = value;
} else {
doc.meta[fieldname] = value;
doc.meta[fieldname] = value;
}
}
void reapXAttrs(const RclConfig* cfg, const string& path,
map<string, string>& xfields)
void reapXAttrs(const RclConfig* cfg, const string& path, map<string, string>& xfields)
{
LOGDEB2("reapXAttrs: [" << path << "]\n");
#ifndef _WIN32
@ -51,39 +49,35 @@ void reapXAttrs(const RclConfig* cfg, const string& path,
vector<string> xnames;
if (!pxattr::list(path, &xnames)) {
if (errno == ENOTSUP) {
LOGDEB("FileInterner::reapXattrs: pxattr::list: errno " <<
errno << "\n");
LOGDEB("FileInterner::reapXattrs: pxattr::list: errno " << errno << "\n");
} else {
LOGERR("FileInterner::reapXattrs: pxattr::list: errno " <<
errno << "\n");
LOGSYSERR("FileInterner::reapXattrs", "pxattr::list", path);
}
return;
return;
}
const map<string, string>& xtof = cfg->getXattrToField();
// Record the xattrs: names found in the config are either skipped
// or mapped depending if the translation is empty. Other names
// are recorded as-is
for (vector<string>::const_iterator it = xnames.begin();
it != xnames.end(); it++) {
string key = *it;
map<string, string>::const_iterator mit = xtof.find(*it);
if (mit != xtof.end()) {
if (mit->second.empty()) {
continue;
} else {
key = mit->second;
for (const auto& xkey : xnames) {
string key = xkey;
auto mit = xtof.find(xkey);
if (mit != xtof.end()) {
if (mit->second.empty()) {
continue;
} else {
key = mit->second;
}
}
}
string value;
if (!pxattr::get(path, *it, &value, pxattr::PXATTR_NOFOLLOW)) {
LOGERR("FileInterner::reapXattrs: pxattr::get failed for " << *it
<< ", errno " << errno << "\n");
continue;
}
// Encode should we ?
xfields[key] = value;
LOGDEB2("reapXAttrs: [" << key << "] -> [" << value << "]\n");
string value;
if (!pxattr::get(path, xkey, &value, pxattr::PXATTR_NOFOLLOW)) {
LOGSYSERR("FileInterner::reapXattrs", "pxattr::get", path + " : " + xkey);
continue;
}
// Encode should we ?
xfields[key] = value;
LOGDEB2("reapXAttrs: [" << key << "] -> [" << value << "]\n");
}
#else
PRETEND_USE(cfg);
@ -92,35 +86,30 @@ void reapXAttrs(const RclConfig* cfg, const string& path,
#endif
}
void docFieldsFromXattrs(RclConfig *cfg, const map<string, string>& xfields,
Rcl::Doc& doc)
void docFieldsFromXattrs(RclConfig *cfg, const map<string, string>& xfields, Rcl::Doc& doc)
{
for (map<string,string>::const_iterator it = xfields.begin();
it != xfields.end(); it++) {
docfieldfrommeta(cfg, it->first, it->second, doc);
for (const auto& fld : xfields) {
docfieldfrommeta(cfg, fld.first, fld.second, doc);
}
}
void reapMetaCmds(RclConfig* cfg, const string& path,
map<string, string>& cfields)
void reapMetaCmds(RclConfig* cfg, const string& path, map<string, string>& cfields)
{
const vector<MDReaper>& reapers = cfg->getMDReapers();
const auto& reapers = cfg->getMDReapers();
if (reapers.empty())
return;
return;
map<char,string> smap = {{'f', path}};
for (vector<MDReaper>::const_iterator rp = reapers.begin();
rp != reapers.end(); rp++) {
vector<string> cmd;
for (vector<string>::const_iterator it = rp->cmdv.begin();
it != rp->cmdv.end(); it++) {
string s;
pcSubst(*it, s, smap);
cmd.push_back(s);
}
string output;
if (ExecCmd::backtick(cmd, output)) {
cfields[rp->fieldname] = output;
}
for (const auto& reaper : reapers) {
vector<string> cmd;
for (const auto& arg : reaper.cmdv) {
string s;
pcSubst(arg, s, smap);
cmd.push_back(s);
}
string output;
if (ExecCmd::backtick(cmd, output)) {
cfields[reaper.fieldname] = output;
}
}
}
@ -132,26 +121,23 @@ void reapMetaCmds(RclConfig* cfg, const string& path,
// "modificationdate" will set mtime instead of an ordinary field,
// and the output from anything beginning with "rclmulti" will be
// interpreted as multiple fields in configuration file format...
void docFieldsFromMetaCmds(RclConfig *cfg, const map<string, string>& cfields,
Rcl::Doc& doc)
void docFieldsFromMetaCmds(RclConfig *cfg, const map<string, string>& cfields, Rcl::Doc& doc)
{
for (map<string,string>::const_iterator it = cfields.begin();
it != cfields.end(); it++) {
if (!it->first.compare(0, 8, "rclmulti")) {
ConfSimple simple(it->second);
if (simple.ok()) {
vector<string> names = simple.getNames("");
for (vector<string>::const_iterator nm = names.begin();
nm != names.end(); nm++) {
string value;
if (simple.get(*nm, value)) {
docfieldfrommeta(cfg, *nm, value, doc);
for (const auto& cfld : cfields) {
if (!cfld.first.compare(0, 8, "rclmulti")) {
ConfSimple simple(cfld.second);
if (simple.ok()) {
auto names = simple.getNames("");
for (const auto& nm : names) {
string value;
if (simple.get(nm, value)) {
docfieldfrommeta(cfg, nm, value, doc);
}
}
}
} else {
docfieldfrommeta(cfg, cfld.first, cfld.second, doc);
}
}
} else {
docfieldfrommeta(cfg, it->first, it->second, doc);
}
}
}

View File

@ -34,7 +34,7 @@ inline void
lowercase_string(string &str)
{
for (string::iterator i = str.begin(); i != str.end(); ++i) {
*i = tolower(static_cast<unsigned char>(*i));
*i = tolower(static_cast<unsigned char>(*i));
}
}
@ -68,7 +68,7 @@ inline static bool
p_nottag(char c)
{
return !isalnum(static_cast<unsigned char>(c)) &&
c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
}
inline static bool
@ -99,14 +99,14 @@ HtmlParser::HtmlParser()
#if 0
static const struct ent { const char *n; unsigned int v; } ents[] = {
#include "namedentities.h"
{ NULL, 0 }
{ NULL, 0 }
};
if (named_ents.empty()) {
const struct ent *i = ents;
while (i->n) {
named_ents[string(i->n)] = i->v;
++i;
}
const struct ent *i = ents;
while (i->n) {
named_ents[string(i->n)] = i->v;
++i;
}
}
#endif
}
@ -123,45 +123,45 @@ HtmlParser::decode_entities(string &)
// find() and find_if() templates don't work...
string::const_iterator amp = s.begin(), s_end = s.end();
while ((amp = find(amp, s_end, '&')) != s_end) {
unsigned int val = 0;
string::const_iterator end, p = amp + 1;
if (p != s_end && *p == '#') {
p++;
if (p != s_end && (*p == 'x' || *p == 'X')) {
// hex
p++;
end = find_if(p, s_end, p_notxdigit);
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
unsigned int val = 0;
string::const_iterator end, p = amp + 1;
if (p != s_end && *p == '#') {
p++;
if (p != s_end && (*p == 'x' || *p == 'X')) {
// hex
p++;
end = find_if(p, s_end, p_notxdigit);
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
} else {
// number
end = find_if(p, s_end, p_notdigit);
val = atoi(s.substr(p - s.begin(), end - p).c_str());
}
} else {
// number
end = find_if(p, s_end, p_notdigit);
val = atoi(s.substr(p - s.begin(), end - p).c_str());
end = find_if(p, s_end, p_notalnum);
string code = s.substr(p - s.begin(), end - p);
map<string, unsigned int>::const_iterator i;
i = named_ents.find(code);
if (i != named_ents.end()) val = i->second;
}
} else {
end = find_if(p, s_end, p_notalnum);
string code = s.substr(p - s.begin(), end - p);
map<string, unsigned int>::const_iterator i;
i = named_ents.find(code);
if (i != named_ents.end()) val = i->second;
}
if (end < s_end && *end == ';') end++;
if (val) {
string::size_type amp_pos = amp - s.begin();
if (val < 0x80) {
s.replace(amp_pos, end - amp, 1u, char(val));
if (end < s_end && *end == ';') end++;
if (val) {
string::size_type amp_pos = amp - s.begin();
if (val < 0x80) {
s.replace(amp_pos, end - amp, 1u, char(val));
} else {
// Convert unicode value val to UTF-8.
char seq[4];
unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
s.replace(amp_pos, end - amp, seq, len);
}
s_end = s.end();
// We've modified the string, so the iterators are no longer
// valid...
amp = s.begin() + amp_pos + 1;
} else {
// Convert unicode value val to UTF-8.
char seq[4];
unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
s.replace(amp_pos, end - amp, seq, len);
amp = end;
}
s_end = s.end();
// We've modified the string, so the iterators are no longer
// valid...
amp = s.begin() + amp_pos + 1;
} else {
amp = end;
}
}
#endif
}
@ -175,222 +175,222 @@ HtmlParser::parse_html(const string &body)
string::const_iterator start = body.begin();
while (true) {
// Skip through until we find an HTML tag, a comment, or the end of
// document. Ignore isolated occurrences of `<' which don't start
// a tag or comment.
string::const_iterator p = start;
while (true) {
p = find(p, body.end(), '<');
if (p == body.end()) break;
unsigned char ch = *(p + 1);
// Skip through until we find an HTML tag, a comment, or the end of
// document. Ignore isolated occurrences of `<' which don't start
// a tag or comment.
string::const_iterator p = start;
while (true) {
p = find(p, body.end(), '<');
if (p == body.end()) break;
unsigned char ch = *(p + 1);
// Tag, closing tag, or comment (or SGML declaration).
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
// Tag, closing tag, or comment (or SGML declaration).
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
if (ch == '?') {
// PHP code or XML declaration.
// XML declaration is only valid at the start of the first line.
// FIXME: need to deal with BOMs...
if (p != body.begin() || body.size() < 20) break;
if (ch == '?') {
// PHP code or XML declaration.
// XML declaration is only valid at the start of the first line.
// FIXME: need to deal with BOMs...
if (p != body.begin() || body.size() < 20) break;
// XML declaration looks something like this:
// <?xml version="1.0" encoding="UTF-8"?>
if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
if (strchr(" \t\r\n", p[5]) == NULL) break;
// XML declaration looks something like this:
// <?xml version="1.0" encoding="UTF-8"?>
if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
if (strchr(" \t\r\n", p[5]) == NULL) break;
string::const_iterator decl_end = find(p + 6, body.end(), '?');
if (decl_end == body.end()) break;
string::const_iterator decl_end = find(p + 6, body.end(), '?');
if (decl_end == body.end()) break;
// Default charset for XML is UTF-8.
charset = "utf-8";
// Default charset for XML is UTF-8.
charset = "utf-8";
string decl(p + 6, decl_end);
size_t enc = decl.find("encoding");
if (enc == string::npos) break;
string decl(p + 6, decl_end);
size_t enc = decl.find("encoding");
if (enc == string::npos) break;
enc = decl.find_first_not_of(" \t\r\n", enc + 8);
if (enc == string::npos || enc == decl.size()) break;
enc = decl.find_first_not_of(" \t\r\n", enc + 8);
if (enc == string::npos || enc == decl.size()) break;
if (decl[enc] != '=') break;
if (decl[enc] != '=') break;
enc = decl.find_first_not_of(" \t\r\n", enc + 1);
if (enc == string::npos || enc == decl.size()) break;
enc = decl.find_first_not_of(" \t\r\n", enc + 1);
if (enc == string::npos || enc == decl.size()) break;
if (decl[enc] != '"' && decl[enc] != '\'') break;
if (decl[enc] != '"' && decl[enc] != '\'') break;
char quote = decl[enc++];
size_t enc_end = decl.find(quote, enc);
char quote = decl[enc++];
size_t enc_end = decl.find(quote, enc);
if (enc != string::npos)
charset = decl.substr(enc, enc_end - enc);
if (enc != string::npos)
charset = decl.substr(enc, enc_end - enc);
break;
}
p++;
}
// Process text up to start of tag.
if (p > start || p == body.end()) {
string text = body.substr(start - body.begin(), p - start);
decode_entities(text);
process_text(text);
}
if (p == body.end()) {
do_eof();
break;
}
start = p + 1;
if (start == body.end()) break;
if (*start == '!') {
if (++start == body.end()) break;
if (++start == body.end()) break;
// comment or SGML declaration
if (*(start - 1) == '-' && *start == '-') {
++start;
string::const_iterator close = find(start, body.end(), '>');
// An unterminated comment swallows rest of document
// (like Netscape, but unlike MSIE IIRC)
if (close == body.end()) break;
p = close;
// look for -->
while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
p = find(p + 1, body.end(), '>');
if (p != body.end()) {
// Check for htdig's "ignore this bit" comments.
if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
string::size_type i;
i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
if (i == string::npos) break;
start = body.begin() + i + 21;
continue;
}
// If we found --> skip to there.
start = p;
} else {
// Otherwise skip to the first > we found (as Netscape does).
start = close;
}
} else {
// just an SGML declaration, perhaps giving the DTD - ignore it
start = find(start - 1, body.end(), '>');
if (start == body.end()) break;
}
++start;
} else if (*start == '?') {
if (++start == body.end()) break;
// PHP - swallow until ?> or EOF
start = find(start + 1, body.end(), '>');
// look for ?>
while (start != body.end() && *(start - 1) != '?')
start = find(start + 1, body.end(), '>');
// unterminated PHP swallows rest of document (rather arbitrarily
// but it avoids polluting the database when things go wrong)
if (start != body.end()) ++start;
} else {
// opening or closing tag
int closing = 0;
if (*start == '/') {
closing = 1;
start = find_if(start + 1, body.end(), p_notwhitespace);
}
p = start;
start = find_if(start, body.end(), p_nottag);
string tag = body.substr(p - body.begin(), start - p);
// convert tagname to lowercase
lowercase_string(tag);
if (closing) {
if (!closing_tag(tag))
return;
if (in_script && tag == "script") in_script = false;
/* ignore any bogus parameters on closing tags */
p = find(start, body.end(), '>');
if (p == body.end()) break;
start = p + 1;
} else {
bool empty_element = false;
// FIXME: parse parameters lazily.
while (start < body.end() && *start != '>') {
string name, value;
p = find_if(start, body.end(), p_whitespaceeqgt);
size_t name_len = p - start;
if (name_len == 1) {
if (*start == '/' && p < body.end() && *p == '>') {
// E.g. <tag foo="bar" />
start = p;
empty_element = true;
break;
}
}
name.assign(body, start - body.begin(), name_len);
p = find_if(p, body.end(), p_notwhitespace);
start = p;
if (start != body.end() && *start == '=') {
start = find_if(start + 1, body.end(), p_notwhitespace);
p = body.end();
int quote = *start;
if (quote == '"' || quote == '\'') {
start++;
p = find(start, body.end(), quote);
}
if (p == body.end()) {
// unquoted or no closing quote
p = find_if(start, body.end(), p_whitespacegt);
}
value.assign(body, start - body.begin(), p - start);
start = find_if(p, body.end(), p_notwhitespace);
if (!name.empty()) {
// convert parameter name to lowercase
lowercase_string(name);
// in case of multiple entries, use the first
// (as Netscape does)
parameters.insert(make_pair(name, value));
}
}
p++;
}
// Process text up to start of tag.
if (p > start || p == body.end()) {
string text = body.substr(start - body.begin(), p - start);
decode_entities(text);
process_text(text);
}
if (p == body.end()) {
do_eof();
break;
}
start = p + 1;
if (start == body.end()) break;
if (*start == '!') {
if (++start == body.end()) break;
if (++start == body.end()) break;
// comment or SGML declaration
if (*(start - 1) == '-' && *start == '-') {
++start;
string::const_iterator close = find(start, body.end(), '>');
// An unterminated comment swallows rest of document
// (like Netscape, but unlike MSIE IIRC)
if (close == body.end()) break;
p = close;
// look for -->
while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
p = find(p + 1, body.end(), '>');
if (p != body.end()) {
// Check for htdig's "ignore this bit" comments.
if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
string::size_type i;
i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
if (i == string::npos) break;
start = body.begin() + i + 21;
continue;
}
// If we found --> skip to there.
start = p;
} else {
// Otherwise skip to the first > we found (as Netscape does).
start = close;
}
} else {
// just an SGML declaration, perhaps giving the DTD - ignore it
start = find(start - 1, body.end(), '>');
if (start == body.end()) break;
}
++start;
} else if (*start == '?') {
if (++start == body.end()) break;
// PHP - swallow until ?> or EOF
start = find(start + 1, body.end(), '>');
// look for ?>
while (start != body.end() && *(start - 1) != '?')
start = find(start + 1, body.end(), '>');
// unterminated PHP swallows rest of document (rather arbitrarily
// but it avoids polluting the database when things go wrong)
if (start != body.end()) ++start;
} else {
// opening or closing tag
int closing = 0;
if (*start == '/') {
closing = 1;
start = find_if(start + 1, body.end(), p_notwhitespace);
}
p = start;
start = find_if(start, body.end(), p_nottag);
string tag = body.substr(p - body.begin(), start - p);
// convert tagname to lowercase
lowercase_string(tag);
if (closing) {
if (!closing_tag(tag))
return;
if (in_script && tag == "script") in_script = false;
/* ignore any bogus parameters on closing tags */
p = find(start, body.end(), '>');
if (p == body.end()) break;
start = p + 1;
} else {
bool empty_element = false;
// FIXME: parse parameters lazily.
while (start < body.end() && *start != '>') {
string name, value;
p = find_if(start, body.end(), p_whitespaceeqgt);
size_t name_len = p - start;
if (name_len == 1) {
if (*start == '/' && p < body.end() && *p == '>') {
// E.g. <tag foo="bar" />
start = p;
empty_element = true;
break;
}
}
name.assign(body, start - body.begin(), name_len);
p = find_if(p, body.end(), p_notwhitespace);
start = p;
if (start != body.end() && *start == '=') {
start = find_if(start + 1, body.end(), p_notwhitespace);
p = body.end();
int quote = *start;
if (quote == '"' || quote == '\'') {
start++;
p = find(start, body.end(), quote);
}
if (p == body.end()) {
// unquoted or no closing quote
p = find_if(start, body.end(), p_whitespacegt);
}
value.assign(body, start - body.begin(), p - start);
start = find_if(p, body.end(), p_notwhitespace);
if (!name.empty()) {
// convert parameter name to lowercase
lowercase_string(name);
// in case of multiple entries, use the first
// (as Netscape does)
parameters.insert(make_pair(name, value));
}
}
}
#if 0
cout << "<" << tag;
map<string, string>::const_iterator x;
for (x = parameters.begin(); x != parameters.end(); x++) {
cout << " " << x->first << "=\"" << x->second << "\"";
}
cout << ">\n";
cout << "<" << tag;
map<string, string>::const_iterator x;
for (x = parameters.begin(); x != parameters.end(); x++) {
cout << " " << x->first << "=\"" << x->second << "\"";
}
cout << ">\n";
#endif
if (!opening_tag(tag))
return;
parameters.clear();
if (!opening_tag(tag))
return;
parameters.clear();
if (empty_element) {
if (!closing_tag(tag))
return;
if (empty_element) {
if (!closing_tag(tag))
return;
}
// In <script> tags we ignore opening tags to avoid problems
// with "a<b".
if (tag == "script") in_script = true;
if (start != body.end() && *start == '>') ++start;
}
}
// In <script> tags we ignore opening tags to avoid problems
// with "a<b".
if (tag == "script") in_script = true;
if (start != body.end() && *start == '>') ++start;
}
}
}
}

View File

@ -32,17 +32,17 @@ using std::map;
class HtmlParser {
map<string, string> parameters;
protected:
virtual void decode_entities(string &s);
bool in_script;
string charset;
protected:
virtual void decode_entities(string &s);
bool in_script;
string charset;
static map<string, unsigned int> named_ents;
bool get_parameter(const string & param, string & value) const;
public:
public:
virtual void process_text(const string &/*text*/) { }
virtual bool opening_tag(const string &/*tag*/) { return true; }
virtual bool closing_tag(const string &/*tag*/) { return true; }
virtual bool closing_tag(const string &/*tag*/) { return true; }
virtual void parse_html(const string &text);
virtual void do_eof() {}
HtmlParser();

View File

@ -39,8 +39,7 @@ public:
virtual void clear_impl() override;
protected:
virtual bool set_document_file_impl(const std::string&,
const std::string&) override;
virtual bool set_document_file_impl(const std::string&, const std::string&) override;
class Internal;
private:

View File

@ -33,11 +33,10 @@
/// Associated to application/x-zerosize, so use the following in mimeconf:
/// <mimetype> = internal application/x-zerosize
class MimeHandlerNull : public RecollFilter {
public:
public:
MimeHandlerNull(RclConfig *cnf, const std::string& id)
: RecollFilter(cnf, id) {
}
virtual ~MimeHandlerNull() {}
: RecollFilter(cnf, id) {}
virtual ~MimeHandlerNull() = default;
MimeHandlerNull(const MimeHandlerNull&) = delete;
MimeHandlerNull& operator=(const MimeHandlerNull&) = delete;
@ -45,14 +44,13 @@ class MimeHandlerNull : public RecollFilter {
return true;
}
virtual bool next_document()
{
if (m_havedoc == false)
return false;
m_havedoc = false;
m_metaData[cstr_dj_keycontent] = cstr_null;
m_metaData[cstr_dj_keymt] = cstr_textplain;
return true;
virtual bool next_document() {
if (m_havedoc == false)
return false;
m_havedoc = false;
m_metaData[cstr_dj_keycontent] = cstr_null;
m_metaData[cstr_dj_keymt] = cstr_textplain;
return true;
}
};

View File

@ -36,9 +36,8 @@
class MimeHandlerSymlink : public RecollFilter {
public:
MimeHandlerSymlink(RclConfig *cnf, const std::string& id)
: RecollFilter(cnf, id) {
}
virtual ~MimeHandlerSymlink() {}
: RecollFilter(cnf, id) {}
virtual ~MimeHandlerSymlink() = default;
MimeHandlerSymlink(const MimeHandlerSymlink&) = delete;
MimeHandlerSymlink& operator=(const MimeHandlerSymlink&) = delete;