diff --git a/src/internfile/extrameta.cpp b/src/internfile/extrameta.cpp index c050828a..26e0a057 100644 --- a/src/internfile/extrameta.cpp +++ b/src/internfile/extrameta.cpp @@ -30,20 +30,18 @@ using std::string; using std::map; static void docfieldfrommeta(RclConfig* cfg, const string& name, - const string &value, Rcl::Doc& doc) + const string &value, Rcl::Doc& doc) { string fieldname = cfg->fieldCanon(name); - LOGDEB0("Internfile:: setting [" << fieldname << - "] from cmd/xattr value [" << value << "]\n"); + LOGDEB0("Internfile:: setting [" << fieldname << "] from cmd/xattr value [" << value << "]\n"); if (fieldname == cstr_dj_keymd) { - doc.dmtime = value; + doc.dmtime = value; } else { - doc.meta[fieldname] = value; + doc.meta[fieldname] = value; } } -void reapXAttrs(const RclConfig* cfg, const string& path, - map& xfields) +void reapXAttrs(const RclConfig* cfg, const string& path, map& xfields) { LOGDEB2("reapXAttrs: [" << path << "]\n"); #ifndef _WIN32 @@ -51,39 +49,35 @@ void reapXAttrs(const RclConfig* cfg, const string& path, vector xnames; if (!pxattr::list(path, &xnames)) { if (errno == ENOTSUP) { - LOGDEB("FileInterner::reapXattrs: pxattr::list: errno " << - errno << "\n"); + LOGDEB("FileInterner::reapXattrs: pxattr::list: errno " << errno << "\n"); } else { - LOGERR("FileInterner::reapXattrs: pxattr::list: errno " << - errno << "\n"); + LOGSYSERR("FileInterner::reapXattrs", "pxattr::list", path); } - return; + return; } const map& xtof = cfg->getXattrToField(); // Record the xattrs: names found in the config are either skipped // or mapped depending if the translation is empty. Other names // are recorded as-is - for (vector::const_iterator it = xnames.begin(); - it != xnames.end(); it++) { - string key = *it; - map::const_iterator mit = xtof.find(*it); - if (mit != xtof.end()) { - if (mit->second.empty()) { - continue; - } else { - key = mit->second; + for (const auto& xkey : xnames) { + string key = xkey; + auto mit = xtof.find(xkey); + if (mit != xtof.end()) { + if (mit->second.empty()) { + continue; + } else { + key = mit->second; + } } - } - string value; - if (!pxattr::get(path, *it, &value, pxattr::PXATTR_NOFOLLOW)) { - LOGERR("FileInterner::reapXattrs: pxattr::get failed for " << *it - << ", errno " << errno << "\n"); - continue; - } - // Encode should we ? - xfields[key] = value; - LOGDEB2("reapXAttrs: [" << key << "] -> [" << value << "]\n"); + string value; + if (!pxattr::get(path, xkey, &value, pxattr::PXATTR_NOFOLLOW)) { + LOGSYSERR("FileInterner::reapXattrs", "pxattr::get", path + " : " + xkey); + continue; + } + // Encode should we ? + xfields[key] = value; + LOGDEB2("reapXAttrs: [" << key << "] -> [" << value << "]\n"); } #else PRETEND_USE(cfg); @@ -92,35 +86,30 @@ void reapXAttrs(const RclConfig* cfg, const string& path, #endif } -void docFieldsFromXattrs(RclConfig *cfg, const map& xfields, - Rcl::Doc& doc) +void docFieldsFromXattrs(RclConfig *cfg, const map& xfields, Rcl::Doc& doc) { - for (map::const_iterator it = xfields.begin(); - it != xfields.end(); it++) { - docfieldfrommeta(cfg, it->first, it->second, doc); + for (const auto& fld : xfields) { + docfieldfrommeta(cfg, fld.first, fld.second, doc); } } -void reapMetaCmds(RclConfig* cfg, const string& path, - map& cfields) +void reapMetaCmds(RclConfig* cfg, const string& path, map& cfields) { - const vector& reapers = cfg->getMDReapers(); + const auto& reapers = cfg->getMDReapers(); if (reapers.empty()) - return; + return; map smap = {{'f', path}}; - for (vector::const_iterator rp = reapers.begin(); - rp != reapers.end(); rp++) { - vector cmd; - for (vector::const_iterator it = rp->cmdv.begin(); - it != rp->cmdv.end(); it++) { - string s; - pcSubst(*it, s, smap); - cmd.push_back(s); - } - string output; - if (ExecCmd::backtick(cmd, output)) { - cfields[rp->fieldname] = output; - } + for (const auto& reaper : reapers) { + vector cmd; + for (const auto& arg : reaper.cmdv) { + string s; + pcSubst(arg, s, smap); + cmd.push_back(s); + } + string output; + if (ExecCmd::backtick(cmd, output)) { + cfields[reaper.fieldname] = output; + } } } @@ -132,26 +121,23 @@ void reapMetaCmds(RclConfig* cfg, const string& path, // "modificationdate" will set mtime instead of an ordinary field, // and the output from anything beginning with "rclmulti" will be // interpreted as multiple fields in configuration file format... -void docFieldsFromMetaCmds(RclConfig *cfg, const map& cfields, - Rcl::Doc& doc) +void docFieldsFromMetaCmds(RclConfig *cfg, const map& cfields, Rcl::Doc& doc) { - for (map::const_iterator it = cfields.begin(); - it != cfields.end(); it++) { - if (!it->first.compare(0, 8, "rclmulti")) { - ConfSimple simple(it->second); - if (simple.ok()) { - vector names = simple.getNames(""); - for (vector::const_iterator nm = names.begin(); - nm != names.end(); nm++) { - string value; - if (simple.get(*nm, value)) { - docfieldfrommeta(cfg, *nm, value, doc); + for (const auto& cfld : cfields) { + if (!cfld.first.compare(0, 8, "rclmulti")) { + ConfSimple simple(cfld.second); + if (simple.ok()) { + auto names = simple.getNames(""); + for (const auto& nm : names) { + string value; + if (simple.get(nm, value)) { + docfieldfrommeta(cfg, nm, value, doc); + } + } } + } else { + docfieldfrommeta(cfg, cfld.first, cfld.second, doc); } - } - } else { - docfieldfrommeta(cfg, it->first, it->second, doc); - } } } diff --git a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp index fd10b897..5ec30be7 100644 --- a/src/internfile/htmlparse.cpp +++ b/src/internfile/htmlparse.cpp @@ -34,7 +34,7 @@ inline void lowercase_string(string &str) { for (string::iterator i = str.begin(); i != str.end(); ++i) { - *i = tolower(static_cast(*i)); + *i = tolower(static_cast(*i)); } } @@ -68,7 +68,7 @@ inline static bool p_nottag(char c) { return !isalnum(static_cast(c)) && - c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. + c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. } inline static bool @@ -99,14 +99,14 @@ HtmlParser::HtmlParser() #if 0 static const struct ent { const char *n; unsigned int v; } ents[] = { #include "namedentities.h" - { NULL, 0 } + { NULL, 0 } }; if (named_ents.empty()) { - const struct ent *i = ents; - while (i->n) { - named_ents[string(i->n)] = i->v; - ++i; - } + const struct ent *i = ents; + while (i->n) { + named_ents[string(i->n)] = i->v; + ++i; + } } #endif } @@ -123,45 +123,45 @@ HtmlParser::decode_entities(string &) // find() and find_if() templates don't work... string::const_iterator amp = s.begin(), s_end = s.end(); while ((amp = find(amp, s_end, '&')) != s_end) { - unsigned int val = 0; - string::const_iterator end, p = amp + 1; - if (p != s_end && *p == '#') { - p++; - if (p != s_end && (*p == 'x' || *p == 'X')) { - // hex - p++; - end = find_if(p, s_end, p_notxdigit); - sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && (*p == 'x' || *p == 'X')) { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } } else { - // number - end = find_if(p, s_end, p_notdigit); - val = atoi(s.substr(p - s.begin(), end - p).c_str()); + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map::const_iterator i; + i = named_ents.find(code); + if (i != named_ents.end()) val = i->second; } - } else { - end = find_if(p, s_end, p_notalnum); - string code = s.substr(p - s.begin(), end - p); - map::const_iterator i; - i = named_ents.find(code); - if (i != named_ents.end()) val = i->second; - } - if (end < s_end && *end == ';') end++; - if (val) { - string::size_type amp_pos = amp - s.begin(); - if (val < 0x80) { - s.replace(amp_pos, end - amp, 1u, char(val)); + if (end < s_end && *end == ';') end++; + if (val) { + string::size_type amp_pos = amp - s.begin(); + if (val < 0x80) { + s.replace(amp_pos, end - amp, 1u, char(val)); + } else { + // Convert unicode value val to UTF-8. + char seq[4]; + unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); + s.replace(amp_pos, end - amp, seq, len); + } + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + 1; } else { - // Convert unicode value val to UTF-8. - char seq[4]; - unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); - s.replace(amp_pos, end - amp, seq, len); + amp = end; } - s_end = s.end(); - // We've modified the string, so the iterators are no longer - // valid... - amp = s.begin() + amp_pos + 1; - } else { - amp = end; - } } #endif } @@ -175,222 +175,222 @@ HtmlParser::parse_html(const string &body) string::const_iterator start = body.begin(); while (true) { - // Skip through until we find an HTML tag, a comment, or the end of - // document. Ignore isolated occurrences of `<' which don't start - // a tag or comment. - string::const_iterator p = start; - while (true) { - p = find(p, body.end(), '<'); - if (p == body.end()) break; - unsigned char ch = *(p + 1); + // Skip through until we find an HTML tag, a comment, or the end of + // document. Ignore isolated occurrences of `<' which don't start + // a tag or comment. + string::const_iterator p = start; + while (true) { + p = find(p, body.end(), '<'); + if (p == body.end()) break; + unsigned char ch = *(p + 1); - // Tag, closing tag, or comment (or SGML declaration). - if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; + // Tag, closing tag, or comment (or SGML declaration). + if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; - if (ch == '?') { - // PHP code or XML declaration. - // XML declaration is only valid at the start of the first line. - // FIXME: need to deal with BOMs... - if (p != body.begin() || body.size() < 20) break; + if (ch == '?') { + // PHP code or XML declaration. + // XML declaration is only valid at the start of the first line. + // FIXME: need to deal with BOMs... + if (p != body.begin() || body.size() < 20) break; - // XML declaration looks something like this: - // - if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; - if (strchr(" \t\r\n", p[5]) == NULL) break; + // XML declaration looks something like this: + // + if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; + if (strchr(" \t\r\n", p[5]) == NULL) break; - string::const_iterator decl_end = find(p + 6, body.end(), '?'); - if (decl_end == body.end()) break; + string::const_iterator decl_end = find(p + 6, body.end(), '?'); + if (decl_end == body.end()) break; - // Default charset for XML is UTF-8. - charset = "utf-8"; + // Default charset for XML is UTF-8. + charset = "utf-8"; - string decl(p + 6, decl_end); - size_t enc = decl.find("encoding"); - if (enc == string::npos) break; + string decl(p + 6, decl_end); + size_t enc = decl.find("encoding"); + if (enc == string::npos) break; - enc = decl.find_first_not_of(" \t\r\n", enc + 8); - if (enc == string::npos || enc == decl.size()) break; + enc = decl.find_first_not_of(" \t\r\n", enc + 8); + if (enc == string::npos || enc == decl.size()) break; - if (decl[enc] != '=') break; + if (decl[enc] != '=') break; - enc = decl.find_first_not_of(" \t\r\n", enc + 1); - if (enc == string::npos || enc == decl.size()) break; + enc = decl.find_first_not_of(" \t\r\n", enc + 1); + if (enc == string::npos || enc == decl.size()) break; - if (decl[enc] != '"' && decl[enc] != '\'') break; + if (decl[enc] != '"' && decl[enc] != '\'') break; - char quote = decl[enc++]; - size_t enc_end = decl.find(quote, enc); + char quote = decl[enc++]; + size_t enc_end = decl.find(quote, enc); - if (enc != string::npos) - charset = decl.substr(enc, enc_end - enc); + if (enc != string::npos) + charset = decl.substr(enc, enc_end - enc); - break; - } - p++; - } - - // Process text up to start of tag. - if (p > start || p == body.end()) { - string text = body.substr(start - body.begin(), p - start); - decode_entities(text); - process_text(text); - } - - if (p == body.end()) { - do_eof(); - break; - } - - start = p + 1; - - if (start == body.end()) break; - - if (*start == '!') { - if (++start == body.end()) break; - if (++start == body.end()) break; - // comment or SGML declaration - if (*(start - 1) == '-' && *start == '-') { - ++start; - string::const_iterator close = find(start, body.end(), '>'); - // An unterminated comment swallows rest of document - // (like Netscape, but unlike MSIE IIRC) - if (close == body.end()) break; - - p = close; - // look for --> - while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) - p = find(p + 1, body.end(), '>'); - - if (p != body.end()) { - // Check for htdig's "ignore this bit" comments. - if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { - string::size_type i; - i = body.find("", p + 1 - body.begin()); - if (i == string::npos) break; - start = body.begin() + i + 21; - continue; - } - // If we found --> skip to there. - start = p; - } else { - // Otherwise skip to the first > we found (as Netscape does). - start = close; - } - } else { - // just an SGML declaration, perhaps giving the DTD - ignore it - start = find(start - 1, body.end(), '>'); - if (start == body.end()) break; - } - ++start; - } else if (*start == '?') { - if (++start == body.end()) break; - // PHP - swallow until ?> or EOF - start = find(start + 1, body.end(), '>'); - - // look for ?> - while (start != body.end() && *(start - 1) != '?') - start = find(start + 1, body.end(), '>'); - - // unterminated PHP swallows rest of document (rather arbitrarily - // but it avoids polluting the database when things go wrong) - if (start != body.end()) ++start; - } else { - // opening or closing tag - int closing = 0; - - if (*start == '/') { - closing = 1; - start = find_if(start + 1, body.end(), p_notwhitespace); - } - - p = start; - start = find_if(start, body.end(), p_nottag); - string tag = body.substr(p - body.begin(), start - p); - // convert tagname to lowercase - lowercase_string(tag); - - if (closing) { - if (!closing_tag(tag)) - return; - if (in_script && tag == "script") in_script = false; - - /* ignore any bogus parameters on closing tags */ - p = find(start, body.end(), '>'); - if (p == body.end()) break; - start = p + 1; - } else { - bool empty_element = false; - // FIXME: parse parameters lazily. - while (start < body.end() && *start != '>') { - string name, value; - - p = find_if(start, body.end(), p_whitespaceeqgt); - - size_t name_len = p - start; - if (name_len == 1) { - if (*start == '/' && p < body.end() && *p == '>') { - // E.g. - start = p; - empty_element = true; break; } - } - - name.assign(body, start - body.begin(), name_len); - - p = find_if(p, body.end(), p_notwhitespace); - - start = p; - if (start != body.end() && *start == '=') { - start = find_if(start + 1, body.end(), p_notwhitespace); - - p = body.end(); - - int quote = *start; - if (quote == '"' || quote == '\'') { - start++; - p = find(start, body.end(), quote); - } - - if (p == body.end()) { - // unquoted or no closing quote - p = find_if(start, body.end(), p_whitespacegt); - } - value.assign(body, start - body.begin(), p - start); - start = find_if(p, body.end(), p_notwhitespace); - - if (!name.empty()) { - // convert parameter name to lowercase - lowercase_string(name); - // in case of multiple entries, use the first - // (as Netscape does) - parameters.insert(make_pair(name, value)); - } - } + p++; } + + // Process text up to start of tag. + if (p > start || p == body.end()) { + string text = body.substr(start - body.begin(), p - start); + decode_entities(text); + process_text(text); + } + + if (p == body.end()) { + do_eof(); + break; + } + + start = p + 1; + + if (start == body.end()) break; + + if (*start == '!') { + if (++start == body.end()) break; + if (++start == body.end()) break; + // comment or SGML declaration + if (*(start - 1) == '-' && *start == '-') { + ++start; + string::const_iterator close = find(start, body.end(), '>'); + // An unterminated comment swallows rest of document + // (like Netscape, but unlike MSIE IIRC) + if (close == body.end()) break; + + p = close; + // look for --> + while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) + p = find(p + 1, body.end(), '>'); + + if (p != body.end()) { + // Check for htdig's "ignore this bit" comments. + if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { + string::size_type i; + i = body.find("", p + 1 - body.begin()); + if (i == string::npos) break; + start = body.begin() + i + 21; + continue; + } + // If we found --> skip to there. + start = p; + } else { + // Otherwise skip to the first > we found (as Netscape does). + start = close; + } + } else { + // just an SGML declaration, perhaps giving the DTD - ignore it + start = find(start - 1, body.end(), '>'); + if (start == body.end()) break; + } + ++start; + } else if (*start == '?') { + if (++start == body.end()) break; + // PHP - swallow until ?> or EOF + start = find(start + 1, body.end(), '>'); + + // look for ?> + while (start != body.end() && *(start - 1) != '?') + start = find(start + 1, body.end(), '>'); + + // unterminated PHP swallows rest of document (rather arbitrarily + // but it avoids polluting the database when things go wrong) + if (start != body.end()) ++start; + } else { + // opening or closing tag + int closing = 0; + + if (*start == '/') { + closing = 1; + start = find_if(start + 1, body.end(), p_notwhitespace); + } + + p = start; + start = find_if(start, body.end(), p_nottag); + string tag = body.substr(p - body.begin(), start - p); + // convert tagname to lowercase + lowercase_string(tag); + + if (closing) { + if (!closing_tag(tag)) + return; + if (in_script && tag == "script") in_script = false; + + /* ignore any bogus parameters on closing tags */ + p = find(start, body.end(), '>'); + if (p == body.end()) break; + start = p + 1; + } else { + bool empty_element = false; + // FIXME: parse parameters lazily. + while (start < body.end() && *start != '>') { + string name, value; + + p = find_if(start, body.end(), p_whitespaceeqgt); + + size_t name_len = p - start; + if (name_len == 1) { + if (*start == '/' && p < body.end() && *p == '>') { + // E.g. + start = p; + empty_element = true; + break; + } + } + + name.assign(body, start - body.begin(), name_len); + + p = find_if(p, body.end(), p_notwhitespace); + + start = p; + if (start != body.end() && *start == '=') { + start = find_if(start + 1, body.end(), p_notwhitespace); + + p = body.end(); + + int quote = *start; + if (quote == '"' || quote == '\'') { + start++; + p = find(start, body.end(), quote); + } + + if (p == body.end()) { + // unquoted or no closing quote + p = find_if(start, body.end(), p_whitespacegt); + } + value.assign(body, start - body.begin(), p - start); + start = find_if(p, body.end(), p_notwhitespace); + + if (!name.empty()) { + // convert parameter name to lowercase + lowercase_string(name); + // in case of multiple entries, use the first + // (as Netscape does) + parameters.insert(make_pair(name, value)); + } + } + } #if 0 - cout << "<" << tag; - map::const_iterator x; - for (x = parameters.begin(); x != parameters.end(); x++) { - cout << " " << x->first << "=\"" << x->second << "\""; - } - cout << ">\n"; + cout << "<" << tag; + map::const_iterator x; + for (x = parameters.begin(); x != parameters.end(); x++) { + cout << " " << x->first << "=\"" << x->second << "\""; + } + cout << ">\n"; #endif - if (!opening_tag(tag)) - return; - parameters.clear(); + if (!opening_tag(tag)) + return; + parameters.clear(); - if (empty_element) { - if (!closing_tag(tag)) - return; + if (empty_element) { + if (!closing_tag(tag)) + return; + } + + // In