indents + use range-base for loops in extrameta.cpp
This commit is contained in:
parent
9e0018034c
commit
20c3a7ed12
@ -30,20 +30,18 @@ using std::string;
|
|||||||
using std::map;
|
using std::map;
|
||||||
|
|
||||||
static void docfieldfrommeta(RclConfig* cfg, const string& name,
|
static void docfieldfrommeta(RclConfig* cfg, const string& name,
|
||||||
const string &value, Rcl::Doc& doc)
|
const string &value, Rcl::Doc& doc)
|
||||||
{
|
{
|
||||||
string fieldname = cfg->fieldCanon(name);
|
string fieldname = cfg->fieldCanon(name);
|
||||||
LOGDEB0("Internfile:: setting [" << fieldname <<
|
LOGDEB0("Internfile:: setting [" << fieldname << "] from cmd/xattr value [" << value << "]\n");
|
||||||
"] from cmd/xattr value [" << value << "]\n");
|
|
||||||
if (fieldname == cstr_dj_keymd) {
|
if (fieldname == cstr_dj_keymd) {
|
||||||
doc.dmtime = value;
|
doc.dmtime = value;
|
||||||
} else {
|
} else {
|
||||||
doc.meta[fieldname] = value;
|
doc.meta[fieldname] = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void reapXAttrs(const RclConfig* cfg, const string& path,
|
void reapXAttrs(const RclConfig* cfg, const string& path, map<string, string>& xfields)
|
||||||
map<string, string>& xfields)
|
|
||||||
{
|
{
|
||||||
LOGDEB2("reapXAttrs: [" << path << "]\n");
|
LOGDEB2("reapXAttrs: [" << path << "]\n");
|
||||||
#ifndef _WIN32
|
#ifndef _WIN32
|
||||||
@ -51,39 +49,35 @@ void reapXAttrs(const RclConfig* cfg, const string& path,
|
|||||||
vector<string> xnames;
|
vector<string> xnames;
|
||||||
if (!pxattr::list(path, &xnames)) {
|
if (!pxattr::list(path, &xnames)) {
|
||||||
if (errno == ENOTSUP) {
|
if (errno == ENOTSUP) {
|
||||||
LOGDEB("FileInterner::reapXattrs: pxattr::list: errno " <<
|
LOGDEB("FileInterner::reapXattrs: pxattr::list: errno " << errno << "\n");
|
||||||
errno << "\n");
|
|
||||||
} else {
|
} else {
|
||||||
LOGERR("FileInterner::reapXattrs: pxattr::list: errno " <<
|
LOGSYSERR("FileInterner::reapXattrs", "pxattr::list", path);
|
||||||
errno << "\n");
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const map<string, string>& xtof = cfg->getXattrToField();
|
const map<string, string>& xtof = cfg->getXattrToField();
|
||||||
|
|
||||||
// Record the xattrs: names found in the config are either skipped
|
// Record the xattrs: names found in the config are either skipped
|
||||||
// or mapped depending if the translation is empty. Other names
|
// or mapped depending if the translation is empty. Other names
|
||||||
// are recorded as-is
|
// are recorded as-is
|
||||||
for (vector<string>::const_iterator it = xnames.begin();
|
for (const auto& xkey : xnames) {
|
||||||
it != xnames.end(); it++) {
|
string key = xkey;
|
||||||
string key = *it;
|
auto mit = xtof.find(xkey);
|
||||||
map<string, string>::const_iterator mit = xtof.find(*it);
|
if (mit != xtof.end()) {
|
||||||
if (mit != xtof.end()) {
|
if (mit->second.empty()) {
|
||||||
if (mit->second.empty()) {
|
continue;
|
||||||
continue;
|
} else {
|
||||||
} else {
|
key = mit->second;
|
||||||
key = mit->second;
|
}
|
||||||
}
|
}
|
||||||
}
|
string value;
|
||||||
string value;
|
if (!pxattr::get(path, xkey, &value, pxattr::PXATTR_NOFOLLOW)) {
|
||||||
if (!pxattr::get(path, *it, &value, pxattr::PXATTR_NOFOLLOW)) {
|
LOGSYSERR("FileInterner::reapXattrs", "pxattr::get", path + " : " + xkey);
|
||||||
LOGERR("FileInterner::reapXattrs: pxattr::get failed for " << *it
|
continue;
|
||||||
<< ", errno " << errno << "\n");
|
}
|
||||||
continue;
|
// Encode should we ?
|
||||||
}
|
xfields[key] = value;
|
||||||
// Encode should we ?
|
LOGDEB2("reapXAttrs: [" << key << "] -> [" << value << "]\n");
|
||||||
xfields[key] = value;
|
|
||||||
LOGDEB2("reapXAttrs: [" << key << "] -> [" << value << "]\n");
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
PRETEND_USE(cfg);
|
PRETEND_USE(cfg);
|
||||||
@ -92,35 +86,30 @@ void reapXAttrs(const RclConfig* cfg, const string& path,
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void docFieldsFromXattrs(RclConfig *cfg, const map<string, string>& xfields,
|
void docFieldsFromXattrs(RclConfig *cfg, const map<string, string>& xfields, Rcl::Doc& doc)
|
||||||
Rcl::Doc& doc)
|
|
||||||
{
|
{
|
||||||
for (map<string,string>::const_iterator it = xfields.begin();
|
for (const auto& fld : xfields) {
|
||||||
it != xfields.end(); it++) {
|
docfieldfrommeta(cfg, fld.first, fld.second, doc);
|
||||||
docfieldfrommeta(cfg, it->first, it->second, doc);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void reapMetaCmds(RclConfig* cfg, const string& path,
|
void reapMetaCmds(RclConfig* cfg, const string& path, map<string, string>& cfields)
|
||||||
map<string, string>& cfields)
|
|
||||||
{
|
{
|
||||||
const vector<MDReaper>& reapers = cfg->getMDReapers();
|
const auto& reapers = cfg->getMDReapers();
|
||||||
if (reapers.empty())
|
if (reapers.empty())
|
||||||
return;
|
return;
|
||||||
map<char,string> smap = {{'f', path}};
|
map<char,string> smap = {{'f', path}};
|
||||||
for (vector<MDReaper>::const_iterator rp = reapers.begin();
|
for (const auto& reaper : reapers) {
|
||||||
rp != reapers.end(); rp++) {
|
vector<string> cmd;
|
||||||
vector<string> cmd;
|
for (const auto& arg : reaper.cmdv) {
|
||||||
for (vector<string>::const_iterator it = rp->cmdv.begin();
|
string s;
|
||||||
it != rp->cmdv.end(); it++) {
|
pcSubst(arg, s, smap);
|
||||||
string s;
|
cmd.push_back(s);
|
||||||
pcSubst(*it, s, smap);
|
}
|
||||||
cmd.push_back(s);
|
string output;
|
||||||
}
|
if (ExecCmd::backtick(cmd, output)) {
|
||||||
string output;
|
cfields[reaper.fieldname] = output;
|
||||||
if (ExecCmd::backtick(cmd, output)) {
|
}
|
||||||
cfields[rp->fieldname] = output;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,26 +121,23 @@ void reapMetaCmds(RclConfig* cfg, const string& path,
|
|||||||
// "modificationdate" will set mtime instead of an ordinary field,
|
// "modificationdate" will set mtime instead of an ordinary field,
|
||||||
// and the output from anything beginning with "rclmulti" will be
|
// and the output from anything beginning with "rclmulti" will be
|
||||||
// interpreted as multiple fields in configuration file format...
|
// interpreted as multiple fields in configuration file format...
|
||||||
void docFieldsFromMetaCmds(RclConfig *cfg, const map<string, string>& cfields,
|
void docFieldsFromMetaCmds(RclConfig *cfg, const map<string, string>& cfields, Rcl::Doc& doc)
|
||||||
Rcl::Doc& doc)
|
|
||||||
{
|
{
|
||||||
for (map<string,string>::const_iterator it = cfields.begin();
|
for (const auto& cfld : cfields) {
|
||||||
it != cfields.end(); it++) {
|
if (!cfld.first.compare(0, 8, "rclmulti")) {
|
||||||
if (!it->first.compare(0, 8, "rclmulti")) {
|
ConfSimple simple(cfld.second);
|
||||||
ConfSimple simple(it->second);
|
if (simple.ok()) {
|
||||||
if (simple.ok()) {
|
auto names = simple.getNames("");
|
||||||
vector<string> names = simple.getNames("");
|
for (const auto& nm : names) {
|
||||||
for (vector<string>::const_iterator nm = names.begin();
|
string value;
|
||||||
nm != names.end(); nm++) {
|
if (simple.get(nm, value)) {
|
||||||
string value;
|
docfieldfrommeta(cfg, nm, value, doc);
|
||||||
if (simple.get(*nm, value)) {
|
}
|
||||||
docfieldfrommeta(cfg, *nm, value, doc);
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
docfieldfrommeta(cfg, cfld.first, cfld.second, doc);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else {
|
|
||||||
docfieldfrommeta(cfg, it->first, it->second, doc);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,7 +34,7 @@ inline void
|
|||||||
lowercase_string(string &str)
|
lowercase_string(string &str)
|
||||||
{
|
{
|
||||||
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
||||||
*i = tolower(static_cast<unsigned char>(*i));
|
*i = tolower(static_cast<unsigned char>(*i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ inline static bool
|
|||||||
p_nottag(char c)
|
p_nottag(char c)
|
||||||
{
|
{
|
||||||
return !isalnum(static_cast<unsigned char>(c)) &&
|
return !isalnum(static_cast<unsigned char>(c)) &&
|
||||||
c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
|
c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static bool
|
inline static bool
|
||||||
@ -99,14 +99,14 @@ HtmlParser::HtmlParser()
|
|||||||
#if 0
|
#if 0
|
||||||
static const struct ent { const char *n; unsigned int v; } ents[] = {
|
static const struct ent { const char *n; unsigned int v; } ents[] = {
|
||||||
#include "namedentities.h"
|
#include "namedentities.h"
|
||||||
{ NULL, 0 }
|
{ NULL, 0 }
|
||||||
};
|
};
|
||||||
if (named_ents.empty()) {
|
if (named_ents.empty()) {
|
||||||
const struct ent *i = ents;
|
const struct ent *i = ents;
|
||||||
while (i->n) {
|
while (i->n) {
|
||||||
named_ents[string(i->n)] = i->v;
|
named_ents[string(i->n)] = i->v;
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -123,45 +123,45 @@ HtmlParser::decode_entities(string &)
|
|||||||
// find() and find_if() templates don't work...
|
// find() and find_if() templates don't work...
|
||||||
string::const_iterator amp = s.begin(), s_end = s.end();
|
string::const_iterator amp = s.begin(), s_end = s.end();
|
||||||
while ((amp = find(amp, s_end, '&')) != s_end) {
|
while ((amp = find(amp, s_end, '&')) != s_end) {
|
||||||
unsigned int val = 0;
|
unsigned int val = 0;
|
||||||
string::const_iterator end, p = amp + 1;
|
string::const_iterator end, p = amp + 1;
|
||||||
if (p != s_end && *p == '#') {
|
if (p != s_end && *p == '#') {
|
||||||
p++;
|
p++;
|
||||||
if (p != s_end && (*p == 'x' || *p == 'X')) {
|
if (p != s_end && (*p == 'x' || *p == 'X')) {
|
||||||
// hex
|
// hex
|
||||||
p++;
|
p++;
|
||||||
end = find_if(p, s_end, p_notxdigit);
|
end = find_if(p, s_end, p_notxdigit);
|
||||||
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
|
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
|
||||||
|
} else {
|
||||||
|
// number
|
||||||
|
end = find_if(p, s_end, p_notdigit);
|
||||||
|
val = atoi(s.substr(p - s.begin(), end - p).c_str());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// number
|
end = find_if(p, s_end, p_notalnum);
|
||||||
end = find_if(p, s_end, p_notdigit);
|
string code = s.substr(p - s.begin(), end - p);
|
||||||
val = atoi(s.substr(p - s.begin(), end - p).c_str());
|
map<string, unsigned int>::const_iterator i;
|
||||||
|
i = named_ents.find(code);
|
||||||
|
if (i != named_ents.end()) val = i->second;
|
||||||
}
|
}
|
||||||
} else {
|
if (end < s_end && *end == ';') end++;
|
||||||
end = find_if(p, s_end, p_notalnum);
|
if (val) {
|
||||||
string code = s.substr(p - s.begin(), end - p);
|
string::size_type amp_pos = amp - s.begin();
|
||||||
map<string, unsigned int>::const_iterator i;
|
if (val < 0x80) {
|
||||||
i = named_ents.find(code);
|
s.replace(amp_pos, end - amp, 1u, char(val));
|
||||||
if (i != named_ents.end()) val = i->second;
|
} else {
|
||||||
}
|
// Convert unicode value val to UTF-8.
|
||||||
if (end < s_end && *end == ';') end++;
|
char seq[4];
|
||||||
if (val) {
|
unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
|
||||||
string::size_type amp_pos = amp - s.begin();
|
s.replace(amp_pos, end - amp, seq, len);
|
||||||
if (val < 0x80) {
|
}
|
||||||
s.replace(amp_pos, end - amp, 1u, char(val));
|
s_end = s.end();
|
||||||
|
// We've modified the string, so the iterators are no longer
|
||||||
|
// valid...
|
||||||
|
amp = s.begin() + amp_pos + 1;
|
||||||
} else {
|
} else {
|
||||||
// Convert unicode value val to UTF-8.
|
amp = end;
|
||||||
char seq[4];
|
|
||||||
unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
|
|
||||||
s.replace(amp_pos, end - amp, seq, len);
|
|
||||||
}
|
}
|
||||||
s_end = s.end();
|
|
||||||
// We've modified the string, so the iterators are no longer
|
|
||||||
// valid...
|
|
||||||
amp = s.begin() + amp_pos + 1;
|
|
||||||
} else {
|
|
||||||
amp = end;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -175,222 +175,222 @@ HtmlParser::parse_html(const string &body)
|
|||||||
string::const_iterator start = body.begin();
|
string::const_iterator start = body.begin();
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// Skip through until we find an HTML tag, a comment, or the end of
|
// Skip through until we find an HTML tag, a comment, or the end of
|
||||||
// document. Ignore isolated occurrences of `<' which don't start
|
// document. Ignore isolated occurrences of `<' which don't start
|
||||||
// a tag or comment.
|
// a tag or comment.
|
||||||
string::const_iterator p = start;
|
string::const_iterator p = start;
|
||||||
while (true) {
|
while (true) {
|
||||||
p = find(p, body.end(), '<');
|
p = find(p, body.end(), '<');
|
||||||
if (p == body.end()) break;
|
if (p == body.end()) break;
|
||||||
unsigned char ch = *(p + 1);
|
unsigned char ch = *(p + 1);
|
||||||
|
|
||||||
// Tag, closing tag, or comment (or SGML declaration).
|
// Tag, closing tag, or comment (or SGML declaration).
|
||||||
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
||||||
|
|
||||||
if (ch == '?') {
|
if (ch == '?') {
|
||||||
// PHP code or XML declaration.
|
// PHP code or XML declaration.
|
||||||
// XML declaration is only valid at the start of the first line.
|
// XML declaration is only valid at the start of the first line.
|
||||||
// FIXME: need to deal with BOMs...
|
// FIXME: need to deal with BOMs...
|
||||||
if (p != body.begin() || body.size() < 20) break;
|
if (p != body.begin() || body.size() < 20) break;
|
||||||
|
|
||||||
// XML declaration looks something like this:
|
// XML declaration looks something like this:
|
||||||
// <?xml version="1.0" encoding="UTF-8"?>
|
// <?xml version="1.0" encoding="UTF-8"?>
|
||||||
if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
|
if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
|
||||||
if (strchr(" \t\r\n", p[5]) == NULL) break;
|
if (strchr(" \t\r\n", p[5]) == NULL) break;
|
||||||
|
|
||||||
string::const_iterator decl_end = find(p + 6, body.end(), '?');
|
string::const_iterator decl_end = find(p + 6, body.end(), '?');
|
||||||
if (decl_end == body.end()) break;
|
if (decl_end == body.end()) break;
|
||||||
|
|
||||||
// Default charset for XML is UTF-8.
|
// Default charset for XML is UTF-8.
|
||||||
charset = "utf-8";
|
charset = "utf-8";
|
||||||
|
|
||||||
string decl(p + 6, decl_end);
|
string decl(p + 6, decl_end);
|
||||||
size_t enc = decl.find("encoding");
|
size_t enc = decl.find("encoding");
|
||||||
if (enc == string::npos) break;
|
if (enc == string::npos) break;
|
||||||
|
|
||||||
enc = decl.find_first_not_of(" \t\r\n", enc + 8);
|
enc = decl.find_first_not_of(" \t\r\n", enc + 8);
|
||||||
if (enc == string::npos || enc == decl.size()) break;
|
if (enc == string::npos || enc == decl.size()) break;
|
||||||
|
|
||||||
if (decl[enc] != '=') break;
|
if (decl[enc] != '=') break;
|
||||||
|
|
||||||
enc = decl.find_first_not_of(" \t\r\n", enc + 1);
|
enc = decl.find_first_not_of(" \t\r\n", enc + 1);
|
||||||
if (enc == string::npos || enc == decl.size()) break;
|
if (enc == string::npos || enc == decl.size()) break;
|
||||||
|
|
||||||
if (decl[enc] != '"' && decl[enc] != '\'') break;
|
if (decl[enc] != '"' && decl[enc] != '\'') break;
|
||||||
|
|
||||||
char quote = decl[enc++];
|
char quote = decl[enc++];
|
||||||
size_t enc_end = decl.find(quote, enc);
|
size_t enc_end = decl.find(quote, enc);
|
||||||
|
|
||||||
if (enc != string::npos)
|
if (enc != string::npos)
|
||||||
charset = decl.substr(enc, enc_end - enc);
|
charset = decl.substr(enc, enc_end - enc);
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
p++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process text up to start of tag.
|
|
||||||
if (p > start || p == body.end()) {
|
|
||||||
string text = body.substr(start - body.begin(), p - start);
|
|
||||||
decode_entities(text);
|
|
||||||
process_text(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p == body.end()) {
|
|
||||||
do_eof();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
start = p + 1;
|
|
||||||
|
|
||||||
if (start == body.end()) break;
|
|
||||||
|
|
||||||
if (*start == '!') {
|
|
||||||
if (++start == body.end()) break;
|
|
||||||
if (++start == body.end()) break;
|
|
||||||
// comment or SGML declaration
|
|
||||||
if (*(start - 1) == '-' && *start == '-') {
|
|
||||||
++start;
|
|
||||||
string::const_iterator close = find(start, body.end(), '>');
|
|
||||||
// An unterminated comment swallows rest of document
|
|
||||||
// (like Netscape, but unlike MSIE IIRC)
|
|
||||||
if (close == body.end()) break;
|
|
||||||
|
|
||||||
p = close;
|
|
||||||
// look for -->
|
|
||||||
while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
|
|
||||||
p = find(p + 1, body.end(), '>');
|
|
||||||
|
|
||||||
if (p != body.end()) {
|
|
||||||
// Check for htdig's "ignore this bit" comments.
|
|
||||||
if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
|
|
||||||
string::size_type i;
|
|
||||||
i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
|
|
||||||
if (i == string::npos) break;
|
|
||||||
start = body.begin() + i + 21;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// If we found --> skip to there.
|
|
||||||
start = p;
|
|
||||||
} else {
|
|
||||||
// Otherwise skip to the first > we found (as Netscape does).
|
|
||||||
start = close;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// just an SGML declaration, perhaps giving the DTD - ignore it
|
|
||||||
start = find(start - 1, body.end(), '>');
|
|
||||||
if (start == body.end()) break;
|
|
||||||
}
|
|
||||||
++start;
|
|
||||||
} else if (*start == '?') {
|
|
||||||
if (++start == body.end()) break;
|
|
||||||
// PHP - swallow until ?> or EOF
|
|
||||||
start = find(start + 1, body.end(), '>');
|
|
||||||
|
|
||||||
// look for ?>
|
|
||||||
while (start != body.end() && *(start - 1) != '?')
|
|
||||||
start = find(start + 1, body.end(), '>');
|
|
||||||
|
|
||||||
// unterminated PHP swallows rest of document (rather arbitrarily
|
|
||||||
// but it avoids polluting the database when things go wrong)
|
|
||||||
if (start != body.end()) ++start;
|
|
||||||
} else {
|
|
||||||
// opening or closing tag
|
|
||||||
int closing = 0;
|
|
||||||
|
|
||||||
if (*start == '/') {
|
|
||||||
closing = 1;
|
|
||||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
|
||||||
}
|
|
||||||
|
|
||||||
p = start;
|
|
||||||
start = find_if(start, body.end(), p_nottag);
|
|
||||||
string tag = body.substr(p - body.begin(), start - p);
|
|
||||||
// convert tagname to lowercase
|
|
||||||
lowercase_string(tag);
|
|
||||||
|
|
||||||
if (closing) {
|
|
||||||
if (!closing_tag(tag))
|
|
||||||
return;
|
|
||||||
if (in_script && tag == "script") in_script = false;
|
|
||||||
|
|
||||||
/* ignore any bogus parameters on closing tags */
|
|
||||||
p = find(start, body.end(), '>');
|
|
||||||
if (p == body.end()) break;
|
|
||||||
start = p + 1;
|
|
||||||
} else {
|
|
||||||
bool empty_element = false;
|
|
||||||
// FIXME: parse parameters lazily.
|
|
||||||
while (start < body.end() && *start != '>') {
|
|
||||||
string name, value;
|
|
||||||
|
|
||||||
p = find_if(start, body.end(), p_whitespaceeqgt);
|
|
||||||
|
|
||||||
size_t name_len = p - start;
|
|
||||||
if (name_len == 1) {
|
|
||||||
if (*start == '/' && p < body.end() && *p == '>') {
|
|
||||||
// E.g. <tag foo="bar" />
|
|
||||||
start = p;
|
|
||||||
empty_element = true;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
p++;
|
||||||
|
|
||||||
name.assign(body, start - body.begin(), name_len);
|
|
||||||
|
|
||||||
p = find_if(p, body.end(), p_notwhitespace);
|
|
||||||
|
|
||||||
start = p;
|
|
||||||
if (start != body.end() && *start == '=') {
|
|
||||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
|
||||||
|
|
||||||
p = body.end();
|
|
||||||
|
|
||||||
int quote = *start;
|
|
||||||
if (quote == '"' || quote == '\'') {
|
|
||||||
start++;
|
|
||||||
p = find(start, body.end(), quote);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p == body.end()) {
|
|
||||||
// unquoted or no closing quote
|
|
||||||
p = find_if(start, body.end(), p_whitespacegt);
|
|
||||||
}
|
|
||||||
value.assign(body, start - body.begin(), p - start);
|
|
||||||
start = find_if(p, body.end(), p_notwhitespace);
|
|
||||||
|
|
||||||
if (!name.empty()) {
|
|
||||||
// convert parameter name to lowercase
|
|
||||||
lowercase_string(name);
|
|
||||||
// in case of multiple entries, use the first
|
|
||||||
// (as Netscape does)
|
|
||||||
parameters.insert(make_pair(name, value));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process text up to start of tag.
|
||||||
|
if (p > start || p == body.end()) {
|
||||||
|
string text = body.substr(start - body.begin(), p - start);
|
||||||
|
decode_entities(text);
|
||||||
|
process_text(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p == body.end()) {
|
||||||
|
do_eof();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
start = p + 1;
|
||||||
|
|
||||||
|
if (start == body.end()) break;
|
||||||
|
|
||||||
|
if (*start == '!') {
|
||||||
|
if (++start == body.end()) break;
|
||||||
|
if (++start == body.end()) break;
|
||||||
|
// comment or SGML declaration
|
||||||
|
if (*(start - 1) == '-' && *start == '-') {
|
||||||
|
++start;
|
||||||
|
string::const_iterator close = find(start, body.end(), '>');
|
||||||
|
// An unterminated comment swallows rest of document
|
||||||
|
// (like Netscape, but unlike MSIE IIRC)
|
||||||
|
if (close == body.end()) break;
|
||||||
|
|
||||||
|
p = close;
|
||||||
|
// look for -->
|
||||||
|
while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
|
||||||
|
p = find(p + 1, body.end(), '>');
|
||||||
|
|
||||||
|
if (p != body.end()) {
|
||||||
|
// Check for htdig's "ignore this bit" comments.
|
||||||
|
if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
|
||||||
|
string::size_type i;
|
||||||
|
i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
|
||||||
|
if (i == string::npos) break;
|
||||||
|
start = body.begin() + i + 21;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// If we found --> skip to there.
|
||||||
|
start = p;
|
||||||
|
} else {
|
||||||
|
// Otherwise skip to the first > we found (as Netscape does).
|
||||||
|
start = close;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// just an SGML declaration, perhaps giving the DTD - ignore it
|
||||||
|
start = find(start - 1, body.end(), '>');
|
||||||
|
if (start == body.end()) break;
|
||||||
|
}
|
||||||
|
++start;
|
||||||
|
} else if (*start == '?') {
|
||||||
|
if (++start == body.end()) break;
|
||||||
|
// PHP - swallow until ?> or EOF
|
||||||
|
start = find(start + 1, body.end(), '>');
|
||||||
|
|
||||||
|
// look for ?>
|
||||||
|
while (start != body.end() && *(start - 1) != '?')
|
||||||
|
start = find(start + 1, body.end(), '>');
|
||||||
|
|
||||||
|
// unterminated PHP swallows rest of document (rather arbitrarily
|
||||||
|
// but it avoids polluting the database when things go wrong)
|
||||||
|
if (start != body.end()) ++start;
|
||||||
|
} else {
|
||||||
|
// opening or closing tag
|
||||||
|
int closing = 0;
|
||||||
|
|
||||||
|
if (*start == '/') {
|
||||||
|
closing = 1;
|
||||||
|
start = find_if(start + 1, body.end(), p_notwhitespace);
|
||||||
|
}
|
||||||
|
|
||||||
|
p = start;
|
||||||
|
start = find_if(start, body.end(), p_nottag);
|
||||||
|
string tag = body.substr(p - body.begin(), start - p);
|
||||||
|
// convert tagname to lowercase
|
||||||
|
lowercase_string(tag);
|
||||||
|
|
||||||
|
if (closing) {
|
||||||
|
if (!closing_tag(tag))
|
||||||
|
return;
|
||||||
|
if (in_script && tag == "script") in_script = false;
|
||||||
|
|
||||||
|
/* ignore any bogus parameters on closing tags */
|
||||||
|
p = find(start, body.end(), '>');
|
||||||
|
if (p == body.end()) break;
|
||||||
|
start = p + 1;
|
||||||
|
} else {
|
||||||
|
bool empty_element = false;
|
||||||
|
// FIXME: parse parameters lazily.
|
||||||
|
while (start < body.end() && *start != '>') {
|
||||||
|
string name, value;
|
||||||
|
|
||||||
|
p = find_if(start, body.end(), p_whitespaceeqgt);
|
||||||
|
|
||||||
|
size_t name_len = p - start;
|
||||||
|
if (name_len == 1) {
|
||||||
|
if (*start == '/' && p < body.end() && *p == '>') {
|
||||||
|
// E.g. <tag foo="bar" />
|
||||||
|
start = p;
|
||||||
|
empty_element = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
name.assign(body, start - body.begin(), name_len);
|
||||||
|
|
||||||
|
p = find_if(p, body.end(), p_notwhitespace);
|
||||||
|
|
||||||
|
start = p;
|
||||||
|
if (start != body.end() && *start == '=') {
|
||||||
|
start = find_if(start + 1, body.end(), p_notwhitespace);
|
||||||
|
|
||||||
|
p = body.end();
|
||||||
|
|
||||||
|
int quote = *start;
|
||||||
|
if (quote == '"' || quote == '\'') {
|
||||||
|
start++;
|
||||||
|
p = find(start, body.end(), quote);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p == body.end()) {
|
||||||
|
// unquoted or no closing quote
|
||||||
|
p = find_if(start, body.end(), p_whitespacegt);
|
||||||
|
}
|
||||||
|
value.assign(body, start - body.begin(), p - start);
|
||||||
|
start = find_if(p, body.end(), p_notwhitespace);
|
||||||
|
|
||||||
|
if (!name.empty()) {
|
||||||
|
// convert parameter name to lowercase
|
||||||
|
lowercase_string(name);
|
||||||
|
// in case of multiple entries, use the first
|
||||||
|
// (as Netscape does)
|
||||||
|
parameters.insert(make_pair(name, value));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#if 0
|
#if 0
|
||||||
cout << "<" << tag;
|
cout << "<" << tag;
|
||||||
map<string, string>::const_iterator x;
|
map<string, string>::const_iterator x;
|
||||||
for (x = parameters.begin(); x != parameters.end(); x++) {
|
for (x = parameters.begin(); x != parameters.end(); x++) {
|
||||||
cout << " " << x->first << "=\"" << x->second << "\"";
|
cout << " " << x->first << "=\"" << x->second << "\"";
|
||||||
}
|
}
|
||||||
cout << ">\n";
|
cout << ">\n";
|
||||||
#endif
|
#endif
|
||||||
if (!opening_tag(tag))
|
if (!opening_tag(tag))
|
||||||
return;
|
return;
|
||||||
parameters.clear();
|
parameters.clear();
|
||||||
|
|
||||||
if (empty_element) {
|
if (empty_element) {
|
||||||
if (!closing_tag(tag))
|
if (!closing_tag(tag))
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// In <script> tags we ignore opening tags to avoid problems
|
||||||
|
// with "a<b".
|
||||||
|
if (tag == "script") in_script = true;
|
||||||
|
|
||||||
|
if (start != body.end() && *start == '>') ++start;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// In <script> tags we ignore opening tags to avoid problems
|
|
||||||
// with "a<b".
|
|
||||||
if (tag == "script") in_script = true;
|
|
||||||
|
|
||||||
if (start != body.end() && *start == '>') ++start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -32,17 +32,17 @@ using std::map;
|
|||||||
|
|
||||||
class HtmlParser {
|
class HtmlParser {
|
||||||
map<string, string> parameters;
|
map<string, string> parameters;
|
||||||
protected:
|
protected:
|
||||||
virtual void decode_entities(string &s);
|
virtual void decode_entities(string &s);
|
||||||
bool in_script;
|
bool in_script;
|
||||||
string charset;
|
string charset;
|
||||||
static map<string, unsigned int> named_ents;
|
static map<string, unsigned int> named_ents;
|
||||||
|
|
||||||
bool get_parameter(const string & param, string & value) const;
|
bool get_parameter(const string & param, string & value) const;
|
||||||
public:
|
public:
|
||||||
virtual void process_text(const string &/*text*/) { }
|
virtual void process_text(const string &/*text*/) { }
|
||||||
virtual bool opening_tag(const string &/*tag*/) { return true; }
|
virtual bool opening_tag(const string &/*tag*/) { return true; }
|
||||||
virtual bool closing_tag(const string &/*tag*/) { return true; }
|
virtual bool closing_tag(const string &/*tag*/) { return true; }
|
||||||
virtual void parse_html(const string &text);
|
virtual void parse_html(const string &text);
|
||||||
virtual void do_eof() {}
|
virtual void do_eof() {}
|
||||||
HtmlParser();
|
HtmlParser();
|
||||||
|
|||||||
@ -39,8 +39,7 @@ public:
|
|||||||
virtual void clear_impl() override;
|
virtual void clear_impl() override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual bool set_document_file_impl(const std::string&,
|
virtual bool set_document_file_impl(const std::string&, const std::string&) override;
|
||||||
const std::string&) override;
|
|
||||||
|
|
||||||
class Internal;
|
class Internal;
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -33,11 +33,10 @@
|
|||||||
/// Associated to application/x-zerosize, so use the following in mimeconf:
|
/// Associated to application/x-zerosize, so use the following in mimeconf:
|
||||||
/// <mimetype> = internal application/x-zerosize
|
/// <mimetype> = internal application/x-zerosize
|
||||||
class MimeHandlerNull : public RecollFilter {
|
class MimeHandlerNull : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
MimeHandlerNull(RclConfig *cnf, const std::string& id)
|
MimeHandlerNull(RclConfig *cnf, const std::string& id)
|
||||||
: RecollFilter(cnf, id) {
|
: RecollFilter(cnf, id) {}
|
||||||
}
|
virtual ~MimeHandlerNull() = default;
|
||||||
virtual ~MimeHandlerNull() {}
|
|
||||||
MimeHandlerNull(const MimeHandlerNull&) = delete;
|
MimeHandlerNull(const MimeHandlerNull&) = delete;
|
||||||
MimeHandlerNull& operator=(const MimeHandlerNull&) = delete;
|
MimeHandlerNull& operator=(const MimeHandlerNull&) = delete;
|
||||||
|
|
||||||
@ -45,14 +44,13 @@ class MimeHandlerNull : public RecollFilter {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool next_document()
|
virtual bool next_document() {
|
||||||
{
|
if (m_havedoc == false)
|
||||||
if (m_havedoc == false)
|
return false;
|
||||||
return false;
|
m_havedoc = false;
|
||||||
m_havedoc = false;
|
m_metaData[cstr_dj_keycontent] = cstr_null;
|
||||||
m_metaData[cstr_dj_keycontent] = cstr_null;
|
m_metaData[cstr_dj_keymt] = cstr_textplain;
|
||||||
m_metaData[cstr_dj_keymt] = cstr_textplain;
|
return true;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -36,9 +36,8 @@
|
|||||||
class MimeHandlerSymlink : public RecollFilter {
|
class MimeHandlerSymlink : public RecollFilter {
|
||||||
public:
|
public:
|
||||||
MimeHandlerSymlink(RclConfig *cnf, const std::string& id)
|
MimeHandlerSymlink(RclConfig *cnf, const std::string& id)
|
||||||
: RecollFilter(cnf, id) {
|
: RecollFilter(cnf, id) {}
|
||||||
}
|
virtual ~MimeHandlerSymlink() = default;
|
||||||
virtual ~MimeHandlerSymlink() {}
|
|
||||||
MimeHandlerSymlink(const MimeHandlerSymlink&) = delete;
|
MimeHandlerSymlink(const MimeHandlerSymlink&) = delete;
|
||||||
MimeHandlerSymlink& operator=(const MimeHandlerSymlink&) = delete;
|
MimeHandlerSymlink& operator=(const MimeHandlerSymlink&) = delete;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user