html/xml meta: avoid appending a value that is already present in the string
This commit is contained in:
parent
552510db06
commit
e5af1651fa
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2005 J.F.Dockes
|
||||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -38,10 +38,10 @@ bool MimeHandlerExecMultiple::startCmd()
|
||||
{
|
||||
LOGDEB("MimeHandlerExecMultiple::startCmd\n");
|
||||
if (params.empty()) {
|
||||
// Hu ho
|
||||
LOGERR("MHExecMultiple::startCmd: empty params\n");
|
||||
m_reason = "RECFILTERROR BADCONFIG";
|
||||
return false;
|
||||
// Hu ho
|
||||
LOGERR("MHExecMultiple::startCmd: empty params\n");
|
||||
m_reason = "RECFILTERROR BADCONFIG";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Command name
|
||||
@ -55,7 +55,7 @@ bool MimeHandlerExecMultiple::startCmd()
|
||||
|
||||
m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
|
||||
m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
|
||||
"RECOLL_FILTER_FORPREVIEW=no");
|
||||
"RECOLL_FILTER_FORPREVIEW=no");
|
||||
|
||||
m_cmd.setrlimit_as(m_filtermaxmbytes);
|
||||
m_adv.setmaxsecs(m_filtermaxseconds);
|
||||
@ -156,11 +156,11 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
{
|
||||
LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
|
||||
if (m_havedoc == false)
|
||||
return false;
|
||||
return false;
|
||||
|
||||
if (missingHelper) {
|
||||
LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
|
||||
return false;
|
||||
LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_cmd.getChildPid() <= 0 && !startCmd()) {
|
||||
@ -178,15 +178,15 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
ostringstream obuf;
|
||||
string file_md5;
|
||||
if (m_filefirst) {
|
||||
if (!m_forPreview && !m_nomd5) {
|
||||
string md5, xmd5, reason;
|
||||
if (MD5File(m_fn, md5, &reason)) {
|
||||
file_md5 = MD5HexPrint(md5, xmd5);
|
||||
} else {
|
||||
LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
|
||||
if (!m_forPreview && !m_nomd5) {
|
||||
string md5, xmd5, reason;
|
||||
if (MD5File(m_fn, md5, &reason)) {
|
||||
file_md5 = MD5HexPrint(md5, xmd5);
|
||||
} else {
|
||||
LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
|
||||
"]: " << reason << "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
obuf << "FileName: " << m_fn.length() << "\n" << m_fn;
|
||||
// m_filefirst is set to true by set_document_file()
|
||||
m_filefirst = false;
|
||||
@ -194,13 +194,13 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
obuf << "Filename: " << 0 << "\n";
|
||||
}
|
||||
if (!m_ipath.empty()) {
|
||||
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
|
||||
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
|
||||
m_ipath << "]\n");
|
||||
obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath;
|
||||
}
|
||||
if (!m_dfltInputCharset.empty()) {
|
||||
obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n"
|
||||
<< m_dfltInputCharset;
|
||||
<< m_dfltInputCharset;
|
||||
}
|
||||
obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
|
||||
obuf << "\n";
|
||||
@ -247,10 +247,10 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
eofnow_received = true;
|
||||
} else if (!stringlowercmp("fileerror:", name)) {
|
||||
LOGDEB("MHExecMultiple: got FILEERROR\n");
|
||||
fileerror_received = true;
|
||||
fileerror_received = true;
|
||||
} else if (!stringlowercmp("subdocerror:", name)) {
|
||||
LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
|
||||
subdocerror_received = true;
|
||||
subdocerror_received = true;
|
||||
} else if (!stringlowercmp("ipath:", name)) {
|
||||
ipath = data;
|
||||
LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
|
||||
@ -264,7 +264,11 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
string nm = stringtolower((const string&)name);
|
||||
trimstring(nm, ":");
|
||||
LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
|
||||
m_metaData[nm] += data;
|
||||
auto it = m_metaData.find(nm);
|
||||
if (it == m_metaData.end() ||
|
||||
it->second.find(data) == std::string::npos) {
|
||||
m_metaData[nm] += data;
|
||||
}
|
||||
}
|
||||
if (loop == 200) {
|
||||
// ??
|
||||
@ -279,7 +283,7 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
return false;
|
||||
}
|
||||
if (subdocerror_received) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
// It used to be that eof could be signalled just by an empty document, but
|
||||
@ -291,13 +295,13 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
}
|
||||
|
||||
if (!ipath.empty()) {
|
||||
// If this has an ipath, it is an internal doc from a
|
||||
// multi-document file. In this case, either the filter
|
||||
// supplies the mimetype, or the ipath MUST be a filename-like
|
||||
// string which we can use to compute a mime type
|
||||
// If this has an ipath, it is an internal doc from a
|
||||
// multi-document file. In this case, either the filter
|
||||
// supplies the mimetype, or the ipath MUST be a filename-like
|
||||
// string which we can use to compute a mime type
|
||||
m_metaData[cstr_dj_keyipath] = ipath;
|
||||
if (mtype.empty()) {
|
||||
LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
|
||||
LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
|
||||
"for a guess\n");
|
||||
mtype = mimetype(ipath, 0, m_config, false);
|
||||
if (mtype.empty()) {
|
||||
@ -313,16 +317,16 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
}
|
||||
}
|
||||
m_metaData[cstr_dj_keymt] = mtype;
|
||||
if (!m_forPreview) {
|
||||
string md5, xmd5;
|
||||
MD5String(m_metaData[cstr_dj_keycontent], md5);
|
||||
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
||||
}
|
||||
if (!m_forPreview) {
|
||||
string md5, xmd5;
|
||||
MD5String(m_metaData[cstr_dj_keycontent], md5);
|
||||
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
||||
}
|
||||
} else {
|
||||
// "Self" document.
|
||||
// "Self" document.
|
||||
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
|
||||
m_metaData.erase(cstr_dj_keyipath);
|
||||
if (!m_forPreview) {
|
||||
if (!m_forPreview) {
|
||||
m_metaData[cstr_dj_keymd5] = file_md5;
|
||||
}
|
||||
}
|
||||
@ -339,4 +343,3 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn)
|
||||
string reason;
|
||||
if (!file_to_string(fn, otext, &reason)) {
|
||||
LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n");
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
m_filename = fn;
|
||||
return set_document_string(mt, otext);
|
||||
@ -51,10 +51,10 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
|
||||
m_havedoc = true;
|
||||
|
||||
if (!m_forPreview) {
|
||||
// We want to compute the md5 now because we may modify m_html later
|
||||
string md5, xmd5;
|
||||
MD5String(htext, md5);
|
||||
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
||||
// We want to compute the md5 now because we may modify m_html later
|
||||
string md5, xmd5;
|
||||
MD5String(htext, md5);
|
||||
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -62,7 +62,7 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
|
||||
bool MimeHandlerHtml::next_document()
|
||||
{
|
||||
if (m_havedoc == false)
|
||||
return false;
|
||||
return false;
|
||||
m_havedoc = false;
|
||||
// If set_doc(fn), take note of file name.
|
||||
string fn = m_filename;
|
||||
@ -70,12 +70,12 @@ bool MimeHandlerHtml::next_document()
|
||||
|
||||
string charset = m_dfltInputCharset;
|
||||
LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset
|
||||
<< "]\n");
|
||||
<< "]\n");
|
||||
// Override default input charset if someone took care to set one:
|
||||
map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset);
|
||||
if (it != m_metaData.end() && !it->second.empty()) {
|
||||
charset = it->second;
|
||||
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
|
||||
charset = it->second;
|
||||
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
|
||||
charset << "]\n");
|
||||
}
|
||||
|
||||
@ -88,78 +88,78 @@ bool MimeHandlerHtml::next_document()
|
||||
|
||||
MyHtmlParser result;
|
||||
for (int pass = 0; pass < 2; pass++) {
|
||||
string transcoded;
|
||||
LOGDEB("Html::mkDoc: pass " << pass << "\n");
|
||||
MyHtmlParser p;
|
||||
string transcoded;
|
||||
LOGDEB("Html::mkDoc: pass " << pass << "\n");
|
||||
MyHtmlParser p;
|
||||
|
||||
// Try transcoding. If it fails, use original text.
|
||||
int ecnt;
|
||||
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
|
||||
// Try transcoding. If it fails, use original text.
|
||||
int ecnt;
|
||||
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
|
||||
charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) <<
|
||||
"]");
|
||||
transcoded = m_html;
|
||||
// We don't know the charset, at all
|
||||
p.reset_charsets();
|
||||
charset.clear();
|
||||
} else {
|
||||
if (ecnt) {
|
||||
if (pass == 0) {
|
||||
LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
|
||||
transcoded = m_html;
|
||||
// We don't know the charset, at all
|
||||
p.reset_charsets();
|
||||
charset.clear();
|
||||
} else {
|
||||
if (ecnt) {
|
||||
if (pass == 0) {
|
||||
LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
|
||||
" errors for ["<<(fn.empty()?"unknown":fn)<< "]\n");
|
||||
} else {
|
||||
LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
|
||||
} else {
|
||||
LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
|
||||
" errors for ["<< (fn.empty()?"unknown":fn)<< "]\n");
|
||||
}
|
||||
}
|
||||
// charset has the putative source charset, transcoded is now
|
||||
// in utf-8
|
||||
p.set_charsets(charset, "utf-8");
|
||||
}
|
||||
}
|
||||
}
|
||||
// charset has the putative source charset, transcoded is now
|
||||
// in utf-8
|
||||
p.set_charsets(charset, "utf-8");
|
||||
}
|
||||
|
||||
try {
|
||||
p.parse_html(transcoded);
|
||||
// No exception: ok? But throw true to use the same
|
||||
// code path as if an exception had been thrown by parse_html
|
||||
throw true;
|
||||
break;
|
||||
} catch (bool diag) {
|
||||
result = p;
|
||||
if (diag == true) {
|
||||
// Parser throws true at end of text. ok
|
||||
try {
|
||||
p.parse_html(transcoded);
|
||||
// No exception: ok? But throw true to use the same
|
||||
// code path as if an exception had been thrown by parse_html
|
||||
throw true;
|
||||
break;
|
||||
} catch (bool diag) {
|
||||
result = p;
|
||||
if (diag == true) {
|
||||
// Parser throws true at end of text. ok
|
||||
|
||||
if (m_forPreview) {
|
||||
// Save the html text
|
||||
m_html = transcoded;
|
||||
// In many cases, we need to change the charset decl,
|
||||
// because the file was transcoded. It seems that just
|
||||
// inserting one is enough (only the 1st one seems to
|
||||
// be used by browsers/qtextedit).
|
||||
if (m_forPreview) {
|
||||
// Save the html text
|
||||
m_html = transcoded;
|
||||
// In many cases, we need to change the charset decl,
|
||||
// because the file was transcoded. It seems that just
|
||||
// inserting one is enough (only the 1st one seems to
|
||||
// be used by browsers/qtextedit).
|
||||
string::size_type idx = m_html.find("<head>");
|
||||
if (idx == string::npos)
|
||||
idx = m_html.find("<HEAD>");
|
||||
if (idx != string::npos)
|
||||
m_html.replace(idx+6, 0,
|
||||
"<meta http-equiv=\"content-type\" "
|
||||
"content=\"text/html; charset=utf-8\">");
|
||||
}
|
||||
if (idx == string::npos)
|
||||
idx = m_html.find("<HEAD>");
|
||||
if (idx != string::npos)
|
||||
m_html.replace(idx+6, 0,
|
||||
"<meta http-equiv=\"content-type\" "
|
||||
"content=\"text/html; charset=utf-8\">");
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
|
||||
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
|
||||
result.get_charset() << "]\n");
|
||||
if (!result.get_charset().empty() &&
|
||||
!samecharset(result.get_charset(), result.fromcharset)) {
|
||||
LOGDEB("textHtmlToDoc: reparse for charsets\n");
|
||||
// Set the origin charset as specified in document before
|
||||
// transcoding again
|
||||
charset = result.get_charset();
|
||||
} else {
|
||||
LOGERR("textHtmlToDoc:: error: non charset exception\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!result.get_charset().empty() &&
|
||||
!samecharset(result.get_charset(), result.fromcharset)) {
|
||||
LOGDEB("textHtmlToDoc: reparse for charsets\n");
|
||||
// Set the origin charset as specified in document before
|
||||
// transcoding again
|
||||
charset = result.get_charset();
|
||||
} else {
|
||||
LOGERR("textHtmlToDoc:: error: non charset exception\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
|
||||
@ -168,13 +168,13 @@ bool MimeHandlerHtml::next_document()
|
||||
// Avoid setting empty values which would crush ones possibly inherited
|
||||
// from parent (if we're an attachment)
|
||||
if (!result.dmtime.empty())
|
||||
m_metaData[cstr_dj_keymd] = result.dmtime;
|
||||
m_metaData[cstr_dj_keymd] = result.dmtime;
|
||||
m_metaData[cstr_dj_keymt] = cstr_textplain;
|
||||
|
||||
for (map<string,string>::const_iterator it = result.meta.begin();
|
||||
it != result.meta.end(); it++) {
|
||||
if (!it->second.empty())
|
||||
m_metaData[it->first] = it->second;
|
||||
for (const auto& entry : result.meta) {
|
||||
if (!entry.second.empty()) {
|
||||
m_metaData[entry.first] = entry.second;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -161,19 +161,19 @@ map<string, string> my_named_ents;
|
||||
class NamedEntsInitializer {
|
||||
public:
|
||||
NamedEntsInitializer()
|
||||
{
|
||||
for (int i = 0;;) {
|
||||
const char *ent;
|
||||
const char *val;
|
||||
ent = epairs[i++];
|
||||
if (ent == 0)
|
||||
break;
|
||||
val = epairs[i++];
|
||||
if (val == 0)
|
||||
break;
|
||||
my_named_ents[string(ent)] = val;
|
||||
}
|
||||
}
|
||||
{
|
||||
for (int i = 0;;) {
|
||||
const char *ent;
|
||||
const char *val;
|
||||
ent = epairs[i++];
|
||||
if (ent == 0)
|
||||
break;
|
||||
val = epairs[i++];
|
||||
if (val == 0)
|
||||
break;
|
||||
my_named_ents[string(ent)] = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
static NamedEntsInitializer namedEntsInitializerInstance;
|
||||
|
||||
@ -198,58 +198,58 @@ void MyHtmlParser::decode_entities(string &s)
|
||||
// so don't do it. If charset known, caller has converted text to utf-8,
|
||||
// and this is also how we translate entities
|
||||
// if (tocharset != "utf-8")
|
||||
// return;
|
||||
// return;
|
||||
|
||||
// We need a const_iterator version of s.end() - otherwise the
|
||||
// find() and find_if() templates don't work...
|
||||
string::const_iterator amp = s.begin(), s_end = s.end();
|
||||
while ((amp = find(amp, s_end, '&')) != s_end) {
|
||||
unsigned int val = 0;
|
||||
string::const_iterator end, p = amp + 1;
|
||||
string subs;
|
||||
if (p != s_end && *p == '#') {
|
||||
p++;
|
||||
if (p != s_end && (*p == 'x' || *p == 'X')) {
|
||||
// hex
|
||||
p++;
|
||||
end = find_if(p, s_end, p_notxdigit);
|
||||
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
|
||||
} else {
|
||||
// number
|
||||
end = find_if(p, s_end, p_notdigit);
|
||||
val = atoi(s.substr(p - s.begin(), end - p).c_str());
|
||||
}
|
||||
} else {
|
||||
end = find_if(p, s_end, p_notalnum);
|
||||
string code = s.substr(p - s.begin(), end - p);
|
||||
map<string, string>::const_iterator i;
|
||||
i = my_named_ents.find(code);
|
||||
if (i != my_named_ents.end())
|
||||
subs = i->second;
|
||||
}
|
||||
unsigned int val = 0;
|
||||
string::const_iterator end, p = amp + 1;
|
||||
string subs;
|
||||
if (p != s_end && *p == '#') {
|
||||
p++;
|
||||
if (p != s_end && (*p == 'x' || *p == 'X')) {
|
||||
// hex
|
||||
p++;
|
||||
end = find_if(p, s_end, p_notxdigit);
|
||||
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
|
||||
} else {
|
||||
// number
|
||||
end = find_if(p, s_end, p_notdigit);
|
||||
val = atoi(s.substr(p - s.begin(), end - p).c_str());
|
||||
}
|
||||
} else {
|
||||
end = find_if(p, s_end, p_notalnum);
|
||||
string code = s.substr(p - s.begin(), end - p);
|
||||
map<string, string>::const_iterator i;
|
||||
i = my_named_ents.find(code);
|
||||
if (i != my_named_ents.end())
|
||||
subs = i->second;
|
||||
}
|
||||
|
||||
if (end < s_end && *end == ';')
|
||||
end++;
|
||||
|
||||
if (val) {
|
||||
// The code is the code position for a unicode char. We need
|
||||
// to translate it to an utf-8 string.
|
||||
string utf16be;
|
||||
utf16be += char(val / 256);
|
||||
utf16be += char(val % 256);
|
||||
transcode(utf16be, subs, "UTF-16BE", "UTF-8");
|
||||
}
|
||||
if (end < s_end && *end == ';')
|
||||
end++;
|
||||
|
||||
if (val) {
|
||||
// The code is the code position for a unicode char. We need
|
||||
// to translate it to an utf-8 string.
|
||||
string utf16be;
|
||||
utf16be += char(val / 256);
|
||||
utf16be += char(val % 256);
|
||||
transcode(utf16be, subs, "UTF-16BE", "UTF-8");
|
||||
}
|
||||
|
||||
if (subs.length() > 0) {
|
||||
string::size_type amp_pos = amp - s.begin();
|
||||
s.replace(amp_pos, end - amp, subs);
|
||||
s_end = s.end();
|
||||
// We've modified the string, so the iterators are no longer
|
||||
// valid...
|
||||
amp = s.begin() + amp_pos + subs.length();
|
||||
} else {
|
||||
amp = end;
|
||||
}
|
||||
if (subs.length() > 0) {
|
||||
string::size_type amp_pos = amp - s.begin();
|
||||
s.replace(amp_pos, end - amp, subs);
|
||||
s_end = s.end();
|
||||
// We've modified the string, so the iterators are no longer
|
||||
// valid...
|
||||
amp = s.begin() + amp_pos + subs.length();
|
||||
} else {
|
||||
amp = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -265,35 +265,35 @@ MyHtmlParser::process_text(const string &text)
|
||||
CancelCheck::instance().checkCancel();
|
||||
|
||||
if (!in_script_tag && !in_style_tag) {
|
||||
if (in_title_tag) {
|
||||
titledump += text;
|
||||
} else if (!in_pre_tag) {
|
||||
string::size_type b = 0;
|
||||
bool only_space = true;
|
||||
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
||||
only_space = false;
|
||||
// If space specifically needed or chunk begins with
|
||||
// whitespace, add exactly one space
|
||||
if (pending_space || b != 0) {
|
||||
dump += ' ';
|
||||
}
|
||||
pending_space = true;
|
||||
string::size_type e = text.find_first_of(WHITESPACE, b);
|
||||
if (e == string::npos) {
|
||||
dump += text.substr(b);
|
||||
pending_space = false;
|
||||
break;
|
||||
}
|
||||
dump += text.substr(b, e - b);
|
||||
b = e + 1;
|
||||
}
|
||||
if (only_space)
|
||||
pending_space = true;
|
||||
} else {
|
||||
if (pending_space)
|
||||
dump += ' ';
|
||||
dump += text;
|
||||
}
|
||||
if (in_title_tag) {
|
||||
titledump += text;
|
||||
} else if (!in_pre_tag) {
|
||||
string::size_type b = 0;
|
||||
bool only_space = true;
|
||||
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
||||
only_space = false;
|
||||
// If space specifically needed or chunk begins with
|
||||
// whitespace, add exactly one space
|
||||
if (pending_space || b != 0) {
|
||||
dump += ' ';
|
||||
}
|
||||
pending_space = true;
|
||||
string::size_type e = text.find_first_of(WHITESPACE, b);
|
||||
if (e == string::npos) {
|
||||
dump += text.substr(b);
|
||||
pending_space = false;
|
||||
break;
|
||||
}
|
||||
dump += text.substr(b, e - b);
|
||||
b = e + 1;
|
||||
}
|
||||
if (only_space)
|
||||
pending_space = true;
|
||||
} else {
|
||||
if (pending_space)
|
||||
dump += ' ';
|
||||
dump += text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -305,175 +305,186 @@ MyHtmlParser::opening_tag(const string &tag)
|
||||
cout << "TAG: " << tag << ": " << endl;
|
||||
map<string, string>::const_iterator x;
|
||||
for (x = p.begin(); x != p.end(); x++) {
|
||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||
}
|
||||
#endif
|
||||
if (tag.empty()) return true;
|
||||
switch (tag[0]) {
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
// body: some bad docs have several opening body tags and
|
||||
// even text before the body is displayed by Opera and
|
||||
// Firefox. We used to reset the dump each time we saw a
|
||||
// body tag, but I can't see any reason to do so.
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
// body: some bad docs have several opening body tags and
|
||||
// even text before the body is displayed by Opera and
|
||||
// Firefox. We used to reset the dump each time we saw a
|
||||
// body tag, but I can't see any reason to do so.
|
||||
|
||||
if (tag == "blockquote" || tag == "br") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
break;
|
||||
case 'd':
|
||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
||||
tag == "dt") pending_space = true;
|
||||
if (tag == "dt")
|
||||
dump += '\n';
|
||||
break;
|
||||
case 'e':
|
||||
if (tag == "embed") pending_space = true;
|
||||
break;
|
||||
case 'f':
|
||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1])) {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
||||
tag == "input") pending_space = true;
|
||||
break;
|
||||
case 'k':
|
||||
if (tag == "keygen") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "meta") {
|
||||
string content;
|
||||
if (get_parameter(cstr_html_content, content)) {
|
||||
string name;
|
||||
if (get_parameter("name", name)) {
|
||||
lowercase_term(name);
|
||||
if (name == "date") {
|
||||
// Specific to Recoll filters.
|
||||
decode_entities(content);
|
||||
struct tm tm;
|
||||
memset(&tm, 0, sizeof(tm));
|
||||
if (strptime(content.c_str(),
|
||||
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
||||
strptime(content.c_str(),
|
||||
"%Y-%m-%dT%H:%M:%S", &tm)
|
||||
) {
|
||||
char ascuxtime[100];
|
||||
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
||||
dmtime = ascuxtime;
|
||||
}
|
||||
} else if (name == "robots") {
|
||||
} else {
|
||||
string markup;
|
||||
bool ishtml = false;
|
||||
if (get_parameter("markup", markup)) {
|
||||
if (!stringlowercmp("html", markup)) {
|
||||
ishtml = true;
|
||||
}
|
||||
}
|
||||
if (!meta[name].empty())
|
||||
meta[name] += ' ';
|
||||
decode_entities(content);
|
||||
meta[name] += content;
|
||||
if (ishtml &&
|
||||
meta[name].compare(0, cstr_fldhtm.size(),
|
||||
cstr_fldhtm)) {
|
||||
meta[name].insert(0, cstr_fldhtm);
|
||||
}
|
||||
}
|
||||
}
|
||||
string hdr;
|
||||
if (get_parameter("http-equiv", hdr)) {
|
||||
lowercase_term(hdr);
|
||||
if (hdr == "content-type") {
|
||||
MimeHeaderValue p;
|
||||
parseMimeHeaderValue(content, p);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find(cstr_html_charset)) !=
|
||||
p.params.end()) {
|
||||
charset = k->second;
|
||||
if (!charset.empty() &&
|
||||
!samecharset(charset, fromcharset)) {
|
||||
LOGDEB1("Doc http-equiv charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
|
||||
throw false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
string newcharset;
|
||||
if (get_parameter(cstr_html_charset, newcharset)) {
|
||||
// HTML5 added: <meta charset="...">
|
||||
lowercase_term(newcharset);
|
||||
charset = newcharset;
|
||||
if (!charset.empty() &&
|
||||
!samecharset(charset, fromcharset)) {
|
||||
LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
|
||||
throw false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'o':
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p" || tag == "plaintext") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
} else if (tag == "pre") {
|
||||
in_pre_tag = true;
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
break;
|
||||
case 's':
|
||||
if (tag == "style") {
|
||||
in_style_tag = true;
|
||||
break;
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = true;
|
||||
break;
|
||||
} else if (tag == "select")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||
tag == "th") {
|
||||
pending_space = true;
|
||||
} else if (tag == "title") {
|
||||
in_title_tag = true;
|
||||
}
|
||||
break;
|
||||
case 'u':
|
||||
if (tag == "ul") pending_space = true;
|
||||
break;
|
||||
case 'x':
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
if (tag == "blockquote" || tag == "br") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
break;
|
||||
case 'd':
|
||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
||||
tag == "dt") pending_space = true;
|
||||
if (tag == "dt")
|
||||
dump += '\n';
|
||||
break;
|
||||
case 'e':
|
||||
if (tag == "embed") pending_space = true;
|
||||
break;
|
||||
case 'f':
|
||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1])) {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
||||
tag == "input") pending_space = true;
|
||||
break;
|
||||
case 'k':
|
||||
if (tag == "keygen") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "meta") {
|
||||
string content;
|
||||
if (get_parameter(cstr_html_content, content)) {
|
||||
string name;
|
||||
if (get_parameter("name", name)) {
|
||||
lowercase_term(name);
|
||||
if (name == "date") {
|
||||
// Specific to Recoll filters.
|
||||
decode_entities(content);
|
||||
struct tm tm;
|
||||
memset(&tm, 0, sizeof(tm));
|
||||
if (strptime(content.c_str(),
|
||||
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
||||
strptime(content.c_str(),
|
||||
"%Y-%m-%dT%H:%M:%S", &tm)
|
||||
) {
|
||||
char ascuxtime[100];
|
||||
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
||||
dmtime = ascuxtime;
|
||||
}
|
||||
} else if (name == "robots") {
|
||||
} else {
|
||||
string markup;
|
||||
bool ishtml = false;
|
||||
if (get_parameter("markup", markup)) {
|
||||
if (!stringlowercmp("html", markup)) {
|
||||
ishtml = true;
|
||||
}
|
||||
}
|
||||
decode_entities(content);
|
||||
// Set metadata field, avoid appending
|
||||
// multiple identical instances.
|
||||
auto it = meta.find(name);
|
||||
if (it == meta.end() || it->second.find(content) ==
|
||||
string::npos) {
|
||||
if (it != meta.end()) {
|
||||
it->second += ' ';
|
||||
it->second += content;
|
||||
} else {
|
||||
meta[name] = content;
|
||||
}
|
||||
}
|
||||
if (ishtml &&
|
||||
meta[name].compare(0, cstr_fldhtm.size(),
|
||||
cstr_fldhtm)) {
|
||||
meta[name].insert(0, cstr_fldhtm);
|
||||
}
|
||||
}
|
||||
}
|
||||
string hdr;
|
||||
if (get_parameter("http-equiv", hdr)) {
|
||||
lowercase_term(hdr);
|
||||
if (hdr == "content-type") {
|
||||
MimeHeaderValue p;
|
||||
parseMimeHeaderValue(content, p);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find(cstr_html_charset)) !=
|
||||
p.params.end()) {
|
||||
charset = k->second;
|
||||
if (!charset.empty() &&
|
||||
!samecharset(charset, fromcharset)) {
|
||||
LOGDEB1("Doc http-equiv charset '" << charset <<
|
||||
"' differs from dir deflt '" <<
|
||||
fromcharset << "'\n");
|
||||
throw false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
string newcharset;
|
||||
if (get_parameter(cstr_html_charset, newcharset)) {
|
||||
// HTML5 added: <meta charset="...">
|
||||
lowercase_term(newcharset);
|
||||
charset = newcharset;
|
||||
if (!charset.empty() &&
|
||||
!samecharset(charset, fromcharset)) {
|
||||
LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
|
||||
throw false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'o':
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p" || tag == "plaintext") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
} else if (tag == "pre") {
|
||||
in_pre_tag = true;
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
break;
|
||||
case 's':
|
||||
if (tag == "style") {
|
||||
in_style_tag = true;
|
||||
break;
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = true;
|
||||
break;
|
||||
} else if (tag == "select")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||
tag == "th") {
|
||||
pending_space = true;
|
||||
} else if (tag == "title") {
|
||||
in_title_tag = true;
|
||||
}
|
||||
break;
|
||||
case 'u':
|
||||
if (tag == "ul") pending_space = true;
|
||||
break;
|
||||
case 'x':
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -484,85 +495,85 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||
LOGDEB2("closing_tag: [" << (tag) << "]\n" );
|
||||
if (tag.empty()) return true;
|
||||
switch (tag[0]) {
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
// body: We used to signal and end of doc here by returning
|
||||
// false but the browsers just ignore body and html
|
||||
// closing tags if there is further text, so it seems right
|
||||
// to do the same
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
// body: We used to signal and end of doc here by returning
|
||||
// false but the browsers just ignore body and html
|
||||
// closing tags if there is further text, so it seems right
|
||||
// to do the same
|
||||
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
break;
|
||||
case 'd':
|
||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
||||
tag == "dt") pending_space = true;
|
||||
break;
|
||||
case 'f':
|
||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "marquee" || tag == "menu") pending_space = true;
|
||||
break;
|
||||
case 'o':
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p") {
|
||||
pending_space = true;
|
||||
} else if (tag == "pre") {
|
||||
pending_space = true;
|
||||
in_pre_tag = false;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
break;
|
||||
case 's':
|
||||
if (tag == "style") {
|
||||
in_style_tag = false;
|
||||
break;
|
||||
}
|
||||
if (tag == "script") {
|
||||
in_script_tag = false;
|
||||
break;
|
||||
}
|
||||
if (tag == "select") pending_space = true;
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "title") {
|
||||
in_title_tag = false;
|
||||
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
||||
meta["title"] = titledump;
|
||||
titledump.clear();
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||
tag == "th") pending_space = true;
|
||||
break;
|
||||
case 'u':
|
||||
if (tag == "ul") pending_space = true;
|
||||
break;
|
||||
case 'x':
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
break;
|
||||
case 'd':
|
||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
||||
tag == "dt") pending_space = true;
|
||||
break;
|
||||
case 'f':
|
||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "marquee" || tag == "menu") pending_space = true;
|
||||
break;
|
||||
case 'o':
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p") {
|
||||
pending_space = true;
|
||||
} else if (tag == "pre") {
|
||||
pending_space = true;
|
||||
in_pre_tag = false;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
break;
|
||||
case 's':
|
||||
if (tag == "style") {
|
||||
in_style_tag = false;
|
||||
break;
|
||||
}
|
||||
if (tag == "script") {
|
||||
in_script_tag = false;
|
||||
break;
|
||||
}
|
||||
if (tag == "select") pending_space = true;
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "title") {
|
||||
in_title_tag = false;
|
||||
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
||||
meta["title"] = titledump;
|
||||
titledump.clear();
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||
tag == "th") pending_space = true;
|
||||
break;
|
||||
case 'u':
|
||||
if (tag == "ul") pending_space = true;
|
||||
break;
|
||||
case 'x':
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user