indexing could crash on different "file -i" output for some (binary) file names

This commit is contained in:
Jean-Francois Dockes 2011-10-13 19:33:38 +02:00
parent e8f63ec124
commit 85191eba16

View File

@ -69,18 +69,36 @@ static string mimetypefromdata(const string &fn, bool usfc)
LOGERR(("mimetypefromdata: doexec: status 0x%x\n", status)); LOGERR(("mimetypefromdata: doexec: status 0x%x\n", status));
return string(); return string();
} }
// LOGDEB(("mimetypefromdata: %s [%s]\n", result.c_str(), fn.c_str())); LOGDEB2(("mimetype: [%s] \"file\" output [%s]\n",
result.c_str(), fn.c_str()));
// The normal output from "file -i" looks like the following:
// thefilename.xxx: text/plain; charset=us-ascii
// Sometimes the semi-colon is missing like in:
// mimetype.cpp: text/x-c charset=us-ascii
// And sometimes we only get the mime type. This apparently happens
// when 'file' believes that the file name is binary
// The result of 'file' execution begins with the file name trimstring(result, " \t\n\r");
// which may contain spaces. We happen to know its size, so
// strip it: // If there is no colon and there is a slash, this is hopefuly
if (result.size() <= fn.size()) { // the mime type
if (result.find_first_of(":") == string::npos &&
result.find_first_of("/") != string::npos) {
return result;
}
// Else the result should begin with the file name. Get rid of it:
if (result.find(fn) != 0) {
// Garbage "file" output. Maybe the result of a charset // Garbage "file" output. Maybe the result of a charset
// conversion attempt? // conversion attempt?
LOGERR(("mimetype: can't interpret 'file' output: [%s]\n",
result.c_str()));
return string(); return string();
} }
result = result.substr(fn.size()); result = result.substr(fn.size());
// Now looks like ": text/plain; charset=us-ascii"
// Now should look like ": text/plain; charset=us-ascii"
// Split it, and take second field // Split it, and take second field
list<string> res; list<string> res;
stringToStrings(result, res); stringToStrings(result, res);
@ -88,10 +106,9 @@ static string mimetypefromdata(const string &fn, bool usfc)
return string(); return string();
list<string>::iterator it = res.begin(); list<string>::iterator it = res.begin();
mime = *++it; mime = *++it;
// Remove possible punctuation at the end. Note that this mangles // Remove possible semi-colon at the end
// text/x-c++ if there is no semi-colon... handled in mimeconf :( trimstring(mime, " \t;");
if (mime.length() > 0 && !isalpha(mime[mime.length() - 1]))
mime.erase(mime.length() -1);
// File -i will sometimes return strange stuff (ie: "very small file") // File -i will sometimes return strange stuff (ie: "very small file")
if(mime.find("/") == string::npos) if(mime.find("/") == string::npos)
mime.clear(); mime.clear();