Moved some recoll-specific code from smallut to rclutil
This commit is contained in:
parent
960a4649d3
commit
225b59e5ee
@ -41,6 +41,7 @@
|
||||
#include <unordered_map>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
#include "rclutil.h"
|
||||
#include "pathut.h"
|
||||
@ -648,9 +649,72 @@ bool thumbPathForUrl(const string& url, int size, string& path)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compare charset names, removing the more common spelling variations
|
||||
bool samecharset(const string& cs1, const string& cs2)
|
||||
{
|
||||
auto mcs1 = std::accumulate(cs1.begin(), cs1.end(), "", [](const char* m, char i) { return (i != '_' && i != '-') ? m + ::tolower(i) : m; });
|
||||
auto mcs2 = std::accumulate(cs2.begin(), cs2.end(), "", [](const char* m, char i) { return (i != '_' && i != '-') ? m + ::tolower(i) : m; });
|
||||
return mcs1 == mcs2;
|
||||
}
|
||||
|
||||
static const std::unordered_map<string, string> lang_to_code {
|
||||
{"be", "cp1251"},
|
||||
{"bg", "cp1251"},
|
||||
{"cs", "iso-8859-2"},
|
||||
{"el", "iso-8859-7"},
|
||||
{"he", "iso-8859-8"},
|
||||
{"hr", "iso-8859-2"},
|
||||
{"hu", "iso-8859-2"},
|
||||
{"ja", "eucjp"},
|
||||
{"kk", "pt154"},
|
||||
{"ko", "euckr"},
|
||||
{"lt", "iso-8859-13"},
|
||||
{"lv", "iso-8859-13"},
|
||||
{"pl", "iso-8859-2"},
|
||||
{"rs", "iso-8859-2"},
|
||||
{"ro", "iso-8859-2"},
|
||||
{"ru", "koi8-r"},
|
||||
{"sk", "iso-8859-2"},
|
||||
{"sl", "iso-8859-2"},
|
||||
{"sr", "iso-8859-2"},
|
||||
{"th", "iso-8859-11"},
|
||||
{"tr", "iso-8859-9"},
|
||||
{"uk", "koi8-u"},
|
||||
};
|
||||
|
||||
string langtocode(const string& lang)
|
||||
{
|
||||
const auto it = lang_to_code.find(lang);
|
||||
|
||||
// Use cp1252 by default...
|
||||
if (it == lang_to_code.end()) {
|
||||
return cstr_cp1252;
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
string localelang()
|
||||
{
|
||||
const char *lang = getenv("LANG");
|
||||
|
||||
if (lang == nullptr || *lang == 0 || !strcmp(lang, "C") ||
|
||||
!strcmp(lang, "POSIX")) {
|
||||
return "en";
|
||||
}
|
||||
string locale(lang);
|
||||
string::size_type under = locale.find_first_of('_');
|
||||
if (under == string::npos) {
|
||||
return locale;
|
||||
}
|
||||
return locale.substr(0, under);
|
||||
}
|
||||
|
||||
void rclutil_init_mt()
|
||||
{
|
||||
path_pkgdatadir();
|
||||
tmplocation();
|
||||
thumbnailsdir();
|
||||
// Init langtocode() static table
|
||||
langtocode("");
|
||||
}
|
||||
|
||||
@ -122,4 +122,11 @@ template <class T> void map_ss_cp_noshr(T s, T *d);
|
||||
template <class T> void addmeta(T& store, const std::string& nm,
|
||||
const std::string& value);
|
||||
|
||||
// Compare charset names, removing the more common spelling variations
|
||||
extern bool samecharset(const std::string& cs1, const std::string& cs2);
|
||||
// Divine language from locale
|
||||
extern std::string localelang();
|
||||
// Divine 8bit charset from language
|
||||
extern std::string langtocode(const std::string& lang);
|
||||
|
||||
#endif /* _RCLUTIL_H_INCLUDED_ */
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2006-2016 J.F.Dockes
|
||||
/* Copyright (C) 2006-2020 J.F.Dockes
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
@ -15,10 +15,24 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
* 02110-1301 USA
|
||||
*/
|
||||
#include "smallut.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cerrno>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cinttypes>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <numeric>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#ifdef _WIN32
|
||||
// needed for localtime_r under mingw?
|
||||
@ -28,11 +42,6 @@
|
||||
#endif /* _MSC_VER */
|
||||
#endif /* _WIN32 */
|
||||
|
||||
#include <ctime>
|
||||
#include <cctype>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
// Older compilers don't support stdc++ regex, but Windows does not
|
||||
// have the Linux one. Have a simple class to solve the simple cases.
|
||||
@ -44,48 +53,13 @@
|
||||
#include <regex.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <numeric>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "smallut.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int stringicmp(const string& s1, const string& s2)
|
||||
{
|
||||
string::const_iterator it1 = s1.begin();
|
||||
string::const_iterator it2 = s2.begin();
|
||||
string::size_type size1 = s1.length(), size2 = s2.length();
|
||||
char c1, c2;
|
||||
|
||||
if (size1 < size2) {
|
||||
while (it1 != s1.end()) {
|
||||
c1 = ::toupper(*it1);
|
||||
c2 = ::toupper(*it2);
|
||||
if (c1 != c2) {
|
||||
return c1 > c2 ? 1 : -1;
|
||||
}
|
||||
++it1;
|
||||
++it2;
|
||||
}
|
||||
return size1 == size2 ? 0 : -1;
|
||||
}
|
||||
|
||||
while (it2 != s2.end()) {
|
||||
c1 = ::toupper(*it1);
|
||||
c2 = ::toupper(*it2);
|
||||
if (c1 != c2) {
|
||||
return c1 > c2 ? 1 : -1;
|
||||
}
|
||||
++it1;
|
||||
++it2;
|
||||
}
|
||||
return size1 == size2 ? 0 : 1;
|
||||
return strcasecmp(s1.c_str(), s2.c_str());
|
||||
}
|
||||
|
||||
void stringtolower(string& io)
|
||||
{
|
||||
std::transform(io.begin(), io.end(), io.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||
@ -110,22 +84,6 @@ string stringtoupper(const string& i)
|
||||
return o;
|
||||
}
|
||||
|
||||
extern int stringisuffcmp(const string& s1, const string& s2)
|
||||
{
|
||||
string::const_reverse_iterator r1 = s1.rbegin(), re1 = s1.rend(),
|
||||
r2 = s2.rbegin(), re2 = s2.rend();
|
||||
while (r1 != re1 && r2 != re2) {
|
||||
char c1 = ::toupper(*r1);
|
||||
char c2 = ::toupper(*r2);
|
||||
if (c1 != c2) {
|
||||
return c1 > c2 ? 1 : -1;
|
||||
}
|
||||
++r1;
|
||||
++r2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// s1 is already lowercase
|
||||
int stringlowercmp(const string& s1, const string& s2)
|
||||
{
|
||||
@ -193,14 +151,6 @@ bool beginswith(const std::string& big, const std::string& small)
|
||||
return big.compare(0, small.size(), small) == 0;
|
||||
}
|
||||
|
||||
// Compare charset names, removing the more common spelling variations
|
||||
bool samecharset(const string& cs1, const string& cs2)
|
||||
{
|
||||
auto mcs1 = std::accumulate(cs1.begin(), cs1.end(), "", [](const char* m, char i) { return (i != '_' && i != '-') ? m + ::tolower(i) : m; });
|
||||
auto mcs2 = std::accumulate(cs2.begin(), cs2.end(), "", [](const char* m, char i) { return (i != '_' && i != '-') ? m + ::tolower(i) : m; });
|
||||
return mcs1 == mcs2;
|
||||
}
|
||||
|
||||
template <class T> bool stringToStrings(const string& s, T& tokens,
|
||||
const string& addseps)
|
||||
{
|
||||
@ -310,15 +260,6 @@ template <class T> bool stringToStrings(const string& s, T& tokens,
|
||||
return true;
|
||||
}
|
||||
|
||||
template bool stringToStrings<list<string> >(const string&,
|
||||
list<string>&, const string&);
|
||||
template bool stringToStrings<vector<string> >(const string&,
|
||||
vector<string>&, const string&);
|
||||
template bool stringToStrings<set<string> >(const string&,
|
||||
set<string>&, const string&);
|
||||
template bool stringToStrings<std::unordered_set<string> >
|
||||
(const string&, std::unordered_set<string>&, const string&);
|
||||
|
||||
template <class T> void stringsToString(const T& tokens, string& s)
|
||||
{
|
||||
for (auto it = tokens.begin();
|
||||
@ -347,20 +288,13 @@ template <class T> void stringsToString(const T& tokens, string& s)
|
||||
}
|
||||
}
|
||||
}
|
||||
template void stringsToString<list<string> >(const list<string>&, string&);
|
||||
template void stringsToString<vector<string> >(const vector<string>&, string&);
|
||||
template void stringsToString<set<string> >(const set<string>&, string&);
|
||||
template void stringsToString<unordered_set<string> >(const unordered_set<string>&, string&);
|
||||
|
||||
template <class T> string stringsToString(const T& tokens)
|
||||
{
|
||||
string out;
|
||||
stringsToString<T>(tokens, out);
|
||||
return out;
|
||||
}
|
||||
template string stringsToString<list<string> >(const list<string>&);
|
||||
template string stringsToString<vector<string> >(const vector<string>&);
|
||||
template string stringsToString<set<string> >(const set<string>&);
|
||||
template string stringsToString<unordered_set<string> >(const unordered_set<string>&);
|
||||
|
||||
template <class T> void stringsToCSV(const T& tokens, string& s,
|
||||
char sep)
|
||||
@ -392,9 +326,30 @@ template <class T> void stringsToCSV(const T& tokens, string& s,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SMALLUT_EXTERNAL_INSTANTIATIONS
|
||||
#include "smallut_instantiate.h"
|
||||
#else
|
||||
template bool stringToStrings<list<string> >(const string&,
|
||||
list<string>&, const string&);
|
||||
template bool stringToStrings<vector<string> >(const string&,
|
||||
vector<string>&, const string&);
|
||||
template bool stringToStrings<set<string> >(const string&,
|
||||
set<string>&, const string&);
|
||||
template bool stringToStrings<std::unordered_set<string> >
|
||||
(const string&, std::unordered_set<string>&, const string&);
|
||||
template void stringsToString<list<string> >(const list<string>&, string&);
|
||||
template void stringsToString<vector<string> >(const vector<string>&, string&);
|
||||
template void stringsToString<set<string> >(const set<string>&, string&);
|
||||
template void stringsToString<unordered_set<string> >(const unordered_set<string>&, string&);
|
||||
template string stringsToString<list<string> >(const list<string>&);
|
||||
template string stringsToString<vector<string> >(const vector<string>&);
|
||||
template string stringsToString<set<string> >(const set<string>&);
|
||||
template string stringsToString<unordered_set<string> >(const unordered_set<string>&);
|
||||
template void stringsToCSV<list<string> >(const list<string>&, string&, char);
|
||||
template void stringsToCSV<vector<string> >(const vector<string>&, string&,
|
||||
char);
|
||||
#endif
|
||||
|
||||
void stringToTokens(const string& str, vector<string>& tokens,
|
||||
const string& delims, bool skipinit)
|
||||
@ -1203,61 +1158,7 @@ void catstrerror(string *reason, const char *what, int _errno)
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static const std::unordered_map<string, string> lang_to_code {
|
||||
{"be", "cp1251"},
|
||||
{"bg", "cp1251"},
|
||||
{"cs", "iso-8859-2"},
|
||||
{"el", "iso-8859-7"},
|
||||
{"he", "iso-8859-8"},
|
||||
{"hr", "iso-8859-2"},
|
||||
{"hu", "iso-8859-2"},
|
||||
{"ja", "eucjp"},
|
||||
{"kk", "pt154"},
|
||||
{"ko", "euckr"},
|
||||
{"lt", "iso-8859-13"},
|
||||
{"lv", "iso-8859-13"},
|
||||
{"pl", "iso-8859-2"},
|
||||
{"rs", "iso-8859-2"},
|
||||
{"ro", "iso-8859-2"},
|
||||
{"ru", "koi8-r"},
|
||||
{"sk", "iso-8859-2"},
|
||||
{"sl", "iso-8859-2"},
|
||||
{"sr", "iso-8859-2"},
|
||||
{"th", "iso-8859-11"},
|
||||
{"tr", "iso-8859-9"},
|
||||
{"uk", "koi8-u"},
|
||||
};
|
||||
static const string cstr_cp1252("CP1252");
|
||||
|
||||
string langtocode(const string& lang)
|
||||
{
|
||||
const auto it = lang_to_code.find(lang);
|
||||
|
||||
// Use cp1252 by default...
|
||||
if (it == lang_to_code.end()) {
|
||||
return cstr_cp1252;
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
string localelang()
|
||||
{
|
||||
const char *lang = getenv("LANG");
|
||||
|
||||
if (lang == nullptr || *lang == 0 || !strcmp(lang, "C") ||
|
||||
!strcmp(lang, "POSIX")) {
|
||||
return "en";
|
||||
}
|
||||
string locale(lang);
|
||||
string::size_type under = locale.find_first_of('_');
|
||||
if (under == string::npos) {
|
||||
return locale;
|
||||
}
|
||||
return locale.substr(0, under);
|
||||
}
|
||||
|
||||
#ifndef SMALLUT_NO_REGEX
|
||||
#ifdef USE_STD_REGEX
|
||||
|
||||
class SimpleRegexp::Internal {
|
||||
@ -1283,6 +1184,17 @@ bool SimpleRegexp::simpleMatch(const string& val) const
|
||||
return regex_search(val, m->res, m->expr);
|
||||
}
|
||||
|
||||
// Substitute one instance of regular expression
|
||||
std::string SimpleRegexp::simpleSub(
|
||||
const std::string& in, const std::string& repl)
|
||||
{
|
||||
if (!ok()) {
|
||||
return std::string();
|
||||
}
|
||||
return regex_replace(
|
||||
in, m->expr, repl, std::regex_constants::format_first_only);
|
||||
}
|
||||
|
||||
string SimpleRegexp::getMatch(const string&, int i) const
|
||||
{
|
||||
return m->res.str(i);
|
||||
@ -1309,6 +1221,36 @@ public:
|
||||
vector<regmatch_t> matches;
|
||||
};
|
||||
|
||||
// Substitute one instance of regular expression
|
||||
std::string SimpleRegexp::simpleSub(
|
||||
const std::string& in, const std::string& repl)
|
||||
{
|
||||
if (!ok()) {
|
||||
return std::string();
|
||||
}
|
||||
|
||||
int err;
|
||||
if ((err = regexec(&m->expr, in.c_str(),
|
||||
m->nmatch + 1, &m->matches[0], 0))) {
|
||||
#if SIMPLESUB_DBG
|
||||
const int ERRSIZE = 200;
|
||||
char errbuf[ERRSIZE + 1];
|
||||
regerror(err, &expr, errbuf, ERRSIZE);
|
||||
std::cerr << "simpleSub: regexec(" << sexp << ") failed: "
|
||||
<< errbuf << "\n";
|
||||
#endif
|
||||
return in;
|
||||
}
|
||||
if (m->matches[0].rm_so == -1) {
|
||||
// No match
|
||||
return in;
|
||||
}
|
||||
string out = in.substr(0, m->matches[0].rm_so);
|
||||
out += repl;
|
||||
out += in.substr(m->matches[0].rm_eo);
|
||||
return out;
|
||||
}
|
||||
|
||||
bool SimpleRegexp::simpleMatch(const string& val) const
|
||||
{
|
||||
if (!ok())
|
||||
@ -1325,7 +1267,7 @@ string SimpleRegexp::getMatch(const string& val, int i) const
|
||||
m->matches[i].rm_eo - m->matches[i].rm_so);
|
||||
}
|
||||
|
||||
#endif // win/notwinf
|
||||
#endif // !windows, using C regexps
|
||||
|
||||
SimpleRegexp::SimpleRegexp(const string& exp, int flags, int nmatch)
|
||||
: m(new Internal(exp, flags, nmatch))
|
||||
@ -1346,6 +1288,7 @@ bool SimpleRegexp::operator() (const string& val) const
|
||||
{
|
||||
return simpleMatch(val);
|
||||
}
|
||||
#endif // SMALLUT_NO_REGEX
|
||||
|
||||
string flagsToString(const vector<CharFlags>& flags, unsigned int val)
|
||||
{
|
||||
@ -1386,29 +1329,8 @@ string valToString(const vector<CharFlags>& flags, unsigned int val)
|
||||
return out;
|
||||
}
|
||||
|
||||
unsigned int stringToFlags(const vector<CharFlags>& flags,
|
||||
const string& input, const char *sep)
|
||||
{
|
||||
unsigned int out = 0;
|
||||
|
||||
vector<string> toks;
|
||||
stringToTokens(input, toks, sep);
|
||||
for (auto& tok: toks) {
|
||||
trimstring(tok);
|
||||
out = std::accumulate(
|
||||
flags.begin(), flags.end(), out,
|
||||
[&](unsigned int o, CharFlags flag) {
|
||||
return tok == flag.yesname ? o | flag.value : o;
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
// Initialization for static stuff to be called from main thread before going
|
||||
// multiple
|
||||
void smallut_init_mt()
|
||||
{
|
||||
// Init langtocode() static table
|
||||
langtocode("");
|
||||
}
|
||||
|
||||
@ -23,7 +23,6 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
// Miscellaneous mostly string-oriented small utilities
|
||||
// Note that none of the following code knows about utf-8.
|
||||
@ -71,17 +70,6 @@ extern void stringtoupper(std::string& io);
|
||||
extern std::string stringtoupper(const std::string& io);
|
||||
extern bool beginswith(const std::string& bg, const std::string& sml);
|
||||
|
||||
// Is one string the end part of the other ?
|
||||
extern int stringisuffcmp(const std::string& s1, const std::string& s2);
|
||||
|
||||
// Divine language from locale
|
||||
extern std::string localelang();
|
||||
// Divine 8bit charset from language
|
||||
extern std::string langtocode(const std::string& lang);
|
||||
|
||||
// Compare charset names, removing the more common spelling variations
|
||||
extern bool samecharset(const std::string& cs1, const std::string& cs2);
|
||||
|
||||
// Parse date interval specifier into pair of y,m,d dates. The format
|
||||
// for the time interval is based on a subset of iso 8601 with
|
||||
// the addition of open intervals, and removal of all time indications.
|
||||
@ -104,8 +92,18 @@ struct DateInterval {
|
||||
extern bool parsedateinterval(const std::string& s, DateInterval *di);
|
||||
extern int monthdays(int mon, int year);
|
||||
|
||||
|
||||
/** Note for all templated functions:
|
||||
* By default, smallut.cpp has explicit instantiations for common
|
||||
* containers (list, vector, set, etc.). If this is not enough, or
|
||||
* conversely, if you want to minimize the module size, you can chose
|
||||
* the instantiations by defining the SMALLUT_EXTERNAL_INSTANTIATIONS
|
||||
* compilation flag, and defining the instances in a file named
|
||||
* smallut_instantiations.h
|
||||
*/
|
||||
|
||||
/**
|
||||
* Parse input string into list of strings.
|
||||
* Parse input string into list of strings. See instantiation note above.
|
||||
*
|
||||
* Token delimiter is " \t\n" except inside dquotes. dquote inside
|
||||
* dquotes can be escaped with \ etc...
|
||||
@ -118,7 +116,7 @@ template <class T> bool stringToStrings(const std::string& s, T& tokens,
|
||||
const std::string& addseps = "");
|
||||
|
||||
/**
|
||||
* Inverse operation:
|
||||
* Inverse operation. See instantiation note above.
|
||||
*/
|
||||
template <class T> void stringsToString(const T& tokens, std::string& s);
|
||||
template <class T> std::string stringsToString(const T& tokens);
|
||||
@ -126,12 +124,13 @@ template <class T> std::string stringsToString(const T& tokens);
|
||||
/**
|
||||
* Strings to CSV string. tokens containing the separator are quoted (")
|
||||
* " inside tokens is escaped as "" ([word "quote"] =>["word ""quote"""]
|
||||
* See instantiation note above.
|
||||
*/
|
||||
template <class T> void stringsToCSV(const T& tokens, std::string& s,
|
||||
char sep = ',');
|
||||
|
||||
/**
|
||||
* Split input string. No handling of quoting
|
||||
* Split input string. No handling of quoting.
|
||||
*/
|
||||
extern void stringToTokens(const std::string& s,
|
||||
std::vector<std::string>& tokens,
|
||||
@ -211,6 +210,7 @@ inline void leftzeropad(std::string& s, unsigned len)
|
||||
// (e.g. ac:23:0c:4f:46:fd)
|
||||
extern std::string hexprint(const std::string& in, char separ= 0);
|
||||
|
||||
#ifndef SMALLUT_NO_REGEX
|
||||
// A class to solve platorm/compiler issues for simple regex
|
||||
// matches. Uses the appropriate native lib under the hood.
|
||||
// This always uses extended regexp syntax.
|
||||
@ -227,13 +227,19 @@ public:
|
||||
std::string getMatch(const std::string& val, int i) const;
|
||||
/// Calls simpleMatch()
|
||||
bool operator() (const std::string& val) const;
|
||||
|
||||
/// Replace the first occurrence of regexp.
|
||||
std::string simpleSub(const std::string& input, const std::string& repl);
|
||||
|
||||
/// Check after construction
|
||||
bool ok() const;
|
||||
|
||||
|
||||
class Internal;
|
||||
private:
|
||||
Internal *m;
|
||||
};
|
||||
#endif // SMALLUT_NO_REGEX
|
||||
|
||||
/// Utilities for printing names for defined values (Ex: O_RDONLY->"O_RDONLY")
|
||||
|
||||
@ -257,9 +263,4 @@ extern std::string flagsToString(const std::vector<CharFlags>&,
|
||||
/// Translate a value into a name
|
||||
extern std::string valToString(const std::vector<CharFlags>&, unsigned int val);
|
||||
|
||||
/// Reverse operation: translate string into bitfield
|
||||
extern unsigned int
|
||||
stringToFlags(const std::vector<CharFlags>&, const std::string& input,
|
||||
const char *sep = "|");
|
||||
|
||||
#endif /* _SMALLUT_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user