ptrans: do not transform the translations input by the user (no path_canon()),

these could be foreign (windows x unix).
fileurltolocalpath: remove the '/' in front of a windows drive letter path even
when not built on Windows
Move all the dubious recoll-specific url code from the generic pathut to rclutil
This commit is contained in:
Jean-Francois Dockes 2022-05-18 08:47:02 +02:00
parent fbfa818a3a
commit be12db218b
8 changed files with 276 additions and 237 deletions

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2006 J.F.Dockes
/* Copyright (C) 2006-2022 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -44,7 +44,7 @@ void EditTrans::init(const string& dbdir)
{
m_dbdir = path_canon(dbdir);
connect(transTW, SIGNAL(itemDoubleClicked(QTableWidgetItem *)),
this, SLOT(onItemDoubleClicked(QTableWidgetItem *)));
this, SLOT(onItemDoubleClicked(QTableWidgetItem *)));
connect(cancelPB, SIGNAL(clicked()), this, SLOT(close()));
QString lab = whatIdxLA->text();
@ -57,18 +57,17 @@ void EditTrans::init(const string& dbdir)
ConfSimple *conftrans = theconfig->getPTrans();
if (!conftrans)
return;
return;
int row = 0;
vector<string> opaths = conftrans->getNames(m_dbdir);
for (vector<string>::const_iterator it = opaths.begin();
it != opaths.end(); it++) {
transTW->setRowCount(row+1);
transTW->setItem(row, 0, new QTableWidgetItem(path2qs(*it)));
string npath;
conftrans->get(*it, npath, m_dbdir);
transTW->setItem(row, 1, new QTableWidgetItem(path2qs(npath)));
row++;
for (const auto& opath : opaths) {
transTW->setRowCount(row+1);
transTW->setItem(row, 0, new QTableWidgetItem(path2qs(opath)));
string npath;
conftrans->get(opath, npath, m_dbdir);
transTW->setItem(row, 1, new QTableWidgetItem(path2qs(npath)));
row++;
}
resize(QSize(640, 300).expandedTo(minimumSizeHint()));
@ -83,18 +82,18 @@ void EditTrans::on_savePB_clicked()
{
ConfSimple *conftrans = theconfig->getPTrans();
if (!conftrans) {
QMessageBox::warning(0, "Recoll", tr("Config error"));
return;
QMessageBox::warning(0, "Recoll", tr("Config error"));
return;
}
conftrans->holdWrites(true);
conftrans->eraseKey(m_dbdir);
for (int row = 0; row < transTW->rowCount(); row++) {
QTableWidgetItem *item0 = transTW->item(row, 0);
string from = path_canon(qs2path(item0->text()));
QTableWidgetItem *item1 = transTW->item(row, 1);
string to = path_canon(qs2path(item1->text()));
conftrans->set(from, to, m_dbdir);
QTableWidgetItem *item0 = transTW->item(row, 0);
string from = qs2path(item0->text());
QTableWidgetItem *item1 = transTW->item(row, 1);
string to = qs2path(item1->text());
conftrans->set(from, to, m_dbdir);
}
conftrans->holdWrites(false);
// The rcldb does not use the same configuration object, but a
@ -118,12 +117,12 @@ void EditTrans::on_delPB_clicked()
QModelIndexList indexes = transTW->selectionModel()->selectedIndexes();
vector<int> rows;
for (int i = 0; i < indexes.size(); i++) {
rows.push_back(indexes.at(i).row());
rows.push_back(indexes.at(i).row());
}
sort(rows.begin(), rows.end());
rows.resize(unique(rows.begin(), rows.end()) - rows.begin());
for (int i = rows.size()-1; i >= 0; i--) {
transTW->removeRow(rows[i]);
transTW->removeRow(rows[i]);
}
}
@ -131,8 +130,8 @@ void EditTrans::on_transTW_itemSelectionChanged()
{
QModelIndexList indexes = transTW->selectionModel()->selectedIndexes();
if(indexes.size() < 1)
delPB->setEnabled(0);
delPB->setEnabled(0);
else
delPB->setEnabled(1);
delPB->setEnabled(1);
}

View File

@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
$(DEFS)
noinst_PROGRAMS = plaintorich textsplit fstreewalk rclconfig hldata unac mbox \
circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb
circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb rclutil
ecrontab_SOURCES = trecrontab.cpp
ecrontab_LDADD = ../librecoll.la
@ -68,6 +68,9 @@ rclconfig_LDADD = ../librecoll.la
rcldb_SOURCES = trrcldb.cpp
rcldb_LDADD = ../librecoll.la
rclutil_SOURCES = trrclutil.cpp
rclutil_LDADD = ../librecoll.la
textsplit_SOURCES = trtextsplit.cpp
textsplit_LDADD = ../librecoll.la

View File

@ -1,6 +1,30 @@
#include "rclutil.h"
#include <getopt.h>
#include <iostream>
#include <map>
#include "pathut.h"
using namespace std;
static std::map<std::string, int> options {
{"path_to_thumb", 0},
{"url_encode", 0},
};
static const char *thisprog;
static void Usage(void)
{
string sopts;
for (const auto& opt: options) {
sopts += "--" + opt.first + "\n";
}
fprintf(stderr, "%s: usage: %s\n%s", thisprog, thisprog, sopts.c_str());
exit(1);
}
void path_to_thumb(const string& _input)
{
@ -17,35 +41,52 @@ void path_to_thumb(const string& _input)
thumbPathForUrl(input, 7, path);
cout << path << endl;
}
const char *thisprog;
int main(int argc, const char **argv)
int main(int argc, char **argv)
{
thisprog = *argv++;
argc--;
thisprog = *argv;
std::vector<struct option> long_options;
string s;
vector<string>::const_iterator it;
#if 0
if (argc > 1) {
cerr << "Usage: thumbpath <filepath>" << endl;
exit(1);
for (auto& entry : options) {
struct option opt;
opt.name = entry.first.c_str();
opt.has_arg = 0;
opt.flag = &entry.second;
opt.val = 1;
long_options.push_back(opt);
}
string input;
if (argc == 1) {
input = *argv++;
if (input.empty()) {
cerr << "Usage: thumbpath <filepath>" << endl;
exit(1);
long_options.push_back({0, 0, 0, 0});
while (getopt_long(argc, argv, "", &long_options[0], nullptr) != -1) {
}
if (options["path_to_thumb"]) {
if (optind >= argc) {
cerr << "Usage: trrcutil --path_to_thumb <filepath>" << "\n";
return 1;
}
string input = argv[optind];
optind++;
if (optind != argc) {
return 1;
}
path_to_thumb(input);
} else {
while (getline(cin, input)) {
path_to_thumb(input);
} else if (options["url_encode"]) {
if (optind >= argc) {
cerr << "Usage: trsmallut --url_encode <arg> [offs=0]\n";
return 1;
}
string s = argv[optind];
optind++;
int offs = 0;
if (optind != argc) {
offs = atoi(argv[optind]);
optind++;
}
if (optind != argc) {
return 1;
}
cout << "url_encode(" << s << ", " << offs << ") -> [" << url_encode(s, offs) << "]\n";
} else {
Usage();
}
exit(0);
#endif
}

View File

@ -1107,179 +1107,6 @@ bool path_access(const std::string& path, int mode)
return ACCESS(syspath, mode) == 0;
}
/* There is a lot of vagueness about what should be percent-encoded or
* not in a file:// url. The constraint that we have is that we may use
* the encoded URL to compute (MD5) a thumbnail path according to the
* freedesktop.org thumbnail spec, which itself does not define what
* should be escaped. We choose to exactly escape what gio does, as
* implemented in glib/gconvert.c:g_escape_uri_string(uri, UNSAFE_PATH).
* Hopefully, the other desktops have the same set of escaped chars.
* Note that $ is not encoded, so the value is not shell-safe.
*/
string url_encode(const string& url, string::size_type offs)
{
string out = url.substr(0, offs);
const char *cp = url.c_str();
for (string::size_type i = offs; i < url.size(); i++) {
unsigned int c;
const char *h = "0123456789ABCDEF";
c = cp[i];
if (c <= 0x20 ||
c >= 0x7f ||
c == '"' ||
c == '#' ||
c == '%' ||
c == ';' ||
c == '<' ||
c == '>' ||
c == '?' ||
c == '[' ||
c == '\\' ||
c == ']' ||
c == '^' ||
c == '`' ||
c == '{' ||
c == '|' ||
c == '}') {
out += '%';
out += h[(c >> 4) & 0xf];
out += h[c & 0xf];
} else {
out += char(c);
}
}
return out;
}
static inline int h2d(int c) {
if ('0' <= c && c <= '9')
return c - '0';
else if ('A' <= c && c <= 'F')
return 10 + c - 'A';
else if ('a' <= c && c <= 'f')
return 10 + c - 'a';
else
return -1;
}
string url_decode(const string &in)
{
if (in.size() <= 2)
return in;
string out;
out.reserve(in.size());
const char *cp = in.c_str();
string::size_type i = 0;
for (; i < in.size() - 2; i++) {
if (cp[i] == '%') {
int d1 = h2d(cp[i+1]);
int d2 = h2d(cp[i+2]);
if (d1 != -1 && d2 != -1) {
out += (d1 << 4) + d2;
} else {
out += '%';
out += cp[i+1];
out += cp[i+2];
}
i += 2;
} else {
out += cp[i];
}
}
while (i < in.size()) {
out += cp[i++];
}
return out;
}
string url_gpath(const string& url)
{
// Remove the access schema part (or whatever it's called)
string::size_type colon = url.find_first_of(":");
if (colon == string::npos || colon == url.size() - 1) {
return url;
}
// If there are non-alphanum chars before the ':', then there
// probably is no scheme. Whatever...
for (string::size_type i = 0; i < colon; i++) {
if (!isalnum(url.at(i))) {
return url;
}
}
// In addition we canonize the path to remove empty host parts
// (for compatibility with older versions of recoll where file://
// was hardcoded, but the local path was used for doc
// identification.
return path_canon(url.substr(colon + 1));
}
string url_parentfolder(const string& url)
{
// In general, the parent is the directory above the full path
string parenturl = path_getfather(url_gpath(url));
// But if this is http, make sure to keep the host part. Recoll
// only has file or http urls for now.
bool isfileurl = urlisfileurl(url);
if (!isfileurl && parenturl == "/") {
parenturl = url_gpath(url);
}
return isfileurl ? string("file://") + parenturl :
string("http://") + parenturl;
}
// Convert to file path if url is like file:
// Note: this only works with our internal pseudo-urls which are not
// encoded/escaped
string fileurltolocalpath(string url)
{
if (url.find("file://") == 0) {
url = url.substr(7, string::npos);
} else {
return string();
}
#ifdef _WIN32
// Absolute file urls are like: file:///c:/mydir/...
// Get rid of the initial '/'
if (url.size() >= 3 && url[0] == '/' && isalpha(url[1]) && url[2] == ':') {
url = url.substr(1);
}
#endif
// Removing the fragment part. This is exclusively used when
// executing a viewer for the recoll manual, and we only strip the
// part after # if it is preceded by .html
string::size_type pos;
if ((pos = url.rfind(".html#")) != string::npos) {
url.erase(pos + 5);
} else if ((pos = url.rfind(".htm#")) != string::npos) {
url.erase(pos + 4);
}
return url;
}
static const string cstr_fileu("file://");
string path_pathtofileurl(const string& path)
{
// We're supposed to receive a canonic absolute path, but on windows we
// may need to add a '/' in front of the drive spec
string url(cstr_fileu);
if (path.empty() || path[0] != '/') {
url.push_back('/');
}
url += path;
return url;
}
bool urlisfileurl(const string& url)
{
return url.find("file://") == 0;
}
#ifndef NO_STD_REGEX
static std::regex
re_uriparse("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?",

View File

@ -213,22 +213,6 @@ bool path_utimes(const std::string& path, struct path_timeval times[2]);
* @param mode is an std::fstream mode (ios::in etc.) */
extern bool path_streamopen(const std::string& path, int mode, std::fstream& outstream);
/// Encode according to rfc 1738
extern std::string url_encode(const std::string& url, std::string::size_type offs = 0);
extern std::string url_decode(const std::string& encoded);
//// Convert to file path if url is like file://. This modifies the
//// input (and returns a copy for convenience)
extern std::string fileurltolocalpath(std::string url);
/// Test for file:/// url
extern bool urlisfileurl(const std::string& url);
///
extern std::string url_parentfolder(const std::string& url);
/// Return the host+path part of an url. This is not a general
/// routine, it does the right thing only in the recoll context
extern std::string url_gpath(const std::string& url);
/// Turn absolute path into file:// url
extern std::string path_pathtofileurl(const std::string& path);
/// URI parser, loosely from rfc2396.txt
class ParsedUri {
public:

View File

@ -293,6 +293,175 @@ const string& path_pkgdatadir()
return datadir;
}
/* There is a lot of vagueness about what should be percent-encoded or
* not in a file:// url. The constraint that we have is that we may use
* the encoded URL to compute (MD5) a thumbnail path according to the
* freedesktop.org thumbnail spec, which itself does not define what
* should be escaped. We choose to exactly escape what gio does, as
* implemented in glib/gconvert.c:g_escape_uri_string(uri, UNSAFE_PATH).
* Hopefully, the other desktops have the same set of escaped chars.
* Note that $ is not encoded, so the value is not shell-safe.
*/
string url_encode(const string& url, string::size_type offs)
{
string out = url.substr(0, offs);
const char *cp = url.c_str();
for (string::size_type i = offs; i < url.size(); i++) {
unsigned int c;
const char *h = "0123456789ABCDEF";
c = cp[i];
if (c <= 0x20 ||
c >= 0x7f ||
c == '"' ||
c == '#' ||
c == '%' ||
c == ';' ||
c == '<' ||
c == '>' ||
c == '?' ||
c == '[' ||
c == '\\' ||
c == ']' ||
c == '^' ||
c == '`' ||
c == '{' ||
c == '|' ||
c == '}') {
out += '%';
out += h[(c >> 4) & 0xf];
out += h[c & 0xf];
} else {
out += char(c);
}
}
return out;
}
static inline int h2d(int c) {
if ('0' <= c && c <= '9')
return c - '0';
else if ('A' <= c && c <= 'F')
return 10 + c - 'A';
else if ('a' <= c && c <= 'f')
return 10 + c - 'a';
else
return -1;
}
string url_decode(const string &in)
{
if (in.size() <= 2)
return in;
string out;
out.reserve(in.size());
const char *cp = in.c_str();
string::size_type i = 0;
for (; i < in.size() - 2; i++) {
if (cp[i] == '%') {
int d1 = h2d(cp[i+1]);
int d2 = h2d(cp[i+2]);
if (d1 != -1 && d2 != -1) {
out += (d1 << 4) + d2;
} else {
out += '%';
out += cp[i+1];
out += cp[i+2];
}
i += 2;
} else {
out += cp[i];
}
}
while (i < in.size()) {
out += cp[i++];
}
return out;
}
string url_gpath(const string& url)
{
// Remove the access schema part (or whatever it's called)
string::size_type colon = url.find_first_of(":");
if (colon == string::npos || colon == url.size() - 1) {
return url;
}
// If there are non-alphanum chars before the ':', then there
// probably is no scheme. Whatever...
for (string::size_type i = 0; i < colon; i++) {
if (!isalnum(url.at(i))) {
return url;
}
}
// In addition we canonize the path to remove empty host parts
// (for compatibility with older versions of recoll where file://
// was hardcoded, but the local path was used for doc
// identification.
return path_canon(url.substr(colon + 1));
}
string url_parentfolder(const string& url)
{
// In general, the parent is the directory above the full path
string parenturl = path_getfather(url_gpath(url));
// But if this is http, make sure to keep the host part. Recoll
// only has file or http urls for now.
bool isfileurl = urlisfileurl(url);
if (!isfileurl && parenturl == "/") {
parenturl = url_gpath(url);
}
return isfileurl ? string("file://") + parenturl :
string("http://") + parenturl;
}
// Convert to file path if url is like file:
// Note: this only works with our internal pseudo-urls which are not
// encoded/escaped
string fileurltolocalpath(string url)
{
if (url.find("file://") == 0) {
url = url.substr(7, string::npos);
} else {
return string();
}
// If this looks like a Windows path: absolute file urls are like: file:///c:/mydir/...
// Get rid of the initial '/'
if (url.size() >= 3 && url[0] == '/' && isalpha(url[1]) && url[2] == ':') {
url = url.substr(1);
}
// Removing the fragment part. This is exclusively used when
// executing a viewer for the recoll manual, and we only strip the
// part after # if it is preceded by .html
string::size_type pos;
if ((pos = url.rfind(".html#")) != string::npos) {
url.erase(pos + 5);
} else if ((pos = url.rfind(".htm#")) != string::npos) {
url.erase(pos + 4);
}
return url;
}
string path_pathtofileurl(const string& path)
{
// We're supposed to receive a canonic absolute path, but on windows we
// may need to add a '/' in front of the drive spec
string url(cstr_fileu);
if (path.empty() || path[0] != '/') {
url.push_back('/');
}
url += path;
return url;
}
bool urlisfileurl(const string& url)
{
return url.find("file://") == 0;
}
// Printable url: this is used to transcode from the system charset
// into either utf-8 if transcoding succeeds, or url-encoded
bool printableUrl(const string& fcharset, const string& in, string& out)

View File

@ -43,6 +43,22 @@ extern const std::string& path_pkgdatadir();
extern std::string path_thisexecpath();
#endif
/// Encode according to rfc 1738
extern std::string url_encode(const std::string& url, std::string::size_type offs = 0);
extern std::string url_decode(const std::string& encoded);
//// Convert to file path if url is like file://. This modifies the
//// input (and returns a copy for convenience)
extern std::string fileurltolocalpath(std::string url);
/// Test for file:/// url
extern bool urlisfileurl(const std::string& url);
///
extern std::string url_parentfolder(const std::string& url);
/// Return the host+path part of an url. This is not a general
/// routine, it does the right thing only in the recoll context
extern std::string url_gpath(const std::string& url);
/// Turn absolute path into file:// url
extern std::string path_pathtofileurl(const std::string& path);
/// Transcode to utf-8 if possible or url encoding, for display.
extern bool printableUrl(const std::string& fcharset,
const std::string& in, std::string& out);

View File

@ -26,7 +26,7 @@
#include "cstr.h"
#include "log.h"
#include "pathut.h"
#include "rclutil.h"
using namespace std;