ptrans: do not transform the translations input by the user (no path_canon()),

these could be foreign (windows x unix). fileurltolocalpath: remove the '/' in front of a windows drive letter path even when not built on Windows Move all the dubious recoll-specific url code from the generic pathut to rclutil
2022-05-18 08:47:02 +02:00 · 2022-05-18 08:47:02 +02:00 · be12db218b
commit be12db218b
parent fbfa818a3a
8 changed files with 276 additions and 237 deletions
--- a/src/qtgui/ptrans_w.cpp
+++ b/src/qtgui/ptrans_w.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2006 J.F.Dockes 
+/* Copyright (C) 2006-2022 J.F.Dockes 
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -44,7 +44,7 @@ void EditTrans::init(const string& dbdir)
 {
    m_dbdir = path_canon(dbdir);
    connect(transTW, SIGNAL(itemDoubleClicked(QTableWidgetItem *)),
-        this, SLOT(onItemDoubleClicked(QTableWidgetItem *)));
+            this, SLOT(onItemDoubleClicked(QTableWidgetItem *)));
    connect(cancelPB, SIGNAL(clicked()), this, SLOT(close()));

    QString lab = whatIdxLA->text();
@ -57,18 +57,17 @@ void EditTrans::init(const string& dbdir)

    ConfSimple *conftrans = theconfig->getPTrans();
    if (!conftrans)
-    return;
+        return;

    int row = 0;
    vector<string> opaths = conftrans->getNames(m_dbdir);
-    for (vector<string>::const_iterator it = opaths.begin(); 
-     it != opaths.end(); it++) {
-    transTW->setRowCount(row+1);
-    transTW->setItem(row, 0, new QTableWidgetItem(path2qs(*it)));
-    string npath;
-    conftrans->get(*it, npath, m_dbdir);
-    transTW->setItem(row, 1, new QTableWidgetItem(path2qs(npath)));
-    row++;
+    for (const auto& opath : opaths) {
+        transTW->setRowCount(row+1);
+        transTW->setItem(row, 0, new QTableWidgetItem(path2qs(opath)));
+        string npath;
+        conftrans->get(opath, npath, m_dbdir);
+        transTW->setItem(row, 1, new QTableWidgetItem(path2qs(npath)));
+        row++;
    }

    resize(QSize(640, 300).expandedTo(minimumSizeHint()));
@ -83,18 +82,18 @@ void EditTrans::on_savePB_clicked()
 {
    ConfSimple *conftrans = theconfig->getPTrans();
    if (!conftrans) {
-    QMessageBox::warning(0, "Recoll", tr("Config error"));
-    return;
+        QMessageBox::warning(0, "Recoll", tr("Config error"));
+        return;
    }
    conftrans->holdWrites(true);
    conftrans->eraseKey(m_dbdir);

    for (int row = 0; row < transTW->rowCount(); row++) {
-    QTableWidgetItem *item0 = transTW->item(row, 0);
-    string from = path_canon(qs2path(item0->text()));
-    QTableWidgetItem *item1 = transTW->item(row, 1);
-    string to = path_canon(qs2path(item1->text()));
-    conftrans->set(from, to, m_dbdir);
+        QTableWidgetItem *item0 = transTW->item(row, 0);
+        string from = qs2path(item0->text());
+        QTableWidgetItem *item1 = transTW->item(row, 1);
+        string to = qs2path(item1->text());
+        conftrans->set(from, to, m_dbdir);
    }
    conftrans->holdWrites(false);
    // The rcldb does not use the same configuration object, but a
@ -118,12 +117,12 @@ void EditTrans::on_delPB_clicked()
    QModelIndexList indexes = transTW->selectionModel()->selectedIndexes();
    vector<int> rows;
    for (int i = 0; i < indexes.size(); i++) {
-    rows.push_back(indexes.at(i).row());
+        rows.push_back(indexes.at(i).row());
    }
    sort(rows.begin(), rows.end());
    rows.resize(unique(rows.begin(), rows.end()) - rows.begin());
    for (int i = rows.size()-1; i >= 0; i--) {
-    transTW->removeRow(rows[i]);
+        transTW->removeRow(rows[i]);
    }
 }

@ -131,8 +130,8 @@ void EditTrans::on_transTW_itemSelectionChanged()
 {
    QModelIndexList indexes = transTW->selectionModel()->selectedIndexes();
    if(indexes.size() < 1)
-    delPB->setEnabled(0);
+        delPB->setEnabled(0);
    else 
-    delPB->setEnabled(1);
+        delPB->setEnabled(1);
 }

--- a/src/testmains/Makefile.am
+++ b/src/testmains/Makefile.am
@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(DEFS)

 noinst_PROGRAMS = plaintorich textsplit fstreewalk rclconfig hldata unac mbox \
-    circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb
+    circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb rclutil

 ecrontab_SOURCES = trecrontab.cpp
 ecrontab_LDADD = ../librecoll.la
@ -68,6 +68,9 @@ rclconfig_LDADD = ../librecoll.la
 rcldb_SOURCES = trrcldb.cpp
 rcldb_LDADD = ../librecoll.la

+rclutil_SOURCES = trrclutil.cpp
+rclutil_LDADD = ../librecoll.la
+
 textsplit_SOURCES = trtextsplit.cpp
 textsplit_LDADD = ../librecoll.la

--- a/src/testmains/trrclutil.cpp
+++ b/src/testmains/trrclutil.cpp
@ -1,6 +1,30 @@

 #include "rclutil.h"

+#include <getopt.h>
+
+#include <iostream>
+#include <map>
+
+#include "pathut.h"
+
+using namespace std;
+
+static std::map<std::string, int> options {
+    {"path_to_thumb", 0},
+    {"url_encode", 0},
+        };
+
+static const char *thisprog;
+static void Usage(void)
+{
+    string sopts;
+    for (const auto& opt: options) {
+        sopts += "--" + opt.first + "\n";
+    }
+    fprintf(stderr, "%s: usage: %s\n%s", thisprog, thisprog, sopts.c_str());
+    exit(1);
+}

 void path_to_thumb(const string& _input)
 {
@ -17,35 +41,52 @@ void path_to_thumb(const string& _input)
    thumbPathForUrl(input, 7, path);
    cout << path << endl;
 }
-
-const char *thisprog;
-
-int main(int argc, const char **argv)
+        
+int main(int argc, char **argv)
 {
-    thisprog = *argv++;
-    argc--;
+    thisprog = *argv;
+    std::vector<struct option> long_options;

-    string s;
-    vector<string>::const_iterator it;
-
-#if 0
-    if (argc > 1) {
-        cerr <<  "Usage: thumbpath <filepath>" << endl;
-        exit(1);
+    for (auto& entry : options) {
+        struct option opt;
+        opt.name = entry.first.c_str();
+        opt.has_arg = 0;
+        opt.flag = &entry.second;
+        opt.val = 1;
+        long_options.push_back(opt);
    }
-    string input;
-    if (argc == 1) {
-        input = *argv++;
-        if (input.empty())  {
-            cerr << "Usage: thumbpath <filepath>" << endl;
-            exit(1);
+    long_options.push_back({0, 0, 0, 0});
+
+    while (getopt_long(argc, argv, "", &long_options[0], nullptr) != -1) {
+    }
+    if (options["path_to_thumb"]) {
+        if (optind >= argc) {
+            cerr <<  "Usage: trrcutil --path_to_thumb <filepath>" << "\n";
+            return 1;
+        }
+        string input = argv[optind];
+        optind++;
+        if (optind != argc) {
+            return 1;
        }
        path_to_thumb(input);
-    } else {
-        while (getline(cin, input)) {
-            path_to_thumb(input);
+    } else if (options["url_encode"]) {
+        if (optind >= argc) {
+            cerr << "Usage: trsmallut --url_encode <arg> [offs=0]\n";
+            return 1;
        }
+        string s = argv[optind];
+        optind++;
+        int offs = 0;
+        if (optind != argc) {
+            offs = atoi(argv[optind]);
+            optind++;
+        }
+        if (optind != argc) {
+            return 1;
+        }
+        cout << "url_encode(" << s << ", " << offs << ") -> [" << url_encode(s, offs) << "]\n";
+    } else {
+        Usage();
    }
-    exit(0);
-#endif
 }
--- a/src/utils/pathut.cpp
+++ b/src/utils/pathut.cpp
@ -1107,179 +1107,6 @@ bool path_access(const std::string& path, int mode)
    return ACCESS(syspath, mode) == 0;
 }

-/* There is a lot of vagueness about what should be percent-encoded or
- * not in a file:// url. The constraint that we have is that we may use
- * the encoded URL to compute (MD5) a thumbnail path according to the
- * freedesktop.org thumbnail spec, which itself does not define what
- * should be escaped. We choose to exactly escape what gio does, as
- * implemented in glib/gconvert.c:g_escape_uri_string(uri, UNSAFE_PATH). 
- * Hopefully, the other desktops have the same set of escaped chars. 
- * Note that $ is not encoded, so the value is not shell-safe.
- */
-string url_encode(const string& url, string::size_type offs)
-{
-    string out = url.substr(0, offs);
-    const char *cp = url.c_str();
-    for (string::size_type i = offs; i < url.size(); i++) {
-        unsigned int c;
-        const char *h = "0123456789ABCDEF";
-        c = cp[i];
-        if (c <= 0x20 ||
-            c >= 0x7f ||
-            c == '"' ||
-            c == '#' ||
-            c == '%' ||
-            c == ';' ||
-            c == '<' ||
-            c == '>' ||
-            c == '?' ||
-            c == '[' ||
-            c == '\\' ||
-            c == ']' ||
-            c == '^' ||
-            c == '`' ||
-            c == '{' ||
-            c == '|' ||
-            c == '}') {
-            out += '%';
-            out += h[(c >> 4) & 0xf];
-            out += h[c & 0xf];
-        } else {
-            out += char(c);
-        }
-    }
-    return out;
-}
-
-static inline int h2d(int c) {
-    if ('0' <= c && c <= '9')
-        return c - '0';
-    else if ('A' <= c && c <= 'F')
-        return 10 + c - 'A';
-    else if ('a' <= c && c <= 'f')
-        return 10 + c - 'a';
-    else 
-        return -1;
-}
-
-string url_decode(const string &in)
-{
-    if (in.size() <= 2)
-        return in;
-    string out;
-    out.reserve(in.size());
-    const char *cp = in.c_str();
-    string::size_type i = 0;
-    for (; i < in.size() - 2; i++) {
-        if (cp[i] == '%') {
-            int d1 = h2d(cp[i+1]);
-            int d2 = h2d(cp[i+2]);
-            if (d1 != -1 && d2 != -1) {
-                out += (d1 << 4) + d2;
-            } else {
-                out += '%';
-                out += cp[i+1];
-                out += cp[i+2];
-            }
-            i += 2;
-        } else {
-            out += cp[i];
-        }
-    }
-    while (i < in.size()) {
-        out += cp[i++];
-    }
-    return out;
-}
-
-string url_gpath(const string& url)
-{
-    // Remove the access schema part (or whatever it's called)
-    string::size_type colon = url.find_first_of(":");
-    if (colon == string::npos || colon == url.size() - 1) {
-        return url;
-    }
-    // If there are non-alphanum chars before the ':', then there
-    // probably is no scheme. Whatever...
-    for (string::size_type i = 0; i < colon; i++) {
-        if (!isalnum(url.at(i))) {
-            return url;
-        }
-    }
-
-    // In addition we canonize the path to remove empty host parts
-    // (for compatibility with older versions of recoll where file://
-    // was hardcoded, but the local path was used for doc
-    // identification.
-    return path_canon(url.substr(colon + 1));
-}
-
-string url_parentfolder(const string& url)
-{
-    // In general, the parent is the directory above the full path
-    string parenturl = path_getfather(url_gpath(url));
-    // But if this is http, make sure to keep the host part. Recoll
-    // only has file or http urls for now.
-    bool isfileurl = urlisfileurl(url);
-    if (!isfileurl && parenturl == "/") {
-        parenturl = url_gpath(url);
-    }
-    return isfileurl ? string("file://") + parenturl :
-        string("http://") + parenturl;
-}
-
-
-// Convert to file path if url is like file:
-// Note: this only works with our internal pseudo-urls which are not
-// encoded/escaped
-string fileurltolocalpath(string url)
-{
-    if (url.find("file://") == 0) {
-        url = url.substr(7, string::npos);
-    } else {
-        return string();
-    }
-
-#ifdef _WIN32
-    // Absolute file urls are like: file:///c:/mydir/...
-    // Get rid of the initial '/'
-    if (url.size() >= 3 && url[0] == '/' && isalpha(url[1]) && url[2] == ':') {
-        url = url.substr(1);
-    }
-#endif
-
-    // Removing the fragment part. This is exclusively used when
-    // executing a viewer for the recoll manual, and we only strip the
-    // part after # if it is preceded by .html
-    string::size_type pos;
-    if ((pos = url.rfind(".html#")) != string::npos) {
-        url.erase(pos + 5);
-    } else if ((pos = url.rfind(".htm#")) != string::npos) {
-        url.erase(pos + 4);
-    }
-
-    return url;
-}
-
-static const string cstr_fileu("file://");
-
-string path_pathtofileurl(const string& path)
-{
-    // We're supposed to receive a canonic absolute path, but on windows we
-    // may need to add a '/' in front of the drive spec
-    string url(cstr_fileu);
-    if (path.empty() || path[0] != '/') {
-        url.push_back('/');
-    }
-    url += path;
-    return url;
-}
-
-bool urlisfileurl(const string& url)
-{
-    return url.find("file://") == 0;
-}
-
 #ifndef NO_STD_REGEX
 static std::regex
 re_uriparse("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?",
--- a/src/utils/pathut.h
+++ b/src/utils/pathut.h
@ -213,22 +213,6 @@ bool path_utimes(const std::string& path, struct path_timeval times[2]);
 * @param mode is an std::fstream mode (ios::in etc.) */
 extern bool path_streamopen(const std::string& path, int mode, std::fstream& outstream);

-/// Encode according to rfc 1738
-extern std::string url_encode(const std::string& url, std::string::size_type offs = 0);
-extern std::string url_decode(const std::string& encoded);
-//// Convert to file path if url is like file://. This modifies the
-//// input (and returns a copy for convenience)
-extern std::string fileurltolocalpath(std::string url);
-/// Test for file:/// url
-extern bool urlisfileurl(const std::string& url);
-///
-extern std::string url_parentfolder(const std::string& url);
-/// Return the host+path part of an url. This is not a general
-/// routine, it does the right thing only in the recoll context
-extern std::string url_gpath(const std::string& url);
-/// Turn absolute path into file:// url
-extern std::string path_pathtofileurl(const std::string& path);
-
 /// URI parser, loosely from rfc2396.txt
 class ParsedUri {
 public:
--- a/src/utils/rclutil.cpp
+++ b/src/utils/rclutil.cpp
@ -293,6 +293,175 @@ const string& path_pkgdatadir()
    return datadir;
 }

+/* There is a lot of vagueness about what should be percent-encoded or
+ * not in a file:// url. The constraint that we have is that we may use
+ * the encoded URL to compute (MD5) a thumbnail path according to the
+ * freedesktop.org thumbnail spec, which itself does not define what
+ * should be escaped. We choose to exactly escape what gio does, as
+ * implemented in glib/gconvert.c:g_escape_uri_string(uri, UNSAFE_PATH). 
+ * Hopefully, the other desktops have the same set of escaped chars. 
+ * Note that $ is not encoded, so the value is not shell-safe.
+ */
+string url_encode(const string& url, string::size_type offs)
+{
+    string out = url.substr(0, offs);
+    const char *cp = url.c_str();
+    for (string::size_type i = offs; i < url.size(); i++) {
+        unsigned int c;
+        const char *h = "0123456789ABCDEF";
+        c = cp[i];
+        if (c <= 0x20 ||
+            c >= 0x7f ||
+            c == '"' ||
+            c == '#' ||
+            c == '%' ||
+            c == ';' ||
+            c == '<' ||
+            c == '>' ||
+            c == '?' ||
+            c == '[' ||
+            c == '\\' ||
+            c == ']' ||
+            c == '^' ||
+            c == '`' ||
+            c == '{' ||
+            c == '|' ||
+            c == '}') {
+            out += '%';
+            out += h[(c >> 4) & 0xf];
+            out += h[c & 0xf];
+        } else {
+            out += char(c);
+        }
+    }
+    return out;
+}
+
+static inline int h2d(int c) {
+    if ('0' <= c && c <= '9')
+        return c - '0';
+    else if ('A' <= c && c <= 'F')
+        return 10 + c - 'A';
+    else if ('a' <= c && c <= 'f')
+        return 10 + c - 'a';
+    else 
+        return -1;
+}
+
+string url_decode(const string &in)
+{
+    if (in.size() <= 2)
+        return in;
+    string out;
+    out.reserve(in.size());
+    const char *cp = in.c_str();
+    string::size_type i = 0;
+    for (; i < in.size() - 2; i++) {
+        if (cp[i] == '%') {
+            int d1 = h2d(cp[i+1]);
+            int d2 = h2d(cp[i+2]);
+            if (d1 != -1 && d2 != -1) {
+                out += (d1 << 4) + d2;
+            } else {
+                out += '%';
+                out += cp[i+1];
+                out += cp[i+2];
+            }
+            i += 2;
+        } else {
+            out += cp[i];
+        }
+    }
+    while (i < in.size()) {
+        out += cp[i++];
+    }
+    return out;
+}
+
+string url_gpath(const string& url)
+{
+    // Remove the access schema part (or whatever it's called)
+    string::size_type colon = url.find_first_of(":");
+    if (colon == string::npos || colon == url.size() - 1) {
+        return url;
+    }
+    // If there are non-alphanum chars before the ':', then there
+    // probably is no scheme. Whatever...
+    for (string::size_type i = 0; i < colon; i++) {
+        if (!isalnum(url.at(i))) {
+            return url;
+        }
+    }
+
+    // In addition we canonize the path to remove empty host parts
+    // (for compatibility with older versions of recoll where file://
+    // was hardcoded, but the local path was used for doc
+    // identification.
+    return path_canon(url.substr(colon + 1));
+}
+
+string url_parentfolder(const string& url)
+{
+    // In general, the parent is the directory above the full path
+    string parenturl = path_getfather(url_gpath(url));
+    // But if this is http, make sure to keep the host part. Recoll
+    // only has file or http urls for now.
+    bool isfileurl = urlisfileurl(url);
+    if (!isfileurl && parenturl == "/") {
+        parenturl = url_gpath(url);
+    }
+    return isfileurl ? string("file://") + parenturl :
+        string("http://") + parenturl;
+}
+
+
+// Convert to file path if url is like file:
+// Note: this only works with our internal pseudo-urls which are not
+// encoded/escaped
+string fileurltolocalpath(string url)
+{
+    if (url.find("file://") == 0) {
+        url = url.substr(7, string::npos);
+    } else {
+        return string();
+    }
+
+    // If this looks like a Windows path: absolute file urls are like: file:///c:/mydir/...
+    // Get rid of the initial '/'
+    if (url.size() >= 3 && url[0] == '/' && isalpha(url[1]) && url[2] == ':') {
+        url = url.substr(1);
+    }
+
+    // Removing the fragment part. This is exclusively used when
+    // executing a viewer for the recoll manual, and we only strip the
+    // part after # if it is preceded by .html
+    string::size_type pos;
+    if ((pos = url.rfind(".html#")) != string::npos) {
+        url.erase(pos + 5);
+    } else if ((pos = url.rfind(".htm#")) != string::npos) {
+        url.erase(pos + 4);
+    }
+
+    return url;
+}
+
+string path_pathtofileurl(const string& path)
+{
+    // We're supposed to receive a canonic absolute path, but on windows we
+    // may need to add a '/' in front of the drive spec
+    string url(cstr_fileu);
+    if (path.empty() || path[0] != '/') {
+        url.push_back('/');
+    }
+    url += path;
+    return url;
+}
+
+bool urlisfileurl(const string& url)
+{
+    return url.find("file://") == 0;
+}
+
 // Printable url: this is used to transcode from the system charset
 // into either utf-8 if transcoding succeeds, or url-encoded
 bool printableUrl(const string& fcharset, const string& in, string& out)
--- a/src/utils/rclutil.h
+++ b/src/utils/rclutil.h
@ -43,6 +43,22 @@ extern const std::string& path_pkgdatadir();
 extern std::string path_thisexecpath();
 #endif

+/// Encode according to rfc 1738
+extern std::string url_encode(const std::string& url, std::string::size_type offs = 0);
+extern std::string url_decode(const std::string& encoded);
+//// Convert to file path if url is like file://. This modifies the
+//// input (and returns a copy for convenience)
+extern std::string fileurltolocalpath(std::string url);
+/// Test for file:/// url
+extern bool urlisfileurl(const std::string& url);
+///
+extern std::string url_parentfolder(const std::string& url);
+/// Return the host+path part of an url. This is not a general
+/// routine, it does the right thing only in the recoll context
+extern std::string url_gpath(const std::string& url);
+/// Turn absolute path into file:// url
+extern std::string path_pathtofileurl(const std::string& path);
+
 /// Transcode to utf-8 if possible or url encoding, for display.
 extern bool printableUrl(const std::string& fcharset,
                         const std::string& in, std::string& out);
--- a/src/utils/strmatcher.cpp
+++ b/src/utils/strmatcher.cpp
@ -26,7 +26,7 @@

 #include "cstr.h"
 #include "log.h"
-#include "pathut.h"
+#include "rclutil.h"

 using namespace std;