diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 8081063c..f19cf3b1 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.24 2006-03-22 14:25:46 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.25 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -20,6 +20,8 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.24 2006-03-22 14:25:46 dockes E #include #include #include +#include + #include #include @@ -140,26 +142,40 @@ bool RclConfig::getConfParam(const std::string &name, bool *bvp) return true; } -// If defcharset was set (from the config or a previous call), use it. -// Else, try to guess it from LANG. -// Use iso8859-1 as ultimate default -// defcharset is reset on setKeyDir() -const string& RclConfig::getDefCharset() +// Get charset to be used for transcoding to utf-8 if unspecified by doc +// For document contents: +// If defcharset was set (from the config or a previous call), use it. +// Else, try to guess it from the locale +// Use iso8859-1 as ultimate default +// defcharset is reset on setKeyDir() +// For filenames, same thing except that we do not use the config file value +// (only the locale). +const string& RclConfig::getDefCharset(bool filename) { - if (defcharset.empty()) { + static string localecharset; // This supposedly never changes + if (localecharset.empty()) { const char *cp; - if ((cp = getenv("LANG"))) { - cp = strrchr(cp, '.'); - if (cp) { - cp++; - if (*cp) - defcharset = string(cp); - } + cp = nl_langinfo(CODESET); + // We don't keep US-ASCII. It's better to use a superset + // Ie: me have a C locale and some french file names, and I + // can't imagine a version of iconv that couldn't translate + // from iso8859? + if (cp && *cp && strcmp(cp, "US-ASCII")) { + localecharset = string(cp); + } else { + localecharset = string("ISO8859-1"); } - if (defcharset.empty()) - defcharset = string("ISO8859-1"); } - return defcharset; + + if (defcharset.empty()) { + defcharset = localecharset; + } + + if (filename) { + return localecharset; + } else { + return defcharset; + } } // Get all known document mime values. We get them from the mimeconf diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 9cd30653..2952023e 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -16,7 +16,7 @@ */ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.17 2006-03-22 14:25:46 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.18 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -53,8 +53,9 @@ class RclConfig { bool getConfParam(const std::string &name, int *value); /** Variant with autoconversion to bool */ bool getConfParam(const std::string &name, bool *value); - /** Get default charset for current keydir (was set during setKeydir) */ - const string &getDefCharset(); + /** Get default charset for current keydir (was set during setKeydir) + * filenames are handled differently */ + const string &getDefCharset(bool filename = false); /** Get guessCharset for current keydir (was set during setKeydir) */ bool getGuessCharset() {return guesscharset;} diff --git a/src/common/rclinit.cpp b/src/common/rclinit.cpp index 52d6f59a..abf11fb0 100644 --- a/src/common/rclinit.cpp +++ b/src/common/rclinit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.5 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -20,6 +20,7 @@ static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp #include #include +#include #include "debuglog.h" #include "rclconfig.h" @@ -59,6 +60,10 @@ RclConfig *recollinit(void (*cleanup)(void), void (*sigcleanup)(int), int lev = atoi(loglevel.c_str()); DebugLog::getdbl()->setloglevel(lev); } - + + // Make sure the locale is set. This is only for converting file names + // to utf8 for indexation. + setlocale(LC_CTYPE, ""); + return config; } diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index df9f2545..a74c241a 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.26 2006-03-22 16:24:41 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.27 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -195,7 +195,7 @@ bool DbIndexer::indexFiles(const list &filenames) /// /// Accent and majuscule handling are performed by the db module when doing /// the actual indexing work. The Rcl::Doc created by internfile() -/// contains pretty raw utf8 data. +/// mostly contains pretty raw utf8 data. FsTreeWalker::Status DbIndexer::processone(const std::string &fn, const struct stat *stp, FsTreeWalker::CbFlag flg) @@ -239,10 +239,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, doc.ipath = ipath; // File name transcoded to utf8 for indexation. - // We actually might want a separate param for the filename charset - string charset = config->getDefCharset(); - // If this fails, the path won't be indexed, no big deal - transcode(fn, doc.utf8fn, charset, "UTF-8"); + string charset = config->getDefCharset(true); + // If this fails, the file name won't be indexed, no big deal + // Note that we used to do the full path here, but I ended up believing + // that it made more sense to use only the file name + transcode(path_getsimple(fn), doc.utf8fn, charset, "UTF-8"); // Do database-specific work to update document data if (!db.add(fn, doc, stp)) return FsTreeWalker::FtwError; diff --git a/src/qtgui/preview/preview.ui.h b/src/qtgui/preview/preview.ui.h index e11e2a32..b9812880 100644 --- a/src/qtgui/preview/preview.ui.h +++ b/src/qtgui/preview/preview.ui.h @@ -351,11 +351,13 @@ class LoadThread : public QThread { string ipath; string *mtype; string tmpdir; - + int loglevel; public: LoadThread(int *stp, Rcl::Doc *odoc, string fn, string ip, string *mt) : statusp(stp), out(odoc), filename(fn), ipath(ip), mtype(mt) - {} + { + loglevel = DebugLog::getdbl()->getlevel(); + } ~LoadThread() { if (tmpdir.length()) { wipedir(tmpdir); @@ -363,7 +365,7 @@ class LoadThread : public QThread { } } virtual void run() { - DebugLog::getdbl()->setloglevel(DEBDEB1); + DebugLog::getdbl()->setloglevel(loglevel); if (!maketmpdir(tmpdir)) { QMessageBox::critical(0, "Recoll", Preview::tr("Cannot create temporary directory")); @@ -389,14 +391,17 @@ class ToRichThread : public QThread { list &terms; string& firstTerm; QString &out; + int loglevel; public: ToRichThread(string &i, list &trms, string& ft, QString &o) : in(i), terms(trms), firstTerm(ft), out(o) - {} + { + loglevel = DebugLog::getdbl()->getlevel(); + } virtual void run() { - DebugLog::getdbl()->setloglevel(DEBDEB1); + DebugLog::getdbl()->setloglevel(loglevel); string rich; try { plaintorich(in, rich, terms, &firstTerm); diff --git a/src/qtgui/rclreslist.cpp b/src/qtgui/rclreslist.cpp index 63cce4ce..690087fe 100644 --- a/src/qtgui/rclreslist.cpp +++ b/src/qtgui/rclreslist.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclreslist.cpp,v 1.5 2006-03-22 11:17:49 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclreslist.cpp,v 1.6 2006-03-29 11:18:14 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -19,6 +19,8 @@ static char rcsid[] = "@(#$Id: rclreslist.cpp,v 1.5 2006-03-22 11:17:49 dockes E #include "guiutils.h" #include "pathut.h" #include "docseq.h" +#include "transcode.h" +#include "pathut.h" #include "rclreslist.h" #include "moc_rclreslist.cpp" @@ -242,7 +244,7 @@ void RclResList::showResultPage() .arg(resCnt); append(chunk); } - + gotone = true; // Determine icon to display if any @@ -265,9 +267,17 @@ void RclResList::showResultPage() sprintf(perbuf, "%3d%% ", percent); // Make title out of file name if none yet - if (doc.title.empty()) - doc.title = path_getsimple(doc.url); + string fcharset = rclconfig->getDefCharset(true); + if (doc.title.empty()) { + transcode(path_getsimple(doc.url), doc.title, fcharset, "UTF-8"); + } + // Printable url: either utf-8 if transcoding succeeds, or url-encoded + string url; int ecnt = 0; + if (!transcode(doc.url, url, fcharset, "UTF-8", &ecnt) || ecnt) { + url = url_encode(doc.url, 7); + } + // Document date: either doc or file modification time char datebuf[100]; datebuf[0] = 0; @@ -317,7 +327,7 @@ void RclResList::showResultPage() if (!img_name.empty()) { result += ""; } - result += "" + doc.url + +"
"; + result += "" + url + +"
"; if (!abst.empty()) result += abst + "
"; if (!doc.keywords.empty()) @@ -417,6 +427,7 @@ QPopupMenu *RclResList::createPopupMenu(const QPoint& pos) popup->insertItem(tr("&Preview"), this, SLOT(menuPreview())); popup->insertItem(tr("&Edit"), this, SLOT(menuEdit())); popup->insertItem(tr("&Copy File Name"), this, SLOT(menuCopyFN())); + popup->insertItem(tr("Copy &Url"), this, SLOT(menuCopyURL())); return popup; } @@ -437,3 +448,12 @@ void RclResList::menuCopyFN() QClipboard::Selection); } } +void RclResList::menuCopyURL() +{ + Rcl::Doc doc; + if (getDoc(m_docnum, doc)) { + string url = url_encode(doc.url, 7); + QApplication::clipboard()->setText(url.c_str(), + QClipboard::Selection); + } +} diff --git a/src/qtgui/rclreslist.h b/src/qtgui/rclreslist.h index 81954060..da739458 100644 --- a/src/qtgui/rclreslist.h +++ b/src/qtgui/rclreslist.h @@ -1,6 +1,6 @@ #ifndef _RCLRESLIST_H_INCLUDED_ #define _RCLRESLIST_H_INCLUDED_ -/* @(#$Id: rclreslist.h,v 1.3 2006-03-21 15:11:30 dockes Exp $ (C) 2005 J.F.Dockes */ +/* @(#$Id: rclreslist.h,v 1.4 2006-03-29 11:18:14 dockes Exp $ (C) 2005 J.F.Dockes */ #include #include @@ -30,6 +30,7 @@ class RclResList : public QTextBrowser virtual void menuPreview(); virtual void menuEdit(); virtual void menuCopyFN(); + virtual void menuCopyURL(); signals: void nextPageAvailable(bool); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index a7e4c32d..76b6935b 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.58 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.59 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -389,8 +389,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, // /////// Split and index terms in document body and auxiliary fields string noacc; - // Split and index file path. Do we really want to do this? Or do - // it with the simple file name only ? + // Split and index file name as document term(s) if (dumb_string(doc.utf8fn, noacc)) { splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; @@ -432,7 +431,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, // Mime type newdocument.add_term("T" + doc.mimetype); - // Path name + // Path name term. This is used for existence/uptodate checks string hash; pathHash(fn, hash, PATHHASHLEN); LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str())); @@ -440,11 +439,11 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, newdocument.add_term(pathterm); // Simple file name. This is used for file name searches only. We index - // it with a term prefix - string sfn = path_getsimple(doc.utf8fn); - if (dumb_string(sfn, noacc) && !noacc.empty()) { - sfn = string("XSFN") + noacc; - newdocument.add_term(sfn); + // it with a term prefix. utf8fn used to be the full path, but it's now + // the simple file name. + if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) { + noacc = string("XSFN") + noacc; + newdocument.add_term(noacc); } // Internal path: with path, makes unique identifier for documents @@ -1047,16 +1046,15 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts, // with XSFN) from the database. // We build an OR query with the expanded values if any. string pattern; - // We take the data either from allwords or orwords to avoid - // interaction with the allwords checkbox dumb_string(sdata.filename, pattern); - // If pattern is not quoted, we add * at each end: match any - // substring - if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') + // If pattern is not quoted, and has no wildcards, we add * at + // each end: match any substring + if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') { pattern = pattern.substr(1, pattern.size() -2); - else + } else if (pattern.find_first_of("*?[") == string::npos) { pattern = "*" + pattern + "*"; + } // else let it be LOGDEB((" pattern: [%s]\n", pattern.c_str())); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 21c6b5d0..a39517b8 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.26 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.27 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -53,10 +53,14 @@ class Doc { public: // These fields potentially go into the document data record // We indicate the routine that sets them up during indexing - string url; // Computed from fn by Db::add - string utf8fn; // Transcoded version of the file path. + string url; // This is just "file://" + binary filename. + // No transcoding: this is used to access files + // Computed from fn by Db::add + string utf8fn; // Transcoded version of the simple file name for + // SFN-prefixed specific file name indexation + // Set by DbIndexer::processone + string ipath; // Internal path for multi-doc files. Ascii // Set by DbIndexer::processone - string ipath; // Set by DbIndexer::processone string mimetype; // Set by FileInterner::internfile string fmtime; // File modification time as decimal ascii unix time // Set by DbIndexer::processone diff --git a/src/utils/pathut.cpp b/src/utils/pathut.cpp index 0937de13..d1e28357 100644 --- a/src/utils/pathut.cpp +++ b/src/utils/pathut.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pathut.cpp,v 1.9 2006-02-02 08:58:11 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pathut.cpp,v 1.10 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -189,6 +189,41 @@ list path_dirglob(const std::string &dir, return res; } +std::string url_encode(const std::string url, string::size_type offs) +{ + string out = url.substr(0, offs); + const char *cp = url.c_str(); + for (string::size_type i = offs; i < url.size(); i++) { + int c; + char *h = "0123456789ABCDEF"; + c = cp[i]; + if(c <= 0x1f || + c >= 0x7f || + c == '<' || + c == '>' || + c == ' ' || + c == '\t'|| + c == '"' || + c == '#' || + c == '%' || + c == '{' || + c == '}' || + c == '|' || + c == '\\' || + c == '^' || + c == '~'|| + c == '[' || + c == ']' || + c == '`') { + out += '%'; + out += h[(c >> 4) & 0xf]; + out += h[c & 0xf]; + } else { + out += char(c); + } + } + return out; +} #else // TEST_PATHUT diff --git a/src/utils/pathut.h b/src/utils/pathut.h index e749b2f0..8012e9a9 100644 --- a/src/utils/pathut.h +++ b/src/utils/pathut.h @@ -16,7 +16,7 @@ */ #ifndef _PATHUT_H_INCLUDED_ #define _PATHUT_H_INCLUDED_ -/* @(#$Id: pathut.h,v 1.7 2006-03-20 09:54:22 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: pathut.h,v 1.8 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -40,4 +40,8 @@ extern std::string path_canon(const std::string &s); /// Use glob(3) to return a list of file names matching pattern inside dir extern std::list path_dirglob(const std::string &dir, const std::string pattern); +/// Encode according to rfc 1738 +extern std::string url_encode(const std::string url, + std::string::size_type offs); + #endif /* _PATHUT_H_INCLUDED_ */ diff --git a/src/utils/transcode.cpp b/src/utils/transcode.cpp index 411d1188..c145d8cb 100644 --- a/src/utils/transcode.cpp +++ b/src/utils/transcode.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: transcode.cpp,v 1.6 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: transcode.cpp,v 1.7 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -40,14 +40,15 @@ using std::string; #endif bool transcode(const string &in, string &out, const string &icode, - const string &ocode) + const string &ocode, int *ecnt) { iconv_t ic; bool ret = false; const int OBSIZ = 8192; char obuf[OBSIZ], *op; bool icopen = false; - + if (ecnt) + *ecnt = 0; out.erase(); size_t isiz = in.length(); out.reserve(isiz); @@ -79,6 +80,8 @@ bool transcode(const string &in, string &out, const string &icode, ip - in.c_str(), out.length() + OBSIZ - osiz)); out.append(obuf, OBSIZ - osiz); out += "?"; + if (ecnt) + (*ecnt)++; ip++;isiz--; continue; } diff --git a/src/utils/transcode.h b/src/utils/transcode.h index 01e03f59..ec274788 100644 --- a/src/utils/transcode.h +++ b/src/utils/transcode.h @@ -16,7 +16,7 @@ */ #ifndef _TRANSCODE_H_INCLUDED_ #define _TRANSCODE_H_INCLUDED_ -/* @(#$Id: transcode.h,v 1.3 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: transcode.h,v 1.4 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes */ /** * A very minimal c++ized interface to iconv */ @@ -24,6 +24,7 @@ extern bool transcode(const std::string &in, std::string &out, const std::string &icode, - const std::string &ocode); + const std::string &ocode, + int *ecnt = 0); #endif /* _TRANSCODE_H_INCLUDED_ */