try to better handle non-ascii file names

This commit is contained in:
dockes 2006-03-29 11:18:15 +00:00
parent d47f70c595
commit 516a588d04
13 changed files with 159 additions and 65 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.24 2006-03-22 14:25:46 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.25 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -20,6 +20,8 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.24 2006-03-22 14:25:46 dockes E
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <langinfo.h>
#include <sys/types.h>
#include <sys/stat.h>
@ -140,26 +142,40 @@ bool RclConfig::getConfParam(const std::string &name, bool *bvp)
return true;
}
// If defcharset was set (from the config or a previous call), use it.
// Else, try to guess it from LANG.
// Use iso8859-1 as ultimate default
// defcharset is reset on setKeyDir()
const string& RclConfig::getDefCharset()
// Get charset to be used for transcoding to utf-8 if unspecified by doc
// For document contents:
// If defcharset was set (from the config or a previous call), use it.
// Else, try to guess it from the locale
// Use iso8859-1 as ultimate default
// defcharset is reset on setKeyDir()
// For filenames, same thing except that we do not use the config file value
// (only the locale).
const string& RclConfig::getDefCharset(bool filename)
{
if (defcharset.empty()) {
static string localecharset; // This supposedly never changes
if (localecharset.empty()) {
const char *cp;
if ((cp = getenv("LANG"))) {
cp = strrchr(cp, '.');
if (cp) {
cp++;
if (*cp)
defcharset = string(cp);
}
cp = nl_langinfo(CODESET);
// We don't keep US-ASCII. It's better to use a superset
// Ie: me have a C locale and some french file names, and I
// can't imagine a version of iconv that couldn't translate
// from iso8859?
if (cp && *cp && strcmp(cp, "US-ASCII")) {
localecharset = string(cp);
} else {
localecharset = string("ISO8859-1");
}
if (defcharset.empty())
defcharset = string("ISO8859-1");
}
return defcharset;
if (defcharset.empty()) {
defcharset = localecharset;
}
if (filename) {
return localecharset;
} else {
return defcharset;
}
}
// Get all known document mime values. We get them from the mimeconf

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.17 2006-03-22 14:25:46 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rclconfig.h,v 1.18 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes */
#include <list>
@ -53,8 +53,9 @@ class RclConfig {
bool getConfParam(const std::string &name, int *value);
/** Variant with autoconversion to bool */
bool getConfParam(const std::string &name, bool *value);
/** Get default charset for current keydir (was set during setKeydir) */
const string &getDefCharset();
/** Get default charset for current keydir (was set during setKeydir)
* filenames are handled differently */
const string &getDefCharset(bool filename = false);
/** Get guessCharset for current keydir (was set during setKeydir) */
bool getGuessCharset() {return guesscharset;}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.5 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -20,6 +20,7 @@ static char rcsid[] = "@(#$Id: rclinit.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp
#include <stdio.h>
#include <signal.h>
#include <locale.h>
#include "debuglog.h"
#include "rclconfig.h"
@ -59,6 +60,10 @@ RclConfig *recollinit(void (*cleanup)(void), void (*sigcleanup)(int),
int lev = atoi(loglevel.c_str());
DebugLog::getdbl()->setloglevel(lev);
}
// Make sure the locale is set. This is only for converting file names
// to utf8 for indexation.
setlocale(LC_CTYPE, "");
return config;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.26 2006-03-22 16:24:41 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.27 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -195,7 +195,7 @@ bool DbIndexer::indexFiles(const list<string> &filenames)
///
/// Accent and majuscule handling are performed by the db module when doing
/// the actual indexing work. The Rcl::Doc created by internfile()
/// contains pretty raw utf8 data.
/// mostly contains pretty raw utf8 data.
FsTreeWalker::Status
DbIndexer::processone(const std::string &fn, const struct stat *stp,
FsTreeWalker::CbFlag flg)
@ -239,10 +239,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
doc.ipath = ipath;
// File name transcoded to utf8 for indexation.
// We actually might want a separate param for the filename charset
string charset = config->getDefCharset();
// If this fails, the path won't be indexed, no big deal
transcode(fn, doc.utf8fn, charset, "UTF-8");
string charset = config->getDefCharset(true);
// If this fails, the file name won't be indexed, no big deal
// Note that we used to do the full path here, but I ended up believing
// that it made more sense to use only the file name
transcode(path_getsimple(fn), doc.utf8fn, charset, "UTF-8");
// Do database-specific work to update document data
if (!db.add(fn, doc, stp))
return FsTreeWalker::FtwError;

View File

@ -351,11 +351,13 @@ class LoadThread : public QThread {
string ipath;
string *mtype;
string tmpdir;
int loglevel;
public:
LoadThread(int *stp, Rcl::Doc *odoc, string fn, string ip, string *mt)
: statusp(stp), out(odoc), filename(fn), ipath(ip), mtype(mt)
{}
{
loglevel = DebugLog::getdbl()->getlevel();
}
~LoadThread() {
if (tmpdir.length()) {
wipedir(tmpdir);
@ -363,7 +365,7 @@ class LoadThread : public QThread {
}
}
virtual void run() {
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::getdbl()->setloglevel(loglevel);
if (!maketmpdir(tmpdir)) {
QMessageBox::critical(0, "Recoll",
Preview::tr("Cannot create temporary directory"));
@ -389,14 +391,17 @@ class ToRichThread : public QThread {
list<string> &terms;
string& firstTerm;
QString &out;
int loglevel;
public:
ToRichThread(string &i, list<string> &trms,
string& ft, QString &o)
: in(i), terms(trms), firstTerm(ft), out(o)
{}
{
loglevel = DebugLog::getdbl()->getlevel();
}
virtual void run()
{
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::getdbl()->setloglevel(loglevel);
string rich;
try {
plaintorich(in, rich, terms, &firstTerm);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclreslist.cpp,v 1.5 2006-03-22 11:17:49 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclreslist.cpp,v 1.6 2006-03-29 11:18:14 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <time.h>
@ -19,6 +19,8 @@ static char rcsid[] = "@(#$Id: rclreslist.cpp,v 1.5 2006-03-22 11:17:49 dockes E
#include "guiutils.h"
#include "pathut.h"
#include "docseq.h"
#include "transcode.h"
#include "pathut.h"
#include "rclreslist.h"
#include "moc_rclreslist.cpp"
@ -242,7 +244,7 @@ void RclResList::showResultPage()
.arg(resCnt);
append(chunk);
}
gotone = true;
// Determine icon to display if any
@ -265,9 +267,17 @@ void RclResList::showResultPage()
sprintf(perbuf, "%3d%% ", percent);
// Make title out of file name if none yet
if (doc.title.empty())
doc.title = path_getsimple(doc.url);
string fcharset = rclconfig->getDefCharset(true);
if (doc.title.empty()) {
transcode(path_getsimple(doc.url), doc.title, fcharset, "UTF-8");
}
// Printable url: either utf-8 if transcoding succeeds, or url-encoded
string url; int ecnt = 0;
if (!transcode(doc.url, url, fcharset, "UTF-8", &ecnt) || ecnt) {
url = url_encode(doc.url, 7);
}
// Document date: either doc or file modification time
char datebuf[100];
datebuf[0] = 0;
@ -317,7 +327,7 @@ void RclResList::showResultPage()
if (!img_name.empty()) {
result += "<img source=\"" + img_name + "\" align=\"left\">";
}
result += "<i>" + doc.url + +"</i><br>";
result += "<i>" + url + +"</i><br>";
if (!abst.empty())
result += abst + "<br>";
if (!doc.keywords.empty())
@ -417,6 +427,7 @@ QPopupMenu *RclResList::createPopupMenu(const QPoint& pos)
popup->insertItem(tr("&Preview"), this, SLOT(menuPreview()));
popup->insertItem(tr("&Edit"), this, SLOT(menuEdit()));
popup->insertItem(tr("&Copy File Name"), this, SLOT(menuCopyFN()));
popup->insertItem(tr("Copy &Url"), this, SLOT(menuCopyURL()));
return popup;
}
@ -437,3 +448,12 @@ void RclResList::menuCopyFN()
QClipboard::Selection);
}
}
void RclResList::menuCopyURL()
{
Rcl::Doc doc;
if (getDoc(m_docnum, doc)) {
string url = url_encode(doc.url, 7);
QApplication::clipboard()->setText(url.c_str(),
QClipboard::Selection);
}
}

View File

@ -1,6 +1,6 @@
#ifndef _RCLRESLIST_H_INCLUDED_
#define _RCLRESLIST_H_INCLUDED_
/* @(#$Id: rclreslist.h,v 1.3 2006-03-21 15:11:30 dockes Exp $ (C) 2005 J.F.Dockes */
/* @(#$Id: rclreslist.h,v 1.4 2006-03-29 11:18:14 dockes Exp $ (C) 2005 J.F.Dockes */
#include <qtextbrowser.h>
#include <qpopupmenu.h>
@ -30,6 +30,7 @@ class RclResList : public QTextBrowser
virtual void menuPreview();
virtual void menuEdit();
virtual void menuCopyFN();
virtual void menuCopyURL();
signals:
void nextPageAvailable(bool);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.58 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.59 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -389,8 +389,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
// /////// Split and index terms in document body and auxiliary fields
string noacc;
// Split and index file path. Do we really want to do this? Or do
// it with the simple file name only ?
// Split and index file name as document term(s)
if (dumb_string(doc.utf8fn, noacc)) {
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
@ -432,7 +431,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
// Mime type
newdocument.add_term("T" + doc.mimetype);
// Path name
// Path name term. This is used for existence/uptodate checks
string hash;
pathHash(fn, hash, PATHHASHLEN);
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
@ -440,11 +439,11 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
newdocument.add_term(pathterm);
// Simple file name. This is used for file name searches only. We index
// it with a term prefix
string sfn = path_getsimple(doc.utf8fn);
if (dumb_string(sfn, noacc) && !noacc.empty()) {
sfn = string("XSFN") + noacc;
newdocument.add_term(sfn);
// it with a term prefix. utf8fn used to be the full path, but it's now
// the simple file name.
if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) {
noacc = string("XSFN") + noacc;
newdocument.add_term(noacc);
}
// Internal path: with path, makes unique identifier for documents
@ -1047,16 +1046,15 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
// with XSFN) from the database.
// We build an OR query with the expanded values if any.
string pattern;
// We take the data either from allwords or orwords to avoid
// interaction with the allwords checkbox
dumb_string(sdata.filename, pattern);
// If pattern is not quoted, we add * at each end: match any
// substring
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"')
// If pattern is not quoted, and has no wildcards, we add * at
// each end: match any substring
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
pattern = pattern.substr(1, pattern.size() -2);
else
} else if (pattern.find_first_of("*?[") == string::npos) {
pattern = "*" + pattern + "*";
} // else let it be
LOGDEB((" pattern: [%s]\n", pattern.c_str()));

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.26 2006-03-20 16:05:41 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.27 2006-03-29 11:18:14 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -53,10 +53,14 @@ class Doc {
public:
// These fields potentially go into the document data record
// We indicate the routine that sets them up during indexing
string url; // Computed from fn by Db::add
string utf8fn; // Transcoded version of the file path.
string url; // This is just "file://" + binary filename.
// No transcoding: this is used to access files
// Computed from fn by Db::add
string utf8fn; // Transcoded version of the simple file name for
// SFN-prefixed specific file name indexation
// Set by DbIndexer::processone
string ipath; // Internal path for multi-doc files. Ascii
// Set by DbIndexer::processone
string ipath; // Set by DbIndexer::processone
string mimetype; // Set by FileInterner::internfile
string fmtime; // File modification time as decimal ascii unix time
// Set by DbIndexer::processone

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.9 2006-02-02 08:58:11 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.10 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -189,6 +189,41 @@ list<std::string> path_dirglob(const std::string &dir,
return res;
}
std::string url_encode(const std::string url, string::size_type offs)
{
string out = url.substr(0, offs);
const char *cp = url.c_str();
for (string::size_type i = offs; i < url.size(); i++) {
int c;
char *h = "0123456789ABCDEF";
c = cp[i];
if(c <= 0x1f ||
c >= 0x7f ||
c == '<' ||
c == '>' ||
c == ' ' ||
c == '\t'||
c == '"' ||
c == '#' ||
c == '%' ||
c == '{' ||
c == '}' ||
c == '|' ||
c == '\\' ||
c == '^' ||
c == '~'||
c == '[' ||
c == ']' ||
c == '`') {
out += '%';
out += h[(c >> 4) & 0xf];
out += h[c & 0xf];
} else {
out += char(c);
}
}
return out;
}
#else // TEST_PATHUT

View File

@ -16,7 +16,7 @@
*/
#ifndef _PATHUT_H_INCLUDED_
#define _PATHUT_H_INCLUDED_
/* @(#$Id: pathut.h,v 1.7 2006-03-20 09:54:22 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: pathut.h,v 1.8 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -40,4 +40,8 @@ extern std::string path_canon(const std::string &s);
/// Use glob(3) to return a list of file names matching pattern inside dir
extern std::list<std::string> path_dirglob(const std::string &dir,
const std::string pattern);
/// Encode according to rfc 1738
extern std::string url_encode(const std::string url,
std::string::size_type offs);
#endif /* _PATHUT_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: transcode.cpp,v 1.6 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: transcode.cpp,v 1.7 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -40,14 +40,15 @@ using std::string;
#endif
bool transcode(const string &in, string &out, const string &icode,
const string &ocode)
const string &ocode, int *ecnt)
{
iconv_t ic;
bool ret = false;
const int OBSIZ = 8192;
char obuf[OBSIZ], *op;
bool icopen = false;
if (ecnt)
*ecnt = 0;
out.erase();
size_t isiz = in.length();
out.reserve(isiz);
@ -79,6 +80,8 @@ bool transcode(const string &in, string &out, const string &icode,
ip - in.c_str(), out.length() + OBSIZ - osiz));
out.append(obuf, OBSIZ - osiz);
out += "?";
if (ecnt)
(*ecnt)++;
ip++;isiz--;
continue;
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _TRANSCODE_H_INCLUDED_
#define _TRANSCODE_H_INCLUDED_
/* @(#$Id: transcode.h,v 1.3 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: transcode.h,v 1.4 2006-03-29 11:18:15 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A very minimal c++ized interface to iconv
*/
@ -24,6 +24,7 @@
extern bool transcode(const std::string &in, std::string &out,
const std::string &icode,
const std::string &ocode);
const std::string &ocode,
int *ecnt = 0);
#endif /* _TRANSCODE_H_INCLUDED_ */