warnings cleanup

This commit is contained in:
dockes 2004-12-15 15:00:37 +00:00
parent 91df3aef73
commit a43ebc3716
11 changed files with 208 additions and 81 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.1 2004-12-14 17:50:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
@ -10,10 +10,6 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.1 2004-12-14 17:50:28 dockes Ex
using namespace std;
ConfTree *getConfig()
{
}
RclConfig::RclConfig()
: m_ok(false), conf(0), mimemap(0), mimeconf(0)
{

View File

@ -1,6 +1,6 @@
#ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.1 2004-12-14 17:50:28 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rclconfig.h,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes */
#include "conftree.h"
@ -9,18 +9,30 @@ class RclConfig {
string confdir; // Directory where the files are stored
ConfTree *conf; // Parsed main configuration
string keydir; // Current directory used for parameter fetches.
string defcharset; // These are stored locally to avoid a config lookup
string deflang; // each time.
// Note: this will have to change if/when we support per directory maps
ConfTree *mimemap;
ConfTree *mimeconf;
public:
// Let some parameters be accessed directly
string defcharset; // These are stored locally to avoid a config lookup
string deflang; // each time.
bool guesscharset;
RclConfig();
~RclConfig() {delete conf;delete mimemap;delete mimeconf;}
bool ok() {return m_ok;}
ConfTree *getConfig() {return m_ok ? conf : 0;}
ConfTree *getMimeMap() {return m_ok ? mimemap : 0;}
ConfTree *getMimeConf() {return m_ok ? mimeconf : 0;}
void setKeyDir(const string &dir)
{
keydir = dir;
conf->get("defaultcharset", defcharset, keydir);
conf->get("defaultlanguage", deflang, keydir);
string str;
conf->get("guesscharset", deflang, str);
guesscharset = ConfTree::stringToBool(str);
}
bool getConfParam(const string &name, string &value)
{
if (conf == 0)
@ -33,12 +45,6 @@ class RclConfig {
const string &getDefLang() {
return deflang;
}
void setKeyDir(const string &dir)
{
keydir = dir;
conf->get("defaultcharset", defcharset, keydir);
conf->get("defaultlanguage", deflang, keydir);
}
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.2 2004-12-14 17:49:11 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.3 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_TEXTSPLIT
@ -35,24 +35,24 @@ static void setcharclasses()
static int init = 0;
if (init)
return;
int i;
unsigned int i;
memset(charclasses, LETTER, sizeof(charclasses));
char digits[] = "0123456789";
for (i = 0; i < sizeof(digits); i++)
charclasses[digits[i]] = DIGIT;
charclasses[int(digits[i])] = DIGIT;
char blankspace[] = "\t\v\f ";
for (i = 0; i < sizeof(blankspace); i++)
charclasses[blankspace[i]] = SPACE;
charclasses[int(blankspace[i])] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
for (i = 0; i < sizeof(seps); i++)
charclasses[seps[i]] = SPACE;
charclasses[int(seps[i])] = SPACE;
char special[] = ".@+-,#'\n\r";
for (i = 0; i < sizeof(special); i++)
charclasses[special[i]] = special[i];
charclasses[int(special[i])] = special[i];
init = 1;
}
@ -95,7 +95,7 @@ void TextSplit::text_to_words(const string &in)
int wordpos = 0;
int spanpos = 0;
for (int i = 0; i < in.length(); i++) {
for (unsigned int i = 0; i < in.length(); i++) {
int c = in[i];
int cc = charclasses[c];
switch (cc) {
@ -114,7 +114,7 @@ void TextSplit::text_to_words(const string &in)
case '-':
case '+':
if (word.length() == 0) {
if (i < in.length() || charclasses[in[i+1]] == DIGIT) {
if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) {
number = true;
word += c;
span += c;
@ -155,7 +155,7 @@ void TextSplit::text_to_words(const string &in)
case '#':
// Keep it only at end of word...
if (word.length() > 0 &&
(i == in.length() -1 || charclasses[in[i+1]] == SPACE)) {
(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE)) {
word += c;
span += c;
}

View File

@ -1,18 +1,41 @@
#ifndef lint
static char rcsid[] = "@(#$Id: csguess.cpp,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: csguess.cpp,v 1.2 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
// This code was converted from estraier / qdbm / myconf.c
#ifndef TEST_CSGUESS
// This code was converted from estraier / qdbm / myconf.c:
/**************************************************************************
* Copyright (C) 2000-2004 Mikio Hirabayashi
*
* This file is part of QDBM, Quick Database Manager.
*
* QDBM is free software; you can redistribute it and/or modify it under the
* terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License or any later
* version. QDBM is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details. You should have received a copy of the GNU
* Lesser General Public License along with QDBM; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
* 02111-1307 USA.
* *********************************************************/
#include <errno.h>
#include <iconv.h>
#include "csguess.h"
#include <string>
#include <iostream>
using std::string;
#include <iconv.h>
#include "csguess.h"
// The values from estraier were 32768, 256, 0.001
const int ICONVCHECKSIZ = 4000;
const int ICONVMISSMAX = 10;
const int ICONVCHECKSIZ = 32768;
const int ICONVMISSMAX = 256;
const double ICONVALLWRAT = 0.001;
// Try to transcode and count errors (for charset guessing)
@ -20,17 +43,18 @@ static int transcodeErrCnt(const char *ptr, int size,
const char *icode, const char *ocode)
{
iconv_t ic;
char obuf[ICONVCHECKSIZ], *wp, *rp;
char obuf[2*ICONVCHECKSIZ], *wp, *rp;
size_t isiz, osiz;
int miss;
isiz = size;
if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ICONVMISSMAX;
if((ic = iconv_open(ocode, icode)) == (iconv_t)-1)
return size;
miss = 0;
rp = (char *)ptr;
while(isiz > 0){
osiz = ICONVCHECKSIZ;
osiz = 2*ICONVCHECKSIZ;
wp = obuf;
if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == -1){
if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == (size_t)-1){
if(errno == EILSEQ || errno == EINVAL){
rp++;
isiz--;
@ -38,17 +62,20 @@ static int transcodeErrCnt(const char *ptr, int size,
if(miss >= ICONVMISSMAX)
break;
} else {
miss = size;
break;
}
}
}
if(iconv_close(ic) == -1)
return ICONVMISSMAX;
return size;
return miss;
}
string csguess(const string &in)
// Try to guess character encoding. This could be optimized quite a
// lot by avoiding the multiple passes on the document, to be done
// after usefulness is demonstrated...
string csguess(const string &in, const string &dflt)
{
const char *hypo;
int i, miss;
@ -74,9 +101,10 @@ string csguess(const string &in)
return "UTF-16LE";
}
// Look for iso-2022 specific escape sequences. As iso-2022 begins
// in ascii, these succeed fast for a japanese text, but are quite
// expensive for any other
// Look for iso-2022 (rfc1468) specific escape sequences. As
// iso-2022 begins in ascii, and typically soon escapes, these
// succeed fast for a japanese text, but are quite expensive for
// any other
for (i = 0; i < size - 3; i++) {
if (text[i] == 0x1b) {
i++;
@ -89,7 +117,7 @@ string csguess(const string &in)
// Try conversions from ascii and utf-8. These are unlikely to succeed
// by mistake.
if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1)
if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1)
return "US-ASCII";
if (transcodeErrCnt(text, size, "UTF-8", "UTF-16BE") < 1)
@ -131,5 +159,35 @@ string csguess(const string &in)
if (!hypo && miss / (double)size <= ICONVALLWRAT)
hypo = "CP932";
return hypo ? hypo : "ISO-8859-1";
return hypo ? hypo : dflt;
}
#else
#include <errno.h>
#include <string>
#include <iostream>
using namespace std;
#include "readfile.h"
#include "csguess.h"
int main(int argc, char **argv)
{
if (argc != 2) {
cerr << "Usage: trcsguess <filename> <default>" << endl;
exit(1);
}
const string filename = argv[1];
const string dflt = argv[2];
string text;
if (!file_to_string(filename, text)) {
cerr << "Couldnt read file, errno " << errno << endl;
exit(1);
}
cout << csguess(text, dflt) << endl;
exit(0);
}
#endif

View File

@ -1,12 +1,13 @@
#ifndef _CSGUESS_H_INCLUDED_
#define _CSGUESS_H_INCLUDED_
/* @(#$Id: csguess.h,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: csguess.h,v 1.2 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
// Try to guess the character set. This might guess unicode encodings, and
// some asian charsets, but has no chance, for example, of discriminating
// betweeen the different iso8859-xx charsets.
extern std::string csguess(const std::string &in);
extern std::string csguess(const std::string &in, const std::string &dflt);
#endif /* _CSGUESS_H_INCLUDED_ */

View File

@ -1,12 +1,12 @@
#ifndef _INDEXER_H_INCLUDED_
#define _INDEXER_H_INCLUDED_
/* @(#$Id: indexer.h,v 1.1 2004-12-14 17:53:51 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: indexer.h,v 1.2 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes */
#include "rclconfig.h"
/* Definition for document interner functions */
typedef Rcl::Doc* (*MimeHandlerFunc)(RclConfig *, const string &,
const string &);
typedef bool (*MimeHandlerFunc)(RclConfig *, const string &,
const string &, Rcl::Doc&);
#if 0

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <ctype.h>
@ -18,7 +18,7 @@ string mimetype(const string &filename, ConfTree *mtypes)
string::size_type dot = filename.find_last_of(".");
if (dot != string::npos) {
string suff = filename.substr(dot);
for (int i = 0; i < suff.length(); i++)
for (unsigned int i = 0; i < suff.length(); i++)
suff[i] = tolower(suff[i]);
string mtype;

View File

@ -1,7 +1,9 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <sys/stat.h>
#include <strings.h>
#include <iostream>
@ -14,25 +16,50 @@ static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.2 2004-12-14 17:54:16 dockes
#include "rcldb.h"
#include "readfile.h"
#include "indexer.h"
#include "csguess.h"
#include "transcode.h"
using namespace std;
Rcl::Doc* textPlainToDoc(RclConfig *conf, const string &fn,
const string &mtype)
bool textPlainToDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout)
{
return 0;
string otext;
if (!file_to_string(fn, otext))
return false;
// Try to guess charset, then convert to utf-8, and fill document fields
string charset;
if (conf->guesscharset) {
charset = csguess(otext, conf->defcharset);
} else
charset = conf->defcharset;
string utf8;
if (transcode(otext, charset, utf8, "UTF-8"))
return 0;
Rcl::Doc out;
out.origcharset = charset;
out.text = utf8;
docout = out;
return true;
}
// Map of mime types to internal interner functions. This could just as well
// be an if else if suite inside getMimeHandler(), but this is prettier ?
static map<string, MimeHandlerFunc> ihandlers;
// Static object to get the map to be initialized at program start.
class IHandler_Init {
public:
IHandler_Init() {
ihandlers["text/plain"] = textPlainToDoc;
// Add new associations here when needed
}
};
static IHandler_Init ihandleriniter;
/**
* Return handler function for given mime type
*/
@ -75,6 +102,9 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
}
}
/**
* Bunch holder for data used while indexing a directory tree
*/
class DirIndexer {
FsTreeWalker walker;
RclConfig *config;
@ -95,23 +125,23 @@ class DirIndexer {
void DirIndexer::index()
{
#if 0
if (!db.open(dbdir, Rcl::Db::DbUpd)) {
cerr << "Error opening database in " << dbdir << " for " <<
topdir << endl;
return;
}
#endif
walker.walk(topdir, indexfile, this);
#if 0
if (!db.close()) {
cerr << "Error closing database in " << dbdir << " for " <<
topdir << endl;
return;
}
#endif
}
/**
* This function gets called for every file and directory found by the
* tree walker. Adjust parameters and index files if/when needed.
*/
FsTreeWalker::Status
indexfile(void *cdata, const std::string &fn, const struct stat *stp,
FsTreeWalker::CbFlag flg)
@ -144,26 +174,25 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
return FsTreeWalker::FtwOk;
}
// Check if file has already been indexed, and has changed since
// - Make path term,
// - query db: postlist_begin->docid
// - fetch doc (get_document(docid)
// - check date field, maybe skip
if (!me->db.needUpdate(fn, stp))
return FsTreeWalker::FtwOk;
// Turn file into a document. The document has fields for title, body
// etc., all text converted to utf8
Rcl::Doc *doc = fun(me->config, fn, mime);
Rcl::Doc doc;
if (!fun(me->config, fn, mime, doc))
return FsTreeWalker::FtwOk;
#if 0
// Set up xapian document, add postings and misc fields,
// add to or update database.
dbadd(doc);
#endif
if (!me->db.add(fn, doc))
return FsTreeWalker::FtwError;
return FsTreeWalker::FtwOk;
}
int main(int argc, const char **argv)
{
RclConfig *config = new RclConfig;
@ -180,7 +209,7 @@ int main(int argc, const char **argv)
}
vector<string> tdl;
if (ConfTree::stringToStrings(topdirs, tdl)) {
for (int i = 0; i < tdl.size(); i++) {
for (unsigned int i = 0; i < tdl.size(); i++) {
string topdir = tdl[i];
cout << topdir << endl;
string dbdir;

View File

@ -1,7 +1,9 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.1 2004-12-14 17:50:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <sys/stat.h>
#include <iostream>
#include <string>
#include <vector>
@ -27,7 +29,7 @@ class Native {
Rcl::Db::Db()
{
pdata = new Native;
// pdata = new Native;
}
Rcl::Db::~Db()
@ -56,6 +58,7 @@ Rcl::Db::~Db()
bool Rcl::Db::open(const string& dir, OpenMode mode)
{
return true;
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
@ -89,8 +92,10 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
}
return false;
}
bool Rcl::Db::close()
{
return true;
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
@ -119,3 +124,21 @@ bool Rcl::Db::close()
return true;
return false;
}
bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
{
return true;
}
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
{
return true;
// TOBEDONE: Check if file has already been indexed, and has changed since
// - Make path term,
// - query db: postlist_begin->docid
// - fetch doc (get_document(docid)
// - check date field, maybe skip
}

View File

@ -1,11 +1,25 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.1 2004-12-14 17:50:28 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
struct stat;
namespace Rcl {
/**
* Holder for document attributes and data
*/
class Doc {
public:
string origcharset;
string title;
string abstract;
string keywords;
string text;
};
/**
* Wrapper class for the native database.
*/
@ -17,15 +31,10 @@ class Db {
enum OpenMode {DbRO, DbUpd, DbTrunc};
bool open(const std::string &dbdir, OpenMode mode);
bool close();
bool add(const string &filename, const Doc &doc);
bool needUpdate(const string &filename, const struct stat *stp);
};
class Doc {
public:
string title;
string abstract;
string keywords;
string text;
};
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: transcode.cpp,v 1.1 2004-12-15 09:43:48 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: transcode.cpp,v 1.2 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_TRANSCODE
@ -22,6 +22,7 @@ bool transcode(const string &in, string &out, const string &icode,
bool ret = false;
const int OBSIZ = 8192;
char obuf[OBSIZ], *op;
bool icopen = false;
out.erase();
size_t isiz = in.length();
@ -33,12 +34,13 @@ bool transcode(const string &in, string &out, const string &icode,
+ " -> " + ocode;
goto error;
}
icopen = true;
while (isiz > 0) {
size_t osiz;
op = obuf;
osiz = OBSIZ;
if(iconv(ic, &ip, &isiz, &op, &osiz) == -1 && errno != E2BIG){
if(iconv(ic, &ip, &isiz, &op, &osiz) == (size_t)-1 && errno != E2BIG){
out.erase();
out = string("iconv failed for ") + icode + " -> " + ocode +
" : " + strerror(errno);
@ -53,8 +55,11 @@ bool transcode(const string &in, string &out, const string &icode,
+ " -> " + ocode;
goto error;
}
icopen = false;
ret = true;
error:
if (icopen)
iconv_close(ic);
return ret;
}
@ -100,7 +105,7 @@ int main(int argc, char **argv)
perror("Open/create output");
exit(1);
}
if (write(fd, out.c_str(), out.length()) != out.length()) {
if (write(fd, out.c_str(), out.length()) != (int)out.length()) {
perror("write");
exit(1);
}