warnings cleanup

2004-12-15 15:00:37 +00:00 · 2004-12-15 15:00:37 +00:00 · a43ebc3716
commit a43ebc3716
parent 91df3aef73
11 changed files with 208 additions and 81 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.1 2004-12-14 17:50:28 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <iostream>
@ -10,10 +10,6 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.1 2004-12-14 17:50:28 dockes Ex
 using namespace std;
 ConfTree *getConfig()
 {
 }
 RclConfig::RclConfig()
    : m_ok(false), conf(0), mimemap(0), mimeconf(0)
 {
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -1,6 +1,6 @@
 #ifndef _RCLCONFIG_H_INCLUDED_
 #define _RCLCONFIG_H_INCLUDED_
-/* @(#$Id: rclconfig.h,v 1.1 2004-12-14 17:50:28 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rclconfig.h,v 1.2 2004-12-15 15:00:36 dockes Exp $  (C) 2004 J.F.Dockes */
 #include "conftree.h"
@ -9,18 +9,30 @@ class RclConfig {
    string confdir; // Directory where the files are stored
    ConfTree *conf; // Parsed main configuration
    string keydir;  // Current directory used for parameter fetches.
    string defcharset; // These are stored locally to avoid a config lookup
    string deflang;    // each time.
    // Note: this will have to change if/when we support per directory maps
    ConfTree *mimemap;
    ConfTree *mimeconf;
 public:
    // Let some parameters be accessed directly
    string defcharset; // These are stored locally to avoid a config lookup
    string deflang;    // each time.
    bool   guesscharset;
    RclConfig();
    ~RclConfig() {delete conf;delete mimemap;delete mimeconf;}
    bool ok() {return m_ok;}
    ConfTree *getConfig() {return m_ok ? conf : 0;}
    ConfTree *getMimeMap() {return m_ok ? mimemap : 0;}
    ConfTree *getMimeConf() {return m_ok ? mimeconf : 0;}
    void setKeyDir(const string &dir) 
    {
 	keydir = dir;
 	conf->get("defaultcharset", defcharset, keydir);
 	conf->get("defaultlanguage", deflang, keydir);
 	string str;
 	conf->get("guesscharset", deflang, str);
 	guesscharset = ConfTree::stringToBool(str);
    }
    bool getConfParam(const string &name, string &value) 
    {
 	if (conf == 0)
@ -33,12 +45,6 @@ class RclConfig {
    const string &getDefLang() {
 	return deflang;
    }
    void setKeyDir(const string &dir) 
    {
 	keydir = dir;
 	conf->get("defaultcharset", defcharset, keydir);
 	conf->get("defaultlanguage", deflang, keydir);
    }
 };
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.2 2004-12-14 17:49:11 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.3 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT
@ -35,24 +35,24 @@ static void setcharclasses()
    static int init = 0;
    if (init)
 	return;
-    int i;
+    unsigned int i;
    memset(charclasses, LETTER, sizeof(charclasses));
    char digits[] = "0123456789";
    for (i = 0; i  < sizeof(digits); i++)
-	charclasses[digits[i]] = DIGIT;
+	charclasses[int(digits[i])] = DIGIT;
    char blankspace[] = "\t\v\f ";
    for (i = 0; i < sizeof(blankspace); i++)
-	charclasses[blankspace[i]] = SPACE;
+	charclasses[int(blankspace[i])] = SPACE;
    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
    for (i = 0; i  < sizeof(seps); i++)
-	charclasses[seps[i]] = SPACE;
+	charclasses[int(seps[i])] = SPACE;
    char special[] = ".@+-,#'\n\r";
    for (i = 0; i  < sizeof(special); i++)
-	charclasses[special[i]] = special[i];
+	charclasses[int(special[i])] = special[i];
    init = 1;
 }
@ -95,7 +95,7 @@ void TextSplit::text_to_words(const string &in)
    int wordpos = 0;
    int spanpos = 0;
-    for (int i = 0; i < in.length(); i++) {
+    for (unsigned int i = 0; i < in.length(); i++) {
 	int c = in[i];
 	int cc = charclasses[c]; 
 	switch (cc) {
@ -114,7 +114,7 @@ void TextSplit::text_to_words(const string &in)
 	case '-':
 	case '+':
 	    if (word.length() == 0) {
-		if (i < in.length() || charclasses[in[i+1]] == DIGIT) {
+		if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) {
 		    number = true;
 		    word += c;
 		    span += c;
@ -155,7 +155,7 @@ void TextSplit::text_to_words(const string &in)
 	case '#': 
 	    // Keep it only at end of word...
 	    if (word.length() > 0 && 
-		(i == in.length() -1 || charclasses[in[i+1]] == SPACE)) {
+		(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE)) {
 		word += c;
 		span += c;
 	    }
--- a/src/index/csguess.cpp
+++ b/src/index/csguess.cpp
@ -1,18 +1,41 @@
 #ifndef lint
-static char	rcsid[] = "@(#$Id: csguess.cpp,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes";
+static char	rcsid[] = "@(#$Id: csguess.cpp,v 1.2 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
-// This code was converted from estraier / qdbm / myconf.c
+
 #ifndef TEST_CSGUESS
 // This code was converted from estraier / qdbm / myconf.c:
 /**************************************************************************
 * Copyright (C) 2000-2004 Mikio Hirabayashi
 * 
 * This file is part of QDBM, Quick Database Manager.  
 * 
 * QDBM is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License or any later
 * version.  QDBM is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 * License for more details.  You should have received a copy of the GNU
 * Lesser General Public License along with QDBM; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 * 02111-1307 USA.
 * *********************************************************/
 #include <errno.h>
 #include <iconv.h>
 #include "csguess.h"
 #include <string>
 #include <iostream>
 using std::string;
 #include <iconv.h>
 #include "csguess.h"
 // The values from estraier were 32768, 256, 0.001
-const int ICONVCHECKSIZ = 4000;
+const int ICONVCHECKSIZ = 32768;
-const int ICONVMISSMAX  = 10;
+const int ICONVMISSMAX  = 256;
 const double ICONVALLWRAT = 0.001;
 // Try to transcode and count errors (for charset guessing)
@ -20,17 +43,18 @@ static int transcodeErrCnt(const char *ptr, int size,
 			   const char *icode, const char *ocode)
 {
    iconv_t ic;
-    char obuf[ICONVCHECKSIZ], *wp, *rp;
+    char obuf[2*ICONVCHECKSIZ], *wp, *rp;
    size_t isiz, osiz;
    int miss;
    isiz = size;
-    if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ICONVMISSMAX;
+    if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) 
 	return size;
    miss = 0;
    rp = (char *)ptr;
    while(isiz > 0){
-	osiz = ICONVCHECKSIZ;
+	osiz = 2*ICONVCHECKSIZ;
 	wp = obuf;
-	if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == -1){
+	if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == (size_t)-1){
 	    if(errno == EILSEQ || errno == EINVAL){
 		rp++;
 		isiz--;
@ -38,17 +62,20 @@ static int transcodeErrCnt(const char *ptr, int size,
 		if(miss >= ICONVMISSMAX) 
 		    break;
 	    } else {
 		miss = size;
 		break;
 	    }
 	}
    }
    if(iconv_close(ic) == -1) 
-	return ICONVMISSMAX;
+	return size;
    return miss;
 }
-
+// Try to guess character encoding. This could be optimized quite a
-string csguess(const string &in)
+// lot by avoiding the multiple passes on the document, to be done
 // after usefulness is demonstrated...
 string csguess(const string &in, const string &dflt)
 {
    const char     *hypo;
    int		i, miss;
@ -74,9 +101,10 @@ string csguess(const string &in)
 	    return "UTF-16LE";
    }
-    // Look for iso-2022 specific escape sequences. As iso-2022 begins
+    // Look for iso-2022 (rfc1468) specific escape sequences. As
-    // in ascii, these succeed fast for a japanese text, but are quite
+    // iso-2022 begins in ascii, and typically soon escapes, these
-    // expensive for any other
+    // succeed fast for a japanese text, but are quite expensive for
    // any other
    for (i = 0; i < size - 3; i++) {
 	if (text[i] == 0x1b) {
 	    i++;
@ -131,5 +159,35 @@ string csguess(const string &in)
    if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	hypo = "CP932";
-    return hypo ? hypo : "ISO-8859-1";
+    return hypo ? hypo : dflt;
 }
 #else
 #include <errno.h>
 #include <string>
 #include <iostream>
 using namespace std;
 #include "readfile.h"
 #include "csguess.h"
 int main(int argc, char **argv)
 {
    if (argc != 2) {
 	cerr << "Usage: trcsguess <filename> <default>" << endl;
 	exit(1);
    }
    const string filename = argv[1];
    const string dflt = argv[2];
    string text;
    if (!file_to_string(filename, text)) {
 	cerr << "Couldnt read file, errno " << errno << endl;
 	exit(1);
    }
    cout << csguess(text, dflt) << endl;
    exit(0);
 }
 #endif
--- a/src/index/csguess.h
+++ b/src/index/csguess.h
@ -1,12 +1,13 @@
 #ifndef _CSGUESS_H_INCLUDED_
 #define _CSGUESS_H_INCLUDED_
-/* @(#$Id: csguess.h,v 1.1 2004-12-15 08:21:05 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: csguess.h,v 1.2 2004-12-15 15:00:37 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 // Try to guess the character set. This might guess unicode encodings, and
 // some asian charsets, but has no chance, for example, of discriminating
 // betweeen the different iso8859-xx charsets.
-extern std::string csguess(const std::string &in);
+extern std::string csguess(const std::string &in, const std::string &dflt);
 #endif /* _CSGUESS_H_INCLUDED_ */
--- a/src/index/indexer.h
+++ b/src/index/indexer.h
@ -1,12 +1,12 @@
 #ifndef _INDEXER_H_INCLUDED_
 #define _INDEXER_H_INCLUDED_
-/* @(#$Id: indexer.h,v 1.1 2004-12-14 17:53:51 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: indexer.h,v 1.2 2004-12-15 15:00:37 dockes Exp $  (C) 2004 J.F.Dockes */
 #include "rclconfig.h"
 /* Definition for document interner functions */
-typedef Rcl::Doc* (*MimeHandlerFunc)(RclConfig *, const string &, 
+typedef bool (*MimeHandlerFunc)(RclConfig *, const string &, 
-				    const string &);
+				const string &, Rcl::Doc&);
 #if 0
--- a/src/index/mimetype.cpp
+++ b/src/index/mimetype.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <ctype.h>
@ -18,7 +18,7 @@ string mimetype(const string &filename, ConfTree *mtypes)
    string::size_type dot = filename.find_last_of(".");
    if (dot != string::npos) {
 	string suff = filename.substr(dot);
-	for (int i = 0; i < suff.length(); i++)
+	for (unsigned int i = 0; i < suff.length(); i++)
 	    suff[i] = tolower(suff[i]);
 	string mtype;
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@ -1,7 +1,9 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <sys/stat.h>
 #include <strings.h>
 #include <iostream>
@ -14,25 +16,50 @@ static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.2 2004-12-14 17:54:16 dockes
 #include "rcldb.h"
 #include "readfile.h"
 #include "indexer.h"
 #include "csguess.h"
 #include "transcode.h"
 using namespace std;
-Rcl::Doc* textPlainToDoc(RclConfig *conf, const string &fn, 
+bool textPlainToDoc(RclConfig *conf, const string &fn, 
-			 const string &mtype)
+			 const string &mtype, Rcl::Doc &docout)
 {
-    return 0;
+    string otext;
    if (!file_to_string(fn, otext))
 	return false;
    // Try to guess charset, then convert to utf-8, and fill document fields
    string charset;
    if (conf->guesscharset) {
 	charset = csguess(otext, conf->defcharset);
    } else
 	charset = conf->defcharset;
    string utf8;
    if (transcode(otext, charset, utf8, "UTF-8"))
 	return 0;
    Rcl::Doc out;
    out.origcharset = charset;
    out.text = utf8;
    docout = out;
    return true;
 }
 // Map of mime types to internal interner functions. This could just as well 
 // be an if else if suite inside getMimeHandler(), but this is prettier ?
 static map<string, MimeHandlerFunc> ihandlers;
 // Static object to get the map to be initialized at program start.
 class IHandler_Init {
 public:
    IHandler_Init() {
 	ihandlers["text/plain"] = textPlainToDoc;
 	// Add new associations here when needed
    }
 };
 static IHandler_Init ihandleriniter;
 /**
 * Return handler function for given mime type
 */
@ -75,6 +102,9 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
    }
 }
 /**
 * Bunch holder for data used while indexing a directory tree
 */
 class DirIndexer {
    FsTreeWalker walker;
    RclConfig *config;
@ -95,23 +125,23 @@ class DirIndexer {
 void DirIndexer::index()
 {
 #if 0
    if (!db.open(dbdir, Rcl::Db::DbUpd)) {
 	cerr << "Error opening database in " << dbdir << " for " <<
 	    topdir << endl;
 	return;
    }
 #endif
    walker.walk(topdir, indexfile, this);
 #if 0
    if (!db.close()) {
 	cerr << "Error closing database in " << dbdir << " for " <<
 	    topdir << endl;
 	return;
    }
 #endif
 }
 /** 
 * This function gets called for every file and directory found by the
 * tree walker. Adjust parameters and index files if/when needed.
 */
 FsTreeWalker::Status 
 indexfile(void *cdata, const std::string &fn, const struct stat *stp, 
 	  FsTreeWalker::CbFlag flg)
@ -144,26 +174,25 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
 	return FsTreeWalker::FtwOk;
    }
-    // Check if file has already been indexed, and has changed since
+    if (!me->db.needUpdate(fn, stp))
-    // - Make path term, 
+	return FsTreeWalker::FtwOk;
    // - query db: postlist_begin->docid
    // - fetch doc (get_document(docid)
    // - check date field, maybe skip
    // Turn file into a document. The document has fields for title, body 
    // etc.,  all text converted to utf8
-    Rcl::Doc *doc = fun(me->config, fn,  mime);
+    Rcl::Doc doc;
    if (!fun(me->config, fn,  mime, doc))
 	return FsTreeWalker::FtwOk;
 #if 0
    // Set up xapian document, add postings and misc fields, 
    // add to or update database.
-    dbadd(doc);
+    if (!me->db.add(fn, doc))
-#endif
+	return FsTreeWalker::FtwError;
    return FsTreeWalker::FtwOk;
 }
 int main(int argc, const char **argv)
 {
    RclConfig *config = new RclConfig;
@ -180,7 +209,7 @@ int main(int argc, const char **argv)
    }
    vector<string> tdl;
    if (ConfTree::stringToStrings(topdirs, tdl)) {
-	for (int i = 0; i < tdl.size(); i++) {
+	for (unsigned int i = 0; i < tdl.size(); i++) {
 	    string topdir = tdl[i];
 	    cout << topdir << endl;
 	    string dbdir;
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,7 +1,9 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.1 2004-12-14 17:50:28 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <sys/stat.h>
 #include <iostream>
 #include <string>
 #include <vector>
@ -27,7 +29,7 @@ class Native {
 Rcl::Db::Db() 
 {
-    pdata = new Native;
+    //    pdata = new Native;
 }
 Rcl::Db::~Db()
@ -56,6 +58,7 @@ Rcl::Db::~Db()
 bool Rcl::Db::open(const string& dir, OpenMode mode)
 {
    return true;
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
@ -89,8 +92,10 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
    }
    return false;
 }
 bool Rcl::Db::close()
 {
    return true;
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
@ -119,3 +124,21 @@ bool Rcl::Db::close()
 	return true;
    return false;
 }
 bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
 {
    return true;
 }
 bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 {
    return true;
    // TOBEDONE: Check if file has already been indexed, and has changed since
    // - Make path term, 
    // - query db: postlist_begin->docid
    // - fetch doc (get_document(docid)
    // - check date field, maybe skip
 }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -1,11 +1,25 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.1 2004-12-14 17:50:28 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.2 2004-12-15 15:00:36 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 struct stat;
 namespace Rcl {
 /**
 * Holder for document attributes and data
 */
 class Doc {
 public:
    string origcharset;
    string title;
    string abstract;
    string keywords;
    string text;
 };
 /**
 * Wrapper class for the native database.
 */
@ -17,15 +31,10 @@ class Db {
    enum OpenMode {DbRO, DbUpd, DbTrunc};
    bool open(const std::string &dbdir, OpenMode mode);
    bool close();
    bool add(const string &filename, const Doc &doc);
    bool needUpdate(const string &filename, const struct stat *stp);
 };
 class Doc {
 public:
    string title;
    string abstract;
    string keywords;
    string text;
 };
 }
--- a/src/utils/transcode.cpp
+++ b/src/utils/transcode.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char	rcsid[] = "@(#$Id: transcode.cpp,v 1.1 2004-12-15 09:43:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char	rcsid[] = "@(#$Id: transcode.cpp,v 1.2 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TRANSCODE
@ -22,6 +22,7 @@ bool transcode(const string &in, string &out, const string &icode,
    bool ret = false;
    const int OBSIZ = 8192;
    char obuf[OBSIZ], *op;
    bool icopen = false;
    out.erase();
    size_t isiz = in.length();
@ -33,12 +34,13 @@ bool transcode(const string &in, string &out, const string &icode,
 	    + " -> " + ocode;
 	goto error;
    }
    icopen = true;
    while (isiz > 0) {
 	size_t osiz;
 	op = obuf;
 	osiz = OBSIZ;
-	if(iconv(ic, &ip, &isiz, &op, &osiz) == -1 && errno != E2BIG){
+	if(iconv(ic, &ip, &isiz, &op, &osiz) == (size_t)-1 && errno != E2BIG){
 	    out.erase();
 	    out = string("iconv failed for ") + icode + " -> " + ocode +
 		" : " + strerror(errno);
@ -53,8 +55,11 @@ bool transcode(const string &in, string &out, const string &icode,
 	    + " -> " + ocode;
 	goto error;
    }
    icopen = false;
    ret = true;
 error:
    if (icopen)
 	iconv_close(ic);
    return ret;
 }
@ -100,7 +105,7 @@ int main(int argc, char **argv)
 	perror("Open/create output");
 	exit(1);
    }
-    if (write(fd, out.c_str(), out.length()) != out.length()) {
+    if (write(fd, out.c_str(), out.length()) != (int)out.length()) {
 	perror("write");
 	exit(1);
    }