*** empty log message ***

2004-12-13 15:42:16 +00:00 · 2004-12-13 15:42:16 +00:00 · 0786c283ef
commit 0786c283ef
parent 063727df38
4 changed files with 361 additions and 0 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -0,0 +1,220 @@
 #ifndef lint
 static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <iostream>
 #include <string>
 using namespace std;
 // Character classes: we have three main groups, and then some chars
 // are their own class because they want special handling.
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
 static int charclasses[256];
 static void setcharclasses()
 {
    static int init = 0;
    if (init)
 	return;
    int i;
    memset(charclasses, LETTER, sizeof(charclasses));
    char digits[] = "0123456789";
    for (i = 0; i  < sizeof(digits); i++)
 	charclasses[digits[i]] = DIGIT;
    char blankspace[] = "\t\v\f ";
    for (i = 0; i < sizeof(blankspace); i++)
 	charclasses[blankspace[i]] = SPACE;
    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
    for (i = 0; i  < sizeof(seps); i++)
 	charclasses[seps[i]] = SPACE;
    char special[] = ".@+-,#'\n\r";
    for (i = 0; i  < sizeof(special); i++)
 	charclasses[special[i]] = special[i];
    init = 1;
 }
 static void emitterm(string &w, int *posp, bool doerase = true)
 {
    // Maybe trim end of word. These are chars that we would keep inside 
    // a word or span, but not at the end
    while (w.length() > 0) {
 	switch (w[w.length()-1]) {
 	case '.':
 	case ',':
 	case '@':
 	    w.erase(w.length()-1);
 	    break;
 	default:
 	    goto breakloop;
 	}
    }
 breakloop:
    if (w.length()) {
 	if (posp)
 	    *posp++;
 	cout << w << endl;
    }
    if (doerase)
 	w.erase();
 }
 void text_to_words(const string &in)
 {
    setcharclasses();
    string span;
    string word;
    bool number = false;
    int pos = 0;
    int spanpos = 0;
    for (int i = 0; i < in.length(); i++) {
 	int c = in[i];
 	int cc = charclasses[c]; 
 	switch (cc) {
 	case SPACE:
 	SPACE:
 	    if (word.length()) {
 		if (span.length() != word.length())
 		    emitterm(span, &spanpos);
 		emitterm(word, &pos);
 		number = false;
 	    }
 	    span.erase();
 	    break;
 	case '-':
 	case '+':
 	    if (word.length() == 0) {
 		if (i < in.length() || charclasses[in[i+1]] == DIGIT) {
 		    number = true;
 		    word += c;
 		    span += c;
 		}
 	    } else {
 		if (span.length() != word.length())
 		    emitterm(span, &spanpos, false);
 		emitterm(word, &pos);
 		number = false;
 		span += c;
 	    }
 	    break;
 	case '\'':
 	case '@':
 	    if (word.length()) {
 		if (span.length() != word.length())
 		    emitterm(span, &spanpos, false);
 		emitterm(word, &pos);
 		number = false;
 	    } else
 		word += c;
 	    span += c;
 	    break;
 	case '.':
 	    if (number) {
 		word += c;
 	    } else {
 		if (word.length()) {
 		    emitterm(word, &pos);
 		    number = false;
 		} else 
 		    word += c;
 	    }
 	    span += c;
 	    break;
 	case '#': 
 	    // Keep it only at end of word...
 	    if (word.length() > 0 && 
 		(i == in.length() -1 || charclasses[in[i+1]] == SPACE)) {
 		word += c;
 		span += c;
 	    }
 	    break;
 	case '\n':
 	case '\r':
 	    if (span.length() && span[span.length() - 1] == '-') {
 		// if '-' is the last char before end of line, just
 		// ignore the line change. This is the right thing to
 		// do almost always. We'd then need a way to check if
 		// the - was added as part of the sleep or was really there, 
 		// but this would need a dictionary.
 	    } else {
 		// Handle like a normal separator
 		goto SPACE;
 	    }
 	    break;
 	case LETTER:
 	case DIGIT:
 	default:
 	    if (word.length() == 0) {
 		if (cc == DIGIT)
 		    number = true;
 		else
 		    number = false;
 	    }
 	    word += (char)c;
 	    span += (char)c;
 	    break;
 	}
    }
    if (word.length()) {
 	if (span.length() != word.length())
 	    emitterm(span, &spanpos);
 	emitterm(word, &pos);
    }
 }
 #if 1 || TEST_TEXTSPLIT
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 int
 file_to_string(const string &fn, string &data)
 {
    int fd = open(fn.c_str(), 0);
    if (fd < 0) {
 	perror("open");
 	return -1;
    }
    char buf[4096];
    for (;;) {
 	int n = read(fd, buf, 4096);
 	if (n < 0) {
 	    perror("read");
 	    close(fd);
 	    return -1;
 	}
 	if (n == 0)
 	    break;
 	data.append(buf, n);
    }
    close(fd);
    return 0;
 }
 static string teststring = 
    "jfd@okyz.com "
    "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
    "@^#$(#$(*)"
    "one\n\rtwo\nthree-\nfour"
    "[olala][ululu]"
 ;
 int main(int argc, char **argv)
 {
    if (argc == 2) {
 	string data;
 	if (file_to_string(argv[1], data) < 0) 
 	    exit(1);
 	text_to_words(data);
    } else {
 	cout << teststring << endl;  text_to_words(teststring);
    }
 }
 #endif // TEST
--- a/src/index/mimetype.cpp
+++ b/src/index/mimetype.cpp
@ -0,0 +1,51 @@
 #ifndef lint
 static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <ctype.h>
 #include <string>
 using std::string;
 #include "mimetype.h"
 string mimetype(const string &filename, ConfTree *mtypes)
 {
    // If filename has a suffix and we find it in the map, we're done
    string::size_type dot = filename.find_last_of(".");
    if (dot != string::npos) {
 	string suff = filename.substr(dot);
 	for (int i = 0; i < suff.length(); i++)
 	    suff[i] = tolower(suff[i]);
 	string mtype;
 	if (mtypes->get(suff, mtype, ""))
 	    return mtype;
    }
    // Look at file data
    return "";
 }
 #ifdef _TEST_MIMETYPE_
 #include <iostream>
 const char *tvec[] = {
    "/toto/tutu",
    "/",
    "toto.txt",
    "toto.TXT",
    "toto.C.txt",
    "toto.C1",
    "",
 };
 const int n = sizeof(tvec) / sizeof(char*);
 using namespace std;
 int main(int argc, const char **argv)
 {
    map<string, string>mtypes;
    mtypes[".txt"] = "text/plain";
    for (int i = 0; i < n; i++) {
 	cout << tvec[i] << " -> " << mimetype(string(tvec[i]), mtypes) << endl;
    }
 }
 #endif
--- a/src/index/mimetype.h
+++ b/src/index/mimetype.h
@ -0,0 +1,16 @@
 #ifndef _MIMETYPE_H_INCLUDED_
 #define _MIMETYPE_H_INCLUDED_
 /* @(#$Id: mimetype.h,v 1.1 2004-12-13 15:42:16 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include "conftree.h"
 /**
 * Try to determine a mime type for filename. 
 * This may imply more than matching the suffix, the name must be usable
 * to actually access file data.
 */
 string mimetype(const std::string &filename, ConfTree *mtypes);
 #endif /* _MIMETYPE_H_INCLUDED_ */
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@ -0,0 +1,74 @@
 #ifndef lint
 static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <iostream>
 #include "pathut.h"
 #include "conftree.h"
 #include "rclconfig.h"
 #include "fstreewalk.h"
 #include "mimetype.h"
 using namespace std;
 class DirIndexer {
    FsTreeWalker walker;
    RclConfig *config;
    string topdir;
 public:
    DirIndexer(RclConfig *cnf, const string &top) 
 	: config(cnf), topdir(top)
    {
    }
    friend FsTreeWalker::Status 
      indexfile(void *, const std::string &, const struct stat *, 
 		FsTreeWalker::CbFlag);
    void index()
    {
 	walker.walk(topdir, indexfile, this);
    }
 };
 FsTreeWalker::Status 
 indexfile(void *cdata, const std::string &fn, 
 	  const struct stat *stp, FsTreeWalker::CbFlag flg)
 {
    DirIndexer *me = (DirIndexer *)cdata;
    if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) {
 	// Possibly adjust defaults
 	cout << "indexfile: [" << fn << "]" << endl;
 	return FsTreeWalker::FtwOk;
    }
    string mtype = mimetype(fn, me->config->getMimeMap());
    if (mtype.length() > 0) 
 	cout << "indexfile: " << mtype << " " << fn << endl;
    else
 	cout << "indexfile: " << "(nomime)" << " " << fn << endl;
 }
 int main(int argc, const char **argv)
 {
    RclConfig *config = new RclConfig;
    if (!config->ok())
 	cerr << "Config could not be built" << endl;
    ConfTree *conf = config->getConfig();
    string topdirs;
    if (conf->get("topdirs", topdirs, "") == 0) {
 	cerr << "No top directories in configuration" << endl;
 	exit(1);
    }
    list<string> tdl;
    if (ConfTree::stringToStrings(topdirs, tdl)) {
 	for (list<string>::iterator it = tdl.begin(); it != tdl.end(); it++) {
 	    cout << *it << endl;
 	    DirIndexer indexer(config, *it);
 	    indexer.index();
 	}
    }
 }