*** empty log message ***

2004-12-13 15:42:16 +00:00 · 2004-12-13 15:42:16 +00:00 · 0786c283ef
commit 0786c283ef
parent 063727df38
4 changed files with 361 additions and 0 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -0,0 +1,220 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
+#endif
+
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+// Character classes: we have three main groups, and then some chars
+// are their own class because they want special handling.
+enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
+static int charclasses[256];
+static void setcharclasses()
+{
+    static int init = 0;
+    if (init)
+	return;
+    int i;
+    memset(charclasses, LETTER, sizeof(charclasses));
+
+    char digits[] = "0123456789";
+    for (i = 0; i  < sizeof(digits); i++)
+	charclasses[digits[i]] = DIGIT;
+
+    char blankspace[] = "\t\v\f ";
+    for (i = 0; i < sizeof(blankspace); i++)
+	charclasses[blankspace[i]] = SPACE;
+
+    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
+    for (i = 0; i  < sizeof(seps); i++)
+	charclasses[seps[i]] = SPACE;
+
+    char special[] = ".@+-,#'\n\r";
+    for (i = 0; i  < sizeof(special); i++)
+	charclasses[special[i]] = special[i];
+
+    init = 1;
+}
+
+static void emitterm(string &w, int *posp, bool doerase = true)
+{
+    // Maybe trim end of word. These are chars that we would keep inside 
+    // a word or span, but not at the end
+    while (w.length() > 0) {
+	switch (w[w.length()-1]) {
+	case '.':
+	case ',':
+	case '@':
+	    w.erase(w.length()-1);
+	    break;
+	default:
+	    goto breakloop;
+	}
+    }
+ breakloop:
+    if (w.length()) {
+	if (posp)
+	    *posp++;
+	cout << w << endl;
+    }
+    if (doerase)
+	w.erase();
+}
+
+void text_to_words(const string &in)
+{
+    setcharclasses();
+    string span;
+    string word;
+    bool number = false;
+    int pos = 0;
+    int spanpos = 0;
+    for (int i = 0; i < in.length(); i++) {
+	int c = in[i];
+	int cc = charclasses[c]; 
+	switch (cc) {
+	case SPACE:
+	SPACE:
+	    if (word.length()) {
+		if (span.length() != word.length())
+		    emitterm(span, &spanpos);
+		emitterm(word, &pos);
+		number = false;
+	    }
+	    span.erase();
+	    break;
+	case '-':
+	case '+':
+	    if (word.length() == 0) {
+		if (i < in.length() || charclasses[in[i+1]] == DIGIT) {
+		    number = true;
+		    word += c;
+		    span += c;
+		}
+	    } else {
+		if (span.length() != word.length())
+		    emitterm(span, &spanpos, false);
+		emitterm(word, &pos);
+		number = false;
+		span += c;
+	    }
+	    break;
+	case '\'':
+	case '@':
+	    if (word.length()) {
+		if (span.length() != word.length())
+		    emitterm(span, &spanpos, false);
+		emitterm(word, &pos);
+		number = false;
+	    } else
+		word += c;
+	    span += c;
+	    break;
+	case '.':
+	    if (number) {
+		word += c;
+	    } else {
+		if (word.length()) {
+		    emitterm(word, &pos);
+		    number = false;
+		} else 
+		    word += c;
+	    }
+	    span += c;
+	    break;
+	case '#': 
+	    // Keep it only at end of word...
+	    if (word.length() > 0 && 
+		(i == in.length() -1 || charclasses[in[i+1]] == SPACE)) {
+		word += c;
+		span += c;
+	    }
+		
+	    break;
+	case '\n':
+	case '\r':
+	    if (span.length() && span[span.length() - 1] == '-') {
+		// if '-' is the last char before end of line, just
+		// ignore the line change. This is the right thing to
+		// do almost always. We'd then need a way to check if
+		// the - was added as part of the sleep or was really there, 
+		// but this would need a dictionary.
+	    } else {
+		// Handle like a normal separator
+		goto SPACE;
+	    }
+	    break;
+	case LETTER:
+	case DIGIT:
+	default:
+	    if (word.length() == 0) {
+		if (cc == DIGIT)
+		    number = true;
+		else
+		    number = false;
+	    }
+	    word += (char)c;
+	    span += (char)c;
+	    break;
+	}
+    }
+    if (word.length()) {
+	if (span.length() != word.length())
+	    emitterm(span, &spanpos);
+	emitterm(word, &pos);
+    }
+}
+
+#if 1 || TEST_TEXTSPLIT
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+int
+file_to_string(const string &fn, string &data)
+{
+    int fd = open(fn.c_str(), 0);
+    if (fd < 0) {
+	perror("open");
+	return -1;
+    }
+    char buf[4096];
+    for (;;) {
+	int n = read(fd, buf, 4096);
+	if (n < 0) {
+	    perror("read");
+	    close(fd);
+	    return -1;
+	}
+	if (n == 0)
+	    break;
+	data.append(buf, n);
+    }
+    close(fd);
+    return 0;
+}
+
+static string teststring = 
+    "jfd@okyz.com "
+    "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
+    "@^#$(#$(*)"
+    "one\n\rtwo\nthree-\nfour"
+    "[olala][ululu]"
+
+;
+
+int main(int argc, char **argv)
+{
+    if (argc == 2) {
+	string data;
+	if (file_to_string(argv[1], data) < 0) 
+	    exit(1);
+	text_to_words(data);
+    } else {
+	cout << teststring << endl;  text_to_words(teststring);
+    }
+    
+}
+#endif // TEST
+
--- a/src/index/mimetype.cpp
+++ b/src/index/mimetype.cpp
@ -0,0 +1,51 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
+#endif
+
+#include <ctype.h>
+
+#include <string>
+using std::string;
+
+#include "mimetype.h"
+
+string mimetype(const string &filename, ConfTree *mtypes)
+{
+    // If filename has a suffix and we find it in the map, we're done
+    string::size_type dot = filename.find_last_of(".");
+    if (dot != string::npos) {
+	string suff = filename.substr(dot);
+	for (int i = 0; i < suff.length(); i++)
+	    suff[i] = tolower(suff[i]);
+
+	string mtype;
+	if (mtypes->get(suff, mtype, ""))
+	    return mtype;
+    }
+    // Look at file data
+    return "";
+}
+
+#ifdef _TEST_MIMETYPE_
+#include <iostream>
+const char *tvec[] = {
+    "/toto/tutu",
+    "/",
+    "toto.txt",
+    "toto.TXT",
+    "toto.C.txt",
+    "toto.C1",
+    "",
+};
+const int n = sizeof(tvec) / sizeof(char*);
+using namespace std;
+int main(int argc, const char **argv)
+{
+    map<string, string>mtypes;
+    mtypes[".txt"] = "text/plain";
+
+    for (int i = 0; i < n; i++) {
+	cout << tvec[i] << " -> " << mimetype(string(tvec[i]), mtypes) << endl;
+    }
+}
+#endif
--- a/src/index/mimetype.h
+++ b/src/index/mimetype.h
@ -0,0 +1,16 @@
+#ifndef _MIMETYPE_H_INCLUDED_
+#define _MIMETYPE_H_INCLUDED_
+/* @(#$Id: mimetype.h,v 1.1 2004-12-13 15:42:16 dockes Exp $  (C) 2004 J.F.Dockes */
+
+#include <string>
+#include "conftree.h"
+
+
+/**
+ * Try to determine a mime type for filename. 
+ * This may imply more than matching the suffix, the name must be usable
+ * to actually access file data.
+ */
+string mimetype(const std::string &filename, ConfTree *mtypes);
+
+#endif /* _MIMETYPE_H_INCLUDED_ */
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@ -0,0 +1,74 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
+#endif
+
+#include <iostream>
+
+#include "pathut.h"
+#include "conftree.h"
+#include "rclconfig.h"
+#include "fstreewalk.h"
+#include "mimetype.h"
+
+using namespace std;
+
+class DirIndexer {
+    FsTreeWalker walker;
+    RclConfig *config;
+    string topdir;
+ public:
+    DirIndexer(RclConfig *cnf, const string &top) 
+	: config(cnf), topdir(top)
+    {
+    }
+    friend FsTreeWalker::Status 
+      indexfile(void *, const std::string &, const struct stat *, 
+		FsTreeWalker::CbFlag);
+    void index()
+    {
+	walker.walk(topdir, indexfile, this);
+    }
+};
+
+FsTreeWalker::Status 
+indexfile(void *cdata, const std::string &fn, 
+	  const struct stat *stp, FsTreeWalker::CbFlag flg)
+{
+    DirIndexer *me = (DirIndexer *)cdata;
+    if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) {
+	// Possibly adjust defaults
+	cout << "indexfile: [" << fn << "]" << endl;
+	return FsTreeWalker::FtwOk;
+    }
+    string mtype = mimetype(fn, me->config->getMimeMap());
+    if (mtype.length() > 0) 
+	cout << "indexfile: " << mtype << " " << fn << endl;
+    else
+	cout << "indexfile: " << "(nomime)" << " " << fn << endl;
+
+}
+
+
+int main(int argc, const char **argv)
+{
+    RclConfig *config = new RclConfig;
+
+    if (!config->ok())
+	cerr << "Config could not be built" << endl;
+
+    ConfTree *conf = config->getConfig();
+    
+    string topdirs;
+    if (conf->get("topdirs", topdirs, "") == 0) {
+	cerr << "No top directories in configuration" << endl;
+	exit(1);
+    }
+    list<string> tdl;
+    if (ConfTree::stringToStrings(topdirs, tdl)) {
+	for (list<string>::iterator it = tdl.begin(); it != tdl.end(); it++) {
+	    cout << *it << endl;
+	    DirIndexer indexer(config, *it);
+	    indexer.index();
+	}
+    }
+}