*** empty log message ***

This commit is contained in:
dockes 2004-12-13 15:42:16 +00:00
parent 063727df38
commit 0786c283ef
4 changed files with 361 additions and 0 deletions

220
src/common/textsplit.cpp Normal file
View File

@ -0,0 +1,220 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
#include <string>
using namespace std;
// Character classes: we have three main groups, and then some chars
// are their own class because they want special handling.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
static int charclasses[256];
static void setcharclasses()
{
static int init = 0;
if (init)
return;
int i;
memset(charclasses, LETTER, sizeof(charclasses));
char digits[] = "0123456789";
for (i = 0; i < sizeof(digits); i++)
charclasses[digits[i]] = DIGIT;
char blankspace[] = "\t\v\f ";
for (i = 0; i < sizeof(blankspace); i++)
charclasses[blankspace[i]] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
for (i = 0; i < sizeof(seps); i++)
charclasses[seps[i]] = SPACE;
char special[] = ".@+-,#'\n\r";
for (i = 0; i < sizeof(special); i++)
charclasses[special[i]] = special[i];
init = 1;
}
static void emitterm(string &w, int *posp, bool doerase = true)
{
// Maybe trim end of word. These are chars that we would keep inside
// a word or span, but not at the end
while (w.length() > 0) {
switch (w[w.length()-1]) {
case '.':
case ',':
case '@':
w.erase(w.length()-1);
break;
default:
goto breakloop;
}
}
breakloop:
if (w.length()) {
if (posp)
*posp++;
cout << w << endl;
}
if (doerase)
w.erase();
}
void text_to_words(const string &in)
{
setcharclasses();
string span;
string word;
bool number = false;
int pos = 0;
int spanpos = 0;
for (int i = 0; i < in.length(); i++) {
int c = in[i];
int cc = charclasses[c];
switch (cc) {
case SPACE:
SPACE:
if (word.length()) {
if (span.length() != word.length())
emitterm(span, &spanpos);
emitterm(word, &pos);
number = false;
}
span.erase();
break;
case '-':
case '+':
if (word.length() == 0) {
if (i < in.length() || charclasses[in[i+1]] == DIGIT) {
number = true;
word += c;
span += c;
}
} else {
if (span.length() != word.length())
emitterm(span, &spanpos, false);
emitterm(word, &pos);
number = false;
span += c;
}
break;
case '\'':
case '@':
if (word.length()) {
if (span.length() != word.length())
emitterm(span, &spanpos, false);
emitterm(word, &pos);
number = false;
} else
word += c;
span += c;
break;
case '.':
if (number) {
word += c;
} else {
if (word.length()) {
emitterm(word, &pos);
number = false;
} else
word += c;
}
span += c;
break;
case '#':
// Keep it only at end of word...
if (word.length() > 0 &&
(i == in.length() -1 || charclasses[in[i+1]] == SPACE)) {
word += c;
span += c;
}
break;
case '\n':
case '\r':
if (span.length() && span[span.length() - 1] == '-') {
// if '-' is the last char before end of line, just
// ignore the line change. This is the right thing to
// do almost always. We'd then need a way to check if
// the - was added as part of the sleep or was really there,
// but this would need a dictionary.
} else {
// Handle like a normal separator
goto SPACE;
}
break;
case LETTER:
case DIGIT:
default:
if (word.length() == 0) {
if (cc == DIGIT)
number = true;
else
number = false;
}
word += (char)c;
span += (char)c;
break;
}
}
if (word.length()) {
if (span.length() != word.length())
emitterm(span, &spanpos);
emitterm(word, &pos);
}
}
#if 1 || TEST_TEXTSPLIT
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
int
file_to_string(const string &fn, string &data)
{
int fd = open(fn.c_str(), 0);
if (fd < 0) {
perror("open");
return -1;
}
char buf[4096];
for (;;) {
int n = read(fd, buf, 4096);
if (n < 0) {
perror("read");
close(fd);
return -1;
}
if (n == 0)
break;
data.append(buf, n);
}
close(fd);
return 0;
}
static string teststring =
"jfd@okyz.com "
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
"@^#$(#$(*)"
"one\n\rtwo\nthree-\nfour"
"[olala][ululu]"
;
int main(int argc, char **argv)
{
if (argc == 2) {
string data;
if (file_to_string(argv[1], data) < 0)
exit(1);
text_to_words(data);
} else {
cout << teststring << endl; text_to_words(teststring);
}
}
#endif // TEST

51
src/index/mimetype.cpp Normal file
View File

@ -0,0 +1,51 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <ctype.h>
#include <string>
using std::string;
#include "mimetype.h"
string mimetype(const string &filename, ConfTree *mtypes)
{
// If filename has a suffix and we find it in the map, we're done
string::size_type dot = filename.find_last_of(".");
if (dot != string::npos) {
string suff = filename.substr(dot);
for (int i = 0; i < suff.length(); i++)
suff[i] = tolower(suff[i]);
string mtype;
if (mtypes->get(suff, mtype, ""))
return mtype;
}
// Look at file data
return "";
}
#ifdef _TEST_MIMETYPE_
#include <iostream>
const char *tvec[] = {
"/toto/tutu",
"/",
"toto.txt",
"toto.TXT",
"toto.C.txt",
"toto.C1",
"",
};
const int n = sizeof(tvec) / sizeof(char*);
using namespace std;
int main(int argc, const char **argv)
{
map<string, string>mtypes;
mtypes[".txt"] = "text/plain";
for (int i = 0; i < n; i++) {
cout << tvec[i] << " -> " << mimetype(string(tvec[i]), mtypes) << endl;
}
}
#endif

16
src/index/mimetype.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef _MIMETYPE_H_INCLUDED_
#define _MIMETYPE_H_INCLUDED_
/* @(#$Id: mimetype.h,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include "conftree.h"
/**
* Try to determine a mime type for filename.
* This may imply more than matching the suffix, the name must be usable
* to actually access file data.
*/
string mimetype(const std::string &filename, ConfTree *mtypes);
#endif /* _MIMETYPE_H_INCLUDED_ */

74
src/index/recollindex.cpp Normal file
View File

@ -0,0 +1,74 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.1 2004-12-13 15:42:16 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
#include "pathut.h"
#include "conftree.h"
#include "rclconfig.h"
#include "fstreewalk.h"
#include "mimetype.h"
using namespace std;
class DirIndexer {
FsTreeWalker walker;
RclConfig *config;
string topdir;
public:
DirIndexer(RclConfig *cnf, const string &top)
: config(cnf), topdir(top)
{
}
friend FsTreeWalker::Status
indexfile(void *, const std::string &, const struct stat *,
FsTreeWalker::CbFlag);
void index()
{
walker.walk(topdir, indexfile, this);
}
};
FsTreeWalker::Status
indexfile(void *cdata, const std::string &fn,
const struct stat *stp, FsTreeWalker::CbFlag flg)
{
DirIndexer *me = (DirIndexer *)cdata;
if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) {
// Possibly adjust defaults
cout << "indexfile: [" << fn << "]" << endl;
return FsTreeWalker::FtwOk;
}
string mtype = mimetype(fn, me->config->getMimeMap());
if (mtype.length() > 0)
cout << "indexfile: " << mtype << " " << fn << endl;
else
cout << "indexfile: " << "(nomime)" << " " << fn << endl;
}
int main(int argc, const char **argv)
{
RclConfig *config = new RclConfig;
if (!config->ok())
cerr << "Config could not be built" << endl;
ConfTree *conf = config->getConfig();
string topdirs;
if (conf->get("topdirs", topdirs, "") == 0) {
cerr << "No top directories in configuration" << endl;
exit(1);
}
list<string> tdl;
if (ConfTree::stringToStrings(topdirs, tdl)) {
for (list<string>::iterator it = tdl.begin(); it != tdl.end(); it++) {
cout << *it << endl;
DirIndexer indexer(config, *it);
indexer.index();
}
}
}