*** empty log message ***
This commit is contained in:
parent
a43ebc3716
commit
869b57eb8c
@ -1,6 +1,6 @@
|
|||||||
#ifndef _RCLCONFIG_H_INCLUDED_
|
#ifndef _RCLCONFIG_H_INCLUDED_
|
||||||
#define _RCLCONFIG_H_INCLUDED_
|
#define _RCLCONFIG_H_INCLUDED_
|
||||||
/* @(#$Id: rclconfig.h,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rclconfig.h,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include "conftree.h"
|
#include "conftree.h"
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ class RclConfig {
|
|||||||
conf->get("defaultcharset", defcharset, keydir);
|
conf->get("defaultcharset", defcharset, keydir);
|
||||||
conf->get("defaultlanguage", deflang, keydir);
|
conf->get("defaultlanguage", deflang, keydir);
|
||||||
string str;
|
string str;
|
||||||
conf->get("guesscharset", deflang, str);
|
conf->get("guesscharset", str, keydir);
|
||||||
guesscharset = ConfTree::stringToBool(str);
|
guesscharset = ConfTree::stringToBool(str);
|
||||||
}
|
}
|
||||||
bool getConfParam(const string &name, string &value)
|
bool getConfParam(const string &name, string &value)
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.3 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#ifndef TEST_TEXTSPLIT
|
#ifndef TEST_TEXTSPLIT
|
||||||
|
|
||||||
@ -57,8 +57,11 @@ static void setcharclasses()
|
|||||||
init = 1;
|
init = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
||||||
{
|
{
|
||||||
|
if (!termsink)
|
||||||
|
return false;
|
||||||
|
|
||||||
// Maybe trim end of word. These are chars that we would keep inside
|
// Maybe trim end of word. These are chars that we would keep inside
|
||||||
// a word or span, but not at the end
|
// a word or span, but not at the end
|
||||||
while (w.length() > 0) {
|
while (w.length() > 0) {
|
||||||
@ -73,12 +76,13 @@ void TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
breakloop:
|
breakloop:
|
||||||
if (w.length()) {
|
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
|
||||||
if (termsink)
|
bool ret = termsink(cdata, w, pos);
|
||||||
termsink(cdata, w, pos);
|
if (doerase)
|
||||||
|
w.erase();
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
if (doerase)
|
return true;
|
||||||
w.erase();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -86,7 +90,7 @@ void TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
|||||||
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
||||||
* are handled properly,
|
* are handled properly,
|
||||||
*/
|
*/
|
||||||
void TextSplit::text_to_words(const string &in)
|
bool TextSplit::text_to_words(const string &in)
|
||||||
{
|
{
|
||||||
setcharclasses();
|
setcharclasses();
|
||||||
string span;
|
string span;
|
||||||
@ -103,9 +107,11 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
SPACE:
|
SPACE:
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length()) {
|
if (span.length() != word.length()) {
|
||||||
emitterm(span, spanpos);
|
if (!emitterm(span, spanpos))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
emitterm(word, wordpos++);
|
if (!emitterm(word, wordpos++))
|
||||||
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
spanpos = wordpos;
|
spanpos = wordpos;
|
||||||
@ -121,9 +127,11 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (span.length() != word.length()) {
|
if (span.length() != word.length()) {
|
||||||
emitterm(span, spanpos, false);
|
if (!emitterm(span, spanpos, false))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
emitterm(word, wordpos++);
|
if (!emitterm(word, wordpos++))
|
||||||
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += c;
|
span += c;
|
||||||
}
|
}
|
||||||
@ -132,9 +140,11 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
case '@':
|
case '@':
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length()) {
|
if (span.length() != word.length()) {
|
||||||
emitterm(span, spanpos, false);
|
if (!emitterm(span, spanpos, false))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
emitterm(word, wordpos++);
|
if (!emitterm(word, wordpos++))
|
||||||
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
} else
|
||||||
word += c;
|
word += c;
|
||||||
@ -145,7 +155,8 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
word += c;
|
word += c;
|
||||||
} else {
|
} else {
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
emitterm(word, wordpos++);
|
if (!emitterm(word, wordpos++))
|
||||||
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
} else
|
||||||
word += c;
|
word += c;
|
||||||
@ -155,7 +166,8 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
case '#':
|
case '#':
|
||||||
// Keep it only at end of word...
|
// Keep it only at end of word...
|
||||||
if (word.length() > 0 &&
|
if (word.length() > 0 &&
|
||||||
(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE)) {
|
(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE ||
|
||||||
|
in[i+1] == '\n' || in[i+1] == '\r')) {
|
||||||
word += c;
|
word += c;
|
||||||
span += c;
|
span += c;
|
||||||
}
|
}
|
||||||
@ -190,9 +202,11 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (span.length() != word.length())
|
if (span.length() != word.length())
|
||||||
emitterm(span, spanpos);
|
if (!emitterm(span, spanpos))
|
||||||
emitterm(word, wordpos);
|
return false;
|
||||||
|
return emitterm(word, wordpos);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // TEST driver ->
|
#else // TEST driver ->
|
||||||
@ -208,10 +222,10 @@ void TextSplit::text_to_words(const string &in)
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
int termsink(void *, const string &term, int pos)
|
bool termsink(void *, const string &term, int pos)
|
||||||
{
|
{
|
||||||
cout << pos << " " << term << endl;
|
cout << pos << " " << term << endl;
|
||||||
return 0;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.1 2004-12-14 17:49:11 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.2 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -12,20 +12,22 @@
|
|||||||
*/
|
*/
|
||||||
class TextSplit {
|
class TextSplit {
|
||||||
public:
|
public:
|
||||||
typedef int (*TermSink)(void *cdata, const std::string & term, int pos);
|
typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
|
||||||
private:
|
private:
|
||||||
TermSink termsink;
|
TermSink termsink;
|
||||||
void *cdata;
|
void *cdata;
|
||||||
void emitterm(std::string &term, int pos, bool doerase);
|
int maxWordLength;
|
||||||
|
bool emitterm(std::string &term, int pos, bool doerase);
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Constructor: just store callback and client data
|
* Constructor: just store callback and client data
|
||||||
*/
|
*/
|
||||||
TextSplit(TermSink t, void *c) : termsink(t), cdata(c) {}
|
TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40)
|
||||||
|
{}
|
||||||
/**
|
/**
|
||||||
* Split text, emit words and positions.
|
* Split text, emit words and positions.
|
||||||
*/
|
*/
|
||||||
void text_to_words(const std::string &in);
|
bool text_to_words(const std::string &in);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -29,20 +29,31 @@ bool textPlainToDoc(RclConfig *conf, const string &fn,
|
|||||||
if (!file_to_string(fn, otext))
|
if (!file_to_string(fn, otext))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Try to guess charset, then convert to utf-8, and fill document fields
|
// Try to guess charset, then convert to utf-8, and fill document
|
||||||
|
// fields The charset guesser really doesnt work well in general
|
||||||
|
// and should be avoided (especially for short documents)
|
||||||
string charset;
|
string charset;
|
||||||
if (conf->guesscharset) {
|
if (conf->guesscharset) {
|
||||||
charset = csguess(otext, conf->defcharset);
|
charset = csguess(otext, conf->defcharset);
|
||||||
} else
|
} else
|
||||||
charset = conf->defcharset;
|
charset = conf->defcharset;
|
||||||
string utf8;
|
string utf8;
|
||||||
if (transcode(otext, charset, utf8, "UTF-8"))
|
cerr << "textPlainToDoc: transcod from " << charset << " to UTF-8"
|
||||||
|
<< endl;
|
||||||
|
|
||||||
|
if (!transcode(otext, utf8, charset, "UTF-8")) {
|
||||||
|
cerr << "textPlainToDoc: transcode failed: charset '" << charset
|
||||||
|
<< "' to UTF-8: "<< utf8 << endl;
|
||||||
|
otext.erase();
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
Rcl::Doc out;
|
Rcl::Doc out;
|
||||||
out.origcharset = charset;
|
out.origcharset = charset;
|
||||||
out.text = utf8;
|
out.text = utf8;
|
||||||
|
//out.text = otext;
|
||||||
docout = out;
|
docout = out;
|
||||||
|
cerr << utf8 << endl;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,6 +194,12 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
|||||||
if (!fun(me->config, fn, mime, doc))
|
if (!fun(me->config, fn, mime, doc))
|
||||||
return FsTreeWalker::FtwOk;
|
return FsTreeWalker::FtwOk;
|
||||||
|
|
||||||
|
// Set up common fields:
|
||||||
|
doc.mimetype = mime;
|
||||||
|
char ascdate[20];
|
||||||
|
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||||
|
doc.mtime = ascdate;
|
||||||
|
|
||||||
// Set up xapian document, add postings and misc fields,
|
// Set up xapian document, add postings and misc fields,
|
||||||
// add to or update database.
|
// add to or update database.
|
||||||
if (!me->db.add(fn, doc))
|
if (!me->db.add(fn, doc))
|
||||||
|
|||||||
17
src/query/Makefile
Normal file
17
src/query/Makefile
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
CXXFLAGS = -Wall -g -I. -I../index -I../utils -I../common -I/usr/local/include
|
||||||
|
|
||||||
|
|
||||||
|
PROGS = xadump
|
||||||
|
all: $(PROGS)
|
||||||
|
|
||||||
|
XADUMP_OBJS= xadump.o transcode.o
|
||||||
|
xadump : $(XADUMP_OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -o xadump $(XADUMP_OBJS) \
|
||||||
|
-L/usr/local/lib -lxapian -liconv
|
||||||
|
|
||||||
|
transcode.o : ../index/transcode.cpp ../index/transcode.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c -o transcode.o ../index/transcode.cpp
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f *.o $(PROGS)
|
||||||
117
src/query/xadump.cpp
Normal file
117
src/query/xadump.cpp
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.1 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <strings.h>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "transcode.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
#include "xapian.h"
|
||||||
|
|
||||||
|
static string thisprog;
|
||||||
|
|
||||||
|
static string usage =
|
||||||
|
" -d <dbdir> -e <output encoding>"
|
||||||
|
" \n\n"
|
||||||
|
;
|
||||||
|
|
||||||
|
static void
|
||||||
|
Usage(void)
|
||||||
|
{
|
||||||
|
cerr << thisprog << ": usage:\n" << usage;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int op_flags;
|
||||||
|
#define OPT_d 0x1
|
||||||
|
#define OPT_e 0x2
|
||||||
|
#define OPT_i 0x4
|
||||||
|
#define OPT_T 0x8
|
||||||
|
#define OPT_D 0x10
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
string dbdir = "/home/dockes/tmp/xapiandb";
|
||||||
|
string outencoding = "ISO8859-1";
|
||||||
|
int docid = 1;
|
||||||
|
|
||||||
|
thisprog = argv[0];
|
||||||
|
argc--; argv++;
|
||||||
|
|
||||||
|
while (argc > 0 && **argv == '-') {
|
||||||
|
(*argv)++;
|
||||||
|
if (!(**argv))
|
||||||
|
/* Cas du "adb - core" */
|
||||||
|
Usage();
|
||||||
|
while (**argv)
|
||||||
|
switch (*(*argv)++) {
|
||||||
|
case 'T': op_flags |= OPT_T; break;
|
||||||
|
case 'D': op_flags |= OPT_D; break;
|
||||||
|
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
|
||||||
|
dbdir = *(++argv);
|
||||||
|
argc--;
|
||||||
|
goto b1;
|
||||||
|
case 'e': op_flags |= OPT_d; if (argc < 2) Usage();
|
||||||
|
outencoding = *(++argv);
|
||||||
|
argc--;
|
||||||
|
goto b1;
|
||||||
|
case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
|
||||||
|
if (sscanf(*(++argv), "%d", &docid) != 1) Usage();
|
||||||
|
argc--;
|
||||||
|
goto b1;
|
||||||
|
default: Usage(); break;
|
||||||
|
}
|
||||||
|
b1: argc--; argv++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc != 0)
|
||||||
|
Usage();
|
||||||
|
|
||||||
|
Xapian::Database db;
|
||||||
|
|
||||||
|
try {
|
||||||
|
db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN);
|
||||||
|
|
||||||
|
if (op_flags & OPT_T) {
|
||||||
|
Xapian::TermIterator term;
|
||||||
|
string printable;
|
||||||
|
if (op_flags & OPT_i) {
|
||||||
|
for (term = db.termlist_begin(docid);
|
||||||
|
term != db.termlist_end(docid);term++) {
|
||||||
|
transcode(*term, printable, "UTF-8", outencoding);
|
||||||
|
cout << printable << endl;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (term = db.allterms_begin();
|
||||||
|
term != db.allterms_end();term++) {
|
||||||
|
transcode(*term, printable, "UTF-8", outencoding);
|
||||||
|
cout << printable << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (op_flags & OPT_D) {
|
||||||
|
Xapian::Document doc = db.get_document(docid);
|
||||||
|
string data = doc.get_data();
|
||||||
|
cout << data << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
cout << "Exception: " << e.get_msg() << endl;
|
||||||
|
} catch (const string &s) {
|
||||||
|
cout << "Exception: " << s << endl;
|
||||||
|
} catch (const char *s) {
|
||||||
|
cout << "Exception: " << s << endl;
|
||||||
|
} catch (...) {
|
||||||
|
cout << "Caught unknown exception" << endl;
|
||||||
|
}
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -11,6 +11,8 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
|
#include "textsplit.h"
|
||||||
|
#include "transcode.h"
|
||||||
|
|
||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
|
|
||||||
@ -29,7 +31,7 @@ class Native {
|
|||||||
|
|
||||||
Rcl::Db::Db()
|
Rcl::Db::Db()
|
||||||
{
|
{
|
||||||
// pdata = new Native;
|
pdata = new Native;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::Db::~Db()
|
Rcl::Db::~Db()
|
||||||
@ -37,6 +39,8 @@ Rcl::Db::~Db()
|
|||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return;
|
return;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
|
cerr << "Db::~Db: isopen " << ndb->isopen << " iswritable " <<
|
||||||
|
ndb->iswritable << endl;
|
||||||
try {
|
try {
|
||||||
// There is nothing to do for an ro db.
|
// There is nothing to do for an ro db.
|
||||||
if (ndb->isopen == false || ndb->iswritable == false) {
|
if (ndb->isopen == false || ndb->iswritable == false) {
|
||||||
@ -58,10 +62,11 @@ Rcl::Db::~Db()
|
|||||||
|
|
||||||
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||||
{
|
{
|
||||||
return true;
|
|
||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return false;
|
return false;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
|
cerr << "Db::open: isopen " << ndb->isopen << " iswritable " <<
|
||||||
|
ndb->iswritable << endl;
|
||||||
try {
|
try {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case DbUpd:
|
case DbUpd:
|
||||||
@ -95,10 +100,11 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
|||||||
|
|
||||||
bool Rcl::Db::close()
|
bool Rcl::Db::close()
|
||||||
{
|
{
|
||||||
return true;
|
|
||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return false;
|
return false;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
|
cerr << "Db::open: isopen " << ndb->isopen << " iswritable " <<
|
||||||
|
ndb->iswritable << endl;
|
||||||
if (ndb->isopen == false)
|
if (ndb->isopen == false)
|
||||||
return true;
|
return true;
|
||||||
try {
|
try {
|
||||||
@ -125,9 +131,103 @@ bool Rcl::Db::close()
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A small class to hold state while splitting text
|
||||||
|
class wsData {
|
||||||
|
public:
|
||||||
|
Xapian::Document &doc;
|
||||||
|
Xapian::termpos basepos; // Base for document section
|
||||||
|
Xapian::termpos curpos; // Last position sent to callback
|
||||||
|
wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
||||||
|
{}
|
||||||
|
};
|
||||||
|
|
||||||
|
bool splitCb(void *cdata, const std::string &term, int pos)
|
||||||
|
{
|
||||||
|
wsData *data = (wsData*)cdata;
|
||||||
|
cerr << "splitCb: term " << term << endl;
|
||||||
|
try {
|
||||||
|
// 1 is the value for wdfinc in index_text when called from omindex
|
||||||
|
// TOBEDONE: check what this is used for
|
||||||
|
data->curpos = pos;
|
||||||
|
data->doc.add_posting(term, data->basepos + data->curpos, 1);
|
||||||
|
string printable;
|
||||||
|
transcode(term, printable, "UTF-8", "ISO8859-1");
|
||||||
|
cerr << "Adding " << printable << endl;
|
||||||
|
} catch (...) {
|
||||||
|
cerr << "Error occurred during add_posting" << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||||
{
|
{
|
||||||
return true;
|
if (pdata == 0)
|
||||||
|
return false;
|
||||||
|
Native *ndb = (Native *)pdata;
|
||||||
|
|
||||||
|
Xapian::Document newdocument;
|
||||||
|
|
||||||
|
// Document data record. omindex has the following nl separated fields:
|
||||||
|
// - url
|
||||||
|
// - sample
|
||||||
|
// - caption (title limited to 100 chars)
|
||||||
|
// - mime type
|
||||||
|
string record = "url=file:/" + fn;
|
||||||
|
record += "\nmtime=" + doc.mtime;
|
||||||
|
record += "\nsample=";
|
||||||
|
record += "\ncaption=" + doc.title;
|
||||||
|
record += "\nmtype=" + doc.mimetype;
|
||||||
|
record += "\n";
|
||||||
|
newdocument.set_data(record);
|
||||||
|
|
||||||
|
// TOBEDONE:
|
||||||
|
// Need to add stuff here to unaccent and lowercase the data: use unac
|
||||||
|
// for accents, and do it by hand for upper / lower. Note lowercasing is
|
||||||
|
// only for ascii letters anyway, so it's just A-Z -> a-z
|
||||||
|
|
||||||
|
wsData splitData(newdocument);
|
||||||
|
|
||||||
|
TextSplit splitter(splitCb, &splitData);
|
||||||
|
|
||||||
|
splitter.text_to_words(doc.title);
|
||||||
|
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
splitter.text_to_words(doc.text);
|
||||||
|
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
splitter.text_to_words(doc.keywords);
|
||||||
|
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
splitter.text_to_words(doc.abstract);
|
||||||
|
|
||||||
|
newdocument.add_term("T" + doc.mimetype);
|
||||||
|
newdocument.add_term("P" + fn);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (dupes == DUPE_replace) {
|
||||||
|
// If this document has already been indexed, update the existing
|
||||||
|
// entry.
|
||||||
|
try {
|
||||||
|
Xapian::docid did = db.replace_document(urlterm, newdocument);
|
||||||
|
if (did < updated.size()) {
|
||||||
|
updated[did] = true;
|
||||||
|
cout << "updated." << endl;
|
||||||
|
} else {
|
||||||
|
cout << "added." << endl;
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
// FIXME: is this ever actually needed?
|
||||||
|
db.add_document(newdocument);
|
||||||
|
cout << "added (failed re-seek for duplicate)." << endl;
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
ndb->wdb.add_document(newdocument);
|
||||||
|
// cout << "added." << endl;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -140,5 +240,3 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
// - fetch doc (get_document(docid)
|
// - fetch doc (get_document(docid)
|
||||||
// - check date field, maybe skip
|
// - check date field, maybe skip
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -13,11 +13,13 @@ namespace Rcl {
|
|||||||
*/
|
*/
|
||||||
class Doc {
|
class Doc {
|
||||||
public:
|
public:
|
||||||
|
string mimetype;
|
||||||
|
string mtime; // Modification time as decimal ascii
|
||||||
string origcharset;
|
string origcharset;
|
||||||
string title;
|
string title;
|
||||||
string abstract;
|
|
||||||
string keywords;
|
|
||||||
string text;
|
string text;
|
||||||
|
string keywords;
|
||||||
|
string abstract;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user