*** empty log message ***
This commit is contained in:
parent
ab473faa8c
commit
3c66f8e964
81
src/common/unacpp.cpp
Normal file
81
src/common/unacpp.cpp
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.1 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef TEST_UNACPP
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
#include "unacpp.h"
|
||||||
|
#include "unac.h"
|
||||||
|
|
||||||
|
|
||||||
|
bool unac_cpp(const std::string &in, std::string &out, const char *encoding)
|
||||||
|
{
|
||||||
|
char *cout = 0;
|
||||||
|
size_t out_len;
|
||||||
|
|
||||||
|
if (unac_string(encoding, in.c_str(), in.length(), &cout, &out_len) < 0) {
|
||||||
|
char cerrno[20];
|
||||||
|
sprintf(cerrno, "%d", errno);
|
||||||
|
out = string("unac_string failed, errno : ") + cerrno;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
out.assign(cout, out_len);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // not testing
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
#include "unacpp.h"
|
||||||
|
#include "readfile.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
if (argc != 4) {
|
||||||
|
cerr << "Usage: unacpp <encoding> <infile> <outfile>" << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
const char *encoding = argv[1];
|
||||||
|
string ifn = argv[2];
|
||||||
|
const char *ofn = argv[3];
|
||||||
|
|
||||||
|
string odata;
|
||||||
|
if (!file_to_string(ifn, odata)) {
|
||||||
|
cerr << "file_to_string: " << odata << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
string ndata;
|
||||||
|
if (!unac_cpp(odata, ndata, encoding)) {
|
||||||
|
cerr << "unac: " << ndata << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||||
|
if (fd < 0) {
|
||||||
|
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
||||||
|
<< endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
||||||
|
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
close(fd);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
11
src/common/unacpp.h
Normal file
11
src/common/unacpp.h
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#ifndef _UNACPP_H_INCLUDED_
|
||||||
|
#define _UNACPP_H_INCLUDED_
|
||||||
|
/* @(#$Id: unacpp.h,v 1.1 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// A small wrapper for unac.c
|
||||||
|
extern bool unac_cpp(const std::string &in, std::string &out,
|
||||||
|
const char *encoding = "UTF-8");
|
||||||
|
|
||||||
|
#endif /* _UNACPP_H_INCLUDED_ */
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.4 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -144,15 +144,17 @@ class wsData {
|
|||||||
bool splitCb(void *cdata, const std::string &term, int pos)
|
bool splitCb(void *cdata, const std::string &term, int pos)
|
||||||
{
|
{
|
||||||
wsData *data = (wsData*)cdata;
|
wsData *data = (wsData*)cdata;
|
||||||
cerr << "splitCb: term " << term << endl;
|
|
||||||
|
// cerr << "splitCb: term " << term << endl;
|
||||||
|
//string printable;
|
||||||
|
//transcode(term, printable, "UTF-8", "ISO8859-1");
|
||||||
|
//cerr << "Adding " << printable << endl;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// 1 is the value for wdfinc in index_text when called from omindex
|
// 1 is the value for wdfinc in index_text when called from omindex
|
||||||
// TOBEDONE: check what this is used for
|
// TOBEDONE: check what this is used for
|
||||||
data->curpos = pos;
|
data->curpos = pos;
|
||||||
data->doc.add_posting(term, data->basepos + data->curpos, 1);
|
data->doc.add_posting(term, data->basepos + data->curpos, 1);
|
||||||
string printable;
|
|
||||||
transcode(term, printable, "UTF-8", "ISO8859-1");
|
|
||||||
cerr << "Adding " << printable << endl;
|
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
cerr << "Error occurred during add_posting" << endl;
|
cerr << "Error occurred during add_posting" << endl;
|
||||||
return false;
|
return false;
|
||||||
@ -202,41 +204,68 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
|||||||
splitter.text_to_words(doc.abstract);
|
splitter.text_to_words(doc.abstract);
|
||||||
|
|
||||||
newdocument.add_term("T" + doc.mimetype);
|
newdocument.add_term("T" + doc.mimetype);
|
||||||
newdocument.add_term("P" + fn);
|
string pathterm = "P" + fn;
|
||||||
|
newdocument.add_term(pathterm);
|
||||||
|
|
||||||
#if 0
|
if (1 /*dupes == DUPE_replace*/) {
|
||||||
if (dupes == DUPE_replace) {
|
|
||||||
// If this document has already been indexed, update the existing
|
// If this document has already been indexed, update the existing
|
||||||
// entry.
|
// entry.
|
||||||
try {
|
try {
|
||||||
Xapian::docid did = db.replace_document(urlterm, newdocument);
|
Xapian::docid did = ndb->wdb.replace_document(pathterm,
|
||||||
|
newdocument);
|
||||||
|
#if 0
|
||||||
if (did < updated.size()) {
|
if (did < updated.size()) {
|
||||||
updated[did] = true;
|
updated[did] = true;
|
||||||
cout << "updated." << endl;
|
//cout << "updated." << endl;
|
||||||
} else {
|
} else {
|
||||||
cout << "added." << endl;
|
//cout << "added." << endl;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// FIXME: is this ever actually needed?
|
// FIXME: is this ever actually needed?
|
||||||
db.add_document(newdocument);
|
ndb->wdb.add_document(newdocument);
|
||||||
cout << "added (failed re-seek for duplicate)." << endl;
|
//cout << "added (failed re-seek for duplicate)." << endl;
|
||||||
}
|
}
|
||||||
} else
|
} else {
|
||||||
#endif
|
try {
|
||||||
{
|
|
||||||
ndb->wdb.add_document(newdocument);
|
ndb->wdb.add_document(newdocument);
|
||||||
// cout << "added." << endl;
|
// cout << "added." << endl;
|
||||||
|
} catch (...) {
|
||||||
|
cerr << "Got exception while adding doc" << endl;
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
||||||
{
|
{
|
||||||
|
if (pdata == 0)
|
||||||
|
return false;
|
||||||
|
Native *ndb = (Native *)pdata;
|
||||||
|
|
||||||
|
string pathterm = "P" + filename;
|
||||||
|
if (!ndb->wdb.term_exists(pathterm))
|
||||||
|
return true;
|
||||||
|
Xapian::PostingIterator doc;
|
||||||
|
try {
|
||||||
|
Xapian::PostingIterator did = ndb->wdb.postlist_begin(pathterm);
|
||||||
|
if (did == ndb->wdb.postlist_end(pathterm))
|
||||||
|
return true;
|
||||||
|
Xapian::Document doc = ndb->wdb.get_document(*did);
|
||||||
|
string data = doc.get_data();
|
||||||
|
//cout << "DOCUMENT EXISTS " << data << endl;
|
||||||
|
const char *cp = strstr(data.c_str(), "mtime=");
|
||||||
|
cp += 6;
|
||||||
|
long mtime = atol(cp);
|
||||||
|
if (mtime >= stp->st_mtime) {
|
||||||
|
// cerr << "DOCUMENT UP TO DATE" << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
// TOBEDONE: Check if file has already been indexed, and has changed since
|
|
||||||
// - Make path term,
|
|
||||||
// - query db: postlist_begin->docid
|
|
||||||
// - fetch doc (get_document(docid)
|
|
||||||
// - check date field, maybe skip
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user