replaced path|ipath with unique doc id in rcldb i/f. Still depends on udi structure for parent/child

This commit is contained in:
dockes 2008-07-28 12:24:15 +00:00
parent 1dd66b5b1d
commit 3109a33f4a
10 changed files with 93 additions and 71 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.66 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.67 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -46,6 +46,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.66 2008-07-28 08:42:52 dockes Exp
#include "internfile.h"
#include "smallut.h"
#include "wipedir.h"
#include "fileudi.h"
#ifdef RCL_USE_ASPELL
#include "rclaspell.h"
@ -335,7 +336,9 @@ bool DbIndexer::purgeFiles(const list<string> &filenames)
list<string>::const_iterator it;
for (it = filenames.begin(); it != filenames.end(); it++) {
if (!m_db.purgeFile(*it)) {
string udi;
make_udi(*it, "", udi);
if (!m_db.purgeFile(udi)) {
LOGERR(("DbIndexer::purgeFiles: Database error\n"));
return false;
}
@ -390,7 +393,9 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// Document signature
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
string sig = cbuf;
if (!m_db.needUpdate(fn, sig)) {
string udi;
make_udi(fn, "", udi);
if (!m_db.needUpdate(udi, sig)) {
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
if (m_updater) {
m_updater->status.fn = fn;
@ -463,6 +468,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
hadNullIpath = true;
else
doc.ipath = ipath;
doc.url = string("file://") + fn;
// Note that the filter may have its own idea of the file name
// (ie: mail attachment)
@ -479,7 +485,9 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
doc.sig = cbuf;
// Add document to database
if (!m_db.add(fn, doc))
string udi;
make_udi(fn, ipath, udi);
if (!m_db.add(udi, doc))
return FsTreeWalker::FtwError;
// Tell what we are doing and check for interrupt request
@ -504,7 +512,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
fileDoc.fmtime = ascdate;
fileDoc.utf8fn = utf8fn;
fileDoc.mimetype = interner.getMimetype();
fileDoc.url = string("file://") + fn;
char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size);
@ -512,7 +520,9 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// Document signature for up to date checks.
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
fileDoc.sig = cbuf;
if (!m_db.add(fn, fileDoc))
string udi;
make_udi(fn, "", udi);
if (!m_db.add(udi, fileDoc))
return FsTreeWalker::FtwError;
}

View File

@ -8,8 +8,8 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o
@ -93,6 +93,8 @@ fstreewalk.o : ../utils/fstreewalk.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../utils/fstreewalk.cpp
idfile.o : ../utils/idfile.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../utils/idfile.cpp
fileudi.o : ../utils/fileudi.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../utils/fileudi.cpp
md5.o : ../utils/md5.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../utils/md5.cpp
mimeparse.o : ../utils/mimeparse.cpp
@ -229,6 +231,9 @@ fstreewalk.dep.stamp : ../utils/fstreewalk.cpp
idfile.dep.stamp : ../utils/idfile.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../utils/idfile.cpp > idfile.dep
touch idfile.dep.stamp
fileudi.dep.stamp : ../utils/fileudi.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../utils/fileudi.cpp > fileudi.dep
touch fileudi.dep.stamp
md5.dep.stamp : ../utils/md5.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../utils/md5.cpp > md5.dep
touch md5.dep.stamp
@ -291,6 +296,7 @@ include debuglog.dep
include execmd.dep
include fstreewalk.dep
include idfile.dep
include fileudi.dep
include md5.dep
include mimeparse.dep
include pathut.dep

View File

@ -42,6 +42,7 @@ ${depth}/utils/debuglog.cpp \
${depth}/utils/execmd.cpp \
${depth}/utils/fstreewalk.cpp \
${depth}/utils/idfile.cpp \
${depth}/utils/fileudi.cpp \
${depth}/utils/md5.cpp \
${depth}/utils/mimeparse.cpp \
${depth}/utils/pathut.cpp \

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: docseqhist.cpp,v 1.2 2007-12-13 06:58:21 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: docseqhist.cpp,v 1.3 2008-07-28 12:24:15 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -23,6 +23,7 @@ static char rcsid[] = "@(#$Id: docseqhist.cpp,v 1.2 2007-12-13 06:58:21 dockes E
#include "docseqhist.h"
#include "rcldb.h"
#include "fileudi.h"
bool DocSequenceHistory::getDoc(int num, Rcl::Doc &doc, int *percent,
string *sh)
@ -58,7 +59,14 @@ bool DocSequenceHistory::getDoc(int num, Rcl::Doc &doc, int *percent,
} else
sh->erase();
}
return m_db->getDoc(m_it->fn, m_it->ipath, doc, percent);
string udi;
make_udi(m_it->fn, m_it->ipath, udi);
bool ret = m_db->getDoc(udi, doc, percent);
if (!ret) {
doc.url = string("file://") + m_it->fn;
doc.ipath = m_it->ipath;
}
return ret;
}
int DocSequenceHistory::getResCnt()

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pathhash.cpp,v 1.5 2007-12-13 06:58:21 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: pathhash.cpp,v 1.6 2008-07-28 12:24:15 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -30,6 +30,7 @@ using std::string;
namespace Rcl {
#endif /* NO_NAMESPACES */
// Debug only
#ifdef PATHHASH_HEX
static void md5hexprint(const unsigned char hash[16], string &out)
{
@ -69,7 +70,7 @@ void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
path.length() - (maxlen - HASHLEN));
MD5Final(chash, &ctx);
#if 0
#ifdef PATHHASH_HEX
string hex;
md5hexprint(chash, hex);
printf("hex [%s]\n", hex.c_str());
@ -83,7 +84,6 @@ void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
// don't need as this won't ever be decoded. Resulting length is 22
hash.resize(hash.length() - 2);
// Truncate path and append hash
phash = path.substr(0, maxlen - HASHLEN) + hash;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.135 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.136 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -80,32 +80,20 @@ namespace Rcl {
// found in document)
const static string rclSyntAbs("?!#@");
// Maximum length for path terms stored for each document. We truncate
// longer paths and uniquize them by appending a hashed value. This
// is done to avoid xapian max term length limitations, not
// to gain space (we gain very little even with very short maxlens
// like 30) Note that Q terms add the ipath to this, and that the
// xapian max key length seems to be around 250.
// The value for PATHHASHLEN includes the length of the hash part.
#define PATHHASHLEN 150
// Compute the unique term used to link documents to their file-system source:
// Hashed path + possible internal path
static inline string make_uniterm(const string& fn, const string& ipath)
static inline string make_uniterm(const string& udi)
{
string hash;
pathHash(fn, hash, PATHHASHLEN);
string s("Q");
s.append(hash);
s.append("|");
s.append(ipath);
return s;
string uniterm("Q");
uniterm.append(udi);
return uniterm;
}
/* See comment in class declaration: return all subdocuments of a
* document given by its unique path id */
bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids)
{
LOGDEB2(("subDocs: [%s]\n", uniterm.c_str()));
docids.clear();
string ermsg;
@ -116,11 +104,11 @@ bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids)
// Don't return the doc itself:
it++;
for (; it != db.allterms_end(); it++) {
LOGDEB2(("Testing [%s]\n", (*it).c_str()));
LOGDEB2(("subDocs: testing [%s]\n", (*it).c_str()));
// If current term does not begin with uniterm or has
// another |, not the same file
if ((*it).find(uniterm) != 0 ||
(*it).find_last_of("|") != uniterm.length() - 1)
(*it).find_last_of("|") != uniterm.length()-1)
break;
docids.push_back(*(db.postlist_begin(*it)));
}
@ -812,9 +800,9 @@ static const int MB = 1024 * 1024;
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
// metadata), and update database
bool Db::add(const string &fn, const Doc &idoc)
bool Db::add(const string &udi, const Doc &idoc)
{
LOGDEB1(("Db::add: fn %s\n", fn.c_str()));
LOGDEB1(("Db::add: udi %s\n", udi.c_str()));
if (m_ndb == 0)
return false;
static int first = 1;
@ -937,7 +925,7 @@ bool Db::add(const string &fn, const Doc &idoc)
// Pathname/ipath unique term: this is used for file existence/uptodate
// checks, and unique id for the replace_document() call.
string uniterm = make_uniterm(fn, doc.ipath);
string uniterm = make_uniterm(udi);
newdocument.add_term(uniterm);
// Dates etc...
@ -957,7 +945,7 @@ bool Db::add(const string &fn, const Doc &idoc)
// - sample
// - caption (title limited to 100 chars)
// - mime type
string record = "url=file://" + fn;
string record = "url=" + doc.url;
record += "\nmtype=" + doc.mimetype;
record += "\nfmtime=" + doc.fmtime;
if (!doc.dmtime.empty()) {
@ -992,7 +980,7 @@ bool Db::add(const string &fn, const Doc &idoc)
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
const char *fnc = fn.c_str();
const char *fnc = udi.c_str();
string ermsg;
// Add db entry or update existing entry:
@ -1001,11 +989,9 @@ bool Db::add(const string &fn, const Doc &idoc)
m_ndb->wdb.replace_document(uniterm, newdocument);
if (did < updated.size()) {
updated[did] = true;
LOGDEB(("Db::add: docid %d updated [%s , %s]\n", did, fnc,
doc.ipath.c_str()));
LOGDEB(("Db::add: docid %d updated [%s]\n", did, fnc));
} else {
LOGDEB(("Db::add: docid %d added [%s , %s]\n", did, fnc,
doc.ipath.c_str()));
LOGDEB(("Db::add: docid %d added [%s]\n", did, fnc));
}
} XCATCHERROR(ermsg);
@ -1044,13 +1030,13 @@ bool Db::add(const string &fn, const Doc &idoc)
return true;
}
// Test if given filename has changed since last indexed:
bool Db::needUpdate(const string &filename, const string& sig)
// Test if doc given by udi has changed since last indexed (test sigs)
bool Db::needUpdate(const string &udi, const string& sig)
{
if (m_ndb == 0)
return false;
string uniterm = make_uniterm(filename, string());
string uniterm = make_uniterm(udi);
string ermsg;
// We look up the document indexed by the uniterm. This is either
@ -1086,8 +1072,8 @@ bool Db::needUpdate(const string &filename, const string& sig)
return true;
string osig = data.substr(i1, i2-i1);
#endif
LOGDEB(("Db::needUpdate: oldsig [%s] new [%s]\n",
osig.c_str(), sig.c_str()));
LOGDEB2(("Db::needUpdate: oldsig [%s] new [%s]\n",
osig.c_str(), sig.c_str()));
// Compare new/old sig
if (sig != osig) {
LOGDEB(("Db::needUpdate:yes: olsig [%s] new [%s]\n",
@ -1222,14 +1208,14 @@ bool Db::purge()
return true;
}
/** Delete document(s) for given filename */
bool Db::purgeFile(const string &fn)
/* Delete document(s) for given unique identifier (doc and descendents) */
bool Db::purgeFile(const string &udi)
{
LOGDEB(("Db:purgeFile: [%s]\n", fn.c_str()));
LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str()));
if (m_ndb == 0)
return false;
Xapian::WritableDatabase db = m_ndb->wdb;
string uniterm = make_uniterm(fn, string());
string uniterm = make_uniterm(udi);
string ermsg;
try {
Xapian::PostingIterator docid = db.postlist_begin(uniterm);
@ -1528,21 +1514,18 @@ bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
}
// Retrieve document defined by file name and internal path.
bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
bool Db::getDoc(const string &udi, Doc &doc, int *pc)
{
LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(),
ipath.c_str()));
LOGDEB(("Db:getDoc: [%s]\n", udi.c_str()));
if (m_ndb == 0)
return false;
// Initialize what we can in any case. If this is history, caller
// will make partial display in case of error
doc.ipath = ipath;
doc.url = string("file://") + fn;
if (*pc)
*pc = 100;
string uniterm = make_uniterm(fn, ipath);
string uniterm = make_uniterm(udi);
string ermsg;
try {
if (!m_ndb->db.term_exists(uniterm)) {

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.57 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.58 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -113,7 +113,7 @@ class Db {
bool add(const string &udi, const Doc &doc);
/** Delete document(s) for given UDI, including subdocs */
bool purgeFile(const string &fn);
bool purgeFile(const string &udi);
/** Remove documents that no longer exist in the file system. This
* depends on the update map, which is built during
@ -161,7 +161,7 @@ class Db {
/** Get document for given filename and ipath. Used by the 'history'
* feature (and nothing else?) */
bool getDoc(const string &fn, const string &ipath, Doc &doc, int *percent);
bool getDoc(const string &udi, Doc &doc, int *percent);
/* The following are mainly for the aspell module */
/** Whole term list walking. */

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLDOC_H_INCLUDED_
#define _RCLDOC_H_INCLUDED_
/* @(#$Id: rcldoc.h,v 1.4 2008-07-28 08:42:52 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: rcldoc.h,v 1.5 2008-07-28 12:24:15 dockes Exp $ (C) 2006 J.F.Dockes */
#include <string>
#include <map>
@ -34,14 +34,21 @@ class Doc {
public:
// These fields potentially go into the document data record
// We indicate the routine that sets them up during indexing
string url; // This is just "file://" + binary filename.
// No transcoding: this is used to access files
// Computed from fn by Db::add
string utf8fn; // Transcoded version of the simple file name for
// SFN-prefixed specific file name indexation
// Set by DbIndexer::processone
string ipath; // Internal path for multi-doc files. Ascii
// Set by DbIndexer::processone
// This is just "file://" + binary filename. No transcoding: this
// is used to access files
// Index: computed from fn by Db::add caller. Query: from doc data.
string url;
// Transcoded version of the simple file name for SFN-prefixed
// specific file name indexation
// Indexx: set by DbIndexer::processone
string utf8fn;
// Internal path for multi-doc files. Ascii
// Set by DbIndexer::processone
string ipath;
string mimetype; // Set by FileInterner::internfile
string fmtime; // File modification time as decimal ascii unix time
// Set by DbIndexer::processone

View File

@ -1,7 +1,7 @@
depth = ..
include $(depth)/mk/sysconf
PROGS = trconftree wipedir smallut trfstreewalk trpathut \
PROGS = trfileudi trconftree wipedir smallut trfstreewalk trpathut \
transcode trbase64 \
trmimeparse trexecmd utf8iter idfile
@ -24,6 +24,12 @@ trpathut : $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h
$(CXX) -o trpathut.o -c $(ALL_CXXFLAGS) -DTEST_PATHUT pathut.cpp
FILEUDI_OBJS= trfileudi.o $(BIGLIB)
trfileudi : $(FILEUDI_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o trfileudi $(FILEUDI_OBJS)
trfileudi.o : fileudi.cpp fileudi.h
$(CXX) -o trfileudi.o -c $(ALL_CXXFLAGS) -DTEST_FILEUDI fileudi.cpp
EXECMD_OBJS= trexecmd.o $(BIGLIB)
trexecmd : $(EXECMD_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o trexecmd $(EXECMD_OBJS)

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: fileudi.cpp,v 1.1 2008-07-28 10:20:20 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: fileudi.cpp,v 1.2 2008-07-28 12:24:15 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -100,6 +100,7 @@ void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
void make_udi(const string& fn, const string& ipath, string &udi)
{
string s(fn);
// Note that we append a "|" in all cases. Historical, could be removed
s.append("|");
s.append(ipath);
pathHash(s, udi, PATHHASHLEN);