abstract building from position data
This commit is contained in:
parent
44b2aa534c
commit
52aaa52754
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.53 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -45,6 +45,13 @@ using namespace std;
|
||||
#include "xapian.h"
|
||||
#include <xapian/stem.h>
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (A>B?A:B)
|
||||
#endif
|
||||
#ifndef MIN
|
||||
#define MIN(A,B) (A<B?A:B)
|
||||
#endif
|
||||
|
||||
// Data for a xapian database. There could actually be 2 different
|
||||
// ones for indexing or query as there is not much in common.
|
||||
class Native {
|
||||
@ -64,6 +71,12 @@ class Native {
|
||||
Xapian::Enquire *enquire;
|
||||
Xapian::MSet mset;
|
||||
|
||||
string makeAbstract(Xapian::docid id, const list<string>& terms);
|
||||
bool dbDataToRclDoc(std::string &data, Rcl::Doc &doc,
|
||||
int qopts,
|
||||
Xapian::docid docid,
|
||||
const list<string>& terms);
|
||||
|
||||
Native() : isopen(false), iswritable(false), enquire(0) { }
|
||||
~Native() {
|
||||
delete enquire;
|
||||
@ -73,6 +86,7 @@ class Native {
|
||||
Rcl::Db::Db()
|
||||
{
|
||||
pdata = new Native;
|
||||
m_qOpts = 0;
|
||||
}
|
||||
|
||||
Rcl::Db::~Db()
|
||||
@ -105,13 +119,14 @@ Rcl::Db::~Db()
|
||||
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
|
||||
}
|
||||
|
||||
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||
bool Rcl::Db::open(const string& dir, OpenMode mode, int qops)
|
||||
{
|
||||
if (pdata == 0)
|
||||
return false;
|
||||
Native *ndb = (Native *)pdata;
|
||||
LOGDEB(("Db::open: isopen %d iswritable %d\n", ndb->isopen,
|
||||
ndb->iswritable));
|
||||
m_qOpts = qops;
|
||||
|
||||
if (ndb->isopen) {
|
||||
LOGERR(("Rcl::Db::open: already open\n"));
|
||||
@ -268,7 +283,7 @@ bool Rcl::dumb_string(const string &in, string &out)
|
||||
/* From omindex direct */
|
||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||
* if reasonably possible. */
|
||||
string
|
||||
static string
|
||||
truncate_to_word(string & input, string::size_type maxlen)
|
||||
{
|
||||
string output;
|
||||
@ -292,32 +307,63 @@ truncate_to_word(string & input, string::size_type maxlen)
|
||||
|
||||
output += " ...";
|
||||
}
|
||||
// No need to replace newlines with spaces, we do this in dumb_string()
|
||||
return output;
|
||||
}
|
||||
|
||||
// remove some chars and replace them with spaces
|
||||
static string stripchars(const string &str, string delims)
|
||||
{
|
||||
string out;
|
||||
string::size_type startPos, pos;
|
||||
|
||||
for (pos = 0;;) {
|
||||
// Skip initial delims, break if this eats all.
|
||||
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
||||
break;
|
||||
// Find next delimiter or end of string (end of token)
|
||||
pos = str.find_first_of(delims, startPos);
|
||||
// Add token to the vector. Note: token cant be empty here
|
||||
if (pos == string::npos) {
|
||||
out += str.substr(startPos) + " ";
|
||||
} else {
|
||||
out += str.substr(startPos, pos - startPos) + " ";
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Truncate longer path and uniquize with hash . The goal for this is
|
||||
// to avoid xapian max term length limitations, not to gain space (we
|
||||
// gain very little even with very short maxlens like 30)
|
||||
#define PATHHASHLEN 150
|
||||
|
||||
#define ABSTRACT_SIZE 200
|
||||
const static string rclSyntAbs = "?!#@";
|
||||
|
||||
// Add document in internal form to the database: index the terms in
|
||||
// the title abstract and body and add special terms for file name,
|
||||
// date, mime type ... , create the document data record (more
|
||||
// metadata), and update database
|
||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
|
||||
const struct stat *stp)
|
||||
{
|
||||
LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
|
||||
if (pdata == 0)
|
||||
return false;
|
||||
Native *ndb = (Native *)pdata;
|
||||
|
||||
// Truncate abstract, title and keywords to reasonable lengths
|
||||
Rcl::Doc doc = idoc;
|
||||
if (doc.abstract.empty())
|
||||
doc.abstract = truncate_to_word(doc.text, 100);
|
||||
else
|
||||
doc.abstract = truncate_to_word(doc.abstract, 100);
|
||||
|
||||
// Truncate abstract, title and keywords to reasonable lengths. If
|
||||
// abstract is currently empty, we make up one with the beginning
|
||||
// of the document.
|
||||
if (doc.abstract.empty()) {
|
||||
doc.abstract = rclSyntAbs +
|
||||
truncate_to_word(doc.text, ABSTRACT_SIZE);
|
||||
} else {
|
||||
doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE);
|
||||
}
|
||||
doc.abstract = stripchars(doc.abstract, "\n\r");
|
||||
doc.title = truncate_to_word(doc.title, 100);
|
||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
||||
|
||||
@ -417,12 +463,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
record += "\ndmtime=" + doc.dmtime;
|
||||
}
|
||||
record += "\norigcharset=" + doc.origcharset;
|
||||
record += "\ncaption=" + doc.title;
|
||||
record += "\nkeywords=" + doc.keywords;
|
||||
record += "\nabstract=" + doc.abstract;
|
||||
char sizebuf[20];
|
||||
sizebuf[0] = 0;
|
||||
if (stp)
|
||||
sprintf(sizebuf, "%ld", (long)stp->st_size);
|
||||
if (sizebuf[0])
|
||||
record += string("\nfbytes=") + sizebuf;
|
||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||
record += string("\ndbytes=") + sizebuf;
|
||||
if (!doc.ipath.empty()) {
|
||||
record += "\nipath=" + doc.ipath;
|
||||
}
|
||||
record += "\ncaption=" + doc.title;
|
||||
record += "\nkeywords=" + doc.keywords;
|
||||
record += "\nabstract=" + doc.abstract;
|
||||
record += "\n";
|
||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
@ -812,6 +866,7 @@ static list<string> stemexpand(Native *ndb, string term, const string& lang)
|
||||
}
|
||||
|
||||
|
||||
// Splitter callback for breaking query into terms
|
||||
class wsQData : public TextSplitCB {
|
||||
public:
|
||||
vector<string> terms;
|
||||
@ -836,7 +891,6 @@ class wsQData : public TextSplitCB {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Turn string into list of xapian queries. There is little
|
||||
// interpretation done on the string (no +term -term or filename:term
|
||||
// stuff). We just separate words and phrases, and interpret
|
||||
@ -927,7 +981,6 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
|
||||
Native *ndb = (Native *)pdata;
|
||||
if (!ndb)
|
||||
return false;
|
||||
|
||||
asdata.erase();
|
||||
dbindices.clear();
|
||||
list<Xapian::Query> pqueries;
|
||||
@ -950,6 +1003,7 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
|
||||
LOGDEB((" phrase: %s\n", sdata.phrase.c_str()));
|
||||
LOGDEB((" orwords: %s\n", sdata.orwords.c_str()));
|
||||
LOGDEB((" nowords: %s\n", sdata.nowords.c_str()));
|
||||
|
||||
string ft;
|
||||
for (list<string>::iterator it = sdata.filetypes.begin();
|
||||
it != sdata.filetypes.end(); it++) {ft += *it + " ";}
|
||||
@ -1053,6 +1107,8 @@ bool Rcl::Db::getQueryTerms(list<string>& terms)
|
||||
return true;
|
||||
}
|
||||
|
||||
static const int qquantum = 30;
|
||||
|
||||
int Rcl::Db::getResCnt()
|
||||
{
|
||||
Native *ndb = (Native *)pdata;
|
||||
@ -1060,8 +1116,19 @@ int Rcl::Db::getResCnt()
|
||||
LOGERR(("Rcl::Db::getResCnt: no query opened\n"));
|
||||
return -1;
|
||||
}
|
||||
if (ndb->mset.size() <= 0)
|
||||
return -1;
|
||||
if (ndb->mset.size() <= 0) {
|
||||
try {
|
||||
ndb->mset = ndb->enquire->get_mset(0, qquantum);
|
||||
} catch (const Xapian::DatabaseModifiedError &error) {
|
||||
ndb->db.reopen();
|
||||
ndb->mset = ndb->enquire->get_mset(0, qquantum);
|
||||
} catch (const Xapian::Error & error) {
|
||||
LOGERR(("enquire->get_mset: exception: %s\n",
|
||||
error.get_msg().c_str()));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return ndb->mset.get_matches_lower_bound();
|
||||
}
|
||||
|
||||
@ -1085,7 +1152,9 @@ class Rcl::DbPops {
|
||||
}
|
||||
};
|
||||
|
||||
bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
|
||||
bool Native::dbDataToRclDoc(std::string &data, Rcl::Doc &doc,
|
||||
int qopts,
|
||||
Xapian::docid docid, const list<string>& terms)
|
||||
{
|
||||
LOGDEB1(("Rcl::Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
||||
ConfSimple parms(&data);
|
||||
@ -1099,7 +1168,20 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
|
||||
parms.get(string("caption"), doc.title);
|
||||
parms.get(string("keywords"), doc.keywords);
|
||||
parms.get(string("abstract"), doc.abstract);
|
||||
bool syntabs = false;
|
||||
if (doc.abstract.find(rclSyntAbs) == 0) {
|
||||
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
||||
syntabs = true;
|
||||
}
|
||||
if ((qopts && Rcl::Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
||||
LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
|
||||
if (doc.abstract.empty() || syntabs ||
|
||||
(qopts & Rcl::Db::QO_REPLACE_ABSTRACT))
|
||||
doc.abstract = makeAbstract(docid, terms);
|
||||
}
|
||||
parms.get(string("ipath"), doc.ipath);
|
||||
parms.get(string("fbytes"), doc.fbytes);
|
||||
parms.get(string("dbytes"), doc.dbytes);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1114,7 +1196,6 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
|
||||
// that dont match the filter).
|
||||
bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
|
||||
{
|
||||
const int qquantum = 30;
|
||||
LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti));
|
||||
Native *ndb = (Native *)pdata;
|
||||
if (!ndb || !ndb->enquire) {
|
||||
@ -1199,12 +1280,15 @@ bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
|
||||
ndb->mset.get_matches_lower_bound()));
|
||||
|
||||
Xapian::Document xdoc = ndb->mset[xapi-first].get_document();
|
||||
Xapian::docid docid = *(ndb->mset[xapi-first]);
|
||||
if (percent)
|
||||
*percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]);
|
||||
|
||||
// Parse xapian document's data and populate doc fields
|
||||
string data = xdoc.get_data();
|
||||
return dbDataToRclDoc(data, doc);
|
||||
list<string> terms;
|
||||
getQueryTerms(terms);
|
||||
return ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
|
||||
}
|
||||
|
||||
// Retrieve document defined by file name and internal path. Very inefficient,
|
||||
@ -1237,7 +1321,9 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
||||
|
||||
Xapian::Document xdoc = ndb->db.get_document(*docid);
|
||||
string data = xdoc.get_data();
|
||||
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
|
||||
list<string> terms;
|
||||
if (ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms)
|
||||
&& doc.ipath == ipath)
|
||||
return true;
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
@ -1258,3 +1344,123 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Width of a sample extract around a query term
|
||||
//
|
||||
// We build a possibly full size but sparsely populated (only around
|
||||
// the search term) reconstruction of the document. It would be
|
||||
// possible to compress the array, by having only multiple chunks
|
||||
// around the terms, but this would seriously complicate the data
|
||||
// structure.
|
||||
#define EXTRACT_WIDTH 3
|
||||
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
||||
{
|
||||
Chrono chron;
|
||||
// A buffer that we populate with the document terms, at their position
|
||||
vector<string> buf;
|
||||
|
||||
// Go through the list of query terms. For each entry in each
|
||||
// position list, populate the slot in the document buffer, and
|
||||
// remember the position and its neigbours
|
||||
vector<unsigned int> qtermposs; // The term positions
|
||||
set<unsigned int> chunkposs; // All the positions we shall populate
|
||||
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
||||
qit++) {
|
||||
Xapian::PositionIterator pos;
|
||||
// There may be query terms not in this doc. This raises an
|
||||
// exception when requesting the position list, we just catch it.
|
||||
try {
|
||||
unsigned int occurrences = 0;
|
||||
for (pos = db.positionlist_begin(docid, *qit);
|
||||
pos != db.positionlist_end(docid, *qit); pos++) {
|
||||
unsigned int ipos = *pos;
|
||||
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
||||
// Possibly extend the array. Do it in big chunks
|
||||
if (ipos + EXTRACT_WIDTH >= buf.size()) {
|
||||
buf.resize(ipos + EXTRACT_WIDTH + 1000);
|
||||
}
|
||||
buf[ipos] = *qit;
|
||||
// Remember the term position
|
||||
qtermposs.push_back(ipos);
|
||||
// Add adjacent slots to the set to populate at next step
|
||||
for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH);
|
||||
ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) {
|
||||
chunkposs.insert(ii);
|
||||
}
|
||||
// Limit the number of occurences we keep for each
|
||||
// term. The abstract has a finite length anyway !
|
||||
if (occurrences++ > 10)
|
||||
break;
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
}
|
||||
|
||||
LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n",
|
||||
chron.millis(), qtermposs.size()));
|
||||
|
||||
// Walk the full document position list and populate slots around
|
||||
// the query terms. We arbitrarily truncate the list to avoid
|
||||
// taking forever. If we do cutoff, the abstract may be
|
||||
// inconsistant, which is bad...
|
||||
{ Xapian::TermIterator term;
|
||||
int cutoff = 500 * 1000;
|
||||
for (term = db.termlist_begin(docid);
|
||||
term != db.termlist_end(docid); term++) {
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = db.positionlist_begin(docid, *term);
|
||||
pos != db.positionlist_end(docid, *term); pos++) {
|
||||
if (cutoff-- < 0)
|
||||
break;
|
||||
unsigned int ipos = *pos;
|
||||
if (chunkposs.find(ipos) != chunkposs.end()) {
|
||||
buf[ipos] = *term;
|
||||
}
|
||||
}
|
||||
if (cutoff-- < 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
LOGDEB1(("Abstract:%d: randomizing and extracting\n", chron.millis()));
|
||||
|
||||
// We randomize the selection of term positions, from which we
|
||||
// shall pull, starting at the beginning, until the abstract is
|
||||
// big enough. The abstract is finally built in correct position
|
||||
// order, thanks to the position map.
|
||||
random_shuffle(qtermposs.begin(), qtermposs.end());
|
||||
map<unsigned int, string> mabs;
|
||||
unsigned int abslen = 0;
|
||||
LOGDEB1(("Abstract:%d: extracting\n", chron.millis()));
|
||||
// Extract data around the first (in random order) term positions,
|
||||
// and store the chunks in the map
|
||||
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
||||
it != qtermposs.end(); it++) {
|
||||
unsigned int ipos = *it;
|
||||
unsigned int start = MAX(0, ipos-EXTRACT_WIDTH);
|
||||
unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1);
|
||||
string chunk;
|
||||
for (unsigned int ii = start; ii <= end; ii++) {
|
||||
if (!buf[ii].empty()) {
|
||||
chunk += buf[ii] + " ";
|
||||
abslen += buf[ii].length();
|
||||
}
|
||||
if (abslen > 300)
|
||||
break;
|
||||
}
|
||||
if (end != buf.size()-1)
|
||||
chunk += "... ";
|
||||
mabs[ipos] = chunk;
|
||||
if (abslen > 300)
|
||||
break;
|
||||
}
|
||||
|
||||
// Build the abstract by walking the map (in order of position)
|
||||
string abstract;
|
||||
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
||||
it != mabs.end(); it++) {
|
||||
abstract += (*it).second;
|
||||
}
|
||||
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
||||
return abstract;
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.22 2006-01-11 15:08:21 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.23 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -31,7 +31,7 @@ namespace Rcl {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Dumb bunch holder for document attributes and data
|
||||
* Dumb holder for document attributes and data
|
||||
*/
|
||||
class Doc {
|
||||
public:
|
||||
@ -45,7 +45,11 @@ class Doc {
|
||||
string title;
|
||||
string keywords;
|
||||
string abstract;
|
||||
string fbytes; // File size
|
||||
string dbytes; // Doc size
|
||||
|
||||
// The following fields don't go to the db. text is only used when
|
||||
// indexing
|
||||
string text;
|
||||
|
||||
int pc; // used by sortseq, convenience
|
||||
@ -60,6 +64,8 @@ class Doc {
|
||||
title.erase();
|
||||
keywords.erase();
|
||||
abstract.erase();
|
||||
fbytes.erase();
|
||||
dbytes.erase();
|
||||
|
||||
text.erase();
|
||||
}
|
||||
@ -79,28 +85,36 @@ class AdvSearchData {
|
||||
string description; // Printable expanded version of the complete query
|
||||
// returned after setQuery.
|
||||
void erase() {
|
||||
allwords.erase();phrase.erase();orwords.erase();nowords.erase();
|
||||
filetypes.clear(); topdir.erase();
|
||||
description.clear();
|
||||
allwords.erase();
|
||||
phrase.erase();
|
||||
orwords.erase();
|
||||
nowords.erase();
|
||||
filetypes.clear();
|
||||
topdir.erase();
|
||||
description.erase();
|
||||
}
|
||||
};
|
||||
|
||||
class DbPops;
|
||||
class DbPops;
|
||||
|
||||
/**
|
||||
* Wrapper class for the native database.
|
||||
*/
|
||||
class Db {
|
||||
public:
|
||||
public:
|
||||
Db();
|
||||
~Db();
|
||||
|
||||
enum OpenMode {DbRO, DbUpd, DbTrunc};
|
||||
bool open(const string &dbdir, OpenMode mode);
|
||||
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_BUILD_ABSTRACT = 2,
|
||||
QO_REPLACE_ABSTRACT = 4};
|
||||
|
||||
bool open(const string &dbdir, OpenMode mode, int qops = 0);
|
||||
bool close();
|
||||
bool isopen();
|
||||
|
||||
// Update-related functions
|
||||
bool add(const string &filename, const Doc &doc);
|
||||
bool add(const string &filename, const Doc &doc, const struct stat *stp);
|
||||
bool needUpdate(const string &filename, const struct stat *stp);
|
||||
bool purge();
|
||||
bool createStemDb(const string &lang);
|
||||
@ -109,7 +123,6 @@ public:
|
||||
// Query-related functions
|
||||
|
||||
// Parse query string and initialize query
|
||||
enum QueryOpts {QO_NONE=0, QO_STEM = 1};
|
||||
bool setQuery(const string &q, QueryOpts opts = QO_NONE,
|
||||
const string& stemlang = "english");
|
||||
bool setQuery(AdvSearchData &q, QueryOpts opts = QO_NONE,
|
||||
@ -143,10 +156,11 @@ private:
|
||||
// db indices that match
|
||||
void *pdata; // Pointer to private data. We don't want db(ie
|
||||
// xapian)-specific defs to show in here
|
||||
unsigned int m_qOpts;
|
||||
|
||||
/* Copyconst and assignemt private and forbidden */
|
||||
Db(const Db &) {}
|
||||
Db & operator=(const Db &) {return *this;};
|
||||
bool dbDataToRclDoc(std::string &data, Doc &doc);
|
||||
};
|
||||
|
||||
// Unaccent and lowercase data.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user