abstract building from position data
This commit is contained in:
parent
44b2aa534c
commit
52aaa52754
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.53 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -45,6 +45,13 @@ using namespace std;
|
|||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
#include <xapian/stem.h>
|
#include <xapian/stem.h>
|
||||||
|
|
||||||
|
#ifndef MAX
|
||||||
|
#define MAX(A,B) (A>B?A:B)
|
||||||
|
#endif
|
||||||
|
#ifndef MIN
|
||||||
|
#define MIN(A,B) (A<B?A:B)
|
||||||
|
#endif
|
||||||
|
|
||||||
// Data for a xapian database. There could actually be 2 different
|
// Data for a xapian database. There could actually be 2 different
|
||||||
// ones for indexing or query as there is not much in common.
|
// ones for indexing or query as there is not much in common.
|
||||||
class Native {
|
class Native {
|
||||||
@ -64,6 +71,12 @@ class Native {
|
|||||||
Xapian::Enquire *enquire;
|
Xapian::Enquire *enquire;
|
||||||
Xapian::MSet mset;
|
Xapian::MSet mset;
|
||||||
|
|
||||||
|
string makeAbstract(Xapian::docid id, const list<string>& terms);
|
||||||
|
bool dbDataToRclDoc(std::string &data, Rcl::Doc &doc,
|
||||||
|
int qopts,
|
||||||
|
Xapian::docid docid,
|
||||||
|
const list<string>& terms);
|
||||||
|
|
||||||
Native() : isopen(false), iswritable(false), enquire(0) { }
|
Native() : isopen(false), iswritable(false), enquire(0) { }
|
||||||
~Native() {
|
~Native() {
|
||||||
delete enquire;
|
delete enquire;
|
||||||
@ -73,6 +86,7 @@ class Native {
|
|||||||
Rcl::Db::Db()
|
Rcl::Db::Db()
|
||||||
{
|
{
|
||||||
pdata = new Native;
|
pdata = new Native;
|
||||||
|
m_qOpts = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::Db::~Db()
|
Rcl::Db::~Db()
|
||||||
@ -105,13 +119,14 @@ Rcl::Db::~Db()
|
|||||||
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
|
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
bool Rcl::Db::open(const string& dir, OpenMode mode, int qops)
|
||||||
{
|
{
|
||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return false;
|
return false;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
LOGDEB(("Db::open: isopen %d iswritable %d\n", ndb->isopen,
|
LOGDEB(("Db::open: isopen %d iswritable %d\n", ndb->isopen,
|
||||||
ndb->iswritable));
|
ndb->iswritable));
|
||||||
|
m_qOpts = qops;
|
||||||
|
|
||||||
if (ndb->isopen) {
|
if (ndb->isopen) {
|
||||||
LOGERR(("Rcl::Db::open: already open\n"));
|
LOGERR(("Rcl::Db::open: already open\n"));
|
||||||
@ -268,7 +283,7 @@ bool Rcl::dumb_string(const string &in, string &out)
|
|||||||
/* From omindex direct */
|
/* From omindex direct */
|
||||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||||
* if reasonably possible. */
|
* if reasonably possible. */
|
||||||
string
|
static string
|
||||||
truncate_to_word(string & input, string::size_type maxlen)
|
truncate_to_word(string & input, string::size_type maxlen)
|
||||||
{
|
{
|
||||||
string output;
|
string output;
|
||||||
@ -292,32 +307,63 @@ truncate_to_word(string & input, string::size_type maxlen)
|
|||||||
|
|
||||||
output += " ...";
|
output += " ...";
|
||||||
}
|
}
|
||||||
// No need to replace newlines with spaces, we do this in dumb_string()
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove some chars and replace them with spaces
|
||||||
|
static string stripchars(const string &str, string delims)
|
||||||
|
{
|
||||||
|
string out;
|
||||||
|
string::size_type startPos, pos;
|
||||||
|
|
||||||
|
for (pos = 0;;) {
|
||||||
|
// Skip initial delims, break if this eats all.
|
||||||
|
if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
|
||||||
|
break;
|
||||||
|
// Find next delimiter or end of string (end of token)
|
||||||
|
pos = str.find_first_of(delims, startPos);
|
||||||
|
// Add token to the vector. Note: token cant be empty here
|
||||||
|
if (pos == string::npos) {
|
||||||
|
out += str.substr(startPos) + " ";
|
||||||
|
} else {
|
||||||
|
out += str.substr(startPos, pos - startPos) + " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
// Truncate longer path and uniquize with hash . The goal for this is
|
// Truncate longer path and uniquize with hash . The goal for this is
|
||||||
// to avoid xapian max term length limitations, not to gain space (we
|
// to avoid xapian max term length limitations, not to gain space (we
|
||||||
// gain very little even with very short maxlens like 30)
|
// gain very little even with very short maxlens like 30)
|
||||||
#define PATHHASHLEN 150
|
#define PATHHASHLEN 150
|
||||||
|
|
||||||
|
#define ABSTRACT_SIZE 200
|
||||||
|
const static string rclSyntAbs = "?!#@";
|
||||||
|
|
||||||
// Add document in internal form to the database: index the terms in
|
// Add document in internal form to the database: index the terms in
|
||||||
// the title abstract and body and add special terms for file name,
|
// the title abstract and body and add special terms for file name,
|
||||||
// date, mime type ... , create the document data record (more
|
// date, mime type ... , create the document data record (more
|
||||||
// metadata), and update database
|
// metadata), and update database
|
||||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc,
|
||||||
|
const struct stat *stp)
|
||||||
{
|
{
|
||||||
LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
|
LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
|
||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return false;
|
return false;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
|
|
||||||
// Truncate abstract, title and keywords to reasonable lengths
|
|
||||||
Rcl::Doc doc = idoc;
|
Rcl::Doc doc = idoc;
|
||||||
if (doc.abstract.empty())
|
|
||||||
doc.abstract = truncate_to_word(doc.text, 100);
|
// Truncate abstract, title and keywords to reasonable lengths. If
|
||||||
else
|
// abstract is currently empty, we make up one with the beginning
|
||||||
doc.abstract = truncate_to_word(doc.abstract, 100);
|
// of the document.
|
||||||
|
if (doc.abstract.empty()) {
|
||||||
|
doc.abstract = rclSyntAbs +
|
||||||
|
truncate_to_word(doc.text, ABSTRACT_SIZE);
|
||||||
|
} else {
|
||||||
|
doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE);
|
||||||
|
}
|
||||||
|
doc.abstract = stripchars(doc.abstract, "\n\r");
|
||||||
doc.title = truncate_to_word(doc.title, 100);
|
doc.title = truncate_to_word(doc.title, 100);
|
||||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
doc.keywords = truncate_to_word(doc.keywords, 300);
|
||||||
|
|
||||||
@ -417,12 +463,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
record += "\ndmtime=" + doc.dmtime;
|
record += "\ndmtime=" + doc.dmtime;
|
||||||
}
|
}
|
||||||
record += "\norigcharset=" + doc.origcharset;
|
record += "\norigcharset=" + doc.origcharset;
|
||||||
record += "\ncaption=" + doc.title;
|
char sizebuf[20];
|
||||||
record += "\nkeywords=" + doc.keywords;
|
sizebuf[0] = 0;
|
||||||
record += "\nabstract=" + doc.abstract;
|
if (stp)
|
||||||
|
sprintf(sizebuf, "%ld", (long)stp->st_size);
|
||||||
|
if (sizebuf[0])
|
||||||
|
record += string("\nfbytes=") + sizebuf;
|
||||||
|
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||||
|
record += string("\ndbytes=") + sizebuf;
|
||||||
if (!doc.ipath.empty()) {
|
if (!doc.ipath.empty()) {
|
||||||
record += "\nipath=" + doc.ipath;
|
record += "\nipath=" + doc.ipath;
|
||||||
}
|
}
|
||||||
|
record += "\ncaption=" + doc.title;
|
||||||
|
record += "\nkeywords=" + doc.keywords;
|
||||||
|
record += "\nabstract=" + doc.abstract;
|
||||||
record += "\n";
|
record += "\n";
|
||||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||||
newdocument.set_data(record);
|
newdocument.set_data(record);
|
||||||
@ -812,6 +866,7 @@ static list<string> stemexpand(Native *ndb, string term, const string& lang)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Splitter callback for breaking query into terms
|
||||||
class wsQData : public TextSplitCB {
|
class wsQData : public TextSplitCB {
|
||||||
public:
|
public:
|
||||||
vector<string> terms;
|
vector<string> terms;
|
||||||
@ -836,7 +891,6 @@ class wsQData : public TextSplitCB {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// Turn string into list of xapian queries. There is little
|
// Turn string into list of xapian queries. There is little
|
||||||
// interpretation done on the string (no +term -term or filename:term
|
// interpretation done on the string (no +term -term or filename:term
|
||||||
// stuff). We just separate words and phrases, and interpret
|
// stuff). We just separate words and phrases, and interpret
|
||||||
@ -927,7 +981,6 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
|
|||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
if (!ndb)
|
if (!ndb)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
asdata.erase();
|
asdata.erase();
|
||||||
dbindices.clear();
|
dbindices.clear();
|
||||||
list<Xapian::Query> pqueries;
|
list<Xapian::Query> pqueries;
|
||||||
@ -950,6 +1003,7 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
|
|||||||
LOGDEB((" phrase: %s\n", sdata.phrase.c_str()));
|
LOGDEB((" phrase: %s\n", sdata.phrase.c_str()));
|
||||||
LOGDEB((" orwords: %s\n", sdata.orwords.c_str()));
|
LOGDEB((" orwords: %s\n", sdata.orwords.c_str()));
|
||||||
LOGDEB((" nowords: %s\n", sdata.nowords.c_str()));
|
LOGDEB((" nowords: %s\n", sdata.nowords.c_str()));
|
||||||
|
|
||||||
string ft;
|
string ft;
|
||||||
for (list<string>::iterator it = sdata.filetypes.begin();
|
for (list<string>::iterator it = sdata.filetypes.begin();
|
||||||
it != sdata.filetypes.end(); it++) {ft += *it + " ";}
|
it != sdata.filetypes.end(); it++) {ft += *it + " ";}
|
||||||
@ -1053,6 +1107,8 @@ bool Rcl::Db::getQueryTerms(list<string>& terms)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const int qquantum = 30;
|
||||||
|
|
||||||
int Rcl::Db::getResCnt()
|
int Rcl::Db::getResCnt()
|
||||||
{
|
{
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
@ -1060,8 +1116,19 @@ int Rcl::Db::getResCnt()
|
|||||||
LOGERR(("Rcl::Db::getResCnt: no query opened\n"));
|
LOGERR(("Rcl::Db::getResCnt: no query opened\n"));
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (ndb->mset.size() <= 0)
|
if (ndb->mset.size() <= 0) {
|
||||||
return -1;
|
try {
|
||||||
|
ndb->mset = ndb->enquire->get_mset(0, qquantum);
|
||||||
|
} catch (const Xapian::DatabaseModifiedError &error) {
|
||||||
|
ndb->db.reopen();
|
||||||
|
ndb->mset = ndb->enquire->get_mset(0, qquantum);
|
||||||
|
} catch (const Xapian::Error & error) {
|
||||||
|
LOGERR(("enquire->get_mset: exception: %s\n",
|
||||||
|
error.get_msg().c_str()));
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ndb->mset.get_matches_lower_bound();
|
return ndb->mset.get_matches_lower_bound();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1085,7 +1152,9 @@ class Rcl::DbPops {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
|
bool Native::dbDataToRclDoc(std::string &data, Rcl::Doc &doc,
|
||||||
|
int qopts,
|
||||||
|
Xapian::docid docid, const list<string>& terms)
|
||||||
{
|
{
|
||||||
LOGDEB1(("Rcl::Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
LOGDEB1(("Rcl::Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
||||||
ConfSimple parms(&data);
|
ConfSimple parms(&data);
|
||||||
@ -1099,7 +1168,20 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
|
|||||||
parms.get(string("caption"), doc.title);
|
parms.get(string("caption"), doc.title);
|
||||||
parms.get(string("keywords"), doc.keywords);
|
parms.get(string("keywords"), doc.keywords);
|
||||||
parms.get(string("abstract"), doc.abstract);
|
parms.get(string("abstract"), doc.abstract);
|
||||||
|
bool syntabs = false;
|
||||||
|
if (doc.abstract.find(rclSyntAbs) == 0) {
|
||||||
|
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
||||||
|
syntabs = true;
|
||||||
|
}
|
||||||
|
if ((qopts && Rcl::Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
||||||
|
LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
|
||||||
|
if (doc.abstract.empty() || syntabs ||
|
||||||
|
(qopts & Rcl::Db::QO_REPLACE_ABSTRACT))
|
||||||
|
doc.abstract = makeAbstract(docid, terms);
|
||||||
|
}
|
||||||
parms.get(string("ipath"), doc.ipath);
|
parms.get(string("ipath"), doc.ipath);
|
||||||
|
parms.get(string("fbytes"), doc.fbytes);
|
||||||
|
parms.get(string("dbytes"), doc.dbytes);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1114,7 +1196,6 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
|
|||||||
// that dont match the filter).
|
// that dont match the filter).
|
||||||
bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
|
bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
|
||||||
{
|
{
|
||||||
const int qquantum = 30;
|
|
||||||
LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti));
|
LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti));
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
if (!ndb || !ndb->enquire) {
|
if (!ndb || !ndb->enquire) {
|
||||||
@ -1199,12 +1280,15 @@ bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
|
|||||||
ndb->mset.get_matches_lower_bound()));
|
ndb->mset.get_matches_lower_bound()));
|
||||||
|
|
||||||
Xapian::Document xdoc = ndb->mset[xapi-first].get_document();
|
Xapian::Document xdoc = ndb->mset[xapi-first].get_document();
|
||||||
|
Xapian::docid docid = *(ndb->mset[xapi-first]);
|
||||||
if (percent)
|
if (percent)
|
||||||
*percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]);
|
*percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]);
|
||||||
|
|
||||||
// Parse xapian document's data and populate doc fields
|
// Parse xapian document's data and populate doc fields
|
||||||
string data = xdoc.get_data();
|
string data = xdoc.get_data();
|
||||||
return dbDataToRclDoc(data, doc);
|
list<string> terms;
|
||||||
|
getQueryTerms(terms);
|
||||||
|
return ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve document defined by file name and internal path. Very inefficient,
|
// Retrieve document defined by file name and internal path. Very inefficient,
|
||||||
@ -1237,7 +1321,9 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
|||||||
|
|
||||||
Xapian::Document xdoc = ndb->db.get_document(*docid);
|
Xapian::Document xdoc = ndb->db.get_document(*docid);
|
||||||
string data = xdoc.get_data();
|
string data = xdoc.get_data();
|
||||||
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
|
list<string> terms;
|
||||||
|
if (ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms)
|
||||||
|
&& doc.ipath == ipath)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
@ -1258,3 +1344,123 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Width of a sample extract around a query term
|
||||||
|
//
|
||||||
|
// We build a possibly full size but sparsely populated (only around
|
||||||
|
// the search term) reconstruction of the document. It would be
|
||||||
|
// possible to compress the array, by having only multiple chunks
|
||||||
|
// around the terms, but this would seriously complicate the data
|
||||||
|
// structure.
|
||||||
|
#define EXTRACT_WIDTH 3
|
||||||
|
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
||||||
|
{
|
||||||
|
Chrono chron;
|
||||||
|
// A buffer that we populate with the document terms, at their position
|
||||||
|
vector<string> buf;
|
||||||
|
|
||||||
|
// Go through the list of query terms. For each entry in each
|
||||||
|
// position list, populate the slot in the document buffer, and
|
||||||
|
// remember the position and its neigbours
|
||||||
|
vector<unsigned int> qtermposs; // The term positions
|
||||||
|
set<unsigned int> chunkposs; // All the positions we shall populate
|
||||||
|
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
||||||
|
qit++) {
|
||||||
|
Xapian::PositionIterator pos;
|
||||||
|
// There may be query terms not in this doc. This raises an
|
||||||
|
// exception when requesting the position list, we just catch it.
|
||||||
|
try {
|
||||||
|
unsigned int occurrences = 0;
|
||||||
|
for (pos = db.positionlist_begin(docid, *qit);
|
||||||
|
pos != db.positionlist_end(docid, *qit); pos++) {
|
||||||
|
unsigned int ipos = *pos;
|
||||||
|
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
||||||
|
// Possibly extend the array. Do it in big chunks
|
||||||
|
if (ipos + EXTRACT_WIDTH >= buf.size()) {
|
||||||
|
buf.resize(ipos + EXTRACT_WIDTH + 1000);
|
||||||
|
}
|
||||||
|
buf[ipos] = *qit;
|
||||||
|
// Remember the term position
|
||||||
|
qtermposs.push_back(ipos);
|
||||||
|
// Add adjacent slots to the set to populate at next step
|
||||||
|
for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH);
|
||||||
|
ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) {
|
||||||
|
chunkposs.insert(ii);
|
||||||
|
}
|
||||||
|
// Limit the number of occurences we keep for each
|
||||||
|
// term. The abstract has a finite length anyway !
|
||||||
|
if (occurrences++ > 10)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n",
|
||||||
|
chron.millis(), qtermposs.size()));
|
||||||
|
|
||||||
|
// Walk the full document position list and populate slots around
|
||||||
|
// the query terms. We arbitrarily truncate the list to avoid
|
||||||
|
// taking forever. If we do cutoff, the abstract may be
|
||||||
|
// inconsistant, which is bad...
|
||||||
|
{ Xapian::TermIterator term;
|
||||||
|
int cutoff = 500 * 1000;
|
||||||
|
for (term = db.termlist_begin(docid);
|
||||||
|
term != db.termlist_end(docid); term++) {
|
||||||
|
Xapian::PositionIterator pos;
|
||||||
|
for (pos = db.positionlist_begin(docid, *term);
|
||||||
|
pos != db.positionlist_end(docid, *term); pos++) {
|
||||||
|
if (cutoff-- < 0)
|
||||||
|
break;
|
||||||
|
unsigned int ipos = *pos;
|
||||||
|
if (chunkposs.find(ipos) != chunkposs.end()) {
|
||||||
|
buf[ipos] = *term;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cutoff-- < 0)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOGDEB1(("Abstract:%d: randomizing and extracting\n", chron.millis()));
|
||||||
|
|
||||||
|
// We randomize the selection of term positions, from which we
|
||||||
|
// shall pull, starting at the beginning, until the abstract is
|
||||||
|
// big enough. The abstract is finally built in correct position
|
||||||
|
// order, thanks to the position map.
|
||||||
|
random_shuffle(qtermposs.begin(), qtermposs.end());
|
||||||
|
map<unsigned int, string> mabs;
|
||||||
|
unsigned int abslen = 0;
|
||||||
|
LOGDEB1(("Abstract:%d: extracting\n", chron.millis()));
|
||||||
|
// Extract data around the first (in random order) term positions,
|
||||||
|
// and store the chunks in the map
|
||||||
|
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
||||||
|
it != qtermposs.end(); it++) {
|
||||||
|
unsigned int ipos = *it;
|
||||||
|
unsigned int start = MAX(0, ipos-EXTRACT_WIDTH);
|
||||||
|
unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1);
|
||||||
|
string chunk;
|
||||||
|
for (unsigned int ii = start; ii <= end; ii++) {
|
||||||
|
if (!buf[ii].empty()) {
|
||||||
|
chunk += buf[ii] + " ";
|
||||||
|
abslen += buf[ii].length();
|
||||||
|
}
|
||||||
|
if (abslen > 300)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (end != buf.size()-1)
|
||||||
|
chunk += "... ";
|
||||||
|
mabs[ipos] = chunk;
|
||||||
|
if (abslen > 300)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the abstract by walking the map (in order of position)
|
||||||
|
string abstract;
|
||||||
|
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
||||||
|
it != mabs.end(); it++) {
|
||||||
|
abstract += (*it).second;
|
||||||
|
}
|
||||||
|
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
||||||
|
return abstract;
|
||||||
|
}
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.22 2006-01-11 15:08:21 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.23 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -31,7 +31,7 @@ namespace Rcl {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dumb bunch holder for document attributes and data
|
* Dumb holder for document attributes and data
|
||||||
*/
|
*/
|
||||||
class Doc {
|
class Doc {
|
||||||
public:
|
public:
|
||||||
@ -45,7 +45,11 @@ class Doc {
|
|||||||
string title;
|
string title;
|
||||||
string keywords;
|
string keywords;
|
||||||
string abstract;
|
string abstract;
|
||||||
|
string fbytes; // File size
|
||||||
|
string dbytes; // Doc size
|
||||||
|
|
||||||
|
// The following fields don't go to the db. text is only used when
|
||||||
|
// indexing
|
||||||
string text;
|
string text;
|
||||||
|
|
||||||
int pc; // used by sortseq, convenience
|
int pc; // used by sortseq, convenience
|
||||||
@ -60,6 +64,8 @@ class Doc {
|
|||||||
title.erase();
|
title.erase();
|
||||||
keywords.erase();
|
keywords.erase();
|
||||||
abstract.erase();
|
abstract.erase();
|
||||||
|
fbytes.erase();
|
||||||
|
dbytes.erase();
|
||||||
|
|
||||||
text.erase();
|
text.erase();
|
||||||
}
|
}
|
||||||
@ -79,28 +85,36 @@ class AdvSearchData {
|
|||||||
string description; // Printable expanded version of the complete query
|
string description; // Printable expanded version of the complete query
|
||||||
// returned after setQuery.
|
// returned after setQuery.
|
||||||
void erase() {
|
void erase() {
|
||||||
allwords.erase();phrase.erase();orwords.erase();nowords.erase();
|
allwords.erase();
|
||||||
filetypes.clear(); topdir.erase();
|
phrase.erase();
|
||||||
description.clear();
|
orwords.erase();
|
||||||
|
nowords.erase();
|
||||||
|
filetypes.clear();
|
||||||
|
topdir.erase();
|
||||||
|
description.erase();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class DbPops;
|
class DbPops;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper class for the native database.
|
* Wrapper class for the native database.
|
||||||
*/
|
*/
|
||||||
class Db {
|
class Db {
|
||||||
public:
|
public:
|
||||||
Db();
|
Db();
|
||||||
~Db();
|
~Db();
|
||||||
|
|
||||||
enum OpenMode {DbRO, DbUpd, DbTrunc};
|
enum OpenMode {DbRO, DbUpd, DbTrunc};
|
||||||
bool open(const string &dbdir, OpenMode mode);
|
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_BUILD_ABSTRACT = 2,
|
||||||
|
QO_REPLACE_ABSTRACT = 4};
|
||||||
|
|
||||||
|
bool open(const string &dbdir, OpenMode mode, int qops = 0);
|
||||||
bool close();
|
bool close();
|
||||||
bool isopen();
|
bool isopen();
|
||||||
|
|
||||||
// Update-related functions
|
// Update-related functions
|
||||||
bool add(const string &filename, const Doc &doc);
|
bool add(const string &filename, const Doc &doc, const struct stat *stp);
|
||||||
bool needUpdate(const string &filename, const struct stat *stp);
|
bool needUpdate(const string &filename, const struct stat *stp);
|
||||||
bool purge();
|
bool purge();
|
||||||
bool createStemDb(const string &lang);
|
bool createStemDb(const string &lang);
|
||||||
@ -109,7 +123,6 @@ public:
|
|||||||
// Query-related functions
|
// Query-related functions
|
||||||
|
|
||||||
// Parse query string and initialize query
|
// Parse query string and initialize query
|
||||||
enum QueryOpts {QO_NONE=0, QO_STEM = 1};
|
|
||||||
bool setQuery(const string &q, QueryOpts opts = QO_NONE,
|
bool setQuery(const string &q, QueryOpts opts = QO_NONE,
|
||||||
const string& stemlang = "english");
|
const string& stemlang = "english");
|
||||||
bool setQuery(AdvSearchData &q, QueryOpts opts = QO_NONE,
|
bool setQuery(AdvSearchData &q, QueryOpts opts = QO_NONE,
|
||||||
@ -143,10 +156,11 @@ private:
|
|||||||
// db indices that match
|
// db indices that match
|
||||||
void *pdata; // Pointer to private data. We don't want db(ie
|
void *pdata; // Pointer to private data. We don't want db(ie
|
||||||
// xapian)-specific defs to show in here
|
// xapian)-specific defs to show in here
|
||||||
|
unsigned int m_qOpts;
|
||||||
|
|
||||||
/* Copyconst and assignemt private and forbidden */
|
/* Copyconst and assignemt private and forbidden */
|
||||||
Db(const Db &) {}
|
Db(const Db &) {}
|
||||||
Db & operator=(const Db &) {return *this;};
|
Db & operator=(const Db &) {return *this;};
|
||||||
bool dbDataToRclDoc(std::string &data, Doc &doc);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Unaccent and lowercase data.
|
// Unaccent and lowercase data.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user