From 52aaa5275447113a54e07d14427b4413ba43b2c5 Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 26 Jan 2006 12:28:50 +0000 Subject: [PATCH] abstract building from position data --- src/rcldb/rcldb.cpp | 248 ++++++++++++++++++++++++++++++++++++++++---- src/rcldb/rcldb.h | 36 +++++-- 2 files changed, 252 insertions(+), 32 deletions(-) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 99c0f7ca..a1090610 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.53 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -45,6 +45,13 @@ using namespace std; #include "xapian.h" #include +#ifndef MAX +#define MAX(A,B) (A>B?A:B) +#endif +#ifndef MIN +#define MIN(A,B) (A& terms); + bool dbDataToRclDoc(std::string &data, Rcl::Doc &doc, + int qopts, + Xapian::docid docid, + const list& terms); + Native() : isopen(false), iswritable(false), enquire(0) { } ~Native() { delete enquire; @@ -73,6 +86,7 @@ class Native { Rcl::Db::Db() { pdata = new Native; + m_qOpts = 0; } Rcl::Db::~Db() @@ -105,13 +119,14 @@ Rcl::Db::~Db() LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg)); } -bool Rcl::Db::open(const string& dir, OpenMode mode) +bool Rcl::Db::open(const string& dir, OpenMode mode, int qops) { if (pdata == 0) return false; Native *ndb = (Native *)pdata; LOGDEB(("Db::open: isopen %d iswritable %d\n", ndb->isopen, ndb->iswritable)); + m_qOpts = qops; if (ndb->isopen) { LOGERR(("Rcl::Db::open: already open\n")); @@ -268,7 +283,7 @@ bool Rcl::dumb_string(const string &in, string &out) /* From omindex direct */ /* Truncate a string to a given maxlength, avoiding cutting off midword * if reasonably possible. */ -string +static string truncate_to_word(string & input, string::size_type maxlen) { string output; @@ -292,32 +307,63 @@ truncate_to_word(string & input, string::size_type maxlen) output += " ..."; } - // No need to replace newlines with spaces, we do this in dumb_string() return output; } +// remove some chars and replace them with spaces +static string stripchars(const string &str, string delims) +{ + string out; + string::size_type startPos, pos; + + for (pos = 0;;) { + // Skip initial delims, break if this eats all. + if ((startPos = str.find_first_not_of(delims, pos)) == string::npos) + break; + // Find next delimiter or end of string (end of token) + pos = str.find_first_of(delims, startPos); + // Add token to the vector. Note: token cant be empty here + if (pos == string::npos) { + out += str.substr(startPos) + " "; + } else { + out += str.substr(startPos, pos - startPos) + " "; + } + } + return out; +} + // Truncate longer path and uniquize with hash . The goal for this is // to avoid xapian max term length limitations, not to gain space (we // gain very little even with very short maxlens like 30) #define PATHHASHLEN 150 +#define ABSTRACT_SIZE 200 +const static string rclSyntAbs = "?!#@"; + // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more // metadata), and update database -bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) +bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, + const struct stat *stp) { LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; - // Truncate abstract, title and keywords to reasonable lengths Rcl::Doc doc = idoc; - if (doc.abstract.empty()) - doc.abstract = truncate_to_word(doc.text, 100); - else - doc.abstract = truncate_to_word(doc.abstract, 100); + + // Truncate abstract, title and keywords to reasonable lengths. If + // abstract is currently empty, we make up one with the beginning + // of the document. + if (doc.abstract.empty()) { + doc.abstract = rclSyntAbs + + truncate_to_word(doc.text, ABSTRACT_SIZE); + } else { + doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE); + } + doc.abstract = stripchars(doc.abstract, "\n\r"); doc.title = truncate_to_word(doc.title, 100); doc.keywords = truncate_to_word(doc.keywords, 300); @@ -417,12 +463,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) record += "\ndmtime=" + doc.dmtime; } record += "\norigcharset=" + doc.origcharset; - record += "\ncaption=" + doc.title; - record += "\nkeywords=" + doc.keywords; - record += "\nabstract=" + doc.abstract; + char sizebuf[20]; + sizebuf[0] = 0; + if (stp) + sprintf(sizebuf, "%ld", (long)stp->st_size); + if (sizebuf[0]) + record += string("\nfbytes=") + sizebuf; + sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); + record += string("\ndbytes=") + sizebuf; if (!doc.ipath.empty()) { record += "\nipath=" + doc.ipath; } + record += "\ncaption=" + doc.title; + record += "\nkeywords=" + doc.keywords; + record += "\nabstract=" + doc.abstract; record += "\n"; LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); @@ -812,6 +866,7 @@ static list stemexpand(Native *ndb, string term, const string& lang) } +// Splitter callback for breaking query into terms class wsQData : public TextSplitCB { public: vector terms; @@ -836,7 +891,6 @@ class wsQData : public TextSplitCB { } }; - // Turn string into list of xapian queries. There is little // interpretation done on the string (no +term -term or filename:term // stuff). We just separate words and phrases, and interpret @@ -927,7 +981,6 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, Native *ndb = (Native *)pdata; if (!ndb) return false; - asdata.erase(); dbindices.clear(); list pqueries; @@ -950,6 +1003,7 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts, LOGDEB((" phrase: %s\n", sdata.phrase.c_str())); LOGDEB((" orwords: %s\n", sdata.orwords.c_str())); LOGDEB((" nowords: %s\n", sdata.nowords.c_str())); + string ft; for (list::iterator it = sdata.filetypes.begin(); it != sdata.filetypes.end(); it++) {ft += *it + " ";} @@ -1053,6 +1107,8 @@ bool Rcl::Db::getQueryTerms(list& terms) return true; } +static const int qquantum = 30; + int Rcl::Db::getResCnt() { Native *ndb = (Native *)pdata; @@ -1060,8 +1116,19 @@ int Rcl::Db::getResCnt() LOGERR(("Rcl::Db::getResCnt: no query opened\n")); return -1; } - if (ndb->mset.size() <= 0) - return -1; + if (ndb->mset.size() <= 0) { + try { + ndb->mset = ndb->enquire->get_mset(0, qquantum); + } catch (const Xapian::DatabaseModifiedError &error) { + ndb->db.reopen(); + ndb->mset = ndb->enquire->get_mset(0, qquantum); + } catch (const Xapian::Error & error) { + LOGERR(("enquire->get_mset: exception: %s\n", + error.get_msg().c_str())); + return -1; + } + } + return ndb->mset.get_matches_lower_bound(); } @@ -1085,7 +1152,9 @@ class Rcl::DbPops { } }; -bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc) +bool Native::dbDataToRclDoc(std::string &data, Rcl::Doc &doc, + int qopts, + Xapian::docid docid, const list& terms) { LOGDEB1(("Rcl::Db::dbDataToRclDoc: data: %s\n", data.c_str())); ConfSimple parms(&data); @@ -1099,7 +1168,20 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc) parms.get(string("caption"), doc.title); parms.get(string("keywords"), doc.keywords); parms.get(string("abstract"), doc.abstract); + bool syntabs = false; + if (doc.abstract.find(rclSyntAbs) == 0) { + doc.abstract = doc.abstract.substr(rclSyntAbs.length()); + syntabs = true; + } + if ((qopts && Rcl::Db::QO_BUILD_ABSTRACT) && !terms.empty()) { + LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n")); + if (doc.abstract.empty() || syntabs || + (qopts & Rcl::Db::QO_REPLACE_ABSTRACT)) + doc.abstract = makeAbstract(docid, terms); + } parms.get(string("ipath"), doc.ipath); + parms.get(string("fbytes"), doc.fbytes); + parms.get(string("dbytes"), doc.dbytes); return true; } @@ -1114,7 +1196,6 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc) // that dont match the filter). bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent) { - const int qquantum = 30; LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti)); Native *ndb = (Native *)pdata; if (!ndb || !ndb->enquire) { @@ -1199,12 +1280,15 @@ bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent) ndb->mset.get_matches_lower_bound())); Xapian::Document xdoc = ndb->mset[xapi-first].get_document(); + Xapian::docid docid = *(ndb->mset[xapi-first]); if (percent) *percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]); // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); - return dbDataToRclDoc(data, doc); + list terms; + getQueryTerms(terms); + return ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms); } // Retrieve document defined by file name and internal path. Very inefficient, @@ -1237,7 +1321,9 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc) Xapian::Document xdoc = ndb->db.get_document(*docid); string data = xdoc.get_data(); - if (dbDataToRclDoc(data, doc) && doc.ipath == ipath) + list terms; + if (ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms) + && doc.ipath == ipath) return true; } } catch (const Xapian::Error &e) { @@ -1258,3 +1344,123 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc) } return false; } + +// Width of a sample extract around a query term +// +// We build a possibly full size but sparsely populated (only around +// the search term) reconstruction of the document. It would be +// possible to compress the array, by having only multiple chunks +// around the terms, but this would seriously complicate the data +// structure. +#define EXTRACT_WIDTH 3 +string Native::makeAbstract(Xapian::docid docid, const list& terms) +{ + Chrono chron; + // A buffer that we populate with the document terms, at their position + vector buf; + + // Go through the list of query terms. For each entry in each + // position list, populate the slot in the document buffer, and + // remember the position and its neigbours + vector qtermposs; // The term positions + set chunkposs; // All the positions we shall populate + for (list::const_iterator qit = terms.begin(); qit != terms.end(); + qit++) { + Xapian::PositionIterator pos; + // There may be query terms not in this doc. This raises an + // exception when requesting the position list, we just catch it. + try { + unsigned int occurrences = 0; + for (pos = db.positionlist_begin(docid, *qit); + pos != db.positionlist_end(docid, *qit); pos++) { + unsigned int ipos = *pos; + LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos)); + // Possibly extend the array. Do it in big chunks + if (ipos + EXTRACT_WIDTH >= buf.size()) { + buf.resize(ipos + EXTRACT_WIDTH + 1000); + } + buf[ipos] = *qit; + // Remember the term position + qtermposs.push_back(ipos); + // Add adjacent slots to the set to populate at next step + for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH); + ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) { + chunkposs.insert(ii); + } + // Limit the number of occurences we keep for each + // term. The abstract has a finite length anyway ! + if (occurrences++ > 10) + break; + } + } catch (...) { + } + } + + LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", + chron.millis(), qtermposs.size())); + + // Walk the full document position list and populate slots around + // the query terms. We arbitrarily truncate the list to avoid + // taking forever. If we do cutoff, the abstract may be + // inconsistant, which is bad... + { Xapian::TermIterator term; + int cutoff = 500 * 1000; + for (term = db.termlist_begin(docid); + term != db.termlist_end(docid); term++) { + Xapian::PositionIterator pos; + for (pos = db.positionlist_begin(docid, *term); + pos != db.positionlist_end(docid, *term); pos++) { + if (cutoff-- < 0) + break; + unsigned int ipos = *pos; + if (chunkposs.find(ipos) != chunkposs.end()) { + buf[ipos] = *term; + } + } + if (cutoff-- < 0) + break; + } + } + + LOGDEB1(("Abstract:%d: randomizing and extracting\n", chron.millis())); + + // We randomize the selection of term positions, from which we + // shall pull, starting at the beginning, until the abstract is + // big enough. The abstract is finally built in correct position + // order, thanks to the position map. + random_shuffle(qtermposs.begin(), qtermposs.end()); + map mabs; + unsigned int abslen = 0; + LOGDEB1(("Abstract:%d: extracting\n", chron.millis())); + // Extract data around the first (in random order) term positions, + // and store the chunks in the map + for (vector::const_iterator it = qtermposs.begin(); + it != qtermposs.end(); it++) { + unsigned int ipos = *it; + unsigned int start = MAX(0, ipos-EXTRACT_WIDTH); + unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1); + string chunk; + for (unsigned int ii = start; ii <= end; ii++) { + if (!buf[ii].empty()) { + chunk += buf[ii] + " "; + abslen += buf[ii].length(); + } + if (abslen > 300) + break; + } + if (end != buf.size()-1) + chunk += "... "; + mabs[ipos] = chunk; + if (abslen > 300) + break; + } + + // Build the abstract by walking the map (in order of position) + string abstract; + for (map::const_iterator it = mabs.begin(); + it != mabs.end(); it++) { + abstract += (*it).second; + } + LOGDEB(("Abtract: done in %d mS\n", chron.millis())); + return abstract; +} diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index eb04ae7b..c723a9e3 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,6 +1,6 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.22 2006-01-11 15:08:21 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.23 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -31,7 +31,7 @@ namespace Rcl { #endif /** - * Dumb bunch holder for document attributes and data + * Dumb holder for document attributes and data */ class Doc { public: @@ -45,7 +45,11 @@ class Doc { string title; string keywords; string abstract; + string fbytes; // File size + string dbytes; // Doc size + // The following fields don't go to the db. text is only used when + // indexing string text; int pc; // used by sortseq, convenience @@ -60,6 +64,8 @@ class Doc { title.erase(); keywords.erase(); abstract.erase(); + fbytes.erase(); + dbytes.erase(); text.erase(); } @@ -79,28 +85,36 @@ class AdvSearchData { string description; // Printable expanded version of the complete query // returned after setQuery. void erase() { - allwords.erase();phrase.erase();orwords.erase();nowords.erase(); - filetypes.clear(); topdir.erase(); - description.clear(); + allwords.erase(); + phrase.erase(); + orwords.erase(); + nowords.erase(); + filetypes.clear(); + topdir.erase(); + description.erase(); } }; - class DbPops; +class DbPops; /** * Wrapper class for the native database. */ class Db { -public: + public: Db(); ~Db(); + enum OpenMode {DbRO, DbUpd, DbTrunc}; - bool open(const string &dbdir, OpenMode mode); + enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_BUILD_ABSTRACT = 2, + QO_REPLACE_ABSTRACT = 4}; + + bool open(const string &dbdir, OpenMode mode, int qops = 0); bool close(); bool isopen(); // Update-related functions - bool add(const string &filename, const Doc &doc); + bool add(const string &filename, const Doc &doc, const struct stat *stp); bool needUpdate(const string &filename, const struct stat *stp); bool purge(); bool createStemDb(const string &lang); @@ -109,7 +123,6 @@ public: // Query-related functions // Parse query string and initialize query - enum QueryOpts {QO_NONE=0, QO_STEM = 1}; bool setQuery(const string &q, QueryOpts opts = QO_NONE, const string& stemlang = "english"); bool setQuery(AdvSearchData &q, QueryOpts opts = QO_NONE, @@ -143,10 +156,11 @@ private: // db indices that match void *pdata; // Pointer to private data. We don't want db(ie // xapian)-specific defs to show in here + unsigned int m_qOpts; + /* Copyconst and assignemt private and forbidden */ Db(const Db &) {} Db & operator=(const Db &) {return *this;}; - bool dbDataToRclDoc(std::string &data, Doc &doc); }; // Unaccent and lowercase data.