abstract building from position data

2006-01-26 12:28:50 +00:00 · 2006-01-26 12:28:50 +00:00 · 52aaa52754
commit 52aaa52754
parent 44b2aa534c
2 changed files with 252 additions and 32 deletions
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.53 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -45,6 +45,13 @@ using namespace std;
 #include "xapian.h"
 #include <xapian/stem.h>
 #ifndef MAX
 #define MAX(A,B) (A>B?A:B)
 #endif
 #ifndef MIN
 #define MIN(A,B) (A<B?A:B)
 #endif
 // Data for a xapian database. There could actually be 2 different
 // ones for indexing or query as there is not much in common.
 class Native {
@ -64,6 +71,12 @@ class Native {
    Xapian::Enquire *enquire;
    Xapian::MSet     mset;
    string makeAbstract(Xapian::docid id, const list<string>& terms);
    bool dbDataToRclDoc(std::string &data, Rcl::Doc &doc, 
 			int qopts,
 			Xapian::docid docid,
 			const list<string>& terms);
    Native() : isopen(false), iswritable(false), enquire(0) { }
    ~Native() {
 	delete enquire;
@ -73,6 +86,7 @@ class Native {
 Rcl::Db::Db() 
 {
    pdata = new Native;
    m_qOpts = 0;
 }
 Rcl::Db::~Db()
@ -105,13 +119,14 @@ Rcl::Db::~Db()
    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
 }
-bool Rcl::Db::open(const string& dir, OpenMode mode)
+bool Rcl::Db::open(const string& dir, OpenMode mode, int qops)
 {
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
    LOGDEB(("Db::open: isopen %d iswritable %d\n", ndb->isopen, 
 	    ndb->iswritable));
    m_qOpts = qops;
    if (ndb->isopen) {
 	LOGERR(("Rcl::Db::open: already open\n"));
@ -268,7 +283,7 @@ bool Rcl::dumb_string(const string &in, string &out)
 /* From omindex direct */
 /* Truncate a string to a given maxlength, avoiding cutting off midword
 * if reasonably possible. */
-string
+static string
 truncate_to_word(string & input, string::size_type maxlen)
 {
    string output;
@ -292,32 +307,63 @@ truncate_to_word(string & input, string::size_type maxlen)
 	output += " ...";
    }
    // No need to replace newlines with spaces, we do this in dumb_string()
    return output;
 }
 // remove some chars and replace them with spaces
 static string stripchars(const string &str, string delims)
 {
    string out;
    string::size_type startPos, pos;
    for (pos = 0;;) { 
        // Skip initial delims, break if this eats all.
        if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
 	    break;
        // Find next delimiter or end of string (end of token)
        pos = str.find_first_of(delims, startPos);
        // Add token to the vector. Note: token cant be empty here
 	if (pos == string::npos) {
 	    out += str.substr(startPos) + " ";
 	} else {
 	    out += str.substr(startPos, pos - startPos) + " ";
 	}
    }
    return out;
 }
 // Truncate longer path and uniquize with hash . The goal for this is
 // to avoid xapian max term length limitations, not to gain space (we
 // gain very little even with very short maxlens like 30)
 #define PATHHASHLEN 150
 #define ABSTRACT_SIZE 200
 const static string rclSyntAbs = "?!#@";
 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
 // metadata), and update database
-bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
+bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc, 
 		  const struct stat *stp)
 {
    LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
    // Truncate abstract, title and keywords to reasonable lengths
    Rcl::Doc doc = idoc;
-    if (doc.abstract.empty()) 
+
-	doc.abstract = truncate_to_word(doc.text, 100);
+    // Truncate abstract, title and keywords to reasonable lengths. If
-    else 
+    // abstract is currently empty, we make up one with the beginning
-	doc.abstract = truncate_to_word(doc.abstract, 100);
+    // of the document.
    if (doc.abstract.empty()) {
 	doc.abstract = rclSyntAbs + 
 	    truncate_to_word(doc.text, ABSTRACT_SIZE);
    } else {
 	doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE);
    }
    doc.abstract = stripchars(doc.abstract, "\n\r");
    doc.title = truncate_to_word(doc.title, 100);
    doc.keywords = truncate_to_word(doc.keywords, 300);
@ -417,12 +463,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	record += "\ndmtime=" + doc.dmtime;
    }
    record += "\norigcharset=" + doc.origcharset;
-    record += "\ncaption=" + doc.title;
+    char sizebuf[20]; 
-    record += "\nkeywords=" + doc.keywords;
+    sizebuf[0] = 0;
-    record += "\nabstract=" + doc.abstract;
+    if (stp)
 	sprintf(sizebuf, "%ld", (long)stp->st_size);
    if (sizebuf[0])
 	record += string("\nfbytes=") + sizebuf;
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
    record += string("\ndbytes=") + sizebuf;
    if (!doc.ipath.empty()) {
 	record += "\nipath=" + doc.ipath;
    }
    record += "\ncaption=" + doc.title;
    record += "\nkeywords=" + doc.keywords;
    record += "\nabstract=" + doc.abstract;
    record += "\n";
    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
    newdocument.set_data(record);
@ -812,6 +866,7 @@ static list<string> stemexpand(Native *ndb, string term, const string& lang)
 }
 // Splitter callback for breaking query into terms
 class wsQData : public TextSplitCB {
 public:
    vector<string> terms;
@ -836,7 +891,6 @@ class wsQData : public TextSplitCB {
    }
 };
 // Turn string into list of xapian queries. There is little
 // interpretation done on the string (no +term -term or filename:term
 // stuff). We just separate words and phrases, and interpret
@ -927,7 +981,6 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
    Native *ndb = (Native *)pdata;
    if (!ndb)
 	return false;
    asdata.erase();
    dbindices.clear();
    list<Xapian::Query> pqueries;
@ -950,6 +1003,7 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts,
    LOGDEB((" phrase:   %s\n", sdata.phrase.c_str()));
    LOGDEB((" orwords:  %s\n", sdata.orwords.c_str()));
    LOGDEB((" nowords:  %s\n", sdata.nowords.c_str()));
    string ft;
    for (list<string>::iterator it = sdata.filetypes.begin(); 
    	 it != sdata.filetypes.end(); it++) {ft += *it + " ";}
@ -1053,6 +1107,8 @@ bool Rcl::Db::getQueryTerms(list<string>& terms)
    return true;
 }
 static const int qquantum = 30;
 int Rcl::Db::getResCnt()
 {
    Native *ndb = (Native *)pdata;
@ -1060,8 +1116,19 @@ int Rcl::Db::getResCnt()
 	LOGERR(("Rcl::Db::getResCnt: no query opened\n"));
 	return -1;
    }
-    if (ndb->mset.size() <= 0)
+    if (ndb->mset.size() <= 0) {
-	return -1;
+	try {
 	    ndb->mset = ndb->enquire->get_mset(0, qquantum);
 	} catch (const Xapian::DatabaseModifiedError &error) {
 	    ndb->db.reopen();
 	    ndb->mset = ndb->enquire->get_mset(0, qquantum);
 	} catch (const Xapian::Error & error) {
 	    LOGERR(("enquire->get_mset: exception: %s\n", 
 		    error.get_msg().c_str()));
 	    return -1;
 	}
    }
    return ndb->mset.get_matches_lower_bound();
 }
@ -1085,7 +1152,9 @@ class Rcl::DbPops {
    }
 };
-bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
+bool Native::dbDataToRclDoc(std::string &data, Rcl::Doc &doc, 
 			    int qopts,
 			    Xapian::docid docid, const list<string>& terms)
 {
    LOGDEB1(("Rcl::Db::dbDataToRclDoc: data: %s\n", data.c_str()));
    ConfSimple parms(&data);
@ -1099,7 +1168,20 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
    parms.get(string("caption"), doc.title);
    parms.get(string("keywords"), doc.keywords);
    parms.get(string("abstract"), doc.abstract);
    bool syntabs = false;
    if (doc.abstract.find(rclSyntAbs) == 0) {
 	doc.abstract = doc.abstract.substr(rclSyntAbs.length());
 	syntabs = true;
    }
    if ((qopts && Rcl::Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
 	LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
 	if (doc.abstract.empty() || syntabs || 
 	    (qopts & Rcl::Db::QO_REPLACE_ABSTRACT))
 	    doc.abstract = makeAbstract(docid, terms);
    }
    parms.get(string("ipath"), doc.ipath);
    parms.get(string("fbytes"), doc.fbytes);
    parms.get(string("dbytes"), doc.dbytes);
    return true;
 }
@ -1114,7 +1196,6 @@ bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc)
 // that dont match the filter).
 bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
 {
    const int qquantum = 30;
    LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti));
    Native *ndb = (Native *)pdata;
    if (!ndb || !ndb->enquire) {
@ -1199,12 +1280,15 @@ bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent)
 	     ndb->mset.get_matches_lower_bound()));
    Xapian::Document xdoc = ndb->mset[xapi-first].get_document();
    Xapian::docid docid = *(ndb->mset[xapi-first]);
    if (percent)
 	*percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]);
    // Parse xapian document's data and populate doc fields
    string data = xdoc.get_data();
-    return dbDataToRclDoc(data, doc);
+    list<string> terms;
    getQueryTerms(terms);
    return ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
 }
 // Retrieve document defined by file name and internal path. Very inefficient,
@ -1237,7 +1321,9 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
 	    Xapian::Document xdoc = ndb->db.get_document(*docid);
 	    string data = xdoc.get_data();
-	    if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
+	    list<string> terms;
 	    if (ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms) 
 		&& doc.ipath == ipath)
 		return true;
 	}
    } catch (const Xapian::Error &e) {
@ -1258,3 +1344,123 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
    }
    return false;
 }
 // Width of a sample extract around a query term
 //
 // We build a possibly full size but sparsely populated (only around
 // the search term) reconstruction of the document. It would be
 // possible to compress the array, by having only multiple chunks
 // around the terms, but this would seriously complicate the data
 // structure.
 #define EXTRACT_WIDTH 3
 string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 {
    Chrono chron;
    // A buffer that we populate with the document terms, at their position
    vector<string> buf;
    // Go through the list of query terms. For each entry in each
    // position list, populate the slot in the document buffer, and
    // remember the position and its neigbours
    vector<unsigned int> qtermposs; // The term positions
    set<unsigned int> chunkposs; // All the positions we shall populate
    for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
 	 qit++) {
 	Xapian::PositionIterator pos;
 	// There may be query terms not in this doc. This raises an
 	// exception when requesting the position list, we just catch it.
 	try {
 	    unsigned int occurrences = 0;
 	    for (pos = db.positionlist_begin(docid, *qit); 
 		 pos != db.positionlist_end(docid, *qit); pos++) {
 		unsigned int ipos = *pos;
 		LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
 		// Possibly extend the array. Do it in big chunks
 		if (ipos + EXTRACT_WIDTH >= buf.size()) {
 		    buf.resize(ipos + EXTRACT_WIDTH + 1000);
 		}
 		buf[ipos] = *qit;
 		// Remember the term position
 		qtermposs.push_back(ipos);
 		// Add adjacent slots to the set to populate at next step
 		for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH); 
 		     ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) {
 		    chunkposs.insert(ii);
 		}
 		// Limit the number of occurences we keep for each
 		// term. The abstract has a finite length anyway !
 		if (occurrences++ > 10)
 		    break;
 	    }
 	} catch (...) {
 	}
    }
    LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", 
 	    chron.millis(), qtermposs.size()));
    // Walk the full document position list and populate slots around
    // the query terms. We arbitrarily truncate the list to avoid
    // taking forever. If we do cutoff, the abstract may be
    // inconsistant, which is bad...
    { Xapian::TermIterator term;
 	int cutoff = 500 * 1000;
 	for (term = db.termlist_begin(docid);
 	     term != db.termlist_end(docid); term++) {
 	    Xapian::PositionIterator pos;
 	    for (pos = db.positionlist_begin(docid, *term); 
 		 pos != db.positionlist_end(docid, *term); pos++) {
 		if (cutoff-- < 0)
 		    break;
 		unsigned int ipos = *pos;
 		if (chunkposs.find(ipos) != chunkposs.end()) {
 		    buf[ipos] = *term;
 		}
 	    }
 	    if (cutoff-- < 0)
 		break;
 	}
    }
    LOGDEB1(("Abstract:%d: randomizing and extracting\n", chron.millis()));
    // We randomize the selection of term positions, from which we
    // shall pull, starting at the beginning, until the abstract is
    // big enough. The abstract is finally built in correct position
    // order, thanks to the position map.
    random_shuffle(qtermposs.begin(), qtermposs.end());
    map<unsigned int, string> mabs;
    unsigned int abslen = 0;
    LOGDEB1(("Abstract:%d: extracting\n", chron.millis()));
    // Extract data around the first (in random order) term positions,
    // and store the chunks in the map
    for (vector<unsigned int>::const_iterator it = qtermposs.begin();
 	 it != qtermposs.end(); it++) {
 	unsigned int ipos = *it;
 	unsigned int start = MAX(0, ipos-EXTRACT_WIDTH);
 	unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1);
 	string chunk;
 	for (unsigned int ii = start; ii <= end; ii++) {
 	    if (!buf[ii].empty()) {
 		chunk += buf[ii] + " ";
 		abslen += buf[ii].length();
 	    }
 	    if (abslen > 300)
 		break;
 	}
 	if (end != buf.size()-1)
 	    chunk += "... ";
 	mabs[ipos] = chunk;
 	if (abslen > 300)
 	    break;
    }
    // Build the abstract by walking the map (in order of position)
    string abstract;
    for (map<unsigned int, string>::const_iterator it = mabs.begin();
 	 it != mabs.end(); it++) {
 	abstract += (*it).second;
    }
    LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
    return abstract;
 }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -1,6 +1,6 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.22 2006-01-11 15:08:21 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.23 2006-01-26 12:28:50 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
@ -31,7 +31,7 @@ namespace Rcl {
 #endif
 /**
- * Dumb bunch holder for document attributes and data
+ * Dumb holder for document attributes and data
 */
 class Doc {
 public:
@ -45,7 +45,11 @@ class Doc {
    string title;
    string keywords;
    string abstract;
    string fbytes;        // File size
    string dbytes;        // Doc size
    // The following fields don't go to the db. text is only used when
    // indexing
    string text;
    int pc; // used by sortseq, convenience
@ -60,6 +64,8 @@ class Doc {
 	title.erase();
 	keywords.erase();
 	abstract.erase();
 	fbytes.erase();
 	dbytes.erase();
 	text.erase();
    }
@ -79,28 +85,36 @@ class AdvSearchData {
    string description; // Printable expanded version of the complete query
                        // returned after setQuery.
    void erase() {
-	allwords.erase();phrase.erase();orwords.erase();nowords.erase();
+	allwords.erase();
-	filetypes.clear(); topdir.erase();
+	phrase.erase();
-	description.clear();
+	orwords.erase();
 	nowords.erase();
 	filetypes.clear(); 
 	topdir.erase();
 	description.erase();
    }
 };
- class DbPops;
+class DbPops;
 /**
 * Wrapper class for the native database.
 */
 class Db {
-public:
+ public:
    Db();
    ~Db();
    enum OpenMode {DbRO, DbUpd, DbTrunc};
-    bool open(const string &dbdir, OpenMode mode);
+    enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_BUILD_ABSTRACT = 2,
 		    QO_REPLACE_ABSTRACT = 4};
    bool open(const string &dbdir, OpenMode mode, int qops = 0);
    bool close();
    bool isopen();
    // Update-related functions
-    bool add(const string &filename, const Doc &doc);
+    bool add(const string &filename, const Doc &doc, const struct stat *stp);
    bool needUpdate(const string &filename, const struct stat *stp);
    bool purge();
    bool createStemDb(const string &lang);
@ -109,7 +123,6 @@ public:
    // Query-related functions
    // Parse query string and initialize query
    enum QueryOpts {QO_NONE=0, QO_STEM = 1};
    bool setQuery(const string &q, QueryOpts opts = QO_NONE, 
 		  const string& stemlang = "english");
    bool setQuery(AdvSearchData &q, QueryOpts opts = QO_NONE,
@ -143,10 +156,11 @@ private:
                           // db indices that match
    void *pdata; // Pointer to private data. We don't want db(ie
                 // xapian)-specific defs to show in here
    unsigned int m_qOpts;
    /* Copyconst and assignemt private and forbidden */
    Db(const Db &) {}
    Db & operator=(const Db &) {return *this;};
    bool dbDataToRclDoc(std::string &data, Doc &doc);
 };
 // Unaccent and lowercase data.