diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 6ac01ecb..35c68d5b 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -250,6 +250,9 @@ void Db::Native::setDbWideQTermsFreqs(Query *query) // Compute query terms quality coefficients for a matched document by // retrieving the Within Document Frequencies and multiplying by // overal term frequency, then using log-based thresholds. +// 2012: it's not too clear to me why exactly we do the log thresholds thing. +// Preferring terms wich are rare either or both in the db and the document +// seems reasonable though double Db::Native::qualityTerms(Xapian::docid docid, Query *query, const vector& terms, @@ -350,6 +353,16 @@ bool Db::Native::getPagePositions(Xapian::docid docid, vector& vpos) return true; } +int Db::Native::getPageNumberForPosition(const vector& pbreaks, + unsigned int pos) +{ + if (pos < baseTextPosition) // Not in text body + return -1; + vector::const_iterator it = + upper_bound(pbreaks.begin(), pbreaks.end(), pos); + return it - pbreaks.begin() + 1; +} + // Return page number for first match of "significant" term. int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query) { @@ -383,15 +396,9 @@ int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query) try { for (pos = xrdb.positionlist_begin(docid, qterm); pos != xrdb.positionlist_end(docid, qterm); pos++) { - int ipos = *pos; - if (ipos < int(baseTextPosition)) // Not in text body - continue; - // What page ? - LOGABS(("getFirstPageMatch: search match for [%s] pos %d\n", - qterm.c_str(), ipos)); - vector::const_iterator it = - upper_bound(pagepos.begin(), pagepos.end(), ipos); - return it - pagepos.begin() + 1; + int pagenum = getPageNumberForPosition(pagepos, *pos); + if (pagenum > 0) + return pagenum; } } catch (...) { // Term does not occur. No problem. @@ -435,8 +442,8 @@ vector Db::Native::makeAbstract(Xapian::docid docid, Query *query) // TOBEDONE: terms issued from an original one by stem expansion // should be somehow aggregated here, else, it may happen that // such a group prevents displaying matches for other terms (by - // remaining its meaning to the maximum occurrences per term test - // using while walking the list below) + // removing its meaning from the maximum occurrences per term test + // used while walking the list below) multimap byQ; double totalweight = qualityTerms(docid, query, terms, byQ); LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms())); @@ -614,8 +621,11 @@ vector Db::Native::makeAbstract(Xapian::docid docid, Query *query) } #endif - LOGABS(("makeAbstract:%d: extracting\n", chron.millis())); + vector vpbreaks; + getPagePositions(docid, vpbreaks); + LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(), + vpbreaks.size())); // Finally build the abstract by walking the map (in order of position) vector vabs; string chunk; @@ -625,6 +635,12 @@ vector Db::Native::makeAbstract(Xapian::docid docid, Query *query) LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); if (!occupiedmarker.compare(it->second)) continue; + if (chunk.empty() && !vpbreaks.empty()) { + int pnum = getPageNumberForPosition(vpbreaks, it->first); + ostringstream ss; + ss << pnum; + chunk += string(" [p ") + ss.str() + "] "; + } Utf8Iter uit(it->second); bool newcjk = false; if (TextSplit::isCJK(*uit)) diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index 6fb64261..133a510c 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -92,6 +92,7 @@ class Db::Native { vector makeAbstract(Xapian::docid id, Query *query); bool getPagePositions(Xapian::docid docid, vector& vpos); int getFirstMatchPage(Xapian::docid docid, Query *query); + int getPageNumberForPosition(const vector& pbreaks, unsigned int pos); bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);