Display page numbers inside abstracts when possible (e.g.: for pdfs)
This commit is contained in:
parent
1f1f65a4c0
commit
3dfaa7525b
@ -250,6 +250,9 @@ void Db::Native::setDbWideQTermsFreqs(Query *query)
|
|||||||
// Compute query terms quality coefficients for a matched document by
|
// Compute query terms quality coefficients for a matched document by
|
||||||
// retrieving the Within Document Frequencies and multiplying by
|
// retrieving the Within Document Frequencies and multiplying by
|
||||||
// overal term frequency, then using log-based thresholds.
|
// overal term frequency, then using log-based thresholds.
|
||||||
|
// 2012: it's not too clear to me why exactly we do the log thresholds thing.
|
||||||
|
// Preferring terms wich are rare either or both in the db and the document
|
||||||
|
// seems reasonable though
|
||||||
double Db::Native::qualityTerms(Xapian::docid docid,
|
double Db::Native::qualityTerms(Xapian::docid docid,
|
||||||
Query *query,
|
Query *query,
|
||||||
const vector<string>& terms,
|
const vector<string>& terms,
|
||||||
@ -350,6 +353,16 @@ bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Db::Native::getPageNumberForPosition(const vector<int>& pbreaks,
|
||||||
|
unsigned int pos)
|
||||||
|
{
|
||||||
|
if (pos < baseTextPosition) // Not in text body
|
||||||
|
return -1;
|
||||||
|
vector<int>::const_iterator it =
|
||||||
|
upper_bound(pbreaks.begin(), pbreaks.end(), pos);
|
||||||
|
return it - pbreaks.begin() + 1;
|
||||||
|
}
|
||||||
|
|
||||||
// Return page number for first match of "significant" term.
|
// Return page number for first match of "significant" term.
|
||||||
int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
|
int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
|
||||||
{
|
{
|
||||||
@ -383,15 +396,9 @@ int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
|
|||||||
try {
|
try {
|
||||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||||
int ipos = *pos;
|
int pagenum = getPageNumberForPosition(pagepos, *pos);
|
||||||
if (ipos < int(baseTextPosition)) // Not in text body
|
if (pagenum > 0)
|
||||||
continue;
|
return pagenum;
|
||||||
// What page ?
|
|
||||||
LOGABS(("getFirstPageMatch: search match for [%s] pos %d\n",
|
|
||||||
qterm.c_str(), ipos));
|
|
||||||
vector<int>::const_iterator it =
|
|
||||||
upper_bound(pagepos.begin(), pagepos.end(), ipos);
|
|
||||||
return it - pagepos.begin() + 1;
|
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// Term does not occur. No problem.
|
// Term does not occur. No problem.
|
||||||
@ -435,8 +442,8 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
// TOBEDONE: terms issued from an original one by stem expansion
|
// TOBEDONE: terms issued from an original one by stem expansion
|
||||||
// should be somehow aggregated here, else, it may happen that
|
// should be somehow aggregated here, else, it may happen that
|
||||||
// such a group prevents displaying matches for other terms (by
|
// such a group prevents displaying matches for other terms (by
|
||||||
// remaining its meaning to the maximum occurrences per term test
|
// removing its meaning from the maximum occurrences per term test
|
||||||
// using while walking the list below)
|
// used while walking the list below)
|
||||||
multimap<double, string> byQ;
|
multimap<double, string> byQ;
|
||||||
double totalweight = qualityTerms(docid, query, terms, byQ);
|
double totalweight = qualityTerms(docid, query, terms, byQ);
|
||||||
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
||||||
@ -614,8 +621,11 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOGABS(("makeAbstract:%d: extracting\n", chron.millis()));
|
vector<int> vpbreaks;
|
||||||
|
getPagePositions(docid, vpbreaks);
|
||||||
|
|
||||||
|
LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
|
||||||
|
vpbreaks.size()));
|
||||||
// Finally build the abstract by walking the map (in order of position)
|
// Finally build the abstract by walking the map (in order of position)
|
||||||
vector<string> vabs;
|
vector<string> vabs;
|
||||||
string chunk;
|
string chunk;
|
||||||
@ -625,6 +635,12 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||||
if (!occupiedmarker.compare(it->second))
|
if (!occupiedmarker.compare(it->second))
|
||||||
continue;
|
continue;
|
||||||
|
if (chunk.empty() && !vpbreaks.empty()) {
|
||||||
|
int pnum = getPageNumberForPosition(vpbreaks, it->first);
|
||||||
|
ostringstream ss;
|
||||||
|
ss << pnum;
|
||||||
|
chunk += string(" [p ") + ss.str() + "] ";
|
||||||
|
}
|
||||||
Utf8Iter uit(it->second);
|
Utf8Iter uit(it->second);
|
||||||
bool newcjk = false;
|
bool newcjk = false;
|
||||||
if (TextSplit::isCJK(*uit))
|
if (TextSplit::isCJK(*uit))
|
||||||
|
|||||||
@ -92,6 +92,7 @@ class Db::Native {
|
|||||||
vector<string> makeAbstract(Xapian::docid id, Query *query);
|
vector<string> makeAbstract(Xapian::docid id, Query *query);
|
||||||
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
|
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
|
||||||
int getFirstMatchPage(Xapian::docid docid, Query *query);
|
int getFirstMatchPage(Xapian::docid docid, Query *query);
|
||||||
|
int getPageNumberForPosition(const vector<int>& pbreaks, unsigned int pos);
|
||||||
|
|
||||||
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user