rclabstract: fixed log call formats and indentation
This commit is contained in:
parent
653b1fb5a5
commit
207e3d5af7
@ -50,7 +50,7 @@ static void listList(const string& what, const vector<string>&l)
|
|||||||
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||||
a = a + *it + " ";
|
a = a + *it + " ";
|
||||||
}
|
}
|
||||||
LOGDEB("" << (what) << ": " << (a) << "\n" );
|
LOGDEB("" << what << ": " << a << "\n");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#define LOGABS LOGDEB2
|
#define LOGABS LOGDEB2
|
||||||
@ -67,12 +67,12 @@ static const bool prune_prefixed_terms = true;
|
|||||||
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
||||||
{
|
{
|
||||||
for (vector<string>::const_iterator qit = in.begin();
|
for (vector<string>::const_iterator qit = in.begin();
|
||||||
qit != in.end(); qit++) {
|
qit != in.end(); qit++) {
|
||||||
if (prune_prefixed_terms) {
|
if (prune_prefixed_terms) {
|
||||||
if (has_prefix(*qit))
|
if (has_prefix(*qit))
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
out.push_back(strip_prefix(*qit));
|
out.push_back(strip_prefix(*qit));
|
||||||
}
|
}
|
||||||
sort(out.begin(), out.end());
|
sort(out.begin(), out.end());
|
||||||
vector<string>::iterator it = unique(out.begin(), out.end());
|
vector<string>::iterator it = unique(out.begin(), out.end());
|
||||||
@ -82,8 +82,8 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
|
|||||||
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
||||||
{
|
{
|
||||||
if (!xenquire) {
|
if (!xenquire) {
|
||||||
LOGERR("Query::getMatchTerms: no query opened\n" );
|
LOGERR("Query::getMatchTerms: no query opened\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
terms.clear();
|
terms.clear();
|
||||||
@ -95,8 +95,8 @@ bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
|||||||
xenquire->get_matching_terms_end(id)),
|
xenquire->get_matching_terms_end(id)),
|
||||||
m_q->m_db->m_ndb->xrdb, m_q->m_reason);
|
m_q->m_db->m_ndb->xrdb, m_q->m_reason);
|
||||||
if (!m_q->m_reason.empty()) {
|
if (!m_q->m_reason.empty()) {
|
||||||
LOGERR("getMatchTerms: xapian error: " << (m_q->m_reason) << "\n" );
|
LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
noPrefixList(iterms, terms);
|
noPrefixList(iterms, terms);
|
||||||
return true;
|
return true;
|
||||||
@ -109,25 +109,26 @@ void Query::Native::setDbWideQTermsFreqs()
|
|||||||
{
|
{
|
||||||
// Do it once only for a given query.
|
// Do it once only for a given query.
|
||||||
if (!termfreqs.empty())
|
if (!termfreqs.empty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
vector<string> qterms;
|
vector<string> qterms;
|
||||||
{
|
{
|
||||||
vector<string> iqterms;
|
vector<string> iqterms;
|
||||||
m_q->getQueryTerms(iqterms);
|
m_q->getQueryTerms(iqterms);
|
||||||
noPrefixList(iqterms, qterms);
|
noPrefixList(iqterms, qterms);
|
||||||
}
|
}
|
||||||
// listList("Query terms: ", qterms);
|
// listList("Query terms: ", qterms);
|
||||||
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
||||||
|
|
||||||
double doccnt = xrdb.get_doccount();
|
double doccnt = xrdb.get_doccount();
|
||||||
if (doccnt == 0)
|
if (doccnt == 0)
|
||||||
doccnt = 1;
|
doccnt = 1;
|
||||||
|
|
||||||
for (vector<string>::const_iterator qit = qterms.begin();
|
for (vector<string>::const_iterator qit = qterms.begin();
|
||||||
qit != qterms.end(); qit++) {
|
qit != qterms.end(); qit++) {
|
||||||
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
||||||
LOGABS("setDbWideQTermFreqs: [" << (qit) << "] db freq " << (termfreqs[*qit]) << "\n" );
|
LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
|
||||||
|
termfreqs[*qit] << "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,10 +144,10 @@ void Query::Native::setDbWideQTermsFreqs()
|
|||||||
// occurrences, and let the frequency for each group member be the
|
// occurrences, and let the frequency for each group member be the
|
||||||
// aggregated frequency.
|
// aggregated frequency.
|
||||||
double Query::Native::qualityTerms(Xapian::docid docid,
|
double Query::Native::qualityTerms(Xapian::docid docid,
|
||||||
const vector<string>& terms,
|
const vector<string>& terms,
|
||||||
multimap<double, vector<string> >& byQ)
|
multimap<double, vector<string> >& byQ)
|
||||||
{
|
{
|
||||||
LOGABS("qualityTerms\n" );
|
LOGABS("qualityTerms\n");
|
||||||
setDbWideQTermsFreqs();
|
setDbWideQTermsFreqs();
|
||||||
|
|
||||||
map<string, double> termQcoefs;
|
map<string, double> termQcoefs;
|
||||||
@ -155,46 +156,46 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
|||||||
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
||||||
double doclen = xrdb.get_doclength(docid);
|
double doclen = xrdb.get_doclength(docid);
|
||||||
if (doclen == 0)
|
if (doclen == 0)
|
||||||
doclen = 1;
|
doclen = 1;
|
||||||
HighlightData hld;
|
HighlightData hld;
|
||||||
if (m_q->m_sd) {
|
if (m_q->m_sd) {
|
||||||
m_q->m_sd->getTerms(hld);
|
m_q->m_sd->getTerms(hld);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUGABSTRACT
|
#ifdef DEBUGABSTRACT
|
||||||
{
|
{
|
||||||
string deb;
|
string deb;
|
||||||
hld.toString(deb);
|
hld.toString(deb);
|
||||||
LOGABS("qualityTerms: hld: " << (deb) << "\n" );
|
LOGABS("qualityTerms: hld: " << deb << "\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Group the input terms by the user term they were possibly expanded from
|
// Group the input terms by the user term they were possibly expanded from
|
||||||
map<string, vector<string> > byRoot;
|
map<string, vector<string> > byRoot;
|
||||||
for (vector<string>::const_iterator qit = terms.begin();
|
for (vector<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
||||||
if (eit != hld.terms.end()) {
|
if (eit != hld.terms.end()) {
|
||||||
byRoot[eit->second].push_back(*qit);
|
byRoot[eit->second].push_back(*qit);
|
||||||
} else {
|
} else {
|
||||||
LOGDEB0("qualityTerms: [" << ((*qit)) << "] not found in hld\n" );
|
LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
|
||||||
byRoot[*qit].push_back(*qit);
|
byRoot[*qit].push_back(*qit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUGABSTRACT
|
#ifdef DEBUGABSTRACT
|
||||||
{
|
{
|
||||||
string byRootstr;
|
string byRootstr;
|
||||||
for (map<string, vector<string> >::const_iterator debit =
|
for (map<string, vector<string> >::const_iterator debit =
|
||||||
byRoot.begin(); debit != byRoot.end(); debit++) {
|
byRoot.begin(); debit != byRoot.end(); debit++) {
|
||||||
byRootstr.append("[").append(debit->first).append("]->");
|
byRootstr.append("[").append(debit->first).append("]->");
|
||||||
for (vector<string>::const_iterator it = debit->second.begin();
|
for (vector<string>::const_iterator it = debit->second.begin();
|
||||||
it != debit->second.end(); it++) {
|
it != debit->second.end(); it++) {
|
||||||
byRootstr.append("[").append(*it).append("] ");
|
byRootstr.append("[").append(*it).append("] ");
|
||||||
}
|
}
|
||||||
byRootstr.append("\n");
|
byRootstr.append("\n");
|
||||||
}
|
}
|
||||||
LOGABS("\nqualityTerms: uterms to terms: " << (byRootstr) << "\n" );
|
LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -202,51 +203,51 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
|||||||
map<string, double> grpwdfs;
|
map<string, double> grpwdfs;
|
||||||
map<string, double> grptfreqs;
|
map<string, double> grptfreqs;
|
||||||
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
||||||
git != byRoot.end(); git++) {
|
git != byRoot.end(); git++) {
|
||||||
for (vector<string>::const_iterator qit = git->second.begin();
|
for (vector<string>::const_iterator qit = git->second.begin();
|
||||||
qit != git->second.end(); qit++) {
|
qit != git->second.end(); qit++) {
|
||||||
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
||||||
term.skip_to(*qit);
|
term.skip_to(*qit);
|
||||||
if (term != xrdb.termlist_end(docid) && *term == *qit) {
|
if (term != xrdb.termlist_end(docid) && *term == *qit) {
|
||||||
if (grpwdfs.find(git->first) != grpwdfs.end()) {
|
if (grpwdfs.find(git->first) != grpwdfs.end()) {
|
||||||
grpwdfs[git->first] = term.get_wdf() / doclen;
|
grpwdfs[git->first] = term.get_wdf() / doclen;
|
||||||
grptfreqs[git->first] = termfreqs[*qit];
|
grptfreqs[git->first] = termfreqs[*qit];
|
||||||
} else {
|
} else {
|
||||||
grpwdfs[git->first] += term.get_wdf() / doclen;
|
grpwdfs[git->first] += term.get_wdf() / doclen;
|
||||||
grptfreqs[git->first] += termfreqs[*qit];
|
grptfreqs[git->first] += termfreqs[*qit];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build a sorted by quality container for the groups
|
// Build a sorted by quality container for the groups
|
||||||
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
||||||
git != byRoot.end(); git++) {
|
git != byRoot.end(); git++) {
|
||||||
double q = (grpwdfs[git->first]) * grptfreqs[git->first];
|
double q = (grpwdfs[git->first]) * grptfreqs[git->first];
|
||||||
q = -log10(q);
|
q = -log10(q);
|
||||||
if (q < 3) {
|
if (q < 3) {
|
||||||
q = 0.05;
|
q = 0.05;
|
||||||
} else if (q < 4) {
|
} else if (q < 4) {
|
||||||
q = 0.3;
|
q = 0.3;
|
||||||
} else if (q < 5) {
|
} else if (q < 5) {
|
||||||
q = 0.7;
|
q = 0.7;
|
||||||
} else if (q < 6) {
|
} else if (q < 6) {
|
||||||
q = 0.8;
|
q = 0.8;
|
||||||
} else {
|
} else {
|
||||||
q = 1;
|
q = 1;
|
||||||
}
|
}
|
||||||
totalweight += q;
|
totalweight += q;
|
||||||
byQ.insert(pair<double, vector<string> >(q, git->second));
|
byQ.insert(pair<double, vector<string> >(q, git->second));
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUGABSTRACT
|
#ifdef DEBUGABSTRACT
|
||||||
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
||||||
mit != byQ.rend(); mit++) {
|
mit != byQ.rend(); mit++) {
|
||||||
LOGABS("qualityTerms: group\n" );
|
LOGABS("qualityTerms: group\n");
|
||||||
for (vector<string>::const_iterator qit = mit->second.begin();
|
for (vector<string>::const_iterator qit = mit->second.begin();
|
||||||
qit != mit->second.end(); qit++) {
|
qit != mit->second.end(); qit++) {
|
||||||
LOGABS("" << (mit->first) << "->[" << (qit) << "]\n" );
|
LOGABS("" << mit->first << "->[" << *qit << "]\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return totalweight;
|
return totalweight;
|
||||||
@ -257,8 +258,8 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
|||||||
{
|
{
|
||||||
LOGDEB("Query::Native::getFirstMatchPage\n");
|
LOGDEB("Query::Native::getFirstMatchPage\n");
|
||||||
if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
|
if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
|
||||||
LOGERR("Query::getFirstMatchPage: no db\n" );
|
LOGERR("Query::getFirstMatchPage: no db\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
||||||
Xapian::Database& xrdb(ndb->xrdb);
|
Xapian::Database& xrdb(ndb->xrdb);
|
||||||
@ -267,14 +268,14 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
|||||||
getMatchTerms(docid, terms);
|
getMatchTerms(docid, terms);
|
||||||
|
|
||||||
if (terms.empty()) {
|
if (terms.empty()) {
|
||||||
LOGDEB("getFirstMatchPage: empty match term list (field match?)\n" );
|
LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<int> pagepos;
|
vector<int> pagepos;
|
||||||
ndb->getPagePositions(docid, pagepos);
|
ndb->getPagePositions(docid, pagepos);
|
||||||
if (pagepos.empty())
|
if (pagepos.empty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
setDbWideQTermsFreqs();
|
setDbWideQTermsFreqs();
|
||||||
|
|
||||||
@ -283,25 +284,25 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
|||||||
qualityTerms(docid, terms, byQ);
|
qualityTerms(docid, terms, byQ);
|
||||||
|
|
||||||
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
||||||
mit != byQ.rend(); mit++) {
|
mit != byQ.rend(); mit++) {
|
||||||
for (vector<string>::const_iterator qit = mit->second.begin();
|
for (vector<string>::const_iterator qit = mit->second.begin();
|
||||||
qit != mit->second.end(); qit++) {
|
qit != mit->second.end(); qit++) {
|
||||||
string qterm = *qit;
|
string qterm = *qit;
|
||||||
Xapian::PositionIterator pos;
|
Xapian::PositionIterator pos;
|
||||||
string emptys;
|
string emptys;
|
||||||
try {
|
try {
|
||||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||||
int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
|
int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
|
||||||
if (pagenum > 0) {
|
if (pagenum > 0) {
|
||||||
term = qterm;
|
term = qterm;
|
||||||
return pagenum;
|
return pagenum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// Term does not occur. No problem.
|
// Term does not occur. No problem.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -312,18 +313,19 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
|||||||
// DatabaseModified and other general exceptions are catched and
|
// DatabaseModified and other general exceptions are catched and
|
||||||
// possibly retried by our caller
|
// possibly retried by our caller
|
||||||
int Query::Native::makeAbstract(Xapian::docid docid,
|
int Query::Native::makeAbstract(Xapian::docid docid,
|
||||||
vector<Snippet>& vabs,
|
vector<Snippet>& vabs,
|
||||||
int imaxoccs, int ictxwords)
|
int imaxoccs, int ictxwords)
|
||||||
{
|
{
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
LOGABS("makeAbstract: docid " << (long(docid)) << " imaxoccs " << (imaxoccs) << " ictxwords " << (ictxwords) << "\n" );
|
LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
|
||||||
|
imaxoccs << " ictxwords " << ictxwords << "\n");
|
||||||
|
|
||||||
// The (unprefixed) terms matched by this document
|
// The (unprefixed) terms matched by this document
|
||||||
vector<string> matchedTerms;
|
vector<string> matchedTerms;
|
||||||
getMatchTerms(docid, matchedTerms);
|
getMatchTerms(docid, matchedTerms);
|
||||||
if (matchedTerms.empty()) {
|
if (matchedTerms.empty()) {
|
||||||
LOGDEB("makeAbstract::Empty term list\n" );
|
LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
|
||||||
return ABSRES_ERROR;
|
return ABSRES_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
listList("Match terms: ", matchedTerms);
|
listList("Match terms: ", matchedTerms);
|
||||||
@ -339,11 +341,11 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
|||||||
// aggregated by the qualityTerms() routine.
|
// aggregated by the qualityTerms() routine.
|
||||||
multimap<double, vector<string> > byQ;
|
multimap<double, vector<string> > byQ;
|
||||||
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
||||||
LOGABS("makeAbstract:" << (chron.ms()) << ": computed Qcoefs.\n" );
|
LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
|
||||||
// This can't happen, but would crash us
|
// This can't happen, but would crash us
|
||||||
if (totalweight == 0.0) {
|
if (totalweight == 0.0) {
|
||||||
LOGERR("makeAbstract: totalweight == 0.0 !\n" );
|
LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
|
||||||
return ABSRES_ERROR;
|
return ABSRES_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
||||||
@ -374,124 +376,129 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
|||||||
// with words. We used to limit the character size at the end, but
|
// with words. We used to limit the character size at the end, but
|
||||||
// this damaged our careful selection of terms
|
// this damaged our careful selection of terms
|
||||||
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
||||||
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
||||||
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
||||||
LOGABS("makeAbstract:" << (chron.ms()) << ": mxttloccs " << (maxtotaloccs) << " ctxwords " << (ctxwords) << "\n" );
|
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
|
||||||
|
maxtotaloccs << " ctxwords " << ctxwords << "\n");
|
||||||
|
|
||||||
int ret = ABSRES_OK;
|
int ret = ABSRES_OK;
|
||||||
|
|
||||||
// Let's go populate
|
// Let's go populate
|
||||||
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
||||||
mit != byQ.rend(); mit++) {
|
mit != byQ.rend(); mit++) {
|
||||||
unsigned int maxgrpoccs;
|
unsigned int maxgrpoccs;
|
||||||
double q;
|
double q;
|
||||||
if (byQ.size() == 1) {
|
if (byQ.size() == 1) {
|
||||||
maxgrpoccs = maxtotaloccs;
|
maxgrpoccs = maxtotaloccs;
|
||||||
q = 1.0;
|
q = 1.0;
|
||||||
} else {
|
} else {
|
||||||
// We give more slots to the better term groups
|
// We give more slots to the better term groups
|
||||||
q = mit->first / totalweight;
|
q = mit->first / totalweight;
|
||||||
maxgrpoccs = int(ceil(maxtotaloccs * q));
|
maxgrpoccs = int(ceil(maxtotaloccs * q));
|
||||||
}
|
}
|
||||||
unsigned int grpoccs = 0;
|
unsigned int grpoccs = 0;
|
||||||
|
|
||||||
for (vector<string>::const_iterator qit = mit->second.begin();
|
for (vector<string>::const_iterator qit = mit->second.begin();
|
||||||
qit != mit->second.end(); qit++) {
|
qit != mit->second.end(); qit++) {
|
||||||
|
|
||||||
// Group done ?
|
// Group done ?
|
||||||
if (grpoccs >= maxgrpoccs)
|
if (grpoccs >= maxgrpoccs)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
string qterm = *qit;
|
string qterm = *qit;
|
||||||
|
|
||||||
LOGABS("makeAbstract: [" << (qterm) << "] " << (maxgrpoccs) << " max grp occs (coef " << (q) << ")\n" );
|
LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
|
||||||
|
" max grp occs (coef " << q << ")\n");
|
||||||
|
|
||||||
// The match term may span several words
|
// The match term may span several words
|
||||||
int qtrmwrdcnt =
|
int qtrmwrdcnt =
|
||||||
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
||||||
|
|
||||||
Xapian::PositionIterator pos;
|
Xapian::PositionIterator pos;
|
||||||
// There may be query terms not in this doc. This raises an
|
// There may be query terms not in this doc. This raises an
|
||||||
// exception when requesting the position list, we catch it ??
|
// exception when requesting the position list, we catch it ??
|
||||||
// Not clear how this can happen because we are walking the
|
// Not clear how this can happen because we are walking the
|
||||||
// match list returned by Xapian. Maybe something with the
|
// match list returned by Xapian. Maybe something with the
|
||||||
// fields?
|
// fields?
|
||||||
string emptys;
|
string emptys;
|
||||||
try {
|
try {
|
||||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||||
int ipos = *pos;
|
int ipos = *pos;
|
||||||
if (ipos < int(baseTextPosition)) // Not in text body
|
if (ipos < int(baseTextPosition)) // Not in text body
|
||||||
continue;
|
continue;
|
||||||
LOGABS("makeAbstract: [" << (qterm) << "] at pos " << (ipos) << " grpoccs " << (grpoccs) << " maxgrpoccs " << (maxgrpoccs) << "\n" );
|
LOGABS("makeAbstract: [" << qterm << "] at pos " <<
|
||||||
|
ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
|
||||||
|
maxgrpoccs << "\n");
|
||||||
|
|
||||||
totaloccs++;
|
totaloccs++;
|
||||||
grpoccs++;
|
grpoccs++;
|
||||||
|
|
||||||
// Add adjacent slots to the set to populate at next
|
// Add adjacent slots to the set to populate at next
|
||||||
// step by inserting empty strings. Special provisions
|
// step by inserting empty strings. Special provisions
|
||||||
// for adding ellipsis and for positions overlapped by
|
// for adding ellipsis and for positions overlapped by
|
||||||
// the match term.
|
// the match term.
|
||||||
unsigned int sta = MAX(int(baseTextPosition),
|
unsigned int sta = MAX(int(baseTextPosition),
|
||||||
ipos - ctxwords);
|
ipos - ctxwords);
|
||||||
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
||||||
m_q->m_db->getAbsCtxLen();
|
m_q->m_db->getAbsCtxLen();
|
||||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||||
if (ii == (unsigned int)ipos) {
|
if (ii == (unsigned int)ipos) {
|
||||||
sparseDoc[ii] = qterm;
|
sparseDoc[ii] = qterm;
|
||||||
searchTermPositions.insert(ii);
|
searchTermPositions.insert(ii);
|
||||||
if (ii > maxpos)
|
if (ii > maxpos)
|
||||||
maxpos = ii;
|
maxpos = ii;
|
||||||
} else if (ii > (unsigned int)ipos &&
|
} else if (ii > (unsigned int)ipos &&
|
||||||
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
||||||
sparseDoc[ii] = occupiedmarker;
|
sparseDoc[ii] = occupiedmarker;
|
||||||
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
||||||
// For an empty slot, the test has a side
|
// For an empty slot, the test has a side
|
||||||
// effect of inserting an empty string which
|
// effect of inserting an empty string which
|
||||||
// is what we want.
|
// is what we want.
|
||||||
sparseDoc[ii] = emptys;
|
sparseDoc[ii] = emptys;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Add ellipsis at the end. This may be replaced later by
|
// Add ellipsis at the end. This may be replaced later by
|
||||||
// an overlapping extract. Take care not to replace an
|
// an overlapping extract. Take care not to replace an
|
||||||
// empty string here, we really want an empty slot,
|
// empty string here, we really want an empty slot,
|
||||||
// use find()
|
// use find()
|
||||||
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
|
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
|
||||||
sparseDoc[sto+1] = cstr_ellipsis;
|
sparseDoc[sto+1] = cstr_ellipsis;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Group done ?
|
// Group done ?
|
||||||
if (grpoccs >= maxgrpoccs) {
|
if (grpoccs >= maxgrpoccs) {
|
||||||
ret |= ABSRES_TRUNC;
|
ret |= ABSRES_TRUNC;
|
||||||
LOGABS("Db::makeAbstract: max group occs cutoff\n" );
|
LOGABS("Db::makeAbstract: max group occs cutoff\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Global done ?
|
// Global done ?
|
||||||
if (totaloccs >= maxtotaloccs) {
|
if (totaloccs >= maxtotaloccs) {
|
||||||
ret |= ABSRES_TRUNC;
|
ret |= ABSRES_TRUNC;
|
||||||
LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
|
LOGABS("Db::makeAbstract: max occurrences cutoff\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// Term does not occur. No problem.
|
// Term does not occur. No problem.
|
||||||
}
|
}
|
||||||
|
|
||||||
if (totaloccs >= maxtotaloccs) {
|
if (totaloccs >= maxtotaloccs) {
|
||||||
ret |= ABSRES_TRUNC;
|
ret |= ABSRES_TRUNC;
|
||||||
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
|
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
maxpos += ctxwords + 1;
|
maxpos += ctxwords + 1;
|
||||||
|
|
||||||
LOGABS("makeAbstract:" << (chron.millis()) << ":chosen number of positions " << (totaloccs) << "\n" );
|
LOGABS("makeAbstract:" << chron.millis() <<
|
||||||
|
"mS:chosen number of positions " << totaloccs << "\n");
|
||||||
// This can happen if there are term occurences in the keywords
|
// This can happen if there are term occurences in the keywords
|
||||||
// etc. but not elsewhere ?
|
// etc. but not elsewhere ?
|
||||||
if (totaloccs == 0) {
|
if (totaloccs == 0) {
|
||||||
LOGDEB("makeAbstract: no occurrences\n" );
|
LOGDEB("makeAbstract: no occurrences\n");
|
||||||
return ABSRES_OK;
|
return ABSRES_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Walk all document's terms position lists and populate slots
|
// Walk all document's terms position lists and populate slots
|
||||||
@ -500,69 +507,74 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
|||||||
// inconsistant (missing words, potentially altering meaning),
|
// inconsistant (missing words, potentially altering meaning),
|
||||||
// which is bad.
|
// which is bad.
|
||||||
{
|
{
|
||||||
Xapian::TermIterator term;
|
Xapian::TermIterator term;
|
||||||
int cutoff = m_q->m_snipMaxPosWalk;
|
int cutoff = m_q->m_snipMaxPosWalk;
|
||||||
for (term = xrdb.termlist_begin(docid);
|
for (term = xrdb.termlist_begin(docid);
|
||||||
term != xrdb.termlist_end(docid); term++) {
|
term != xrdb.termlist_end(docid); term++) {
|
||||||
// Ignore prefixed terms
|
// Ignore prefixed terms
|
||||||
if (has_prefix(*term))
|
if (has_prefix(*term))
|
||||||
continue;
|
continue;
|
||||||
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
||||||
ret |= ABSRES_TERMMISS;
|
ret |= ABSRES_TERMMISS;
|
||||||
LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" );
|
LOGDEB0("makeAbstract: max term count cutoff " <<
|
||||||
break;
|
m_q->m_snipMaxPosWalk << "\n");
|
||||||
}
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
map<unsigned int, string>::iterator vit;
|
map<unsigned int, string>::iterator vit;
|
||||||
Xapian::PositionIterator pos;
|
Xapian::PositionIterator pos;
|
||||||
for (pos = xrdb.positionlist_begin(docid, *term);
|
for (pos = xrdb.positionlist_begin(docid, *term);
|
||||||
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
||||||
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
||||||
ret |= ABSRES_TERMMISS;
|
ret |= ABSRES_TERMMISS;
|
||||||
LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" );
|
LOGDEB0("makeAbstract: max term count cutoff " <<
|
||||||
break;
|
m_q->m_snipMaxPosWalk << "\n");
|
||||||
}
|
break;
|
||||||
// If we are beyond the max possible position, stop
|
}
|
||||||
// for this term
|
// If we are beyond the max possible position, stop
|
||||||
if (*pos > maxpos) {
|
// for this term
|
||||||
break;
|
if (*pos > maxpos) {
|
||||||
}
|
break;
|
||||||
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
}
|
||||||
// Don't replace a term: the terms list is in
|
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||||
// alphabetic order, and we may have several terms
|
// Don't replace a term: the terms list is in
|
||||||
// at the same position, we want to keep only the
|
// alphabetic order, and we may have several terms
|
||||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
// at the same position, we want to keep only the
|
||||||
if (vit->second.empty()) {
|
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||||
LOGDEB2("makeAbstract: populating: [" << ((*term)) << "] at " << (*pos) << "\n" );
|
if (vit->second.empty()) {
|
||||||
sparseDoc[*pos] = *term;
|
LOGDEB2("makeAbstract: populating: [" << *term <<
|
||||||
}
|
"] at " << *pos << "\n");
|
||||||
}
|
sparseDoc[*pos] = *term;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// Debug only: output the full term[position] vector
|
// Debug only: output the full term[position] vector
|
||||||
bool epty = false;
|
bool epty = false;
|
||||||
int ipos = 0;
|
int ipos = 0;
|
||||||
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
||||||
it != sparseDoc.end();
|
it != sparseDoc.end();
|
||||||
it++, ipos++) {
|
it++, ipos++) {
|
||||||
if (it->empty()) {
|
if (it->empty()) {
|
||||||
if (!epty)
|
if (!epty)
|
||||||
LOGDEB("makeAbstract:vec[" << (ipos) << "]: [" << (it) << "]\n" );
|
LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
|
||||||
epty=true;
|
epty=true;
|
||||||
} else {
|
} else {
|
||||||
epty = false;
|
epty = false;
|
||||||
LOGDEB("makeAbstract:vec[" << (ipos) << "]: [" << (it) << "]\n" );
|
LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
vector<int> vpbreaks;
|
vector<int> vpbreaks;
|
||||||
ndb->getPagePositions(docid, vpbreaks);
|
ndb->getPagePositions(docid, vpbreaks);
|
||||||
|
|
||||||
LOGABS("makeAbstract:" << (chron.millis()) << ": extracting. Got " << (vpbreaks.size()) << " pages\n" );
|
LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
|
||||||
|
vpbreaks.size() << " pages\n");
|
||||||
// Finally build the abstract by walking the map (in order of position)
|
// Finally build the abstract by walking the map (in order of position)
|
||||||
vabs.clear();
|
vabs.clear();
|
||||||
string chunk;
|
string chunk;
|
||||||
@ -570,45 +582,43 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
|||||||
int page = 0;
|
int page = 0;
|
||||||
string term;
|
string term;
|
||||||
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
||||||
it != sparseDoc.end(); it++) {
|
it != sparseDoc.end(); it++) {
|
||||||
LOGDEB2("Abtract:output " << (it->first) << " -> [" << (it->second) << "]\n" );
|
LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
|
||||||
if (!occupiedmarker.compare(it->second)) {
|
"]\n");
|
||||||
LOGDEB("Abstract: qtrm position not filled ??\n" );
|
if (!occupiedmarker.compare(it->second)) {
|
||||||
continue;
|
LOGDEB("Abstract: qtrm position not filled ??\n");
|
||||||
}
|
continue;
|
||||||
if (chunk.empty() && !vpbreaks.empty()) {
|
}
|
||||||
page = ndb->getPageNumberForPosition(vpbreaks, it->first);
|
if (chunk.empty() && !vpbreaks.empty()) {
|
||||||
if (page < 0)
|
page = ndb->getPageNumberForPosition(vpbreaks, it->first);
|
||||||
page = 0;
|
if (page < 0)
|
||||||
term.clear();
|
page = 0;
|
||||||
}
|
term.clear();
|
||||||
Utf8Iter uit(it->second);
|
}
|
||||||
bool newcjk = false;
|
Utf8Iter uit(it->second);
|
||||||
if (TextSplit::isCJK(*uit))
|
bool newcjk = false;
|
||||||
newcjk = true;
|
if (TextSplit::isCJK(*uit))
|
||||||
if (!incjk || (incjk && !newcjk))
|
newcjk = true;
|
||||||
chunk += " ";
|
if (!incjk || (incjk && !newcjk))
|
||||||
incjk = newcjk;
|
chunk += " ";
|
||||||
if (searchTermPositions.find(it->first) != searchTermPositions.end())
|
incjk = newcjk;
|
||||||
term = it->second;
|
if (searchTermPositions.find(it->first) != searchTermPositions.end())
|
||||||
if (it->second == cstr_ellipsis) {
|
term = it->second;
|
||||||
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
if (it->second == cstr_ellipsis) {
|
||||||
chunk.clear();
|
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
||||||
} else {
|
chunk.clear();
|
||||||
if (it->second.compare(end_of_field_term) &&
|
} else {
|
||||||
it->second.compare(start_of_field_term))
|
if (it->second.compare(end_of_field_term) &&
|
||||||
chunk += it->second;
|
it->second.compare(start_of_field_term))
|
||||||
}
|
chunk += it->second;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!chunk.empty())
|
if (!chunk.empty())
|
||||||
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
||||||
|
|
||||||
LOGDEB2("makeAbtract: done in " << (chron.millis()) << " mS\n" );
|
LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user