From f6d8c44cc56b24626fd52836691c84c8ed677cc5 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 25 Jun 2007 10:25:39 +0000 Subject: [PATCH] simplified and hopefully improved abstract generation --- src/rcldb/rcldb.cpp | 226 ++++++++++++++++++++++---------------------- 1 file changed, 114 insertions(+), 112 deletions(-) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 2e11f7b3..a9389538 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.118 2007-06-22 06:14:04 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.119 2007-06-25 10:25:39 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -56,6 +56,10 @@ using namespace std; #define MIN(A,B) (A m_termfreqs; + // Term frequencies for current query. See makeAbstract, setQuery + map m_termfreqs; Native(Db *db) : m_db(db), @@ -232,12 +236,19 @@ static list noPrefixList(const list& in) return out; } +//#define DEBUGABSTRACT +#ifdef DEBUGABSTRACT +#define LOGABS LOGDEB +#else +#define LOGABS LOGDEB2 +#endif + // Build a document abstract by extracting text chunks around the query terms // This uses the db termlists, not the original document. string Native::makeAbstract(Xapian::docid docid, const list& iterms) { Chrono chron; - LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), + LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen)); list terms = noPrefixList(iterms); @@ -245,90 +256,105 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) return ""; } - // We may want to use the db-wide freqs to tune the abstracts one - // day but we currently don't -#if 0 + // Retrieve db-wide frequencies for the query terms if (m_termfreqs.empty()) { + double doccnt = db.get_doccount(); + if (doccnt == 0) doccnt = 1; for (list::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { - m_termfreqs[*qit] = db.get_termfreq(*qit); - LOGDEB(("makeAbstract: [%s] db freq %d\n", qit->c_str(), + m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt; + LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), m_termfreqs[*qit])); } - LOGDEB(("makeAbstract:%d: got termfreqs\n", chron.ms())); + LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms())); } -#endif - // Retrieve the term Within Document Frequencies. We are going to try + // Compute a term quality coefficient by retrieving the term + // Within Document Frequencies and multiplying by overal term + // frequency, then using log-based thresholds. We are going to try // and show text around the less common search terms. - map termwdfs; - int totalqtermoccs = 0; + map termQcoefs; + double totalweight = 0; + double doclen = db.get_doclength(docid); + if (doclen == 0) doclen = 1; for (list::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { Xapian::TermIterator term = db.termlist_begin(docid); term.skip_to(*qit); if (term != db.termlist_end(docid) && *term == *qit) { - int f = term.get_wdf(); - termwdfs[*qit] = f; - totalqtermoccs += f; - LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(), - termwdfs[*qit])); + double q = (term.get_wdf() / doclen) * m_termfreqs[*qit]; + q = -log10(q); + if (q < 3) { + q = 0.05; + } else if (q < 4) { + q = 0.3; + } else if (q < 5) { + q = 0.7; + } else if (q < 6) { + q = 0.8; + } else { + q = 1; + } + termQcoefs[*qit] = q; + totalweight += q; } } - LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n", - chron.ms(), totalqtermoccs)); - if (totalqtermoccs == 0) { - LOGERR(("makeAbstract: no term occurrences !\n")); - return ""; - } + LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms())); - // Build a sorted by frequency term list: it seems reasonable to - // prefer sampling around the less frequent terms: - multimap bywdf; + // Build a sorted by quality term list. + multimap byQ; for (list::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { - if (termwdfs.find(*qit) != termwdfs.end()) - bywdf.insert(pair(termwdfs[*qit], *qit)); + if (termQcoefs.find(*qit) != termQcoefs.end()) + byQ.insert(pair(termQcoefs[*qit], *qit)); } - // For each of the query terms, query xapian for its positions - // list in the document. For each position entry, remember it in qtermposs - // and insert it and its neighbours in the set of 'interesting' positions +#ifdef DEBUGABSTRACT + for (multimap::reverse_iterator qit = byQ.rbegin(); + qit != byQ.rend(); qit++) { + LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str())); + } +#endif + + + // For each of the query terms, ask xapian for its positions list + // in the document. For each position entry, remember it in + // qtermposs and insert it and its neighbours in the set of + // 'interesting' positions // The terms 'array' that we partially populate with the document // terms, at their positions around the search terms positions: map sparseDoc; - // All the query term positions. We remember this mainly because we are - // going to random-shuffle it for selecting the chunks that we actually - // print. + // All the chosen query term positions. vector qtermposs; - // Limit the total number of slots we populate. + // Limit the total number of slots we populate. The 7 is taken as + // average word size. It was a mistake to have the user max + // abstract size parameter in characters, we basically only deal + // with words. We used to limit the character size at the end, but + // this damaged our careful selection of terms const unsigned int maxtotaloccs = - MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1))); - LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n", - chron.ms(), totalqtermoccs, maxtotaloccs)); -#if 0 - for (multimap::iterator qit = bywdf.begin(); - qit != bywdf.end(); qit++) { - LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str())); + m_db->m_synthAbsLen /(7 * (m_db->m_synthAbsWordCtxLen+1)); + LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs)); + // This can't happen, but would crash us + if (totalweight == 0.0) { + LOGERR(("makeAbstract: 0 totalweight!\n")); + return ""; } -#endif - // Find the text positions which we will have to fill with terms - unsigned int totaloccs = 0; - for (multimap::iterator qit = bywdf.begin(); - qit != bywdf.end(); qit++) { + // Let's go populate + for (multimap::reverse_iterator qit = byQ.rbegin(); + qit != byQ.rend(); qit++) { string qterm = qit->second; unsigned int maxoccs; - if (bywdf.size() == 1) { + if (byQ.size() == 1) { maxoccs = maxtotaloccs; } else { - float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) / - (bywdf.size() - 1); + // We give more slots to the better terms + float q = qit->first / totalweight; maxoccs = int(ceil(maxtotaloccs * q)); - LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n", + LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n", qterm.c_str(), maxoccs, q)); } @@ -341,7 +367,10 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) for (pos = db.positionlist_begin(docid, qterm); pos != db.positionlist_end(docid, qterm); pos++) { unsigned int ipos = *pos; - LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos)); + if (ipos < baseTextPosition) // Not in text body + continue; + LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n", + qterm.c_str(), ipos, occurrences, maxoccs)); // Remember the term position qtermposs.push_back(ipos); // Add adjacent slots to the set to populate at next step @@ -353,26 +382,30 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) else sparseDoc[ii] = emptys; } - // Limit the number of occurences we keep for each - // term. The abstract has a finite length anyway ! - if (occurrences++ > maxoccs) + // Limit to allocated occurences and total size + if (++occurrences >= maxoccs || + qtermposs.size() >= maxtotaloccs) break; } } catch (...) { // Term does not occur. No problem. } - // Limit total size - if (totaloccs++ > maxtotaloccs) + if (qtermposs.size() >= maxtotaloccs) break; } - - LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n", + LOGABS(("makeAbstract:%d:chosen number of positions %d\n", chron.millis(), qtermposs.size())); - // Walk the full document position list (for each term walk - // position list) and populate slots around the query terms. We - // arbitrarily truncate the list to avoid taking forever. If we do - // cutoff, the abstract may be inconsistant, which is bad... + // This can happen if there are term occurences in the keywords + // etc. but not elsewhere ? + if (qtermposs.size() == 0) + return ""; + + // Walk all document's terms position lists and populate slots + // around the query terms. We arbitrarily truncate the list to + // avoid taking forever. If we do cutoff, the abstract may be + // inconsistant (missing words, potentially altering meaning), + // which is bad... { Xapian::TermIterator term; int cutoff = 500 * 1000; @@ -401,7 +434,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) // at the same position, we want to keep only the // first one (ie: dockes and dockes@wanadoo.fr) if (vit->second.empty()) { - LOGDEB2(("makeAbstract: populating: [%s] at %d\n", + LOGABS(("makeAbstract: populating: [%s] at %d\n", (*term).c_str(), *pos)); sparseDoc[*pos] = *term; } @@ -428,61 +461,29 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) } #endif - LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis())); + LOGDEB(("makeAbstract:%d: extracting\n", chron.millis())); - // We randomize the selection of term positions, from which we - // shall pull, starting at the beginning, until the abstract is - // big enough. The abstract is finally built in correct position - // order, thanks to the position map. - random_shuffle(qtermposs.begin(), qtermposs.end()); - map mabs; - unsigned int abslen = 0; - - // Extract data around the N first (in random order) query term - // positions, and store the terms in the map. Don't concatenate - // immediately into chunks because there might be overlaps + // Add "..." at ends of chunks for (vector::const_iterator pos = qtermposs.begin(); pos != qtermposs.end(); pos++) { - - if (int(abslen) > m_db->m_synthAbsLen) - break; - - unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen); unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen; - LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto)); - - for (unsigned int ii = sta; ii <= sto; ii++) { - - if (int(abslen) > m_db->m_synthAbsLen) - break; - map::const_iterator vit = - sparseDoc.find(ii); - if (vit != sparseDoc.end() && !vit->second.empty()) { - LOGDEB2(("makeAbstract: position %d -> [%s]\n", - ii, vit->second.c_str())); - mabs[ii] = vit->second; - abslen += vit->second.length(); - } else { - LOGDEB2(("makeAbstract: empty position at %d\n", ii)); - } - } - // Possibly add a ... at the end of chunk if it's not // overlapping - if (mabs.find(sto+1) == mabs.end()) - mabs[sto+1] = "..."; + if (sparseDoc.find(sto) != sparseDoc.end() && + sparseDoc.find(sto+1) == sparseDoc.end()) + sparseDoc[sto+1] = "..."; } - // Build the abstract by walking the map (in order of position) + // Finally build the abstract by walking the map (in order of position) string abstract; - for (map::const_iterator it = mabs.begin(); - it != mabs.end(); it++) { + for (map::const_iterator it = sparseDoc.begin(); + it != sparseDoc.end(); it++) { LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); abstract += it->second + " "; } - // This happens for docs with no terms (only filename) indexed. I'll fix + // This happens for docs with no terms (only filename) indexed? I'll fix // one day (yeah) if (!abstract.compare("... ")) abstract.clear(); @@ -973,16 +974,18 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) } } + if (splitData.curpos < baseTextPosition) + splitData.basepos = baseTextPosition; + else + splitData.basepos += splitData.curpos + 100; - // Split and index body text + // Finally: split and index body text LOGDEB2(("Db::add: split body\n")); if (!dumb_string(doc.text, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); - splitData.basepos += splitData.curpos + 100; - ////// Special terms for other metadata. No positions for these. // Mime type @@ -1425,7 +1428,7 @@ bool Db::filenameWildExp(const string& fnexp, list& names) return true; } -// Prepare query out of "advanced search" data +// Prepare query out of user search data bool Db::setQuery(RefCntr sdata, int opts, const string& stemlang) { @@ -1447,7 +1450,6 @@ bool Db::setQuery(RefCntr sdata, int opts, m_reason += sdata->getReason(); return false; } - m_ndb->query = xq; delete m_ndb->enquire; m_ndb->enquire = new Xapian::Enquire(m_ndb->db);