fix abstract generation when the match term is a multiword span (esp. for japanese)

This commit is contained in:
dockes 2008-12-17 14:26:49 +00:00
parent 5463ea258f
commit 4eebdf5708
2 changed files with 34 additions and 20 deletions

View File

@ -1,4 +1,4 @@
# @(#$Id: Makefile,v 1.15 2008-11-24 15:47:40 dockes Exp $ (C) 2005 J.F.Dockes
# @(#$Id: Makefile,v 1.16 2008-12-17 14:26:49 dockes Exp $ (C) 2005 J.F.Dockes
depth = ..
include $(depth)/mk/sysconf
@ -20,7 +20,7 @@ trunacpp.o : unacpp.cpp unacpp.h
TEXTSPLIT_OBJS= trtextsplit.o $(BIGLIB)
textsplit : $(TEXTSPLIT_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS) $(LIBICONV)
trtextsplit.o : textsplit.cpp
$(CXX) $(ALL_CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
textsplit.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.152 2008-12-17 08:01:40 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.153 2008-12-17 14:26:49 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -310,6 +310,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
return string();
}
// This is used to mark positions overlapped by a multi-word match term
const string occupiedmarker("?");
const string ellipsis("...");
// Let's go populate
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
qit != byQ.rend(); qit++) {
@ -324,7 +328,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
qterm.c_str(), maxoccs, q));
}
// The match term may span several words
int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
Xapian::PositionIterator pos;
// There may be query terms not in this doc. This raises an
// exception when requesting the position list, we catch it.
@ -340,15 +347,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
qterm.c_str(), ipos, occurrences, maxoccs));
// Remember the term position
qtermposs.push_back(ipos);
// Add adjacent slots to the set to populate at next step
// Add adjacent slots to the set to populate at next
// step by inserting empty strings. Special provisions
// for adding ellipsis and for positions overlapped by
// the match term.
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
unsigned int sto = ipos + qtrmwrdcnt-1 +
m_db->m_synthAbsWordCtxLen;
for (unsigned int ii = sta; ii <= sto; ii++) {
if (ii == (unsigned int)ipos)
if (ii == (unsigned int)ipos) {
sparseDoc[ii] = qterm;
else
} else if (ii > (unsigned int)ipos &&
ii < (unsigned int)ipos + qtrmwrdcnt) {
sparseDoc[ii] = occupiedmarker;
} else if (!sparseDoc[ii].compare(ellipsis)) {
// For an empty, slot, the test has a side
// effect of inserting an empty string which
// is what we want
sparseDoc[ii] = emptys;
}
}
// Add ... at the end. This may be replaced later by
// an overlapping extract
if (sparseDoc[sto+1].empty())
sparseDoc[sto+1] = ellipsis;
// Limit to allocated occurences and total size
if (++occurrences >= maxoccs ||
qtermposs.size() >= maxtotaloccs)
@ -430,18 +454,6 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
// Add "..." at ends of chunks
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
pos != qtermposs.end(); pos++) {
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
// Possibly add a ... at the end of chunk if it's not
// overlapping
if (sparseDoc.find(sto) != sparseDoc.end() &&
sparseDoc.find(sto+1) == sparseDoc.end())
sparseDoc[sto+1] = "...";
}
// Finally build the abstract by walking the map (in order of position)
string abstract;
abstract.reserve(sparseDoc.size() * 10);
@ -449,6 +461,8 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
it != sparseDoc.end(); it++) {
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
if (!occupiedmarker.compare(it->second))
continue;
Utf8Iter uit(it->second);
bool newcjk = false;
if (TextSplit::isCJK(*uit))