fix abstract generation when the match term is a multiword span (esp. for japanese)
This commit is contained in:
parent
5463ea258f
commit
4eebdf5708
@ -1,4 +1,4 @@
|
||||
# @(#$Id: Makefile,v 1.15 2008-11-24 15:47:40 dockes Exp $ (C) 2005 J.F.Dockes
|
||||
# @(#$Id: Makefile,v 1.16 2008-12-17 14:26:49 dockes Exp $ (C) 2005 J.F.Dockes
|
||||
depth = ..
|
||||
include $(depth)/mk/sysconf
|
||||
|
||||
@ -20,7 +20,7 @@ trunacpp.o : unacpp.cpp unacpp.h
|
||||
|
||||
TEXTSPLIT_OBJS= trtextsplit.o $(BIGLIB)
|
||||
textsplit : $(TEXTSPLIT_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS) $(LIBICONV)
|
||||
trtextsplit.o : textsplit.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
|
||||
textsplit.cpp
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.152 2008-12-17 08:01:40 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.153 2008-12-17 14:26:49 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -310,6 +310,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
return string();
|
||||
}
|
||||
|
||||
// This is used to mark positions overlapped by a multi-word match term
|
||||
const string occupiedmarker("?");
|
||||
const string ellipsis("...");
|
||||
|
||||
// Let's go populate
|
||||
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
||||
qit != byQ.rend(); qit++) {
|
||||
@ -324,7 +328,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
|
||||
qterm.c_str(), maxoccs, q));
|
||||
}
|
||||
|
||||
|
||||
// The match term may span several words
|
||||
int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
||||
|
||||
Xapian::PositionIterator pos;
|
||||
// There may be query terms not in this doc. This raises an
|
||||
// exception when requesting the position list, we catch it.
|
||||
@ -340,15 +347,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
qterm.c_str(), ipos, occurrences, maxoccs));
|
||||
// Remember the term position
|
||||
qtermposs.push_back(ipos);
|
||||
// Add adjacent slots to the set to populate at next step
|
||||
|
||||
// Add adjacent slots to the set to populate at next
|
||||
// step by inserting empty strings. Special provisions
|
||||
// for adding ellipsis and for positions overlapped by
|
||||
// the match term.
|
||||
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
||||
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
||||
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
||||
m_db->m_synthAbsWordCtxLen;
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
if (ii == (unsigned int)ipos)
|
||||
if (ii == (unsigned int)ipos) {
|
||||
sparseDoc[ii] = qterm;
|
||||
else
|
||||
} else if (ii > (unsigned int)ipos &&
|
||||
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
||||
sparseDoc[ii] = occupiedmarker;
|
||||
} else if (!sparseDoc[ii].compare(ellipsis)) {
|
||||
// For an empty, slot, the test has a side
|
||||
// effect of inserting an empty string which
|
||||
// is what we want
|
||||
sparseDoc[ii] = emptys;
|
||||
}
|
||||
}
|
||||
// Add ... at the end. This may be replaced later by
|
||||
// an overlapping extract
|
||||
if (sparseDoc[sto+1].empty())
|
||||
sparseDoc[sto+1] = ellipsis;
|
||||
|
||||
// Limit to allocated occurences and total size
|
||||
if (++occurrences >= maxoccs ||
|
||||
qtermposs.size() >= maxtotaloccs)
|
||||
@ -430,18 +454,6 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
|
||||
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
|
||||
|
||||
// Add "..." at ends of chunks
|
||||
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
||||
pos != qtermposs.end(); pos++) {
|
||||
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
||||
|
||||
// Possibly add a ... at the end of chunk if it's not
|
||||
// overlapping
|
||||
if (sparseDoc.find(sto) != sparseDoc.end() &&
|
||||
sparseDoc.find(sto+1) == sparseDoc.end())
|
||||
sparseDoc[sto+1] = "...";
|
||||
}
|
||||
|
||||
// Finally build the abstract by walking the map (in order of position)
|
||||
string abstract;
|
||||
abstract.reserve(sparseDoc.size() * 10);
|
||||
@ -449,6 +461,8 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
||||
it != sparseDoc.end(); it++) {
|
||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||
if (!occupiedmarker.compare(it->second))
|
||||
continue;
|
||||
Utf8Iter uit(it->second);
|
||||
bool newcjk = false;
|
||||
if (TextSplit::isCJK(*uit))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user