fix abstract generation when the match term is a multiword span (esp. for japanese)
This commit is contained in:
parent
5463ea258f
commit
4eebdf5708
@ -1,4 +1,4 @@
|
|||||||
# @(#$Id: Makefile,v 1.15 2008-11-24 15:47:40 dockes Exp $ (C) 2005 J.F.Dockes
|
# @(#$Id: Makefile,v 1.16 2008-12-17 14:26:49 dockes Exp $ (C) 2005 J.F.Dockes
|
||||||
depth = ..
|
depth = ..
|
||||||
include $(depth)/mk/sysconf
|
include $(depth)/mk/sysconf
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ trunacpp.o : unacpp.cpp unacpp.h
|
|||||||
|
|
||||||
TEXTSPLIT_OBJS= trtextsplit.o $(BIGLIB)
|
TEXTSPLIT_OBJS= trtextsplit.o $(BIGLIB)
|
||||||
textsplit : $(TEXTSPLIT_OBJS)
|
textsplit : $(TEXTSPLIT_OBJS)
|
||||||
$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS)
|
$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS) $(LIBICONV)
|
||||||
trtextsplit.o : textsplit.cpp
|
trtextsplit.o : textsplit.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
|
$(CXX) $(ALL_CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
|
||||||
textsplit.cpp
|
textsplit.cpp
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.152 2008-12-17 08:01:40 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.153 2008-12-17 14:26:49 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -310,6 +310,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
return string();
|
return string();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is used to mark positions overlapped by a multi-word match term
|
||||||
|
const string occupiedmarker("?");
|
||||||
|
const string ellipsis("...");
|
||||||
|
|
||||||
// Let's go populate
|
// Let's go populate
|
||||||
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
||||||
qit != byQ.rend(); qit++) {
|
qit != byQ.rend(); qit++) {
|
||||||
@ -325,6 +329,9 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
qterm.c_str(), maxoccs, q));
|
qterm.c_str(), maxoccs, q));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The match term may span several words
|
||||||
|
int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
||||||
|
|
||||||
Xapian::PositionIterator pos;
|
Xapian::PositionIterator pos;
|
||||||
// There may be query terms not in this doc. This raises an
|
// There may be query terms not in this doc. This raises an
|
||||||
// exception when requesting the position list, we catch it.
|
// exception when requesting the position list, we catch it.
|
||||||
@ -340,15 +347,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
qterm.c_str(), ipos, occurrences, maxoccs));
|
qterm.c_str(), ipos, occurrences, maxoccs));
|
||||||
// Remember the term position
|
// Remember the term position
|
||||||
qtermposs.push_back(ipos);
|
qtermposs.push_back(ipos);
|
||||||
// Add adjacent slots to the set to populate at next step
|
|
||||||
|
// Add adjacent slots to the set to populate at next
|
||||||
|
// step by inserting empty strings. Special provisions
|
||||||
|
// for adding ellipsis and for positions overlapped by
|
||||||
|
// the match term.
|
||||||
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
||||||
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
||||||
|
m_db->m_synthAbsWordCtxLen;
|
||||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||||
if (ii == (unsigned int)ipos)
|
if (ii == (unsigned int)ipos) {
|
||||||
sparseDoc[ii] = qterm;
|
sparseDoc[ii] = qterm;
|
||||||
else
|
} else if (ii > (unsigned int)ipos &&
|
||||||
|
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
||||||
|
sparseDoc[ii] = occupiedmarker;
|
||||||
|
} else if (!sparseDoc[ii].compare(ellipsis)) {
|
||||||
|
// For an empty, slot, the test has a side
|
||||||
|
// effect of inserting an empty string which
|
||||||
|
// is what we want
|
||||||
sparseDoc[ii] = emptys;
|
sparseDoc[ii] = emptys;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// Add ... at the end. This may be replaced later by
|
||||||
|
// an overlapping extract
|
||||||
|
if (sparseDoc[sto+1].empty())
|
||||||
|
sparseDoc[sto+1] = ellipsis;
|
||||||
|
|
||||||
// Limit to allocated occurences and total size
|
// Limit to allocated occurences and total size
|
||||||
if (++occurrences >= maxoccs ||
|
if (++occurrences >= maxoccs ||
|
||||||
qtermposs.size() >= maxtotaloccs)
|
qtermposs.size() >= maxtotaloccs)
|
||||||
@ -430,18 +454,6 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
|
|
||||||
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
|
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
|
||||||
|
|
||||||
// Add "..." at ends of chunks
|
|
||||||
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
|
||||||
pos != qtermposs.end(); pos++) {
|
|
||||||
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
|
||||||
|
|
||||||
// Possibly add a ... at the end of chunk if it's not
|
|
||||||
// overlapping
|
|
||||||
if (sparseDoc.find(sto) != sparseDoc.end() &&
|
|
||||||
sparseDoc.find(sto+1) == sparseDoc.end())
|
|
||||||
sparseDoc[sto+1] = "...";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally build the abstract by walking the map (in order of position)
|
// Finally build the abstract by walking the map (in order of position)
|
||||||
string abstract;
|
string abstract;
|
||||||
abstract.reserve(sparseDoc.size() * 10);
|
abstract.reserve(sparseDoc.size() * 10);
|
||||||
@ -449,6 +461,8 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
||||||
it != sparseDoc.end(); it++) {
|
it != sparseDoc.end(); it++) {
|
||||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||||
|
if (!occupiedmarker.compare(it->second))
|
||||||
|
continue;
|
||||||
Utf8Iter uit(it->second);
|
Utf8Iter uit(it->second);
|
||||||
bool newcjk = false;
|
bool newcjk = false;
|
||||||
if (TextSplit::isCJK(*uit))
|
if (TextSplit::isCJK(*uit))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user