From 4eebdf57084830ec0b6882cbcca38093ec250c85 Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 17 Dec 2008 14:26:49 +0000 Subject: [PATCH] fix abstract generation when the match term is a multiword span (esp. for japanese) --- src/common/Makefile | 4 ++-- src/rcldb/rcldb.cpp | 50 +++++++++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/common/Makefile b/src/common/Makefile index 0d14f629..fcc3a943 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -1,4 +1,4 @@ -# @(#$Id: Makefile,v 1.15 2008-11-24 15:47:40 dockes Exp $ (C) 2005 J.F.Dockes +# @(#$Id: Makefile,v 1.16 2008-12-17 14:26:49 dockes Exp $ (C) 2005 J.F.Dockes depth = .. include $(depth)/mk/sysconf @@ -20,7 +20,7 @@ trunacpp.o : unacpp.cpp unacpp.h TEXTSPLIT_OBJS= trtextsplit.o $(BIGLIB) textsplit : $(TEXTSPLIT_OBJS) - $(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS) + $(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS) $(LIBICONV) trtextsplit.o : textsplit.cpp $(CXX) $(ALL_CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \ textsplit.cpp diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index b585f6f7..2eb7f003 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.152 2008-12-17 08:01:40 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.153 2008-12-17 14:26:49 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -310,6 +310,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) return string(); } + // This is used to mark positions overlapped by a multi-word match term + const string occupiedmarker("?"); + const string ellipsis("..."); + // Let's go populate for (multimap::reverse_iterator qit = byQ.rbegin(); qit != byQ.rend(); qit++) { @@ -324,7 +328,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n", qterm.c_str(), maxoccs, q)); } - + + // The match term may span several words + int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS); + Xapian::PositionIterator pos; // There may be query terms not in this doc. This raises an // exception when requesting the position list, we catch it. @@ -340,15 +347,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) qterm.c_str(), ipos, occurrences, maxoccs)); // Remember the term position qtermposs.push_back(ipos); - // Add adjacent slots to the set to populate at next step + + // Add adjacent slots to the set to populate at next + // step by inserting empty strings. Special provisions + // for adding ellipsis and for positions overlapped by + // the match term. unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen); - unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen; + unsigned int sto = ipos + qtrmwrdcnt-1 + + m_db->m_synthAbsWordCtxLen; for (unsigned int ii = sta; ii <= sto; ii++) { - if (ii == (unsigned int)ipos) + if (ii == (unsigned int)ipos) { sparseDoc[ii] = qterm; - else + } else if (ii > (unsigned int)ipos && + ii < (unsigned int)ipos + qtrmwrdcnt) { + sparseDoc[ii] = occupiedmarker; + } else if (!sparseDoc[ii].compare(ellipsis)) { + // For an empty, slot, the test has a side + // effect of inserting an empty string which + // is what we want sparseDoc[ii] = emptys; + } } + // Add ... at the end. This may be replaced later by + // an overlapping extract + if (sparseDoc[sto+1].empty()) + sparseDoc[sto+1] = ellipsis; + // Limit to allocated occurences and total size if (++occurrences >= maxoccs || qtermposs.size() >= maxtotaloccs) @@ -430,18 +454,6 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) LOGDEB(("makeAbstract:%d: extracting\n", chron.millis())); - // Add "..." at ends of chunks - for (vector::const_iterator pos = qtermposs.begin(); - pos != qtermposs.end(); pos++) { - unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen; - - // Possibly add a ... at the end of chunk if it's not - // overlapping - if (sparseDoc.find(sto) != sparseDoc.end() && - sparseDoc.find(sto+1) == sparseDoc.end()) - sparseDoc[sto+1] = "..."; - } - // Finally build the abstract by walking the map (in order of position) string abstract; abstract.reserve(sparseDoc.size() * 10); @@ -449,6 +461,8 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) for (map::const_iterator it = sparseDoc.begin(); it != sparseDoc.end(); it++) { LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); + if (!occupiedmarker.compare(it->second)) + continue; Utf8Iter uit(it->second); bool newcjk = false; if (TextSplit::isCJK(*uit))