fix abstract generation when the match term is a multiword span (esp. for japanese)

2008-12-17 14:26:49 +00:00 · 2008-12-17 14:26:49 +00:00 · 4eebdf5708
commit 4eebdf5708
parent 5463ea258f
2 changed files with 34 additions and 20 deletions
--- a/src/common/Makefile
+++ b/src/common/Makefile
@ -1,4 +1,4 @@
-# @(#$Id: Makefile,v 1.15 2008-11-24 15:47:40 dockes Exp $  (C) 2005 J.F.Dockes
+# @(#$Id: Makefile,v 1.16 2008-12-17 14:26:49 dockes Exp $  (C) 2005 J.F.Dockes
 depth = ..
 include $(depth)/mk/sysconf

@ -20,7 +20,7 @@ trunacpp.o : unacpp.cpp unacpp.h

 TEXTSPLIT_OBJS= trtextsplit.o  $(BIGLIB)
 textsplit : $(TEXTSPLIT_OBJS)
-	$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS)
+	$(CXX) $(ALL_CXXFLAGS) -o textsplit $(TEXTSPLIT_OBJS) $(LIBICONV)
 trtextsplit.o : textsplit.cpp 
 	$(CXX) $(ALL_CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
 	       textsplit.cpp
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.152 2008-12-17 08:01:40 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.153 2008-12-17 14:26:49 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -310,6 +310,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
 	return string();
    }

+    // This is used to mark positions overlapped by a multi-word match term
+    const string occupiedmarker("?");
+    const string ellipsis("...");
+
    // Let's go populate
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
 	 qit != byQ.rend(); qit++) {
@ -324,7 +328,10 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
 	    LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n", 
 		    qterm.c_str(), maxoccs, q));
 	}
-		
+
+	// The match term may span several words
+	int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
+
 	Xapian::PositionIterator pos;
 	// There may be query terms not in this doc. This raises an
 	// exception when requesting the position list, we catch it.
@ -340,15 +347,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
 			qterm.c_str(), ipos, occurrences, maxoccs));
 		// Remember the term position
 		qtermposs.push_back(ipos);
-		// Add adjacent slots to the set to populate at next step
+
+		// Add adjacent slots to the set to populate at next
+		// step by inserting empty strings. Special provisions
+		// for adding ellipsis and for positions overlapped by
+		// the match term.
 		unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
-		unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
+		unsigned int sto = ipos + qtrmwrdcnt-1 + 
+		    m_db->m_synthAbsWordCtxLen;
 		for (unsigned int ii = sta; ii <= sto;  ii++) {
-		    if (ii == (unsigned int)ipos)
+		    if (ii == (unsigned int)ipos) {
 			sparseDoc[ii] = qterm;
-		    else
+		    } else if (ii > (unsigned int)ipos && 
+			       ii < (unsigned int)ipos + qtrmwrdcnt) {
+			sparseDoc[ii] = occupiedmarker;
+		    } else if (!sparseDoc[ii].compare(ellipsis)) {
+			// For an empty, slot, the test has a side
+			// effect of inserting an empty string which
+			// is what we want
 			sparseDoc[ii] = emptys;
+		    }
 		}
+		// Add ... at the end. This may be replaced later by
+		// an overlapping extract
+		if (sparseDoc[sto+1].empty())
+		    sparseDoc[sto+1] = ellipsis;
+
 		// Limit to allocated occurences and total size
 		if (++occurrences >= maxoccs || 
 		    qtermposs.size() >= maxtotaloccs)
@ -430,18 +454,6 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)

    LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));

-    // Add "..." at ends of chunks
-    for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
-	 pos != qtermposs.end(); pos++) {
-	unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
-
-	// Possibly add a ... at the end of chunk if it's not
-	// overlapping
-	if (sparseDoc.find(sto) != sparseDoc.end() && 
-	    sparseDoc.find(sto+1) == sparseDoc.end())
-	    sparseDoc[sto+1] = "...";
-    }
-
    // Finally build the abstract by walking the map (in order of position)
    string abstract;
    abstract.reserve(sparseDoc.size() * 10);
@ -449,6 +461,8 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
 	 it != sparseDoc.end(); it++) {
 	LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
+	if (!occupiedmarker.compare(it->second))
+	    continue;
 	Utf8Iter uit(it->second);
 	bool newcjk = false;
 	if (TextSplit::isCJK(*uit))