dont insert space in cjk abstracts

This commit is contained in:
dockes 2008-12-12 11:53:45 +00:00
parent d318f506bf
commit 64ef8d0b81
3 changed files with 22 additions and 5 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.38 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -149,6 +149,11 @@ static inline int whatcc(unsigned int c)
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
bool TextSplit::isCJK(int c)
{
return UNICODE_IS_CJK(c);
}
bool TextSplit::o_processCJK = true;
unsigned int TextSplit::o_CJKNgramLen = 2;

View File

@ -16,7 +16,7 @@
*/
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: textsplit.h,v 1.22 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -96,7 +96,10 @@ public:
* handles all Unicode whitespace, but needs correct utf-8 input
*/
static bool stringToStrings(const string &s, list<string> &tokens);
/** Is char CJK ? */
static bool isCJK(int c);
private:
Flags m_flags;
TextSplitCB *m_cb;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.150 2008-12-12 11:02:20 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.151 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -444,10 +444,19 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
// Finally build the abstract by walking the map (in order of position)
string abstract;
abstract.reserve(sparseDoc.size() * 10);
bool incjk = false;
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
it != sparseDoc.end(); it++) {
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
abstract += it->second + " ";
Utf8Iter uit(it->second);
bool newcjk = false;
if (TextSplit::isCJK(*uit))
newcjk = true;
if (!incjk || (incjk && !newcjk))
abstract += " ";
incjk = newcjk;
abstract += it->second;
}
// This happens for docs with no terms (only filename) indexed? I'll fix