From 64ef8d0b8136d50568cabc38daf49489dc6ff52d Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 12 Dec 2008 11:53:45 +0000 Subject: [PATCH] dont insert space in cjk abstracts --- src/common/textsplit.cpp | 7 ++++++- src/common/textsplit.h | 7 +++++-- src/rcldb/rcldb.cpp | 13 +++++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 4826ff99..4528bb79 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.38 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -149,6 +149,11 @@ static inline int whatcc(unsigned int c) || ((p) >= 0x20000 && (p) <= 0x2A6DF) \ || ((p) >= 0x2F800 && (p) <= 0x2FA1F)) +bool TextSplit::isCJK(int c) +{ + return UNICODE_IS_CJK(c); +} + bool TextSplit::o_processCJK = true; unsigned int TextSplit::o_CJKNgramLen = 2; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 4c3b1ab2..3727a148 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,7 +16,7 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.22 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -96,7 +96,10 @@ public: * handles all Unicode whitespace, but needs correct utf-8 input */ static bool stringToStrings(const string &s, list &tokens); - + + /** Is char CJK ? */ + static bool isCJK(int c); + private: Flags m_flags; TextSplitCB *m_cb; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 5eb1e55d..dfbcaa95 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.150 2008-12-12 11:02:20 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.151 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -444,10 +444,19 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) // Finally build the abstract by walking the map (in order of position) string abstract; + abstract.reserve(sparseDoc.size() * 10); + bool incjk = false; for (map::const_iterator it = sparseDoc.begin(); it != sparseDoc.end(); it++) { LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); - abstract += it->second + " "; + Utf8Iter uit(it->second); + bool newcjk = false; + if (TextSplit::isCJK(*uit)) + newcjk = true; + if (!incjk || (incjk && !newcjk)) + abstract += " "; + incjk = newcjk; + abstract += it->second; } // This happens for docs with no terms (only filename) indexed? I'll fix