dont insert space in cjk abstracts
This commit is contained in:
parent
d318f506bf
commit
64ef8d0b81
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.38 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -149,6 +149,11 @@ static inline int whatcc(unsigned int c)
|
|||||||
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
||||||
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
||||||
|
|
||||||
|
bool TextSplit::isCJK(int c)
|
||||||
|
{
|
||||||
|
return UNICODE_IS_CJK(c);
|
||||||
|
}
|
||||||
|
|
||||||
bool TextSplit::o_processCJK = true;
|
bool TextSplit::o_processCJK = true;
|
||||||
unsigned int TextSplit::o_CJKNgramLen = 2;
|
unsigned int TextSplit::o_CJKNgramLen = 2;
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.22 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -97,6 +97,9 @@ public:
|
|||||||
*/
|
*/
|
||||||
static bool stringToStrings(const string &s, list<string> &tokens);
|
static bool stringToStrings(const string &s, list<string> &tokens);
|
||||||
|
|
||||||
|
/** Is char CJK ? */
|
||||||
|
static bool isCJK(int c);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Flags m_flags;
|
Flags m_flags;
|
||||||
TextSplitCB *m_cb;
|
TextSplitCB *m_cb;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.150 2008-12-12 11:02:20 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.151 2008-12-12 11:53:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -444,10 +444,19 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||||||
|
|
||||||
// Finally build the abstract by walking the map (in order of position)
|
// Finally build the abstract by walking the map (in order of position)
|
||||||
string abstract;
|
string abstract;
|
||||||
|
abstract.reserve(sparseDoc.size() * 10);
|
||||||
|
bool incjk = false;
|
||||||
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
||||||
it != sparseDoc.end(); it++) {
|
it != sparseDoc.end(); it++) {
|
||||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||||
abstract += it->second + " ";
|
Utf8Iter uit(it->second);
|
||||||
|
bool newcjk = false;
|
||||||
|
if (TextSplit::isCJK(*uit))
|
||||||
|
newcjk = true;
|
||||||
|
if (!incjk || (incjk && !newcjk))
|
||||||
|
abstract += " ";
|
||||||
|
incjk = newcjk;
|
||||||
|
abstract += it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This happens for docs with no terms (only filename) indexed? I'll fix
|
// This happens for docs with no terms (only filename) indexed? I'll fix
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user