Partially revert change treating Katakana as words, going back to n-grams. Did not work well because of separator-less compounds mostly
This commit is contained in:
parent
59e5cc4150
commit
f853f39ef3
@ -32,7 +32,6 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
|
|||||||
$(X_CFLAGS) \
|
$(X_CFLAGS) \
|
||||||
-DRECOLL_DATADIR=\"${pkgdatadir}\" \
|
-DRECOLL_DATADIR=\"${pkgdatadir}\" \
|
||||||
-D_GNU_SOURCE \
|
-D_GNU_SOURCE \
|
||||||
-DTESTING_XAPIAN_SPELL \
|
|
||||||
$(DEFS)
|
$(DEFS)
|
||||||
|
|
||||||
ACLOCAL_AMFLAGS = -I m4
|
ACLOCAL_AMFLAGS = -I m4
|
||||||
|
|||||||
@ -33,6 +33,16 @@
|
|||||||
#include "uproplist.h"
|
#include "uproplist.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
|
|
||||||
|
// Decide if we treat katakana as western scripts, splitting into
|
||||||
|
// words instead of n-grams. This is not absurd (katakana is a kind of
|
||||||
|
// alphabet, albeit phonetic and syllabic and is mostly used to
|
||||||
|
// transcribe western words), but it does not work well because
|
||||||
|
// japanese uses separator-less compound katakana words, and because
|
||||||
|
// the plural terminaisons are irregular and would need a specialized
|
||||||
|
// stemmer. So we for now process katakana as the rest of cjk, using
|
||||||
|
// ngrams
|
||||||
|
#undef KATAKANA_AS_WORDS
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -209,11 +219,15 @@ static inline int whatcc(unsigned int c)
|
|||||||
// katakana variants' to something else. Look up "Kuromoji" Lucene
|
// katakana variants' to something else. Look up "Kuromoji" Lucene
|
||||||
// filter, KuromojiNormalizeFilter.java
|
// filter, KuromojiNormalizeFilter.java
|
||||||
// 309F is Hiragana.
|
// 309F is Hiragana.
|
||||||
|
#ifdef KATAKANA_AS_WORDS
|
||||||
#define UNICODE_IS_KATAKANA(p) \
|
#define UNICODE_IS_KATAKANA(p) \
|
||||||
((p) != 0x309F && \
|
((p) != 0x309F && \
|
||||||
(((p) >= 0x3099 && (p) <= 0x30FF) || \
|
(((p) >= 0x3099 && (p) <= 0x30FF) || \
|
||||||
((p) >= 0x31F0 && (p) <= 0x31FF)))
|
((p) >= 0x31F0 && (p) <= 0x31FF)))
|
||||||
|
#else
|
||||||
|
#define UNICODE_IS_KATAKANA(p) false
|
||||||
|
#endif
|
||||||
|
|
||||||
bool TextSplit::isCJK(int c)
|
bool TextSplit::isCJK(int c)
|
||||||
{
|
{
|
||||||
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
|
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
|
||||||
@ -520,6 +534,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
|
LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
CharSpanClass csc;
|
CharSpanClass csc;
|
||||||
if (UNICODE_IS_KATAKANA(c)) {
|
if (UNICODE_IS_KATAKANA(c)) {
|
||||||
csc = CSC_KATAKANA;
|
csc = CSC_KATAKANA;
|
||||||
@ -528,6 +543,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
} else {
|
} else {
|
||||||
csc = CSC_OTHER;
|
csc = CSC_OTHER;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (o_processCJK && csc == CSC_CJK) {
|
if (o_processCJK && csc == CSC_CJK) {
|
||||||
// CJK excluding Katakana character hit.
|
// CJK excluding Katakana character hit.
|
||||||
// Do like at EOF with the current non-cjk data.
|
// Do like at EOF with the current non-cjk data.
|
||||||
@ -548,6 +564,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef KATAKANA_AS_WORDS
|
||||||
|
// Only needed if we have script transitions inside this
|
||||||
|
// routine, else the call to cjk_to_words does the job.
|
||||||
if (csc != prev_csc && (m_wordLen || m_span.length())) {
|
if (csc != prev_csc && (m_wordLen || m_span.length())) {
|
||||||
LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
|
LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
|
||||||
m_wordLen << " spl " << m_span.length() << endl);
|
m_wordLen << " spl " << m_span.length() << endl);
|
||||||
@ -555,8 +574,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
prev_csc = csc;
|
prev_csc = csc;
|
||||||
|
|
||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
|
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
|
|||||||
@ -2,7 +2,7 @@ TEMPLATE = app
|
|||||||
LANGUAGE = C++
|
LANGUAGE = C++
|
||||||
|
|
||||||
VPATH = @srcdir@
|
VPATH = @srcdir@
|
||||||
DEFINES += BUILDING_RECOLL TESTING_XAPIAN_SPELL
|
DEFINES += BUILDING_RECOLL
|
||||||
|
|
||||||
@QMAKE_ENABLE_WEBKIT@ QT += webkit
|
@QMAKE_ENABLE_WEBKIT@ QT += webkit
|
||||||
@QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER
|
@QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER
|
||||||
|
|||||||
@ -200,21 +200,37 @@ class Db {
|
|||||||
vector<string> getStemLangs();
|
vector<string> getStemLangs();
|
||||||
|
|
||||||
/** Test word for spelling correction candidate: not too long, no
|
/** Test word for spelling correction candidate: not too long, no
|
||||||
special chars... */
|
* special chars...
|
||||||
static bool isSpellingCandidate(const string& term, bool aspell=true)
|
* @param with_aspell test for use with aspell, else for xapian speller
|
||||||
|
*/
|
||||||
|
static bool isSpellingCandidate(const string& term, bool with_aspell=true)
|
||||||
{
|
{
|
||||||
if (term.empty() || term.length() > 50)
|
if (term.empty() || term.length() > 50)
|
||||||
return false;
|
return false;
|
||||||
if (has_prefix(term))
|
if (has_prefix(term))
|
||||||
return false;
|
return false;
|
||||||
Utf8Iter u8i(term);
|
Utf8Iter u8i(term);
|
||||||
if (aspell) {
|
if (with_aspell) {
|
||||||
|
// If spelling with aspell, neither katakana nor other cjk
|
||||||
|
// scripts are candidates
|
||||||
if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
|
if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
|
// The Xapian speller (purely proximity-based) can be used
|
||||||
|
// for Katakana (when split as words which is not always
|
||||||
|
// completely feasible because of separator-less
|
||||||
|
// compounds). Currently we don't try to use the Xapian
|
||||||
|
// speller with other scripts with which it would be usable
|
||||||
|
// in the absence of aspell (it would indeed be better
|
||||||
|
// than nothing with e.g. european languages). This would
|
||||||
|
// require a few more config variables, maybe one day.
|
||||||
if (!TextSplit::isKATAKANA(*u8i)) {
|
if (!TextSplit::isKATAKANA(*u8i)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
|
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
|
||||||
!= string::npos)
|
!= string::npos)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user