From 7287bd4d7e2c6791f071105a53c695be061b0cc0 Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 15 Dec 2004 08:21:05 +0000 Subject: [PATCH] just converted (indent+comments) from estraier --- src/index/csguess.cpp | 135 ++++++++++++++++++++++++++++++++++++++++++ src/index/csguess.h | 12 ++++ 2 files changed, 147 insertions(+) create mode 100644 src/index/csguess.cpp create mode 100644 src/index/csguess.h diff --git a/src/index/csguess.cpp b/src/index/csguess.cpp new file mode 100644 index 00000000..6ef8681c --- /dev/null +++ b/src/index/csguess.cpp @@ -0,0 +1,135 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: csguess.cpp,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes"; +#endif +// This code was converted from estraier / qdbm / myconf.c +#include + +#include +#include "csguess.h" + +#include +using std::string; + +// The values from estraier were 32768, 256, 0.001 +const int ICONVCHECKSIZ = 4000; +const int ICONVMISSMAX = 10; +const double ICONVALLWRAT = 0.001; + +// Try to transcode and count errors (for charset guessing) +static int transcodeErrCnt(const char *ptr, int size, + const char *icode, const char *ocode) +{ + iconv_t ic; + char obuf[ICONVCHECKSIZ], *wp, *rp; + size_t isiz, osiz; + int miss; + isiz = size; + if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ICONVMISSMAX; + miss = 0; + rp = (char *)ptr; + while(isiz > 0){ + osiz = ICONVCHECKSIZ; + wp = obuf; + if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == -1){ + if(errno == EILSEQ || errno == EINVAL){ + rp++; + isiz--; + miss++; + if(miss >= ICONVMISSMAX) + break; + } else { + break; + } + } + } + if(iconv_close(ic) == -1) + return ICONVMISSMAX; + return miss; +} + + +string csguess(const string &in) +{ + const char *hypo; + int i, miss; + const char *text = in.c_str(); + bool cr = false; + + int size = in.length(); + if (size > ICONVCHECKSIZ) + size = ICONVCHECKSIZ; + + // UTF-16 with normal prefix ? + if (size >= 2 && (!memcmp(text, "\xfe\xff", 2) || + !memcmp(text, "\xff\xfe", 2))) + return "UTF-16"; + + // If we find a zero at an appropriate position, guess it's UTF-16 + // anyway. This is a quite expensive test for other texts as we'll + // have to scan the whole thing. + for (i = 0; i < size - 1; i += 2) { + if (text[i] == 0 && text[i + 1] != 0) + return "UTF-16BE"; + if (text[i + 1] == 0 && text[i] != 0) + return "UTF-16LE"; + } + + // Look for iso-2022 specific escape sequences. As iso-2022 begins + // in ascii, these succeed fast for a japanese text, but are quite + // expensive for any other + for (i = 0; i < size - 3; i++) { + if (text[i] == 0x1b) { + i++; + if (text[i] == '(' && strchr("BJHI", text[i + 1])) + return "ISO-2022-JP"; + if (text[i] == '$' && strchr("@B(", text[i + 1])) + return "ISO-2022-JP"; + } + } + + // Try conversions from ascii and utf-8. These are unlikely to succeed + // by mistake. + if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1) + return "US-ASCII"; + + if (transcodeErrCnt(text, size, "UTF-8", "UTF-16BE") < 1) + return "UTF-8"; + + hypo = 0; + for (i = 0; i < size; i++) { + if (text[i] == 0xd) { + cr = true; + break; + } + } + + if (cr) { + if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1) + return "Shift_JIS"; + if (!hypo && miss / (double)size <= ICONVALLWRAT) + hypo = "Shift_JIS"; + if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1) + return "EUC-JP"; + if (!hypo && miss / (double)size <= ICONVALLWRAT) + hypo = "EUC-JP"; + } else { + if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1) + return "EUC-JP"; + if (!hypo && miss / (double)size <= ICONVALLWRAT) + hypo = "EUC-JP"; + if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1) + return "Shift_JIS"; + if (!hypo && miss / (double)size <= ICONVALLWRAT) + hypo = "Shift_JIS"; + } + if ((miss = transcodeErrCnt(text, size, "UTF-8", "UTF-16BE")) < 1) + return "UTF-8"; + if (!hypo && miss / (double)size <= ICONVALLWRAT) + hypo = "UTF-8"; + if ((miss = transcodeErrCnt(text, size, "CP932", "UTF-16BE")) < 1) + return "CP932"; + if (!hypo && miss / (double)size <= ICONVALLWRAT) + hypo = "CP932"; + + return hypo ? hypo : "ISO-8859-1"; +} diff --git a/src/index/csguess.h b/src/index/csguess.h new file mode 100644 index 00000000..3faff4e5 --- /dev/null +++ b/src/index/csguess.h @@ -0,0 +1,12 @@ +#ifndef _CSGUESS_H_INCLUDED_ +#define _CSGUESS_H_INCLUDED_ +/* @(#$Id: csguess.h,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include + +// Try to guess the character set. This might guess unicode encodings, and +// some asian charsets, but has no chance, for example, of discriminating +// betweeen the different iso8859-xx charsets. +extern std::string csguess(const std::string &in); + +#endif /* _CSGUESS_H_INCLUDED_ */