just converted (indent+comments) from estraier
This commit is contained in:
parent
5ca462cdff
commit
7287bd4d7e
135
src/index/csguess.cpp
Normal file
135
src/index/csguess.cpp
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: csguess.cpp,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
// This code was converted from estraier / qdbm / myconf.c
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <iconv.h>
|
||||||
|
#include "csguess.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
// The values from estraier were 32768, 256, 0.001
|
||||||
|
const int ICONVCHECKSIZ = 4000;
|
||||||
|
const int ICONVMISSMAX = 10;
|
||||||
|
const double ICONVALLWRAT = 0.001;
|
||||||
|
|
||||||
|
// Try to transcode and count errors (for charset guessing)
|
||||||
|
static int transcodeErrCnt(const char *ptr, int size,
|
||||||
|
const char *icode, const char *ocode)
|
||||||
|
{
|
||||||
|
iconv_t ic;
|
||||||
|
char obuf[ICONVCHECKSIZ], *wp, *rp;
|
||||||
|
size_t isiz, osiz;
|
||||||
|
int miss;
|
||||||
|
isiz = size;
|
||||||
|
if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ICONVMISSMAX;
|
||||||
|
miss = 0;
|
||||||
|
rp = (char *)ptr;
|
||||||
|
while(isiz > 0){
|
||||||
|
osiz = ICONVCHECKSIZ;
|
||||||
|
wp = obuf;
|
||||||
|
if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == -1){
|
||||||
|
if(errno == EILSEQ || errno == EINVAL){
|
||||||
|
rp++;
|
||||||
|
isiz--;
|
||||||
|
miss++;
|
||||||
|
if(miss >= ICONVMISSMAX)
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(iconv_close(ic) == -1)
|
||||||
|
return ICONVMISSMAX;
|
||||||
|
return miss;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
string csguess(const string &in)
|
||||||
|
{
|
||||||
|
const char *hypo;
|
||||||
|
int i, miss;
|
||||||
|
const char *text = in.c_str();
|
||||||
|
bool cr = false;
|
||||||
|
|
||||||
|
int size = in.length();
|
||||||
|
if (size > ICONVCHECKSIZ)
|
||||||
|
size = ICONVCHECKSIZ;
|
||||||
|
|
||||||
|
// UTF-16 with normal prefix ?
|
||||||
|
if (size >= 2 && (!memcmp(text, "\xfe\xff", 2) ||
|
||||||
|
!memcmp(text, "\xff\xfe", 2)))
|
||||||
|
return "UTF-16";
|
||||||
|
|
||||||
|
// If we find a zero at an appropriate position, guess it's UTF-16
|
||||||
|
// anyway. This is a quite expensive test for other texts as we'll
|
||||||
|
// have to scan the whole thing.
|
||||||
|
for (i = 0; i < size - 1; i += 2) {
|
||||||
|
if (text[i] == 0 && text[i + 1] != 0)
|
||||||
|
return "UTF-16BE";
|
||||||
|
if (text[i + 1] == 0 && text[i] != 0)
|
||||||
|
return "UTF-16LE";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for iso-2022 specific escape sequences. As iso-2022 begins
|
||||||
|
// in ascii, these succeed fast for a japanese text, but are quite
|
||||||
|
// expensive for any other
|
||||||
|
for (i = 0; i < size - 3; i++) {
|
||||||
|
if (text[i] == 0x1b) {
|
||||||
|
i++;
|
||||||
|
if (text[i] == '(' && strchr("BJHI", text[i + 1]))
|
||||||
|
return "ISO-2022-JP";
|
||||||
|
if (text[i] == '$' && strchr("@B(", text[i + 1]))
|
||||||
|
return "ISO-2022-JP";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try conversions from ascii and utf-8. These are unlikely to succeed
|
||||||
|
// by mistake.
|
||||||
|
if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1)
|
||||||
|
return "US-ASCII";
|
||||||
|
|
||||||
|
if (transcodeErrCnt(text, size, "UTF-8", "UTF-16BE") < 1)
|
||||||
|
return "UTF-8";
|
||||||
|
|
||||||
|
hypo = 0;
|
||||||
|
for (i = 0; i < size; i++) {
|
||||||
|
if (text[i] == 0xd) {
|
||||||
|
cr = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cr) {
|
||||||
|
if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1)
|
||||||
|
return "Shift_JIS";
|
||||||
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
||||||
|
hypo = "Shift_JIS";
|
||||||
|
if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1)
|
||||||
|
return "EUC-JP";
|
||||||
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
||||||
|
hypo = "EUC-JP";
|
||||||
|
} else {
|
||||||
|
if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1)
|
||||||
|
return "EUC-JP";
|
||||||
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
||||||
|
hypo = "EUC-JP";
|
||||||
|
if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1)
|
||||||
|
return "Shift_JIS";
|
||||||
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
||||||
|
hypo = "Shift_JIS";
|
||||||
|
}
|
||||||
|
if ((miss = transcodeErrCnt(text, size, "UTF-8", "UTF-16BE")) < 1)
|
||||||
|
return "UTF-8";
|
||||||
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
||||||
|
hypo = "UTF-8";
|
||||||
|
if ((miss = transcodeErrCnt(text, size, "CP932", "UTF-16BE")) < 1)
|
||||||
|
return "CP932";
|
||||||
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
||||||
|
hypo = "CP932";
|
||||||
|
|
||||||
|
return hypo ? hypo : "ISO-8859-1";
|
||||||
|
}
|
||||||
12
src/index/csguess.h
Normal file
12
src/index/csguess.h
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#ifndef _CSGUESS_H_INCLUDED_
|
||||||
|
#define _CSGUESS_H_INCLUDED_
|
||||||
|
/* @(#$Id: csguess.h,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// Try to guess the character set. This might guess unicode encodings, and
|
||||||
|
// some asian charsets, but has no chance, for example, of discriminating
|
||||||
|
// betweeen the different iso8859-xx charsets.
|
||||||
|
extern std::string csguess(const std::string &in);
|
||||||
|
|
||||||
|
#endif /* _CSGUESS_H_INCLUDED_ */
|
||||||
Loading…
x
Reference in New Issue
Block a user