just converted (indent+comments) from estraier

2004-12-15 08:21:05 +00:00 · 2004-12-15 08:21:05 +00:00 · 7287bd4d7e
commit 7287bd4d7e
parent 5ca462cdff
2 changed files with 147 additions and 0 deletions
--- a/src/index/csguess.cpp
+++ b/src/index/csguess.cpp
@ -0,0 +1,135 @@
 #ifndef lint
 static char	rcsid[] = "@(#$Id: csguess.cpp,v 1.1 2004-12-15 08:21:05 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 // This code was converted from estraier / qdbm / myconf.c
 #include <errno.h>
 #include <iconv.h>
 #include "csguess.h"
 #include <string>
 using std::string;
 // The values from estraier were 32768, 256, 0.001
 const int ICONVCHECKSIZ = 4000;
 const int ICONVMISSMAX  = 10;
 const double ICONVALLWRAT = 0.001;
 // Try to transcode and count errors (for charset guessing)
 static int transcodeErrCnt(const char *ptr, int size, 
 			   const char *icode, const char *ocode)
 {
    iconv_t ic;
    char obuf[ICONVCHECKSIZ], *wp, *rp;
    size_t isiz, osiz;
    int miss;
    isiz = size;
    if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ICONVMISSMAX;
    miss = 0;
    rp = (char *)ptr;
    while(isiz > 0){
 	osiz = ICONVCHECKSIZ;
 	wp = obuf;
 	if(iconv(ic, (const char **)&rp, &isiz, &wp, &osiz) == -1){
 	    if(errno == EILSEQ || errno == EINVAL){
 		rp++;
 		isiz--;
 		miss++;
 		if(miss >= ICONVMISSMAX) 
 		    break;
 	    } else {
 		break;
 	    }
 	}
    }
    if(iconv_close(ic) == -1) 
 	return ICONVMISSMAX;
    return miss;
 }
 string csguess(const string &in)
 {
    const char     *hypo;
    int		i, miss;
    const char *text = in.c_str();
    bool cr = false;
    int size = in.length();
    if (size > ICONVCHECKSIZ)
 	size = ICONVCHECKSIZ;
    // UTF-16 with normal prefix ?
    if (size >= 2 && (!memcmp(text, "\xfe\xff", 2) || 
 		      !memcmp(text, "\xff\xfe", 2)))
 	return "UTF-16";
    // If we find a zero at an appropriate position, guess it's UTF-16 
    // anyway. This is a quite expensive test for other texts as we'll 
    // have to scan the whole thing.
    for (i = 0; i < size - 1; i += 2) {
 	if (text[i] == 0 && text[i + 1] != 0)
 	    return "UTF-16BE";
 	if (text[i + 1] == 0 && text[i] != 0)
 	    return "UTF-16LE";
    }
    // Look for iso-2022 specific escape sequences. As iso-2022 begins
    // in ascii, these succeed fast for a japanese text, but are quite
    // expensive for any other
    for (i = 0; i < size - 3; i++) {
 	if (text[i] == 0x1b) {
 	    i++;
 	    if (text[i] == '(' && strchr("BJHI", text[i + 1]))
 		return "ISO-2022-JP";
 	    if (text[i] == '$' && strchr("@B(", text[i + 1]))
 		return "ISO-2022-JP";
 	}
    }
    // Try conversions from ascii and utf-8. These are unlikely to succeed
    // by mistake.
    if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1)
 	return "US-ASCII";
    if (transcodeErrCnt(text, size, "UTF-8", "UTF-16BE") < 1)
 	return "UTF-8";
    hypo = 0;
    for (i = 0; i < size; i++) {
 	if (text[i] == 0xd) {
 	    cr = true;
 	    break;
 	}
    }
    if (cr) {
 	if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1)
 	    return "Shift_JIS";
 	if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	    hypo = "Shift_JIS";
 	if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1)
 	    return "EUC-JP";
 	if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	    hypo = "EUC-JP";
    } else {
 	if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1)
 	    return "EUC-JP";
 	if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	    hypo = "EUC-JP";
 	if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1)
 	    return "Shift_JIS";
 	if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	    hypo = "Shift_JIS";
    }
    if ((miss = transcodeErrCnt(text, size, "UTF-8", "UTF-16BE")) < 1)
 	return "UTF-8";
    if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	hypo = "UTF-8";
    if ((miss = transcodeErrCnt(text, size, "CP932", "UTF-16BE")) < 1)
 	return "CP932";
    if (!hypo && miss / (double)size <= ICONVALLWRAT)
 	hypo = "CP932";
    return hypo ? hypo : "ISO-8859-1";
 }
--- a/src/index/csguess.h
+++ b/src/index/csguess.h
@ -0,0 +1,12 @@
 #ifndef _CSGUESS_H_INCLUDED_
 #define _CSGUESS_H_INCLUDED_
 /* @(#$Id: csguess.h,v 1.1 2004-12-15 08:21:05 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 // Try to guess the character set. This might guess unicode encodings, and
 // some asian charsets, but has no chance, for example, of discriminating
 // betweeen the different iso8859-xx charsets.
 extern std::string csguess(const std::string &in);
 #endif /* _CSGUESS_H_INCLUDED_ */