216 lines
6.2 KiB
C++
216 lines
6.2 KiB
C++
/* Copyright (C) 2004 J.F.Dockes
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
|
|
#ifndef TEST_CSGUESS
|
|
|
|
// This code was converted from estraier / qdbm / myconf.c:
|
|
|
|
/**************************************************************************
|
|
* Copyright (C) 2000-2004 Mikio Hirabayashi
|
|
*
|
|
* This file is part of QDBM, Quick Database Manager.
|
|
*
|
|
* QDBM is free software; you can redistribute it and/or modify it under the
|
|
* terms of the GNU Lesser General Public License as published by the Free
|
|
* Software Foundation; either version 2.1 of the License or any later
|
|
* version. QDBM is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
* License for more details. You should have received a copy of the GNU
|
|
* Lesser General Public License along with QDBM; if not, write to the Free
|
|
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
* 02111-1307 USA.
|
|
* *********************************************************/
|
|
|
|
#include <errno.h>
|
|
#include <cstring>
|
|
#include <iostream>
|
|
|
|
#ifndef NO_NAMESPACES
|
|
using std::string;
|
|
#endif /* NO_NAMESPACES */
|
|
|
|
#include <iconv.h>
|
|
|
|
#include "csguess.h"
|
|
#include "autoconfig.h"
|
|
#ifdef RCL_ICONV_INBUF_CONST
|
|
#define ICV_P2_TYPE const char**
|
|
#else
|
|
#define ICV_P2_TYPE char**
|
|
#endif
|
|
|
|
// The values from estraier were 32768, 256, 0.001
|
|
const int ICONVCHECKSIZ = 32768;
|
|
const int ICONVMISSMAX = 256;
|
|
const double ICONVALLWRAT = 0.001;
|
|
|
|
// Try to transcode and count errors (for charset guessing)
|
|
static int transcodeErrCnt(const char *ptr, int size,
|
|
const char *icode, const char *ocode)
|
|
{
|
|
iconv_t ic;
|
|
char obuf[2*ICONVCHECKSIZ], *wp, *rp;
|
|
size_t isiz, osiz;
|
|
int miss;
|
|
isiz = size;
|
|
if((ic = iconv_open(ocode, icode)) == (iconv_t)-1)
|
|
return size;
|
|
miss = 0;
|
|
rp = (char *)ptr;
|
|
while(isiz > 0){
|
|
osiz = 2*ICONVCHECKSIZ;
|
|
wp = obuf;
|
|
if(iconv(ic, (ICV_P2_TYPE)&rp, &isiz, &wp, &osiz) == (size_t)-1){
|
|
if(errno == EILSEQ || errno == EINVAL){
|
|
rp++;
|
|
isiz--;
|
|
miss++;
|
|
if(miss >= ICONVMISSMAX)
|
|
break;
|
|
} else {
|
|
miss = size;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if(iconv_close(ic) == -1)
|
|
return size;
|
|
return miss;
|
|
}
|
|
|
|
// Try to guess character encoding. This could be optimized quite a
|
|
// lot by avoiding the multiple passes on the document, to be done
|
|
// after usefulness is demonstrated...
|
|
string csguess(const string &in, const string &dflt)
|
|
{
|
|
const char *hypo;
|
|
int i, miss;
|
|
const char *text = in.c_str();
|
|
bool cr = false;
|
|
|
|
int size = in.length();
|
|
if (size > ICONVCHECKSIZ)
|
|
size = ICONVCHECKSIZ;
|
|
|
|
// UTF-16 with normal prefix ?
|
|
if (size >= 2 && (!memcmp(text, "\xfe\xff", 2) ||
|
|
!memcmp(text, "\xff\xfe", 2)))
|
|
return "UTF-16";
|
|
|
|
// If we find a zero at an appropriate position, guess it's UTF-16
|
|
// anyway. This is a quite expensive test for other texts as we'll
|
|
// have to scan the whole thing.
|
|
for (i = 0; i < size - 1; i += 2) {
|
|
if (text[i] == 0 && text[i + 1] != 0)
|
|
return "UTF-16BE";
|
|
if (text[i + 1] == 0 && text[i] != 0)
|
|
return "UTF-16LE";
|
|
}
|
|
|
|
// Look for iso-2022 (rfc1468) specific escape sequences. As
|
|
// iso-2022 begins in ascii, and typically soon escapes, these
|
|
// succeed fast for a japanese text, but are quite expensive for
|
|
// any other
|
|
for (i = 0; i < size - 3; i++) {
|
|
if (text[i] == 0x1b) {
|
|
i++;
|
|
if (text[i] == '(' && strchr("BJHI", text[i + 1]))
|
|
return "ISO-2022-JP";
|
|
if (text[i] == '$' && strchr("@B(", text[i + 1]))
|
|
return "ISO-2022-JP";
|
|
}
|
|
}
|
|
|
|
// Try conversions from ascii and utf-8. These are unlikely to succeed
|
|
// by mistake.
|
|
if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1)
|
|
return "US-ASCII";
|
|
|
|
if (transcodeErrCnt(text, size, "UTF-8", "UTF-16BE") < 1)
|
|
return "UTF-8";
|
|
|
|
hypo = 0;
|
|
for (i = 0; i < size; i++) {
|
|
if (text[i] == 0xd) {
|
|
cr = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (cr) {
|
|
if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1)
|
|
return "Shift_JIS";
|
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
|
hypo = "Shift_JIS";
|
|
if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1)
|
|
return "EUC-JP";
|
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
|
hypo = "EUC-JP";
|
|
} else {
|
|
if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1)
|
|
return "EUC-JP";
|
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
|
hypo = "EUC-JP";
|
|
if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1)
|
|
return "Shift_JIS";
|
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
|
hypo = "Shift_JIS";
|
|
}
|
|
if ((miss = transcodeErrCnt(text, size, "UTF-8", "UTF-16BE")) < 1)
|
|
return "UTF-8";
|
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
|
hypo = "UTF-8";
|
|
if ((miss = transcodeErrCnt(text, size, "CP932", "UTF-16BE")) < 1)
|
|
return "CP932";
|
|
if (!hypo && miss / (double)size <= ICONVALLWRAT)
|
|
hypo = "CP932";
|
|
|
|
return hypo ? hypo : dflt;
|
|
}
|
|
|
|
#else
|
|
|
|
#include <errno.h>
|
|
|
|
#include <cstdlib>
|
|
#include <string>
|
|
#include <iostream>
|
|
|
|
using namespace std;
|
|
|
|
#include "readfile.h"
|
|
#include "csguess.h"
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
if (argc != 2) {
|
|
cerr << "Usage: trcsguess <filename> <default>" << endl;
|
|
exit(1);
|
|
}
|
|
const string filename = argv[1];
|
|
const string dflt = argv[2];
|
|
string text;
|
|
if (!file_to_string(filename, text)) {
|
|
cerr << "Couldnt read file, errno " << errno << endl;
|
|
exit(1);
|
|
}
|
|
cout << csguess(text, dflt) << endl;
|
|
exit(0);
|
|
}
|
|
#endif
|