#ifndef lint static char rcsid[] = "@(#$Id: csguess.cpp,v 1.5 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef TEST_CSGUESS // This code was converted from estraier / qdbm / myconf.c: /************************************************************************** * Copyright (C) 2000-2004 Mikio Hirabayashi * * This file is part of QDBM, Quick Database Manager. * * QDBM is free software; you can redistribute it and/or modify it under the * terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License or any later * version. QDBM is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. You should have received a copy of the GNU * Lesser General Public License along with QDBM; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA. * *********************************************************/ #include #include #include #ifndef NO_NAMESPACES using std::string; #endif /* NO_NAMESPACES */ #include #include "csguess.h" // The values from estraier were 32768, 256, 0.001 const int ICONVCHECKSIZ = 32768; const int ICONVMISSMAX = 256; const double ICONVALLWRAT = 0.001; // Try to transcode and count errors (for charset guessing) static int transcodeErrCnt(const char *ptr, int size, const char *icode, const char *ocode) { iconv_t ic; char obuf[2*ICONVCHECKSIZ], *wp, *rp; size_t isiz, osiz; int miss; isiz = size; if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return size; miss = 0; rp = (char *)ptr; while(isiz > 0){ osiz = 2*ICONVCHECKSIZ; wp = obuf; if(iconv(ic, #if defined(_LIBICONV_VERSION) (const char **)&rp, #else (char **)&rp, #endif &isiz, &wp, &osiz) == (size_t)-1){ if(errno == EILSEQ || errno == EINVAL){ rp++; isiz--; miss++; if(miss >= ICONVMISSMAX) break; } else { miss = size; break; } } } if(iconv_close(ic) == -1) return size; return miss; } // Try to guess character encoding. This could be optimized quite a // lot by avoiding the multiple passes on the document, to be done // after usefulness is demonstrated... string csguess(const string &in, const string &dflt) { const char *hypo; int i, miss; const char *text = in.c_str(); bool cr = false; int size = in.length(); if (size > ICONVCHECKSIZ) size = ICONVCHECKSIZ; // UTF-16 with normal prefix ? if (size >= 2 && (!memcmp(text, "\xfe\xff", 2) || !memcmp(text, "\xff\xfe", 2))) return "UTF-16"; // If we find a zero at an appropriate position, guess it's UTF-16 // anyway. This is a quite expensive test for other texts as we'll // have to scan the whole thing. for (i = 0; i < size - 1; i += 2) { if (text[i] == 0 && text[i + 1] != 0) return "UTF-16BE"; if (text[i + 1] == 0 && text[i] != 0) return "UTF-16LE"; } // Look for iso-2022 (rfc1468) specific escape sequences. As // iso-2022 begins in ascii, and typically soon escapes, these // succeed fast for a japanese text, but are quite expensive for // any other for (i = 0; i < size - 3; i++) { if (text[i] == 0x1b) { i++; if (text[i] == '(' && strchr("BJHI", text[i + 1])) return "ISO-2022-JP"; if (text[i] == '$' && strchr("@B(", text[i + 1])) return "ISO-2022-JP"; } } // Try conversions from ascii and utf-8. These are unlikely to succeed // by mistake. if (transcodeErrCnt(text, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII"; if (transcodeErrCnt(text, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8"; hypo = 0; for (i = 0; i < size; i++) { if (text[i] == 0xd) { cr = true; break; } } if (cr) { if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS"; if (!hypo && miss / (double)size <= ICONVALLWRAT) hypo = "Shift_JIS"; if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP"; if (!hypo && miss / (double)size <= ICONVALLWRAT) hypo = "EUC-JP"; } else { if ((miss = transcodeErrCnt(text, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP"; if (!hypo && miss / (double)size <= ICONVALLWRAT) hypo = "EUC-JP"; if ((miss = transcodeErrCnt(text, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS"; if (!hypo && miss / (double)size <= ICONVALLWRAT) hypo = "Shift_JIS"; } if ((miss = transcodeErrCnt(text, size, "UTF-8", "UTF-16BE")) < 1) return "UTF-8"; if (!hypo && miss / (double)size <= ICONVALLWRAT) hypo = "UTF-8"; if ((miss = transcodeErrCnt(text, size, "CP932", "UTF-16BE")) < 1) return "CP932"; if (!hypo && miss / (double)size <= ICONVALLWRAT) hypo = "CP932"; return hypo ? hypo : dflt; } #else #include #include #include using namespace std; #include "readfile.h" #include "csguess.h" int main(int argc, char **argv) { if (argc != 2) { cerr << "Usage: trcsguess " << endl; exit(1); } const string filename = argv[1]; const string dflt = argv[2]; string text; if (!file_to_string(filename, text)) { cerr << "Couldnt read file, errno " << errno << endl; exit(1); } cout << csguess(text, dflt) << endl; exit(0); } #endif