diff --git a/src/common/unacpp.cpp b/src/common/unacpp.cpp index c1fecc62..45b08112 100644 --- a/src/common/unacpp.cpp +++ b/src/common/unacpp.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2004 J.F.Dockes +/* Copyright (C) 2004-2019 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -15,7 +15,6 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ -#ifndef TEST_UNACPP #include #include #include @@ -28,7 +27,7 @@ #include "utf8iter.h" bool unacmaybefold(const string &in, string &out, - const char *encoding, UnacOp what) + const char *encoding, UnacOp what) { char *cout = 0; size_t out_len; @@ -36,30 +35,30 @@ bool unacmaybefold(const string &in, string &out, switch (what) { case UNACOP_UNAC: - status = unac_string(encoding, in.c_str(), in.length(), - &cout, &out_len); - break; + status = unac_string(encoding, in.c_str(), in.length(), + &cout, &out_len); + break; case UNACOP_UNACFOLD: - status = unacfold_string(encoding, in.c_str(), in.length(), - &cout, &out_len); - break; + status = unacfold_string(encoding, in.c_str(), in.length(), + &cout, &out_len); + break; case UNACOP_FOLD: - status = fold_string(encoding, in.c_str(), in.length(), - &cout, &out_len); - break; + status = fold_string(encoding, in.c_str(), in.length(), + &cout, &out_len); + break; } if (status < 0) { - if (cout) - free(cout); - char cerrno[20]; - sprintf(cerrno, "%d", errno); - out = string("unac_string failed, errno : ") + cerrno; - return false; + if (cout) + free(cout); + char cerrno[20]; + sprintf(cerrno, "%d", errno); + out = string("unac_string failed, errno : ") + cerrno; + return false; } out.assign(cout, out_len); if (cout) - free(cout); + free(cout); return true; } @@ -68,183 +67,83 @@ bool unacmaybefold(const string &in, string &out, // testing user-entered terms, so we don't really care. bool unaciscapital(const string& in) { - LOGDEB2("unaciscapital: [" << (in) << "]\n" ); + LOGDEB2("unaciscapital: [" << in << "]\n"); if (in.empty()) - return false; + return false; Utf8Iter it(in); string shorter; it.appendchartostring(shorter); string lower; if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) { - LOGINFO("unaciscapital: unac/fold failed for [" << (in) << "]\n" ); - return false; + LOGINFO("unaciscapital: unac/fold failed for [" << in << "]\n"); + return false; } Utf8Iter it1(lower); if (*it != *it1) - return true; + return true; else - return false; + return false; } -bool unachasuppercase(const string& in) -{ - LOGDEB2("unachasuppercase: [" << (in) << "]\n" ); - if (in.empty()) - return false; +// Check if input contains upper case characters. We used to case-fold +// the input and look for a difference, but lowercasing and +// casefolding are actually not exactly the same, for example german +// sharp s folds to ss but lowercases to itself, and greek final sigma +// folds to sigma. So an input containing one of these characters +// would wrongly detected as containing upper case. We now handle a +// few special cases explicitely, by folding them before performing +// the lowercasing. There are actually quite a few other cases of +// lowercase being transformed by casefolding, check Unicode +// CaseFolding.txt for occurrences of SMALL. One more step towards +// ditching everything and using icu... +bool unachasuppercase(const string& _in) +{ + LOGDEB("unachasuppercase: in [" << _in << "]\n"); + if (_in.empty()) + return false; + string in; + Utf8Iter it(_in); + for (; !it.eof(); it++) { + if (*it == 0xdf) { + // s sharp -> ss + in += 's'; + in += 's'; + } else if (*it == 0x3c2) { + // final sigma -> sigma + in.append("\xcf\x83"); + } else { + it.appendchartostring(in); + } + } + LOGDEB("unachasuppercase: folded: [" << in << "]\n"); + string lower; if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) { - LOGINFO("unachasuppercase: unac/fold failed for [" << (in) << "]\n" ); - return false; + LOGINFO("unachasuppercase: unac/fold failed for [" << in << "]\n"); + return false; } + LOGDEB("unachasuppercase: lower [" << lower << "]\n"); if (lower != in) - return true; + return true; else - return false; + return false; } + bool unachasaccents(const string& in) { - LOGDEB2("unachasaccents: [" << (in) << "]\n" ); + LOGDEB("unachasaccents: in [" << in << "]\n"); if (in.empty()) - return false; + return false; string noac; if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) { - LOGINFO("unachasaccents: unac/unac failed for [" << (in) << "]\n" ); - return false; + LOGINFO("unachasaccents: unac/unac failed for [" << (in) << "]\n" ); + return false; } + LOGDEB("unachasaccents: noac [" << noac << "]\n"); if (noac != in) - return true; + return true; else - return false; + return false; } - -#else // not testing - -#include -#include -#include -#include -#include -#include - -#include - -using namespace std; - -#include "unacpp.h" -#include "readfile.h" -#include "rclinit.h" - -static char *thisprog; - -static char usage [] = "\n" - "[-c|-C] \n" - " Default : unaccent\n" - " -c : unaccent and casefold\n" - " -C : casefold only\n" - "-t test string as capitalized, upper-case anywhere, accents\n" - " the parameter is supposedly utf-8 so this can only work in an utf-8\n" - " locale\n" - "\n"; -; - -static void -Usage(void) -{ - fprintf(stderr, "%s: usage: %s\n", thisprog, usage); - exit(1); -} - -static int op_flags; -#define OPT_c 0x2 -#define OPT_C 0x4 -#define OPT_t 0x8 - -int main(int argc, char **argv) -{ - UnacOp op = UNACOP_UNAC; - - thisprog = argv[0]; - argc--; argv++; - - while (argc > 0 && **argv == '-') { - (*argv)++; - if (!(**argv)) - /* Cas du "adb - core" */ - Usage(); - while (**argv) - switch (*(*argv)++) { - case 'c': op_flags |= OPT_c; break; - case 'C': op_flags |= OPT_C; break; - case 't': op_flags |= OPT_t; break; - default: Usage(); break; - } - argc--; argv++; - } - - if (op_flags & OPT_t) { - if (argc != 1) - Usage(); - string in = *argv++;argc--; - bool capital, upper, accent; - capital = unaciscapital(in); - upper = unachasuppercase(in); - accent = unachasaccents(in); - cout << "[" << in << "] : " << - "capitalized: " << (capital ? "Yes. " : "No. ") << - "has uppercase: " << (upper ? "Yes. " : "No. ") << - "has accents: " << (accent ? "Yes. " : "No. ") << - endl; - return 0; - } else { - if (argc != 3) - Usage(); - if (op_flags & OPT_c) { - op = UNACOP_UNACFOLD; - } else if (op_flags & OPT_C) { - op = UNACOP_FOLD; - } - - const char *encoding = *argv++; argc--; - string ifn = *argv++; argc--; - if (!ifn.compare("stdin")) - ifn.clear(); - const char *ofn = *argv++; argc--; - - string reason; - (void)recollinit(RCLINIT_NONE, 0, 0, reason, 0); - - string odata; - if (!file_to_string(ifn, odata)) { - cerr << "file_to_string " << ifn << " : " << odata << endl; - return 1; - } - string ndata; - if (!unacmaybefold(odata, ndata, encoding, op)) { - cerr << "unac: " << ndata << endl; - return 1; - } - - int fd; - if (strcmp(ofn, "stdout")) { - fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666); - } else { - fd = 1; - } - if (fd < 0) { - cerr << "Open/Create " << ofn << " failed: " << strerror(errno) - << endl; - return 1; - } - if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) { - cerr << "Write(2) failed: " << strerror(errno) << endl; - return 1; - } - close(fd); - return 0; - } -} - -#endif -