When checking if user input contains capital letters, take care of some lowercase letters which dont casefold to themselves

2019-05-16 15:35:11 +02:00 · 2019-05-16 15:35:11 +02:00 · 780521ec6c
commit 780521ec6c
parent 8428093f6a
1 changed files with 69 additions and 170 deletions
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2004 J.F.Dockes
+/* Copyright (C) 2004-2019 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -15,7 +15,6 @@
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
 #ifndef TEST_UNACPP
 #include <stdio.h>
 #include <cstdlib>
 #include <errno.h>
@ -28,7 +27,7 @@
 #include "utf8iter.h"
 bool unacmaybefold(const string &in, string &out, 
-		   const char *encoding, UnacOp what)
+                   const char *encoding, UnacOp what)
 {
    char *cout = 0;
    size_t out_len;
@ -36,30 +35,30 @@ bool unacmaybefold(const string &in, string &out,
    switch (what) {
    case UNACOP_UNAC:
-	status = unac_string(encoding, in.c_str(), in.length(), 
+        status = unac_string(encoding, in.c_str(), in.length(), 
-			     &cout, &out_len);
+                             &cout, &out_len);
-	break;
+        break;
    case UNACOP_UNACFOLD:
-	status = unacfold_string(encoding, in.c_str(), in.length(), 
+        status = unacfold_string(encoding, in.c_str(), in.length(), 
-				 &cout, &out_len);
+                                 &cout, &out_len);
-	break;
+        break;
    case UNACOP_FOLD:
-	status = fold_string(encoding, in.c_str(), in.length(), 
+        status = fold_string(encoding, in.c_str(), in.length(), 
-			     &cout, &out_len);
+                             &cout, &out_len);
-	break;
+        break;
    }
    if (status < 0) {
-	if (cout)
+        if (cout)
-	    free(cout);
+            free(cout);
-	char cerrno[20];
+        char cerrno[20];
-	sprintf(cerrno, "%d", errno);
+        sprintf(cerrno, "%d", errno);
-	out = string("unac_string failed, errno : ") + cerrno;
+        out = string("unac_string failed, errno : ") + cerrno;
-	return false;
+        return false;
    }
    out.assign(cout, out_len);
    if (cout)
-	free(cout);
+        free(cout);
    return true;
 }
@ -68,183 +67,83 @@ bool unacmaybefold(const string &in, string &out,
 // testing user-entered terms, so we don't really care.
 bool unaciscapital(const string& in)
 {
-    LOGDEB2("unaciscapital: ["  << (in) << "]\n" );
+    LOGDEB2("unaciscapital: [" << in << "]\n");
    if (in.empty())
-	return false;
+        return false;
    Utf8Iter it(in);
    string shorter;
    it.appendchartostring(shorter);
    string lower;
    if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
-	LOGINFO("unaciscapital: unac/fold failed for ["  << (in) << "]\n" );
+        LOGINFO("unaciscapital: unac/fold failed for [" << in << "]\n");
-	return false;
+        return false;
    } 
    Utf8Iter it1(lower);
    if (*it != *it1)
-	return true;
+        return true;
    else
-	return false;
+        return false;
 }
 bool unachasuppercase(const string& in)
 {
    LOGDEB2("unachasuppercase: ["  << (in) << "]\n" );
    if (in.empty())
 	return false;
 // Check if input contains upper case characters. We used to case-fold
 // the input and look for a difference, but lowercasing and
 // casefolding are actually not exactly the same, for example german
 // sharp s folds to ss but lowercases to itself, and greek final sigma
 // folds to sigma. So an input containing one of these characters
 // would wrongly detected as containing upper case. We now handle a
 // few special cases explicitely, by folding them before performing
 // the lowercasing. There are actually quite a few other cases of
 // lowercase being transformed by casefolding, check Unicode
 // CaseFolding.txt for occurrences of SMALL. One more step towards
 // ditching everything and using icu...
 bool unachasuppercase(const string& _in)
 {
    LOGDEB("unachasuppercase: in [" << _in << "]\n");
    if (_in.empty())
        return false;
    string in;
    Utf8Iter it(_in);
    for (; !it.eof(); it++) {
        if (*it == 0xdf) {
            // s sharp -> ss
            in += 's';
            in += 's';
        } else if (*it == 0x3c2) {
            // final sigma -> sigma
            in.append("\xcf\x83");
        } else {
            it.appendchartostring(in);
        }
    }
    LOGDEB("unachasuppercase: folded: [" << in << "]\n");
    string lower;
    if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
-	LOGINFO("unachasuppercase: unac/fold failed for ["  << (in) << "]\n" );
+        LOGINFO("unachasuppercase: unac/fold failed for [" << in << "]\n");
-	return false;
+        return false;
    } 
    LOGDEB("unachasuppercase: lower [" << lower << "]\n");
    if (lower != in)
-	return true;
+        return true;
    else
-	return false;
+        return false;
 }
 bool unachasaccents(const string& in)
 {
-    LOGDEB2("unachasaccents: ["  << (in) << "]\n" );
+    LOGDEB("unachasaccents: in [" << in << "]\n");
    if (in.empty())
-	return false;
+        return false;
    string noac;
    if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
-	LOGINFO("unachasaccents: unac/unac failed for ["  << (in) << "]\n" );
+        LOGINFO("unachasaccents: unac/unac failed for ["  << (in) << "]\n" );
-	return false;
+        return false;
    } 
    LOGDEB("unachasaccents: noac [" << noac << "]\n");
    if (noac != in)
-	return true;
+        return true;
    else
-	return false;
+        return false;
 }
 #else // not testing
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
 #include <iostream>
 using namespace std;
 #include "unacpp.h"
 #include "readfile.h"
 #include "rclinit.h"
 static char *thisprog;
 static char usage [] = "\n"
    "[-c|-C] <encoding> <infile> <outfile>\n"
    "   Default : unaccent\n"
    "   -c : unaccent and casefold\n"
    "   -C : casefold only\n"
    "-t <string> test string as capitalized, upper-case anywhere, accents\n"
    "   the parameter is supposedly utf-8 so this can only work in an utf-8\n"
    "   locale\n"
    "\n";
 ;
 static void
 Usage(void)
 {
    fprintf(stderr, "%s: usage: %s\n", thisprog, usage);
    exit(1);
 }
 static int     op_flags;
 #define OPT_c	  0x2 
 #define OPT_C	  0x4 
 #define OPT_t     0x8
 int main(int argc, char **argv)
 {
    UnacOp op = UNACOP_UNAC;
    thisprog = argv[0];
    argc--; argv++;
    while (argc > 0 && **argv == '-') {
 	(*argv)++;
 	if (!(**argv))
 	    /* Cas du "adb - core" */
 	    Usage();
 	while (**argv)
 	    switch (*(*argv)++) {
 	    case 'c':	op_flags |= OPT_c; break;
 	    case 'C':	op_flags |= OPT_C; break;
 	    case 't':	op_flags |= OPT_t; break;
 	    default: Usage();	break;
 	    }
 	argc--; argv++;
    }
    if (op_flags & OPT_t) {
 	if (argc != 1)
 	    Usage();
 	string in = *argv++;argc--;
 	bool capital, upper, accent;
 	capital = unaciscapital(in);
 	upper = unachasuppercase(in);
 	accent = unachasaccents(in);
 	cout << "[" << in << "] : " << 
 	    "capitalized: " << (capital ? "Yes. " : "No. ") <<
 	    "has uppercase: " << (upper ? "Yes. " : "No. ") <<
 	    "has accents: " << (accent ? "Yes. " : "No. ") << 
 	    endl;
 	return 0;
    } else {
 	if (argc != 3)
 	    Usage();
 	if (op_flags & OPT_c) {
 	    op = UNACOP_UNACFOLD;
 	} else if (op_flags & OPT_C) {
 	    op = UNACOP_FOLD;
 	}
 	const char *encoding = *argv++; argc--;
 	string ifn = *argv++; argc--;
 	if (!ifn.compare("stdin"))
 	    ifn.clear();
 	const char *ofn = *argv++; argc--;
 	string reason;
 	(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
 	string odata;
 	if (!file_to_string(ifn, odata)) {
 	    cerr << "file_to_string " << ifn << " : " << odata << endl;
 	    return 1;
 	}
 	string ndata;
 	if (!unacmaybefold(odata, ndata, encoding, op)) {
 	    cerr << "unac: " << ndata << endl;
 	    return 1;
 	}
 	int fd;
 	if (strcmp(ofn, "stdout")) {
 	    fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
 	} else {
 	    fd = 1;
 	}
 	if (fd < 0) {
 	    cerr << "Open/Create " << ofn << " failed: " << strerror(errno) 
 		 << endl;
 	    return 1;
 	}
 	if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
 	    cerr << "Write(2) failed: " << strerror(errno)  << endl;
 	    return 1;
 	}
 	close(fd);
 	return 0;
    }
 }
 #endif