When checking if user input contains capital letters, take care of some lowercase letters which dont casefold to themselves
This commit is contained in:
parent
8428093f6a
commit
780521ec6c
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2004 J.F.Dockes
|
/* Copyright (C) 2004-2019 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -15,7 +15,6 @@
|
|||||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef TEST_UNACPP
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
@ -28,7 +27,7 @@
|
|||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
|
|
||||||
bool unacmaybefold(const string &in, string &out,
|
bool unacmaybefold(const string &in, string &out,
|
||||||
const char *encoding, UnacOp what)
|
const char *encoding, UnacOp what)
|
||||||
{
|
{
|
||||||
char *cout = 0;
|
char *cout = 0;
|
||||||
size_t out_len;
|
size_t out_len;
|
||||||
@ -36,30 +35,30 @@ bool unacmaybefold(const string &in, string &out,
|
|||||||
|
|
||||||
switch (what) {
|
switch (what) {
|
||||||
case UNACOP_UNAC:
|
case UNACOP_UNAC:
|
||||||
status = unac_string(encoding, in.c_str(), in.length(),
|
status = unac_string(encoding, in.c_str(), in.length(),
|
||||||
&cout, &out_len);
|
&cout, &out_len);
|
||||||
break;
|
break;
|
||||||
case UNACOP_UNACFOLD:
|
case UNACOP_UNACFOLD:
|
||||||
status = unacfold_string(encoding, in.c_str(), in.length(),
|
status = unacfold_string(encoding, in.c_str(), in.length(),
|
||||||
&cout, &out_len);
|
&cout, &out_len);
|
||||||
break;
|
break;
|
||||||
case UNACOP_FOLD:
|
case UNACOP_FOLD:
|
||||||
status = fold_string(encoding, in.c_str(), in.length(),
|
status = fold_string(encoding, in.c_str(), in.length(),
|
||||||
&cout, &out_len);
|
&cout, &out_len);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
if (cout)
|
if (cout)
|
||||||
free(cout);
|
free(cout);
|
||||||
char cerrno[20];
|
char cerrno[20];
|
||||||
sprintf(cerrno, "%d", errno);
|
sprintf(cerrno, "%d", errno);
|
||||||
out = string("unac_string failed, errno : ") + cerrno;
|
out = string("unac_string failed, errno : ") + cerrno;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
out.assign(cout, out_len);
|
out.assign(cout, out_len);
|
||||||
if (cout)
|
if (cout)
|
||||||
free(cout);
|
free(cout);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,183 +67,83 @@ bool unacmaybefold(const string &in, string &out,
|
|||||||
// testing user-entered terms, so we don't really care.
|
// testing user-entered terms, so we don't really care.
|
||||||
bool unaciscapital(const string& in)
|
bool unaciscapital(const string& in)
|
||||||
{
|
{
|
||||||
LOGDEB2("unaciscapital: [" << (in) << "]\n" );
|
LOGDEB2("unaciscapital: [" << in << "]\n");
|
||||||
if (in.empty())
|
if (in.empty())
|
||||||
return false;
|
return false;
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
string shorter;
|
string shorter;
|
||||||
it.appendchartostring(shorter);
|
it.appendchartostring(shorter);
|
||||||
|
|
||||||
string lower;
|
string lower;
|
||||||
if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
|
if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
LOGINFO("unaciscapital: unac/fold failed for [" << (in) << "]\n" );
|
LOGINFO("unaciscapital: unac/fold failed for [" << in << "]\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
Utf8Iter it1(lower);
|
Utf8Iter it1(lower);
|
||||||
if (*it != *it1)
|
if (*it != *it1)
|
||||||
return true;
|
return true;
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool unachasuppercase(const string& in)
|
|
||||||
{
|
|
||||||
LOGDEB2("unachasuppercase: [" << (in) << "]\n" );
|
|
||||||
if (in.empty())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
|
// Check if input contains upper case characters. We used to case-fold
|
||||||
|
// the input and look for a difference, but lowercasing and
|
||||||
|
// casefolding are actually not exactly the same, for example german
|
||||||
|
// sharp s folds to ss but lowercases to itself, and greek final sigma
|
||||||
|
// folds to sigma. So an input containing one of these characters
|
||||||
|
// would wrongly detected as containing upper case. We now handle a
|
||||||
|
// few special cases explicitely, by folding them before performing
|
||||||
|
// the lowercasing. There are actually quite a few other cases of
|
||||||
|
// lowercase being transformed by casefolding, check Unicode
|
||||||
|
// CaseFolding.txt for occurrences of SMALL. One more step towards
|
||||||
|
// ditching everything and using icu...
|
||||||
|
bool unachasuppercase(const string& _in)
|
||||||
|
{
|
||||||
|
LOGDEB("unachasuppercase: in [" << _in << "]\n");
|
||||||
|
if (_in.empty())
|
||||||
|
return false;
|
||||||
|
string in;
|
||||||
|
Utf8Iter it(_in);
|
||||||
|
for (; !it.eof(); it++) {
|
||||||
|
if (*it == 0xdf) {
|
||||||
|
// s sharp -> ss
|
||||||
|
in += 's';
|
||||||
|
in += 's';
|
||||||
|
} else if (*it == 0x3c2) {
|
||||||
|
// final sigma -> sigma
|
||||||
|
in.append("\xcf\x83");
|
||||||
|
} else {
|
||||||
|
it.appendchartostring(in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOGDEB("unachasuppercase: folded: [" << in << "]\n");
|
||||||
|
|
||||||
string lower;
|
string lower;
|
||||||
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
|
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
LOGINFO("unachasuppercase: unac/fold failed for [" << (in) << "]\n" );
|
LOGINFO("unachasuppercase: unac/fold failed for [" << in << "]\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
LOGDEB("unachasuppercase: lower [" << lower << "]\n");
|
||||||
if (lower != in)
|
if (lower != in)
|
||||||
return true;
|
return true;
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool unachasaccents(const string& in)
|
bool unachasaccents(const string& in)
|
||||||
{
|
{
|
||||||
LOGDEB2("unachasaccents: [" << (in) << "]\n" );
|
LOGDEB("unachasaccents: in [" << in << "]\n");
|
||||||
if (in.empty())
|
if (in.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
string noac;
|
string noac;
|
||||||
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
|
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
|
||||||
LOGINFO("unachasaccents: unac/unac failed for [" << (in) << "]\n" );
|
LOGINFO("unachasaccents: unac/unac failed for [" << (in) << "]\n" );
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
LOGDEB("unachasaccents: noac [" << noac << "]\n");
|
||||||
if (noac != in)
|
if (noac != in)
|
||||||
return true;
|
return true;
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // not testing
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
#include "unacpp.h"
|
|
||||||
#include "readfile.h"
|
|
||||||
#include "rclinit.h"
|
|
||||||
|
|
||||||
static char *thisprog;
|
|
||||||
|
|
||||||
static char usage [] = "\n"
|
|
||||||
"[-c|-C] <encoding> <infile> <outfile>\n"
|
|
||||||
" Default : unaccent\n"
|
|
||||||
" -c : unaccent and casefold\n"
|
|
||||||
" -C : casefold only\n"
|
|
||||||
"-t <string> test string as capitalized, upper-case anywhere, accents\n"
|
|
||||||
" the parameter is supposedly utf-8 so this can only work in an utf-8\n"
|
|
||||||
" locale\n"
|
|
||||||
"\n";
|
|
||||||
;
|
|
||||||
|
|
||||||
static void
|
|
||||||
Usage(void)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "%s: usage: %s\n", thisprog, usage);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int op_flags;
|
|
||||||
#define OPT_c 0x2
|
|
||||||
#define OPT_C 0x4
|
|
||||||
#define OPT_t 0x8
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
UnacOp op = UNACOP_UNAC;
|
|
||||||
|
|
||||||
thisprog = argv[0];
|
|
||||||
argc--; argv++;
|
|
||||||
|
|
||||||
while (argc > 0 && **argv == '-') {
|
|
||||||
(*argv)++;
|
|
||||||
if (!(**argv))
|
|
||||||
/* Cas du "adb - core" */
|
|
||||||
Usage();
|
|
||||||
while (**argv)
|
|
||||||
switch (*(*argv)++) {
|
|
||||||
case 'c': op_flags |= OPT_c; break;
|
|
||||||
case 'C': op_flags |= OPT_C; break;
|
|
||||||
case 't': op_flags |= OPT_t; break;
|
|
||||||
default: Usage(); break;
|
|
||||||
}
|
|
||||||
argc--; argv++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (op_flags & OPT_t) {
|
|
||||||
if (argc != 1)
|
|
||||||
Usage();
|
|
||||||
string in = *argv++;argc--;
|
|
||||||
bool capital, upper, accent;
|
|
||||||
capital = unaciscapital(in);
|
|
||||||
upper = unachasuppercase(in);
|
|
||||||
accent = unachasaccents(in);
|
|
||||||
cout << "[" << in << "] : " <<
|
|
||||||
"capitalized: " << (capital ? "Yes. " : "No. ") <<
|
|
||||||
"has uppercase: " << (upper ? "Yes. " : "No. ") <<
|
|
||||||
"has accents: " << (accent ? "Yes. " : "No. ") <<
|
|
||||||
endl;
|
|
||||||
return 0;
|
|
||||||
} else {
|
|
||||||
if (argc != 3)
|
|
||||||
Usage();
|
|
||||||
if (op_flags & OPT_c) {
|
|
||||||
op = UNACOP_UNACFOLD;
|
|
||||||
} else if (op_flags & OPT_C) {
|
|
||||||
op = UNACOP_FOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *encoding = *argv++; argc--;
|
|
||||||
string ifn = *argv++; argc--;
|
|
||||||
if (!ifn.compare("stdin"))
|
|
||||||
ifn.clear();
|
|
||||||
const char *ofn = *argv++; argc--;
|
|
||||||
|
|
||||||
string reason;
|
|
||||||
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
|
||||||
|
|
||||||
string odata;
|
|
||||||
if (!file_to_string(ifn, odata)) {
|
|
||||||
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
string ndata;
|
|
||||||
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
|
||||||
cerr << "unac: " << ndata << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int fd;
|
|
||||||
if (strcmp(ofn, "stdout")) {
|
|
||||||
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
|
||||||
} else {
|
|
||||||
fd = 1;
|
|
||||||
}
|
|
||||||
if (fd < 0) {
|
|
||||||
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
|
||||||
<< endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
|
||||||
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
close(fd);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user