recoll/unac/unactest1.c

498 lines
14 KiB
C
Raw Permalink Blame History

/*
* Copyright (C) 2000, 2001, 2002 Loic Dachary <loic@senga.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
/*
* Run unac_string on an input large enough to trigger re-allocation.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "unac.h"
static char* longstr_expected =
"\n"
"Senga - Catalog software\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
" \n"
" \n"
" \n"
"\n"
" \n"
" \n"
" \n"
" \n"
" \n"
"\n"
" \n"
" \n"
" senga.org\n"
" \n"
"\n"
" \n"
"\n"
"\n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
"\n"
"\n"
"\n"
"\n"
"\n"
"December 28, 2000 \n"
" \n"
" January 27, 2000\n"
" Catalog-1.02 \n"
" is available. \n"
" \n"
" The dmoz loading process has been dramatically simplified. It is\n"
" now only available as a command. No more fancy web interface that\n"
" confuses everyone. In addition the convert_dmoz script now generates\n"
" text files that can be directly loaded into Catalog instead of the\n"
" intermediate XML file. The whole loading process now takes from \n"
" one to two hours depending on your machine. It took around 10 hours\n"
" with the previous version. \n"
" The -exclude option was added to convert_dmoz to get rid of \n"
" a whole branch of the catalog at load time. Typical usage would\n"
" be convert_dmoz -exclude '^/Adult' -what content content.rdf.gz.\n"
" A lot more sanity checks and repair have been added to deal with\n"
" duplicates, category id conflicts and the like.\n"
" Hopefully this new method will also be more understandable and \n"
" generate less traffic on the mailing list. There is room for \n"
" improvements and contributors are welcome. \n"
" \n"
" A new set of software is available in the \n"
" download directory under the RedHat-6.1 section. These\n"
" are the most up to date versions on which Catalog depends. Although the\n"
" binaries depend on RedHat-6.1 the perl modules are source and can be\n"
" used on any platform.\n"
" \n"
" September 7, 1999\n"
" Catalog-1.01 \n"
" is available. \n"
" This is a maintainance release.\n"
" \n"
" Various bug fixes. All easy\n"
" to fix bugs have been fixed. Take a look at Bug Track to see what hasn't been fixed.\n"
" The _PATHTEXT_ and _PATHFILE_ \n"
" tags syntax has been extended to specify a range of path component.\n"
" \n"
" Graham Barr added a recursive\n"
" template feature for a catalog root page. This allows to show sub-categories\n"
" of the root categories in the root page of a catalog.\n"
" \n"
" \n"
" Don't hesitate to submit bugs\n"
" or ideas to bug track. Hopefully the next version of Catalog will have\n"
" a fast full text indexing mechanism and I'll be able to implement new\n"
" functionalities.\n"
" \n"
" Have fun !\n"
" July 3, 1999\n"
" Catalog-1.00 \n"
" is available. \n"
" This release includes PHP3 \n"
" code to display a catalog. The author is Weston Bustraan (weston@infinityteldata.net). \n"
" The main motivation to jump directly to version 1.00 is to avoid version \n"
" number problems on CPAN. \n"
" July 2, 1999\n"
" Catalog-0.19 \n"
" is available. \n"
" This is a minor release. The \n"
" most noticeable addition is the new search mechanism.\n"
" \n"
" Searching : two search modes \n"
" are now available. AltaVista simple syntax and AltaVista advanced syntax. \n"
" Both use the Text-Query and Text-Query-SQL perl modules. \n"
" Dmoz loading is much more \n"
" fault tolerant. In addition it can handle compressed versions of content.rdf \n"
" and structure.rdf. The comments are now stored in text fields instead \n"
" of char(255).\n"
" The template system was \n"
" extended with the pre_fill and post_fill parameters.\n"
" Searching associated to \n"
" a catalog dumped to static pages is now possible using the 'static' \n"
" mode.\n"
" Fixed two security weakness \n"
" in confedit and recursive cgi handling.\n"
" Many sql queries have been \n"
" optimized.\n"
" The configuration was changed \n"
" a bit to fix bugs and to isolate database dependencies.\n"
" The tests were updated to \n"
" isolate database dependencies. \n"
" Fixed numerous minor bugs, \n"
" check ChangeLog if you're interested in details.\n"
" \n"
" Many thanks to Tim Bunce for \n"
" his numerous contributions and ideas. He is the architect of the Text-Query \n"
" and Text-Query-SQL modules, Eric Bohlman and Loic Dachary did the programming. \n"
" \n"
" Thanks to Eric Bohlman for \n"
" his help on the Text-Query module. He was very busy but managed to spend \n"
" the time needed to release it. \n"
" There is not yet anything usable \n"
" for full text indexing but we keep working on it. The storage management \n"
" is now handled by the reiserfs file system thanks to Hans Reiser who is \n"
" working full time on this. Loic Dachary does his best to get something \n"
" working, if you're interested go to http://www.senga.org/mifluz/. \n"
" For some mysterious reason \n"
" CPAN lost track of Catalog name. In order to install catalog you should \n"
" use perl -MCPAN -e 'install Catalog::db'. Weird but temporary.\n"
" Have fun !\n"
" The Senga Team\n"
" Ecila\n"
" 100 Av. du General Leclerc\n"
" 93 500 Pantin\n"
" Tel: 33 1 56 96 09 80\n"
" Fax: 33 1 56 96 09 81\n"
" WEB: http://www.senga.org/\n"
" Mail: senga@senga.org\n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
"\n"
"[\n"
"Catalog |\n"
"webbase |\n"
"mifluz |\n"
"unac |\n"
"Search-Mifluz |\n"
"Text-Query |\n"
"uri |\n"
"Statistics |\n"
"News\n"
"]\n"
"\n"
"\n"
" \n"
" \n"
"\n"
"\n"
"\n"
;
static char* longstr =
"\n"
"Senga - Catalog software\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
" \n"
" \n"
" \n"
"\n"
" \n"
" \n"
" \n"
" \n"
" <20>\n"
"\n"
" \n"
" \n"
" senga.org\n"
" \n"
"\n"
"<EFBFBD> \n"
"\n"
"\n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
" \n"
"\n"
"\n"
"\n"
"\n"
"\n"
"December 28, 2000 \n"
" \n"
" January 27, 2000\n"
" Catalog-1.02 \n"
" is available. \n"
" \n"
" The dmoz loading process has been dramatically simplified. It is\n"
" now only available as a command. No more fancy web interface that\n"
" confuses everyone. In addition the convert_dmoz script now generates\n"
" text files that can be directly loaded into Catalog instead of the\n"
" intermediate XML file. The whole loading process now takes from \n"
" one to two hours depending on your machine. It took around 10 hours\n"
" with the previous version. \n"
" The -exclude option was added to convert_dmoz to get rid of \n"
" a whole branch of the catalog at load time. Typical usage would\n"
" be convert_dmoz -exclude '^/Adult' -what content content.rdf.gz.\n"
" A lot more sanity checks and repair have been added to deal with\n"
" duplicates, category id conflicts and the like.\n"
" Hopefully this new method will also be more understandable and \n"
" generate less traffic on the mailing list. There is room for \n"
" improvements and contributors are welcome. \n"
" \n"
" A new set of software is available in the \n"
" download directory under the RedHat-6.1 section. These\n"
" are the most up to date versions on which Catalog depends. Although the\n"
" binaries depend on RedHat-6.1 the perl modules are source and can be\n"
" used on any platform.\n"
" \n"
" September 7, 1999\n"
" Catalog-1.01 \n"
" is available. \n"
" This is a maintainance release.\n"
" \n"
" Various bug fixes. All easy\n"
" to fix bugs have been fixed. Take a look at Bug Track to see what hasn't been fixed.\n"
" The _PATHTEXT_ and _PATHFILE_ \n"
" tags syntax has been extended to specify a range of path component.\n"
" \n"
" Graham Barr added a recursive\n"
" template feature for a catalog root page. This allows to show sub-categories\n"
" of the root categories in the root page of a catalog.\n"
" \n"
" \n"
" Don't hesitate to submit bugs\n"
" or ideas to bug track. Hopefully the next version of Catalog will have\n"
" a fast full text indexing mechanism and I'll be able to implement new\n"
" functionalities.\n"
" \n"
" Have fun !\n"
" July 3, 1999\n"
" Catalog-1.00 \n"
" is available. \n"
" This release includes PHP3 \n"
" code to display a catalog. The author is Weston Bustraan (weston@infinityteldata.net). \n"
" The main motivation to jump directly to version 1.00 is to avoid version \n"
" number problems on CPAN. \n"
" July 2, 1999\n"
" Catalog-0.19 \n"
" is available. \n"
" This is a minor release. The \n"
" most noticeable addition is the new search mechanism.\n"
" \n"
" Searching : two search modes \n"
" are now available. AltaVista simple syntax and AltaVista advanced syntax. \n"
" Both use the Text-Query and Text-Query-SQL perl modules. \n"
" Dmoz loading is much more \n"
" fault tolerant. In addition it can handle compressed versions of content.rdf \n"
" and structure.rdf. The comments are now stored in text fields instead \n"
" of char(255).\n"
" The template system was \n"
" extended with the pre_fill and post_fill parameters.\n"
" Searching associated to \n"
" a catalog dumped to static pages is now possible using the 'static' \n"
" mode.\n"
" Fixed two security weakness \n"
" in confedit and recursive cgi handling.\n"
" Many sql queries have been \n"
" optimized.\n"
" The configuration was changed \n"
" a bit to fix bugs and to isolate database dependencies.\n"
" The tests were updated to \n"
" isolate database dependencies. \n"
" Fixed numerous minor bugs, \n"
" check ChangeLog if you're interested in details.\n"
" \n"
" Many thanks to Tim Bunce for \n"
" his numerous contributions and ideas. He is the architect of the Text-Query \n"
" and Text-Query-SQL modules, Eric Bohlman and Loic Dachary did the programming. \n"
" \n"
" Thanks to Eric Bohlman for \n"
" his help on the Text-Query module. He was very busy but managed to spend \n"
" the time needed to release it. \n"
" There is not yet anything usable \n"
" for full text indexing but we keep working on it. The storage management \n"
" is now handled by the reiserfs file system thanks to Hans Reiser who is \n"
" working full time on this. Loic Dachary does his best to get something \n"
" working, if you're interested go to http://www.senga.org/mifluz/. \n"
" For some mysterious reason \n"
" CPAN lost track of Catalog name. In order to install catalog you should \n"
" use perl -MCPAN -e 'install Catalog::db'. Weird but temporary.\n"
" Have fun !\n"
" The Senga Team\n"
" Ecila\n"
" 100 Av. du G<>n<EFBFBD>ral Leclerc\n"
" 93 500 Pantin\n"
" Tel: 33 1 56 96 09 80\n"
" Fax: 33 1 56 96 09 81\n"
" WEB: http://www.senga.org/\n"
" Mail: senga@senga.org\n"
" \n"
" \n"
" \n"
" \n"
" <20>\n"
" \n"
" \n"
"\n"
"[\n"
"Catalog |\n"
"webbase |\n"
"mifluz |\n"
"unac |\n"
"Search-Mifluz |\n"
"Text-Query |\n"
"uri |\n"
"Statistics |\n"
"News\n"
"]\n"
"\n"
"\n"
" \n"
" \n"
"\n"
"\n"
"\n"
;
int main() {
int i;
char* out = 0;
size_t out_length = 0;
{
if(unac_string("ISO-8859-1", "<EFBFBD>t<EFBFBD>", 3, &out, &out_length) < 0) {
perror("unac <20>t<EFBFBD>");
exit(1);
}
if(out_length != 3) {
fprintf(stderr, "out_length == %d instead of 3\n", (int)out_length);
exit(1);
}
if(memcmp("ete", out, out_length)) {
fprintf(stderr, "out == %.*s instead of ete\n", (int)out_length, out);
exit(1);
}
}
{
char tmp[10];
sprintf(tmp, "%c", 0xBC);
if(unac_string("ISO-8859-1", tmp, 1, &out, &out_length) < 0) {
perror("unac 0xBC (1/4)");
exit(1);
}
if(out_length != 3) {
fprintf(stderr, "out_length == %d instead of 3\n", (int)out_length);
exit(1);
}
if(memcmp("1 4", out, out_length)) {
fprintf(stderr, "out == %.*s instead of '1 4'\n", (int)out_length, out);
exit(1);
}
}
for(i = 0; i < 3; i++) {
int longstr_length = strlen(longstr);
if(unac_string("ISO-8859-1", longstr, longstr_length, &out, &out_length) == -1) {
perror("unac_string longstr failed");
exit(1);
}
if(out_length != longstr_length) {
fprintf(stderr, "out_length == %d instead of %d\n", (int)out_length, longstr_length);
exit(1);
}
if(memcmp(longstr_expected, out, out_length)) {
fprintf(stderr, "out == %.*s instead of ete\n", (int)out_length, out);
exit(1);
}
}
free(out);
return 0;
}