textsplit: process unicode apostrophes and right quotation mark as ascii single quote
This commit is contained in:
parent
b1ff34407d
commit
bbeaebf632
@ -14,7 +14,7 @@
|
|||||||
* Free Software Foundation, Inc.,
|
* Free Software Foundation, Inc.,
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
#ifndef TEST_TEXTSPLIT
|
|
||||||
#include "autoconfig.h"
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
@ -80,8 +80,7 @@ static std::unordered_set<unsigned int> sskip;
|
|||||||
|
|
||||||
class CharClassInit {
|
class CharClassInit {
|
||||||
public:
|
public:
|
||||||
CharClassInit()
|
CharClassInit() {
|
||||||
{
|
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
// Set default value for all: SPACE
|
// Set default value for all: SPACE
|
||||||
@ -138,7 +137,7 @@ public:
|
|||||||
};
|
};
|
||||||
static const CharClassInit charClassInitInstance;
|
static const CharClassInit charClassInitInstance;
|
||||||
|
|
||||||
static inline int whatcc(unsigned int c)
|
static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
||||||
{
|
{
|
||||||
if (c <= 127) {
|
if (c <= 127) {
|
||||||
return charclasses[c];
|
return charclasses[c];
|
||||||
@ -146,7 +145,15 @@ static inline int whatcc(unsigned int c)
|
|||||||
if (c == 0x2010) {
|
if (c == 0x2010) {
|
||||||
// Special treatment for hyphen: handle as ascii minus. See
|
// Special treatment for hyphen: handle as ascii minus. See
|
||||||
// doc/notes/minus-hyphen-dash.txt
|
// doc/notes/minus-hyphen-dash.txt
|
||||||
return 0x2010;
|
if (asciirep)
|
||||||
|
*asciirep = '-';
|
||||||
|
return c;
|
||||||
|
} else if (c == 0x2019 || c == 0x275c || c == 0x02bc) {
|
||||||
|
// Things sometimes replacing a single quote. Use single
|
||||||
|
// quote so that span processing works ok
|
||||||
|
if (asciirep)
|
||||||
|
*asciirep = '\'';
|
||||||
|
return c;
|
||||||
} else if (sskip.find(c) != sskip.end()) {
|
} else if (sskip.find(c) != sskip.end()) {
|
||||||
return SKIP;
|
return SKIP;
|
||||||
} else if (spunc.find(c) != spunc.end()) {
|
} else if (spunc.find(c) != spunc.end()) {
|
||||||
@ -623,7 +630,8 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
prev_csc = csc;
|
prev_csc = csc;
|
||||||
int cc = whatcc(c);
|
char asciirep = 0;
|
||||||
|
int cc = whatcc(c, &asciirep);
|
||||||
|
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
case SKIP:
|
case SKIP:
|
||||||
@ -709,20 +717,6 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
|
||||||
|
|
||||||
case 0x2010:
|
|
||||||
// Hyphen is replaced with ascii minus
|
|
||||||
if (m_wordLen != 0) {
|
|
||||||
// Treat '-' inside span as glue char
|
|
||||||
if (!doemit(false, it.getBpos()))
|
|
||||||
return false;
|
|
||||||
m_inNumber = false;
|
|
||||||
m_span += '-';
|
|
||||||
m_wordStart++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
goto SPACE;
|
|
||||||
|
|
||||||
case '.':
|
case '.':
|
||||||
{
|
{
|
||||||
@ -768,6 +762,26 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 0x2010:
|
||||||
|
case 0x2019:
|
||||||
|
case 0x275c:
|
||||||
|
case 0x02bc:
|
||||||
|
// Unicode chars which we replace with ascii for
|
||||||
|
// processing (2010 -> -,others -> '). It happens that
|
||||||
|
// they all work as glue chars and use the same code, but
|
||||||
|
// there might be cases needing different processing.
|
||||||
|
// Hyphen is replaced with ascii minus
|
||||||
|
if (m_wordLen) {
|
||||||
|
// Inside span: glue char
|
||||||
|
if (!doemit(false, it.getBpos()))
|
||||||
|
return false;
|
||||||
|
m_inNumber = false;
|
||||||
|
m_span += asciirep;
|
||||||
|
m_wordStart++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
goto SPACE;
|
||||||
|
|
||||||
case '@':
|
case '@':
|
||||||
case '_':
|
case '_':
|
||||||
case '\'':
|
case '\'':
|
||||||
@ -1101,250 +1115,3 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
|
|||||||
return u8stringToStrings<vector<string> >(s, tokens);
|
return u8stringToStrings<vector<string> >(s, tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // TEST driver ->
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
#include "textsplit.h"
|
|
||||||
#include "readfile.h"
|
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
#include "transcode.h"
|
|
||||||
#include "unacpp.h"
|
|
||||||
#include "termproc.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class myTermProc : public Rcl::TermProc {
|
|
||||||
int first;
|
|
||||||
bool nooutput;
|
|
||||||
public:
|
|
||||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
|
||||||
void setNoOut(bool val) {nooutput = val;}
|
|
||||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
|
||||||
{
|
|
||||||
if (nooutput)
|
|
||||||
return true;
|
|
||||||
FILE *fp = stdout;
|
|
||||||
if (first) {
|
|
||||||
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
|
||||||
first = 0;
|
|
||||||
}
|
|
||||||
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define OPT_s 0x1
|
|
||||||
#define OPT_w 0x2
|
|
||||||
#define OPT_q 0x4
|
|
||||||
#define OPT_c 0x8
|
|
||||||
#define OPT_k 0x10
|
|
||||||
#define OPT_C 0x20
|
|
||||||
#define OPT_n 0x40
|
|
||||||
#define OPT_S 0x80
|
|
||||||
#define OPT_u 0x100
|
|
||||||
#define OPT_p 0x200
|
|
||||||
|
|
||||||
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
|
||||||
{
|
|
||||||
myTermProc printproc;
|
|
||||||
|
|
||||||
Rcl::TermProc *nxt = &printproc;
|
|
||||||
|
|
||||||
// Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
|
||||||
// if (op_flags & OPT_S)
|
|
||||||
// nxt = &commonproc;
|
|
||||||
|
|
||||||
Rcl::TermProcPrep preproc(nxt);
|
|
||||||
if (op_flags & OPT_u)
|
|
||||||
nxt = &preproc;
|
|
||||||
|
|
||||||
Rcl::TextSplitP splitter(nxt, flags);
|
|
||||||
|
|
||||||
if (op_flags & OPT_q)
|
|
||||||
printproc.setNoOut(true);
|
|
||||||
|
|
||||||
splitter.text_to_words(data);
|
|
||||||
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
|
||||||
TextSplit::Stats::Values v = splitter.getStats();
|
|
||||||
cout << "Average length: "
|
|
||||||
<< v.avglen
|
|
||||||
<< " Standard deviation: "
|
|
||||||
<< v.sigma
|
|
||||||
<< " Coef of variation "
|
|
||||||
<< v.sigma / v.avglen
|
|
||||||
<< endl;
|
|
||||||
#endif
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char *teststrings[] = {
|
|
||||||
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n",
|
|
||||||
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n",
|
|
||||||
"n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'",
|
|
||||||
"_network_ some_span",
|
|
||||||
"data123\n",
|
|
||||||
"134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n",
|
|
||||||
"@^#$(#$(*)\n",
|
|
||||||
"192.168.4.1 one\n\rtwo\r",
|
|
||||||
"[olala][ululu] (valeur) (23)\n",
|
|
||||||
"utf-8 ucs-4© \\nodef\n",
|
|
||||||
"A b C 2 . +",
|
|
||||||
"','this\n",
|
|
||||||
" ,able,test-domain",
|
|
||||||
" -wl,--export-dynamic",
|
|
||||||
" ~/.xsession-errors",
|
|
||||||
"this_very_long_span_this_very_long_span_this_very_long_span",
|
|
||||||
"soft\xc2\xadhyphen",
|
|
||||||
"soft\xc2\xad\nhyphen",
|
|
||||||
"soft\xc2\xad\n\rhyphen",
|
|
||||||
"real\xe2\x80\x90hyphen",
|
|
||||||
"real\xe2\x80\x90\nhyphen",
|
|
||||||
"hyphen-\nminus",
|
|
||||||
};
|
|
||||||
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
|
||||||
|
|
||||||
static string teststring1 = " nouvel-an ";
|
|
||||||
|
|
||||||
static string thisprog;
|
|
||||||
|
|
||||||
static string usage =
|
|
||||||
" textsplit [opts] [filename]\n"
|
|
||||||
" -q : no output\n"
|
|
||||||
" -s : only spans\n"
|
|
||||||
" -w : only words\n"
|
|
||||||
" -n : no numbers\n"
|
|
||||||
" -k : preserve wildcards (?*)\n"
|
|
||||||
" -c : just count words\n"
|
|
||||||
" -u : use unac\n"
|
|
||||||
" -C [charset] : input charset\n"
|
|
||||||
" -S [stopfile] : stopfile to use for commongrams\n"
|
|
||||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
|
|
||||||
" textplit -p somephrase : display results from stringToStrings()\n"
|
|
||||||
" \n"
|
|
||||||
;
|
|
||||||
|
|
||||||
static void
|
|
||||||
Usage(void)
|
|
||||||
{
|
|
||||||
cerr << thisprog << ": usage:\n" << usage;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int op_flags;
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
string charset, stopfile;
|
|
||||||
|
|
||||||
thisprog = argv[0];
|
|
||||||
argc--; argv++;
|
|
||||||
|
|
||||||
while (argc > 0 && **argv == '-') {
|
|
||||||
(*argv)++;
|
|
||||||
if (!(**argv))
|
|
||||||
/* Cas du "adb - core" */
|
|
||||||
Usage();
|
|
||||||
while (**argv)
|
|
||||||
switch (*(*argv)++) {
|
|
||||||
case 'c': op_flags |= OPT_c; break;
|
|
||||||
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
|
|
||||||
charset = *(++argv); argc--;
|
|
||||||
goto b1;
|
|
||||||
case 'k': op_flags |= OPT_k; break;
|
|
||||||
case 'n': op_flags |= OPT_n; break;
|
|
||||||
case 'p': op_flags |= OPT_p; break;
|
|
||||||
case 'q': op_flags |= OPT_q; break;
|
|
||||||
case 's': op_flags |= OPT_s; break;
|
|
||||||
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
|
|
||||||
stopfile = *(++argv); argc--;
|
|
||||||
goto b1;
|
|
||||||
case 'u': op_flags |= OPT_u; break;
|
|
||||||
case 'w': op_flags |= OPT_w; break;
|
|
||||||
default: Usage(); break;
|
|
||||||
}
|
|
||||||
b1: argc--; argv++;
|
|
||||||
}
|
|
||||||
|
|
||||||
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
|
||||||
|
|
||||||
if (op_flags&OPT_s)
|
|
||||||
flags = TextSplit::TXTS_ONLYSPANS;
|
|
||||||
else if (op_flags&OPT_w)
|
|
||||||
flags = TextSplit::TXTS_NOSPANS;
|
|
||||||
if (op_flags & OPT_k)
|
|
||||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
|
||||||
if (op_flags & OPT_n)
|
|
||||||
TextSplit::noNumbers();
|
|
||||||
|
|
||||||
Rcl::StopList stoplist;
|
|
||||||
if (op_flags & OPT_S) {
|
|
||||||
if (!stoplist.setFile(stopfile)) {
|
|
||||||
cerr << "Can't read stopfile: " << stopfile << endl;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
string odata, reason;
|
|
||||||
if (argc == 1) {
|
|
||||||
const char *filename = *argv++; argc--;
|
|
||||||
if (op_flags& OPT_p) {
|
|
||||||
vector<string> tokens;
|
|
||||||
TextSplit::stringToStrings(filename, tokens);
|
|
||||||
for (vector<string>::const_iterator it = tokens.begin();
|
|
||||||
it != tokens.end(); it++) {
|
|
||||||
cout << "[" << *it << "] ";
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
if (!strcmp(filename, "stdin")) {
|
|
||||||
char buf[1024];
|
|
||||||
int nread;
|
|
||||||
while ((nread = read(0, buf, 1024)) > 0) {
|
|
||||||
odata.append(buf, nread);
|
|
||||||
}
|
|
||||||
} else if (!file_to_string(filename, odata, &reason)) {
|
|
||||||
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
|
||||||
<< reason << endl;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (op_flags & OPT_p)
|
|
||||||
Usage();
|
|
||||||
for (int i = 0; i < teststrings_cnt; i++) {
|
|
||||||
cout << endl << teststrings[i] << endl;
|
|
||||||
dosplit(teststrings[i], flags, op_flags);
|
|
||||||
}
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
string& data = odata;
|
|
||||||
string ndata;
|
|
||||||
if ((op_flags & OPT_C)) {
|
|
||||||
if (!transcode(odata, ndata, charset, "UTF-8")) {
|
|
||||||
cerr << "Failed: transcode error" << endl;
|
|
||||||
exit(1);
|
|
||||||
} else {
|
|
||||||
data = ndata;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (op_flags & OPT_c) {
|
|
||||||
int n = TextSplit::countWords(data, flags);
|
|
||||||
cout << n << " words" << endl;
|
|
||||||
} else {
|
|
||||||
dosplit(data, flags, op_flags);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // TEST
|
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,13 @@ recollq '"This is the Mysql reference manual"'
|
|||||||
# Tests that the charset spec is correctly recognised inside badhtml.html
|
# Tests that the charset spec is correctly recognised inside badhtml.html
|
||||||
recollq -a 'etonne badhtml'
|
recollq -a 'etonne badhtml'
|
||||||
# Tests field extraction/storage and indexing
|
# Tests field extraction/storage and indexing
|
||||||
recollq -m -q "testfield:testfieldvalue" | egrep 'results|^text/html|^testfield ='
|
recollq -m -q "testfield:testfieldvalue" | \
|
||||||
|
egrep 'results|^text/html|^testfield ='
|
||||||
|
|
||||||
|
# Not specifically HTML. apos.html has text where an apostrophe-like
|
||||||
|
# Unicode character is used in place of ASCII ' . Checks that we
|
||||||
|
# replace the character for proper span processing
|
||||||
|
recollq -q '"'imperfections de l"'"oeil'"'
|
||||||
|
|
||||||
# more unaccenting tests
|
# more unaccenting tests
|
||||||
recollq -q 'effaranteUTF8HTML'
|
recollq -q 'effaranteUTF8HTML'
|
||||||
|
|||||||
@ -11,6 +11,8 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] ["
|
|||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes
|
||||||
testfield = testfieldvalue
|
testfield = testfieldvalue
|
||||||
1 results
|
1 results
|
||||||
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/apos.html] [apos.html] 344 bytes
|
||||||
|
1 results
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes
|
||||||
1 results
|
1 results
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user