textsplit: process unicode apostrophes and right quotation mark as ascii single quote

2019-02-01 16:10:51 +01:00 · 2019-02-01 16:10:51 +01:00 · bbeaebf632
commit bbeaebf632
parent b1ff34407d
3 changed files with 417 additions and 642 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -14,7 +14,7 @@
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
-#ifndef TEST_TEXTSPLIT
+
 #include "autoconfig.h"
 #include <assert.h>
@ -80,8 +80,7 @@ static std::unordered_set<unsigned int> sskip;
 class CharClassInit {
 public:
-    CharClassInit() 
+    CharClassInit() {
    {
        unsigned int i;
        // Set default value for all: SPACE
@ -138,7 +137,7 @@ public:
 };
 static const CharClassInit charClassInitInstance;
-static inline int whatcc(unsigned int c)
+static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 {
    if (c <= 127) {
        return charclasses[c]; 
@ -146,7 +145,15 @@ static inline int whatcc(unsigned int c)
        if (c == 0x2010) {
            // Special treatment for hyphen: handle as ascii minus. See
            // doc/notes/minus-hyphen-dash.txt
-            return 0x2010;
+            if (asciirep)
                *asciirep = '-';
            return c;
        } else if (c == 0x2019 || c == 0x275c || c == 0x02bc) {
            // Things sometimes replacing a single quote. Use single
            // quote so that span processing works ok
            if (asciirep)
                *asciirep = '\'';
            return c;
        } else if (sskip.find(c) != sskip.end()) {
            return SKIP;
        } else if (spunc.find(c) != spunc.end()) {
@ -623,7 +630,8 @@ bool TextSplit::text_to_words(const string &in)
 #endif
        prev_csc = csc;
-	int cc = whatcc(c);
+        char asciirep = 0;
        int cc = whatcc(c, &asciirep);
        switch (cc) {
        case SKIP:
@ -709,20 +717,6 @@ bool TextSplit::text_to_words(const string &in)
                }
            }
            goto SPACE;
 	    break;
 	case 0x2010:
            // Hyphen is replaced with ascii minus
 	    if (m_wordLen != 0) {
                // Treat '-' inside span as glue char
                if (!doemit(false, it.getBpos()))
                    return false;
                m_inNumber = false;
                m_span += '-';
                m_wordStart++;
                break;
            }
            goto SPACE;
        case '.':
        {
@ -768,6 +762,26 @@ bool TextSplit::text_to_words(const string &in)
        }
        break;
        case 0x2010:
        case 0x2019:
        case 0x275c:
        case 0x02bc:
            // Unicode chars which we replace with ascii for
            // processing (2010 -> -,others -> '). It happens that
            // they all work as glue chars and use the same code, but
            // there might be cases needing different processing.
            // Hyphen is replaced with ascii minus
            if (m_wordLen) {
                // Inside span: glue char
                if (!doemit(false, it.getBpos()))
                    return false;
                m_inNumber = false;
                m_span += asciirep;
                m_wordStart++;
                break;
            }
            goto SPACE;
        case '@':
        case '_':
        case '\'':
@ -1101,250 +1115,3 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
    return u8stringToStrings<vector<string> >(s, tokens);
 }
 #else  // TEST driver ->
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <string.h>
 #include <math.h>
 #include <iostream>
 #include "textsplit.h"
 #include "readfile.h"
 #include "log.h"
 #include "transcode.h"
 #include "unacpp.h"
 #include "termproc.h"
 using namespace std;
 class myTermProc : public Rcl::TermProc {
    int first;
    bool nooutput;
 public:
    myTermProc() : TermProc(0), first(1), nooutput(false) {}
    void setNoOut(bool val) {nooutput = val;}
    virtual bool takeword(const string &term, int pos, int bs, int be)
    {
 	if (nooutput)
 	    return true;
 	FILE *fp = stdout;
 	if (first) {
 	    fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
 	    first = 0;
 	}
 	fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
 	return true;
    }
 };
 #define OPT_s	  0x1 
 #define OPT_w	  0x2
 #define OPT_q	  0x4
 #define OPT_c     0x8
 #define OPT_k     0x10
 #define OPT_C     0x20
 #define OPT_n     0x40
 #define OPT_S     0x80
 #define OPT_u     0x100
 #define OPT_p     0x200
 bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
 {
    myTermProc printproc;
    Rcl::TermProc *nxt = &printproc;
 //    Rcl::TermProcCommongrams commonproc(nxt, stoplist);
 //    if (op_flags & OPT_S)
 //        nxt = &commonproc;
    Rcl::TermProcPrep preproc(nxt);
    if (op_flags & OPT_u) 
        nxt = &preproc;
    Rcl::TextSplitP splitter(nxt, flags);
    if (op_flags & OPT_q)
        printproc.setNoOut(true);
    splitter.text_to_words(data);
 #ifdef TEXTSPLIT_STATS
 	TextSplit::Stats::Values v = splitter.getStats();
 	cout << "Average length: " 
 	     <<  v.avglen
 	     << " Standard deviation: " 
 	     << v.sigma
 	     << " Coef of variation "
 	     << v.sigma / v.avglen
 	     << endl;
 #endif
    return true;
 }
 static const char *teststrings[] = {
    "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n",
    "\"Jean-Francois Dockes\" <jfd@okyz.com>\n",
    "n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'",
    "_network_ some_span",
    "data123\n",
    "134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n",
    "@^#$(#$(*)\n",
    "192.168.4.1 one\n\rtwo\r",
    "[olala][ululu]  (valeur) (23)\n",
    "utf-8 ucs-4© \\nodef\n",
    "A b C 2 . +",
    "','this\n",
    " ,able,test-domain",
    " -wl,--export-dynamic",
    " ~/.xsession-errors",
    "this_very_long_span_this_very_long_span_this_very_long_span",
    "soft\xc2\xadhyphen",
    "soft\xc2\xad\nhyphen",
    "soft\xc2\xad\n\rhyphen",
    "real\xe2\x80\x90hyphen",
    "real\xe2\x80\x90\nhyphen",
    "hyphen-\nminus",
 };
 const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
 static string teststring1 = " nouvel-an ";
 static string thisprog;
 static string usage =
    " textsplit [opts] [filename]\n"
    "   -q : no output\n"
    "   -s :  only spans\n"
    "   -w :  only words\n"
    "   -n :  no numbers\n"
    "   -k :  preserve wildcards (?*)\n"
    "   -c : just count words\n"
    "   -u : use unac\n"
    "   -C [charset] : input charset\n"
    "   -S [stopfile] : stopfile to use for commongrams\n"
    " if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
    " textplit -p somephrase : display results from stringToStrings()\n"
    "  \n"
    ;
 static void
 Usage(void)
 {
    cerr << thisprog  << ": usage:\n" << usage;
    exit(1);
 }
 static int        op_flags;
 int main(int argc, char **argv)
 {
    string charset, stopfile;
    thisprog = argv[0];
    argc--; argv++;
    while (argc > 0 && **argv == '-') {
 	(*argv)++;
 	if (!(**argv))
 	    /* Cas du "adb - core" */
 	    Usage();
 	while (**argv)
 	    switch (*(*argv)++) {
 	    case 'c':	op_flags |= OPT_c; break;
            case 'C':	op_flags |= OPT_C; if (argc < 2)  Usage();
                charset = *(++argv); argc--; 
                goto b1;
 	    case 'k':	op_flags |= OPT_k; break;
 	    case 'n':	op_flags |= OPT_n; break;
 	    case 'p':	op_flags |= OPT_p; break;
 	    case 'q':	op_flags |= OPT_q; break;
 	    case 's':	op_flags |= OPT_s; break;
            case 'S':	op_flags |= OPT_S; if (argc < 2)  Usage();
                stopfile = *(++argv); argc--; 
                goto b1;
 	    case 'u':	op_flags |= OPT_u; break;
 	    case 'w':	op_flags |= OPT_w; break;
 	    default: Usage();	break;
 	    }
    b1: argc--; argv++;
    }
    TextSplit::Flags flags = TextSplit::TXTS_NONE;
    if (op_flags&OPT_s)
 	flags = TextSplit::TXTS_ONLYSPANS;
    else if (op_flags&OPT_w)
 	flags = TextSplit::TXTS_NOSPANS;
    if (op_flags & OPT_k) 
 	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); 
    if (op_flags & OPT_n)
 	TextSplit::noNumbers();
    Rcl::StopList stoplist;
    if (op_flags & OPT_S) {
 	if (!stoplist.setFile(stopfile)) {
 	    cerr << "Can't read stopfile: " << stopfile << endl;
 	    exit(1);
 	}
    }
    string odata, reason;
    if (argc == 1) {
 	const char *filename = *argv++;	argc--;
        if (op_flags& OPT_p) {
            vector<string> tokens;
            TextSplit::stringToStrings(filename, tokens);
            for (vector<string>::const_iterator it = tokens.begin();
                 it != tokens.end(); it++) {
                cout << "[" << *it << "] ";
            }
            cout << endl;
            exit(0);
        }
 	if (!strcmp(filename, "stdin")) {
 	    char buf[1024];
 	    int nread;
 	    while ((nread = read(0, buf, 1024)) > 0) {
 		odata.append(buf, nread);
 	    }
 	} else if (!file_to_string(filename, odata, &reason)) {
            cerr << "Failed: file_to_string(" << filename << ") failed: " 
                 << reason << endl;
 	    exit(1);
        }
    } else {
        if (op_flags & OPT_p)
            Usage();
        for (int i = 0; i < teststrings_cnt; i++) {
            cout << endl << teststrings[i] << endl;  
            dosplit(teststrings[i], flags, op_flags);
        }
        exit(0);
    }
    string& data = odata;
    string ndata;
    if ((op_flags & OPT_C)) {
        if (!transcode(odata, ndata, charset, "UTF-8")) {
            cerr << "Failed: transcode error" << endl;
            exit(1);
        } else {
            data = ndata;
        }
    }
    if (op_flags & OPT_c) {
 	int n = TextSplit::countWords(data, flags);
 	cout << n << " words" << endl;
    } else {
        dosplit(data, flags, op_flags);
    }    
 }
 #endif // TEST
--- a/tests/html/html.sh
+++ b/tests/html/html.sh
@ -11,7 +11,13 @@ recollq '"This is the Mysql reference manual"'
 # Tests that the charset spec is correctly recognised inside badhtml.html
 recollq -a 'etonne badhtml' 
 # Tests field extraction/storage and indexing
-recollq -m -q "testfield:testfieldvalue" | egrep 'results|^text/html|^testfield ='
+recollq -m -q "testfield:testfieldvalue" | \
    egrep 'results|^text/html|^testfield ='
 # Not specifically HTML. apos.html has text where an apostrophe-like
 # Unicode character is used in place of ASCII ' . Checks that we
 # replace the character for proper span processing
 recollq -q '"'imperfections de l"'"oeil'"'
 # more unaccenting tests
 recollq -q 'effaranteUTF8HTML'
--- a/tests/html/html.txt
+++ b/tests/html/html.txt
@ -11,6 +11,8 @@ text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html]	["
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html]	[htmlfield.html]	137	bytes	
 testfield = testfieldvalue
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/apos.html]	[apos.html]	344	bytes	
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html]	[Some chars]	330	bytes	
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html]	[Some chars]	330	bytes