textsplit: process unicode apostrophes and right quotation mark as ascii single quote

2019-02-01 16:10:51 +01:00 · 2019-02-01 16:10:51 +01:00 · bbeaebf632
commit bbeaebf632
parent b1ff34407d
3 changed files with 417 additions and 642 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -14,7 +14,7 @@
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
-#ifndef TEST_TEXTSPLIT
+
 #include "autoconfig.h"

 #include <assert.h>
@ -80,8 +80,7 @@ static std::unordered_set<unsigned int> sskip;

 class CharClassInit {
 public:
-    CharClassInit() 
-    {
+    CharClassInit() {
        unsigned int i;

        // Set default value for all: SPACE
@ -138,7 +137,7 @@ public:
 };
 static const CharClassInit charClassInitInstance;

-static inline int whatcc(unsigned int c)
+static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 {
    if (c <= 127) {
        return charclasses[c]; 
@ -146,7 +145,15 @@ static inline int whatcc(unsigned int c)
        if (c == 0x2010) {
            // Special treatment for hyphen: handle as ascii minus. See
            // doc/notes/minus-hyphen-dash.txt
-            return 0x2010;
+            if (asciirep)
+                *asciirep = '-';
+            return c;
+        } else if (c == 0x2019 || c == 0x275c || c == 0x02bc) {
+            // Things sometimes replacing a single quote. Use single
+            // quote so that span processing works ok
+            if (asciirep)
+                *asciirep = '\'';
+            return c;
        } else if (sskip.find(c) != sskip.end()) {
            return SKIP;
        } else if (spunc.find(c) != spunc.end()) {
@ -623,7 +630,8 @@ bool TextSplit::text_to_words(const string &in)
 #endif

        prev_csc = csc;
-	int cc = whatcc(c);
+        char asciirep = 0;
+        int cc = whatcc(c, &asciirep);

        switch (cc) {
        case SKIP:
@ -709,20 +717,6 @@ bool TextSplit::text_to_words(const string &in)
                }
            }
            goto SPACE;
-	    break;
-
-	case 0x2010:
-            // Hyphen is replaced with ascii minus
-	    if (m_wordLen != 0) {
-                // Treat '-' inside span as glue char
-                if (!doemit(false, it.getBpos()))
-                    return false;
-                m_inNumber = false;
-                m_span += '-';
-                m_wordStart++;
-                break;
-            }
-            goto SPACE;

        case '.':
        {
@ -768,6 +762,26 @@ bool TextSplit::text_to_words(const string &in)
        }
        break;

+        case 0x2010:
+        case 0x2019:
+        case 0x275c:
+        case 0x02bc:
+            // Unicode chars which we replace with ascii for
+            // processing (2010 -> -,others -> '). It happens that
+            // they all work as glue chars and use the same code, but
+            // there might be cases needing different processing.
+            // Hyphen is replaced with ascii minus
+            if (m_wordLen) {
+                // Inside span: glue char
+                if (!doemit(false, it.getBpos()))
+                    return false;
+                m_inNumber = false;
+                m_span += asciirep;
+                m_wordStart++;
+                break;
+            }
+            goto SPACE;
+
        case '@':
        case '_':
        case '\'':
@ -1101,250 +1115,3 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
    return u8stringToStrings<vector<string> >(s, tokens);
 }

-#else  // TEST driver ->
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <math.h>
-
-#include <iostream>
-
-#include "textsplit.h"
-#include "readfile.h"
-#include "log.h"
-
-#include "transcode.h"
-#include "unacpp.h"
-#include "termproc.h"
-
-using namespace std;
-
-class myTermProc : public Rcl::TermProc {
-    int first;
-    bool nooutput;
-public:
-    myTermProc() : TermProc(0), first(1), nooutput(false) {}
-    void setNoOut(bool val) {nooutput = val;}
-    virtual bool takeword(const string &term, int pos, int bs, int be)
-    {
-	if (nooutput)
-	    return true;
-	FILE *fp = stdout;
-	if (first) {
-	    fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
-	    first = 0;
-	}
-	fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
-	return true;
-    }
-};
-
-#define OPT_s	  0x1 
-#define OPT_w	  0x2
-#define OPT_q	  0x4
-#define OPT_c     0x8
-#define OPT_k     0x10
-#define OPT_C     0x20
-#define OPT_n     0x40
-#define OPT_S     0x80
-#define OPT_u     0x100
-#define OPT_p     0x200
-
-bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
-{
-    myTermProc printproc;
-
-    Rcl::TermProc *nxt = &printproc;
-
-//    Rcl::TermProcCommongrams commonproc(nxt, stoplist);
-//    if (op_flags & OPT_S)
-//        nxt = &commonproc;
-
-    Rcl::TermProcPrep preproc(nxt);
-    if (op_flags & OPT_u) 
-        nxt = &preproc;
-
-    Rcl::TextSplitP splitter(nxt, flags);
-
-    if (op_flags & OPT_q)
-        printproc.setNoOut(true);
-
-    splitter.text_to_words(data);
-
-#ifdef TEXTSPLIT_STATS
-	TextSplit::Stats::Values v = splitter.getStats();
-	cout << "Average length: " 
-	     <<  v.avglen
-	     << " Standard deviation: " 
-	     << v.sigma
-	     << " Coef of variation "
-	     << v.sigma / v.avglen
-	     << endl;
-#endif
-    return true;
-}
-
-static const char *teststrings[] = {
-    "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n",
-    "\"Jean-Francois Dockes\" <jfd@okyz.com>\n",
-    "n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'",
-    "_network_ some_span",
-    "data123\n",
-    "134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n",
-    "@^#$(#$(*)\n",
-    "192.168.4.1 one\n\rtwo\r",
-    "[olala][ululu]  (valeur) (23)\n",
-    "utf-8 ucs-4© \\nodef\n",
-    "A b C 2 . +",
-    "','this\n",
-    " ,able,test-domain",
-    " -wl,--export-dynamic",
-    " ~/.xsession-errors",
-    "this_very_long_span_this_very_long_span_this_very_long_span",
-    "soft\xc2\xadhyphen",
-    "soft\xc2\xad\nhyphen",
-    "soft\xc2\xad\n\rhyphen",
-    "real\xe2\x80\x90hyphen",
-    "real\xe2\x80\x90\nhyphen",
-    "hyphen-\nminus",
-};
-const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
-
-static string teststring1 = " nouvel-an ";
-
-static string thisprog;
-
-static string usage =
-    " textsplit [opts] [filename]\n"
-    "   -q : no output\n"
-    "   -s :  only spans\n"
-    "   -w :  only words\n"
-    "   -n :  no numbers\n"
-    "   -k :  preserve wildcards (?*)\n"
-    "   -c : just count words\n"
-    "   -u : use unac\n"
-    "   -C [charset] : input charset\n"
-    "   -S [stopfile] : stopfile to use for commongrams\n"
-    " if filename is 'stdin', will read stdin for data (end with ^D)\n\n"
-    " textplit -p somephrase : display results from stringToStrings()\n"
-    "  \n"
-    ;
-
-static void
-Usage(void)
-{
-    cerr << thisprog  << ": usage:\n" << usage;
-    exit(1);
-}
-
-static int        op_flags;
-
-int main(int argc, char **argv)
-{
-    string charset, stopfile;
-
-    thisprog = argv[0];
-    argc--; argv++;
-
-    while (argc > 0 && **argv == '-') {
-	(*argv)++;
-	if (!(**argv))
-	    /* Cas du "adb - core" */
-	    Usage();
-	while (**argv)
-	    switch (*(*argv)++) {
-	    case 'c':	op_flags |= OPT_c; break;
-            case 'C':	op_flags |= OPT_C; if (argc < 2)  Usage();
-                charset = *(++argv); argc--; 
-                goto b1;
-	    case 'k':	op_flags |= OPT_k; break;
-	    case 'n':	op_flags |= OPT_n; break;
-	    case 'p':	op_flags |= OPT_p; break;
-	    case 'q':	op_flags |= OPT_q; break;
-	    case 's':	op_flags |= OPT_s; break;
-            case 'S':	op_flags |= OPT_S; if (argc < 2)  Usage();
-                stopfile = *(++argv); argc--; 
-                goto b1;
-	    case 'u':	op_flags |= OPT_u; break;
-	    case 'w':	op_flags |= OPT_w; break;
-	    default: Usage();	break;
-	    }
-    b1: argc--; argv++;
-    }
-
-    TextSplit::Flags flags = TextSplit::TXTS_NONE;
-
-    if (op_flags&OPT_s)
-	flags = TextSplit::TXTS_ONLYSPANS;
-    else if (op_flags&OPT_w)
-	flags = TextSplit::TXTS_NOSPANS;
-    if (op_flags & OPT_k) 
-	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); 
-    if (op_flags & OPT_n)
-	TextSplit::noNumbers();
-
-    Rcl::StopList stoplist;
-    if (op_flags & OPT_S) {
-	if (!stoplist.setFile(stopfile)) {
-	    cerr << "Can't read stopfile: " << stopfile << endl;
-	    exit(1);
-	}
-    }
-    string odata, reason;
-    if (argc == 1) {
-	const char *filename = *argv++;	argc--;
-        if (op_flags& OPT_p) {
-            vector<string> tokens;
-            TextSplit::stringToStrings(filename, tokens);
-            for (vector<string>::const_iterator it = tokens.begin();
-                 it != tokens.end(); it++) {
-                cout << "[" << *it << "] ";
-            }
-            cout << endl;
-            exit(0);
-        }
-	if (!strcmp(filename, "stdin")) {
-	    char buf[1024];
-	    int nread;
-	    while ((nread = read(0, buf, 1024)) > 0) {
-		odata.append(buf, nread);
-	    }
-	} else if (!file_to_string(filename, odata, &reason)) {
-            cerr << "Failed: file_to_string(" << filename << ") failed: " 
-                 << reason << endl;
-	    exit(1);
-        }
-    } else {
-        if (op_flags & OPT_p)
-            Usage();
-        for (int i = 0; i < teststrings_cnt; i++) {
-            cout << endl << teststrings[i] << endl;  
-            dosplit(teststrings[i], flags, op_flags);
-        }
-        exit(0);
-    }
-
-    string& data = odata;
-    string ndata;
-    if ((op_flags & OPT_C)) {
-        if (!transcode(odata, ndata, charset, "UTF-8")) {
-            cerr << "Failed: transcode error" << endl;
-            exit(1);
-        } else {
-            data = ndata;
-        }
-    }
-
-    if (op_flags & OPT_c) {
-	int n = TextSplit::countWords(data, flags);
-	cout << n << " words" << endl;
-    } else {
-        dosplit(data, flags, op_flags);
-    }    
-}
-#endif // TEST
-
--- a/tests/html/html.sh
+++ b/tests/html/html.sh
@ -11,7 +11,13 @@ recollq '"This is the Mysql reference manual"'
 # Tests that the charset spec is correctly recognised inside badhtml.html
 recollq -a 'etonne badhtml' 
 # Tests field extraction/storage and indexing
-recollq -m -q "testfield:testfieldvalue" | egrep 'results|^text/html|^testfield ='
+recollq -m -q "testfield:testfieldvalue" | \
+    egrep 'results|^text/html|^testfield ='
+
+# Not specifically HTML. apos.html has text where an apostrophe-like
+# Unicode character is used in place of ASCII ' . Checks that we
+# replace the character for proper span processing
+recollq -q '"'imperfections de l"'"oeil'"'

 # more unaccenting tests
 recollq -q 'effaranteUTF8HTML'
--- a/tests/html/html.txt
+++ b/tests/html/html.txt
@ -11,6 +11,8 @@ text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html]	["
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html]	[htmlfield.html]	137	bytes	
 testfield = testfieldvalue
 1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/apos.html]	[apos.html]	344	bytes	
+1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html]	[Some chars]	330	bytes	
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html]	[Some chars]	330	bytes