textsplit: process unicode apostrophes and right quotation mark as ascii single quote

This commit is contained in:
Jean-Francois Dockes 2019-02-01 16:10:51 +01:00
parent b1ff34407d
commit bbeaebf632
3 changed files with 417 additions and 642 deletions

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,13 @@ recollq '"This is the Mysql reference manual"'
# Tests that the charset spec is correctly recognised inside badhtml.html # Tests that the charset spec is correctly recognised inside badhtml.html
recollq -a 'etonne badhtml' recollq -a 'etonne badhtml'
# Tests field extraction/storage and indexing # Tests field extraction/storage and indexing
recollq -m -q "testfield:testfieldvalue" | egrep 'results|^text/html|^testfield =' recollq -m -q "testfield:testfieldvalue" | \
egrep 'results|^text/html|^testfield ='
# Not specifically HTML. apos.html has text where an apostrophe-like
# Unicode character is used in place of ASCII ' . Checks that we
# replace the character for proper span processing
recollq -q '"'imperfections de l"'"oeil'"'
# more unaccenting tests # more unaccenting tests
recollq -q 'effaranteUTF8HTML' recollq -q 'effaranteUTF8HTML'

View File

@ -11,6 +11,8 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] ["
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes
testfield = testfieldvalue testfield = testfieldvalue
1 results 1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/apos.html] [apos.html] 344 bytes
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes
1 results 1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes