From e0bc65bfddb6cf7285a9074833dba96072513f51 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 13 Sep 2012 12:25:01 +0200 Subject: [PATCH] small mods inocuous or auxiliary to case/diac sensitivity but which can live in main branch --- src/common/autoconfig.h.in | 6 + src/configure | 261 ++++++++++++++++++++------------- src/configure.ac | 39 ++++- src/index/fsindexer.cpp | 2 - src/index/indexer.cpp | 7 +- src/internfile/mimehandler.cpp | 9 -- src/query/recollq.cpp | 2 + src/query/wasatorcl.cpp | 29 ++-- src/rcldb/searchdata.h | 2 +- src/sampleconf/recoll.conf.in | 10 +- src/utils/hldata.h | 4 +- src/utils/utf8iter.h | 35 ++--- 12 files changed, 247 insertions(+), 159 deletions(-) diff --git a/src/common/autoconfig.h.in b/src/common/autoconfig.h.in index 41a7691e..e6b04112 100644 --- a/src/common/autoconfig.h.in +++ b/src/common/autoconfig.h.in @@ -96,6 +96,12 @@ /* Use file extended attributes */ #undef RCL_USE_XATTR +/* Use multiple threads for indexing */ +#undef IDX_THREADS + +/* Remove case and accents from terms */ +#undef RCL_INDEX_STRIPCHARS + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS diff --git a/src/configure b/src/configure index 22c481f3..894af913 100755 --- a/src/configure +++ b/src/configure @@ -1,11 +1,9 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.68 for Recoll 1.18.0. +# Generated by GNU Autoconf 2.69 for Recoll 1.18.0. # # -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, -# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software -# Foundation, Inc. +# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # # # This configure script is free software; the Free Software Foundation @@ -134,6 +132,31 @@ export LANGUAGE # CDPATH. (unset CDPATH) >/dev/null 2>&1 && unset CDPATH +# Use a proper internal environment variable to ensure we don't fall + # into an infinite loop, continuously re-executing ourselves. + if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then + _as_can_reexec=no; export _as_can_reexec; + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +as_fn_exit 255 + fi + # We don't want this to propagate to other subprocesses. + { _as_can_reexec=; unset _as_can_reexec;} if test "x$CONFIG_SHELL" = x; then as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : emulate sh @@ -167,7 +190,8 @@ if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : else exitcode=1; echo positional parameters were not saved. fi -test x\$exitcode = x0 || exit 1" +test x\$exitcode = x0 || exit 1 +test -x / || exit 1" as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && @@ -212,21 +236,25 @@ IFS=$as_save_IFS if test "x$CONFIG_SHELL" != x; then : - # We cannot yet assume a decent shell, so we have to provide a - # neutralization value for shells without unset; and this also - # works around shells that cannot unset nonexistent variables. - # Preserve -v and -x to the replacement shell. - BASH_ENV=/dev/null - ENV=/dev/null - (unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV - export CONFIG_SHELL - case $- in # (((( - *v*x* | *x*v* ) as_opts=-vx ;; - *v* ) as_opts=-v ;; - *x* ) as_opts=-x ;; - * ) as_opts= ;; - esac - exec "$CONFIG_SHELL" $as_opts "$as_myself" ${1+"$@"} + export CONFIG_SHELL + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +exit 255 fi if test x$as_have_required = xno; then : @@ -328,6 +356,14 @@ $as_echo X"$as_dir" | } # as_fn_mkdir_p + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p # as_fn_append VAR VALUE # ---------------------- # Append the text in VALUE to the end of the definition contained in VAR. Take @@ -449,6 +485,10 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits chmod +x "$as_me.lineno" || { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + # If we had to re-execute with $CONFIG_SHELL, we're ensured to have + # already done that, so ensure we don't try to do so again and fall + # in an infinite loop. This has already happened in practice. + _as_can_reexec=no; export _as_can_reexec # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensitive to this). @@ -483,16 +523,16 @@ if (echo >conf$$.file) 2>/dev/null; then # ... but there are two gotchas: # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. - # In both cases, we have to default to `cp -p'. + # In both cases, we have to default to `cp -pR'. ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || - as_ln_s='cp -p' + as_ln_s='cp -pR' elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else - as_ln_s='cp -p' + as_ln_s='cp -pR' fi else - as_ln_s='cp -p' + as_ln_s='cp -pR' fi rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file rmdir conf$$.dir 2>/dev/null @@ -504,28 +544,8 @@ else as_mkdir_p=false fi -if test -x / >/dev/null 2>&1; then - as_test_x='test -x' -else - if ls -dL / >/dev/null 2>&1; then - as_ls_L_option=L - else - as_ls_L_option= - fi - as_test_x=' - eval sh -c '\'' - if test -d "$1"; then - test -d "$1/."; - else - case $1 in #( - -*)set "./$1";; - esac; - case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #(( - ???[sx]*):;;*)false;;esac;fi - '\'' sh - ' -fi -as_executable_p=$as_test_x +as_test_x='test -x' +as_executable_p=as_fn_executable_p # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" @@ -693,7 +713,9 @@ with_aspell with_inotify with_fam enable_xattr +enable_idxthreads enable_camelcase +enable_stripchars enable_python_module enable_pic enable_qtgui @@ -1171,8 +1193,6 @@ target=$target_alias if test "x$host_alias" != x; then if test "x$build_alias" = x; then cross_compiling=maybe - $as_echo "$as_me: WARNING: if you wanted to set the --build type, don't use --host. - If a cross compiler is detected then cross compile mode will be used" >&2 elif test "x$build_alias" != "x$host_alias"; then cross_compiling=yes fi @@ -1337,6 +1357,8 @@ Optional Features: creates them on (part of) your data set. You also need to set up appropriate mappings in the configuration. + --enable-idxthreads Enable multithread indexing. This can somewhat boost + indexing performance. --enable-camelcase Enable splitting camelCase words. This is not enabled by default as this makes phrase matches more difficult: you need to use matching case in the @@ -1344,12 +1366,16 @@ Optional Features: manual" and "my sql manual" are the same, but not the same as "mysql manual" (in phrases only and you could raise the phrase slack to get a match). + --enable-stripchars Remove diacritics and fold character case in indexed + terms. This will yield less precise searches but the + index will be smaller --disable-python-module Do not build the Python module. - --enable-pic Do not compile library objects as position + --disable-pic Do not compile library objects as position independant code. This is incompatible with the php or python extensions. --disable-qtgui Disable the QT-based graphical user interface. - --disable-webkit Disable use of qt-webkit. + --disable-webkit Disable use of qt-webkit (only meaningful if qtgui + is enabled). --disable-x11mon Disable recollindex support for X11 session monitoring. @@ -1361,7 +1387,7 @@ Optional Packages: --without-aspell Disable use of aspell spelling package to provide term expansion to other spellings --with-inotify Use inotify for almost real time indexing of - modified files. + modified files (the default is yes on Linux). --with-fam Use File Alteration Monitor for almost real time indexing of modified files. Give the fam/gamin library as argument (ie: /usr/lib/libfam.so) if @@ -1451,9 +1477,9 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF Recoll configure 1.18.0 -generated by GNU Autoconf 2.68 +generated by GNU Autoconf 2.69 -Copyright (C) 2010 Free Software Foundation, Inc. +Copyright (C) 2012 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. _ACEOF @@ -1764,7 +1790,7 @@ $as_echo "$ac_try_echo"; } >&5 test ! -s conftest.err } && test -s conftest$ac_exeext && { test "$cross_compiling" = yes || - $as_test_x conftest$ac_exeext + test -x conftest$ac_exeext }; then : ac_retval=0 else @@ -1877,7 +1903,7 @@ $as_echo "$ac_try_echo"; } >&5 test ! -s conftest.err } && test -s conftest$ac_exeext && { test "$cross_compiling" = yes || - $as_test_x conftest$ac_exeext + test -x conftest$ac_exeext }; then : ac_retval=0 else @@ -2004,7 +2030,7 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. It was created by Recoll $as_me 1.18.0, which was -generated by GNU Autoconf 2.68. Invocation command line was +generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2384,7 +2410,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CXX="$ac_tool_prefix$ac_prog" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -2428,7 +2454,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CXX="$ac_prog" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -2907,7 +2933,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CC="${ac_tool_prefix}gcc" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -2947,7 +2973,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="gcc" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -3000,7 +3026,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CC="${ac_tool_prefix}cc" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -3041,7 +3067,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then ac_prog_rejected=yes continue @@ -3099,7 +3125,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CC="$ac_tool_prefix$ac_prog" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -3143,7 +3169,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="$ac_prog" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -3339,8 +3365,7 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include #include -#include -#include +struct stat; /* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ struct buf { int x; }; FILE * (*rcsopen) (struct buf *, struct stat *, int); @@ -3622,6 +3647,8 @@ _ACEOF esac rm -rf conftest* fi + + fi @@ -3780,7 +3807,7 @@ do for ac_prog in grep ggrep; do for ac_exec_ext in '' $ac_executable_extensions; do ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext" - { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue + as_fn_executable_p "$ac_path_GREP" || continue # Check for GNU ac_path_GREP and select it if it is found. # Check for GNU $ac_path_GREP case `"$ac_path_GREP" --version 2>&1` in @@ -3846,7 +3873,7 @@ do for ac_prog in egrep; do for ac_exec_ext in '' $ac_executable_extensions; do ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext" - { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue + as_fn_executable_p "$ac_path_EGREP" || continue # Check for GNU ac_path_EGREP and select it if it is found. # Check for GNU $ac_path_EGREP case `"$ac_path_EGREP" --version 2>&1` in @@ -4084,7 +4111,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_path_fileProg="$as_dir/$ac_word$ac_exec_ext" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -4160,7 +4187,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_path_aspellProg="$as_dir/$ac_word$ac_exec_ext" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -4313,7 +4340,8 @@ _ACEOF fi # Enable use of file extended attributes. -# Not by default as these are little used for now. +# Not by default as extended attributes are little used for now, and +# looking for them is not completely trivial # Check whether --enable-xattr was given. if test "${enable_xattr+set}" = set; then : enableval=$enable_xattr; xattrEnabled=$enableval @@ -4328,6 +4356,26 @@ $as_echo "#define RCL_USE_XATTR 1" >>confdefs.h fi +# Enable use of threads in the indexing pipeline. +# Threads are used in bucket-brigade fashion for the processing steps +# (reading file - text splitting - indexing proper). The performance +# increase can be significant, but this is disabled by default as we +# usually care little about indexing absolute performance (more about +# impact on usability and total resources used). +# Check whether --enable-idxthreads was given. +if test "${enable_idxthreads+set}" = set; then : + enableval=$enable_idxthreads; idxthreadsEnabled=$enableval +else + idxthreadsEnabled=no +fi + + +if test X$idxthreadsEnabled = Xyes ; then + +$as_echo "#define IDX_THREADS 1" >>confdefs.h + +fi + # Enable CamelCase word splitting. This is optional because it causes # problems with phrases: with camelcase enabled, "MySQL manual" # will be matched by "MySQL manual" and "my sql manual" but not @@ -4347,6 +4395,21 @@ $as_echo "#define RCL_SPLIT_CAMELCASE 1" >>confdefs.h fi +# Not by default as these are little used for now. +# Check whether --enable-stripchars was given. +if test "${enable_stripchars+set}" = set; then : + enableval=$enable_stripchars; stripcharsEnabled=$enableval +else + stripcharsEnabled=no +fi + + +if test X$stripcharsEnabled = Xyes ; then + +$as_echo "#define RCL_INDEX_STRIPCHARS 1" >>confdefs.h + +fi + # Disable building the python module. This is built by default, because # it's really the easiest way to interface and extend recoll. It forces PIC # objects for everything (indexing performance impact: 1%), because it's @@ -4555,7 +4618,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_path_XAPIAN_CONFIG="$as_dir/$ac_word$ac_exec_ext" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -4676,7 +4739,7 @@ do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_path_QMAKEPATH="$as_dir/$ac_word$ac_exec_ext" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 @@ -4722,7 +4785,7 @@ fi #echo "qmake version: $qmakevers" v4=`expr "$qmakevers" : '.*Qt *version *4.*'` if test X$v4 = X0 ; then - as_fn_error $? "qmake seems to indincate using Qt version 3 which is not supported any more" "$LINENO" 5 + as_fn_error $? "qmake seems to be using Qt version 3 which is not supported any more" "$LINENO" 5 QTGUI=qtgui else { $as_echo "$as_me:${as_lineno-$LINENO}: using qt version 4 user interface" >&5 @@ -4769,6 +4832,8 @@ fi QMAKE_DISABLE_WEBKIT="" fi + + ##### Using QZeitGeist lib ? Default no for now # Check whether --with-qzeitgeist was given. @@ -6110,16 +6175,16 @@ if (echo >conf$$.file) 2>/dev/null; then # ... but there are two gotchas: # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. - # In both cases, we have to default to `cp -p'. + # In both cases, we have to default to `cp -pR'. ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || - as_ln_s='cp -p' + as_ln_s='cp -pR' elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else - as_ln_s='cp -p' + as_ln_s='cp -pR' fi else - as_ln_s='cp -p' + as_ln_s='cp -pR' fi rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file rmdir conf$$.dir 2>/dev/null @@ -6179,28 +6244,16 @@ else as_mkdir_p=false fi -if test -x / >/dev/null 2>&1; then - as_test_x='test -x' -else - if ls -dL / >/dev/null 2>&1; then - as_ls_L_option=L - else - as_ls_L_option= - fi - as_test_x=' - eval sh -c '\'' - if test -d "$1"; then - test -d "$1/."; - else - case $1 in #( - -*)set "./$1";; - esac; - case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #(( - ???[sx]*):;;*)false;;esac;fi - '\'' sh - ' -fi -as_executable_p=$as_test_x + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p +as_test_x='test -x' +as_executable_p=as_fn_executable_p # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" @@ -6222,7 +6275,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # values after options handling. ac_log=" This file was extended by Recoll $as_me 1.18.0, which was -generated by GNU Autoconf 2.68. Invocation command line was +generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS @@ -6284,10 +6337,10 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ Recoll config.status 1.18.0 -configured by $0, generated by GNU Autoconf 2.68, +configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" -Copyright (C) 2010 Free Software Foundation, Inc. +Copyright (C) 2012 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." @@ -6375,7 +6428,7 @@ fi _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 if \$ac_cs_recheck; then - set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion shift \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 CONFIG_SHELL='$SHELL' diff --git a/src/configure.ac b/src/configure.ac index a74642ce..41c45bb2 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -104,7 +104,8 @@ fi # Real time monitoring with inotify AC_ARG_WITH(inotify, AC_HELP_STRING([--with-inotify], - [Use inotify for almost real time indexing of modified files.]), + [Use inotify for almost real time indexing of modified files (the default + is yes on Linux).]), withInotify=$withval, withInotify=$inot_default) if test X$withInotify != Xno ; then @@ -163,7 +164,8 @@ if test X$withFam != Xno ; then fi # Enable use of file extended attributes. -# Not by default as these are little used for now. +# Not by default as extended attributes are little used for now, and +# looking for them is not completely trivial AC_ARG_ENABLE(xattr, AC_HELP_STRING([--enable-xattr], [Enable fetching metadata from file extended attributes. This is only @@ -175,6 +177,22 @@ if test X$xattrEnabled = Xyes ; then AC_DEFINE(RCL_USE_XATTR, 1, [Use file extended attributes]) fi +# Enable use of threads in the indexing pipeline. Threads are used in +# bucket-brigade fashion for the processing steps (reading file - text +# splitting - indexing proper). The performance increase is small in normal +# case (might be a bit more significant if you're using an SSD), and this +# is disabled by default as we usually care little about indexing absolute +# performance (more about impact on usability and total resources used). +AC_ARG_ENABLE(idxthreads, + AC_HELP_STRING([--enable-idxthreads], + [Enable multithread indexing. This can somewhat boost indexing + performance.]), + idxthreadsEnabled=$enableval, idxthreadsEnabled=no) + +if test X$idxthreadsEnabled = Xyes ; then + AC_DEFINE(IDX_THREADS, 1, [Use multiple threads for indexing]) +fi + # Enable CamelCase word splitting. This is optional because it causes # problems with phrases: with camelcase enabled, "MySQL manual" # will be matched by "MySQL manual" and "my sql manual" but not @@ -194,6 +212,17 @@ if test X$camelcaseEnabled = Xyes ; then AC_DEFINE(RCL_SPLIT_CAMELCASE, 1, [Split camelCase words]) fi +# Not by default as these are little used for now. +AC_ARG_ENABLE(stripchars, + AC_HELP_STRING([--enable-stripchars], + [Remove diacritics and fold character case in indexed terms. This will + yield less precise searches but the index will be smaller]), + stripcharsEnabled=$enableval, stripcharsEnabled=no) + +if test X$stripcharsEnabled = Xyes ; then + AC_DEFINE(RCL_INDEX_STRIPCHARS, 1, [Remove case and accents from terms]) +fi + # Disable building the python module. This is built by default, because # it's really the easiest way to interface and extend recoll. It forces PIC # objects for everything (indexing performance impact: 1%), because it's @@ -214,7 +243,7 @@ fi # Build PIC objects for the library ? AC_ARG_ENABLE(pic, - AC_HELP_STRING([--enable-pic], + AC_HELP_STRING([--disable-pic], [Do not compile library objects as position independant code. This is incompatible with the php or python extensions.]), picEnabled=$enableval, picEnabled=forpython) @@ -433,7 +462,7 @@ else ##### Using Qt webkit for reslist display? Else Qt textbrowser AC_ARG_ENABLE(webkit, AC_HELP_STRING([--disable-webkit], - [Disable use of qt-webkit.]), + [Disable use of qt-webkit (only meaningful if qtgui is enabled).]), enableWebkit=$enableval, enableWebkit="yes") if test "$enableWebkit" = "yes" ; then @@ -444,6 +473,8 @@ else QMAKE_DISABLE_WEBKIT="" fi + + ##### Using QZeitGeist lib ? Default no for now AC_ARG_WITH(qzeitgeist, AC_HELP_STRING([--with-qzeitgeist], diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 8ff85709..4a40644c 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -14,9 +14,7 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#ifdef HAVE_CONFIG_H #include "autoconfig.h" -#endif #include #include diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index bd64c975..da45cffa 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -258,10 +258,11 @@ bool ConfIndexer::createStemmingDatabases() bool ConfIndexer::createStemDb(const string &lang) { - if (!m_db.open(Rcl::Db::DbUpd)) { + if (!m_db.open(Rcl::Db::DbUpd)) return false; - } - return m_db.createStemDbs(vector(1, lang)); + vector langs; + stringToStrings(lang, langs); + return m_db.createStemDbs(langs); } // The language for the aspell dictionary is handled internally by the aspell diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index a9618478..83c464c7 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -274,15 +274,6 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, // We get here if there was no specific error, but there is no // identified mime type, or no handler associated. -#ifdef INDEX_UNKNOWN_TEXT_AS_PLAIN - // If the type is an unknown text/xxx, index as text/plain and - // hope for the best (this wouldn't work too well with text/rtf...) - if (mtype.find("text/") == 0) { - h = mhFactory(cstr_textplain); - goto out; - } -#endif - // Finally, unhandled files are either ignored or their name and // generic metadata is indexed, depending on configuration {bool indexunknown = false; diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 5f6ff546..3ae2f54d 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -102,6 +102,8 @@ static char usage [] = " -m : dump the whole document meta[] array for each result\n" " -A : output the document abstracts\n" " -S fld : sort by field \n" +" -s stemlang : set stemming language to use (must exist in index...)\n" +" Use -s \"\" to turn off stem expansion\n" " -D : sort descending\n" " -i : additional index, several can be given\n" " -e use url encoding (%xx) for urls\n" diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp index a180b519..0cf238fc 100644 --- a/src/query/wasatorcl.cpp +++ b/src/query/wasatorcl.cpp @@ -166,6 +166,9 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, } // "Regular" processing follows: + unsigned int mods = (unsigned int)(*it)->m_modifiers; + nclause = 0; + switch ((*it)->m_op) { case WasaQuery::OP_NULL: case WasaQuery::OP_AND: @@ -192,8 +195,6 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, } } - unsigned int mods = (unsigned int)(*it)->m_modifiers; - // I'm not sure I understand the phrase/near detection // thereafter anymore, maybe it would be better to have an // explicit flag. Mods can only be set after a double @@ -216,12 +217,6 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, LOGERR(("wasaQueryToRcl: out of memory\n")); return 0; } - if (mods & WasaQuery::WQM_NOSTEM) { - nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); - } - if ((*it)->m_weight != 1.0) - nclause->setWeight((*it)->m_weight); - sdata->addClause(nclause); } break; @@ -248,11 +243,6 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, LOGERR(("wasaQueryToRcl: out of memory\n")); return 0; } - if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM) - nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING); - if ((*it)->m_weight != 1.0) - nclause->setWeight((*it)->m_weight); - sdata->addClause(nclause); break; case WasaQuery::OP_OR: @@ -272,10 +262,17 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, reason = "Out of memory"; return 0; } - if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM) - nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING); - sdata->addClause(nclause); } + + if (mods & WasaQuery::WQM_NOSTEM) + nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); + if (mods & WasaQuery::WQM_DIACSENS) + nclause->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); + if (mods & WasaQuery::WQM_CASESENS) + nclause->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); + if ((*it)->m_weight != 1.0) + nclause->setWeight((*it)->m_weight); + sdata->addClause(nclause); } return sdata; diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index a619b098..348d581e 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -179,7 +179,7 @@ private: class SearchDataClause { public: enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, - SDCM_ANCHOREND=4}; + SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16}; SearchDataClause(SClType tp) : m_tp(tp), m_parentSearch(0), m_haveWildCards(0), diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index 975cc8d5..d79eeb0d 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -16,7 +16,8 @@ topdirs = ~ # ignore. If you need index mozilla/thunderbird mail folders, don't put # ".*" in there (as was the case with an older sample config) # These are simple names, not paths (must contain no / ) -skippedNames = #* bin CVS Cache cache* caughtspam tmp .thumbnails .svn \ +skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \ + .thumbnails .svn \ *~ .beagle .git .hg .bzr loop.ps .xsession-errors \ .recoll* xapiandb recollrc recoll.conf @@ -73,8 +74,13 @@ indexstemminglanguages = english # first element and the translation following. The handling of both the # lowercase and upper-case versions of a character should be specified, as # appartenance to the list will turn-off both standard accent and case -# processing. Example for Swedish: +# processing. Examples: +# Swedish: # unac_except_trans = åå Åå ää Ää öö Öö +# German: +# unac_except_trans = Ää Öö Üü ää öö üü ßss +# In French, you probably want to decompose oe and ae +# unac_except_trans = œoe Œoe æae Æae # Where to store the database (directory). This may be an absolute path, # else it is taken as relative to the configuration directory (-c argument diff --git a/src/utils/hldata.h b/src/utils/hldata.h index 372c37a7..06b04a17 100644 --- a/src/utils/hldata.h +++ b/src/utils/hldata.h @@ -24,7 +24,9 @@ struct HighlightData { /** Processed/expanded terms and groups. Used for looking for * regions to highlight. Terms are just groups with 1 entry. All - * terms in there are unaccented, and the list may include values + * terms are transformed to be compatible with index content + * (unaccented and lowercased as needed depending on + * configuration), and the list may include values * expanded from the original terms by stem or wildcard expansion. */ std::vector > groups; diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index 335d9da0..cf7e9c5c 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -20,6 +20,7 @@ #ifdef UTF8ITER_CHECK #include "assert.h" #endif +#include /** * A small helper class to iterate over utf8 strings. This is not an @@ -30,13 +31,13 @@ */ class Utf8Iter { public: - Utf8Iter(const string &in) + Utf8Iter(const std::string &in) : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false) { update_cl(); } - const string& buffer() const {return m_s;} + const std::string& buffer() const {return m_s;} void rewind() { @@ -52,7 +53,7 @@ public: * current position */ unsigned int operator[](unsigned int charpos) const { - string::size_type mypos = 0; + std::string::size_type mypos = 0; unsigned int mycp = 0; if (charpos >= m_charpos) { mypos = m_pos; @@ -75,7 +76,7 @@ public: } /** Increment current position to next utf-8 char */ - string::size_type operator++(int) + std::string::size_type operator++(int) { // Note: m_cl may be zero at eof if user's test not right // this shouldn't crash the program until actual data access @@ -83,7 +84,7 @@ public: assert(m_cl != 0); #endif if (m_cl <= 0) - return string::npos; + return std::string::npos; m_pos += m_cl; m_charpos++; @@ -102,7 +103,7 @@ public: /** Append current utf-8 possibly multi-byte character to string param. This needs to be fast. No error checking. */ - unsigned int appendchartostring(string &out) { + unsigned int appendchartostring(std::string &out) { #ifdef UTF8ITER_CHECK assert(m_cl != 0); #endif @@ -111,7 +112,7 @@ public: } /** Return current character as string */ - operator string() { + operator std::string() { #ifdef UTF8ITER_CHECK assert(m_cl != 0); #endif @@ -127,39 +128,39 @@ public: } /** Return current byte offset in input string */ - string::size_type getBpos() const { + std::string::size_type getBpos() const { return m_pos; } /** Return current character length */ - string::size_type getBlen() const { + std::string::size_type getBlen() const { return m_cl; } /** Return current unicode character offset in input string */ - string::size_type getCpos() const { + std::string::size_type getCpos() const { return m_charpos; } private: // String we're working with - const string& m_s; + const std::string& m_s; // Character length at current position. A value of zero indicates // an error. unsigned int m_cl; // Current byte offset in string. - string::size_type m_pos; + std::string::size_type m_pos; // Current character position unsigned int m_charpos; // Am I ok ? mutable bool m_error; // Check position and cl against string length - bool poslok(string::size_type p, int l) const { + bool poslok(std::string::size_type p, int l) const { #ifdef UTF8ITER_CHECK - assert(p != string::npos && l > 0 && p + l <= m_s.length()); + assert(p != std::string::npos && l > 0 && p + l <= m_s.length()); #endif - return p != string::npos && l > 0 && p + l <= m_s.length(); + return p != std::string::npos && l > 0 && p + l <= m_s.length(); } // Update current char length in object state, minimum checking @@ -180,7 +181,7 @@ private: } // Get character byte length at specified position. Returns 0 for error. - inline int get_cl(string::size_type p) const + inline int get_cl(std::string::size_type p) const { unsigned int z = (unsigned char)m_s[p]; if (z <= 127) { @@ -200,7 +201,7 @@ private: } // Compute value at given position. No error checking. - inline unsigned int getvalueat(string::size_type p, int l) const + inline unsigned int getvalueat(std::string::size_type p, int l) const { switch (l) { case 1: