diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index b59ce064..424cb5e2 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -1,6 +1,6 @@ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.10 2006-01-04 11:33:44 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.11 2006-01-10 11:07:21 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -37,7 +37,6 @@ class RclConfig { { keydir = dir; conf->get("defaultcharset", defcharset, keydir); - conf->get("defaultlanguage", deflang, keydir); string str; conf->get("guesscharset", str, keydir); guesscharset = stringToBool(str); @@ -63,7 +62,6 @@ class RclConfig { string getMimeIconName(const string &mtype); const string &getDefCharset() {return defcharset;} - const string &getDefLang() {return deflang;} bool getGuessCharset() {return guesscharset;} std::list getAllMimeTypes(); @@ -81,7 +79,6 @@ class RclConfig { // Parameters auto-fetched on setkeydir string defcharset; // These are stored locally to avoid - string deflang; // a config lookup each time. bool guesscharset; // They are fetched initially or on setKeydir() }; diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 603d4f88..3632fc9b 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -24,10 +24,10 @@ Dockes - $Id: usermanual.sgml,v 1.1 2006-01-04 11:09:53 dockes Exp $ + $Id: usermanual.sgml,v 1.2 2006-01-10 11:07:21 dockes Exp $ - The &RCL; user manual introduces full text search notions + This document introduces full text search notions and describes the installation and use of the &RCL; application. @@ -40,8 +40,8 @@ Giving it a try - If you do not like reading manuals and would like to give - &RCL; a try, just perform If you do not like reading manuals (who does?) and would + like to give &RCL; a try, just perform installation and start the recoll user interface, which will index your home directory and let you search it right after. @@ -62,13 +62,14 @@ Full text search - Full text search applications allow you to find your data - by content rather than by external attributes (like a file - name). More specifically, they will let you specify words - (terms) that should or should not appear in the text you are - looking for, and return a list of matching documents, ordered - so that the most relevant documents will - appear first. + &RCL; is a full text search application. Full text search + applications let you find your data by content rather + than by external attributes (like a file name). More + specifically, they will let you specify words (terms) that + should or should not appear in the text you are looking for, + and return a list of matching documents, ordered so that the + most relevant documents will appear + first. You do not need to remember in what file or email message you stored a given piece of information. You just ask for related @@ -84,17 +85,17 @@ guess is probably the most important element for a search application. - In many cases, one is looking for all the forms of a word, - not for a specific form or spelling. These different forms may include - plurals, different tenses for a verb, or terms derived from - the same root or stem (exemple: floor, - floors, floored, floorings...). &RCL; will by default expand - queries to all such related terms (words that reduce to the - same stem). This expansion can be disabled at search - time. + In many cases, you are looking for all the forms of a + word, not for a specific form or spelling. These different + forms may include plurals, different tenses for a verb, or + terms derived from the same root or stem + (exemple: floor, floors, floored, floorings...). &RCL; will by + default expand queries to all such related terms (words that + reduce to the same stem). This expansion can be disabled at + search time. Stemming, by itself, does not provide for misspellings or - phonetic searches. &RCL; does not support these currently. + phonetic searches. &RCL; currently does not support these. @@ -102,7 +103,7 @@ Recoll overview - &RCL; is a full text search application which uses the + &RCL; uses the &XAP; information retrieval library as its storage and retrieval engine. &XAP; is a very mature package using Stemming depends on the document language. &RCL; stores the unstemmed versions of terms and uses auxiliary databases for - term expansion. It can switch stemming languages without reindexing. - Storing documents in different languages in the same - database is possible, and useful in practice, but does introduce - possibilities of confusion. &RCL; makes no attempt at automatic - language recognition. + term expansion. It can switch stemming languages, or add a + language, without reindexing. Storing documents in different + languages in the same database is possible, and useful in + practice, but does introduce possibilities of confusion. &RCL; + makes no attempt at automatic language recognition. &RCL; has many parameters which define exactly what to index, and how to classify and decode the source @@ -146,9 +147,9 @@ Indexation is started automatically the first time you execute the recoll search graphical user interface, or by - executing the recollindex. + executing the recollindex command. - Searches are + Searches are performed inside the recoll program, which has many options to help you find what you are looking for. @@ -168,12 +169,14 @@ is normally incremental: documents will only be processed if they have been modified. On the first execution, of course, all documents will need processing. A full index build can be forced - later on by specifying an option to the indexation command. + later on by specifying an option to the indexation command + (recollindex -z). - &RCL; indexation takes place at discrete times. There is no + &RCL; indexation takes place at discrete times. There is currently no interface to real time file modification monitors. The typical usage is to have a nightly indexation run - programmed into your cron file. + programmed into your + cron file. &RCL; knows about quite a few different document types. The parameters for document types recognition and @@ -184,13 +187,19 @@ many individually indexed documents. + &RCL; indexation processes plain text, HTML, openoffice + and e-mail files internally. Other types (ie: postscript, pdf, + ms-word, rtf) need external applications for preprocessing. The + list is in the installation + section. + Without further configuration, &RCL; will index all appropriate files from your home directory, with a reasonable set of defaults, if you live in western Europe or the USA. If your normal character set is not iso8859-1, you almost certainly need to adjust the configuration. - @@ -208,9 +217,9 @@ Cancel when the program asks if it should start initial indexation. - You can also have a look to the configuration overview inside - the installation chapter of this document. + The configuration is also documented inside the installation chapter of + this document, or in the recoll.conf(5) man page. @@ -229,7 +238,7 @@ It is best to avoid interrupting the indexation process, as this may sometimes leave the database in a bad state. This is not a serious problem, as you then just need to clear - everything and restart the indexation. The database files are + everything and restart the indexation: the database files are normally stored in the $HOME/.recoll/xapiandb directory, which you can just delete if needed. Alternatively, you can @@ -259,10 +268,14 @@ - - Searching + + Search - + The recoll program provides the user + interface for searching. It is based on the + QT library. + + Simple search Start the recoll program, then @@ -287,12 +300,12 @@ By default, the document list is presented in order of relevance (how well the system estimates that the document matches the query). You can specify a different ordering by - using the Tools - / Sort parameters dialog. + using the Tools + / Sort parameters dialog. - + Complex/advanced search The advanced search dialog has fields that will allow a more @@ -308,10 +321,13 @@ It will let you restrict the search results to a subtree of the indexed area. - In other respects, it works like the simple search. + Click on the Start Search button in + the advanced search dialog to start the search. The button in + the main window always performs a simple search. + - + Document history Documents that you actually view (with the internal preview @@ -322,7 +338,26 @@ - + + Result list sorting + + The documents in a result list are normally sorted in + order of relevance. It is possible to specify different sort + parameters by using the Sort parameters + dialog (located in the Tools + menu). + + The tool sorts a specified number of the most + relevant documents in the result list, according to + specified criteria. The currently available criteria are + date and mime type. + + The sort parameters stay in effect until they are explicitely + reset, or the program exits. + + + + Search tips, shortcuts Disabling stem expansion @@ -331,7 +366,7 @@ gardening if you enter Garden instead of garden). This is the only case where - character case will make a difference for a &RCL; + character case should make a difference for a &RCL; search. Phrases @@ -354,8 +389,28 @@ - + + Customising the search interface + It is possible to customise some aspects of the search + interface by using Query configuration entry + in the Preferences menu. + + There are two tabs in the dialog, to modify the appearance + of the user interface (result list appearance), or the + parameters used for searching (language used for stem + expansion). + + The stemming language can be chosen among those that were + specified in the configuration file, or later added with + recollindex -s (See the recollindex + manual). Stemming languages which are dynamically added will be + deleted at the next indexation pass unless they are also added in + the configuration file. + + + + @@ -493,9 +548,12 @@ Configuration overview - The personal configuration files and the database are kept in - the .recoll directory in your - home. If this directory does not exist when + The personal configuration files and the database are + normally kept in + the .recoll directory in your home (this + can be changed with the RECOLL_CONFDIR + environment variable, and a parameter inside the main + configuration file). If this directory does not exist when recoll or recollindex are started, the directory will be created and the sample configuration files will @@ -504,13 +562,6 @@ indexation. recollindex will proceed immediately. - &RCL; uses text - configuration files. You will have to edit them by hand for - now (there is still some hope for a GUI configuration tool - in the future). The most accurate documentation for the - configuraton parameters is given by comments inside the sample - files, and we will just give a general overview here. - Most of the parameters specific to the recoll GUI are set through the Preferences menu and stored in the @@ -518,15 +569,58 @@ ($HOME/.qt/recollrc). You probably do not want to edit this by hand. + For other options, &RCL; uses text configuration + files. You will have to edit them by hand for + now (there is still some hope for a GUI configuration tool + in the future). The most accurate documentation for the + configuration parameters is given by comments inside the sample + files, and we will just give a general overview here. + + All configuration files share the same format. For + exemple, a short extract of the main configuration file might + look as follows: + + # Space-separated list of directories to index. + topdirs = ~/docs /usr/share/doc + + [~/somedirectory-with-utf8-txt-files] + defaultcharset = utf-8 + + + There are three kinds of lines: + + Comment (starts with + #) or empty. + + Parameter affectation (name = + value). + + Section definition + ([somedirname]). + + + + Section lines allow redefining some parameters for a + directory subtree. Some of the parameters used for indexation + are looked up hierarchically from the more to the less + specific. Not all parameters can be meaningfully redefined, + this is specified for each in the next section. + + The tilde character (~) is expanded in file names to the + name of the user's home directory. + + White space is used for separation inside lists. + Elements with embedded spaces can be quoted using + double-quotes. + Main configuration file ~/.recoll/recoll.conf is the main - configuration file. It defines + configuration file. It defines things like what to index (top directories and things to ignore), and the - default character set to use (for document types which do not - specify it internally). The default character set can be - specified separately for any directory subtree. + default character set to use for document types which do not + specify it internally. The default configuration will index your home directory. If this is not appropriate, use @@ -535,8 +629,103 @@ the configuration file before restarting the command. This will start the initial indexation, which may take some time. - There are also miscellaneous other parameters inside - recoll.conf. Explore and enjoy :) + Paramers: + + + + topdirs + Specifies the list of directories to index + (recursively). + + + + skippedNames + A space-separated list of patterns for + names of files or directories that should be completely + ignored. The list defined in the default file is: + +*~ #* bin CVS Cache caughtspam tmp + + The list can be redefined for subdirectories, but is only + actually changed for the top level ones in + topdirs + + + + loglevel + Verbosity level for recoll and + recollindex. A value of 4 lists quite a lot of + debug/information messages. 3 only lists errors. + + + + logfilename + Where should the messages go. 'stderr' can + be used as a special value. + + + + filtersdir + A directory to search for the external + filter scripts used to index some types of files. The + value should not be changed, except if you want to modify + one of the default scripts. The value can be redefined for + any subdirectory. + + + + indexstemminglanguages + A list of languages for which the stem + expansion databases will be built. See recollindex(1) for + possible values. You can add a stem expansion database for + a different language by using recollindex + -s, but it will be deleted during the next + indexation. Only languages listed in the configuration + file are permanent. + + + + iconsdir + The name of the directory where + recoll result list icons are + stored. You can change this if you want different + images. + + + + dbdir + The name of the Xapian database + directory. It will be created if needed when the database + is initialized. + + + + defaultcharset + The name of the character set used for + files that do not contain a character set definition (ie: + plain text files). This can be redefined for any + subdirectory. + + guesscharset + Decide if we try to guess the character + set of files if no internal value is available (ie: for + plain text files). This does not work well in general, and + should probably not be used. + + + + usesystemfilecommand + Decide if we use the file -i + system command as a final step for determining the mime + type for a file (the main procedure uses suffix + associations as defined in the mimemap + file). This can be useful for files with suffixless names, + but it will also cause the indexation of many bogus "text" + files. + + + + diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index d33ba70e..e6e6e87a 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.15 2006-01-10 11:07:21 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -84,22 +84,22 @@ static int op_flags; static const char usage [] = "\n" -"recollindex [-hz] \n" -" Normal index run\n" +"recollindex [-h] \n" +" Print help\n" +"recollindex [-z] \n" +" Index everything according to configuration file\n" +" -z : reset database before starting indexation\n" "recollindex -i \n" -" Index individual files. No db purge or stem database updates\n" +" Index individual files. No database purge or stem database updates\n" "recollindex -s \n" -" Build stem database for language \n" -"Options:\n" -" -h : print this message\n" -" -z : reset database before starting indexation\n\n" +" Build stem database for additional language \n" ; static void Usage(void) { FILE *fp = (op_flags & OPT_h) ? stdout : stderr; - fprintf(fp, "%s: usage: %s", thisprog, usage); + fprintf(fp, "%s: Usage: %s", thisprog, usage); exit((op_flags & OPT_h)==0); } diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index b3bdbdd8..19e0bd7b 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -1,4 +1,4 @@ -# @(#$Id: recoll.conf.in,v 1.5 2005-12-15 14:39:58 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: recoll.conf.in,v 1.6 2006-01-10 11:07:21 dockes Exp $ (C) 2004 J.F.Dockes # # Recoll default configuration file. This should be copied to # ~/.recoll/recoll.conf @@ -41,8 +41,6 @@ dbdir = ~/.recoll/xapiandb # below). Used when converting to utf-8 (internal storage format), so it # may be quite important for pure text files. defaultcharset = iso-8859-1 -# defaultlanguage is currently not used by the program. -defaultlanguage = english # Guessing charsets usually does not work well guesscharset = 0