diff --git a/src/VERSION b/src/VERSION index 1c99cf0e..bc80560f 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.4 +1.5.0 diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 067a62c7..256cd321 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -24,7 +24,7 @@ Dockes - $Id: usermanual.sgml,v 1.16 2006-09-11 14:22:15 dockes Exp $ + $Id: usermanual.sgml,v 1.17 2006-09-15 16:50:44 dockes Exp $ This document introduces full text search notions @@ -42,18 +42,18 @@ If you do not like reading manuals (who does?) and would like to give &RCL; a try, just perform installation and start the + linkend="rcl.install.binary">installation and start the recoll user interface, which will index your home directory by default, allowing you to search immediately after indexing completes. - Do not do this if your home has a huge + Do not do this if your home directory contains a huge number of documents and you do not want to wait or are very short on disk space. In this case, you may want to edit the configuration file first to restrict the indexed area. - Also be aware that you will need to install the + Also be aware that you may need to install the appropriate supporting applications for document types that need them (for example antiword for @@ -186,7 +186,7 @@ programmed into your cron file. - Side note: there is nothing in &RCL; and &XAP; + There is nothing in &RCL; and &XAP; that would prevent interfacing with a real time file modification monitor, but this would tend to consume significant system resources for dubious gain, because you rarely need a @@ -196,7 +196,6 @@ the manual page. - &RCL; knows about quite a few different document types. The parameters for document types recognition and processing are set in @@ -209,14 +208,23 @@ &RCL; indexing processes plain text, HTML, openoffice and e-mail files internally. Other types (ie: postscript, pdf, ms-word, rtf) need external applications for preprocessing. The - list is in the installation - section. + list is in the + installation section. Without further configuration, &RCL; will index all appropriate files from your home directory, with a reasonable set of defaults. + In some cases, it may be interesting to index different + areas of the file system to separate databases. You can do this + by using multiple configuration directories, each indexing a + file system area to a specific database. You would use the + RECOLL_CONFDIR environment variable or the + -c confdir option + to recollindex to indicate which + configuration to process. The recoll search + program can use any selection of the existing databases for each + search, this is configurable inside the user interface. @@ -227,7 +235,7 @@ be changed by setting the RECOLL_CONFDIR environment variable, or by specifying the dbdir parameter in the configuration file - (see the configuration + (see the configuration section). The size of the index is determined by the size of the set @@ -245,8 +253,9 @@ (2006), that even a big index will be negligible against the total amount of data on the computer. - The index data directory only contains data that will be - rebuilt by an index run, so that it can be destroyed safely. + The index data directory (xapiandb) + only contains data that will be rebuilt by an index run, and it + can always be destroyed safely. Security aspects @@ -258,13 +267,13 @@ As of version 1.4, &RCL; will create the configuration directory with a mode of 0700 (access by owner only). As the - index directory is by default a subdirectory of the + index data directory is by default a subdirectory of the configuration directory, this should result in appropriate - protection. + protection. If you use another setup, you should think of the kind of protection you need for your index, and set the directory - access modes appropriately. + and files access modes appropriately. @@ -306,21 +315,25 @@ Indexing is performed either by the recollindex program, or by the indexing thread inside the recoll - program (use the File menu). + program (use the File menu). Both programs + will use of the RECOLL_CONFDIR + variable or accept a -c + confdir option to specify the + configuration directory to be used. If the recoll program finds no index - when it starts, it will automatically start indexing (except - if cancelled). + when it starts, it will automatically start indexing (except + if cancelled). It is best to avoid interrupting the indexing process, as - this may sometimes leave the database in a bad state. This is + this may sometimes leave the index in a bad state. This is not a serious problem, as you then just need to clear everything and restart the indexing: the index files are normally stored in the $HOME/.recoll/xapiandb - directory, - which you can just delete if needed. Alternatively, you can - start recollindex -z, which will - reset the database before indexing. + directory, which you can just delete if needed. Alternatively, + you can start recollindex with option + -z, which will reset the database before + indexing. @@ -380,46 +393,153 @@ (*, ? , []). + You can search for exact phrases (adjacent words in a + given order) by enclosing the input inside double quotes. Ex: + "virtual reality". + Character case has no influence on search, except that you + can disable stem expansion for any term by capitalizing it. Ie: + a search for floor will also normally look for + flooring, floored, etc., but + a search for Floor will only look for + floor, in any character case (stemming can + also be disabled globally in the preferences). + &RCL; remembers the last few searches that you - performed. You can use the simple search text entry widget (a - combobox) to recall them (click on the thing at the right of the - text field). Please note, however, that only the search texts - are remembered, not the mode (all/any/filename). + performed. You can use the simple search text entry widget (a + combobox) to recall them (click on the thing at the right of the + text field). Please note, however, that only the search texts + are remembered, not the mode (all/any/filename). + + Hitting ^Tab (Ctrl + + Tab) while entering a word in the + simple search entry will open a window with possible completions + for the word. The completions are extracted from the + database. + + Double-clicking on a word in the result list or a preview + window will insert it into the simple search entry field. You can use the Tools / Advanced search dialog for more complex searches. + + + + The result list After starting a search, a list of results will instantly - be displayed in the main list window. Clicking on the - Preview link for an entry will open an - internal preview window for the document. Clicking the - Edit link will attempt to start an external - viewer (have a look at the mimeconf - configuration file to see how these are configured). + be displayed in the main list window. By default, the document list is presented in order of - relevance (how well the system estimates that the document - matches the query). You can specify a different ordering by - using the Tools + relevance (how well the system estimates that the document + matches the query). You can specify a different ordering by + using the Tools / Sort parameters dialog. + Clicking on the + Preview link for an entry will open an + internal preview window for the document. Clicking the + Edit link will attempt to start an external + viewer (have a look at the mimeconf + configuration file to see how these are configured). + The Preview and Edit - edit links may not be present for all entries, meaning that - &RCL; has no configured way to preview a given file type (which - was indexed by name only), or no configured external viewer for - the file type. This can sometimes be adjusted simply by tweaking - the + edit links may not be present for all entries, meaning that + &RCL; has no configured way to preview a given file type (which + was indexed by name only), or no configured external viewer for + the file type. This can sometimes be adjusted simply by tweaking + the mimemap and - + mimeconf configuration files. You can click on the Query details link - at the top of the results page to see the query actually - performed, after stem expansion and other processing. + at the top of the results page to see the query actually + performed, after stem expansion and other processing. + + Double-clicking on any word inside the result list or a + preview window will insert it into the simple search text. + + The result list is divided into pages (the size of which + you can change in the preferences). Use the arrow buttons in the + toolbar or the links at the bottom of the page to browse the + results. + + + + The result list right-click menu + + Apart from the preview and edit links, you can display a + popup menu by right-clicking over a paragraph in the result + list. This menu has the following entries: + + + Preview + Edit + Copy File Name + Copy Url + Find similar + + + The Preview and + Edit entries do the same thing as the + corresponding links. The two following entries will copy either + an url or the file path to the clipboard, for pasting into + another application. + + The Find similar entry will select + a number of relevant term from the current document and enter + them into the simple search field. You can then start a simple + search, with a good chance of finding documents related to the + current result. + + + + + + The preview window + + The preview window opens when you first click a + Preview link inside the result list. + + Subsequent preview requests for a given search open new + tabs in the existing window. + + Starting another search and requesting a preview will + create a new preview window. The old one stays open until you + close it. + + You can close a preview tab by typing ^W + (Ctrl + W) in the + window. Closing the last tab for a window will also close the + window. + + Of course you can also close a preview window by using the + window manager button in the top of the frame. + + You can display successive or previous documents from the + result list inside a preview tab by typing + Shift+Down or + Shift+Up (Down + and Up are the arrow keys). + + The preview tabs have an internal incremental search + function. You initiate the search either by typing a + / (slash) inside the text area or by clicking + into the Search for: text field and + entering the search string. You can then use the + Next and Previous + buttons to find the next/previous occurence. You can also type + F3 inside the text area to get to the next + occurrence. + + If you have a search string entered and you use ^Up/^Down + to browse the results, the search is initiated for each successive + document. If the string is found, the cursor will be positionned + at the first occurrence of the search string. - + Complex/advanced search The advanced search dialog has fields that will allow a more @@ -427,19 +547,25 @@ given exact phrase, none of the given elements, or a given file name (with wildcard expansion). All relevant fields will be combined by an implicit AND clause. All fields except "Exact - phrase" can accept single words, or phrases enclosed in double - quotes. + phrase" can accept a mix of single words and phrases enclosed + in double quotes. - It will let you search for documents of specific mime + Advanced search will let you search for documents of specific mime types (ie: only text/plain, or text/html or - application/pdf etc...) + application/pdf etc...). The state of the + file type selection can be saved as the default (the file type + filter will not be activated at program startup, but the lists + will be in the restored state). - It will let you restrict the search results to a subtree of - the indexed area. + You can also restrict the search results + to a subtree of the indexed area. If you need to do this often, + you may think of setting up multiple indexes instead, as the + performance will be much better. Click on the Start Search button in - the advanced search dialog to start the search. The button in + the advanced search dialog, or type Enter in + any text field to start the search. The button in the main window always performs a simple search. Click on the Show query details link at @@ -450,29 +576,57 @@ Multiple databases - Your &RCL; configuration always defines a main index. This - is what gets updated, for example, when you execute - recollindex. + Multiple &RCL; databases or indexes can be created by + using several configuration directories which are usually set to + index different areas of the file system. A specific index can + be selected for updating or searching, using the + RECOLL_CONFDIR environment variable or the + -c option to recoll and + recollindex. - You can use the search configuration - tool to define additional databases to be searched. These - databases can be made active or inactive at any moment. + A recollindex program instance can only + update one specific index. - The typical use of this feature is for a system - administrator to set up a central index, that you may choose to - search, or not, in addition to your personal data. Of course, - there are other possibilities. + A recoll program instance is also + associated with a specific index, which is the one to be + updated by its indexing thread, but it can use any + number of &RCL; indexes for searching. The external indexes + can be selected through the external + indexes tab in the preferences dialog. - The main index (defined by your personal configuration) is - always active. + Index selection is performed in two phases. A set of all + usable indexes must first be defined, and then the subset of + indexes to be used for searching. Of course, these parameters + are retained across program executions (there are kept + separately for each &RCL; configuration). The set of all indexes + is usually quite stable, while the active ones might typically + be adjusted quite frequently. - The list of searchable databases may also be defined by - the RECOLL_EXTRA_DBS environment - variable. This should hold a colon-separated list of index - directories, ie: + The main index (defined by + RECOLL_CONFDIR) is always active. If this is + undesirable, you can set up your base configuration to index + an empty directory. + + As building the set of all indexes can be a little tedious + when done through the user interface, you can use the + RECOLL_EXTRA_DBS environment + variable to provide an initial set. This might typically be + set up by a system administrator so that every user does not + have to do it. The variable should define a colon-separated list + of index directories, ie: + export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db - + + A typical usage scenario for the multiple index feature + would be for a system administrator to set up a central index + for shared data, that you may choose to search, or not, in + addition to your personal data. Of course, there are other + possibilities. There are many cases where you know the subset of + files that you want to be searched for a given query, and where + restricting the query will much improve the precision of the + results. This can also be performed with the directory filter in + advanced search, but multiple indexes will have much better + performance and may be worth the trouble. @@ -488,7 +642,7 @@ - Result list sorting + Sorting search results The documents in a result list are normally sorted in order of relevance. It is possible to specify different sort @@ -507,35 +661,6 @@ - - Additional result list functionality - - Apart from the preview and edit links, you can display a - popup menu by right-clicking over a paragraph in the result - list. This menu has the following entries: - - - Preview - Edit - Copy File Name - Copy Url - Find similar - - - The Preview and - Edit entries do the same thing as the - corresponding links. The two following entries will copy either - an url or the file path to the clipboard, for pasting into - another application. - - The Find similar entry will select - a number of relevant term from the current document and enter - them into the simple search field. You can then start a simple - search, with a good chance of finding documents related to the - current result. - - - Search tips, shortcuts @@ -555,11 +680,27 @@ only for occurrences of user immediately followed by manual. You can use the This exact phrase field of the advanced - search dialog to the same effect. + search dialog to the same effect. Phrases can be entered along + simple terms in all search entry fields (except This + exact phrase). + AutoPhrases + This option can be set in the preferences dialog. If it is + set, a phrase will be automatically built and added to simple + searches when looking for Any terms. This + will not change radically the results, but will give a relevance + boost to the results where the search terms appear as a + phrase. Ie: searching for virtual reality + will still find all documents where either + virtual or reality or + both appear, but those which contain virtual + reality should appear sooner in the list. + + Term completion - Typing ^TAB (Control+Tab) in the simple + Typing ^TAB (Control + + Tab) in the simple search entry field while entering a word will either complete the current word if its beginning matches a unique term in the index, or open a window to propose a list of completions @@ -572,7 +713,7 @@ Finding related documents - Selecting the More like this entry + Selecting the Find similar documents entry in the result list paragraph right-click menu will select a set of "interesting" terms from the current result, and insert them into the simple search entry field. You can then possibly @@ -591,7 +732,7 @@ specify them as ordinary terms in normal search fields (&RCL; used to index all directories in the file path as terms. This has been abandonned as it did not seem really useful). Alternatively, you - can use specific file name search which will + can use the specific file name search which will only look for file names and can use wildcard expansion. @@ -607,6 +748,14 @@ close it (and, for the last tab, close the preview window). + List browsing in preview + Entering Shift-Down or Shift-Up + (Shift + an arrow key) in a preview window will + display the next or the previous document from the result + list. Any secondary search currently active will be executed on + the new document. + + @@ -664,16 +813,17 @@ Search parameters: + Stemming language: stemming obviously depends on the document's language. This listbox will let you chose among the stemming databases which were built during indexing (this is set in the main configuration file), or later added with - recollindex -s (See the recollindex - manual). Stemming languages which are dynamically added will be - deleted at the next indexing pass unless they are also added in - the configuration file. + recollindex -s (See the recollindex + manual). Stemming languages which are dynamically added will be + deleted at the next indexing pass unless they are also added in + the configuration file. Dynamically build @@ -684,29 +834,38 @@ result list display significantly for big documents, and you may want to turn it off. + Replace abstracts from documents: this decides if we should synthetize and display an abstract in place of an explicit abstract found within the document itself. + + Synthetic abstract size: + adjust to taste... + + + Synthetic abstract context + words: how many words should be displayed around + each term occurrence. + + - Extra - databases: - - - This panel will let you browse for additional databases - that you may want to search. Extra databases are designated by + + External indexes: + This panel will let you browse for additional indexes + that you may want to search. External indexes are designated by their database directory (ie: /home/someothergui/.recoll/xapiandb, /usr/local/recollglobal/xapiandb). - Once entered, the databases will appear in the - All extra databases list, and you can + Once entered, the indexes will appear in the + All indexes list, and you can chose which ones you want to use at any moment by tranferring - them to/from the Active extra databases + them to/from the Active indexes list. Your main database (the one the current configuration indexes to), is always implicitely active. If this is not @@ -721,6 +880,51 @@ Installation + + Installing a prebuilt copy + + Recoll binary installations are always linked statically + to the xapian libraries, and have no other dependencies. You + will only have to check or install + supporting + applications for the file types that you want to index + beyond text, html and mail files. + + + Installing through a package system + + If you use a BSD-type port system or a + prebuilt package (RPM or other), just follow the usual + procedure, and maybe have a look at the configuration + section (but this may not be necessary for a quick + test with default parameters). + + + + + Installing a prebuilt &RCL; + + The unpackaged binary versions are just compressed tar + files of a build tree, where only the useful parts were kept + (executables and sample configuration). + + The executable binary files are built with a static link to + libxapian and libiconv, to make installation easier (no + dependencies). However, this also means that you cannot change + the versions which are used. + + After extracting the tar file, you can proceed with + installation as + if you had built the package from source. + + The binary trees are built for installation to + /usr/local. + + + + Building from source @@ -815,46 +1019,19 @@ and the sample configuration files, scripts and other shared data to prefix/share/recoll. + If the installation prefix given to + recollinstall is different from what was + specified when executing configure, you + will have to set the RECOLL_DATADIR + environment variable to indicate where the shared data is to + be found. + You can then proceed to configuration. - - Installing a prebuilt copy - - - Installing through a package system - - If you are lucky enough to be using a port system or a - prebuilt package (RPM or other), just follow the usual - procedure, and have a look at the configuration - section. - - - - Installing a prebuilt &RCL; - - The unpackaged binary versions are just compressed tar - files of a build - tree, where only the useful parts were kept (executables and - sample configuration). - - The executable binary files are built with a static link to - libxapian and libiconv, to make installation easier (no - dependencies). However, this also means that you cannot change - the versions which are used. - - After extracting the tar file, you can proceed with - installation as - if you had built the package from source. - - - - @@ -880,6 +1057,11 @@ antiword. + MS Excel and PowerPoint: + + catdoc. + + RTF: unrtf @@ -1012,6 +1194,14 @@ + dbdir + The name of the Xapian data directory. It + will be created if needed when the index is + initialized. If this is not an absolute path, it will be + interpreted relative to the configuration directory. + + + skippedNames A space-separated list of patterns for @@ -1074,22 +1264,7 @@ - iconsdir - The name of the directory where - recoll result list icons are - stored. You can change this if you want different - images. - - - - dbdir - The name of the Xapian data directory. It - will be created if needed when the index is - initialized. If this is not an absolute path, it will be - interpreted relative to the configuration directory. - - - + defaultcharset The name of the character set used for files that do not contain a character set definition (ie: @@ -1128,6 +1303,25 @@ + idxabsmlen + &RCL; stores an abstract for each indexed + file inside the database. This is so that they can be + displayed inside the result lists without decoding the + original file. This parameter defines the size of the + stored abstract (which can come from an actual section or + just be the beginning of the text). The default value is 250. + + + + + iconsdir + The name of the directory where + recoll result list icons are + stored. You can change this if you want different + images. + + + diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 8d552517..af972513 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.16 2006-09-05 17:09:30 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.17 2006-09-15 16:50:44 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -216,19 +216,14 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, } if (doc.h.getFirstHeader("Date", hi)) { rfc2047_decode(hi.getValue(), transcoded); - // Try to set the mtime from the date field. - string date = transcoded; - string::size_type pos; - // Possibly get rid of the day - if ((pos = date.find(",")) != string::npos) - date = date.substr(pos+1); - struct tm tm; - if (strptime(date.c_str(), " %d %b %Y %H:%M:%S %z ", &tm)) { + time_t t = rfc2822DateToUxTime(transcoded); + if (t != (time_t)-1) { char ascuxtime[100]; - sprintf(ascuxtime, "%ld", (long)mktime(&tm)); + sprintf(ascuxtime, "%ld", (long)t); docout.dmtime = ascuxtime; } else { - LOGDEB(("strptime failed for [%s]\n", date.c_str())); + // Leave mtime field alone, ftime will be used instead. + LOGDEB(("rfc2822Date...: failed for [%s]\n", transcoded.c_str())); } docout.text += string("Date: ") + transcoded + string("\n"); diff --git a/src/utils/mimeparse.cpp b/src/utils/mimeparse.cpp index 0635435d..96ca82e4 100644 --- a/src/utils/mimeparse.cpp +++ b/src/utils/mimeparse.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.12 2006-09-06 09:14:43 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.13 2006-09-15 16:50:44 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -26,6 +26,7 @@ static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.12 2006-09-06 09:14:43 dockes E #include #include #include +#include #include "mimeparse.h" #include "base64.h" @@ -578,8 +579,159 @@ bool rfc2047_decode(const std::string& in, std::string &out) return true; } +#define DEBUGDATE 1 +#if DEBUGDATE +#define DATEDEB(X) fprintf X +#else +#define DATEDEB(X) +#endif + +// Convert rfc822 date to unix time. A date string normally looks like: +// Mon, 3 Jul 2006 09:51:58 +0200 +// But there are many common variations +// +time_t rfc2822DateToUxTime(const string& dt) +{ + // Strip everything up to first comma if any, we don't need weekday, + // then break into tokens + list toks; + string::size_type idx; + if ((idx = dt.find_first_of(",")) != string::npos) { + if (idx == dt.length() - 1) { + DATEDEB((stderr, "Bad rfc822 date format (short1): [%s]\n", + dt.c_str())); + return (time_t)-1; + } + string date = dt.substr(idx+1); + stringToTokens(date, toks, " \t:"); + } else { + stringToTokens(dt, toks, " \t:"); + } + +#if DEBUGDATE + for (list::iterator it = toks.begin(); it != toks.end(); it++) { + DATEDEB((stderr, "[%s] ", it->c_str())); + } + DATEDEB((stderr, "\n")); +#endif + + if (toks.size() == 6) { + // Probably no timezone, sometimes happens + toks.push_back("+0000"); + } + + if (toks.size() < 7) { + DATEDEB((stderr, "Bad rfc822 date format (toks cnt): [%s]\n", + dt.c_str())); + return (time_t)-1; + } + + struct tm tm; + memset(&tm, 0, sizeof(tm)); + + // Load struct tm with appropriate tokens, possibly converting + // when needed + + list::iterator it = toks.begin(); + + // Day of month: no conversion needed + tm.tm_mday = atoi(it->c_str()); + it++; + + // Month. Only Jan-Dec are legal. January, February do happen + // though. Convert to 0-11 + if (*it == "Jan" || *it == "January") tm.tm_mon = 0; else if + (*it == "Feb" || *it == "February") tm.tm_mon = 1; else if + (*it == "Mar" || *it == "March") tm.tm_mon = 2; else if + (*it == "Apr" || *it == "April") tm.tm_mon = 3; else if + (*it == "May") tm.tm_mon = 4; else if + (*it == "Jun" || *it == "June") tm.tm_mon = 5; else if + (*it == "Jul" || *it == "July") tm.tm_mon = 6; else if + (*it == "Aug" || *it == "August") tm.tm_mon = 7; else if + (*it == "Sep" || *it == "September") tm.tm_mon = 8; else if + (*it == "Oct" || *it == "October") tm.tm_mon = 9; else if + (*it == "Nov" || *it == "November") tm.tm_mon = 10; else if + (*it == "Dec" || *it == "December") tm.tm_mon = 11; else { + DATEDEB((stderr, "Bad rfc822 date format (month): [%s]\n", + dt.c_str())); + return (time_t)-1; + } + it++; + + // Year. Struct tm counts from 1900 + tm.tm_year = atoi(it->c_str()); + if (tm.tm_year > 1900) + tm.tm_year -= 1900; + it++; + + // Hour minute second need no adjustments + tm.tm_hour = atoi(it->c_str()); it++; + tm.tm_min = atoi(it->c_str()); it++; + tm.tm_sec = atoi(it->c_str()); it++; + + + // Timezone is supposed to be either +-XYZT or a zone name + int zonesecs = 0; + if (it->length() < 1) { + DATEDEB((stderr, "Bad rfc822 date format (zlen): [%s]\n", dt.c_str())); + return (time_t)-1; + } + if (it->at(0) == '-' || it->at(0) == '+') { + // Note that +xy:zt (instead of +xyzt) sometimes happen, we + // may want to process it one day + if (it->length() < 5) { + DATEDEB((stderr, "Bad rfc822 date format (zlen1): [%s]\n", + dt.c_str())); + goto nozone; + } + zonesecs = 3600*((it->at(1)-'0') * 10 + it->at(2)-'0')+ + (it->at(3)-'0')*10 + it->at(4)-'0'; + zonesecs = it->at(0) == '+' ? -1 * zonesecs : zonesecs; + } else { + int hours; + if (*it == "A") hours= 1; else if (*it == "B") hours= 2; + else if (*it == "C") hours= 3; else if (*it == "D") hours= 4; + else if (*it == "E") hours= 5; else if (*it == "F") hours= 6; + else if (*it == "G") hours= 7; else if (*it == "H") hours= 8; + else if (*it == "I") hours= 9; else if (*it == "K") hours= 10; + else if (*it == "L") hours= 11; else if (*it == "M") hours= 12; + else if (*it == "N") hours= -1; else if (*it == "O") hours= -2; + else if (*it == "P") hours= -3; else if (*it == "Q") hours= -4; + else if (*it == "R") hours= -5; else if (*it == "S") hours= -6; + else if (*it == "T") hours= -7; else if (*it == "U") hours= -8; + else if (*it == "V") hours= -9; else if (*it == "W") hours= -10; + else if (*it == "X") hours= -11; else if (*it == "Y") hours= -12; + else if (*it == "Z") hours= 0; else if (*it == "UT") hours= 0; + else if (*it == "GMT") hours= 0; else if (*it == "EST") hours= 5; + else if (*it == "EDT") hours= 4; else if (*it == "CST") hours= 6; + else if (*it == "CDT") hours= 5; else if (*it == "MST") hours= 7; + else if (*it == "MDT") hours= 6; else if (*it == "PST") hours= 8; + else if (*it == "PDT") hours= 7; + // Non standard names + // Standard Time (or Irish Summer Time?) is actually +5.5 + else if (*it == "CET") hours= -1; else if (*it == "JST") hours= -9; + else if (*it == "IST") hours= -5; else if (*it == "WET") hours= 0; + else if (*it == "MET") hours= -1; + else { + DATEDEB((stderr, "Bad rfc822 date format (zname): [%s]\n", + dt.c_str())); + // Forget tz + goto nozone; + } + zonesecs = 3600 * hours; + } + DATEDEB((stderr, "Tz: [%s] -> %d\n", it->c_str(), zonesecs)); + nozone: + + time_t tim = mktime(&tm); + tim += zonesecs; + DATEDEB((stderr, "Date: %s uxtime %ld \n", ctime(&tim), tim)); + return tim; +} + #else +#include #include #include "mimeparse.h" @@ -588,6 +740,7 @@ bool rfc2047_decode(const std::string& in, std::string &out) using namespace std; extern bool rfc2231_decode(const string& in, string& out, string& charset); +extern time_t rfc2822DateToUxTime(const string& date); int main(int argc, const char **argv) @@ -641,7 +794,7 @@ main(int argc, const char **argv) exit(1); } printf("Decoded: '%s'\n", out.c_str()); -#elif 1 +#elif 0 char line [1024]; string out; bool res; @@ -675,7 +828,22 @@ main(int argc, const char **argv) exit(1); } printf("Decoded: [%s]\n", decoded.c_str()); - +#elif 1 + { + time_t t; + + const char *dates[] = { + " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)", + " Mon, 3 Jul 2006 09:51:58 +0200", + " Wed, 13 Sep 2006 08:19:48 GMT-07:00", + " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)", + " Sat, 23 Dec 89 19:27:12 EST", + " 13 Jan 90 08:23:29 GMT"}; + + for (unsigned int i = 0; i #include #include #include "base64.h" +#ifndef NO_NAMESPACES +using std::string; +#endif + /** A class to represent a MIME header value with parameters */ class MimeHeaderValue { public: - std::string value; - std::map params; + string value; + std::map params; }; /** @@ -36,10 +42,10 @@ class MimeHeaderValue { * @param in the input string should be like: value; pn1=pv1; pn2=pv2. * Example: text/plain; charset="iso-8859-1" */ -extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd); +extern bool parseMimeHeaderValue(const string& in, MimeHeaderValue& psd); /** Quoted printable decoding. Doubles up as rfc2231 decoder, hence the esc */ -extern bool qp_decode(const std::string& in, std::string &out, +extern bool qp_decode(const string& in, string &out, char esc = '='); /** Decode an Internet mail field value encoded according to rfc2047 @@ -53,6 +59,14 @@ extern bool qp_decode(const std::string& in, std::string &out, * @param in input string, ascii with rfc2047 markup * @return out output string encoded in utf-8 */ -extern bool rfc2047_decode(const std::string& in, std::string &out); +extern bool rfc2047_decode(const string& in, string &out); + + +/** Decode RFC2822 date to unix time (gmt secs from 1970 + * + * @param dt date string (the part after Date: ) + * @return unix time + */ +time_t rfc2822DateToUxTime(const string& dt); #endif /* _MIME_H_INCLUDED_ */