From ceb996c8fb696592d99e2db7acb5d6cf28dfaaf1 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 11 Sep 2010 12:07:53 +0200 Subject: [PATCH] Implement date: date range filter/searches. Remove restriction on pure negative queries --- src/query/recollq.cpp | 58 +++++--- src/query/wasatorcl.cpp | 20 ++- src/rcldb/rcldb.cpp | 26 ++++ src/rcldb/rcldb.h | 2 + src/rcldb/searchdata.cpp | 165 +++++++++++++++++++--- src/rcldb/searchdata.h | 26 +++- src/utils/smallut.cpp | 292 ++++++++++++++++++++++++++++++++++++++- src/utils/smallut.h | 17 +++ 8 files changed, 551 insertions(+), 55 deletions(-) diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index a9643db3..f74e46c5 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -63,6 +63,7 @@ bool dump_contents(RclConfig *rclconfig, TempDir& tmpdir, Rcl::Doc& idoc) static char *thisprog; static char usage [] = +" -P: Show date span for documents in index\n" " [-o|-a|-f] \n" " Runs a recoll query and displays result lines. \n" " Default: will interpret the argument(s) as a xesam query string\n" @@ -110,6 +111,7 @@ static int op_flags; #define OPT_s 0x4000 #define OPT_A 0x8000 #define OPT_i 0x10000 +#define OPT_P 0x20000 int recollq(RclConfig **cfp, int argc, char **argv) { @@ -148,6 +150,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) if (limit <= 0) limit = INT_MAX; argc--; goto b1; case 'o': op_flags |= OPT_o; break; + case 'P': op_flags |= OPT_P; break; case 'q': op_flags |= OPT_q; break; case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); sortfield = *(++argv); @@ -161,13 +164,6 @@ int recollq(RclConfig **cfp, int argc, char **argv) b1: argc--; argv++; } - if (argc < 1) { - Usage(); - } - string qs = *argv++;argc--; - while (argc > 0) { - qs += string(" ") + *argv++;argc--; - } string reason; *cfp = recollinit(0, 0, reason, &a_config); RclConfig *rclconfig = *cfp; @@ -176,21 +172,10 @@ int recollq(RclConfig **cfp, int argc, char **argv) exit(1); } - { - string uq; - string charset = rclconfig->getDefCharset(true); - int ercnt; - if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) { - fprintf(stderr, "Can't convert command line args to utf-8\n"); - exit(1); - } else if (ercnt) { - fprintf(stderr, "%d errors while converting arguments from %s " - "to utf-8\n", ercnt, charset.c_str()); - } - qs = uq; + if (argc < 1 && !(op_flags & OPT_P)) { + Usage(); } - Rcl::Db rcldb(rclconfig); if (!extra_dbs.empty()) { for (list::iterator it = extra_dbs.begin(); @@ -208,6 +193,39 @@ int recollq(RclConfig **cfp, int argc, char **argv) exit(1); } + if (op_flags & OPT_P) { + int minyear, maxyear; + if (!rcldb.maxYearSpan(&minyear, &maxyear)) { + cerr << "maxYearSpan failed: " << rcldb.getReason() << endl; + exit(1); + } else { + cout << "Min year " << minyear << " Max year " << maxyear << endl; + exit(0); + } + } + + if (argc < 1) { + Usage(); + } + string qs = *argv++;argc--; + while (argc > 0) { + qs += string(" ") + *argv++;argc--; + } + + { + string uq; + string charset = rclconfig->getDefCharset(true); + int ercnt; + if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) { + fprintf(stderr, "Can't convert command line args to utf-8\n"); + exit(1); + } else if (ercnt) { + fprintf(stderr, "%d errors while converting arguments from %s " + "to utf-8\n", ercnt, charset.c_str()); + } + qs = uq; + } + Rcl::SearchData *sd = 0; if (op_flags & (OPT_a|OPT_o|OPT_f)) { diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp index 7214c21f..b1941bc5 100644 --- a/src/query/wasatorcl.cpp +++ b/src/query/wasatorcl.cpp @@ -42,14 +42,13 @@ Rcl::SearchData *wasaStringToRcl(const string &qs, string &reason, return 0; Rcl::SearchData *rq = wasaQueryToRcl(wq, autosuffs); if (rq == 0) { - reason = "Failed translating wasa query structure to recoll"; + reason = "Failed translating xesam query structure to recoll"; return 0; } return rq; } -Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa, - const string& autosuffs) +Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa, const string& autosuffs) { if (wasa == 0) return 0; @@ -119,6 +118,19 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa, } } + // Handle "date" spec + if (!stringicmp("date", (*it)->m_fieldspec)) { + DateInterval di; + if (!parsedateinterval((*it)->m_value, &di)) { + LOGERR(("wasaQueryToRcl: bad date interval format\n")); + // Process rest of query anyway ? + break; + } + LOGDEB(("wasaQueryToRcl:: date span: %d-%d-%d/%d-%d-%d\n", + di.y1,di.m1,di.d1, di.y2,di.m2,di.d2)) + sdata->setDateSpan(&di); + break; + } // "Regular" processing follows: unsigned int mods = (unsigned int)(*it)->m_modifiers; @@ -151,7 +163,7 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa, case WasaQuery::OP_EXCL: LOGDEB2(("wasaQueryToRcl: excl clause [%s]:[%s]\n", - (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str())); + (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str())); if (wasa->m_op != WasaQuery::OP_AND) { LOGERR(("wasaQueryToRcl: negative clause inside OR list!\n")); continue; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 8f938ec4..11d0cfa2 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -767,6 +767,10 @@ bool Db::fieldToPrefix(const string& fld, string &pfx) fldToPrefs["keyword"] = "K"; fldToPrefs["tag"] = "K"; fldToPrefs["tags"] = "K"; + + fldToPrefs["xapyear"] = "Y"; + fldToPrefs["xapyearmon"] = "M"; + fldToPrefs["xapdate"] = "D"; } if (m_config && m_config->getFieldPrefix(fld, pfx)) @@ -1365,6 +1369,28 @@ bool Db::filenameWildExp(const string& fnexp, list& names) return true; } +// Walk the Y terms and return min/max +bool Db::maxYearSpan(int *minyear, int *maxyear) +{ + *minyear = 1000000; + *maxyear = -1000000; + TermMatchResult result; + if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear")) + return false; + for (list::const_iterator it = result.entries.begin(); + it != result.entries.end(); it++) { + if (!it->term.empty()) { + int year = atoi(it->term.c_str()+1); + if (year < *minyear) + *minyear = year; + if (year > *maxyear) + *maxyear = year; + } + } + return true; +} + + class TermMatchCmpByWcf { public: int operator()(const TermMatchEntry& l, const TermMatchEntry& r) { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index a19316ba..f728a099 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -171,6 +171,8 @@ class Db { const string& field = "", string *prefix = 0 ); + /** Return min and max years for doc mod times in db */ + bool maxYearSpan(int *minyear, int *maxyear); /** Special filename wildcard to XSFN terms expansion. internal/searchdata use only */ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 9e754f82..5f2332f4 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -46,33 +46,107 @@ typedef vector::const_iterator qlist_cit_t; static const int original_term_wqf_booster = 10; +/* The dates-to-query routine is is lifted quasi-verbatim but + * modified from xapian-omega:date.cc. Copyright info: + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 James Aylett + * Copyright 2001,2002 Ananova Ltd + * Copyright 2002 Intercede 1749 Ltd + * Copyright 2002,2003,2006 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +static Xapian::Query +date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2) +{ + // Xapian uses a smallbuf and snprintf. Can't be bothered, we're + // only doing %d's ! + char buf[200]; + sprintf(buf, "D%04d%02d", y1, m1); + vector v; + + int d_last = monthdays(m1, y1); + int d_end = d_last; + if (y1 == y2 && m1 == m2 && d2 < d_last) { + d_end = d2; + } + // Deal with any initial partial month + if (d1 > 1 || d_end < d_last) { + for ( ; d1 <= d_end ; d1++) { + sprintf(buf + 7, "%02d", d1); + v.push_back(Xapian::Query(buf)); + } + } else { + buf[0] = 'M'; + v.push_back(Xapian::Query(buf)); + } + + if (y1 == y2 && m1 == m2) { + return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end()); + } + + int m_last = (y1 < y2) ? 12 : m2 - 1; + while (++m1 <= m_last) { + sprintf(buf + 5, "%02d", m1); + buf[0] = 'M'; + v.push_back(Xapian::Query(buf)); + } + + if (y1 < y2) { + while (++y1 < y2) { + sprintf(buf + 1, "%04d", y1); + buf[0] = 'Y'; + v.push_back(Xapian::Query(buf)); + } + sprintf(buf + 1, "%04d", y2); + buf[0] = 'M'; + for (m1 = 1; m1 < m2; m1++) { + sprintf(buf + 5, "%02d", m1); + v.push_back(Xapian::Query(buf)); + } + } + + sprintf(buf + 5, "%02d", m2); + + // Deal with any final partial month + if (d2 < monthdays(m2, y2)) { + buf[0] = 'D'; + for (d1 = 1 ; d1 <= d2; d1++) { + sprintf(buf + 7, "%02d", d1); + v.push_back(Xapian::Query(buf)); + } + } else { + buf[0] = 'M'; + v.push_back(Xapian::Query(buf)); + } + + return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end()); +} + bool SearchData::toNativeQuery(Rcl::Db &db, void *d) { Xapian::Query xq; m_reason.erase(); - if (m_query.size() < 1) { + if (!m_query.size() && !m_haveDates) { m_reason = "empty query"; return false; } - // It's not allowed to have a pure negative query and also it - // seems that Xapian doesn't like the first element to be AND_NOT - qlist_it_t itnotneg = m_query.end(); - for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) { - if ((*it)->m_tp != SCLT_EXCL) { - itnotneg = it; - break; - } - } - if (itnotneg == m_query.end()) { - LOGERR(("SearchData::toNativeQuery: can't have all negative clauses")); - m_reason = "Can't have only negative clauses"; - return false; - } - if ((*m_query.begin())->m_tp == SCLT_EXCL) - iter_swap(m_query.begin(), itnotneg); - // Walk the clause list translating each in turn and building the // Xapian query tree for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) { @@ -91,12 +165,59 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) // addClause()) Xapian::Query::op op; if (m_tp == SCLT_AND) { - op = (*it)->m_tp == SCLT_EXCL ? - Xapian::Query::OP_AND_NOT: Xapian::Query::OP_AND; + if ((*it)->m_tp == SCLT_EXCL) { + op = Xapian::Query::OP_AND_NOT; + } else { + op = Xapian::Query::OP_AND; + } } else { op = Xapian::Query::OP_OR; } - xq = xq.empty() ? nq : Xapian::Query(op, xq, nq); + if (xq.empty()) { + if (op == Xapian::Query::OP_AND_NOT) + xq = Xapian::Query(op, Xapian::Query::MatchAll, nq); + else + xq = nq; + } else { + xq = Xapian::Query(op, xq, nq); + } + } + + if (m_haveDates) { + // If one of the extremities is unset, compute db extremas + if (m_dates.y1 == 0 || m_dates.y2 == 0) { + int minyear = 1970, maxyear = 2100; + if (!db.maxYearSpan(&minyear, &maxyear)) { + LOGERR(("Can't retrieve index min/max dates\n")); + //whatever, go on. + } + if (m_dates.y1 == 0) { + m_dates.y1 = minyear; + m_dates.m1 = 1; + m_dates.d1 = 1; + } + if (m_dates.y2 == 0) { + m_dates.y2 = maxyear; + m_dates.m2 = 12; + m_dates.d2 = 31; + } + } + LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n", + m_dates.y1, m_dates.m1, m_dates.d1, + m_dates.y2, m_dates.m2, m_dates.d2)); + Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1, + m_dates.y2, m_dates.m2, m_dates.d2); + if (dq.empty()) { + LOGINFO(("Db::toNativeQuery: date filter is empty\n")); + } + // If no probabilistic query is provided then promote the daterange + // filter to be THE query instead of filtering an empty query. + if (xq.empty()) { + LOGINFO(("Db::toNativeQuery: proba query is empty\n")); + xq = dq; + } else { + xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq); + } } // Add the file type filtering clause if any @@ -116,7 +237,6 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) } } - list pqueries; Xapian::Query tq; for (vector::iterator it = exptps.begin(); it != exptps.end(); it++) { @@ -157,6 +277,7 @@ void SearchData::erase() { m_topdir.erase(); m_description.erase(); m_reason.erase(); + m_haveDates = false; } // Am I a file name only search ? This is to turn off term highlighting diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 76818759..9c0f9502 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -30,6 +30,7 @@ #include "rcldb.h" #include "refcntr.h" +#include "smallut.h" #ifndef NO_NAMESPACES using std::vector; @@ -70,7 +71,9 @@ class SearchDataClause; */ class SearchData { public: - SearchData(SClType tp) : m_tp(tp), m_haveWildCards(false) {} + SearchData(SClType tp) + : m_tp(tp), m_haveDates(false), m_haveWildCards(false) + {} ~SearchData() {erase();} /** Make pristine */ @@ -88,6 +91,18 @@ public: /** We become the owner of cl and will delete it */ bool addClause(SearchDataClause *cl); + /** Set/get top subdirectory for filtering results */ + void setTopdir(const string& t) {m_topdir = t;} + string getTopdir() {return m_topdir;} + + /** Set date span for filtering results */ + void setDateSpan(DateInterval *dip) {m_dates = *dip; m_haveDates = true;} + + /** Add file type for filtering results */ + void addFiletype(const string& ft) {m_filetypes.push_back(ft);} + + void setStemlang(const string& lang = "english") {m_stemlang = lang;} + /** Retrieve error description */ string getReason() {return m_reason;} @@ -107,17 +122,14 @@ public: */ string getDescription() {return m_description;} void setDescription(const string& d) {m_description = d;} - /** Get/set top subdirectory for filtering results */ - string getTopdir() {return m_topdir;} - void setTopdir(const string& t) {m_topdir = t;} - /** Add file type for filtering results */ - void addFiletype(const string& ft) {m_filetypes.push_back(ft);} - void setStemlang(const string& lang = "english") {m_stemlang = lang;} + private: SClType m_tp; // Only SCLT_AND or SCLT_OR here vector m_query; vector m_filetypes; // Restrict to filetypes if set. string m_topdir; // Restrict to subtree. + bool m_haveDates; + DateInterval m_dates; // Restrict to date interval // Printable expanded version of the complete query, retrieved/set // from rcldb after the Xapian::setQuery() call string m_description; diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index 847f823a..ad28b499 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -29,6 +29,7 @@ static char rcsid[] = "@(#$Id: smallut.cpp,v 1.35 2008-11-19 10:06:49 dockes Exp #include #include +#include #include "smallut.h" #include "utf8iter.h" @@ -281,6 +282,7 @@ template bool stringToStrings(const string &s, T &tokens, } return true; } + bool stringToStrings(const string &s, list &tokens, const string& as) { @@ -707,10 +709,253 @@ float Chrono::secs(int frozen) gettime(CLOCK_REALTIME, &tv); float secs = (float)(frozen?frozen_tv.tv_sec:tv.tv_sec - m_secs); float nsecs = (float)(frozen?frozen_tv.tv_nsec:tv.tv_nsec - m_nsecs); - //fprintf(stderr, "secs %.2f nsecs %.2f\n", secs, nsecs); return secs + nsecs * 1e-9; } +// Date is Y[-M[-D]] +static bool parsedate(vector::const_iterator& it, + vector::const_iterator end, DateInterval *dip) +{ + dip->y1 = dip->m1 = dip->d1 = dip->y2 = dip->m2 = dip->d2 = 0; + if (it == end || sscanf(it++->c_str(), "%d", &dip->y1) != 1) { + return false; + } + if (it == end || *it == "/") + return true; + if (*it++ != "-") { + return false; + } + + if (it == end || sscanf(it++->c_str(), "%d", &dip->m1) != 1) { + return false; + } + if (it == end || *it == "/") + return true; + if (*it++ != "-") { + return false; + } + + if (it == end || sscanf(it++->c_str(), "%d", &dip->d1) != 1) { + return -1; + } + + return true; +} + +// Called with the 'P' already processed. Period ends at end of string +// or at '/'. We dont' do a lot effort at validation and will happily +// accept 10Y1Y4Y (the last wins) +static bool parseperiod(vector::const_iterator& it, + vector::const_iterator end, DateInterval *dip) +{ + dip->y1 = dip->m1 = dip->d1 = dip->y2 = dip->m2 = dip->d2 = 0; + while (it != end) { + int value; + if (sscanf(it++->c_str(), "%d", &value) != 1) { + return false; + } + if (it == end || it->empty()) + return false; + switch (it->at(0)) { + case 'Y': case 'y': dip->y1 = value;break; + case 'M': case 'm': dip->m1 = value;break; + case 'D': case 'd': dip->d1 = value;break; + default: return false; + } + it++; + if (it == end) + return true; + if (*it == "/") { + return true; + } + } + return true; +} + +static void cerrdip(const string& s, DateInterval *dip) +{ + cerr << s << dip->y1 << "-" << dip->m1 << "-" << dip->d1 << "/" + << dip->y2 << "-" << dip->m2 << "-" << dip->d2 + << endl; +} + +// Compute date + period. Won't work out of the unix era. +// or pre-1970 dates. Just convert everything to unixtime and +// seconds (with average durations for months/years), add and convert +// back +static bool addperiod(DateInterval *dp, DateInterval *pp) +{ + struct tm tm; + // Create a struct tm with possibly non normalized fields and let + // timegm sort it out + memset(&tm, 0, sizeof(tm)); + tm.tm_year = dp->y1 - 1900 + pp->y1; + tm.tm_mon = dp->m1 + pp->m1 -1; + tm.tm_mday = dp->d1 + pp->d1; + time_t tres = timegm(&tm); + // Convert back to normalized tm, then output + gmtime_r(&tres, &tm); + dp->y1 = tm.tm_year + 1900; + dp->m1 = tm.tm_mon + 1; + dp->d1 = tm.tm_mday; + //cerrdip("Addperiod return", dp); + return true; +} +int monthdays(int mon, int year) +{ + switch (mon) { + case 2: return (year % 4) == 0 ? 29 : 28; + case 1:case 3:case 5:case 7: case 8:case 10:case 12: return 31; + default: return 30; + } +} +bool parsedateinterval(const string& s, DateInterval *dip) +{ + vector vs; + dip->y1 = dip->m1 = dip->d1 = dip->y2 = dip->m2 = dip->d2 = 0; + DateInterval p1, p2, d1, d2; + p1 = p2 = d1 = d2 = *dip; + bool hasp1 = false, hasp2 = false, hasd1 = false, hasd2 = false, + hasslash = false; + + if (!stringToStrings(s, vs, "PYMDpymd-/")) { + return false; + } + if (vs.empty()) + return false; + + vector::const_iterator it = vs.begin(); + if (*it == "P" || *it == "p") { + it++; + if (!parseperiod(it, vs.end(), &p1)) { + return false; + } + hasp1 = true; + //cerrdip("p1", &p1); + p1.y1 = -p1.y1; + p1.m1 = -p1.m1; + p1.d1 = -p1.d1; + } else if (*it == "/") { + hasslash = true; + goto secondelt; + } else { + if (!parsedate(it, vs.end(), &d1)) { + return false; + } + hasd1 = true; + } + + // Got one element and/or / +secondelt: + if (it != vs.end()) { + if (*it != "/") { + return false; + } + hasslash = true; + it++; + if (it == vs.end()) { + // ok + } else if (*it == "P" || *it == "p") { + it++; + if (!parseperiod(it, vs.end(), &p2)) { + return false; + } + hasp2 = true; + } else { + if (!parsedate(it, vs.end(), &d2)) { + return false; + } + hasd2 = true; + } + } + + // 2 periods dont' make sense + if (hasp1 && hasp2) { + return false; + } + // Nothing at all doesn't either + if (!hasp1 && !hasd1 && !hasp2 && !hasd2) { + return false; + } + + // Empty part means today IF other part is period, else means + // forever (stays at 0) + time_t now = time(0); + struct tm *tmnow = gmtime(&now); + if ((!hasp1 && !hasd1) && hasp2) { + d1.y1 = 1900 + tmnow->tm_year; + d1.m1 = tmnow->tm_mon + 1; + d1.d1 = tmnow->tm_mday; + hasd1 = true; + } else if ((!hasp2 && !hasd2) && hasp1) { + d2.y1 = 1900 + tmnow->tm_year; + d2.m1 = tmnow->tm_mon + 1; + d2.d1 = tmnow->tm_mday; + hasd2 = true; + } + + // Incomplete dates have different meanings depending if there is + // a period or not (actual or infinite indicated by a / + empty) + // + // If there is no explicit period, an incomplete date indicates a + // period of the size of the uncompleted elements. Ex: 1999 + // actually means 1999/P12M + // + // If there is a period, the incomplete date should be extended + // to the beginning or end of the unspecified portion. Ex: 1999/ + // means 1999-01-01/ and /1999 means /1999-12-31 + if (hasd1) { + if (!(hasslash || hasp2)) { + if (d1.m1 == 0) { + p2.m1 = 12; + d1.m1 = 1; + d1.d1 = 1; + } else if (d1.d1 == 0) { + d1.d1 = 1; + p2.d1 = monthdays(d1.m1, d1.y1); + } + hasp2 = true; + } else { + if (d1.m1 == 0) { + d1.m1 = 1; + d1.d1 = 1; + } else if (d1.d1 == 0) { + d1.d1 = 1; + } + } + } + // if hasd2 is true we had a / + if (hasd2) { + if (d2.m1 == 0) { + d2.m1 = 12; + d2.d1 = 31; + } else if (d2.d1 == 0) { + d2.d1 = monthdays(d2.m1, d2.y1); + } + } + if (hasp1) { + // Compute d1 + d1 = d2; + if (!addperiod(&d1, &p1)) { + return false; + } + } else if (hasp2) { + // Compute d2 + d2 = d1; + if (!addperiod(&d2, &p2)) { + return false; + } + } + + dip->y1 = d1.y1; + dip->m1 = d1.m1; + dip->d1 = d1.d1; + dip->y2 = d2.y1; + dip->m2 = d2.m1; + dip->d2 = d2.d1; + return true; +} + #else #include @@ -750,13 +995,33 @@ struct spair suffpairs[] = { }; int nsuffpairs = sizeof(suffpairs) / sizeof(struct spair); + +// Periods test strings +const char* periods[] = { + "2001", // Year 2001 + "2001/", // 2001 or later + "2001/P3Y", // 2001 -> 2004 or 2005, ambiguous + "2001-01-01/P3Y", // 01-2001 -> 01 2004 + "2001-03-03/2001-05-01", // Explicit one + "P3M/", // 3 months ago to now + "P1Y1M/2001-03-01", // 2000-02-01/2001-03-01 + "/2001", // From the epoch to the end of 2001 +}; +const int nperiods = sizeof(periods) / sizeof(char*); + const char *thisprog; +static void cerrdip(const string& s, DateInterval *dip) +{ + cerr << s << dip->y1 << "-" << dip->m1 << "-" << dip->d1 << "/" + << dip->y2 << "-" << dip->m2 << "-" << dip->d2 + << endl; +} int main(int argc, char **argv) { thisprog = *argv++;argc--; -#if 1 +#if 0 if (argc <=0 ) { cerr << "Usage: smallut " << endl; exit(1); @@ -771,6 +1036,29 @@ int main(int argc, char **argv) cerr << "[" << *it << "] "; cerr << endl; exit(0); +#elif 0 + if (argc <=0 ) { + cerr << "Usage: smallut " << endl; + exit(1); + } + string s = *argv++;argc--; + DateInterval di; + if (!parsedateinterval(s, &di)) { + cerr << "Parse failed" << endl; + exit(1); + } + cerrdip("", &di); + exit(0); +#elif 1 + DateInterval di; + for (int i = 0; i < nperiods; i++) { + if (!parsedateinterval(periods[i], &di)) { + cerr << "Parsing failed for [" << periods[i] << "]" << endl; + } else { + cerrdip(string(periods[i]).append(" : "), &di); + } + } + exit(0); #elif 0 for (int i = 0; i < npairs; i++) { { diff --git a/src/utils/smallut.h b/src/utils/smallut.h index 14f13c9a..2967a0a6 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -56,6 +56,23 @@ extern int stringisuffcmp(const string& s1, const string& s2); // Compare charset names, removing the more common spelling variations extern bool samecharset(const string &cs1, const string &cs2); +// Parse date interval specifier into pair of y,m,d dates. The format +// for the time interval is based on a subset of iso 8601 with +// the addition of open intervals, and removal of all time indications. +// 'P' is the Period indicator, it's followed by a length in +// years/months/days (or any subset thereof) +// Dates: YYYY-MM-DD YYYY-MM YYYY +// Periods: P[nY][nM][nD] where n is an integer value. +// At least one of YMD must be specified +// The separator for the interval is /. Interval examples +// YYYY/ (from YYYY) YYYY-MM-DD/P3Y (3 years after date) etc. +// This returns a pair of y,m,d dates. +struct DateInterval { + int y1;int m1;int d1; int y2;int m2;int d2; +}; +bool parsedateinterval(const string&s, DateInterval *di); +int monthdays(int mon, int year); + /** * Parse input string into list of strings. *