From 8fde38975ac42e822d698eab28e73995055d8232 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 24 Mar 2021 10:58:41 +0100 Subject: [PATCH] Web indexing: add a parameter to specify at what frequency a page should be stored again: none/day/week/month/year. Previously, only one instance could be kept, which is still the default --- src/index/webqueue.cpp | 61 ++++++++++++++++++++++++++---- src/index/webqueue.h | 18 ++++++--- src/qtgui/confgui/confguiindex.cpp | 15 +++++++- src/sampleconf/recoll.conf | 10 +++++ 4 files changed, 89 insertions(+), 15 deletions(-) diff --git a/src/index/webqueue.cpp b/src/index/webqueue.cpp index ac872f80..4ed890e1 100644 --- a/src/index/webqueue.cpp +++ b/src/index/webqueue.cpp @@ -173,14 +173,25 @@ public: // Initialize. Compute paths and create a temporary directory that will be // used by internfile() -WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, - DbIxStatusUpdater *updfunc) - : m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc), - m_nocacheindex(false) +WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) + : m_config(cnf), m_db(db), m_updater(updfunc) { m_queuedir = m_config->getWebQueueDir(); path_catslash(m_queuedir); m_cache = new WebStore(cnf); + string keepinterval; + m_config->getConfParam("webcachekeepinterval", keepinterval); + if (keepinterval == "day") { + m_keepinterval = WQKI_DAY; + } else if (keepinterval == "week") { + m_keepinterval = WQKI_WEEK; + } else if (keepinterval == "month") { + m_keepinterval = WQKI_MONTH; + } else if (keepinterval == "year") { + m_keepinterval = WQKI_YEAR; + } else if (!keepinterval.empty()) { + LOGERR("WebQueueIndexer: bad value for keepinterval: " << keepinterval << "\n"); + } } WebQueueIndexer::~WebQueueIndexer() @@ -361,10 +372,35 @@ bool WebQueueIndexer::indexFiles(list& files) return true; } +static std::string date_string(const char *fmt) +{ + time_t now = time(0); + struct tm tmb; + localtime_r(&now, &tmb); + char buf[200]; + strftime(buf, sizeof(buf)-1, fmt, &tmb); + return buf; +} +static std::string yearday() +{ + return date_string("%Y%j"); +} +static std::string yearweek() +{ + return date_string("%Y%V"); +} +static std::string yearmonth() +{ + return date_string("%Y%m"); +} +static std::string yearyear() +{ + return date_string("%Y"); +} + FsTreeWalker::Status -WebQueueIndexer::processone(const string &path, - const struct PathStat *stp, - FsTreeWalker::CbFlag flg) +WebQueueIndexer::processone( + const string &path, const struct PathStat *stp, FsTreeWalker::CbFlag flg) { if (!m_db) //?? return FsTreeWalker::FtwError; @@ -389,6 +425,17 @@ WebQueueIndexer::processone(const string &path, // Have to use the hit type for the udi, because the same url can exist // as a bookmark or a page. udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url)); + // !! is an arbitrary separator rather unlikely to be found in urls. + switch (m_keepinterval) { + case WQKI_DAY: udipath = udipath + "!!" + yearday(); break; + case WQKI_WEEK: udipath = udipath + "!!" + yearweek(); break; + case WQKI_MONTH: udipath= udipath + "!!" + yearmonth(); break; + case WQKI_YEAR: udipath = udipath + "!!" + yearyear(); break; + default: break; + } + + // Also append the current date (year+day): we store one page copy per day + std::cerr << "UDI: " << udipath << "\n"; make_udi(udipath, cstr_null, udi); LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n"); diff --git a/src/index/webqueue.h b/src/index/webqueue.h index 90940399..279c1dde 100644 --- a/src/index/webqueue.h +++ b/src/index/webqueue.h @@ -65,13 +65,19 @@ public: bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data, std::string *hittype = 0); private: - RclConfig *m_config; - Rcl::Db *m_db; - WebStore *m_cache; + RclConfig *m_config{nullptr}; + Rcl::Db *m_db{nullptr}; + WebStore *m_cache{nullptr}; std::string m_queuedir; - DbIxStatusUpdater *m_updater; - bool m_nocacheindex; - + DbIxStatusUpdater *m_updater{nullptr}; + // Don't process the cache. Set by indexFiles(). + bool m_nocacheindex{false}; + // Config: page erase interval. We normally keep only one + // instance. This can be set to "day", "week", "month", "year" to + // keep more. + enum KeepInterval {WQKI_NONE, WQKI_DAY, WQKI_WEEK, WQKI_MONTH, WQKI_YEAR}; + KeepInterval m_keepinterval{WQKI_NONE}; + bool indexFromCache(const std::string& udi); void updstatus(const std::string& udi); }; diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp index d2af32f6..c666b366 100644 --- a/src/qtgui/confgui/confguiindex.cpp +++ b/src/qtgui/confgui/confguiindex.cpp @@ -347,8 +347,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx) m_w->enableLink(bparam, cparam); cparam = m_w->addParam( - idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", - tr("Max. size for the web store (MB)"), + idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", tr("Max. size for the web store (MB)"), tr("Entries will be recycled once the size is reached." "
" "Only increasing the size really makes sense because " @@ -356,6 +355,18 @@ bool ConfIndexW::setupWebHistoryPanel(int idx) "file (only waste space at the end)." ), -1, 1000*1000); // Max 1TB... m_w->enableLink(bparam, cparam); + + QStringList intervals{"", "day", "week", "month", "year"}; + cparam = m_w->addParam( + idx, ConfTabsW::CFPT_CSTR, "webcachekeepinterval", tr("Page recycle interval"), + tr("

By default, only one instance of an URL is kept in the cache. This " + "can be changed by setting this to a value determining at what frequency " + "we keep multiple instances ('day', 'week', 'month', 'year'). " + "Note that increasing the interval will not erase existing entries."), + 0, 0, &intervals); + m_w->enableLink(bparam, cparam); + + int64_t sz = -1; auto ws = std::unique_ptr(new WebStore(m_rclconf)); sz = ws->cc()->size(); diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index da2baaec..29e3caac 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -545,6 +545,16 @@ webcachemaxmbs = 40 # then moved by a script to webqueuedir. #webdownloadsdir = ~/Downloads +# +# +# Page recycle interval +# By default, only one instance of an URL is kept in the cache. This +# can be changed by setting this to a value determining at what frequency +# we keep multiple instances ('day', 'week', 'month', +# 'year'). Note that increasing the interval will not erase existing +# entries. +#webcachekeepinterval= + # # # Aspell dictionary storage directory location. The