Web indexing: add a parameter to specify at what frequency a page should be stored again: none/day/week/month/year. Previously, only one instance could be kept, which is still the default

This commit is contained in:
Jean-Francois Dockes 2021-03-24 10:58:41 +01:00
parent 9eac638bb9
commit 8fde38975a
4 changed files with 89 additions and 15 deletions

View File

@ -173,14 +173,25 @@ public:
// Initialize. Compute paths and create a temporary directory that will be // Initialize. Compute paths and create a temporary directory that will be
// used by internfile() // used by internfile()
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
DbIxStatusUpdater *updfunc) : m_config(cnf), m_db(db), m_updater(updfunc)
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
m_nocacheindex(false)
{ {
m_queuedir = m_config->getWebQueueDir(); m_queuedir = m_config->getWebQueueDir();
path_catslash(m_queuedir); path_catslash(m_queuedir);
m_cache = new WebStore(cnf); m_cache = new WebStore(cnf);
string keepinterval;
m_config->getConfParam("webcachekeepinterval", keepinterval);
if (keepinterval == "day") {
m_keepinterval = WQKI_DAY;
} else if (keepinterval == "week") {
m_keepinterval = WQKI_WEEK;
} else if (keepinterval == "month") {
m_keepinterval = WQKI_MONTH;
} else if (keepinterval == "year") {
m_keepinterval = WQKI_YEAR;
} else if (!keepinterval.empty()) {
LOGERR("WebQueueIndexer: bad value for keepinterval: " << keepinterval << "\n");
}
} }
WebQueueIndexer::~WebQueueIndexer() WebQueueIndexer::~WebQueueIndexer()
@ -361,10 +372,35 @@ bool WebQueueIndexer::indexFiles(list<string>& files)
return true; return true;
} }
static std::string date_string(const char *fmt)
{
time_t now = time(0);
struct tm tmb;
localtime_r(&now, &tmb);
char buf[200];
strftime(buf, sizeof(buf)-1, fmt, &tmb);
return buf;
}
static std::string yearday()
{
return date_string("%Y%j");
}
static std::string yearweek()
{
return date_string("%Y%V");
}
static std::string yearmonth()
{
return date_string("%Y%m");
}
static std::string yearyear()
{
return date_string("%Y");
}
FsTreeWalker::Status FsTreeWalker::Status
WebQueueIndexer::processone(const string &path, WebQueueIndexer::processone(
const struct PathStat *stp, const string &path, const struct PathStat *stp, FsTreeWalker::CbFlag flg)
FsTreeWalker::CbFlag flg)
{ {
if (!m_db) //?? if (!m_db) //??
return FsTreeWalker::FtwError; return FsTreeWalker::FtwError;
@ -389,6 +425,17 @@ WebQueueIndexer::processone(const string &path,
// Have to use the hit type for the udi, because the same url can exist // Have to use the hit type for the udi, because the same url can exist
// as a bookmark or a page. // as a bookmark or a page.
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url)); udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
// !! is an arbitrary separator rather unlikely to be found in urls.
switch (m_keepinterval) {
case WQKI_DAY: udipath = udipath + "!!" + yearday(); break;
case WQKI_WEEK: udipath = udipath + "!!" + yearweek(); break;
case WQKI_MONTH: udipath= udipath + "!!" + yearmonth(); break;
case WQKI_YEAR: udipath = udipath + "!!" + yearyear(); break;
default: break;
}
// Also append the current date (year+day): we store one page copy per day
std::cerr << "UDI: " << udipath << "\n";
make_udi(udipath, cstr_null, udi); make_udi(udipath, cstr_null, udi);
LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n"); LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n");

View File

@ -65,12 +65,18 @@ public:
bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data, bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
std::string *hittype = 0); std::string *hittype = 0);
private: private:
RclConfig *m_config; RclConfig *m_config{nullptr};
Rcl::Db *m_db; Rcl::Db *m_db{nullptr};
WebStore *m_cache; WebStore *m_cache{nullptr};
std::string m_queuedir; std::string m_queuedir;
DbIxStatusUpdater *m_updater; DbIxStatusUpdater *m_updater{nullptr};
bool m_nocacheindex; // Don't process the cache. Set by indexFiles().
bool m_nocacheindex{false};
// Config: page erase interval. We normally keep only one
// instance. This can be set to "day", "week", "month", "year" to
// keep more.
enum KeepInterval {WQKI_NONE, WQKI_DAY, WQKI_WEEK, WQKI_MONTH, WQKI_YEAR};
KeepInterval m_keepinterval{WQKI_NONE};
bool indexFromCache(const std::string& udi); bool indexFromCache(const std::string& udi);
void updstatus(const std::string& udi); void updstatus(const std::string& udi);

View File

@ -347,8 +347,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
m_w->enableLink(bparam, cparam); m_w->enableLink(bparam, cparam);
cparam = m_w->addParam( cparam = m_w->addParam(
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", tr("Max. size for the web store (MB)"),
tr("Max. size for the web store (MB)"),
tr("Entries will be recycled once the size is reached." tr("Entries will be recycled once the size is reached."
"<br>" "<br>"
"Only increasing the size really makes sense because " "Only increasing the size really makes sense because "
@ -356,6 +355,18 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
"file (only waste space at the end)." "file (only waste space at the end)."
), -1, 1000*1000); // Max 1TB... ), -1, 1000*1000); // Max 1TB...
m_w->enableLink(bparam, cparam); m_w->enableLink(bparam, cparam);
QStringList intervals{"", "day", "week", "month", "year"};
cparam = m_w->addParam(
idx, ConfTabsW::CFPT_CSTR, "webcachekeepinterval", tr("Page recycle interval"),
tr("<p>By default, only one instance of an URL is kept in the cache. This "
"can be changed by setting this to a value determining at what frequency "
"we keep multiple instances ('day', 'week', 'month', 'year'). "
"Note that increasing the interval will not erase existing entries."),
0, 0, &intervals);
m_w->enableLink(bparam, cparam);
int64_t sz = -1; int64_t sz = -1;
auto ws = std::unique_ptr<WebStore>(new WebStore(m_rclconf)); auto ws = std::unique_ptr<WebStore>(new WebStore(m_rclconf));
sz = ws->cc()->size(); sz = ws->cc()->size();

View File

@ -545,6 +545,16 @@ webcachemaxmbs = 40
# then moved by a script to webqueuedir.</descr></var> # then moved by a script to webqueuedir.</descr></var>
#webdownloadsdir = ~/Downloads #webdownloadsdir = ~/Downloads
# <var name="webcachekeepinterval" type="string">
#
# <brief>Page recycle interval</brief>
# <descr>By default, only one instance of an URL is kept in the cache. This
# can be changed by setting this to a value determining at what frequency
# we keep multiple instances ('day', 'week', 'month',
# 'year'). Note that increasing the interval will not erase existing
# entries.</descr></var>
#webcachekeepinterval=
# <var name="aspellDicDir" type="dfn"> # <var name="aspellDicDir" type="dfn">
# #
# <brief>Aspell dictionary storage directory location.</brief> <descr>The # <brief>Aspell dictionary storage directory location.</brief> <descr>The