Web indexing: add a parameter to specify at what frequency a page should be stored again: none/day/week/month/year. Previously, only one instance could be kept, which is still the default

This commit is contained in:
Jean-Francois Dockes 2021-03-24 10:58:41 +01:00
parent 9eac638bb9
commit 8fde38975a
4 changed files with 89 additions and 15 deletions

View File

@ -173,14 +173,25 @@ public:
// Initialize. Compute paths and create a temporary directory that will be
// used by internfile()
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db,
DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
m_nocacheindex(false)
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_updater(updfunc)
{
m_queuedir = m_config->getWebQueueDir();
path_catslash(m_queuedir);
m_cache = new WebStore(cnf);
string keepinterval;
m_config->getConfParam("webcachekeepinterval", keepinterval);
if (keepinterval == "day") {
m_keepinterval = WQKI_DAY;
} else if (keepinterval == "week") {
m_keepinterval = WQKI_WEEK;
} else if (keepinterval == "month") {
m_keepinterval = WQKI_MONTH;
} else if (keepinterval == "year") {
m_keepinterval = WQKI_YEAR;
} else if (!keepinterval.empty()) {
LOGERR("WebQueueIndexer: bad value for keepinterval: " << keepinterval << "\n");
}
}
WebQueueIndexer::~WebQueueIndexer()
@ -361,10 +372,35 @@ bool WebQueueIndexer::indexFiles(list<string>& files)
return true;
}
static std::string date_string(const char *fmt)
{
time_t now = time(0);
struct tm tmb;
localtime_r(&now, &tmb);
char buf[200];
strftime(buf, sizeof(buf)-1, fmt, &tmb);
return buf;
}
static std::string yearday()
{
return date_string("%Y%j");
}
static std::string yearweek()
{
return date_string("%Y%V");
}
static std::string yearmonth()
{
return date_string("%Y%m");
}
static std::string yearyear()
{
return date_string("%Y");
}
FsTreeWalker::Status
WebQueueIndexer::processone(const string &path,
const struct PathStat *stp,
FsTreeWalker::CbFlag flg)
WebQueueIndexer::processone(
const string &path, const struct PathStat *stp, FsTreeWalker::CbFlag flg)
{
if (!m_db) //??
return FsTreeWalker::FtwError;
@ -389,6 +425,17 @@ WebQueueIndexer::processone(const string &path,
// Have to use the hit type for the udi, because the same url can exist
// as a bookmark or a page.
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
// !! is an arbitrary separator rather unlikely to be found in urls.
switch (m_keepinterval) {
case WQKI_DAY: udipath = udipath + "!!" + yearday(); break;
case WQKI_WEEK: udipath = udipath + "!!" + yearweek(); break;
case WQKI_MONTH: udipath= udipath + "!!" + yearmonth(); break;
case WQKI_YEAR: udipath = udipath + "!!" + yearyear(); break;
default: break;
}
// Also append the current date (year+day): we store one page copy per day
std::cerr << "UDI: " << udipath << "\n";
make_udi(udipath, cstr_null, udi);
LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n");

View File

@ -65,13 +65,19 @@ public:
bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
std::string *hittype = 0);
private:
RclConfig *m_config;
Rcl::Db *m_db;
WebStore *m_cache;
RclConfig *m_config{nullptr};
Rcl::Db *m_db{nullptr};
WebStore *m_cache{nullptr};
std::string m_queuedir;
DbIxStatusUpdater *m_updater;
bool m_nocacheindex;
DbIxStatusUpdater *m_updater{nullptr};
// Don't process the cache. Set by indexFiles().
bool m_nocacheindex{false};
// Config: page erase interval. We normally keep only one
// instance. This can be set to "day", "week", "month", "year" to
// keep more.
enum KeepInterval {WQKI_NONE, WQKI_DAY, WQKI_WEEK, WQKI_MONTH, WQKI_YEAR};
KeepInterval m_keepinterval{WQKI_NONE};
bool indexFromCache(const std::string& udi);
void updstatus(const std::string& udi);
};

View File

@ -347,8 +347,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
m_w->enableLink(bparam, cparam);
cparam = m_w->addParam(
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs",
tr("Max. size for the web store (MB)"),
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", tr("Max. size for the web store (MB)"),
tr("Entries will be recycled once the size is reached."
"<br>"
"Only increasing the size really makes sense because "
@ -356,6 +355,18 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
"file (only waste space at the end)."
), -1, 1000*1000); // Max 1TB...
m_w->enableLink(bparam, cparam);
QStringList intervals{"", "day", "week", "month", "year"};
cparam = m_w->addParam(
idx, ConfTabsW::CFPT_CSTR, "webcachekeepinterval", tr("Page recycle interval"),
tr("<p>By default, only one instance of an URL is kept in the cache. This "
"can be changed by setting this to a value determining at what frequency "
"we keep multiple instances ('day', 'week', 'month', 'year'). "
"Note that increasing the interval will not erase existing entries."),
0, 0, &intervals);
m_w->enableLink(bparam, cparam);
int64_t sz = -1;
auto ws = std::unique_ptr<WebStore>(new WebStore(m_rclconf));
sz = ws->cc()->size();

View File

@ -545,6 +545,16 @@ webcachemaxmbs = 40
# then moved by a script to webqueuedir.</descr></var>
#webdownloadsdir = ~/Downloads
# <var name="webcachekeepinterval" type="string">
#
# <brief>Page recycle interval</brief>
# <descr>By default, only one instance of an URL is kept in the cache. This
# can be changed by setting this to a value determining at what frequency
# we keep multiple instances ('day', 'week', 'month',
# 'year'). Note that increasing the interval will not erase existing
# entries.</descr></var>
#webcachekeepinterval=
# <var name="aspellDicDir" type="dfn">
#
# <brief>Aspell dictionary storage directory location.</brief> <descr>The