Web indexing: add a parameter to specify at what frequency a page should be stored again: none/day/week/month/year. Previously, only one instance could be kept, which is still the default
This commit is contained in:
parent
9eac638bb9
commit
8fde38975a
@ -173,14 +173,25 @@ public:
|
|||||||
|
|
||||||
// Initialize. Compute paths and create a temporary directory that will be
|
// Initialize. Compute paths and create a temporary directory that will be
|
||||||
// used by internfile()
|
// used by internfile()
|
||||||
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||||
DbIxStatusUpdater *updfunc)
|
: m_config(cnf), m_db(db), m_updater(updfunc)
|
||||||
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
|
|
||||||
m_nocacheindex(false)
|
|
||||||
{
|
{
|
||||||
m_queuedir = m_config->getWebQueueDir();
|
m_queuedir = m_config->getWebQueueDir();
|
||||||
path_catslash(m_queuedir);
|
path_catslash(m_queuedir);
|
||||||
m_cache = new WebStore(cnf);
|
m_cache = new WebStore(cnf);
|
||||||
|
string keepinterval;
|
||||||
|
m_config->getConfParam("webcachekeepinterval", keepinterval);
|
||||||
|
if (keepinterval == "day") {
|
||||||
|
m_keepinterval = WQKI_DAY;
|
||||||
|
} else if (keepinterval == "week") {
|
||||||
|
m_keepinterval = WQKI_WEEK;
|
||||||
|
} else if (keepinterval == "month") {
|
||||||
|
m_keepinterval = WQKI_MONTH;
|
||||||
|
} else if (keepinterval == "year") {
|
||||||
|
m_keepinterval = WQKI_YEAR;
|
||||||
|
} else if (!keepinterval.empty()) {
|
||||||
|
LOGERR("WebQueueIndexer: bad value for keepinterval: " << keepinterval << "\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
WebQueueIndexer::~WebQueueIndexer()
|
WebQueueIndexer::~WebQueueIndexer()
|
||||||
@ -361,10 +372,35 @@ bool WebQueueIndexer::indexFiles(list<string>& files)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string date_string(const char *fmt)
|
||||||
|
{
|
||||||
|
time_t now = time(0);
|
||||||
|
struct tm tmb;
|
||||||
|
localtime_r(&now, &tmb);
|
||||||
|
char buf[200];
|
||||||
|
strftime(buf, sizeof(buf)-1, fmt, &tmb);
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
static std::string yearday()
|
||||||
|
{
|
||||||
|
return date_string("%Y%j");
|
||||||
|
}
|
||||||
|
static std::string yearweek()
|
||||||
|
{
|
||||||
|
return date_string("%Y%V");
|
||||||
|
}
|
||||||
|
static std::string yearmonth()
|
||||||
|
{
|
||||||
|
return date_string("%Y%m");
|
||||||
|
}
|
||||||
|
static std::string yearyear()
|
||||||
|
{
|
||||||
|
return date_string("%Y");
|
||||||
|
}
|
||||||
|
|
||||||
FsTreeWalker::Status
|
FsTreeWalker::Status
|
||||||
WebQueueIndexer::processone(const string &path,
|
WebQueueIndexer::processone(
|
||||||
const struct PathStat *stp,
|
const string &path, const struct PathStat *stp, FsTreeWalker::CbFlag flg)
|
||||||
FsTreeWalker::CbFlag flg)
|
|
||||||
{
|
{
|
||||||
if (!m_db) //??
|
if (!m_db) //??
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
@ -389,6 +425,17 @@ WebQueueIndexer::processone(const string &path,
|
|||||||
// Have to use the hit type for the udi, because the same url can exist
|
// Have to use the hit type for the udi, because the same url can exist
|
||||||
// as a bookmark or a page.
|
// as a bookmark or a page.
|
||||||
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
|
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
|
||||||
|
// !! is an arbitrary separator rather unlikely to be found in urls.
|
||||||
|
switch (m_keepinterval) {
|
||||||
|
case WQKI_DAY: udipath = udipath + "!!" + yearday(); break;
|
||||||
|
case WQKI_WEEK: udipath = udipath + "!!" + yearweek(); break;
|
||||||
|
case WQKI_MONTH: udipath= udipath + "!!" + yearmonth(); break;
|
||||||
|
case WQKI_YEAR: udipath = udipath + "!!" + yearyear(); break;
|
||||||
|
default: break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also append the current date (year+day): we store one page copy per day
|
||||||
|
std::cerr << "UDI: " << udipath << "\n";
|
||||||
make_udi(udipath, cstr_null, udi);
|
make_udi(udipath, cstr_null, udi);
|
||||||
|
|
||||||
LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n");
|
LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n");
|
||||||
|
|||||||
@ -65,12 +65,18 @@ public:
|
|||||||
bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
|
bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
|
||||||
std::string *hittype = 0);
|
std::string *hittype = 0);
|
||||||
private:
|
private:
|
||||||
RclConfig *m_config;
|
RclConfig *m_config{nullptr};
|
||||||
Rcl::Db *m_db;
|
Rcl::Db *m_db{nullptr};
|
||||||
WebStore *m_cache;
|
WebStore *m_cache{nullptr};
|
||||||
std::string m_queuedir;
|
std::string m_queuedir;
|
||||||
DbIxStatusUpdater *m_updater;
|
DbIxStatusUpdater *m_updater{nullptr};
|
||||||
bool m_nocacheindex;
|
// Don't process the cache. Set by indexFiles().
|
||||||
|
bool m_nocacheindex{false};
|
||||||
|
// Config: page erase interval. We normally keep only one
|
||||||
|
// instance. This can be set to "day", "week", "month", "year" to
|
||||||
|
// keep more.
|
||||||
|
enum KeepInterval {WQKI_NONE, WQKI_DAY, WQKI_WEEK, WQKI_MONTH, WQKI_YEAR};
|
||||||
|
KeepInterval m_keepinterval{WQKI_NONE};
|
||||||
|
|
||||||
bool indexFromCache(const std::string& udi);
|
bool indexFromCache(const std::string& udi);
|
||||||
void updstatus(const std::string& udi);
|
void updstatus(const std::string& udi);
|
||||||
|
|||||||
@ -347,8 +347,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
|
|||||||
m_w->enableLink(bparam, cparam);
|
m_w->enableLink(bparam, cparam);
|
||||||
|
|
||||||
cparam = m_w->addParam(
|
cparam = m_w->addParam(
|
||||||
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs",
|
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", tr("Max. size for the web store (MB)"),
|
||||||
tr("Max. size for the web store (MB)"),
|
|
||||||
tr("Entries will be recycled once the size is reached."
|
tr("Entries will be recycled once the size is reached."
|
||||||
"<br>"
|
"<br>"
|
||||||
"Only increasing the size really makes sense because "
|
"Only increasing the size really makes sense because "
|
||||||
@ -356,6 +355,18 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
|
|||||||
"file (only waste space at the end)."
|
"file (only waste space at the end)."
|
||||||
), -1, 1000*1000); // Max 1TB...
|
), -1, 1000*1000); // Max 1TB...
|
||||||
m_w->enableLink(bparam, cparam);
|
m_w->enableLink(bparam, cparam);
|
||||||
|
|
||||||
|
QStringList intervals{"", "day", "week", "month", "year"};
|
||||||
|
cparam = m_w->addParam(
|
||||||
|
idx, ConfTabsW::CFPT_CSTR, "webcachekeepinterval", tr("Page recycle interval"),
|
||||||
|
tr("<p>By default, only one instance of an URL is kept in the cache. This "
|
||||||
|
"can be changed by setting this to a value determining at what frequency "
|
||||||
|
"we keep multiple instances ('day', 'week', 'month', 'year'). "
|
||||||
|
"Note that increasing the interval will not erase existing entries."),
|
||||||
|
0, 0, &intervals);
|
||||||
|
m_w->enableLink(bparam, cparam);
|
||||||
|
|
||||||
|
|
||||||
int64_t sz = -1;
|
int64_t sz = -1;
|
||||||
auto ws = std::unique_ptr<WebStore>(new WebStore(m_rclconf));
|
auto ws = std::unique_ptr<WebStore>(new WebStore(m_rclconf));
|
||||||
sz = ws->cc()->size();
|
sz = ws->cc()->size();
|
||||||
|
|||||||
@ -545,6 +545,16 @@ webcachemaxmbs = 40
|
|||||||
# then moved by a script to webqueuedir.</descr></var>
|
# then moved by a script to webqueuedir.</descr></var>
|
||||||
#webdownloadsdir = ~/Downloads
|
#webdownloadsdir = ~/Downloads
|
||||||
|
|
||||||
|
# <var name="webcachekeepinterval" type="string">
|
||||||
|
#
|
||||||
|
# <brief>Page recycle interval</brief>
|
||||||
|
# <descr>By default, only one instance of an URL is kept in the cache. This
|
||||||
|
# can be changed by setting this to a value determining at what frequency
|
||||||
|
# we keep multiple instances ('day', 'week', 'month',
|
||||||
|
# 'year'). Note that increasing the interval will not erase existing
|
||||||
|
# entries.</descr></var>
|
||||||
|
#webcachekeepinterval=
|
||||||
|
|
||||||
# <var name="aspellDicDir" type="dfn">
|
# <var name="aspellDicDir" type="dfn">
|
||||||
#
|
#
|
||||||
# <brief>Aspell dictionary storage directory location.</brief> <descr>The
|
# <brief>Aspell dictionary storage directory location.</brief> <descr>The
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user