Web indexing: add a parameter to specify at what frequency a page should be stored again: none/day/week/month/year. Previously, only one instance could be kept, which is still the default
This commit is contained in:
parent
9eac638bb9
commit
8fde38975a
@ -173,14 +173,25 @@ public:
|
||||
|
||||
// Initialize. Compute paths and create a temporary directory that will be
|
||||
// used by internfile()
|
||||
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
DbIxStatusUpdater *updfunc)
|
||||
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
|
||||
m_nocacheindex(false)
|
||||
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||
: m_config(cnf), m_db(db), m_updater(updfunc)
|
||||
{
|
||||
m_queuedir = m_config->getWebQueueDir();
|
||||
path_catslash(m_queuedir);
|
||||
m_cache = new WebStore(cnf);
|
||||
string keepinterval;
|
||||
m_config->getConfParam("webcachekeepinterval", keepinterval);
|
||||
if (keepinterval == "day") {
|
||||
m_keepinterval = WQKI_DAY;
|
||||
} else if (keepinterval == "week") {
|
||||
m_keepinterval = WQKI_WEEK;
|
||||
} else if (keepinterval == "month") {
|
||||
m_keepinterval = WQKI_MONTH;
|
||||
} else if (keepinterval == "year") {
|
||||
m_keepinterval = WQKI_YEAR;
|
||||
} else if (!keepinterval.empty()) {
|
||||
LOGERR("WebQueueIndexer: bad value for keepinterval: " << keepinterval << "\n");
|
||||
}
|
||||
}
|
||||
|
||||
WebQueueIndexer::~WebQueueIndexer()
|
||||
@ -361,10 +372,35 @@ bool WebQueueIndexer::indexFiles(list<string>& files)
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::string date_string(const char *fmt)
|
||||
{
|
||||
time_t now = time(0);
|
||||
struct tm tmb;
|
||||
localtime_r(&now, &tmb);
|
||||
char buf[200];
|
||||
strftime(buf, sizeof(buf)-1, fmt, &tmb);
|
||||
return buf;
|
||||
}
|
||||
static std::string yearday()
|
||||
{
|
||||
return date_string("%Y%j");
|
||||
}
|
||||
static std::string yearweek()
|
||||
{
|
||||
return date_string("%Y%V");
|
||||
}
|
||||
static std::string yearmonth()
|
||||
{
|
||||
return date_string("%Y%m");
|
||||
}
|
||||
static std::string yearyear()
|
||||
{
|
||||
return date_string("%Y");
|
||||
}
|
||||
|
||||
FsTreeWalker::Status
|
||||
WebQueueIndexer::processone(const string &path,
|
||||
const struct PathStat *stp,
|
||||
FsTreeWalker::CbFlag flg)
|
||||
WebQueueIndexer::processone(
|
||||
const string &path, const struct PathStat *stp, FsTreeWalker::CbFlag flg)
|
||||
{
|
||||
if (!m_db) //??
|
||||
return FsTreeWalker::FtwError;
|
||||
@ -389,6 +425,17 @@ WebQueueIndexer::processone(const string &path,
|
||||
// Have to use the hit type for the udi, because the same url can exist
|
||||
// as a bookmark or a page.
|
||||
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
|
||||
// !! is an arbitrary separator rather unlikely to be found in urls.
|
||||
switch (m_keepinterval) {
|
||||
case WQKI_DAY: udipath = udipath + "!!" + yearday(); break;
|
||||
case WQKI_WEEK: udipath = udipath + "!!" + yearweek(); break;
|
||||
case WQKI_MONTH: udipath= udipath + "!!" + yearmonth(); break;
|
||||
case WQKI_YEAR: udipath = udipath + "!!" + yearyear(); break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
// Also append the current date (year+day): we store one page copy per day
|
||||
std::cerr << "UDI: " << udipath << "\n";
|
||||
make_udi(udipath, cstr_null, udi);
|
||||
|
||||
LOGDEB("WebQueueIndexer: prc1: udi [" << udi << "]\n");
|
||||
|
||||
@ -65,13 +65,19 @@ public:
|
||||
bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
|
||||
std::string *hittype = 0);
|
||||
private:
|
||||
RclConfig *m_config;
|
||||
Rcl::Db *m_db;
|
||||
WebStore *m_cache;
|
||||
RclConfig *m_config{nullptr};
|
||||
Rcl::Db *m_db{nullptr};
|
||||
WebStore *m_cache{nullptr};
|
||||
std::string m_queuedir;
|
||||
DbIxStatusUpdater *m_updater;
|
||||
bool m_nocacheindex;
|
||||
|
||||
DbIxStatusUpdater *m_updater{nullptr};
|
||||
// Don't process the cache. Set by indexFiles().
|
||||
bool m_nocacheindex{false};
|
||||
// Config: page erase interval. We normally keep only one
|
||||
// instance. This can be set to "day", "week", "month", "year" to
|
||||
// keep more.
|
||||
enum KeepInterval {WQKI_NONE, WQKI_DAY, WQKI_WEEK, WQKI_MONTH, WQKI_YEAR};
|
||||
KeepInterval m_keepinterval{WQKI_NONE};
|
||||
|
||||
bool indexFromCache(const std::string& udi);
|
||||
void updstatus(const std::string& udi);
|
||||
};
|
||||
|
||||
@ -347,8 +347,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
|
||||
m_w->enableLink(bparam, cparam);
|
||||
|
||||
cparam = m_w->addParam(
|
||||
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs",
|
||||
tr("Max. size for the web store (MB)"),
|
||||
idx, ConfTabsW::CFPT_INT, "webcachemaxmbs", tr("Max. size for the web store (MB)"),
|
||||
tr("Entries will be recycled once the size is reached."
|
||||
"<br>"
|
||||
"Only increasing the size really makes sense because "
|
||||
@ -356,6 +355,18 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
|
||||
"file (only waste space at the end)."
|
||||
), -1, 1000*1000); // Max 1TB...
|
||||
m_w->enableLink(bparam, cparam);
|
||||
|
||||
QStringList intervals{"", "day", "week", "month", "year"};
|
||||
cparam = m_w->addParam(
|
||||
idx, ConfTabsW::CFPT_CSTR, "webcachekeepinterval", tr("Page recycle interval"),
|
||||
tr("<p>By default, only one instance of an URL is kept in the cache. This "
|
||||
"can be changed by setting this to a value determining at what frequency "
|
||||
"we keep multiple instances ('day', 'week', 'month', 'year'). "
|
||||
"Note that increasing the interval will not erase existing entries."),
|
||||
0, 0, &intervals);
|
||||
m_w->enableLink(bparam, cparam);
|
||||
|
||||
|
||||
int64_t sz = -1;
|
||||
auto ws = std::unique_ptr<WebStore>(new WebStore(m_rclconf));
|
||||
sz = ws->cc()->size();
|
||||
|
||||
@ -545,6 +545,16 @@ webcachemaxmbs = 40
|
||||
# then moved by a script to webqueuedir.</descr></var>
|
||||
#webdownloadsdir = ~/Downloads
|
||||
|
||||
# <var name="webcachekeepinterval" type="string">
|
||||
#
|
||||
# <brief>Page recycle interval</brief>
|
||||
# <descr>By default, only one instance of an URL is kept in the cache. This
|
||||
# can be changed by setting this to a value determining at what frequency
|
||||
# we keep multiple instances ('day', 'week', 'month',
|
||||
# 'year'). Note that increasing the interval will not erase existing
|
||||
# entries.</descr></var>
|
||||
#webcachekeepinterval=
|
||||
|
||||
# <var name="aspellDicDir" type="dfn">
|
||||
#
|
||||
# <brief>Aspell dictionary storage directory location.</brief> <descr>The
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user