This commit is contained in:
parent
d14601bde9
commit
0fe1574439
246
src/index/beaglequeue.cpp
Normal file
246
src/index/beaglequeue.cpp
Normal file
@ -0,0 +1,246 @@
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
#include "autoconfig.h"
|
||||
#include "pathut.h"
|
||||
#include "debuglog.h"
|
||||
#include "fstreewalk.h"
|
||||
#include "beaglequeue.h"
|
||||
#include "smallut.h"
|
||||
#include "fileudi.h"
|
||||
#include "internfile.h"
|
||||
#include "wipedir.h"
|
||||
#include "circache.h"
|
||||
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
using namespace std;
|
||||
|
||||
#include <sys/stat.h>
|
||||
|
||||
const string keybght("beagleHitType");
|
||||
|
||||
#define LL 2048
|
||||
|
||||
class BeagleDotFile {
|
||||
public:
|
||||
BeagleDotFile(RclConfig *conf, const string& fn)
|
||||
: m_conf(conf), m_fn(fn)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
bool readLine(string& line)
|
||||
{
|
||||
char cline[LL];
|
||||
cline[0] = 0;
|
||||
m_input.getline(cline, LL-1);
|
||||
if (!m_input.good()) {
|
||||
if (m_input.bad()) {
|
||||
LOGERR(("beagleDotFileRead: input.bad()\n"));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
int ll = strlen(cline);
|
||||
while (ll > 0 && (cline[ll-1] == '\n' || cline[ll-1] == '\r')) {
|
||||
cline[ll-1] = 0;
|
||||
ll--;
|
||||
}
|
||||
line.assign(cline, ll);
|
||||
LOGDEB2(("BeagleDotFile:readLine: [%s]\n", line.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Process a beagle dot file and set interesting stuff in the doc
|
||||
bool toDoc(Rcl::Doc& doc)
|
||||
{
|
||||
string line;
|
||||
|
||||
m_input.open(m_fn.c_str(), ios::in);
|
||||
if (!m_input.good()) {
|
||||
LOGERR(("BeagleDotFile: open failed for [%s]\n", m_fn.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read the 3 first lines:
|
||||
// - url
|
||||
// - hit type: we only know about Bookmark and WebHistory for now
|
||||
// - content-type.
|
||||
if (!readLine(line))
|
||||
return false;
|
||||
doc.url = line;
|
||||
if (!readLine(line))
|
||||
return false;
|
||||
doc.meta[keybght] = line;
|
||||
if (!readLine(line))
|
||||
return false;
|
||||
doc.mimetype = line;
|
||||
|
||||
if (doc.mimetype.empty() &&
|
||||
!stringlowercmp("bookmark", doc.meta[keybght]))
|
||||
doc.mimetype = "text/plain";
|
||||
|
||||
string confstr;
|
||||
string ss(" ");
|
||||
// Read the rest: fields and keywords
|
||||
for (;;) {
|
||||
if (!readLine(line)) {
|
||||
// Eof hopefully
|
||||
break;
|
||||
}
|
||||
if (line.find("t:") != 0)
|
||||
continue;
|
||||
line = line.substr(2);
|
||||
confstr += line + "\n";
|
||||
}
|
||||
|
||||
ConfSimple fields(confstr, 1);
|
||||
list<string> names = fields.getNames("");
|
||||
for (list<string>::iterator it = names.begin();
|
||||
it != names.end(); it++) {
|
||||
string value;
|
||||
fields.get(*it, value, "");
|
||||
if (!value.compare("undefined") || !value.compare("null"))
|
||||
continue;
|
||||
string caname = m_conf->fieldCanon(*it);
|
||||
doc.meta[caname].append(ss + value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
RclConfig *m_conf;
|
||||
string m_fn;
|
||||
ifstream m_input;
|
||||
};
|
||||
|
||||
const string badtmpdirname = "/no/such/dir/really/can/exist";
|
||||
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf)
|
||||
: m_config(cnf), m_db(cnf)
|
||||
{
|
||||
if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
|
||||
m_queuedir = path_tildexpand("~/.beagle/ToIndex");
|
||||
if (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) {
|
||||
string reason;
|
||||
if (!maketmpdir(m_tmpdir, reason)) {
|
||||
LOGERR(("DbIndexer: cannot create temporary directory: %s\n",
|
||||
reason.c_str()));
|
||||
m_tmpdir = badtmpdirname;
|
||||
}
|
||||
}
|
||||
Rcl::Db::OpenMode mode = Rcl::Db::DbUpd;
|
||||
if (!m_db.open(mode)) {
|
||||
LOGERR(("BeagleQueueIndexer: error opening database %s\n",
|
||||
m_config->getDbDir().c_str()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
BeagleQueueIndexer::~BeagleQueueIndexer()
|
||||
{
|
||||
LOGDEB(("BeagleQueueIndexer::~\n"));
|
||||
if (m_tmpdir.length() && m_tmpdir.compare(badtmpdirname)) {
|
||||
wipedir(m_tmpdir);
|
||||
if (rmdir(m_tmpdir.c_str()) < 0) {
|
||||
LOGERR(("BeagleQueueIndexer::~: cannot clear temp dir %s\n",
|
||||
m_tmpdir.c_str()));
|
||||
}
|
||||
}
|
||||
m_db.close();
|
||||
}
|
||||
|
||||
bool BeagleQueueIndexer::processqueue()
|
||||
{
|
||||
LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n",
|
||||
m_queuedir.c_str()));
|
||||
|
||||
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
|
||||
walker.addSkippedName(".*");
|
||||
FsTreeWalker::Status status =walker.walk(m_queuedir, *this);
|
||||
LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status));
|
||||
return true;
|
||||
}
|
||||
|
||||
FsTreeWalker::Status
|
||||
BeagleQueueIndexer::processone(const string &path,
|
||||
const struct stat *stp,
|
||||
FsTreeWalker::CbFlag flg)
|
||||
{
|
||||
if (flg != FsTreeWalker::FtwRegular)
|
||||
return FsTreeWalker::FtwOk;
|
||||
|
||||
string dotpath = path_cat(path_getfather(path),
|
||||
string(".") + path_getsimple(path));
|
||||
LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str()));
|
||||
BeagleDotFile dotfile(m_config, dotpath);
|
||||
Rcl::Doc dotdoc;
|
||||
string udi, udipath;
|
||||
if (!dotfile.toDoc(dotdoc))
|
||||
goto out;
|
||||
//dotdoc.dump(1);
|
||||
|
||||
// Have to use the hit type for the udi, because the same url can exist
|
||||
// as a bookmark or a page.
|
||||
udipath = path_cat(dotdoc.meta[keybght], url_gpath(dotdoc.url));
|
||||
make_udi(udipath, "", udi);
|
||||
|
||||
LOGDEB(("BeagleQueueIndexer: prc1: udi [%s]\n", udi.c_str()));
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
|
||||
// We only process bookmarks or text/html and text/plain files.
|
||||
if (!stringlowercmp("bookmark", dotdoc.meta[keybght])) {
|
||||
|
||||
} else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
|
||||
(dotdoc.mimetype.compare("text/html") &&
|
||||
dotdoc.mimetype.compare("text/plain"))) {
|
||||
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
|
||||
dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
|
||||
goto out;
|
||||
} else {
|
||||
Rcl::Doc doc;
|
||||
FileInterner interner(path, stp, m_config, m_tmpdir,
|
||||
FileInterner::FIF_doUseInputMimetype,
|
||||
&dotdoc.mimetype);
|
||||
string ipath;
|
||||
FileInterner::Status fis = interner.internfile(doc, ipath);
|
||||
if (fis != FileInterner::FIDone) {
|
||||
LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (doc.fmtime.empty())
|
||||
doc.fmtime = ascdate;
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%ld", (long)stp->st_size);
|
||||
doc.fbytes = cbuf;
|
||||
// Document signature for up to date checks: none. The file is
|
||||
// going to be deleted anyway. We always reindex what comes in
|
||||
// the queue. It would probably be possible to extract some
|
||||
// http data to avoid this.
|
||||
doc.sig = "";
|
||||
doc.url = dotdoc.url;
|
||||
if (!m_db.addOrUpdate(udi, "", doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
out:
|
||||
// unlink(path.c_str());
|
||||
// unlink(dotpath.c_str());
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
48
src/index/beaglequeue.h
Normal file
48
src/index/beaglequeue.h
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _beaglequeue_h_included_
|
||||
#define _beaglequeue_h_included_
|
||||
/* @(#$Id: $ (C) 2009 J.F.Dockes */
|
||||
|
||||
/**
|
||||
* Code to process the Beagle indexing queue. Beagle MUST NOT be
|
||||
* running, else mayhem will ensue. Interesting to reuse the beagle
|
||||
* firefox visited page indexing plugin for example.
|
||||
*/
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "fstreewalk.h"
|
||||
#include "rcldb.h"
|
||||
|
||||
class BeagleQueueIndexer : public FsTreeWalkerCB {
|
||||
public:
|
||||
BeagleQueueIndexer(RclConfig *cnf);
|
||||
~BeagleQueueIndexer();
|
||||
|
||||
bool processqueue();
|
||||
|
||||
FsTreeWalker::Status
|
||||
processone(const string &, const struct stat *, FsTreeWalker::CbFlag);
|
||||
|
||||
private:
|
||||
RclConfig *m_config;
|
||||
Rcl::Db m_db;
|
||||
string m_queuedir;
|
||||
string m_tmpdir;
|
||||
};
|
||||
|
||||
#endif /* _beaglequeue_h_included_ */
|
||||
Loading…
x
Reference in New Issue
Block a user