From 0fe1574439f2a98bf197d089b42a2c1724023c2f Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 10 Nov 2009 18:11:35 +0000 Subject: [PATCH] --- src/index/beaglequeue.cpp | 246 ++++++++++++++++++++++++++++++++++++++ src/index/beaglequeue.h | 48 ++++++++ 2 files changed, 294 insertions(+) create mode 100644 src/index/beaglequeue.cpp create mode 100644 src/index/beaglequeue.h diff --git a/src/index/beaglequeue.cpp b/src/index/beaglequeue.cpp new file mode 100644 index 00000000..e323d59b --- /dev/null +++ b/src/index/beaglequeue.cpp @@ -0,0 +1,246 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef lint +static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes"; +#endif +#include "autoconfig.h" +#include "pathut.h" +#include "debuglog.h" +#include "fstreewalk.h" +#include "beaglequeue.h" +#include "smallut.h" +#include "fileudi.h" +#include "internfile.h" +#include "wipedir.h" +#include "circache.h" + +#include +#include +using namespace std; + +#include + +const string keybght("beagleHitType"); + +#define LL 2048 + +class BeagleDotFile { +public: + BeagleDotFile(RclConfig *conf, const string& fn) + : m_conf(conf), m_fn(fn) + { + + } + + bool readLine(string& line) + { + char cline[LL]; + cline[0] = 0; + m_input.getline(cline, LL-1); + if (!m_input.good()) { + if (m_input.bad()) { + LOGERR(("beagleDotFileRead: input.bad()\n")); + } + return false; + } + int ll = strlen(cline); + while (ll > 0 && (cline[ll-1] == '\n' || cline[ll-1] == '\r')) { + cline[ll-1] = 0; + ll--; + } + line.assign(cline, ll); + LOGDEB2(("BeagleDotFile:readLine: [%s]\n", line.c_str())); + return true; + } + + // Process a beagle dot file and set interesting stuff in the doc + bool toDoc(Rcl::Doc& doc) + { + string line; + + m_input.open(m_fn.c_str(), ios::in); + if (!m_input.good()) { + LOGERR(("BeagleDotFile: open failed for [%s]\n", m_fn.c_str())); + return false; + } + + // Read the 3 first lines: + // - url + // - hit type: we only know about Bookmark and WebHistory for now + // - content-type. + if (!readLine(line)) + return false; + doc.url = line; + if (!readLine(line)) + return false; + doc.meta[keybght] = line; + if (!readLine(line)) + return false; + doc.mimetype = line; + + if (doc.mimetype.empty() && + !stringlowercmp("bookmark", doc.meta[keybght])) + doc.mimetype = "text/plain"; + + string confstr; + string ss(" "); + // Read the rest: fields and keywords + for (;;) { + if (!readLine(line)) { + // Eof hopefully + break; + } + if (line.find("t:") != 0) + continue; + line = line.substr(2); + confstr += line + "\n"; + } + + ConfSimple fields(confstr, 1); + list names = fields.getNames(""); + for (list::iterator it = names.begin(); + it != names.end(); it++) { + string value; + fields.get(*it, value, ""); + if (!value.compare("undefined") || !value.compare("null")) + continue; + string caname = m_conf->fieldCanon(*it); + doc.meta[caname].append(ss + value); + } + return true; + } + + RclConfig *m_conf; + string m_fn; + ifstream m_input; +}; + +const string badtmpdirname = "/no/such/dir/really/can/exist"; +BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf) + : m_config(cnf), m_db(cnf) +{ + if (!m_config->getConfParam("beaglequeuedir", m_queuedir)) + m_queuedir = path_tildexpand("~/.beagle/ToIndex"); + if (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) { + string reason; + if (!maketmpdir(m_tmpdir, reason)) { + LOGERR(("DbIndexer: cannot create temporary directory: %s\n", + reason.c_str())); + m_tmpdir = badtmpdirname; + } + } + Rcl::Db::OpenMode mode = Rcl::Db::DbUpd; + if (!m_db.open(mode)) { + LOGERR(("BeagleQueueIndexer: error opening database %s\n", + m_config->getDbDir().c_str())); + return; + } +} + +BeagleQueueIndexer::~BeagleQueueIndexer() +{ + LOGDEB(("BeagleQueueIndexer::~\n")); + if (m_tmpdir.length() && m_tmpdir.compare(badtmpdirname)) { + wipedir(m_tmpdir); + if (rmdir(m_tmpdir.c_str()) < 0) { + LOGERR(("BeagleQueueIndexer::~: cannot clear temp dir %s\n", + m_tmpdir.c_str())); + } + } + m_db.close(); +} + +bool BeagleQueueIndexer::processqueue() +{ + LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n", + m_queuedir.c_str())); + + FsTreeWalker walker(FsTreeWalker::FtwNoRecurse); + walker.addSkippedName(".*"); + FsTreeWalker::Status status =walker.walk(m_queuedir, *this); + LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status)); + return true; +} + +FsTreeWalker::Status +BeagleQueueIndexer::processone(const string &path, + const struct stat *stp, + FsTreeWalker::CbFlag flg) +{ + if (flg != FsTreeWalker::FtwRegular) + return FsTreeWalker::FtwOk; + + string dotpath = path_cat(path_getfather(path), + string(".") + path_getsimple(path)); + LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str())); + BeagleDotFile dotfile(m_config, dotpath); + Rcl::Doc dotdoc; + string udi, udipath; + if (!dotfile.toDoc(dotdoc)) + goto out; + //dotdoc.dump(1); + + // Have to use the hit type for the udi, because the same url can exist + // as a bookmark or a page. + udipath = path_cat(dotdoc.meta[keybght], url_gpath(dotdoc.url)); + make_udi(udipath, "", udi); + + LOGDEB(("BeagleQueueIndexer: prc1: udi [%s]\n", udi.c_str())); + char ascdate[20]; + sprintf(ascdate, "%ld", long(stp->st_mtime)); + + // We only process bookmarks or text/html and text/plain files. + if (!stringlowercmp("bookmark", dotdoc.meta[keybght])) { + + } else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) || + (dotdoc.mimetype.compare("text/html") && + dotdoc.mimetype.compare("text/plain"))) { + LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n", + dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str())); + goto out; + } else { + Rcl::Doc doc; + FileInterner interner(path, stp, m_config, m_tmpdir, + FileInterner::FIF_doUseInputMimetype, + &dotdoc.mimetype); + string ipath; + FileInterner::Status fis = interner.internfile(doc, ipath); + if (fis != FileInterner::FIDone) { + LOGERR(("BeagleQueueIndexer: bad status from internfile\n")); + goto out; + } + + if (doc.fmtime.empty()) + doc.fmtime = ascdate; + + char cbuf[100]; + sprintf(cbuf, "%ld", (long)stp->st_size); + doc.fbytes = cbuf; + // Document signature for up to date checks: none. The file is + // going to be deleted anyway. We always reindex what comes in + // the queue. It would probably be possible to extract some + // http data to avoid this. + doc.sig = ""; + doc.url = dotdoc.url; + if (!m_db.addOrUpdate(udi, "", doc)) + return FsTreeWalker::FtwError; + } +out: +// unlink(path.c_str()); +// unlink(dotpath.c_str()); + return FsTreeWalker::FtwOk; +} diff --git a/src/index/beaglequeue.h b/src/index/beaglequeue.h new file mode 100644 index 00000000..55640eb5 --- /dev/null +++ b/src/index/beaglequeue.h @@ -0,0 +1,48 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef _beaglequeue_h_included_ +#define _beaglequeue_h_included_ +/* @(#$Id: $ (C) 2009 J.F.Dockes */ + +/** + * Code to process the Beagle indexing queue. Beagle MUST NOT be + * running, else mayhem will ensue. Interesting to reuse the beagle + * firefox visited page indexing plugin for example. + */ + +#include "rclconfig.h" +#include "fstreewalk.h" +#include "rcldb.h" + +class BeagleQueueIndexer : public FsTreeWalkerCB { +public: + BeagleQueueIndexer(RclConfig *cnf); + ~BeagleQueueIndexer(); + + bool processqueue(); + + FsTreeWalker::Status + processone(const string &, const struct stat *, FsTreeWalker::CbFlag); + +private: + RclConfig *m_config; + Rcl::Db m_db; + string m_queuedir; + string m_tmpdir; +}; + +#endif /* _beaglequeue_h_included_ */