#ifndef lint static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifdef HAVE_CONFIG_H #include "autoconfig.h" #endif #include #include #include #include #include #include #include #include #include #include #include "pathut.h" #include "conftree.h" #include "rclconfig.h" #include "fstreewalk.h" #include "rcldb.h" #include "readfile.h" #include "indexer.h" #include "fsindexer.h" #include "csguess.h" #include "transcode.h" #include "debuglog.h" #include "internfile.h" #include "smallut.h" #include "wipedir.h" #include "fileudi.h" #include "cancelcheck.h" // When using extended attributes, we have to use the ctime. // This is quite an expensive price to pay... #ifdef RCL_USE_XATTR #define RCL_STTIME st_ctime #else #define RCL_STTIME st_mtime #endif // RCL_USE_XATTR #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ #ifndef deleteZ #define deleteZ(X) {delete X;X = 0;} #endif FsIndexer::~FsIndexer() { } bool FsIndexer::init() { if (m_tdl.empty()) { m_tdl = m_config->getTopdirs(); if (m_tdl.empty()) { LOGERR(("FsIndexers: no topdirs list defined\n")); return false; } } return true; } // Recursively index each directory in the topdirs: bool FsIndexer::index() { if (!init()) return false; if (m_updater) { m_updater->status.reset(); m_updater->status.dbtotdocs = m_db->docCnt(); } m_walker.setSkippedPaths(m_config->getSkippedPaths()); m_walker.addSkippedPath(path_tildexpand("~/.beagle")); for (list::const_iterator it = m_tdl.begin(); it != m_tdl.end(); it++) { LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), getDbDir().c_str())); // Set the current directory in config so that subsequent // getConfParams() will get local values m_config->setKeyDir(*it); // Adjust the "follow symlinks" option bool follow; if (m_config->getConfParam("followLinks", &follow) && follow) { m_walker.setOpts(FsTreeWalker::FtwFollow); } else { m_walker.setOpts(FsTreeWalker::FtwOptNone); } int abslen; if (m_config->getConfParam("idxabsmlen", &abslen)) m_db->setAbstractParams(abslen, -1, -1); // Walk the directory tree if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) { LOGERR(("FsIndexer::index: error while indexing %s: %s\n", it->c_str(), m_walker.getReason().c_str())); return false; } } string missing; FileInterner::getMissingDescription(missing); if (!missing.empty()) { LOGINFO(("FsIndexer::index missing helper program(s):\n%s\n", missing.c_str())); } m_config->storeMissingHelperDesc(missing); return true; } static bool matchesSkipped(const list& tdl, FsTreeWalker& walker, const string& path) { // First check what (if any) topdir this is in: string td; for (list::const_iterator it = tdl.begin(); it != tdl.end(); it++) { if (path.find(*it) == 0) { td = *it; break; } } if (td.empty()) { LOGDEB(("FsIndexer::indexFiles: skipping [%s] (ntd)\n", path.c_str())); return true; } // Check path against skippedPaths. if (walker.inSkippedPaths(path)) { LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpp)\n", path.c_str())); return true; } // Then check all path components up to the topdir against skippedNames string mpath = path; while (mpath.length() >= td.length() && mpath.length() > 1) { string fn = path_getsimple(mpath); if (walker.inSkippedNames(fn)) { LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpn)\n", path.c_str())); return true; } string::size_type len = mpath.length(); mpath = path_getfather(mpath); // getfather normally returns a path ending with /, getsimple // would then return '' if (!mpath.empty() && mpath[mpath.size()-1] == '/') mpath.erase(mpath.size()-1); // should not be necessary, but lets be prudent. If the // path did not shorten, something is seriously amiss // (could be an assert actually) if (mpath.length() >= len) return true; } return false; } /** * Index individual files, out of a full tree run. No database purging */ bool FsIndexer::indexFiles(list& files) { if (!init()) return false; // We use an FsTreeWalker just for handling the skipped path/name lists FsTreeWalker walker; walker.setSkippedPaths(m_config->getSkippedPaths()); m_walker.addSkippedPath(path_tildexpand("~/.beagle")); for (list::iterator it = files.begin(); it != files.end(); ) { LOGDEB2(("FsIndexer::indexFiles: [%s]\n", it->c_str())); m_config->setKeyDir(path_getfather(*it)); if (m_havelocalfields) localfieldsfromconf(); walker.setSkippedNames(m_config->getSkippedNames()); // Check path against indexed areas and skipped names/paths if (matchesSkipped(m_tdl, walker, *it)) { it++; continue; } struct stat stb; if (lstat(it->c_str(), &stb) != 0) { LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(), strerror(errno))); it++; continue; } // If we get to indexing directory names one day, will need to test // against dbdir here to avoid modification loops (with rclmon). if (!S_ISREG(stb.st_mode)) { LOGDEB(("FsIndexer::indexFiles: skipping [%s] (nr)\n", it->c_str())); it++; continue; } int abslen; if (m_config->getConfParam("idxabsmlen", &abslen)) m_db->setAbstractParams(abslen, -1, -1); if (processone(*it, &stb, FsTreeWalker::FtwRegular) != FsTreeWalker::FtwOk) { LOGERR(("FsIndexer::indexFiles: processone failed\n")); return false; } it = files.erase(it); } return true; } /** Purge docs for given files out of the database */ bool FsIndexer::purgeFiles(list& files) { if (!init()) return false; for (list::iterator it = files.begin(); it != files.end(); ) { string udi; make_udi(*it, "", udi); // rcldb::purgefile returns true if the udi was either not // found or deleted, false only in case of actual error bool existed; if (!m_db->purgeFile(udi, &existed)) { LOGERR(("FsIndexer::purgeFiles: Database error\n")); return false; } // If we actually deleted something, take it off the list if (existed) { it = files.erase(it); } else { it++; } } return true; } // Local fields can be set for fs subtrees in the configuration file void FsIndexer::localfieldsfromconf() { LOGDEB0(("FsIndexer::localfieldsfromconf\n")); m_localfields.clear(); m_config->addLocalFields(&m_localfields); } // void FsIndexer::setlocalfields(Rcl::Doc& doc) { for (map::const_iterator it = m_localfields.begin(); it != m_localfields.end(); it++) { // Should local fields override those coming from the document // ? I think not, but not too sure if (doc.meta.find(it->second) == doc.meta.end()) { doc.meta[it->first] = it->second; } } } /// This method gets called for every file and directory found by the /// tree walker. /// /// It checks with the db if the file has changed and needs to be /// reindexed. If so, it calls internfile() which will identify the /// file type and call an appropriate handler to convert the document into /// internal format, which we then add to the database. /// /// Accent and majuscule handling are performed by the db module when doing /// the actual indexing work. The Rcl::Doc created by internfile() /// mostly contains pretty raw utf8 data. FsTreeWalker::Status FsIndexer::processone(const std::string &fn, const struct stat *stp, FsTreeWalker::CbFlag flg) { if (m_updater && !m_updater->update()) { return FsTreeWalker::FtwStop; } // If we're changing directories, possibly adjust parameters (set // the current directory in configuration object) if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) { m_config->setKeyDir(fn); // Set up skipped patterns for this subtree. m_walker.setSkippedNames(m_config->getSkippedNames()); int abslen; if (m_config->getConfParam("idxabsmlen", &abslen)) m_db->setAbstractParams(abslen, -1, -1); // Adjust local fields from config for this subtree if (m_havelocalfields) localfieldsfromconf(); if (flg == FsTreeWalker::FtwDirReturn) return FsTreeWalker::FtwOk; } //////////////////// // Check db up to date ? Doing this before file type // identification means that, if usesystemfilecommand is switched // from on to off it may happen that some files which are now // without mime type will not be purged from the db, resulting // in possible 'cannot intern file' messages at query time... // Document signature. This is based on m/ctime and size and used // for the uptodate check (the value computed here is checked // against the stored one). Changing the computation forces a full // reindex of course. char cbuf[100]; sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); string sig = cbuf; string udi; make_udi(fn, "", udi); if (!m_db->needUpdate(udi, sig)) { LOGDEB(("processone: up to date: %s\n", fn.c_str())); if (m_updater) { // Status bar update, abort request etc. m_updater->status.fn = fn; if (!m_updater->update()) { return FsTreeWalker::FtwStop; } } return FsTreeWalker::FtwOk; } LOGDEB0(("processone: processing: [%s] %s\n", displayableBytes(stp->st_size).c_str(), fn.c_str())); FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none); if (!interner.ok()) { // no indexing whatsoever in this case. This typically means that // indexallfilenames is not set return FsTreeWalker::FtwOk; } // File name transcoded to utf8 for indexing. string charset = m_config->getDefCharset(true); // If this fails, the file name won't be indexed, no big deal // Note that we used to do the full path here, but I ended up believing // that it made more sense to use only the file name string utf8fn; int ercnt; if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) { LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n", charset.c_str(), path_getsimple(fn).c_str())); } else if (ercnt) { LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n", ercnt, charset.c_str(), path_getsimple(fn).c_str())); } LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n", path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), "UTF-8")); string parent_udi; make_udi(fn, "", parent_udi); Rcl::Doc doc; const string plus("+"); char ascdate[30]; sprintf(ascdate, "%ld", long(stp->st_mtime)); FileInterner::Status fis = FileInterner::FIAgain; bool hadNullIpath = false; while (fis == FileInterner::FIAgain) { doc.erase(); try { fis = interner.internfile(doc); } catch (CancelExcept) { LOGERR(("fsIndexer::processone: interrupted\n")); return FsTreeWalker::FtwStop; } // Index at least the file name even if there was an error. // We'll change the signature to ensure that the indexing will // be retried every time. // Internal access path for multi-document files if (doc.ipath.empty()) hadNullIpath = true; // Set file name, mod time and url if not done by filter if (doc.fmtime.empty()) doc.fmtime = ascdate; if (doc.url.empty()) doc.url = string("file://") + fn; if (doc.utf8fn.empty()) doc.utf8fn = utf8fn; char cbuf[100]; sprintf(cbuf, "%ld", (long)stp->st_size); doc.fbytes = cbuf; // Document signature for up to date checks: concatenate // m/ctime and size. Looking for changes only, no need to // parseback so no need for reversible formatting. Also set, // but never used, for subdocs. sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); doc.sig = cbuf; // If there was an error, ensure indexing will be // retried. This is for the once missing, later installed // filter case. It can make indexing much slower (if there are // myriads of such files, the ext script is executed for them // and fails every time) if (fis == FileInterner::FIError) { doc.sig += plus; } // Possibly add fields from local config if (m_havelocalfields) setlocalfields(doc); // Add document to database. If there is an ipath, add it as a children // of the file document. string udi; make_udi(fn, doc.ipath, udi); if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? "" : parent_udi, doc)) return FsTreeWalker::FtwError; // Tell what we are doing and check for interrupt request if (m_updater) { ++(m_updater->status.docsdone); if (m_updater->status.dbtotdocs < m_updater->status.docsdone) m_updater->status.dbtotdocs = m_updater->status.docsdone; m_updater->status.fn = fn; if (!doc.ipath.empty()) m_updater->status.fn += "|" + doc.ipath; if (!m_updater->update()) { return FsTreeWalker::FtwStop; } } } // If we had no instance with a null ipath, we create an empty // document to stand for the file itself, to be used mainly for up // to date checks. Typically this happens for an mbox file. if (hadNullIpath == false) { LOGDEB1(("Creating empty doc for file\n")); Rcl::Doc fileDoc; fileDoc.fmtime = ascdate; fileDoc.utf8fn = utf8fn; fileDoc.mimetype = interner.getMimetype(); fileDoc.url = string("file://") + fn; char cbuf[100]; sprintf(cbuf, "%ld", (long)stp->st_size); fileDoc.fbytes = cbuf; // Document signature for up to date checks. sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); fileDoc.sig = cbuf; if (!m_db->addOrUpdate(parent_udi, "", fileDoc)) return FsTreeWalker::FtwError; } return FsTreeWalker::FtwOk; }