recoll/src/index/fetcher.h

90 lines
3.5 KiB
C++

/* Copyright (C) 2012-2019 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef _FETCHER_H_INCLUDED_
#define _FETCHER_H_INCLUDED_
#include <string>
#include <memory>
#include "rcldoc.h"
#include "pathut.h"
class RclConfig;
/**
* Generic interface to retrieve the data for a document designated by
* its index data (udi/ipath/url). This is used to retrieve the data
* for previewing. The actual implementation is specific to the kind
* of backend (file system, web cache, others?...), and the
* implementation may of course share code with the indexing-time
* functions from the specific backend.
*
* This normally gives access the raw document container (either as a
* file or as a memory block). The Internfile code will then further
* process it to get to the actual document, especially if
* de-embedding is involved.
*
* The DATADIRECT document kind, which holds final extracted data, is only
* returned when using an external indexer (only the python demo sample at
* this point), in which case the whole extraction is performed by the
* external code.
*/
class DocFetcher {
public:
/** A RawDoc is the data for a document-holding entity either as a
memory block, or pointed to by a file name */
struct RawDoc {
enum RawDocKind {RDK_FILENAME, RDK_DATA, RDK_DATADIRECT};
RawDocKind kind;
std::string data; // Doc data or file name
struct PathStat st; // Only used if RDK_FILENAME
};
/**
* Return the data for the requested document, either as a
* file-system file or as a memory object (maybe stream too in the
* future?)
* @param cnf the global config
* @param idoc the data gathered from the index for this doc (udi/ipath)
* @param out we may return either a file name or the document data.
*/
virtual bool fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out) = 0;
/**
* Return the signature for the requested document. This is used for
* up-to-date tests performed when not indexing (e.g.: verifying that a
* document is not stale before previewing it).
* @param cnf the global config
* @param idoc the data gathered from the index for this doc (udi/ipath)
* @param sig output.
*/
virtual bool makesig(RclConfig* cnf, const Rcl::Doc& idoc, std::string& sig) = 0;
enum Reason{FetchOk, FetchNotExist, FetchNoPerm, FetchOther};
virtual Reason testAccess(RclConfig*, const Rcl::Doc&) {
return FetchOther;
}
DocFetcher() {}
virtual ~DocFetcher() {}
DocFetcher(const DocFetcher&) = delete;
DocFetcher& operator=(const DocFetcher&) = delete;
};
/** Return an appropriate fetcher object given the backend string identifier inside idoc*/
std::unique_ptr<DocFetcher> docFetcherMake(RclConfig *config, const Rcl::Doc& idoc);
#endif /* _FETCHER_H_INCLUDED_ */