comments and unused defs removal

This commit is contained in:
Jean-Francois Dockes 2018-11-14 09:43:20 +01:00
parent 036e1da6b4
commit f008457493

View File

@ -24,184 +24,163 @@
class RclConfig;
namespace Dijon
{
class Filter;
namespace Dijon {
/** Provides the list of MIME types supported by the filter(s).
* The character string is allocated with new[].
* This function is exported by dynamically loaded filter libraries.
/// Document handler interface.
///
/// Document handler can either translate the text format
/// (e.g. msdoc->text/plain), or/and extract subdocuments from
/// multidocument formats (e.g. mbox->message, message->attachments,
/// zip etc.)
class Filter {
public:
Filter() {}
virtual ~Filter() {}
/// Filter objects cannot be copied.
Filter(const Filter &other) = delete;
Filter& operator=(const Filter& other) = delete;
// Allow me to access the general config. This is a borrowed
// pointer. It has to be read/write, but don't delete.
virtual void setConfig(RclConfig *) = 0;
/// Returns the MIME type handled by the filter.
std::string get_mime_type(void) const {
return m_mimeType;
}
/** Supported input types */
typedef enum {DOCUMENT_DATA = 0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
DOCUMENT_URI} DataInput;
virtual bool is_data_input_ok(DataInput input) const = 0;
/* Properties to be set prior to actual operation */
typedef enum {
// Source encoding to be used for reading/transcoding the
// original data if there is no other way to find
// (e.g. for text/plain files)
DEFAULT_CHARSET = 0,
// Either "view" or "index". Some implementations produce
// slightly different data (e.g. avoiding repeating some
// text in index mode)
OPERATING_MODE,
// Unique document identifier. This can be useful if the
// filter wants to manage a persistent cache (e.g. mh_mbox)
DJF_UDI
} Properties;
/** Sets a property, prior to calling set_document_XXX().
* Returns false if the property or value is not supported. */
virtual bool set_property(Properties prop_name,
const std::string &prop_value) = 0;
/** (Re)initializes the filter with the given data.
* Caller should ensure the given pointer is valid until the
* Filter object is destroyed, as some filters may not need to
* do a deep copy of the data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
typedef bool (get_filter_types_func)(std::set<std::string> &);
/** Returns what data should be passed to the filter(s).
* Output is cast from Filter::DataInput to int for convenience.
* This function is exported by dynamically loaded filter libraries.
* The aim is to let the client application know before-hand whether
* it should load documents or not.
virtual bool set_document_data(const std::string& mtype,
const char *data_ptr,
size_t data_length) = 0;
/** (Re)initializes the filter with the given data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
typedef bool (check_filter_data_input_func)(int);
/** Returns a Filter that handles the given MIME type.
* The Filter object is allocated with new.
* This function is exported by dynamically loaded filter libraries
* and serves as a factory for Filter objects, so that the client
* application doesn't have to know which Filter sub-types handle
* which MIME types.
virtual bool set_document_string(const std::string& mtype,
const std::string &data_str) = 0;
/** (Re)initializes the filter with the given file.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
typedef Filter *(get_filter_func)(const std::string &);
virtual bool set_document_file(const std::string& mtype,
const std::string &file_path) = 0;
/// Filter interface.
class Filter
{
public:
/// Destroys the filter.
Filter()
{
}
virtual ~Filter() {}
virtual void setConfig(RclConfig *) = 0;
/** (Re)initializes the filter with the given URI.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
* No implementation supports this at the moment.
*/
virtual bool set_document_uri(const std::string& mtype,
const std::string &uri) = 0;
// Enumerations.
/** Set the document size meta_data element. This is the size
of the immediate containing file (ie, a .doc, a .odt), not
the size of, ie, a containing archive or .gz nor the size
of the extracted text. This is set externally, because the
surrounding code quite often has a better idea about it
(having created a temp file, etc.), and this saves more
stat() calls The value is stored inside metaData, docsize
key
*/
virtual void set_docsize(int64_t size) = 0;
/** What data a filter supports as input.
* It can be either the whole document data, its file name, or its URI.
*/
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
DOCUMENT_URI } DataInput;
// Going from one nested document to the next.
/** Input properties supported by the filter.
*
* - DEFAULT_CHARSET is the source encoding that should be used
* for reading/transcoding the original data if there is no
* other way to determine it (ie: for text/plain files)
* - OPERATING_MODE can be set to either view or index.
* - DJF_UDI Unique document identifier. This can be useful if the
* filter wants to manage a persistent cache.
*/
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties;
/** Returns true if there are nested documents left to extract.
* Returns false if the end of the parent document was reached
* or an error occurred.
*/
virtual bool has_documents(void) const = 0;
/** Moves to the next nested document.
* Returns false if there are none left.
*/
virtual bool next_document(void) = 0;
// Information.
/** Skips to the nested document with the given ipath.
* Returns false if no such document exists.
*/
virtual bool skip_to_document(const std::string &ipath) = 0;
/// Returns the MIME type handled by the filter.
std::string get_mime_type(void) const
{
return m_mimeType;
}
// Accessing documents' contents.
/// Returns what data the filter requires as input.
virtual bool is_data_input_ok(DataInput input) const = 0;
/// Returns the message for the most recent error that has occurred.
virtual std::string get_error(void) const = 0;
/** Returns a dictionary of metadata extracted from the current document.
* Metadata fields may include one or more of the following :
* content, title, ipath, mimetype, language, charset, author, creator,
* publisher, modificationdate, creationdate, size
* Special considerations apply :
* - content may contain binary data, watch out !
* - ipath is an internal path to the nested document that can be
* later passed to skip_to_document(). It may be empty if the parent
* document's type doesn't allow embedding, in which case the filter
* should only return one document.
* - mimetype should be text/plain if the document could be handled
* internally, empty if unknown. If any other value, it is expected
* that the client application can pass the nested document's content
* to another filter that supports this particular type.
*/
virtual const std::map<std::string, std::string>&
get_meta_data(void) const {
return m_metaData;
}
// Initialization.
virtual void clear() {
m_metaData.clear();
}
// Hack: is this the special version used for unknown types?
virtual bool is_unknown() {
return false;
}
/** Sets a property, prior to calling set_document_XXX().
* Returns false if the property is not supported.
*/
virtual bool set_property(Properties prop_name,
const std::string &prop_value) = 0;
protected:
/// The MIME type handled by the filter.
std::string m_mimeType;
/** (Re)initializes the filter with the given data.
* Caller should ensure the given pointer is valid until the
* Filter object is destroyed, as some filters may not need to
* do a deep copy of the data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
virtual bool set_document_data(const std::string& mtype,
const char *data_ptr,
size_t data_length) = 0;
/// Current Metadata dictionary. For multi-document files,
/// this may be rebuilt for each sub-document. See
/// common/cstr.h for the common key definitions. The document
/// text is "content"
std::map<std::string, std::string> m_metaData;
/** (Re)initializes the filter with the given data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
virtual bool set_document_string(const std::string& mtype,
const std::string &data_str) = 0;
/** (Re)initializes the filter with the given file.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
virtual bool set_document_file(const std::string& mtype,
const std::string &file_path) = 0;
/** (Re)initializes the filter with the given URI.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occurred.
*/
virtual bool set_document_uri(const std::string& mtype,
const std::string &uri) = 0;
/** Set the document size meta_data element. This is the size
of the immediate containing file (ie, a .doc, a .odt), not
the size of, ie, a containing archive or .gz nor the size
of the extracted text. This is set externally, because the
surrounding code quite often has a better idea about it
(having created a temp file, etc.), and this saves more
stat() calls The value is stored inside metaData, docsize
key
*/
virtual void set_docsize(int64_t size) = 0;
// Going from one nested document to the next.
/** Returns true if there are nested documents left to extract.
* Returns false if the end of the parent document was reached
* or an error occurred.
*/
virtual bool has_documents(void) const = 0;
/** Moves to the next nested document.
* Returns false if there are none left.
*/
virtual bool next_document(void) = 0;
/** Skips to the nested document with the given ipath.
* Returns false if no such document exists.
*/
virtual bool skip_to_document(const std::string &ipath) = 0;
// Accessing documents' contents.
/// Returns the message for the most recent error that has occurred.
virtual std::string get_error(void) const = 0;
/** Returns a dictionary of metadata extracted from the current document.
* Metadata fields may include one or more of the following :
* content, title, ipath, mimetype, language, charset, author, creator,
* publisher, modificationdate, creationdate, size
* Special considerations apply :
* - content may contain binary data, watch out !
* - ipath is an internal path to the nested document that can be
* later passed to skip_to_document(). It may be empty if the parent
* document's type doesn't allow embedding, in which case the filter
* should only return one document.
* - mimetype should be text/plain if the document could be handled
* internally, empty if unknown. If any other value, it is expected
* that the client application can pass the nested document's content
* to another filter that supports this particular type.
*/
virtual const std::map<std::string, std::string> &get_meta_data(void) const
{
return m_metaData;
}
virtual void clear() {m_metaData.clear();}
virtual bool is_unknown() {return false;}
protected:
/// The MIME type handled by the filter.
std::string m_mimeType;
/// Metadata dictionary.
std::map<std::string, std::string> m_metaData;
private:
/// Filter objects cannot be copied.
Filter(const Filter &other);
/// Filter objects cannot be copied.
Filter& operator=(const Filter& other);
};
};
}
#endif // _DIJON_FILTER_H