From f008457493a4db7d27b76141cdad2a405e5a6b8d Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 14 Nov 2018 09:43:20 +0100 Subject: [PATCH] comments and unused defs removal --- src/internfile/Filter.h | 301 +++++++++++++++++++--------------------- 1 file changed, 140 insertions(+), 161 deletions(-) diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index 841175ce..35c5e584 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -24,184 +24,163 @@ class RclConfig; -namespace Dijon -{ - class Filter; +namespace Dijon { - /** Provides the list of MIME types supported by the filter(s). - * The character string is allocated with new[]. - * This function is exported by dynamically loaded filter libraries. +/// Document handler interface. +/// +/// Document handler can either translate the text format +/// (e.g. msdoc->text/plain), or/and extract subdocuments from +/// multidocument formats (e.g. mbox->message, message->attachments, +/// zip etc.) +class Filter { +public: + Filter() {} + virtual ~Filter() {} + /// Filter objects cannot be copied. + Filter(const Filter &other) = delete; + Filter& operator=(const Filter& other) = delete; + + // Allow me to access the general config. This is a borrowed + // pointer. It has to be read/write, but don't delete. + virtual void setConfig(RclConfig *) = 0; + + /// Returns the MIME type handled by the filter. + std::string get_mime_type(void) const { + return m_mimeType; + } + + /** Supported input types */ + typedef enum {DOCUMENT_DATA = 0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, + DOCUMENT_URI} DataInput; + + virtual bool is_data_input_ok(DataInput input) const = 0; + + /* Properties to be set prior to actual operation */ + typedef enum { + // Source encoding to be used for reading/transcoding the + // original data if there is no other way to find + // (e.g. for text/plain files) + DEFAULT_CHARSET = 0, + // Either "view" or "index". Some implementations produce + // slightly different data (e.g. avoiding repeating some + // text in index mode) + OPERATING_MODE, + // Unique document identifier. This can be useful if the + // filter wants to manage a persistent cache (e.g. mh_mbox) + DJF_UDI + } Properties; + + /** Sets a property, prior to calling set_document_XXX(). + * Returns false if the property or value is not supported. */ + virtual bool set_property(Properties prop_name, + const std::string &prop_value) = 0; + + /** (Re)initializes the filter with the given data. + * Caller should ensure the given pointer is valid until the + * Filter object is destroyed, as some filters may not need to + * do a deep copy of the data. + * Call next_document() to position the filter onto the first document. + * Returns false if this input is not supported or an error occurred. */ - typedef bool (get_filter_types_func)(std::set &); - /** Returns what data should be passed to the filter(s). - * Output is cast from Filter::DataInput to int for convenience. - * This function is exported by dynamically loaded filter libraries. - * The aim is to let the client application know before-hand whether - * it should load documents or not. + virtual bool set_document_data(const std::string& mtype, + const char *data_ptr, + size_t data_length) = 0; + + /** (Re)initializes the filter with the given data. + * Call next_document() to position the filter onto the first document. + * Returns false if this input is not supported or an error occurred. */ - typedef bool (check_filter_data_input_func)(int); - /** Returns a Filter that handles the given MIME type. - * The Filter object is allocated with new. - * This function is exported by dynamically loaded filter libraries - * and serves as a factory for Filter objects, so that the client - * application doesn't have to know which Filter sub-types handle - * which MIME types. + virtual bool set_document_string(const std::string& mtype, + const std::string &data_str) = 0; + + /** (Re)initializes the filter with the given file. + * Call next_document() to position the filter onto the first document. + * Returns false if this input is not supported or an error occurred. */ - typedef Filter *(get_filter_func)(const std::string &); + virtual bool set_document_file(const std::string& mtype, + const std::string &file_path) = 0; - /// Filter interface. - class Filter - { - public: - /// Destroys the filter. - Filter() - { - } - virtual ~Filter() {} - virtual void setConfig(RclConfig *) = 0; + /** (Re)initializes the filter with the given URI. + * Call next_document() to position the filter onto the first document. + * Returns false if this input is not supported or an error occurred. + * No implementation supports this at the moment. + */ + virtual bool set_document_uri(const std::string& mtype, + const std::string &uri) = 0; - // Enumerations. + /** Set the document size meta_data element. This is the size + of the immediate containing file (ie, a .doc, a .odt), not + the size of, ie, a containing archive or .gz nor the size + of the extracted text. This is set externally, because the + surrounding code quite often has a better idea about it + (having created a temp file, etc.), and this saves more + stat() calls The value is stored inside metaData, docsize + key + */ + virtual void set_docsize(int64_t size) = 0; - /** What data a filter supports as input. - * It can be either the whole document data, its file name, or its URI. - */ - typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, - DOCUMENT_URI } DataInput; + // Going from one nested document to the next. - /** Input properties supported by the filter. - * - * - DEFAULT_CHARSET is the source encoding that should be used - * for reading/transcoding the original data if there is no - * other way to determine it (ie: for text/plain files) - * - OPERATING_MODE can be set to either view or index. - * - DJF_UDI Unique document identifier. This can be useful if the - * filter wants to manage a persistent cache. - */ - typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties; + /** Returns true if there are nested documents left to extract. + * Returns false if the end of the parent document was reached + * or an error occurred. + */ + virtual bool has_documents(void) const = 0; + /** Moves to the next nested document. + * Returns false if there are none left. + */ + virtual bool next_document(void) = 0; - // Information. + /** Skips to the nested document with the given ipath. + * Returns false if no such document exists. + */ + virtual bool skip_to_document(const std::string &ipath) = 0; - /// Returns the MIME type handled by the filter. - std::string get_mime_type(void) const - { - return m_mimeType; - } + // Accessing documents' contents. - /// Returns what data the filter requires as input. - virtual bool is_data_input_ok(DataInput input) const = 0; + /// Returns the message for the most recent error that has occurred. + virtual std::string get_error(void) const = 0; + /** Returns a dictionary of metadata extracted from the current document. + * Metadata fields may include one or more of the following : + * content, title, ipath, mimetype, language, charset, author, creator, + * publisher, modificationdate, creationdate, size + * Special considerations apply : + * - content may contain binary data, watch out ! + * - ipath is an internal path to the nested document that can be + * later passed to skip_to_document(). It may be empty if the parent + * document's type doesn't allow embedding, in which case the filter + * should only return one document. + * - mimetype should be text/plain if the document could be handled + * internally, empty if unknown. If any other value, it is expected + * that the client application can pass the nested document's content + * to another filter that supports this particular type. + */ + virtual const std::map& + get_meta_data(void) const { + return m_metaData; + } - // Initialization. + virtual void clear() { + m_metaData.clear(); + } + // Hack: is this the special version used for unknown types? + virtual bool is_unknown() { + return false; + } - /** Sets a property, prior to calling set_document_XXX(). - * Returns false if the property is not supported. - */ - virtual bool set_property(Properties prop_name, - const std::string &prop_value) = 0; +protected: + /// The MIME type handled by the filter. + std::string m_mimeType; - /** (Re)initializes the filter with the given data. - * Caller should ensure the given pointer is valid until the - * Filter object is destroyed, as some filters may not need to - * do a deep copy of the data. - * Call next_document() to position the filter onto the first document. - * Returns false if this input is not supported or an error occurred. - */ - virtual bool set_document_data(const std::string& mtype, - const char *data_ptr, - size_t data_length) = 0; + /// Current Metadata dictionary. For multi-document files, + /// this may be rebuilt for each sub-document. See + /// common/cstr.h for the common key definitions. The document + /// text is "content" + std::map m_metaData; - /** (Re)initializes the filter with the given data. - * Call next_document() to position the filter onto the first document. - * Returns false if this input is not supported or an error occurred. - */ - virtual bool set_document_string(const std::string& mtype, - const std::string &data_str) = 0; - - /** (Re)initializes the filter with the given file. - * Call next_document() to position the filter onto the first document. - * Returns false if this input is not supported or an error occurred. - */ - virtual bool set_document_file(const std::string& mtype, - const std::string &file_path) = 0; - - /** (Re)initializes the filter with the given URI. - * Call next_document() to position the filter onto the first document. - * Returns false if this input is not supported or an error occurred. - */ - virtual bool set_document_uri(const std::string& mtype, - const std::string &uri) = 0; - - /** Set the document size meta_data element. This is the size - of the immediate containing file (ie, a .doc, a .odt), not - the size of, ie, a containing archive or .gz nor the size - of the extracted text. This is set externally, because the - surrounding code quite often has a better idea about it - (having created a temp file, etc.), and this saves more - stat() calls The value is stored inside metaData, docsize - key - */ - virtual void set_docsize(int64_t size) = 0; - - // Going from one nested document to the next. - - /** Returns true if there are nested documents left to extract. - * Returns false if the end of the parent document was reached - * or an error occurred. - */ - virtual bool has_documents(void) const = 0; - - /** Moves to the next nested document. - * Returns false if there are none left. - */ - virtual bool next_document(void) = 0; - - /** Skips to the nested document with the given ipath. - * Returns false if no such document exists. - */ - virtual bool skip_to_document(const std::string &ipath) = 0; - - - // Accessing documents' contents. - - /// Returns the message for the most recent error that has occurred. - virtual std::string get_error(void) const = 0; - - /** Returns a dictionary of metadata extracted from the current document. - * Metadata fields may include one or more of the following : - * content, title, ipath, mimetype, language, charset, author, creator, - * publisher, modificationdate, creationdate, size - * Special considerations apply : - * - content may contain binary data, watch out ! - * - ipath is an internal path to the nested document that can be - * later passed to skip_to_document(). It may be empty if the parent - * document's type doesn't allow embedding, in which case the filter - * should only return one document. - * - mimetype should be text/plain if the document could be handled - * internally, empty if unknown. If any other value, it is expected - * that the client application can pass the nested document's content - * to another filter that supports this particular type. - */ - virtual const std::map &get_meta_data(void) const - { - return m_metaData; - } - - virtual void clear() {m_metaData.clear();} - virtual bool is_unknown() {return false;} - protected: - /// The MIME type handled by the filter. - std::string m_mimeType; - /// Metadata dictionary. - std::map m_metaData; - - private: - /// Filter objects cannot be copied. - Filter(const Filter &other); - /// Filter objects cannot be copied. - Filter& operator=(const Filter& other); - - }; +}; } #endif // _DIJON_FILTER_H