comments and unused defs removal

2018-11-14 09:43:20 +01:00 · 2018-11-14 09:43:20 +01:00 · f008457493
commit f008457493
parent 036e1da6b4
1 changed files with 140 additions and 161 deletions
--- a/src/internfile/Filter.h
+++ b/src/internfile/Filter.h
@ -24,184 +24,163 @@

 class RclConfig;

-namespace Dijon
-{
-    class Filter;
+namespace Dijon {

-    /** Provides the list of MIME types supported by the filter(s).
-     * The character string is allocated with new[].
-     * This function is exported by dynamically loaded filter libraries.
+/// Document handler interface.
+///
+/// Document handler can either translate the text format
+/// (e.g. msdoc->text/plain), or/and extract subdocuments from
+/// multidocument formats (e.g. mbox->message, message->attachments,
+/// zip etc.)
+class Filter {
+public:
+    Filter() {}
+    virtual ~Filter() {}
+    /// Filter objects cannot be copied.
+    Filter(const Filter &other) = delete;
+    Filter& operator=(const Filter& other) = delete;
+
+    // Allow me to access the general config. This is a borrowed
+    // pointer. It has to be read/write, but don't delete.
+    virtual void setConfig(RclConfig *) = 0;
+
+    /// Returns the MIME type handled by the filter.
+    std::string get_mime_type(void) const {
+        return m_mimeType;
+    }
+
+    /** Supported input types */
+    typedef enum {DOCUMENT_DATA = 0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, 
+                  DOCUMENT_URI} DataInput;
+
+    virtual bool is_data_input_ok(DataInput input) const = 0;
+
+    /* Properties to be set prior to actual operation */
+    typedef enum {
+        // Source encoding to be used for reading/transcoding the
+        // original data if there is no other way to find
+        // (e.g. for text/plain files)
+        DEFAULT_CHARSET = 0,
+        // Either "view" or "index". Some implementations produce
+        // slightly different data (e.g. avoiding repeating some
+        // text in index mode)
+        OPERATING_MODE,
+        // Unique document identifier. This can be useful if the
+        // filter wants to manage a persistent cache (e.g. mh_mbox)
+        DJF_UDI
+    } Properties;
+
+    /** Sets a property, prior to calling set_document_XXX().
+     * Returns false if the property or value is not supported. */
+    virtual bool set_property(Properties prop_name, 
+                              const std::string &prop_value) = 0;
+
+    /** (Re)initializes the filter with the given data.
+     * Caller should ensure the given pointer is valid until the
+     * Filter object is destroyed, as some filters may not need to
+     * do a deep copy of the data.
+     * Call next_document() to position the filter onto the first document.
+     * Returns false if this input is not supported or an error occurred.
     */
-    typedef bool (get_filter_types_func)(std::set<std::string> &);
-    /** Returns what data should be passed to the filter(s).
-     * Output is cast from Filter::DataInput to int for convenience.
-     * This function is exported by dynamically loaded filter libraries.
-     * The aim is to let the client application know before-hand whether
-     * it should load documents or not.
+    virtual bool set_document_data(const std::string& mtype, 
+                                   const char *data_ptr, 
+                                   size_t data_length) = 0;
+
+    /** (Re)initializes the filter with the given data.
+     * Call next_document() to position the filter onto the first document.
+     * Returns false if this input is not supported or an error occurred.
     */
-    typedef bool (check_filter_data_input_func)(int);
-    /** Returns a Filter that handles the given MIME type.
-     * The Filter object is allocated with new.
-     * This function is exported by dynamically loaded filter libraries
-     * and serves as a factory for Filter objects, so that the client
-     * application doesn't have to know which Filter sub-types handle
-     * which MIME types.
+    virtual bool set_document_string(const std::string& mtype, 
+                                     const std::string &data_str) = 0;
+
+    /** (Re)initializes the filter with the given file.
+     * Call next_document() to position the filter onto the first document.
+     * Returns false if this input is not supported or an error occurred.
     */
-    typedef Filter *(get_filter_func)(const std::string &);
+    virtual bool set_document_file(const std::string& mtype, 
+                                   const std::string &file_path) = 0;

-    /// Filter interface.
-    class Filter
-    {
-    public:
-	/// Destroys the filter.
-	Filter()
-	{
-	}
-	virtual ~Filter() {}
-	virtual void setConfig(RclConfig *) = 0;
+    /** (Re)initializes the filter with the given URI.
+     * Call next_document() to position the filter onto the first document.
+     * Returns false if this input is not supported or an error occurred.
+     * No implementation supports this at the moment.
+     */
+    virtual bool set_document_uri(const std::string& mtype, 
+                                  const std::string &uri) = 0;

-	// Enumerations.
+    /** Set the document size meta_data element. This is the size
+        of the immediate containing file (ie, a .doc, a .odt), not
+        the size of, ie, a containing archive or .gz nor the size
+        of the extracted text. This is set externally, because the
+        surrounding code quite often has a better idea about it
+        (having created a temp file, etc.), and this saves more
+        stat() calls The value is stored inside metaData, docsize
+        key
+    */
+    virtual void set_docsize(int64_t size) = 0;

-	/** What data a filter supports as input.
-	 * It can be either the whole document data, its file name, or its URI.
-	 */
-	typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, 
-		       DOCUMENT_URI } DataInput;
+    // Going from one nested document to the next.

-	/** Input properties supported by the filter.
-	 *
-	 * - DEFAULT_CHARSET is the source encoding that should be used
-	 *   for reading/transcoding the original data if there is no
-	 *   other way to determine it (ie: for text/plain files)
-	 * - OPERATING_MODE can be set to either view or index.
-	 * - DJF_UDI Unique document identifier. This can be useful if the
-	 *     filter wants to manage a persistent cache.
-	 */
-	typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties;
+    /** Returns true if there are nested documents left to extract.
+     * Returns false if the end of the parent document was reached
+     * or an error occurred.
+     */
+    virtual bool has_documents(void) const = 0;

+    /** Moves to the next nested document.
+     * Returns false if there are none left.
+     */ 
+    virtual bool next_document(void) = 0;

-	// Information.
+    /** Skips to the nested document with the given ipath.
+     * Returns false if no such document exists.
+     */
+    virtual bool skip_to_document(const std::string &ipath) = 0;

-	/// Returns the MIME type handled by the filter.
-	std::string get_mime_type(void) const
-	{
-		return m_mimeType;
-	}
+    // Accessing documents' contents.

-	/// Returns what data the filter requires as input.
-	virtual bool is_data_input_ok(DataInput input) const = 0;
+    /// Returns the message for the most recent error that has occurred.
+    virtual std::string get_error(void) const = 0;

+    /** Returns a dictionary of metadata extracted from the current document.
+     * Metadata fields may include one or more of the following :
+     * content, title, ipath, mimetype, language, charset, author, creator,
+     * publisher, modificationdate, creationdate, size
+     * Special considerations apply :
+     * - content may contain binary data, watch out !
+     * - ipath is an internal path to the nested document that can be
+     * later passed to skip_to_document(). It may be empty if the parent
+     * document's type doesn't allow embedding, in which case the filter
+     * should only return one document.
+     * - mimetype should be text/plain if the document could be handled
+     * internally, empty if unknown. If any other value, it is expected
+     * that the client application can pass the nested document's content
+     * to another filter that supports this particular type.
+     */
+    virtual const std::map<std::string, std::string>&
+    get_meta_data(void) const {
+        return m_metaData;
+    }

-	// Initialization.
+    virtual void clear() {
+        m_metaData.clear();
+    }
+    // Hack: is this the special version used for unknown types?
+    virtual bool is_unknown() {
+        return false;
+    }

-	/** Sets a property, prior to calling set_document_XXX().
-	 * Returns false if the property is not supported.
-	 */
-	virtual bool set_property(Properties prop_name, 
-				  const std::string &prop_value) = 0;
+protected:
+    /// The MIME type handled by the filter.
+    std::string m_mimeType;

-	/** (Re)initializes the filter with the given data.
-	 * Caller should ensure the given pointer is valid until the
-	 * Filter object is destroyed, as some filters may not need to
-	 * do a deep copy of the data.
-	 * Call next_document() to position the filter onto the first document.
-	 * Returns false if this input is not supported or an error occurred.
-	 */
-	virtual bool set_document_data(const std::string& mtype, 
-				       const char *data_ptr, 
-				       size_t data_length) = 0;
+    /// Current Metadata dictionary. For multi-document files,
+    /// this may be rebuilt for each sub-document. See
+    /// common/cstr.h for the common key definitions. The document
+    /// text is "content"
+    std::map<std::string, std::string> m_metaData;

-	/** (Re)initializes the filter with the given data.
-	 * Call next_document() to position the filter onto the first document.
-	 * Returns false if this input is not supported or an error occurred.
-	 */
-	virtual bool set_document_string(const std::string& mtype, 
-					 const std::string &data_str) = 0;
-
-	/** (Re)initializes the filter with the given file.
-	 * Call next_document() to position the filter onto the first document.
-	 * Returns false if this input is not supported or an error occurred.
-	 */
-	virtual bool set_document_file(const std::string& mtype, 
-				       const std::string &file_path) = 0;
-
-	/** (Re)initializes the filter with the given URI.
-	 * Call next_document() to position the filter onto the first document.
-	 * Returns false if this input is not supported or an error occurred.
-	 */
-	virtual bool set_document_uri(const std::string& mtype, 
-				      const std::string &uri) = 0;
-
-	/** Set the document size meta_data element. This is the size
-	    of the immediate containing file (ie, a .doc, a .odt), not
-	    the size of, ie, a containing archive or .gz nor the size
-	    of the extracted text. This is set externally, because the
-	    surrounding code quite often has a better idea about it
-	    (having created a temp file, etc.), and this saves more
-	    stat() calls The value is stored inside metaData, docsize
-	    key
-	*/
-	virtual void set_docsize(int64_t size) = 0;
-
-	// Going from one nested document to the next.
-
-	/** Returns true if there are nested documents left to extract.
-	 * Returns false if the end of the parent document was reached
-	 * or an error occurred.
-	 */
-	virtual bool has_documents(void) const = 0;
-
-	/** Moves to the next nested document.
-	 * Returns false if there are none left.
-	 */ 
-	virtual bool next_document(void) = 0;
-
-	/** Skips to the nested document with the given ipath.
-	 * Returns false if no such document exists.
-	 */
-	virtual bool skip_to_document(const std::string &ipath) = 0;
-
-
-	// Accessing documents' contents.
-
-	/// Returns the message for the most recent error that has occurred.
-	virtual std::string get_error(void) const = 0;
-
-	/** Returns a dictionary of metadata extracted from the current document.
-	 * Metadata fields may include one or more of the following :
-	 * content, title, ipath, mimetype, language, charset, author, creator,
-	 * publisher, modificationdate, creationdate, size
-	 * Special considerations apply :
-	 * - content may contain binary data, watch out !
-	 * - ipath is an internal path to the nested document that can be
-	 * later passed to skip_to_document(). It may be empty if the parent
-	 * document's type doesn't allow embedding, in which case the filter
-	 * should only return one document.
-	 * - mimetype should be text/plain if the document could be handled
-	 * internally, empty if unknown. If any other value, it is expected
-	 * that the client application can pass the nested document's content
-	 * to another filter that supports this particular type.
-	 */
-	virtual const std::map<std::string, std::string> &get_meta_data(void) const
-	{
-	    return m_metaData;
-	}
-
-	virtual void clear() {m_metaData.clear();}
-	virtual bool is_unknown() {return false;}
-    protected:
-	/// The MIME type handled by the filter.
-	std::string m_mimeType;
-	/// Metadata dictionary.
-	std::map<std::string, std::string> m_metaData;
-
-    private:
-	/// Filter objects cannot be copied.
-	Filter(const Filter &other);
-	/// Filter objects cannot be copied.
-	Filter& operator=(const Filter& other);
-
-    };
+};
 }

 #endif // _DIJON_FILTER_H