/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_FILTER_H #define _DIJON_FILTER_H #include #include #include #include class RclConfig; namespace Dijon { class Filter; /** Provides the list of MIME types supported by the filter(s). * The character string is allocated with new[]. * This function is exported by dynamically loaded filter libraries. */ typedef bool (get_filter_types_func)(std::set &); /** Returns what data should be passed to the filter(s). * Output is cast from Filter::DataInput to int for convenience. * This function is exported by dynamically loaded filter libraries. * The aim is to let the client application know before-hand whether * it should load documents or not. */ typedef bool (check_filter_data_input_func)(int); /** Returns a Filter that handles the given MIME type. * The Filter object is allocated with new. * This function is exported by dynamically loaded filter libraries * and serves as a factory for Filter objects, so that the client * application doesn't have to know which Filter sub-types handle * which MIME types. */ typedef Filter *(get_filter_func)(const std::string &); /// Filter interface. class Filter { public: /// Destroys the filter. Filter() { } virtual ~Filter() {} virtual void setConfig(RclConfig *) = 0; // Enumerations. /** What data a filter supports as input. * It can be either the whole document data, its file name, or its URI. */ typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput; /** Input properties supported by the filter. * * - DEFAULT_CHARSET is the source encoding that should be used * for reading/transcoding the original data if there is no * other way to determine it (ie: for text/plain files) * - OPERATING_MODE can be set to either view or index. * - DJF_UDI Unique document identifier. This can be useful if the * filter wants to manage a persistent cache. */ typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties; // Information. /// Returns the MIME type handled by the filter. std::string get_mime_type(void) const { return m_mimeType; } /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const = 0; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0; /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const std::string& mtype, const char *data_ptr, size_t data_length) = 0; /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string& mtype, const std::string &data_str) = 0; /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string& mtype, const std::string &file_path) = 0; /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string& mtype, const std::string &uri) = 0; /** Set the document size meta_data element. This is the size of the immediate containing file (ie, a .doc, a .odt), not the size of, ie, a containing archive or .gz nor the size of the extracted text. This is set externally, because the surrounding code quite often has a better idea about it (having created a temp file, etc.), and this saves more stat() calls The value is stored inside metaData, docsize key */ virtual void set_docsize(int64_t size) = 0; // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const = 0; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void) = 0; /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath) = 0; // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const = 0; /** Returns a dictionary of metadata extracted from the current document. * Metadata fields may include one or more of the following : * content, title, ipath, mimetype, language, charset, author, creator, * publisher, modificationdate, creationdate, size * Special considerations apply : * - content may contain binary data, watch out ! * - ipath is an internal path to the nested document that can be * later passed to skip_to_document(). It may be empty if the parent * document's type doesn't allow embedding, in which case the filter * should only return one document. * - mimetype should be text/plain if the document could be handled * internally, empty if unknown. If any other value, it is expected * that the client application can pass the nested document's content * to another filter that supports this particular type. */ virtual const std::map &get_meta_data(void) const { return m_metaData; } virtual void clear() {m_metaData.clear();} virtual bool is_unknown() {return false;} protected: /// The MIME type handled by the filter. std::string m_mimeType; /// Metadata dictionary. std::map m_metaData; private: /// Filter objects cannot be copied. Filter(const Filter &other); /// Filter objects cannot be copied. Filter& operator=(const Filter& other); }; } #endif // _DIJON_FILTER_H