comments and unused defs removal
This commit is contained in:
parent
036e1da6b4
commit
f008457493
@ -24,184 +24,163 @@
|
||||
|
||||
class RclConfig;
|
||||
|
||||
namespace Dijon
|
||||
{
|
||||
class Filter;
|
||||
namespace Dijon {
|
||||
|
||||
/** Provides the list of MIME types supported by the filter(s).
|
||||
* The character string is allocated with new[].
|
||||
* This function is exported by dynamically loaded filter libraries.
|
||||
/// Document handler interface.
|
||||
///
|
||||
/// Document handler can either translate the text format
|
||||
/// (e.g. msdoc->text/plain), or/and extract subdocuments from
|
||||
/// multidocument formats (e.g. mbox->message, message->attachments,
|
||||
/// zip etc.)
|
||||
class Filter {
|
||||
public:
|
||||
Filter() {}
|
||||
virtual ~Filter() {}
|
||||
/// Filter objects cannot be copied.
|
||||
Filter(const Filter &other) = delete;
|
||||
Filter& operator=(const Filter& other) = delete;
|
||||
|
||||
// Allow me to access the general config. This is a borrowed
|
||||
// pointer. It has to be read/write, but don't delete.
|
||||
virtual void setConfig(RclConfig *) = 0;
|
||||
|
||||
/// Returns the MIME type handled by the filter.
|
||||
std::string get_mime_type(void) const {
|
||||
return m_mimeType;
|
||||
}
|
||||
|
||||
/** Supported input types */
|
||||
typedef enum {DOCUMENT_DATA = 0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
|
||||
DOCUMENT_URI} DataInput;
|
||||
|
||||
virtual bool is_data_input_ok(DataInput input) const = 0;
|
||||
|
||||
/* Properties to be set prior to actual operation */
|
||||
typedef enum {
|
||||
// Source encoding to be used for reading/transcoding the
|
||||
// original data if there is no other way to find
|
||||
// (e.g. for text/plain files)
|
||||
DEFAULT_CHARSET = 0,
|
||||
// Either "view" or "index". Some implementations produce
|
||||
// slightly different data (e.g. avoiding repeating some
|
||||
// text in index mode)
|
||||
OPERATING_MODE,
|
||||
// Unique document identifier. This can be useful if the
|
||||
// filter wants to manage a persistent cache (e.g. mh_mbox)
|
||||
DJF_UDI
|
||||
} Properties;
|
||||
|
||||
/** Sets a property, prior to calling set_document_XXX().
|
||||
* Returns false if the property or value is not supported. */
|
||||
virtual bool set_property(Properties prop_name,
|
||||
const std::string &prop_value) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given data.
|
||||
* Caller should ensure the given pointer is valid until the
|
||||
* Filter object is destroyed, as some filters may not need to
|
||||
* do a deep copy of the data.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
typedef bool (get_filter_types_func)(std::set<std::string> &);
|
||||
/** Returns what data should be passed to the filter(s).
|
||||
* Output is cast from Filter::DataInput to int for convenience.
|
||||
* This function is exported by dynamically loaded filter libraries.
|
||||
* The aim is to let the client application know before-hand whether
|
||||
* it should load documents or not.
|
||||
virtual bool set_document_data(const std::string& mtype,
|
||||
const char *data_ptr,
|
||||
size_t data_length) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given data.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
typedef bool (check_filter_data_input_func)(int);
|
||||
/** Returns a Filter that handles the given MIME type.
|
||||
* The Filter object is allocated with new.
|
||||
* This function is exported by dynamically loaded filter libraries
|
||||
* and serves as a factory for Filter objects, so that the client
|
||||
* application doesn't have to know which Filter sub-types handle
|
||||
* which MIME types.
|
||||
virtual bool set_document_string(const std::string& mtype,
|
||||
const std::string &data_str) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given file.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
typedef Filter *(get_filter_func)(const std::string &);
|
||||
virtual bool set_document_file(const std::string& mtype,
|
||||
const std::string &file_path) = 0;
|
||||
|
||||
/// Filter interface.
|
||||
class Filter
|
||||
{
|
||||
public:
|
||||
/// Destroys the filter.
|
||||
Filter()
|
||||
{
|
||||
}
|
||||
virtual ~Filter() {}
|
||||
virtual void setConfig(RclConfig *) = 0;
|
||||
/** (Re)initializes the filter with the given URI.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
* No implementation supports this at the moment.
|
||||
*/
|
||||
virtual bool set_document_uri(const std::string& mtype,
|
||||
const std::string &uri) = 0;
|
||||
|
||||
// Enumerations.
|
||||
/** Set the document size meta_data element. This is the size
|
||||
of the immediate containing file (ie, a .doc, a .odt), not
|
||||
the size of, ie, a containing archive or .gz nor the size
|
||||
of the extracted text. This is set externally, because the
|
||||
surrounding code quite often has a better idea about it
|
||||
(having created a temp file, etc.), and this saves more
|
||||
stat() calls The value is stored inside metaData, docsize
|
||||
key
|
||||
*/
|
||||
virtual void set_docsize(int64_t size) = 0;
|
||||
|
||||
/** What data a filter supports as input.
|
||||
* It can be either the whole document data, its file name, or its URI.
|
||||
*/
|
||||
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
|
||||
DOCUMENT_URI } DataInput;
|
||||
// Going from one nested document to the next.
|
||||
|
||||
/** Input properties supported by the filter.
|
||||
*
|
||||
* - DEFAULT_CHARSET is the source encoding that should be used
|
||||
* for reading/transcoding the original data if there is no
|
||||
* other way to determine it (ie: for text/plain files)
|
||||
* - OPERATING_MODE can be set to either view or index.
|
||||
* - DJF_UDI Unique document identifier. This can be useful if the
|
||||
* filter wants to manage a persistent cache.
|
||||
*/
|
||||
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties;
|
||||
/** Returns true if there are nested documents left to extract.
|
||||
* Returns false if the end of the parent document was reached
|
||||
* or an error occurred.
|
||||
*/
|
||||
virtual bool has_documents(void) const = 0;
|
||||
|
||||
/** Moves to the next nested document.
|
||||
* Returns false if there are none left.
|
||||
*/
|
||||
virtual bool next_document(void) = 0;
|
||||
|
||||
// Information.
|
||||
/** Skips to the nested document with the given ipath.
|
||||
* Returns false if no such document exists.
|
||||
*/
|
||||
virtual bool skip_to_document(const std::string &ipath) = 0;
|
||||
|
||||
/// Returns the MIME type handled by the filter.
|
||||
std::string get_mime_type(void) const
|
||||
{
|
||||
return m_mimeType;
|
||||
}
|
||||
// Accessing documents' contents.
|
||||
|
||||
/// Returns what data the filter requires as input.
|
||||
virtual bool is_data_input_ok(DataInput input) const = 0;
|
||||
/// Returns the message for the most recent error that has occurred.
|
||||
virtual std::string get_error(void) const = 0;
|
||||
|
||||
/** Returns a dictionary of metadata extracted from the current document.
|
||||
* Metadata fields may include one or more of the following :
|
||||
* content, title, ipath, mimetype, language, charset, author, creator,
|
||||
* publisher, modificationdate, creationdate, size
|
||||
* Special considerations apply :
|
||||
* - content may contain binary data, watch out !
|
||||
* - ipath is an internal path to the nested document that can be
|
||||
* later passed to skip_to_document(). It may be empty if the parent
|
||||
* document's type doesn't allow embedding, in which case the filter
|
||||
* should only return one document.
|
||||
* - mimetype should be text/plain if the document could be handled
|
||||
* internally, empty if unknown. If any other value, it is expected
|
||||
* that the client application can pass the nested document's content
|
||||
* to another filter that supports this particular type.
|
||||
*/
|
||||
virtual const std::map<std::string, std::string>&
|
||||
get_meta_data(void) const {
|
||||
return m_metaData;
|
||||
}
|
||||
|
||||
// Initialization.
|
||||
virtual void clear() {
|
||||
m_metaData.clear();
|
||||
}
|
||||
// Hack: is this the special version used for unknown types?
|
||||
virtual bool is_unknown() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Sets a property, prior to calling set_document_XXX().
|
||||
* Returns false if the property is not supported.
|
||||
*/
|
||||
virtual bool set_property(Properties prop_name,
|
||||
const std::string &prop_value) = 0;
|
||||
protected:
|
||||
/// The MIME type handled by the filter.
|
||||
std::string m_mimeType;
|
||||
|
||||
/** (Re)initializes the filter with the given data.
|
||||
* Caller should ensure the given pointer is valid until the
|
||||
* Filter object is destroyed, as some filters may not need to
|
||||
* do a deep copy of the data.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
virtual bool set_document_data(const std::string& mtype,
|
||||
const char *data_ptr,
|
||||
size_t data_length) = 0;
|
||||
/// Current Metadata dictionary. For multi-document files,
|
||||
/// this may be rebuilt for each sub-document. See
|
||||
/// common/cstr.h for the common key definitions. The document
|
||||
/// text is "content"
|
||||
std::map<std::string, std::string> m_metaData;
|
||||
|
||||
/** (Re)initializes the filter with the given data.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
virtual bool set_document_string(const std::string& mtype,
|
||||
const std::string &data_str) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given file.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
virtual bool set_document_file(const std::string& mtype,
|
||||
const std::string &file_path) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given URI.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occurred.
|
||||
*/
|
||||
virtual bool set_document_uri(const std::string& mtype,
|
||||
const std::string &uri) = 0;
|
||||
|
||||
/** Set the document size meta_data element. This is the size
|
||||
of the immediate containing file (ie, a .doc, a .odt), not
|
||||
the size of, ie, a containing archive or .gz nor the size
|
||||
of the extracted text. This is set externally, because the
|
||||
surrounding code quite often has a better idea about it
|
||||
(having created a temp file, etc.), and this saves more
|
||||
stat() calls The value is stored inside metaData, docsize
|
||||
key
|
||||
*/
|
||||
virtual void set_docsize(int64_t size) = 0;
|
||||
|
||||
// Going from one nested document to the next.
|
||||
|
||||
/** Returns true if there are nested documents left to extract.
|
||||
* Returns false if the end of the parent document was reached
|
||||
* or an error occurred.
|
||||
*/
|
||||
virtual bool has_documents(void) const = 0;
|
||||
|
||||
/** Moves to the next nested document.
|
||||
* Returns false if there are none left.
|
||||
*/
|
||||
virtual bool next_document(void) = 0;
|
||||
|
||||
/** Skips to the nested document with the given ipath.
|
||||
* Returns false if no such document exists.
|
||||
*/
|
||||
virtual bool skip_to_document(const std::string &ipath) = 0;
|
||||
|
||||
|
||||
// Accessing documents' contents.
|
||||
|
||||
/// Returns the message for the most recent error that has occurred.
|
||||
virtual std::string get_error(void) const = 0;
|
||||
|
||||
/** Returns a dictionary of metadata extracted from the current document.
|
||||
* Metadata fields may include one or more of the following :
|
||||
* content, title, ipath, mimetype, language, charset, author, creator,
|
||||
* publisher, modificationdate, creationdate, size
|
||||
* Special considerations apply :
|
||||
* - content may contain binary data, watch out !
|
||||
* - ipath is an internal path to the nested document that can be
|
||||
* later passed to skip_to_document(). It may be empty if the parent
|
||||
* document's type doesn't allow embedding, in which case the filter
|
||||
* should only return one document.
|
||||
* - mimetype should be text/plain if the document could be handled
|
||||
* internally, empty if unknown. If any other value, it is expected
|
||||
* that the client application can pass the nested document's content
|
||||
* to another filter that supports this particular type.
|
||||
*/
|
||||
virtual const std::map<std::string, std::string> &get_meta_data(void) const
|
||||
{
|
||||
return m_metaData;
|
||||
}
|
||||
|
||||
virtual void clear() {m_metaData.clear();}
|
||||
virtual bool is_unknown() {return false;}
|
||||
protected:
|
||||
/// The MIME type handled by the filter.
|
||||
std::string m_mimeType;
|
||||
/// Metadata dictionary.
|
||||
std::map<std::string, std::string> m_metaData;
|
||||
|
||||
private:
|
||||
/// Filter objects cannot be copied.
|
||||
Filter(const Filter &other);
|
||||
/// Filter objects cannot be copied.
|
||||
Filter& operator=(const Filter& other);
|
||||
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
#endif // _DIJON_FILTER_H
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user