/* * Functions for simple html parsing */ #ifndef RSPAMD_HTML_H #define RSPAMD_HTML_H #include "config.h" #include "mem_pool.h" /* * HTML content flags */ #define RSPAMD_HTML_FLAG_BAD_START (1 << 0) #define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1) #define RSPAMD_HTML_FLAG_XML (1 << 2) #define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3) #define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4) #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5) /* * Image flags */ #define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0) #define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1) enum html_component_type { RSPAMD_HTML_COMPONENT_NAME = 0, RSPAMD_HTML_COMPONENT_HREF, RSPAMD_HTML_COMPONENT_COLOR, RSPAMD_HTML_COMPONENT_STYLE, RSPAMD_HTML_COMPONENT_CLASS, RSPAMD_HTML_COMPONENT_WIDTH, RSPAMD_HTML_COMPONENT_HEIGHT }; struct html_tag_component { enum html_component_type type; guint len; const guchar *start; }; struct html_image { guint height; guint width; guint flags; gchar *src; struct html_tag *tag; }; struct html_color { union { struct { #if !defined(BYTE_ORDER) || BYTE_ORDER == LITTLE_ENDIAN guint8 b; guint8 g; guint8 r; guint8 alpha; #else guint8 alpha; guint8 r; guint8 g; guint8 b; #endif } comp; guint32 val; } d; gboolean valid; }; struct html_block { struct html_tag *tag; struct html_color font_color; struct html_color background_color; struct html_tag_component style; guint font_size; gchar *class; }; struct html_tag { gint id; gint flags; struct html_tag_component name; GQueue *params; gpointer extra; /** Additional data associated with tag (e.g. image) */ GNode *parent; }; /* Forwarded declaration */ struct rspamd_task; struct html_content { GNode *html_tags; gint flags; guchar *tags_seen; GPtrArray *images; GPtrArray *blocks; }; /* * Decode HTML entitles in text. Text is modified in place. */ guint rspamd_html_decode_entitles_inplace (gchar *s, guint len); GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in); GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails); /* * Returns true if a specified tag has been seen in a part */ gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname); /** * Returns name for the specified tag id * @param id * @return */ const gchar* rspamd_html_tag_by_id (gint id); /** * Extract URL from HTML tag component and sets component elements if needed * @param pool * @param start * @param len * @param comp * @return */ struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, struct html_tag_component *comp); #endif