123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- /*
- * Functions for simple html parsing
- */
-
- #ifndef RSPAMD_HTML_H
- #define RSPAMD_HTML_H
-
- #include "config.h"
- #include "mem_pool.h"
-
- /*
- * HTML content flags
- */
- #define RSPAMD_HTML_FLAG_BAD_START (1 << 0)
- #define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1)
- #define RSPAMD_HTML_FLAG_XML (1 << 2)
- #define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3)
- #define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4)
- #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
- #define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
- #define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
-
- /*
- * Image flags
- */
- #define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0)
- #define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1)
- #define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2)
-
- enum html_component_type {
- RSPAMD_HTML_COMPONENT_NAME = 0,
- RSPAMD_HTML_COMPONENT_HREF,
- RSPAMD_HTML_COMPONENT_COLOR,
- RSPAMD_HTML_COMPONENT_BGCOLOR,
- RSPAMD_HTML_COMPONENT_STYLE,
- RSPAMD_HTML_COMPONENT_CLASS,
- RSPAMD_HTML_COMPONENT_WIDTH,
- RSPAMD_HTML_COMPONENT_HEIGHT,
- RSPAMD_HTML_COMPONENT_SIZE,
- };
-
- struct html_tag_component {
- enum html_component_type type;
- guint len;
- const guchar *start;
- };
-
-
- struct rspamd_image;
-
- struct html_image {
- guint height;
- guint width;
- guint flags;
- gchar *src;
- struct rspamd_url *url;
- struct rspamd_image *embedded_image;
- struct html_tag *tag;
- };
-
- struct html_color {
- union {
- struct {
- #if !defined(BYTE_ORDER) || BYTE_ORDER == LITTLE_ENDIAN
- guint8 b;
- guint8 g;
- guint8 r;
- guint8 alpha;
- #else
- guint8 alpha;
- guint8 r;
- guint8 g;
- guint8 b;
- #endif
- } comp;
- guint32 val;
- } d;
- gboolean valid;
- };
-
- struct html_block {
- struct html_tag *tag;
- struct html_color font_color;
- struct html_color background_color;
- struct html_tag_component style;
- guint font_size;
- gboolean visible;
- gchar *class;
- };
-
- /* Public tags flags */
- /* XML tag */
- #define FL_XML (1 << 23)
- /* Closing tag */
- #define FL_CLOSING (1 << 24)
- /* Fully closed tag (e.g. <a attrs />) */
- #define FL_CLOSED (1 << 25)
- #define FL_BROKEN (1 << 26)
- #define FL_IGNORE (1 << 27)
- #define FL_BLOCK (1 << 28)
-
- struct html_tag {
- gint id;
- gint flags;
- guint content_length;
- struct html_tag_component name;
- const gchar *content;
- GQueue *params;
- gpointer extra; /** Additional data associated with tag (e.g. image) */
- GNode *parent;
- };
-
- /* Forwarded declaration */
- struct rspamd_task;
-
- struct html_content {
- struct rspamd_url *base_url;
- GNode *html_tags;
- gint flags;
- guint total_tags;
- struct html_color bgcolor;
- guchar *tags_seen;
- GPtrArray *images;
- GPtrArray *blocks;
- };
-
- /*
- * Decode HTML entitles in text. Text is modified in place.
- */
- guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
-
- GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in);
-
- GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails);
-
- /*
- * Returns true if a specified tag has been seen in a part
- */
- gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
-
- /**
- * Returns name for the specified tag id
- * @param id
- * @return
- */
- const gchar* rspamd_html_tag_by_id (gint id);
-
- /**
- * Returns HTML tag id by name
- * @param name
- * @return
- */
- gint rspamd_html_tag_by_name (const gchar *name);
-
- /**
- * Extract URL from HTML tag component and sets component elements if needed
- * @param pool
- * @param start
- * @param len
- * @param comp
- * @return
- */
- struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool,
- const gchar *start, guint len,
- struct html_tag_component *comp);
-
- #endif
|