/*
* Functions for simple html parsing
*/
#ifndef RSPAMD_HTML_H
#define RSPAMD_HTML_H
#include "config.h"
#include "libutil/mem_pool.h"
#include "libserver/url.h"
#ifdef __cplusplus
extern "C" {
#endif
/*
* HTML content flags
*/
#define RSPAMD_HTML_FLAG_BAD_START (1 << 0)
#define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1)
#define RSPAMD_HTML_FLAG_XML (1 << 2)
#define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3)
#define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4)
#define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
#define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
#define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
/*
* Image flags
*/
#define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0)
#define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1)
#define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2)
enum html_component_type {
RSPAMD_HTML_COMPONENT_NAME = 0,
RSPAMD_HTML_COMPONENT_HREF,
RSPAMD_HTML_COMPONENT_COLOR,
RSPAMD_HTML_COMPONENT_BGCOLOR,
RSPAMD_HTML_COMPONENT_STYLE,
RSPAMD_HTML_COMPONENT_CLASS,
RSPAMD_HTML_COMPONENT_WIDTH,
RSPAMD_HTML_COMPONENT_HEIGHT,
RSPAMD_HTML_COMPONENT_SIZE,
RSPAMD_HTML_COMPONENT_REL,
};
struct html_tag_component {
enum html_component_type type;
guint len;
const guchar *start;
};
struct rspamd_image;
struct html_image {
guint height;
guint width;
guint flags;
gchar *src;
struct rspamd_url *url;
struct rspamd_image *embedded_image;
struct html_tag *tag;
};
struct html_color {
union {
struct {
#if !defined(BYTE_ORDER) || BYTE_ORDER == LITTLE_ENDIAN
guint8 b;
guint8 g;
guint8 r;
guint8 alpha;
#else
guint8 alpha;
guint8 r;
guint8 g;
guint8 b;
#endif
} comp;
guint32 val;
} d;
gboolean valid;
};
struct html_block {
struct html_tag *tag;
struct html_color font_color;
struct html_color background_color;
struct html_tag_component style;
guint font_size;
gboolean visible;
gchar *html_class;
};
/* Public tags flags */
/* XML tag */
#define FL_XML (1 << 23)
/* Closing tag */
#define FL_CLOSING (1 << 24)
/* Fully closed tag (e.g. ) */
#define FL_CLOSED (1 << 25)
#define FL_BROKEN (1 << 26)
#define FL_IGNORE (1 << 27)
#define FL_BLOCK (1 << 28)
#define FL_HREF (1 << 29)
#define FL_IMAGE (1 << 30)
struct html_tag {
gint id;
gint flags;
struct html_tag_component name;
guint content_length;
goffset content_offset;
GQueue *params;
gpointer extra; /** Additional data associated with tag (e.g. image) */
GNode *parent;
};
/* Forwarded declaration */
struct rspamd_task;
struct html_content {
struct rspamd_url *base_url;
GNode *html_tags;
gint flags;
guint total_tags;
struct html_color bgcolor;
guchar *tags_seen;
GPtrArray *images;
GPtrArray *blocks;
GByteArray *parsed;
};
/*
* Decode HTML entitles in text. Text is modified in place.
*/
guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in);
GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls);
/*
* Returns true if a specified tag has been seen in a part
*/
gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
/**
* Returns name for the specified tag id
* @param id
* @return
*/
const gchar *rspamd_html_tag_by_id (gint id);
/**
* Returns HTML tag id by name
* @param name
* @return
*/
gint rspamd_html_tag_by_name (const gchar *name);
/**
* Extract URL from HTML tag component and sets component elements if needed
* @param pool
* @param start
* @param len
* @param comp
* @return
*/
#ifdef __cplusplus
}
#endif
#endif