summaryrefslogtreecommitdiffstats
path: root/src/libserver/html.h
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-04-21 16:25:51 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-04-21 16:25:51 +0100
commit61555065f3d1c8badcc9573691232f1b6e42988c (patch)
tree563d5b7cb8c468530f7e79c4da0a75267b1184e1 /src/libserver/html.h
parentad5bf825b7f33bc10311673991f0cc888e69c0b1 (diff)
downloadrspamd-61555065f3d1c8badcc9573691232f1b6e42988c.tar.gz
rspamd-61555065f3d1c8badcc9573691232f1b6e42988c.zip
Rework project structure, remove trash files.
Diffstat (limited to 'src/libserver/html.h')
-rw-r--r--src/libserver/html.h226
1 files changed, 226 insertions, 0 deletions
diff --git a/src/libserver/html.h b/src/libserver/html.h
new file mode 100644
index 000000000..3ea758e60
--- /dev/null
+++ b/src/libserver/html.h
@@ -0,0 +1,226 @@
+/*
+ * Functions for simple html parsing
+ */
+
+#ifndef RSPAMD_HTML_H
+#define RSPAMD_HTML_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+/* Known HTML tags */
+typedef enum
+{
+ Tag_UNKNOWN, /**< Unknown tag! */
+ Tag_A, /**< A */
+ Tag_ABBR, /**< ABBR */
+ Tag_ACRONYM, /**< ACRONYM */
+ Tag_ADDRESS, /**< ADDRESS */
+ Tag_ALIGN, /**< ALIGN */
+ Tag_APPLET, /**< APPLET */
+ Tag_AREA, /**< AREA */
+ Tag_B, /**< B */
+ Tag_BASE, /**< BASE */
+ Tag_BASEFONT, /**< BASEFONT */
+ Tag_BDO, /**< BDO */
+ Tag_BGSOUND, /**< BGSOUND */
+ Tag_BIG, /**< BIG */
+ Tag_BLINK, /**< BLINK */
+ Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
+ Tag_BODY, /**< BODY */
+ Tag_BR, /**< BR */
+ Tag_BUTTON, /**< BUTTON */
+ Tag_CAPTION, /**< CAPTION */
+ Tag_CENTER, /**< CENTER */
+ Tag_CITE, /**< CITE */
+ Tag_CODE, /**< CODE */
+ Tag_COL, /**< COL */
+ Tag_COLGROUP, /**< COLGROUP */
+ Tag_COMMENT, /**< COMMENT */
+ Tag_DD, /**< DD */
+ Tag_DEL, /**< DEL */
+ Tag_DFN, /**< DFN */
+ Tag_DIR, /**< DIR */
+ Tag_DIV, /**< DIF */
+ Tag_DL, /**< DL */
+ Tag_DT, /**< DT */
+ Tag_EM, /**< EM */
+ Tag_EMBED, /**< EMBED */
+ Tag_FIELDSET, /**< FIELDSET */
+ Tag_FONT, /**< FONT */
+ Tag_FORM, /**< FORM */
+ Tag_FRAME, /**< FRAME */
+ Tag_FRAMESET, /**< FRAMESET */
+ Tag_H1, /**< H1 */
+ Tag_H2, /**< H2 */
+ Tag_H3, /**< H3 */
+ Tag_H4, /**< H4 */
+ Tag_H5, /**< H5 */
+ Tag_H6, /**< H6 */
+ Tag_HEAD, /**< HEAD */
+ Tag_HR, /**< HR */
+ Tag_HTML, /**< HTML */
+ Tag_I, /**< I */
+ Tag_IFRAME, /**< IFRAME */
+ Tag_ILAYER, /**< ILAYER */
+ Tag_IMG, /**< IMG */
+ Tag_INPUT, /**< INPUT */
+ Tag_INS, /**< INS */
+ Tag_ISINDEX, /**< ISINDEX */
+ Tag_KBD, /**< KBD */
+ Tag_KEYGEN, /**< KEYGEN */
+ Tag_LABEL, /**< LABEL */
+ Tag_LAYER, /**< LAYER */
+ Tag_LEGEND, /**< LEGEND */
+ Tag_LI, /**< LI */
+ Tag_LINK, /**< LINK */
+ Tag_LISTING, /**< LISTING */
+ Tag_MAP, /**< MAP */
+ Tag_MARQUEE, /**< MARQUEE */
+ Tag_MENU, /**< MENU */
+ Tag_META, /**< META */
+ Tag_MULTICOL, /**< MULTICOL */
+ Tag_NOBR, /**< NOBR */
+ Tag_NOEMBED, /**< NOEMBED */
+ Tag_NOFRAMES, /**< NOFRAMES */
+ Tag_NOLAYER, /**< NOLAYER */
+ Tag_NOSAVE, /**< NOSAVE */
+ Tag_NOSCRIPT, /**< NOSCRIPT */
+ Tag_OBJECT, /**< OBJECT */
+ Tag_OL, /**< OL */
+ Tag_OPTGROUP, /**< OPTGROUP */
+ Tag_OPTION, /**< OPTION */
+ Tag_P, /**< P */
+ Tag_PARAM, /**< PARAM */
+ Tag_PLAINTEXT,/**< PLAINTEXT */
+ Tag_PRE, /**< PRE */
+ Tag_Q, /**< Q */
+ Tag_RB, /**< RB */
+ Tag_RBC, /**< RBC */
+ Tag_RP, /**< RP */
+ Tag_RT, /**< RT */
+ Tag_RTC, /**< RTC */
+ Tag_RUBY, /**< RUBY */
+ Tag_S, /**< S */
+ Tag_SAMP, /**< SAMP */
+ Tag_SCRIPT, /**< SCRIPT */
+ Tag_SELECT, /**< SELECT */
+ Tag_SERVER, /**< SERVER */
+ Tag_SERVLET, /**< SERVLET */
+ Tag_SMALL, /**< SMALL */
+ Tag_SPACER, /**< SPACER */
+ Tag_SPAN, /**< SPAN */
+ Tag_STRIKE, /**< STRIKE */
+ Tag_STRONG, /**< STRONG */
+ Tag_STYLE, /**< STYLE */
+ Tag_SUB, /**< SUB */
+ Tag_SUP, /**< SUP */
+ Tag_TABLE, /**< TABLE */
+ Tag_TBODY, /**< TBODY */
+ Tag_TD, /**< TD */
+ Tag_TEXTAREA, /**< TEXTAREA */
+ Tag_TFOOT, /**< TFOOT */
+ Tag_TH, /**< TH */
+ Tag_THEAD, /**< THEAD */
+ Tag_TITLE, /**< TITLE */
+ Tag_TR, /**< TR */
+ Tag_TT, /**< TT */
+ Tag_U, /**< U */
+ Tag_UL, /**< UL */
+ Tag_VAR, /**< VAR */
+ Tag_WBR, /**< WBR */
+ Tag_XMP, /**< XMP */
+ Tag_XML, /**< XML */
+ Tag_NEXTID, /**< NEXTID */
+
+ N_TAGS /**< Must be last */
+} tag_id_t;
+
+#define CM_UNKNOWN 0
+/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY (1 << 0)
+/* Elements that appear outside of "BODY". */
+#define CM_HTML (1 << 1)
+/* Elements that can appear within HEAD. */
+#define CM_HEAD (1 << 2)
+/* HTML "block" elements. */
+#define CM_BLOCK (1 << 3)
+/* HTML "inline" elements. */
+#define CM_INLINE (1 << 4)
+/* Elements that mark list item ("LI"). */
+#define CM_LIST (1 << 5)
+/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_DEFLIST (1 << 6)
+/* Elements that can appear inside TABLE. */
+#define CM_TABLE (1 << 7)
+/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROWGRP (1 << 8)
+/* Used for "TD", "TH" */
+#define CM_ROW (1 << 9)
+/* Elements whose content must be protected against white space movement.
+ Includes some elements that can found in forms. */
+#define CM_FIELD (1 << 10)
+/* Used to avoid propagating inline emphasis inside some elements
+ such as OBJECT or APPLET. */
+#define CM_OBJECT (1 << 11)
+/* Elements that allows "PARAM". */
+#define CM_PARAM (1 << 12)
+/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
+#define CM_FRAMES (1 << 13)
+/* Heading elements (h1, h2, ...). */
+#define CM_HEADING (1 << 14)
+/* Elements with an optional end tag. */
+#define CM_OPT (1 << 15)
+/* Elements that use "align" attribute for vertical position. */
+#define CM_IMG (1 << 16)
+/* Elements with inline and block model. Used to avoid calling InlineDup. */
+#define CM_MIXED (1 << 17)
+/* Elements whose content needs to be indented only if containing one
+ CM_BLOCK element. */
+#define CM_NO_INDENT (1 << 18)
+/* Elements that are obsolete (such as "dir", "menu"). */
+#define CM_OBSOLETE (1 << 19)
+/* User defined elements. Used to determine how attributes wihout value
+ should be printed. */
+#define CM_NEW (1 << 20)
+/* Elements that cannot be omitted. */
+#define CM_OMITST (1 << 21)
+
+/* XML tag */
+#define FL_XML (1 << 0)
+/* Closing tag */
+#define FL_CLOSING (1 << 1)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED (1 << 2)
+
+struct html_tag {
+ tag_id_t id;
+ const gchar *name;
+ gint flags;
+};
+
+struct html_node {
+ struct html_tag *tag;
+ gint flags;
+};
+
+/* Forwarded declaration */
+struct rspamd_task;
+
+/*
+ * Add a single node to the tags tree
+ */
+gboolean add_html_node (struct rspamd_task *task, rspamd_mempool_t *pool,
+ struct mime_text_part *part, gchar *tag_text, gsize tag_len, gsize remain, GNode **cur_level);
+
+/*
+ * Get tag structure by its name (binary search is used)
+ */
+struct html_tag * get_tag_by_name (const gchar *name);
+
+/*
+ * Decode HTML entitles in text. Text is modified in place.
+ */
+void decode_entitles (gchar *s, guint *len);
+
+#endif