From 5b5cae6782b1b2844e65aad311b90f0272eeeae1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 15 Jul 2015 11:49:20 +0100 Subject: [PATCH] Refactor html.h. --- src/libmime/message.c | 4 +- src/libmime/message.h | 3 +- src/libserver/html.c | 194 ++++++++++++++++++++++++++++++++++++++++-- src/libserver/html.h | 191 +---------------------------------------- 4 files changed, 195 insertions(+), 197 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index fde23ccb2..70885a36d 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -236,7 +236,7 @@ unbreak_tag: *rp = ';'; if (rp - estart > 0) { dlen = rp - estart + 1; - decode_entitles (estart, &dlen); + rspamd_html_decode_entitles_inplace (estart, &dlen); rp = estart + dlen; } } @@ -1398,7 +1398,7 @@ process_text_part (struct rspamd_task *task, NULL); if (text_part->html_nodes != NULL) { - decode_entitles (text_part->content->data, + rspamd_html_decode_entitles_inplace (text_part->content->data, &text_part->content->len); } rspamd_url_text_extract (task->task_pool, task, text_part, TRUE); diff --git a/src/libmime/message.h b/src/libmime/message.h index 8ff2a262a..04e7cd5f3 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -11,6 +11,7 @@ struct rspamd_task; struct controller_session; +struct html_content; struct mime_part { GMimeContentType *type; @@ -40,7 +41,7 @@ struct mime_text_part { const gchar *real_charset; GByteArray *orig; GByteArray *content; - GNode *html_nodes; + struct html_content *html; GList *urls_offset; /**< list of offsets of urls */ rspamd_fuzzy_t *fuzzy; rspamd_fuzzy_t *double_fuzzy; diff --git a/src/libserver/html.c b/src/libserver/html.c index 0f9702646..ce89ec741 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -31,6 +31,193 @@ static sig_atomic_t tags_sorted = 0; +/* Known HTML tags */ +typedef enum +{ + Tag_UNKNOWN, /**< Unknown tag! */ + Tag_A, /**< A */ + Tag_ABBR, /**< ABBR */ + Tag_ACRONYM, /**< ACRONYM */ + Tag_ADDRESS, /**< ADDRESS */ + Tag_ALIGN, /**< ALIGN */ + Tag_APPLET, /**< APPLET */ + Tag_AREA, /**< AREA */ + Tag_B, /**< B */ + Tag_BASE, /**< BASE */ + Tag_BASEFONT, /**< BASEFONT */ + Tag_BDO, /**< BDO */ + Tag_BGSOUND, /**< BGSOUND */ + Tag_BIG, /**< BIG */ + Tag_BLINK, /**< BLINK */ + Tag_BLOCKQUOTE, /**< BLOCKQUOTE */ + Tag_BODY, /**< BODY */ + Tag_BR, /**< BR */ + Tag_BUTTON, /**< BUTTON */ + Tag_CAPTION, /**< CAPTION */ + Tag_CENTER, /**< CENTER */ + Tag_CITE, /**< CITE */ + Tag_CODE, /**< CODE */ + Tag_COL, /**< COL */ + Tag_COLGROUP, /**< COLGROUP */ + Tag_COMMENT, /**< COMMENT */ + Tag_DD, /**< DD */ + Tag_DEL, /**< DEL */ + Tag_DFN, /**< DFN */ + Tag_DIR, /**< DIR */ + Tag_DIV, /**< DIF */ + Tag_DL, /**< DL */ + Tag_DT, /**< DT */ + Tag_EM, /**< EM */ + Tag_EMBED, /**< EMBED */ + Tag_FIELDSET, /**< FIELDSET */ + Tag_FONT, /**< FONT */ + Tag_FORM, /**< FORM */ + Tag_FRAME, /**< FRAME */ + Tag_FRAMESET, /**< FRAMESET */ + Tag_H1, /**< H1 */ + Tag_H2, /**< H2 */ + Tag_H3, /**< H3 */ + Tag_H4, /**< H4 */ + Tag_H5, /**< H5 */ + Tag_H6, /**< H6 */ + Tag_HEAD, /**< HEAD */ + Tag_HR, /**< HR */ + Tag_HTML, /**< HTML */ + Tag_I, /**< I */ + Tag_IFRAME, /**< IFRAME */ + Tag_ILAYER, /**< ILAYER */ + Tag_IMG, /**< IMG */ + Tag_INPUT, /**< INPUT */ + Tag_INS, /**< INS */ + Tag_ISINDEX, /**< ISINDEX */ + Tag_KBD, /**< KBD */ + Tag_KEYGEN, /**< KEYGEN */ + Tag_LABEL, /**< LABEL */ + Tag_LAYER, /**< LAYER */ + Tag_LEGEND, /**< LEGEND */ + Tag_LI, /**< LI */ + Tag_LINK, /**< LINK */ + Tag_LISTING, /**< LISTING */ + Tag_MAP, /**< MAP */ + Tag_MARQUEE, /**< MARQUEE */ + Tag_MENU, /**< MENU */ + Tag_META, /**< META */ + Tag_MULTICOL, /**< MULTICOL */ + Tag_NOBR, /**< NOBR */ + Tag_NOEMBED, /**< NOEMBED */ + Tag_NOFRAMES, /**< NOFRAMES */ + Tag_NOLAYER, /**< NOLAYER */ + Tag_NOSAVE, /**< NOSAVE */ + Tag_NOSCRIPT, /**< NOSCRIPT */ + Tag_OBJECT, /**< OBJECT */ + Tag_OL, /**< OL */ + Tag_OPTGROUP, /**< OPTGROUP */ + Tag_OPTION, /**< OPTION */ + Tag_P, /**< P */ + Tag_PARAM, /**< PARAM */ + Tag_PLAINTEXT, /**< PLAINTEXT */ + Tag_PRE, /**< PRE */ + Tag_Q, /**< Q */ + Tag_RB, /**< RB */ + Tag_RBC, /**< RBC */ + Tag_RP, /**< RP */ + Tag_RT, /**< RT */ + Tag_RTC, /**< RTC */ + Tag_RUBY, /**< RUBY */ + Tag_S, /**< S */ + Tag_SAMP, /**< SAMP */ + Tag_SCRIPT, /**< SCRIPT */ + Tag_SELECT, /**< SELECT */ + Tag_SERVER, /**< SERVER */ + Tag_SERVLET, /**< SERVLET */ + Tag_SMALL, /**< SMALL */ + Tag_SPACER, /**< SPACER */ + Tag_SPAN, /**< SPAN */ + Tag_STRIKE, /**< STRIKE */ + Tag_STRONG, /**< STRONG */ + Tag_STYLE, /**< STYLE */ + Tag_SUB, /**< SUB */ + Tag_SUP, /**< SUP */ + Tag_TABLE, /**< TABLE */ + Tag_TBODY, /**< TBODY */ + Tag_TD, /**< TD */ + Tag_TEXTAREA, /**< TEXTAREA */ + Tag_TFOOT, /**< TFOOT */ + Tag_TH, /**< TH */ + Tag_THEAD, /**< THEAD */ + Tag_TITLE, /**< TITLE */ + Tag_TR, /**< TR */ + Tag_TT, /**< TT */ + Tag_U, /**< U */ + Tag_UL, /**< UL */ + Tag_VAR, /**< VAR */ + Tag_WBR, /**< WBR */ + Tag_XMP, /**< XMP */ + Tag_XML, /**< XML */ + Tag_NEXTID, /**< NEXTID */ + + N_TAGS /**< Must be last */ +} tag_id_t; + +#define CM_UNKNOWN 0 +/* Elements with no content. Map to HTML specification. */ +#define CM_EMPTY (1 << 0) +/* Elements that appear outside of "BODY". */ +#define CM_HTML (1 << 1) +/* Elements that can appear within HEAD. */ +#define CM_HEAD (1 << 2) +/* HTML "block" elements. */ +#define CM_BLOCK (1 << 3) +/* HTML "inline" elements. */ +#define CM_INLINE (1 << 4) +/* Elements that mark list item ("LI"). */ +#define CM_LIST (1 << 5) +/* Elements that mark definition list item ("DL", "DT"). */ +#define CM_DEFLIST (1 << 6) +/* Elements that can appear inside TABLE. */ +#define CM_TABLE (1 << 7) +/* Used for "THEAD", "TFOOT" or "TBODY". */ +#define CM_ROWGRP (1 << 8) +/* Used for "TD", "TH" */ +#define CM_ROW (1 << 9) +/* Elements whose content must be protected against white space movement. + Includes some elements that can found in forms. */ +#define CM_FIELD (1 << 10) +/* Used to avoid propagating inline emphasis inside some elements + such as OBJECT or APPLET. */ +#define CM_OBJECT (1 << 11) +/* Elements that allows "PARAM". */ +#define CM_PARAM (1 << 12) +/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ +#define CM_FRAMES (1 << 13) +/* Heading elements (h1, h2, ...). */ +#define CM_HEADING (1 << 14) +/* Elements with an optional end tag. */ +#define CM_OPT (1 << 15) +/* Elements that use "align" attribute for vertical position. */ +#define CM_IMG (1 << 16) +/* Elements with inline and block model. Used to avoid calling InlineDup. */ +#define CM_MIXED (1 << 17) +/* Elements whose content needs to be indented only if containing one + CM_BLOCK element. */ +#define CM_NO_INDENT (1 << 18) +/* Elements that are obsolete (such as "dir", "menu"). */ +#define CM_OBSOLETE (1 << 19) +/* User defined elements. Used to determine how attributes wihout value + should be printed. */ +#define CM_NEW (1 << 20) +/* Elements that cannot be omitted. */ +#define CM_OMITST (1 << 21) + +/* XML tag */ +#define FL_XML (1 << 0) +/* Closing tag */ +#define FL_CLOSING (1 << 1) +/* Fully closed tag (e.g. ) */ +#define FL_CLOSED (1 << 2) +/* task_pool, len + 1); rspamd_strlcpy (url_text, c, len + 1); - decode_entitles (url_text, &len); + rspamd_html_decode_entitles_inplace (url_text, &len); if (g_ascii_strncasecmp (url_text, "http", sizeof ("http") - 1) != 0 && @@ -967,6 +1154,3 @@ add_html_node (struct rspamd_task *task, return TRUE; } -/* - * vi:ts=4 - */ diff --git a/src/libserver/html.h b/src/libserver/html.h index 1760ad0ca..465fe62c7 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -8,195 +8,8 @@ #include "config.h" #include "mem_pool.h" -/* Known HTML tags */ -typedef enum -{ - Tag_UNKNOWN, /**< Unknown tag! */ - Tag_A, /**< A */ - Tag_ABBR, /**< ABBR */ - Tag_ACRONYM, /**< ACRONYM */ - Tag_ADDRESS, /**< ADDRESS */ - Tag_ALIGN, /**< ALIGN */ - Tag_APPLET, /**< APPLET */ - Tag_AREA, /**< AREA */ - Tag_B, /**< B */ - Tag_BASE, /**< BASE */ - Tag_BASEFONT, /**< BASEFONT */ - Tag_BDO, /**< BDO */ - Tag_BGSOUND, /**< BGSOUND */ - Tag_BIG, /**< BIG */ - Tag_BLINK, /**< BLINK */ - Tag_BLOCKQUOTE, /**< BLOCKQUOTE */ - Tag_BODY, /**< BODY */ - Tag_BR, /**< BR */ - Tag_BUTTON, /**< BUTTON */ - Tag_CAPTION, /**< CAPTION */ - Tag_CENTER, /**< CENTER */ - Tag_CITE, /**< CITE */ - Tag_CODE, /**< CODE */ - Tag_COL, /**< COL */ - Tag_COLGROUP, /**< COLGROUP */ - Tag_COMMENT, /**< COMMENT */ - Tag_DD, /**< DD */ - Tag_DEL, /**< DEL */ - Tag_DFN, /**< DFN */ - Tag_DIR, /**< DIR */ - Tag_DIV, /**< DIF */ - Tag_DL, /**< DL */ - Tag_DT, /**< DT */ - Tag_EM, /**< EM */ - Tag_EMBED, /**< EMBED */ - Tag_FIELDSET, /**< FIELDSET */ - Tag_FONT, /**< FONT */ - Tag_FORM, /**< FORM */ - Tag_FRAME, /**< FRAME */ - Tag_FRAMESET, /**< FRAMESET */ - Tag_H1, /**< H1 */ - Tag_H2, /**< H2 */ - Tag_H3, /**< H3 */ - Tag_H4, /**< H4 */ - Tag_H5, /**< H5 */ - Tag_H6, /**< H6 */ - Tag_HEAD, /**< HEAD */ - Tag_HR, /**< HR */ - Tag_HTML, /**< HTML */ - Tag_I, /**< I */ - Tag_IFRAME, /**< IFRAME */ - Tag_ILAYER, /**< ILAYER */ - Tag_IMG, /**< IMG */ - Tag_INPUT, /**< INPUT */ - Tag_INS, /**< INS */ - Tag_ISINDEX, /**< ISINDEX */ - Tag_KBD, /**< KBD */ - Tag_KEYGEN, /**< KEYGEN */ - Tag_LABEL, /**< LABEL */ - Tag_LAYER, /**< LAYER */ - Tag_LEGEND, /**< LEGEND */ - Tag_LI, /**< LI */ - Tag_LINK, /**< LINK */ - Tag_LISTING, /**< LISTING */ - Tag_MAP, /**< MAP */ - Tag_MARQUEE, /**< MARQUEE */ - Tag_MENU, /**< MENU */ - Tag_META, /**< META */ - Tag_MULTICOL, /**< MULTICOL */ - Tag_NOBR, /**< NOBR */ - Tag_NOEMBED, /**< NOEMBED */ - Tag_NOFRAMES, /**< NOFRAMES */ - Tag_NOLAYER, /**< NOLAYER */ - Tag_NOSAVE, /**< NOSAVE */ - Tag_NOSCRIPT, /**< NOSCRIPT */ - Tag_OBJECT, /**< OBJECT */ - Tag_OL, /**< OL */ - Tag_OPTGROUP, /**< OPTGROUP */ - Tag_OPTION, /**< OPTION */ - Tag_P, /**< P */ - Tag_PARAM, /**< PARAM */ - Tag_PLAINTEXT, /**< PLAINTEXT */ - Tag_PRE, /**< PRE */ - Tag_Q, /**< Q */ - Tag_RB, /**< RB */ - Tag_RBC, /**< RBC */ - Tag_RP, /**< RP */ - Tag_RT, /**< RT */ - Tag_RTC, /**< RTC */ - Tag_RUBY, /**< RUBY */ - Tag_S, /**< S */ - Tag_SAMP, /**< SAMP */ - Tag_SCRIPT, /**< SCRIPT */ - Tag_SELECT, /**< SELECT */ - Tag_SERVER, /**< SERVER */ - Tag_SERVLET, /**< SERVLET */ - Tag_SMALL, /**< SMALL */ - Tag_SPACER, /**< SPACER */ - Tag_SPAN, /**< SPAN */ - Tag_STRIKE, /**< STRIKE */ - Tag_STRONG, /**< STRONG */ - Tag_STYLE, /**< STYLE */ - Tag_SUB, /**< SUB */ - Tag_SUP, /**< SUP */ - Tag_TABLE, /**< TABLE */ - Tag_TBODY, /**< TBODY */ - Tag_TD, /**< TD */ - Tag_TEXTAREA, /**< TEXTAREA */ - Tag_TFOOT, /**< TFOOT */ - Tag_TH, /**< TH */ - Tag_THEAD, /**< THEAD */ - Tag_TITLE, /**< TITLE */ - Tag_TR, /**< TR */ - Tag_TT, /**< TT */ - Tag_U, /**< U */ - Tag_UL, /**< UL */ - Tag_VAR, /**< VAR */ - Tag_WBR, /**< WBR */ - Tag_XMP, /**< XMP */ - Tag_XML, /**< XML */ - Tag_NEXTID, /**< NEXTID */ - - N_TAGS /**< Must be last */ -} tag_id_t; - -#define CM_UNKNOWN 0 -/* Elements with no content. Map to HTML specification. */ -#define CM_EMPTY (1 << 0) -/* Elements that appear outside of "BODY". */ -#define CM_HTML (1 << 1) -/* Elements that can appear within HEAD. */ -#define CM_HEAD (1 << 2) -/* HTML "block" elements. */ -#define CM_BLOCK (1 << 3) -/* HTML "inline" elements. */ -#define CM_INLINE (1 << 4) -/* Elements that mark list item ("LI"). */ -#define CM_LIST (1 << 5) -/* Elements that mark definition list item ("DL", "DT"). */ -#define CM_DEFLIST (1 << 6) -/* Elements that can appear inside TABLE. */ -#define CM_TABLE (1 << 7) -/* Used for "THEAD", "TFOOT" or "TBODY". */ -#define CM_ROWGRP (1 << 8) -/* Used for "TD", "TH" */ -#define CM_ROW (1 << 9) -/* Elements whose content must be protected against white space movement. - Includes some elements that can found in forms. */ -#define CM_FIELD (1 << 10) -/* Used to avoid propagating inline emphasis inside some elements - such as OBJECT or APPLET. */ -#define CM_OBJECT (1 << 11) -/* Elements that allows "PARAM". */ -#define CM_PARAM (1 << 12) -/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ -#define CM_FRAMES (1 << 13) -/* Heading elements (h1, h2, ...). */ -#define CM_HEADING (1 << 14) -/* Elements with an optional end tag. */ -#define CM_OPT (1 << 15) -/* Elements that use "align" attribute for vertical position. */ -#define CM_IMG (1 << 16) -/* Elements with inline and block model. Used to avoid calling InlineDup. */ -#define CM_MIXED (1 << 17) -/* Elements whose content needs to be indented only if containing one - CM_BLOCK element. */ -#define CM_NO_INDENT (1 << 18) -/* Elements that are obsolete (such as "dir", "menu"). */ -#define CM_OBSOLETE (1 << 19) -/* User defined elements. Used to determine how attributes wihout value - should be printed. */ -#define CM_NEW (1 << 20) -/* Elements that cannot be omitted. */ -#define CM_OMITST (1 << 21) - -/* XML tag */ -#define FL_XML (1 << 0) -/* Closing tag */ -#define FL_CLOSING (1 << 1) -/* Fully closed tag (e.g. ) */ -#define FL_CLOSED (1 << 2) -/*