1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
|
/*
* Functions for simple html parsing
*/
#ifndef RSPAMD_HTML_H
#define RSPAMD_HTML_H
#include "config.h"
#include "mem_pool.h"
/* Known HTML tags */
typedef enum
{
Tag_UNKNOWN, /**< Unknown tag! */
Tag_A, /**< A */
Tag_ABBR, /**< ABBR */
Tag_ACRONYM, /**< ACRONYM */
Tag_ADDRESS, /**< ADDRESS */
Tag_ALIGN, /**< ALIGN */
Tag_APPLET, /**< APPLET */
Tag_AREA, /**< AREA */
Tag_B, /**< B */
Tag_BASE, /**< BASE */
Tag_BASEFONT, /**< BASEFONT */
Tag_BDO, /**< BDO */
Tag_BGSOUND, /**< BGSOUND */
Tag_BIG, /**< BIG */
Tag_BLINK, /**< BLINK */
Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
Tag_BODY, /**< BODY */
Tag_BR, /**< BR */
Tag_BUTTON, /**< BUTTON */
Tag_CAPTION, /**< CAPTION */
Tag_CENTER, /**< CENTER */
Tag_CITE, /**< CITE */
Tag_CODE, /**< CODE */
Tag_COL, /**< COL */
Tag_COLGROUP, /**< COLGROUP */
Tag_COMMENT, /**< COMMENT */
Tag_DD, /**< DD */
Tag_DEL, /**< DEL */
Tag_DFN, /**< DFN */
Tag_DIR, /**< DIR */
Tag_DIV, /**< DIF */
Tag_DL, /**< DL */
Tag_DT, /**< DT */
Tag_EM, /**< EM */
Tag_EMBED, /**< EMBED */
Tag_FIELDSET, /**< FIELDSET */
Tag_FONT, /**< FONT */
Tag_FORM, /**< FORM */
Tag_FRAME, /**< FRAME */
Tag_FRAMESET, /**< FRAMESET */
Tag_H1, /**< H1 */
Tag_H2, /**< H2 */
Tag_H3, /**< H3 */
Tag_H4, /**< H4 */
Tag_H5, /**< H5 */
Tag_H6, /**< H6 */
Tag_HEAD, /**< HEAD */
Tag_HR, /**< HR */
Tag_HTML, /**< HTML */
Tag_I, /**< I */
Tag_IFRAME, /**< IFRAME */
Tag_ILAYER, /**< ILAYER */
Tag_IMG, /**< IMG */
Tag_INPUT, /**< INPUT */
Tag_INS, /**< INS */
Tag_ISINDEX, /**< ISINDEX */
Tag_KBD, /**< KBD */
Tag_KEYGEN, /**< KEYGEN */
Tag_LABEL, /**< LABEL */
Tag_LAYER, /**< LAYER */
Tag_LEGEND, /**< LEGEND */
Tag_LI, /**< LI */
Tag_LINK, /**< LINK */
Tag_LISTING, /**< LISTING */
Tag_MAP, /**< MAP */
Tag_MARQUEE, /**< MARQUEE */
Tag_MENU, /**< MENU */
Tag_META, /**< META */
Tag_MULTICOL, /**< MULTICOL */
Tag_NOBR, /**< NOBR */
Tag_NOEMBED, /**< NOEMBED */
Tag_NOFRAMES, /**< NOFRAMES */
Tag_NOLAYER, /**< NOLAYER */
Tag_NOSAVE, /**< NOSAVE */
Tag_NOSCRIPT, /**< NOSCRIPT */
Tag_OBJECT, /**< OBJECT */
Tag_OL, /**< OL */
Tag_OPTGROUP, /**< OPTGROUP */
Tag_OPTION, /**< OPTION */
Tag_P, /**< P */
Tag_PARAM, /**< PARAM */
Tag_PLAINTEXT,/**< PLAINTEXT */
Tag_PRE, /**< PRE */
Tag_Q, /**< Q */
Tag_RB, /**< RB */
Tag_RBC, /**< RBC */
Tag_RP, /**< RP */
Tag_RT, /**< RT */
Tag_RTC, /**< RTC */
Tag_RUBY, /**< RUBY */
Tag_S, /**< S */
Tag_SAMP, /**< SAMP */
Tag_SCRIPT, /**< SCRIPT */
Tag_SELECT, /**< SELECT */
Tag_SERVER, /**< SERVER */
Tag_SERVLET, /**< SERVLET */
Tag_SMALL, /**< SMALL */
Tag_SPACER, /**< SPACER */
Tag_SPAN, /**< SPAN */
Tag_STRIKE, /**< STRIKE */
Tag_STRONG, /**< STRONG */
Tag_STYLE, /**< STYLE */
Tag_SUB, /**< SUB */
Tag_SUP, /**< SUP */
Tag_TABLE, /**< TABLE */
Tag_TBODY, /**< TBODY */
Tag_TD, /**< TD */
Tag_TEXTAREA, /**< TEXTAREA */
Tag_TFOOT, /**< TFOOT */
Tag_TH, /**< TH */
Tag_THEAD, /**< THEAD */
Tag_TITLE, /**< TITLE */
Tag_TR, /**< TR */
Tag_TT, /**< TT */
Tag_U, /**< U */
Tag_UL, /**< UL */
Tag_VAR, /**< VAR */
Tag_WBR, /**< WBR */
Tag_XMP, /**< XMP */
Tag_XML, /**< XML */
Tag_NEXTID, /**< NEXTID */
N_TAGS /**< Must be last */
} tag_id_t;
#define CM_UNKNOWN 0
/* Elements with no content. Map to HTML specification. */
#define CM_EMPTY (1 << 0)
/* Elements that appear outside of "BODY". */
#define CM_HTML (1 << 1)
/* Elements that can appear within HEAD. */
#define CM_HEAD (1 << 2)
/* HTML "block" elements. */
#define CM_BLOCK (1 << 3)
/* HTML "inline" elements. */
#define CM_INLINE (1 << 4)
/* Elements that mark list item ("LI"). */
#define CM_LIST (1 << 5)
/* Elements that mark definition list item ("DL", "DT"). */
#define CM_DEFLIST (1 << 6)
/* Elements that can appear inside TABLE. */
#define CM_TABLE (1 << 7)
/* Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROWGRP (1 << 8)
/* Used for "TD", "TH" */
#define CM_ROW (1 << 9)
/* Elements whose content must be protected against white space movement.
Includes some elements that can found in forms. */
#define CM_FIELD (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
such as OBJECT or APPLET. */
#define CM_OBJECT (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED (1 << 17)
/* Elements whose content needs to be indented only if containing one
CM_BLOCK element. */
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
/* User defined elements. Used to determine how attributes wihout value
should be printed. */
#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST (1 << 21)
/* XML tag */
#define FL_XML (1 << 0)
/* Closing tag */
#define FL_CLOSING (1 << 1)
/* Fully closed tag (e.g. <a attrs />) */
#define FL_CLOSED (1 << 2)
struct html_tag {
tag_id_t id;
const gchar *name;
gint flags;
};
struct html_node {
struct html_tag *tag;
gint flags;
};
/* Forwarded declaration */
struct worker_task;
/*
* Add a single node to the tags tree
*/
gboolean add_html_node (struct worker_task *task, rspamd_mempool_t *pool,
struct mime_text_part *part, gchar *tag_text, gsize tag_len, gsize remain, GNode **cur_level);
/*
* Get tag structure by its name (binary search is used)
*/
struct html_tag * get_tag_by_name (const gchar *name);
/*
* Decode HTML entitles in text. Text is modified in place.
*/
void decode_entitles (gchar *s, guint *len);
#endif
|