aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/html.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 09:32:28 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 09:32:28 +0100
commit757d70003805aebf8bff4acdd9c1a8a273065649 (patch)
tree2cd9d832762a2998255bc3ae45055c628ce7e706 /src/libserver/html.c
parenta76fd88266a59d7b59438a3f1985ccf2c33e7e0a (diff)
downloadrspamd-757d70003805aebf8bff4acdd9c1a8a273065649.tar.gz
rspamd-757d70003805aebf8bff4acdd9c1a8a273065649.zip
Further rework of html parser.
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r--src/libserver/html.c109
1 files changed, 85 insertions, 24 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index fb450c823..6af61a199 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -705,19 +705,19 @@ get_tag_by_name (const gchar *name)
}
/* Decode HTML entitles in text */
-void
-rspamd_html_decode_entitles_inplace (gchar *s, guint * len)
+guint
+rspamd_html_decode_entitles_inplace (gchar *s, guint len)
{
guint l, rep_len;
gchar *t = s, *h = s, *e = s, *end_ptr;
gint state = 0, val, base;
entity *found, key;
- if (len == NULL || *len == 0) {
+ if (len == 0) {
l = strlen (s);
}
else {
- l = *len;
+ l = len;
}
while (h - s < (gint)l) {
@@ -796,11 +796,8 @@ rspamd_html_decode_entitles_inplace (gchar *s, guint * len)
break;
}
}
- *t = '\0';
- if (len != NULL) {
- *len = t - s;
- }
+ return (t - s);
}
static void
@@ -965,7 +962,7 @@ parse_tag_url (struct rspamd_task *task,
url_text = rspamd_mempool_alloc (task->task_pool, len + 1);
rspamd_strlcpy (url_text, c, len + 1);
- rspamd_html_decode_entitles_inplace (url_text, &len);
+ len = rspamd_html_decode_entitles_inplace (url_text, len);
if (g_ascii_strncasecmp (url_text, "http",
sizeof ("http") - 1) != 0 &&
@@ -1064,10 +1061,46 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
return FALSE;
}
+static gboolean
+rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
+ const guchar *begin, const guchar *end,
+ struct html_tag *tag)
+{
+ struct html_tag_component *comp;
+ gint len;
+ gboolean ret = FALSE;
+
+ g_assert (end >= begin);
+ len = rspamd_html_decode_entitles_inplace ((gchar *)begin, end - begin);
+
+ if (len == 3) {
+ if (g_ascii_strncasecmp (begin, "src", len) == 0) {
+ comp = rspamd_mempool_alloc (pool, sizeof (*comp));
+ comp->type = RSPAMD_HTML_COMPONENT_HREF;
+ comp->start = NULL;
+ comp->len = 0;
+ tag->params = g_list_prepend (tag->params, comp);
+ ret = TRUE;
+ }
+ }
+ else if (len == 4) {
+ if (g_ascii_strncasecmp (begin, "href", len) == 0) {
+ comp = rspamd_mempool_alloc (pool, sizeof (*comp));
+ comp->type = RSPAMD_HTML_COMPONENT_HREF;
+ comp->start = NULL;
+ comp->len = 0;
+ tag->params = g_list_prepend (tag->params, comp);
+ ret = TRUE;
+ }
+ }
+
+ return ret;
+}
+
static void
rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
struct html_content *hc, struct html_tag *tag, const guchar *in,
- gint *statep, guchar **savep)
+ gint *statep, guchar const **savep)
{
enum {
parse_start = 0,
@@ -1088,8 +1121,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
ignore_bad_tag
} state;
struct html_tag_def *found;
- struct html_tag_component *comp;
gboolean store = FALSE;
+ struct html_tag_component *comp;
state = *statep;
@@ -1121,6 +1154,11 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
state = ignore_bad_tag;
}
else {
+ /* We can safely modify tag's name here, as it is already parsed */
+ tag->name.len = rspamd_html_decode_entitles_inplace (
+ (gchar *)tag->name.start,
+ tag->name.len);
+
found = bsearch (tag, tag_defs, G_N_ELEMENTS (tag_defs),
sizeof (tag_defs[0]), tag_find);
if (found == NULL) {
@@ -1155,14 +1193,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
return;
}
- if (g_ascii_strncasecmp (*savep, "href", in - *savep) == 0 ||
- g_ascii_strncasecmp (*savep, "src", in - *savep) == 0) {
- comp->type = RSPAMD_HTML_COMPONENT_HREF;
- comp->start = NULL;
- comp->len = 0;
- tag->params = g_list_prepend (tag->params, comp);
- }
- else {
+ if (!rspamd_html_parse_tag_component (pool, *savep, in, tag)) {
/* Ignore unknown params */
*savep = NULL;
}
@@ -1320,6 +1351,13 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
}
break;
+ case spaces_after_param:
+ if (!g_ascii_isspace (*in)) {
+ state = parse_attr_name;
+ *savep = in;
+ }
+ break;
+
case ignore_bad_tag:
break;
}
@@ -1331,13 +1369,13 @@ gboolean
rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
GByteArray *in)
{
- const guchar *p, *c, *end, *tag_start = NULL;
- guchar *savep = NULL, t;
- gboolean closing = FALSE;
+ const guchar *p, *c, *end, *tag_start = NULL, *savep = NULL;
+ guchar t;
+ gboolean closing = FALSE, need_decode = FALSE;
GByteArray *dest;
guint obrace = 0, ebrace = 0;
GNode *cur_level = NULL;
- gint substate;
+ gint substate, len;
struct html_tag *cur_tag;
enum {
parse_start = 0,
@@ -1521,16 +1559,38 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
case content_write:
if (t != '<') {
p ++;
+
+ if (t == '&') {
+ need_decode = TRUE;
+ }
}
else {
if (c != p) {
- g_byte_array_append (dest, c, p - c);
+
+ if (need_decode) {
+ len = rspamd_html_decode_entitles_inplace ((gchar *)c,
+ p - c);
+ }
+ else {
+ len = p - c;
+ }
+
+ g_byte_array_append (dest, c, len);
}
state = tag_begin;
}
break;
+ case sgml_content:
+ /* TODO: parse DOCTYPE here */
+ if (t == '>') {
+ state = tag_end;
+ continue;
+ }
+ p ++;
+ break;
+
case tag_content:
rspamd_html_parse_tag_content (pool, hc, cur_tag,
p, &substate, &savep);
@@ -1560,6 +1620,7 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
if (cur_tag != NULL) {
if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level)) {
state = content_write;
+ need_decode = FALSE;
}
else {
state = content_ignore;