From 0bcc686d0f5a1b3470104b6311dd85d39fa31c88 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 25 Jun 2018 17:59:04 +0100 Subject: [PATCH] [Feature] Support base tag in HTML --- src/libserver/html.c | 65 +++++++++++++++++++++++++++++++++++++++++--- src/libserver/html.h | 1 + 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index 4b7f28e9b..4b3eb9d7f 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1629,11 +1629,14 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, } static struct rspamd_url * -rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag) +rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag, + struct html_content *hc) { struct html_tag_component *comp; GList *cur; struct rspamd_url *url; + const gchar *start; + gsize len; cur = tag->params->head; @@ -1641,7 +1644,40 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag) comp = cur->data; if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { - url = rspamd_html_process_url (pool, comp->start, comp->len, comp); + start = comp->start; + len = comp->len; + + /* Check base url */ + if (hc && hc->base_url && comp->len > 0) { + /* + * Relative url canot start from the following: + * schema:// + * slash + */ + + if (comp->start[0] != '/' && + rspamd_substring_search (start, len, "://", 3) == -1) { + /* Assume relative url */ + gchar *buf; + gboolean need_slash = FALSE; + + len += hc->base_url->urllen; + + if (hc->base_url->string[hc->base_url->urllen - 1] != '/') { + need_slash = TRUE; + len ++; + } + + buf = rspamd_mempool_alloc (pool, len + 1); + rspamd_snprintf (buf, len + 1, "%*s%s%*s", + hc->base_url->urllen, hc->base_url->string, + need_slash ? "/" : "", + (gint)len, start); + start = buf; + } + } + + url = rspamd_html_process_url (pool, start, len, comp); if (url && tag->extra == NULL) { tag->extra = url; @@ -2889,7 +2925,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) { if (!(cur_tag->flags & (FL_CLOSING))) { - url = rspamd_html_process_url_tag (pool, cur_tag); + url = rspamd_html_process_url_tag (pool, cur_tag, hc); if (url != NULL) { @@ -2958,7 +2994,28 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, } } else if (cur_tag->id == Tag_LINK) { - url = rspamd_html_process_url_tag (pool, cur_tag); + url = rspamd_html_process_url_tag (pool, cur_tag, hc); + } + else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) { + struct html_tag *prev_tag = NULL; + + if (cur_level && cur_level->parent) { + prev_tag = cur_level->parent->data; + } + + /* + * Base is allowed only within head tag but we slightly + * relax that + */ + if (!prev_tag || prev_tag->id == Tag_HEAD || + prev_tag->id == Tag_HTML) { + url = rspamd_html_process_url_tag (pool, cur_tag, hc); + + if (url != NULL) { + /* We have a base tag available */ + hc->base_url = url; + } + } } if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { diff --git a/src/libserver/html.h b/src/libserver/html.h index 0414f48d9..c7534d309 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -107,6 +107,7 @@ struct html_tag { struct rspamd_task; struct html_content { + struct rspamd_url *base_url; GNode *html_tags; gint flags; guint total_tags; -- 2.39.5