diff options
Diffstat (limited to 'src/libserver/html/html.cxx')
-rw-r--r-- | src/libserver/html/html.cxx | 403 |
1 files changed, 71 insertions, 332 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index c167b004f..c384a9023 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -30,6 +30,7 @@ #include "html_tag_defs.hxx" #include "html_entities.hxx" #include "html_tag.hxx" +#include "html_url.hxx" #include <vector> #include <frozen/unordered_map.h> @@ -633,273 +634,76 @@ parse_tag_content(rspamd_mempool_t *pool, parser_env.cur_state = state; } -} - -/* Unconverted C part */ - -static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool, - const gchar *start, guint len, - struct html_tag_component *comp); - - - - -struct rspamd_url * -rspamd_html_process_url(rspamd_mempool_t *pool, const gchar *start, guint len, - struct html_tag_component *comp) { - struct rspamd_url *url; - guint saved_flags = 0; - gchar *decoded; - gint rc; - gsize decoded_len; - const gchar *p, *s, *prefix = "http://"; - gchar *d; - guint i; - gsize dlen; - gboolean has_bad_chars = FALSE, no_prefix = FALSE; - static const gchar hexdigests[] = "0123456789abcdef"; - - p = start; - - /* Strip spaces from the url */ - /* Head spaces */ - while (p < start + len && g_ascii_isspace (*p)) { - p++; - start++; - len--; - } - - if (comp) { - comp->start = (guchar *)p; - comp->len = len; - } - - /* Trailing spaces */ - p = start + len - 1; - - while (p >= start && g_ascii_isspace (*p)) { - p--; - len--; - - if (comp) { - comp->len--; - } - } +static auto +html_process_url_tag(rspamd_mempool_t *pool, + struct html_tag *tag, + struct html_content *hc) -> std::optional<struct rspamd_url *> +{ + auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF); - s = start; - dlen = 0; + if (found_href_it != tag->parameters.end()) { + /* Check base url */ + auto &href_value = found_href_it->second; - for (i = 0; i < len; i++) { - if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { - dlen += 3; - } - else { - dlen++; - } - } + if (hc && hc->base_url && href_value.size() > 2) { + /* + * Relative url cannot start from the following: + * schema:// + * data: + * slash + */ - if (rspamd_substring_search(start, len, "://", 3) == -1) { - if (len >= sizeof("mailto:") && - (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 || - memcmp(start, "tel:", sizeof("tel:") - 1) == 0 || - memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) { - /* Exclusion, has valid but 'strange' prefix */ - } - else { - for (i = 0; i < len; i++) { - if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) { - if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') { - prefix = "http:"; - dlen += sizeof("http:") - 1; - no_prefix = TRUE; - } - else if (s[i] == '@') { - /* Likely email prefix */ - prefix = "mailto://"; - dlen += sizeof("mailto://") - 1; - no_prefix = TRUE; - } - else if (s[i] == ':' && i != 0) { - /* Special case */ - no_prefix = FALSE; - } - else { - if (i == 0) { - /* No valid data */ - return NULL; - } - else { - no_prefix = TRUE; - dlen += strlen(prefix); - } - } + if (rspamd_substring_search(href_value.data(), href_value.size(), "://", 3) == -1) { - break; + if (href_value.size() >= sizeof("data:") && + g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) { + /* Image data url, never insert as url */ + return std::nullopt; } - } - } - } - - decoded = (char *)rspamd_mempool_alloc (pool, dlen + 1); - d = decoded; - - if (no_prefix) { - gsize plen = strlen(prefix); - memcpy(d, prefix, plen); - d += plen; - } - - /* - * We also need to remove all internal newlines, spaces - * and encode unsafe characters - */ - for (i = 0; i < len; i++) { - if (G_UNLIKELY (g_ascii_isspace(s[i]))) { - continue; - } - else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { - /* URL encode */ - *d++ = '%'; - *d++ = hexdigests[(s[i] >> 4) & 0xf]; - *d++ = hexdigests[s[i] & 0xf]; - has_bad_chars = TRUE; - } - else { - *d++ = s[i]; - } - } - - *d = '\0'; - dlen = d - decoded; - url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + /* Assume relative url */ + auto need_slash = false; - rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags); + auto orig_len = href_value.size(); + auto len = orig_len + hc->base_url->urllen; - rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); - - /* Filter some completely damaged urls */ - if (rc == URI_ERRNO_OK && url->hostlen > 0 && - !((url->protocol & PROTOCOL_UNKNOWN))) { - url->flags |= saved_flags; - - if (has_bad_chars) { - url->flags |= RSPAMD_URL_FLAG_OBSCURED; - } - - if (no_prefix) { - url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + if (hc->base_url->datalen == 0) { + need_slash = true; + len++; + } - if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) { - /* Ignore urls with both no schema and no tld */ - return NULL; + auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1); + auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1, + "%*s%s%*s", + hc->base_url->urllen, hc->base_url->string, + need_slash ? "/" : "", + (gint) orig_len, href_value.size()); + href_value = {buf, nlen}; + } + else if (href_value[0] == '/' && href_value[1] != '/') { + /* Relative to the hostname */ + auto orig_len = href_value.size(); + auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen + + 3 /* for :// */; + auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1); + auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s", + hc->base_url->protocollen, hc->base_url->string, + hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url), + (gint)orig_len, href_value.data()); + href_value = {buf, nlen}; } } - decoded = url->string; - decoded_len = url->urllen; + auto url = html_process_url(pool, href_value); - if (comp) { - comp->start = (guchar *)decoded; - comp->len = decoded_len; - } - /* Spaces in href usually mean an attempt to obfuscate URL */ - /* See https://github.com/vstakhov/rspamd/issues/593 */ -#if 0 - if (has_spaces) { - url->flags |= RSPAMD_URL_FLAG_OBSCURED; + if (url && tag->extra == nullptr) { + tag->extra = url.value(); } -#endif return url; } - return NULL; -} - -static struct rspamd_url * -rspamd_html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag, - struct html_content *hc) { - struct html_tag_component *comp; - GList *cur; - struct rspamd_url *url; - const gchar *start; - gsize len; - - cur = tag->params->head; - - while (cur) { - comp = (struct html_tag_component *)cur->data; - - if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { - start = (char *)comp->start; - len = comp->len; - - /* Check base url */ - if (hc && hc->base_url && comp->len > 2) { - /* - * Relative url cannot start from the following: - * schema:// - * data: - * slash - */ - gchar *buf; - gsize orig_len; - - if (rspamd_substring_search(start, len, "://", 3) == -1) { - - if (len >= sizeof("data:") && - g_ascii_strncasecmp(start, "data:", sizeof("data:") - 1) == 0) { - /* Image data url, never insert as url */ - return NULL; - } - - /* Assume relative url */ - - gboolean need_slash = FALSE; - - orig_len = len; - len += hc->base_url->urllen; - - if (hc->base_url->datalen == 0) { - need_slash = TRUE; - len++; - } - - buf = (char *)rspamd_mempool_alloc (pool, len + 1); - rspamd_snprintf(buf, len + 1, "%*s%s%*s", - hc->base_url->urllen, hc->base_url->string, - need_slash ? "/" : "", - (gint) orig_len, start); - start = buf; - } - else if (start[0] == '/' && start[1] != '/') { - /* Relative to the hostname */ - orig_len = len; - len += hc->base_url->hostlen + hc->base_url->protocollen + - 3 /* for :// */; - buf = (char *)rspamd_mempool_alloc (pool, len + 1); - rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s", - hc->base_url->protocollen, hc->base_url->string, - hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url), - (gint) orig_len, start); - start = buf; - } - } - - url = rspamd_html_process_url(pool, start, len, comp); - - if (url && tag->extra == NULL) { - tag->extra = url; - } - - return url; - } - - cur = g_list_next (cur); - } - - return NULL; + return std::nullopt; } struct rspamd_html_url_query_cbd { @@ -910,8 +714,9 @@ struct rspamd_html_url_query_cbd { }; static gboolean -rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset, - gsize end_offset, gpointer ud) { +html_url_query_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ struct rspamd_html_url_query_cbd *cbd = (struct rspamd_html_url_query_cbd *) ud; rspamd_mempool_t *pool; @@ -939,9 +744,10 @@ rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset, } static void -rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url, - khash_t (rspamd_url_hash) *url_set, - GPtrArray *part_urls) { +process_html_query_url(rspamd_mempool_t *pool, struct rspamd_url *url, + khash_t (rspamd_url_hash) *url_set, + GPtrArray *part_urls) +{ if (url->querylen > 0) { struct rspamd_html_url_query_cbd qcbd; @@ -953,7 +759,7 @@ rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url, rspamd_url_find_multiple(pool, rspamd_url_query_unsafe (url), url->querylen, RSPAMD_URL_FIND_ALL, NULL, - rspamd_html_url_query_callback, &qcbd); + html_url_query_callback, &qcbd); } if (part_urls) { @@ -1013,10 +819,12 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool, } static void -rspamd_html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag, - struct html_content *hc, khash_t (rspamd_url_hash) *url_set, - GPtrArray *part_urls, - GByteArray *dest) { +html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag, + struct html_content *hc, + khash_t (rspamd_url_hash) *url_set, + GPtrArray *part_urls, + GByteArray *dest) +{ struct html_tag_component *comp; struct html_image *img; rspamd_ftok_t fstr; @@ -1205,6 +1013,10 @@ rspamd_html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, } } +} + +/* Unconverted C part */ + static void rspamd_html_process_color(const gchar *line, guint len, struct html_color *cl) { @@ -1764,80 +1576,7 @@ rspamd_html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, tag->extra = bl; } -static void -rspamd_html_check_displayed_url(rspamd_mempool_t *pool, - GList **exceptions, - khash_t (rspamd_url_hash) *url_set, - GByteArray *dest, - gint href_offset, - struct rspamd_url *url) { - struct rspamd_url *displayed_url = NULL; - struct rspamd_url *turl; - gboolean url_found = FALSE; - struct rspamd_process_exception *ex; - guint saved_flags = 0; - gsize dlen; - - if (href_offset < 0) { - /* No dispalyed url, just some text within <a> tag */ - return; - } - - url->visible_part = (gchar *)rspamd_mempool_alloc (pool, dest->len - href_offset + 1); - rspamd_strlcpy(url->visible_part, - reinterpret_cast<const gchar *>(dest->data + href_offset), - dest->len - href_offset + 1); - dlen = dest->len - href_offset; - - /* Strip unicode spaces from the start and the end */ - url->visible_part = rspamd_string_unicode_trim_inplace(url->visible_part, - &dlen); - rspamd_html_url_is_phished(pool, url, - reinterpret_cast<const guchar *>(url->visible_part), - dlen, - &url_found, &displayed_url); - - if (url_found) { - url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL; - } - - if (exceptions && url_found) { - ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception); - ex->pos = href_offset; - ex->len = dest->len - href_offset; - ex->type = RSPAMD_EXCEPTION_URL; - ex->ptr = url; - - *exceptions = g_list_prepend(*exceptions, - ex); - } - - if (displayed_url && url_set) { - turl = rspamd_url_set_add_or_return(url_set, - displayed_url); - if (turl != NULL) { - /* Here, we assume the following: - * if we have a URL in the text part which - * is the same as displayed URL in the - * HTML part, we assume that it is also - * hint only. - */ - if (turl->flags & - RSPAMD_URL_FLAG_FROM_TEXT) { - turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; - turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; - } - - turl->count++; - } - else { - /* Already inserted by `rspamd_url_set_add_or_return` */ - } - } - - rspamd_normalise_unicode_inplace(url->visible_part, &dlen); -} static gboolean rspamd_html_propagate_lengths(GNode *node, gpointer _unused) { |