From b8b418eddc2eb371570e7c4099084255804d0484 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 22 May 2021 13:24:05 +0100 Subject: [PATCH] [Rework] Move HTML url functions and rework them --- src/libserver/CMakeLists.txt | 1 + src/libserver/html/html_url.cxx | 200 ++++++++++++++++++++++++++++++++ src/libserver/html/html_url.hxx | 44 +++++++ 3 files changed, 245 insertions(+) create mode 100644 src/libserver/html/html_url.cxx create mode 100644 src/libserver/html/html_url.hxx diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt index 3a4bae81f..9a191870e 100644 --- a/src/libserver/CMakeLists.txt +++ b/src/libserver/CMakeLists.txt @@ -35,6 +35,7 @@ SET(LIBRSPAMDSERVERSRC ${CMAKE_CURRENT_SOURCE_DIR}/maps/map.c ${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx ${LIBCSSSRC}) diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx new file mode 100644 index 000000000..93728119b --- /dev/null +++ b/src/libserver/html/html_url.cxx @@ -0,0 +1,200 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "html_url.hxx" +#include "libutil/str_util.h" +#include "libserver/url.h" +#include "libserver/logger.h" + +#include + +namespace rspamd::html { + +static auto +rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool +{ + const auto *p1 = t1.data() + t1.size() - 1; + const auto *p2 = t2.data() + t2.size() - 1; + + /* Skip trailing dots */ + while (p1 > t1.data()) { + if (*p1 != '.') { + break; + } + + p1--; + } + + while (p2 > t2.data()) { + if (*p2 != '.') { + break; + } + + p2--; + } + + while (p1 > t1.data() && p2 > t2.data()) { + if (*p1 != *p2) { + break; + } + + p1--; + p2--; + } + + if (p2 == t2.data()) { + /* p2 can be subdomain of p1 if *p1 is '.' */ + if (p1 != t1.data() && *(p1 - 1) == '.') { + return true; + } + } + else if (p1 == t1.data()) { + if (p2 != t2.data() && *(p2 - 1) == '.') { + return true; + } + } + + return false; +} + + +static auto +get_icu_idna_instance(void) -> auto +{ + auto uc_err = U_ZERO_ERROR; + static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err); + + return udn; +} + +static auto +convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld) + -> std::string_view +{ + std::string_view ret = use_tld ? + std::string_view{rspamd_url_tld_unsafe (url), url->tldlen} : + std::string_view {rspamd_url_host_unsafe (url), url->hostlen}; + + /* Handle IDN url's */ + if (ret.size() > 4 && + rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) { + const auto buf_capacity = ret.size() * 2 + 1; + auto *idn_hbuf = (char *)rspamd_mempool_alloc (pool, buf_capacity); + icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int)buf_capacity}; + /* We need to convert it to the normal value first */ + icu::IDNAInfo info; + auto uc_err = U_ZERO_ERROR; + auto *udn = get_icu_idna_instance(); + udn->nameToASCII_UTF8(ret,byte_sink, info, uc_err); + + if (uc_err == U_ZERO_ERROR && !info.hasErrors()) { + ret = std::string_view{idn_hbuf, (std::size_t)byte_sink.NumberOfBytesWritten()}; + } + else { + msg_err_pool ("cannot convert to IDN: %s (0x%xd)", + u_errorName(uc_err), info.getErrors()); + } + } + + return ret; +}; + +constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto { + return (s1.size() == s2.size()) && + std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(), + [](const auto c1, const auto c2) { + return g_ascii_tolower(c1) == g_ascii_tolower(c2); + }); +} + +auto +html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional +{ + struct rspamd_url *text_url; + std::string_view disp_tok, href_tok; + goffset url_pos; + gchar *url_str = NULL; + + auto sz = text_data.size(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz); + text_data = std::string_view(trimmed, sz); + + if (text_data.size() > 4 && + rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str, + RSPAMD_URL_FIND_ALL, + &url_pos, NULL) && url_str != NULL) { + + text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url); + auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK) { + disp_tok = convert_idna_hostname_maybe(pool, text_url, false); + href_tok = convert_idna_hostname_maybe(pool, href_url, false); + + if (!sv_equals(disp_tok, href_tok) && + text_url->tldlen > 0 && href_url->tldlen > 0) { + + /* Apply the same logic for TLD */ + disp_tok = convert_idna_hostname_maybe(pool, text_url, true); + href_tok = convert_idna_hostname_maybe(pool, href_url, true); + + if (!sv_equals(disp_tok, href_tok)) { + /* Check if one url is a subdomain for another */ + + if (!rspamd_url_is_subdomain(disp_tok, href_tok)) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED; + href_url->linked_url = text_url; + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + } + } + } + + return text_url; + } + else { + /* + * We have found something that looks like an url but it was + * not parsed correctly. + * Sometimes it means an obfuscation attempt, so we have to check + * what's inside of the text + */ + gboolean obfuscation_found = FALSE; + + if (text_data.size() > 4 + && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 && + rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) { + /* Clearly an obfuscation attempt */ + obfuscation_found = TRUE; + } + + msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s", + url_str, + rspamd_url_strerror(rc), + obfuscation_found ? "yes" : "no"); + + if (obfuscation_found) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED; + } + } + } + + return std::nullopt; +} + +} \ No newline at end of file diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx new file mode 100644 index 000000000..7bf81b7d7 --- /dev/null +++ b/src/libserver/html/html_url.hxx @@ -0,0 +1,44 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_URL_HXX +#define RSPAMD_HTML_URL_HXX +#pragma once + +#include "libutil/mem_pool.h" +#include +#include + +struct rspamd_url; /* Forward declaration */ + +namespace rspamd::html { + + +/** + * Checks if an html url is likely phished by some displayed url + * @param pool + * @param href_url + * @param text_data + * @return + */ +auto html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional; + + +} + +#endif //RSPAMD_HTML_URL_HXX \ No newline at end of file -- 2.39.5