[Rework] Move HTML url functions and rework them

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 22 May 2021 12:24:05 +0000 (13:24 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 27 May 2021 14:05:21 +0000 (15:05 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 22 May 2021 12:24:05 +0000 (13:24 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 27 May 2021 14:05:21 +0000 (15:05 +0100)
diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt

index 3a4bae81f7234ebfe40967d2f66a8c9639d00589..9a191870e4c5a9dc29719f9cb4d7f1ea04fd1160 100644 (file)
--- a/src/libserver/CMakeLists.txt
+++ b/src/libserver/CMakeLists.txt
@@ -35,6 +35,7 @@ SET(LIBRSPAMDSERVERSRC
                                 ${CMAKE_CURRENT_SOURCE_DIR}/maps/map.c
                                 ${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c
                                 ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx
+                               ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx
                                 ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx
                                 ${LIBCSSSRC})
  
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx

new file mode 100644 (file)

index 0000000..9372811
--- /dev/null
+++ b/src/libserver/html/html_url.cxx
@@ -0,0 +1,200 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "html_url.hxx"
+#include "libutil/str_util.h"
+#include "libserver/url.h"
+#include "libserver/logger.h"
+
+#include <unicode/idna.h>
+
+namespace rspamd::html {
+
+static auto
+rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
+{
+       const auto *p1 = t1.data() + t1.size() - 1;
+       const auto *p2 = t2.data() + t2.size() - 1;
+
+       /* Skip trailing dots */
+       while (p1 > t1.data()) {
+               if (*p1 != '.') {
+                       break;
+               }
+
+               p1--;
+       }
+
+       while (p2 > t2.data()) {
+               if (*p2 != '.') {
+                       break;
+               }
+
+               p2--;
+       }
+
+       while (p1 > t1.data() && p2 > t2.data()) {
+               if (*p1 != *p2) {
+                       break;
+               }
+
+               p1--;
+               p2--;
+       }
+
+       if (p2 == t2.data()) {
+               /* p2 can be subdomain of p1 if *p1 is '.' */
+               if (p1 != t1.data() && *(p1 - 1) == '.') {
+                       return true;
+               }
+       }
+       else if (p1 == t1.data()) {
+               if (p2 != t2.data() && *(p2 - 1) == '.') {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+
+static auto
+get_icu_idna_instance(void) -> auto
+{
+       auto uc_err = U_ZERO_ERROR;
+       static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
+
+       return udn;
+}
+
+static auto
+convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
+               -> std::string_view
+{
+       std::string_view ret = use_tld ?
+                       std::string_view{rspamd_url_tld_unsafe (url), url->tldlen} :
+                       std::string_view {rspamd_url_host_unsafe (url), url->hostlen};
+
+       /* Handle IDN url's */
+       if (ret.size() > 4 &&
+               rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
+               const auto buf_capacity = ret.size() * 2 + 1;
+               auto *idn_hbuf = (char *)rspamd_mempool_alloc (pool, buf_capacity);
+               icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int)buf_capacity};
+               /* We need to convert it to the normal value first */
+               icu::IDNAInfo info;
+               auto uc_err = U_ZERO_ERROR;
+               auto *udn = get_icu_idna_instance();
+               udn->nameToASCII_UTF8(ret,byte_sink, info, uc_err);
+
+               if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
+                       ret = std::string_view{idn_hbuf, (std::size_t)byte_sink.NumberOfBytesWritten()};
+               }
+               else {
+                       msg_err_pool ("cannot convert to IDN: %s (0x%xd)",
+                                       u_errorName(uc_err), info.getErrors());
+               }
+       }
+
+       return ret;
+};
+
+constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto {
+       return (s1.size() == s2.size()) &&
+               std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
+                               [](const auto c1, const auto c2) {
+                                       return g_ascii_tolower(c1) == g_ascii_tolower(c2);
+               });
+}
+
+auto
+html_url_is_phished(rspamd_mempool_t *pool,
+                                       struct rspamd_url *href_url,
+                                       std::string_view text_data) -> std::optional<rspamd_url *>
+{
+       struct rspamd_url *text_url;
+       std::string_view disp_tok, href_tok;
+       goffset url_pos;
+       gchar *url_str = NULL;
+
+       auto sz = text_data.size();
+       const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
+       text_data = std::string_view(trimmed, sz);
+
+       if (text_data.size() > 4 &&
+               rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
+                               RSPAMD_URL_FIND_ALL,
+                               &url_pos, NULL) && url_str != NULL) {
+
+               text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
+               auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
+                               RSPAMD_URL_PARSE_TEXT);
+
+               if (rc == URI_ERRNO_OK) {
+                       disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
+                       href_tok = convert_idna_hostname_maybe(pool, href_url, false);
+
+                       if (!sv_equals(disp_tok, href_tok) &&
+                               text_url->tldlen > 0 && href_url->tldlen > 0) {
+
+                               /* Apply the same logic for TLD */
+                               disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
+                               href_tok = convert_idna_hostname_maybe(pool, href_url, true);
+
+                               if (!sv_equals(disp_tok, href_tok)) {
+                                       /* Check if one url is a subdomain for another */
+
+                                       if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
+                                               href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
+                                               href_url->linked_url = text_url;
+                                               text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+                                       }
+                               }
+                       }
+
+                       return text_url;
+               }
+               else {
+                       /*
+                        * We have found something that looks like an url but it was
+                        * not parsed correctly.
+                        * Sometimes it means an obfuscation attempt, so we have to check
+                        * what's inside of the text
+                        */
+                       gboolean obfuscation_found = FALSE;
+
+                       if (text_data.size() > 4
+                               && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
+                               rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
+                               /* Clearly an obfuscation attempt */
+                               obfuscation_found = TRUE;
+                       }
+
+                       msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s",
+                                       url_str,
+                                       rspamd_url_strerror(rc),
+                                       obfuscation_found ? "yes" : "no");
+
+                       if (obfuscation_found) {
+                               href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
+                       }
+               }
+       }
+
+       return std::nullopt;
+}
+
+}
+\ No newline at end of file
diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx

new file mode 100644 (file)

index 0000000..7bf81b7
--- /dev/null
+++ b/src/libserver/html/html_url.hxx
@@ -0,0 +1,44 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_URL_HXX
+#define RSPAMD_HTML_URL_HXX
+#pragma once
+
+#include "libutil/mem_pool.h"
+#include <string_view>
+#include <optional>
+
+struct rspamd_url; /* Forward declaration */
+
+namespace rspamd::html {
+
+
+/**
+ * Checks if an html url is likely phished by some displayed url
+ * @param pool
+ * @param href_url
+ * @param text_data
+ * @return
+ */
+auto html_url_is_phished(rspamd_mempool_t *pool,
+                                       struct rspamd_url *href_url,
+                                       std::string_view text_data) -> std::optional<rspamd_url *>;
+
+
+}
+
+#endif //RSPAMD_HTML_URL_HXX
+\ No newline at end of file
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 22 May 2021 12:24:05 +0000 (13:24 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 27 May 2021 14:05:21 +0000 (15:05 +0100)
src/libserver/CMakeLists.txt		patch \| blob \| history
src/libserver/html/html_url.cxx	[new file with mode: 0644]	patch \| blob
src/libserver/html/html_url.hxx	[new file with mode: 0644]	patch \| blob