123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496 |
- /*-
- * Copyright 2021 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "html_url.hxx"
- #include "libutil/str_util.h"
- #include "libserver/url.h"
- #include "libserver/logger.h"
- #include "rspamd.h"
-
- #include <unicode/idna.h>
-
- namespace rspamd::html {
-
- static auto
- rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
- {
- const auto *p1 = t1.data() + t1.size() - 1;
- const auto *p2 = t2.data() + t2.size() - 1;
-
- /* Skip trailing dots */
- while (p1 > t1.data()) {
- if (*p1 != '.') {
- break;
- }
-
- p1--;
- }
-
- while (p2 > t2.data()) {
- if (*p2 != '.') {
- break;
- }
-
- p2--;
- }
-
- while (p1 > t1.data() && p2 > t2.data()) {
- if (*p1 != *p2) {
- break;
- }
-
- p1--;
- p2--;
- }
-
- if (p2 == t2.data()) {
- /* p2 can be subdomain of p1 if *p1 is '.' */
- if (p1 != t1.data() && *(p1 - 1) == '.') {
- return true;
- }
- }
- else if (p1 == t1.data()) {
- if (p2 != t2.data() && *(p2 - 1) == '.') {
- return true;
- }
- }
-
- return false;
- }
-
-
- static auto
- get_icu_idna_instance(void) -> auto
- {
- auto uc_err = U_ZERO_ERROR;
- static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
-
- return udn;
- }
-
- static auto
- convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
- -> std::string_view
- {
- std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen};
-
- /* Handle IDN url's */
- if (ret.size() > 4 &&
- rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
-
- const auto buf_capacity = ret.size() * 2 + 1;
- auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity);
- icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity};
-
- /* We need to convert it to the normal value first */
- icu::IDNAInfo info;
- auto uc_err = U_ZERO_ERROR;
- auto *udn = get_icu_idna_instance();
- udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()),
- byte_sink, info, uc_err);
-
- if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
- /* idn_hbuf is allocated in mempool, so it is safe to use */
- ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()};
- }
- else {
- msg_err_pool("cannot convert to IDN: %s (0x%xd)",
- u_errorName(uc_err), info.getErrors());
- }
- }
-
- return ret;
- };
-
- constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto
- {
- return (s1.size() == s2.size()) &&
- std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
- [](const auto c1, const auto c2) {
- return g_ascii_tolower(c1) == g_ascii_tolower(c2);
- });
- }
-
- constexpr auto
- is_transfer_proto(struct rspamd_url *u) -> bool
- {
- return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
- }
-
- auto html_url_is_phished(rspamd_mempool_t *pool,
- struct rspamd_url *href_url,
- std::string_view text_data) -> std::optional<rspamd_url *>
- {
- struct rspamd_url *text_url;
- std::string_view disp_tok, href_tok;
- goffset url_pos;
- char *url_str = NULL;
-
- auto sz = text_data.size();
- const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
- text_data = std::string_view(trimmed, sz);
-
- if (text_data.size() > 4 &&
- rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
- RSPAMD_URL_FIND_ALL,
- &url_pos, NULL) &&
- url_str != nullptr) {
-
- if (url_pos > 0) {
- /*
- * We have some url at some offset, so we need to check what is
- * at the start of the text
- */
- return std::nullopt;
- }
-
- text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
- auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
- RSPAMD_URL_PARSE_TEXT);
-
- if (rc == URI_ERRNO_OK) {
- text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
-
- /* Check for phishing */
- if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
- disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
- href_tok = convert_idna_hostname_maybe(pool, href_url, false);
-
- if (!sv_equals(disp_tok, href_tok) &&
- text_url->tldlen > 0 && href_url->tldlen > 0) {
-
- /* Apply the same logic for TLD */
- disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
- href_tok = convert_idna_hostname_maybe(pool, href_url, true);
-
- if (!sv_equals(disp_tok, href_tok)) {
- /* Check if one url is a subdomain for another */
-
- if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
- href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
- text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
-
- if (href_url->ext == nullptr) {
- href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
- }
- href_url->ext->linked_url = text_url;
- }
- }
- }
- }
-
- return text_url;
- }
- else {
- /*
- * We have found something that looks like an url but it was
- * not parsed correctly.
- * Sometimes it means an obfuscation attempt, so we have to check
- * what's inside of the text
- */
- gboolean obfuscation_found = FALSE;
-
- if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
- rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
- /* Clearly an obfuscation attempt */
- obfuscation_found = TRUE;
- }
-
- msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s",
- url_str,
- rspamd_url_strerror(rc),
- obfuscation_found ? "yes" : "no");
-
- if (obfuscation_found) {
- href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
- }
- }
- }
-
- return std::nullopt;
- }
-
- void html_check_displayed_url(rspamd_mempool_t *pool,
- GList **exceptions,
- void *url_set,
- std::string_view visible_part,
- goffset href_offset,
- struct rspamd_url *url)
- {
- struct rspamd_url *displayed_url = nullptr;
- struct rspamd_url *turl;
- struct rspamd_process_exception *ex;
- unsigned int saved_flags = 0;
- gsize dlen;
-
- if (visible_part.empty()) {
- /* No displayed url, just some text within <a> tag */
- return;
- }
-
- if (url->ext == nullptr) {
- url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
- }
- url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
- rspamd_strlcpy(url->ext->visible_part,
- visible_part.data(),
- visible_part.size() + 1);
- dlen = visible_part.size();
-
- /* Strip unicode spaces from the start and the end */
- url->ext->visible_part = const_cast<char *>(
- rspamd_string_unicode_trim_inplace(url->ext->visible_part,
- &dlen));
- auto maybe_url = html_url_is_phished(pool, url,
- {url->ext->visible_part, dlen});
-
- if (maybe_url) {
- url->flags |= saved_flags;
- displayed_url = maybe_url.value();
- }
-
- if (exceptions && displayed_url != nullptr) {
- ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception);
- ex->pos = href_offset;
- ex->len = dlen;
- ex->type = RSPAMD_EXCEPTION_URL;
- ex->ptr = url;
-
- *exceptions = g_list_prepend(*exceptions, ex);
- }
-
- if (displayed_url && url_set) {
- turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url);
-
- if (turl != nullptr) {
- /* Here, we assume the following:
- * if we have a URL in the text part which
- * is the same as displayed URL in the
- * HTML part, we assume that it is also
- * hint only.
- */
- if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
-
- /*
- * We have the same URL for href and displayed url, so we
- * know that this url cannot be both target and display (as
- * it breaks logic in many places), so we do not
- * propagate html flags
- */
- if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) {
- turl->flags |= displayed_url->flags;
- }
- turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
- }
-
- turl->count++;
- }
- else {
- /* Already inserted by `rspamd_url_set_add_or_return` */
- }
- }
-
- rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
- }
-
- auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
- -> std::optional<struct rspamd_url *>
- {
- struct rspamd_url *url;
- unsigned int saved_flags = 0;
- int rc;
- const char *s, *prefix = "http://";
- char *d;
- gsize dlen;
- gboolean has_bad_chars = FALSE, no_prefix = FALSE;
- static const char hexdigests[] = "0123456789abcdef";
-
- auto sz = input.length();
- const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
- input = {trimmed, sz};
-
- const auto *start = input.data();
- s = start;
- dlen = 0;
-
- for (auto i = 0; i < sz; i++) {
- if (G_UNLIKELY(((unsigned int) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
- dlen += 3;
- }
- else {
- dlen++;
- }
- }
-
- if (rspamd_substring_search(start, sz, "://", 3) == -1) {
- if (sz >= sizeof("mailto:") &&
- (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
- memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
- memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
- /* Exclusion, has valid but 'strange' prefix */
- }
- else {
- for (auto i = 0; i < sz; i++) {
- if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) {
- if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
- prefix = "http:";
- dlen += sizeof("http:") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == '@') {
- /* Likely email prefix */
- prefix = "mailto://";
- dlen += sizeof("mailto://") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == ':' && i != 0) {
- /* Special case */
- no_prefix = FALSE;
- }
- else {
- if (i == 0) {
- /* No valid data */
- return std::nullopt;
- }
- else {
- no_prefix = TRUE;
- dlen += strlen(prefix);
- }
- }
-
- break;
- }
- }
- }
- }
-
- auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
- d = decoded;
-
- if (no_prefix) {
- gsize plen = strlen(prefix);
- memcpy(d, prefix, plen);
- d += plen;
- }
-
- /*
- * We also need to remove all internal newlines, spaces
- * and encode unsafe characters
- * Another obfuscation find in the wild was encoding of the SAFE url characters,
- * including essential ones
- */
- for (auto i = 0; i < sz; i++) {
- if (G_UNLIKELY(g_ascii_isspace(s[i]))) {
- continue;
- }
- else if (G_UNLIKELY(((unsigned int) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
- /* URL encode */
- *d++ = '%';
- *d++ = hexdigests[(s[i] >> 4) & 0xf];
- *d++ = hexdigests[s[i] & 0xf];
- has_bad_chars = TRUE;
- }
- else if (G_UNLIKELY(s[i] == '%')) {
- if (i + 2 < sz) {
- auto c1 = s[i + 1];
- auto c2 = s[i + 2];
-
- if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
- auto codepoint = 0;
-
- if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
- else if (c1 >= 'A' && c1 <= 'F')
- codepoint = c1 - 'A' + 10;
- else if (c1 >= 'a' && c1 <= 'f')
- codepoint = c1 - 'a' + 10;
-
- codepoint <<= 4;
-
- if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
- else if (c2 >= 'A' && c2 <= 'F')
- codepoint += c2 - 'A' + 10;
- else if (c2 >= 'a' && c2 <= 'f')
- codepoint += c2 - 'a' + 10;
-
- /* Now check for 'interesting' codepoints */
- if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
- codepoint == '?' || codepoint == '\\' || codepoint == '/') {
- /* Replace it back */
- *d++ = (char) (codepoint & 0xff);
- i += 2;
- }
- else {
- *d++ = s[i];
- }
- }
- else {
- *d++ = s[i];
- }
- }
- else {
- *d++ = s[i];
- }
- }
- else {
- *d++ = s[i];
- }
- }
-
- *d = '\0';
- dlen = d - decoded;
-
- url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
- rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
- rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
- /* Filter some completely damaged urls */
- if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
- !((url->protocol & PROTOCOL_UNKNOWN))) {
- url->flags |= saved_flags;
-
- if (has_bad_chars) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (no_prefix) {
- url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
-
- if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
- /* Ignore urls with both no schema and no tld */
- return std::nullopt;
- }
- }
-
- decoded = url->string;
-
- input = {decoded, url->urllen};
-
- /* Spaces in href usually mean an attempt to obfuscate URL */
- /* See https://github.com/vstakhov/rspamd/issues/593 */
- #if 0
- if (has_spaces) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
- #endif
-
- return url;
- }
-
- return std::nullopt;
- }
-
- }// namespace rspamd::html
|