You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html_url.cxx 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "html_url.hxx"
  17. #include "libutil/str_util.h"
  18. #include "libserver/url.h"
  19. #include "libserver/logger.h"
  20. #include "rspamd.h"
  21. #include <unicode/idna.h>
  22. namespace rspamd::html {
  23. static auto
  24. rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
  25. {
  26. const auto *p1 = t1.data() + t1.size() - 1;
  27. const auto *p2 = t2.data() + t2.size() - 1;
  28. /* Skip trailing dots */
  29. while (p1 > t1.data()) {
  30. if (*p1 != '.') {
  31. break;
  32. }
  33. p1--;
  34. }
  35. while (p2 > t2.data()) {
  36. if (*p2 != '.') {
  37. break;
  38. }
  39. p2--;
  40. }
  41. while (p1 > t1.data() && p2 > t2.data()) {
  42. if (*p1 != *p2) {
  43. break;
  44. }
  45. p1--;
  46. p2--;
  47. }
  48. if (p2 == t2.data()) {
  49. /* p2 can be subdomain of p1 if *p1 is '.' */
  50. if (p1 != t1.data() && *(p1 - 1) == '.') {
  51. return true;
  52. }
  53. }
  54. else if (p1 == t1.data()) {
  55. if (p2 != t2.data() && *(p2 - 1) == '.') {
  56. return true;
  57. }
  58. }
  59. return false;
  60. }
  61. static auto
  62. get_icu_idna_instance(void) -> auto
  63. {
  64. auto uc_err = U_ZERO_ERROR;
  65. static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
  66. return udn;
  67. }
  68. static auto
  69. convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
  70. -> std::string_view
  71. {
  72. std::string_view ret = use_tld ?
  73. std::string_view{rspamd_url_tld_unsafe (url), url->tldlen} :
  74. std::string_view {rspamd_url_host_unsafe (url), url->hostlen};
  75. /* Handle IDN url's */
  76. if (ret.size() > 4 &&
  77. rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
  78. const auto buf_capacity = ret.size() * 2 + 1;
  79. auto *idn_hbuf = (char *)rspamd_mempool_alloc (pool, buf_capacity);
  80. icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int)buf_capacity};
  81. /* We need to convert it to the normal value first */
  82. icu::IDNAInfo info;
  83. auto uc_err = U_ZERO_ERROR;
  84. auto *udn = get_icu_idna_instance();
  85. udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()),
  86. byte_sink, info, uc_err);
  87. if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
  88. /* idn_hbuf is allocated in mempool, so it is safe to use */
  89. ret = std::string_view{idn_hbuf, (std::size_t)byte_sink.NumberOfBytesWritten()};
  90. }
  91. else {
  92. msg_err_pool ("cannot convert to IDN: %s (0x%xd)",
  93. u_errorName(uc_err), info.getErrors());
  94. }
  95. }
  96. return ret;
  97. };
  98. constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto {
  99. return (s1.size() == s2.size()) &&
  100. std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
  101. [](const auto c1, const auto c2) {
  102. return g_ascii_tolower(c1) == g_ascii_tolower(c2);
  103. });
  104. }
  105. constexpr auto
  106. is_transfer_proto(struct rspamd_url *u) -> bool
  107. {
  108. return (u->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_FTP)) != 0;
  109. }
  110. auto
  111. html_url_is_phished(rspamd_mempool_t *pool,
  112. struct rspamd_url *href_url,
  113. std::string_view text_data) -> std::optional<rspamd_url *>
  114. {
  115. struct rspamd_url *text_url;
  116. std::string_view disp_tok, href_tok;
  117. goffset url_pos;
  118. gchar *url_str = NULL;
  119. auto sz = text_data.size();
  120. const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
  121. text_data = std::string_view(trimmed, sz);
  122. if (text_data.size() > 4 &&
  123. rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
  124. RSPAMD_URL_FIND_ALL,
  125. &url_pos, NULL) && url_str != nullptr) {
  126. if (url_pos > 0) {
  127. /*
  128. * We have some url at some offset, so we need to check what is
  129. * at the start of the text
  130. */
  131. return std::nullopt;
  132. }
  133. text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
  134. auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
  135. RSPAMD_URL_PARSE_TEXT);
  136. if (rc == URI_ERRNO_OK) {
  137. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  138. href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
  139. /* Check for phishing */
  140. if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
  141. disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
  142. href_tok = convert_idna_hostname_maybe(pool, href_url, false);
  143. if (!sv_equals(disp_tok, href_tok) &&
  144. text_url->tldlen > 0 && href_url->tldlen > 0) {
  145. /* Apply the same logic for TLD */
  146. disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
  147. href_tok = convert_idna_hostname_maybe(pool, href_url, true);
  148. if (!sv_equals(disp_tok, href_tok)) {
  149. /* Check if one url is a subdomain for another */
  150. if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
  151. href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
  152. href_url->linked_url = text_url;
  153. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  154. }
  155. }
  156. }
  157. }
  158. return text_url;
  159. }
  160. else {
  161. /*
  162. * We have found something that looks like an url but it was
  163. * not parsed correctly.
  164. * Sometimes it means an obfuscation attempt, so we have to check
  165. * what's inside of the text
  166. */
  167. gboolean obfuscation_found = FALSE;
  168. if (text_data.size() > 4
  169. && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
  170. rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
  171. /* Clearly an obfuscation attempt */
  172. obfuscation_found = TRUE;
  173. }
  174. msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s",
  175. url_str,
  176. rspamd_url_strerror(rc),
  177. obfuscation_found ? "yes" : "no");
  178. if (obfuscation_found) {
  179. href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
  180. }
  181. }
  182. }
  183. return std::nullopt;
  184. }
  185. void
  186. html_check_displayed_url(rspamd_mempool_t *pool,
  187. GList **exceptions,
  188. void *url_set,
  189. std::string_view visible_part,
  190. goffset href_offset,
  191. struct rspamd_url *url)
  192. {
  193. struct rspamd_url *displayed_url = nullptr;
  194. struct rspamd_url *turl;
  195. struct rspamd_process_exception *ex;
  196. guint saved_flags = 0;
  197. gsize dlen;
  198. if (visible_part.empty()) {
  199. /* No displayed url, just some text within <a> tag */
  200. return;
  201. }
  202. url->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
  203. rspamd_strlcpy(url->visible_part,
  204. visible_part.data(),
  205. visible_part.size() + 1);
  206. dlen = visible_part.size();
  207. /* Strip unicode spaces from the start and the end */
  208. url->visible_part = const_cast<char *>(
  209. rspamd_string_unicode_trim_inplace(url->visible_part,
  210. &dlen));
  211. auto maybe_url = html_url_is_phished(pool, url,
  212. {url->visible_part, dlen});
  213. if (maybe_url) {
  214. url->flags |= saved_flags;
  215. displayed_url = maybe_url.value();
  216. }
  217. if (exceptions && displayed_url != nullptr) {
  218. ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
  219. ex->pos = href_offset;
  220. ex->len = dlen;
  221. ex->type = RSPAMD_EXCEPTION_URL;
  222. ex->ptr = url;
  223. *exceptions = g_list_prepend(*exceptions, ex);
  224. }
  225. if (displayed_url && url_set) {
  226. turl = rspamd_url_set_add_or_return((khash_t (rspamd_url_hash) *)url_set, displayed_url);
  227. if (turl != nullptr) {
  228. /* Here, we assume the following:
  229. * if we have a URL in the text part which
  230. * is the same as displayed URL in the
  231. * HTML part, we assume that it is also
  232. * hint only.
  233. */
  234. if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
  235. /*
  236. * We have the same URL for href and displayed url, so we
  237. * know that this url cannot be both target and display (as
  238. * it breaks logic in many places), so we do not
  239. * propagate html flags
  240. */
  241. if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) {
  242. turl->flags |= displayed_url->flags;
  243. }
  244. turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
  245. }
  246. turl->count++;
  247. }
  248. else {
  249. /* Already inserted by `rspamd_url_set_add_or_return` */
  250. }
  251. }
  252. rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
  253. }
  254. auto
  255. html_process_url(rspamd_mempool_t *pool, std::string_view &input)
  256. -> std::optional<struct rspamd_url *>
  257. {
  258. struct rspamd_url *url;
  259. guint saved_flags = 0;
  260. gint rc;
  261. const gchar *s, *prefix = "http://";
  262. gchar *d;
  263. gsize dlen;
  264. gboolean has_bad_chars = FALSE, no_prefix = FALSE;
  265. static const gchar hexdigests[] = "0123456789abcdef";
  266. auto sz = input.length();
  267. const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
  268. input = {trimmed, sz};
  269. const auto *start = input.data();
  270. s = start;
  271. dlen = 0;
  272. for (auto i = 0; i < sz; i++) {
  273. if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
  274. dlen += 3;
  275. }
  276. else {
  277. dlen++;
  278. }
  279. }
  280. if (rspamd_substring_search(start, sz, "://", 3) == -1) {
  281. if (sz >= sizeof("mailto:") &&
  282. (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
  283. memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
  284. memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
  285. /* Exclusion, has valid but 'strange' prefix */
  286. }
  287. else {
  288. for (auto i = 0; i < sz; i++) {
  289. if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
  290. if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
  291. prefix = "http:";
  292. dlen += sizeof("http:") - 1;
  293. no_prefix = TRUE;
  294. }
  295. else if (s[i] == '@') {
  296. /* Likely email prefix */
  297. prefix = "mailto://";
  298. dlen += sizeof("mailto://") - 1;
  299. no_prefix = TRUE;
  300. }
  301. else if (s[i] == ':' && i != 0) {
  302. /* Special case */
  303. no_prefix = FALSE;
  304. }
  305. else {
  306. if (i == 0) {
  307. /* No valid data */
  308. return std::nullopt;
  309. }
  310. else {
  311. no_prefix = TRUE;
  312. dlen += strlen(prefix);
  313. }
  314. }
  315. break;
  316. }
  317. }
  318. }
  319. }
  320. auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
  321. d = decoded;
  322. if (no_prefix) {
  323. gsize plen = strlen(prefix);
  324. memcpy(d, prefix, plen);
  325. d += plen;
  326. }
  327. /*
  328. * We also need to remove all internal newlines, spaces
  329. * and encode unsafe characters
  330. * Another obfuscation find in the wild was encoding of the SAFE url characters,
  331. * including essential ones
  332. */
  333. for (auto i = 0; i < sz; i++) {
  334. if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
  335. continue;
  336. }
  337. else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
  338. /* URL encode */
  339. *d++ = '%';
  340. *d++ = hexdigests[(s[i] >> 4) & 0xf];
  341. *d++ = hexdigests[s[i] & 0xf];
  342. has_bad_chars = TRUE;
  343. }
  344. else if (G_UNLIKELY (s[i] == '%')) {
  345. if (i + 2 < sz) {
  346. auto c1 = s[i + 1];
  347. auto c2 = s[i + 2];
  348. if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
  349. auto codepoint = 0;
  350. if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
  351. else if (c1 >= 'A' && c1 <= 'F') codepoint = c1 - 'A' + 10;
  352. else if (c1 >= 'a' && c1 <= 'f') codepoint = c1 - 'a' + 10;
  353. codepoint <<= 4;
  354. if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
  355. else if (c2 >= 'A' && c2 <= 'F') codepoint += c2 - 'A' + 10;
  356. else if (c2 >= 'a' && c2 <= 'f') codepoint += c2 - 'a' + 10;
  357. /* Now check for 'interesting' codepoints */
  358. if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
  359. codepoint == '?' || codepoint == '\\' || codepoint == '/') {
  360. /* Replace it back */
  361. *d++ = (char)(codepoint & 0xff);
  362. i += 2;
  363. }
  364. else {
  365. *d++ = s[i];
  366. }
  367. }
  368. else {
  369. *d++ = s[i];
  370. }
  371. }
  372. else {
  373. *d++ = s[i];
  374. }
  375. }
  376. else {
  377. *d++ = s[i];
  378. }
  379. }
  380. *d = '\0';
  381. dlen = d - decoded;
  382. url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
  383. rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
  384. rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
  385. /* Filter some completely damaged urls */
  386. if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
  387. !((url->protocol & PROTOCOL_UNKNOWN))) {
  388. url->flags |= saved_flags;
  389. if (has_bad_chars) {
  390. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  391. }
  392. if (no_prefix) {
  393. url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  394. if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
  395. /* Ignore urls with both no schema and no tld */
  396. return std::nullopt;
  397. }
  398. }
  399. decoded = url->string;
  400. input = {decoded, url->urllen};
  401. /* Spaces in href usually mean an attempt to obfuscate URL */
  402. /* See https://github.com/vstakhov/rspamd/issues/593 */
  403. #if 0
  404. if (has_spaces) {
  405. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  406. }
  407. #endif
  408. return url;
  409. }
  410. return std::nullopt;
  411. }
  412. }