Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

html_url.cxx 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "html_url.hxx"
  17. #include "libutil/str_util.h"
  18. #include "libserver/url.h"
  19. #include "libserver/logger.h"
  20. #include "rspamd.h"
  21. #include <unicode/idna.h>
  22. namespace rspamd::html {
  23. static auto
  24. rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
  25. {
  26. const auto *p1 = t1.data() + t1.size() - 1;
  27. const auto *p2 = t2.data() + t2.size() - 1;
  28. /* Skip trailing dots */
  29. while (p1 > t1.data()) {
  30. if (*p1 != '.') {
  31. break;
  32. }
  33. p1--;
  34. }
  35. while (p2 > t2.data()) {
  36. if (*p2 != '.') {
  37. break;
  38. }
  39. p2--;
  40. }
  41. while (p1 > t1.data() && p2 > t2.data()) {
  42. if (*p1 != *p2) {
  43. break;
  44. }
  45. p1--;
  46. p2--;
  47. }
  48. if (p2 == t2.data()) {
  49. /* p2 can be subdomain of p1 if *p1 is '.' */
  50. if (p1 != t1.data() && *(p1 - 1) == '.') {
  51. return true;
  52. }
  53. }
  54. else if (p1 == t1.data()) {
  55. if (p2 != t2.data() && *(p2 - 1) == '.') {
  56. return true;
  57. }
  58. }
  59. return false;
  60. }
  61. static auto
  62. get_icu_idna_instance(void) -> auto
  63. {
  64. auto uc_err = U_ZERO_ERROR;
  65. static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
  66. return udn;
  67. }
  68. static auto
  69. convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
  70. -> std::string_view
  71. {
  72. std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen};
  73. /* Handle IDN url's */
  74. if (ret.size() > 4 &&
  75. rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
  76. const auto buf_capacity = ret.size() * 2 + 1;
  77. auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity);
  78. icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity};
  79. /* We need to convert it to the normal value first */
  80. icu::IDNAInfo info;
  81. auto uc_err = U_ZERO_ERROR;
  82. auto *udn = get_icu_idna_instance();
  83. udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()),
  84. byte_sink, info, uc_err);
  85. if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
  86. /* idn_hbuf is allocated in mempool, so it is safe to use */
  87. ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()};
  88. }
  89. else {
  90. msg_err_pool("cannot convert to IDN: %s (0x%xd)",
  91. u_errorName(uc_err), info.getErrors());
  92. }
  93. }
  94. return ret;
  95. };
  96. constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto
  97. {
  98. return (s1.size() == s2.size()) &&
  99. std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
  100. [](const auto c1, const auto c2) {
  101. return g_ascii_tolower(c1) == g_ascii_tolower(c2);
  102. });
  103. }
  104. constexpr auto
  105. is_transfer_proto(struct rspamd_url *u) -> bool
  106. {
  107. return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
  108. }
  109. auto html_url_is_phished(rspamd_mempool_t *pool,
  110. struct rspamd_url *href_url,
  111. std::string_view text_data) -> std::optional<rspamd_url *>
  112. {
  113. struct rspamd_url *text_url;
  114. std::string_view disp_tok, href_tok;
  115. goffset url_pos;
  116. char *url_str = NULL;
  117. auto sz = text_data.size();
  118. const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
  119. text_data = std::string_view(trimmed, sz);
  120. if (text_data.size() > 4 &&
  121. rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
  122. RSPAMD_URL_FIND_ALL,
  123. &url_pos, NULL) &&
  124. url_str != nullptr) {
  125. if (url_pos > 0) {
  126. /*
  127. * We have some url at some offset, so we need to check what is
  128. * at the start of the text
  129. */
  130. return std::nullopt;
  131. }
  132. text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
  133. auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
  134. RSPAMD_URL_PARSE_TEXT);
  135. if (rc == URI_ERRNO_OK) {
  136. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  137. href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
  138. /* Check for phishing */
  139. if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
  140. disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
  141. href_tok = convert_idna_hostname_maybe(pool, href_url, false);
  142. if (!sv_equals(disp_tok, href_tok) &&
  143. text_url->tldlen > 0 && href_url->tldlen > 0) {
  144. /* Apply the same logic for TLD */
  145. disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
  146. href_tok = convert_idna_hostname_maybe(pool, href_url, true);
  147. if (!sv_equals(disp_tok, href_tok)) {
  148. /* Check if one url is a subdomain for another */
  149. if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
  150. href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
  151. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  152. if (href_url->ext == nullptr) {
  153. href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
  154. }
  155. href_url->ext->linked_url = text_url;
  156. }
  157. }
  158. }
  159. }
  160. return text_url;
  161. }
  162. else {
  163. /*
  164. * We have found something that looks like an url but it was
  165. * not parsed correctly.
  166. * Sometimes it means an obfuscation attempt, so we have to check
  167. * what's inside of the text
  168. */
  169. gboolean obfuscation_found = FALSE;
  170. if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
  171. rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
  172. /* Clearly an obfuscation attempt */
  173. obfuscation_found = TRUE;
  174. }
  175. msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s",
  176. url_str,
  177. rspamd_url_strerror(rc),
  178. obfuscation_found ? "yes" : "no");
  179. if (obfuscation_found) {
  180. href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
  181. }
  182. }
  183. }
  184. return std::nullopt;
  185. }
  186. void html_check_displayed_url(rspamd_mempool_t *pool,
  187. GList **exceptions,
  188. void *url_set,
  189. std::string_view visible_part,
  190. goffset href_offset,
  191. struct rspamd_url *url)
  192. {
  193. struct rspamd_url *displayed_url = nullptr;
  194. struct rspamd_url *turl;
  195. struct rspamd_process_exception *ex;
  196. unsigned int saved_flags = 0;
  197. gsize dlen;
  198. if (visible_part.empty()) {
  199. /* No displayed url, just some text within <a> tag */
  200. return;
  201. }
  202. if (url->ext == nullptr) {
  203. url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
  204. }
  205. url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
  206. rspamd_strlcpy(url->ext->visible_part,
  207. visible_part.data(),
  208. visible_part.size() + 1);
  209. dlen = visible_part.size();
  210. /* Strip unicode spaces from the start and the end */
  211. url->ext->visible_part = const_cast<char *>(
  212. rspamd_string_unicode_trim_inplace(url->ext->visible_part,
  213. &dlen));
  214. auto maybe_url = html_url_is_phished(pool, url,
  215. {url->ext->visible_part, dlen});
  216. if (maybe_url) {
  217. url->flags |= saved_flags;
  218. displayed_url = maybe_url.value();
  219. }
  220. if (exceptions && displayed_url != nullptr) {
  221. ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception);
  222. ex->pos = href_offset;
  223. ex->len = dlen;
  224. ex->type = RSPAMD_EXCEPTION_URL;
  225. ex->ptr = url;
  226. *exceptions = g_list_prepend(*exceptions, ex);
  227. }
  228. if (displayed_url && url_set) {
  229. turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url);
  230. if (turl != nullptr) {
  231. /* Here, we assume the following:
  232. * if we have a URL in the text part which
  233. * is the same as displayed URL in the
  234. * HTML part, we assume that it is also
  235. * hint only.
  236. */
  237. if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
  238. /*
  239. * We have the same URL for href and displayed url, so we
  240. * know that this url cannot be both target and display (as
  241. * it breaks logic in many places), so we do not
  242. * propagate html flags
  243. */
  244. if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) {
  245. turl->flags |= displayed_url->flags;
  246. }
  247. turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
  248. }
  249. turl->count++;
  250. }
  251. else {
  252. /* Already inserted by `rspamd_url_set_add_or_return` */
  253. }
  254. }
  255. rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
  256. }
  257. auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
  258. -> std::optional<struct rspamd_url *>
  259. {
  260. struct rspamd_url *url;
  261. unsigned int saved_flags = 0;
  262. int rc;
  263. const char *s, *prefix = "http://";
  264. char *d;
  265. gsize dlen;
  266. gboolean has_bad_chars = FALSE, no_prefix = FALSE;
  267. static const char hexdigests[] = "0123456789abcdef";
  268. auto sz = input.length();
  269. const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
  270. input = {trimmed, sz};
  271. const auto *start = input.data();
  272. s = start;
  273. dlen = 0;
  274. for (auto i = 0; i < sz; i++) {
  275. if (G_UNLIKELY(((unsigned int) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
  276. dlen += 3;
  277. }
  278. else {
  279. dlen++;
  280. }
  281. }
  282. if (rspamd_substring_search(start, sz, "://", 3) == -1) {
  283. if (sz >= sizeof("mailto:") &&
  284. (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
  285. memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
  286. memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
  287. /* Exclusion, has valid but 'strange' prefix */
  288. }
  289. else {
  290. for (auto i = 0; i < sz; i++) {
  291. if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) {
  292. if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
  293. prefix = "http:";
  294. dlen += sizeof("http:") - 1;
  295. no_prefix = TRUE;
  296. }
  297. else if (s[i] == '@') {
  298. /* Likely email prefix */
  299. prefix = "mailto://";
  300. dlen += sizeof("mailto://") - 1;
  301. no_prefix = TRUE;
  302. }
  303. else if (s[i] == ':' && i != 0) {
  304. /* Special case */
  305. no_prefix = FALSE;
  306. }
  307. else {
  308. if (i == 0) {
  309. /* No valid data */
  310. return std::nullopt;
  311. }
  312. else {
  313. no_prefix = TRUE;
  314. dlen += strlen(prefix);
  315. }
  316. }
  317. break;
  318. }
  319. }
  320. }
  321. }
  322. auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
  323. d = decoded;
  324. if (no_prefix) {
  325. gsize plen = strlen(prefix);
  326. memcpy(d, prefix, plen);
  327. d += plen;
  328. }
  329. /*
  330. * We also need to remove all internal newlines, spaces
  331. * and encode unsafe characters
  332. * Another obfuscation find in the wild was encoding of the SAFE url characters,
  333. * including essential ones
  334. */
  335. for (auto i = 0; i < sz; i++) {
  336. if (G_UNLIKELY(g_ascii_isspace(s[i]))) {
  337. continue;
  338. }
  339. else if (G_UNLIKELY(((unsigned int) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
  340. /* URL encode */
  341. *d++ = '%';
  342. *d++ = hexdigests[(s[i] >> 4) & 0xf];
  343. *d++ = hexdigests[s[i] & 0xf];
  344. has_bad_chars = TRUE;
  345. }
  346. else if (G_UNLIKELY(s[i] == '%')) {
  347. if (i + 2 < sz) {
  348. auto c1 = s[i + 1];
  349. auto c2 = s[i + 2];
  350. if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
  351. auto codepoint = 0;
  352. if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
  353. else if (c1 >= 'A' && c1 <= 'F')
  354. codepoint = c1 - 'A' + 10;
  355. else if (c1 >= 'a' && c1 <= 'f')
  356. codepoint = c1 - 'a' + 10;
  357. codepoint <<= 4;
  358. if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
  359. else if (c2 >= 'A' && c2 <= 'F')
  360. codepoint += c2 - 'A' + 10;
  361. else if (c2 >= 'a' && c2 <= 'f')
  362. codepoint += c2 - 'a' + 10;
  363. /* Now check for 'interesting' codepoints */
  364. if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
  365. codepoint == '?' || codepoint == '\\' || codepoint == '/') {
  366. /* Replace it back */
  367. *d++ = (char) (codepoint & 0xff);
  368. i += 2;
  369. }
  370. else {
  371. *d++ = s[i];
  372. }
  373. }
  374. else {
  375. *d++ = s[i];
  376. }
  377. }
  378. else {
  379. *d++ = s[i];
  380. }
  381. }
  382. else {
  383. *d++ = s[i];
  384. }
  385. }
  386. *d = '\0';
  387. dlen = d - decoded;
  388. url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
  389. rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
  390. rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
  391. /* Filter some completely damaged urls */
  392. if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
  393. !((url->protocol & PROTOCOL_UNKNOWN))) {
  394. url->flags |= saved_flags;
  395. if (has_bad_chars) {
  396. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  397. }
  398. if (no_prefix) {
  399. url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  400. if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
  401. /* Ignore urls with both no schema and no tld */
  402. return std::nullopt;
  403. }
  404. }
  405. decoded = url->string;
  406. input = {decoded, url->urllen};
  407. /* Spaces in href usually mean an attempt to obfuscate URL */
  408. /* See https://github.com/vstakhov/rspamd/issues/593 */
  409. #if 0
  410. if (has_spaces) {
  411. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  412. }
  413. #endif
  414. return url;
  415. }
  416. return std::nullopt;
  417. }
  418. }// namespace rspamd::html