You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

str_util.c 55KB


  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "cryptobox.h"
  19. #include "url.h"
  20. #include "str_util.h"
  21. #include "logger.h"
  22. #include "contrib/t1ha/t1ha.h"
  23. #include <unicode/uversion.h>
  24. #include <unicode/ucnv.h>
  25. #if U_ICU_VERSION_MAJOR_NUM >= 44
  26. #include <unicode/unorm2.h>
  27. #endif
  28. #include <math.h>
  29. const guchar lc_map[256] = {
  30. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  31. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  32. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  33. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  34. 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
  35. 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  36. 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
  37. 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
  38. 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
  39. 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
  40. 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
  41. 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
  42. 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
  43. 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
  44. 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
  45. 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
  46. 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
  47. 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
  48. 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
  49. 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
  50. 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
  51. 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
  52. 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
  53. 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
  54. 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
  55. 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
  56. 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
  57. 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
  58. 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
  59. 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
  60. 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
  61. 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
  62. };
  63. void
  64. rspamd_str_lc (gchar *str, guint size)
  65. {
  66. guint leftover = size % 4;
  67. guint fp, i;
  68. const uint8_t* s = (const uint8_t*) str;
  69. gchar *dest = str;
  70. guchar c1, c2, c3, c4;
  71. fp = size - leftover;
  72. for (i = 0; i != fp; i += 4) {
  73. c1 = s[i], c2 = s[i + 1], c3 = s[i + 2], c4 = s[i + 3];
  74. dest[0] = lc_map[c1];
  75. dest[1] = lc_map[c2];
  76. dest[2] = lc_map[c3];
  77. dest[3] = lc_map[c4];
  78. dest += 4;
  79. }
  80. switch (leftover) {
  81. case 3:
  82. *dest++ = lc_map[(guchar)str[i++]];
  83. /* FALLTHRU */
  84. case 2:
  85. *dest++ = lc_map[(guchar)str[i++]];
  86. /* FALLTHRU */
  87. case 1:
  88. *dest = lc_map[(guchar)str[i]];
  89. }
  90. }
  91. gint
  92. rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l)
  93. {
  94. guint fp, i;
  95. guchar c1, c2, c3, c4;
  96. union {
  97. guchar c[4];
  98. guint32 n;
  99. } cmp1, cmp2;
  100. gsize leftover = l % 4;
  101. gint ret = 0;
  102. fp = l - leftover;
  103. for (i = 0; i != fp; i += 4) {
  104. c1 = s[i], c2 = s[i + 1], c3 = s[i + 2], c4 = s[i + 3];
  105. cmp1.c[0] = lc_map[c1];
  106. cmp1.c[1] = lc_map[c2];
  107. cmp1.c[2] = lc_map[c3];
  108. cmp1.c[3] = lc_map[c4];
  109. c1 = d[i], c2 = d[i + 1], c3 = d[i + 2], c4 = d[i + 3];
  110. cmp2.c[0] = lc_map[c1];
  111. cmp2.c[1] = lc_map[c2];
  112. cmp2.c[2] = lc_map[c3];
  113. cmp2.c[3] = lc_map[c4];
  114. if (cmp1.n != cmp2.n) {
  115. return cmp1.n - cmp2.n;
  116. }
  117. }
  118. while (leftover > 0) {
  119. if (g_ascii_tolower (s[i]) != g_ascii_tolower (d[i])) {
  120. return s[i] - d[i];
  121. }
  122. leftover--;
  123. i++;
  124. }
  125. return ret;
  126. }
  127. /*
  128. * The purpose of this function is fast and in place conversion of a unicode
  129. * string to lower case, so some locale peculiarities are simply ignored
  130. * If the target string is longer than initial one, then we just trim it
  131. */
  132. void
  133. rspamd_str_lc_utf8 (gchar *str, guint size)
  134. {
  135. const gchar *s = str, *p;
  136. gchar *d = str, tst[6];
  137. gint remain = size;
  138. gint r;
  139. gunichar uc;
  140. while (remain > 0) {
  141. p = g_utf8_next_char (s);
  142. if (p - s > remain) {
  143. break;
  144. }
  145. uc = g_utf8_get_char (s);
  146. uc = g_unichar_tolower (uc);
  147. if (remain >= 6) {
  148. r = g_unichar_to_utf8 (uc, d);
  149. }
  150. else {
  151. /* We must be cautious here to avoid broken unicode being append */
  152. r = g_unichar_to_utf8 (uc, tst);
  153. if (r > remain) {
  154. break;
  155. }
  156. else {
  157. memcpy (d, tst, r);
  158. }
  159. }
  160. remain -= r;
  161. s = p;
  162. d += r;
  163. }
  164. }
  165. gboolean
  166. rspamd_strcase_equal (gconstpointer v, gconstpointer v2)
  167. {
  168. if (g_ascii_strcasecmp ((const gchar *)v, (const gchar *)v2) == 0) {
  169. return TRUE;
  170. }
  171. return FALSE;
  172. }
  173. guint64
  174. rspamd_icase_hash (const gchar *in, gsize len, guint64 seed)
  175. {
  176. guint leftover = len % sizeof (guint64);
  177. guint fp, i;
  178. const uint8_t* s = (const uint8_t*) in;
  179. union {
  180. struct {
  181. guchar c1, c2, c3, c4, c5, c6, c7, c8;
  182. } c;
  183. guint64 pp;
  184. } u;
  185. guint64 h = seed;
  186. fp = len - leftover;
  187. for (i = 0; i != fp; i += 8) {
  188. u.c.c1 = s[i], u.c.c2 = s[i + 1], u.c.c3 = s[i + 2], u.c.c4 = s[i + 3];
  189. u.c.c5 = s[i + 4], u.c.c6 = s[i + 5], u.c.c7 = s[i + 6], u.c.c8 = s[i + 7];
  190. u.c.c1 = lc_map[u.c.c1];
  191. u.c.c2 = lc_map[u.c.c2];
  192. u.c.c3 = lc_map[u.c.c3];
  193. u.c.c4 = lc_map[u.c.c4];
  194. u.c.c5 = lc_map[u.c.c5];
  195. u.c.c6 = lc_map[u.c.c6];
  196. u.c.c7 = lc_map[u.c.c7];
  197. u.c.c8 = lc_map[u.c.c8];
  198. h = t1ha (&u.pp, sizeof (u), h);
  199. }
  200. u.pp = 0;
  201. switch (leftover) {
  202. case 7:
  203. u.c.c7 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  204. case 6:
  205. u.c.c6 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  206. case 5:
  207. u.c.c5 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  208. case 4:
  209. u.c.c4 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  210. case 3:
  211. u.c.c3 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  212. case 2:
  213. u.c.c2 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  214. case 1:
  215. u.c.c1 = lc_map[(guchar)s[i]];
  216. break;
  217. }
  218. h = t1ha (&u.pp, sizeof (u), h);
  219. return h;
  220. }
  221. guint
  222. rspamd_strcase_hash (gconstpointer key)
  223. {
  224. const gchar *p = key;
  225. gsize len;
  226. len = strlen (p);
  227. return rspamd_icase_hash (p, len, rspamd_hash_seed ());
  228. }
  229. guint
  230. rspamd_str_hash (gconstpointer key)
  231. {
  232. gsize len;
  233. len = strlen ((const gchar *)key);
  234. return rspamd_cryptobox_fast_hash (key, len, rspamd_hash_seed ());
  235. }
  236. gboolean
  237. rspamd_str_equal (gconstpointer v, gconstpointer v2)
  238. {
  239. return strcmp ((const gchar *)v, (const gchar *)v2) == 0;
  240. }
  241. gboolean
  242. rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2)
  243. {
  244. const rspamd_ftok_t *f1 = v, *f2 = v2;
  245. if (f1->len == f2->len &&
  246. rspamd_lc_cmp (f1->begin, f2->begin, f1->len) == 0) {
  247. return TRUE;
  248. }
  249. return FALSE;
  250. }
  251. guint
  252. rspamd_ftok_icase_hash (gconstpointer key)
  253. {
  254. const rspamd_ftok_t *f = key;
  255. return rspamd_icase_hash (f->begin, f->len, rspamd_hash_seed ());
  256. }
  257. gboolean
  258. rspamd_ftok_equal (gconstpointer v, gconstpointer v2)
  259. {
  260. const rspamd_ftok_t *f1 = v, *f2 = v2;
  261. if (f1->len == f2->len &&
  262. memcmp (f1->begin, f2->begin, f1->len) == 0) {
  263. return TRUE;
  264. }
  265. return FALSE;
  266. }
  267. guint
  268. rspamd_ftok_hash (gconstpointer key)
  269. {
  270. const rspamd_ftok_t *f = key;
  271. return t1ha (f->begin, f->len, rspamd_hash_seed ());
  272. }
  273. gboolean
  274. rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2)
  275. {
  276. const GString *f1 = v, *f2 = v2;
  277. if (f1->len == f2->len &&
  278. rspamd_lc_cmp (f1->str, f2->str, f1->len) == 0) {
  279. return TRUE;
  280. }
  281. return FALSE;
  282. }
  283. guint
  284. rspamd_gstring_icase_hash (gconstpointer key)
  285. {
  286. const GString *f = key;
  287. return rspamd_icase_hash (f->str, f->len, rspamd_hash_seed ());
  288. }
  289. /* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
  290. #define MEM_ALIGN (sizeof(gsize)-1)
  291. #if defined(__LP64__) || defined(_LP64)
  292. #define WORD_TYPE guint64
  293. #define ZEROMASK 0x7F7F7F7F7F7F7F7FLLU
  294. #else
  295. #define WORD_TYPE guint32
  296. #define ZEROMASK 0x7F7F7F7FU
  297. #endif
  298. #define HASZERO(x) ~(((((x) & ZEROMASK) + ZEROMASK) | (x)) | ZEROMASK)
  299. gsize
  300. rspamd_strlcpy_fast (gchar *dst, const gchar *src, gsize siz)
  301. {
  302. gchar *d = dst;
  303. const gchar *s = src;
  304. gsize n = siz;
  305. WORD_TYPE *wd;
  306. const WORD_TYPE *ws;
  307. /* Copy as many bytes as will fit */
  308. if (n-- != 0) {
  309. if (((uintptr_t) s & MEM_ALIGN) == ((uintptr_t) d & MEM_ALIGN)) {
  310. /* Init copy byte by byte */
  311. for (; ((uintptr_t) s & MEM_ALIGN) && n && (*d = *s); n--, s++, d++);
  312. if (n && *s) {
  313. wd = (void *) d;
  314. ws = (const void *) s;
  315. /*
  316. * Copy by 32 or 64 bits (causes valgrind warnings)
  317. */
  318. for (; n >= sizeof (WORD_TYPE) && !HASZERO(*ws);
  319. n -= sizeof (WORD_TYPE), ws++, wd++) {
  320. *wd = *ws;
  321. }
  322. d = (void *) wd;
  323. s = (const void *) ws;
  324. }
  325. }
  326. /* Copy the rest */
  327. for (; n && (*d = *s); n--, s++, d++);
  328. *d = 0;
  329. }
  330. else {
  331. return 0;
  332. }
  333. return (d - dst);
  334. }
  335. gsize
  336. rspamd_null_safe_copy (const gchar *src, gsize srclen,
  337. gchar *dest, gsize destlen)
  338. {
  339. gsize copied = 0, si = 0, di = 0;
  340. if (destlen == 0) {
  341. return 0;
  342. }
  343. while (si < srclen && di + 1 < destlen) {
  344. if (src[si] != '\0') {
  345. dest[di++] = src[si++];
  346. copied ++;
  347. }
  348. else {
  349. si ++;
  350. }
  351. }
  352. dest[di] = '\0';
  353. return copied;
  354. }
  355. size_t
  356. rspamd_strlcpy_safe (gchar *dst, const gchar *src, gsize siz)
  357. {
  358. gchar *d = dst;
  359. gsize nleft = siz;
  360. if (nleft != 0) {
  361. while (--nleft != 0) {
  362. if ((*d++ = *src++) == '\0') {
  363. d --;
  364. break;
  365. }
  366. }
  367. }
  368. if (nleft == 0) {
  369. if (siz != 0) {
  370. *d = '\0';
  371. }
  372. }
  373. return (d - dst);
  374. }
  375. /*
  376. * Try to convert string of length to long
  377. */
  378. gboolean
  379. rspamd_strtol (const gchar *s, gsize len, glong *value)
  380. {
  381. const gchar *p = s, *end = s + len;
  382. gchar c;
  383. glong v = 0;
  384. const glong cutoff = G_MAXLONG / 10, cutlim = G_MAXLONG % 10;
  385. gboolean neg;
  386. /* Case negative values */
  387. if (*p == '-') {
  388. neg = TRUE;
  389. p++;
  390. }
  391. else {
  392. neg = FALSE;
  393. }
  394. /* Some preparations for range errors */
  395. while (p < end) {
  396. c = *p;
  397. if (c >= '0' && c <= '9') {
  398. c -= '0';
  399. if (v > cutoff || (v == cutoff && c > cutlim)) {
  400. /* Range error */
  401. *value = neg ? G_MINLONG : G_MAXLONG;
  402. return FALSE;
  403. }
  404. else {
  405. v *= 10;
  406. v += c;
  407. }
  408. }
  409. else {
  410. return FALSE;
  411. }
  412. p++;
  413. }
  414. *value = neg ? -(v) : v;
  415. return TRUE;
  416. }
  417. /*
  418. * Try to convert string of length to long
  419. */
  420. gboolean
  421. rspamd_strtoul (const gchar *s, gsize len, gulong *value)
  422. {
  423. const gchar *p = s, *end = s + len;
  424. gchar c;
  425. gulong v = 0;
  426. const gulong cutoff = G_MAXULONG / 10, cutlim = G_MAXULONG % 10;
  427. /* Some preparations for range errors */
  428. while (p < end) {
  429. c = *p;
  430. if (c >= '0' && c <= '9') {
  431. c -= '0';
  432. if (v > cutoff || (v == cutoff && (guint8)c > cutlim)) {
  433. /* Range error */
  434. *value = G_MAXULONG;
  435. return FALSE;
  436. }
  437. else {
  438. v *= 10;
  439. v += c;
  440. }
  441. }
  442. else {
  443. *value = v;
  444. return FALSE;
  445. }
  446. p++;
  447. }
  448. *value = v;
  449. return TRUE;
  450. }
  451. /**
  452. * Utility function to provide mem_pool copy for rspamd_hash_table_copy function
  453. * @param data string to copy
  454. * @param ud memory pool to use
  455. * @return
  456. */
  457. gpointer
  458. rspamd_str_pool_copy (gconstpointer data, gpointer ud)
  459. {
  460. rspamd_mempool_t *pool = ud;
  461. return data ? rspamd_mempool_strdup (pool, data) : NULL;
  462. }
  463. /*
  464. * We use here z-base32 encoding described here:
  465. * http://philzimmermann.com/docs/human-oriented-base-32-encoding.txt
  466. */
  467. gint
  468. rspamd_encode_base32_buf (const guchar *in, gsize inlen, gchar *out,
  469. gsize outlen)
  470. {
  471. static const char b32[]="ybndrfg8ejkmcpqxot1uwisza345h769";
  472. gchar *o, *end;
  473. gsize i;
  474. gint remain = -1, x;
  475. end = out + outlen;
  476. o = out;
  477. for (i = 0; i < inlen && o < end - 1; i++) {
  478. switch (i % 5) {
  479. case 0:
  480. /* 8 bits of input and 3 to remain */
  481. x = in[i];
  482. remain = in[i] >> 5;
  483. *o++ = b32[x & 0x1F];
  484. break;
  485. case 1:
  486. /* 11 bits of input, 1 to remain */
  487. x = remain | in[i] << 3;
  488. *o++ = b32[x & 0x1F];
  489. *o++ = b32[x >> 5 & 0x1F];
  490. remain = x >> 10;
  491. break;
  492. case 2:
  493. /* 9 bits of input, 4 to remain */
  494. x = remain | in[i] << 1;
  495. *o++ = b32[x & 0x1F];
  496. remain = x >> 5;
  497. break;
  498. case 3:
  499. /* 12 bits of input, 2 to remain */
  500. x = remain | in[i] << 4;
  501. *o++ = b32[x & 0x1F];
  502. *o++ = b32[x >> 5 & 0x1F];
  503. remain = x >> 10 & 0x3;
  504. break;
  505. case 4:
  506. /* 10 bits of output, nothing to remain */
  507. x = remain | in[i] << 2;
  508. *o++ = b32[x & 0x1F];
  509. *o++ = b32[x >> 5 & 0x1F];
  510. remain = -1;
  511. break;
  512. default:
  513. /* Not to be happen */
  514. break;
  515. }
  516. }
  517. if (remain >= 0 && o < end) {
  518. *o++ = b32[remain];
  519. }
  520. if (o <= end) {
  521. return (o - out);
  522. }
  523. return -1;
  524. }
  525. gchar *
  526. rspamd_encode_base32 (const guchar *in, gsize inlen)
  527. {
  528. gsize allocated_len = inlen * 8 / 5 + 2;
  529. gchar *out;
  530. gint outlen;
  531. out = g_malloc (allocated_len);
  532. outlen = rspamd_encode_base32_buf (in, inlen, out, allocated_len - 1);
  533. if (outlen >= 0) {
  534. out[outlen] = 0;
  535. return out;
  536. }
  537. g_free (out);
  538. return NULL;
  539. }
  540. static const guchar b32_dec[] = {
  541. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  542. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  543. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  544. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  545. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  546. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  547. 0xff, 0x12, 0xff, 0x19, 0x1a, 0x1b, 0x1e, 0x1d,
  548. 0x07, 0x1f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  549. 0xff, 0x18, 0x01, 0x0c, 0x03, 0x08, 0x05, 0x06,
  550. 0x1c, 0x15, 0x09, 0x0a, 0xff, 0x0b, 0x02, 0x10,
  551. 0x0d, 0x0e, 0x04, 0x16, 0x11, 0x13, 0xff, 0x14,
  552. 0x0f, 0x00, 0x17, 0xff, 0xff, 0xff, 0xff, 0xff,
  553. 0xff, 0x18, 0x01, 0x0c, 0x03, 0x08, 0x05, 0x06,
  554. 0x1c, 0x15, 0x09, 0x0a, 0xff, 0x0b, 0x02, 0x10,
  555. 0x0d, 0x0e, 0x04, 0x16, 0x11, 0x13, 0xff, 0x14,
  556. 0x0f, 0x00, 0x17, 0xff, 0xff, 0xff, 0xff, 0xff,
  557. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  558. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  559. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  560. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  561. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  562. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  563. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  564. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  565. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  566. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  567. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  568. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  569. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  570. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  571. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  572. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  573. };
  574. gint
  575. rspamd_decode_base32_buf (const gchar *in, gsize inlen,
  576. guchar *out, gsize outlen)
  577. {
  578. guchar *o, *end, decoded;
  579. guchar c;
  580. guint acc = 0U;
  581. guint processed_bits = 0;
  582. gsize i;
  583. end = out + outlen;
  584. o = out;
  585. for (i = 0; i < inlen; i ++) {
  586. c = (guchar)in[i];
  587. if (processed_bits >= 8) {
  588. processed_bits -= 8;
  589. *o++ = acc & 0xFF;
  590. acc >>= 8;
  591. }
  592. decoded = b32_dec[c];
  593. if (decoded == 0xff || o >= end) {
  594. return -1;
  595. }
  596. acc = (decoded << processed_bits) | acc;
  597. processed_bits += 5;
  598. }
  599. if (processed_bits > 0 && o < end) {
  600. *o++ = (acc & 0xFF);
  601. }
  602. else if (o > end) {
  603. return -1;
  604. }
  605. return (o - out);
  606. }
  607. guchar*
  608. rspamd_decode_base32 (const gchar *in, gsize inlen, gsize *outlen)
  609. {
  610. guchar *res;
  611. gsize allocated_len = inlen * 5 / 8 + 2;
  612. gssize olen;
  613. res = g_malloc (allocated_len);
  614. olen = rspamd_decode_base32_buf (in, inlen, res, allocated_len - 1);
  615. if (olen >= 0) {
  616. res[olen] = '\0';
  617. }
  618. else {
  619. g_free (res);
  620. return NULL;
  621. }
  622. if (outlen) {
  623. *outlen = olen;
  624. }
  625. return res;
  626. }
  627. /**
  628. * Decode string using base32 encoding
  629. * @param in input
  630. * @param inlen input length
  631. * @param out output buf (may overlap with `in`)
  632. * @param outlen output buf len
  633. * @return TRUE if in is valid base32 and `outlen` is enough to encode `inlen`
  634. */
  635. static gchar *
  636. rspamd_encode_base64_common (const guchar *in, gsize inlen, gint str_len,
  637. gsize *outlen, gboolean fold, enum rspamd_newlines_type how)
  638. {
  639. #define ADD_SPLIT do { \
  640. if (how == RSPAMD_TASK_NEWLINES_CR || how == RSPAMD_TASK_NEWLINES_CRLF) *o++ = '\r'; \
  641. if (how == RSPAMD_TASK_NEWLINES_LF || how == RSPAMD_TASK_NEWLINES_CRLF) *o++ = '\n'; \
  642. if (fold) *o++ = '\t'; \
  643. } while (0)
  644. #define CHECK_SPLIT \
  645. do { if (str_len > 0 && cols >= str_len) { \
  646. ADD_SPLIT; \
  647. cols = 0; \
  648. } } \
  649. while (0)
  650. gsize allocated_len = (inlen / 3) * 4 + 5;
  651. gchar *out, *o;
  652. guint64 n;
  653. guint32 rem, t, carry;
  654. gint cols, shift;
  655. static const char b64_enc[] =
  656. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  657. "abcdefghijklmnopqrstuvwxyz"
  658. "0123456789+/";
  659. if (str_len > 0) {
  660. g_assert (str_len > 8);
  661. if (fold) {
  662. switch (how) {
  663. case RSPAMD_TASK_NEWLINES_CR:
  664. case RSPAMD_TASK_NEWLINES_LF:
  665. allocated_len += (allocated_len / str_len + 1) * 2 + 1;
  666. break;
  667. default:
  668. allocated_len += (allocated_len / str_len + 1) * 3 + 1;
  669. break;
  670. }
  671. }
  672. else {
  673. switch (how) {
  674. case RSPAMD_TASK_NEWLINES_CR:
  675. case RSPAMD_TASK_NEWLINES_LF:
  676. allocated_len += (allocated_len / str_len + 1) * 1 + 1;
  677. break;
  678. default:
  679. allocated_len += (allocated_len / str_len + 1) * 2 + 1;
  680. break;
  681. }
  682. }
  683. }
  684. out = g_malloc (allocated_len);
  685. o = out;
  686. cols = 0;
  687. while (inlen > 6) {
  688. n = *(guint64 *)in;
  689. n = GUINT64_TO_BE (n);
  690. if (str_len <= 0 || cols <= str_len - 8) {
  691. *o++ = b64_enc[(n >> 58) & 0x3F];
  692. *o++ = b64_enc[(n >> 52) & 0x3F];
  693. *o++ = b64_enc[(n >> 46) & 0x3F];
  694. *o++ = b64_enc[(n >> 40) & 0x3F];
  695. *o++ = b64_enc[(n >> 34) & 0x3F];
  696. *o++ = b64_enc[(n >> 28) & 0x3F];
  697. *o++ = b64_enc[(n >> 22) & 0x3F];
  698. *o++ = b64_enc[(n >> 16) & 0x3F];
  699. cols += 8;
  700. }
  701. else {
  702. cols = str_len - cols;
  703. shift = 58;
  704. while (cols) {
  705. *o++ = b64_enc[(n >> shift) & 0x3F];
  706. shift -= 6;
  707. cols --;
  708. }
  709. ADD_SPLIT;
  710. /* Remaining bytes */
  711. while (shift >= 16) {
  712. *o++ = b64_enc[(n >> shift) & 0x3F];
  713. shift -= 6;
  714. cols ++;
  715. }
  716. }
  717. in += 6;
  718. inlen -= 6;
  719. }
  720. CHECK_SPLIT;
  721. rem = 0;
  722. carry = 0;
  723. for (;;) {
  724. /* Padding + remaining data (0 - 2 bytes) */
  725. switch (rem) {
  726. case 0:
  727. if (inlen-- == 0) {
  728. goto end;
  729. }
  730. t = *in++;
  731. *o++ = b64_enc[t >> 2];
  732. carry = (t << 4) & 0x30;
  733. rem = 1;
  734. cols ++;
  735. case 1:
  736. if (inlen-- == 0) {
  737. goto end;
  738. }
  739. CHECK_SPLIT;
  740. t = *in++;
  741. *o++ = b64_enc[carry | (t >> 4)];
  742. carry = (t << 2) & 0x3C;
  743. rem = 2;
  744. cols ++;
  745. default:
  746. if (inlen-- == 0) {
  747. goto end;
  748. }
  749. CHECK_SPLIT;
  750. t = *in ++;
  751. *o++ = b64_enc[carry | (t >> 6)];
  752. cols ++;
  753. CHECK_SPLIT;
  754. *o++ = b64_enc[t & 0x3F];
  755. cols ++;
  756. CHECK_SPLIT;
  757. rem = 0;
  758. }
  759. }
  760. end:
  761. if (rem == 1) {
  762. *o++ = b64_enc[carry];
  763. cols ++;
  764. CHECK_SPLIT;
  765. *o++ = '=';
  766. cols ++;
  767. CHECK_SPLIT;
  768. *o++ = '=';
  769. cols ++;
  770. CHECK_SPLIT;
  771. }
  772. else if (rem == 2) {
  773. *o++ = b64_enc[carry];
  774. cols ++;
  775. CHECK_SPLIT;
  776. *o++ = '=';
  777. cols ++;
  778. }
  779. CHECK_SPLIT;
  780. *o = '\0';
  781. if (outlen != NULL) {
  782. *outlen = o - out;
  783. }
  784. return out;
  785. }
  786. gchar *
  787. rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
  788. gsize *outlen)
  789. {
  790. return rspamd_encode_base64_common (in, inlen, str_len, outlen, FALSE,
  791. RSPAMD_TASK_NEWLINES_CRLF);
  792. }
  793. gchar *
  794. rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
  795. gsize *outlen, enum rspamd_newlines_type how)
  796. {
  797. return rspamd_encode_base64_common (in, inlen, str_len, outlen, TRUE, how);
  798. }
  799. gchar *
  800. rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
  801. gsize *outlen, enum rspamd_newlines_type how)
  802. {
  803. gsize olen = 0, span = 0, i = 0;
  804. gchar *out;
  805. gint ch;
  806. const guchar *end = in + inlen, *p = in;
  807. static const gchar hexdigests[16] = "0123456789ABCDEF";
  808. while (p < end) {
  809. ch = *p;
  810. if (ch < 128 && ch != '\r' && ch != '\n') {
  811. olen ++;
  812. span ++;
  813. }
  814. else {
  815. if (str_len > 0 && span + 5 >= str_len) {
  816. if (how == RSPAMD_TASK_NEWLINES_CRLF) {
  817. /* =\r\n */
  818. olen += 3;
  819. }
  820. else {
  821. olen += 2;
  822. }
  823. span = 0;
  824. }
  825. olen += 3;
  826. span += 3;
  827. }
  828. if (str_len > 0 && span + 3 >= str_len) {
  829. if (how == RSPAMD_TASK_NEWLINES_CRLF) {
  830. /* =\r\n */
  831. olen += 3;
  832. }
  833. else {
  834. olen += 2;
  835. }
  836. span = 0;
  837. }
  838. p ++;
  839. }
  840. out = g_malloc (olen + 1);
  841. p = in;
  842. i = 0;
  843. span = 0;
  844. while (p < end) {
  845. ch = *p;
  846. if (ch < 128 && ch != '\r' && ch != '\n') {
  847. out[i++] = ch;
  848. span ++;
  849. }
  850. else {
  851. if (str_len > 0 && span + 5 >= str_len) {
  852. /* Add new line and then continue */
  853. switch (how) {
  854. default:
  855. case RSPAMD_TASK_NEWLINES_CRLF:
  856. out[i++] = '=';
  857. out[i++] = '\r';
  858. out[i++] = '\n';
  859. break;
  860. case RSPAMD_TASK_NEWLINES_LF:
  861. out[i++] = '=';
  862. out[i++] = '\n';
  863. break;
  864. case RSPAMD_TASK_NEWLINES_CR:
  865. out[i++] = '=';
  866. out[i++] = '\r';
  867. break;
  868. }
  869. span = 0;
  870. }
  871. out[i++] = '=';
  872. out[i++] = hexdigests[((ch >> 4) & 0xF)];
  873. out[i++] = hexdigests[(ch & 0xF)];
  874. span += 3;
  875. }
  876. if (str_len > 0 && span + 3 >= str_len) {
  877. /* Add new line and then continue */
  878. switch (how) {
  879. default:
  880. case RSPAMD_TASK_NEWLINES_CRLF:
  881. out[i++] = '=';
  882. out[i++] = '\r';
  883. out[i++] = '\n';
  884. break;
  885. case RSPAMD_TASK_NEWLINES_LF:
  886. out[i++] = '=';
  887. out[i++] = '\n';
  888. break;
  889. case RSPAMD_TASK_NEWLINES_CR:
  890. out[i++] = '=';
  891. out[i++] = '\r';
  892. break;
  893. }
  894. span = 0;
  895. }
  896. g_assert (i <= olen);
  897. p ++;
  898. }
  899. out[i] = '\0';
  900. if (outlen) {
  901. *outlen = i;
  902. }
  903. return out;
  904. }
  905. #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
  906. gint
  907. rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
  908. const gchar *s2, gsize s2len,
  909. guint replace_cost)
  910. {
  911. gchar c1, c2, last_c2, last_c1;
  912. static GArray *current_row = NULL, *prev_row = NULL, *transp_row = NULL;
  913. gint eq;
  914. static const guint max_cmp = 8192;
  915. gint ret;
  916. g_assert (s1 != NULL);
  917. g_assert (s2 != NULL);
  918. if (s1len == 0) {
  919. s1len = strlen (s1);
  920. }
  921. if (s2len == 0) {
  922. s2len = strlen (s2);
  923. }
  924. if (MAX(s1len, s2len) > max_cmp) {
  925. /* Cannot compare too many characters */
  926. return max_cmp;
  927. }
  928. if (s1len > s2len) {
  929. /* Exchange s1 and s2 */
  930. const gchar *tmp;
  931. gsize tmplen;
  932. tmp = s2;
  933. s2 = s1;
  934. s1 = tmp;
  935. tmplen = s2len;
  936. s2len = s1len;
  937. s1len = tmplen;
  938. }
  939. /* Adjust static space */
  940. if (current_row == NULL) {
  941. current_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
  942. prev_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
  943. transp_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
  944. g_array_set_size (current_row, s1len + 1);
  945. g_array_set_size (prev_row, s1len + 1);
  946. g_array_set_size (transp_row, s1len + 1);
  947. }
  948. else if (current_row->len < s1len + 1) {
  949. g_array_set_size (current_row, s1len + 1);
  950. g_array_set_size (prev_row, s1len + 1);
  951. g_array_set_size (transp_row, s1len + 1);
  952. }
  953. memset (current_row->data, 0, (s1len + 1) * sizeof (gint));
  954. memset (transp_row->data, 0, (s1len + 1) * sizeof (gint));
  955. for (gint i = 0; i <= s1len; i++) {
  956. g_array_index (prev_row, gint, i) = i;
  957. }
  958. last_c2 = '\0';
  959. for (gint i = 1; i <= s2len; i++) {
  960. c2 = s2[i - 1];
  961. g_array_index (current_row, gint, 0) = i;
  962. last_c1 = '\0';
  963. for (gint j = 1; j <= s1len; j++) {
  964. c1 = s1[j - 1];
  965. eq = c1 == c2 ? 0 : replace_cost;
  966. ret = MIN3 (g_array_index (current_row, gint, j - 1) + 1, /* Insert */
  967. g_array_index (prev_row, gint, j) + 1, /* Remove */
  968. g_array_index (prev_row, gint, j - 1) + eq /* Replace */);
  969. /* Take reordering into account */
  970. if (c1 == last_c2 && c2 == last_c1 && j >= 2) {
  971. ret = MIN (ret, g_array_index (transp_row, gint, j - 2) + eq);
  972. }
  973. g_array_index (current_row, gint, j) = ret;
  974. last_c1 = c1;
  975. }
  976. last_c2 = c2;
  977. /* Exchange pointers */
  978. GArray *tmp;
  979. tmp = transp_row;
  980. transp_row = prev_row;
  981. prev_row = current_row;
  982. current_row = tmp;
  983. }
  984. ret = g_array_index (prev_row, gint, s1len);
  985. return ret;
  986. }
  987. GString *
  988. rspamd_header_value_fold (const gchar *name,
  989. const gchar *value,
  990. guint fold_max,
  991. enum rspamd_newlines_type how,
  992. const gchar *fold_on_chars)
  993. {
  994. GString *res;
  995. const guint default_fold_max = 76;
  996. guint cur_len;
  997. const gchar *p, *c;
  998. guint nspaces = 0;
  999. const gchar *last;
  1000. gboolean first_token = TRUE;
  1001. enum {
  1002. fold_before = 0,
  1003. fold_after
  1004. } fold_type = fold_before;
  1005. enum {
  1006. read_token = 0,
  1007. read_quoted,
  1008. after_quote,
  1009. fold_token,
  1010. } state = read_token, next_state = read_token;
  1011. g_assert (name != NULL);
  1012. g_assert (value != NULL);
  1013. /* Filter insane values */
  1014. if (fold_max < 20) {
  1015. fold_max = default_fold_max;
  1016. }
  1017. res = g_string_sized_new (strlen (value));
  1018. c = value;
  1019. p = c;
  1020. /* name:<WSP> */
  1021. cur_len = strlen (name) + 2;
  1022. while (*p) {
  1023. switch (state) {
  1024. case read_token:
  1025. if (fold_on_chars) {
  1026. if (strchr (fold_on_chars, *p) != NULL) {
  1027. fold_type = fold_after;
  1028. state = fold_token;
  1029. next_state = read_token;
  1030. }
  1031. p ++;
  1032. }
  1033. else {
  1034. if (*p == ',' || *p == ';') {
  1035. /* We have something similar to the token's end, so check len */
  1036. if (cur_len > fold_max * 0.8 && cur_len < fold_max) {
  1037. /* We want fold */
  1038. fold_type = fold_after;
  1039. state = fold_token;
  1040. next_state = read_token;
  1041. } else if (cur_len > fold_max && !first_token) {
  1042. fold_type = fold_before;
  1043. state = fold_token;
  1044. next_state = read_token;
  1045. } else {
  1046. g_string_append_len (res, c, p - c + 1);
  1047. c = p + 1;
  1048. first_token = FALSE;
  1049. }
  1050. p++;
  1051. } else if (*p == '"') {
  1052. /* Fold before quoted tokens */
  1053. g_string_append_len (res, c, p - c);
  1054. c = p;
  1055. state = read_quoted;
  1056. } else if (*p == '\r' || *p == '\n') {
  1057. if (cur_len > fold_max && !first_token) {
  1058. fold_type = fold_before;
  1059. state = fold_token;
  1060. next_state = read_token;
  1061. } else {
  1062. /* Reset line length */
  1063. cur_len = 0;
  1064. while (g_ascii_isspace (*p)) {
  1065. p++;
  1066. }
  1067. g_string_append_len (res, c, p - c);
  1068. c = p;
  1069. first_token = TRUE;
  1070. }
  1071. } else if (g_ascii_isspace (*p)) {
  1072. if (cur_len > fold_max * 0.8 && cur_len < fold_max) {
  1073. /* We want fold */
  1074. fold_type = fold_after;
  1075. state = fold_token;
  1076. next_state = read_token;
  1077. } else if (cur_len > fold_max && !first_token) {
  1078. fold_type = fold_before;
  1079. state = fold_token;
  1080. next_state = read_token;
  1081. } else {
  1082. g_string_append_len (res, c, p - c);
  1083. c = p;
  1084. first_token = FALSE;
  1085. p++;
  1086. cur_len++;
  1087. }
  1088. } else {
  1089. p++;
  1090. cur_len++;
  1091. }
  1092. }
  1093. break;
  1094. case fold_token:
  1095. /* Here, we have token start at 'c' and token end at 'p' */
  1096. if (fold_type == fold_after) {
  1097. nspaces = 0;
  1098. if (p > c) {
  1099. g_string_append_len (res, c, p - c);
  1100. /*
  1101. * Check any spaces that are appended to the result
  1102. * before folding
  1103. */
  1104. last = &res->str[res->len - 1];
  1105. while (g_ascii_isspace (*last)) {
  1106. last --;
  1107. nspaces ++;
  1108. res->len --;
  1109. }
  1110. }
  1111. switch (how) {
  1112. case RSPAMD_TASK_NEWLINES_LF:
  1113. g_string_append_len (res, "\n\t", 2);
  1114. break;
  1115. case RSPAMD_TASK_NEWLINES_CR:
  1116. g_string_append_len (res, "\r\t", 2);
  1117. break;
  1118. case RSPAMD_TASK_NEWLINES_CRLF:
  1119. default:
  1120. g_string_append_len (res, "\r\n\t", 3);
  1121. break;
  1122. }
  1123. /* Skip space if needed */
  1124. if (g_ascii_isspace (*p)) {
  1125. p ++;
  1126. }
  1127. /* Move leftover spaces */
  1128. while (nspaces) {
  1129. g_string_append_c (res, ' ');
  1130. nspaces --;
  1131. }
  1132. cur_len = 0;
  1133. }
  1134. else {
  1135. const gchar *last;
  1136. /* Skip space if needed */
  1137. if (g_ascii_isspace (*c) && p > c) {
  1138. c ++;
  1139. }
  1140. /* Avoid double folding */
  1141. last = &res->str[res->len - 1];
  1142. last --;
  1143. if (*last != '\r' && *last != '\n') {
  1144. last ++;
  1145. while (g_ascii_isspace (*last)) {
  1146. last --;
  1147. nspaces ++;
  1148. res->len --;
  1149. }
  1150. switch (how) {
  1151. case RSPAMD_TASK_NEWLINES_LF:
  1152. g_string_append_len (res, "\n\t", 2);
  1153. break;
  1154. case RSPAMD_TASK_NEWLINES_CR:
  1155. g_string_append_len (res, "\r\t", 2);
  1156. break;
  1157. case RSPAMD_TASK_NEWLINES_CRLF:
  1158. default:
  1159. g_string_append_len (res, "\r\n\t", 3);
  1160. break;
  1161. }
  1162. }
  1163. /* Move leftover spaces */
  1164. cur_len = nspaces;
  1165. while (nspaces) {
  1166. g_string_append_c (res, ' ');
  1167. nspaces --;
  1168. }
  1169. if (p > c) {
  1170. g_string_append_len (res, c, p - c);
  1171. cur_len += p - c;
  1172. }
  1173. else {
  1174. cur_len = 0;
  1175. }
  1176. }
  1177. first_token = TRUE;
  1178. c = p;
  1179. state = next_state;
  1180. break;
  1181. case read_quoted:
  1182. if (p != c && *p == '"') {
  1183. state = after_quote;
  1184. }
  1185. p ++;
  1186. cur_len ++;
  1187. break;
  1188. case after_quote:
  1189. state = read_token;
  1190. /* Skip one more character after the quote */
  1191. p ++;
  1192. cur_len ++;
  1193. g_string_append_len (res, c, p - c);
  1194. c = p;
  1195. first_token = TRUE;
  1196. break;
  1197. }
  1198. }
  1199. /* Last token */
  1200. switch (state) {
  1201. case read_token:
  1202. if (!fold_on_chars && cur_len > fold_max && !first_token) {
  1203. if (g_ascii_isspace (*c)) {
  1204. c ++;
  1205. }
  1206. switch (how) {
  1207. case RSPAMD_TASK_NEWLINES_LF:
  1208. g_string_append_len (res, "\n\t", 2);
  1209. break;
  1210. case RSPAMD_TASK_NEWLINES_CR:
  1211. g_string_append_len (res, "\r\t", 2);
  1212. break;
  1213. case RSPAMD_TASK_NEWLINES_CRLF:
  1214. default:
  1215. g_string_append_len (res, "\r\n\t", 3);
  1216. break;
  1217. }
  1218. g_string_append_len (res, c, p - c);
  1219. }
  1220. else {
  1221. g_string_append_len (res, c, p - c);
  1222. }
  1223. break;
  1224. case read_quoted:
  1225. case after_quote:
  1226. g_string_append_len (res, c, p - c);
  1227. break;
  1228. case fold_token:
  1229. /* Here, we have token start at 'c' and token end at 'p' */
  1230. if (g_ascii_isspace (res->str[res->len - 1])) {
  1231. g_string_append_len (res, c, p - c);
  1232. }
  1233. else {
  1234. if (*c != '\r' && *c != '\n') {
  1235. /* We need to add folding as well */
  1236. switch (how) {
  1237. case RSPAMD_TASK_NEWLINES_LF:
  1238. g_string_append_len (res, "\n\t", 2);
  1239. break;
  1240. case RSPAMD_TASK_NEWLINES_CR:
  1241. g_string_append_len (res, "\r\t", 2);
  1242. break;
  1243. case RSPAMD_TASK_NEWLINES_CRLF:
  1244. default:
  1245. g_string_append_len (res, "\r\n\t", 3);
  1246. break;
  1247. }
  1248. g_string_append_len (res, c, p - c);
  1249. }
  1250. else {
  1251. g_string_append_len (res, c, p - c);
  1252. }
  1253. }
  1254. break;
  1255. default:
  1256. g_assert (p == c);
  1257. break;
  1258. }
  1259. return res;
  1260. }
  1261. static inline bool rspamd_substring_cmp_func (guchar a, guchar b) { return a == b; }
  1262. static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) { return lc_map[a] == lc_map[b]; }
  1263. typedef bool (*rspamd_cmpchar_func_t) (guchar a, guchar b);
  1264. static inline void
  1265. rspamd_substring_preprocess_kmp (const gchar *pat, gsize len, goffset *fsm,
  1266. rspamd_cmpchar_func_t f)
  1267. {
  1268. goffset i, j;
  1269. i = 0;
  1270. j = -1;
  1271. fsm[0] = -1;
  1272. while (i < len) {
  1273. while (j > -1 && !f(pat[i], pat[j])) {
  1274. j = fsm[j];
  1275. }
  1276. i++;
  1277. j++;
  1278. if (i < len && j < len && f(pat[i], pat[j])) {
  1279. fsm[i] = fsm[j];
  1280. }
  1281. else {
  1282. fsm[i] = j;
  1283. }
  1284. }
  1285. }
  1286. static inline goffset
  1287. rspamd_substring_search_common (const gchar *in, gsize inlen,
  1288. const gchar *srch, gsize srchlen, rspamd_cmpchar_func_t f)
  1289. {
  1290. static goffset st_fsm[128];
  1291. goffset *fsm;
  1292. goffset i, j, k, ell, ret = -1;
  1293. if (G_LIKELY (srchlen < G_N_ELEMENTS (st_fsm))) {
  1294. fsm = st_fsm;
  1295. }
  1296. else {
  1297. fsm = g_malloc ((srchlen + 1) * sizeof (*fsm));
  1298. }
  1299. rspamd_substring_preprocess_kmp (srch, srchlen, fsm, f);
  1300. for (ell = 1; f(srch[ell - 1], srch[ell]); ell++) {}
  1301. if (ell == srchlen) {
  1302. ell = 0;
  1303. }
  1304. /* Searching */
  1305. i = ell;
  1306. j = k = 0;
  1307. while (j <= inlen - srchlen) {
  1308. while (i < srchlen && f(srch[i], in[i + j])) {
  1309. ++i;
  1310. }
  1311. if (i >= srchlen) {
  1312. while (k < ell && f(srch[k], in[j + k])) {
  1313. ++k;
  1314. }
  1315. if (k >= ell) {
  1316. ret = j;
  1317. goto out;
  1318. }
  1319. }
  1320. j += (i - fsm[i]);
  1321. if (i == ell) {
  1322. k = MAX(0, k - 1);
  1323. }
  1324. else {
  1325. if (fsm[i] <= ell) {
  1326. k = MAX(0, fsm[i]);
  1327. i = ell;
  1328. } else {
  1329. k = ell;
  1330. i = fsm[i];
  1331. }
  1332. }
  1333. }
  1334. out:
  1335. if (G_UNLIKELY (srchlen >= G_N_ELEMENTS (st_fsm))) {
  1336. g_free (fsm);
  1337. }
  1338. return ret;
  1339. }
  1340. goffset
  1341. rspamd_substring_search (const gchar *in, gsize inlen,
  1342. const gchar *srch, gsize srchlen)
  1343. {
  1344. if (inlen > srchlen) {
  1345. if (G_UNLIKELY (srchlen == 1)) {
  1346. const gchar *p;
  1347. p = memchr (in, srch[0], inlen);
  1348. if (p) {
  1349. return p - in;
  1350. }
  1351. return (-1);
  1352. }
  1353. return rspamd_substring_search_common (in, inlen, srch, srchlen,
  1354. rspamd_substring_cmp_func);
  1355. }
  1356. else if (inlen == srchlen) {
  1357. return rspamd_lc_cmp (srch, in, srchlen) == 0;
  1358. }
  1359. else {
  1360. return (-1);
  1361. }
  1362. return (-1);
  1363. }
  1364. goffset
  1365. rspamd_substring_search_caseless (const gchar *in, gsize inlen,
  1366. const gchar *srch, gsize srchlen)
  1367. {
  1368. if (inlen > srchlen) {
  1369. if (G_UNLIKELY (srchlen == 1)) {
  1370. goffset i;
  1371. gchar s = lc_map[(guchar)srch[0]];
  1372. for (i = 0; i < inlen; i++) {
  1373. if (lc_map[(guchar)in[i]] == s) {
  1374. return i;
  1375. }
  1376. }
  1377. return (-1);
  1378. }
  1379. return rspamd_substring_search_common (in, inlen, srch, srchlen,
  1380. rspamd_substring_casecmp_func);
  1381. }
  1382. else if (inlen == srchlen) {
  1383. return rspamd_lc_cmp (srch, in, srchlen) == 0 ? 0 : (-1);
  1384. }
  1385. return (-1);
  1386. }
  1387. goffset
  1388. rspamd_string_find_eoh (GString *input, goffset *body_start)
  1389. {
  1390. const gchar *p, *c = NULL, *end;
  1391. enum {
  1392. skip_char = 0,
  1393. got_cr,
  1394. got_lf,
  1395. got_linebreak,
  1396. got_linebreak_cr,
  1397. got_linebreak_lf,
  1398. obs_fws
  1399. } state = skip_char;
  1400. g_assert (input != NULL);
  1401. p = input->str;
  1402. end = p + input->len;
  1403. while (p < end) {
  1404. switch (state) {
  1405. case skip_char:
  1406. if (*p == '\r') {
  1407. p++;
  1408. state = got_cr;
  1409. }
  1410. else if (*p == '\n') {
  1411. p++;
  1412. state = got_lf;
  1413. }
  1414. else {
  1415. p++;
  1416. }
  1417. break;
  1418. case got_cr:
  1419. if (*p == '\r') {
  1420. /*
  1421. * Double \r\r, so need to check the current char
  1422. * if it is '\n', then we have \r\r\n sequence, that is NOT
  1423. * double end of line
  1424. */
  1425. if (p < end && p[1] == '\n') {
  1426. p++;
  1427. state = got_lf;
  1428. }
  1429. else {
  1430. /* We have \r\r[^\n] */
  1431. if (body_start) {
  1432. *body_start = p - input->str + 1;
  1433. }
  1434. return p - input->str;
  1435. }
  1436. }
  1437. else if (*p == '\n') {
  1438. p++;
  1439. state = got_lf;
  1440. }
  1441. else if (g_ascii_isspace (*p)) {
  1442. /* We have \r<space>*, allow to stay in this state */
  1443. c = p;
  1444. p ++;
  1445. state = obs_fws;
  1446. }
  1447. else {
  1448. p++;
  1449. state = skip_char;
  1450. }
  1451. break;
  1452. case got_lf:
  1453. if (*p == '\n') {
  1454. /* We have \n\n, which is obviously end of headers */
  1455. if (body_start) {
  1456. *body_start = p - input->str + 1;
  1457. }
  1458. return p - input->str;
  1459. }
  1460. else if (*p == '\r') {
  1461. state = got_linebreak;
  1462. }
  1463. else if (g_ascii_isspace (*p)) {
  1464. /* We have \n<space>*, allow to stay in this state */
  1465. c = p;
  1466. p ++;
  1467. state = obs_fws;
  1468. }
  1469. else {
  1470. p++;
  1471. state = skip_char;
  1472. }
  1473. break;
  1474. case got_linebreak:
  1475. if (*p == '\r') {
  1476. c = p;
  1477. p++;
  1478. state = got_linebreak_cr;
  1479. }
  1480. else if (*p == '\n') {
  1481. c = p;
  1482. p++;
  1483. state = got_linebreak_lf;
  1484. }
  1485. else if (g_ascii_isspace (*p)) {
  1486. /* We have <linebreak><space>*, allow to stay in this state */
  1487. c = p;
  1488. p ++;
  1489. state = obs_fws;
  1490. }
  1491. else {
  1492. p++;
  1493. state = skip_char;
  1494. }
  1495. break;
  1496. case got_linebreak_cr:
  1497. if (*p == '\r') {
  1498. /* Got double \r\r after \n, so does not treat it as EOH */
  1499. state = got_linebreak_cr;
  1500. p++;
  1501. }
  1502. else if (*p == '\n') {
  1503. state = got_linebreak_lf;
  1504. p++;
  1505. }
  1506. else if (g_ascii_isspace (*p)) {
  1507. /* We have \r\n<space>*, allow to keep in this state */
  1508. c = p;
  1509. state = obs_fws;
  1510. p ++;
  1511. }
  1512. else {
  1513. p++;
  1514. state = skip_char;
  1515. }
  1516. break;
  1517. case got_linebreak_lf:
  1518. g_assert (c != NULL);
  1519. if (body_start) {
  1520. /* \r\n\r\n */
  1521. *body_start = p - input->str;
  1522. }
  1523. return c - input->str;
  1524. case obs_fws:
  1525. if (*p == ' ' || *p == '\t') {
  1526. p ++;
  1527. }
  1528. else if (*p == '\r') {
  1529. /* Perform lookahead due to #2349 */
  1530. if (end - p > 2) {
  1531. if (p[1] == '\n' && g_ascii_isspace (p[2])) {
  1532. /* Real obs_fws state, switch */
  1533. c = p;
  1534. p ++;
  1535. state = got_cr;
  1536. }
  1537. else if (g_ascii_isspace (p[1])) {
  1538. p ++;
  1539. state = obs_fws;
  1540. }
  1541. else {
  1542. /*
  1543. * newline wsp+ \r <nwsp>, hence:
  1544. * c -> eoh
  1545. * p + 1 -> body start
  1546. */
  1547. if (body_start) {
  1548. /* \r\n\r\n */
  1549. *body_start = p - input->str + 1;
  1550. }
  1551. return c - input->str;
  1552. }
  1553. }
  1554. else {
  1555. /* shortage */
  1556. if (body_start) {
  1557. *body_start = p - input->str + 1;
  1558. }
  1559. return p - input->str;
  1560. }
  1561. }
  1562. else if (*p == '\n') {
  1563. /* Perform lookahead due to #2349 */
  1564. if (end - p > 1) {
  1565. if (p[1] == ' ' || p[1] == '\t') {
  1566. c = p;
  1567. p ++;
  1568. state = obs_fws;
  1569. }
  1570. else if (p[1] == '\r') {
  1571. c = p;
  1572. p ++;
  1573. state = got_lf;
  1574. }
  1575. else if (p[1] == '\n') {
  1576. c = p;
  1577. p ++;
  1578. state = got_lf;
  1579. }
  1580. else {
  1581. /*
  1582. * newline wsp+ \n <nwsp>, hence:
  1583. * c -> eoh
  1584. * p + 1 -> body start
  1585. */
  1586. if (body_start) {
  1587. /* \r\n\r\n */
  1588. *body_start = p - input->str + 1;
  1589. }
  1590. return c - input->str;
  1591. }
  1592. }
  1593. else {
  1594. /* shortage */
  1595. if (body_start) {
  1596. *body_start = p - input->str + 1;
  1597. }
  1598. return p - input->str;
  1599. }
  1600. }
  1601. else {
  1602. p++;
  1603. state = skip_char;
  1604. }
  1605. break;
  1606. }
  1607. }
  1608. if (state == got_linebreak_lf) {
  1609. if (body_start) {
  1610. /* \r\n\r\n */
  1611. *body_start = p - input->str;
  1612. }
  1613. return c - input->str;
  1614. }
  1615. return -1;
  1616. }
  1617. gint
  1618. rspamd_encode_hex_buf (const guchar *in, gsize inlen, gchar *out,
  1619. gsize outlen)
  1620. {
  1621. gchar *o, *end;
  1622. const guchar *p;
  1623. static const gchar hexdigests[16] = "0123456789abcdef";
  1624. end = out + outlen;
  1625. o = out;
  1626. p = in;
  1627. while (inlen > 0 && o < end - 1) {
  1628. *o++ = hexdigests[((*p >> 4) & 0xF)];
  1629. *o++ = hexdigests[((*p++) & 0xF)];
  1630. inlen --;
  1631. }
  1632. if (o <= end) {
  1633. return (o - out);
  1634. }
  1635. return -1;
  1636. }
  1637. gchar *
  1638. rspamd_encode_hex (const guchar *in, gsize inlen)
  1639. {
  1640. gchar *out;
  1641. gsize outlen = inlen * 2 + 1;
  1642. gint olen;
  1643. if (in == NULL) {
  1644. return NULL;
  1645. }
  1646. out = g_malloc (outlen);
  1647. olen = rspamd_encode_hex_buf (in, inlen, out, outlen - 1);
  1648. if (olen >= 0) {
  1649. out[olen] = '\0';
  1650. }
  1651. else {
  1652. g_free (out);
  1653. return NULL;
  1654. }
  1655. return out;
  1656. }
  1657. gssize
  1658. rspamd_decode_hex_buf (const gchar *in, gsize inlen,
  1659. guchar *out, gsize outlen)
  1660. {
  1661. guchar *o, *end, ret = 0;
  1662. const gchar *p;
  1663. gchar c;
  1664. end = out + outlen;
  1665. o = out;
  1666. p = in;
  1667. /* We ignore trailing chars if we have not even input */
  1668. inlen = inlen - inlen % 2;
  1669. while (inlen > 1 && o < end) {
  1670. c = *p++;
  1671. if (c >= '0' && c <= '9') ret = c - '0';
  1672. else if (c >= 'A' && c <= 'F') ret = c - 'A' + 10;
  1673. else if (c >= 'a' && c <= 'f') ret = c - 'a' + 10;
  1674. c = *p++;
  1675. ret *= 16;
  1676. if (c >= '0' && c <= '9') ret += c - '0';
  1677. else if (c >= 'A' && c <= 'F') ret += c - 'A' + 10;
  1678. else if (c >= 'a' && c <= 'f') ret += c - 'a' + 10;
  1679. *o++ = ret;
  1680. inlen -= 2;
  1681. }
  1682. if (o <= end) {
  1683. return (o - out);
  1684. }
  1685. return -1;
  1686. }
  1687. guchar*
  1688. rspamd_decode_hex (const gchar *in, gsize inlen)
  1689. {
  1690. guchar *out;
  1691. gsize outlen = (inlen / 2 + inlen % 2) + 1;
  1692. gint olen;
  1693. if (in == NULL) {
  1694. return NULL;
  1695. }
  1696. out = g_malloc (outlen);
  1697. olen = rspamd_decode_hex_buf (in, inlen, out, outlen - 1);
  1698. if (olen >= 0) {
  1699. out[olen] = '\0';
  1700. return out;
  1701. }
  1702. g_free (out);
  1703. return NULL;
  1704. }
  1705. gssize
  1706. rspamd_decode_qp_buf (const gchar *in, gsize inlen,
  1707. gchar *out, gsize outlen)
  1708. {
  1709. gchar *o, *end, *pos, c;
  1710. const gchar *p;
  1711. guchar ret;
  1712. gssize remain, processed;
  1713. p = in;
  1714. o = out;
  1715. end = out + outlen;
  1716. remain = inlen;
  1717. while (remain > 0 && o < end) {
  1718. if (*p == '=') {
  1719. p ++;
  1720. remain --;
  1721. if (remain == 0) {
  1722. if (end - o > 0) {
  1723. *o++ = *p;
  1724. break;
  1725. }
  1726. }
  1727. decode:
  1728. /* Decode character after '=' */
  1729. c = *p++;
  1730. remain --;
  1731. ret = 0;
  1732. if (c >= '0' && c <= '9') { ret = c - '0'; }
  1733. else if (c >= 'A' && c <= 'F') { ret = c - 'A' + 10; }
  1734. else if (c >= 'a' && c <= 'f') { ret = c - 'a' + 10; }
  1735. else if (c == '\r' || c == '\n') {
  1736. /* Soft line break */
  1737. while (remain > 0 && (*p == '\r' || *p == '\n')) {
  1738. remain --;
  1739. p ++;
  1740. }
  1741. continue;
  1742. }
  1743. else {
  1744. /* Hack, hack, hack, treat =<garbadge> as =<garbadge> */
  1745. if (remain > 0) {
  1746. *o++ = *(p - 1);
  1747. }
  1748. continue;
  1749. }
  1750. if (remain > 0) {
  1751. c = *p++;
  1752. ret *= 16;
  1753. if (c >= '0' && c <= '9') { ret += c - '0'; }
  1754. else if (c >= 'A' && c <= 'F') { ret += c - 'A' + 10; }
  1755. else if (c >= 'a' && c <= 'f') { ret += c - 'a' + 10; }
  1756. if (end - o > 0) {
  1757. *o++ = (gchar)ret;
  1758. }
  1759. else {
  1760. return (-1);
  1761. }
  1762. remain --;
  1763. }
  1764. }
  1765. else {
  1766. if (end - o >= remain) {
  1767. if ((pos = memccpy (o, p, '=', remain)) == NULL) {
  1768. /* All copied */
  1769. o += remain;
  1770. break;
  1771. }
  1772. else {
  1773. processed = pos - o;
  1774. remain -= processed;
  1775. p += processed;
  1776. o = pos - 1;
  1777. /* Skip comparison, as we know that we have found match */
  1778. goto decode;
  1779. }
  1780. }
  1781. else {
  1782. /* Buffer overflow */
  1783. return (-1);
  1784. }
  1785. }
  1786. }
  1787. return (o - out);
  1788. }
  1789. #define BITOP(a,b,op) \
  1790. ((a)[(gsize)(b)/(8*sizeof *(a))] op (gsize)1<<((gsize)(b)%(8*sizeof *(a))))
  1791. gsize
  1792. rspamd_memcspn (const gchar *s, const gchar *e, gsize len)
  1793. {
  1794. gsize byteset[32 / sizeof(gsize)];
  1795. const gchar *p = s, *end = s + len;
  1796. if (!e[1]) {
  1797. for (; p < end && *p != *e; p++);
  1798. return p - s;
  1799. }
  1800. memset (byteset, 0, sizeof byteset);
  1801. for (; *e && BITOP (byteset, *(guchar *)e, |=); e++);
  1802. for (; p < end && !BITOP (byteset, *(guchar *)p, &); p++);
  1803. return p - s;
  1804. }
  1805. gsize
  1806. rspamd_memspn (const gchar *s, const gchar *e, gsize len)
  1807. {
  1808. gsize byteset[32 / sizeof(gsize)];
  1809. const gchar *p = s, *end = s + len;
  1810. if (!e[1]) {
  1811. for (; p < end && *p == *e; p++);
  1812. return p - s;
  1813. }
  1814. memset (byteset, 0, sizeof byteset);
  1815. for (; *e && BITOP (byteset, *(guchar *)e, |=); e++);
  1816. for (; p < end && BITOP (byteset, *(guchar *)p, &); p++);
  1817. return p - s;
  1818. }
  1819. gssize
  1820. rspamd_decode_qp2047_buf (const gchar *in, gsize inlen,
  1821. gchar *out, gsize outlen)
  1822. {
  1823. gchar *o, *end, c;
  1824. const gchar *p;
  1825. guchar ret;
  1826. gsize remain, processed;
  1827. p = in;
  1828. o = out;
  1829. end = out + outlen;
  1830. remain = inlen;
  1831. while (remain > 0 && o < end) {
  1832. if (*p == '=') {
  1833. p ++;
  1834. remain --;
  1835. if (remain == 0) {
  1836. if (end - o > 0) {
  1837. *o++ = *p;
  1838. break;
  1839. }
  1840. }
  1841. decode:
  1842. /* Decode character after '=' */
  1843. c = *p++;
  1844. remain --;
  1845. ret = 0;
  1846. if (c >= '0' && c <= '9') { ret = c - '0'; }
  1847. else if (c >= 'A' && c <= 'F') { ret = c - 'A' + 10; }
  1848. else if (c >= 'a' && c <= 'f') { ret = c - 'a' + 10; }
  1849. else if (c == '\r' || c == '\n') {
  1850. /* Soft line break */
  1851. while (remain > 0 && (*p == '\r' || *p == '\n')) {
  1852. remain --;
  1853. p ++;
  1854. }
  1855. continue;
  1856. }
  1857. if (remain > 0) {
  1858. c = *p++;
  1859. ret *= 16;
  1860. if (c >= '0' && c <= '9') { ret += c - '0'; }
  1861. else if (c >= 'A' && c <= 'F') { ret += c - 'A' + 10; }
  1862. else if (c >= 'a' && c <= 'f') { ret += c - 'a' + 10; }
  1863. if (end - o > 0) {
  1864. *o++ = (gchar)ret;
  1865. }
  1866. else {
  1867. return (-1);
  1868. }
  1869. remain --;
  1870. }
  1871. }
  1872. else {
  1873. if (end - o >= remain) {
  1874. processed = rspamd_memcspn (p, "=_", remain);
  1875. memcpy (o, p, processed);
  1876. o += processed;
  1877. if (processed == remain) {
  1878. break;
  1879. }
  1880. else {
  1881. remain -= processed;
  1882. p += processed;
  1883. if (G_LIKELY (*p == '=')) {
  1884. p ++;
  1885. /* Skip comparison, as we know that we have found match */
  1886. remain --;
  1887. goto decode;
  1888. }
  1889. else {
  1890. *o++ = ' ';
  1891. p ++;
  1892. remain --;
  1893. }
  1894. }
  1895. }
  1896. else {
  1897. /* Buffer overflow */
  1898. return (-1);
  1899. }
  1900. }
  1901. }
  1902. return (o - out);
  1903. }
  1904. gssize
  1905. rspamd_encode_qp2047_buf (const gchar *in, gsize inlen,
  1906. gchar *out, gsize outlen)
  1907. {
  1908. gchar *o = out, *end = out + outlen, c;
  1909. static const gchar hexdigests[16] = "0123456789ABCDEF";
  1910. while (inlen > 0 && o < end) {
  1911. c = *in;
  1912. if (g_ascii_isalnum (c)) {
  1913. *o++ = c;
  1914. }
  1915. else if (c == ' ') {
  1916. *o++ = '_';
  1917. }
  1918. else if (end - o >= 3){
  1919. *o++ = '=';
  1920. *o++ = hexdigests[((c >> 4) & 0xF)];
  1921. *o++ = hexdigests[(c & 0xF)];
  1922. }
  1923. else {
  1924. return (-1);
  1925. }
  1926. in ++;
  1927. inlen --;
  1928. }
  1929. if (inlen != 0) {
  1930. return (-1);
  1931. }
  1932. return (o - out);
  1933. }
  1934. /*
  1935. * GString ucl emitting functions
  1936. */
  1937. static int
  1938. rspamd_gstring_append_character (unsigned char c, size_t len, void *ud)
  1939. {
  1940. GString *buf = ud;
  1941. gsize old_len;
  1942. if (len == 1) {
  1943. g_string_append_c (buf, c);
  1944. }
  1945. else {
  1946. if (buf->allocated_len - buf->len <= len) {
  1947. old_len = buf->len;
  1948. g_string_set_size (buf, buf->len + len + 1);
  1949. buf->len = old_len;
  1950. }
  1951. memset (&buf->str[buf->len], c, len);
  1952. buf->len += len;
  1953. }
  1954. return 0;
  1955. }
  1956. static int
  1957. rspamd_gstring_append_len (const unsigned char *str, size_t len, void *ud)
  1958. {
  1959. GString *buf = ud;
  1960. g_string_append_len (buf, str, len);
  1961. return 0;
  1962. }
  1963. static int
  1964. rspamd_gstring_append_int (int64_t val, void *ud)
  1965. {
  1966. GString *buf = ud;
  1967. rspamd_printf_gstring (buf, "%L", (intmax_t) val);
  1968. return 0;
  1969. }
  1970. static int
  1971. rspamd_gstring_append_double (double val, void *ud)
  1972. {
  1973. GString *buf = ud;
  1974. const double delta = 0.0000001;
  1975. if (isfinite (val)) {
  1976. if (val == (double) (int) val) {
  1977. rspamd_printf_gstring (buf, "%.1f", val);
  1978. } else if (fabs (val - (double) (int) val) < delta) {
  1979. /* Write at maximum precision */
  1980. rspamd_printf_gstring (buf, "%.*g", DBL_DIG, val);
  1981. } else {
  1982. rspamd_printf_gstring (buf, "%f", val);
  1983. }
  1984. }
  1985. else {
  1986. rspamd_printf_gstring (buf, "null");
  1987. }
  1988. return 0;
  1989. }
  1990. void
  1991. rspamd_ucl_emit_gstring_comments (const ucl_object_t *obj,
  1992. enum ucl_emitter emit_type,
  1993. GString *target,
  1994. const ucl_object_t *comments)
  1995. {
  1996. struct ucl_emitter_functions func = {
  1997. .ucl_emitter_append_character = rspamd_gstring_append_character,
  1998. .ucl_emitter_append_len = rspamd_gstring_append_len,
  1999. .ucl_emitter_append_int = rspamd_gstring_append_int,
  2000. .ucl_emitter_append_double = rspamd_gstring_append_double
  2001. };
  2002. func.ud = target;
  2003. ucl_object_emit_full (obj, emit_type, &func, comments);
  2004. }
  2005. /*
  2006. * FString ucl emitting functions
  2007. */
  2008. static int
  2009. rspamd_fstring_emit_append_character (unsigned char c, size_t len, void *ud)
  2010. {
  2011. rspamd_fstring_t **buf = ud;
  2012. *buf = rspamd_fstring_append_chars (*buf, c, len);
  2013. return 0;
  2014. }
  2015. static int
  2016. rspamd_fstring_emit_append_len (const unsigned char *str, size_t len, void *ud)
  2017. {
  2018. rspamd_fstring_t **buf = ud;
  2019. *buf = rspamd_fstring_append (*buf, str, len);
  2020. return 0;
  2021. }
  2022. static int
  2023. rspamd_fstring_emit_append_int (int64_t val, void *ud)
  2024. {
  2025. rspamd_fstring_t **buf = ud;
  2026. rspamd_printf_fstring (buf, "%L", (intmax_t) val);
  2027. return 0;
  2028. }
  2029. static int
  2030. rspamd_fstring_emit_append_double (double val, void *ud)
  2031. {
  2032. rspamd_fstring_t **buf = ud;
  2033. #define MAX_PRECISION 6
  2034. if (isfinite (val)) {
  2035. if (val == (double) ((gint) val)) {
  2036. rspamd_printf_fstring (buf, "%.1f", val);
  2037. } else {
  2038. rspamd_printf_fstring (buf, "%." G_STRINGIFY (MAX_PRECISION) "f",
  2039. val);
  2040. }
  2041. }
  2042. else {
  2043. rspamd_printf_fstring (buf, "null");
  2044. }
  2045. return 0;
  2046. }
  2047. void
  2048. rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
  2049. enum ucl_emitter emit_type,
  2050. rspamd_fstring_t **buf,
  2051. const ucl_object_t *comments)
  2052. {
  2053. struct ucl_emitter_functions func = {
  2054. .ucl_emitter_append_character = rspamd_fstring_emit_append_character,
  2055. .ucl_emitter_append_len = rspamd_fstring_emit_append_len,
  2056. .ucl_emitter_append_int = rspamd_fstring_emit_append_int,
  2057. .ucl_emitter_append_double = rspamd_fstring_emit_append_double
  2058. };
  2059. func.ud = buf;
  2060. ucl_object_emit_full (obj, emit_type, &func, comments);
  2061. }
  2062. const void *
  2063. rspamd_memrchr (const void *m, gint c, gsize len)
  2064. {
  2065. const guint8 *p = m;
  2066. gsize i;
  2067. for (i = len; i > 0; i --) {
  2068. if (p[i - 1] == c) {
  2069. return p + i - 1;
  2070. }
  2071. }
  2072. return NULL;
  2073. }
  2074. struct UConverter *
  2075. rspamd_get_utf8_converter (void)
  2076. {
  2077. static UConverter *utf8_conv = NULL;
  2078. UErrorCode uc_err = U_ZERO_ERROR;
  2079. if (utf8_conv == NULL) {
  2080. utf8_conv = ucnv_open ("UTF-8", &uc_err);
  2081. if (!U_SUCCESS (uc_err)) {
  2082. msg_err ("FATAL error: cannot open converter for utf8: %s",
  2083. u_errorName (uc_err));
  2084. g_assert_not_reached ();
  2085. }
  2086. ucnv_setFromUCallBack (utf8_conv,
  2087. UCNV_FROM_U_CALLBACK_SUBSTITUTE,
  2088. NULL,
  2089. NULL,
  2090. NULL,
  2091. &uc_err);
  2092. ucnv_setToUCallBack (utf8_conv,
  2093. UCNV_TO_U_CALLBACK_SUBSTITUTE,
  2094. NULL,
  2095. NULL,
  2096. NULL,
  2097. &uc_err);
  2098. }
  2099. return utf8_conv;
  2100. }
  2101. const struct UNormalizer2 *
  2102. rspamd_get_unicode_normalizer (void)
  2103. {
  2104. #if U_ICU_VERSION_MAJOR_NUM >= 44
  2105. UErrorCode uc_err = U_ZERO_ERROR;
  2106. static const UNormalizer2 *norm = NULL;
  2107. if (norm == NULL) {
  2108. norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
  2109. g_assert (U_SUCCESS (uc_err));
  2110. }
  2111. return norm;
  2112. #else
  2113. /* Old libicu */
  2114. return NULL;
  2115. #endif
  2116. }
  2117. enum rspamd_normalise_result
  2118. rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
  2119. guint *len)
  2120. {
  2121. #if U_ICU_VERSION_MAJOR_NUM >= 44
  2122. UErrorCode uc_err = U_ZERO_ERROR;
  2123. UConverter *utf8_conv = rspamd_get_utf8_converter ();
  2124. const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
  2125. gint32 nsym, end;
  2126. UChar *src = NULL, *dest = NULL;
  2127. enum rspamd_normalise_result ret = 0;
  2128. gboolean has_invisible = FALSE;
  2129. /* We first need to convert data to UChars :( */
  2130. src = g_malloc ((*len + 1) * sizeof (*src));
  2131. nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
  2132. start, *len, &uc_err);
  2133. if (!U_SUCCESS (uc_err)) {
  2134. msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
  2135. u_errorName (uc_err));
  2136. ret |= RSPAMD_UNICODE_NORM_ERROR;
  2137. goto out;
  2138. }
  2139. /* We can now check if we need to decompose */
  2140. end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
  2141. if (!U_SUCCESS (uc_err)) {
  2142. msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
  2143. u_errorName (uc_err));
  2144. ret |= RSPAMD_UNICODE_NORM_ERROR;
  2145. goto out;
  2146. }
  2147. for (gint32 i = 0; i < nsym; i ++) {
  2148. if (IS_ZERO_WIDTH_SPACE (src[i])) {
  2149. has_invisible = TRUE;
  2150. break;
  2151. }
  2152. }
  2153. uc_err = U_ZERO_ERROR;
  2154. if (end != nsym) {
  2155. /* No normalisation needed, but we may still have invisible spaces */
  2156. /* We copy sub(src, 0, end) to dest and normalise the rest */
  2157. ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
  2158. dest = g_malloc (nsym * sizeof (*dest));
  2159. memcpy (dest, src, end * sizeof (*dest));
  2160. nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
  2161. src + end, nsym - end, &uc_err);
  2162. if (!U_SUCCESS (uc_err)) {
  2163. if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
  2164. msg_warn_pool_check ("cannot normalise URL: %s",
  2165. u_errorName (uc_err));
  2166. ret |= RSPAMD_UNICODE_NORM_ERROR;
  2167. }
  2168. goto out;
  2169. }
  2170. }
  2171. else if (!has_invisible) {
  2172. goto out;
  2173. }
  2174. else {
  2175. dest = src;
  2176. src = NULL;
  2177. }
  2178. if (has_invisible) {
  2179. /* Also filter zero width spaces */
  2180. gint32 new_len = 0;
  2181. UChar *t = dest, *h = dest;
  2182. ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
  2183. for (gint32 i = 0; i < nsym; i ++) {
  2184. if (!IS_ZERO_WIDTH_SPACE (*h)) {
  2185. *t++ = *h++;
  2186. new_len ++;
  2187. }
  2188. else {
  2189. h ++;
  2190. }
  2191. }
  2192. nsym = new_len;
  2193. }
  2194. /* We now convert it back to utf */
  2195. nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
  2196. if (!U_SUCCESS (uc_err)) {
  2197. msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
  2198. " input length: %d chars, unicode length: %d utf16 symbols",
  2199. u_errorName (uc_err), (gint)*len, (gint)nsym);
  2200. if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
  2201. ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
  2202. }
  2203. else {
  2204. ret |= RSPAMD_UNICODE_NORM_ERROR;
  2205. }
  2206. goto out;
  2207. }
  2208. *len = nsym;
  2209. out:
  2210. if (src) {
  2211. g_free (src);
  2212. }
  2213. if (dest) {
  2214. g_free (dest);
  2215. }
  2216. return ret;
  2217. #else
  2218. /* Kill that with fire please */
  2219. return FALSE;
  2220. #endif
  2221. }
  2222. gchar *
  2223. rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
  2224. gsize *dst_len, enum rspamd_regexp_escape_flags flags)
  2225. {
  2226. const gchar *p, *end = pattern + slen;
  2227. gchar *res, *d, t, *tmp_utf = NULL, *dend;
  2228. gsize len;
  2229. static const gchar hexdigests[16] = "0123456789abcdef";
  2230. len = slen;
  2231. p = pattern;
  2232. /* [-[\]{}()*+?.,\\^$|#\s] need to be escaped */
  2233. while (p < end) {
  2234. t = *p ++;
  2235. switch (t) {
  2236. case '[':
  2237. case ']':
  2238. case '-':
  2239. case '\\':
  2240. case '{':
  2241. case '}':
  2242. case '(':
  2243. case ')':
  2244. case '*':
  2245. case '+':
  2246. case '?':
  2247. case '.':
  2248. case ',':
  2249. case '^':
  2250. case '$':
  2251. case '|':
  2252. case '#':
  2253. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2254. len++;
  2255. }
  2256. break;
  2257. default:
  2258. if (g_ascii_isspace (t)) {
  2259. len ++;
  2260. }
  2261. else {
  2262. if (!g_ascii_isprint (t) || (t & 0x80)) {
  2263. if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
  2264. /* \x{code}, where code can be up to 5 digits */
  2265. len += 4;
  2266. }
  2267. else {
  2268. /* \\xHH -> 4 symbols */
  2269. len += 3;
  2270. }
  2271. }
  2272. }
  2273. break;
  2274. }
  2275. }
  2276. if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
  2277. if (!g_utf8_validate (pattern, slen, NULL)) {
  2278. tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
  2279. }
  2280. }
  2281. if (slen == len) {
  2282. if (dst_len) {
  2283. if (tmp_utf) {
  2284. slen = strlen (tmp_utf);
  2285. }
  2286. *dst_len = slen;
  2287. }
  2288. if (tmp_utf) {
  2289. return tmp_utf;
  2290. }
  2291. else {
  2292. return g_strdup (pattern);
  2293. }
  2294. }
  2295. if (tmp_utf) {
  2296. pattern = tmp_utf;
  2297. }
  2298. res = g_malloc (len + 1);
  2299. p = pattern;
  2300. d = res;
  2301. dend = d + len;
  2302. while (p < end) {
  2303. g_assert (d < dend);
  2304. t = *p ++;
  2305. switch (t) {
  2306. case '[':
  2307. case ']':
  2308. case '-':
  2309. case '\\':
  2310. case '{':
  2311. case '}':
  2312. case '(':
  2313. case ')':
  2314. case '.':
  2315. case ',':
  2316. case '^':
  2317. case '$':
  2318. case '|':
  2319. case '#':
  2320. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2321. *d++ = '\\';
  2322. }
  2323. break;
  2324. case '*':
  2325. case '?':
  2326. case '+':
  2327. if (flags & RSPAMD_REGEXP_ESCAPE_GLOB) {
  2328. /* Treat * as .* and ? as .? */
  2329. *d++ = '.';
  2330. }
  2331. else {
  2332. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2333. *d++ = '\\';
  2334. }
  2335. }
  2336. break;
  2337. default:
  2338. if (g_ascii_isspace (t)) {
  2339. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2340. *d++ = '\\';
  2341. }
  2342. }
  2343. else if (t & 0x80 || !g_ascii_isprint (t)) {
  2344. if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
  2345. *d++ = '\\';
  2346. *d++ = 'x';
  2347. *d++ = hexdigests[((t >> 4) & 0xF)];
  2348. *d++ = hexdigests[((t) & 0xF)];
  2349. continue; /* To avoid *d++ = t; */
  2350. }
  2351. else {
  2352. if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) {
  2353. UChar32 uc;
  2354. gint32 off = p - pattern - 1;
  2355. U8_NEXT (pattern, off, slen, uc);
  2356. if (uc > 0) {
  2357. d += rspamd_snprintf (d, dend - d,
  2358. "\\x{%xd}", uc);
  2359. p = pattern + off;
  2360. }
  2361. continue; /* To avoid *d++ = t; */
  2362. }
  2363. }
  2364. }
  2365. break;
  2366. }
  2367. *d++ = t;
  2368. }
  2369. *d = '\0';
  2370. if (dst_len) {
  2371. *dst_len = d - res;
  2372. }
  2373. if (tmp_utf) {
  2374. g_free (tmp_utf);
  2375. }
  2376. return res;
  2377. }
  2378. gchar *
  2379. rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen)
  2380. {
  2381. GString *dst;
  2382. const gchar *last;
  2383. gchar *dchar;
  2384. gsize valid, prev;
  2385. UChar32 uc;
  2386. gint32 i;
  2387. if (src == NULL) {
  2388. return NULL;
  2389. }
  2390. if (slen == 0) {
  2391. slen = strlen (src);
  2392. }
  2393. dst = g_string_sized_new (slen);
  2394. i = 0;
  2395. last = src;
  2396. valid = 0;
  2397. prev = 0;
  2398. while (i < slen) {
  2399. U8_NEXT (src, i, slen, uc);
  2400. if (uc <= 0) {
  2401. if (valid > 0) {
  2402. g_string_append_len (dst, last, valid);
  2403. }
  2404. /* 0xFFFD in UTF8 */
  2405. g_string_append_len (dst, "\357\277\275", 3);
  2406. valid = 0;
  2407. last = &src[i];
  2408. }
  2409. else {
  2410. valid += i - prev;
  2411. }
  2412. prev = i;
  2413. }
  2414. if (valid > 0) {
  2415. g_string_append_len (dst, last, valid);
  2416. }
  2417. dchar = dst->str;
  2418. if (dstlen) {
  2419. *dstlen = dst->len;
  2420. }
  2421. g_string_free (dst, FALSE);
  2422. return dchar;
  2423. }
  2424. gsize
  2425. rspamd_gstring_strip (GString *s, const gchar *strip_chars)
  2426. {
  2427. const gchar *p, *sc;
  2428. gsize strip_len = 0, total = 0;
  2429. p = s->str + s->len - 1;
  2430. while (p >= s->str) {
  2431. gboolean seen = FALSE;
  2432. sc = strip_chars;
  2433. while (*sc != '\0') {
  2434. if (*p == *sc) {
  2435. strip_len ++;
  2436. seen = TRUE;
  2437. break;
  2438. }
  2439. sc ++;
  2440. }
  2441. if (!seen) {
  2442. break;
  2443. }
  2444. p --;
  2445. }
  2446. if (strip_len > 0) {
  2447. s->len -= strip_len;
  2448. s->str[s->len] = '\0';
  2449. total += strip_len;
  2450. }
  2451. if (s->len > 0) {
  2452. strip_len = rspamd_memspn (s->str, strip_chars, s->len);
  2453. if (strip_len > 0) {
  2454. memmove (s->str, s->str + strip_len, s->len - strip_len);
  2455. s->len -= strip_len;
  2456. total += strip_len;
  2457. }
  2458. }
  2459. return total;
  2460. }
  2461. const gchar* rspamd_string_len_strip (const gchar *in,
  2462. gsize *len,
  2463. const gchar *strip_chars)
  2464. {
  2465. const gchar *p, *sc;
  2466. gsize strip_len = 0, old_len = *len;
  2467. p = in + old_len - 1;
  2468. /* Trail */
  2469. while (p >= in) {
  2470. gboolean seen = FALSE;
  2471. sc = strip_chars;
  2472. while (*sc != '\0') {
  2473. if (*p == *sc) {
  2474. strip_len ++;
  2475. seen = TRUE;
  2476. break;
  2477. }
  2478. sc ++;
  2479. }
  2480. if (!seen) {
  2481. break;
  2482. }
  2483. p --;
  2484. }
  2485. if (strip_len > 0) {
  2486. *len -= strip_len;
  2487. }
  2488. /* Head */
  2489. old_len = *len;
  2490. if (old_len > 0) {
  2491. strip_len = rspamd_memspn (in, strip_chars, old_len);
  2492. if (strip_len > 0) {
  2493. *len -= strip_len;
  2494. return in + strip_len;
  2495. }
  2496. }
  2497. return in;
  2498. }