You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

str_util.c 71KB


  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "cryptobox.h"
  19. #include "url.h"
  20. #include "str_util.h"
  21. #include "logger.h"
  22. #include "contrib/t1ha/t1ha.h"
  23. #include <unicode/uversion.h>
  24. #include <unicode/ucnv.h>
  25. #if U_ICU_VERSION_MAJOR_NUM >= 44
  26. #include <unicode/unorm2.h>
  27. #endif
  28. #include <math.h>
  29. #ifdef __x86_64__
  30. #include <immintrin.h>
  31. #endif
  32. #include "contrib/fastutf8/fastutf8.h"
  33. const guchar lc_map[256] = {
  34. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  35. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  36. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  37. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  38. 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
  39. 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  40. 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
  41. 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
  42. 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
  43. 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
  44. 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
  45. 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
  46. 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
  47. 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
  48. 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
  49. 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
  50. 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
  51. 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
  52. 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
  53. 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
  54. 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
  55. 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
  56. 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
  57. 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
  58. 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
  59. 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
  60. 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
  61. 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
  62. 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
  63. 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
  64. 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
  65. 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
  66. };
  67. guint
  68. rspamd_str_lc (gchar *str, guint size)
  69. {
  70. guint leftover = size % 4;
  71. guint fp, i;
  72. const uint8_t* s = (const uint8_t*) str;
  73. gchar *dest = str;
  74. guchar c1, c2, c3, c4;
  75. fp = size - leftover;
  76. for (i = 0; i != fp; i += 4) {
  77. c1 = s[i], c2 = s[i + 1], c3 = s[i + 2], c4 = s[i + 3];
  78. dest[0] = lc_map[c1];
  79. dest[1] = lc_map[c2];
  80. dest[2] = lc_map[c3];
  81. dest[3] = lc_map[c4];
  82. dest += 4;
  83. }
  84. switch (leftover) {
  85. case 3:
  86. *dest++ = lc_map[(guchar)str[i++]];
  87. /* FALLTHRU */
  88. case 2:
  89. *dest++ = lc_map[(guchar)str[i++]];
  90. /* FALLTHRU */
  91. case 1:
  92. *dest = lc_map[(guchar)str[i]];
  93. }
  94. return size;
  95. }
  96. gsize
  97. rspamd_str_copy_lc (const gchar *src, gchar *dst, gsize size)
  98. {
  99. gchar *d = dst;
  100. /* Find aligned start */
  101. while ((0xf & (uintptr_t)src) && size > 0) {
  102. *d++ = lc_map[(guchar)*src++];
  103. size --;
  104. }
  105. /* Aligned start in src */
  106. #ifdef __x86_64__
  107. while (size >= 16) {
  108. __m128i sv = _mm_load_si128((const __m128i*)src);
  109. /* From A */
  110. __m128i rangeshift = _mm_sub_epi8(sv, _mm_set1_epi8((char)('A'+128)));
  111. /* To Z */
  112. __m128i nomodify = _mm_cmpgt_epi8(rangeshift, _mm_set1_epi8(-128 + 25));
  113. /* ^ ' ' */
  114. __m128i flip = _mm_andnot_si128(nomodify, _mm_set1_epi8(0x20));
  115. __m128i uc = _mm_xor_si128(sv, flip);
  116. _mm_storeu_si128((__m128i*)d, uc);
  117. d += 16;
  118. src += 16;
  119. size -= 16;
  120. }
  121. #endif
  122. /* Leftover */
  123. while (size > 0) {
  124. *d++ = lc_map[(guchar)*src++];
  125. size --;
  126. }
  127. return (d - dst);
  128. }
  129. gint
  130. rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l)
  131. {
  132. gsize fp, i;
  133. guchar c1, c2, c3, c4;
  134. union {
  135. guchar c[4];
  136. guint32 n;
  137. } cmp1, cmp2;
  138. gsize leftover = l % 4;
  139. gint ret = 0;
  140. fp = l - leftover;
  141. for (i = 0; i != fp; i += 4) {
  142. c1 = s[i], c2 = s[i + 1], c3 = s[i + 2], c4 = s[i + 3];
  143. cmp1.c[0] = lc_map[c1];
  144. cmp1.c[1] = lc_map[c2];
  145. cmp1.c[2] = lc_map[c3];
  146. cmp1.c[3] = lc_map[c4];
  147. c1 = d[i], c2 = d[i + 1], c3 = d[i + 2], c4 = d[i + 3];
  148. cmp2.c[0] = lc_map[c1];
  149. cmp2.c[1] = lc_map[c2];
  150. cmp2.c[2] = lc_map[c3];
  151. cmp2.c[3] = lc_map[c4];
  152. if (cmp1.n != cmp2.n) {
  153. return cmp1.n - cmp2.n;
  154. }
  155. }
  156. while (leftover > 0) {
  157. if (g_ascii_tolower (s[i]) != g_ascii_tolower (d[i])) {
  158. return s[i] - d[i];
  159. }
  160. leftover--;
  161. i++;
  162. }
  163. return ret;
  164. }
  165. /*
  166. * The purpose of this function is fast and in place conversion of a unicode
  167. * string to lower case, so some locale peculiarities are simply ignored
  168. * If the target string is longer than initial one, then we just trim it
  169. */
  170. guint
  171. rspamd_str_lc_utf8 (gchar *str, guint size)
  172. {
  173. guchar *d = (guchar *)str, tst[6];
  174. gint32 i = 0, prev = 0;
  175. UChar32 uc;
  176. while (i < size) {
  177. prev = i;
  178. U8_NEXT ((guint8*)str, i, size, uc);
  179. uc = u_tolower (uc);
  180. gint32 olen = 0;
  181. U8_APPEND_UNSAFE (tst, olen, uc);
  182. if (olen <= (i - prev)) {
  183. memcpy (d, tst, olen);
  184. d += olen;
  185. }
  186. else {
  187. /* Lowercasing has increased the length, so we need to ignore it */
  188. d += i - prev;
  189. }
  190. }
  191. return d - (guchar *)str;
  192. }
  193. gboolean
  194. rspamd_strcase_equal (gconstpointer v, gconstpointer v2)
  195. {
  196. if (g_ascii_strcasecmp ((const gchar *)v, (const gchar *)v2) == 0) {
  197. return TRUE;
  198. }
  199. return FALSE;
  200. }
  201. guint64
  202. rspamd_icase_hash (const gchar *in, gsize len, guint64 seed)
  203. {
  204. guint leftover = len % sizeof (guint64);
  205. guint fp, i;
  206. const uint8_t* s = (const uint8_t*) in;
  207. union {
  208. struct {
  209. guchar c1, c2, c3, c4, c5, c6, c7, c8;
  210. } c;
  211. guint64 pp;
  212. } u;
  213. guint64 h = seed;
  214. fp = len - leftover;
  215. for (i = 0; i != fp; i += 8) {
  216. u.c.c1 = s[i], u.c.c2 = s[i + 1], u.c.c3 = s[i + 2], u.c.c4 = s[i + 3];
  217. u.c.c5 = s[i + 4], u.c.c6 = s[i + 5], u.c.c7 = s[i + 6], u.c.c8 = s[i + 7];
  218. u.c.c1 = lc_map[u.c.c1];
  219. u.c.c2 = lc_map[u.c.c2];
  220. u.c.c3 = lc_map[u.c.c3];
  221. u.c.c4 = lc_map[u.c.c4];
  222. u.c.c5 = lc_map[u.c.c5];
  223. u.c.c6 = lc_map[u.c.c6];
  224. u.c.c7 = lc_map[u.c.c7];
  225. u.c.c8 = lc_map[u.c.c8];
  226. h = t1ha (&u.pp, sizeof (u), h);
  227. }
  228. u.pp = 0;
  229. switch (leftover) {
  230. case 7:
  231. u.c.c7 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  232. case 6:
  233. u.c.c6 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  234. case 5:
  235. u.c.c5 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  236. case 4:
  237. u.c.c4 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  238. case 3:
  239. u.c.c3 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  240. case 2:
  241. u.c.c2 = lc_map[(guchar)s[i++]]; /* FALLTHRU */
  242. case 1:
  243. u.c.c1 = lc_map[(guchar)s[i]];
  244. break;
  245. }
  246. h = t1ha (&u.pp, sizeof (u), h);
  247. return h;
  248. }
  249. guint
  250. rspamd_strcase_hash (gconstpointer key)
  251. {
  252. const gchar *p = key;
  253. gsize len;
  254. len = strlen (p);
  255. return (guint)rspamd_icase_hash (p, len, rspamd_hash_seed ());
  256. }
  257. guint
  258. rspamd_str_hash (gconstpointer key)
  259. {
  260. gsize len;
  261. len = strlen ((const gchar *)key);
  262. return (guint)rspamd_cryptobox_fast_hash (key, len, rspamd_hash_seed ());
  263. }
  264. gboolean
  265. rspamd_str_equal (gconstpointer v, gconstpointer v2)
  266. {
  267. return strcmp ((const gchar *)v, (const gchar *)v2) == 0;
  268. }
  269. gboolean
  270. rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2)
  271. {
  272. const rspamd_ftok_t *f1 = v, *f2 = v2;
  273. if (f1->len == f2->len &&
  274. rspamd_lc_cmp (f1->begin, f2->begin, f1->len) == 0) {
  275. return TRUE;
  276. }
  277. return FALSE;
  278. }
  279. guint
  280. rspamd_ftok_icase_hash (gconstpointer key)
  281. {
  282. const rspamd_ftok_t *f = key;
  283. return (guint)rspamd_icase_hash (f->begin, f->len, rspamd_hash_seed ());
  284. }
  285. gboolean
  286. rspamd_ftok_equal (gconstpointer v, gconstpointer v2)
  287. {
  288. const rspamd_ftok_t *f1 = v, *f2 = v2;
  289. if (f1->len == f2->len &&
  290. memcmp (f1->begin, f2->begin, f1->len) == 0) {
  291. return TRUE;
  292. }
  293. return FALSE;
  294. }
  295. guint
  296. rspamd_ftok_hash (gconstpointer key)
  297. {
  298. const rspamd_ftok_t *f = key;
  299. return (guint)rspamd_cryptobox_fast_hash (f->begin, f->len, rspamd_hash_seed ());
  300. }
  301. gboolean
  302. rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2)
  303. {
  304. const GString *f1 = v, *f2 = v2;
  305. if (f1->len == f2->len &&
  306. rspamd_lc_cmp (f1->str, f2->str, f1->len) == 0) {
  307. return TRUE;
  308. }
  309. return FALSE;
  310. }
  311. guint
  312. rspamd_gstring_icase_hash (gconstpointer key)
  313. {
  314. const GString *f = key;
  315. return (guint)rspamd_icase_hash (f->str, f->len, rspamd_hash_seed ());
  316. }
  317. /* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
  318. #define MEM_ALIGN (sizeof(gsize)-1)
  319. #if defined(__LP64__) || defined(_LP64)
  320. #define WORD_TYPE guint64
  321. #define ZEROMASK 0x7F7F7F7F7F7F7F7FLLU
  322. #else
  323. #define WORD_TYPE guint32
  324. #define ZEROMASK 0x7F7F7F7FU
  325. #endif
  326. #define HASZERO(x) ~(((((x) & ZEROMASK) + ZEROMASK) | (x)) | ZEROMASK)
  327. gsize
  328. rspamd_strlcpy_fast (gchar *dst, const gchar *src, gsize siz)
  329. {
  330. gchar *d = dst;
  331. const gchar *s = src;
  332. gsize n = siz;
  333. WORD_TYPE *wd;
  334. const WORD_TYPE *ws;
  335. /* Copy as many bytes as will fit */
  336. if (n-- != 0) {
  337. if (((uintptr_t) s & MEM_ALIGN) == ((uintptr_t) d & MEM_ALIGN)) {
  338. /* Init copy byte by byte */
  339. for (; ((uintptr_t) s & MEM_ALIGN) && n && (*d = *s); n--, s++, d++);
  340. if (n && *s) {
  341. wd = (void *) d;
  342. ws = (const void *) s;
  343. /*
  344. * Copy by 32 or 64 bits (causes valgrind warnings)
  345. */
  346. for (; n >= sizeof (WORD_TYPE) && !HASZERO(*ws);
  347. n -= sizeof (WORD_TYPE), ws++, wd++) {
  348. *wd = *ws;
  349. }
  350. d = (void *) wd;
  351. s = (const void *) ws;
  352. }
  353. }
  354. /* Copy the rest */
  355. for (; n && (*d = *s); n--, s++, d++);
  356. *d = 0;
  357. }
  358. else {
  359. return 0;
  360. }
  361. return (d - dst);
  362. }
  363. gsize
  364. rspamd_null_safe_copy (const gchar *src, gsize srclen,
  365. gchar *dest, gsize destlen)
  366. {
  367. gsize copied = 0, si = 0, di = 0;
  368. if (destlen == 0) {
  369. return 0;
  370. }
  371. while (si < srclen && di + 1 < destlen) {
  372. if (src[si] != '\0') {
  373. dest[di++] = src[si++];
  374. copied ++;
  375. }
  376. else {
  377. si ++;
  378. }
  379. }
  380. dest[di] = '\0';
  381. return copied;
  382. }
  383. size_t
  384. rspamd_strlcpy_safe (gchar *dst, const gchar *src, gsize siz)
  385. {
  386. gchar *d = dst;
  387. gsize nleft = siz;
  388. if (nleft != 0) {
  389. while (--nleft != 0) {
  390. if ((*d++ = *src++) == '\0') {
  391. d --;
  392. break;
  393. }
  394. }
  395. }
  396. if (nleft == 0) {
  397. if (siz != 0) {
  398. *d = '\0';
  399. }
  400. }
  401. return (d - dst);
  402. }
  403. /*
  404. * Try to convert string of length to long
  405. */
  406. gboolean
  407. rspamd_strtol (const gchar *s, gsize len, glong *value)
  408. {
  409. const gchar *p = s, *end = s + len;
  410. gchar c;
  411. glong v = 0;
  412. const glong cutoff = G_MAXLONG / 10, cutlim = G_MAXLONG % 10;
  413. gboolean neg;
  414. /* Case negative values */
  415. if (*p == '-') {
  416. neg = TRUE;
  417. p++;
  418. }
  419. else {
  420. neg = FALSE;
  421. }
  422. /* Some preparations for range errors */
  423. while (p < end) {
  424. c = *p;
  425. if (c >= '0' && c <= '9') {
  426. c -= '0';
  427. if (v > cutoff || (v == cutoff && c > cutlim)) {
  428. /* Range error */
  429. *value = neg ? G_MINLONG : G_MAXLONG;
  430. return FALSE;
  431. }
  432. else {
  433. v *= 10;
  434. v += c;
  435. }
  436. }
  437. else {
  438. return FALSE;
  439. }
  440. p++;
  441. }
  442. *value = neg ? -(v) : v;
  443. return TRUE;
  444. }
  445. /*
  446. * Try to convert string of length to long
  447. */
  448. #define CONV_STR_LIM_DECIMAL(max_num) do { \
  449. while (p < end) { \
  450. c = *p; \
  451. if (c >= '0' && c <= '9') { \
  452. c -= '0'; \
  453. if (v > cutoff || (v == cutoff && (guint8)c > cutlim)) { \
  454. *value = (max_num); \
  455. return FALSE; \
  456. } \
  457. else { \
  458. v *= 10; \
  459. v += c; \
  460. } \
  461. } \
  462. else { \
  463. *value = v; \
  464. return FALSE; \
  465. } \
  466. p++; \
  467. } \
  468. } while(0)
  469. gboolean
  470. rspamd_strtoul (const gchar *s, gsize len, gulong *value)
  471. {
  472. const gchar *p = s, *end = s + len;
  473. gchar c;
  474. gulong v = 0;
  475. const gulong cutoff = G_MAXULONG / 10, cutlim = G_MAXULONG % 10;
  476. /* Some preparations for range errors */
  477. CONV_STR_LIM_DECIMAL(G_MAXULONG);
  478. *value = v;
  479. return TRUE;
  480. }
  481. gboolean
  482. rspamd_strtou64 (const gchar *s, gsize len, guint64 *value)
  483. {
  484. const gchar *p = s, *end = s + len;
  485. gchar c;
  486. guint64 v = 0;
  487. const guint64 cutoff = G_MAXUINT64 / 10, cutlim = G_MAXUINT64 % 10;
  488. /* Some preparations for range errors */
  489. CONV_STR_LIM_DECIMAL(G_MAXUINT64);
  490. *value = v;
  491. return TRUE;
  492. }
  493. gboolean
  494. rspamd_xstrtoul (const gchar *s, gsize len, gulong *value)
  495. {
  496. const gchar *p = s, *end = s + len;
  497. gchar c;
  498. gulong v = 0;
  499. const gulong cutoff = G_MAXULONG / 10, cutlim = G_MAXULONG % 10;
  500. /* Some preparations for range errors */
  501. while (p < end) {
  502. c = g_ascii_tolower (*p);
  503. if (c >= '0' && c <= '9') {
  504. c -= '0';
  505. if (v > cutoff || (v == cutoff && (guint8)c > cutlim)) {
  506. /* Range error */
  507. *value = G_MAXULONG;
  508. return FALSE;
  509. }
  510. else {
  511. v *= 16;
  512. v += c;
  513. }
  514. }
  515. else if (c >= 'a' || c <= 'f') {
  516. c = c - 'a' + 10;
  517. if (v > cutoff || (v == cutoff && (guint8)c > cutlim)) {
  518. /* Range error */
  519. *value = G_MAXULONG;
  520. return FALSE;
  521. }
  522. else {
  523. v *= 16;
  524. v += c;
  525. }
  526. }
  527. else {
  528. *value = v;
  529. return FALSE;
  530. }
  531. p++;
  532. }
  533. *value = v;
  534. return TRUE;
  535. }
  536. /**
  537. * Utility function to provide mem_pool copy for rspamd_hash_table_copy function
  538. * @param data string to copy
  539. * @param ud memory pool to use
  540. * @return
  541. */
  542. gpointer
  543. rspamd_str_pool_copy (gconstpointer data, gpointer ud)
  544. {
  545. rspamd_mempool_t *pool = ud;
  546. return data ? rspamd_mempool_strdup (pool, data) : NULL;
  547. }
  548. /*
  549. * We use here z-base32 encoding described here:
  550. * http://philzimmermann.com/docs/human-oriented-base-32-encoding.txt
  551. */
  552. gint
  553. rspamd_encode_base32_buf (const guchar *in, gsize inlen, gchar *out, gsize outlen,
  554. enum rspamd_base32_type type)
  555. {
  556. static const char b32_default[] = "ybndrfg8ejkmcpqxot1uwisza345h769",
  557. b32_bleach[] = "qpzry9x8gf2tvdw0s3jn54khce6mua7l",
  558. b32_rfc[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567",
  559. *b32;
  560. gchar *o, *end;
  561. gsize i;
  562. gint remain = -1, x;
  563. bool inverse_order = true;
  564. end = out + outlen;
  565. o = out;
  566. switch (type) {
  567. case RSPAMD_BASE32_DEFAULT:
  568. b32 = b32_default;
  569. break;
  570. case RSPAMD_BASE32_BLEACH:
  571. b32 = b32_bleach;
  572. inverse_order = false;
  573. break;
  574. case RSPAMD_BASE32_RFC:
  575. b32 = b32_rfc;
  576. inverse_order = false;
  577. break;
  578. default:
  579. g_assert_not_reached ();
  580. abort ();
  581. }
  582. if (inverse_order) {
  583. /* Zbase32 as used in Rspamd */
  584. for (i = 0; i < inlen && o < end - 1; i++) {
  585. switch (i % 5) {
  586. case 0:
  587. /* 8 bits of input and 3 to remain */
  588. x = in[i];
  589. remain = in[i] >> 5;
  590. *o++ = b32[x & 0x1F];
  591. break;
  592. case 1:
  593. /* 11 bits of input, 1 to remain */
  594. x = remain | in[i] << 3;
  595. *o++ = b32[x & 0x1F];
  596. *o++ = b32[x >> 5 & 0x1F];
  597. remain = x >> 10;
  598. break;
  599. case 2:
  600. /* 9 bits of input, 4 to remain */
  601. x = remain | in[i] << 1;
  602. *o++ = b32[x & 0x1F];
  603. remain = x >> 5;
  604. break;
  605. case 3:
  606. /* 12 bits of input, 2 to remain */
  607. x = remain | in[i] << 4;
  608. *o++ = b32[x & 0x1F];
  609. *o++ = b32[x >> 5 & 0x1F];
  610. remain = x >> 10 & 0x3;
  611. break;
  612. case 4:
  613. /* 10 bits of output, nothing to remain */
  614. x = remain | in[i] << 2;
  615. *o++ = b32[x & 0x1F];
  616. *o++ = b32[x >> 5 & 0x1F];
  617. remain = -1;
  618. break;
  619. default:
  620. /* Not to be happen */
  621. break;
  622. }
  623. }
  624. }
  625. else {
  626. /* Traditional base32 with no bits inversion */
  627. for (i = 0; i < inlen && o < end - 1; i++) {
  628. switch (i % 5) {
  629. case 0:
  630. /* 8 bits of input and 3 to remain */
  631. x = in[i] >> 3;
  632. remain = (in[i] & 7) << 2;
  633. *o++ = b32[x & 0x1F];
  634. break;
  635. case 1:
  636. /* 11 bits of input, 1 to remain */
  637. x = (remain << 6) | in[i];
  638. *o++ = b32[(x >> 6) & 0x1F];
  639. *o++ = b32[(x >> 1) & 0x1F];
  640. remain = (x & 0x1) << 4;
  641. break;
  642. case 2:
  643. /* 9 bits of input, 4 to remain */
  644. x = (remain << 4) | in[i];
  645. *o++ = b32[(x >> 4) & 0x1F];
  646. remain = (x & 15) << 1;
  647. break;
  648. case 3:
  649. /* 12 bits of input, 2 to remain */
  650. x = (remain << 7) | in[i];
  651. *o++ = b32[(x >> 7) & 0x1F];
  652. *o++ = b32[(x >> 2) & 0x1F];
  653. remain = (x & 3) << 3;
  654. break;
  655. case 4:
  656. /* 10 bits of output, nothing to remain */
  657. x = (remain << 5) | in[i];
  658. *o++ = b32[(x >> 5) & 0x1F];
  659. *o++ = b32[x & 0x1F];
  660. remain = -1;
  661. break;
  662. default:
  663. /* Not to be happen */
  664. break;
  665. }
  666. }
  667. }
  668. if (remain >= 0 && o < end) {
  669. *o++ = b32[remain & 0x1F];
  670. }
  671. if (o <= end) {
  672. return (o - out);
  673. }
  674. return -1;
  675. }
  676. gchar *
  677. rspamd_encode_base32 (const guchar *in, gsize inlen, enum rspamd_base32_type type)
  678. {
  679. gsize allocated_len = inlen * 8 / 5 + 2;
  680. gchar *out;
  681. gint outlen;
  682. out = g_malloc (allocated_len);
  683. outlen = rspamd_encode_base32_buf (in, inlen, out,
  684. allocated_len - 1, type);
  685. if (outlen >= 0) {
  686. out[outlen] = 0;
  687. return out;
  688. }
  689. g_free (out);
  690. return NULL;
  691. }
  692. enum rspamd_base32_type
  693. rspamd_base32_decode_type_from_str (const gchar *str)
  694. {
  695. enum rspamd_base32_type ret = RSPAMD_BASE32_INVALID;
  696. if (str == NULL) {
  697. return RSPAMD_BASE32_DEFAULT;
  698. }
  699. if (strcmp (str, "default") == 0 || strcmp (str, "zbase") == 0) {
  700. ret = RSPAMD_BASE32_ZBASE;
  701. }
  702. else if (strcmp (str, "bleach") == 0) {
  703. ret = RSPAMD_BASE32_BLEACH;
  704. }
  705. else if (strcmp (str, "rfc") == 0) {
  706. ret = RSPAMD_BASE32_RFC;
  707. }
  708. return ret;
  709. }
  710. static const guchar b32_dec_zbase[] = {
  711. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  712. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  713. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  714. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  715. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  716. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  717. 0xff, 0x12, 0xff, 0x19, 0x1a, 0x1b, 0x1e, 0x1d,
  718. 0x07, 0x1f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  719. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  720. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  721. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  722. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  723. 0xff, 0x18, 0x01, 0x0c, 0x03, 0x08, 0x05, 0x06,
  724. 0x1c, 0x15, 0x09, 0x0a, 0xff, 0x0b, 0x02, 0x10,
  725. 0x0d, 0x0e, 0x04, 0x16, 0x11, 0x13, 0xff, 0x14,
  726. 0x0f, 0x00, 0x17, 0xff, 0xff, 0xff, 0xff, 0xff,
  727. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  728. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  729. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  730. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  731. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  732. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  733. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  734. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  735. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  736. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  737. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  738. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  739. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  740. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  741. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  742. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  743. };
  744. static const guchar b32_dec_bleach[] = {
  745. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  746. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  747. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  748. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  749. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  750. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  751. 0x0f, 0xff, 0x0a, 0x11, 0x15, 0x14, 0x1a, 0x1e,
  752. 0x07, 0x05, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  753. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  754. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  755. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  756. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  757. 0xff, 0x1d, 0xff, 0x18, 0x0d, 0x19, 0x09, 0x08,
  758. 0x17, 0xff, 0x12, 0x16, 0x1f, 0x1b, 0x13, 0xff,
  759. 0x01, 0x00, 0x03, 0x10, 0x0b, 0x1c, 0x0c, 0x0e,
  760. 0x06, 0x04, 0x02, 0xff, 0xff, 0xff, 0xff, 0xff,
  761. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  762. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  763. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  764. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  765. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  766. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  767. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  768. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  769. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  770. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  771. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  772. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  773. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  774. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  775. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  776. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  777. };
  778. static const guchar b32_dec_rfc[] = {
  779. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  780. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  781. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  782. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  783. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  784. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  785. 0xff, 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  786. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  787. 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  788. 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
  789. 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
  790. 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
  791. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  792. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  793. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  794. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  795. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  796. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  797. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  798. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  799. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  800. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  801. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  802. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  803. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  804. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  805. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  806. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  807. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  808. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  809. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  810. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  811. };
  812. gint
  813. rspamd_decode_base32_buf (const gchar *in, gsize inlen, guchar *out, gsize outlen,
  814. enum rspamd_base32_type type)
  815. {
  816. guchar *o, *end, decoded;
  817. guchar c;
  818. guint acc = 0U;
  819. guint processed_bits = 0;
  820. gsize i;
  821. const guchar *b32_dec;
  822. bool inverse_bits = true;
  823. end = out + outlen;
  824. o = out;
  825. switch (type) {
  826. case RSPAMD_BASE32_DEFAULT:
  827. b32_dec = b32_dec_zbase;
  828. break;
  829. case RSPAMD_BASE32_BLEACH:
  830. b32_dec = b32_dec_bleach;
  831. inverse_bits = false;
  832. break;
  833. case RSPAMD_BASE32_RFC:
  834. b32_dec = b32_dec_rfc;
  835. inverse_bits = false;
  836. break;
  837. default:
  838. g_assert_not_reached ();
  839. abort ();
  840. }
  841. if (inverse_bits) {
  842. for (i = 0; i < inlen; i++) {
  843. c = (guchar) in[i];
  844. if (processed_bits >= 8) {
  845. /* Emit from left to right */
  846. processed_bits -= 8;
  847. *o++ = acc & 0xFF;
  848. acc >>= 8;
  849. }
  850. decoded = b32_dec[c];
  851. if (decoded == 0xff || o >= end) {
  852. return -1;
  853. }
  854. acc = (decoded << processed_bits) | acc;
  855. processed_bits += 5;
  856. }
  857. if (processed_bits > 0 && o < end) {
  858. *o++ = (acc & 0xFF);
  859. }
  860. else if (o > end) {
  861. return -1;
  862. }
  863. }
  864. else {
  865. for (i = 0; i < inlen; i++) {
  866. c = (guchar) in[i];
  867. decoded = b32_dec[c];
  868. if (decoded == 0xff) {
  869. return -1;
  870. }
  871. acc = (acc << 5) | decoded;
  872. processed_bits += 5;
  873. if (processed_bits >= 8) {
  874. /* Emit from right to left */
  875. processed_bits -= 8;
  876. /* Output buffer overflow */
  877. if (o >= end) {
  878. return -1;
  879. }
  880. *o++ = (acc >> processed_bits) & 0xFF;
  881. /* Preserve lowers at the higher parts of the input */
  882. acc = (acc & ((1u << processed_bits) - 1));
  883. }
  884. }
  885. if (processed_bits > 0 && o < end) {
  886. *o++ = (acc & 0xFF);
  887. }
  888. else if (o > end) {
  889. return -1;
  890. }
  891. }
  892. return (o - out);
  893. }
  894. guchar *
  895. rspamd_decode_base32 (const gchar *in, gsize inlen, gsize *outlen,
  896. enum rspamd_base32_type type)
  897. {
  898. guchar *res;
  899. gsize allocated_len = inlen * 5 / 8 + 2;
  900. gssize olen;
  901. res = g_malloc (allocated_len);
  902. olen = rspamd_decode_base32_buf (in, inlen, res, allocated_len - 1,
  903. type);
  904. if (olen >= 0) {
  905. res[olen] = '\0';
  906. }
  907. else {
  908. g_free (res);
  909. if (outlen) {
  910. *outlen = 0;
  911. }
  912. return NULL;
  913. }
  914. if (outlen) {
  915. *outlen = olen;
  916. }
  917. return res;
  918. }
  919. gchar *
  920. rspamd_encode_base64_common (const guchar *in, gsize inlen, gint str_len,
  921. gsize *outlen, gboolean fold, enum rspamd_newlines_type how)
  922. {
  923. #define ADD_SPLIT do { \
  924. if (how == RSPAMD_TASK_NEWLINES_CR || how == RSPAMD_TASK_NEWLINES_CRLF) *o++ = '\r'; \
  925. if (how == RSPAMD_TASK_NEWLINES_LF || how == RSPAMD_TASK_NEWLINES_CRLF) *o++ = '\n'; \
  926. if (fold) *o++ = '\t'; \
  927. } while (0)
  928. #define CHECK_SPLIT \
  929. do { if (str_len > 0 && cols >= str_len) { \
  930. ADD_SPLIT; \
  931. cols = 0; \
  932. } } \
  933. while (0)
  934. gsize allocated_len = (inlen / 3) * 4 + 5;
  935. gchar *out, *o;
  936. guint64 n;
  937. guint32 rem, t, carry;
  938. gint cols, shift;
  939. static const char b64_enc[] =
  940. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  941. "abcdefghijklmnopqrstuvwxyz"
  942. "0123456789+/";
  943. if (str_len > 0) {
  944. g_assert (str_len > 8);
  945. if (fold) {
  946. switch (how) {
  947. case RSPAMD_TASK_NEWLINES_CR:
  948. case RSPAMD_TASK_NEWLINES_LF:
  949. allocated_len += (allocated_len / str_len + 1) * 2 + 1;
  950. break;
  951. default:
  952. allocated_len += (allocated_len / str_len + 1) * 3 + 1;
  953. break;
  954. }
  955. }
  956. else {
  957. switch (how) {
  958. case RSPAMD_TASK_NEWLINES_CR:
  959. case RSPAMD_TASK_NEWLINES_LF:
  960. allocated_len += (allocated_len / str_len + 1) * 1 + 1;
  961. break;
  962. default:
  963. allocated_len += (allocated_len / str_len + 1) * 2 + 1;
  964. break;
  965. }
  966. }
  967. }
  968. out = g_malloc (allocated_len);
  969. o = out;
  970. cols = 0;
  971. while (inlen > 6) {
  972. memcpy (&n, in, sizeof (n));
  973. n = GUINT64_TO_BE (n);
  974. if (str_len <= 0 || cols <= str_len - 8) {
  975. *o++ = b64_enc[(n >> 58) & 0x3F];
  976. *o++ = b64_enc[(n >> 52) & 0x3F];
  977. *o++ = b64_enc[(n >> 46) & 0x3F];
  978. *o++ = b64_enc[(n >> 40) & 0x3F];
  979. *o++ = b64_enc[(n >> 34) & 0x3F];
  980. *o++ = b64_enc[(n >> 28) & 0x3F];
  981. *o++ = b64_enc[(n >> 22) & 0x3F];
  982. *o++ = b64_enc[(n >> 16) & 0x3F];
  983. cols += 8;
  984. }
  985. else {
  986. cols = str_len - cols;
  987. shift = 58;
  988. while (cols) {
  989. *o++ = b64_enc[(n >> shift) & 0x3F];
  990. shift -= 6;
  991. cols --;
  992. }
  993. ADD_SPLIT;
  994. /* Remaining bytes */
  995. while (shift >= 16) {
  996. *o++ = b64_enc[(n >> shift) & 0x3F];
  997. shift -= 6;
  998. cols ++;
  999. }
  1000. }
  1001. in += 6;
  1002. inlen -= 6;
  1003. }
  1004. CHECK_SPLIT;
  1005. rem = 0;
  1006. carry = 0;
  1007. for (;;) {
  1008. /* Padding + remaining data (0 - 2 bytes) */
  1009. switch (rem) {
  1010. case 0:
  1011. if (inlen-- == 0) {
  1012. goto end;
  1013. }
  1014. t = *in++;
  1015. *o++ = b64_enc[t >> 2];
  1016. carry = (t << 4) & 0x30;
  1017. rem = 1;
  1018. cols ++;
  1019. case 1:
  1020. if (inlen-- == 0) {
  1021. goto end;
  1022. }
  1023. CHECK_SPLIT;
  1024. t = *in++;
  1025. *o++ = b64_enc[carry | (t >> 4)];
  1026. carry = (t << 2) & 0x3C;
  1027. rem = 2;
  1028. cols ++;
  1029. default:
  1030. if (inlen-- == 0) {
  1031. goto end;
  1032. }
  1033. CHECK_SPLIT;
  1034. t = *in ++;
  1035. *o++ = b64_enc[carry | (t >> 6)];
  1036. cols ++;
  1037. CHECK_SPLIT;
  1038. *o++ = b64_enc[t & 0x3F];
  1039. cols ++;
  1040. CHECK_SPLIT;
  1041. rem = 0;
  1042. }
  1043. }
  1044. end:
  1045. if (rem == 1) {
  1046. *o++ = b64_enc[carry];
  1047. cols ++;
  1048. CHECK_SPLIT;
  1049. *o++ = '=';
  1050. cols ++;
  1051. CHECK_SPLIT;
  1052. *o++ = '=';
  1053. cols ++;
  1054. CHECK_SPLIT;
  1055. }
  1056. else if (rem == 2) {
  1057. *o++ = b64_enc[carry];
  1058. cols ++;
  1059. CHECK_SPLIT;
  1060. *o++ = '=';
  1061. cols ++;
  1062. }
  1063. CHECK_SPLIT;
  1064. *o = '\0';
  1065. if (outlen != NULL) {
  1066. *outlen = o - out;
  1067. }
  1068. return out;
  1069. }
  1070. gchar *
  1071. rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
  1072. gsize *outlen)
  1073. {
  1074. return rspamd_encode_base64_common (in, inlen, str_len, outlen, FALSE,
  1075. RSPAMD_TASK_NEWLINES_CRLF);
  1076. }
  1077. gchar *
  1078. rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
  1079. gsize *outlen, enum rspamd_newlines_type how)
  1080. {
  1081. return rspamd_encode_base64_common (in, inlen, str_len, outlen, TRUE, how);
  1082. }
  1083. #define QP_RANGE(x) (((x) >= 33 && (x) <= 60) || ((x) >= 62 && (x) <= 126) \
  1084. || (x) == '\r' || (x) == '\n' || (x) == ' ' || (x) == '\t')
  1085. #define QP_SPAN_NORMAL(span, str_len) ((str_len) > 0 && \
  1086. ((span) + 1) >= (str_len))
  1087. #define QP_SPAN_SPECIAL(span, str_len) ((str_len) > 0 && \
  1088. ((span) + 4) >= (str_len))
  1089. gchar *
  1090. rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
  1091. gsize *outlen, enum rspamd_newlines_type how)
  1092. {
  1093. gsize olen = 0, span = 0, i = 0, seen_spaces = 0;
  1094. gchar *out;
  1095. gint ch, last_sp;
  1096. const guchar *end = in + inlen, *p = in;
  1097. static const gchar hexdigests[16] = "0123456789ABCDEF";
  1098. while (p < end) {
  1099. ch = *p;
  1100. if (QP_RANGE(ch)) {
  1101. olen ++;
  1102. span ++;
  1103. if (ch == '\r' || ch == '\n') {
  1104. if (seen_spaces > 0) {
  1105. /* We must encode spaces at the end of line */
  1106. olen += 3;
  1107. seen_spaces = 0;
  1108. /* Special stuff for space character at the end */
  1109. if (QP_SPAN_SPECIAL(span, str_len)) {
  1110. if (how == RSPAMD_TASK_NEWLINES_CRLF) {
  1111. /* =\r\n */
  1112. olen += 3;
  1113. }
  1114. else {
  1115. olen += 2;
  1116. }
  1117. }
  1118. /* Continue with the same `ch` but without spaces logic */
  1119. continue;
  1120. }
  1121. span = 0;
  1122. }
  1123. else if (ch == ' ' || ch == '\t') {
  1124. seen_spaces ++;
  1125. last_sp = ch;
  1126. }
  1127. else {
  1128. seen_spaces = 0;
  1129. }
  1130. }
  1131. else {
  1132. if (QP_SPAN_SPECIAL(span, str_len)) {
  1133. if (how == RSPAMD_TASK_NEWLINES_CRLF) {
  1134. /* =\r\n */
  1135. olen += 3;
  1136. }
  1137. else {
  1138. olen += 2;
  1139. }
  1140. span = 0;
  1141. }
  1142. olen += 3;
  1143. span += 3;
  1144. }
  1145. if (QP_SPAN_NORMAL(span, str_len)) {
  1146. if (how == RSPAMD_TASK_NEWLINES_CRLF) {
  1147. /* =\r\n */
  1148. olen += 3;
  1149. }
  1150. else {
  1151. olen += 2;
  1152. }
  1153. span = 0;
  1154. }
  1155. p ++;
  1156. }
  1157. if (seen_spaces > 0) {
  1158. /* Reserve length for the last space encoded */
  1159. olen += 3;
  1160. }
  1161. out = g_malloc (olen + 1);
  1162. p = in;
  1163. i = 0;
  1164. span = 0;
  1165. seen_spaces = 0;
  1166. while (p < end) {
  1167. ch = *p;
  1168. if (QP_RANGE (ch)) {
  1169. if (ch == '\r' || ch == '\n') {
  1170. if (seen_spaces > 0) {
  1171. if (QP_SPAN_SPECIAL(span, str_len)) {
  1172. /* Add soft newline */
  1173. i --;
  1174. if (p + 1 < end || span + 3 >= str_len) {
  1175. switch (how) {
  1176. default:
  1177. case RSPAMD_TASK_NEWLINES_CRLF:
  1178. out[i++] = '=';
  1179. out[i++] = '\r';
  1180. out[i++] = '\n';
  1181. break;
  1182. case RSPAMD_TASK_NEWLINES_LF:
  1183. out[i++] = '=';
  1184. out[i++] = '\n';
  1185. break;
  1186. case RSPAMD_TASK_NEWLINES_CR:
  1187. out[i++] = '=';
  1188. out[i++] = '\r';
  1189. break;
  1190. }
  1191. }
  1192. /* Now write encoded `last_sp` but after newline */
  1193. out[i++] = '=';
  1194. out[i++] = hexdigests[((last_sp >> 4) & 0xF)];
  1195. out[i++] = hexdigests[(last_sp & 0xF)];
  1196. span = 0;
  1197. }
  1198. else {
  1199. /* Encode last space */
  1200. --i;
  1201. out[i++] = '=';
  1202. out[i++] = hexdigests[((last_sp >> 4) & 0xF)];
  1203. out[i++] = hexdigests[(last_sp & 0xF)];
  1204. seen_spaces = 0;
  1205. }
  1206. continue;
  1207. }
  1208. span = 0;
  1209. }
  1210. else if (ch == ' ' || ch == '\t') {
  1211. seen_spaces ++;
  1212. last_sp = ch;
  1213. span ++;
  1214. }
  1215. else {
  1216. seen_spaces = 0;
  1217. span ++;
  1218. }
  1219. out[i++] = ch;
  1220. }
  1221. else {
  1222. if (QP_SPAN_SPECIAL(span, str_len)) {
  1223. /* Add new line and then continue */
  1224. if (p + 1 < end || span + 3 >= str_len) {
  1225. switch (how) {
  1226. default:
  1227. case RSPAMD_TASK_NEWLINES_CRLF:
  1228. out[i++] = '=';
  1229. out[i++] = '\r';
  1230. out[i++] = '\n';
  1231. break;
  1232. case RSPAMD_TASK_NEWLINES_LF:
  1233. out[i++] = '=';
  1234. out[i++] = '\n';
  1235. break;
  1236. case RSPAMD_TASK_NEWLINES_CR:
  1237. out[i++] = '=';
  1238. out[i++] = '\r';
  1239. break;
  1240. }
  1241. span = 0;
  1242. }
  1243. }
  1244. out[i++] = '=';
  1245. out[i++] = hexdigests[((ch >> 4) & 0xF)];
  1246. out[i++] = hexdigests[(ch & 0xF)];
  1247. span += 3;
  1248. seen_spaces = 0;
  1249. }
  1250. if (QP_SPAN_NORMAL(span, str_len)) {
  1251. /* Add new line and then continue */
  1252. if (p + 1 < end || span > str_len || seen_spaces) {
  1253. switch (how) {
  1254. default:
  1255. case RSPAMD_TASK_NEWLINES_CRLF:
  1256. out[i++] = '=';
  1257. out[i++] = '\r';
  1258. out[i++] = '\n';
  1259. break;
  1260. case RSPAMD_TASK_NEWLINES_LF:
  1261. out[i++] = '=';
  1262. out[i++] = '\n';
  1263. break;
  1264. case RSPAMD_TASK_NEWLINES_CR:
  1265. out[i++] = '=';
  1266. out[i++] = '\r';
  1267. break;
  1268. }
  1269. span = 0;
  1270. seen_spaces = 0;
  1271. }
  1272. }
  1273. g_assert (i <= olen);
  1274. p ++;
  1275. }
  1276. /* Deal with the last space character */
  1277. if (seen_spaces > 0) {
  1278. i --;
  1279. out[i++] = '=';
  1280. out[i++] = hexdigests[((last_sp >> 4) & 0xF)];
  1281. out[i++] = hexdigests[(last_sp & 0xF)];
  1282. }
  1283. out[i] = '\0';
  1284. if (outlen) {
  1285. *outlen = i;
  1286. }
  1287. return out;
  1288. }
  1289. #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
  1290. gint
  1291. rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
  1292. const gchar *s2, gsize s2len,
  1293. guint replace_cost)
  1294. {
  1295. gchar c1, c2, last_c2, last_c1;
  1296. static GArray *current_row = NULL, *prev_row = NULL, *transp_row = NULL;
  1297. gint eq;
  1298. static const guint max_cmp = 8192;
  1299. gint ret;
  1300. g_assert (s1 != NULL);
  1301. g_assert (s2 != NULL);
  1302. if (s1len == 0) {
  1303. s1len = strlen (s1);
  1304. }
  1305. if (s2len == 0) {
  1306. s2len = strlen (s2);
  1307. }
  1308. if (MAX(s1len, s2len) > max_cmp) {
  1309. /* Cannot compare too many characters */
  1310. return max_cmp;
  1311. }
  1312. if (s1len > s2len) {
  1313. /* Exchange s1 and s2 */
  1314. const gchar *tmp;
  1315. gsize tmplen;
  1316. tmp = s2;
  1317. s2 = s1;
  1318. s1 = tmp;
  1319. tmplen = s2len;
  1320. s2len = s1len;
  1321. s1len = tmplen;
  1322. }
  1323. /* Adjust static space */
  1324. if (current_row == NULL) {
  1325. current_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
  1326. prev_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
  1327. transp_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
  1328. g_array_set_size (current_row, s1len + 1);
  1329. g_array_set_size (prev_row, s1len + 1);
  1330. g_array_set_size (transp_row, s1len + 1);
  1331. }
  1332. else if (current_row->len < s1len + 1) {
  1333. g_array_set_size (current_row, s1len + 1);
  1334. g_array_set_size (prev_row, s1len + 1);
  1335. g_array_set_size (transp_row, s1len + 1);
  1336. }
  1337. memset (current_row->data, 0, (s1len + 1) * sizeof (gint));
  1338. memset (transp_row->data, 0, (s1len + 1) * sizeof (gint));
  1339. for (gint i = 0; i <= s1len; i++) {
  1340. g_array_index (prev_row, gint, i) = i;
  1341. }
  1342. last_c2 = '\0';
  1343. for (gint i = 1; i <= s2len; i++) {
  1344. c2 = s2[i - 1];
  1345. g_array_index (current_row, gint, 0) = i;
  1346. last_c1 = '\0';
  1347. for (gint j = 1; j <= s1len; j++) {
  1348. c1 = s1[j - 1];
  1349. eq = c1 == c2 ? 0 : replace_cost;
  1350. ret = MIN3 (g_array_index (current_row, gint, j - 1) + 1, /* Insert */
  1351. g_array_index (prev_row, gint, j) + 1, /* Remove */
  1352. g_array_index (prev_row, gint, j - 1) + eq /* Replace */);
  1353. /* Take reordering into account */
  1354. if (c1 == last_c2 && c2 == last_c1 && j >= 2) {
  1355. ret = MIN (ret, g_array_index (transp_row, gint, j - 2) + eq);
  1356. }
  1357. g_array_index (current_row, gint, j) = ret;
  1358. last_c1 = c1;
  1359. }
  1360. last_c2 = c2;
  1361. /* Exchange pointers */
  1362. GArray *tmp;
  1363. tmp = transp_row;
  1364. transp_row = prev_row;
  1365. prev_row = current_row;
  1366. current_row = tmp;
  1367. }
  1368. ret = g_array_index (prev_row, gint, s1len);
  1369. return ret;
  1370. }
  1371. GString *
  1372. rspamd_header_value_fold (const gchar *name, gsize name_len,
  1373. const gchar *value,
  1374. gsize value_len,
  1375. guint fold_max,
  1376. enum rspamd_newlines_type how,
  1377. const gchar *fold_on_chars)
  1378. {
  1379. GString *res;
  1380. const guint default_fold_max = 76;
  1381. guint cur_len;
  1382. const gchar *p, *c, *end, *fold_sequence;
  1383. guint nspaces = 0;
  1384. gboolean first_token = TRUE;
  1385. enum {
  1386. fold_before = 0,
  1387. fold_after
  1388. } fold_type = fold_before;
  1389. enum {
  1390. read_token = 0,
  1391. read_quoted,
  1392. after_quote,
  1393. fold_token,
  1394. } state = read_token, next_state = read_token;
  1395. g_assert (name != NULL);
  1396. g_assert (value != NULL);
  1397. /* Filter insane values */
  1398. if (fold_max < 20) {
  1399. fold_max = default_fold_max;
  1400. }
  1401. switch (how) {
  1402. case RSPAMD_TASK_NEWLINES_LF:
  1403. fold_sequence = "\n\t";
  1404. break;
  1405. case RSPAMD_TASK_NEWLINES_CR:
  1406. fold_sequence = "\r\t";
  1407. break;
  1408. case RSPAMD_TASK_NEWLINES_CRLF:
  1409. default:
  1410. fold_sequence ="\r\n\t";
  1411. break;
  1412. }
  1413. res = g_string_sized_new (value_len);
  1414. c = value;
  1415. p = c;
  1416. end = value + value_len;
  1417. /* name:<WSP> */
  1418. cur_len = name_len + 2;
  1419. while (p < end) {
  1420. switch (state) {
  1421. case read_token:
  1422. if (fold_on_chars) {
  1423. if (strchr (fold_on_chars, *p) != NULL) {
  1424. fold_type = fold_after;
  1425. state = fold_token;
  1426. next_state = read_token;
  1427. }
  1428. p ++;
  1429. }
  1430. else {
  1431. if (*p == ',' || *p == ';') {
  1432. /* We have something similar to the token's end, so check len */
  1433. if (cur_len > fold_max * 0.8 && cur_len < fold_max) {
  1434. /* We want fold */
  1435. fold_type = fold_after;
  1436. state = fold_token;
  1437. next_state = read_token;
  1438. } else if (cur_len > fold_max && !first_token) {
  1439. fold_type = fold_before;
  1440. state = fold_token;
  1441. next_state = read_token;
  1442. } else {
  1443. g_string_append_len (res, c, p - c + 1);
  1444. c = p + 1;
  1445. first_token = FALSE;
  1446. }
  1447. p++;
  1448. } else if (*p == '"') {
  1449. /* Fold before quoted tokens */
  1450. g_string_append_len (res, c, p - c);
  1451. c = p;
  1452. state = read_quoted;
  1453. } else if (*p == '\r' || *p == '\n') {
  1454. if (cur_len > fold_max && !first_token) {
  1455. fold_type = fold_before;
  1456. state = fold_token;
  1457. next_state = read_token;
  1458. } else {
  1459. /* We need to ensure that it is a folding and not something else */
  1460. const char *t = p;
  1461. bool seen_fold = false;
  1462. while (t < end) {
  1463. if (*t == ' ' || *t == '\t') {
  1464. seen_fold = true;
  1465. break;
  1466. }
  1467. else if (!g_ascii_isspace(*t)) {
  1468. break;
  1469. }
  1470. t++;
  1471. }
  1472. if (seen_fold) {
  1473. /* Reset line length */
  1474. cur_len = 0;
  1475. while (g_ascii_isspace (*p)) {
  1476. p++;
  1477. }
  1478. g_string_append_len(res, c, p - c);
  1479. c = p;
  1480. first_token = TRUE;
  1481. }
  1482. else {
  1483. /* Not seen folding, inject it */
  1484. g_string_append_len (res, c, p - c);
  1485. g_string_append (res, fold_sequence);
  1486. p = t; /* Adjust p to ensure that we do not append extra stuff */
  1487. state = read_token;
  1488. first_token = TRUE;
  1489. c = p;
  1490. }
  1491. }
  1492. } else if (g_ascii_isspace (*p)) {
  1493. if (cur_len > fold_max * 0.8 && cur_len < fold_max) {
  1494. /* We want fold */
  1495. fold_type = fold_after;
  1496. state = fold_token;
  1497. next_state = read_token;
  1498. } else if (cur_len > fold_max && !first_token) {
  1499. fold_type = fold_before;
  1500. state = fold_token;
  1501. next_state = read_token;
  1502. } else {
  1503. g_string_append_len (res, c, p - c);
  1504. c = p;
  1505. first_token = FALSE;
  1506. p++;
  1507. cur_len++;
  1508. }
  1509. } else {
  1510. p++;
  1511. cur_len++;
  1512. }
  1513. }
  1514. break;
  1515. case fold_token:
  1516. /* Here, we have token start at 'c' and token end at 'p' */
  1517. if (fold_type == fold_after) {
  1518. nspaces = 0;
  1519. if (p > c) {
  1520. g_string_append_len (res, c, p - c);
  1521. /*
  1522. * Check any spaces that are appended to the result
  1523. * before folding
  1524. */
  1525. const gchar *last = &res->str[res->len - 1];
  1526. while (g_ascii_isspace (*last)) {
  1527. last --;
  1528. nspaces ++;
  1529. res->len --;
  1530. }
  1531. }
  1532. g_string_append (res, fold_sequence);
  1533. /* Skip space if needed */
  1534. if (g_ascii_isspace (*p)) {
  1535. p ++;
  1536. }
  1537. /* Move leftover spaces */
  1538. while (nspaces) {
  1539. g_string_append_c (res, ' ');
  1540. nspaces --;
  1541. }
  1542. cur_len = 0;
  1543. }
  1544. else {
  1545. const gchar *last;
  1546. /* Skip space if needed */
  1547. if (g_ascii_isspace (*c) && p > c) {
  1548. c ++;
  1549. }
  1550. /* Avoid double folding */
  1551. last = &res->str[res->len - 1];
  1552. last --;
  1553. if (*last != '\r' && *last != '\n') {
  1554. last ++;
  1555. while (g_ascii_isspace (*last)) {
  1556. last --;
  1557. nspaces ++;
  1558. res->len --;
  1559. }
  1560. g_string_append (res, fold_sequence);
  1561. }
  1562. /* Move leftover spaces */
  1563. cur_len = nspaces;
  1564. while (nspaces) {
  1565. g_string_append_c (res, ' ');
  1566. nspaces --;
  1567. }
  1568. if (p > c) {
  1569. g_string_append_len (res, c, p - c);
  1570. cur_len += p - c;
  1571. }
  1572. else {
  1573. cur_len = 0;
  1574. }
  1575. }
  1576. first_token = TRUE;
  1577. c = p;
  1578. state = next_state;
  1579. break;
  1580. case read_quoted:
  1581. if (p != c && *p == '"') {
  1582. state = after_quote;
  1583. }
  1584. p ++;
  1585. cur_len ++;
  1586. break;
  1587. case after_quote:
  1588. state = read_token;
  1589. /* Skip one more character after the quote */
  1590. p ++;
  1591. cur_len ++;
  1592. g_string_append_len (res, c, p - c);
  1593. c = p;
  1594. first_token = TRUE;
  1595. break;
  1596. }
  1597. }
  1598. /* Last token */
  1599. switch (state) {
  1600. case read_token:
  1601. if (!fold_on_chars && cur_len > fold_max && !first_token) {
  1602. if (g_ascii_isspace (*c)) {
  1603. c ++;
  1604. }
  1605. g_string_append (res, fold_sequence);
  1606. g_string_append_len (res, c, p - c);
  1607. }
  1608. else {
  1609. g_string_append_len (res, c, p - c);
  1610. }
  1611. break;
  1612. case read_quoted:
  1613. case after_quote:
  1614. g_string_append_len (res, c, p - c);
  1615. break;
  1616. case fold_token:
  1617. /* Here, we have token start at 'c' and token end at 'p' */
  1618. if (g_ascii_isspace (res->str[res->len - 1])) {
  1619. g_string_append_len (res, c, p - c);
  1620. }
  1621. else {
  1622. if (*c != '\r' && *c != '\n') {
  1623. /* We need to add folding as well */
  1624. g_string_append (res, fold_sequence);
  1625. g_string_append_len (res, c, p - c);
  1626. }
  1627. else {
  1628. g_string_append_len (res, c, p - c);
  1629. }
  1630. }
  1631. break;
  1632. default:
  1633. g_assert (p == c);
  1634. break;
  1635. }
  1636. return res;
  1637. }
  1638. static inline bool rspamd_substring_cmp_func (guchar a, guchar b) { return a == b; }
  1639. static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) { return lc_map[a] == lc_map[b]; }
  1640. typedef bool (*rspamd_cmpchar_func_t) (guchar a, guchar b);
  1641. static inline void
  1642. rspamd_substring_preprocess_kmp (const gchar *pat, gsize len, goffset *fsm,
  1643. rspamd_cmpchar_func_t f)
  1644. {
  1645. goffset i, j;
  1646. i = 0;
  1647. j = -1;
  1648. fsm[0] = -1;
  1649. while (i < len) {
  1650. while (j > -1 && !f(pat[i], pat[j])) {
  1651. j = fsm[j];
  1652. }
  1653. i++;
  1654. j++;
  1655. if (i < len && j < len && f(pat[i], pat[j])) {
  1656. fsm[i] = fsm[j];
  1657. }
  1658. else {
  1659. fsm[i] = j;
  1660. }
  1661. }
  1662. }
  1663. static inline goffset
  1664. rspamd_substring_search_preprocessed (const gchar *in, gsize inlen,
  1665. const gchar *srch,
  1666. gsize srchlen,
  1667. const goffset *fsm,
  1668. rspamd_cmpchar_func_t f)
  1669. {
  1670. goffset i, j, k, ell;
  1671. for (ell = 1; f(srch[ell - 1], srch[ell]); ell++) {}
  1672. if (ell == srchlen) {
  1673. ell = 0;
  1674. }
  1675. /* Searching */
  1676. i = ell;
  1677. j = k = 0;
  1678. while (j <= inlen - srchlen) {
  1679. while (i < srchlen && f(srch[i], in[i + j])) {
  1680. ++i;
  1681. }
  1682. if (i >= srchlen) {
  1683. while (k < ell && f(srch[k], in[j + k])) {
  1684. ++k;
  1685. }
  1686. if (k >= ell) {
  1687. return j;
  1688. }
  1689. }
  1690. j += (i - fsm[i]);
  1691. if (i == ell) {
  1692. k = MAX(0, k - 1);
  1693. }
  1694. else {
  1695. if (fsm[i] <= ell) {
  1696. k = MAX(0, fsm[i]);
  1697. i = ell;
  1698. } else {
  1699. k = ell;
  1700. i = fsm[i];
  1701. }
  1702. }
  1703. }
  1704. return -1;
  1705. }
  1706. static inline goffset
  1707. rspamd_substring_search_common (const gchar *in, gsize inlen,
  1708. const gchar *srch, gsize srchlen, rspamd_cmpchar_func_t f)
  1709. {
  1710. static goffset st_fsm[128];
  1711. goffset *fsm, ret;
  1712. if (G_LIKELY (srchlen < G_N_ELEMENTS (st_fsm))) {
  1713. fsm = st_fsm;
  1714. }
  1715. else {
  1716. fsm = g_malloc ((srchlen + 1) * sizeof (*fsm));
  1717. }
  1718. rspamd_substring_preprocess_kmp (srch, srchlen, fsm, f);
  1719. ret = rspamd_substring_search_preprocessed (in, inlen, srch, srchlen, fsm, f);
  1720. if (G_UNLIKELY (srchlen >= G_N_ELEMENTS (st_fsm))) {
  1721. g_free (fsm);
  1722. }
  1723. return ret;
  1724. }
  1725. goffset
  1726. rspamd_substring_search (const gchar *in, gsize inlen,
  1727. const gchar *srch, gsize srchlen)
  1728. {
  1729. if (inlen > srchlen) {
  1730. if (G_UNLIKELY (srchlen == 1)) {
  1731. const gchar *p;
  1732. p = memchr (in, srch[0], inlen);
  1733. if (p) {
  1734. return p - in;
  1735. }
  1736. return (-1);
  1737. }
  1738. else if (G_UNLIKELY (srchlen == 0)) {
  1739. return 0;
  1740. }
  1741. return rspamd_substring_search_common (in, inlen, srch, srchlen,
  1742. rspamd_substring_cmp_func);
  1743. }
  1744. else if (inlen == srchlen) {
  1745. return (rspamd_lc_cmp (srch, in, srchlen) == 0 ? 0 : -1);
  1746. }
  1747. else {
  1748. return (-1);
  1749. }
  1750. return (-1);
  1751. }
  1752. goffset
  1753. rspamd_substring_search_caseless (const gchar *in, gsize inlen,
  1754. const gchar *srch, gsize srchlen)
  1755. {
  1756. if (inlen > srchlen) {
  1757. if (G_UNLIKELY (srchlen == 1)) {
  1758. goffset i;
  1759. gchar s = lc_map[(guchar)srch[0]];
  1760. for (i = 0; i < inlen; i++) {
  1761. if (lc_map[(guchar)in[i]] == s) {
  1762. return i;
  1763. }
  1764. }
  1765. return (-1);
  1766. }
  1767. return rspamd_substring_search_common (in, inlen, srch, srchlen,
  1768. rspamd_substring_casecmp_func);
  1769. }
  1770. else if (inlen == srchlen) {
  1771. return rspamd_lc_cmp (srch, in, srchlen) == 0 ? 0 : (-1);
  1772. }
  1773. return (-1);
  1774. }
  1775. goffset
  1776. rspamd_string_find_eoh (GString *input, goffset *body_start)
  1777. {
  1778. const gchar *p, *c = NULL, *end;
  1779. enum {
  1780. skip_char = 0,
  1781. got_cr,
  1782. got_lf,
  1783. got_linebreak,
  1784. got_linebreak_cr,
  1785. got_linebreak_lf,
  1786. obs_fws
  1787. } state = skip_char;
  1788. g_assert (input != NULL);
  1789. p = input->str;
  1790. end = p + input->len;
  1791. while (p < end) {
  1792. switch (state) {
  1793. case skip_char:
  1794. if (*p == '\r') {
  1795. p++;
  1796. state = got_cr;
  1797. }
  1798. else if (*p == '\n') {
  1799. p++;
  1800. state = got_lf;
  1801. }
  1802. else {
  1803. p++;
  1804. }
  1805. break;
  1806. case got_cr:
  1807. if (*p == '\r') {
  1808. /*
  1809. * Double \r\r, so need to check the current char
  1810. * if it is '\n', then we have \r\r\n sequence, that is NOT
  1811. * double end of line
  1812. */
  1813. if (p < end && p[1] == '\n') {
  1814. p++;
  1815. state = got_lf;
  1816. }
  1817. else {
  1818. /* We have \r\r[^\n] */
  1819. if (body_start) {
  1820. *body_start = p - input->str + 1;
  1821. }
  1822. return p - input->str;
  1823. }
  1824. }
  1825. else if (*p == '\n') {
  1826. p++;
  1827. state = got_lf;
  1828. }
  1829. else if (g_ascii_isspace (*p)) {
  1830. /* We have \r<space>*, allow to stay in this state */
  1831. c = p;
  1832. p ++;
  1833. state = obs_fws;
  1834. }
  1835. else {
  1836. p++;
  1837. state = skip_char;
  1838. }
  1839. break;
  1840. case got_lf:
  1841. if (*p == '\n') {
  1842. /* We have \n\n, which is obviously end of headers */
  1843. if (body_start) {
  1844. *body_start = p - input->str + 1;
  1845. }
  1846. return p - input->str;
  1847. }
  1848. else if (*p == '\r') {
  1849. state = got_linebreak;
  1850. }
  1851. else if (g_ascii_isspace (*p)) {
  1852. /* We have \n<space>*, allow to stay in this state */
  1853. c = p;
  1854. p ++;
  1855. state = obs_fws;
  1856. }
  1857. else {
  1858. p++;
  1859. state = skip_char;
  1860. }
  1861. break;
  1862. case got_linebreak:
  1863. if (*p == '\r') {
  1864. c = p;
  1865. p++;
  1866. state = got_linebreak_cr;
  1867. }
  1868. else if (*p == '\n') {
  1869. c = p;
  1870. p++;
  1871. state = got_linebreak_lf;
  1872. }
  1873. else if (g_ascii_isspace (*p)) {
  1874. /* We have <linebreak><space>*, allow to stay in this state */
  1875. c = p;
  1876. p ++;
  1877. state = obs_fws;
  1878. }
  1879. else {
  1880. p++;
  1881. state = skip_char;
  1882. }
  1883. break;
  1884. case got_linebreak_cr:
  1885. if (*p == '\r') {
  1886. /* Got double \r\r after \n, so does not treat it as EOH */
  1887. state = got_linebreak_cr;
  1888. p++;
  1889. }
  1890. else if (*p == '\n') {
  1891. state = got_linebreak_lf;
  1892. p++;
  1893. }
  1894. else if (g_ascii_isspace (*p)) {
  1895. /* We have \r\n<space>*, allow to keep in this state */
  1896. c = p;
  1897. state = obs_fws;
  1898. p ++;
  1899. }
  1900. else {
  1901. p++;
  1902. state = skip_char;
  1903. }
  1904. break;
  1905. case got_linebreak_lf:
  1906. g_assert (c != NULL);
  1907. if (body_start) {
  1908. /* \r\n\r\n */
  1909. *body_start = p - input->str;
  1910. }
  1911. return c - input->str;
  1912. case obs_fws:
  1913. if (*p == ' ' || *p == '\t') {
  1914. p ++;
  1915. }
  1916. else if (*p == '\r') {
  1917. /* Perform lookahead due to #2349 */
  1918. if (end - p > 2) {
  1919. if (p[1] == '\n' && g_ascii_isspace (p[2])) {
  1920. /* Real obs_fws state, switch */
  1921. c = p;
  1922. p ++;
  1923. state = got_cr;
  1924. }
  1925. else if (g_ascii_isspace (p[1])) {
  1926. p ++;
  1927. state = obs_fws;
  1928. }
  1929. else {
  1930. /*
  1931. * <nline> <wsp>+ \r <nwsp>.
  1932. * It is an empty header likely, so we can go further...
  1933. * https://tools.ietf.org/html/rfc2822#section-4.2
  1934. */
  1935. c = p;
  1936. p ++;
  1937. state = got_cr;
  1938. }
  1939. }
  1940. else {
  1941. /* shortage */
  1942. if (body_start) {
  1943. *body_start = p - input->str + 1;
  1944. }
  1945. return p - input->str;
  1946. }
  1947. }
  1948. else if (*p == '\n') {
  1949. /* Perform lookahead due to #2349 */
  1950. if (end - p > 1) {
  1951. /* Continue folding with an empty line */
  1952. if (p[1] == ' ' || p[1] == '\t') {
  1953. c = p;
  1954. p ++;
  1955. state = obs_fws;
  1956. }
  1957. else if (p[1] == '\r') {
  1958. /* WTF state: we have seen spaces, \n and then it follows \r */
  1959. c = p;
  1960. p ++;
  1961. state = got_lf;
  1962. }
  1963. else if (p[1] == '\n') {
  1964. /*
  1965. * Switching to got_lf state here will let us to finish
  1966. * the cycle.
  1967. */
  1968. c = p;
  1969. p ++;
  1970. state = got_lf;
  1971. }
  1972. else {
  1973. /*
  1974. * <nline> <wsp>+ \n <nwsp>.
  1975. * It is an empty header likely, so we can go further...
  1976. * https://tools.ietf.org/html/rfc2822#section-4.2
  1977. */
  1978. c = p;
  1979. p ++;
  1980. state = got_lf;
  1981. }
  1982. }
  1983. else {
  1984. /* shortage */
  1985. if (body_start) {
  1986. *body_start = p - input->str + 1;
  1987. }
  1988. return p - input->str;
  1989. }
  1990. }
  1991. else {
  1992. p++;
  1993. state = skip_char;
  1994. }
  1995. break;
  1996. }
  1997. }
  1998. if (state == got_linebreak_lf) {
  1999. if (body_start) {
  2000. /* \r\n\r\n */
  2001. *body_start = p - input->str;
  2002. }
  2003. return c - input->str;
  2004. }
  2005. return -1;
  2006. }
  2007. gint
  2008. rspamd_encode_hex_buf (const guchar *in, gsize inlen, gchar *out,
  2009. gsize outlen)
  2010. {
  2011. gchar *o, *end;
  2012. const guchar *p;
  2013. static const gchar hexdigests[16] = "0123456789abcdef";
  2014. end = out + outlen;
  2015. o = out;
  2016. p = in;
  2017. while (inlen > 0 && o < end - 1) {
  2018. *o++ = hexdigests[((*p >> 4) & 0xF)];
  2019. *o++ = hexdigests[((*p++) & 0xF)];
  2020. inlen --;
  2021. }
  2022. if (o <= end) {
  2023. return (o - out);
  2024. }
  2025. return -1;
  2026. }
  2027. gchar *
  2028. rspamd_encode_hex (const guchar *in, gsize inlen)
  2029. {
  2030. gchar *out;
  2031. gsize outlen = inlen * 2 + 1;
  2032. gint olen;
  2033. if (in == NULL) {
  2034. return NULL;
  2035. }
  2036. out = g_malloc (outlen);
  2037. olen = rspamd_encode_hex_buf (in, inlen, out, outlen - 1);
  2038. if (olen >= 0) {
  2039. out[olen] = '\0';
  2040. }
  2041. else {
  2042. g_free (out);
  2043. return NULL;
  2044. }
  2045. return out;
  2046. }
  2047. gssize
  2048. rspamd_decode_hex_buf (const gchar *in, gsize inlen,
  2049. guchar *out, gsize outlen)
  2050. {
  2051. guchar *o, *end, ret = 0;
  2052. const gchar *p;
  2053. gchar c;
  2054. end = out + outlen;
  2055. o = out;
  2056. p = in;
  2057. /* We ignore trailing chars if we have not even input */
  2058. inlen = inlen - inlen % 2;
  2059. while (inlen > 1 && o < end) {
  2060. c = *p++;
  2061. if (c >= '0' && c <= '9') ret = c - '0';
  2062. else if (c >= 'A' && c <= 'F') ret = c - 'A' + 10;
  2063. else if (c >= 'a' && c <= 'f') ret = c - 'a' + 10;
  2064. c = *p++;
  2065. ret *= 16;
  2066. if (c >= '0' && c <= '9') ret += c - '0';
  2067. else if (c >= 'A' && c <= 'F') ret += c - 'A' + 10;
  2068. else if (c >= 'a' && c <= 'f') ret += c - 'a' + 10;
  2069. *o++ = ret;
  2070. inlen -= 2;
  2071. }
  2072. if (o <= end) {
  2073. return (o - out);
  2074. }
  2075. return -1;
  2076. }
  2077. guchar*
  2078. rspamd_decode_hex (const gchar *in, gsize inlen)
  2079. {
  2080. guchar *out;
  2081. gsize outlen = (inlen / 2 + inlen % 2) + 1;
  2082. gint olen;
  2083. if (in == NULL) {
  2084. return NULL;
  2085. }
  2086. out = g_malloc (outlen);
  2087. olen = rspamd_decode_hex_buf (in, inlen, out, outlen - 1);
  2088. if (olen >= 0) {
  2089. out[olen] = '\0';
  2090. return out;
  2091. }
  2092. g_free (out);
  2093. return NULL;
  2094. }
  2095. gssize
  2096. rspamd_decode_qp_buf (const gchar *in, gsize inlen,
  2097. gchar *out, gsize outlen)
  2098. {
  2099. gchar *o, *end, *pos, c;
  2100. const gchar *p;
  2101. guchar ret;
  2102. gssize remain, processed;
  2103. p = in;
  2104. o = out;
  2105. end = out + outlen;
  2106. remain = inlen;
  2107. while (remain > 0 && o < end) {
  2108. if (*p == '=') {
  2109. remain --;
  2110. if (remain == 0) {
  2111. /* Last '=' character, bugon */
  2112. if (end - o > 0) {
  2113. *o++ = *p;
  2114. }
  2115. else {
  2116. /* Buffer overflow */
  2117. return (-1);
  2118. }
  2119. break;
  2120. }
  2121. p ++;
  2122. decode:
  2123. /* Decode character after '=' */
  2124. c = *p++;
  2125. remain --;
  2126. ret = 0;
  2127. if (c >= '0' && c <= '9') {
  2128. ret = c - '0';
  2129. }
  2130. else if (c >= 'A' && c <= 'F') {
  2131. ret = c - 'A' + 10;
  2132. }
  2133. else if (c >= 'a' && c <= 'f') {
  2134. ret = c - 'a' + 10;
  2135. }
  2136. else if (c == '\r') {
  2137. /* Eat one more endline */
  2138. if (remain > 0 && *p == '\n') {
  2139. p ++;
  2140. remain --;
  2141. }
  2142. continue;
  2143. }
  2144. else if (c == '\n') {
  2145. /* Soft line break */
  2146. continue;
  2147. }
  2148. else {
  2149. /* Hack, hack, hack, treat =<garbadge> as =<garbadge> */
  2150. if (end - o > 1) {
  2151. *o++ = '=';
  2152. *o++ = *(p - 1);
  2153. }
  2154. else {
  2155. return (-1);
  2156. }
  2157. continue;
  2158. }
  2159. if (remain > 0) {
  2160. c = *p++;
  2161. ret *= 16;
  2162. remain --;
  2163. if (c >= '0' && c <= '9') {
  2164. ret += c - '0';
  2165. }
  2166. else if (c >= 'A' && c <= 'F') {
  2167. ret += c - 'A' + 10;
  2168. }
  2169. else if (c >= 'a' && c <= 'f') {
  2170. ret += c - 'a' + 10;
  2171. }
  2172. else {
  2173. /* Treat =<good><rubbish> as =<good><rubbish> */
  2174. if (end - o > 2) {
  2175. *o++ = '=';
  2176. *o++ = *(p - 2);
  2177. *o++ = *(p - 1);
  2178. }
  2179. else {
  2180. return (-1);
  2181. }
  2182. continue;
  2183. }
  2184. if (end - o > 0) {
  2185. *o++ = (gchar)ret;
  2186. }
  2187. else {
  2188. return (-1);
  2189. }
  2190. }
  2191. }
  2192. else {
  2193. if (end - o >= remain) {
  2194. if ((pos = memccpy (o, p, '=', remain)) == NULL) {
  2195. /* All copied */
  2196. o += remain;
  2197. break;
  2198. }
  2199. else {
  2200. processed = pos - o;
  2201. remain -= processed;
  2202. p += processed;
  2203. if (remain > 0) {
  2204. o = pos - 1;
  2205. /*
  2206. * Skip comparison and jump inside decode branch,
  2207. * as we know that we have found match
  2208. */
  2209. goto decode;
  2210. }
  2211. else {
  2212. /* Last '=' character, bugon */
  2213. o = pos;
  2214. if (end - o > 0) {
  2215. *o = '=';
  2216. }
  2217. else {
  2218. /* Buffer overflow */
  2219. return (-1);
  2220. }
  2221. break;
  2222. }
  2223. }
  2224. }
  2225. else {
  2226. /* Buffer overflow */
  2227. return (-1);
  2228. }
  2229. }
  2230. }
  2231. return (o - out);
  2232. }
  2233. gssize
  2234. rspamd_decode_uue_buf (const gchar *in, gsize inlen,
  2235. gchar *out, gsize outlen)
  2236. {
  2237. gchar *o, *out_end;
  2238. const gchar *p;
  2239. gssize remain;
  2240. gboolean base64 = FALSE;
  2241. goffset pos;
  2242. const gchar *nline = "\r\n";
  2243. p = in;
  2244. o = out;
  2245. out_end = out + outlen;
  2246. remain = inlen;
  2247. /* Skip newlines */
  2248. #define SKIP_NEWLINE do { while (remain > 0 && (*p == '\n' || *p == '\r')) {p ++; remain --; } } while (0)
  2249. SKIP_NEWLINE;
  2250. /* First of all, we need to read the first line (and probably skip it) */
  2251. if (remain < sizeof ("begin-base64 ")) {
  2252. /* Obviously truncated */
  2253. return -1;
  2254. }
  2255. if (memcmp (p, "begin ", sizeof ("begin ") - 1) == 0) {
  2256. p += sizeof ("begin ") - 1;
  2257. remain -= sizeof ("begin ") - 1;
  2258. pos = rspamd_memcspn (p, nline, remain);
  2259. }
  2260. else if (memcmp (p, "begin-base64 ", sizeof ("begin-base64 ") - 1) == 0) {
  2261. base64 = TRUE;
  2262. p += sizeof ("begin-base64 ") - 1;
  2263. remain -= sizeof ("begin-base64 ") - 1;
  2264. pos = rspamd_memcspn (p, nline, remain);
  2265. }
  2266. else {
  2267. /* Crap */
  2268. return (-1);
  2269. }
  2270. if (pos == -1 || remain == 0) {
  2271. /* Crap */
  2272. return (-1);
  2273. }
  2274. #define DEC(c) (((c) - ' ') & 077) /* single character decode */
  2275. #define IS_DEC(c) ( (((c) - ' ') >= 0) && (((c) - ' ') <= 077 + 1) )
  2276. #define CHAR_OUT(c) do { if (o < out_end) { *o++ = c; } else { return (-1); } } while(0)
  2277. remain -= pos;
  2278. p = p + pos;
  2279. SKIP_NEWLINE;
  2280. if (base64) {
  2281. if (!rspamd_cryptobox_base64_decode (p,
  2282. remain,
  2283. out, &outlen)) {
  2284. return (-1);
  2285. }
  2286. return outlen;
  2287. }
  2288. while (remain > 0 && o < out_end) {
  2289. /* Main cycle */
  2290. const gchar *eol;
  2291. gint i, ch;
  2292. pos = rspamd_memcspn (p, nline, remain);
  2293. if (pos == 0) {
  2294. /* Skip empty lines */
  2295. SKIP_NEWLINE;
  2296. if (remain == 0) {
  2297. break;
  2298. }
  2299. }
  2300. eol = p + pos;
  2301. remain -= eol - p;
  2302. if ((i = DEC(*p)) <= 0) {
  2303. /* Last pos */
  2304. break;
  2305. }
  2306. /* i can be less than eol - p, it means uue padding which we ignore */
  2307. for (++p; i > 0 && p < eol; p += 4, i -= 3) {
  2308. if (i >= 3 && p + 3 < eol) {
  2309. /* Process 4 bytes of input */
  2310. if (!IS_DEC(*p)) {
  2311. return (-1);
  2312. }
  2313. if (!IS_DEC(*(p + 1))) {
  2314. return (-1);
  2315. }
  2316. if (!IS_DEC(*(p + 2))) {
  2317. return (-1);
  2318. }
  2319. if (!IS_DEC(*(p + 3))) {
  2320. return (-1);
  2321. }
  2322. ch = DEC(p[0]) << 2 | DEC(p[1]) >> 4;
  2323. CHAR_OUT(ch);
  2324. ch = DEC(p[1]) << 4 | DEC(p[2]) >> 2;
  2325. CHAR_OUT(ch);
  2326. ch = DEC(p[2]) << 6 | DEC(p[3]);
  2327. CHAR_OUT(ch);
  2328. }
  2329. else {
  2330. if (i >= 1 && p + 1 < eol) {
  2331. if (!IS_DEC(*p)) {
  2332. return (-1);
  2333. }
  2334. if (!IS_DEC(*(p + 1))) {
  2335. return (-1);
  2336. }
  2337. ch = DEC(p[0]) << 2 | DEC(p[1]) >> 4;
  2338. CHAR_OUT(ch);
  2339. }
  2340. if (i >= 2 && p + 2 < eol) {
  2341. if (!IS_DEC(*(p + 1))) {
  2342. return (-1);
  2343. }
  2344. if (!IS_DEC(*(p + 2))) {
  2345. return (-1);
  2346. }
  2347. ch = DEC(p[1]) << 4 | DEC(p[2]) >> 2;
  2348. CHAR_OUT(ch);
  2349. }
  2350. }
  2351. }
  2352. /* Skip newline */
  2353. p = eol;
  2354. SKIP_NEWLINE;
  2355. }
  2356. return (o - out);
  2357. }
  2358. #define BITOP(a,b,op) \
  2359. ((a)[(gsize)(b)/(8*sizeof *(a))] op (gsize)1<<((gsize)(b)%(8*sizeof *(a))))
  2360. gsize
  2361. rspamd_memcspn (const gchar *s, const gchar *e, gsize len)
  2362. {
  2363. gsize byteset[32 / sizeof(gsize)];
  2364. const gchar *p = s, *end = s + len;
  2365. if (!e[1]) {
  2366. for (; p < end && *p != *e; p++);
  2367. return p - s;
  2368. }
  2369. memset (byteset, 0, sizeof byteset);
  2370. for (; *e && BITOP (byteset, *(guchar *)e, |=); e++);
  2371. for (; p < end && !BITOP (byteset, *(guchar *)p, &); p++);
  2372. return p - s;
  2373. }
  2374. gsize
  2375. rspamd_memspn (const gchar *s, const gchar *e, gsize len)
  2376. {
  2377. gsize byteset[32 / sizeof(gsize)];
  2378. const gchar *p = s, *end = s + len;
  2379. if (!e[1]) {
  2380. for (; p < end && *p == *e; p++);
  2381. return p - s;
  2382. }
  2383. memset (byteset, 0, sizeof byteset);
  2384. for (; *e && BITOP (byteset, *(guchar *)e, |=); e++);
  2385. for (; p < end && BITOP (byteset, *(guchar *)p, &); p++);
  2386. return p - s;
  2387. }
  2388. gssize
  2389. rspamd_decode_qp2047_buf (const gchar *in, gsize inlen,
  2390. gchar *out, gsize outlen)
  2391. {
  2392. gchar *o, *end, c;
  2393. const gchar *p;
  2394. guchar ret;
  2395. gsize remain, processed;
  2396. p = in;
  2397. o = out;
  2398. end = out + outlen;
  2399. remain = inlen;
  2400. while (remain > 0 && o < end) {
  2401. if (*p == '=') {
  2402. p ++;
  2403. remain --;
  2404. if (remain == 0) {
  2405. if (end - o > 0) {
  2406. *o++ = *p;
  2407. break;
  2408. }
  2409. }
  2410. decode:
  2411. /* Decode character after '=' */
  2412. c = *p++;
  2413. remain --;
  2414. ret = 0;
  2415. if (c >= '0' && c <= '9') { ret = c - '0'; }
  2416. else if (c >= 'A' && c <= 'F') { ret = c - 'A' + 10; }
  2417. else if (c >= 'a' && c <= 'f') { ret = c - 'a' + 10; }
  2418. else if (c == '\r' || c == '\n') {
  2419. /* Soft line break */
  2420. while (remain > 0 && (*p == '\r' || *p == '\n')) {
  2421. remain --;
  2422. p ++;
  2423. }
  2424. continue;
  2425. }
  2426. if (remain > 0) {
  2427. c = *p++;
  2428. ret *= 16;
  2429. if (c >= '0' && c <= '9') { ret += c - '0'; }
  2430. else if (c >= 'A' && c <= 'F') { ret += c - 'A' + 10; }
  2431. else if (c >= 'a' && c <= 'f') { ret += c - 'a' + 10; }
  2432. if (end - o > 0) {
  2433. *o++ = (gchar)ret;
  2434. }
  2435. else {
  2436. return (-1);
  2437. }
  2438. remain --;
  2439. }
  2440. }
  2441. else {
  2442. if (end - o >= remain) {
  2443. processed = rspamd_memcspn (p, "=_", remain);
  2444. memcpy (o, p, processed);
  2445. o += processed;
  2446. if (processed == remain) {
  2447. break;
  2448. }
  2449. else {
  2450. remain -= processed;
  2451. p += processed;
  2452. if (G_LIKELY (*p == '=')) {
  2453. p ++;
  2454. /* Skip comparison, as we know that we have found match */
  2455. remain --;
  2456. goto decode;
  2457. }
  2458. else {
  2459. *o++ = ' ';
  2460. p ++;
  2461. remain --;
  2462. }
  2463. }
  2464. }
  2465. else {
  2466. /* Buffer overflow */
  2467. return (-1);
  2468. }
  2469. }
  2470. }
  2471. return (o - out);
  2472. }
  2473. gssize
  2474. rspamd_encode_qp2047_buf (const gchar *in, gsize inlen,
  2475. gchar *out, gsize outlen)
  2476. {
  2477. gchar *o = out, *end = out + outlen, c;
  2478. static const gchar hexdigests[16] = "0123456789ABCDEF";
  2479. while (inlen > 0 && o < end) {
  2480. c = *in;
  2481. if (g_ascii_isalnum (c)) {
  2482. *o++ = c;
  2483. }
  2484. else if (c == ' ') {
  2485. *o++ = '_';
  2486. }
  2487. else if (end - o >= 3){
  2488. *o++ = '=';
  2489. *o++ = hexdigests[((c >> 4) & 0xF)];
  2490. *o++ = hexdigests[(c & 0xF)];
  2491. }
  2492. else {
  2493. return (-1);
  2494. }
  2495. in ++;
  2496. inlen --;
  2497. }
  2498. if (inlen != 0) {
  2499. return (-1);
  2500. }
  2501. return (o - out);
  2502. }
  2503. /*
  2504. * GString ucl emitting functions
  2505. */
  2506. static int
  2507. rspamd_gstring_append_character (unsigned char c, size_t len, void *ud)
  2508. {
  2509. GString *buf = ud;
  2510. gsize old_len;
  2511. if (len == 1) {
  2512. g_string_append_c (buf, c);
  2513. }
  2514. else {
  2515. if (buf->allocated_len - buf->len <= len) {
  2516. old_len = buf->len;
  2517. g_string_set_size (buf, buf->len + len + 1);
  2518. buf->len = old_len;
  2519. }
  2520. memset (&buf->str[buf->len], c, len);
  2521. buf->len += len;
  2522. }
  2523. return 0;
  2524. }
  2525. static int
  2526. rspamd_gstring_append_len (const unsigned char *str, size_t len, void *ud)
  2527. {
  2528. GString *buf = ud;
  2529. g_string_append_len (buf, str, len);
  2530. return 0;
  2531. }
  2532. static int
  2533. rspamd_gstring_append_int (int64_t val, void *ud)
  2534. {
  2535. GString *buf = ud;
  2536. rspamd_printf_gstring (buf, "%L", (intmax_t) val);
  2537. return 0;
  2538. }
  2539. static int
  2540. rspamd_gstring_append_double (double val, void *ud)
  2541. {
  2542. GString *buf = ud;
  2543. const double delta = 0.0000001;
  2544. if (isfinite (val)) {
  2545. if (val == (double) (int) val) {
  2546. rspamd_printf_gstring (buf, "%.1f", val);
  2547. } else if (fabs (val - (double) (int) val) < delta) {
  2548. /* Write at maximum precision */
  2549. rspamd_printf_gstring (buf, "%.*g", DBL_DIG, val);
  2550. } else {
  2551. rspamd_printf_gstring (buf, "%f", val);
  2552. }
  2553. }
  2554. else {
  2555. rspamd_printf_gstring (buf, "null");
  2556. }
  2557. return 0;
  2558. }
  2559. void
  2560. rspamd_ucl_emit_gstring_comments (const ucl_object_t *obj,
  2561. enum ucl_emitter emit_type,
  2562. GString *target,
  2563. const ucl_object_t *comments)
  2564. {
  2565. struct ucl_emitter_functions func = {
  2566. .ucl_emitter_append_character = rspamd_gstring_append_character,
  2567. .ucl_emitter_append_len = rspamd_gstring_append_len,
  2568. .ucl_emitter_append_int = rspamd_gstring_append_int,
  2569. .ucl_emitter_append_double = rspamd_gstring_append_double
  2570. };
  2571. func.ud = target;
  2572. ucl_object_emit_full (obj, emit_type, &func, comments);
  2573. }
  2574. /*
  2575. * FString ucl emitting functions
  2576. */
  2577. static int
  2578. rspamd_fstring_emit_append_character (unsigned char c, size_t len, void *ud)
  2579. {
  2580. rspamd_fstring_t **buf = ud;
  2581. *buf = rspamd_fstring_append_chars (*buf, c, len);
  2582. return 0;
  2583. }
  2584. static int
  2585. rspamd_fstring_emit_append_len (const unsigned char *str, size_t len, void *ud)
  2586. {
  2587. rspamd_fstring_t **buf = ud;
  2588. *buf = rspamd_fstring_append (*buf, str, len);
  2589. return 0;
  2590. }
  2591. static int
  2592. rspamd_fstring_emit_append_int (int64_t val, void *ud)
  2593. {
  2594. rspamd_fstring_t **buf = ud;
  2595. rspamd_printf_fstring (buf, "%L", (intmax_t) val);
  2596. return 0;
  2597. }
  2598. static int
  2599. rspamd_fstring_emit_append_double (double val, void *ud)
  2600. {
  2601. rspamd_fstring_t **buf = ud;
  2602. #define MAX_PRECISION 6
  2603. if (isfinite (val)) {
  2604. if (val == (double) ((gint) val)) {
  2605. rspamd_printf_fstring (buf, "%.1f", val);
  2606. } else {
  2607. rspamd_printf_fstring (buf, "%." G_STRINGIFY (MAX_PRECISION) "f",
  2608. val);
  2609. }
  2610. }
  2611. else {
  2612. rspamd_printf_fstring (buf, "null");
  2613. }
  2614. return 0;
  2615. }
  2616. void
  2617. rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
  2618. enum ucl_emitter emit_type,
  2619. rspamd_fstring_t **buf,
  2620. const ucl_object_t *comments)
  2621. {
  2622. struct ucl_emitter_functions func = {
  2623. .ucl_emitter_append_character = rspamd_fstring_emit_append_character,
  2624. .ucl_emitter_append_len = rspamd_fstring_emit_append_len,
  2625. .ucl_emitter_append_int = rspamd_fstring_emit_append_int,
  2626. .ucl_emitter_append_double = rspamd_fstring_emit_append_double
  2627. };
  2628. func.ud = buf;
  2629. ucl_object_emit_full (obj, emit_type, &func, comments);
  2630. }
  2631. #ifndef HAVE_MEMRCHR
  2632. void *
  2633. rspamd_memrchr (const void *m, gint c, gsize len)
  2634. {
  2635. const guint8 *p = m;
  2636. for (gsize i = len; i > 0; i --) {
  2637. if (p[i - 1] == c) {
  2638. return (void *)(p + i - 1);
  2639. }
  2640. }
  2641. return NULL;
  2642. }
  2643. #endif
  2644. struct UConverter *
  2645. rspamd_get_utf8_converter (void)
  2646. {
  2647. static UConverter *utf8_conv = NULL;
  2648. UErrorCode uc_err = U_ZERO_ERROR;
  2649. if (utf8_conv == NULL) {
  2650. utf8_conv = ucnv_open ("UTF-8", &uc_err);
  2651. if (!U_SUCCESS (uc_err)) {
  2652. msg_err ("FATAL error: cannot open converter for utf8: %s",
  2653. u_errorName (uc_err));
  2654. g_assert_not_reached ();
  2655. }
  2656. ucnv_setFromUCallBack (utf8_conv,
  2657. UCNV_FROM_U_CALLBACK_SUBSTITUTE,
  2658. NULL,
  2659. NULL,
  2660. NULL,
  2661. &uc_err);
  2662. ucnv_setToUCallBack (utf8_conv,
  2663. UCNV_TO_U_CALLBACK_SUBSTITUTE,
  2664. NULL,
  2665. NULL,
  2666. NULL,
  2667. &uc_err);
  2668. }
  2669. return utf8_conv;
  2670. }
  2671. const struct UNormalizer2 *
  2672. rspamd_get_unicode_normalizer (void)
  2673. {
  2674. #if U_ICU_VERSION_MAJOR_NUM >= 44
  2675. UErrorCode uc_err = U_ZERO_ERROR;
  2676. static const UNormalizer2 *norm = NULL;
  2677. if (norm == NULL) {
  2678. norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
  2679. g_assert (U_SUCCESS (uc_err));
  2680. }
  2681. return norm;
  2682. #else
  2683. /* Old libicu */
  2684. return NULL;
  2685. #endif
  2686. }
  2687. gchar *
  2688. rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
  2689. gsize *dst_len, enum rspamd_regexp_escape_flags flags)
  2690. {
  2691. const gchar *p, *end = pattern + slen;
  2692. gchar *res, *d, t, *tmp_utf = NULL, *dend;
  2693. gsize len;
  2694. static const gchar hexdigests[16] = "0123456789abcdef";
  2695. len = 0;
  2696. p = pattern;
  2697. /* [-[\]{}()*+?.,\\^$|#\s] need to be escaped */
  2698. while (p < end) {
  2699. t = *p ++;
  2700. switch (t) {
  2701. case '[':
  2702. case ']':
  2703. case '-':
  2704. case '\\':
  2705. case '{':
  2706. case '}':
  2707. case '(':
  2708. case ')':
  2709. case '*':
  2710. case '+':
  2711. case '?':
  2712. case '.':
  2713. case ',':
  2714. case '^':
  2715. case '$':
  2716. case '|':
  2717. case '#':
  2718. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2719. len++;
  2720. }
  2721. break;
  2722. default:
  2723. if (g_ascii_isspace (t)) {
  2724. len ++;
  2725. }
  2726. else {
  2727. if (!g_ascii_isprint (t) || (t & 0x80)) {
  2728. if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
  2729. /* \x{code}, where code can be up to 5 digits */
  2730. len += 4;
  2731. }
  2732. else {
  2733. /* \\xHH -> 4 symbols */
  2734. len += 3;
  2735. }
  2736. }
  2737. }
  2738. break;
  2739. }
  2740. }
  2741. if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
  2742. if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
  2743. tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL, NULL);
  2744. }
  2745. }
  2746. if (len == 0) {
  2747. /* No need to escape anything */
  2748. if (dst_len) {
  2749. *dst_len = slen;
  2750. }
  2751. if (tmp_utf) {
  2752. return tmp_utf;
  2753. }
  2754. else {
  2755. return g_strdup (pattern);
  2756. }
  2757. }
  2758. /* Escape logic */
  2759. if (tmp_utf) {
  2760. pattern = tmp_utf;
  2761. }
  2762. len = slen + len;
  2763. res = g_malloc (len + 1);
  2764. p = pattern;
  2765. d = res;
  2766. dend = d + len;
  2767. while (p < end) {
  2768. g_assert (d < dend);
  2769. t = *p ++;
  2770. switch (t) {
  2771. case '[':
  2772. case ']':
  2773. case '-':
  2774. case '\\':
  2775. case '{':
  2776. case '}':
  2777. case '(':
  2778. case ')':
  2779. case '.':
  2780. case ',':
  2781. case '^':
  2782. case '$':
  2783. case '|':
  2784. case '#':
  2785. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2786. *d++ = '\\';
  2787. }
  2788. break;
  2789. case '*':
  2790. case '?':
  2791. case '+':
  2792. if (flags & RSPAMD_REGEXP_ESCAPE_GLOB) {
  2793. /* Treat * as .* and ? as .? */
  2794. *d++ = '.';
  2795. }
  2796. else {
  2797. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2798. *d++ = '\\';
  2799. }
  2800. }
  2801. break;
  2802. default:
  2803. if (g_ascii_isspace (t)) {
  2804. if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
  2805. *d++ = '\\';
  2806. }
  2807. }
  2808. else if (t & 0x80 || !g_ascii_isprint (t)) {
  2809. if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
  2810. *d++ = '\\';
  2811. *d++ = 'x';
  2812. *d++ = hexdigests[((t >> 4) & 0xF)];
  2813. *d++ = hexdigests[((t) & 0xF)];
  2814. continue; /* To avoid *d++ = t; */
  2815. }
  2816. else {
  2817. if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) {
  2818. UChar32 uc;
  2819. gint32 off = p - pattern - 1;
  2820. U8_NEXT (pattern, off, slen, uc);
  2821. if (uc > 0) {
  2822. d += rspamd_snprintf (d, dend - d,
  2823. "\\x{%xd}", uc);
  2824. p = pattern + off;
  2825. }
  2826. continue; /* To avoid *d++ = t; */
  2827. }
  2828. }
  2829. }
  2830. break;
  2831. }
  2832. *d++ = t;
  2833. }
  2834. *d = '\0';
  2835. if (dst_len) {
  2836. *dst_len = d - res;
  2837. }
  2838. if (tmp_utf) {
  2839. g_free (tmp_utf);
  2840. }
  2841. return res;
  2842. }
  2843. gchar *
  2844. rspamd_str_make_utf_valid (const guchar *src, gsize slen,
  2845. gsize *dstlen,
  2846. rspamd_mempool_t *pool)
  2847. {
  2848. UChar32 uc;
  2849. goffset err_offset;
  2850. const guchar *p;
  2851. gchar *dst, *d;
  2852. gsize remain = slen, dlen = 0;
  2853. if (src == NULL) {
  2854. return NULL;
  2855. }
  2856. if (slen == 0) {
  2857. if (dstlen) {
  2858. *dstlen = 0;
  2859. }
  2860. return pool ? rspamd_mempool_strdup (pool, "") : g_strdup ("");
  2861. }
  2862. p = src;
  2863. dlen = slen + 1; /* As we add '\0' */
  2864. /* Check space required */
  2865. while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain)) > 0) {
  2866. gint i = 0;
  2867. err_offset --; /* As it returns it 1 indexed */
  2868. p += err_offset;
  2869. remain -= err_offset;
  2870. dlen += err_offset;
  2871. /* Each invalid character of input requires 3 bytes of output (+2 bytes) */
  2872. while (i < remain) {
  2873. U8_NEXT (p, i, remain, uc);
  2874. if (uc < 0) {
  2875. dlen += 2;
  2876. }
  2877. else {
  2878. break;
  2879. }
  2880. }
  2881. p += i;
  2882. remain -= i;
  2883. }
  2884. if (pool) {
  2885. dst = rspamd_mempool_alloc (pool, dlen + 1);
  2886. }
  2887. else {
  2888. dst = g_malloc (dlen + 1);
  2889. }
  2890. p = src;
  2891. d = dst;
  2892. remain = slen;
  2893. while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain)) > 0) {
  2894. /* Copy valid */
  2895. err_offset --; /* As it returns it 1 indexed */
  2896. memcpy (d, p, err_offset);
  2897. d += err_offset;
  2898. /* Append 0xFFFD for each bad character */
  2899. gint i = 0;
  2900. p += err_offset;
  2901. remain -= err_offset;
  2902. while (i < remain) {
  2903. gint old_i = i;
  2904. U8_NEXT (p, i, remain, uc);
  2905. if (uc < 0) {
  2906. *d++ = '\357';
  2907. *d++ = '\277';
  2908. *d++ = '\275';
  2909. }
  2910. else {
  2911. /* Adjust p and remaining stuff and go to the outer cycle */
  2912. i = old_i;
  2913. break;
  2914. }
  2915. }
  2916. /*
  2917. * Now p is the first valid utf8 character and remain is the rest of the string
  2918. * so we can continue our loop
  2919. */
  2920. p += i;
  2921. remain -= i;
  2922. }
  2923. if (err_offset == 0 && remain > 0) {
  2924. /* Last piece */
  2925. memcpy (d, p, remain);
  2926. d += remain;
  2927. }
  2928. /* Last '\0' */
  2929. g_assert (dlen > d - dst);
  2930. *d = '\0';
  2931. if (dstlen) {
  2932. *dstlen = d - dst;
  2933. }
  2934. return dst;
  2935. }
  2936. gsize
  2937. rspamd_gstring_strip (GString *s, const gchar *strip_chars)
  2938. {
  2939. const gchar *p, *sc;
  2940. gsize strip_len = 0, total = 0;
  2941. p = s->str + s->len - 1;
  2942. while (p >= s->str) {
  2943. gboolean seen = FALSE;
  2944. sc = strip_chars;
  2945. while (*sc != '\0') {
  2946. if (*p == *sc) {
  2947. strip_len ++;
  2948. seen = TRUE;
  2949. break;
  2950. }
  2951. sc ++;
  2952. }
  2953. if (!seen) {
  2954. break;
  2955. }
  2956. p --;
  2957. }
  2958. if (strip_len > 0) {
  2959. s->len -= strip_len;
  2960. s->str[s->len] = '\0';
  2961. total += strip_len;
  2962. }
  2963. if (s->len > 0) {
  2964. strip_len = rspamd_memspn (s->str, strip_chars, s->len);
  2965. if (strip_len > 0) {
  2966. memmove (s->str, s->str + strip_len, s->len - strip_len);
  2967. s->len -= strip_len;
  2968. total += strip_len;
  2969. }
  2970. }
  2971. return total;
  2972. }
  2973. const gchar* rspamd_string_len_strip (const gchar *in,
  2974. gsize *len,
  2975. const gchar *strip_chars)
  2976. {
  2977. const gchar *p, *sc;
  2978. gsize strip_len = 0, old_len = *len;
  2979. p = in + old_len - 1;
  2980. /* Trail */
  2981. while (p >= in) {
  2982. gboolean seen = FALSE;
  2983. sc = strip_chars;
  2984. while (*sc != '\0') {
  2985. if (*p == *sc) {
  2986. strip_len ++;
  2987. seen = TRUE;
  2988. break;
  2989. }
  2990. sc ++;
  2991. }
  2992. if (!seen) {
  2993. break;
  2994. }
  2995. p --;
  2996. }
  2997. if (strip_len > 0) {
  2998. *len -= strip_len;
  2999. }
  3000. /* Head */
  3001. old_len = *len;
  3002. if (old_len > 0) {
  3003. strip_len = rspamd_memspn (in, strip_chars, old_len);
  3004. if (strip_len > 0) {
  3005. *len -= strip_len;
  3006. return in + strip_len;
  3007. }
  3008. }
  3009. return in;
  3010. }
  3011. gchar **
  3012. rspamd_string_len_split (const gchar *in, gsize len, const gchar *spill,
  3013. gint max_elts, rspamd_mempool_t *pool)
  3014. {
  3015. const gchar *p = in, *end = in + len;
  3016. gsize detected_elts = 0;
  3017. gchar **res;
  3018. /* Detect number of elements */
  3019. while (p < end) {
  3020. gsize cur_fragment = rspamd_memcspn (p, spill, end - p);
  3021. if (cur_fragment > 0) {
  3022. detected_elts ++;
  3023. p += cur_fragment;
  3024. if (max_elts > 0 && detected_elts >= max_elts) {
  3025. break;
  3026. }
  3027. }
  3028. /* Something like a,,b produces {'a', 'b'} not {'a', '', 'b'} */
  3029. p += rspamd_memspn (p, spill, end - p);
  3030. }
  3031. res = pool ?
  3032. rspamd_mempool_alloc (pool, sizeof (gchar *) * (detected_elts + 1)) :
  3033. g_malloc (sizeof (gchar *) * (detected_elts + 1));
  3034. /* Last one */
  3035. res[detected_elts] = NULL;
  3036. detected_elts = 0;
  3037. p = in;
  3038. while (p < end) {
  3039. gsize cur_fragment = rspamd_memcspn (p, spill, end - p);
  3040. if (cur_fragment > 0) {
  3041. gchar *elt;
  3042. elt = pool ?
  3043. rspamd_mempool_alloc (pool, cur_fragment + 1) :
  3044. g_malloc (cur_fragment + 1);
  3045. memcpy (elt, p, cur_fragment);
  3046. elt[cur_fragment] = '\0';
  3047. res[detected_elts ++] = elt;
  3048. p += cur_fragment;
  3049. if (max_elts > 0 && detected_elts >= max_elts) {
  3050. break;
  3051. }
  3052. }
  3053. p += rspamd_memspn (p, spill, end - p);
  3054. }
  3055. return res;
  3056. }
  3057. #if defined(__x86_64__)
  3058. #include <x86intrin.h>
  3059. #endif
  3060. static inline gboolean
  3061. rspamd_str_has_8bit_u64 (const guchar *beg, gsize len)
  3062. {
  3063. guint8 orb = 0;
  3064. if (len >= 16) {
  3065. const guchar *nextd = beg + sizeof(guint64);
  3066. guint64 n1 = 0, n2 = 0;
  3067. do {
  3068. guint64 t;
  3069. memcpy(&t, beg, sizeof(t));
  3070. n1 |= t;
  3071. memcpy(&t, nextd, sizeof(t));
  3072. n2 |= t;
  3073. beg += 16;
  3074. nextd += 16;
  3075. len -= 16;
  3076. } while (len >= 16);
  3077. /*
  3078. * Idea from Benny Halevy <bhalevy@scylladb.com>
  3079. * - 7-th bit set ==> orb = !(non-zero) - 1 = 0 - 1 = 0xFF
  3080. * - 7-th bit clear ==> orb = !0 - 1 = 1 - 1 = 0x00
  3081. */
  3082. orb = !((n1 | n2) & 0x8080808080808080ULL) - 1;
  3083. }
  3084. while (len--) {
  3085. orb |= *beg++;
  3086. }
  3087. return orb >= 0x80;
  3088. }
  3089. gboolean
  3090. rspamd_str_has_8bit (const guchar *beg, gsize len)
  3091. {
  3092. #if defined(__x86_64__)
  3093. if (len >= 32) {
  3094. const uint8_t *nextd = beg + 16;
  3095. __m128i n1 = _mm_set1_epi8 (0), n2;
  3096. n2 = n1;
  3097. while (len >= 32) {
  3098. __m128i xmm1 = _mm_loadu_si128 ((const __m128i *)beg);
  3099. __m128i xmm2 = _mm_loadu_si128 ((const __m128i *)nextd);
  3100. n1 = _mm_or_si128 (n1, xmm1);
  3101. n2 = _mm_or_si128 (n2, xmm2);
  3102. beg += 32;
  3103. nextd += 32;
  3104. len -= 32;
  3105. }
  3106. n1 = _mm_or_si128 (n1, n2);
  3107. /* We assume 2 complement here */
  3108. if (_mm_movemask_epi8 (n1)) {
  3109. return TRUE;
  3110. }
  3111. }
  3112. #endif
  3113. return rspamd_str_has_8bit_u64 (beg, len);
  3114. }