You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

mime_encoding.c 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "libutil/mem_pool.h"
  18. #include "libutil/regexp.h"
  19. #include "libutil/hash.h"
  20. #include "libserver/cfg_file.h"
  21. #include "libserver/task.h"
  22. #include "mime_encoding.h"
  23. #include "message.h"
  24. #include "contrib/fastutf8/fastutf8.h"
  25. #include "contrib/google-ced/ced_c.h"
  26. #include <unicode/ucnv.h>
  27. #if U_ICU_VERSION_MAJOR_NUM >= 44
  28. #include <unicode/unorm2.h>
  29. #endif
  30. #include <math.h>
  31. #define UTF8_CHARSET "UTF-8"
  32. #define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
  33. #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
  34. #define RSPAMD_CHARSET_CACHE_SIZE 32
  35. #define RSPAMD_CHARSET_MAX_CONTENT 512
  36. #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  37. #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  38. static rspamd_regexp_t *utf_compatible_re = NULL;
  39. struct rspamd_charset_substitution {
  40. const gchar *input;
  41. const gchar *canon;
  42. gint flags;
  43. };
  44. #include "mime_encoding_list.h"
  45. static GHashTable *sub_hash = NULL;
  46. static const UChar iso_8859_16_map[] = {
  47. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
  48. 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
  49. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
  50. 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
  51. 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
  52. 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
  53. 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
  54. 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
  55. 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
  56. 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
  57. 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
  58. 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
  59. 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
  60. 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
  61. 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
  62. 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
  63. };
  64. struct rspamd_charset_converter {
  65. gchar *canon_name;
  66. union {
  67. UConverter *conv;
  68. const UChar *cnv_table;
  69. } d;
  70. gboolean is_internal;
  71. };
  72. static GQuark
  73. rspamd_charset_conv_error_quark (void)
  74. {
  75. return g_quark_from_static_string ("charset conversion error");
  76. }
  77. static void
  78. rspamd_converter_dtor (gpointer p)
  79. {
  80. struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p;
  81. if (!c->is_internal) {
  82. ucnv_close (c->d.conv);
  83. }
  84. g_free (c->canon_name);
  85. g_free (c);
  86. }
  87. int32_t
  88. rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
  89. UChar *dest,
  90. int32_t destCapacity,
  91. const char *src,
  92. int32_t srcLength,
  93. UErrorCode *pErrorCode)
  94. {
  95. if (!cnv->is_internal) {
  96. return ucnv_toUChars (cnv->d.conv,
  97. dest, destCapacity,
  98. src, srcLength,
  99. pErrorCode);
  100. }
  101. else {
  102. UChar *d = dest, *dend = dest + destCapacity;
  103. const guchar *p = src, *end = src + srcLength;
  104. while (p < end && d < dend) {
  105. if (*p <= 127) {
  106. *d++ = (UChar)*p;
  107. }
  108. else {
  109. *d++ = cnv->d.cnv_table[*p - 128];
  110. }
  111. p ++;
  112. }
  113. return d - dest;
  114. }
  115. }
  116. struct rspamd_charset_converter *
  117. rspamd_mime_get_converter_cached (const gchar *enc,
  118. rspamd_mempool_t *pool,
  119. gboolean is_canon,
  120. UErrorCode *err)
  121. {
  122. const gchar *canon_name;
  123. static rspamd_lru_hash_t *cache;
  124. struct rspamd_charset_converter *conv;
  125. if (cache == NULL) {
  126. cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL,
  127. rspamd_converter_dtor, rspamd_str_hash,
  128. rspamd_str_equal);
  129. }
  130. if (enc == NULL) {
  131. return NULL;
  132. }
  133. if (!is_canon) {
  134. rspamd_ftok_t cset_tok;
  135. RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
  136. canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
  137. }
  138. else {
  139. canon_name = enc;
  140. }
  141. if (canon_name == NULL) {
  142. return NULL;
  143. }
  144. conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0);
  145. if (conv == NULL) {
  146. if (!(strcmp (canon_name, "ISO-8859-16") == 0 ||
  147. strcmp (canon_name, "latin10") == 0 ||
  148. strcmp (canon_name, "iso-ir-226") == 0)) {
  149. conv = g_malloc0 (sizeof (*conv));
  150. conv->d.conv = ucnv_open (canon_name, err);
  151. conv->canon_name = g_strdup (canon_name);
  152. if (conv->d.conv != NULL) {
  153. ucnv_setToUCallBack (conv->d.conv,
  154. UCNV_TO_U_CALLBACK_SUBSTITUTE,
  155. NULL,
  156. NULL,
  157. NULL,
  158. err);
  159. rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
  160. }
  161. else {
  162. g_free (conv);
  163. conv = NULL;
  164. }
  165. }
  166. else {
  167. /* ISO-8859-16 */
  168. conv = g_malloc0 (sizeof (*conv));
  169. conv->is_internal = TRUE;
  170. conv->d.cnv_table = iso_8859_16_map;
  171. conv->canon_name = g_strdup (canon_name);
  172. rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
  173. }
  174. }
  175. return conv;
  176. }
  177. static void
  178. rspamd_mime_encoding_substitute_init (void)
  179. {
  180. guint i;
  181. sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);
  182. for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
  183. g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
  184. }
  185. }
  186. static void
  187. rspamd_charset_normalize (gchar *in)
  188. {
  189. /*
  190. * This is a simple routine to validate input charset
  191. * we just check that charset starts with alphanumeric and ends
  192. * with alphanumeric
  193. */
  194. gchar *begin, *end;
  195. gboolean changed = FALSE;
  196. begin = in;
  197. while (*begin && !g_ascii_isalnum (*begin)) {
  198. begin ++;
  199. changed = TRUE;
  200. }
  201. end = begin + strlen (begin) - 1;
  202. while (end > begin && !g_ascii_isalnum (*end)) {
  203. end --;
  204. changed = TRUE;
  205. }
  206. if (changed) {
  207. memmove (in, begin, end - begin + 2);
  208. *(end + 1) = '\0';
  209. }
  210. }
  211. const gchar *
  212. rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
  213. {
  214. gchar *ret = NULL, *h, *t;
  215. struct rspamd_charset_substitution *s;
  216. const gchar *cset;
  217. rspamd_ftok_t utf8_tok;
  218. UErrorCode uc_err = U_ZERO_ERROR;
  219. if (sub_hash == NULL) {
  220. rspamd_mime_encoding_substitute_init ();
  221. }
  222. /* Fast path */
  223. RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf-8");
  224. if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
  225. return UTF8_CHARSET;
  226. }
  227. RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf8");
  228. if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
  229. return UTF8_CHARSET;
  230. }
  231. ret = rspamd_mempool_ftokdup (pool, in);
  232. rspamd_charset_normalize (ret);
  233. if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) ||
  234. (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) {
  235. /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
  236. h = ret;
  237. t = ret;
  238. while (*h != '\0') {
  239. if (*h != '-') {
  240. *t++ = *h;
  241. }
  242. h ++;
  243. }
  244. *t = '\0';
  245. }
  246. s = g_hash_table_lookup (sub_hash, ret);
  247. if (s) {
  248. ret = (char *)s->canon;
  249. }
  250. /* Try different aliases */
  251. cset = ucnv_getCanonicalName (ret, "MIME", &uc_err);
  252. if (cset == NULL) {
  253. uc_err = U_ZERO_ERROR;
  254. cset = ucnv_getCanonicalName (ret, "IANA", &uc_err);
  255. }
  256. if (cset == NULL) {
  257. uc_err = U_ZERO_ERROR;
  258. cset = ucnv_getCanonicalName (ret, "", &uc_err);
  259. }
  260. if (cset == NULL) {
  261. uc_err = U_ZERO_ERROR;
  262. cset = ucnv_getAlias (ret, 0, &uc_err);
  263. }
  264. return cset;
  265. }
  266. gchar *
  267. rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
  268. gchar *input, gsize len, const gchar *in_enc,
  269. gsize *olen, GError **err)
  270. {
  271. gchar *d;
  272. gint32 r, clen, dlen;
  273. UChar *tmp_buf;
  274. UErrorCode uc_err = U_ZERO_ERROR;
  275. UConverter *utf8_converter;
  276. struct rspamd_charset_converter *conv;
  277. rspamd_ftok_t cset_tok;
  278. /* Check if already utf8 */
  279. RSPAMD_FTOK_FROM_STR (&cset_tok, in_enc);
  280. if (rspamd_mime_charset_utf_check (&cset_tok, input, len,
  281. FALSE)) {
  282. d = rspamd_mempool_alloc (pool, len);
  283. memcpy (d, input, len);
  284. if (olen) {
  285. *olen = len;
  286. }
  287. return d;
  288. }
  289. conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
  290. utf8_converter = rspamd_get_utf8_converter ();
  291. if (conv == NULL) {
  292. g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
  293. "cannot open converter for %s: %s",
  294. in_enc, u_errorName (uc_err));
  295. return NULL;
  296. }
  297. tmp_buf = g_new (UChar, len + 1);
  298. uc_err = U_ZERO_ERROR;
  299. r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err);
  300. if (!U_SUCCESS (uc_err)) {
  301. g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
  302. "cannot convert data to unicode from %s: %s",
  303. in_enc, u_errorName (uc_err));
  304. g_free (tmp_buf);
  305. return NULL;
  306. }
  307. /* Now, convert to utf8 */
  308. clen = ucnv_getMaxCharSize (utf8_converter);
  309. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
  310. d = rspamd_mempool_alloc (pool, dlen);
  311. r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
  312. if (!U_SUCCESS (uc_err)) {
  313. g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
  314. "cannot convert data from unicode from %s: %s",
  315. in_enc, u_errorName (uc_err));
  316. g_free (tmp_buf);
  317. return NULL;
  318. }
  319. msg_debug_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d",
  320. in_enc, len, r);
  321. g_free (tmp_buf);
  322. if (olen) {
  323. *olen = r;
  324. }
  325. return d;
  326. }
  327. static gboolean
  328. rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
  329. struct rspamd_mime_text_part *text_part,
  330. GByteArray *input,
  331. const gchar *charset,
  332. GError **err)
  333. {
  334. gchar *d;
  335. gint32 r, clen, dlen, uc_len;
  336. UChar *tmp_buf;
  337. UErrorCode uc_err = U_ZERO_ERROR;
  338. UConverter *utf8_converter;
  339. struct rspamd_charset_converter *conv;
  340. conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
  341. TRUE, &uc_err);
  342. utf8_converter = rspamd_get_utf8_converter ();
  343. if (conv == NULL) {
  344. g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
  345. "cannot open converter for %s: %s",
  346. charset, u_errorName (uc_err));
  347. return FALSE;
  348. }
  349. tmp_buf = g_new (UChar, input->len + 1);
  350. uc_err = U_ZERO_ERROR;
  351. uc_len = rspamd_converter_to_uchars (conv,
  352. tmp_buf,
  353. input->len + 1,
  354. input->data,
  355. input->len,
  356. &uc_err);
  357. if (!U_SUCCESS (uc_err)) {
  358. g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
  359. "cannot convert data to unicode from %s: %s",
  360. charset, u_errorName (uc_err));
  361. g_free (tmp_buf);
  362. return FALSE;
  363. }
  364. /* Now, convert to utf8 */
  365. clen = ucnv_getMaxCharSize (utf8_converter);
  366. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
  367. d = rspamd_mempool_alloc (task->task_pool, dlen);
  368. r = ucnv_fromUChars (utf8_converter, d, dlen,
  369. tmp_buf, uc_len, &uc_err);
  370. if (!U_SUCCESS (uc_err)) {
  371. g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
  372. "cannot convert data from unicode from %s: %s",
  373. charset, u_errorName (uc_err));
  374. g_free (tmp_buf);
  375. return FALSE;
  376. }
  377. if (text_part->mime_part && text_part->mime_part->ct) {
  378. msg_info_task ("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
  379. charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
  380. }
  381. else {
  382. msg_info_task ("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
  383. "outlen: %d (%d UTF16 chars)",
  384. charset, input->len, r, uc_len);
  385. }
  386. text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
  387. sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
  388. text_part->utf_raw_content->data = d;
  389. text_part->utf_raw_content->len = r;
  390. g_free (tmp_buf);
  391. return TRUE;
  392. }
  393. gboolean
  394. rspamd_mime_to_utf8_byte_array (GByteArray *in,
  395. GByteArray *out,
  396. rspamd_mempool_t *pool,
  397. const gchar *enc)
  398. {
  399. gint32 r, clen, dlen;
  400. UChar *tmp_buf;
  401. UErrorCode uc_err = U_ZERO_ERROR;
  402. UConverter *utf8_converter;
  403. struct rspamd_charset_converter *conv;
  404. rspamd_ftok_t charset_tok;
  405. if (in == NULL || in->len == 0) {
  406. return FALSE;
  407. }
  408. if (enc == NULL) {
  409. /* Assume utf ? */
  410. if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
  411. g_byte_array_set_size (out, in->len);
  412. memcpy (out->data, in->data, out->len);
  413. return TRUE;
  414. }
  415. else {
  416. /* Bad stuff, keep out */
  417. return FALSE;
  418. }
  419. }
  420. RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
  421. if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
  422. FALSE)) {
  423. g_byte_array_set_size (out, in->len);
  424. memcpy (out->data, in->data, out->len);
  425. return TRUE;
  426. }
  427. utf8_converter = rspamd_get_utf8_converter ();
  428. conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
  429. if (conv == NULL) {
  430. return FALSE;
  431. }
  432. tmp_buf = g_new (UChar, in->len + 1);
  433. uc_err = U_ZERO_ERROR;
  434. r = rspamd_converter_to_uchars (conv,
  435. tmp_buf, in->len + 1,
  436. in->data, in->len, &uc_err);
  437. if (!U_SUCCESS (uc_err)) {
  438. g_free (tmp_buf);
  439. return FALSE;
  440. }
  441. /* Now, convert to utf8 */
  442. clen = ucnv_getMaxCharSize (utf8_converter);
  443. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
  444. g_byte_array_set_size (out, dlen);
  445. r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
  446. if (!U_SUCCESS (uc_err)) {
  447. g_free (tmp_buf);
  448. return FALSE;
  449. }
  450. g_free (tmp_buf);
  451. out->len = r;
  452. return TRUE;
  453. }
  454. void
  455. rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
  456. {
  457. gchar *p, *end;
  458. goffset err_offset;
  459. UChar32 uc = 0;
  460. /* Now we validate input and replace bad characters with '?' symbol */
  461. p = in;
  462. end = in + len;
  463. while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) {
  464. err_offset --; /* As it returns it 1 indexed */
  465. gint32 cur_offset = err_offset;
  466. while (cur_offset < len) {
  467. gint32 tmp = cur_offset;
  468. U8_NEXT (p, cur_offset, len, uc);
  469. if (uc > 0) {
  470. /* Fill string between err_offset and tmp with `?` character */
  471. memset (p + err_offset, '?', tmp - err_offset);
  472. break;
  473. }
  474. }
  475. if (uc < 0) {
  476. /* Fill till the end */
  477. memset (p + err_offset, '?', len - err_offset);
  478. break;
  479. }
  480. p += cur_offset;
  481. len = end - p;
  482. }
  483. }
  484. const char *
  485. rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen,
  486. bool check_utf8)
  487. {
  488. int nconsumed;
  489. bool is_reliable;
  490. const gchar *ced_name;
  491. if (check_utf8) {
  492. if (rspamd_fast_utf8_validate (in, inlen) == 0) {
  493. return UTF8_CHARSET;
  494. }
  495. }
  496. ced_name = ced_encoding_detect (in, inlen, NULL, NULL,
  497. NULL, 0, CED_EMAIL_CORPUS,
  498. false, &nconsumed, &is_reliable);
  499. if (ced_name) {
  500. return ced_name;
  501. }
  502. return NULL;
  503. }
  504. static const char *
  505. rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen)
  506. {
  507. if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) {
  508. return rspamd_mime_charset_find_by_content (in, inlen, false);
  509. }
  510. else {
  511. const gchar *c1, *c2, *c3;
  512. c1 = rspamd_mime_charset_find_by_content (in, RSPAMD_CHARSET_MAX_CONTENT, false);
  513. c2 = rspamd_mime_charset_find_by_content (in + inlen / 2,
  514. RSPAMD_CHARSET_MAX_CONTENT, false);
  515. c3 = rspamd_mime_charset_find_by_content (in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
  516. RSPAMD_CHARSET_MAX_CONTENT, false);
  517. /* 7bit stuff */
  518. if (c1 && strcmp (c1, "US-ASCII") == 0) {
  519. c1 = NULL; /* Invalid - we have 8 bit there */
  520. }
  521. if (c2 && strcmp (c2, "US-ASCII") == 0) {
  522. c2 = NULL; /* Invalid - we have 8 bit there */
  523. }
  524. if (c3 && strcmp (c3, "US-ASCII") == 0) {
  525. c3 = NULL; /* Invalid - we have 8 bit there */
  526. }
  527. if (!c1) {
  528. c1 = c2 ? c2 : c3;
  529. }
  530. if (!c2) {
  531. c2 = c3 ? c3 : c1;
  532. }
  533. if (!c3) {
  534. c3 = c1 ? c2 : c1;
  535. }
  536. if (c1 && c2 && c3) {
  537. /* Quorum */
  538. if (c1 == c2) {
  539. return c1;
  540. }
  541. else if (c2 == c3) {
  542. return c2;
  543. }
  544. else if (c1 == c3) {
  545. return c3;
  546. }
  547. /* All charsets are distinct. Use the one from the top */
  548. return c1;
  549. }
  550. return NULL;
  551. }
  552. }
  553. gboolean
  554. rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  555. gchar *in, gsize len, gboolean content_check)
  556. {
  557. const gchar *real_charset;
  558. if (utf_compatible_re == NULL) {
  559. utf_compatible_re = rspamd_regexp_new (
  560. "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
  561. "i", NULL);
  562. }
  563. if (charset->len == 0 ||
  564. rspamd_regexp_match (utf_compatible_re,
  565. charset->begin, charset->len, TRUE)) {
  566. /*
  567. * In case of UTF8 charset we still can check the content to find
  568. * corner cases
  569. */
  570. if (content_check) {
  571. if (rspamd_fast_utf8_validate (in, len) != 0) {
  572. real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len);
  573. if (real_charset) {
  574. if (rspamd_regexp_match (utf_compatible_re,
  575. real_charset, strlen (real_charset), TRUE)) {
  576. RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
  577. return TRUE;
  578. }
  579. else {
  580. charset->begin = real_charset;
  581. charset->len = strlen (real_charset);
  582. return FALSE;
  583. }
  584. }
  585. rspamd_mime_charset_utf_enforce (in, len);
  586. }
  587. }
  588. return TRUE;
  589. }
  590. return FALSE;
  591. }
  592. void
  593. rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
  594. struct rspamd_mime_text_part *text_part)
  595. {
  596. GError *err = NULL;
  597. const gchar *charset = NULL;
  598. gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
  599. GByteArray *part_content;
  600. rspamd_ftok_t charset_tok;
  601. struct rspamd_mime_part *part = text_part->mime_part;
  602. if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
  603. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW;
  604. }
  605. /* Allocate copy storage */
  606. part_content = g_byte_array_sized_new (text_part->parsed.len);
  607. memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
  608. part_content->len = text_part->parsed.len;
  609. rspamd_mempool_notify_alloc (task->task_pool,
  610. part_content->len);
  611. rspamd_mempool_add_destructor (task->task_pool,
  612. (rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
  613. if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
  614. if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
  615. /* Valid UTF, likely all good */
  616. need_charset_heuristic = FALSE;
  617. valid_utf8 = TRUE;
  618. checked = TRUE;
  619. }
  620. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
  621. }
  622. else {
  623. /* All 7bit characters, assume it valid utf */
  624. need_charset_heuristic = FALSE;
  625. valid_utf8 = TRUE;
  626. checked = TRUE; /* Already valid utf, no need in further checks */
  627. }
  628. if (part->ct->charset.len == 0) {
  629. if (need_charset_heuristic) {
  630. charset = rspamd_mime_charset_find_by_content_maybe_split (text_part->parsed.begin,
  631. text_part->parsed.len);
  632. if (charset != NULL) {
  633. msg_info_task ("detected charset %s", charset);
  634. }
  635. checked = TRUE;
  636. text_part->real_charset = charset;
  637. }
  638. else if (valid_utf8) {
  639. SET_PART_UTF (text_part);
  640. text_part->utf_raw_content = part_content;
  641. text_part->real_charset = UTF8_CHARSET;
  642. return;
  643. }
  644. }
  645. else {
  646. charset = rspamd_mime_detect_charset (&part->ct->charset,
  647. task->task_pool);
  648. if (charset == NULL) {
  649. /* We don't know the real charset but can try heuristic */
  650. if (need_charset_heuristic) {
  651. charset = rspamd_mime_charset_find_by_content_maybe_split (part_content->data,
  652. part_content->len);
  653. msg_info_task ("detected charset: %s", charset);
  654. checked = TRUE;
  655. text_part->real_charset = charset;
  656. }
  657. else if (valid_utf8) {
  658. /* We already know that the input is valid utf, so skip heuristic */
  659. text_part->real_charset = UTF8_CHARSET;
  660. }
  661. }
  662. else {
  663. text_part->real_charset = charset;
  664. if (strcmp (charset, UTF8_CHARSET) != 0) {
  665. /*
  666. * We have detected some charset, but we don't know which one,
  667. * so we need to reset valid utf8 flag and enforce it later
  668. */
  669. valid_utf8 = FALSE;
  670. }
  671. }
  672. }
  673. if (text_part->real_charset == NULL) {
  674. msg_info_task ("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
  675. MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset,
  676. part->ct->cpy);
  677. SET_PART_RAW (text_part);
  678. text_part->utf_raw_content = part_content;
  679. return;
  680. }
  681. RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
  682. if (!valid_utf8) {
  683. if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
  684. part_content->len, !checked)) {
  685. SET_PART_UTF (text_part);
  686. text_part->utf_raw_content = part_content;
  687. text_part->real_charset = UTF8_CHARSET;
  688. return;
  689. }
  690. else {
  691. charset = charset_tok.begin;
  692. if (!rspamd_mime_text_part_utf8_convert (task, text_part,
  693. part_content, charset, &err)) {
  694. msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
  695. MESSAGE_FIELD (task, message_id),
  696. charset,
  697. err ? err->message : "unknown problem");
  698. SET_PART_RAW (text_part);
  699. g_error_free (err);
  700. text_part->utf_raw_content = part_content;
  701. return;
  702. }
  703. SET_PART_UTF (text_part);
  704. text_part->real_charset = charset;
  705. }
  706. }
  707. else {
  708. SET_PART_UTF (text_part);
  709. text_part->utf_raw_content = part_content;
  710. }
  711. }