Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

mime_encoding.c 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "libutil/mem_pool.h"
  18. #include "libutil/regexp.h"
  19. #include "libutil/hash.h"
  20. #include "libserver/task.h"
  21. #include "mime_encoding.h"
  22. #include "message.h"
  23. #include <unicode/ucnv.h>
  24. #include <unicode/ucsdet.h>
  25. #if U_ICU_VERSION_MAJOR_NUM >= 44
  26. #include <unicode/unorm2.h>
  27. #endif
  28. #include <math.h>
  29. #define UTF8_CHARSET "UTF-8"
  30. #define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
  31. #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
  32. #define RSPAMD_CHARSET_CACHE_SIZE 32
  33. #define RSPAMD_CHARSET_MAX_CONTENT 128
  34. #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  35. #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  36. static rspamd_regexp_t *utf_compatible_re = NULL;
  37. struct rspamd_charset_substitution {
  38. const gchar *input;
  39. const gchar *canon;
  40. gint flags;
  41. };
  42. #include "mime_encoding_list.h"
  43. static GHashTable *sub_hash = NULL;
  44. static const UChar iso_8859_16_map[] = {
  45. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
  46. 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
  47. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
  48. 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
  49. 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
  50. 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
  51. 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
  52. 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
  53. 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
  54. 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
  55. 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
  56. 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
  57. 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
  58. 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
  59. 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
  60. 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
  61. };
  62. struct rspamd_charset_converter {
  63. gchar *canon_name;
  64. union {
  65. UConverter *conv;
  66. const UChar *cnv_table;
  67. } d;
  68. gboolean is_internal;
  69. };
  70. static GQuark
  71. rspamd_iconv_error_quark (void)
  72. {
  73. return g_quark_from_static_string ("iconv error");
  74. }
  75. static void
  76. rspamd_converter_dtor (gpointer p)
  77. {
  78. struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p;
  79. if (!c->is_internal) {
  80. ucnv_close (c->d.conv);
  81. }
  82. g_free (c->canon_name);
  83. g_free (c);
  84. }
  85. int32_t
  86. rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
  87. UChar *dest,
  88. int32_t destCapacity,
  89. const char *src,
  90. int32_t srcLength,
  91. UErrorCode *pErrorCode)
  92. {
  93. if (!cnv->is_internal) {
  94. return ucnv_toUChars (cnv->d.conv,
  95. dest, destCapacity,
  96. src, srcLength,
  97. pErrorCode);
  98. }
  99. else {
  100. UChar *d = dest, *dend = dest + destCapacity;
  101. const guchar *p = src, *end = src + srcLength;
  102. while (p < end && d < dend) {
  103. if (*p <= 127) {
  104. *d++ = (UChar)*p;
  105. }
  106. else {
  107. *d++ = cnv->d.cnv_table[*p - 128];
  108. }
  109. p ++;
  110. }
  111. return d - dest;
  112. }
  113. }
  114. struct rspamd_charset_converter *
  115. rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
  116. {
  117. const gchar *canon_name;
  118. static rspamd_lru_hash_t *cache;
  119. struct rspamd_charset_converter *conv;
  120. if (cache == NULL) {
  121. cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL,
  122. rspamd_converter_dtor, rspamd_str_hash,
  123. rspamd_str_equal);
  124. }
  125. canon_name = ucnv_getStandardName (enc, "IANA", err);
  126. if (canon_name == NULL) {
  127. return NULL;
  128. }
  129. conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0);
  130. if (conv == NULL) {
  131. if (!(strcmp (canon_name, "ISO-8859-16") == 0 ||
  132. strcmp (canon_name, "latin10") == 0 ||
  133. strcmp (canon_name, "iso-ir-226") == 0)) {
  134. conv = g_malloc0 (sizeof (*conv));
  135. conv->d.conv = ucnv_open (canon_name, err);
  136. conv->canon_name = g_strdup (canon_name);
  137. if (conv->d.conv != NULL) {
  138. ucnv_setToUCallBack (conv->d.conv,
  139. UCNV_TO_U_CALLBACK_SUBSTITUTE,
  140. NULL,
  141. NULL,
  142. NULL,
  143. err);
  144. rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
  145. }
  146. else {
  147. g_free (conv);
  148. conv = NULL;
  149. }
  150. }
  151. else {
  152. /* ISO-8859-16 */
  153. conv = g_malloc0 (sizeof (*conv));
  154. conv->is_internal = TRUE;
  155. conv->d.cnv_table = iso_8859_16_map;
  156. conv->canon_name = g_strdup (canon_name);
  157. rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
  158. }
  159. }
  160. return conv;
  161. }
  162. static void
  163. rspamd_mime_encoding_substitute_init (void)
  164. {
  165. guint i;
  166. sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);
  167. for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
  168. g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
  169. }
  170. }
  171. static void
  172. rspamd_charset_normalize (gchar *in)
  173. {
  174. /*
  175. * This is a simple routine to validate input charset
  176. * we just check that charset starts with alphanumeric and ends
  177. * with alphanumeric
  178. */
  179. gchar *begin, *end;
  180. gboolean changed = FALSE;
  181. begin = in;
  182. while (*begin && !g_ascii_isalnum (*begin)) {
  183. begin ++;
  184. changed = TRUE;
  185. }
  186. end = begin + strlen (begin) - 1;
  187. while (end > begin && !g_ascii_isalnum (*end)) {
  188. end --;
  189. changed = TRUE;
  190. }
  191. if (changed) {
  192. memmove (in, begin, end - begin + 2);
  193. *(end + 1) = '\0';
  194. }
  195. }
  196. const gchar *
  197. rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
  198. {
  199. gchar *ret = NULL, *h, *t;
  200. struct rspamd_charset_substitution *s;
  201. UErrorCode uc_err = U_ZERO_ERROR;
  202. if (sub_hash == NULL) {
  203. rspamd_mime_encoding_substitute_init ();
  204. }
  205. ret = rspamd_mempool_ftokdup (pool, in);
  206. rspamd_charset_normalize (ret);
  207. if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) ||
  208. (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) {
  209. /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
  210. h = ret;
  211. t = ret;
  212. while (*h != '\0') {
  213. if (*h != '-') {
  214. *t++ = *h;
  215. }
  216. h ++;
  217. }
  218. *t = '\0';
  219. }
  220. s = g_hash_table_lookup (sub_hash, ret);
  221. if (s) {
  222. return ucnv_getStandardName (s->canon, "IANA", &uc_err);
  223. }
  224. return ucnv_getStandardName (ret, "IANA", &uc_err);
  225. }
  226. gchar *
  227. rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
  228. gchar *input, gsize len, const gchar *in_enc,
  229. gsize *olen, GError **err)
  230. {
  231. gchar *d;
  232. gint32 r, clen, dlen;
  233. UChar *tmp_buf;
  234. UErrorCode uc_err = U_ZERO_ERROR;
  235. UConverter *utf8_converter;
  236. struct rspamd_charset_converter *conv;
  237. conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
  238. utf8_converter = rspamd_get_utf8_converter ();
  239. if (conv == NULL) {
  240. g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
  241. "cannot open converter for %s: %s",
  242. in_enc, u_errorName (uc_err));
  243. return NULL;
  244. }
  245. tmp_buf = g_new (UChar, len + 1);
  246. uc_err = U_ZERO_ERROR;
  247. r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err);
  248. if (!U_SUCCESS (uc_err)) {
  249. g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
  250. "cannot convert data to unicode from %s: %s",
  251. in_enc, u_errorName (uc_err));
  252. g_free (tmp_buf);
  253. return NULL;
  254. }
  255. /* Now, convert to utf8 */
  256. clen = ucnv_getMaxCharSize (utf8_converter);
  257. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
  258. d = rspamd_mempool_alloc (pool, dlen);
  259. r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
  260. if (!U_SUCCESS (uc_err)) {
  261. g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
  262. "cannot convert data from unicode from %s: %s",
  263. in_enc, u_errorName (uc_err));
  264. g_free (tmp_buf);
  265. return NULL;
  266. }
  267. msg_info_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d",
  268. in_enc, len, r);
  269. g_free (tmp_buf);
  270. if (olen) {
  271. *olen = r;
  272. }
  273. return d;
  274. }
  275. static gboolean
  276. rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
  277. struct rspamd_mime_text_part *text_part,
  278. GByteArray *input,
  279. const gchar *charset,
  280. GError **err)
  281. {
  282. gchar *d;
  283. gint32 r, clen, dlen, uc_len;
  284. UChar *tmp_buf;
  285. UErrorCode uc_err = U_ZERO_ERROR;
  286. UConverter *utf8_converter;
  287. struct rspamd_charset_converter *conv;
  288. conv = rspamd_mime_get_converter_cached (charset, &uc_err);
  289. utf8_converter = rspamd_get_utf8_converter ();
  290. if (conv == NULL) {
  291. g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
  292. "cannot open converter for %s: %s",
  293. charset, u_errorName (uc_err));
  294. return FALSE;
  295. }
  296. tmp_buf = g_new (UChar, input->len + 1);
  297. uc_err = U_ZERO_ERROR;
  298. uc_len = rspamd_converter_to_uchars (conv,
  299. tmp_buf,
  300. input->len + 1,
  301. input->data,
  302. input->len,
  303. &uc_err);
  304. if (!U_SUCCESS (uc_err)) {
  305. g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
  306. "cannot convert data to unicode from %s: %s",
  307. charset, u_errorName (uc_err));
  308. g_free (tmp_buf);
  309. return FALSE;
  310. }
  311. /* Now, convert to utf8 */
  312. clen = ucnv_getMaxCharSize (utf8_converter);
  313. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
  314. d = rspamd_mempool_alloc (task->task_pool, dlen);
  315. r = ucnv_fromUChars (utf8_converter, d, dlen,
  316. tmp_buf, uc_len, &uc_err);
  317. if (!U_SUCCESS (uc_err)) {
  318. g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
  319. "cannot convert data from unicode from %s: %s",
  320. charset, u_errorName (uc_err));
  321. g_free (tmp_buf);
  322. return FALSE;
  323. }
  324. msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
  325. charset, input->len, r, uc_len);
  326. text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
  327. sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
  328. text_part->utf_raw_content->data = d;
  329. text_part->utf_raw_content->len = r;
  330. g_free (tmp_buf);
  331. return TRUE;
  332. }
  333. gboolean
  334. rspamd_mime_to_utf8_byte_array (GByteArray *in,
  335. GByteArray *out,
  336. const gchar *enc)
  337. {
  338. gint32 r, clen, dlen;
  339. UChar *tmp_buf;
  340. UErrorCode uc_err = U_ZERO_ERROR;
  341. UConverter *utf8_converter;
  342. struct rspamd_charset_converter *conv;
  343. rspamd_ftok_t charset_tok;
  344. RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
  345. if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
  346. FALSE)) {
  347. g_byte_array_set_size (out, in->len);
  348. memcpy (out->data, in->data, out->len);
  349. return TRUE;
  350. }
  351. utf8_converter = rspamd_get_utf8_converter ();
  352. conv = rspamd_mime_get_converter_cached (enc, &uc_err);
  353. if (conv == NULL) {
  354. return FALSE;
  355. }
  356. tmp_buf = g_new (UChar, in->len + 1);
  357. uc_err = U_ZERO_ERROR;
  358. r = rspamd_converter_to_uchars (conv,
  359. tmp_buf, in->len + 1,
  360. in->data, in->len, &uc_err);
  361. if (!U_SUCCESS (uc_err)) {
  362. g_free (tmp_buf);
  363. return FALSE;
  364. }
  365. /* Now, convert to utf8 */
  366. clen = ucnv_getMaxCharSize (utf8_converter);
  367. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
  368. g_byte_array_set_size (out, dlen);
  369. r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
  370. if (!U_SUCCESS (uc_err)) {
  371. g_free (tmp_buf);
  372. return FALSE;
  373. }
  374. g_free (tmp_buf);
  375. out->len = r;
  376. return TRUE;
  377. }
  378. void
  379. rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
  380. {
  381. const gchar *end, *p;
  382. gsize remain = len;
  383. /* Now we validate input and replace bad characters with '?' symbol */
  384. p = in;
  385. while (remain > 0 && !g_utf8_validate (p, remain, &end)) {
  386. gchar *valid;
  387. if (end >= in + len) {
  388. if (p < in + len) {
  389. memset ((gchar *)p, '?', (in + len) - p);
  390. }
  391. break;
  392. }
  393. valid = g_utf8_find_next_char (end, in + len);
  394. if (!valid) {
  395. valid = in + len;
  396. }
  397. if (valid > end) {
  398. memset ((gchar *)end, '?', valid - end);
  399. p = valid;
  400. remain = (in + len) - p;
  401. }
  402. else {
  403. break;
  404. }
  405. }
  406. }
  407. const char *
  408. rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
  409. {
  410. static UCharsetDetector *csd;
  411. const UCharsetMatch **csm, *sel = NULL;
  412. UErrorCode uc_err = U_ZERO_ERROR;
  413. gint32 matches, i, max_conf = G_MININT32, conf;
  414. gdouble mean = 0.0, stddev = 0.0;
  415. if (csd == NULL) {
  416. csd = ucsdet_open (&uc_err);
  417. g_assert (csd != NULL);
  418. }
  419. /* If text is ascii, then we can treat it as utf8 data */
  420. for (i = 0; i < inlen; i++) {
  421. if ((((guchar)in[i]) & 0x80) != 0) {
  422. goto detect;
  423. }
  424. }
  425. return UTF8_CHARSET;
  426. detect:
  427. ucsdet_setText (csd, in, inlen, &uc_err);
  428. csm = ucsdet_detectAll (csd, &matches, &uc_err);
  429. for (i = 0; i < matches; i ++) {
  430. if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) {
  431. max_conf = conf;
  432. sel = csm[i];
  433. }
  434. mean += (conf - mean) / (i + 1);
  435. gdouble err = fabs (conf - mean);
  436. stddev += (err - stddev) / (i + 1);
  437. }
  438. if (sel && ((max_conf > 50) || (max_conf - mean > stddev * 1.25))) {
  439. return ucsdet_getName (sel, &uc_err);
  440. }
  441. return NULL;
  442. }
  443. gboolean
  444. rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  445. gchar *in, gsize len, gboolean content_check)
  446. {
  447. const gchar *real_charset;
  448. if (utf_compatible_re == NULL) {
  449. utf_compatible_re = rspamd_regexp_new (
  450. "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
  451. "i", NULL);
  452. }
  453. if (charset->len == 0 ||
  454. rspamd_regexp_match (utf_compatible_re,
  455. charset->begin, charset->len, TRUE)) {
  456. /*
  457. * In case of UTF8 charset we still can check the content to find
  458. * corner cases
  459. */
  460. if (content_check) {
  461. real_charset = rspamd_mime_charset_find_by_content (in,
  462. MIN (RSPAMD_CHARSET_MAX_CONTENT, len));
  463. if (real_charset) {
  464. if (rspamd_regexp_match (utf_compatible_re,
  465. real_charset, strlen (real_charset), TRUE)) {
  466. RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
  467. return TRUE;
  468. }
  469. else {
  470. charset->begin = real_charset;
  471. charset->len = strlen (real_charset);
  472. return FALSE;
  473. }
  474. }
  475. }
  476. rspamd_mime_charset_utf_enforce (in, len);
  477. return TRUE;
  478. }
  479. return FALSE;
  480. }
  481. void
  482. rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
  483. struct rspamd_mime_text_part *text_part)
  484. {
  485. GError *err = NULL;
  486. const gchar *charset = NULL;
  487. gboolean checked = FALSE, need_charset_heuristic = TRUE;
  488. GByteArray *part_content;
  489. rspamd_ftok_t charset_tok;
  490. struct rspamd_mime_part *part = text_part->mime_part;
  491. if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
  492. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT;
  493. }
  494. /* Allocate copy storage */
  495. part_content = g_byte_array_sized_new (text_part->parsed.len);
  496. memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
  497. part_content->len = text_part->parsed.len;
  498. rspamd_mempool_add_destructor (task->task_pool,
  499. (rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
  500. if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
  501. text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
  502. }
  503. if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) {
  504. need_charset_heuristic = FALSE;
  505. }
  506. if (task->cfg && task->cfg->raw_mode) {
  507. SET_PART_RAW (text_part);
  508. text_part->utf_raw_content = part_content;
  509. return;
  510. }
  511. if (part->ct->charset.len == 0) {
  512. if (need_charset_heuristic) {
  513. charset = rspamd_mime_charset_find_by_content (part_content->data,
  514. MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
  515. if (charset != NULL) {
  516. msg_info_task ("detected charset %s", charset);
  517. }
  518. checked = TRUE;
  519. text_part->real_charset = charset;
  520. }
  521. else {
  522. SET_PART_UTF (text_part);
  523. text_part->utf_raw_content = part_content;
  524. text_part->real_charset = UTF8_CHARSET;
  525. return;
  526. }
  527. }
  528. else {
  529. charset = rspamd_mime_detect_charset (&part->ct->charset,
  530. task->task_pool);
  531. if (charset == NULL) {
  532. charset = rspamd_mime_charset_find_by_content (part_content->data,
  533. MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
  534. msg_info_task ("detected charset: %s", charset);
  535. checked = TRUE;
  536. text_part->real_charset = charset;
  537. }
  538. }
  539. if (charset == NULL) {
  540. msg_info_task ("<%s>: has invalid charset", task->message_id);
  541. SET_PART_RAW (text_part);
  542. text_part->utf_raw_content = part_content;
  543. return;
  544. }
  545. RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
  546. if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
  547. part_content->len, !checked)) {
  548. SET_PART_UTF (text_part);
  549. text_part->utf_raw_content = part_content;
  550. text_part->real_charset = UTF8_CHARSET;
  551. return;
  552. }
  553. else {
  554. charset = charset_tok.begin;
  555. if (!rspamd_mime_text_part_utf8_convert (task, text_part,
  556. part_content, charset, &err)) {
  557. msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
  558. task->message_id,
  559. charset,
  560. err ? err->message : "unknown problem");
  561. SET_PART_RAW (text_part);
  562. g_error_free (err);
  563. text_part->utf_raw_content = part_content;
  564. return;
  565. }
  566. text_part->real_charset = charset;
  567. }
  568. SET_PART_UTF (text_part);
  569. }