123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "config.h"
- #include "libutil/mem_pool.h"
- #include "libutil/regexp.h"
- #include "libutil/hash.h"
- #include "libserver/cfg_file.h"
- #include "libserver/task.h"
- #include "mime_encoding.h"
- #include "message.h"
- #include "contrib/fastutf8/fastutf8.h"
- #include "contrib/google-ced/ced_c.h"
- #include <unicode/ucnv.h>
- #if U_ICU_VERSION_MAJOR_NUM >= 44
- #include <unicode/unorm2.h>
- #endif
- #include <math.h>
-
- #define UTF8_CHARSET "UTF-8"
-
- #define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
- #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
-
- #define RSPAMD_CHARSET_CACHE_SIZE 32
- #define RSPAMD_CHARSET_MAX_CONTENT 512
-
- #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
- #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
-
- static rspamd_regexp_t *utf_compatible_re = NULL;
-
- struct rspamd_charset_substitution {
- const gchar *input;
- const gchar *canon;
- gint flags;
- };
-
- #include "mime_encoding_list.h"
-
- static GHashTable *sub_hash = NULL;
-
- static const UChar iso_8859_16_map[] = {
- 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
- 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
- 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
- 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
- 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
- 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
- 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
- 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
- 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
- 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
- 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
- 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
- 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
- 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
- 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
- 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
- };
-
- struct rspamd_charset_converter {
- gchar *canon_name;
- union {
- UConverter *conv;
- const UChar *cnv_table;
- } d;
- gboolean is_internal;
- };
-
- static GQuark
- rspamd_charset_conv_error_quark (void)
- {
- return g_quark_from_static_string ("charset conversion error");
- }
-
- static void
- rspamd_converter_dtor (gpointer p)
- {
- struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p;
-
- if (!c->is_internal) {
- ucnv_close (c->d.conv);
- }
-
- g_free (c->canon_name);
- g_free (c);
- }
-
- int32_t
- rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
- UChar *dest,
- int32_t destCapacity,
- const char *src,
- int32_t srcLength,
- UErrorCode *pErrorCode)
- {
- if (!cnv->is_internal) {
- return ucnv_toUChars (cnv->d.conv,
- dest, destCapacity,
- src, srcLength,
- pErrorCode);
- }
- else {
- UChar *d = dest, *dend = dest + destCapacity;
- const guchar *p = src, *end = src + srcLength;
-
- while (p < end && d < dend) {
- if (*p <= 127) {
- *d++ = (UChar)*p;
- }
- else {
- *d++ = cnv->d.cnv_table[*p - 128];
- }
-
- p ++;
- }
-
- return d - dest;
- }
- }
-
-
- struct rspamd_charset_converter *
- rspamd_mime_get_converter_cached (const gchar *enc,
- rspamd_mempool_t *pool,
- gboolean is_canon,
- UErrorCode *err)
- {
- const gchar *canon_name;
- static rspamd_lru_hash_t *cache;
- struct rspamd_charset_converter *conv;
-
- if (cache == NULL) {
- cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL,
- rspamd_converter_dtor, rspamd_str_hash,
- rspamd_str_equal);
- }
-
- if (enc == NULL) {
- return NULL;
- }
-
- if (!is_canon) {
- rspamd_ftok_t cset_tok;
-
- RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
- canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
- }
- else {
- canon_name = enc;
- }
-
- if (canon_name == NULL) {
- return NULL;
- }
-
- conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0);
-
- if (conv == NULL) {
- if (!(strcmp (canon_name, "ISO-8859-16") == 0 ||
- strcmp (canon_name, "latin10") == 0 ||
- strcmp (canon_name, "iso-ir-226") == 0)) {
- conv = g_malloc0 (sizeof (*conv));
- conv->d.conv = ucnv_open (canon_name, err);
- conv->canon_name = g_strdup (canon_name);
-
- if (conv->d.conv != NULL) {
- ucnv_setToUCallBack (conv->d.conv,
- UCNV_TO_U_CALLBACK_SUBSTITUTE,
- NULL,
- NULL,
- NULL,
- err);
- rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
- }
- else {
- g_free (conv);
- conv = NULL;
- }
- }
- else {
- /* ISO-8859-16 */
- conv = g_malloc0 (sizeof (*conv));
- conv->is_internal = TRUE;
- conv->d.cnv_table = iso_8859_16_map;
- conv->canon_name = g_strdup (canon_name);
-
- rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
- }
- }
-
- return conv;
- }
-
- static void
- rspamd_mime_encoding_substitute_init (void)
- {
- guint i;
-
- sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);
-
- for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
- g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
- }
- }
-
- static void
- rspamd_charset_normalize (gchar *in)
- {
- /*
- * This is a simple routine to validate input charset
- * we just check that charset starts with alphanumeric and ends
- * with alphanumeric
- */
- gchar *begin, *end;
- gboolean changed = FALSE;
-
- begin = in;
-
- while (*begin && !g_ascii_isalnum (*begin)) {
- begin ++;
- changed = TRUE;
- }
-
- end = begin + strlen (begin) - 1;
-
- while (end > begin && !g_ascii_isalnum (*end)) {
- end --;
- changed = TRUE;
- }
-
- if (changed) {
- memmove (in, begin, end - begin + 2);
- *(end + 1) = '\0';
- }
- }
-
- const gchar *
- rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
- {
- gchar *ret = NULL, *h, *t;
- struct rspamd_charset_substitution *s;
- const gchar *cset;
- rspamd_ftok_t utf8_tok;
- UErrorCode uc_err = U_ZERO_ERROR;
-
- if (sub_hash == NULL) {
- rspamd_mime_encoding_substitute_init ();
- }
-
- /* Fast path */
- RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf-8");
-
- if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
- return UTF8_CHARSET;
- }
-
- RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf8");
-
- if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
- return UTF8_CHARSET;
- }
-
- ret = rspamd_mempool_ftokdup (pool, in);
- rspamd_charset_normalize (ret);
-
- if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) ||
- (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) {
- /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
- h = ret;
- t = ret;
-
- while (*h != '\0') {
- if (*h != '-') {
- *t++ = *h;
- }
-
- h ++;
- }
-
- *t = '\0';
- }
-
- s = g_hash_table_lookup (sub_hash, ret);
-
- if (s) {
- ret = (char *)s->canon;
- }
-
- /* Try different aliases */
- cset = ucnv_getCanonicalName (ret, "MIME", &uc_err);
-
- if (cset == NULL) {
- uc_err = U_ZERO_ERROR;
- cset = ucnv_getCanonicalName (ret, "IANA", &uc_err);
- }
-
- if (cset == NULL) {
- uc_err = U_ZERO_ERROR;
- cset = ucnv_getCanonicalName (ret, "", &uc_err);
- }
-
- if (cset == NULL) {
- uc_err = U_ZERO_ERROR;
- cset = ucnv_getAlias (ret, 0, &uc_err);
- }
-
- return cset;
- }
-
- gchar *
- rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
- gchar *input, gsize len, const gchar *in_enc,
- gsize *olen, GError **err)
- {
- gchar *d;
- gint32 r, clen, dlen;
- UChar *tmp_buf;
-
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter;
- struct rspamd_charset_converter *conv;
- rspamd_ftok_t cset_tok;
-
- /* Check if already utf8 */
- RSPAMD_FTOK_FROM_STR (&cset_tok, in_enc);
-
- if (rspamd_mime_charset_utf_check (&cset_tok, input, len,
- FALSE)) {
- d = rspamd_mempool_alloc (pool, len);
- memcpy (d, input, len);
- if (olen) {
- *olen = len;
- }
-
- return d;
- }
-
- conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
- utf8_converter = rspamd_get_utf8_converter ();
-
- if (conv == NULL) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot open converter for %s: %s",
- in_enc, u_errorName (uc_err));
-
- return NULL;
- }
-
- tmp_buf = g_new (UChar, len + 1);
- uc_err = U_ZERO_ERROR;
- r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data to unicode from %s: %s",
- in_enc, u_errorName (uc_err));
- g_free (tmp_buf);
-
- return NULL;
- }
-
- /* Now, convert to utf8 */
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
- d = rspamd_mempool_alloc (pool, dlen);
- r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data from unicode from %s: %s",
- in_enc, u_errorName (uc_err));
- g_free (tmp_buf);
-
- return NULL;
- }
-
- msg_debug_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d",
- in_enc, len, r);
- g_free (tmp_buf);
-
- if (olen) {
- *olen = r;
- }
-
- return d;
- }
-
- static gboolean
- rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part,
- GByteArray *input,
- const gchar *charset,
- GError **err)
- {
- gchar *d;
- gint32 r, clen, dlen, uc_len;
- UChar *tmp_buf;
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter;
- struct rspamd_charset_converter *conv;
-
- conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
- TRUE, &uc_err);
- utf8_converter = rspamd_get_utf8_converter ();
-
- if (conv == NULL) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot open converter for %s: %s",
- charset, u_errorName (uc_err));
-
- return FALSE;
- }
-
- tmp_buf = g_new (UChar, input->len + 1);
- uc_err = U_ZERO_ERROR;
- uc_len = rspamd_converter_to_uchars (conv,
- tmp_buf,
- input->len + 1,
- input->data,
- input->len,
- &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data to unicode from %s: %s",
- charset, u_errorName (uc_err));
- g_free (tmp_buf);
-
- return FALSE;
- }
-
- /* Now, convert to utf8 */
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
- d = rspamd_mempool_alloc (task->task_pool, dlen);
- r = ucnv_fromUChars (utf8_converter, d, dlen,
- tmp_buf, uc_len, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data from unicode from %s: %s",
- charset, u_errorName (uc_err));
- g_free (tmp_buf);
-
- return FALSE;
- }
-
- if (text_part->mime_part && text_part->mime_part->ct) {
- msg_info_task ("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
- charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
- }
- else {
- msg_info_task ("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
- "outlen: %d (%d UTF16 chars)",
- charset, input->len, r, uc_len);
- }
-
- text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
- sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
- text_part->utf_raw_content->data = d;
- text_part->utf_raw_content->len = r;
- g_free (tmp_buf);
-
- return TRUE;
- }
-
- gboolean
- rspamd_mime_to_utf8_byte_array (GByteArray *in,
- GByteArray *out,
- rspamd_mempool_t *pool,
- const gchar *enc)
- {
- gint32 r, clen, dlen;
- UChar *tmp_buf;
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter;
- struct rspamd_charset_converter *conv;
- rspamd_ftok_t charset_tok;
-
- if (in == NULL || in->len == 0) {
- return FALSE;
- }
-
- if (enc == NULL) {
- /* Assume utf ? */
- if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
- g_byte_array_set_size (out, in->len);
- memcpy (out->data, in->data, out->len);
-
- return TRUE;
- }
- else {
- /* Bad stuff, keep out */
- return FALSE;
- }
- }
-
- RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
-
- if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
- FALSE)) {
- g_byte_array_set_size (out, in->len);
- memcpy (out->data, in->data, out->len);
-
- return TRUE;
- }
-
- utf8_converter = rspamd_get_utf8_converter ();
- conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
-
- if (conv == NULL) {
- return FALSE;
- }
-
- tmp_buf = g_new (UChar, in->len + 1);
- uc_err = U_ZERO_ERROR;
- r = rspamd_converter_to_uchars (conv,
- tmp_buf, in->len + 1,
- in->data, in->len, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_free (tmp_buf);
-
- return FALSE;
- }
-
- /* Now, convert to utf8 */
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
- g_byte_array_set_size (out, dlen);
- r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_free (tmp_buf);
-
- return FALSE;
- }
-
- g_free (tmp_buf);
- out->len = r;
-
- return TRUE;
- }
-
- void
- rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
- {
- gchar *p, *end;
- goffset err_offset;
- UChar32 uc = 0;
-
- /* Now we validate input and replace bad characters with '?' symbol */
- p = in;
- end = in + len;
-
- while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) {
- err_offset --; /* As it returns it 1 indexed */
- gint32 cur_offset = err_offset;
-
- while (cur_offset < len) {
- gint32 tmp = cur_offset;
-
- U8_NEXT (p, cur_offset, len, uc);
-
- if (uc > 0) {
- /* Fill string between err_offset and tmp with `?` character */
- memset (p + err_offset, '?', tmp - err_offset);
- break;
- }
- }
-
- if (uc < 0) {
- /* Fill till the end */
- memset (p + err_offset, '?', len - err_offset);
- break;
- }
-
- p += cur_offset;
- len = end - p;
- }
- }
-
- const char *
- rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen,
- bool check_utf8)
- {
- int nconsumed;
- bool is_reliable;
- const gchar *ced_name;
-
- if (check_utf8) {
- if (rspamd_fast_utf8_validate (in, inlen) == 0) {
- return UTF8_CHARSET;
- }
- }
-
-
- ced_name = ced_encoding_detect (in, inlen, NULL, NULL,
- NULL, 0, CED_EMAIL_CORPUS,
- false, &nconsumed, &is_reliable);
-
- if (ced_name) {
-
- return ced_name;
- }
-
- return NULL;
- }
-
- static const char *
- rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen)
- {
- if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) {
- return rspamd_mime_charset_find_by_content (in, inlen, false);
- }
- else {
- const gchar *c1, *c2, *c3;
-
- c1 = rspamd_mime_charset_find_by_content (in, RSPAMD_CHARSET_MAX_CONTENT, false);
- c2 = rspamd_mime_charset_find_by_content (in + inlen / 2,
- RSPAMD_CHARSET_MAX_CONTENT, false);
- c3 = rspamd_mime_charset_find_by_content (in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
- RSPAMD_CHARSET_MAX_CONTENT, false);
-
- /* 7bit stuff */
- if (c1 && strcmp (c1, "US-ASCII") == 0) {
- c1 = NULL; /* Invalid - we have 8 bit there */
- }
- if (c2 && strcmp (c2, "US-ASCII") == 0) {
- c2 = NULL; /* Invalid - we have 8 bit there */
- }
- if (c3 && strcmp (c3, "US-ASCII") == 0) {
- c3 = NULL; /* Invalid - we have 8 bit there */
- }
-
- if (!c1) {
- c1 = c2 ? c2 : c3;
- }
- if (!c2) {
- c2 = c3 ? c3 : c1;
- }
- if (!c3) {
- c3 = c1 ? c2 : c1;
- }
-
- if (c1 && c2 && c3) {
- /* Quorum */
- if (c1 == c2) {
- return c1;
- }
- else if (c2 == c3) {
- return c2;
- }
- else if (c1 == c3) {
- return c3;
- }
-
- /* All charsets are distinct. Use the one from the top */
- return c1;
- }
-
- return NULL;
- }
- }
-
- gboolean
- rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
- gchar *in, gsize len, gboolean content_check)
- {
- const gchar *real_charset;
-
- if (utf_compatible_re == NULL) {
- utf_compatible_re = rspamd_regexp_new (
- "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
- "i", NULL);
- }
-
- if (charset->len == 0 ||
- rspamd_regexp_match (utf_compatible_re,
- charset->begin, charset->len, TRUE)) {
- /*
- * In case of UTF8 charset we still can check the content to find
- * corner cases
- */
- if (content_check) {
- if (rspamd_fast_utf8_validate (in, len) != 0) {
- real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len);
-
- if (real_charset) {
-
- if (rspamd_regexp_match (utf_compatible_re,
- real_charset, strlen (real_charset), TRUE)) {
- RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
-
- return TRUE;
- }
- else {
- charset->begin = real_charset;
- charset->len = strlen (real_charset);
-
- return FALSE;
- }
- }
-
- rspamd_mime_charset_utf_enforce (in, len);
- }
- }
-
- return TRUE;
- }
-
- return FALSE;
- }
-
- void
- rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
- {
- GError *err = NULL;
- const gchar *charset = NULL;
- gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
- GByteArray *part_content;
- rspamd_ftok_t charset_tok;
- struct rspamd_mime_part *part = text_part->mime_part;
-
- if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW;
- }
-
- /* Allocate copy storage */
- part_content = g_byte_array_sized_new (text_part->parsed.len);
- memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
- part_content->len = text_part->parsed.len;
- rspamd_mempool_notify_alloc (task->task_pool,
- part_content->len);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
-
- if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
- if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
- /* Valid UTF, likely all good */
- need_charset_heuristic = FALSE;
- valid_utf8 = TRUE;
- checked = TRUE;
- }
-
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
- }
- else {
- /* All 7bit characters, assume it valid utf */
- need_charset_heuristic = FALSE;
- valid_utf8 = TRUE;
- checked = TRUE; /* Already valid utf, no need in further checks */
- }
-
- if (part->ct->charset.len == 0) {
- if (need_charset_heuristic) {
- charset = rspamd_mime_charset_find_by_content_maybe_split (text_part->parsed.begin,
- text_part->parsed.len);
-
- if (charset != NULL) {
- msg_info_task ("detected charset %s", charset);
- }
-
- checked = TRUE;
- text_part->real_charset = charset;
- }
- else if (valid_utf8) {
- SET_PART_UTF (text_part);
- text_part->utf_raw_content = part_content;
- text_part->real_charset = UTF8_CHARSET;
-
- return;
- }
- }
- else {
- charset = rspamd_mime_detect_charset (&part->ct->charset,
- task->task_pool);
-
- if (charset == NULL) {
- /* We don't know the real charset but can try heuristic */
- if (need_charset_heuristic) {
- charset = rspamd_mime_charset_find_by_content_maybe_split (part_content->data,
- part_content->len);
- msg_info_task ("detected charset: %s", charset);
- checked = TRUE;
- text_part->real_charset = charset;
- }
- else if (valid_utf8) {
- /* We already know that the input is valid utf, so skip heuristic */
- text_part->real_charset = UTF8_CHARSET;
- }
- }
- else {
- text_part->real_charset = charset;
-
- if (strcmp (charset, UTF8_CHARSET) != 0) {
- /*
- * We have detected some charset, but we don't know which one,
- * so we need to reset valid utf8 flag and enforce it later
- */
- valid_utf8 = FALSE;
- }
- }
- }
-
- if (text_part->real_charset == NULL) {
- msg_info_task ("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
- MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset,
- part->ct->cpy);
- SET_PART_RAW (text_part);
- text_part->utf_raw_content = part_content;
-
- return;
- }
-
- RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
-
- if (!valid_utf8) {
- if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
- part_content->len, !checked)) {
- SET_PART_UTF (text_part);
- text_part->utf_raw_content = part_content;
- text_part->real_charset = UTF8_CHARSET;
-
- return;
- }
- else {
- charset = charset_tok.begin;
-
- if (!rspamd_mime_text_part_utf8_convert (task, text_part,
- part_content, charset, &err)) {
- msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
- MESSAGE_FIELD (task, message_id),
- charset,
- err ? err->message : "unknown problem");
- SET_PART_RAW (text_part);
- g_error_free (err);
-
- text_part->utf_raw_content = part_content;
- return;
- }
-
- SET_PART_UTF (text_part);
- text_part->real_charset = charset;
- }
- }
- else {
- SET_PART_UTF (text_part);
- text_part->utf_raw_content = part_content;
- }
- }
|