4 years ago · a455c65422
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -573,17 +573,10 @@ rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
 		g_assert (csd != NULL);
 	}

 	/* If text is ascii, then we can treat it as utf8 data */
 	for (i = 0; i < inlen; i++) {
 		if ((((guchar)in[i]) & 0x80) != 0) {
 			goto detect;
 		}
 	if (rspamd_fast_utf8_validate (in, inlen) == 0) {
 		return UTF8_CHARSET;
 	}

 	return UTF8_CHARSET;

 detect:

 	ucsdet_setText (csd, in, inlen, &uc_err);
 	csm = ucsdet_detectAll (csd, &matches, &uc_err);

@@ -661,15 +654,11 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 {
 	GError *err = NULL;
 	const gchar *charset = NULL;
 	gboolean checked = FALSE, need_charset_heuristic = TRUE;
 	gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
 	GByteArray *part_content;
 	rspamd_ftok_t charset_tok;
 	struct rspamd_mime_part *part = text_part->mime_part;

 	if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT;
 	}

 	/* Allocate copy storage */
 	part_content = g_byte_array_sized_new (text_part->parsed.len);
 	memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
@@ -680,18 +669,20 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 			(rspamd_mempool_destruct_t)g_byte_array_unref, part_content);

 	if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
 		if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
 			/* Valid UTF, likely all good */
 			need_charset_heuristic = FALSE;
 			valid_utf8 = TRUE;
 			checked = TRUE;
 		}

 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
 	}

 	if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) {
 	else {
 		/* All 7bit characters, assume it valid utf */
 		need_charset_heuristic = FALSE;
 	}

 	if (task->cfg && task->cfg->raw_mode) {
 		SET_PART_RAW (text_part);
 		text_part->utf_raw_content = part_content;

 		return;
 		valid_utf8 = TRUE;
 		checked = TRUE; /* Already valid utf, no need in further checks */
 	}

 	if (part->ct->charset.len == 0) {
@@ -706,7 +697,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 			checked = TRUE;
 			text_part->real_charset = charset;
 		}
 		else {
 		else if (valid_utf8) {
 			SET_PART_UTF (text_part);
 			text_part->utf_raw_content = part_content;
 			text_part->real_charset = UTF8_CHARSET;
@@ -719,17 +710,30 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 				task->task_pool);

 		if (charset == NULL) {
 			charset = rspamd_mime_charset_find_by_content (part_content->data,
 					MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
 			msg_info_task ("detected charset: %s", charset);
 			checked = TRUE;
 			text_part->real_charset = charset;
 			/* We don't know the real charset but can try heuristic */
 			if (need_charset_heuristic) {
 				charset = rspamd_mime_charset_find_by_content (part_content->data,
 						MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
 				msg_info_task ("detected charset: %s", charset);
 				checked = TRUE;
 				text_part->real_charset = charset;
 			}
 			else if (valid_utf8) {
 				/* We already know that the input is valid utf, so skip heuristic */
 				text_part->real_charset = UTF8_CHARSET;
 			}
 		}
 		else {
 			/*
 			 * We have detected some charset, but we don't know which one
 			 */
 			valid_utf8 = FALSE;
 		}
 	}

 	if (charset == NULL) {
 		msg_info_task ("<%s>: has invalid charset",
 				MESSAGE_FIELD_CHECK (task, message_id));
 	if (text_part->real_charset == NULL) {
 		msg_info_task ("<%s>: has invalid charset; original: %T",
 				MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset);
 		SET_PART_RAW (text_part);
 		text_part->utf_raw_content = part_content;

@@ -738,32 +742,37 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,

 	RSPAMD_FTOK_FROM_STR (&charset_tok, charset);

 	if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
 			part_content->len, !checked)) {
 		SET_PART_UTF (text_part);
 		text_part->utf_raw_content = part_content;
 		text_part->real_charset = UTF8_CHARSET;

 		return;
 	}
 	else {
 		charset = charset_tok.begin;

 		if (!rspamd_mime_text_part_utf8_convert (task, text_part,
 				part_content, charset, &err)) {
 			msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
 					MESSAGE_FIELD (task, message_id),
 					charset,
 					err ? err->message : "unknown problem");
 			SET_PART_RAW (text_part);
 			g_error_free (err);

 	if (!valid_utf8) {
 		if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
 				part_content->len, !checked)) {
 			SET_PART_UTF (text_part);
 			text_part->utf_raw_content = part_content;
 			text_part->real_charset = UTF8_CHARSET;

 			return;
 		}
 		else {
 			charset = charset_tok.begin;

 			if (!rspamd_mime_text_part_utf8_convert (task, text_part,
 					part_content, charset, &err)) {
 				msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
 						MESSAGE_FIELD (task, message_id),
 						charset,
 						err ? err->message : "unknown problem");
 				SET_PART_RAW (text_part);
 				g_error_free (err);

 				text_part->utf_raw_content = part_content;
 				return;
 			}

 		text_part->real_charset = charset;
 			SET_PART_UTF (text_part);
 			text_part->real_charset = charset;
 		}
 	}
 	else {
 		SET_PART_UTF (text_part);
 		text_part->utf_raw_content = part_content;
 	}

 	SET_PART_UTF (text_part);
 }