aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-08-23 20:06:34 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-08-23 20:06:34 +0100
commit3944ae59e74c31103d3bd10fe4dd025b05d74679 (patch)
tree6556701646c1d1b3d35d5b8766236a1995ef88bf /src/libmime
parente9c773e6bb0e09b4802f3cb06b93b7a082e464ed (diff)
downloadrspamd-3944ae59e74c31103d3bd10fe4dd025b05d74679.tar.gz
rspamd-3944ae59e74c31103d3bd10fe4dd025b05d74679.zip
[Project] Further changes in unicode operations
* Normalise unicode * Add normality flag for text parts * Store UCS in text parts * Rework unicode conversions and operations
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/message.c15
-rw-r--r--src/libmime/message.h4
-rw-r--r--src/libmime/mime_encoding.c301
-rw-r--r--src/libmime/mime_encoding.h2
4 files changed, 249 insertions, 73 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 5d9cf19d1..e6cb63504 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -661,7 +661,6 @@ rspamd_message_process_text_part (struct rspamd_task *task,
{
struct rspamd_mime_text_part *text_part;
rspamd_ftok_t html_tok, xhtml_tok;
- GByteArray *part_content;
gboolean found_html = FALSE, found_txt = FALSE;
enum rspamd_action_type act;
@@ -756,22 +755,21 @@ rspamd_message_process_text_part (struct rspamd_task *task,
return;
}
- part_content = rspamd_mime_text_part_maybe_convert (task, text_part);
+ rspamd_mime_text_part_maybe_convert (task, text_part);
- if (part_content == NULL) {
+ if (text_part->utf_raw_content == NULL) {
return;
}
text_part->html = rspamd_mempool_alloc0 (task->task_pool,
sizeof (*text_part->html));
text_part->mime_part = mime_part;
- text_part->utf_raw_content = part_content;
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
text_part->content = rspamd_html_process_part_full (
task->task_pool,
text_part->html,
- part_content,
+ text_part->utf_raw_content,
&text_part->exceptions,
task->urls,
task->emails);
@@ -802,15 +800,14 @@ rspamd_message_process_text_part (struct rspamd_task *task,
return;
}
- text_part->content = rspamd_mime_text_part_maybe_convert (task,
- text_part);
- text_part->utf_raw_content = text_part->content;
+ rspamd_mime_text_part_maybe_convert (task, text_part);
- if (text_part->content != NULL) {
+ if (text_part->utf_raw_content != NULL) {
/*
* We ignore unconverted parts from now as it is dangerous
* to treat them as text parts
*/
+ text_part->content = text_part->utf_raw_content;
g_ptr_array_add (task->text_parts, text_part);
}
else {
diff --git a/src/libmime/message.h b/src/libmime/message.h
index b0a7983b4..baabb762a 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -73,6 +73,8 @@ struct rspamd_mime_part {
#define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 3)
#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT (1 << 4)
#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 5)
+#define RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL (1 << 6)
+#define RSPAMD_MIME_TEXT_PART_NORMALISED (1 << 7)
#define IS_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY)
#define IS_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF)
@@ -88,7 +90,7 @@ struct rspamd_mime_text_part {
rspamd_ftok_t parsed; /* decoded from mime encodings */
GByteArray *content; /* utf8 encoded processed content */
- UChar *ucs_raw_content; /* unicode raw content */
+ GArray *ucs_raw_content; /* unicode raw content (of UChar) */
GByteArray *utf_raw_content; /* utf raw content */
GByteArray *stripped_content; /* utf content with no newlines */
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 605ab7649..1e284c6c2 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -23,6 +23,7 @@
#include "message.h"
#include <unicode/ucnv.h>
#include <unicode/ucsdet.h>
+#include <unicode/unorm2.h>
#include <math.h>
#define UTF8_CHARSET "UTF-8"
@@ -39,6 +40,10 @@
static rspamd_regexp_t *utf_compatible_re = NULL;
UConverter *utf8_converter = NULL;
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+static const UNormalizer2 *norm = NULL;
+#endif
+
struct rspamd_charset_substitution {
const gchar *input;
const gchar *canon;
@@ -94,6 +99,36 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
return conv;
}
+static inline void
+rspamd_mime_utf8_conv_init (void)
+{
+ if (utf8_converter == NULL) {
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_err ("FATAL error: cannot open converter for utf8: %s",
+ u_errorName (uc_err));
+
+ g_assert_not_reached ();
+ }
+
+ ucnv_setFromUCallBack (utf8_converter,
+ UCNV_FROM_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ &uc_err);
+ ucnv_setToUCallBack (utf8_converter,
+ UCNV_TO_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ &uc_err);
+ }
+}
+
static void
rspamd_mime_encoding_substitute_init (void)
{
@@ -189,25 +224,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
UErrorCode uc_err = U_ZERO_ERROR;
UConverter *conv;
- if (utf8_converter == NULL) {
- utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
- "cannot open converter for utf8: %s",
- u_errorName (uc_err));
-
- return NULL;
- }
-
- ucnv_setFromUCallBack (utf8_converter,
- UCNV_FROM_U_CALLBACK_SUBSTITUTE,
- NULL,
- NULL,
- NULL,
- &uc_err);
- }
-
+ rspamd_mime_utf8_conv_init ();
conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
if (conv == NULL) {
@@ -222,7 +239,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
uc_err = U_ZERO_ERROR;
r = ucnv_toUChars (conv, tmp_buf, len + 1, input, len, &uc_err);
- if (uc_err != U_ZERO_ERROR) {
+ if (!U_SUCCESS (uc_err)) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot convert data to unicode from %s: %s",
in_enc, u_errorName (uc_err));
@@ -237,7 +254,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
d = rspamd_mempool_alloc (pool, dlen);
r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
- if (uc_err != U_ZERO_ERROR) {
+ if (!U_SUCCESS (uc_err)) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot convert data from unicode from %s: %s",
in_enc, u_errorName (uc_err));
@@ -257,6 +274,186 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
return d;
}
+static void
+rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ GByteArray *utf;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ rspamd_mime_utf8_conv_init ();
+ utf = text_part->utf_raw_content;
+ text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+ sizeof (UChar), utf->len + 1);
+ text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter,
+ (UChar *)text_part->ucs_raw_content->data,
+ utf->len + 1,
+ utf->data,
+ utf->len,
+ &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ g_array_free (text_part->ucs_raw_content, TRUE);
+ text_part->ucs_raw_content = NULL;
+ }
+}
+
+static void
+rspamd_mime_text_part_normalise (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gint32 nsym, end;
+ UChar *src = NULL, *dest = NULL;
+
+ if (norm == NULL) {
+ norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+ }
+
+ if (!text_part->ucs_raw_content) {
+ return;
+ }
+
+ src = (UChar *)text_part->ucs_raw_content->data;
+ nsym = text_part->ucs_raw_content->len;
+
+ /* We can now check if we need to decompose */
+ end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_task ("cannot normalise URL, cannot check normalisation: %s",
+ u_errorName (uc_err));
+ return;
+ }
+
+ if (end == nsym) {
+ /* Already normalised */
+ return;
+ }
+
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL;
+ dest = g_malloc (nsym * sizeof (*dest));
+ memcpy (dest, src, end * sizeof (*dest));
+ nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+ src + end, nsym - end, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+ msg_warn_task ("cannot normalise URL: %s",
+ u_errorName (uc_err));
+ }
+ }
+ else {
+ /* Copy normalised back */
+ memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar));
+ text_part->ucs_raw_content->len = nsym;
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
+ }
+
+ g_free (dest);
+#endif
+}
+
+/*
+ * Recode utf from normalised unichars if needed
+ */
+static void
+rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+ guint clen, dlen;
+ gint r;
+
+ rspamd_mime_utf8_conv_init ();
+
+ if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
+ text_part->ucs_raw_content) {
+ clen = ucnv_getMaxCharSize (utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len,
+ clen);
+ g_byte_array_set_size (text_part->utf_raw_content, dlen);
+ r = ucnv_fromUChars (utf8_converter,
+ text_part->utf_raw_content->data,
+ dlen,
+ (UChar *)text_part->ucs_raw_content->data,
+ text_part->ucs_raw_content->len,
+ &uc_err);
+ text_part->utf_raw_content->len = r;
+ }
+}
+
+
+static gboolean
+rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part,
+ GByteArray *input,
+ const gchar *charset,
+ GError **err)
+{
+ gchar *d;
+ gint32 r, clen, dlen;
+
+ UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *conv;
+
+ rspamd_mime_utf8_conv_init ();
+ conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+
+ if (conv == NULL) {
+ g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
+ "cannot open converter for %s: %s",
+ charset, u_errorName (uc_err));
+
+ return FALSE;
+ }
+
+
+ text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+ sizeof (UChar), input->len + 1);
+ r = ucnv_toUChars (conv,
+ (UChar *)text_part->ucs_raw_content->data,
+ input->len + 1,
+ input->data,
+ input->len,
+ &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
+ "cannot convert data to unicode from %s: %s",
+ charset, u_errorName (uc_err));
+ return FALSE;
+ }
+
+ text_part->ucs_raw_content->len = r;
+ rspamd_mime_text_part_normalise (task, text_part);
+
+ /* Now, convert to utf8 */
+ clen = ucnv_getMaxCharSize (utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+ d = rspamd_mempool_alloc (task->task_pool, dlen);
+ r = ucnv_fromUChars (utf8_converter, d, dlen,
+ (UChar *)text_part->ucs_raw_content->data, r, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
+ "cannot convert data from unicode from %s: %s",
+ charset, u_errorName (uc_err));
+
+ return FALSE;
+ }
+
+ msg_info_task ("converted from %s to UTF-8 inlen: %z, outlen: %d",
+ charset, input->len, r);
+ text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
+ sizeof (text_part->utf_raw_content));
+ text_part->utf_raw_content->data = d;
+ text_part->utf_raw_content->len = r;
+
+ return TRUE;
+}
+
gboolean
rspamd_mime_to_utf8_byte_array (GByteArray *in,
GByteArray *out,
@@ -278,24 +475,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
return TRUE;
}
- if (utf8_converter == NULL) {
- utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_warn ("cannot open converter for utf8: %s",
- u_errorName (uc_err));
-
- return FALSE;
- }
-
- ucnv_setFromUCallBack (utf8_converter,
- UCNV_FROM_U_CALLBACK_SUBSTITUTE,
- NULL,
- NULL,
- NULL,
- &uc_err);
- }
-
+ rspamd_mime_utf8_conv_init ();
conv = rspamd_mime_get_converter_cached (enc, &uc_err);
if (conv == NULL) {
@@ -306,7 +486,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
uc_err = U_ZERO_ERROR;
r = ucnv_toUChars (conv, tmp_buf, in->len + 1, in->data, in->len, &uc_err);
- if (uc_err != U_ZERO_ERROR) {
+ if (!U_SUCCESS (uc_err)) {
g_free (tmp_buf);
return FALSE;
@@ -318,7 +498,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
g_byte_array_set_size (out, dlen);
r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
- if (uc_err != U_ZERO_ERROR) {
+ if (!U_SUCCESS (uc_err)) {
g_free (tmp_buf);
return FALSE;
@@ -461,16 +641,14 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
return FALSE;
}
-GByteArray *
+void
rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part)
{
GError *err = NULL;
- gsize write_bytes;
const gchar *charset = NULL;
gboolean checked = FALSE, need_charset_heuristic = TRUE;
- gchar *res_str;
- GByteArray *result_array, *part_content;
+ GByteArray *part_content;
rspamd_ftok_t charset_tok;
struct rspamd_mime_part *part = text_part->mime_part;
@@ -494,8 +672,9 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
if (task->cfg && task->cfg->raw_mode) {
SET_PART_RAW (text_part);
+ text_part->utf_raw_content = part_content;
- return part_content;
+ return;
}
if (part->ct->charset.len == 0) {
@@ -511,8 +690,11 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
}
else {
SET_PART_UTF (text_part);
+ rspamd_mime_text_part_ucs_from_utf (task, text_part);
+ rspamd_mime_text_part_normalise (task, text_part);
+ rspamd_mime_text_part_maybe_renormalise (task, text_part);
- return part_content;
+ return;
}
}
else {
@@ -530,27 +712,26 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
if (charset == NULL) {
msg_info_task ("<%s>: has invalid charset", task->message_id);
SET_PART_RAW (text_part);
+ text_part->utf_raw_content = part_content;
- return NULL;
+ return;
}
RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
part_content->len, !checked)) {
- SET_PART_UTF (text_part);
+ rspamd_mime_text_part_ucs_from_utf (task, text_part);
+ rspamd_mime_text_part_normalise (task, text_part);
+ rspamd_mime_text_part_maybe_renormalise (task, text_part);
- return part_content;
+ return;
}
else {
charset = charset_tok.begin;
- res_str = rspamd_mime_text_to_utf8 (task->task_pool, part_content->data,
- part_content->len,
- charset,
- &write_bytes,
- &err);
- if (res_str == NULL) {
+ if (!rspamd_mime_text_part_utf8_convert (task, text_part,
+ part_content, charset, &err)) {
msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
task->message_id,
charset,
@@ -558,14 +739,10 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
SET_PART_RAW (text_part);
g_error_free (err);
- return NULL;
+ text_part->utf_raw_content = part_content;
+ return;
}
}
- result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
- result_array->data = res_str;
- result_array->len = write_bytes;
SET_PART_UTF (text_part);
-
- return result_array;
}
diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h
index 58a799e45..5e30efdae 100644
--- a/src/libmime/mime_encoding.h
+++ b/src/libmime/mime_encoding.h
@@ -65,7 +65,7 @@ gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in,
* @param text_part
* @return
*/
-GByteArray * rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
+void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part);
/**