diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-08-22 10:36:00 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-08-22 10:36:00 +0100 |
commit | ac38a392e434721bd80360a78385a069e31e5f09 (patch) | |
tree | 90da122adfe9f8f8a25055dad98c326f358783e6 /src/libmime/mime_encoding.c | |
parent | 124cebf7a0920ed3c74de9ace8ef3ae9cc2b777e (diff) | |
download | rspamd-ac38a392e434721bd80360a78385a069e31e5f09.tar.gz rspamd-ac38a392e434721bd80360a78385a069e31e5f09.zip |
[Minor] Improve charset detection logic
Diffstat (limited to 'src/libmime/mime_encoding.c')
-rw-r--r-- | src/libmime/mime_encoding.c | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index c316b264c..605ab7649 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -23,6 +23,7 @@ #include "message.h" #include <unicode/ucnv.h> #include <unicode/ucsdet.h> +#include <math.h> #define UTF8_CHARSET "UTF-8" @@ -372,6 +373,7 @@ rspamd_mime_charset_find_by_content (gchar *in, gsize inlen) const UCharsetMatch **csm, *sel = NULL; UErrorCode uc_err = U_ZERO_ERROR; gint32 matches, i, max_conf = G_MININT32, conf; + gdouble mean = 0.0, stddev = 0.0; if (csd == NULL) { csd = ucsdet_open (&uc_err); @@ -398,9 +400,13 @@ detect: max_conf = conf; sel = csm[i]; } + + mean += (conf - mean) / (i + 1); + gdouble err = fabs (conf - mean); + stddev += (err - stddev) / (i + 1); } - if (sel && max_conf > 50) { + if (sel && ((max_conf > 50) || (max_conf - mean > stddev * 1.25))) { return ucsdet_getName (sel, &uc_err); } |