From ac38a392e434721bd80360a78385a069e31e5f09 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 22 Aug 2018 10:36:00 +0100 Subject: [PATCH] [Minor] Improve charset detection logic --- src/libmime/mime_encoding.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index c316b264c..605ab7649 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -23,6 +23,7 @@ #include "message.h" #include #include +#include #define UTF8_CHARSET "UTF-8" @@ -372,6 +373,7 @@ rspamd_mime_charset_find_by_content (gchar *in, gsize inlen) const UCharsetMatch **csm, *sel = NULL; UErrorCode uc_err = U_ZERO_ERROR; gint32 matches, i, max_conf = G_MININT32, conf; + gdouble mean = 0.0, stddev = 0.0; if (csd == NULL) { csd = ucsdet_open (&uc_err); @@ -398,9 +400,13 @@ detect: max_conf = conf; sel = csm[i]; } + + mean += (conf - mean) / (i + 1); + gdouble err = fabs (conf - mean); + stddev += (err - stddev) / (i + 1); } - if (sel && max_conf > 50) { + if (sel && ((max_conf > 50) || (max_conf - mean > stddev * 1.25))) { return ucsdet_getName (sel, &uc_err); } -- 2.39.5