diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-05-26 11:31:47 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-05-26 11:31:47 +0100 |
commit | 19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3 (patch) | |
tree | 6d0f43f3cd9ede27eb578562480633e27f042934 /src | |
parent | c11838dcbacbfd0a75e98f95a63a026217c88c51 (diff) | |
download | rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.tar.gz rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.zip |
[Rework] Use google-ced instead of libicu chardet as the former sucks
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/mime_encoding.c | 33 |
1 files changed, 9 insertions, 24 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index 73b68fe06..8e7e54356 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -23,6 +23,7 @@ #include "mime_encoding.h" #include "message.h" #include "contrib/fastutf8/fastutf8.h" +#include "contrib/google-ced/ced_c.h" #include <unicode/ucnv.h> #include <unicode/ucsdet.h> #if U_ICU_VERSION_MAJOR_NUM >= 44 @@ -561,38 +562,22 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len) const char * rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen) { - static UCharsetDetector *csd; - const UCharsetMatch **csm, *sel = NULL; - UErrorCode uc_err = U_ZERO_ERROR; - gint32 matches, i, max_conf = G_MININT32, conf; - gdouble mean = 0.0, stddev = 0.0; - - if (csd == NULL) { - csd = ucsdet_open (&uc_err); - - g_assert (csd != NULL); - } + int nconsumed; + bool is_reliable; + const gchar *ced_name; if (rspamd_fast_utf8_validate (in, inlen) == 0) { return UTF8_CHARSET; } - ucsdet_setText (csd, in, inlen, &uc_err); - csm = ucsdet_detectAll (csd, &matches, &uc_err); - for (i = 0; i < matches; i ++) { - if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) { - max_conf = conf; - sel = csm[i]; - } + ced_name = ced_encoding_detect (in, inlen, NULL, NULL, + NULL, 0, CED_EMAIL_CORPUS, + false, &nconsumed, &is_reliable); - mean += (conf - mean) / (i + 1); - gdouble err = fabs (conf - mean); - stddev += (err - stddev) / (i + 1); - } + if (ced_name) { - if (sel && ((max_conf > 50) || (max_conf - mean > stddev * 1.25))) { - return ucsdet_getName (sel, &uc_err); + return ced_name; } return NULL; |