aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-05-26 11:31:47 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-05-26 11:31:47 +0100
commit19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3 (patch)
tree6d0f43f3cd9ede27eb578562480633e27f042934 /src
parentc11838dcbacbfd0a75e98f95a63a026217c88c51 (diff)
downloadrspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.tar.gz
rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.zip
[Rework] Use google-ced instead of libicu chardet as the former sucks
Diffstat (limited to 'src')
-rw-r--r--src/libmime/mime_encoding.c33
1 files changed, 9 insertions, 24 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 73b68fe06..8e7e54356 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -23,6 +23,7 @@
#include "mime_encoding.h"
#include "message.h"
#include "contrib/fastutf8/fastutf8.h"
+#include "contrib/google-ced/ced_c.h"
#include <unicode/ucnv.h>
#include <unicode/ucsdet.h>
#if U_ICU_VERSION_MAJOR_NUM >= 44
@@ -561,38 +562,22 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
const char *
rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
{
- static UCharsetDetector *csd;
- const UCharsetMatch **csm, *sel = NULL;
- UErrorCode uc_err = U_ZERO_ERROR;
- gint32 matches, i, max_conf = G_MININT32, conf;
- gdouble mean = 0.0, stddev = 0.0;
-
- if (csd == NULL) {
- csd = ucsdet_open (&uc_err);
-
- g_assert (csd != NULL);
- }
+ int nconsumed;
+ bool is_reliable;
+ const gchar *ced_name;
if (rspamd_fast_utf8_validate (in, inlen) == 0) {
return UTF8_CHARSET;
}
- ucsdet_setText (csd, in, inlen, &uc_err);
- csm = ucsdet_detectAll (csd, &matches, &uc_err);
- for (i = 0; i < matches; i ++) {
- if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) {
- max_conf = conf;
- sel = csm[i];
- }
+ ced_name = ced_encoding_detect (in, inlen, NULL, NULL,
+ NULL, 0, CED_EMAIL_CORPUS,
+ false, &nconsumed, &is_reliable);
- mean += (conf - mean) / (i + 1);
- gdouble err = fabs (conf - mean);
- stddev += (err - stddev) / (i + 1);
- }
+ if (ced_name) {
- if (sel && ((max_conf > 50) || (max_conf - mean > stddev * 1.25))) {
- return ucsdet_getName (sel, &uc_err);
+ return ced_name;
}
return NULL;