From 891590d6093196348f63655a6c308fdca79ec061 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 3 Feb 2018 15:25:23 +0000 Subject: [PATCH] [Project] Further improvements to language detector --- src/libmime/lang_detection.c | 62 +++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 2420e1c5f..e15f99f76 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include static const gsize default_short_text_limit = 200; @@ -647,7 +647,7 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1), utf_token->begin, utf_token->len, &uc_err); - if (nsym >= 0) { + if (nsym >= 0 && uc_err == U_ZERO_ERROR) { rspamd_language_detector_ucs_lowercase (out, nsym); ucs_token->begin = (const gchar *) out; ucs_token->len = nsym; @@ -664,6 +664,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, guint step_len, remainder, i, out_idx; guint64 coin, sel; goffset tmp; + rspamd_stat_token_t *tok; g_assert (nwords != 0); g_assert (offsets_out != NULL); @@ -694,17 +695,45 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, for (i = step_len + remainder; i < ucs_tokens->len; i += step_len, out_idx ++) { + guint ntries = 0; coin = rspamd_random_uint64_fast (); sel = (coin % step_len) + i; - offsets_out[out_idx] = sel; + + for (;;) { + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel); + /* Filter bad tokens */ + if (tok->len >= 2 && u_isalpha (*(UChar *)tok->begin) + && u_isalpha (*(((UChar *)tok->begin) + (tok->len - 1)))) { + offsets_out[out_idx] = sel; + break; + } + else { + ntries ++; + coin = rspamd_random_uint64_fast (); + + if (ntries < step_len) { + sel = (coin % step_len) + i; + } + else if (ntries < ucs_tokens->len) { + sel = coin % ucs_tokens->len; + } + else { + offsets_out[out_idx] = sel; + break; + } + } + } } + + /* * Fisher-Yates algorithm: * for i from 0 to n−2 do - * j ← random integer such that i ≤ j < n - * exchange a[i] and a[j] - */ + * j ← random integer such that i ≤ j < n + * exchange a[i] and a[j] + */ +#if 0 if (out_idx > 2) { for (i = 0; i < out_idx - 2; i++) { coin = rspamd_random_uint64_fast (); @@ -715,6 +744,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, offsets_out[sel] = tmp; } } +#endif } enum rspamd_language_gramm_type { @@ -749,10 +779,11 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, /* No more fun */ return -1; } - - /* Normal case */ - for (i = 0; i < wlen; i ++) { - window[i] = *(((UChar *)tok->begin) + cur_off + i); + else { + /* Normal case */ + for (i = 0; i < wlen; i++) { + window[i] = *(((UChar *) tok->begin) + cur_off + i); + } } } else { @@ -780,8 +811,7 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, struct rspamd_ngramm_elt *elt; struct rspamd_lang_detector_res *cand; GHashTable *ngramms; - /* Ignore if ngramm is found in that amount of languages */ - static const guint languages_cutoff = 10; + gdouble mult = 1.0, prob; switch (type) { case rs_unigramm: @@ -795,20 +825,22 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, ar = g_hash_table_lookup (ngramms, window); - if (ar && ar->len < languages_cutoff) { + if (ar) { PTR_ARRAY_FOREACH (ar, i, elt) { cand = g_hash_table_lookup (candidates, elt->elt->name); + prob = elt->prob * mult; + if (cand == NULL) { cand = g_malloc (sizeof (*cand)); cand->elt = elt->elt; cand->lang = elt->elt->name; - cand->prob = elt->prob; + cand->prob = prob; g_hash_table_insert (candidates, (gpointer)cand->lang, cand); } else { /* Update guess */ - cand->prob += elt->prob; + cand->prob += prob; } } } -- 2.39.5