From: Vsevolod Stakhov Date: Sat, 27 Jan 2018 14:26:45 +0000 (+0000) Subject: [Project] Detect some languages based on unicode script X-Git-Tag: 1.7.0~245 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=b777078525df449d31c099d70df07051ebf526b9;p=rspamd.git [Project] Detect some languages based on unicode script --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 64e820b31..98c4a10ef 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -22,6 +22,7 @@ #include #include #include +#include #include static const gsize default_short_text_limit = 200; @@ -205,7 +206,7 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, target = d->unigramms; break; case 2: - /* Ignore */ + g_assert_not_reached (); break; case 3: target = d->trigramms; @@ -320,6 +321,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, g_hash_table_insert (d->unicode_scripts, (gpointer)&uc_match->unicode_code, nelt); nelt->flags |= RS_LANGUAGE_UNISCRIPT; + msg_info_config ("loaded unicode script only %s language: %d", + nelt->name, + uc_match->unicode_code); } else { GPtrArray *ngramms; @@ -414,13 +418,12 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, } g_ptr_array_free (ngramms, TRUE); + msg_info_config ("loaded %s language, %d unigramms, %d trigramms", + nelt->name, + (gint)nelt->unigramms_total, + (gint)nelt->trigramms_total); } - msg_info_config ("loaded %s language, %d unigramms, %d trigramms", - nelt->name, - (gint)nelt->unigramms_total, - (gint)nelt->trigramms_total); - g_ptr_array_add (d->languages, nelt); ucl_object_unref (top); } @@ -514,11 +517,13 @@ rspamd_language_detector_init (struct rspamd_config *cfg) g_free (fname); } - msg_info_config ("loaded %d languages, %d unigramms, " + msg_info_config ("loaded %d languages, %d unicode only languages, " + "%d unigramms, " "%d trigramms", (gint)ret->languages->len, - g_hash_table_size (ret->unigramms), - g_hash_table_size (ret->trigramms)); + (gint)g_hash_table_size (ret->unicode_scripts), + (gint)g_hash_table_size (ret->unigramms), + (gint)g_hash_table_size (ret->trigramms)); end: if (gl.gl_pathc > 0) { globfree (&gl); @@ -788,14 +793,85 @@ rspamd_language_detector_filter_negligible (struct rspamd_task *task, msg_debug_lang_det ("removed %d languages", filtered); } +static gboolean +rspamd_language_detector_is_unicode (struct rspamd_task *task, + struct rspamd_lang_detector *d, + GArray *ucs_tokens, + goffset *selected_words, + gsize nparts, + GHashTable *candidates) +{ + guint i, j, total_found = 0, total_checked = 0; + rspamd_stat_token_t *tok; + UChar t; + gint uc_script; + struct rspamd_language_elt *elt; + struct rspamd_lang_detector_res *cand; + GHashTableIter it; + gpointer k, v; + + for (i = 0; i < nparts; i++) { + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, + selected_words[i]); + + for (j = 0; j < tok->len; j ++) { + t = *(((UChar *)tok->begin) + j); + + uc_script = ublock_getCode (t); + elt = g_hash_table_lookup (d->unicode_scripts, &uc_script); + + if (elt) { + cand = g_hash_table_lookup (candidates, elt->name); + + if (cand == NULL) { + cand = g_malloc (sizeof (*cand)); + cand->elt = elt; + cand->lang = elt->name; + cand->prob = 1; + + g_hash_table_insert (candidates, (gpointer)cand->lang, cand); + } else { + /* Update guess */ + cand->prob ++; + } + + total_found ++; + } + + total_checked ++; + } + + if (i >= nparts / 2 && total_found == 0) { + /* No special scripts found, stop processing */ + return FALSE; + } + } + + if (total_found < total_checked / 2) { + /* Not enough confidence */ + return FALSE; + } + else { + /* Filter candidates */ + g_hash_table_iter_init (&it, candidates); + + while (g_hash_table_iter_next (&it, &k, &v)) { + cand = (struct rspamd_lang_detector_res *)v; + + cand->prob = cand->prob / total_checked; + } + } + + return TRUE; +} + static void rspamd_language_detector_detect_type (struct rspamd_task *task, guint nwords, struct rspamd_lang_detector *d, GArray *ucs_tokens, GHashTable *candidates, - enum rspamd_language_gramm_type type) -{ + enum rspamd_language_gramm_type type) { guint nparts = MIN (ucs_tokens->len, nwords); goffset *selected_words; rspamd_stat_token_t *tok; @@ -805,18 +881,22 @@ rspamd_language_detector_detect_type (struct rspamd_task *task, rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words); msg_debug_lang_det ("randomly selected %d words", nparts); - /* Deal with the first word in a special case */ - tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]); + /* Check unicode scripts */ + if (g_hash_table_size (candidates) != 0 || + !rspamd_language_detector_is_unicode (task, d, ucs_tokens, + selected_words, nparts, candidates)) { - rspamd_language_detector_detect_word (task, d, tok, candidates, type); + for (i = 0; i < nparts; i++) { + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, + selected_words[i]); + rspamd_language_detector_detect_word (task, d, tok, candidates, + type); + } - for (i = 1; i < nparts; i ++) { - tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]); - rspamd_language_detector_detect_word (task, d, tok, candidates, type); + /* Filter negligible candidates */ + rspamd_language_detector_filter_negligible (task, candidates); } - /* Filter negligible candidates */ - rspamd_language_detector_filter_negligible (task, candidates); g_free (selected_words); } @@ -951,8 +1031,7 @@ rspamd_language_detector_detect (struct rspamd_task *task, candidates); if (r == rs_detect_none) { - msg_debug_lang_det ("short mode; no trigramms found, " - "switch to unigramms"); + msg_debug_lang_det ("no trigramms found, switch to unigramms"); r = rspamd_language_detector_try_ngramm (task, default_words, d, ucs_tokens, rs_unigramm, candidates);