From: Vsevolod Stakhov Date: Sat, 13 Jan 2018 17:41:57 +0000 (+0000) Subject: [Project] Preliminary version of ngramms based language detector X-Git-Tag: 1.7.0~283 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=b72c8f94ccbbe8362b38a4a9f35823367ad21a9c;p=rspamd.git [Project] Preliminary version of ngramms based language detector --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index fb9af6df7..66901e6b9 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -263,7 +263,7 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, } static void -rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords, +rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, goffset *offsets_out) { guint step_len, remainder, i, out_idx; @@ -362,7 +362,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, } } else { - if (tok->len >= cur_off) { + if (tok->len <= cur_off) { return -1; } @@ -406,17 +406,21 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d, freq = ((gdouble)GPOINTER_TO_UINT ( g_hash_table_lookup (ngramms, window))) / class_freq; - cand = g_hash_table_lookup (candidates, elt->name); - if (cand == NULL) { - cand = g_malloc (sizeof (*cand)); - cand->elt = elt; - cand->lang = elt->name; - cand->prob = freq; - } - else { - /* Update guess */ - cand->prob += freq; + if (freq > 0) { + cand = g_hash_table_lookup (candidates, elt->name); + + if (cand == NULL) { + cand = g_malloc (sizeof (*cand)); + cand->elt = elt; + cand->lang = elt->name; + cand->prob = freq; + + g_hash_table_insert (candidates, (gpointer)elt->name, cand); + } else { + /* Update guess */ + cand->prob += freq; + } } } } @@ -583,7 +587,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates) static void rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, - GPtrArray *ucs_tokens, + GArray *ucs_tokens, GHashTable *candidates, enum rspamd_language_gramm_type type, gboolean start_over) @@ -597,7 +601,7 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words); /* Deal with the first word in a special case */ - tok = g_ptr_array_index (ucs_tokens, selected_words[0]); + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]); if (start_over) { rspamd_language_detector_detect_word (d, tok, candidates, type); @@ -607,7 +611,7 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, } for (i = 1; i < nparts; i ++) { - tok = g_ptr_array_index (ucs_tokens, selected_words[i]); + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]); rspamd_language_detector_update_guess (d, tok, candidates, type); } @@ -620,13 +624,13 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b) { const struct rspamd_lang_detector_res *canda = *(const struct rspamd_lang_detector_res **)a, - *candb = *(const struct rspamd_lang_detector_res **)a; + *candb = *(const struct rspamd_lang_detector_res **)b; if (canda->prob > candb->prob) { - return 1; + return -1; } else if (candb->prob > canda->prob) { - return -1; + return 1; } return 0; @@ -634,7 +638,7 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b) GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d, - GPtrArray *ucs_tokens, gsize words_len) + GArray *ucs_tokens, gsize words_len) { GHashTable *candidates; GPtrArray *result; @@ -690,6 +694,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, while (g_hash_table_iter_next (&it, &k, &v)) { cand = (struct rspamd_lang_detector_res *) v; + msg_err ("%s -> %.2f", cand->lang, cand->prob); g_ptr_array_add (result, cand); g_hash_table_iter_steal (&it); } diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index 9373b09f2..048e425f6 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -55,6 +55,6 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, * @return array of struct rspamd_lang_detector_res sorted by freq descending */ GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d, - GPtrArray *ucs_tokens, gsize words_len); + GArray *ucs_tokens, gsize words_len); #endif diff --git a/src/libmime/message.c b/src/libmime/message.c index af1147770..4bac77062 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -93,6 +93,8 @@ rspamd_extract_words (struct rspamd_task *task, } if (part->ucs32_words) { + struct rspamd_lang_detector_res *lang; + for (i = 0; i < part->normalized_words->len; i++) { w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); @@ -103,6 +105,16 @@ rspamd_extract_words (struct rspamd_task *task, ucs_len += ucs_w.len; } + part->languages = rspamd_language_detector_detect (task->lang_det, + part->ucs32_words, ucs_len); + + if (part->languages->len > 0) { + lang = g_ptr_array_index (part->languages, 0); + part->language = lang->lang; + + msg_info_task ("detected part language: %s", part->language); + } + #ifdef WITH_SNOWBALL static GHashTable *stemmers = NULL; @@ -869,7 +881,6 @@ rspamd_message_parse (struct rspamd_task *task) if (RSPAMD_TASK_IS_EMPTY (task)) { /* Don't do anything with empty task */ - return TRUE; } diff --git a/src/libmime/message.h b/src/libmime/message.h index 90f86b3bd..5ee5b4c43 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -78,10 +78,8 @@ struct rspamd_mime_part { #define IS_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML) struct rspamd_mime_text_part { - guint flags; - GUnicodeScript script; - const gchar *lang_code; const gchar *language; + GPtrArray *languages; const gchar *real_charset; rspamd_ftok_t raw; rspamd_ftok_t parsed; @@ -95,6 +93,7 @@ struct rspamd_mime_text_part { GArray *normalized_words; GArray *ucs32_words; GArray *normalized_hashes; + guint flags; guint nlines; guint spaces; guint non_ascii_chars; diff --git a/src/libserver/task.c b/src/libserver/task.c index 7b665d983..961af1c9f 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -242,6 +242,9 @@ rspamd_task_free (struct rspamd_task *task) if (tp->ucs32_words) { g_array_free (tp->ucs32_words, TRUE); } + if (tp->languages) { + g_ptr_array_free (tp->languages, TRUE); + } } if (task->rcpt_envelope) { diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c index 6563d0dc1..8682ca73f 100644 --- a/src/libstat/backends/sqlite3_backend.c +++ b/src/libstat/backends/sqlite3_backend.c @@ -365,8 +365,8 @@ rspamd_sqlite3_get_language (struct rspamd_stat_sqlite3_db *db, for (i = 0; i < task->text_parts->len; i++) { tp = g_ptr_array_index (task->text_parts, i); - if (tp->lang_code != NULL && tp->lang_code[0] != '\0' && - strcmp (tp->lang_code, "en") != 0) { + if (tp->language != NULL && tp->language[0] != '\0' && + strcmp (tp->language, "en") != 0) { language = tp->language; break; } diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index cce78ff3a..7fc8f74ac 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -745,8 +745,8 @@ lua_textpart_get_language (lua_State * L) struct rspamd_mime_text_part *part = lua_check_textpart (L); if (part != NULL) { - if (part->lang_code != NULL && part->lang_code[0] != '\0') { - lua_pushstring (L, part->lang_code); + if (part->language != NULL && part->language[0] != '\0') { + lua_pushstring (L, part->language); return 1; } }