From 6e925ae527b87d75c8346d44c27b55252cc78dbf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 6 Apr 2018 17:32:15 +0100 Subject: [PATCH] [Feature] Do not detect language twice for relative parts --- src/libmime/message.c | 294 +++++++++++++++++++++++++----------------- src/libmime/message.h | 1 + src/libserver/task.c | 2 +- 3 files changed, 179 insertions(+), 118 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index aaae40fa5..1eae26a43 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -55,157 +55,106 @@ free_byte_array_callback (void *pointer) } static void -rspamd_extract_words (struct rspamd_task *task, +rspamd_mime_part_extract_words (struct rspamd_task *task, struct rspamd_mime_text_part *part) { #ifdef WITH_SNOWBALL struct sb_stemmer *stem = NULL; #endif - rspamd_stat_token_t *w, ucs_w; + rspamd_stat_token_t *w; gchar *temp_word; const guchar *r; - guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0; + guint i, nlen, total_len = 0, short_len = 0; gdouble avg_len = 0; - /* Ugly workaround */ - if (IS_PART_HTML (part)) { - part->normalized_words = rspamd_tokenize_text ( - part->stripped_content->data, - part->stripped_content->len, IS_PART_UTF (part), task->cfg, - part->exceptions, FALSE, - NULL); - } - else { - part->normalized_words = rspamd_tokenize_text ( - part->stripped_content->data, - part->stripped_content->len, IS_PART_UTF (part), task->cfg, - part->exceptions, FALSE, - NULL); - } - if (part->normalized_words) { - part->normalized_hashes = g_array_sized_new (FALSE, FALSE, - sizeof (guint64), part->normalized_words->len); - - if (IS_PART_UTF (part) && task->lang_det) { - part->ucs32_words = g_array_sized_new (FALSE, FALSE, - sizeof (rspamd_stat_token_t), part->normalized_words->len); - } - - if (part->ucs32_words) { - struct rspamd_lang_detector_res *lang; - - for (i = 0; i < part->normalized_words->len; i++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); - - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { - rspamd_language_detector_to_ucs (task->lang_det, - task->task_pool, - w, &ucs_w); - g_array_append_val (part->ucs32_words, ucs_w); - ucs_len += ucs_w.len; - } - } - - part->languages = rspamd_language_detector_detect (task, - task->lang_det, - part->ucs32_words, ucs_len); +#ifdef WITH_SNOWBALL + static GHashTable *stemmers = NULL; - if (part->languages->len > 0) { - lang = g_ptr_array_index (part->languages, 0); - part->language = lang->lang; + if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) { - msg_info_task ("detected part language: %s", part->language); + if (!stemmers) { + stemmers = g_hash_table_new (rspamd_strcase_hash, + rspamd_strcase_equal); } -#ifdef WITH_SNOWBALL - static GHashTable *stemmers = NULL; + stem = g_hash_table_lookup (stemmers, part->language); - if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) { + if (stem == NULL) { - if (!stemmers) { - stemmers = g_hash_table_new (rspamd_strcase_hash, - rspamd_strcase_equal); - } - - stem = g_hash_table_lookup (stemmers, part->language); + stem = sb_stemmer_new (part->language, "UTF_8"); if (stem == NULL) { - - stem = sb_stemmer_new (part->language, "UTF_8"); - - if (stem == NULL) { - msg_debug_task ("<%s> cannot create lemmatizer for %s language", - task->message_id, part->language); - } - else { - g_hash_table_insert (stemmers, g_strdup (part->language), - stem); - } + msg_debug_task ("<%s> cannot create lemmatizer for %s language", + task->message_id, part->language); + } + else { + g_hash_table_insert (stemmers, g_strdup (part->language), + stem); } } -#endif } +#endif + } - for (i = 0; i < part->normalized_words->len; i ++) { - guint64 h; + for (i = 0; i < part->normalized_words->len; i ++) { + guint64 h; - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); - r = NULL; + w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + r = NULL; #ifdef WITH_SNOWBALL - if (stem) { - r = sb_stemmer_stem (stem, w->begin, w->len); - } + if (stem) { + r = sb_stemmer_stem (stem, w->begin, w->len); + } #endif - if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { - avg_len = avg_len + (w->len - avg_len) / (double)i; - - if (r != NULL) { - nlen = strlen (r); - nlen = MIN (nlen, w->len); - temp_word = rspamd_mempool_alloc (task->task_pool, nlen); - memcpy (temp_word, r, nlen); + if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { + avg_len = avg_len + (w->len - avg_len) / (double)i; - if (IS_PART_UTF (part)) { - rspamd_str_lc_utf8 (temp_word, nlen); - } - else { - rspamd_str_lc (temp_word, nlen); - } + if (r != NULL) { + nlen = strlen (r); + nlen = MIN (nlen, w->len); + temp_word = rspamd_mempool_alloc (task->task_pool, nlen); + memcpy (temp_word, r, nlen); - w->begin = temp_word; - w->len = nlen; + if (IS_PART_UTF (part)) { + rspamd_str_lc_utf8 (temp_word, nlen); } else { - temp_word = rspamd_mempool_alloc (task->task_pool, w->len); - memcpy (temp_word, w->begin, w->len); + rspamd_str_lc (temp_word, nlen); + } - if (IS_PART_UTF (part)) { - rspamd_str_lc_utf8 (temp_word, w->len); - } - else { - rspamd_str_lc (temp_word, w->len); - } + w->begin = temp_word; + w->len = nlen; + } + else { + temp_word = rspamd_mempool_alloc (task->task_pool, w->len); + memcpy (temp_word, w->begin, w->len); - w->begin = temp_word; + if (IS_PART_UTF (part)) { + rspamd_str_lc_utf8 (temp_word, w->len); + } + else { + rspamd_str_lc (temp_word, w->len); } + + w->begin = temp_word; } + } - if (w->len > 0) { - /* - * We use static hash seed if we would want to use that in shingles - * computation in future - */ - h = rspamd_cryptobox_fast_hash_specific ( - RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, - w->begin, w->len, words_hash_seed); - g_array_append_val (part->normalized_hashes, h); - total_len += w->len; - - if (w->len <= 3) { - short_len ++; - } + if (w->len > 0) { + /* + * We use static hash seed if we would want to use that in shingles + * computation in future + */ + h = rspamd_cryptobox_fast_hash_specific ( + RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + w->begin, w->len, words_hash_seed); + g_array_append_val (part->normalized_hashes, h); + total_len += w->len; + + if (w->len <= 3) { + short_len ++; } } } @@ -239,8 +188,78 @@ rspamd_extract_words (struct rspamd_task *task, *short_len_p += short_len; } } +} + +static guint +rspamd_mime_part_create_words (struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + rspamd_stat_token_t *w, ucs_w; + guint i, ucs_len = 0; + + /* Ugly workaround */ + if (IS_PART_HTML (part)) { + part->normalized_words = rspamd_tokenize_text ( + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, + part->exceptions, FALSE, + NULL); + } + else { + part->normalized_words = rspamd_tokenize_text ( + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, + part->exceptions, FALSE, + NULL); + } + if (part->normalized_words) { + part->normalized_hashes = g_array_sized_new (FALSE, FALSE, + sizeof (guint64), part->normalized_words->len); + + if (IS_PART_UTF (part) && task->lang_det) { + part->ucs32_words = g_array_sized_new (FALSE, FALSE, + sizeof (rspamd_stat_token_t), part->normalized_words->len); + } + + if (part->ucs32_words) { + + + for (i = 0; i < part->normalized_words->len; i++) { + w = &g_array_index (part->normalized_words, rspamd_stat_token_t, + i); + + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + rspamd_language_detector_to_ucs (task->lang_det, + task->task_pool, + w, &ucs_w); + g_array_append_val (part->ucs32_words, ucs_w); + ucs_len += ucs_w.len; + } + } + } + } + + return ucs_len; +} + +static void +rspamd_mime_part_detect_language (struct rspamd_task *task, + struct rspamd_mime_text_part *part, guint ucs_len) +{ + struct rspamd_lang_detector_res *lang; + if (part->ucs32_words) { + part->languages = rspamd_language_detector_detect (task, + task->lang_det, + part->ucs32_words, ucs_len); + if (part->languages->len > 0) { + lang = g_ptr_array_index (part->languages, 0); + part->language = lang->lang; + + msg_info_task ("detected part language: %s", part->language); + } + } } static void @@ -833,7 +852,7 @@ rspamd_message_process_text_part (struct rspamd_task *task, text_part->exceptions); } - rspamd_extract_words (task, text_part); + text_part->ucs_len = rspamd_mime_part_create_words (task, text_part); } /* Creates message from various data using libmagic to detect type */ @@ -1128,6 +1147,41 @@ rspamd_message_parse (struct rspamd_task *task) if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) { if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && p1->normalized_hashes && p2->normalized_hashes) { + /* + * We also detect language on one part and propagate it to + * another one + */ + struct rspamd_mime_text_part *sel; + + /* Prefer HTML as text part is not displayed normally */ + if (IS_PART_HTML (p1)) { + sel = p1; + } + else if (IS_PART_HTML (p2)) { + sel = p2; + } + else { + if (p1->ucs_len > p2->ucs_len) { + sel = p1; + } + else { + sel = p2; + } + } + + rspamd_mime_part_detect_language (task, sel, sel->ucs_len); + + if (sel->language && sel->language[0]) { + /* Propagate language */ + if (sel == p1) { + p2->language = sel->language; + p2->languages = g_ptr_array_ref (sel->languages); + } + else { + p1->language = sel->language; + p1->languages = g_ptr_array_ref (sel->languages); + } + } tw = p1->normalized_hashes->len + p2->normalized_hashes->len; @@ -1180,6 +1234,12 @@ rspamd_message_parse (struct rspamd_task *task) guint total_words = 0; PTR_ARRAY_FOREACH (task->text_parts, i, text_part) { + if (!text_part->language) { + rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len); + } + + rspamd_mime_part_extract_words (task, text_part); + if (text_part->normalized_words) { total_words += text_part->normalized_words->len; } diff --git a/src/libmime/message.h b/src/libmime/message.h index 5ee5b4c43..0ed2a5c66 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -103,6 +103,7 @@ struct rspamd_mime_text_part { guint empty_lines; guint capital_letters; guint numeric_characters; + guint ucs_len; }; enum rspamd_received_type { diff --git a/src/libserver/task.c b/src/libserver/task.c index 55adc1a4f..b5594816b 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -245,7 +245,7 @@ rspamd_task_free (struct rspamd_task *task) g_array_free (tp->ucs32_words, TRUE); } if (tp->languages) { - g_ptr_array_free (tp->languages, TRUE); + g_ptr_array_unref (tp->languages); } } -- 2.39.5