]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Do not detect language twice for relative parts
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 6 Apr 2018 16:32:15 +0000 (17:32 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 6 Apr 2018 16:32:15 +0000 (17:32 +0100)
src/libmime/message.c
src/libmime/message.h
src/libserver/task.c

index aaae40fa567a692046f6345a1efc8fc6623f607f..1eae26a4342ef659f25ab5d31c1a77a7e07f6ad7 100644 (file)
@@ -55,157 +55,106 @@ free_byte_array_callback (void *pointer)
 }
 
 static void
-rspamd_extract_words (struct rspamd_task *task,
+rspamd_mime_part_extract_words (struct rspamd_task *task,
                struct rspamd_mime_text_part *part)
 {
 #ifdef WITH_SNOWBALL
        struct sb_stemmer *stem = NULL;
 #endif
-       rspamd_stat_token_t *w, ucs_w;
+       rspamd_stat_token_t *w;
        gchar *temp_word;
        const guchar *r;
-       guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0;
+       guint i, nlen, total_len = 0, short_len = 0;
        gdouble avg_len = 0;
 
-       /* Ugly workaround */
-       if (IS_PART_HTML (part)) {
-               part->normalized_words = rspamd_tokenize_text (
-                               part->stripped_content->data,
-                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
-                               part->exceptions, FALSE,
-                               NULL);
-       }
-       else {
-               part->normalized_words = rspamd_tokenize_text (
-                               part->stripped_content->data,
-                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
-                               part->exceptions, FALSE,
-                               NULL);
-       }
-
        if (part->normalized_words) {
-               part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
-                               sizeof (guint64), part->normalized_words->len);
-
-               if (IS_PART_UTF (part) && task->lang_det) {
-                       part->ucs32_words =  g_array_sized_new (FALSE, FALSE,
-                                       sizeof (rspamd_stat_token_t), part->normalized_words->len);
-               }
-
-               if (part->ucs32_words) {
-                       struct rspamd_lang_detector_res *lang;
-
-                       for (i = 0; i < part->normalized_words->len; i++) {
-                               w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
-
-                               if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
-                                       rspamd_language_detector_to_ucs (task->lang_det,
-                                                       task->task_pool,
-                                                       w, &ucs_w);
-                                       g_array_append_val (part->ucs32_words, ucs_w);
-                                       ucs_len += ucs_w.len;
-                               }
-                       }
-
-                       part->languages = rspamd_language_detector_detect (task,
-                                       task->lang_det,
-                                       part->ucs32_words, ucs_len);
+#ifdef WITH_SNOWBALL
+               static GHashTable *stemmers = NULL;
 
-                       if (part->languages->len > 0) {
-                               lang = g_ptr_array_index (part->languages, 0);
-                               part->language = lang->lang;
+               if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
 
-                               msg_info_task ("detected part language: %s", part->language);
+                       if (!stemmers) {
+                               stemmers = g_hash_table_new (rspamd_strcase_hash,
+                                               rspamd_strcase_equal);
                        }
 
-#ifdef WITH_SNOWBALL
-                       static GHashTable *stemmers = NULL;
+                       stem = g_hash_table_lookup (stemmers, part->language);
 
-                       if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
+                       if (stem == NULL) {
 
-                               if (!stemmers) {
-                                       stemmers = g_hash_table_new (rspamd_strcase_hash,
-                                                       rspamd_strcase_equal);
-                               }
-
-                               stem = g_hash_table_lookup (stemmers, part->language);
+                               stem = sb_stemmer_new (part->language, "UTF_8");
 
                                if (stem == NULL) {
-
-                                       stem = sb_stemmer_new (part->language, "UTF_8");
-
-                                       if (stem == NULL) {
-                                               msg_debug_task ("<%s> cannot create lemmatizer for %s language",
-                                                               task->message_id, part->language);
-                                       }
-                                       else {
-                                               g_hash_table_insert (stemmers, g_strdup (part->language),
-                                                               stem);
-                                       }
+                                       msg_debug_task ("<%s> cannot create lemmatizer for %s language",
+                                                       task->message_id, part->language);
+                               }
+                               else {
+                                       g_hash_table_insert (stemmers, g_strdup (part->language),
+                                                       stem);
                                }
                        }
-#endif
                }
+#endif
+       }
 
-               for (i = 0; i < part->normalized_words->len; i ++) {
-                       guint64 h;
+       for (i = 0; i < part->normalized_words->len; i ++) {
+               guint64 h;
 
-                       w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
-                       r = NULL;
+               w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+               r = NULL;
 #ifdef WITH_SNOWBALL
-                       if (stem) {
-                               r = sb_stemmer_stem (stem, w->begin, w->len);
-                       }
+               if (stem) {
+                       r = sb_stemmer_stem (stem, w->begin, w->len);
+               }
 #endif
 
-                       if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
-                               avg_len = avg_len + (w->len - avg_len) / (double)i;
-
-                               if (r != NULL) {
-                                       nlen = strlen (r);
-                                       nlen = MIN (nlen, w->len);
-                                       temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
-                                       memcpy (temp_word, r, nlen);
+               if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+                       avg_len = avg_len + (w->len - avg_len) / (double)i;
 
-                                       if (IS_PART_UTF (part)) {
-                                               rspamd_str_lc_utf8 (temp_word, nlen);
-                                       }
-                                       else {
-                                               rspamd_str_lc (temp_word, nlen);
-                                       }
+                       if (r != NULL) {
+                               nlen = strlen (r);
+                               nlen = MIN (nlen, w->len);
+                               temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
+                               memcpy (temp_word, r, nlen);
 
-                                       w->begin = temp_word;
-                                       w->len = nlen;
+                               if (IS_PART_UTF (part)) {
+                                       rspamd_str_lc_utf8 (temp_word, nlen);
                                }
                                else {
-                                       temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
-                                       memcpy (temp_word, w->begin, w->len);
+                                       rspamd_str_lc (temp_word, nlen);
+                               }
 
-                                       if (IS_PART_UTF (part)) {
-                                               rspamd_str_lc_utf8 (temp_word, w->len);
-                                       }
-                                       else {
-                                               rspamd_str_lc (temp_word, w->len);
-                                       }
+                               w->begin = temp_word;
+                               w->len = nlen;
+                       }
+                       else {
+                               temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
+                               memcpy (temp_word, w->begin, w->len);
 
-                                       w->begin = temp_word;
+                               if (IS_PART_UTF (part)) {
+                                       rspamd_str_lc_utf8 (temp_word, w->len);
+                               }
+                               else {
+                                       rspamd_str_lc (temp_word, w->len);
                                }
+
+                               w->begin = temp_word;
                        }
+               }
 
-                       if (w->len > 0) {
-                               /*
-                                * We use static hash seed if we would want to use that in shingles
-                                * computation in future
-                                */
-                               h = rspamd_cryptobox_fast_hash_specific (
-                                               RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
-                                               w->begin, w->len, words_hash_seed);
-                               g_array_append_val (part->normalized_hashes, h);
-                               total_len += w->len;
-
-                               if (w->len <= 3) {
-                                       short_len ++;
-                               }
+               if (w->len > 0) {
+                       /*
+                        * We use static hash seed if we would want to use that in shingles
+                        * computation in future
+                        */
+                       h = rspamd_cryptobox_fast_hash_specific (
+                                       RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+                                       w->begin, w->len, words_hash_seed);
+                       g_array_append_val (part->normalized_hashes, h);
+                       total_len += w->len;
+
+                       if (w->len <= 3) {
+                               short_len ++;
                        }
                }
        }
@@ -239,8 +188,78 @@ rspamd_extract_words (struct rspamd_task *task,
                        *short_len_p += short_len;
                }
        }
+}
+
+static guint
+rspamd_mime_part_create_words (struct rspamd_task *task,
+               struct rspamd_mime_text_part *part)
+{
+       rspamd_stat_token_t *w, ucs_w;
+       guint i, ucs_len = 0;
+
+       /* Ugly workaround */
+       if (IS_PART_HTML (part)) {
+               part->normalized_words = rspamd_tokenize_text (
+                               part->stripped_content->data,
+                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+                               part->exceptions, FALSE,
+                               NULL);
+       }
+       else {
+               part->normalized_words = rspamd_tokenize_text (
+                               part->stripped_content->data,
+                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+                               part->exceptions, FALSE,
+                               NULL);
+       }
+       if (part->normalized_words) {
+               part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
+                               sizeof (guint64), part->normalized_words->len);
+
+               if (IS_PART_UTF (part) && task->lang_det) {
+                       part->ucs32_words = g_array_sized_new (FALSE, FALSE,
+                                       sizeof (rspamd_stat_token_t), part->normalized_words->len);
+               }
+
+               if (part->ucs32_words) {
+
+
+                       for (i = 0; i < part->normalized_words->len; i++) {
+                               w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
+                                               i);
+
+                               if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                                       rspamd_language_detector_to_ucs (task->lang_det,
+                                                       task->task_pool,
+                                                       w, &ucs_w);
+                                       g_array_append_val (part->ucs32_words, ucs_w);
+                                       ucs_len += ucs_w.len;
+                               }
+                       }
+               }
+       }
+
+       return ucs_len;
+}
+
+static void
+rspamd_mime_part_detect_language (struct rspamd_task *task,
+               struct rspamd_mime_text_part *part, guint ucs_len)
+{
+       struct rspamd_lang_detector_res *lang;
 
+       if (part->ucs32_words) {
+               part->languages = rspamd_language_detector_detect (task,
+                               task->lang_det,
+                               part->ucs32_words, ucs_len);
 
+               if (part->languages->len > 0) {
+                       lang = g_ptr_array_index (part->languages, 0);
+                       part->language = lang->lang;
+
+                       msg_info_task ("detected part language: %s", part->language);
+               }
+       }
 }
 
 static void
@@ -833,7 +852,7 @@ rspamd_message_process_text_part (struct rspamd_task *task,
                                text_part->exceptions);
        }
 
-       rspamd_extract_words (task, text_part);
+       text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
 }
 
 /* Creates message from various data using libmagic to detect type */
@@ -1128,6 +1147,41 @@ rspamd_message_parse (struct rspamd_task *task)
                        if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
                                if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
                                                p1->normalized_hashes && p2->normalized_hashes) {
+                                       /*
+                                        * We also detect language on one part and propagate it to
+                                        * another one
+                                        */
+                                       struct rspamd_mime_text_part *sel;
+
+                                       /* Prefer HTML as text part is not displayed normally */
+                                       if (IS_PART_HTML (p1)) {
+                                               sel = p1;
+                                       }
+                                       else if (IS_PART_HTML (p2)) {
+                                               sel = p2;
+                                       }
+                                       else {
+                                               if (p1->ucs_len > p2->ucs_len) {
+                                                       sel = p1;
+                                               }
+                                               else {
+                                                       sel = p2;
+                                               }
+                                       }
+
+                                       rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
+
+                                       if (sel->language && sel->language[0]) {
+                                               /* Propagate language */
+                                               if (sel == p1) {
+                                                       p2->language = sel->language;
+                                                       p2->languages = g_ptr_array_ref (sel->languages);
+                                               }
+                                               else {
+                                                       p1->language = sel->language;
+                                                       p1->languages = g_ptr_array_ref (sel->languages);
+                                               }
+                                       }
 
                                        tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
 
@@ -1180,6 +1234,12 @@ rspamd_message_parse (struct rspamd_task *task)
        guint total_words = 0;
 
        PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
+               if (!text_part->language) {
+                       rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
+               }
+
+               rspamd_mime_part_extract_words (task, text_part);
+
                if (text_part->normalized_words) {
                        total_words += text_part->normalized_words->len;
                }
index 5ee5b4c4376fbed009afcff7fb0e30c45fbba14b..0ed2a5c663765936dec38728200ca9b67bf0429b 100644 (file)
@@ -103,6 +103,7 @@ struct rspamd_mime_text_part {
        guint empty_lines;
        guint capital_letters;
        guint numeric_characters;
+       guint ucs_len;
 };
 
 enum rspamd_received_type {
index 55adc1a4f9ac02eb713a68e0da8c09bac9656ccc..b5594816bf3def05ce8bde7b203e74196f93f4ab 100644 (file)
@@ -245,7 +245,7 @@ rspamd_task_free (struct rspamd_task *task)
                                g_array_free (tp->ucs32_words, TRUE);
                        }
                        if (tp->languages) {
-                               g_ptr_array_free (tp->languages, TRUE);
+                               g_ptr_array_unref (tp->languages);
                        }
                }