]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Count words based on text words
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 30 Nov 2018 15:03:00 +0000 (15:03 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 30 Nov 2018 15:03:00 +0000 (15:03 +0000)
src/libmime/lang_detection.c
src/libmime/message.c
src/libmime/message.h
src/lua/lua_mimepart.c
src/plugins/chartable.c

index f5a175ad8d892b8c4ac6c6e3c73594b00f27102f..82e5fc2ffbfd7c6a422c0b577392ab9bf48310d7 100644 (file)
@@ -1665,11 +1665,11 @@ rspamd_language_detector_detect (struct rspamd_task *task,
        }
 
        if (!ret) {
-               if (part->utf_words->len < default_short_text_limit) {
+               if (part->nwords < default_short_text_limit) {
                        r = rs_detect_none;
                        msg_debug_lang_det ("text is too short for trigramms detection: "
                                           "%d words; at least %d words required",
-                                       (int)part->utf_words->len,
+                                       (int)part->nwords,
                                        (int)default_short_text_limit);
                        rspamd_language_detector_set_language (task, part, "en");
                        candidates = kh_init (rspamd_candidates_hash);
@@ -1728,7 +1728,7 @@ rspamd_language_detector_detect (struct rspamd_task *task,
                                        cbd.std = std;
                                        cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
 
-                                       if (part->utf_words->len < default_words / 2) {
+                                       if (part->nwords < default_words / 2) {
                                                cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
                                        }
                                }
index 3c29b1170198c62e599e70b410f72ab0045ce0c0..411b3bf03898d78201971dcf45575d946d23afe2 100644 (file)
@@ -89,6 +89,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
                                }
                        }
 
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                               part->nwords ++;
+                       }
+
                        if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE|
                                                RSPAMD_STAT_TOKEN_FLAG_NORMALISED|
                                        RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
@@ -96,7 +100,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
                        }
                }
 
-               if (part->utf_words && part->utf_words->len) {
+               if (part->utf_words->len) {
                        gdouble *avg_len_p, *short_len_p;
 
                        avg_len_p = rspamd_mempool_get_variable (task->task_pool,
@@ -1294,7 +1298,7 @@ rspamd_message_process (struct rspamd_task *task)
                rspamd_mime_part_extract_words (task, text_part);
 
                if (text_part->utf_words) {
-                       total_words += text_part->utf_words->len;
+                       total_words += text_part->nwords;
                }
        }
 
index ed9dfef6eff96f78b4a49de36ea8918abc81dab7..29f777c3b18034989e507f3a14cb521ca4d15060 100644 (file)
@@ -112,6 +112,7 @@ struct rspamd_mime_text_part {
        guint flags;
        guint nlines;
        guint spaces;
+       guint nwords;
        guint non_ascii_chars;
        guint ascii_chars;
        guint double_spaces;
index 14111f760f6ba6832212259898d11dd005585655..3617a145bef5218c05571b027f786d7eac9d7d6e 100644 (file)
@@ -756,7 +756,7 @@ lua_textpart_get_words_count (lua_State *L)
                lua_pushinteger (L, 0);
        }
        else {
-               lua_pushinteger (L, part->utf_words->len);
+               lua_pushinteger (L, part->nwords);
        }
 
        return 1;
index e48d19ea2dab49a04f5f4bb88e937adec87f2ee2..b6e42457a2e29ef8fe1fedb2bff98853f43ee3de 100644 (file)
@@ -594,7 +594,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
         */
        part->capital_letters += ncap;
 
-       cur_score /= (gdouble)part->utf_words->len;
+       cur_score /= (gdouble)part->nwords;
 
        if (cur_score > 2.0) {
                cur_score = 2.0;