summaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-04-06 17:32:15 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-04-06 17:32:15 +0100
commit6e925ae527b87d75c8346d44c27b55252cc78dbf (patch)
treeb5bb829351ffd70028d8563d25503ef419953b1c /src/libmime
parentb4e985a0b9a0ad8e79a3c5c55191798e3cb32e0d (diff)
downloadrspamd-6e925ae527b87d75c8346d44c27b55252cc78dbf.tar.gz
rspamd-6e925ae527b87d75c8346d44c27b55252cc78dbf.zip
[Feature] Do not detect language twice for relative parts
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/message.c294
-rw-r--r--src/libmime/message.h1
2 files changed, 178 insertions, 117 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index aaae40fa5..1eae26a43 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -55,157 +55,106 @@ free_byte_array_callback (void *pointer)
}
static void
-rspamd_extract_words (struct rspamd_task *task,
+rspamd_mime_part_extract_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
#ifdef WITH_SNOWBALL
struct sb_stemmer *stem = NULL;
#endif
- rspamd_stat_token_t *w, ucs_w;
+ rspamd_stat_token_t *w;
gchar *temp_word;
const guchar *r;
- guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0;
+ guint i, nlen, total_len = 0, short_len = 0;
gdouble avg_len = 0;
- /* Ugly workaround */
- if (IS_PART_HTML (part)) {
- part->normalized_words = rspamd_tokenize_text (
- part->stripped_content->data,
- part->stripped_content->len, IS_PART_UTF (part), task->cfg,
- part->exceptions, FALSE,
- NULL);
- }
- else {
- part->normalized_words = rspamd_tokenize_text (
- part->stripped_content->data,
- part->stripped_content->len, IS_PART_UTF (part), task->cfg,
- part->exceptions, FALSE,
- NULL);
- }
-
if (part->normalized_words) {
- part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
- sizeof (guint64), part->normalized_words->len);
-
- if (IS_PART_UTF (part) && task->lang_det) {
- part->ucs32_words = g_array_sized_new (FALSE, FALSE,
- sizeof (rspamd_stat_token_t), part->normalized_words->len);
- }
-
- if (part->ucs32_words) {
- struct rspamd_lang_detector_res *lang;
-
- for (i = 0; i < part->normalized_words->len; i++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
-
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
- rspamd_language_detector_to_ucs (task->lang_det,
- task->task_pool,
- w, &ucs_w);
- g_array_append_val (part->ucs32_words, ucs_w);
- ucs_len += ucs_w.len;
- }
- }
-
- part->languages = rspamd_language_detector_detect (task,
- task->lang_det,
- part->ucs32_words, ucs_len);
+#ifdef WITH_SNOWBALL
+ static GHashTable *stemmers = NULL;
- if (part->languages->len > 0) {
- lang = g_ptr_array_index (part->languages, 0);
- part->language = lang->lang;
+ if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
- msg_info_task ("detected part language: %s", part->language);
+ if (!stemmers) {
+ stemmers = g_hash_table_new (rspamd_strcase_hash,
+ rspamd_strcase_equal);
}
-#ifdef WITH_SNOWBALL
- static GHashTable *stemmers = NULL;
+ stem = g_hash_table_lookup (stemmers, part->language);
- if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
+ if (stem == NULL) {
- if (!stemmers) {
- stemmers = g_hash_table_new (rspamd_strcase_hash,
- rspamd_strcase_equal);
- }
-
- stem = g_hash_table_lookup (stemmers, part->language);
+ stem = sb_stemmer_new (part->language, "UTF_8");
if (stem == NULL) {
-
- stem = sb_stemmer_new (part->language, "UTF_8");
-
- if (stem == NULL) {
- msg_debug_task ("<%s> cannot create lemmatizer for %s language",
- task->message_id, part->language);
- }
- else {
- g_hash_table_insert (stemmers, g_strdup (part->language),
- stem);
- }
+ msg_debug_task ("<%s> cannot create lemmatizer for %s language",
+ task->message_id, part->language);
+ }
+ else {
+ g_hash_table_insert (stemmers, g_strdup (part->language),
+ stem);
}
}
-#endif
}
+#endif
+ }
- for (i = 0; i < part->normalized_words->len; i ++) {
- guint64 h;
+ for (i = 0; i < part->normalized_words->len; i ++) {
+ guint64 h;
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
- r = NULL;
+ w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ r = NULL;
#ifdef WITH_SNOWBALL
- if (stem) {
- r = sb_stemmer_stem (stem, w->begin, w->len);
- }
+ if (stem) {
+ r = sb_stemmer_stem (stem, w->begin, w->len);
+ }
#endif
- if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
- avg_len = avg_len + (w->len - avg_len) / (double)i;
-
- if (r != NULL) {
- nlen = strlen (r);
- nlen = MIN (nlen, w->len);
- temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
- memcpy (temp_word, r, nlen);
+ if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+ avg_len = avg_len + (w->len - avg_len) / (double)i;
- if (IS_PART_UTF (part)) {
- rspamd_str_lc_utf8 (temp_word, nlen);
- }
- else {
- rspamd_str_lc (temp_word, nlen);
- }
+ if (r != NULL) {
+ nlen = strlen (r);
+ nlen = MIN (nlen, w->len);
+ temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
+ memcpy (temp_word, r, nlen);
- w->begin = temp_word;
- w->len = nlen;
+ if (IS_PART_UTF (part)) {
+ rspamd_str_lc_utf8 (temp_word, nlen);
}
else {
- temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
- memcpy (temp_word, w->begin, w->len);
+ rspamd_str_lc (temp_word, nlen);
+ }
- if (IS_PART_UTF (part)) {
- rspamd_str_lc_utf8 (temp_word, w->len);
- }
- else {
- rspamd_str_lc (temp_word, w->len);
- }
+ w->begin = temp_word;
+ w->len = nlen;
+ }
+ else {
+ temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
+ memcpy (temp_word, w->begin, w->len);
- w->begin = temp_word;
+ if (IS_PART_UTF (part)) {
+ rspamd_str_lc_utf8 (temp_word, w->len);
+ }
+ else {
+ rspamd_str_lc (temp_word, w->len);
}
+
+ w->begin = temp_word;
}
+ }
- if (w->len > 0) {
- /*
- * We use static hash seed if we would want to use that in shingles
- * computation in future
- */
- h = rspamd_cryptobox_fast_hash_specific (
- RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
- w->begin, w->len, words_hash_seed);
- g_array_append_val (part->normalized_hashes, h);
- total_len += w->len;
-
- if (w->len <= 3) {
- short_len ++;
- }
+ if (w->len > 0) {
+ /*
+ * We use static hash seed if we would want to use that in shingles
+ * computation in future
+ */
+ h = rspamd_cryptobox_fast_hash_specific (
+ RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+ w->begin, w->len, words_hash_seed);
+ g_array_append_val (part->normalized_hashes, h);
+ total_len += w->len;
+
+ if (w->len <= 3) {
+ short_len ++;
}
}
}
@@ -239,8 +188,78 @@ rspamd_extract_words (struct rspamd_task *task,
*short_len_p += short_len;
}
}
+}
+
+static guint
+rspamd_mime_part_create_words (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ rspamd_stat_token_t *w, ucs_w;
+ guint i, ucs_len = 0;
+
+ /* Ugly workaround */
+ if (IS_PART_HTML (part)) {
+ part->normalized_words = rspamd_tokenize_text (
+ part->stripped_content->data,
+ part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+ part->exceptions, FALSE,
+ NULL);
+ }
+ else {
+ part->normalized_words = rspamd_tokenize_text (
+ part->stripped_content->data,
+ part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+ part->exceptions, FALSE,
+ NULL);
+ }
+ if (part->normalized_words) {
+ part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
+ sizeof (guint64), part->normalized_words->len);
+
+ if (IS_PART_UTF (part) && task->lang_det) {
+ part->ucs32_words = g_array_sized_new (FALSE, FALSE,
+ sizeof (rspamd_stat_token_t), part->normalized_words->len);
+ }
+
+ if (part->ucs32_words) {
+
+
+ for (i = 0; i < part->normalized_words->len; i++) {
+ w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
+ i);
+
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ rspamd_language_detector_to_ucs (task->lang_det,
+ task->task_pool,
+ w, &ucs_w);
+ g_array_append_val (part->ucs32_words, ucs_w);
+ ucs_len += ucs_w.len;
+ }
+ }
+ }
+ }
+
+ return ucs_len;
+}
+
+static void
+rspamd_mime_part_detect_language (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part, guint ucs_len)
+{
+ struct rspamd_lang_detector_res *lang;
+ if (part->ucs32_words) {
+ part->languages = rspamd_language_detector_detect (task,
+ task->lang_det,
+ part->ucs32_words, ucs_len);
+ if (part->languages->len > 0) {
+ lang = g_ptr_array_index (part->languages, 0);
+ part->language = lang->lang;
+
+ msg_info_task ("detected part language: %s", part->language);
+ }
+ }
}
static void
@@ -833,7 +852,7 @@ rspamd_message_process_text_part (struct rspamd_task *task,
text_part->exceptions);
}
- rspamd_extract_words (task, text_part);
+ text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
}
/* Creates message from various data using libmagic to detect type */
@@ -1128,6 +1147,41 @@ rspamd_message_parse (struct rspamd_task *task)
if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
p1->normalized_hashes && p2->normalized_hashes) {
+ /*
+ * We also detect language on one part and propagate it to
+ * another one
+ */
+ struct rspamd_mime_text_part *sel;
+
+ /* Prefer HTML as text part is not displayed normally */
+ if (IS_PART_HTML (p1)) {
+ sel = p1;
+ }
+ else if (IS_PART_HTML (p2)) {
+ sel = p2;
+ }
+ else {
+ if (p1->ucs_len > p2->ucs_len) {
+ sel = p1;
+ }
+ else {
+ sel = p2;
+ }
+ }
+
+ rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
+
+ if (sel->language && sel->language[0]) {
+ /* Propagate language */
+ if (sel == p1) {
+ p2->language = sel->language;
+ p2->languages = g_ptr_array_ref (sel->languages);
+ }
+ else {
+ p1->language = sel->language;
+ p1->languages = g_ptr_array_ref (sel->languages);
+ }
+ }
tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
@@ -1180,6 +1234,12 @@ rspamd_message_parse (struct rspamd_task *task)
guint total_words = 0;
PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
+ if (!text_part->language) {
+ rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
+ }
+
+ rspamd_mime_part_extract_words (task, text_part);
+
if (text_part->normalized_words) {
total_words += text_part->normalized_words->len;
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 5ee5b4c43..0ed2a5c66 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -103,6 +103,7 @@ struct rspamd_mime_text_part {
guint empty_lines;
guint capital_letters;
guint numeric_characters;
+ guint ucs_len;
};
enum rspamd_received_type {