summaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-14 23:58:56 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-14 23:58:56 +0100
commitd32b1c887e86319691da76a6ee8d6ee51b17dcab (patch)
treed33c362721c32ffd60a98dc900164dc0cbb92022 /src/libmime
parentb98130a443131de8ed80cfdddfc70b3c4ccb9fc4 (diff)
downloadrspamd-d32b1c887e86319691da76a6ee8d6ee51b17dcab.tar.gz
rspamd-d32b1c887e86319691da76a6ee8d6ee51b17dcab.zip
Move distance calculation to message parsing.
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/message.c98
-rw-r--r--src/libmime/mime_expressions.c132
2 files changed, 97 insertions, 133 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index b1d80f7e9..fde23ccb2 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1254,6 +1254,49 @@ rspamd_normalize_text_part (struct rspamd_task *task,
}
}
+#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+
+static gint
+rspamd_words_levenshtein_distance (GArray *w1, GArray *w2)
+{
+ guint s1len, s2len, x, y, lastdiag, olddiag;
+ guint *column;
+ rspamd_fstring_t *s1, *s2;
+ gint eq;
+ static const guint max_words = 8192;
+
+ s1len = w1->len;
+ s2len = w2->len;
+
+ if (s1len > max_words) {
+ msg_err ("cannot compare parts with more than %ud words: %ud",
+ max_words, s1len);
+ return 0;
+ }
+
+ column = g_alloca ((s1len + 1) * sizeof (guint));
+
+ for (y = 1; y <= s1len; y++) {
+ column[y] = y;
+ }
+
+ for (x = 1; x <= s2len; x++) {
+ column[0] = x;
+
+ for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
+ olddiag = column[y];
+ s1 = &g_array_index (w1, rspamd_fstring_t, y - 1);
+ s2 = &g_array_index (w1, rspamd_fstring_t, x - 1);
+ eq = rspamd_fstring_equal (s1, s2) ? 0 : 1;
+ column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
+ lastdiag + (eq));
+ lastdiag = olddiag;
+ }
+ }
+
+ return column[s1len];
+}
+
static int
rspamd_gtube_cb (int strnum, int textpos, void *context)
{
@@ -1624,6 +1667,9 @@ rspamd_message_parse (struct rspamd_task *task)
GList *first, *cur;
GMimePart *part;
GMimeDataWrapper *wrapper;
+ GMimeObject *parent;
+ const GMimeContentType *ct;
+ struct mime_text_part *p1, *p2;
struct mime_foreach_data md;
struct received_header *recv;
gchar *mid, *url_str;
@@ -1631,7 +1677,8 @@ rspamd_message_parse (struct rspamd_task *task)
struct rspamd_url *subject_url;
gsize len;
gint64 hdr_start, hdr_end;
- gint rc, state = 0;
+ gint rc, state = 0, diff, *pdiff;
+ guint tw, dw;
tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
p = task->msg.start;
@@ -1863,6 +1910,55 @@ rspamd_message_parse (struct rspamd_task *task)
}
}
+ /* Calculate distance for 2-parts messages */
+ if (task->text_parts->len == 2) {
+ p1 = g_ptr_array_index (task->text_parts, 0);
+ p2 = g_ptr_array_index (task->text_parts, 1);
+
+ /* First of all check parent object */
+ if (p1->parent && p1->parent == p2->parent) {
+ parent = p1->parent;
+ ct = g_mime_object_get_content_type (parent);
+ if (ct == NULL ||
+ !g_mime_content_type_is_type ((GMimeContentType *)ct,
+ "multipart", "alternative")) {
+ debug_task (
+ "two parts are not belong to multipart/alternative container, skip check");
+ }
+ }
+ else {
+ debug_task (
+ "message contains two parts but they are in different multi-parts");
+ }
+
+ if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
+ p1->normalized_words && p2->normalized_words) {
+
+ tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
+ dw = rspamd_words_levenshtein_distance (p1->normalized_words,
+ p2->normalized_words);
+ diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100;
+
+ msg_info (
+ "different words: %d, total words: %d, "
+ "got likeliness between parts of %d%%",
+ dw, tw,
+ diff);
+
+ pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
+ *pdiff = diff;
+ rspamd_mempool_set_variable (task->task_pool,
+ "parts_distance",
+ pdiff,
+ NULL);
+ }
+ }
+ else {
+ debug_task (
+ "message has too many text parts, so do not try to compare "
+ "them with each other");
+ }
+
return TRUE;
}
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index 446493b4d..be49f11d8 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -1165,42 +1165,6 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused)
return FALSE;
}
-#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
-
-static gint
-rspamd_words_levenshtein_distance (GArray *w1, GArray *w2)
-{
- guint s1len, s2len, x, y, lastdiag, olddiag;
- guint *column;
- rspamd_fstring_t *s1, *s2;
- gint eq;
-
- s1len = w1->len;
- s2len = w2->len;
-
- column = g_alloca ((s1len + 1) * sizeof (guint));
-
- for (y = 1; y <= s1len; y++) {
- column[y] = y;
- }
-
- for (x = 1; x <= s2len; x++) {
- column[0] = x;
-
- for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
- olddiag = column[y];
- s1 = &g_array_index (w1, rspamd_fstring_t, y - 1);
- s2 = &g_array_index (w1, rspamd_fstring_t, x - 1);
- eq = rspamd_fstring_equal (s1, s2) ? 0 : 1;
- column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
- lastdiag + (eq));
- lastdiag = olddiag;
- }
- }
-
- return column[s1len];
-}
-
/*
* This function is designed to find difference between text/html and text/plain parts
@@ -1212,11 +1176,7 @@ gboolean
rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
{
gint threshold, threshold2 = -1, diff;
- struct mime_text_part *p1, *p2;
struct expression_argument *arg;
- GMimeObject *parent;
- const GMimeContentType *ct;
- guint tw, dw;
gint *pdiff;
if (args == NULL || args->len == 0) {
@@ -1278,98 +1238,6 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
}
}
- if (task->text_parts->len == 2) {
- p1 = g_ptr_array_index (task->text_parts, 0);
- p2 = g_ptr_array_index (task->text_parts, 1);
- pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
- *pdiff = -1;
-
- /* First of all check parent object */
- if (p1->parent && p1->parent == p2->parent) {
- parent = p1->parent;
- ct = g_mime_object_get_content_type (parent);
-#ifndef GMIME24
- if (ct == NULL ||
- !g_mime_content_type_is_type (ct, "multipart", "alternative")) {
-#else
- if (ct == NULL ||
- !g_mime_content_type_is_type ((GMimeContentType *)ct,
- "multipart", "alternative")) {
-#endif
- debug_task (
- "two parts are not belong to multipart/alternative container, skip check");
- rspamd_mempool_set_variable (task->task_pool,
- "parts_distance",
- pdiff,
- NULL);
- return FALSE;
- }
- }
- else {
- debug_task (
- "message contains two parts but they are in different multi-parts");
- rspamd_mempool_set_variable (task->task_pool,
- "parts_distance",
- pdiff,
- NULL);
- return FALSE;
- }
- if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
- p1->normalized_words && p2->normalized_words) {
-
- tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
- dw = rspamd_words_levenshtein_distance (p1->normalized_words,
- p2->normalized_words);
- diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100;
-
- msg_debug (
- "different words: %d, total words: %d, "
- "got likeliness between parts of %d%%, threshold is %d%%",
- dw, tw,
- diff,
- threshold);
-
- *pdiff = diff;
- rspamd_mempool_set_variable (task->task_pool,
- "parts_distance",
- pdiff,
- NULL);
- if (threshold2 > 0) {
- if (diff >=
- MIN (threshold,
- threshold2) && diff < MAX (threshold, threshold2)) {
- return TRUE;
- }
- }
- else {
- if (diff <= threshold) {
- return TRUE;
- }
- }
- }
- else if ((IS_PART_EMPTY (p1) &&
- !IS_PART_EMPTY (p2)) || (!IS_PART_EMPTY (p1)&& IS_PART_EMPTY (p2))) {
- /* Empty and non empty parts are different */
- *pdiff = 0;
- rspamd_mempool_set_variable (task->task_pool,
- "parts_distance",
- pdiff,
- NULL);
- return TRUE;
- }
- }
- else {
- debug_task (
- "message has too many text parts, so do not try to compare them with each other");
- rspamd_mempool_set_variable (task->task_pool,
- "parts_distance",
- pdiff,
- NULL);
- return FALSE;
- }
-
- rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff,
- NULL);
return FALSE;
}