aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-13 19:39:37 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-13 19:39:37 +0400
commitb0ddff4f0d56a877305649a14b902b3f23140b4b (patch)
tree65a2066d13b480b062b20036280d18321fb91e94
parent091e84951a2b032bb2930b300ffe43eaf01a304e (diff)
downloadrspamd-b0ddff4f0d56a877305649a14b902b3f23140b4b.tar.gz
rspamd-b0ddff4f0d56a877305649a14b902b3f23140b4b.zip
* Add new algorithm based on diff algorithm to compare relatively short text parts
-rw-r--r--lib/CMakeLists.txt1
-rw-r--r--src/cfg_file.h2
-rw-r--r--src/cfg_utils.c3
-rw-r--r--src/cfg_xml.c6
-rw-r--r--src/diff.c379
-rw-r--r--src/diff.h49
-rw-r--r--src/expressions.c8
-rw-r--r--src/fstring.c17
-rw-r--r--src/fstring.h5
-rw-r--r--src/fuzzy.c24
-rw-r--r--src/fuzzy.h2
-rw-r--r--src/lua/lua_task.c8
-rw-r--r--src/message.c4
-rw-r--r--src/message.h1
14 files changed, 498 insertions, 11 deletions
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 2bb4e028e..a8d0527f1 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -28,6 +28,7 @@ SET(RSPAMDLIBSRC ../src/binlog.c
../src/buffer.c
../src/cfg_utils.c
../src/cfg_xml.c
+ ../src/diff.c
../src/dns.c
../src/events.c
../src/expressions.c
diff --git a/src/cfg_file.h b/src/cfg_file.h
index 64a46d149..6a4abb9b8 100644
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -259,6 +259,8 @@ struct config_file {
gboolean check_text_attachements; /**< check text attachements as text */
gboolean convert_config; /**< convert config to XML format */
+ gsize max_diff; /**< maximum diff size for text parts */
+
enum rspamd_log_type log_type; /**< log type */
gint log_facility; /**< log facility in case of syslog */
gint log_level; /**< log level trigger */
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 56202d4f1..a34084cce 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -175,6 +175,9 @@ init_defaults (struct config_file *cfg)
cfg->statfile_sync_interval = 60000;
cfg->statfile_sync_timeout = 20000;
+ /* 20 Kb */
+ cfg->max_diff = 20480;
+
cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);
diff --git a/src/cfg_xml.c b/src/cfg_xml.c
index bb23dd641..9920f355c 100644
--- a/src/cfg_xml.c
+++ b/src/cfg_xml.c
@@ -293,6 +293,12 @@ static struct xml_parser_rule grammar[] = {
G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout),
NULL
},
+ {
+ "max_diff",
+ xml_handle_size,
+ G_STRUCT_OFFSET (struct config_file, max_diff),
+ NULL
+ },
NULL_ATTR
},
NULL_DEF_ATTR
diff --git a/src/diff.c b/src/diff.c
new file mode 100644
index 000000000..a2e84349b
--- /dev/null
+++ b/src/diff.c
@@ -0,0 +1,379 @@
+/* diff - compute a shortest edit script (SES) given two sequences
+ * Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
+ *
+ * The MIT License
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* This algorithm is basically Myers' solution to SES/LCS with
+ * the Hirschberg linear space refinement as described in the
+ * following publication:
+ *
+ * E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
+ * Algorithmica 1, 2 (1986), 251-266.
+ * http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
+ *
+ * This is the same algorithm used by GNU diff(1).
+ */
+
+/* Copyright (c) 2010, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "diff.h"
+
+
+#define FV(k) _v(ctx, (k), 0)
+#define RV(k) _v(ctx, (k), 1)
+
+#define MAX_DIFF 1024
+
+struct _ctx
+{
+ GArray *buf;
+ GArray *ses;
+ gint si;
+ gint dmax;
+};
+
+struct middle_snake
+{
+ gint x, y, u, v;
+};
+
+static
+void maybe_resize_array(GArray *arr, guint k)
+{
+ if (k > arr->len) {
+ g_array_set_size (arr, k);
+ }
+
+}
+
+static void
+_setv(struct _ctx *ctx, gint k, gint r, gint val)
+{
+ gint j;
+ gint *i;
+ /* Pack -N to N ginto 0 to N * 2
+ */
+ j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
+
+ maybe_resize_array (ctx->buf, j);
+ i = (gint *) &g_array_index (ctx->buf, gint, j);
+ *i = val;
+}
+
+static gint
+_v(struct _ctx *ctx, gint k, gint r)
+{
+ gint j;
+
+ j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
+
+ return *((gint *) &g_array_index (ctx->buf, gint, j));
+}
+
+static gint
+_find_middle_snake(const void *a, gint aoff, gint n, const void *b,
+ gint boff, gint m, struct _ctx *ctx, struct middle_snake *ms)
+{
+ gint delta, odd, mid, d;
+
+ delta = n - m;
+ odd = delta & 1;
+ mid = (n + m) / 2;
+ mid += odd;
+
+ _setv (ctx, 1, 0, 0);
+ _setv (ctx, delta - 1, 1, n);
+
+ for (d = 0; d <= mid; d++) {
+ gint k, x, y;
+
+ if ((2 * d - 1) >= ctx->dmax) {
+ return ctx->dmax;
+ }
+
+ for (k = d; k >= -d; k -= 2) {
+ if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
+ x = FV(k + 1);
+ }
+ else {
+ x = FV(k - 1) + 1;
+ }
+ y = x - k;
+
+ ms->x = x;
+ ms->y = y;
+ const guchar *a0 = (const guchar *) a + aoff;
+ const guchar *b0 = (const guchar *) b + boff;
+ while (x < n && y < m && a0[x] == b0[y]) {
+ x++;
+ y++;
+ }
+ _setv (ctx, k, 0, x);
+
+ if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
+ if (x >= RV(k)) {
+ ms->u = x;
+ ms->v = y;
+ return 2 * d - 1;
+ }
+ }
+ }
+ for (k = d; k >= -d; k -= 2) {
+ gint kr = (n - m) + k;
+
+ if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
+ x = RV(kr - 1);
+ }
+ else {
+ x = RV(kr + 1) - 1;
+ }
+ y = x - kr;
+
+ ms->u = x;
+ ms->v = y;
+ const guchar *a0 = (const guchar *) a + aoff;
+ const guchar *b0 = (const guchar *) b + boff;
+ while (x > 0 && y > 0 && a0[x - 1] == b0[y - 1]) {
+ x--;
+ y--;
+ }
+ _setv (ctx, kr, 1, x);
+
+ if (!odd && kr >= -d && kr <= d) {
+ if (x <= FV(kr)) {
+ ms->x = x;
+ ms->y = y;
+ return 2 * d;
+ }
+ }
+ }
+ }
+
+ errno = EFAULT;
+
+ return -1;
+}
+
+static void
+_edit(struct _ctx *ctx, gint op, gint off, gint len)
+{
+ struct diff_edit *e = NULL, newe;
+
+ if (len == 0 || ctx->ses == NULL) {
+ return;
+ }
+ /*
+ * Add an edit to the SES (or
+ * coalesce if the op is the same)
+ */
+ if (ctx->ses->len != 0) {
+ e = &g_array_index (ctx->ses, struct diff_edit, ctx->ses->len - 1);
+ }
+ if (e == NULL || e->op != op) {
+ newe.op = op;
+ newe.off = off;
+ newe.len = len;
+ g_array_append_val (ctx->ses, newe);
+ }
+ else {
+ e->len += len;
+ }
+}
+
+static gint
+_ses(const void *a, gint aoff, gint n, const void *b, gint boff,
+ gint m, struct _ctx *ctx)
+{
+ struct middle_snake ms;
+ gint d;
+
+ if (n == 0) {
+ _edit (ctx, DIFF_INSERT, boff, m);
+ d = m;
+ }
+ else if (m == 0) {
+ _edit (ctx, DIFF_DELETE, aoff, n);
+ d = n;
+ }
+ else {
+ /* Find the middle "snake" around which we
+ * recursively solve the sub-problems.
+ */
+ d = _find_middle_snake (a, aoff, n, b, boff, m, ctx, &ms);
+ if (d == -1) {
+ return -1;
+ }
+ else if (d >= ctx->dmax) {
+ return ctx->dmax;
+ }
+ else if (ctx->ses == NULL) {
+ return d;
+ }
+ else if (d > 1) {
+ if (_ses (a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
+ return -1;
+ }
+
+ _edit (ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);
+
+ aoff += ms.u;
+ boff += ms.v;
+ n -= ms.u;
+ m -= ms.v;
+ if (_ses (a, aoff, n, b, boff, m, ctx) == -1) {
+ return -1;
+ }
+ }
+ else {
+ gint x = ms.x;
+ gint u = ms.u;
+
+ /* There are only 4 base cases when the
+ * edit distance is 1.
+ *
+ * n > m m > n
+ *
+ * - |
+ * \ \ x != u
+ * \ \
+ *
+ * \ \
+ * \ \ x == u
+ * - |
+ */
+
+ if (m > n) {
+ if (x == u) {
+ _edit (ctx, DIFF_MATCH, aoff, n);
+ _edit (ctx, DIFF_INSERT, boff + (m - 1), 1);
+ }
+ else {
+ _edit (ctx, DIFF_INSERT, boff, 1);
+ _edit (ctx, DIFF_MATCH, aoff, n);
+ }
+ }
+ else {
+ if (x == u) {
+ _edit (ctx, DIFF_MATCH, aoff, m);
+ _edit (ctx, DIFF_DELETE, aoff + (n - 1), 1);
+ }
+ else {
+ _edit (ctx, DIFF_DELETE, aoff, 1);
+ _edit (ctx, DIFF_MATCH, aoff + 1, m);
+ }
+ }
+ }
+ }
+
+ return d;
+}
+
+gint
+rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
+ gint dmax, GArray *ses, gint *sn)
+{
+ struct _ctx ctx;
+ gint d, x, y;
+ struct diff_edit *e = NULL;
+ GArray *tmp;
+
+ tmp = g_array_sized_new (FALSE, TRUE, sizeof(gint), dmax);
+ ctx.buf = tmp;
+ ctx.ses = ses;
+ ctx.si = 0;
+ ctx.dmax = dmax;
+
+ /* The _ses function assumes the SES will begin or end with a delete
+ * or insert. The following will insure this is true by eating any
+ * beginning matches. This is also a quick to process sequences
+ * that match entirely.
+ */
+ x = y = 0;
+ const guchar *a0 = (const guchar *) a + aoff;
+ const guchar *b0 = (const guchar *) b + boff;
+ while (x < n && y < m && a0[x] == b0[y]) {
+ x++;
+ y++;
+ }
+ _edit (&ctx, DIFF_MATCH, aoff, x);
+
+ if ((d = _ses (a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
+ g_array_free (tmp, TRUE);
+ return -1;
+ }
+ if (ses && sn) {
+ *sn = e->op ? ctx.si + 1 : 0;
+ }
+
+ g_array_free (tmp, TRUE);
+ return d;
+}
+
+guint32
+compare_diff_distance (f_str_t *s1, f_str_t *s2)
+{
+ GArray *ses;
+ struct diff_edit *e;
+ gint i;
+ guint32 distance = 0;
+
+ ses = g_array_sized_new (FALSE, TRUE, sizeof (struct diff_edit), MAX_DIFF);
+
+ if (rspamd_diff (s1->begin, 0, s1->len,
+ s2->begin, 0, s2->len, MAX_DIFF, ses, NULL) == -1) {
+ /* Diff failed, strings are different */
+ g_array_free (ses, TRUE);
+ return 0;
+ }
+
+ for (i = 0; i < ses->len; i ++) {
+ e = &g_array_index(ses, struct diff_edit, i);
+ if (e->op != DIFF_MATCH) {
+ distance += e->len;
+ }
+ }
+
+ g_array_free (ses, TRUE);
+ return 100 - (2 * distance * 100) / (s1->len + s2->len);
+}
diff --git a/src/diff.h b/src/diff.h
new file mode 100644
index 000000000..b1c5426d2
--- /dev/null
+++ b/src/diff.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef DIFF_H_
+#define DIFF_H_
+
+#include "config.h"
+#include "fstring.h"
+
+typedef enum
+{
+ DIFF_MATCH = 1,
+ DIFF_DELETE,
+ DIFF_INSERT
+} diff_op;
+
+struct diff_edit
+{
+ gshort op;
+ gint off; /* off ginto s1 if MATCH or DELETE but s2 if INSERT */
+ gint len;
+};
+
+gint rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
+ gint dmax, GArray *ses, gint *sn);
+guint32 compare_diff_distance (f_str_t *s1, f_str_t *s2);
+
+#endif /* DIFF_H_ */
diff --git a/src/expressions.c b/src/expressions.c
index c2af8288c..3dfd542a4 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -31,6 +31,7 @@
#include "expressions.h"
#include "html.h"
#include "lua/lua_common.h"
+#include "diff.h"
gboolean rspamd_compare_encoding (struct worker_task *task, GList * args, void *unused);
gboolean rspamd_header_exists (struct worker_task *task, GList * args, void *unused);
@@ -1083,7 +1084,12 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused)
return FALSE;
}
if (!p1->is_empty && !p2->is_empty) {
- diff = fuzzy_compare_parts (p1, p2);
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold);
*pdiff = diff;
memory_pool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
diff --git a/src/fstring.c b/src/fstring.c
index 84c8c54bd..3f7ae2195 100644
--- a/src/fstring.c
+++ b/src/fstring.c
@@ -241,6 +241,23 @@ fstrpush (f_str_t * dest, gchar c)
}
/*
+ * Push one character to fstr
+ */
+gint
+fstrpush_unichar (f_str_t * dest, gunichar c)
+{
+ int l;
+ if (dest->size < dest->len) {
+ /* Need to reallocate string */
+ return 0;
+ }
+
+ l = g_unichar_to_utf8 (c, dest->begin + dest->len);
+ dest->len += l;
+ return l;
+}
+
+/*
* Allocate memory for f_str_t
*/
f_str_t *
diff --git a/src/fstring.h b/src/fstring.h
index eb32dc285..566cbdc32 100644
--- a/src/fstring.h
+++ b/src/fstring.h
@@ -69,6 +69,11 @@ size_t fstrcat (f_str_t *dest, f_str_t *src);
gint fstrpush (f_str_t *dest, gchar c);
/*
+ * Push one character to fstr
+ */
+gint fstrpush_unichar (f_str_t *dest, gunichar c);
+
+/*
* Allocate memory for f_str_t
*/
f_str_t* fstralloc (memory_pool_t *pool, size_t len);
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 5a2decb34..2639d68a7 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -313,7 +313,7 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool)
}
void
-fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
+fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff)
{
fuzzy_hash_t *new, *new2;
gchar *c, *end, *begin;
@@ -321,7 +321,7 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
GList *cur_offset;
struct process_exception *cur_ex = NULL;
gunichar uc;
- GString *debug;
+ gboolean write_diff = FALSE;
cur_offset = part->urls_offset;
if (cur_offset != NULL) {
@@ -371,7 +371,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
}
}
- debug = g_string_sized_new (real_len);
+ write_diff = real_len < max_diff;
+
+ if (write_diff) {
+ part->diff_str = fstralloc (pool, real_len);
+ }
+ else {
+ part->diff_str = NULL;
+ }
+
new->block_size = fuzzy_blocksize (real_len);
new2->block_size = new->block_size * 2;
@@ -397,7 +405,9 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
uc = g_utf8_get_char (c);
if (g_unichar_isalnum (uc)) {
fuzzy_update2 (new, new2, uc);
- g_string_append_unichar (debug, uc);
+ if (write_diff) {
+ fstrpush_unichar (part->diff_str, uc);
+ }
}
c = g_utf8_next_char (c);
}
@@ -415,13 +425,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
else {
if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
fuzzy_update2 (new, new2, *c);
- g_string_append_c (debug, *c);
+ if (write_diff) {
+ fstrpush (part->diff_str, *c);
+ }
}
c++;
}
}
}
- msg_info ("make hash of string: %v", debug);
+
/* Check whether we have more bytes in a rolling window */
if (new->rh != 0) {
new->hash_pipe[new->hi] = b64[new->h % 64];
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 271bfee2a..a1daa5107 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -30,7 +30,7 @@ struct mime_text_part;
*/
fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
-void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool);
+void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff);
gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 4916a50ad..f217ee0b2 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -37,6 +37,7 @@
#include "../classifiers/classifiers.h"
#include "../binlog.h"
#include "../statfile_sync.h"
+#include "../diff.h"
extern stat_file_t* get_statfile_by_symbol (statfile_pool_t *pool, struct classifier_config *ccf,
const gchar *symbol, struct statfile **st, gboolean try_create);
@@ -1368,7 +1369,12 @@ lua_textpart_compare_distance (lua_State * L)
}
else {
if (!part->is_empty && !other->is_empty) {
- diff = fuzzy_compare_parts (part, other);
+ if (part->diff_str != NULL && other->diff_str != NULL) {
+ diff = compare_diff_distance (part->diff_str, other->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (part, other);
+ }
}
else if ((part->is_empty && !other->is_empty) || (!part->is_empty && other->is_empty)) {
/* Empty and non empty parts are different */
diff --git a/src/message.c b/src/message.c
index 8ff53ea93..93c9fb622 100644
--- a/src/message.c
+++ b/src/message.c
@@ -786,7 +786,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
#endif
}
- fuzzy_init_part (text_part, task->task_pool);
+ fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff);
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
@@ -806,7 +806,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
text_part->content = text_part->orig;
url_parse_text (task->task_pool, task, text_part, FALSE);
- fuzzy_init_part (text_part, task->task_pool);
+ fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
}
diff --git a/src/message.h b/src/message.h
index 15fd90188..d716f7906 100644
--- a/src/message.h
+++ b/src/message.h
@@ -35,6 +35,7 @@ struct mime_text_part {
fuzzy_hash_t *double_fuzzy;
GMimeObject *parent;
GUnicodeScript script;
+ f_str_t *diff_str;
};
struct received_header {