]> source.dussan.org Git - rspamd.git/commitdiff
* Add new algorithm based on diff algorithm to compare relatively short text parts
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 13 Jul 2011 15:39:37 +0000 (19:39 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 13 Jul 2011 15:39:37 +0000 (19:39 +0400)
14 files changed:
lib/CMakeLists.txt
src/cfg_file.h
src/cfg_utils.c
src/cfg_xml.c
src/diff.c [new file with mode: 0644]
src/diff.h [new file with mode: 0644]
src/expressions.c
src/fstring.c
src/fstring.h
src/fuzzy.c
src/fuzzy.h
src/lua/lua_task.c
src/message.c
src/message.h

index 2bb4e028ee60ad81c355c0ac0f9874a243729d54..a8d0527f13fd8bb52506ed5fc5cbafb5f983076a 100644 (file)
@@ -28,6 +28,7 @@ SET(RSPAMDLIBSRC ../src/binlog.c
                                ../src/buffer.c
                                ../src/cfg_utils.c
                                ../src/cfg_xml.c
+                               ../src/diff.c
                                ../src/dns.c
                                ../src/events.c
                                ../src/expressions.c
index 64a46d1499d990a56093131b0db5fc5e109b2689..6a4abb9b8975f0d827b8d6aae1978c42c431a54f 100644 (file)
@@ -259,6 +259,8 @@ struct config_file {
        gboolean check_text_attachements;                               /**< check text attachements as text                                    */
        gboolean convert_config;                                                /**< convert config to XML format                                               */
 
+       gsize max_diff;                                                                 /**< maximum diff size for text parts                                   */
+
        enum rspamd_log_type log_type;                                  /**< log type                                                                                   */
        gint log_facility;                                                              /**< log facility in case of syslog                                             */
        gint log_level;                                                                 /**< log level trigger                                                                  */
index 56202d4f11aa6afaab93ba1654b49a392679d1c3..a34084cceae87c02769ae99882a6764a850e9394 100644 (file)
@@ -175,6 +175,9 @@ init_defaults (struct config_file *cfg)
        cfg->statfile_sync_interval = 60000;
        cfg->statfile_sync_timeout = 20000;
 
+       /* 20 Kb */
+       cfg->max_diff = 20480;
+
        cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
        cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
        cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);
index bb23dd64137503c50aca2850a2fd5736fd795252..9920f355c457371d24451a4d6fddf5c5267dbe39 100644 (file)
@@ -293,6 +293,12 @@ static struct xml_parser_rule grammar[] = {
                                G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout),
                                NULL
                        },
+                       {
+                               "max_diff",
+                               xml_handle_size,
+                               G_STRUCT_OFFSET (struct config_file, max_diff),
+                               NULL
+                       },
                        NULL_ATTR
                },
                NULL_DEF_ATTR
diff --git a/src/diff.c b/src/diff.c
new file mode 100644 (file)
index 0000000..a2e8434
--- /dev/null
@@ -0,0 +1,379 @@
+/* diff - compute a shortest edit script (SES) given two sequences
+ * Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
+ *
+ * The MIT License
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* This algorithm is basically Myers' solution to SES/LCS with
+ * the Hirschberg linear space refinement as described in the
+ * following publication:
+ *
+ *   E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
+ *   Algorithmica 1, 2 (1986), 251-266.
+ *   http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
+ *
+ * This is the same algorithm used by GNU diff(1).
+ */
+
+/* Copyright (c) 2010, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "diff.h"
+
+
+#define FV(k) _v(ctx, (k), 0)
+#define RV(k) _v(ctx, (k), 1)
+
+#define MAX_DIFF 1024
+
+struct _ctx
+{
+       GArray *buf;
+       GArray *ses;
+       gint si;
+       gint dmax;
+};
+
+struct middle_snake
+{
+       gint x, y, u, v;
+};
+
+static
+void maybe_resize_array(GArray *arr, guint k)
+{
+       if (k > arr->len) {
+               g_array_set_size (arr, k);
+       }
+
+}
+
+static void
+_setv(struct _ctx *ctx, gint k, gint r, gint val)
+{
+       gint j;
+       gint *i;
+       /* Pack -N to N ginto 0 to N * 2
+        */
+       j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
+
+       maybe_resize_array (ctx->buf, j);
+       i = (gint *) &g_array_index (ctx->buf, gint, j);
+       *i = val;
+}
+
+static gint
+_v(struct _ctx *ctx, gint k, gint r)
+{
+       gint j;
+
+       j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);
+
+       return *((gint *) &g_array_index (ctx->buf, gint, j));
+}
+
+static gint
+_find_middle_snake(const void *a, gint aoff, gint n, const void *b,
+               gint boff, gint m, struct _ctx *ctx, struct middle_snake *ms)
+{
+       gint delta, odd, mid, d;
+
+       delta = n - m;
+       odd = delta & 1;
+       mid = (n + m) / 2;
+       mid += odd;
+
+       _setv (ctx, 1, 0, 0);
+       _setv (ctx, delta - 1, 1, n);
+
+       for (d = 0; d <= mid; d++) {
+               gint k, x, y;
+
+               if ((2 * d - 1) >= ctx->dmax) {
+                       return ctx->dmax;
+               }
+
+               for (k = d; k >= -d; k -= 2) {
+                       if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
+                               x = FV(k + 1);
+                       }
+                       else {
+                               x = FV(k - 1) + 1;
+                       }
+                       y = x - k;
+
+                       ms->x = x;
+                       ms->y = y;
+                       const guchar *a0 = (const guchar *) a + aoff;
+                       const guchar *b0 = (const guchar *) b + boff;
+                       while (x < n && y < m && a0[x] == b0[y]) {
+                               x++;
+                               y++;
+                       }
+                       _setv (ctx, k, 0, x);
+
+                       if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
+                               if (x >= RV(k)) {
+                                       ms->u = x;
+                                       ms->v = y;
+                                       return 2 * d - 1;
+                               }
+                       }
+               }
+               for (k = d; k >= -d; k -= 2) {
+                       gint kr = (n - m) + k;
+
+                       if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
+                               x = RV(kr - 1);
+                       }
+                       else {
+                               x = RV(kr + 1) - 1;
+                       }
+                       y = x - kr;
+
+                       ms->u = x;
+                       ms->v = y;
+                       const guchar *a0 = (const guchar *) a + aoff;
+                       const guchar *b0 = (const guchar *) b + boff;
+                       while (x > 0 && y > 0 && a0[x - 1] == b0[y - 1]) {
+                               x--;
+                               y--;
+                       }
+                       _setv (ctx, kr, 1, x);
+
+                       if (!odd && kr >= -d && kr <= d) {
+                               if (x <= FV(kr)) {
+                                       ms->x = x;
+                                       ms->y = y;
+                                       return 2 * d;
+                               }
+                       }
+               }
+       }
+
+       errno = EFAULT;
+
+       return -1;
+}
+
+static void
+_edit(struct _ctx *ctx, gint op, gint off, gint len)
+{
+       struct diff_edit *e = NULL, newe;
+
+       if (len == 0 || ctx->ses == NULL) {
+               return;
+       }
+       /*
+        * Add an edit to the SES (or
+        * coalesce if the op is the same)
+        */
+       if (ctx->ses->len != 0) {
+               e = &g_array_index (ctx->ses, struct diff_edit, ctx->ses->len - 1);
+       }
+       if (e == NULL || e->op != op) {
+               newe.op = op;
+               newe.off = off;
+               newe.len = len;
+               g_array_append_val (ctx->ses, newe);
+       }
+       else {
+               e->len += len;
+       }
+}
+
+static gint
+_ses(const void *a, gint aoff, gint n, const void *b, gint boff,
+               gint m, struct _ctx *ctx)
+{
+       struct middle_snake ms;
+       gint d;
+
+       if (n == 0) {
+               _edit (ctx, DIFF_INSERT, boff, m);
+               d = m;
+       }
+       else if (m == 0) {
+               _edit (ctx, DIFF_DELETE, aoff, n);
+               d = n;
+       }
+       else {
+               /* Find the middle "snake" around which we
+                * recursively solve the sub-problems.
+                */
+               d = _find_middle_snake (a, aoff, n, b, boff, m, ctx, &ms);
+               if (d == -1) {
+                       return -1;
+               }
+               else if (d >= ctx->dmax) {
+                       return ctx->dmax;
+               }
+               else if (ctx->ses == NULL) {
+                       return d;
+               }
+               else if (d > 1) {
+                       if (_ses (a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
+                               return -1;
+                       }
+
+                       _edit (ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);
+
+                       aoff += ms.u;
+                       boff += ms.v;
+                       n -= ms.u;
+                       m -= ms.v;
+                       if (_ses (a, aoff, n, b, boff, m, ctx) == -1) {
+                               return -1;
+                       }
+               }
+               else {
+                       gint x = ms.x;
+                       gint u = ms.u;
+
+                       /* There are only 4 base cases when the
+                        * edit distance is 1.
+                        *
+                        * n > m   m > n
+                        *
+                        *   -       |
+                        *    \       \    x != u
+                        *     \       \
+                        *
+                        *   \       \
+                        *    \       \    x == u
+                        *     -       |
+                        */
+
+                       if (m > n) {
+                               if (x == u) {
+                                       _edit (ctx, DIFF_MATCH, aoff, n);
+                                       _edit (ctx, DIFF_INSERT, boff + (m - 1), 1);
+                               }
+                               else {
+                                       _edit (ctx, DIFF_INSERT, boff, 1);
+                                       _edit (ctx, DIFF_MATCH, aoff, n);
+                               }
+                       }
+                       else {
+                               if (x == u) {
+                                       _edit (ctx, DIFF_MATCH, aoff, m);
+                                       _edit (ctx, DIFF_DELETE, aoff + (n - 1), 1);
+                               }
+                               else {
+                                       _edit (ctx, DIFF_DELETE, aoff, 1);
+                                       _edit (ctx, DIFF_MATCH, aoff + 1, m);
+                               }
+                       }
+               }
+       }
+
+       return d;
+}
+
+gint
+rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
+               gint dmax, GArray *ses, gint *sn)
+{
+       struct _ctx ctx;
+       gint d, x, y;
+       struct diff_edit *e = NULL;
+       GArray *tmp;
+
+       tmp = g_array_sized_new (FALSE, TRUE, sizeof(gint), dmax);
+       ctx.buf = tmp;
+       ctx.ses = ses;
+       ctx.si = 0;
+       ctx.dmax = dmax;
+
+       /* The _ses function assumes the SES will begin or end with a delete
+        * or insert. The following will insure this is true by eating any
+        * beginning matches. This is also a quick to process sequences
+        * that match entirely.
+        */
+       x = y = 0;
+       const guchar *a0 = (const guchar *) a + aoff;
+       const guchar *b0 = (const guchar *) b + boff;
+       while (x < n && y < m && a0[x] == b0[y]) {
+               x++;
+               y++;
+       }
+       _edit (&ctx, DIFF_MATCH, aoff, x);
+
+       if ((d = _ses (a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
+               g_array_free (tmp, TRUE);
+               return -1;
+       }
+       if (ses && sn) {
+               *sn = e->op ? ctx.si + 1 : 0;
+       }
+
+       g_array_free (tmp, TRUE);
+       return d;
+}
+
+guint32
+compare_diff_distance (f_str_t *s1, f_str_t *s2)
+{
+       GArray *ses;
+       struct diff_edit *e;
+       gint i;
+       guint32 distance = 0;
+
+       ses = g_array_sized_new (FALSE, TRUE, sizeof (struct diff_edit), MAX_DIFF);
+
+       if (rspamd_diff (s1->begin, 0, s1->len,
+                               s2->begin, 0, s2->len, MAX_DIFF, ses, NULL) == -1) {
+               /* Diff failed, strings are different */
+               g_array_free (ses, TRUE);
+               return 0;
+       }
+
+       for (i = 0; i < ses->len; i ++) {
+               e = &g_array_index(ses, struct diff_edit, i);
+               if (e->op != DIFF_MATCH) {
+                       distance += e->len;
+               }
+       }
+
+       g_array_free (ses, TRUE);
+       return 100 - (2 * distance * 100) / (s1->len + s2->len);
+}
diff --git a/src/diff.h b/src/diff.h
new file mode 100644 (file)
index 0000000..b1c5426
--- /dev/null
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef DIFF_H_
+#define DIFF_H_
+
+#include "config.h"
+#include "fstring.h"
+
+typedef enum
+{
+       DIFF_MATCH = 1,
+       DIFF_DELETE,
+       DIFF_INSERT
+} diff_op;
+
+struct diff_edit
+{
+       gshort op;
+       gint off; /* off ginto s1 if MATCH or DELETE but s2 if INSERT */
+       gint len;
+};
+
+gint rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
+               gint dmax, GArray *ses, gint *sn);
+guint32 compare_diff_distance (f_str_t *s1, f_str_t *s2);
+
+#endif /* DIFF_H_ */
index c2af8288c33b0ff6c004861d5528a98cf241903c..3dfd542a48e53bfd61bb3b99306dc007ff9c8d19 100644 (file)
@@ -31,6 +31,7 @@
 #include "expressions.h"
 #include "html.h"
 #include "lua/lua_common.h"
+#include "diff.h"
 
 gboolean                        rspamd_compare_encoding (struct worker_task *task, GList * args, void *unused);
 gboolean                        rspamd_header_exists (struct worker_task *task, GList * args, void *unused);
@@ -1083,7 +1084,12 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused)
                        return FALSE;
                }
                if (!p1->is_empty && !p2->is_empty) {
-                       diff = fuzzy_compare_parts (p1, p2);
+                       if (p1->diff_str != NULL && p2->diff_str != NULL) {
+                               diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+                       }
+                       else {
+                               diff = fuzzy_compare_parts (p1, p2);
+                       }
                        debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold);
                        *pdiff = diff;
                        memory_pool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
index 84c8c54bd3fc2aebf3c99aecf1bab61d0a7c7b4c..3f7ae2195bc94606f6fded075d415bfe70d84517 100644 (file)
@@ -240,6 +240,23 @@ fstrpush (f_str_t * dest, gchar c)
        return 1;
 }
 
+/*
+ * Push one character to fstr
+ */
+gint
+fstrpush_unichar (f_str_t * dest, gunichar c)
+{
+       int                            l;
+       if (dest->size < dest->len) {
+               /* Need to reallocate string */
+               return 0;
+       }
+
+       l = g_unichar_to_utf8 (c, dest->begin + dest->len);
+       dest->len += l;
+       return l;
+}
+
 /*
  * Allocate memory for f_str_t
  */
index eb32dc2858221b0581123aa39dc3c02ba91d1fff..566cbdc324b1a74f3a6116b7ed1b157a890acc2c 100644 (file)
@@ -68,6 +68,11 @@ size_t fstrcat (f_str_t *dest, f_str_t *src);
  */
 gint fstrpush (f_str_t *dest, gchar c);
 
+/*
+ * Push one character to fstr
+ */
+gint fstrpush_unichar (f_str_t *dest, gunichar c);
+
 /*
  * Allocate memory for f_str_t
  */
index 5a2decb345bb487e0ba7e99d65b33e3f5b06e806..2639d68a773d7ae6441bff596b92a0dd23c4914f 100644 (file)
@@ -313,7 +313,7 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool)
 }
 
 void
-fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
+fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff)
 {
        fuzzy_hash_t                   *new, *new2;
        gchar                          *c, *end, *begin;
@@ -321,7 +321,7 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
        GList                          *cur_offset;
        struct process_exception       *cur_ex = NULL;
        gunichar                        uc;
-       GString *debug;
+       gboolean                        write_diff = FALSE;
 
        cur_offset = part->urls_offset;
        if (cur_offset != NULL) {
@@ -371,7 +371,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
                }
        }
 
-       debug = g_string_sized_new (real_len);
+       write_diff = real_len < max_diff;
+
+       if (write_diff) {
+               part->diff_str = fstralloc (pool, real_len);
+       }
+       else {
+               part->diff_str = NULL;
+       }
+
        new->block_size = fuzzy_blocksize (real_len);
        new2->block_size = new->block_size * 2;
 
@@ -397,7 +405,9 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
                                uc = g_utf8_get_char (c);
                                if (g_unichar_isalnum (uc)) {
                                        fuzzy_update2 (new, new2, uc);
-                                       g_string_append_unichar (debug, uc);
+                                       if (write_diff) {
+                                               fstrpush_unichar (part->diff_str, uc);
+                                       }
                                }
                                c = g_utf8_next_char (c);
                        }
@@ -415,13 +425,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
                        else {
                                if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
                                        fuzzy_update2 (new, new2, *c);
-                                       g_string_append_c (debug, *c);
+                                       if (write_diff) {
+                                               fstrpush (part->diff_str, *c);
+                                       }
                                }
                                c++;
                        }
                }
        }
-       msg_info ("make hash of string: %v", debug);
+
        /* Check whether we have more bytes in a rolling window */
        if (new->rh != 0) {
                new->hash_pipe[new->hi] = b64[new->h % 64];
index 271bfee2a4ec9243afb068166290af96d25caf5e..a1daa51076092bd660a84f66fd681915127eca7f 100644 (file)
@@ -30,7 +30,7 @@ struct mime_text_part;
  */
 fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
 fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
-void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool);
+void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff);
 
 gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);
 
index 4916a50ad18cddc9c0aa6abd9379b581ad2b2418..f217ee0b2c6a8e0548342420b70ce65949a70b2c 100644 (file)
@@ -37,6 +37,7 @@
 #include "../classifiers/classifiers.h"
 #include "../binlog.h"
 #include "../statfile_sync.h"
+#include "../diff.h"
 
 extern stat_file_t* get_statfile_by_symbol (statfile_pool_t *pool, struct classifier_config *ccf,
                const gchar *symbol, struct statfile **st, gboolean try_create);
@@ -1368,7 +1369,12 @@ lua_textpart_compare_distance (lua_State * L)
                }
                else {
                        if (!part->is_empty && !other->is_empty) {
-                               diff = fuzzy_compare_parts (part, other);
+                               if (part->diff_str != NULL && other->diff_str != NULL) {
+                                       diff = compare_diff_distance (part->diff_str, other->diff_str);
+                               }
+                               else {
+                                       diff = fuzzy_compare_parts (part, other);
+                               }
                        }
                        else if ((part->is_empty && !other->is_empty) || (!part->is_empty && other->is_empty)) {
                                /* Empty and non empty parts are different */
index 8ff53ea938a6ef1c410ab9295d474d4257a2eaa6..93c9fb622e4460686adb602c23d43c2dd23e7b84 100644 (file)
@@ -786,7 +786,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 #endif
                }
 
-               fuzzy_init_part (text_part, task->task_pool);
+               fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff);
                memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content);
                task->text_parts = g_list_prepend (task->text_parts, text_part);
        }
@@ -806,7 +806,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
                text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
                text_part->content = text_part->orig;
                url_parse_text (task->task_pool, task, text_part, FALSE);
-               fuzzy_init_part (text_part, task->task_pool);
+               fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff);
                task->text_parts = g_list_prepend (task->text_parts, text_part);
        }
 }
index 15fd90188187f66f3648e48835a44b2c7938c628..d716f7906ac4b6c31a7b5ab2db9f7e341ace7907 100644 (file)
@@ -35,6 +35,7 @@ struct mime_text_part {
        fuzzy_hash_t *double_fuzzy;
        GMimeObject *parent;
        GUnicodeScript script;
+       f_str_t *diff_str;
 };
 
 struct received_header {