From: Vsevolod Stakhov Date: Wed, 13 Jul 2011 15:39:37 +0000 (+0400) Subject: * Add new algorithm based on diff algorithm to compare relatively short text parts X-Git-Tag: 0.4.0~27 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=b0ddff4f0d56a877305649a14b902b3f23140b4b;p=rspamd.git * Add new algorithm based on diff algorithm to compare relatively short text parts --- diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 2bb4e028e..a8d0527f1 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -28,6 +28,7 @@ SET(RSPAMDLIBSRC ../src/binlog.c ../src/buffer.c ../src/cfg_utils.c ../src/cfg_xml.c + ../src/diff.c ../src/dns.c ../src/events.c ../src/expressions.c diff --git a/src/cfg_file.h b/src/cfg_file.h index 64a46d149..6a4abb9b8 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -259,6 +259,8 @@ struct config_file { gboolean check_text_attachements; /**< check text attachements as text */ gboolean convert_config; /**< convert config to XML format */ + gsize max_diff; /**< maximum diff size for text parts */ + enum rspamd_log_type log_type; /**< log type */ gint log_facility; /**< log facility in case of syslog */ gint log_level; /**< log level trigger */ diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 56202d4f1..a34084cce 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -175,6 +175,9 @@ init_defaults (struct config_file *cfg) cfg->statfile_sync_interval = 60000; cfg->statfile_sync_timeout = 20000; + /* 20 Kb */ + cfg->max_diff = 20480; + cfg->max_statfile_size = DEFAULT_STATFILE_SIZE; cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal); cfg->variables = g_hash_table_new (g_str_hash, g_str_equal); diff --git a/src/cfg_xml.c b/src/cfg_xml.c index bb23dd641..9920f355c 100644 --- a/src/cfg_xml.c +++ b/src/cfg_xml.c @@ -293,6 +293,12 @@ static struct xml_parser_rule grammar[] = { G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout), NULL }, + { + "max_diff", + xml_handle_size, + G_STRUCT_OFFSET (struct config_file, max_diff), + NULL + }, NULL_ATTR }, NULL_DEF_ATTR diff --git a/src/diff.c b/src/diff.c new file mode 100644 index 000000000..a2e84349b --- /dev/null +++ b/src/diff.c @@ -0,0 +1,379 @@ +/* diff - compute a shortest edit script (SES) given two sequences + * Copyright (c) 2004 Michael B. Allen + * + * The MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* This algorithm is basically Myers' solution to SES/LCS with + * the Hirschberg linear space refinement as described in the + * following publication: + * + * E. Myers, ``An O(ND) Difference Algorithm and Its Variations,'' + * Algorithmica 1, 2 (1986), 251-266. + * http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps + * + * This is the same algorithm used by GNU diff(1). + */ + +/* Copyright (c) 2010, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "diff.h" + + +#define FV(k) _v(ctx, (k), 0) +#define RV(k) _v(ctx, (k), 1) + +#define MAX_DIFF 1024 + +struct _ctx +{ + GArray *buf; + GArray *ses; + gint si; + gint dmax; +}; + +struct middle_snake +{ + gint x, y, u, v; +}; + +static +void maybe_resize_array(GArray *arr, guint k) +{ + if (k > arr->len) { + g_array_set_size (arr, k); + } + +} + +static void +_setv(struct _ctx *ctx, gint k, gint r, gint val) +{ + gint j; + gint *i; + /* Pack -N to N ginto 0 to N * 2 + */ + j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2); + + maybe_resize_array (ctx->buf, j); + i = (gint *) &g_array_index (ctx->buf, gint, j); + *i = val; +} + +static gint +_v(struct _ctx *ctx, gint k, gint r) +{ + gint j; + + j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2); + + return *((gint *) &g_array_index (ctx->buf, gint, j)); +} + +static gint +_find_middle_snake(const void *a, gint aoff, gint n, const void *b, + gint boff, gint m, struct _ctx *ctx, struct middle_snake *ms) +{ + gint delta, odd, mid, d; + + delta = n - m; + odd = delta & 1; + mid = (n + m) / 2; + mid += odd; + + _setv (ctx, 1, 0, 0); + _setv (ctx, delta - 1, 1, n); + + for (d = 0; d <= mid; d++) { + gint k, x, y; + + if ((2 * d - 1) >= ctx->dmax) { + return ctx->dmax; + } + + for (k = d; k >= -d; k -= 2) { + if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) { + x = FV(k + 1); + } + else { + x = FV(k - 1) + 1; + } + y = x - k; + + ms->x = x; + ms->y = y; + const guchar *a0 = (const guchar *) a + aoff; + const guchar *b0 = (const guchar *) b + boff; + while (x < n && y < m && a0[x] == b0[y]) { + x++; + y++; + } + _setv (ctx, k, 0, x); + + if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) { + if (x >= RV(k)) { + ms->u = x; + ms->v = y; + return 2 * d - 1; + } + } + } + for (k = d; k >= -d; k -= 2) { + gint kr = (n - m) + k; + + if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) { + x = RV(kr - 1); + } + else { + x = RV(kr + 1) - 1; + } + y = x - kr; + + ms->u = x; + ms->v = y; + const guchar *a0 = (const guchar *) a + aoff; + const guchar *b0 = (const guchar *) b + boff; + while (x > 0 && y > 0 && a0[x - 1] == b0[y - 1]) { + x--; + y--; + } + _setv (ctx, kr, 1, x); + + if (!odd && kr >= -d && kr <= d) { + if (x <= FV(kr)) { + ms->x = x; + ms->y = y; + return 2 * d; + } + } + } + } + + errno = EFAULT; + + return -1; +} + +static void +_edit(struct _ctx *ctx, gint op, gint off, gint len) +{ + struct diff_edit *e = NULL, newe; + + if (len == 0 || ctx->ses == NULL) { + return; + } + /* + * Add an edit to the SES (or + * coalesce if the op is the same) + */ + if (ctx->ses->len != 0) { + e = &g_array_index (ctx->ses, struct diff_edit, ctx->ses->len - 1); + } + if (e == NULL || e->op != op) { + newe.op = op; + newe.off = off; + newe.len = len; + g_array_append_val (ctx->ses, newe); + } + else { + e->len += len; + } +} + +static gint +_ses(const void *a, gint aoff, gint n, const void *b, gint boff, + gint m, struct _ctx *ctx) +{ + struct middle_snake ms; + gint d; + + if (n == 0) { + _edit (ctx, DIFF_INSERT, boff, m); + d = m; + } + else if (m == 0) { + _edit (ctx, DIFF_DELETE, aoff, n); + d = n; + } + else { + /* Find the middle "snake" around which we + * recursively solve the sub-problems. + */ + d = _find_middle_snake (a, aoff, n, b, boff, m, ctx, &ms); + if (d == -1) { + return -1; + } + else if (d >= ctx->dmax) { + return ctx->dmax; + } + else if (ctx->ses == NULL) { + return d; + } + else if (d > 1) { + if (_ses (a, aoff, ms.x, b, boff, ms.y, ctx) == -1) { + return -1; + } + + _edit (ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x); + + aoff += ms.u; + boff += ms.v; + n -= ms.u; + m -= ms.v; + if (_ses (a, aoff, n, b, boff, m, ctx) == -1) { + return -1; + } + } + else { + gint x = ms.x; + gint u = ms.u; + + /* There are only 4 base cases when the + * edit distance is 1. + * + * n > m m > n + * + * - | + * \ \ x != u + * \ \ + * + * \ \ + * \ \ x == u + * - | + */ + + if (m > n) { + if (x == u) { + _edit (ctx, DIFF_MATCH, aoff, n); + _edit (ctx, DIFF_INSERT, boff + (m - 1), 1); + } + else { + _edit (ctx, DIFF_INSERT, boff, 1); + _edit (ctx, DIFF_MATCH, aoff, n); + } + } + else { + if (x == u) { + _edit (ctx, DIFF_MATCH, aoff, m); + _edit (ctx, DIFF_DELETE, aoff + (n - 1), 1); + } + else { + _edit (ctx, DIFF_DELETE, aoff, 1); + _edit (ctx, DIFF_MATCH, aoff + 1, m); + } + } + } + } + + return d; +} + +gint +rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m, + gint dmax, GArray *ses, gint *sn) +{ + struct _ctx ctx; + gint d, x, y; + struct diff_edit *e = NULL; + GArray *tmp; + + tmp = g_array_sized_new (FALSE, TRUE, sizeof(gint), dmax); + ctx.buf = tmp; + ctx.ses = ses; + ctx.si = 0; + ctx.dmax = dmax; + + /* The _ses function assumes the SES will begin or end with a delete + * or insert. The following will insure this is true by eating any + * beginning matches. This is also a quick to process sequences + * that match entirely. + */ + x = y = 0; + const guchar *a0 = (const guchar *) a + aoff; + const guchar *b0 = (const guchar *) b + boff; + while (x < n && y < m && a0[x] == b0[y]) { + x++; + y++; + } + _edit (&ctx, DIFF_MATCH, aoff, x); + + if ((d = _ses (a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) { + g_array_free (tmp, TRUE); + return -1; + } + if (ses && sn) { + *sn = e->op ? ctx.si + 1 : 0; + } + + g_array_free (tmp, TRUE); + return d; +} + +guint32 +compare_diff_distance (f_str_t *s1, f_str_t *s2) +{ + GArray *ses; + struct diff_edit *e; + gint i; + guint32 distance = 0; + + ses = g_array_sized_new (FALSE, TRUE, sizeof (struct diff_edit), MAX_DIFF); + + if (rspamd_diff (s1->begin, 0, s1->len, + s2->begin, 0, s2->len, MAX_DIFF, ses, NULL) == -1) { + /* Diff failed, strings are different */ + g_array_free (ses, TRUE); + return 0; + } + + for (i = 0; i < ses->len; i ++) { + e = &g_array_index(ses, struct diff_edit, i); + if (e->op != DIFF_MATCH) { + distance += e->len; + } + } + + g_array_free (ses, TRUE); + return 100 - (2 * distance * 100) / (s1->len + s2->len); +} diff --git a/src/diff.h b/src/diff.h new file mode 100644 index 000000000..b1c5426d2 --- /dev/null +++ b/src/diff.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef DIFF_H_ +#define DIFF_H_ + +#include "config.h" +#include "fstring.h" + +typedef enum +{ + DIFF_MATCH = 1, + DIFF_DELETE, + DIFF_INSERT +} diff_op; + +struct diff_edit +{ + gshort op; + gint off; /* off ginto s1 if MATCH or DELETE but s2 if INSERT */ + gint len; +}; + +gint rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m, + gint dmax, GArray *ses, gint *sn); +guint32 compare_diff_distance (f_str_t *s1, f_str_t *s2); + +#endif /* DIFF_H_ */ diff --git a/src/expressions.c b/src/expressions.c index c2af8288c..3dfd542a4 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -31,6 +31,7 @@ #include "expressions.h" #include "html.h" #include "lua/lua_common.h" +#include "diff.h" gboolean rspamd_compare_encoding (struct worker_task *task, GList * args, void *unused); gboolean rspamd_header_exists (struct worker_task *task, GList * args, void *unused); @@ -1083,7 +1084,12 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused) return FALSE; } if (!p1->is_empty && !p2->is_empty) { - diff = fuzzy_compare_parts (p1, p2); + if (p1->diff_str != NULL && p2->diff_str != NULL) { + diff = compare_diff_distance (p1->diff_str, p2->diff_str); + } + else { + diff = fuzzy_compare_parts (p1, p2); + } debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold); *pdiff = diff; memory_pool_set_variable (task->task_pool, "parts_distance", pdiff, NULL); diff --git a/src/fstring.c b/src/fstring.c index 84c8c54bd..3f7ae2195 100644 --- a/src/fstring.c +++ b/src/fstring.c @@ -240,6 +240,23 @@ fstrpush (f_str_t * dest, gchar c) return 1; } +/* + * Push one character to fstr + */ +gint +fstrpush_unichar (f_str_t * dest, gunichar c) +{ + int l; + if (dest->size < dest->len) { + /* Need to reallocate string */ + return 0; + } + + l = g_unichar_to_utf8 (c, dest->begin + dest->len); + dest->len += l; + return l; +} + /* * Allocate memory for f_str_t */ diff --git a/src/fstring.h b/src/fstring.h index eb32dc285..566cbdc32 100644 --- a/src/fstring.h +++ b/src/fstring.h @@ -68,6 +68,11 @@ size_t fstrcat (f_str_t *dest, f_str_t *src); */ gint fstrpush (f_str_t *dest, gchar c); +/* + * Push one character to fstr + */ +gint fstrpush_unichar (f_str_t *dest, gunichar c); + /* * Allocate memory for f_str_t */ diff --git a/src/fuzzy.c b/src/fuzzy.c index 5a2decb34..2639d68a7 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -313,7 +313,7 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool) } void -fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) +fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff) { fuzzy_hash_t *new, *new2; gchar *c, *end, *begin; @@ -321,7 +321,7 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) GList *cur_offset; struct process_exception *cur_ex = NULL; gunichar uc; - GString *debug; + gboolean write_diff = FALSE; cur_offset = part->urls_offset; if (cur_offset != NULL) { @@ -371,7 +371,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) } } - debug = g_string_sized_new (real_len); + write_diff = real_len < max_diff; + + if (write_diff) { + part->diff_str = fstralloc (pool, real_len); + } + else { + part->diff_str = NULL; + } + new->block_size = fuzzy_blocksize (real_len); new2->block_size = new->block_size * 2; @@ -397,7 +405,9 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) uc = g_utf8_get_char (c); if (g_unichar_isalnum (uc)) { fuzzy_update2 (new, new2, uc); - g_string_append_unichar (debug, uc); + if (write_diff) { + fstrpush_unichar (part->diff_str, uc); + } } c = g_utf8_next_char (c); } @@ -415,13 +425,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) else { if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { fuzzy_update2 (new, new2, *c); - g_string_append_c (debug, *c); + if (write_diff) { + fstrpush (part->diff_str, *c); + } } c++; } } } - msg_info ("make hash of string: %v", debug); + /* Check whether we have more bytes in a rolling window */ if (new->rh != 0) { new->hash_pipe[new->hi] = b64[new->h % 64]; diff --git a/src/fuzzy.h b/src/fuzzy.h index 271bfee2a..a1daa5107 100644 --- a/src/fuzzy.h +++ b/src/fuzzy.h @@ -30,7 +30,7 @@ struct mime_text_part; */ fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool); fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool); -void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool); +void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff); gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2); diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 4916a50ad..f217ee0b2 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -37,6 +37,7 @@ #include "../classifiers/classifiers.h" #include "../binlog.h" #include "../statfile_sync.h" +#include "../diff.h" extern stat_file_t* get_statfile_by_symbol (statfile_pool_t *pool, struct classifier_config *ccf, const gchar *symbol, struct statfile **st, gboolean try_create); @@ -1368,7 +1369,12 @@ lua_textpart_compare_distance (lua_State * L) } else { if (!part->is_empty && !other->is_empty) { - diff = fuzzy_compare_parts (part, other); + if (part->diff_str != NULL && other->diff_str != NULL) { + diff = compare_diff_distance (part->diff_str, other->diff_str); + } + else { + diff = fuzzy_compare_parts (part, other); + } } else if ((part->is_empty && !other->is_empty) || (!part->is_empty && other->is_empty)) { /* Empty and non empty parts are different */ diff --git a/src/message.c b/src/message.c index 8ff53ea93..93c9fb622 100644 --- a/src/message.c +++ b/src/message.c @@ -786,7 +786,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont #endif } - fuzzy_init_part (text_part, task->task_pool); + fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff); memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content); task->text_parts = g_list_prepend (task->text_parts, text_part); } @@ -806,7 +806,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part->orig = convert_text_to_utf (task, part_content, type, text_part); text_part->content = text_part->orig; url_parse_text (task->task_pool, task, text_part, FALSE); - fuzzy_init_part (text_part, task->task_pool); + fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff); task->text_parts = g_list_prepend (task->text_parts, text_part); } } diff --git a/src/message.h b/src/message.h index 15fd90188..d716f7906 100644 --- a/src/message.h +++ b/src/message.h @@ -35,6 +35,7 @@ struct mime_text_part { fuzzy_hash_t *double_fuzzy; GMimeObject *parent; GUnicodeScript script; + f_str_t *diff_str; }; struct received_header {