Browse Source

* Add new algorithm based on diff algorithm to compare relatively short text parts

tags/0.4.0
Vsevolod Stakhov 13 years ago
parent
commit
b0ddff4f0d
14 changed files with 498 additions and 11 deletions
  1. 1
    0
      lib/CMakeLists.txt
  2. 2
    0
      src/cfg_file.h
  3. 3
    0
      src/cfg_utils.c
  4. 6
    0
      src/cfg_xml.c
  5. 379
    0
      src/diff.c
  6. 49
    0
      src/diff.h
  7. 7
    1
      src/expressions.c
  8. 17
    0
      src/fstring.c
  9. 5
    0
      src/fstring.h
  10. 18
    6
      src/fuzzy.c
  11. 1
    1
      src/fuzzy.h
  12. 7
    1
      src/lua/lua_task.c
  13. 2
    2
      src/message.c
  14. 1
    0
      src/message.h

+ 1
- 0
lib/CMakeLists.txt View File

@@ -28,6 +28,7 @@ SET(RSPAMDLIBSRC ../src/binlog.c
../src/buffer.c
../src/cfg_utils.c
../src/cfg_xml.c
../src/diff.c
../src/dns.c
../src/events.c
../src/expressions.c

+ 2
- 0
src/cfg_file.h View File

@@ -259,6 +259,8 @@ struct config_file {
gboolean check_text_attachements; /**< check text attachements as text */
gboolean convert_config; /**< convert config to XML format */

gsize max_diff; /**< maximum diff size for text parts */

enum rspamd_log_type log_type; /**< log type */
gint log_facility; /**< log facility in case of syslog */
gint log_level; /**< log level trigger */

+ 3
- 0
src/cfg_utils.c View File

@@ -175,6 +175,9 @@ init_defaults (struct config_file *cfg)
cfg->statfile_sync_interval = 60000;
cfg->statfile_sync_timeout = 20000;

/* 20 Kb */
cfg->max_diff = 20480;

cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);

+ 6
- 0
src/cfg_xml.c View File

@@ -293,6 +293,12 @@ static struct xml_parser_rule grammar[] = {
G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout),
NULL
},
{
"max_diff",
xml_handle_size,
G_STRUCT_OFFSET (struct config_file, max_diff),
NULL
},
NULL_ATTR
},
NULL_DEF_ATTR

+ 379
- 0
src/diff.c View File

@@ -0,0 +1,379 @@
/* diff - compute a shortest edit script (SES) given two sequences
* Copyright (c) 2004 Michael B. Allen <mba2000 ioplex.com>
*
* The MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

/* This algorithm is basically Myers' solution to SES/LCS with
* the Hirschberg linear space refinement as described in the
* following publication:
*
* E. Myers, ``An O(ND) Difference Algorithm and Its Variations,''
* Algorithmica 1, 2 (1986), 251-266.
* http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps
*
* This is the same algorithm used by GNU diff(1).
*/

/* Copyright (c) 2010, Vsevolod Stakhov
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "config.h"
#include "diff.h"


#define FV(k) _v(ctx, (k), 0)
#define RV(k) _v(ctx, (k), 1)

#define MAX_DIFF 1024

struct _ctx
{
GArray *buf;
GArray *ses;
gint si;
gint dmax;
};

struct middle_snake
{
gint x, y, u, v;
};

static
void maybe_resize_array(GArray *arr, guint k)
{
if (k > arr->len) {
g_array_set_size (arr, k);
}

}

static void
_setv(struct _ctx *ctx, gint k, gint r, gint val)
{
gint j;
gint *i;
/* Pack -N to N ginto 0 to N * 2
*/
j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);

maybe_resize_array (ctx->buf, j);
i = (gint *) &g_array_index (ctx->buf, gint, j);
*i = val;
}

static gint
_v(struct _ctx *ctx, gint k, gint r)
{
gint j;

j = k <= 0 ? -k * 4 + r : k * 4 + (r - 2);

return *((gint *) &g_array_index (ctx->buf, gint, j));
}

static gint
_find_middle_snake(const void *a, gint aoff, gint n, const void *b,
gint boff, gint m, struct _ctx *ctx, struct middle_snake *ms)
{
gint delta, odd, mid, d;

delta = n - m;
odd = delta & 1;
mid = (n + m) / 2;
mid += odd;

_setv (ctx, 1, 0, 0);
_setv (ctx, delta - 1, 1, n);

for (d = 0; d <= mid; d++) {
gint k, x, y;

if ((2 * d - 1) >= ctx->dmax) {
return ctx->dmax;
}

for (k = d; k >= -d; k -= 2) {
if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
x = FV(k + 1);
}
else {
x = FV(k - 1) + 1;
}
y = x - k;

ms->x = x;
ms->y = y;
const guchar *a0 = (const guchar *) a + aoff;
const guchar *b0 = (const guchar *) b + boff;
while (x < n && y < m && a0[x] == b0[y]) {
x++;
y++;
}
_setv (ctx, k, 0, x);

if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
if (x >= RV(k)) {
ms->u = x;
ms->v = y;
return 2 * d - 1;
}
}
}
for (k = d; k >= -d; k -= 2) {
gint kr = (n - m) + k;

if (k == d || (k != -d && RV(kr - 1) < RV(kr + 1))) {
x = RV(kr - 1);
}
else {
x = RV(kr + 1) - 1;
}
y = x - kr;

ms->u = x;
ms->v = y;
const guchar *a0 = (const guchar *) a + aoff;
const guchar *b0 = (const guchar *) b + boff;
while (x > 0 && y > 0 && a0[x - 1] == b0[y - 1]) {
x--;
y--;
}
_setv (ctx, kr, 1, x);

if (!odd && kr >= -d && kr <= d) {
if (x <= FV(kr)) {
ms->x = x;
ms->y = y;
return 2 * d;
}
}
}
}

errno = EFAULT;

return -1;
}

static void
_edit(struct _ctx *ctx, gint op, gint off, gint len)
{
struct diff_edit *e = NULL, newe;

if (len == 0 || ctx->ses == NULL) {
return;
}
/*
* Add an edit to the SES (or
* coalesce if the op is the same)
*/
if (ctx->ses->len != 0) {
e = &g_array_index (ctx->ses, struct diff_edit, ctx->ses->len - 1);
}
if (e == NULL || e->op != op) {
newe.op = op;
newe.off = off;
newe.len = len;
g_array_append_val (ctx->ses, newe);
}
else {
e->len += len;
}
}

static gint
_ses(const void *a, gint aoff, gint n, const void *b, gint boff,
gint m, struct _ctx *ctx)
{
struct middle_snake ms;
gint d;

if (n == 0) {
_edit (ctx, DIFF_INSERT, boff, m);
d = m;
}
else if (m == 0) {
_edit (ctx, DIFF_DELETE, aoff, n);
d = n;
}
else {
/* Find the middle "snake" around which we
* recursively solve the sub-problems.
*/
d = _find_middle_snake (a, aoff, n, b, boff, m, ctx, &ms);
if (d == -1) {
return -1;
}
else if (d >= ctx->dmax) {
return ctx->dmax;
}
else if (ctx->ses == NULL) {
return d;
}
else if (d > 1) {
if (_ses (a, aoff, ms.x, b, boff, ms.y, ctx) == -1) {
return -1;
}

_edit (ctx, DIFF_MATCH, aoff + ms.x, ms.u - ms.x);

aoff += ms.u;
boff += ms.v;
n -= ms.u;
m -= ms.v;
if (_ses (a, aoff, n, b, boff, m, ctx) == -1) {
return -1;
}
}
else {
gint x = ms.x;
gint u = ms.u;

/* There are only 4 base cases when the
* edit distance is 1.
*
* n > m m > n
*
* - |
* \ \ x != u
* \ \
*
* \ \
* \ \ x == u
* - |
*/

if (m > n) {
if (x == u) {
_edit (ctx, DIFF_MATCH, aoff, n);
_edit (ctx, DIFF_INSERT, boff + (m - 1), 1);
}
else {
_edit (ctx, DIFF_INSERT, boff, 1);
_edit (ctx, DIFF_MATCH, aoff, n);
}
}
else {
if (x == u) {
_edit (ctx, DIFF_MATCH, aoff, m);
_edit (ctx, DIFF_DELETE, aoff + (n - 1), 1);
}
else {
_edit (ctx, DIFF_DELETE, aoff, 1);
_edit (ctx, DIFF_MATCH, aoff + 1, m);
}
}
}
}

return d;
}

gint
rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
gint dmax, GArray *ses, gint *sn)
{
struct _ctx ctx;
gint d, x, y;
struct diff_edit *e = NULL;
GArray *tmp;

tmp = g_array_sized_new (FALSE, TRUE, sizeof(gint), dmax);
ctx.buf = tmp;
ctx.ses = ses;
ctx.si = 0;
ctx.dmax = dmax;

/* The _ses function assumes the SES will begin or end with a delete
* or insert. The following will insure this is true by eating any
* beginning matches. This is also a quick to process sequences
* that match entirely.
*/
x = y = 0;
const guchar *a0 = (const guchar *) a + aoff;
const guchar *b0 = (const guchar *) b + boff;
while (x < n && y < m && a0[x] == b0[y]) {
x++;
y++;
}
_edit (&ctx, DIFF_MATCH, aoff, x);

if ((d = _ses (a, aoff + x, n - x, b, boff + y, m - y, &ctx)) == -1) {
g_array_free (tmp, TRUE);
return -1;
}
if (ses && sn) {
*sn = e->op ? ctx.si + 1 : 0;
}

g_array_free (tmp, TRUE);
return d;
}

guint32
compare_diff_distance (f_str_t *s1, f_str_t *s2)
{
GArray *ses;
struct diff_edit *e;
gint i;
guint32 distance = 0;

ses = g_array_sized_new (FALSE, TRUE, sizeof (struct diff_edit), MAX_DIFF);

if (rspamd_diff (s1->begin, 0, s1->len,
s2->begin, 0, s2->len, MAX_DIFF, ses, NULL) == -1) {
/* Diff failed, strings are different */
g_array_free (ses, TRUE);
return 0;
}

for (i = 0; i < ses->len; i ++) {
e = &g_array_index(ses, struct diff_edit, i);
if (e->op != DIFF_MATCH) {
distance += e->len;
}
}

g_array_free (ses, TRUE);
return 100 - (2 * distance * 100) / (s1->len + s2->len);
}

+ 49
- 0
src/diff.h View File

@@ -0,0 +1,49 @@
/* Copyright (c) 2010, Vsevolod Stakhov
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/


#ifndef DIFF_H_
#define DIFF_H_

#include "config.h"
#include "fstring.h"

typedef enum
{
DIFF_MATCH = 1,
DIFF_DELETE,
DIFF_INSERT
} diff_op;

struct diff_edit
{
gshort op;
gint off; /* off ginto s1 if MATCH or DELETE but s2 if INSERT */
gint len;
};

gint rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
gint dmax, GArray *ses, gint *sn);
guint32 compare_diff_distance (f_str_t *s1, f_str_t *s2);

#endif /* DIFF_H_ */

+ 7
- 1
src/expressions.c View File

@@ -31,6 +31,7 @@
#include "expressions.h"
#include "html.h"
#include "lua/lua_common.h"
#include "diff.h"

gboolean rspamd_compare_encoding (struct worker_task *task, GList * args, void *unused);
gboolean rspamd_header_exists (struct worker_task *task, GList * args, void *unused);
@@ -1083,7 +1084,12 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused)
return FALSE;
}
if (!p1->is_empty && !p2->is_empty) {
diff = fuzzy_compare_parts (p1, p2);
if (p1->diff_str != NULL && p2->diff_str != NULL) {
diff = compare_diff_distance (p1->diff_str, p2->diff_str);
}
else {
diff = fuzzy_compare_parts (p1, p2);
}
debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold);
*pdiff = diff;
memory_pool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);

+ 17
- 0
src/fstring.c View File

@@ -240,6 +240,23 @@ fstrpush (f_str_t * dest, gchar c)
return 1;
}

/*
* Push one character to fstr
*/
gint
fstrpush_unichar (f_str_t * dest, gunichar c)
{
int l;
if (dest->size < dest->len) {
/* Need to reallocate string */
return 0;
}

l = g_unichar_to_utf8 (c, dest->begin + dest->len);
dest->len += l;
return l;
}

/*
* Allocate memory for f_str_t
*/

+ 5
- 0
src/fstring.h View File

@@ -68,6 +68,11 @@ size_t fstrcat (f_str_t *dest, f_str_t *src);
*/
gint fstrpush (f_str_t *dest, gchar c);

/*
* Push one character to fstr
*/
gint fstrpush_unichar (f_str_t *dest, gunichar c);

/*
* Allocate memory for f_str_t
*/

+ 18
- 6
src/fuzzy.c View File

@@ -313,7 +313,7 @@ fuzzy_init_byte_array (GByteArray * in, memory_pool_t * pool)
}

void
fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff)
{
fuzzy_hash_t *new, *new2;
gchar *c, *end, *begin;
@@ -321,7 +321,7 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
GList *cur_offset;
struct process_exception *cur_ex = NULL;
gunichar uc;
GString *debug;
gboolean write_diff = FALSE;

cur_offset = part->urls_offset;
if (cur_offset != NULL) {
@@ -371,7 +371,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
}
}

debug = g_string_sized_new (real_len);
write_diff = real_len < max_diff;

if (write_diff) {
part->diff_str = fstralloc (pool, real_len);
}
else {
part->diff_str = NULL;
}

new->block_size = fuzzy_blocksize (real_len);
new2->block_size = new->block_size * 2;

@@ -397,7 +405,9 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
uc = g_utf8_get_char (c);
if (g_unichar_isalnum (uc)) {
fuzzy_update2 (new, new2, uc);
g_string_append_unichar (debug, uc);
if (write_diff) {
fstrpush_unichar (part->diff_str, uc);
}
}
c = g_utf8_next_char (c);
}
@@ -415,13 +425,15 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
else {
if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
fuzzy_update2 (new, new2, *c);
g_string_append_c (debug, *c);
if (write_diff) {
fstrpush (part->diff_str, *c);
}
}
c++;
}
}
}
msg_info ("make hash of string: %v", debug);
/* Check whether we have more bytes in a rolling window */
if (new->rh != 0) {
new->hash_pipe[new->hi] = b64[new->h % 64];

+ 1
- 1
src/fuzzy.h View File

@@ -30,7 +30,7 @@ struct mime_text_part;
*/
fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool);
void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool, gsize max_diff);

gint fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);


+ 7
- 1
src/lua/lua_task.c View File

@@ -37,6 +37,7 @@
#include "../classifiers/classifiers.h"
#include "../binlog.h"
#include "../statfile_sync.h"
#include "../diff.h"

extern stat_file_t* get_statfile_by_symbol (statfile_pool_t *pool, struct classifier_config *ccf,
const gchar *symbol, struct statfile **st, gboolean try_create);
@@ -1368,7 +1369,12 @@ lua_textpart_compare_distance (lua_State * L)
}
else {
if (!part->is_empty && !other->is_empty) {
diff = fuzzy_compare_parts (part, other);
if (part->diff_str != NULL && other->diff_str != NULL) {
diff = compare_diff_distance (part->diff_str, other->diff_str);
}
else {
diff = fuzzy_compare_parts (part, other);
}
}
else if ((part->is_empty && !other->is_empty) || (!part->is_empty && other->is_empty)) {
/* Empty and non empty parts are different */

+ 2
- 2
src/message.c View File

@@ -786,7 +786,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
#endif
}

fuzzy_init_part (text_part, task->task_pool);
fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff);
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) free_byte_array_callback, text_part->content);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
@@ -806,7 +806,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
text_part->content = text_part->orig;
url_parse_text (task->task_pool, task, text_part, FALSE);
fuzzy_init_part (text_part, task->task_pool);
fuzzy_init_part (text_part, task->task_pool, task->cfg->max_diff);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
}

+ 1
- 0
src/message.h View File

@@ -35,6 +35,7 @@ struct mime_text_part {
fuzzy_hash_t *double_fuzzy;
GMimeObject *parent;
GUnicodeScript script;
f_str_t *diff_str;
};

struct received_header {

Loading…
Cancel
Save