aboutsummaryrefslogtreecommitdiffstats
path: root/src/diff.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-09 21:15:49 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-09 21:15:49 +0400
commitaa7c8f99ba4860a3cbf2153026cb91f6053fc66c (patch)
treed86f53e884418bf72863b2f60d4fed6ccd531778 /src/diff.c
parent8e399cdba1bba1da8c1de2b8a22efe719aa30cde (diff)
downloadrspamd-aa7c8f99ba4860a3cbf2153026cb91f6053fc66c.tar.gz
rspamd-aa7c8f99ba4860a3cbf2153026cb91f6053fc66c.zip
* Compare parts distance normalized in diff algorithm that reduces number of
false positives for this function.
Diffstat (limited to 'src/diff.c')
-rw-r--r--src/diff.c90
1 files changed, 86 insertions, 4 deletions
diff --git a/src/diff.c b/src/diff.c
index 9bd1d1d1c..3173b5f19 100644
--- a/src/diff.c
+++ b/src/diff.c
@@ -355,8 +355,8 @@ rspamd_diff(const void *a, gint aoff, gint n, const void *b, gint boff, gint m,
return d;
}
-guint32
-compare_diff_distance (f_str_t *s1, f_str_t *s2)
+static guint32
+compare_diff_distance_unnormalized (f_str_t *s1, f_str_t *s2)
{
GArray *ses;
struct diff_edit *e;
@@ -366,7 +366,7 @@ compare_diff_distance (f_str_t *s1, f_str_t *s2)
ses = g_array_sized_new (FALSE, TRUE, sizeof (struct diff_edit), MAX_DIFF);
if (rspamd_diff (s1->begin, 0, s1->len,
- s2->begin, 0, s2->len, MAX_DIFF, ses, NULL) == -1) {
+ s2->begin, 0, s2->len, MAX_DIFF, ses, NULL) == -1) {
/* Diff failed, strings are different */
g_array_free (ses, TRUE);
return 0;
@@ -380,5 +380,87 @@ compare_diff_distance (f_str_t *s1, f_str_t *s2)
}
g_array_free (ses, TRUE);
- return 100 - (2 * distance * 100) / (s1->len + s2->len);
+
+ return distance;
+}
+
+guint32
+compare_diff_distance (f_str_t *s1, f_str_t *s2)
+{
+
+ return 100 - (2 * compare_diff_distance_unnormalized (s1, s2) * 100) / (s1->len + s2->len);
+}
+
+
+guint32
+compare_diff_distance_normalized (f_str_t *s1, f_str_t *s2)
+{
+ gchar b1[BUFSIZ], b2[BUFSIZ], *t, *h, *p1, *p2;
+ gsize r1, r2;
+ f_str_t t1, t2;
+ guint32 cur_diff = 0;
+
+ r1 = s1->len;
+ r2 = s2->len;
+ p1 = s1->begin;
+ p2 = s2->begin;
+
+ while (r1 > 0 && r2 > 0) {
+ /* Copy strings to the buffer normalized */
+ h = p1;
+ t = b1;
+
+ /* The first string */
+ while (r1 > 0 && h - b1 < (gint)sizeof (b1)) {
+ if (!g_ascii_isspace (*h)) {
+ *t++ = g_ascii_tolower (*h);
+ }
+ h ++;
+ p1 ++;
+ r1 --;
+ }
+
+ t1.begin = b1;
+ t1.len = h - b1;
+
+ /* The second string */
+ h = p2;
+ t = b2;
+ while (r2 > 0 && h - b2 < (gint)sizeof (b2)) {
+ if (!g_ascii_isspace (*h)) {
+ *t++ = g_ascii_tolower (*h);
+ }
+ h ++;
+ p2 ++;
+ r2 --;
+ }
+
+ t2.begin = b2;
+ t2.len = h - b2;
+
+ cur_diff += compare_diff_distance_unnormalized (&t1, &t2);
+ }
+
+ if (r1 > 0) {
+ h = p1;
+ while (r1 > 0) {
+ if (!g_ascii_isspace (*h)) {
+ cur_diff ++;
+ }
+ r1 --;
+ h ++;
+ }
+ }
+ else if (r2 > 0) {
+ h = p2;
+ while (r2 > 0) {
+ if (!g_ascii_isspace (*h)) {
+ cur_diff ++;
+ }
+ r1 --;
+ h ++;
+ }
+ }
+
+ return 100 - (2 * cur_diff * 100) / (s1->len + s2->len);
}