]> source.dussan.org Git - rspamd.git/commitdiff
* Make fuzzy hashes utf8 compatible.
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 12 Jul 2011 14:20:21 +0000 (18:20 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 12 Jul 2011 14:20:21 +0000 (18:20 +0400)
src/fuzzy.c
src/html.c

index 0a24494e2ca748191f4265e4e5a2643423d063d0..cdc9922cc27b6a735d08f4acef316be1e34b61da 100644 (file)
@@ -48,7 +48,7 @@ static struct roll_state        rs;
 
 /* Rolling hash function based on Adler-32 checksum */
 static                          guint32
-fuzzy_roll_hash (gchar c)
+fuzzy_roll_hash (guint c)
 {
        /* Check window position */
        if (rs.n == ROLL_WINDOW_SIZE) {
@@ -73,7 +73,7 @@ fuzzy_roll_hash (gchar c)
 
 /* A simple non-rolling hash, based on the FNV hash */
 static                          guint32
-fuzzy_fnv_hash (gchar c, guint32 hval)
+fuzzy_fnv_hash (guint c, guint32 hval)
 {
        hval ^= c;
        hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
@@ -95,7 +95,7 @@ fuzzy_blocksize (guint32 len)
 
 /* Update hash with new symbol */
 static void
-fuzzy_update (fuzzy_hash_t * h, gchar c)
+fuzzy_update (fuzzy_hash_t * h, guint c)
 {
        h->rh = fuzzy_roll_hash (c);
        h->h = fuzzy_fnv_hash (c, h->h);
@@ -110,7 +110,7 @@ fuzzy_update (fuzzy_hash_t * h, gchar c)
 }
 
 static void
-fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, gchar c)
+fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, guint c)
 {
        h1->rh = fuzzy_roll_hash (c);
        h1->h = fuzzy_fnv_hash (c, h1->h);
@@ -316,36 +316,57 @@ void
 fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
 {
        fuzzy_hash_t                   *new, *new2;
-       gint                            i;
-       gchar                          *c;
+       gchar                          *c, *end, *begin;
        gsize                           real_len = 0, len = part->content->len;
        GList                          *cur_offset;
        struct process_exception       *cur_ex = NULL;
+       gunichar                        uc;
 
        cur_offset = part->urls_offset;
        if (cur_offset != NULL) {
                cur_ex = cur_offset->data;
        }
 
-       c = part->content->data;
+       begin = part->content->data;
+       c = begin;
        new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
        new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
        bzero (&rs, sizeof (rs));
-       for (i = 0; i < len;) {
-               if (cur_ex != NULL && cur_ex->pos == i) {
-                       i += cur_ex->len + 1;
-                       c += cur_ex->len + 1;
-                       cur_offset = g_list_next (cur_offset);
-                       if (cur_offset != NULL) {
-                               cur_ex = cur_offset->data;
+       end = c + len;
+
+       if (part->is_utf) {
+               while (c < end) {
+                       if (cur_ex != NULL && cur_ex->pos == c - begin) {
+                               c += cur_ex->len + 1;
+                               cur_offset = g_list_next (cur_offset);
+                               if (cur_offset != NULL) {
+                                       cur_ex = cur_offset->data;
+                               }
+                       }
+                       else {
+                               uc = g_utf8_get_char (c);
+                               if (g_unichar_isalnum (uc)) {
+                                       real_len ++;
+                               }
+                               c = g_utf8_next_char (c);
                        }
                }
-               else {
-                       if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
-                               real_len ++;
+       }
+       else {
+               while (c < end) {
+                       if (cur_ex != NULL && cur_ex->pos == c - begin) {
+                               c += cur_ex->len + 1;
+                               cur_offset = g_list_next (cur_offset);
+                               if (cur_offset != NULL) {
+                                       cur_ex = cur_offset->data;
+                               }
+                       }
+                       else {
+                               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
+                                       real_len ++;
+                               }
+                               c++;
                        }
-                       c++;
-                       i++;
                }
        }
 
@@ -357,26 +378,45 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
                cur_ex = cur_offset->data;
        }
 
-       c = part->content->data;
-
-       for (i = 0; i < len;) {
-               if (cur_ex != NULL && cur_ex->pos == i) {
-                       i += cur_ex->len + 1;
-                       c += cur_ex->len + 1;
-                       cur_offset = g_list_next (cur_offset);
-                       if (cur_offset != NULL) {
-                               cur_ex = cur_offset->data;
+       begin = part->content->data;
+       c = begin;
+       end = c + len;
+       if (part->is_utf) {
+
+               while (c < end) {
+                       if (cur_ex != NULL && cur_ex->pos == c - begin) {
+                               c += cur_ex->len + 1;
+                               cur_offset = g_list_next (cur_offset);
+                               if (cur_offset != NULL) {
+                                       cur_ex = cur_offset->data;
+                               }
+                       }
+                       else {
+                               uc = g_utf8_get_char (c);
+                               if (g_unichar_isalnum (uc)) {
+                                       fuzzy_update2 (new, new2, uc);
+                               }
+                               c = g_utf8_next_char (c);
                        }
                }
-               else {
-                       if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
-                               fuzzy_update2 (new, new2, *c);
+       }
+       else {
+               while (c < end) {
+                       if (cur_ex != NULL && cur_ex->pos == c - begin) {
+                               c += cur_ex->len + 1;
+                               cur_offset = g_list_next (cur_offset);
+                               if (cur_offset != NULL) {
+                                       cur_ex = cur_offset->data;
+                               }
+                       }
+                       else {
+                               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
+                                       fuzzy_update2 (new, new2, *c);
+                               }
+                               c++;
                        }
-                       c++;
-                       i++;
                }
        }
-
        /* Check whether we have more bytes in a rolling window */
        if (new->rh != 0) {
                new->hash_pipe[new->hi] = b64[new->h % 64];
index bf261082140d1211f95ca5fe9c11a71a25aa4118..306e1e70059e4b592ab58f53935ca2949a2f27f1 100644 (file)
@@ -334,7 +334,7 @@ static entity                   entities_defs[] = {
        {"upsih", 978, "upsih"},
        {"piv", 982, "piv"},
        {"bull", 8226, "bull"},
-       {"hellip", 8230, "hellip"},
+       {"hellip", 8230, "..."},
        {"prime", 8242, "'"},
        {"Prime", 8243, "'"},
        {"oline", 8254, "-"},