From: Vsevolod Stakhov Date: Tue, 12 Jul 2011 14:20:21 +0000 (+0400) Subject: * Make fuzzy hashes utf8 compatible. X-Git-Tag: 0.4.0~33 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f88f875cf8b89fb9f26a510c66ebf7004ea80d6c;p=rspamd.git * Make fuzzy hashes utf8 compatible. --- diff --git a/src/fuzzy.c b/src/fuzzy.c index 0a24494e2..cdc9922cc 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -48,7 +48,7 @@ static struct roll_state rs; /* Rolling hash function based on Adler-32 checksum */ static guint32 -fuzzy_roll_hash (gchar c) +fuzzy_roll_hash (guint c) { /* Check window position */ if (rs.n == ROLL_WINDOW_SIZE) { @@ -73,7 +73,7 @@ fuzzy_roll_hash (gchar c) /* A simple non-rolling hash, based on the FNV hash */ static guint32 -fuzzy_fnv_hash (gchar c, guint32 hval) +fuzzy_fnv_hash (guint c, guint32 hval) { hval ^= c; hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); @@ -95,7 +95,7 @@ fuzzy_blocksize (guint32 len) /* Update hash with new symbol */ static void -fuzzy_update (fuzzy_hash_t * h, gchar c) +fuzzy_update (fuzzy_hash_t * h, guint c) { h->rh = fuzzy_roll_hash (c); h->h = fuzzy_fnv_hash (c, h->h); @@ -110,7 +110,7 @@ fuzzy_update (fuzzy_hash_t * h, gchar c) } static void -fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, gchar c) +fuzzy_update2 (fuzzy_hash_t * h1, fuzzy_hash_t *h2, guint c) { h1->rh = fuzzy_roll_hash (c); h1->h = fuzzy_fnv_hash (c, h1->h); @@ -316,36 +316,57 @@ void fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) { fuzzy_hash_t *new, *new2; - gint i; - gchar *c; + gchar *c, *end, *begin; gsize real_len = 0, len = part->content->len; GList *cur_offset; struct process_exception *cur_ex = NULL; + gunichar uc; cur_offset = part->urls_offset; if (cur_offset != NULL) { cur_ex = cur_offset->data; } - c = part->content->data; + begin = part->content->data; + c = begin; new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); bzero (&rs, sizeof (rs)); - for (i = 0; i < len;) { - if (cur_ex != NULL && cur_ex->pos == i) { - i += cur_ex->len + 1; - c += cur_ex->len + 1; - cur_offset = g_list_next (cur_offset); - if (cur_offset != NULL) { - cur_ex = cur_offset->data; + end = c + len; + + if (part->is_utf) { + while (c < end) { + if (cur_ex != NULL && cur_ex->pos == c - begin) { + c += cur_ex->len + 1; + cur_offset = g_list_next (cur_offset); + if (cur_offset != NULL) { + cur_ex = cur_offset->data; + } + } + else { + uc = g_utf8_get_char (c); + if (g_unichar_isalnum (uc)) { + real_len ++; + } + c = g_utf8_next_char (c); } } - else { - if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { - real_len ++; + } + else { + while (c < end) { + if (cur_ex != NULL && cur_ex->pos == c - begin) { + c += cur_ex->len + 1; + cur_offset = g_list_next (cur_offset); + if (cur_offset != NULL) { + cur_ex = cur_offset->data; + } + } + else { + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { + real_len ++; + } + c++; } - c++; - i++; } } @@ -357,26 +378,45 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) cur_ex = cur_offset->data; } - c = part->content->data; - - for (i = 0; i < len;) { - if (cur_ex != NULL && cur_ex->pos == i) { - i += cur_ex->len + 1; - c += cur_ex->len + 1; - cur_offset = g_list_next (cur_offset); - if (cur_offset != NULL) { - cur_ex = cur_offset->data; + begin = part->content->data; + c = begin; + end = c + len; + if (part->is_utf) { + + while (c < end) { + if (cur_ex != NULL && cur_ex->pos == c - begin) { + c += cur_ex->len + 1; + cur_offset = g_list_next (cur_offset); + if (cur_offset != NULL) { + cur_ex = cur_offset->data; + } + } + else { + uc = g_utf8_get_char (c); + if (g_unichar_isalnum (uc)) { + fuzzy_update2 (new, new2, uc); + } + c = g_utf8_next_char (c); } } - else { - if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { - fuzzy_update2 (new, new2, *c); + } + else { + while (c < end) { + if (cur_ex != NULL && cur_ex->pos == c - begin) { + c += cur_ex->len + 1; + cur_offset = g_list_next (cur_offset); + if (cur_offset != NULL) { + cur_ex = cur_offset->data; + } + } + else { + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) { + fuzzy_update2 (new, new2, *c); + } + c++; } - c++; - i++; } } - /* Check whether we have more bytes in a rolling window */ if (new->rh != 0) { new->hash_pipe[new->hi] = b64[new->h % 64]; diff --git a/src/html.c b/src/html.c index bf2610821..306e1e700 100644 --- a/src/html.c +++ b/src/html.c @@ -334,7 +334,7 @@ static entity entities_defs[] = { {"upsih", 978, "upsih"}, {"piv", 982, "piv"}, {"bull", 8226, "bull"}, - {"hellip", 8230, "hellip"}, + {"hellip", 8230, "..."}, {"prime", 8242, "'"}, {"Prime", 8243, "'"}, {"oline", 8254, "-"},