diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
commit | a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch) | |
tree | 352c634bbbc74cf17644545ace66a8feedc841c3 /src/fstring.c | |
parent | 63725086863e4f422340479f83dd7ef374613e76 (diff) | |
download | rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip |
* Welcome 0.4.0
Uncompatible changes:
- Statistics is uncompatible in utf8 mode
Major changes:
- Improved utf8 mode
- Convert all characters to lowercase in statistics
- Skip URL's in statistics
- Improve speed of bayes classifier by using integer arithmetics
- Fixed statfiles synchronization that was broken for a long time
- Synchronization is now configurable
Minor changes:
- Bugfixes
- Removed some of legacy code
- Types polishing
Diffstat (limited to 'src/fstring.c')
-rw-r--r-- | src/fstring.c | 95 |
1 files changed, 72 insertions, 23 deletions
diff --git a/src/fstring.c b/src/fstring.c index 5fcb12bd2..84c8c54bd 100644 --- a/src/fstring.c +++ b/src/fstring.c @@ -297,6 +297,34 @@ fstrgrow (memory_pool_t * pool, f_str_t * orig, size_t newlen) return res; } +static guint32 +fstrhash_c (gchar c, guint32 hval) +{ + guint32 tmp; + /* + * xor in the current byte against each byte of hval + * (which alone gaurantees that every bit of input will have + * an effect on the output) + */ + tmp = c & 0xFF; + tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); + hval ^= tmp; + + /* add some bits out of the middle as low order bits */ + hval = hval + ((hval >> 12) & 0x0000ffff); + + /* swap most and min significative bytes */ + tmp = (hval << 24) | ((hval >> 24) & 0xff); + /* zero most and min significative bytes of hval */ + hval &= 0x00ffff00; + hval |= tmp; + /* + * rotate hval 3 bits to the left (thereby making the + * 3rd msb of the above mess the hsb of the output hash) + */ + return (hval << 3) + (hval >> 29); +} + /* * Return hash value for a string */ @@ -305,7 +333,6 @@ fstrhash (f_str_t * str) { size_t i; guint32 hval; - guint32 tmp; gchar *c = str->begin; if (str == NULL) { @@ -314,32 +341,54 @@ fstrhash (f_str_t * str) hval = str->len; for (i = 0; i < str->len; i++, c++) { - /* - * xor in the current byte against each byte of hval - * (which alone gaurantees that every bit of input will have - * an effect on the output) - */ - tmp = *c & 0xFF; - tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); - hval ^= tmp; - - /* add some bits out of the middle as low order bits */ - hval = hval + ((hval >> 12) & 0x0000ffff); - - /* swap most and min significative bytes */ - tmp = (hval << 24) | ((hval >> 24) & 0xff); - /* zero most and min significative bytes of hval */ - hval &= 0x00ffff00; - hval |= tmp; - /* - * rotate hval 3 bits to the left (thereby making the - * 3rd msb of the above mess the hsb of the output hash) - */ - hval = (hval << 3) + (hval >> 29); + hval = fstrhash_c (*c, hval); } return hval; } +/* + * Return hash value for a string + */ +guint32 +fstrhash_lowercase (f_str_t * str, gboolean is_utf) +{ + gsize i; + guint32 j, hval; + const gchar *p = str->begin, *end = NULL; + gchar t; + gunichar uc; + + if (str == NULL) { + return 0; + } + hval = str->len; + + if (is_utf) { + while (end < str->begin + str->len) { + g_utf8_validate (p, str->len, &end); + while (p < end) { + uc = g_unichar_tolower (g_utf8_get_char (p)); + for (j = 0; j < sizeof (gunichar); j ++) { + t = (uc >> (j * 8)) & 0xff; + if (t != 0) { + hval = fstrhash_c (t, hval); + } + } + p = g_utf8_next_char (p); + } + p = end + 1; + } + + } + else { + for (i = 0; i < str->len; i++, p++) { + hval = fstrhash_c (g_ascii_tolower (*p), hval); + } + } + + return hval; +} + void fstrstrip (f_str_t * str) { |