[Minor] Use a more unified approach to hash strings

author Vsevolod Stakhov <vsevolod@rspamd.com>

Mon, 15 Aug 2022 22:10:36 +0000 (23:10 +0100)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Mon, 15 Aug 2022 22:10:36 +0000 (23:10 +0100)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Mon, 15 Aug 2022 22:10:36 +0000 (23:10 +0100)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Mon, 15 Aug 2022 22:10:36 +0000 (23:10 +0100)
diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c

index 3698bdb3fcbd345ceb2250f6b75d75232226e79d..a4f6b1c7023147b831d453d79efb4a104d1a7910 100644 (file)
--- a/src/libutil/fstring.c
+++ b/src/libutil/fstring.c
@@ -16,6 +16,7 @@
  #include "fstring.h"
  #include "str_util.h"
  #include "contrib/fastutf8/fastutf8.h"
+#include "contrib/mumhash/mum.h"
  
  
  #ifdef WITH_JEMALLOC
@@ -230,32 +231,10 @@ rspamd_fstring_erase (rspamd_fstring_t *str, gsize pos, gsize len)
  }
  
  /* Compat code */
-static guint32
-fstrhash_c (gchar c, guint32 hval)
+static guint64
+fstrhash_c (guint64 c, guint64 hval)
  {
-       guint32 tmp;
-       /*
-        * xor in the current byte against each byte of hval
-        * (which alone guarantees that every bit of input will have
-        * an effect on the output)
-        */
-       tmp = c & 0xFF;
-       tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24);
-       hval ^= tmp;
-
-       /* add some bits out of the middle as low order bits */
-       hval = hval + ((hval >> 12) & 0x0000ffff);
-
-       /* swap most and min significative bytes */
-       tmp = (hval << 24) | ((hval >> 24) & 0xff);
-       /* zero most and min significative bytes of hval */
-       hval &= 0x00ffff00;
-       hval |= tmp;
-       /*
-        * rotate hval 3 bits to the left (thereby making the
-        * 3rd msb of the above mess the hsb of the output hash)
-        */
-       return (hval << 3) + (hval >> 29);
+       return mum_hash_step(hval, c);
  }
  
  
@@ -266,9 +245,8 @@ guint32
  rspamd_fstrhash_lc (const rspamd_ftok_t * str, gboolean is_utf)
  {
         gsize i;
-       guint32 j, hval;
+       guint64 hval;
         const gchar *p, *end = NULL;
-       gchar t;
         gunichar uc;
  
         if (str == NULL) {
@@ -285,18 +263,27 @@ rspamd_fstrhash_lc (const rspamd_ftok_t * str, gboolean is_utf)
                 }
                 while (p < end) {
                         uc = g_unichar_tolower (g_utf8_get_char (p));
-                       for (j = 0; j < sizeof (gunichar); j++) {
-                               t = (uc >> (j * 8)) & 0xff;
-                               if (t != 0) {
-                                       hval = fstrhash_c (t, hval);
-                               }
-                       }
+                       hval = fstrhash_c (uc, hval);
                         p = g_utf8_next_char (p);
                 }
         }
         else {
-               for (i = 0; i < str->len; i++, p++) {
-                       hval = fstrhash_c (g_ascii_tolower (*p), hval);
+               gsize large_steps = str->len / sizeof (guint64);
+               for (i = 0; i < large_steps; i++, p += sizeof (guint64)) {
+                       /* Copy to the uint64 lowercasing each byte */
+                       union {
+                               char c[sizeof(guint64)];
+                               guint64 iu64;
+                       } t;
+                       for (int j = 0; j < sizeof(guint64); j ++) {
+                               t.c[j] = g_ascii_tolower(p[j]);
+                       }
+                       hval = fstrhash_c (t.iu64, hval);
+               }
+
+               gsize remain = str->len % sizeof(guint64);
+               for (i = 0; i < remain; i ++, p ++) {
+                       hval = fstrhash_c (g_ascii_tolower(*p), hval);
                 }
         }
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Mon, 15 Aug 2022 22:10:36 +0000 (23:10 +0100)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Mon, 15 Aug 2022 22:10:36 +0000 (23:10 +0100)