From ddba1860ecead8529f6542bb69c4027e8a7e13a4 Mon Sep 17 00:00:00 2001 From: "cebka@lenovo-laptop" Date: Thu, 18 Mar 2010 19:43:55 +0300 Subject: [PATCH] * Try to speed up fuzzy storage --- CMakeLists.txt | 40 +++++++++--- config.h.in | 2 + src/fuzzy_storage.c | 144 +++++++++++++++++++++++++++++++------------- 3 files changed, 135 insertions(+), 51 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37630d51e..7e866f719 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,8 @@ LINK_DIRECTORIES(${GMIME2_LIBRARY_DIRS}) SET(CMAKE_REQUIRED_LIBRARIES m) SET(CMAKE_REQUIRED_INCLUDES sys/mman.h stdlib.h stdio.h unistd.h time.h sched.h) +# Check for libevent + FIND_LIBRARY(LIBEVENT_LIBRARY NAMES event PATHS /lib /opt/lib /usr/lib @@ -160,6 +162,33 @@ IF(NOT LIBEVENT_LIBRARY) MESSAGE(FATAL_ERROR "libevent is required for building rspamd") ENDIF(NOT LIBEVENT_LIBRARY) +FIND_PATH(LIBEVENT_INCLUDE event.h PATHS /opt/include + /usr/include + /usr/local/include + DOC "Path where the libevent header files can be found") + +GET_FILENAME_COMPONENT(LIBEVENT_PATH "${LIBEVENT_LIBRARY}" PATH) +INCLUDE_DIRECTORIES("${LIBEVENT_INCLUDE}") +LINK_DIRECTORIES("${LIBEVENT_PATH}") + +# Find libjudy +#FIND_LIBRARY(LIBJUDY_LIBRARY NAMES judy PATHS /lib +# /opt/lib +# /usr/lib +# /usr/local/lib +# DOC "Path where the lijudy library can be found") +#IF(LIBJUDY_LIBRARY) +# FIND_PATH(LIBJUDY_INCLUDE Judy.h PATHS /opt/include +# /usr/include +# /usr/local/include +# DOC "Path where the judy header files can be found") +# +# GET_FILENAME_COMPONENT(LIBJUDY_PATH "${LIBJUDY_LIBRARY}" PATH) +# INCLUDE_DIRECTORIES("${LIBJUDY_INCLUDE}") +# LINK_DIRECTORIES("${LIBJUDY_PATH}") +# SET(WITH_JUDY 1) +#ENDIF(LIBJUDY_LIBRARY) + IF(ENABLE_PROFILING MATCHES "ON") SET(WITH_PROFILER 1) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg") @@ -193,14 +222,6 @@ IF(ENABLE_GPERF_TOOLS MATCHES "ON") ENDIF(ENABLE_GPERF_TOOLS MATCHES "ON") -FIND_PATH(LIBEVENT_INCLUDE event.h PATHS /opt/include - /usr/include - /usr/local/include - DOC "Path where the libevent header files can be found") - -GET_FILENAME_COMPONENT(LIBEVENT_PATH "${LIBEVENT_LIBRARY}" PATH) -INCLUDE_DIRECTORIES("${LIBEVENT_INCLUDE}") -LINK_DIRECTORIES("${LIBEVENT_PATH}") FIND_LIBRARY(LIBUTIL_LIBRARY NAMES util PATHS /lib /opt/lib @@ -532,6 +553,9 @@ TARGET_LINK_LIBRARIES(rspamd m) IF(LIBUTIL_LIBRARY) TARGET_LINK_LIBRARIES(rspamd util) ENDIF(LIBUTIL_LIBRARY) +IF(LIBJUDY_LIBRARY) + TARGET_LINK_LIBRARIES(rspamd judy) +ENDIF(LIBJUDY_LIBRARY) TARGET_LINK_LIBRARIES(rspamd rspamd_evdns) TARGET_LINK_LIBRARIES(rspamd event) TARGET_LINK_LIBRARIES(rspamd rspamd_json) diff --git a/config.h.in b/config.h.in index cdb5680f1..99ea39c5b 100644 --- a/config.h.in +++ b/config.h.in @@ -111,6 +111,8 @@ #cmakedefine WITH_PROFILER 1 +#cmakedefine WITH_JUDY 1 + #cmakedefine WITH_GPERF_TOOLS 1 #cmakedefine HAVE_ASM_PAUSE 1 diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c index f6ed7b0d6..4edc77534 100644 --- a/src/fuzzy_storage.c +++ b/src/fuzzy_storage.c @@ -51,8 +51,11 @@ #define BUCKETS 1024 /* Number of insuccessfull bind retries */ #define MAX_RETRIES 40 +/* Weight of hash to consider it frequent */ +#define FREQUENT_SCORE 100 static GQueue *hashes[BUCKETS]; +static GQueue *frequent; static bloom_filter_t *bf; /* Number of cache modifications */ @@ -90,6 +93,14 @@ sig_handler (int signo, siginfo_t *info, void *unused) } } +static gint +compare_nodes (gconstpointer a, gconstpointer b, gpointer unused) +{ + const struct rspamd_fuzzy_node *n1 = a, *n2 = b; + + return n1->value - n2->value; +} + static void sync_cache (struct rspamd_worker *wrk) { @@ -132,6 +143,14 @@ sync_cache (struct rspamd_worker *wrk) (void)lock_file (fd, FALSE); now = (uint64_t) time (NULL); + cur = frequent->head; + while (cur) { + node = cur->data; + if (write (fd, node, sizeof (struct rspamd_fuzzy_node)) == -1) { + msg_err ("cannot write file %s: %s", filename, strerror (errno)); + } + cur = g_list_next (cur); + } for (i = 0; i < BUCKETS; i++) { cur = hashes[i]->head; while (cur) { @@ -204,6 +223,7 @@ read_hashes_file (struct rspamd_worker *wrk) for (i = 0; i < BUCKETS; i++) { hashes[i] = g_queue_new (); } + frequent = g_queue_new (); filename = g_hash_table_lookup (wrk->cf->params, "hashfile"); if (filename == NULL) { @@ -225,11 +245,22 @@ read_hashes_file (struct rspamd_worker *wrk) if (r != sizeof (struct rspamd_fuzzy_node)) { break; } - g_queue_push_head (hashes[node->h.block_size % BUCKETS], node); + if (node->value > FREQUENT_SCORE) { + g_queue_push_head (frequent, node); + } + else { + g_queue_push_head (hashes[node->h.block_size % BUCKETS], node); + } bloom_add (bf, node->h.hash_pipe); server_stat->fuzzy_hashes ++; } + /* Sort everything */ + g_queue_sort (frequent, compare_nodes, NULL); + for (i = 0; i < BUCKETS; i ++) { + g_queue_sort (hashes[i], compare_nodes, NULL); + } + (void)unlock_file (fd, FALSE); close (fd); @@ -244,60 +275,69 @@ read_hashes_file (struct rspamd_worker *wrk) return TRUE; } -static int -process_check_command (struct fuzzy_cmd *cmd) +static inline int +check_hash_node (GQueue *hash, fuzzy_hash_t *s, int update_value) { GList *cur; struct rspamd_fuzzy_node *h; - fuzzy_hash_t s; int prob = 0; - - if (!bloom_check (bf, cmd->hash)) { - return 0; + + cur = frequent->head; + while (cur) { + h = cur->data; + if ((prob = fuzzy_compare_hashes (&h->h, s)) > LEV_LIMIT) { + msg_info ("fuzzy hash was found, probability %d%%", prob); + return h->value; + } + cur = g_list_next (cur); } - memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe)); - s.block_size = cmd->blocksize; - cur = hashes[cmd->blocksize % BUCKETS]->head; - - /* XXX: too slow way */ + cur = hash->head; while (cur) { h = cur->data; - if ((prob = fuzzy_compare_hashes (&h->h, &s)) > LEV_LIMIT) { + if ((prob = fuzzy_compare_hashes (&h->h, s)) > LEV_LIMIT) { msg_info ("fuzzy hash was found, probability %d%%", prob); + if (update_value) { + h->value += update_value; + } + if (h->value > FREQUENT_SCORE) { + g_queue_unlink (hash, cur); + g_queue_push_head_link (frequent, cur); + } return h->value; } cur = g_list_next (cur); } - msg_debug ("fuzzy hash was NOT found, prob is %d%%", prob); return 0; } +static int +process_check_command (struct fuzzy_cmd *cmd) +{ + fuzzy_hash_t s; + + if (!bloom_check (bf, cmd->hash)) { + return 0; + } + + memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe)); + s.block_size = cmd->blocksize; + + return check_hash_node (hashes[cmd->blocksize % BUCKETS], &s, 0); +} + static gboolean update_hash (struct fuzzy_cmd *cmd) { GList *cur; - struct rspamd_fuzzy_node *h; fuzzy_hash_t s; - int prob = 0; memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe)); s.block_size = cmd->blocksize; cur = hashes[cmd->blocksize % BUCKETS]->head; - /* XXX: too slow way */ - while (cur) { - h = cur->data; - if ((prob = fuzzy_compare_hashes (&h->h, &s)) > LEV_LIMIT) { - h->value += cmd->value; - msg_info ("fuzzy hash was found, probability %d%%, set new value to %d", prob, h->value); - return TRUE; - } - cur = g_list_next (cur); - } - - return FALSE; + return check_hash_node (hashes[cmd->blocksize % BUCKETS], &s, cmd->value); } static gboolean @@ -324,40 +364,58 @@ process_write_command (struct fuzzy_cmd *cmd) return TRUE; } -static gboolean -process_delete_command (struct fuzzy_cmd *cmd) +static gboolean +delete_hash (GQueue *hash, fuzzy_hash_t *s) { GList *cur, *tmp; struct rspamd_fuzzy_node *h; - fuzzy_hash_t s; gboolean res = FALSE; - - if (!bloom_check (bf, cmd->hash)) { - return FALSE; - } - - memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe)); - s.block_size = cmd->blocksize; - cur = hashes[cmd->blocksize % BUCKETS]->head; + + cur = hash->head; /* XXX: too slow way */ while (cur) { h = cur->data; - if (fuzzy_compare_hashes (&h->h, &s) > LEV_LIMIT) { + if (fuzzy_compare_hashes (&h->h, s) > LEV_LIMIT) { g_free (h); tmp = cur; cur = g_list_next (cur); - g_queue_delete_link (hashes[cmd->blocksize % BUCKETS], tmp); - bloom_del (bf, cmd->hash); + g_queue_delete_link (hash, tmp); + bloom_del (bf, s->hash_pipe); msg_info ("fuzzy hash was successfully deleted"); server_stat->fuzzy_hashes --; - res = TRUE; mods++; + res = TRUE; continue; } cur = g_list_next (cur); } + return res; + +} + +static gboolean +process_delete_command (struct fuzzy_cmd *cmd) +{ + fuzzy_hash_t s; + gboolean res = FALSE; + + if (!bloom_check (bf, cmd->hash)) { + return FALSE; + } + + memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe)); + s.block_size = cmd->blocksize; + + res = delete_hash (frequent, &s); + if (!res) { + res = delete_hash (hashes[cmd->blocksize % BUCKETS], &s); + } + else { + (void)delete_hash (hashes[cmd->blocksize % BUCKETS], &s); + } + return res; } -- 2.39.5