aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorcebka@lenovo-laptop <cebka@lenovo-laptop>2010-03-18 19:43:55 +0300
committercebka@lenovo-laptop <cebka@lenovo-laptop>2010-03-18 19:43:55 +0300
commitddba1860ecead8529f6542bb69c4027e8a7e13a4 (patch)
tree8ad0857ad2ca333f7becdf321e7dc49866d5e54b
parentddd99f6762c055a7fb7afc5c03fec941c5671d67 (diff)
downloadrspamd-ddba1860ecead8529f6542bb69c4027e8a7e13a4.tar.gz
rspamd-ddba1860ecead8529f6542bb69c4027e8a7e13a4.zip
* Try to speed up fuzzy storage
-rw-r--r--CMakeLists.txt40
-rw-r--r--config.h.in2
-rw-r--r--src/fuzzy_storage.c144
3 files changed, 135 insertions, 51 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37630d51e..7e866f719 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,6 +151,8 @@ LINK_DIRECTORIES(${GMIME2_LIBRARY_DIRS})
SET(CMAKE_REQUIRED_LIBRARIES m)
SET(CMAKE_REQUIRED_INCLUDES sys/mman.h stdlib.h stdio.h unistd.h time.h sched.h)
+# Check for libevent
+
FIND_LIBRARY(LIBEVENT_LIBRARY NAMES event PATHS /lib
/opt/lib
/usr/lib
@@ -160,6 +162,33 @@ IF(NOT LIBEVENT_LIBRARY)
MESSAGE(FATAL_ERROR "libevent is required for building rspamd")
ENDIF(NOT LIBEVENT_LIBRARY)
+FIND_PATH(LIBEVENT_INCLUDE event.h PATHS /opt/include
+ /usr/include
+ /usr/local/include
+ DOC "Path where the libevent header files can be found")
+
+GET_FILENAME_COMPONENT(LIBEVENT_PATH "${LIBEVENT_LIBRARY}" PATH)
+INCLUDE_DIRECTORIES("${LIBEVENT_INCLUDE}")
+LINK_DIRECTORIES("${LIBEVENT_PATH}")
+
+# Find libjudy
+#FIND_LIBRARY(LIBJUDY_LIBRARY NAMES judy PATHS /lib
+# /opt/lib
+# /usr/lib
+# /usr/local/lib
+# DOC "Path where the lijudy library can be found")
+#IF(LIBJUDY_LIBRARY)
+# FIND_PATH(LIBJUDY_INCLUDE Judy.h PATHS /opt/include
+# /usr/include
+# /usr/local/include
+# DOC "Path where the judy header files can be found")
+#
+# GET_FILENAME_COMPONENT(LIBJUDY_PATH "${LIBJUDY_LIBRARY}" PATH)
+# INCLUDE_DIRECTORIES("${LIBJUDY_INCLUDE}")
+# LINK_DIRECTORIES("${LIBJUDY_PATH}")
+# SET(WITH_JUDY 1)
+#ENDIF(LIBJUDY_LIBRARY)
+
IF(ENABLE_PROFILING MATCHES "ON")
SET(WITH_PROFILER 1)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg")
@@ -193,14 +222,6 @@ IF(ENABLE_GPERF_TOOLS MATCHES "ON")
ENDIF(ENABLE_GPERF_TOOLS MATCHES "ON")
-FIND_PATH(LIBEVENT_INCLUDE event.h PATHS /opt/include
- /usr/include
- /usr/local/include
- DOC "Path where the libevent header files can be found")
-
-GET_FILENAME_COMPONENT(LIBEVENT_PATH "${LIBEVENT_LIBRARY}" PATH)
-INCLUDE_DIRECTORIES("${LIBEVENT_INCLUDE}")
-LINK_DIRECTORIES("${LIBEVENT_PATH}")
FIND_LIBRARY(LIBUTIL_LIBRARY NAMES util PATHS /lib
/opt/lib
@@ -532,6 +553,9 @@ TARGET_LINK_LIBRARIES(rspamd m)
IF(LIBUTIL_LIBRARY)
TARGET_LINK_LIBRARIES(rspamd util)
ENDIF(LIBUTIL_LIBRARY)
+IF(LIBJUDY_LIBRARY)
+ TARGET_LINK_LIBRARIES(rspamd judy)
+ENDIF(LIBJUDY_LIBRARY)
TARGET_LINK_LIBRARIES(rspamd rspamd_evdns)
TARGET_LINK_LIBRARIES(rspamd event)
TARGET_LINK_LIBRARIES(rspamd rspamd_json)
diff --git a/config.h.in b/config.h.in
index cdb5680f1..99ea39c5b 100644
--- a/config.h.in
+++ b/config.h.in
@@ -111,6 +111,8 @@
#cmakedefine WITH_PROFILER 1
+#cmakedefine WITH_JUDY 1
+
#cmakedefine WITH_GPERF_TOOLS 1
#cmakedefine HAVE_ASM_PAUSE 1
diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c
index f6ed7b0d6..4edc77534 100644
--- a/src/fuzzy_storage.c
+++ b/src/fuzzy_storage.c
@@ -51,8 +51,11 @@
#define BUCKETS 1024
/* Number of insuccessfull bind retries */
#define MAX_RETRIES 40
+/* Weight of hash to consider it frequent */
+#define FREQUENT_SCORE 100
static GQueue *hashes[BUCKETS];
+static GQueue *frequent;
static bloom_filter_t *bf;
/* Number of cache modifications */
@@ -90,6 +93,14 @@ sig_handler (int signo, siginfo_t *info, void *unused)
}
}
+static gint
+compare_nodes (gconstpointer a, gconstpointer b, gpointer unused)
+{
+ const struct rspamd_fuzzy_node *n1 = a, *n2 = b;
+
+ return n1->value - n2->value;
+}
+
static void
sync_cache (struct rspamd_worker *wrk)
{
@@ -132,6 +143,14 @@ sync_cache (struct rspamd_worker *wrk)
(void)lock_file (fd, FALSE);
now = (uint64_t) time (NULL);
+ cur = frequent->head;
+ while (cur) {
+ node = cur->data;
+ if (write (fd, node, sizeof (struct rspamd_fuzzy_node)) == -1) {
+ msg_err ("cannot write file %s: %s", filename, strerror (errno));
+ }
+ cur = g_list_next (cur);
+ }
for (i = 0; i < BUCKETS; i++) {
cur = hashes[i]->head;
while (cur) {
@@ -204,6 +223,7 @@ read_hashes_file (struct rspamd_worker *wrk)
for (i = 0; i < BUCKETS; i++) {
hashes[i] = g_queue_new ();
}
+ frequent = g_queue_new ();
filename = g_hash_table_lookup (wrk->cf->params, "hashfile");
if (filename == NULL) {
@@ -225,11 +245,22 @@ read_hashes_file (struct rspamd_worker *wrk)
if (r != sizeof (struct rspamd_fuzzy_node)) {
break;
}
- g_queue_push_head (hashes[node->h.block_size % BUCKETS], node);
+ if (node->value > FREQUENT_SCORE) {
+ g_queue_push_head (frequent, node);
+ }
+ else {
+ g_queue_push_head (hashes[node->h.block_size % BUCKETS], node);
+ }
bloom_add (bf, node->h.hash_pipe);
server_stat->fuzzy_hashes ++;
}
+ /* Sort everything */
+ g_queue_sort (frequent, compare_nodes, NULL);
+ for (i = 0; i < BUCKETS; i ++) {
+ g_queue_sort (hashes[i], compare_nodes, NULL);
+ }
+
(void)unlock_file (fd, FALSE);
close (fd);
@@ -244,60 +275,69 @@ read_hashes_file (struct rspamd_worker *wrk)
return TRUE;
}
-static int
-process_check_command (struct fuzzy_cmd *cmd)
+static inline int
+check_hash_node (GQueue *hash, fuzzy_hash_t *s, int update_value)
{
GList *cur;
struct rspamd_fuzzy_node *h;
- fuzzy_hash_t s;
int prob = 0;
-
- if (!bloom_check (bf, cmd->hash)) {
- return 0;
+
+ cur = frequent->head;
+ while (cur) {
+ h = cur->data;
+ if ((prob = fuzzy_compare_hashes (&h->h, s)) > LEV_LIMIT) {
+ msg_info ("fuzzy hash was found, probability %d%%", prob);
+ return h->value;
+ }
+ cur = g_list_next (cur);
}
- memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
- s.block_size = cmd->blocksize;
- cur = hashes[cmd->blocksize % BUCKETS]->head;
-
- /* XXX: too slow way */
+ cur = hash->head;
while (cur) {
h = cur->data;
- if ((prob = fuzzy_compare_hashes (&h->h, &s)) > LEV_LIMIT) {
+ if ((prob = fuzzy_compare_hashes (&h->h, s)) > LEV_LIMIT) {
msg_info ("fuzzy hash was found, probability %d%%", prob);
+ if (update_value) {
+ h->value += update_value;
+ }
+ if (h->value > FREQUENT_SCORE) {
+ g_queue_unlink (hash, cur);
+ g_queue_push_head_link (frequent, cur);
+ }
return h->value;
}
cur = g_list_next (cur);
}
- msg_debug ("fuzzy hash was NOT found, prob is %d%%", prob);
return 0;
}
+static int
+process_check_command (struct fuzzy_cmd *cmd)
+{
+ fuzzy_hash_t s;
+
+ if (!bloom_check (bf, cmd->hash)) {
+ return 0;
+ }
+
+ memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
+ s.block_size = cmd->blocksize;
+
+ return check_hash_node (hashes[cmd->blocksize % BUCKETS], &s, 0);
+}
+
static gboolean
update_hash (struct fuzzy_cmd *cmd)
{
GList *cur;
- struct rspamd_fuzzy_node *h;
fuzzy_hash_t s;
- int prob = 0;
memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
s.block_size = cmd->blocksize;
cur = hashes[cmd->blocksize % BUCKETS]->head;
- /* XXX: too slow way */
- while (cur) {
- h = cur->data;
- if ((prob = fuzzy_compare_hashes (&h->h, &s)) > LEV_LIMIT) {
- h->value += cmd->value;
- msg_info ("fuzzy hash was found, probability %d%%, set new value to %d", prob, h->value);
- return TRUE;
- }
- cur = g_list_next (cur);
- }
-
- return FALSE;
+ return check_hash_node (hashes[cmd->blocksize % BUCKETS], &s, cmd->value);
}
static gboolean
@@ -324,41 +364,59 @@ process_write_command (struct fuzzy_cmd *cmd)
return TRUE;
}
-static gboolean
-process_delete_command (struct fuzzy_cmd *cmd)
+static gboolean
+delete_hash (GQueue *hash, fuzzy_hash_t *s)
{
GList *cur, *tmp;
struct rspamd_fuzzy_node *h;
- fuzzy_hash_t s;
gboolean res = FALSE;
-
- if (!bloom_check (bf, cmd->hash)) {
- return FALSE;
- }
-
- memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
- s.block_size = cmd->blocksize;
- cur = hashes[cmd->blocksize % BUCKETS]->head;
+
+ cur = hash->head;
/* XXX: too slow way */
while (cur) {
h = cur->data;
- if (fuzzy_compare_hashes (&h->h, &s) > LEV_LIMIT) {
+ if (fuzzy_compare_hashes (&h->h, s) > LEV_LIMIT) {
g_free (h);
tmp = cur;
cur = g_list_next (cur);
- g_queue_delete_link (hashes[cmd->blocksize % BUCKETS], tmp);
- bloom_del (bf, cmd->hash);
+ g_queue_delete_link (hash, tmp);
+ bloom_del (bf, s->hash_pipe);
msg_info ("fuzzy hash was successfully deleted");
server_stat->fuzzy_hashes --;
- res = TRUE;
mods++;
+ res = TRUE;
continue;
}
cur = g_list_next (cur);
}
return res;
+
+}
+
+static gboolean
+process_delete_command (struct fuzzy_cmd *cmd)
+{
+ fuzzy_hash_t s;
+ gboolean res = FALSE;
+
+ if (!bloom_check (bf, cmd->hash)) {
+ return FALSE;
+ }
+
+ memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
+ s.block_size = cmd->blocksize;
+
+ res = delete_hash (frequent, &s);
+ if (!res) {
+ res = delete_hash (hashes[cmd->blocksize % BUCKETS], &s);
+ }
+ else {
+ (void)delete_hash (hashes[cmd->blocksize % BUCKETS], &s);
+ }
+
+ return res;
}
#define CMD_PROCESS(x) \