]> source.dussan.org Git - rspamd.git/commitdiff
* Try to speed up fuzzy storage
authorcebka@lenovo-laptop <cebka@lenovo-laptop>
Thu, 18 Mar 2010 16:43:55 +0000 (19:43 +0300)
committercebka@lenovo-laptop <cebka@lenovo-laptop>
Thu, 18 Mar 2010 16:43:55 +0000 (19:43 +0300)
CMakeLists.txt
config.h.in
src/fuzzy_storage.c

index 37630d51efbe50727297dd8fb02fadabc065e525..7e866f719ea3da3bddd1241a82aeb36936316893 100644 (file)
@@ -151,6 +151,8 @@ LINK_DIRECTORIES(${GMIME2_LIBRARY_DIRS})
 SET(CMAKE_REQUIRED_LIBRARIES m)
 SET(CMAKE_REQUIRED_INCLUDES sys/mman.h stdlib.h stdio.h unistd.h time.h sched.h)
 
+# Check for libevent
+
 FIND_LIBRARY(LIBEVENT_LIBRARY NAMES event PATHS /lib
                                                        /opt/lib
                                                /usr/lib
@@ -160,6 +162,33 @@ IF(NOT LIBEVENT_LIBRARY)
        MESSAGE(FATAL_ERROR "libevent is required for building rspamd")
 ENDIF(NOT LIBEVENT_LIBRARY)
 
+FIND_PATH(LIBEVENT_INCLUDE event.h PATHS       /opt/include
+                                                                               /usr/include
+                                                                               /usr/local/include
+                                                                               DOC "Path where the libevent header files can be found")
+
+GET_FILENAME_COMPONENT(LIBEVENT_PATH "${LIBEVENT_LIBRARY}" PATH)
+INCLUDE_DIRECTORIES("${LIBEVENT_INCLUDE}")
+LINK_DIRECTORIES("${LIBEVENT_PATH}")
+
+# Find libjudy
+#FIND_LIBRARY(LIBJUDY_LIBRARY NAMES judy PATHS /lib
+#                                                      /opt/lib
+#                                              /usr/lib
+#                                              /usr/local/lib
+#                 DOC "Path where the lijudy library can be found")
+#IF(LIBJUDY_LIBRARY)
+#      FIND_PATH(LIBJUDY_INCLUDE Judy.h PATHS  /opt/include
+#                                                                                      /usr/include
+#                                                                                      /usr/local/include
+#                                                                                      DOC "Path where the judy header files can be found")
+#
+#      GET_FILENAME_COMPONENT(LIBJUDY_PATH "${LIBJUDY_LIBRARY}" PATH)
+#      INCLUDE_DIRECTORIES("${LIBJUDY_INCLUDE}")
+#      LINK_DIRECTORIES("${LIBJUDY_PATH}")
+#      SET(WITH_JUDY 1)
+#ENDIF(LIBJUDY_LIBRARY)
+
 IF(ENABLE_PROFILING MATCHES "ON")
        SET(WITH_PROFILER 1)
        SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg")
@@ -193,14 +222,6 @@ IF(ENABLE_GPERF_TOOLS MATCHES "ON")
 ENDIF(ENABLE_GPERF_TOOLS MATCHES "ON")
 
 
-FIND_PATH(LIBEVENT_INCLUDE event.h PATHS       /opt/include
-                                                                               /usr/include
-                                                                               /usr/local/include
-                                                                               DOC "Path where the libevent header files can be found")
-
-GET_FILENAME_COMPONENT(LIBEVENT_PATH "${LIBEVENT_LIBRARY}" PATH)
-INCLUDE_DIRECTORIES("${LIBEVENT_INCLUDE}")
-LINK_DIRECTORIES("${LIBEVENT_PATH}")
 
 FIND_LIBRARY(LIBUTIL_LIBRARY NAMES util PATHS  /lib
                                                        /opt/lib
@@ -532,6 +553,9 @@ TARGET_LINK_LIBRARIES(rspamd m)
 IF(LIBUTIL_LIBRARY)
        TARGET_LINK_LIBRARIES(rspamd util)
 ENDIF(LIBUTIL_LIBRARY)
+IF(LIBJUDY_LIBRARY)
+       TARGET_LINK_LIBRARIES(rspamd judy)
+ENDIF(LIBJUDY_LIBRARY)
 TARGET_LINK_LIBRARIES(rspamd rspamd_evdns)
 TARGET_LINK_LIBRARIES(rspamd event)
 TARGET_LINK_LIBRARIES(rspamd rspamd_json)
index cdb5680f1b1795dd73fdeba988eb360bacb656b7..99ea39c5bf5a5a28093e45e52a6e8882eb5c14a7 100644 (file)
 
 #cmakedefine WITH_PROFILER       1
 
+#cmakedefine WITH_JUDY           1
+
 #cmakedefine WITH_GPERF_TOOLS    1
 
 #cmakedefine HAVE_ASM_PAUSE      1
index f6ed7b0d6d185008e15216dbe7e84f07752f1ec0..4edc77534bb97a6101044524203f4d49694cd01b 100644 (file)
 #define BUCKETS 1024
 /* Number of insuccessfull bind retries */
 #define MAX_RETRIES 40
+/* Weight of hash to consider it frequent */
+#define FREQUENT_SCORE 100
 
 static GQueue                  *hashes[BUCKETS];
+static GQueue                  *frequent;
 static bloom_filter_t          *bf;
 
 /* Number of cache modifications */
@@ -90,6 +93,14 @@ sig_handler (int signo, siginfo_t *info, void *unused)
        }
 }
 
+static gint
+compare_nodes (gconstpointer a, gconstpointer b, gpointer unused)
+{
+       const struct rspamd_fuzzy_node *n1 = a, *n2 = b;
+
+       return n1->value - n2->value;
+}
+
 static void
 sync_cache (struct rspamd_worker *wrk)
 {
@@ -132,6 +143,14 @@ sync_cache (struct rspamd_worker *wrk)
        (void)lock_file (fd, FALSE);
 
        now = (uint64_t) time (NULL);
+       cur = frequent->head;
+       while (cur) {
+               node = cur->data;
+               if (write (fd, node, sizeof (struct rspamd_fuzzy_node)) == -1) {
+                       msg_err ("cannot write file %s: %s", filename, strerror (errno));
+               }
+               cur = g_list_next (cur);
+       }
        for (i = 0; i < BUCKETS; i++) {
                cur = hashes[i]->head;
                while (cur) {
@@ -204,6 +223,7 @@ read_hashes_file (struct rspamd_worker *wrk)
        for (i = 0; i < BUCKETS; i++) {
                hashes[i] = g_queue_new ();
        }
+       frequent = g_queue_new ();
 
        filename = g_hash_table_lookup (wrk->cf->params, "hashfile");
        if (filename == NULL) {
@@ -225,11 +245,22 @@ read_hashes_file (struct rspamd_worker *wrk)
                if (r != sizeof (struct rspamd_fuzzy_node)) {
                        break;
                }
-               g_queue_push_head (hashes[node->h.block_size % BUCKETS], node);
+               if (node->value > FREQUENT_SCORE) {
+                       g_queue_push_head (frequent, node);
+               }
+               else {
+                       g_queue_push_head (hashes[node->h.block_size % BUCKETS], node);
+               }
                bloom_add (bf, node->h.hash_pipe);
                server_stat->fuzzy_hashes ++;
        }
 
+       /* Sort everything */
+       g_queue_sort (frequent, compare_nodes, NULL);
+       for (i = 0; i < BUCKETS; i ++) {
+               g_queue_sort (hashes[i], compare_nodes, NULL);
+       }
+
        (void)unlock_file (fd, FALSE);
        close (fd);
 
@@ -244,60 +275,69 @@ read_hashes_file (struct rspamd_worker *wrk)
        return TRUE;
 }
 
-static                          int
-process_check_command (struct fuzzy_cmd *cmd)
+static inline int
+check_hash_node (GQueue *hash, fuzzy_hash_t *s, int update_value)
 {
        GList                          *cur;
        struct rspamd_fuzzy_node       *h;
-       fuzzy_hash_t                    s;
        int                             prob = 0;
-
-       if (!bloom_check (bf, cmd->hash)) {
-               return 0;
+       
+       cur = frequent->head;
+       while (cur) {
+               h = cur->data;
+               if ((prob = fuzzy_compare_hashes (&h->h, s)) > LEV_LIMIT) {
+                       msg_info ("fuzzy hash was found, probability %d%%", prob);
+                       return h->value;
+               }
+               cur = g_list_next (cur);
        }
 
-       memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
-       s.block_size = cmd->blocksize;
-       cur = hashes[cmd->blocksize % BUCKETS]->head;
-
-       /* XXX: too slow way */
+       cur = hash->head;
        while (cur) {
                h = cur->data;
-               if ((prob = fuzzy_compare_hashes (&h->h, &s)) > LEV_LIMIT) {
+               if ((prob = fuzzy_compare_hashes (&h->h, s)) > LEV_LIMIT) {
                        msg_info ("fuzzy hash was found, probability %d%%", prob);
+                       if (update_value) {
+                               h->value += update_value;
+                       }
+                       if (h->value > FREQUENT_SCORE) {
+                               g_queue_unlink (hash, cur);
+                               g_queue_push_head_link (frequent, cur);
+                       }
                        return h->value;
                }
                cur = g_list_next (cur);
        }
-       msg_debug ("fuzzy hash was NOT found, prob is %d%%", prob);
 
        return 0;
 }
 
+static                          int
+process_check_command (struct fuzzy_cmd *cmd)
+{
+       fuzzy_hash_t                    s;
+
+       if (!bloom_check (bf, cmd->hash)) {
+               return 0;
+       }
+
+       memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
+       s.block_size = cmd->blocksize;
+
+       return check_hash_node (hashes[cmd->blocksize % BUCKETS], &s, 0);
+}
+
 static                          gboolean
 update_hash (struct fuzzy_cmd *cmd)
 {
        GList                          *cur;
-       struct rspamd_fuzzy_node       *h;
        fuzzy_hash_t                    s;
-       int                             prob = 0;
 
        memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
        s.block_size = cmd->blocksize;
        cur = hashes[cmd->blocksize % BUCKETS]->head;
 
-       /* XXX: too slow way */
-       while (cur) {
-               h = cur->data;
-               if ((prob = fuzzy_compare_hashes (&h->h, &s)) > LEV_LIMIT) {
-                       h->value += cmd->value;
-                       msg_info ("fuzzy hash was found, probability %d%%, set new value to %d", prob, h->value);
-                       return TRUE;
-               }
-               cur = g_list_next (cur);
-       }
-
-       return FALSE;
+       return check_hash_node (hashes[cmd->blocksize % BUCKETS], &s, cmd->value);
 }
 
 static                          gboolean
@@ -324,40 +364,58 @@ process_write_command (struct fuzzy_cmd *cmd)
        return TRUE;
 }
 
-static                          gboolean
-process_delete_command (struct fuzzy_cmd *cmd)
+static gboolean
+delete_hash (GQueue *hash, fuzzy_hash_t *s)
 {
        GList                          *cur, *tmp;
        struct rspamd_fuzzy_node       *h;
-       fuzzy_hash_t                    s;
        gboolean                        res = FALSE;
-
-       if (!bloom_check (bf, cmd->hash)) {
-               return FALSE;
-       }
-
-       memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
-       s.block_size = cmd->blocksize;
-       cur = hashes[cmd->blocksize % BUCKETS]->head;
+       
+       cur = hash->head;
 
        /* XXX: too slow way */
        while (cur) {
                h = cur->data;
-               if (fuzzy_compare_hashes (&h->h, &s) > LEV_LIMIT) {
+               if (fuzzy_compare_hashes (&h->h, s) > LEV_LIMIT) {
                        g_free (h);
                        tmp = cur;
                        cur = g_list_next (cur);
-                       g_queue_delete_link (hashes[cmd->blocksize % BUCKETS], tmp);
-                       bloom_del (bf, cmd->hash);
+                       g_queue_delete_link (hash, tmp);
+                       bloom_del (bf, s->hash_pipe);
                        msg_info ("fuzzy hash was successfully deleted");
                        server_stat->fuzzy_hashes --;
-                       res = TRUE;
                        mods++;
+                       res = TRUE;
                        continue;
                }
                cur = g_list_next (cur);
        }
 
+       return res;
+
+}
+
+static                          gboolean
+process_delete_command (struct fuzzy_cmd *cmd)
+{
+       fuzzy_hash_t                    s;
+       gboolean                        res = FALSE;
+
+       if (!bloom_check (bf, cmd->hash)) {
+               return FALSE;
+       }
+
+       memcpy (s.hash_pipe, cmd->hash, sizeof (s.hash_pipe));
+       s.block_size = cmd->blocksize;
+
+       res = delete_hash (frequent, &s);
+       if (!res) {
+               res = delete_hash (hashes[cmd->blocksize % BUCKETS], &s);
+       }
+       else {
+               (void)delete_hash (hashes[cmd->blocksize % BUCKETS], &s);
+       }
+
        return res;
 }