From 7b44f3af0b2b7674c4ee14d386022b8d96e4ceb8 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 11 May 2017 14:29:18 +0100 Subject: [PATCH] [Rework] Use LFU algorithm in LRU cache Avoid usage of heap as its updates are not cheap. Use LFU algorithm described here: http://antirez.com/news/109 --- src/fuzzy_storage.c | 6 +- src/libutil/hash.c | 256 +++++++++++++++++++++++++++++++++++++------- src/libutil/hash.h | 20 ++-- 3 files changed, 230 insertions(+), 52 deletions(-) diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c index 50bc864bc..99b98ceff 100644 --- a/src/fuzzy_storage.c +++ b/src/fuzzy_storage.c @@ -1976,7 +1976,8 @@ rspamd_fuzzy_stat_to_ucl (struct rspamd_fuzzy_storage_ctx *ctx, gboolean ip_stat while (g_hash_table_iter_next (&ip_it, &k, &v)) { lru_elt = v; - ip_cur = rspamd_fuzzy_storage_stat_key (lru_elt->data); + ip_cur = rspamd_fuzzy_storage_stat_key ( + rspamd_lru_hash_element_data (lru_elt)); ucl_object_insert_key (ip_elt, ip_cur, rspamd_inet_address_to_string (k), 0, true); } @@ -2019,7 +2020,8 @@ rspamd_fuzzy_stat_to_ucl (struct rspamd_fuzzy_storage_ctx *ctx, gboolean ip_stat lru_elt = v; ucl_object_insert_key (ip_elt, - ucl_object_fromint (*(guint64 *)lru_elt->data), + ucl_object_fromint (*(guint64 *) + rspamd_lru_hash_element_data (lru_elt)), rspamd_inet_address_to_string (k), 0, true); } diff --git a/src/libutil/hash.c b/src/libutil/hash.c index f1279798a..09377f1e5 100644 --- a/src/libutil/hash.c +++ b/src/libutil/hash.c @@ -22,15 +22,32 @@ */ static const guint expire_aggressive_count = 10; +static const guint log_base = 10; +static const guint eviction_candidates = 16; +static const gdouble lfu_base_value = 5.0; struct rspamd_lru_hash_s { guint maxsize; + guint eviction_min_prio; + guint eviction_used; GDestroyNotify value_destroy; GDestroyNotify key_destroy; - struct rspamd_min_heap *heap; + struct rspamd_lru_element_s **eviction_pool; GHashTable *tbl; }; +struct rspamd_lru_element_s { + guint16 ttl; + guint16 last; + guint8 lg_usages; + guint eviction_pos; + gpointer data; + gpointer key; + rspamd_lru_hash_t *hash; +}; + +#define TIME_TO_TS(t) ((guint16)(((t) / 60) & 0xFFFFU)) + static void rspamd_lru_destroy_node (gpointer value) { @@ -48,17 +65,102 @@ rspamd_lru_destroy_node (gpointer value) } } -static inline guint -rspamd_lru_priority (guint ttl, guint usages) +static void +rspamd_lru_hash_remove_evicted (rspamd_lru_hash_t *hash, + rspamd_lru_element_t *elt) +{ + guint i; + + g_assert (elt->eviction_pos < eviction_candidates); + + memmove (&hash->eviction_pool[elt->eviction_pos], + &hash->eviction_pool[elt->eviction_pos + 1], + sizeof (rspamd_lru_element_t *) * + (eviction_candidates - elt->eviction_pos - 1)); + hash->eviction_used --; + + if (elt->lg_usages <= hash->eviction_min_prio) { + /* We also need to update min_prio */ + hash->eviction_min_prio = G_MAXUINT; + + for (i = 0; i < hash->eviction_used; i ++) { + if (hash->eviction_min_prio > hash->eviction_pool[i]->lg_usages) { + hash->eviction_min_prio = hash->eviction_pool[i]->lg_usages; + } + } + } +} + +static void +rspamd_lru_hash_update_counter (rspamd_lru_element_t *elt) +{ + guint8 counter = elt->lg_usages; + + if (counter != 255) { + double r, baseval, p; + + r = rspamd_random_double_fast (); + baseval = counter - lfu_base_value; + + if (baseval < 0) { + baseval = 0; + } + + p = 1.0 / (baseval * log_base + 1); + + if (r < p) { + elt->lg_usages ++; + } + } +} + +static inline void +rspamd_lru_hash_decrease_counter (rspamd_lru_element_t *elt, time_t now) +{ + if (now - elt->last > lfu_base_value) { + /* Penalise counters for outdated records */ + elt->lg_usages /= 2; + } +} + +static gboolean +rspamd_lru_hash_maybe_evict (rspamd_lru_hash_t *hash, + rspamd_lru_element_t *elt) { - /* 1 day */ - static const guint max_ttl = 3600 * 24; + guint i; + rspamd_lru_element_t *cur; + + if (elt->eviction_pos != -1) { + if (hash->eviction_used < eviction_candidates) { + /* There are free places in eviction pool */ + hash->eviction_pool[hash->eviction_used] = elt; + elt->eviction_pos = hash->eviction_used; + hash->eviction_used ++; + + if (hash->eviction_min_prio > elt->lg_usages) { + hash->eviction_min_prio = elt->lg_usages; + } - if (ttl > 0 && usages > 0) { - return G_MAXUINT / (MIN (max_ttl, ttl) * usages); + return TRUE; + } + else if (hash->eviction_min_prio > elt->lg_usages) { + /* Find any candidate that has higher usage count */ + for (i = 0; i < hash->eviction_used; i ++) { + cur = hash->eviction_pool[i]; + + if (cur->lg_usages > elt->lg_usages) { + cur->eviction_pos = -1; + elt->eviction_pos = i; + hash->eviction_pool[i] = elt; + hash->eviction_min_prio = elt->lg_usages; + + return TRUE; + } + } + } } - return G_MAXUINT; + return FALSE; } static rspamd_lru_element_t * @@ -73,16 +175,90 @@ rspamd_lru_create_node (rspamd_lru_hash_t *hash, node = g_slice_alloc (sizeof (rspamd_lru_element_t)); node->data = value; node->key = key; - node->ttl = ttl; - node->helt.pri = now; + node->ttl = TIME_TO_TS (ttl); + + if (node->ttl == 0) { + node->ttl = 1; + } + node->hash = hash; - node->usages = 1; - node->storage = now; - node->helt.pri = rspamd_lru_priority (ttl, 1); + node->lg_usages = lfu_base_value; + node->last = TIME_TO_TS (now); return node; } +static void +rspamd_lru_hash_remove_node (rspamd_lru_hash_t *hash, rspamd_lru_element_t *elt) +{ + if (elt->eviction_pos != -1) { + rspamd_lru_hash_remove_evicted (hash, elt); + } + + g_hash_table_remove (hash->tbl, elt); +} + +static rspamd_lru_element_t * +rspamd_lru_eviction_full_update (rspamd_lru_hash_t *hash, time_t now) +{ + GHashTableIter it; + gpointer k, v; + rspamd_lru_element_t *cur, *selected = NULL; + + g_hash_table_iter_init (&it, hash->tbl); + now = TIME_TO_TS (now); + + while (g_hash_table_iter_next (&it, &k, &v)) { + cur = v; + + rspamd_lru_hash_decrease_counter (cur, now); + + if (rspamd_lru_hash_maybe_evict (hash, cur)) { + + if (selected && cur->lg_usages < selected->lg_usages) { + selected = cur; + } + else if (selected == NULL) { + selected = cur; + } + } + } + + return selected; +} + +static void +rspamd_lru_hash_evict (rspamd_lru_hash_t *hash, time_t now) +{ + double r; + guint i; + rspamd_lru_element_t *elt = NULL; + + /* + * We either evict one node from the eviction list + * or, at some probability scan all table and update eviction + * list first + */ + + r = rspamd_random_double_fast (); + + if (r < ((double)eviction_candidates) / hash->maxsize) { + elt = rspamd_lru_eviction_full_update (hash, now); + } + else { + for (i = 0; i < hash->eviction_used; i ++) { + elt = hash->eviction_pool[i]; + + if (elt->lg_usages <= hash->eviction_min_prio) { + break; + } + } + } + + g_assert (elt != NULL); + rspamd_lru_hash_remove_node (hash, elt); +} + rspamd_lru_hash_t * rspamd_lru_hash_new_full ( gint maxsize, @@ -93,12 +269,18 @@ rspamd_lru_hash_new_full ( { rspamd_lru_hash_t *new; - new = g_slice_alloc (sizeof (rspamd_lru_hash_t)); + if (maxsize < eviction_candidates * 2) { + maxsize = eviction_candidates * 2; + } + + new = g_malloc0 (sizeof (rspamd_lru_hash_t)); new->tbl = g_hash_table_new_full (hf, cmpf, NULL, rspamd_lru_destroy_node); - new->heap = rspamd_min_heap_create (maxsize); + new->eviction_pool = g_malloc0 (sizeof (rspamd_lru_element_t *) * + eviction_candidates); new->maxsize = maxsize; new->value_destroy = value_destroy; new->key_destroy = key_destroy; + new->eviction_min_prio = G_MAXUINT; return new; } @@ -121,16 +303,19 @@ rspamd_lru_hash_lookup (rspamd_lru_hash_t *hash, gconstpointer key, time_t now) res = g_hash_table_lookup (hash->tbl, key); if (res != NULL) { + now = TIME_TO_TS(now); + if (res->ttl != 0) { - if (((guint)now) - res->storage > res->ttl) { - rspamd_min_heap_remove_elt (hash->heap, &res->helt); - g_hash_table_remove (hash->tbl, key); + if (now - res->last > res->ttl) { + rspamd_lru_hash_remove_node (hash, res); + return NULL; } } - rspamd_min_heap_update_elt (hash->heap, &res->helt, - rspamd_lru_priority (res->ttl, ++res->usages)); + res->last = MAX (res->last, now); + rspamd_lru_hash_update_counter (res); + rspamd_lru_hash_maybe_evict (hash, res); return res->data; } @@ -143,42 +328,29 @@ rspamd_lru_hash_insert (rspamd_lru_hash_t *hash, gpointer key, gpointer value, time_t now, guint ttl) { rspamd_lru_element_t *res; - guint i; res = g_hash_table_lookup (hash->tbl, key); if (res != NULL) { - rspamd_min_heap_remove_elt (hash->heap, &res->helt); - g_hash_table_remove (hash->tbl, key); + rspamd_lru_hash_remove_node (hash, res); } else { - if (hash->maxsize > 0 && - g_hash_table_size (hash->tbl) >= hash->maxsize) { - - for (i = 0; i < MIN (hash->maxsize, expire_aggressive_count); i ++) { - res = (rspamd_lru_element_t *)rspamd_min_heap_pop (hash->heap); - - if (res) { - g_hash_table_remove (hash->tbl, res->key); - } - else { - break; - } - } + if (g_hash_table_size (hash->tbl) >= hash->maxsize) { + rspamd_lru_hash_evict (hash, now); } } res = rspamd_lru_create_node (hash, key, value, now, ttl); g_hash_table_insert (hash->tbl, key, res); - rspamd_min_heap_push (hash->heap, &res->helt); + rspamd_lru_hash_maybe_evict (hash, res); } void rspamd_lru_hash_destroy (rspamd_lru_hash_t *hash) { - rspamd_min_heap_destroy (hash->heap); g_hash_table_unref (hash->tbl); - g_slice_free1 (sizeof (rspamd_lru_hash_t), hash); + g_free (hash->eviction_pool); + g_free (hash); } @@ -187,3 +359,9 @@ rspamd_lru_hash_get_htable (rspamd_lru_hash_t *hash) { return hash->tbl; } + +gpointer +rspamd_lru_hash_element_data (rspamd_lru_element_t *elt) +{ + return elt->data; +} \ No newline at end of file diff --git a/src/libutil/hash.h b/src/libutil/hash.h index 30c0c99ac..80bdf8ae3 100644 --- a/src/libutil/hash.h +++ b/src/libutil/hash.h @@ -8,20 +8,11 @@ #define RSPAMD_HASH_H #include "config.h" -#include "heap.h" struct rspamd_lru_hash_s; typedef struct rspamd_lru_hash_s rspamd_lru_hash_t; - -typedef struct rspamd_lru_element_s { - struct rspamd_min_heap_elt helt; - guint ttl; - guint usages; - time_t storage; - gpointer data; - gpointer key; - rspamd_lru_hash_t *hash; -} rspamd_lru_element_t; +struct rspamd_lru_element_s; +typedef struct rspamd_lru_element_s rspamd_lru_element_t; /** @@ -85,6 +76,13 @@ void rspamd_lru_hash_destroy (rspamd_lru_hash_t *hash); * Get hash table for this lru hash (use rspamd_lru_element_t as data) */ GHashTable *rspamd_lru_hash_get_htable (rspamd_lru_hash_t *hash); + +/** + * Get element's data + * @param elt + * @return + */ +gpointer rspamd_lru_hash_element_data (rspamd_lru_element_t *elt); #endif /* -- 2.39.5