]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Rework and improve fuzzy storage
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 May 2016 12:50:10 +0000 (13:50 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 May 2016 12:50:10 +0000 (13:50 +0100)
- Allow multiple algorithms in fuzzy rules
- Cache fuzzy requests to avoid expensive calculations for multiple
  fuzzy storages
- Simplify request generating procedure

src/libmime/message.c
src/plugins/fuzzy_check.c

index d9e8dd18e249f745504fee71aa28aa556683290d..dbc9921d92152f2c25d084f110280fc755132eba 100644 (file)
@@ -1079,7 +1079,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                                 * We use static hash seed if we would want to use that in shingles
                                 * computation in future
                                 */
-                               h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_MUMHASH,
+                               h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
                                                w->begin, w->len, words_hash_seed);
                                g_array_append_val (part->normalized_hashes, h);
                        }
index 304ce37041f0d5eb4c9a846296f37f8c90564894..2596a18019901ad1a63bb4a799d972ed7c834c22 100644 (file)
@@ -69,6 +69,8 @@ struct fuzzy_mime_type {
 struct fuzzy_rule {
        struct upstream_list *servers;
        const gchar *symbol;
+       const gchar *algorithm_str;
+       enum rspamd_shingle_alg alg;
        GHashTable *mappings;
        GList *mime_types;
        GList *fuzzy_headers;
@@ -363,6 +365,7 @@ fuzzy_parse_rule (struct rspamd_config *cfg, const ucl_object_t *obj, gint cb_id
        rule = fuzzy_rule_new (fuzzy_module_ctx->default_symbol,
                        fuzzy_module_ctx->fuzzy_pool);
        rule->learn_condition_cb = -1;
+       rule->alg = RSPAMD_SHINGLES_OLD;
 
        if ((value = ucl_object_lookup (obj, "mime_types")) != NULL) {
                it = NULL;
@@ -410,6 +413,46 @@ fuzzy_parse_rule (struct rspamd_config *cfg, const ucl_object_t *obj, gint cb_id
                rule->skip_unknown = ucl_obj_toboolean (value);
        }
 
+       if ((value = ucl_object_lookup (obj, "algorithm")) != NULL) {
+               rule->algorithm_str = ucl_object_tostring (value);
+
+               if (rule->algorithm_str) {
+                       if (g_ascii_strcasecmp (rule->algorithm_str, "old") == 0 ||
+                                       g_ascii_strcasecmp (rule->algorithm_str, "siphash") == 0) {
+                               rule->alg = RSPAMD_SHINGLES_OLD;
+                       }
+                       else if (g_ascii_strcasecmp (rule->algorithm_str, "xxhash") == 0) {
+                               rule->alg = RSPAMD_SHINGLES_XXHASH;
+                       }
+                       else if (g_ascii_strcasecmp (rule->algorithm_str, "mumhash") == 0) {
+                               rule->alg = RSPAMD_SHINGLES_MUMHASH;
+                       }
+                       else if (g_ascii_strcasecmp (rule->algorithm_str, "fasthash") == 0 ||
+                                       g_ascii_strcasecmp (rule->algorithm_str, "fast") == 0) {
+                               rule->alg = RSPAMD_SHINGLES_FAST;
+                       }
+                       else {
+                               msg_warn_config ("unknown algorithm: %s, use siphash by default");
+                       }
+               }
+       }
+
+       /* Set a consistent and short string name */
+       switch (rule->alg) {
+       case RSPAMD_SHINGLES_OLD:
+               rule->algorithm_str = "sip";
+               break;
+       case RSPAMD_SHINGLES_XXHASH:
+               rule->algorithm_str = "xx";
+               break;
+       case RSPAMD_SHINGLES_MUMHASH:
+               rule->algorithm_str = "mum";
+               break;
+       case RSPAMD_SHINGLES_FAST:
+               rule->algorithm_str = "fast";
+               break;
+       }
+
        if ((value = ucl_object_lookup (obj, "servers")) != NULL) {
                rule->servers = rspamd_upstreams_create (cfg->ups_ctx);
 
@@ -1023,6 +1066,30 @@ fuzzy_cmd_from_task_meta (struct fuzzy_rule *rule,
        return io;
 }
 
+static void *
+fuzzy_cmd_get_cached (struct fuzzy_rule *rule,
+               rspamd_mempool_t *pool,
+               struct mime_text_part *part)
+{
+       gchar key[32];
+
+       rspamd_snprintf (key, sizeof (key), "%p%s", part, rule->algorithm_str);
+       return rspamd_mempool_get_variable (pool, key);
+}
+
+static void
+fuzzy_cmd_set_cached (struct fuzzy_rule *rule,
+               rspamd_mempool_t *pool,
+               struct mime_text_part *part,
+               struct rspamd_fuzzy_encrypted_shingle_cmd *data)
+{
+       gchar key[32];
+
+       rspamd_snprintf (key, sizeof (key), "%p%s", part, rule->algorithm_str);
+       /* Key is copied */
+       rspamd_mempool_set_variable (pool, key, data, NULL);
+}
+
 /*
  * Create fuzzy command from a text part
  */
@@ -1035,7 +1102,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
                struct mime_text_part *part)
 {
        struct rspamd_fuzzy_shingle_cmd *shcmd;
-       struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd;
+       struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd, *cached;
        struct rspamd_shingle *sh;
        guint i;
        rspamd_cryptobox_hash_state_t st;
@@ -1043,41 +1110,47 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
        GArray *words;
        struct fuzzy_cmd_io *io;
 
-       if (rule->peer_key) {
-               encshcmd = rspamd_mempool_alloc0 (pool, sizeof (*encshcmd));
+       cached = fuzzy_cmd_get_cached (rule, pool, part);
+
+       if (cached) {
+               /* Copy cached */
+               encshcmd = rspamd_mempool_alloc (pool, sizeof (*encshcmd));
+               memcpy (encshcmd, cached, sizeof (*encshcmd));
                shcmd = &encshcmd->cmd;
        }
        else {
-               shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd));
-               encshcmd = NULL;
-       }
-
-       /*
-        * Generate hash from all words in the part
-        */
-       rspamd_cryptobox_hash_init (&st, rule->hash_key->str, rule->hash_key->len);
-       words = fuzzy_preprocess_words (part, pool);
+               encshcmd = rspamd_mempool_alloc0 (pool, sizeof (*encshcmd));
+               shcmd = &encshcmd->cmd;
 
-       for (i = 0; i < words->len; i ++) {
-               word = &g_array_index (words, rspamd_ftok_t, i);
-               rspamd_cryptobox_hash_update (&st, word->begin, word->len);
-       }
-       rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);
+               /*
+                * Generate hash from all words in the part
+                */
+               rspamd_cryptobox_hash_init (&st, rule->hash_key->str, rule->hash_key->len);
+               words = fuzzy_preprocess_words (part, pool);
 
-       msg_debug_pool ("loading shingles with key %*xs", 16,
-                       rule->shingles_key->str);
-       sh = rspamd_shingles_generate (words,
-                       rule->shingles_key->str, pool,
-                       rspamd_shingles_default_filter, NULL,
-                       RSPAMD_SHINGLES_OLD);
-       if (sh != NULL) {
-               memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
-               shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+               for (i = 0; i < words->len; i ++) {
+                       word = &g_array_index (words, rspamd_ftok_t, i);
+                       rspamd_cryptobox_hash_update (&st, word->begin, word->len);
+               }
+               rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);
+
+               msg_debug_pool ("loading shingles of type %s with key %*xs",
+                               rule->algorithm_str,
+                               16, rule->shingles_key->str);
+               sh = rspamd_shingles_generate (words,
+                               rule->shingles_key->str, pool,
+                               rspamd_shingles_default_filter, NULL,
+                               rule->alg);
+               if (sh != NULL) {
+                       memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
+                       shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+               }
        }
 
        shcmd->basic.tag = ottery_rand_uint32 ();
        shcmd->basic.cmd = c;
        shcmd->basic.version = RSPAMD_FUZZY_VERSION;
+
        if (c != FUZZY_CHECK) {
                shcmd->basic.flag = flag;
                shcmd->basic.value = weight;