aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-05-12 13:50:10 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-05-12 13:50:10 +0100
commit274efffcbb21ea6fb32ee509b2449f69bcf98992 (patch)
tree373157af635e6810cc106da9e44d2d92ab604b30 /src
parentc2e0f14cf3b208f75c6a92e8c532223d9ca8db22 (diff)
downloadrspamd-274efffcbb21ea6fb32ee509b2449f69bcf98992.tar.gz
rspamd-274efffcbb21ea6fb32ee509b2449f69bcf98992.zip
[Feature] Rework and improve fuzzy storage
- Allow multiple algorithms in fuzzy rules - Cache fuzzy requests to avoid expensive calculations for multiple fuzzy storages - Simplify request generating procedure
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c2
-rw-r--r--src/plugins/fuzzy_check.c125
2 files changed, 100 insertions, 27 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index d9e8dd18e..dbc9921d9 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1079,7 +1079,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
* We use static hash seed if we would want to use that in shingles
* computation in future
*/
- h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_MUMHASH,
+ h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
w->begin, w->len, words_hash_seed);
g_array_append_val (part->normalized_hashes, h);
}
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index 304ce3704..2596a1801 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -69,6 +69,8 @@ struct fuzzy_mime_type {
struct fuzzy_rule {
struct upstream_list *servers;
const gchar *symbol;
+ const gchar *algorithm_str;
+ enum rspamd_shingle_alg alg;
GHashTable *mappings;
GList *mime_types;
GList *fuzzy_headers;
@@ -363,6 +365,7 @@ fuzzy_parse_rule (struct rspamd_config *cfg, const ucl_object_t *obj, gint cb_id
rule = fuzzy_rule_new (fuzzy_module_ctx->default_symbol,
fuzzy_module_ctx->fuzzy_pool);
rule->learn_condition_cb = -1;
+ rule->alg = RSPAMD_SHINGLES_OLD;
if ((value = ucl_object_lookup (obj, "mime_types")) != NULL) {
it = NULL;
@@ -410,6 +413,46 @@ fuzzy_parse_rule (struct rspamd_config *cfg, const ucl_object_t *obj, gint cb_id
rule->skip_unknown = ucl_obj_toboolean (value);
}
+ if ((value = ucl_object_lookup (obj, "algorithm")) != NULL) {
+ rule->algorithm_str = ucl_object_tostring (value);
+
+ if (rule->algorithm_str) {
+ if (g_ascii_strcasecmp (rule->algorithm_str, "old") == 0 ||
+ g_ascii_strcasecmp (rule->algorithm_str, "siphash") == 0) {
+ rule->alg = RSPAMD_SHINGLES_OLD;
+ }
+ else if (g_ascii_strcasecmp (rule->algorithm_str, "xxhash") == 0) {
+ rule->alg = RSPAMD_SHINGLES_XXHASH;
+ }
+ else if (g_ascii_strcasecmp (rule->algorithm_str, "mumhash") == 0) {
+ rule->alg = RSPAMD_SHINGLES_MUMHASH;
+ }
+ else if (g_ascii_strcasecmp (rule->algorithm_str, "fasthash") == 0 ||
+ g_ascii_strcasecmp (rule->algorithm_str, "fast") == 0) {
+ rule->alg = RSPAMD_SHINGLES_FAST;
+ }
+ else {
+ msg_warn_config ("unknown algorithm: %s, use siphash by default");
+ }
+ }
+ }
+
+ /* Set a consistent and short string name */
+ switch (rule->alg) {
+ case RSPAMD_SHINGLES_OLD:
+ rule->algorithm_str = "sip";
+ break;
+ case RSPAMD_SHINGLES_XXHASH:
+ rule->algorithm_str = "xx";
+ break;
+ case RSPAMD_SHINGLES_MUMHASH:
+ rule->algorithm_str = "mum";
+ break;
+ case RSPAMD_SHINGLES_FAST:
+ rule->algorithm_str = "fast";
+ break;
+ }
+
if ((value = ucl_object_lookup (obj, "servers")) != NULL) {
rule->servers = rspamd_upstreams_create (cfg->ups_ctx);
@@ -1023,6 +1066,30 @@ fuzzy_cmd_from_task_meta (struct fuzzy_rule *rule,
return io;
}
+static void *
+fuzzy_cmd_get_cached (struct fuzzy_rule *rule,
+ rspamd_mempool_t *pool,
+ struct mime_text_part *part)
+{
+ gchar key[32];
+
+ rspamd_snprintf (key, sizeof (key), "%p%s", part, rule->algorithm_str);
+ return rspamd_mempool_get_variable (pool, key);
+}
+
+static void
+fuzzy_cmd_set_cached (struct fuzzy_rule *rule,
+ rspamd_mempool_t *pool,
+ struct mime_text_part *part,
+ struct rspamd_fuzzy_encrypted_shingle_cmd *data)
+{
+ gchar key[32];
+
+ rspamd_snprintf (key, sizeof (key), "%p%s", part, rule->algorithm_str);
+ /* Key is copied */
+ rspamd_mempool_set_variable (pool, key, data, NULL);
+}
+
/*
* Create fuzzy command from a text part
*/
@@ -1035,7 +1102,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
struct mime_text_part *part)
{
struct rspamd_fuzzy_shingle_cmd *shcmd;
- struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd;
+ struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd, *cached;
struct rspamd_shingle *sh;
guint i;
rspamd_cryptobox_hash_state_t st;
@@ -1043,41 +1110,47 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
GArray *words;
struct fuzzy_cmd_io *io;
- if (rule->peer_key) {
- encshcmd = rspamd_mempool_alloc0 (pool, sizeof (*encshcmd));
+ cached = fuzzy_cmd_get_cached (rule, pool, part);
+
+ if (cached) {
+ /* Copy cached */
+ encshcmd = rspamd_mempool_alloc (pool, sizeof (*encshcmd));
+ memcpy (encshcmd, cached, sizeof (*encshcmd));
shcmd = &encshcmd->cmd;
}
else {
- shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd));
- encshcmd = NULL;
- }
-
- /*
- * Generate hash from all words in the part
- */
- rspamd_cryptobox_hash_init (&st, rule->hash_key->str, rule->hash_key->len);
- words = fuzzy_preprocess_words (part, pool);
+ encshcmd = rspamd_mempool_alloc0 (pool, sizeof (*encshcmd));
+ shcmd = &encshcmd->cmd;
- for (i = 0; i < words->len; i ++) {
- word = &g_array_index (words, rspamd_ftok_t, i);
- rspamd_cryptobox_hash_update (&st, word->begin, word->len);
- }
- rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);
+ /*
+ * Generate hash from all words in the part
+ */
+ rspamd_cryptobox_hash_init (&st, rule->hash_key->str, rule->hash_key->len);
+ words = fuzzy_preprocess_words (part, pool);
- msg_debug_pool ("loading shingles with key %*xs", 16,
- rule->shingles_key->str);
- sh = rspamd_shingles_generate (words,
- rule->shingles_key->str, pool,
- rspamd_shingles_default_filter, NULL,
- RSPAMD_SHINGLES_OLD);
- if (sh != NULL) {
- memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
- shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+ for (i = 0; i < words->len; i ++) {
+ word = &g_array_index (words, rspamd_ftok_t, i);
+ rspamd_cryptobox_hash_update (&st, word->begin, word->len);
+ }
+ rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);
+
+ msg_debug_pool ("loading shingles of type %s with key %*xs",
+ rule->algorithm_str,
+ 16, rule->shingles_key->str);
+ sh = rspamd_shingles_generate (words,
+ rule->shingles_key->str, pool,
+ rspamd_shingles_default_filter, NULL,
+ rule->alg);
+ if (sh != NULL) {
+ memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
+ shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+ }
}
shcmd->basic.tag = ottery_rand_uint32 ();
shcmd->basic.cmd = c;
shcmd->basic.version = RSPAMD_FUZZY_VERSION;
+
if (c != FUZZY_CHECK) {
shcmd->basic.flag = flag;
shcmd->basic.value = weight;