diff options
Diffstat (limited to 'src/libutil')
-rw-r--r-- | src/libutil/expression.c | 123 | ||||
-rw-r--r-- | src/libutil/fstring.h | 14 | ||||
-rw-r--r-- | src/libutil/mem_pool.c | 7 | ||||
-rw-r--r-- | src/libutil/mem_pool.h | 2 | ||||
-rw-r--r-- | src/libutil/radix.c | 37 | ||||
-rw-r--r-- | src/libutil/radix.h | 18 | ||||
-rw-r--r-- | src/libutil/shingles.c | 27 | ||||
-rw-r--r-- | src/libutil/shingles.h | 5 |
8 files changed, 202 insertions, 31 deletions
diff --git a/src/libutil/expression.c b/src/libutil/expression.c index e36964e72..cac7594d6 100644 --- a/src/libutil/expression.c +++ b/src/libutil/expression.c @@ -80,6 +80,10 @@ struct rspamd_expr_process_data { /* != NULL if trace is collected */ GPtrArray *trace; rspamd_expression_process_cb process_closure; + /* Optimization thresholds for arithmetic operations */ + double threshold; + enum rspamd_expression_op threshold_op; + gboolean has_threshold; }; #define msg_debug_expression(...) rspamd_conditional_debug_fast(NULL, NULL, \ @@ -1198,10 +1202,55 @@ error_label: } /* + * Analyze AST node to determine if arithmetic operations can be optimized + * based on comparison context (e.g., A + B + C > 2 can stop at 3) + */ +static void +rspamd_ast_analyze_node(GNode *node, struct rspamd_expr_process_data *process_data) +{ + struct rspamd_expression_elt *elt; + GNode *child; + + if (!node || !node->data) { + return; + } + + elt = (struct rspamd_expression_elt *) node->data; + + /* Check if this is a comparison operation with arithmetic child */ + if (elt->type == ELT_OP && (elt->p.op.op_flags & RSPAMD_EXPRESSION_COMPARISON)) { + /* Look for arithmetic operations in children */ + child = node->children; + if (child && child->next) { + GNode *left = child; + GNode *right = child->next; + struct rspamd_expression_elt *left_elt = left->data; + struct rspamd_expression_elt *right_elt = right->data; + + /* Check if left child is arithmetic operation and right is limit */ + if (left_elt->type == ELT_OP && + (left_elt->p.op.op_flags & RSPAMD_EXPRESSION_ARITHMETIC) && + (left_elt->p.op.op_flags & RSPAMD_EXPRESSION_NARY) && + right_elt->type == ELT_LIMIT) { + + /* Set threshold for arithmetic optimization */ + process_data->has_threshold = TRUE; + process_data->threshold = right_elt->p.lim; + process_data->threshold_op = elt->p.op.op; + + msg_debug_expression_verbose("detected arithmetic optimization: %s %.1f", + rspamd_expr_op_to_str(elt->p.op.op), + right_elt->p.lim); + } + } + } +} + +/* * Node optimizer function: skip nodes that are not relevant */ static gboolean -rspamd_ast_node_done(struct rspamd_expression_elt *elt, double acc) +rspamd_ast_node_done(struct rspamd_expression_elt *elt, double acc, struct rspamd_expr_process_data *process_data) { gboolean ret = FALSE; @@ -1217,6 +1266,47 @@ rspamd_ast_node_done(struct rspamd_expression_elt *elt, double acc) case OP_OR: ret = acc != 0; break; + case OP_PLUS: + case OP_MULT: + /* Handle arithmetic operations with thresholds */ + if (process_data->has_threshold) { + switch (process_data->threshold_op) { + case OP_GT: + /* For A + B + C > 2, stop when acc > threshold */ + ret = acc > process_data->threshold; + break; + case OP_GE: + /* For A + B + C >= 2, stop when acc >= threshold */ + ret = acc >= process_data->threshold; + break; + case OP_LT: + /* For A + B + C < 2, stop when acc >= threshold (result will be false) */ + ret = acc >= process_data->threshold; + break; + case OP_LE: + /* For A + B + C <= 2, stop when acc > threshold (result will be false) */ + ret = acc > process_data->threshold; + break; + case OP_EQ: + /* For A + B + C == 2, stop when acc > threshold (result will be false) */ + ret = acc > process_data->threshold; + break; + case OP_NE: + /* For A + B + C != 2, stop when acc > threshold (result will be true) */ + ret = acc > process_data->threshold; + break; + default: + break; + } + + if (ret) { + msg_debug_expression_verbose("arithmetic optimization triggered: %s %.1f %s %.1f", + rspamd_expr_op_to_str(elt->p.op.op), acc, + rspamd_expr_op_to_str(process_data->threshold_op), + process_data->threshold); + } + } + break; default: break; } @@ -1340,9 +1430,28 @@ rspamd_ast_process_node(struct rspamd_expression *e, GNode *node, double val; gboolean calc_ticks = FALSE; __attribute__((unused)) const char *op_name = NULL; + gboolean saved_has_threshold = FALSE; + double saved_threshold = 0.0; + enum rspamd_expression_op saved_threshold_op = OP_INVALID; elt = node->data; + /* Analyze node for optimization opportunities */ + if (elt->type == ELT_OP && (elt->p.op.op_flags & RSPAMD_EXPRESSION_COMPARISON)) { + /* Save current threshold state */ + saved_has_threshold = process_data->has_threshold; + saved_threshold = process_data->threshold; + saved_threshold_op = process_data->threshold_op; + + /* Reset threshold state */ + process_data->has_threshold = FALSE; + process_data->threshold = 0.0; + process_data->threshold_op = OP_INVALID; + + /* Analyze for optimization opportunities */ + rspamd_ast_analyze_node(node, process_data); + } + switch (elt->type) { case ELT_ATOM: if (!(elt->flags & RSPAMD_EXPR_FLAG_PROCESSED)) { @@ -1400,7 +1509,7 @@ rspamd_ast_process_node(struct rspamd_expression *e, GNode *node, /* Check if we need to process further */ if (!(process_data->flags & RSPAMD_EXPRESSION_FLAG_NOOPT)) { - if (rspamd_ast_node_done(elt, acc)) { + if (rspamd_ast_node_done(elt, acc, process_data)) { msg_debug_expression_verbose("optimizer: done"); return acc; } @@ -1443,6 +1552,13 @@ rspamd_ast_process_node(struct rspamd_expression *e, GNode *node, break; } + /* Restore threshold state if it was saved */ + if (elt->type == ELT_OP && (elt->p.op.op_flags & RSPAMD_EXPRESSION_COMPARISON)) { + process_data->has_threshold = saved_has_threshold; + process_data->threshold = saved_threshold; + process_data->threshold_op = saved_threshold_op; + } + return acc; } @@ -1477,6 +1593,9 @@ rspamd_process_expression_closure(struct rspamd_expression *expr, pd.process_closure = cb; pd.flags = flags; pd.ud = runtime_ud; + pd.has_threshold = FALSE; + pd.threshold = 0.0; + pd.threshold_op = OP_INVALID; if (track) { pd.trace = g_ptr_array_sized_new(32); diff --git a/src/libutil/fstring.h b/src/libutil/fstring.h index 0792ab9fa..ca9f689c8 100644 --- a/src/libutil/fstring.h +++ b/src/libutil/fstring.h @@ -1,11 +1,11 @@ -/*- - * Copyright 2016 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -30,8 +30,8 @@ extern "C" { */ typedef struct f_str_s { - gsize len; - gsize allocated; + size_t len; + size_t allocated; char str[]; } rspamd_fstring_t; @@ -40,12 +40,12 @@ typedef struct f_str_s { #define RSPAMD_FSTRING_LIT(lit) rspamd_fstring_new_init((lit), sizeof(lit) - 1) typedef struct f_str_tok { - gsize len; + size_t len; const char *begin; } rspamd_ftok_t; typedef struct f_str_unicode_tok { - gsize len; /* in UChar32 */ + size_t len; /* in UChar32 */ const UChar32 *begin; } rspamd_ftok_unicode_t; diff --git a/src/libutil/mem_pool.c b/src/libutil/mem_pool.c index 3dc67bc5f..575b4e497 100644 --- a/src/libutil/mem_pool.c +++ b/src/libutil/mem_pool.c @@ -403,9 +403,10 @@ rspamd_mempool_new_(gsize size, const char *tag, int flags, const char *loc) /* Generate new uid */ uint64_t uid = rspamd_random_uint64_fast(); - rspamd_encode_hex_buf((unsigned char *) &uid, sizeof(uid), - new_pool->tag.uid, sizeof(new_pool->tag.uid) - 1); - new_pool->tag.uid[sizeof(new_pool->tag.uid) - 1] = '\0'; + G_STATIC_ASSERT(sizeof(new_pool->tag.uid) >= sizeof(uid) * 2 + 1); + int enc_len = rspamd_encode_hex_buf((unsigned char *) &uid, sizeof(uid), + new_pool->tag.uid, sizeof(new_pool->tag.uid) - 1); + new_pool->tag.uid[enc_len] = '\0'; mem_pool_stat->pools_allocated++; diff --git a/src/libutil/mem_pool.h b/src/libutil/mem_pool.h index 651b44661..00d1a2067 100644 --- a/src/libutil/mem_pool.h +++ b/src/libutil/mem_pool.h @@ -71,7 +71,7 @@ struct f_str_s; #endif #define MEMPOOL_TAG_LEN 16 -#define MEMPOOL_UID_LEN 16 +#define MEMPOOL_UID_LEN 32 /* All pointers are aligned as this variable */ #define MIN_MEM_ALIGNMENT G_MEM_ALIGN diff --git a/src/libutil/radix.c b/src/libutil/radix.c index 2cae8e34a..bdd722b49 100644 --- a/src/libutil/radix.c +++ b/src/libutil/radix.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,7 +66,7 @@ radix_find_compressed(radix_compressed_t *tree, const uint8_t *key, gsize keylen uintptr_t radix_insert_compressed(radix_compressed_t *tree, - uint8_t *key, gsize keylen, + const uint8_t *key, gsize keylen, gsize masklen, uintptr_t value) { @@ -128,6 +128,39 @@ radix_insert_compressed(radix_compressed_t *tree, return old; } +uintptr_t +radix_insert_compressed_addr(radix_compressed_t *tree, + const rspamd_inet_addr_t *addr, + uintptr_t value) +{ + const unsigned char *key; + unsigned int klen = 0; + unsigned char buf[16]; + + if (addr == NULL) { + return RADIX_NO_VALUE; + } + + key = rspamd_inet_address_get_hash_key(addr, &klen); + + if (key && klen) { + if (klen == 4) { + /* Map to ipv6 */ + memset(buf, 0, 10); + buf[10] = 0xffu; + buf[11] = 0xffu; + memcpy(buf + 12, key, klen); + + key = buf; + klen = sizeof(buf); + } + + return radix_insert_compressed(tree, key, klen, 0, value); + } + + return RADIX_NO_VALUE; +} + radix_compressed_t * radix_create_compressed(const char *tree_name) diff --git a/src/libutil/radix.h b/src/libutil/radix.h index c4fe96441..8c1224707 100644 --- a/src/libutil/radix.h +++ b/src/libutil/radix.h @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ #include "mem_pool.h" #include "util.h" -#define RADIX_NO_VALUE (uintptr_t) - 1 +#define RADIX_NO_VALUE (uintptr_t) -1 #ifdef __cplusplus extern "C" { @@ -39,11 +39,23 @@ typedef struct radix_tree_compressed radix_compressed_t; */ uintptr_t radix_insert_compressed(radix_compressed_t *tree, - uint8_t *key, gsize keylen, + const uint8_t *key, gsize keylen, gsize masklen, uintptr_t value); /** + * Insert new address to the radix trie (works for IPv4 or IPv6 addresses) + * @param tree radix trie + * @param addr address to insert + * @param value opaque value pointer + * @return previous value of the key or `RADIX_NO_VALUE` + */ +uintptr_t +radix_insert_compressed_addr(radix_compressed_t *tree, + const rspamd_inet_addr_t *addr, + uintptr_t value); + +/** * Find a key in a radix trie * @param tree radix trie * @param key key to find (bitstring) diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index 5fe110eb8..c69c42292 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -18,6 +18,7 @@ #include "cryptobox.h" #include "images.h" #include "libstat/stat_api.h" +#include "libserver/word.h" #define SHINGLES_WINDOW 3 #define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES @@ -112,7 +113,7 @@ rspamd_shingles_get_keys_cached(const unsigned char key[SHINGLES_KEY_SIZE]) } struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") - rspamd_shingles_from_text(GArray *input, + rspamd_shingles_from_text(rspamd_words_t *input, const unsigned char key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, @@ -123,12 +124,16 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") uint64_t **hashes; unsigned char **keys; rspamd_fstring_t *row; - rspamd_stat_token_t *word; + rspamd_word_t *word; uint64_t val; int i, j, k; gsize hlen, ilen = 0, beg = 0, widx = 0; enum rspamd_cryptobox_fast_hash_type ht; + if (!input || !input->a) { + return NULL; + } + if (pool != NULL) { res = rspamd_mempool_alloc(pool, sizeof(*res)); } @@ -138,10 +143,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") row = rspamd_fstring_sized_new(256); - for (i = 0; i < input->len; i++) { - word = &g_array_index(input, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*input); i++) { + word = &kv_A(*input, i); - if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) { + if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) { ilen++; } } @@ -162,10 +167,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") for (j = beg; j < i; j++) { word = NULL; - while (widx < input->len) { - word = &g_array_index(input, rspamd_stat_token_t, widx); + while (widx < kv_size(*input)) { + word = &kv_A(*input, widx); - if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) { + if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) { widx++; } else { @@ -237,10 +242,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") word = NULL; - while (widx < input->len) { - word = &g_array_index(input, rspamd_stat_token_t, widx); + while (widx < kv_size(*input)) { + word = &kv_A(*input, widx); - if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) { + if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) { widx++; } else { diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index fe6f16cf8..1ab2c6842 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -18,6 +18,7 @@ #include "config.h" #include "mem_pool.h" +#include "libserver/word.h" #define RSPAMD_SHINGLE_SIZE 32 @@ -48,14 +49,14 @@ typedef uint64_t (*rspamd_shingles_filter)(uint64_t *input, gsize count, /** * Generate shingles from the input of fixed size strings using lemmatizer * if needed - * @param input array of `rspamd_fstring_t` + * @param input kvec of `rspamd_word_t` * @param key secret key used to generate shingles * @param pool pool to allocate shingles array * @param filter hashes filtering function * @param filterd opaque data for filtering function * @return shingles array */ -struct rspamd_shingle *rspamd_shingles_from_text(GArray *input, +struct rspamd_shingle *rspamd_shingles_from_text(rspamd_words_t *input, const unsigned char key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, |