From: Vsevolod Stakhov Date: Tue, 14 Feb 2017 13:01:08 +0000 (+0000) Subject: [Rework] Use a special structure for stats tokens X-Git-Tag: 1.5.0~106 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=4a17956f7b5b6268859445e1d2369abdb2965ae4;p=rspamd.git [Rework] Use a special structure for stats tokens --- diff --git a/src/libmime/message.c b/src/libmime/message.c index 99384e8a2..c84b63360 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -204,7 +204,7 @@ rspamd_extract_words (struct rspamd_task *task, #ifdef WITH_SNOWBALL struct sb_stemmer *stem = NULL; #endif - rspamd_ftok_t *w; + rspamd_stat_token_t *w; gchar *temp_word; const guchar *r; guint i, nlen; @@ -231,7 +231,7 @@ rspamd_extract_words (struct rspamd_task *task, for (i = 0; i < part->normalized_words->len; i ++) { guint64 h; - w = &g_array_index (part->normalized_words, rspamd_ftok_t, i); + w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); r = NULL; #ifdef WITH_SNOWBALL if (stem) { @@ -239,7 +239,7 @@ rspamd_extract_words (struct rspamd_task *task, } #endif - if (w->len > 0 && !(w->len == 6 && memcmp (w->begin, "!!EX!!", 6) == 0)) { + if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { if (r != NULL) { nlen = strlen (r); nlen = MIN (nlen, w->len); @@ -268,7 +268,8 @@ rspamd_extract_words (struct rspamd_task *task, * We use static hash seed if we would want to use that in shingles * computation in future */ - h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + h = rspamd_cryptobox_fast_hash_specific ( + RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, w->begin, w->len, words_hash_seed); g_array_append_val (part->normalized_hashes, h); } diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 3c42e8622..6c2604e89 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -26,6 +26,18 @@ * High level statistics API */ +#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0) +#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1) +#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2) +#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3) +#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4) + +typedef struct rspamd_stat_token_s { + const gchar *begin; + gsize len; + guint flags; +} rspamd_stat_token_t; + /** * The results of statistics processing: * - error diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h index 8f06736bf..36ab6a697 100644 --- a/src/libstat/stat_internal.h +++ b/src/libstat/stat_internal.h @@ -56,6 +56,7 @@ typedef struct token_node_s { guchar data[RSPAMD_MAX_TOKEN_LEN]; guint window_idx; guint datalen; + guint flags; gdouble values[]; } rspamd_token_t; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 2b87fffc6..00b26ee2e 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -38,9 +38,10 @@ rspamd_stat_tokenize_header (struct rspamd_task *task, struct rspamd_mime_header *cur; GPtrArray *hdrs; guint i; - rspamd_ftok_t str; + rspamd_stat_token_t str; hdrs = g_hash_table_lookup (task->raw_headers, name); + str.flags = RSPAMD_STAT_TOKEN_FLAG_META; if (hdrs != NULL) { @@ -75,12 +76,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, struct rspamd_mime_text_part *tp; GList *cur; GArray *ar; - rspamd_ftok_t elt; + rspamd_stat_token_t elt; guint i; gchar tmpbuf[128]; lua_State *L = task->cfg->lua_state; ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16); + elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; /* Insert images */ for (i = 0; i < task->parts->len; i ++) { @@ -171,6 +173,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, /* Use global metatokens from lua */ lua_getglobal (L, "rspamd_gen_metatokens"); + elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META; if (lua_type (L, -1) == LUA_TFUNCTION) { struct rspamd_task **ptask; @@ -227,6 +230,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task) { struct rspamd_mime_text_part *part; + rspamd_stat_token_t *tok; GArray *words; gchar *sub = NULL; guint i, reserved_len = 0; @@ -272,6 +276,12 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, NULL); if (words != NULL) { + + for (i = 0; i < words->len; i ++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; + } + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, words, diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index c2e050f23..6c8ac354b 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -264,12 +264,12 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, GPtrArray *result) { rspamd_token_t *new_tok = NULL; - rspamd_ftok_t *token; + rspamd_stat_token_t *token; struct rspamd_osb_tokenizer_config *osb_cf; guint64 *hashpipe, cur, seed; guint32 h1, h2; gsize token_size; - guint processed = 0, i, w, window_size; + guint processed = 0, i, w, window_size, token_flags = 0; if (words == NULL) { return FALSE; @@ -292,10 +292,15 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, g_assert (token_size > 0); for (w = 0; w < words->len; w ++) { - token = &g_array_index (words, rspamd_ftok_t, w); + token = &g_array_index (words, rspamd_stat_token_t, w); + token_flags = token->flags; if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - cur = rspamd_fstrhash_lc (token, is_utf); + rspamd_ftok_t ftok; + + ftok.begin = token->begin; + ftok.len = token->len; + cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ @@ -316,6 +321,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, #define ADD_TOKEN do {\ new_tok = rspamd_mempool_alloc0 (pool, token_size); \ new_tok->datalen = sizeof (gint64); \ + new_tok->flags = token_flags; \ if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ h1 = ((guint32)hashpipe[0]) * primes[0] + \ ((guint32)hashpipe[i]) * primes[i << 1]; \ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 6eab11f98..72f7a6bb2 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -22,8 +22,8 @@ #include "stat_internal.h" #include "../../../contrib/mumhash/mum.h" -typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, - rspamd_ftok_t * token, +typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, + rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { @@ -69,8 +69,8 @@ token_node_compare_func (gconstpointer a, gconstpointer b) /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, - gchar const **cur, rspamd_ftok_t * token, +rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, + gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) { gsize remain, pos; @@ -92,6 +92,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, if (ex->pos == 0) { token->begin = buf->begin + ex->len; token->len = ex->len; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; } else { token->begin = buf->begin; @@ -155,14 +156,16 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, } } + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + *cur = p; return TRUE; } static gboolean -rspamd_tokenizer_get_word (rspamd_ftok_t * buf, - gchar const **cur, rspamd_ftok_t * token, +rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, + gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature) { @@ -219,6 +222,7 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, if (ex->type == RSPAMD_EXCEPTION_URL) { token->begin = "!!EX!!"; token->len = sizeof ("!!EX!!") - 1; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; processed = token->len; } state = skip_exception; @@ -240,9 +244,11 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, break; case feed_token: if (ex != NULL && p - buf->begin == (gint)ex->pos) { + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) { + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } processed ++; @@ -288,7 +294,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, struct rspamd_config *cfg, GList *exceptions, gboolean compat, guint64 *hash) { - rspamd_ftok_t token, buf; + rspamd_stat_token_t token, buf; const gchar *pos = NULL; gsize l; GArray *res; @@ -322,7 +328,8 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, initial_size = word_decay * 2; } - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), + initial_size); while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 70ff7560c..530eb40a0 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -29,7 +29,7 @@ struct rspamd_stat_tokenizer { gint token_node_compare_func (gconstpointer a, gconstpointer b); -/* Tokenize text into array of words (rspamd_ftok_t type) */ +/* Tokenize text into array of words (rspamd_stat_token_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, struct rspamd_config *cfg, GList *exceptions, gboolean compat, guint64 *hash); diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 5bd59a32d..80baf8b34 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -895,7 +895,7 @@ lua_util_tokenize_text (lua_State *L) struct rspamd_lua_text *t; struct rspamd_process_exception *ex; GArray *res; - rspamd_ftok_t *w; + rspamd_stat_token_t *w; gboolean compat = FALSE; if (lua_type (L, 1) == LUA_TSTRING) { @@ -959,7 +959,7 @@ lua_util_tokenize_text (lua_State *L) lua_createtable (L, res->len, 0); for (i = 0; i < res->len; i ++) { - w = &g_array_index (res, rspamd_ftok_t, i); + w = &g_array_index (res, rspamd_stat_token_t, i); lua_pushlstring (L, w->begin, w->len); lua_rawseti (L, -2, i + 1); } diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 3b5a0f717..172a69261 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -25,6 +25,7 @@ #include "config.h" #include "libmime/message.h" #include "rspamd.h" +#include "libstat/stat_api.h" #define DEFAULT_SYMBOL "R_MIXED_CHARSET" #define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL" @@ -163,7 +164,8 @@ chartable_module_reconfig (struct rspamd_config *cfg) } static gdouble -rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w, +rspamd_chartable_process_word_utf (struct rspamd_task *task, + rspamd_stat_token_t *w, gboolean is_url) { const gchar *p, *end, *c; @@ -258,7 +260,8 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w, } static gdouble -rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w, +rspamd_chartable_process_word_ascii (struct rspamd_task *task, + rspamd_stat_token_t *w, gboolean is_url) { const guchar *p, *end, *c; @@ -343,7 +346,7 @@ static void rspamd_chartable_process_part (struct rspamd_task *task, struct rspamd_mime_text_part *part) { - rspamd_ftok_t *w; + rspamd_stat_token_t *w; guint i; gdouble cur_score = 0.0; @@ -353,9 +356,9 @@ rspamd_chartable_process_part (struct rspamd_task *task, } for (i = 0; i < part->normalized_words->len; i++) { - w = &g_array_index (part->normalized_words, rspamd_ftok_t, i); + w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); - if (w->len > 0) { + if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { if (IS_PART_UTF (part)) { cur_score += rspamd_chartable_process_word_utf (task, w, FALSE); @@ -397,7 +400,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused) struct rspamd_url *u; GHashTableIter it; gpointer k, v; - rspamd_ftok_t w; + rspamd_stat_token_t w; gdouble cur_score = 0.0; g_hash_table_iter_init (&it, task->urls); diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 92930b948..1804e8648 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -43,6 +43,7 @@ #include "lua/lua_common.h" #include "unix-std.h" #include "libutil/http_private.h" +#include "libstat/stat_api.h" #include #define DEFAULT_SYMBOL "R_FUZZY_HASH" @@ -1266,7 +1267,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, struct rspamd_shingle *sh; guint i; rspamd_cryptobox_hash_state_t st; - rspamd_ftok_t *word; + rspamd_stat_token_t *word; GArray *words; struct fuzzy_cmd_io *io; @@ -1289,7 +1290,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, words = fuzzy_preprocess_words (part, pool); for (i = 0; i < words->len; i ++) { - word = &g_array_index (words, rspamd_ftok_t, i); + word = &g_array_index (words, rspamd_stat_token_t, i); rspamd_cryptobox_hash_update (&st, word->begin, word->len); } rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);