diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-02-14 13:01:08 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-02-14 13:37:18 +0000 |
commit | 4a17956f7b5b6268859445e1d2369abdb2965ae4 (patch) | |
tree | 7e2849b456ef4f11ae2bde3d3526fb0ecc741b04 /src/libstat | |
parent | da43e0ec3b059752e7d4d4d283e33d568aa110cf (diff) | |
download | rspamd-4a17956f7b5b6268859445e1d2369abdb2965ae4.tar.gz rspamd-4a17956f7b5b6268859445e1d2369abdb2965ae4.zip |
[Rework] Use a special structure for stats tokens
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/stat_api.h | 12 | ||||
-rw-r--r-- | src/libstat/stat_internal.h | 1 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 14 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 14 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 23 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 2 |
6 files changed, 51 insertions, 15 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 3c42e8622..6c2604e89 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -26,6 +26,18 @@ * High level statistics API */ +#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0) +#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1) +#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2) +#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3) +#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4) + +typedef struct rspamd_stat_token_s { + const gchar *begin; + gsize len; + guint flags; +} rspamd_stat_token_t; + /** * The results of statistics processing: * - error diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h index 8f06736bf..36ab6a697 100644 --- a/src/libstat/stat_internal.h +++ b/src/libstat/stat_internal.h @@ -56,6 +56,7 @@ typedef struct token_node_s { guchar data[RSPAMD_MAX_TOKEN_LEN]; guint window_idx; guint datalen; + guint flags; gdouble values[]; } rspamd_token_t; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 2b87fffc6..00b26ee2e 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -38,9 +38,10 @@ rspamd_stat_tokenize_header (struct rspamd_task *task, struct rspamd_mime_header *cur; GPtrArray *hdrs; guint i; - rspamd_ftok_t str; + rspamd_stat_token_t str; hdrs = g_hash_table_lookup (task->raw_headers, name); + str.flags = RSPAMD_STAT_TOKEN_FLAG_META; if (hdrs != NULL) { @@ -75,12 +76,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, struct rspamd_mime_text_part *tp; GList *cur; GArray *ar; - rspamd_ftok_t elt; + rspamd_stat_token_t elt; guint i; gchar tmpbuf[128]; lua_State *L = task->cfg->lua_state; ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16); + elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; /* Insert images */ for (i = 0; i < task->parts->len; i ++) { @@ -171,6 +173,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, /* Use global metatokens from lua */ lua_getglobal (L, "rspamd_gen_metatokens"); + elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META; if (lua_type (L, -1) == LUA_TFUNCTION) { struct rspamd_task **ptask; @@ -227,6 +230,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task) { struct rspamd_mime_text_part *part; + rspamd_stat_token_t *tok; GArray *words; gchar *sub = NULL; guint i, reserved_len = 0; @@ -272,6 +276,12 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, NULL); if (words != NULL) { + + for (i = 0; i < words->len; i ++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; + } + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, words, diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index c2e050f23..6c8ac354b 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -264,12 +264,12 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, GPtrArray *result) { rspamd_token_t *new_tok = NULL; - rspamd_ftok_t *token; + rspamd_stat_token_t *token; struct rspamd_osb_tokenizer_config *osb_cf; guint64 *hashpipe, cur, seed; guint32 h1, h2; gsize token_size; - guint processed = 0, i, w, window_size; + guint processed = 0, i, w, window_size, token_flags = 0; if (words == NULL) { return FALSE; @@ -292,10 +292,15 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, g_assert (token_size > 0); for (w = 0; w < words->len; w ++) { - token = &g_array_index (words, rspamd_ftok_t, w); + token = &g_array_index (words, rspamd_stat_token_t, w); + token_flags = token->flags; if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - cur = rspamd_fstrhash_lc (token, is_utf); + rspamd_ftok_t ftok; + + ftok.begin = token->begin; + ftok.len = token->len; + cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ @@ -316,6 +321,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, #define ADD_TOKEN do {\ new_tok = rspamd_mempool_alloc0 (pool, token_size); \ new_tok->datalen = sizeof (gint64); \ + new_tok->flags = token_flags; \ if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ h1 = ((guint32)hashpipe[0]) * primes[0] + \ ((guint32)hashpipe[i]) * primes[i << 1]; \ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 6eab11f98..72f7a6bb2 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -22,8 +22,8 @@ #include "stat_internal.h" #include "../../../contrib/mumhash/mum.h" -typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, - rspamd_ftok_t * token, +typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, + rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { @@ -69,8 +69,8 @@ token_node_compare_func (gconstpointer a, gconstpointer b) /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, - gchar const **cur, rspamd_ftok_t * token, +rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, + gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) { gsize remain, pos; @@ -92,6 +92,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, if (ex->pos == 0) { token->begin = buf->begin + ex->len; token->len = ex->len; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; } else { token->begin = buf->begin; @@ -155,14 +156,16 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, } } + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + *cur = p; return TRUE; } static gboolean -rspamd_tokenizer_get_word (rspamd_ftok_t * buf, - gchar const **cur, rspamd_ftok_t * token, +rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, + gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature) { @@ -219,6 +222,7 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, if (ex->type == RSPAMD_EXCEPTION_URL) { token->begin = "!!EX!!"; token->len = sizeof ("!!EX!!") - 1; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; processed = token->len; } state = skip_exception; @@ -240,9 +244,11 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, break; case feed_token: if (ex != NULL && p - buf->begin == (gint)ex->pos) { + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) { + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } processed ++; @@ -288,7 +294,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, struct rspamd_config *cfg, GList *exceptions, gboolean compat, guint64 *hash) { - rspamd_ftok_t token, buf; + rspamd_stat_token_t token, buf; const gchar *pos = NULL; gsize l; GArray *res; @@ -322,7 +328,8 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, initial_size = word_decay * 2; } - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), + initial_size); while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 70ff7560c..530eb40a0 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -29,7 +29,7 @@ struct rspamd_stat_tokenizer { gint token_node_compare_func (gconstpointer a, gconstpointer b); -/* Tokenize text into array of words (rspamd_ftok_t type) */ +/* Tokenize text into array of words (rspamd_stat_token_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, struct rspamd_config *cfg, GList *exceptions, gboolean compat, guint64 *hash); |