diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-02-14 13:01:08 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-02-14 13:37:18 +0000 |
commit | 4a17956f7b5b6268859445e1d2369abdb2965ae4 (patch) | |
tree | 7e2849b456ef4f11ae2bde3d3526fb0ecc741b04 /src/libstat/tokenizers | |
parent | da43e0ec3b059752e7d4d4d283e33d568aa110cf (diff) | |
download | rspamd-4a17956f7b5b6268859445e1d2369abdb2965ae4.tar.gz rspamd-4a17956f7b5b6268859445e1d2369abdb2965ae4.zip |
[Rework] Use a special structure for stats tokens
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 14 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 23 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 2 |
3 files changed, 26 insertions, 13 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index c2e050f23..6c8ac354b 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -264,12 +264,12 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, GPtrArray *result) { rspamd_token_t *new_tok = NULL; - rspamd_ftok_t *token; + rspamd_stat_token_t *token; struct rspamd_osb_tokenizer_config *osb_cf; guint64 *hashpipe, cur, seed; guint32 h1, h2; gsize token_size; - guint processed = 0, i, w, window_size; + guint processed = 0, i, w, window_size, token_flags = 0; if (words == NULL) { return FALSE; @@ -292,10 +292,15 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, g_assert (token_size > 0); for (w = 0; w < words->len; w ++) { - token = &g_array_index (words, rspamd_ftok_t, w); + token = &g_array_index (words, rspamd_stat_token_t, w); + token_flags = token->flags; if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { - cur = rspamd_fstrhash_lc (token, is_utf); + rspamd_ftok_t ftok; + + ftok.begin = token->begin; + ftok.len = token->len; + cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ @@ -316,6 +321,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, #define ADD_TOKEN do {\ new_tok = rspamd_mempool_alloc0 (pool, token_size); \ new_tok->datalen = sizeof (gint64); \ + new_tok->flags = token_flags; \ if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ h1 = ((guint32)hashpipe[0]) * primes[0] + \ ((guint32)hashpipe[i]) * primes[i << 1]; \ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 6eab11f98..72f7a6bb2 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -22,8 +22,8 @@ #include "stat_internal.h" #include "../../../contrib/mumhash/mum.h" -typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, - rspamd_ftok_t * token, +typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, + rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { @@ -69,8 +69,8 @@ token_node_compare_func (gconstpointer a, gconstpointer b) /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, - gchar const **cur, rspamd_ftok_t * token, +rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, + gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) { gsize remain, pos; @@ -92,6 +92,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, if (ex->pos == 0) { token->begin = buf->begin + ex->len; token->len = ex->len; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; } else { token->begin = buf->begin; @@ -155,14 +156,16 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, } } + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + *cur = p; return TRUE; } static gboolean -rspamd_tokenizer_get_word (rspamd_ftok_t * buf, - gchar const **cur, rspamd_ftok_t * token, +rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, + gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature) { @@ -219,6 +222,7 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, if (ex->type == RSPAMD_EXCEPTION_URL) { token->begin = "!!EX!!"; token->len = sizeof ("!!EX!!") - 1; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; processed = token->len; } state = skip_exception; @@ -240,9 +244,11 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, break; case feed_token: if (ex != NULL && p - buf->begin == (gint)ex->pos) { + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) { + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } processed ++; @@ -288,7 +294,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, struct rspamd_config *cfg, GList *exceptions, gboolean compat, guint64 *hash) { - rspamd_ftok_t token, buf; + rspamd_stat_token_t token, buf; const gchar *pos = NULL; gsize l; GArray *res; @@ -322,7 +328,8 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, initial_size = word_decay * 2; } - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), + initial_size); while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 70ff7560c..530eb40a0 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -29,7 +29,7 @@ struct rspamd_stat_tokenizer { gint token_node_compare_func (gconstpointer a, gconstpointer b); -/* Tokenize text into array of words (rspamd_ftok_t type) */ +/* Tokenize text into array of words (rspamd_stat_token_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, struct rspamd_config *cfg, GList *exceptions, gboolean compat, guint64 *hash); |