[Rework] Use a special structure for stats tokens

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 14 Feb 2017 13:01:08 +0000 (13:01 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 14 Feb 2017 13:37:18 +0000 (13:37 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Feb 2017 13:01:08 +0000 (13:01 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Feb 2017 13:37:18 +0000 (13:37 +0000)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 99384e8a2376b741e600a8ad76cf85abb3f85bba..c84b63360c2bd30cf1f8917f45c9e18e7d39f825 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -204,7 +204,7 @@ rspamd_extract_words (struct rspamd_task *task,
  #ifdef WITH_SNOWBALL
         struct sb_stemmer *stem = NULL;
  #endif
-       rspamd_ftok_t *w;
+       rspamd_stat_token_t *w;
         gchar *temp_word;
         const guchar *r;
         guint i, nlen;
@@ -231,7 +231,7 @@ rspamd_extract_words (struct rspamd_task *task,
                 for (i = 0; i < part->normalized_words->len; i ++) {
                         guint64 h;
  
-                       w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
+                       w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
                         r = NULL;
  #ifdef WITH_SNOWBALL
                         if (stem) {
@@ -239,7 +239,7 @@ rspamd_extract_words (struct rspamd_task *task,
                         }
  #endif
  
-                       if (w->len > 0 && !(w->len == 6 && memcmp (w->begin, "!!EX!!", 6) == 0)) {
+                       if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
                                 if (r != NULL) {
                                         nlen = strlen (r);
                                         nlen = MIN (nlen, w->len);
@@ -268,7 +268,8 @@ rspamd_extract_words (struct rspamd_task *task,
                                  * We use static hash seed if we would want to use that in shingles
                                  * computation in future
                                  */
-                               h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+                               h = rspamd_cryptobox_fast_hash_specific (
+                                               RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
                                                 w->begin, w->len, words_hash_seed);
                                 g_array_append_val (part->normalized_hashes, h);
                         }
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h

index 3c42e86229333e14cf0ee8a2d6d4780e25e90f83..6c2604e8952b768bd8f20ef03a167013d63de26f 100644 (file)
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -26,6 +26,18 @@
   * High level statistics API
   */
  
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0)
+#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1)
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2)
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3)
+#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4)
+
+typedef struct rspamd_stat_token_s {
+       const gchar *begin;
+       gsize len;
+       guint flags;
+} rspamd_stat_token_t;
+
  /**
   * The results of statistics processing:
   * - error
diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h

index 8f06736bf2edf12faf042cce9c643aaec56bd212..36ab6a6970779ba531758291e6182fd272e81a50 100644 (file)
--- a/src/libstat/stat_internal.h
+++ b/src/libstat/stat_internal.h
@@ -56,6 +56,7 @@ typedef struct token_node_s {
         guchar data[RSPAMD_MAX_TOKEN_LEN];
         guint window_idx;
         guint datalen;
+       guint flags;
         gdouble values[];
  } rspamd_token_t;
  
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index 2b87fffc62ee61561511f9f50ab01c71516b6b51..00b26ee2e3072b03d71a401140cfaa313d2fc2fb 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -38,9 +38,10 @@ rspamd_stat_tokenize_header (struct rspamd_task *task,
         struct rspamd_mime_header *cur;
         GPtrArray *hdrs;
         guint i;
-       rspamd_ftok_t str;
+       rspamd_stat_token_t str;
  
         hdrs = g_hash_table_lookup (task->raw_headers, name);
+       str.flags = RSPAMD_STAT_TOKEN_FLAG_META;
  
         if (hdrs != NULL) {
  
@@ -75,12 +76,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
         struct rspamd_mime_text_part *tp;
         GList *cur;
         GArray *ar;
-       rspamd_ftok_t elt;
+       rspamd_stat_token_t elt;
         guint i;
         gchar tmpbuf[128];
         lua_State *L = task->cfg->lua_state;
  
         ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+       elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
  
         /* Insert images */
         for (i = 0; i < task->parts->len; i ++) {
@@ -171,6 +173,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
  
         /* Use global metatokens from lua */
         lua_getglobal (L, "rspamd_gen_metatokens");
+       elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META;
  
         if (lua_type (L, -1) == LUA_TFUNCTION) {
                 struct rspamd_task **ptask;
@@ -227,6 +230,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                 struct rspamd_task *task)
  {
         struct rspamd_mime_text_part *part;
+       rspamd_stat_token_t *tok;
         GArray *words;
         gchar *sub = NULL;
         guint i, reserved_len = 0;
@@ -272,6 +276,12 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                 words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE,
                                 NULL);
                 if (words != NULL) {
+
+                       for (i = 0; i < words->len; i ++) {
+                               tok = &g_array_index (words, rspamd_stat_token_t, i);
+                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+                       }
+
                         st_ctx->tokenizer->tokenize_func (st_ctx,
                                         task->task_pool,
                                         words,
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c

index c2e050f2332af28d136984eac6c9afbf606399a2..6c8ac354b624b4aaffb08f0cffbed8c3059578b4 100644 (file)
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -264,12 +264,12 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
                 GPtrArray *result)
  {
         rspamd_token_t *new_tok = NULL;
-       rspamd_ftok_t *token;
+       rspamd_stat_token_t *token;
         struct rspamd_osb_tokenizer_config *osb_cf;
         guint64 *hashpipe, cur, seed;
         guint32 h1, h2;
         gsize token_size;
-       guint processed = 0, i, w, window_size;
+       guint processed = 0, i, w, window_size, token_flags = 0;
  
         if (words == NULL) {
                 return FALSE;
@@ -292,10 +292,15 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
         g_assert (token_size > 0);
  
         for (w = 0; w < words->len; w ++) {
-               token = &g_array_index (words, rspamd_ftok_t, w);
+               token = &g_array_index (words, rspamd_stat_token_t, w);
+               token_flags = token->flags;
  
                 if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
-                       cur = rspamd_fstrhash_lc (token, is_utf);
+                       rspamd_ftok_t ftok;
+
+                       ftok.begin = token->begin;
+                       ftok.len = token->len;
+                       cur = rspamd_fstrhash_lc (&ftok, is_utf);
                 }
                 else {
                         /* We know that the words are normalized */
@@ -316,6 +321,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
  #define ADD_TOKEN do {\
      new_tok = rspamd_mempool_alloc0 (pool, token_size); \
      new_tok->datalen = sizeof (gint64); \
+    new_tok->flags = token_flags; \
      if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
          h1 = ((guint32)hashpipe[0]) * primes[0] + \
              ((guint32)hashpipe[i]) * primes[i << 1]; \
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 6eab11f98ecbf6bc49ee79e28cffb40a5d19c785..72f7a6bb27830fbf7799e1184c5a21c961008b22 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -22,8 +22,8 @@
  #include "stat_internal.h"
  #include "../../../contrib/mumhash/mum.h"
  
-typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
-               rspamd_ftok_t * token,
+typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
+               rspamd_stat_token_t * token,
                 GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
  
  const gchar t_delimiters[255] = {
@@ -69,8 +69,8 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
  
  /* Get next word from specified f_str_t buf */
  static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
-               gchar const **cur, rspamd_ftok_t * token,
+rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+               gchar const **cur, rspamd_stat_token_t * token,
                 GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
  {
         gsize remain, pos;
@@ -92,6 +92,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
                         if (ex->pos == 0) {
                                 token->begin = buf->begin + ex->len;
                                 token->len = ex->len;
+                               token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
                         }
                         else {
                                 token->begin = buf->begin;
@@ -155,14 +156,16 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
                 }
         }
  
+       token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+
         *cur = p;
  
         return TRUE;
  }
  
  static gboolean
-rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
-               gchar const **cur, rspamd_ftok_t * token,
+rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
+               gchar const **cur, rspamd_stat_token_t * token,
                 GList **exceptions, gboolean is_utf, gsize *rl,
                 gboolean check_signature)
  {
@@ -219,6 +222,7 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
                                 if (ex->type == RSPAMD_EXCEPTION_URL) {
                                         token->begin = "!!EX!!";
                                         token->len = sizeof ("!!EX!!") - 1;
+                                       token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
                                         processed = token->len;
                                 }
                                 state = skip_exception;
@@ -240,9 +244,11 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
                         break;
                 case feed_token:
                         if (ex != NULL && p - buf->begin == (gint)ex->pos) {
+                               token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
                                 goto set_token;
                         }
                         else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
+                               token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
                                 goto set_token;
                         }
                         processed ++;
@@ -288,7 +294,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                 struct rspamd_config *cfg, GList *exceptions, gboolean compat,
                 guint64 *hash)
  {
-       rspamd_ftok_t token, buf;
+       rspamd_stat_token_t token, buf;
         const gchar *pos = NULL;
         gsize l;
         GArray *res;
@@ -322,7 +328,8 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                 initial_size = word_decay * 2;
         }
  
-       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
+       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
+                       initial_size);
  
         while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
                 if (l == 0 || (min_len > 0 && l < min_len) ||
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h

index 70ff7560cdb75de79462417dd51fc6007d945bf3..530eb40a04799bc6f7f6136b56837cea3f0ab912 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -29,7 +29,7 @@ struct rspamd_stat_tokenizer {
  gint token_node_compare_func (gconstpointer a, gconstpointer b);
  
  
-/* Tokenize text into array of words (rspamd_ftok_t type) */
+/* Tokenize text into array of words (rspamd_stat_token_t type) */
  GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                 struct rspamd_config *cfg, GList *exceptions, gboolean compat,
                 guint64 *hash);
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c

index 5bd59a32dcf6132d6693c01c610efadb77188eb5..80baf8b34ad88de30e72f34c48a0bbe8dcd3b08d 100644 (file)
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -895,7 +895,7 @@ lua_util_tokenize_text (lua_State *L)
         struct rspamd_lua_text *t;
         struct rspamd_process_exception *ex;
         GArray *res;
-       rspamd_ftok_t *w;
+       rspamd_stat_token_t *w;
         gboolean compat = FALSE;
  
         if (lua_type (L, 1) == LUA_TSTRING) {
@@ -959,7 +959,7 @@ lua_util_tokenize_text (lua_State *L)
                 lua_createtable (L, res->len, 0);
  
                 for (i = 0; i < res->len; i ++) {
-                       w = &g_array_index (res, rspamd_ftok_t, i);
+                       w = &g_array_index (res, rspamd_stat_token_t, i);
                         lua_pushlstring (L, w->begin, w->len);
                         lua_rawseti (L, -2, i + 1);
                 }
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c

index 3b5a0f717773c1049e5a945866e41be7d3587d8c..172a69261e2900c5700e370808e97e883f31ba90 100644 (file)
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -25,6 +25,7 @@
  #include "config.h"
  #include "libmime/message.h"
  #include "rspamd.h"
+#include "libstat/stat_api.h"
  
  #define DEFAULT_SYMBOL "R_MIXED_CHARSET"
  #define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL"
@@ -163,7 +164,8 @@ chartable_module_reconfig (struct rspamd_config *cfg)
  }
  
  static gdouble
-rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w,
+rspamd_chartable_process_word_utf (struct rspamd_task *task,
+               rspamd_stat_token_t *w,
                 gboolean is_url)
  {
         const gchar *p, *end, *c;
@@ -258,7 +260,8 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w,
  }
  
  static gdouble
-rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w,
+rspamd_chartable_process_word_ascii (struct rspamd_task *task,
+               rspamd_stat_token_t *w,
                 gboolean is_url)
  {
         const guchar *p, *end, *c;
@@ -343,7 +346,7 @@ static void
  rspamd_chartable_process_part (struct rspamd_task *task,
                 struct rspamd_mime_text_part *part)
  {
-       rspamd_ftok_t *w;
+       rspamd_stat_token_t *w;
         guint i;
         gdouble cur_score = 0.0;
  
@@ -353,9 +356,9 @@ rspamd_chartable_process_part (struct rspamd_task *task,
         }
  
         for (i = 0; i < part->normalized_words->len; i++) {
-               w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
+               w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
  
-               if (w->len > 0) {
+               if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
  
                         if (IS_PART_UTF (part)) {
                                 cur_score += rspamd_chartable_process_word_utf (task, w, FALSE);
@@ -397,7 +400,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused)
         struct rspamd_url *u;
         GHashTableIter it;
         gpointer k, v;
-       rspamd_ftok_t w;
+       rspamd_stat_token_t w;
         gdouble cur_score = 0.0;
  
         g_hash_table_iter_init (&it, task->urls);
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c

index 92930b94827146cc7302f458c2b7cc7cef2ce6d2..1804e864870b9cb146e3e576f7d947c2ea694325 100644 (file)
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -43,6 +43,7 @@
  #include "lua/lua_common.h"
  #include "unix-std.h"
  #include "libutil/http_private.h"
+#include "libstat/stat_api.h"
  #include <math.h>
  
  #define DEFAULT_SYMBOL "R_FUZZY_HASH"
@@ -1266,7 +1267,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
         struct rspamd_shingle *sh;
         guint i;
         rspamd_cryptobox_hash_state_t st;
-       rspamd_ftok_t *word;
+       rspamd_stat_token_t *word;
         GArray *words;
         struct fuzzy_cmd_io *io;
  
@@ -1289,7 +1290,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
                 words = fuzzy_preprocess_words (part, pool);
  
                 for (i = 0; i < words->len; i ++) {
-                       word = &g_array_index (words, rspamd_ftok_t, i);
+                       word = &g_array_index (words, rspamd_stat_token_t, i);
                         rspamd_cryptobox_hash_update (&st, word->begin, word->len);
                 }
                 rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 14 Feb 2017 13:01:08 +0000 (13:01 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 14 Feb 2017 13:37:18 +0000 (13:37 +0000)
src/libmime/message.c		patch \| blob \| history
src/libstat/stat_api.h		patch \| blob \| history
src/libstat/stat_internal.h		patch \| blob \| history
src/libstat/stat_process.c		patch \| blob \| history
src/libstat/tokenizers/osb.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.h		patch \| blob \| history
src/lua/lua_util.c		patch \| blob \| history
src/plugins/chartable.c		patch \| blob \| history
src/plugins/fuzzy_check.c		patch \| blob \| history