@@ -204,7 +204,7 @@ rspamd_extract_words (struct rspamd_task *task, | |||
#ifdef WITH_SNOWBALL | |||
struct sb_stemmer *stem = NULL; | |||
#endif | |||
rspamd_ftok_t *w; | |||
rspamd_stat_token_t *w; | |||
gchar *temp_word; | |||
const guchar *r; | |||
guint i, nlen; | |||
@@ -231,7 +231,7 @@ rspamd_extract_words (struct rspamd_task *task, | |||
for (i = 0; i < part->normalized_words->len; i ++) { | |||
guint64 h; | |||
w = &g_array_index (part->normalized_words, rspamd_ftok_t, i); | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
r = NULL; | |||
#ifdef WITH_SNOWBALL | |||
if (stem) { | |||
@@ -239,7 +239,7 @@ rspamd_extract_words (struct rspamd_task *task, | |||
} | |||
#endif | |||
if (w->len > 0 && !(w->len == 6 && memcmp (w->begin, "!!EX!!", 6) == 0)) { | |||
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { | |||
if (r != NULL) { | |||
nlen = strlen (r); | |||
nlen = MIN (nlen, w->len); | |||
@@ -268,7 +268,8 @@ rspamd_extract_words (struct rspamd_task *task, | |||
* We use static hash seed if we would want to use that in shingles | |||
* computation in future | |||
*/ | |||
h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, | |||
h = rspamd_cryptobox_fast_hash_specific ( | |||
RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, | |||
w->begin, w->len, words_hash_seed); | |||
g_array_append_val (part->normalized_hashes, h); | |||
} |
@@ -26,6 +26,18 @@ | |||
* High level statistics API | |||
*/ | |||
#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0) | |||
#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1) | |||
#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2) | |||
#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3) | |||
#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4) | |||
typedef struct rspamd_stat_token_s { | |||
const gchar *begin; | |||
gsize len; | |||
guint flags; | |||
} rspamd_stat_token_t; | |||
/** | |||
* The results of statistics processing: | |||
* - error |
@@ -56,6 +56,7 @@ typedef struct token_node_s { | |||
guchar data[RSPAMD_MAX_TOKEN_LEN]; | |||
guint window_idx; | |||
guint datalen; | |||
guint flags; | |||
gdouble values[]; | |||
} rspamd_token_t; | |||
@@ -38,9 +38,10 @@ rspamd_stat_tokenize_header (struct rspamd_task *task, | |||
struct rspamd_mime_header *cur; | |||
GPtrArray *hdrs; | |||
guint i; | |||
rspamd_ftok_t str; | |||
rspamd_stat_token_t str; | |||
hdrs = g_hash_table_lookup (task->raw_headers, name); | |||
str.flags = RSPAMD_STAT_TOKEN_FLAG_META; | |||
if (hdrs != NULL) { | |||
@@ -75,12 +76,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, | |||
struct rspamd_mime_text_part *tp; | |||
GList *cur; | |||
GArray *ar; | |||
rspamd_ftok_t elt; | |||
rspamd_stat_token_t elt; | |||
guint i; | |||
gchar tmpbuf[128]; | |||
lua_State *L = task->cfg->lua_state; | |||
ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16); | |||
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; | |||
/* Insert images */ | |||
for (i = 0; i < task->parts->len; i ++) { | |||
@@ -171,6 +173,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, | |||
/* Use global metatokens from lua */ | |||
lua_getglobal (L, "rspamd_gen_metatokens"); | |||
elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META; | |||
if (lua_type (L, -1) == LUA_TFUNCTION) { | |||
struct rspamd_task **ptask; | |||
@@ -227,6 +230,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, | |||
struct rspamd_task *task) | |||
{ | |||
struct rspamd_mime_text_part *part; | |||
rspamd_stat_token_t *tok; | |||
GArray *words; | |||
gchar *sub = NULL; | |||
guint i, reserved_len = 0; | |||
@@ -272,6 +276,12 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, | |||
words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, | |||
NULL); | |||
if (words != NULL) { | |||
for (i = 0; i < words->len; i ++) { | |||
tok = &g_array_index (words, rspamd_stat_token_t, i); | |||
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; | |||
} | |||
st_ctx->tokenizer->tokenize_func (st_ctx, | |||
task->task_pool, | |||
words, |
@@ -264,12 +264,12 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, | |||
GPtrArray *result) | |||
{ | |||
rspamd_token_t *new_tok = NULL; | |||
rspamd_ftok_t *token; | |||
rspamd_stat_token_t *token; | |||
struct rspamd_osb_tokenizer_config *osb_cf; | |||
guint64 *hashpipe, cur, seed; | |||
guint32 h1, h2; | |||
gsize token_size; | |||
guint processed = 0, i, w, window_size; | |||
guint processed = 0, i, w, window_size, token_flags = 0; | |||
if (words == NULL) { | |||
return FALSE; | |||
@@ -292,10 +292,15 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, | |||
g_assert (token_size > 0); | |||
for (w = 0; w < words->len; w ++) { | |||
token = &g_array_index (words, rspamd_ftok_t, w); | |||
token = &g_array_index (words, rspamd_stat_token_t, w); | |||
token_flags = token->flags; | |||
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { | |||
cur = rspamd_fstrhash_lc (token, is_utf); | |||
rspamd_ftok_t ftok; | |||
ftok.begin = token->begin; | |||
ftok.len = token->len; | |||
cur = rspamd_fstrhash_lc (&ftok, is_utf); | |||
} | |||
else { | |||
/* We know that the words are normalized */ | |||
@@ -316,6 +321,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, | |||
#define ADD_TOKEN do {\ | |||
new_tok = rspamd_mempool_alloc0 (pool, token_size); \ | |||
new_tok->datalen = sizeof (gint64); \ | |||
new_tok->flags = token_flags; \ | |||
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ | |||
h1 = ((guint32)hashpipe[0]) * primes[0] + \ | |||
((guint32)hashpipe[i]) * primes[i << 1]; \ |
@@ -22,8 +22,8 @@ | |||
#include "stat_internal.h" | |||
#include "../../../contrib/mumhash/mum.h" | |||
typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, | |||
rspamd_ftok_t * token, | |||
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, | |||
rspamd_stat_token_t * token, | |||
GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); | |||
const gchar t_delimiters[255] = { | |||
@@ -69,8 +69,8 @@ token_node_compare_func (gconstpointer a, gconstpointer b) | |||
/* Get next word from specified f_str_t buf */ | |||
static gboolean | |||
rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, | |||
gchar const **cur, rspamd_ftok_t * token, | |||
rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, | |||
gchar const **cur, rspamd_stat_token_t * token, | |||
GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) | |||
{ | |||
gsize remain, pos; | |||
@@ -92,6 +92,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, | |||
if (ex->pos == 0) { | |||
token->begin = buf->begin + ex->len; | |||
token->len = ex->len; | |||
token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; | |||
} | |||
else { | |||
token->begin = buf->begin; | |||
@@ -155,14 +156,16 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, | |||
} | |||
} | |||
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; | |||
*cur = p; | |||
return TRUE; | |||
} | |||
static gboolean | |||
rspamd_tokenizer_get_word (rspamd_ftok_t * buf, | |||
gchar const **cur, rspamd_ftok_t * token, | |||
rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, | |||
gchar const **cur, rspamd_stat_token_t * token, | |||
GList **exceptions, gboolean is_utf, gsize *rl, | |||
gboolean check_signature) | |||
{ | |||
@@ -219,6 +222,7 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, | |||
if (ex->type == RSPAMD_EXCEPTION_URL) { | |||
token->begin = "!!EX!!"; | |||
token->len = sizeof ("!!EX!!") - 1; | |||
token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; | |||
processed = token->len; | |||
} | |||
state = skip_exception; | |||
@@ -240,9 +244,11 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, | |||
break; | |||
case feed_token: | |||
if (ex != NULL && p - buf->begin == (gint)ex->pos) { | |||
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; | |||
goto set_token; | |||
} | |||
else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) { | |||
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; | |||
goto set_token; | |||
} | |||
processed ++; | |||
@@ -288,7 +294,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, | |||
struct rspamd_config *cfg, GList *exceptions, gboolean compat, | |||
guint64 *hash) | |||
{ | |||
rspamd_ftok_t token, buf; | |||
rspamd_stat_token_t token, buf; | |||
const gchar *pos = NULL; | |||
gsize l; | |||
GArray *res; | |||
@@ -322,7 +328,8 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, | |||
initial_size = word_decay * 2; | |||
} | |||
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); | |||
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), | |||
initial_size); | |||
while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { | |||
if (l == 0 || (min_len > 0 && l < min_len) || |
@@ -29,7 +29,7 @@ struct rspamd_stat_tokenizer { | |||
gint token_node_compare_func (gconstpointer a, gconstpointer b); | |||
/* Tokenize text into array of words (rspamd_ftok_t type) */ | |||
/* Tokenize text into array of words (rspamd_stat_token_t type) */ | |||
GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, | |||
struct rspamd_config *cfg, GList *exceptions, gboolean compat, | |||
guint64 *hash); |
@@ -895,7 +895,7 @@ lua_util_tokenize_text (lua_State *L) | |||
struct rspamd_lua_text *t; | |||
struct rspamd_process_exception *ex; | |||
GArray *res; | |||
rspamd_ftok_t *w; | |||
rspamd_stat_token_t *w; | |||
gboolean compat = FALSE; | |||
if (lua_type (L, 1) == LUA_TSTRING) { | |||
@@ -959,7 +959,7 @@ lua_util_tokenize_text (lua_State *L) | |||
lua_createtable (L, res->len, 0); | |||
for (i = 0; i < res->len; i ++) { | |||
w = &g_array_index (res, rspamd_ftok_t, i); | |||
w = &g_array_index (res, rspamd_stat_token_t, i); | |||
lua_pushlstring (L, w->begin, w->len); | |||
lua_rawseti (L, -2, i + 1); | |||
} |
@@ -25,6 +25,7 @@ | |||
#include "config.h" | |||
#include "libmime/message.h" | |||
#include "rspamd.h" | |||
#include "libstat/stat_api.h" | |||
#define DEFAULT_SYMBOL "R_MIXED_CHARSET" | |||
#define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL" | |||
@@ -163,7 +164,8 @@ chartable_module_reconfig (struct rspamd_config *cfg) | |||
} | |||
static gdouble | |||
rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w, | |||
rspamd_chartable_process_word_utf (struct rspamd_task *task, | |||
rspamd_stat_token_t *w, | |||
gboolean is_url) | |||
{ | |||
const gchar *p, *end, *c; | |||
@@ -258,7 +260,8 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w, | |||
} | |||
static gdouble | |||
rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w, | |||
rspamd_chartable_process_word_ascii (struct rspamd_task *task, | |||
rspamd_stat_token_t *w, | |||
gboolean is_url) | |||
{ | |||
const guchar *p, *end, *c; | |||
@@ -343,7 +346,7 @@ static void | |||
rspamd_chartable_process_part (struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part) | |||
{ | |||
rspamd_ftok_t *w; | |||
rspamd_stat_token_t *w; | |||
guint i; | |||
gdouble cur_score = 0.0; | |||
@@ -353,9 +356,9 @@ rspamd_chartable_process_part (struct rspamd_task *task, | |||
} | |||
for (i = 0; i < part->normalized_words->len; i++) { | |||
w = &g_array_index (part->normalized_words, rspamd_ftok_t, i); | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
if (w->len > 0) { | |||
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { | |||
if (IS_PART_UTF (part)) { | |||
cur_score += rspamd_chartable_process_word_utf (task, w, FALSE); | |||
@@ -397,7 +400,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused) | |||
struct rspamd_url *u; | |||
GHashTableIter it; | |||
gpointer k, v; | |||
rspamd_ftok_t w; | |||
rspamd_stat_token_t w; | |||
gdouble cur_score = 0.0; | |||
g_hash_table_iter_init (&it, task->urls); |
@@ -43,6 +43,7 @@ | |||
#include "lua/lua_common.h" | |||
#include "unix-std.h" | |||
#include "libutil/http_private.h" | |||
#include "libstat/stat_api.h" | |||
#include <math.h> | |||
#define DEFAULT_SYMBOL "R_FUZZY_HASH" | |||
@@ -1266,7 +1267,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, | |||
struct rspamd_shingle *sh; | |||
guint i; | |||
rspamd_cryptobox_hash_state_t st; | |||
rspamd_ftok_t *word; | |||
rspamd_stat_token_t *word; | |||
GArray *words; | |||
struct fuzzy_cmd_io *io; | |||
@@ -1289,7 +1290,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, | |||
words = fuzzy_preprocess_words (part, pool); | |||
for (i = 0; i < words->len; i ++) { | |||
word = &g_array_index (words, rspamd_ftok_t, i); | |||
word = &g_array_index (words, rspamd_stat_token_t, i); | |||
rspamd_cryptobox_hash_update (&st, word->begin, word->len); | |||
} | |||
rspamd_cryptobox_hash_final (&st, shcmd->basic.digest); |