#ifdef WITH_SNOWBALL
struct sb_stemmer *stem = NULL;
#endif
- rspamd_ftok_t *w;
+ rspamd_stat_token_t *w;
gchar *temp_word;
const guchar *r;
guint i, nlen;
for (i = 0; i < part->normalized_words->len; i ++) {
guint64 h;
- w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
+ w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
r = NULL;
#ifdef WITH_SNOWBALL
if (stem) {
}
#endif
- if (w->len > 0 && !(w->len == 6 && memcmp (w->begin, "!!EX!!", 6) == 0)) {
+ if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
if (r != NULL) {
nlen = strlen (r);
nlen = MIN (nlen, w->len);
* We use static hash seed if we would want to use that in shingles
* computation in future
*/
- h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+ h = rspamd_cryptobox_fast_hash_specific (
+ RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
w->begin, w->len, words_hash_seed);
g_array_append_val (part->normalized_hashes, h);
}
* High level statistics API
*/
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0)
+#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1)
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2)
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3)
+#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4)
+
+typedef struct rspamd_stat_token_s {
+ const gchar *begin;
+ gsize len;
+ guint flags;
+} rspamd_stat_token_t;
+
/**
* The results of statistics processing:
* - error
guchar data[RSPAMD_MAX_TOKEN_LEN];
guint window_idx;
guint datalen;
+ guint flags;
gdouble values[];
} rspamd_token_t;
struct rspamd_mime_header *cur;
GPtrArray *hdrs;
guint i;
- rspamd_ftok_t str;
+ rspamd_stat_token_t str;
hdrs = g_hash_table_lookup (task->raw_headers, name);
+ str.flags = RSPAMD_STAT_TOKEN_FLAG_META;
if (hdrs != NULL) {
struct rspamd_mime_text_part *tp;
GList *cur;
GArray *ar;
- rspamd_ftok_t elt;
+ rspamd_stat_token_t elt;
guint i;
gchar tmpbuf[128];
lua_State *L = task->cfg->lua_state;
ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+ elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
/* Insert images */
for (i = 0; i < task->parts->len; i ++) {
/* Use global metatokens from lua */
lua_getglobal (L, "rspamd_gen_metatokens");
+ elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META;
if (lua_type (L, -1) == LUA_TFUNCTION) {
struct rspamd_task **ptask;
struct rspamd_task *task)
{
struct rspamd_mime_text_part *part;
+ rspamd_stat_token_t *tok;
GArray *words;
gchar *sub = NULL;
guint i, reserved_len = 0;
words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE,
NULL);
if (words != NULL) {
+
+ for (i = 0; i < words->len; i ++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+ }
+
st_ctx->tokenizer->tokenize_func (st_ctx,
task->task_pool,
words,
GPtrArray *result)
{
rspamd_token_t *new_tok = NULL;
- rspamd_ftok_t *token;
+ rspamd_stat_token_t *token;
struct rspamd_osb_tokenizer_config *osb_cf;
guint64 *hashpipe, cur, seed;
guint32 h1, h2;
gsize token_size;
- guint processed = 0, i, w, window_size;
+ guint processed = 0, i, w, window_size, token_flags = 0;
if (words == NULL) {
return FALSE;
g_assert (token_size > 0);
for (w = 0; w < words->len; w ++) {
- token = &g_array_index (words, rspamd_ftok_t, w);
+ token = &g_array_index (words, rspamd_stat_token_t, w);
+ token_flags = token->flags;
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
- cur = rspamd_fstrhash_lc (token, is_utf);
+ rspamd_ftok_t ftok;
+
+ ftok.begin = token->begin;
+ ftok.len = token->len;
+ cur = rspamd_fstrhash_lc (&ftok, is_utf);
}
else {
/* We know that the words are normalized */
#define ADD_TOKEN do {\
new_tok = rspamd_mempool_alloc0 (pool, token_size); \
new_tok->datalen = sizeof (gint64); \
+ new_tok->flags = token_flags; \
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
h1 = ((guint32)hashpipe[0]) * primes[0] + \
((guint32)hashpipe[i]) * primes[i << 1]; \
#include "stat_internal.h"
#include "../../../contrib/mumhash/mum.h"
-typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
- rspamd_ftok_t * token,
+typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
+ rspamd_stat_token_t * token,
GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
const gchar t_delimiters[255] = {
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
- gchar const **cur, rspamd_ftok_t * token,
+rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+ gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
{
gsize remain, pos;
if (ex->pos == 0) {
token->begin = buf->begin + ex->len;
token->len = ex->len;
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
}
else {
token->begin = buf->begin;
}
}
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+
*cur = p;
return TRUE;
}
static gboolean
-rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
- gchar const **cur, rspamd_ftok_t * token,
+rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
+ gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gboolean is_utf, gsize *rl,
gboolean check_signature)
{
if (ex->type == RSPAMD_EXCEPTION_URL) {
token->begin = "!!EX!!";
token->len = sizeof ("!!EX!!") - 1;
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
processed = token->len;
}
state = skip_exception;
break;
case feed_token:
if (ex != NULL && p - buf->begin == (gint)ex->pos) {
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
goto set_token;
}
else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
goto set_token;
}
processed ++;
struct rspamd_config *cfg, GList *exceptions, gboolean compat,
guint64 *hash)
{
- rspamd_ftok_t token, buf;
+ rspamd_stat_token_t token, buf;
const gchar *pos = NULL;
gsize l;
GArray *res;
initial_size = word_decay * 2;
}
- res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
+ res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
+ initial_size);
while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
if (l == 0 || (min_len > 0 && l < min_len) ||
gint token_node_compare_func (gconstpointer a, gconstpointer b);
-/* Tokenize text into array of words (rspamd_ftok_t type) */
+/* Tokenize text into array of words (rspamd_stat_token_t type) */
GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
struct rspamd_config *cfg, GList *exceptions, gboolean compat,
guint64 *hash);
struct rspamd_lua_text *t;
struct rspamd_process_exception *ex;
GArray *res;
- rspamd_ftok_t *w;
+ rspamd_stat_token_t *w;
gboolean compat = FALSE;
if (lua_type (L, 1) == LUA_TSTRING) {
lua_createtable (L, res->len, 0);
for (i = 0; i < res->len; i ++) {
- w = &g_array_index (res, rspamd_ftok_t, i);
+ w = &g_array_index (res, rspamd_stat_token_t, i);
lua_pushlstring (L, w->begin, w->len);
lua_rawseti (L, -2, i + 1);
}
#include "config.h"
#include "libmime/message.h"
#include "rspamd.h"
+#include "libstat/stat_api.h"
#define DEFAULT_SYMBOL "R_MIXED_CHARSET"
#define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL"
}
static gdouble
-rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w,
+rspamd_chartable_process_word_utf (struct rspamd_task *task,
+ rspamd_stat_token_t *w,
gboolean is_url)
{
const gchar *p, *end, *c;
}
static gdouble
-rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w,
+rspamd_chartable_process_word_ascii (struct rspamd_task *task,
+ rspamd_stat_token_t *w,
gboolean is_url)
{
const guchar *p, *end, *c;
rspamd_chartable_process_part (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
- rspamd_ftok_t *w;
+ rspamd_stat_token_t *w;
guint i;
gdouble cur_score = 0.0;
}
for (i = 0; i < part->normalized_words->len; i++) {
- w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
+ w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
- if (w->len > 0) {
+ if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
if (IS_PART_UTF (part)) {
cur_score += rspamd_chartable_process_word_utf (task, w, FALSE);
struct rspamd_url *u;
GHashTableIter it;
gpointer k, v;
- rspamd_ftok_t w;
+ rspamd_stat_token_t w;
gdouble cur_score = 0.0;
g_hash_table_iter_init (&it, task->urls);
#include "lua/lua_common.h"
#include "unix-std.h"
#include "libutil/http_private.h"
+#include "libstat/stat_api.h"
#include <math.h>
#define DEFAULT_SYMBOL "R_FUZZY_HASH"
struct rspamd_shingle *sh;
guint i;
rspamd_cryptobox_hash_state_t st;
- rspamd_ftok_t *word;
+ rspamd_stat_token_t *word;
GArray *words;
struct fuzzy_cmd_io *io;
words = fuzzy_preprocess_words (part, pool);
for (i = 0; i < words->len; i ++) {
- word = &g_array_index (words, rspamd_ftok_t, i);
+ word = &g_array_index (words, rspamd_stat_token_t, i);
rspamd_cryptobox_hash_update (&st, word->begin, word->len);
}
rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);