Przeglądaj źródła

* New system of classifiers interface and statfiles processing

* Fix sample config
* Fix compile warnings
* Fix building without lua support
* Fix bugs with nrcpt header parsing and symbols cache loading (by Anton Nekhoroshikh)
tags/0.2.7
Vsevolod Stakhov 14 lat temu
rodzic
commit
a0f41f7c57

+ 9
- 0
conf/rspamd.conf.sample Wyświetl plik

@@ -306,3 +306,12 @@ view {
# Symbols to check, can also be list of files or regexp:
symbols = "/^[A-Z]{2}_SURBL_MULTI$/i";
};

# Settings files
settings {
# json data for user's settings
#user_settings = "file:///some/json/file";
# json data for domain's settings
#domain_settings = "file:///some/other/json/file";
};

+ 94
- 44
rspamd.conf.sample Wyświetl plik

@@ -39,12 +39,52 @@ worker {
password = "q1";
};

# Settings for fuzzy storage interface
worker {
type = "fuzzy";

# Bind socket for control interface
bind_socket = localhost:11335;

count = 1;
# Path to filesystem storage
hashfile = "/tmp/fuzzy.db";
};

# Options for lmtp worker
#worker {
#type = "lmtp";
# Bind socket for lmtp interface
#bind_socket = localhost:11335;
# Metric that is considered as main. If we have spam result on
# this metric, lmtp delivery would be failed
#metric = "default";
# Number of lmtp workers
#count = 1;
#};

#worker {
#type = "delivery";
# Path to delivery agent, %f is expanded as mail from address and %r
# is expanded as recipient address
# Expample: agent = "/usr/local/bin/procmail -f %f -d %r"
#agent = "/dev/null";
# Bind socket for lmtp interface
# Example: bind_socket = localhost:25
# Whether we should use lmtp for MTA delivery
#lmtp = no;
#};


# Sample metric definition
metric {
# Name of metric
name = "testmetric";
# Score to count message as spam by this metric
required_score = 10.1;
# Symbols cache path for optimal checks planning
cache_file = "/tmp/symbols.cache";
};

# Logging settings
@@ -64,27 +104,36 @@ logging {
# Default: 100M
statfile_pool_size = 40M;


# Sample statfile definition
#statfile {
# Alias is used for learning and is used as symbol
#alias = "test.spam";
# Pattern is path to file, can include %r - recipient name and %f - mail from value
#pattern = "./test.spam";
# Weight in spam/ham classifier
#weight = 1.0;
# Size of this statfile class
#size = 10M;
# Tokenizer for this statfile
# Deafault: osb-text
#tokenizer = "osb-text";
#};
#statfile {
#alias = "test.ham";
#pattern = "./test.ham";
#weight = -2.0;
#size = 10M;
#};
# Classifier definition
classifier {
# Type of classfier
type = "winnow";
# Tokenizer used
tokenizer = "osb-text";
# Sample statfile definition
statfile {
# Alias is used for learning and is used as symbol
symbol = "WINNOW_SPAM";
# Pattern is path to file, can include %r - recipient name and %f - mail from value
path = "/tmp/test.spam";
# Size of this statfile class
size = 10M;
# Tokenizer for this statfile
# Deafault: osb-text
#tokenizer = "osb-text";
autolearn {
min_mark = 10.0;
};
};
statfile {
symbol = "WINNOW_HAM";
path = "/tmp/test.ham";
size = 10M;
autolearn {
max_mark = 0.1;
};
};
};

# Factors coefficients
factors {
@@ -159,30 +208,7 @@ factors {
"R_MIXED_CHARSET" = 5;
"R_BAD_EMAIL" = 10.5;
};
# Options for lmtp worker
#worker {
#type = "lmtp";
# Bind socket for lmtp interface
#bind_socket = localhost:11335;
# Metric that is considered as main. If we have spam result on
# this metric, lmtp delivery would be failed
#metric = "default";
# Number of lmtp workers
#count = 1;
#};

#worker {
#type = "delivery";
# Path to delivery agent, %f is expanded as mail from address and %r
# is expanded as recipient address
# Expample: agent = "/usr/local/bin/procmail -f %f -d %r"
#agent = "/dev/null";
# Bind socket for lmtp interface
# Example: bind_socket = localhost:25
# Whether we should use lmtp for MTA delivery
#lmtp = no;
#};

# SURBL module params, note that single quotes are mandatory here
.module 'surbl' {
@@ -285,6 +311,14 @@ factors {
#blacklist = "file:///some/path/emails.lst";
};

# Module for fuzzy checksum loading
.module 'fuzzy_check' {
metric = "default";
symbol = "R_FUZZY";
# List of fuzzy storage servers, separated by ',' or ';' or simple by spaces
servers = "localhost:11335";
};

# If enables threat each regexp as raw regex and do not try to convert
# each text part to utf8 encoding. Save a lot of resources but less
# portable.
@@ -315,3 +349,19 @@ settings {
# json data for domain's settings
#domain_settings = "file:///some/other/json/file";
};

# Example of json config:
# [
# {
# "name": "cebka@test.ru",
# "metrics":
# {
# "default": 5.5
# },
# "factors":
# {
# "R_FUZZY": 10.1
# },
# "want_spam": false
# }
# ]

+ 16
- 7
src/cfg_file.h Wyświetl plik

@@ -130,16 +130,24 @@ struct statfile_autolearn_params {
* Statfile config definition
*/
struct statfile {
char *alias; /**< alias of statfile */
char *pattern; /**< filesystem pattern (with %r or %f) */
double weight; /**< weight scale */
char *metric; /**< metric name */
char *symbol; /**< symbol of statfile */
char *path; /**< filesystem pattern (with %r or %f) */
size_t size; /**< size of statfile */
struct tokenizer *tokenizer; /**< tokenizer used for statfile */
GList *sections; /**< list of sections in statfile */
struct statfile_autolearn_params *autolearn; /**< autolearn params */
};

/**
* Classifier config definition
*/
struct classifier_config {
GList *statfiles; /**< statfiles list */
char *metric; /**< metric of this classifier */
struct classifier *classifier; /**< classifier interface */
struct tokenizer *tokenizer; /**< tokenizer used for classifier */
GHashTable *opts; /**< other options */
};

/**
* Config option for importing to script module
*/
@@ -223,7 +231,8 @@ struct config_file {
GHashTable* factors; /**< hash of factors indexed by symbol name */
GHashTable* c_modules; /**< hash of c modules indexed by module name */
GHashTable* composite_symbols; /**< hash of composite symbols indexed by its name */
GHashTable* statfiles; /**< hash of defined statfiles indexed by alias */
GList *classifiers; /**< list of all classifiers defined */
GHashTable *classifiers_symbols; /**< hashtable indexed by symbol name of classifiers */
GHashTable* cfg_params; /**< all cfg params indexed by its name in this structure */
int clock_res; /**< resolution of clock used */
GList *views; /**< views */
@@ -314,7 +323,7 @@ void post_load_config (struct config_file *cfg);
void unescape_quotes (char *line);

GList* parse_comma_list (memory_pool_t *pool, char *line);
struct classifier_config* check_classifier_cfg (struct config_file *cfg, struct classifier_config *c);

int yylex (void);
int yyparse (void);

+ 33
- 15
src/cfg_file.l Wyświetl plik

@@ -2,6 +2,7 @@
%x module
%x lua
%x worker
%x classifier

%{

@@ -21,6 +22,7 @@ extern void add_luabuf (const char *line);
YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH];
int line_stack[MAX_INCLUDE_DEPTH];
int include_stack_ptr = 0;
int nested_depth = 0;
extern struct config_file *cfg;

%}
@@ -74,17 +76,7 @@ enabled return ENABLED;
delivery return DELIVERY;
agent return AGENT;

statfile return STATFILE;
alias return ALIAS;
pattern return PATTERN;
weight return WEIGHT;
size return SIZE;
tokenizer return TOKENIZER;
classifier return CLASSIFIER;
section return SECTION;
autolearn return AUTOLEARN;
min_mark return MIN_MARK;
max_mark return MAX_MARK;
classifier BEGIN(classifier); return CLASSIFIER;

logging return LOGGING;

@@ -167,8 +159,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<module>[ \t]+ /* ignore whitespace */;
<module>[ \t]*#.* /* ignore comments */;
<module>\'[a-zA-Z0-9_-]+\' yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; return MODULE_OPT;
<module>\{ return OBRACE;
<module>\} BEGIN(INITIAL); return EBRACE;
<module>\{ nested_depth ++; return OBRACE;
<module>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<module>\; return SEMICOLON;
<module>= return EQSIGN;
<module>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
@@ -178,8 +170,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<worker>\n /* ignore EOL */;
<worker>[ \t]+ /* ignore whitespace */;
<worker>[ \t]*#.* /* ignore comments */;
<worker>\{ return OBRACE;
<worker>\} BEGIN(INITIAL); return EBRACE;
<worker>\{ nested_depth ++; return OBRACE;
<worker>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<worker>\; return SEMICOLON;
<worker>= return EQSIGN;
<worker>type return TYPE;
@@ -193,6 +185,32 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<worker>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
<worker>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;

<classifier>\n /* ignore EOL */;
<classifier>[ \t]+ /* ignore whitespace */;
<classifier>[ \t]*#.* /* ignore comments */;
<classifier>\{ nested_depth ++; return OBRACE;
<classifier>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<classifier>\; return SEMICOLON;
<classifier>= return EQSIGN;
<classifier>type return TYPE;
<classifier>bind_socket return BINDSOCK;
<classifier>count return COUNT;
<classifier>statfile return STATFILE;
<classifier>symbol return SYMBOL;
<classifier>path return PATH;
<classifier>size return SIZE;
<classifier>tokenizer return TOKENIZER;
<classifier>section return SECTION;
<classifier>autolearn return AUTOLEARN;
<classifier>min_mark return MIN_MARK;
<classifier>max_mark return MAX_MARK;
<classifier>[0-9]+ yylval.number=strtol(yytext, NULL, 10); return NUMBER;
<classifier>-?[0-9]+\.?[0-9]* yylval.fract=strtod(yytext, NULL); return FRACT;
<classifier>[0-9]+[kKmMgG]? yylval.limit=parse_limit(yytext); return SIZELIMIT;
<classifier>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
<classifier>[a-zA-Z0-9_%-]+ yylval.string=strdup(yytext); return PARAM;
<classifier>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;

<lua>\n /* ignore EOL */;
<lua>[ \t]+ /* ignore whitespace */;
<lua>[ \t]*#.* /* ignore comments */;

+ 83
- 55
src/cfg_file.y Wyświetl plik

@@ -23,6 +23,7 @@ extern char *yytext;

GList *cur_module_opt = NULL;
struct metric *cur_metric = NULL;
struct classifier_config *cur_classifier = NULL;
struct statfile *cur_statfile = NULL;
struct statfile_section *cur_section = NULL;
struct statfile_autolearn_params *cur_autolearn = NULL;
@@ -58,7 +59,7 @@ struct rspamd_view *cur_view = NULL;
%token DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT
%token VIEW IP FROM SYMBOLS
%token AUTOLEARN MIN_MARK MAX_MARK
%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS
%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS SYMBOL PATH

%type <string> STRING
%type <string> VARIABLE
@@ -93,7 +94,7 @@ command :
| metric
| composites
| logging
| statfile
| classifier
| statfile_pool_size
| luacode
| raw_mode
@@ -660,20 +661,81 @@ loggingfile:
}
;


classifier:
CLASSIFIER OBRACE classifierbody EBRACE {
if (cur_classifier == NULL || cur_classifier->classifier == NULL) {
yyerror ("yyparse: invalid classifier definition");
YYERROR;
}
if (cur_classifier->metric == NULL) {
cur_classifier->metric = DEFAULT_METRIC;
}
if (cur_classifier->tokenizer == NULL) {
cur_classifier->tokenizer = get_tokenizer ("osb-text");
}

cfg->classifiers = g_list_prepend (cfg->classifiers, cur_classifier);
cur_classifier = NULL;
}
;

classifierbody:
| classifiercmd SEMICOLON
| classifierbody classifiercmd SEMICOLON
;

classifiercmd:
| statfile
| classifiertype
| classifiermetric
| classifiertokenizer
| classifieroption
;

classifiertype:
TYPE EQSIGN QUOTEDSTRING {
cur_classifier = check_classifier_cfg (cfg, cur_classifier);
if ((cur_classifier->classifier = get_classifier ($3)) == NULL) {
yyerror ("yyparse: unknown classifier type: %s", $3);
YYERROR;
}
}
;
classifiertokenizer:
TOKENIZER EQSIGN QUOTEDSTRING {
cur_classifier = check_classifier_cfg (cfg, cur_classifier);
if ((cur_classifier->tokenizer = get_tokenizer ($3)) == NULL) {
yyerror ("yyparse: unknown tokenizer %s", $3);
YYERROR;
}
}
;

classifiermetric:
METRIC EQSIGN QUOTEDSTRING {
cur_classifier = check_classifier_cfg (cfg, cur_classifier);
cur_classifier->metric = $3;
memory_pool_add_destructor (cfg->cfg_pool, g_free, cur_classifier->metric);
}
;

classifieroption:
PARAM EQSIGN QUOTEDSTRING {
cur_classifier = check_classifier_cfg (cfg, cur_classifier);
g_hash_table_insert (cur_classifier->opts, $1, $3);
memory_pool_add_destructor (cfg->cfg_pool, g_free, $1);
memory_pool_add_destructor (cfg->cfg_pool, g_free, $3);
};

statfile:
STATFILE OBRACE statfilebody EBRACE {
if (cur_statfile == NULL || cur_statfile->alias == NULL || cur_statfile->pattern == NULL
|| cur_statfile->weight == 0 || cur_statfile->size == 0) {
if (cur_statfile == NULL || cur_statfile->path == NULL || cur_statfile->size == 0) {
yyerror ("yyparse: not enough arguments in statfile definition");
YYERROR;
}
if (cur_statfile->metric == NULL) {
cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default");
}
if (cur_statfile->tokenizer == NULL) {
cur_statfile->tokenizer = get_tokenizer ("osb-text");
}
g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile);
cur_classifier = check_classifier_cfg (cfg, cur_classifier);
cur_classifier->statfiles = g_list_prepend (cur_classifier->statfiles, cur_statfile);
cur_statfile = NULL;
}
;
@@ -684,48 +746,33 @@ statfilebody:
;

statfilecmd:
| statfilealias
| statfilepattern
| statfileweight
| statfilesymbol
| statfilepath
| statfilesize
| statfilemetric
| statfiletokenizer
| statfilesection
| statfileautolearn
;
statfilealias:
ALIAS EQSIGN QUOTEDSTRING {
statfilesymbol:
SYMBOL EQSIGN QUOTEDSTRING {
cur_classifier = check_classifier_cfg (cfg, cur_classifier);
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
cur_statfile->alias = memory_pool_strdup (cfg->cfg_pool, $3);
cur_statfile->symbol = memory_pool_strdup (cfg->cfg_pool, $3);
g_hash_table_insert (cfg->classifiers_symbols, $3, cur_classifier);
}
;

statfilepattern:
PATTERN EQSIGN QUOTEDSTRING {
statfilepath:
PATH EQSIGN QUOTEDSTRING {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
cur_statfile->pattern = memory_pool_strdup (cfg->cfg_pool, $3);
cur_statfile->path = memory_pool_strdup (cfg->cfg_pool, $3);
}
;

statfileweight:
WEIGHT EQSIGN NUMBER {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
cur_statfile->weight = $3;
}
| WEIGHT EQSIGN FRACT {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
cur_statfile->weight = $3;
}
;

statfilesize:
SIZE EQSIGN NUMBER {
@@ -742,26 +789,7 @@ statfilesize:
}
;

statfilemetric:
METRIC EQSIGN QUOTEDSTRING {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3);
}
;

statfiletokenizer:
TOKENIZER EQSIGN QUOTEDSTRING {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) {
yyerror ("yyparse: unknown tokenizer %s", $3);
YYERROR;
}
}
;

statfilesection:
SECTION OBRACE sectionbody EBRACE {

+ 17
- 3
src/cfg_utils.c Wyświetl plik

@@ -186,7 +186,7 @@ init_defaults (struct config_file *cfg)
cfg->factors = g_hash_table_new (g_str_hash, g_str_equal);
cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal);
cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal);
cfg->statfiles = g_hash_table_new (g_str_hash, g_str_equal);
cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal);
cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal);
init_settings (cfg);

@@ -207,10 +207,10 @@ free_config (struct config_file *cfg)
g_hash_table_unref (cfg->c_modules);
g_hash_table_remove_all (cfg->composite_symbols);
g_hash_table_unref (cfg->composite_symbols);
g_hash_table_remove_all (cfg->statfiles);
g_hash_table_unref (cfg->statfiles);
g_hash_table_remove_all (cfg->cfg_params);
g_hash_table_unref (cfg->cfg_params);
g_hash_table_destroy (cfg->classifiers_symbols);
g_list_free (cfg->classifiers);
g_list_free (cfg->metrics_list);
memory_pool_delete (cfg->cfg_pool);
}
@@ -604,6 +604,20 @@ parse_comma_list (memory_pool_t *pool, char *line)
return res;
}

struct classifier_config *
check_classifier_cfg (struct config_file *cfg, struct classifier_config *c)
{
if (c == NULL) {
c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config));
}
if (c->opts == NULL) {
c->opts = g_hash_table_new (g_str_hash, g_str_equal);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->opts);
}

return c;
}

/*
* vi:ts=4
*/

+ 0
- 1
src/classifiers/classifiers.c Wyświetl plik

@@ -35,7 +35,6 @@ struct classifier classifiers[] = {
.init_func = winnow_init,
.classify_func = winnow_classify,
.learn_func = winnow_learn,
.result_file_func = winnow_result_file
},
};


+ 10
- 9
src/classifiers/classifiers.h Wyświetl plik

@@ -6,29 +6,30 @@
#include "../statfile.h"
#include "../tokenizers/tokenizers.h"

struct classifier_config;
struct worker_task;

struct classifier_ctx {
memory_pool_t *pool;
GHashTable *results;
struct classifier_config *cfg;
};
/* Common classifier structure */
struct classifier {
char *name;
struct classifier_ctx* (*init_func)(memory_pool_t *pool);
void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
char *statfile, GTree *input, double scale);
struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf);
void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
char *statfile, GTree *input, int in_class);
char* (*result_file_func)(struct classifier_ctx *ctx, double *probability);
char *symbol, GTree *input, gboolean in_class);
};

/* Get classifier structure by name or return NULL if this name is not found */
struct classifier* get_classifier (char *name);

/* Winnow algorithm */
struct classifier_ctx* winnow_init (memory_pool_t *pool);
void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale);
void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
char* winnow_result_file (struct classifier_ctx* ctx, double *probability);
struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf);
void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class);

/* Array of all defined classifiers */
extern struct classifier classifiers[];

+ 56
- 57
src/classifiers/winnow.c Wyświetl plik

@@ -27,6 +27,9 @@
*/

#include "classifiers.h"
#include "../main.h"
#include "../filter.h"
#include "../cfg_file.h"

#define WINNOW_PROMOTION 1.23
#define WINNOW_DEMOTION 0.83
@@ -85,21 +88,23 @@ learn_callback (gpointer key, gpointer value, gpointer data)
}

struct classifier_ctx*
winnow_init (memory_pool_t *pool)
winnow_init (memory_pool_t *pool, struct classifier_config *cfg)
{
struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx));

ctx->pool = pool;
ctx->results = g_hash_table_new (g_str_hash, g_str_equal);
memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results);
ctx->cfg = cfg;

return ctx;
}
void
winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale)
winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
{
struct winnow_callback_data data;
double *res = memory_pool_alloc (ctx->pool, sizeof (double));
double max = 0;
GList *cur;
struct statfile *st, *sel = NULL;

g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -109,29 +114,44 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfi
data.count = 0;
data.now = time (NULL);
data.ctx = ctx;

if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
return;
cur = ctx->cfg->statfiles;
while (cur) {
st = cur->data;
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
msg_warn ("winnow_classify: cannot open %s, skip it", st->path);
cur = g_list_next (cur);
continue;
}
}
}

g_tree_foreach (input, classify_callback, &data);
g_tree_foreach (input, classify_callback, &data);
if (data.count != 0) {
*res = scale * (data.sum / data.count);
if (data.count != 0) {
*res = (data.sum / data.count);
}
else {
*res = 0;
}
if (*res > max) {
max = *res;
sel = st;
}
cur = g_list_next (cur);
}
else {
*res = 0;
if (sel != NULL) {
insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL);
}

g_hash_table_insert (ctx->results, statfile, res);
}

void
winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
{
struct winnow_callback_data data;
GList *cur;
struct statfile *st;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -142,50 +162,29 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile,
data.in_class = in_class;
data.now = time (NULL);
data.ctx = ctx;

if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
return;
cur = g_list_first (ctx->cfg->statfiles);
while (cur) {
st = cur->data;
if (strcmp (symbol, st->symbol) == 0) {
if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
/* Try to create statfile */
if (statfile_pool_create (pool,
st->path, st->size / sizeof (struct stat_file_block)) == -1) {
msg_err ("winnow_learn: cannot create statfile %s", st->path);
return;
}
if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
msg_err ("winnow_learn: cannot create statfile %s", st->path);
return;
}
}
break;
}
cur = g_list_next (cur);
}

statfile_pool_lock_file (pool, data.file);
g_tree_foreach (input, learn_callback, &data);
statfile_pool_unlock_file (pool, data.file);
}

struct winnow_result_data {
char *filename;
double max_score;
double sum;
};

static void
result_file_callback (gpointer key, gpointer value, gpointer data)
{
struct winnow_result_data *d = (struct winnow_result_data *)data;
double w = *((double *)value);

if (fabs (w) > fabs (d->max_score)) {
d->filename = (char *)key;
d->max_score = w;
}
d->sum += fabs (w);
}

char*
winnow_result_file (struct classifier_ctx* ctx, double *probability)
{
struct winnow_result_data data = { NULL, 0, 0 };
g_assert (ctx != NULL);
g_hash_table_foreach (ctx->results, result_file_callback, &data);
if (data.sum != 0) {
*probability = data.max_score / data.sum;
}
else {
*probability = 1;
}

return data.filename;
}

+ 9
- 37
src/controller.c Wyświetl plik

@@ -181,9 +181,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
int r = 0, days, hours, minutes;
time_t uptime;
unsigned long size = 0;
struct statfile *statfile;
stat_file_t *file;
struct metric *metric;
struct classifier_config *cl;
memory_pool_stat_t mem_st;
char *password = g_hash_table_lookup (session->worker->cf->params, "password");

@@ -311,26 +309,16 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
return;
}

statfile = g_hash_table_lookup (session->cfg->statfiles, *cmd_args);
if (statfile == NULL) {
session->learn_symbol = *cmd_args;
cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args);
if (cl == NULL) {
r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args);
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
return;

}
session->learn_classifier = cl;

metric = g_hash_table_lookup (session->cfg->metrics, statfile->metric);

session->learn_rcpt = NULL;
session->learn_from = NULL;
session->learn_filename = NULL;
session->learn_tokenizer = statfile->tokenizer;
if (metric != NULL) {
session->learn_classifier = metric->classifier;
}
else {
session->learn_classifier = get_classifier ("winnow");
}
/* By default learn positive */
session->in_class = 1;
/* Get all arguments */
@@ -366,22 +354,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
}
}
}
session->learn_filename = resolve_stat_filename (session->session_pool, statfile->pattern,
session->learn_rcpt, session->learn_from);
if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) {
/* Try to create statfile */
if (statfile_pool_create (session->worker->srv->statfile_pool,
session->learn_filename, statfile->size / sizeof (struct stat_file_block)) == -1) {
r = snprintf (out_buf, sizeof (out_buf), "cannot create statfile %s" CRLF, session->learn_filename);
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
return;
}
if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) {
r = snprintf (out_buf, sizeof (out_buf), "cannot open statfile %s" CRLF, session->learn_filename);
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
return;
}
}
rspamd_set_dispatcher_policy (session->dispatcher, BUFFER_CHARACTER, size);
session->state = STATE_LEARN;
}
@@ -479,7 +451,7 @@ controller_read_socket (f_str_t *in, void *arg)
while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
c.begin = content->data;
c.len = content->len;
if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer,
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
session->session_pool, &c, &tokens)) {
i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE);
@@ -487,9 +459,9 @@ controller_read_socket (f_str_t *in, void *arg)
return;
}
}
cls_ctx = session->learn_classifier->init_func (session->session_pool);
session->learn_classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
session->learn_filename, tokens, session->in_class);
cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier);
session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
session->learn_symbol, tokens, session->in_class);
session->worker->srv->stat->messages_learned ++;
i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE);

+ 24
- 70
src/filter.c Wyświetl plik

@@ -444,7 +444,7 @@ check_autolearn (struct statfile_autolearn_params *params, struct worker_task *t
return FALSE;
}

static void
void
process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens,
struct classifier *classifier, char *filename, struct classifier_ctx* ctx)
{
@@ -464,7 +464,7 @@ process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens,
}
}

classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1);
classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, TRUE);
}
}
}
@@ -488,48 +488,27 @@ make_composites (struct worker_task *task)
g_hash_table_foreach (task->results, composites_metric_callback, task);
}

struct statfile_result_data {
struct metric *metric;
struct classifier_ctx *ctx;
};

struct statfile_callback_data {
GHashTable *tokens;
GHashTable *classifiers;
struct worker_task *task;
};

static void
statfiles_callback (gpointer key, gpointer value, void *arg)
classifiers_callback (gpointer value, void *arg)
{
struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
struct worker_task *task = data->task;
struct statfile *st = (struct statfile *)value;
struct classifier *classifier;
struct statfile_result_data *res_data;
struct metric *metric;
struct classifier_config *cl = value;
struct classifier_ctx *ctx;
struct mime_text_part *text_part;
struct statfile *st;
GTree *tokens = NULL;
GList *cur;

char *filename;
f_str_t c;
if (g_list_length (task->rcpt) == 1) {
filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
}
else {
/* XXX: handle multiply recipients correctly */
filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
}
if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) {
return;
}
cur = g_list_first (task->text_parts);
if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
if ((tokens = g_hash_table_lookup (data->tokens, cl->tokenizer)) == NULL) {
while (cur != NULL) {
text_part = (struct mime_text_part *)cur->data;
if (text_part->is_empty) {
@@ -539,52 +518,32 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) {
msg_info ("statfiles_callback: cannot tokenize input");
return;
}
cur = g_list_next (cur);
}
g_hash_table_insert (data->tokens, st->tokenizer, tokens);
g_hash_table_insert (data->tokens, cl->tokenizer, tokens);
}
metric = g_hash_table_lookup (task->cfg->metrics, st->metric);
if (metric == NULL) {
classifier = get_classifier ("winnow");
}
else {
classifier = metric->classifier;
}
if ((res_data = g_hash_table_lookup (data->classifiers, classifier)) == NULL) {
res_data = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result_data));
res_data->ctx = classifier->init_func (task->task_pool);
res_data->metric = metric;
g_hash_table_insert (data->classifiers, classifier, res_data);
}
ctx = cl->classifier->init_func (task->task_pool, cl);
cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task);
classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight);

if (st->autolearn) {
/* Process autolearn */
process_autolearn (st, task, tokens, classifier, filename, res_data->ctx);
/* Autolearning */
cur = g_list_first (cl->statfiles);
while (cur) {
st = cur->data;
if (st->autolearn) {
if (check_autolearn (st->autolearn, task)) {
/* Process autolearn */
process_autolearn (st, task, tokens, cl->classifier, st->path, ctx);
}
}
cur = g_list_next (cur);
}
}

static void
statfiles_results_callback (gpointer key, gpointer value, void *arg)
{
struct worker_task *task = (struct worker_task *)arg;
struct statfile_result_data *res = (struct statfile_result_data *)value;
struct classifier *classifier = (struct classifier *)key;
double *w;
char *filename;

w = memory_pool_alloc (task->task_pool, sizeof (double));
filename = classifier->result_file_func (res->ctx, w);
insert_result (task, res->metric->name, classifier->name, *w, NULL);
msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name);
}


void
process_statfiles (struct worker_task *task)
@@ -593,16 +552,11 @@ process_statfiles (struct worker_task *task)
cd.task = task;
cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
cd.classifiers = g_hash_table_new (g_str_hash, g_str_equal);

g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd);
g_hash_table_foreach (cd.classifiers, statfiles_results_callback, task);
g_list_foreach (task->cfg->classifiers, classifiers_callback, &cd);
g_hash_table_destroy (cd.tokens);
g_hash_table_destroy (cd.classifiers);
/* Process results */
g_hash_table_foreach (task->results, metric_process_callback_forced, task);

/* Process results */
task->state = WRITE_REPLY;
}


+ 4
- 1
src/lua/lua_common.h Wyświetl plik

@@ -2,6 +2,8 @@
#define RSPAMD_LUA_H

#include "../config.h"
#ifdef WITH_LUA

#include "../main.h"
#include "../cfg_file.h"
#include <lua.h>
@@ -30,4 +32,5 @@ int lua_call_chain_filter (const char *function, struct worker_task *task, int *
double lua_consolidation_func (struct worker_task *task, const char *metric_name, const char *function_name);
void add_luabuf (const char *line);

#endif
#endif /* WITH_LUA */
#endif /* RSPAMD_LUA_H */

+ 3
- 3
src/main.h Wyświetl plik

@@ -71,6 +71,7 @@ struct pidfh;
struct config_file;
struct tokenizer;
struct classifier;
struct classifier_config;
struct mime_part;
struct rspamd_view;

@@ -140,9 +141,8 @@ struct controller_session {
struct config_file *cfg; /**< pointer to config file */
char *learn_rcpt; /**< recipient for learning */
char *learn_from; /**< from address for learning */
struct tokenizer *learn_tokenizer; /**< tokenizer for learning */
struct classifier *learn_classifier; /**< classifier for learning */
char *learn_filename; /**< real filename for learning */
struct classifier_config *learn_classifier;
char *learn_symbol; /**< symbol to train */
rspamd_io_dispatcher_t *dispatcher; /**< IO dispatcher object */
f_str_t *learn_buf; /**< learn input */
GList *parts; /**< extracted mime parts */

+ 1
- 9
src/protocol.c Wyświetl plik

@@ -313,15 +313,7 @@ parse_header (struct worker_task *task, f_str_t *line)
task->rcpt = g_list_prepend (task->rcpt, tmp);
msg_debug ("parse_header: read rcpt header, value: %s", tmp);
}
else {
msg_info ("parse_header: wrong header: %s", headern);
return -1;
}
break;
case 'n':
case 'N':
/* nrcpt */
if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) {
else if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) {
tmp = memory_pool_fstrdup (task->task_pool, line);
task->nrcpt = strtoul (tmp, &err, 10);
msg_debug ("parse_header: read rcpt header, value: %d", (int)task->nrcpt);

+ 3
- 0
src/symbols_cache.c Wyświetl plik

@@ -360,6 +360,9 @@ call_symbol_callback (struct worker_task *task, struct symbols_cache *cache, str
item = &cache->items[0];
}
else {
if (cache == NULL) {
return FALSE;
}
/* Next pointer */
if (*saved_item - cache->items >= cache->used_items - 1) {
/* No more items in cache */

+ 6
- 5
test/rspamd_statfile_test.c Wyświetl plik

@@ -25,6 +25,7 @@ void
rspamd_statfile_test_func ()
{
statfile_pool_t *pool;
stat_file_t *st;
uint32_t random_hashes[HASHES_NUM], i, v;
time_t now;
@@ -40,17 +41,17 @@ rspamd_statfile_test_func ()

/* Create new file */
g_assert (statfile_pool_create (pool, TEST_FILENAME, 65535) != -1);
g_assert (statfile_pool_open (pool, TEST_FILENAME) != -1);
g_assert ((st = statfile_pool_open (pool, TEST_FILENAME)) != NULL);
/* Get and set random blocks */
statfile_pool_lock_file (pool, TEST_FILENAME);
statfile_pool_lock_file (pool, st);
for (i = 0; i < HASHES_NUM; i ++) {
statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0);
statfile_pool_set_block (pool, st, random_hashes[i], random_hashes[i], now, 1.0);
}
statfile_pool_unlock_file (pool, TEST_FILENAME);
statfile_pool_unlock_file (pool, st);

for (i = 0; i < HASHES_NUM; i ++) {
v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now);
v = statfile_pool_get_block (pool, st, random_hashes[i], random_hashes[i], now);
g_assert(v == 1.0);
}


+ 2
- 0
test/rspamd_test_suite.c Wyświetl plik

@@ -30,4 +30,6 @@ main (int argc, char **argv)
g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);

g_test_run ();

return 0;
}

+ 2
- 0
utils/expression_parser.c Wyświetl plik

@@ -49,4 +49,6 @@ main (int argc, char **argv)
}

memory_pool_delete (pool);

return 0;
}

Ładowanie…
Anuluj
Zapisz