* Fix sample config * Fix compile warnings * Fix building without lua support * Fix bugs with nrcpt header parsing and symbols cache loading (by Anton Nekhoroshikh)tags/0.2.7
@@ -306,3 +306,12 @@ view { | |||
# Symbols to check, can also be list of files or regexp: | |||
symbols = "/^[A-Z]{2}_SURBL_MULTI$/i"; | |||
}; | |||
# Settings files | |||
settings { | |||
# json data for user's settings | |||
#user_settings = "file:///some/json/file"; | |||
# json data for domain's settings | |||
#domain_settings = "file:///some/other/json/file"; | |||
}; |
@@ -39,12 +39,52 @@ worker { | |||
password = "q1"; | |||
}; | |||
# Settings for fuzzy storage interface | |||
worker { | |||
type = "fuzzy"; | |||
# Bind socket for control interface | |||
bind_socket = localhost:11335; | |||
count = 1; | |||
# Path to filesystem storage | |||
hashfile = "/tmp/fuzzy.db"; | |||
}; | |||
# Options for lmtp worker | |||
#worker { | |||
#type = "lmtp"; | |||
# Bind socket for lmtp interface | |||
#bind_socket = localhost:11335; | |||
# Metric that is considered as main. If we have spam result on | |||
# this metric, lmtp delivery would be failed | |||
#metric = "default"; | |||
# Number of lmtp workers | |||
#count = 1; | |||
#}; | |||
#worker { | |||
#type = "delivery"; | |||
# Path to delivery agent, %f is expanded as mail from address and %r | |||
# is expanded as recipient address | |||
# Expample: agent = "/usr/local/bin/procmail -f %f -d %r" | |||
#agent = "/dev/null"; | |||
# Bind socket for lmtp interface | |||
# Example: bind_socket = localhost:25 | |||
# Whether we should use lmtp for MTA delivery | |||
#lmtp = no; | |||
#}; | |||
# Sample metric definition | |||
metric { | |||
# Name of metric | |||
name = "testmetric"; | |||
# Score to count message as spam by this metric | |||
required_score = 10.1; | |||
# Symbols cache path for optimal checks planning | |||
cache_file = "/tmp/symbols.cache"; | |||
}; | |||
# Logging settings | |||
@@ -64,27 +104,36 @@ logging { | |||
# Default: 100M | |||
statfile_pool_size = 40M; | |||
# Sample statfile definition | |||
#statfile { | |||
# Alias is used for learning and is used as symbol | |||
#alias = "test.spam"; | |||
# Pattern is path to file, can include %r - recipient name and %f - mail from value | |||
#pattern = "./test.spam"; | |||
# Weight in spam/ham classifier | |||
#weight = 1.0; | |||
# Size of this statfile class | |||
#size = 10M; | |||
# Tokenizer for this statfile | |||
# Deafault: osb-text | |||
#tokenizer = "osb-text"; | |||
#}; | |||
#statfile { | |||
#alias = "test.ham"; | |||
#pattern = "./test.ham"; | |||
#weight = -2.0; | |||
#size = 10M; | |||
#}; | |||
# Classifier definition | |||
classifier { | |||
# Type of classfier | |||
type = "winnow"; | |||
# Tokenizer used | |||
tokenizer = "osb-text"; | |||
# Sample statfile definition | |||
statfile { | |||
# Alias is used for learning and is used as symbol | |||
symbol = "WINNOW_SPAM"; | |||
# Pattern is path to file, can include %r - recipient name and %f - mail from value | |||
path = "/tmp/test.spam"; | |||
# Size of this statfile class | |||
size = 10M; | |||
# Tokenizer for this statfile | |||
# Deafault: osb-text | |||
#tokenizer = "osb-text"; | |||
autolearn { | |||
min_mark = 10.0; | |||
}; | |||
}; | |||
statfile { | |||
symbol = "WINNOW_HAM"; | |||
path = "/tmp/test.ham"; | |||
size = 10M; | |||
autolearn { | |||
max_mark = 0.1; | |||
}; | |||
}; | |||
}; | |||
# Factors coefficients | |||
factors { | |||
@@ -159,30 +208,7 @@ factors { | |||
"R_MIXED_CHARSET" = 5; | |||
"R_BAD_EMAIL" = 10.5; | |||
}; | |||
# Options for lmtp worker | |||
#worker { | |||
#type = "lmtp"; | |||
# Bind socket for lmtp interface | |||
#bind_socket = localhost:11335; | |||
# Metric that is considered as main. If we have spam result on | |||
# this metric, lmtp delivery would be failed | |||
#metric = "default"; | |||
# Number of lmtp workers | |||
#count = 1; | |||
#}; | |||
#worker { | |||
#type = "delivery"; | |||
# Path to delivery agent, %f is expanded as mail from address and %r | |||
# is expanded as recipient address | |||
# Expample: agent = "/usr/local/bin/procmail -f %f -d %r" | |||
#agent = "/dev/null"; | |||
# Bind socket for lmtp interface | |||
# Example: bind_socket = localhost:25 | |||
# Whether we should use lmtp for MTA delivery | |||
#lmtp = no; | |||
#}; | |||
# SURBL module params, note that single quotes are mandatory here | |||
.module 'surbl' { | |||
@@ -285,6 +311,14 @@ factors { | |||
#blacklist = "file:///some/path/emails.lst"; | |||
}; | |||
# Module for fuzzy checksum loading | |||
.module 'fuzzy_check' { | |||
metric = "default"; | |||
symbol = "R_FUZZY"; | |||
# List of fuzzy storage servers, separated by ',' or ';' or simple by spaces | |||
servers = "localhost:11335"; | |||
}; | |||
# If enables threat each regexp as raw regex and do not try to convert | |||
# each text part to utf8 encoding. Save a lot of resources but less | |||
# portable. | |||
@@ -315,3 +349,19 @@ settings { | |||
# json data for domain's settings | |||
#domain_settings = "file:///some/other/json/file"; | |||
}; | |||
# Example of json config: | |||
# [ | |||
# { | |||
# "name": "cebka@test.ru", | |||
# "metrics": | |||
# { | |||
# "default": 5.5 | |||
# }, | |||
# "factors": | |||
# { | |||
# "R_FUZZY": 10.1 | |||
# }, | |||
# "want_spam": false | |||
# } | |||
# ] |
@@ -130,16 +130,24 @@ struct statfile_autolearn_params { | |||
* Statfile config definition | |||
*/ | |||
struct statfile { | |||
char *alias; /**< alias of statfile */ | |||
char *pattern; /**< filesystem pattern (with %r or %f) */ | |||
double weight; /**< weight scale */ | |||
char *metric; /**< metric name */ | |||
char *symbol; /**< symbol of statfile */ | |||
char *path; /**< filesystem pattern (with %r or %f) */ | |||
size_t size; /**< size of statfile */ | |||
struct tokenizer *tokenizer; /**< tokenizer used for statfile */ | |||
GList *sections; /**< list of sections in statfile */ | |||
struct statfile_autolearn_params *autolearn; /**< autolearn params */ | |||
}; | |||
/** | |||
* Classifier config definition | |||
*/ | |||
struct classifier_config { | |||
GList *statfiles; /**< statfiles list */ | |||
char *metric; /**< metric of this classifier */ | |||
struct classifier *classifier; /**< classifier interface */ | |||
struct tokenizer *tokenizer; /**< tokenizer used for classifier */ | |||
GHashTable *opts; /**< other options */ | |||
}; | |||
/** | |||
* Config option for importing to script module | |||
*/ | |||
@@ -223,7 +231,8 @@ struct config_file { | |||
GHashTable* factors; /**< hash of factors indexed by symbol name */ | |||
GHashTable* c_modules; /**< hash of c modules indexed by module name */ | |||
GHashTable* composite_symbols; /**< hash of composite symbols indexed by its name */ | |||
GHashTable* statfiles; /**< hash of defined statfiles indexed by alias */ | |||
GList *classifiers; /**< list of all classifiers defined */ | |||
GHashTable *classifiers_symbols; /**< hashtable indexed by symbol name of classifiers */ | |||
GHashTable* cfg_params; /**< all cfg params indexed by its name in this structure */ | |||
int clock_res; /**< resolution of clock used */ | |||
GList *views; /**< views */ | |||
@@ -314,7 +323,7 @@ void post_load_config (struct config_file *cfg); | |||
void unescape_quotes (char *line); | |||
GList* parse_comma_list (memory_pool_t *pool, char *line); | |||
struct classifier_config* check_classifier_cfg (struct config_file *cfg, struct classifier_config *c); | |||
int yylex (void); | |||
int yyparse (void); |
@@ -2,6 +2,7 @@ | |||
%x module | |||
%x lua | |||
%x worker | |||
%x classifier | |||
%{ | |||
@@ -21,6 +22,7 @@ extern void add_luabuf (const char *line); | |||
YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH]; | |||
int line_stack[MAX_INCLUDE_DEPTH]; | |||
int include_stack_ptr = 0; | |||
int nested_depth = 0; | |||
extern struct config_file *cfg; | |||
%} | |||
@@ -74,17 +76,7 @@ enabled return ENABLED; | |||
delivery return DELIVERY; | |||
agent return AGENT; | |||
statfile return STATFILE; | |||
alias return ALIAS; | |||
pattern return PATTERN; | |||
weight return WEIGHT; | |||
size return SIZE; | |||
tokenizer return TOKENIZER; | |||
classifier return CLASSIFIER; | |||
section return SECTION; | |||
autolearn return AUTOLEARN; | |||
min_mark return MIN_MARK; | |||
max_mark return MAX_MARK; | |||
classifier BEGIN(classifier); return CLASSIFIER; | |||
logging return LOGGING; | |||
@@ -167,8 +159,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; | |||
<module>[ \t]+ /* ignore whitespace */; | |||
<module>[ \t]*#.* /* ignore comments */; | |||
<module>\'[a-zA-Z0-9_-]+\' yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; return MODULE_OPT; | |||
<module>\{ return OBRACE; | |||
<module>\} BEGIN(INITIAL); return EBRACE; | |||
<module>\{ nested_depth ++; return OBRACE; | |||
<module>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE; | |||
<module>\; return SEMICOLON; | |||
<module>= return EQSIGN; | |||
<module>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; | |||
@@ -178,8 +170,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; | |||
<worker>\n /* ignore EOL */; | |||
<worker>[ \t]+ /* ignore whitespace */; | |||
<worker>[ \t]*#.* /* ignore comments */; | |||
<worker>\{ return OBRACE; | |||
<worker>\} BEGIN(INITIAL); return EBRACE; | |||
<worker>\{ nested_depth ++; return OBRACE; | |||
<worker>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE; | |||
<worker>\; return SEMICOLON; | |||
<worker>= return EQSIGN; | |||
<worker>type return TYPE; | |||
@@ -193,6 +185,32 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; | |||
<worker>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; | |||
<worker>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING; | |||
<classifier>\n /* ignore EOL */; | |||
<classifier>[ \t]+ /* ignore whitespace */; | |||
<classifier>[ \t]*#.* /* ignore comments */; | |||
<classifier>\{ nested_depth ++; return OBRACE; | |||
<classifier>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE; | |||
<classifier>\; return SEMICOLON; | |||
<classifier>= return EQSIGN; | |||
<classifier>type return TYPE; | |||
<classifier>bind_socket return BINDSOCK; | |||
<classifier>count return COUNT; | |||
<classifier>statfile return STATFILE; | |||
<classifier>symbol return SYMBOL; | |||
<classifier>path return PATH; | |||
<classifier>size return SIZE; | |||
<classifier>tokenizer return TOKENIZER; | |||
<classifier>section return SECTION; | |||
<classifier>autolearn return AUTOLEARN; | |||
<classifier>min_mark return MIN_MARK; | |||
<classifier>max_mark return MAX_MARK; | |||
<classifier>[0-9]+ yylval.number=strtol(yytext, NULL, 10); return NUMBER; | |||
<classifier>-?[0-9]+\.?[0-9]* yylval.fract=strtod(yytext, NULL); return FRACT; | |||
<classifier>[0-9]+[kKmMgG]? yylval.limit=parse_limit(yytext); return SIZELIMIT; | |||
<classifier>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; | |||
<classifier>[a-zA-Z0-9_%-]+ yylval.string=strdup(yytext); return PARAM; | |||
<classifier>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING; | |||
<lua>\n /* ignore EOL */; | |||
<lua>[ \t]+ /* ignore whitespace */; | |||
<lua>[ \t]*#.* /* ignore comments */; |
@@ -23,6 +23,7 @@ extern char *yytext; | |||
GList *cur_module_opt = NULL; | |||
struct metric *cur_metric = NULL; | |||
struct classifier_config *cur_classifier = NULL; | |||
struct statfile *cur_statfile = NULL; | |||
struct statfile_section *cur_section = NULL; | |||
struct statfile_autolearn_params *cur_autolearn = NULL; | |||
@@ -58,7 +59,7 @@ struct rspamd_view *cur_view = NULL; | |||
%token DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT | |||
%token VIEW IP FROM SYMBOLS | |||
%token AUTOLEARN MIN_MARK MAX_MARK | |||
%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS | |||
%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS SYMBOL PATH | |||
%type <string> STRING | |||
%type <string> VARIABLE | |||
@@ -93,7 +94,7 @@ command : | |||
| metric | |||
| composites | |||
| logging | |||
| statfile | |||
| classifier | |||
| statfile_pool_size | |||
| luacode | |||
| raw_mode | |||
@@ -660,20 +661,81 @@ loggingfile: | |||
} | |||
; | |||
classifier: | |||
CLASSIFIER OBRACE classifierbody EBRACE { | |||
if (cur_classifier == NULL || cur_classifier->classifier == NULL) { | |||
yyerror ("yyparse: invalid classifier definition"); | |||
YYERROR; | |||
} | |||
if (cur_classifier->metric == NULL) { | |||
cur_classifier->metric = DEFAULT_METRIC; | |||
} | |||
if (cur_classifier->tokenizer == NULL) { | |||
cur_classifier->tokenizer = get_tokenizer ("osb-text"); | |||
} | |||
cfg->classifiers = g_list_prepend (cfg->classifiers, cur_classifier); | |||
cur_classifier = NULL; | |||
} | |||
; | |||
classifierbody: | |||
| classifiercmd SEMICOLON | |||
| classifierbody classifiercmd SEMICOLON | |||
; | |||
classifiercmd: | |||
| statfile | |||
| classifiertype | |||
| classifiermetric | |||
| classifiertokenizer | |||
| classifieroption | |||
; | |||
classifiertype: | |||
TYPE EQSIGN QUOTEDSTRING { | |||
cur_classifier = check_classifier_cfg (cfg, cur_classifier); | |||
if ((cur_classifier->classifier = get_classifier ($3)) == NULL) { | |||
yyerror ("yyparse: unknown classifier type: %s", $3); | |||
YYERROR; | |||
} | |||
} | |||
; | |||
classifiertokenizer: | |||
TOKENIZER EQSIGN QUOTEDSTRING { | |||
cur_classifier = check_classifier_cfg (cfg, cur_classifier); | |||
if ((cur_classifier->tokenizer = get_tokenizer ($3)) == NULL) { | |||
yyerror ("yyparse: unknown tokenizer %s", $3); | |||
YYERROR; | |||
} | |||
} | |||
; | |||
classifiermetric: | |||
METRIC EQSIGN QUOTEDSTRING { | |||
cur_classifier = check_classifier_cfg (cfg, cur_classifier); | |||
cur_classifier->metric = $3; | |||
memory_pool_add_destructor (cfg->cfg_pool, g_free, cur_classifier->metric); | |||
} | |||
; | |||
classifieroption: | |||
PARAM EQSIGN QUOTEDSTRING { | |||
cur_classifier = check_classifier_cfg (cfg, cur_classifier); | |||
g_hash_table_insert (cur_classifier->opts, $1, $3); | |||
memory_pool_add_destructor (cfg->cfg_pool, g_free, $1); | |||
memory_pool_add_destructor (cfg->cfg_pool, g_free, $3); | |||
}; | |||
statfile: | |||
STATFILE OBRACE statfilebody EBRACE { | |||
if (cur_statfile == NULL || cur_statfile->alias == NULL || cur_statfile->pattern == NULL | |||
|| cur_statfile->weight == 0 || cur_statfile->size == 0) { | |||
if (cur_statfile == NULL || cur_statfile->path == NULL || cur_statfile->size == 0) { | |||
yyerror ("yyparse: not enough arguments in statfile definition"); | |||
YYERROR; | |||
} | |||
if (cur_statfile->metric == NULL) { | |||
cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default"); | |||
} | |||
if (cur_statfile->tokenizer == NULL) { | |||
cur_statfile->tokenizer = get_tokenizer ("osb-text"); | |||
} | |||
g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile); | |||
cur_classifier = check_classifier_cfg (cfg, cur_classifier); | |||
cur_classifier->statfiles = g_list_prepend (cur_classifier->statfiles, cur_statfile); | |||
cur_statfile = NULL; | |||
} | |||
; | |||
@@ -684,48 +746,33 @@ statfilebody: | |||
; | |||
statfilecmd: | |||
| statfilealias | |||
| statfilepattern | |||
| statfileweight | |||
| statfilesymbol | |||
| statfilepath | |||
| statfilesize | |||
| statfilemetric | |||
| statfiletokenizer | |||
| statfilesection | |||
| statfileautolearn | |||
; | |||
statfilealias: | |||
ALIAS EQSIGN QUOTEDSTRING { | |||
statfilesymbol: | |||
SYMBOL EQSIGN QUOTEDSTRING { | |||
cur_classifier = check_classifier_cfg (cfg, cur_classifier); | |||
if (cur_statfile == NULL) { | |||
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); | |||
} | |||
cur_statfile->alias = memory_pool_strdup (cfg->cfg_pool, $3); | |||
cur_statfile->symbol = memory_pool_strdup (cfg->cfg_pool, $3); | |||
g_hash_table_insert (cfg->classifiers_symbols, $3, cur_classifier); | |||
} | |||
; | |||
statfilepattern: | |||
PATTERN EQSIGN QUOTEDSTRING { | |||
statfilepath: | |||
PATH EQSIGN QUOTEDSTRING { | |||
if (cur_statfile == NULL) { | |||
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); | |||
} | |||
cur_statfile->pattern = memory_pool_strdup (cfg->cfg_pool, $3); | |||
cur_statfile->path = memory_pool_strdup (cfg->cfg_pool, $3); | |||
} | |||
; | |||
statfileweight: | |||
WEIGHT EQSIGN NUMBER { | |||
if (cur_statfile == NULL) { | |||
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); | |||
} | |||
cur_statfile->weight = $3; | |||
} | |||
| WEIGHT EQSIGN FRACT { | |||
if (cur_statfile == NULL) { | |||
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); | |||
} | |||
cur_statfile->weight = $3; | |||
} | |||
; | |||
statfilesize: | |||
SIZE EQSIGN NUMBER { | |||
@@ -742,26 +789,7 @@ statfilesize: | |||
} | |||
; | |||
statfilemetric: | |||
METRIC EQSIGN QUOTEDSTRING { | |||
if (cur_statfile == NULL) { | |||
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); | |||
} | |||
cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3); | |||
} | |||
; | |||
statfiletokenizer: | |||
TOKENIZER EQSIGN QUOTEDSTRING { | |||
if (cur_statfile == NULL) { | |||
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); | |||
} | |||
if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) { | |||
yyerror ("yyparse: unknown tokenizer %s", $3); | |||
YYERROR; | |||
} | |||
} | |||
; | |||
statfilesection: | |||
SECTION OBRACE sectionbody EBRACE { |
@@ -186,7 +186,7 @@ init_defaults (struct config_file *cfg) | |||
cfg->factors = g_hash_table_new (g_str_hash, g_str_equal); | |||
cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal); | |||
cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal); | |||
cfg->statfiles = g_hash_table_new (g_str_hash, g_str_equal); | |||
cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal); | |||
cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal); | |||
init_settings (cfg); | |||
@@ -207,10 +207,10 @@ free_config (struct config_file *cfg) | |||
g_hash_table_unref (cfg->c_modules); | |||
g_hash_table_remove_all (cfg->composite_symbols); | |||
g_hash_table_unref (cfg->composite_symbols); | |||
g_hash_table_remove_all (cfg->statfiles); | |||
g_hash_table_unref (cfg->statfiles); | |||
g_hash_table_remove_all (cfg->cfg_params); | |||
g_hash_table_unref (cfg->cfg_params); | |||
g_hash_table_destroy (cfg->classifiers_symbols); | |||
g_list_free (cfg->classifiers); | |||
g_list_free (cfg->metrics_list); | |||
memory_pool_delete (cfg->cfg_pool); | |||
} | |||
@@ -604,6 +604,20 @@ parse_comma_list (memory_pool_t *pool, char *line) | |||
return res; | |||
} | |||
struct classifier_config * | |||
check_classifier_cfg (struct config_file *cfg, struct classifier_config *c) | |||
{ | |||
if (c == NULL) { | |||
c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config)); | |||
} | |||
if (c->opts == NULL) { | |||
c->opts = g_hash_table_new (g_str_hash, g_str_equal); | |||
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->opts); | |||
} | |||
return c; | |||
} | |||
/* | |||
* vi:ts=4 | |||
*/ |
@@ -35,7 +35,6 @@ struct classifier classifiers[] = { | |||
.init_func = winnow_init, | |||
.classify_func = winnow_classify, | |||
.learn_func = winnow_learn, | |||
.result_file_func = winnow_result_file | |||
}, | |||
}; | |||
@@ -6,29 +6,30 @@ | |||
#include "../statfile.h" | |||
#include "../tokenizers/tokenizers.h" | |||
struct classifier_config; | |||
struct worker_task; | |||
struct classifier_ctx { | |||
memory_pool_t *pool; | |||
GHashTable *results; | |||
struct classifier_config *cfg; | |||
}; | |||
/* Common classifier structure */ | |||
struct classifier { | |||
char *name; | |||
struct classifier_ctx* (*init_func)(memory_pool_t *pool); | |||
void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, | |||
char *statfile, GTree *input, double scale); | |||
struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf); | |||
void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); | |||
void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, | |||
char *statfile, GTree *input, int in_class); | |||
char* (*result_file_func)(struct classifier_ctx *ctx, double *probability); | |||
char *symbol, GTree *input, gboolean in_class); | |||
}; | |||
/* Get classifier structure by name or return NULL if this name is not found */ | |||
struct classifier* get_classifier (char *name); | |||
/* Winnow algorithm */ | |||
struct classifier_ctx* winnow_init (memory_pool_t *pool); | |||
void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale); | |||
void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class); | |||
char* winnow_result_file (struct classifier_ctx* ctx, double *probability); | |||
struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf); | |||
void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); | |||
void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class); | |||
/* Array of all defined classifiers */ | |||
extern struct classifier classifiers[]; |
@@ -27,6 +27,9 @@ | |||
*/ | |||
#include "classifiers.h" | |||
#include "../main.h" | |||
#include "../filter.h" | |||
#include "../cfg_file.h" | |||
#define WINNOW_PROMOTION 1.23 | |||
#define WINNOW_DEMOTION 0.83 | |||
@@ -85,21 +88,23 @@ learn_callback (gpointer key, gpointer value, gpointer data) | |||
} | |||
struct classifier_ctx* | |||
winnow_init (memory_pool_t *pool) | |||
winnow_init (memory_pool_t *pool, struct classifier_config *cfg) | |||
{ | |||
struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx)); | |||
ctx->pool = pool; | |||
ctx->results = g_hash_table_new (g_str_hash, g_str_equal); | |||
memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results); | |||
ctx->cfg = cfg; | |||
return ctx; | |||
} | |||
void | |||
winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale) | |||
winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task) | |||
{ | |||
struct winnow_callback_data data; | |||
double *res = memory_pool_alloc (ctx->pool, sizeof (double)); | |||
double max = 0; | |||
GList *cur; | |||
struct statfile *st, *sel = NULL; | |||
g_assert (pool != NULL); | |||
g_assert (ctx != NULL); | |||
@@ -109,29 +114,44 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfi | |||
data.count = 0; | |||
data.now = time (NULL); | |||
data.ctx = ctx; | |||
if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) { | |||
if ((data.file = statfile_pool_open (pool, statfile)) == NULL) { | |||
return; | |||
cur = ctx->cfg->statfiles; | |||
while (cur) { | |||
st = cur->data; | |||
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { | |||
if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { | |||
msg_warn ("winnow_classify: cannot open %s, skip it", st->path); | |||
cur = g_list_next (cur); | |||
continue; | |||
} | |||
} | |||
} | |||
g_tree_foreach (input, classify_callback, &data); | |||
g_tree_foreach (input, classify_callback, &data); | |||
if (data.count != 0) { | |||
*res = scale * (data.sum / data.count); | |||
if (data.count != 0) { | |||
*res = (data.sum / data.count); | |||
} | |||
else { | |||
*res = 0; | |||
} | |||
if (*res > max) { | |||
max = *res; | |||
sel = st; | |||
} | |||
cur = g_list_next (cur); | |||
} | |||
else { | |||
*res = 0; | |||
if (sel != NULL) { | |||
insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL); | |||
} | |||
g_hash_table_insert (ctx->results, statfile, res); | |||
} | |||
void | |||
winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class) | |||
winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class) | |||
{ | |||
struct winnow_callback_data data; | |||
GList *cur; | |||
struct statfile *st; | |||
g_assert (pool != NULL); | |||
g_assert (ctx != NULL); | |||
@@ -142,50 +162,29 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, | |||
data.in_class = in_class; | |||
data.now = time (NULL); | |||
data.ctx = ctx; | |||
if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) { | |||
if ((data.file = statfile_pool_open (pool, statfile)) == NULL) { | |||
return; | |||
cur = g_list_first (ctx->cfg->statfiles); | |||
while (cur) { | |||
st = cur->data; | |||
if (strcmp (symbol, st->symbol) == 0) { | |||
if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { | |||
/* Try to create statfile */ | |||
if (statfile_pool_create (pool, | |||
st->path, st->size / sizeof (struct stat_file_block)) == -1) { | |||
msg_err ("winnow_learn: cannot create statfile %s", st->path); | |||
return; | |||
} | |||
if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { | |||
msg_err ("winnow_learn: cannot create statfile %s", st->path); | |||
return; | |||
} | |||
} | |||
break; | |||
} | |||
cur = g_list_next (cur); | |||
} | |||
statfile_pool_lock_file (pool, data.file); | |||
g_tree_foreach (input, learn_callback, &data); | |||
statfile_pool_unlock_file (pool, data.file); | |||
} | |||
struct winnow_result_data { | |||
char *filename; | |||
double max_score; | |||
double sum; | |||
}; | |||
static void | |||
result_file_callback (gpointer key, gpointer value, gpointer data) | |||
{ | |||
struct winnow_result_data *d = (struct winnow_result_data *)data; | |||
double w = *((double *)value); | |||
if (fabs (w) > fabs (d->max_score)) { | |||
d->filename = (char *)key; | |||
d->max_score = w; | |||
} | |||
d->sum += fabs (w); | |||
} | |||
char* | |||
winnow_result_file (struct classifier_ctx* ctx, double *probability) | |||
{ | |||
struct winnow_result_data data = { NULL, 0, 0 }; | |||
g_assert (ctx != NULL); | |||
g_hash_table_foreach (ctx->results, result_file_callback, &data); | |||
if (data.sum != 0) { | |||
*probability = data.max_score / data.sum; | |||
} | |||
else { | |||
*probability = 1; | |||
} | |||
return data.filename; | |||
} |
@@ -181,9 +181,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control | |||
int r = 0, days, hours, minutes; | |||
time_t uptime; | |||
unsigned long size = 0; | |||
struct statfile *statfile; | |||
stat_file_t *file; | |||
struct metric *metric; | |||
struct classifier_config *cl; | |||
memory_pool_stat_t mem_st; | |||
char *password = g_hash_table_lookup (session->worker->cf->params, "password"); | |||
@@ -311,26 +309,16 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control | |||
return; | |||
} | |||
statfile = g_hash_table_lookup (session->cfg->statfiles, *cmd_args); | |||
if (statfile == NULL) { | |||
session->learn_symbol = *cmd_args; | |||
cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args); | |||
if (cl == NULL) { | |||
r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args); | |||
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); | |||
return; | |||
} | |||
session->learn_classifier = cl; | |||
metric = g_hash_table_lookup (session->cfg->metrics, statfile->metric); | |||
session->learn_rcpt = NULL; | |||
session->learn_from = NULL; | |||
session->learn_filename = NULL; | |||
session->learn_tokenizer = statfile->tokenizer; | |||
if (metric != NULL) { | |||
session->learn_classifier = metric->classifier; | |||
} | |||
else { | |||
session->learn_classifier = get_classifier ("winnow"); | |||
} | |||
/* By default learn positive */ | |||
session->in_class = 1; | |||
/* Get all arguments */ | |||
@@ -366,22 +354,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control | |||
} | |||
} | |||
} | |||
session->learn_filename = resolve_stat_filename (session->session_pool, statfile->pattern, | |||
session->learn_rcpt, session->learn_from); | |||
if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) { | |||
/* Try to create statfile */ | |||
if (statfile_pool_create (session->worker->srv->statfile_pool, | |||
session->learn_filename, statfile->size / sizeof (struct stat_file_block)) == -1) { | |||
r = snprintf (out_buf, sizeof (out_buf), "cannot create statfile %s" CRLF, session->learn_filename); | |||
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); | |||
return; | |||
} | |||
if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) { | |||
r = snprintf (out_buf, sizeof (out_buf), "cannot open statfile %s" CRLF, session->learn_filename); | |||
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); | |||
return; | |||
} | |||
} | |||
rspamd_set_dispatcher_policy (session->dispatcher, BUFFER_CHARACTER, size); | |||
session->state = STATE_LEARN; | |||
} | |||
@@ -479,7 +451,7 @@ controller_read_socket (f_str_t *in, void *arg) | |||
while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) { | |||
c.begin = content->data; | |||
c.len = content->len; | |||
if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer, | |||
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, | |||
session->session_pool, &c, &tokens)) { | |||
i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF); | |||
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE); | |||
@@ -487,9 +459,9 @@ controller_read_socket (f_str_t *in, void *arg) | |||
return; | |||
} | |||
} | |||
cls_ctx = session->learn_classifier->init_func (session->session_pool); | |||
session->learn_classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool, | |||
session->learn_filename, tokens, session->in_class); | |||
cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier); | |||
session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool, | |||
session->learn_symbol, tokens, session->in_class); | |||
session->worker->srv->stat->messages_learned ++; | |||
i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF); | |||
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE); |
@@ -444,7 +444,7 @@ check_autolearn (struct statfile_autolearn_params *params, struct worker_task *t | |||
return FALSE; | |||
} | |||
static void | |||
void | |||
process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens, | |||
struct classifier *classifier, char *filename, struct classifier_ctx* ctx) | |||
{ | |||
@@ -464,7 +464,7 @@ process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens, | |||
} | |||
} | |||
classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1); | |||
classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, TRUE); | |||
} | |||
} | |||
} | |||
@@ -488,48 +488,27 @@ make_composites (struct worker_task *task) | |||
g_hash_table_foreach (task->results, composites_metric_callback, task); | |||
} | |||
struct statfile_result_data { | |||
struct metric *metric; | |||
struct classifier_ctx *ctx; | |||
}; | |||
struct statfile_callback_data { | |||
GHashTable *tokens; | |||
GHashTable *classifiers; | |||
struct worker_task *task; | |||
}; | |||
static void | |||
statfiles_callback (gpointer key, gpointer value, void *arg) | |||
classifiers_callback (gpointer value, void *arg) | |||
{ | |||
struct statfile_callback_data *data= (struct statfile_callback_data *)arg; | |||
struct worker_task *task = data->task; | |||
struct statfile *st = (struct statfile *)value; | |||
struct classifier *classifier; | |||
struct statfile_result_data *res_data; | |||
struct metric *metric; | |||
struct classifier_config *cl = value; | |||
struct classifier_ctx *ctx; | |||
struct mime_text_part *text_part; | |||
struct statfile *st; | |||
GTree *tokens = NULL; | |||
GList *cur; | |||
char *filename; | |||
f_str_t c; | |||
if (g_list_length (task->rcpt) == 1) { | |||
filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data); | |||
} | |||
else { | |||
/* XXX: handle multiply recipients correctly */ | |||
filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, ""); | |||
} | |||
if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) { | |||
return; | |||
} | |||
cur = g_list_first (task->text_parts); | |||
if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) { | |||
if ((tokens = g_hash_table_lookup (data->tokens, cl->tokenizer)) == NULL) { | |||
while (cur != NULL) { | |||
text_part = (struct mime_text_part *)cur->data; | |||
if (text_part->is_empty) { | |||
@@ -539,52 +518,32 @@ statfiles_callback (gpointer key, gpointer value, void *arg) | |||
c.begin = text_part->content->data; | |||
c.len = text_part->content->len; | |||
/* Tree would be freed at task pool freeing */ | |||
if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) { | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) { | |||
msg_info ("statfiles_callback: cannot tokenize input"); | |||
return; | |||
} | |||
cur = g_list_next (cur); | |||
} | |||
g_hash_table_insert (data->tokens, st->tokenizer, tokens); | |||
g_hash_table_insert (data->tokens, cl->tokenizer, tokens); | |||
} | |||
metric = g_hash_table_lookup (task->cfg->metrics, st->metric); | |||
if (metric == NULL) { | |||
classifier = get_classifier ("winnow"); | |||
} | |||
else { | |||
classifier = metric->classifier; | |||
} | |||
if ((res_data = g_hash_table_lookup (data->classifiers, classifier)) == NULL) { | |||
res_data = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result_data)); | |||
res_data->ctx = classifier->init_func (task->task_pool); | |||
res_data->metric = metric; | |||
g_hash_table_insert (data->classifiers, classifier, res_data); | |||
} | |||
ctx = cl->classifier->init_func (task->task_pool, cl); | |||
cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task); | |||
classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight); | |||
if (st->autolearn) { | |||
/* Process autolearn */ | |||
process_autolearn (st, task, tokens, classifier, filename, res_data->ctx); | |||
/* Autolearning */ | |||
cur = g_list_first (cl->statfiles); | |||
while (cur) { | |||
st = cur->data; | |||
if (st->autolearn) { | |||
if (check_autolearn (st->autolearn, task)) { | |||
/* Process autolearn */ | |||
process_autolearn (st, task, tokens, cl->classifier, st->path, ctx); | |||
} | |||
} | |||
cur = g_list_next (cur); | |||
} | |||
} | |||
static void | |||
statfiles_results_callback (gpointer key, gpointer value, void *arg) | |||
{ | |||
struct worker_task *task = (struct worker_task *)arg; | |||
struct statfile_result_data *res = (struct statfile_result_data *)value; | |||
struct classifier *classifier = (struct classifier *)key; | |||
double *w; | |||
char *filename; | |||
w = memory_pool_alloc (task->task_pool, sizeof (double)); | |||
filename = classifier->result_file_func (res->ctx, w); | |||
insert_result (task, res->metric->name, classifier->name, *w, NULL); | |||
msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name); | |||
} | |||
void | |||
process_statfiles (struct worker_task *task) | |||
@@ -593,16 +552,11 @@ process_statfiles (struct worker_task *task) | |||
cd.task = task; | |||
cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal); | |||
cd.classifiers = g_hash_table_new (g_str_hash, g_str_equal); | |||
g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd); | |||
g_hash_table_foreach (cd.classifiers, statfiles_results_callback, task); | |||
g_list_foreach (task->cfg->classifiers, classifiers_callback, &cd); | |||
g_hash_table_destroy (cd.tokens); | |||
g_hash_table_destroy (cd.classifiers); | |||
/* Process results */ | |||
g_hash_table_foreach (task->results, metric_process_callback_forced, task); | |||
/* Process results */ | |||
task->state = WRITE_REPLY; | |||
} | |||
@@ -2,6 +2,8 @@ | |||
#define RSPAMD_LUA_H | |||
#include "../config.h" | |||
#ifdef WITH_LUA | |||
#include "../main.h" | |||
#include "../cfg_file.h" | |||
#include <lua.h> | |||
@@ -30,4 +32,5 @@ int lua_call_chain_filter (const char *function, struct worker_task *task, int * | |||
double lua_consolidation_func (struct worker_task *task, const char *metric_name, const char *function_name); | |||
void add_luabuf (const char *line); | |||
#endif | |||
#endif /* WITH_LUA */ | |||
#endif /* RSPAMD_LUA_H */ |
@@ -71,6 +71,7 @@ struct pidfh; | |||
struct config_file; | |||
struct tokenizer; | |||
struct classifier; | |||
struct classifier_config; | |||
struct mime_part; | |||
struct rspamd_view; | |||
@@ -140,9 +141,8 @@ struct controller_session { | |||
struct config_file *cfg; /**< pointer to config file */ | |||
char *learn_rcpt; /**< recipient for learning */ | |||
char *learn_from; /**< from address for learning */ | |||
struct tokenizer *learn_tokenizer; /**< tokenizer for learning */ | |||
struct classifier *learn_classifier; /**< classifier for learning */ | |||
char *learn_filename; /**< real filename for learning */ | |||
struct classifier_config *learn_classifier; | |||
char *learn_symbol; /**< symbol to train */ | |||
rspamd_io_dispatcher_t *dispatcher; /**< IO dispatcher object */ | |||
f_str_t *learn_buf; /**< learn input */ | |||
GList *parts; /**< extracted mime parts */ |
@@ -313,15 +313,7 @@ parse_header (struct worker_task *task, f_str_t *line) | |||
task->rcpt = g_list_prepend (task->rcpt, tmp); | |||
msg_debug ("parse_header: read rcpt header, value: %s", tmp); | |||
} | |||
else { | |||
msg_info ("parse_header: wrong header: %s", headern); | |||
return -1; | |||
} | |||
break; | |||
case 'n': | |||
case 'N': | |||
/* nrcpt */ | |||
if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) { | |||
else if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) { | |||
tmp = memory_pool_fstrdup (task->task_pool, line); | |||
task->nrcpt = strtoul (tmp, &err, 10); | |||
msg_debug ("parse_header: read rcpt header, value: %d", (int)task->nrcpt); |
@@ -360,6 +360,9 @@ call_symbol_callback (struct worker_task *task, struct symbols_cache *cache, str | |||
item = &cache->items[0]; | |||
} | |||
else { | |||
if (cache == NULL) { | |||
return FALSE; | |||
} | |||
/* Next pointer */ | |||
if (*saved_item - cache->items >= cache->used_items - 1) { | |||
/* No more items in cache */ |
@@ -25,6 +25,7 @@ void | |||
rspamd_statfile_test_func () | |||
{ | |||
statfile_pool_t *pool; | |||
stat_file_t *st; | |||
uint32_t random_hashes[HASHES_NUM], i, v; | |||
time_t now; | |||
@@ -40,17 +41,17 @@ rspamd_statfile_test_func () | |||
/* Create new file */ | |||
g_assert (statfile_pool_create (pool, TEST_FILENAME, 65535) != -1); | |||
g_assert (statfile_pool_open (pool, TEST_FILENAME) != -1); | |||
g_assert ((st = statfile_pool_open (pool, TEST_FILENAME)) != NULL); | |||
/* Get and set random blocks */ | |||
statfile_pool_lock_file (pool, TEST_FILENAME); | |||
statfile_pool_lock_file (pool, st); | |||
for (i = 0; i < HASHES_NUM; i ++) { | |||
statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0); | |||
statfile_pool_set_block (pool, st, random_hashes[i], random_hashes[i], now, 1.0); | |||
} | |||
statfile_pool_unlock_file (pool, TEST_FILENAME); | |||
statfile_pool_unlock_file (pool, st); | |||
for (i = 0; i < HASHES_NUM; i ++) { | |||
v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now); | |||
v = statfile_pool_get_block (pool, st, random_hashes[i], random_hashes[i], now); | |||
g_assert(v == 1.0); | |||
} | |||
@@ -30,4 +30,6 @@ main (int argc, char **argv) | |||
g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func); | |||
g_test_run (); | |||
return 0; | |||
} |
@@ -49,4 +49,6 @@ main (int argc, char **argv) | |||
} | |||
memory_pool_delete (pool); | |||
return 0; | |||
} |