diff options
-rw-r--r-- | conf/rspamd.conf.sample | 9 | ||||
-rw-r--r-- | rspamd.conf.sample | 138 | ||||
-rw-r--r-- | src/cfg_file.h | 23 | ||||
-rw-r--r-- | src/cfg_file.l | 48 | ||||
-rw-r--r-- | src/cfg_file.y | 138 | ||||
-rw-r--r-- | src/cfg_utils.c | 20 | ||||
-rw-r--r-- | src/classifiers/classifiers.c | 1 | ||||
-rw-r--r-- | src/classifiers/classifiers.h | 19 | ||||
-rw-r--r-- | src/classifiers/winnow.c | 113 | ||||
-rw-r--r-- | src/controller.c | 46 | ||||
-rw-r--r-- | src/filter.c | 94 | ||||
-rw-r--r-- | src/lua/lua_common.h | 5 | ||||
-rw-r--r-- | src/main.h | 6 | ||||
-rw-r--r-- | src/protocol.c | 10 | ||||
-rw-r--r-- | src/symbols_cache.c | 3 | ||||
-rw-r--r-- | test/rspamd_statfile_test.c | 11 | ||||
-rw-r--r-- | test/rspamd_test_suite.c | 2 | ||||
-rw-r--r-- | utils/expression_parser.c | 2 |
18 files changed, 372 insertions, 316 deletions
diff --git a/conf/rspamd.conf.sample b/conf/rspamd.conf.sample index 926f9901e..42659259a 100644 --- a/conf/rspamd.conf.sample +++ b/conf/rspamd.conf.sample @@ -306,3 +306,12 @@ view { # Symbols to check, can also be list of files or regexp: symbols = "/^[A-Z]{2}_SURBL_MULTI$/i"; }; + +# Settings files +settings { + # json data for user's settings + #user_settings = "file:///some/json/file"; + + # json data for domain's settings + #domain_settings = "file:///some/other/json/file"; +}; diff --git a/rspamd.conf.sample b/rspamd.conf.sample index 507878431..da67662a1 100644 --- a/rspamd.conf.sample +++ b/rspamd.conf.sample @@ -39,12 +39,52 @@ worker { password = "q1"; }; +# Settings for fuzzy storage interface +worker { + type = "fuzzy"; + + # Bind socket for control interface + bind_socket = localhost:11335; + + count = 1; + # Path to filesystem storage + hashfile = "/tmp/fuzzy.db"; +}; + +# Options for lmtp worker +#worker { + #type = "lmtp"; + # Bind socket for lmtp interface + #bind_socket = localhost:11335; + # Metric that is considered as main. If we have spam result on + # this metric, lmtp delivery would be failed + #metric = "default"; + # Number of lmtp workers + #count = 1; +#}; + +#worker { + #type = "delivery"; + # Path to delivery agent, %f is expanded as mail from address and %r + # is expanded as recipient address + # Expample: agent = "/usr/local/bin/procmail -f %f -d %r" + #agent = "/dev/null"; + # Bind socket for lmtp interface + # Example: bind_socket = localhost:25 + + # Whether we should use lmtp for MTA delivery + #lmtp = no; +#}; + + # Sample metric definition metric { # Name of metric name = "testmetric"; # Score to count message as spam by this metric required_score = 10.1; + # Symbols cache path for optimal checks planning + cache_file = "/tmp/symbols.cache"; }; # Logging settings @@ -64,27 +104,36 @@ logging { # Default: 100M statfile_pool_size = 40M; - -# Sample statfile definition -#statfile { - # Alias is used for learning and is used as symbol - #alias = "test.spam"; - # Pattern is path to file, can include %r - recipient name and %f - mail from value - #pattern = "./test.spam"; - # Weight in spam/ham classifier - #weight = 1.0; - # Size of this statfile class - #size = 10M; - # Tokenizer for this statfile - # Deafault: osb-text - #tokenizer = "osb-text"; -#}; -#statfile { - #alias = "test.ham"; - #pattern = "./test.ham"; - #weight = -2.0; - #size = 10M; -#}; +# Classifier definition +classifier { + # Type of classfier + type = "winnow"; + # Tokenizer used + tokenizer = "osb-text"; + # Sample statfile definition + statfile { + # Alias is used for learning and is used as symbol + symbol = "WINNOW_SPAM"; + # Pattern is path to file, can include %r - recipient name and %f - mail from value + path = "/tmp/test.spam"; + # Size of this statfile class + size = 10M; + # Tokenizer for this statfile + # Deafault: osb-text + #tokenizer = "osb-text"; + autolearn { + min_mark = 10.0; + }; + }; + statfile { + symbol = "WINNOW_HAM"; + path = "/tmp/test.ham"; + size = 10M; + autolearn { + max_mark = 0.1; + }; + }; +}; # Factors coefficients factors { @@ -159,30 +208,7 @@ factors { "R_MIXED_CHARSET" = 5; "R_BAD_EMAIL" = 10.5; }; -# Options for lmtp worker -#worker { - #type = "lmtp"; - # Bind socket for lmtp interface - #bind_socket = localhost:11335; - # Metric that is considered as main. If we have spam result on - # this metric, lmtp delivery would be failed - #metric = "default"; - # Number of lmtp workers - #count = 1; -#}; -#worker { - #type = "delivery"; - # Path to delivery agent, %f is expanded as mail from address and %r - # is expanded as recipient address - # Expample: agent = "/usr/local/bin/procmail -f %f -d %r" - #agent = "/dev/null"; - # Bind socket for lmtp interface - # Example: bind_socket = localhost:25 - - # Whether we should use lmtp for MTA delivery - #lmtp = no; -#}; # SURBL module params, note that single quotes are mandatory here .module 'surbl' { @@ -285,6 +311,14 @@ factors { #blacklist = "file:///some/path/emails.lst"; }; +# Module for fuzzy checksum loading +.module 'fuzzy_check' { + metric = "default"; + symbol = "R_FUZZY"; + # List of fuzzy storage servers, separated by ',' or ';' or simple by spaces + servers = "localhost:11335"; +}; + # If enables threat each regexp as raw regex and do not try to convert # each text part to utf8 encoding. Save a lot of resources but less # portable. @@ -315,3 +349,19 @@ settings { # json data for domain's settings #domain_settings = "file:///some/other/json/file"; }; + +# Example of json config: +# [ +# { +# "name": "cebka@test.ru", +# "metrics": +# { +# "default": 5.5 +# }, +# "factors": +# { +# "R_FUZZY": 10.1 +# }, +# "want_spam": false +# } +# ] diff --git a/src/cfg_file.h b/src/cfg_file.h index 3e932fc10..527c3f7c6 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -130,17 +130,25 @@ struct statfile_autolearn_params { * Statfile config definition */ struct statfile { - char *alias; /**< alias of statfile */ - char *pattern; /**< filesystem pattern (with %r or %f) */ - double weight; /**< weight scale */ - char *metric; /**< metric name */ + char *symbol; /**< symbol of statfile */ + char *path; /**< filesystem pattern (with %r or %f) */ size_t size; /**< size of statfile */ - struct tokenizer *tokenizer; /**< tokenizer used for statfile */ GList *sections; /**< list of sections in statfile */ struct statfile_autolearn_params *autolearn; /**< autolearn params */ }; /** + * Classifier config definition + */ +struct classifier_config { + GList *statfiles; /**< statfiles list */ + char *metric; /**< metric of this classifier */ + struct classifier *classifier; /**< classifier interface */ + struct tokenizer *tokenizer; /**< tokenizer used for classifier */ + GHashTable *opts; /**< other options */ +}; + +/** * Config option for importing to script module */ struct config_scalar { @@ -223,7 +231,8 @@ struct config_file { GHashTable* factors; /**< hash of factors indexed by symbol name */ GHashTable* c_modules; /**< hash of c modules indexed by module name */ GHashTable* composite_symbols; /**< hash of composite symbols indexed by its name */ - GHashTable* statfiles; /**< hash of defined statfiles indexed by alias */ + GList *classifiers; /**< list of all classifiers defined */ + GHashTable *classifiers_symbols; /**< hashtable indexed by symbol name of classifiers */ GHashTable* cfg_params; /**< all cfg params indexed by its name in this structure */ int clock_res; /**< resolution of clock used */ GList *views; /**< views */ @@ -314,7 +323,7 @@ void post_load_config (struct config_file *cfg); void unescape_quotes (char *line); GList* parse_comma_list (memory_pool_t *pool, char *line); - +struct classifier_config* check_classifier_cfg (struct config_file *cfg, struct classifier_config *c); int yylex (void); int yyparse (void); diff --git a/src/cfg_file.l b/src/cfg_file.l index 64c113a71..5355a7c57 100644 --- a/src/cfg_file.l +++ b/src/cfg_file.l @@ -2,6 +2,7 @@ %x module %x lua %x worker +%x classifier %{ @@ -21,6 +22,7 @@ extern void add_luabuf (const char *line); YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH]; int line_stack[MAX_INCLUDE_DEPTH]; int include_stack_ptr = 0; +int nested_depth = 0; extern struct config_file *cfg; %} @@ -74,17 +76,7 @@ enabled return ENABLED; delivery return DELIVERY; agent return AGENT; -statfile return STATFILE; -alias return ALIAS; -pattern return PATTERN; -weight return WEIGHT; -size return SIZE; -tokenizer return TOKENIZER; -classifier return CLASSIFIER; -section return SECTION; -autolearn return AUTOLEARN; -min_mark return MIN_MARK; -max_mark return MAX_MARK; +classifier BEGIN(classifier); return CLASSIFIER; logging return LOGGING; @@ -167,8 +159,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; <module>[ \t]+ /* ignore whitespace */; <module>[ \t]*#.* /* ignore comments */; <module>\'[a-zA-Z0-9_-]+\' yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; return MODULE_OPT; -<module>\{ return OBRACE; -<module>\} BEGIN(INITIAL); return EBRACE; +<module>\{ nested_depth ++; return OBRACE; +<module>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE; <module>\; return SEMICOLON; <module>= return EQSIGN; <module>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; @@ -178,8 +170,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; <worker>\n /* ignore EOL */; <worker>[ \t]+ /* ignore whitespace */; <worker>[ \t]*#.* /* ignore comments */; -<worker>\{ return OBRACE; -<worker>\} BEGIN(INITIAL); return EBRACE; +<worker>\{ nested_depth ++; return OBRACE; +<worker>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE; <worker>\; return SEMICOLON; <worker>= return EQSIGN; <worker>type return TYPE; @@ -193,6 +185,32 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; <worker>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; <worker>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING; +<classifier>\n /* ignore EOL */; +<classifier>[ \t]+ /* ignore whitespace */; +<classifier>[ \t]*#.* /* ignore comments */; +<classifier>\{ nested_depth ++; return OBRACE; +<classifier>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE; +<classifier>\; return SEMICOLON; +<classifier>= return EQSIGN; +<classifier>type return TYPE; +<classifier>bind_socket return BINDSOCK; +<classifier>count return COUNT; +<classifier>statfile return STATFILE; +<classifier>symbol return SYMBOL; +<classifier>path return PATH; +<classifier>size return SIZE; +<classifier>tokenizer return TOKENIZER; +<classifier>section return SECTION; +<classifier>autolearn return AUTOLEARN; +<classifier>min_mark return MIN_MARK; +<classifier>max_mark return MAX_MARK; +<classifier>[0-9]+ yylval.number=strtol(yytext, NULL, 10); return NUMBER; +<classifier>-?[0-9]+\.?[0-9]* yylval.fract=strtod(yytext, NULL); return FRACT; +<classifier>[0-9]+[kKmMgG]? yylval.limit=parse_limit(yytext); return SIZELIMIT; +<classifier>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; +<classifier>[a-zA-Z0-9_%-]+ yylval.string=strdup(yytext); return PARAM; +<classifier>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING; + <lua>\n /* ignore EOL */; <lua>[ \t]+ /* ignore whitespace */; <lua>[ \t]*#.* /* ignore comments */; diff --git a/src/cfg_file.y b/src/cfg_file.y index 19584be92..84a0a1000 100644 --- a/src/cfg_file.y +++ b/src/cfg_file.y @@ -23,6 +23,7 @@ extern char *yytext; GList *cur_module_opt = NULL; struct metric *cur_metric = NULL; +struct classifier_config *cur_classifier = NULL; struct statfile *cur_statfile = NULL; struct statfile_section *cur_section = NULL; struct statfile_autolearn_params *cur_autolearn = NULL; @@ -58,7 +59,7 @@ struct rspamd_view *cur_view = NULL; %token DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT %token VIEW IP FROM SYMBOLS %token AUTOLEARN MIN_MARK MAX_MARK -%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS +%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS SYMBOL PATH %type <string> STRING %type <string> VARIABLE @@ -93,7 +94,7 @@ command : | metric | composites | logging - | statfile + | classifier | statfile_pool_size | luacode | raw_mode @@ -660,20 +661,81 @@ loggingfile: } ; + +classifier: + CLASSIFIER OBRACE classifierbody EBRACE { + if (cur_classifier == NULL || cur_classifier->classifier == NULL) { + yyerror ("yyparse: invalid classifier definition"); + YYERROR; + } + if (cur_classifier->metric == NULL) { + cur_classifier->metric = DEFAULT_METRIC; + } + if (cur_classifier->tokenizer == NULL) { + cur_classifier->tokenizer = get_tokenizer ("osb-text"); + } + + cfg->classifiers = g_list_prepend (cfg->classifiers, cur_classifier); + cur_classifier = NULL; + } + ; + +classifierbody: + | classifiercmd SEMICOLON + | classifierbody classifiercmd SEMICOLON + ; + +classifiercmd: + | statfile + | classifiertype + | classifiermetric + | classifiertokenizer + | classifieroption + ; + +classifiertype: + TYPE EQSIGN QUOTEDSTRING { + cur_classifier = check_classifier_cfg (cfg, cur_classifier); + if ((cur_classifier->classifier = get_classifier ($3)) == NULL) { + yyerror ("yyparse: unknown classifier type: %s", $3); + YYERROR; + } + } + ; +classifiertokenizer: + TOKENIZER EQSIGN QUOTEDSTRING { + cur_classifier = check_classifier_cfg (cfg, cur_classifier); + if ((cur_classifier->tokenizer = get_tokenizer ($3)) == NULL) { + yyerror ("yyparse: unknown tokenizer %s", $3); + YYERROR; + } + } + ; + +classifiermetric: + METRIC EQSIGN QUOTEDSTRING { + cur_classifier = check_classifier_cfg (cfg, cur_classifier); + cur_classifier->metric = $3; + memory_pool_add_destructor (cfg->cfg_pool, g_free, cur_classifier->metric); + } + ; + +classifieroption: + PARAM EQSIGN QUOTEDSTRING { + cur_classifier = check_classifier_cfg (cfg, cur_classifier); + g_hash_table_insert (cur_classifier->opts, $1, $3); + memory_pool_add_destructor (cfg->cfg_pool, g_free, $1); + memory_pool_add_destructor (cfg->cfg_pool, g_free, $3); + }; + statfile: STATFILE OBRACE statfilebody EBRACE { - if (cur_statfile == NULL || cur_statfile->alias == NULL || cur_statfile->pattern == NULL - || cur_statfile->weight == 0 || cur_statfile->size == 0) { + if (cur_statfile == NULL || cur_statfile->path == NULL || cur_statfile->size == 0) { yyerror ("yyparse: not enough arguments in statfile definition"); YYERROR; } - if (cur_statfile->metric == NULL) { - cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default"); - } - if (cur_statfile->tokenizer == NULL) { - cur_statfile->tokenizer = get_tokenizer ("osb-text"); - } - g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile); + cur_classifier = check_classifier_cfg (cfg, cur_classifier); + cur_classifier->statfiles = g_list_prepend (cur_classifier->statfiles, cur_statfile); cur_statfile = NULL; } ; @@ -684,48 +746,33 @@ statfilebody: ; statfilecmd: - | statfilealias - | statfilepattern - | statfileweight + | statfilesymbol + | statfilepath | statfilesize - | statfilemetric - | statfiletokenizer | statfilesection | statfileautolearn ; -statfilealias: - ALIAS EQSIGN QUOTEDSTRING { +statfilesymbol: + SYMBOL EQSIGN QUOTEDSTRING { + cur_classifier = check_classifier_cfg (cfg, cur_classifier); if (cur_statfile == NULL) { cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); } - cur_statfile->alias = memory_pool_strdup (cfg->cfg_pool, $3); + cur_statfile->symbol = memory_pool_strdup (cfg->cfg_pool, $3); + g_hash_table_insert (cfg->classifiers_symbols, $3, cur_classifier); } ; -statfilepattern: - PATTERN EQSIGN QUOTEDSTRING { +statfilepath: + PATH EQSIGN QUOTEDSTRING { if (cur_statfile == NULL) { cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); } - cur_statfile->pattern = memory_pool_strdup (cfg->cfg_pool, $3); + cur_statfile->path = memory_pool_strdup (cfg->cfg_pool, $3); } ; -statfileweight: - WEIGHT EQSIGN NUMBER { - if (cur_statfile == NULL) { - cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); - } - cur_statfile->weight = $3; - } - | WEIGHT EQSIGN FRACT { - if (cur_statfile == NULL) { - cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); - } - cur_statfile->weight = $3; - } - ; statfilesize: SIZE EQSIGN NUMBER { @@ -742,26 +789,7 @@ statfilesize: } ; -statfilemetric: - METRIC EQSIGN QUOTEDSTRING { - if (cur_statfile == NULL) { - cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); - } - cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3); - } - ; -statfiletokenizer: - TOKENIZER EQSIGN QUOTEDSTRING { - if (cur_statfile == NULL) { - cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); - } - if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) { - yyerror ("yyparse: unknown tokenizer %s", $3); - YYERROR; - } - } - ; statfilesection: SECTION OBRACE sectionbody EBRACE { diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 7d06e662c..0acd50be8 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -186,7 +186,7 @@ init_defaults (struct config_file *cfg) cfg->factors = g_hash_table_new (g_str_hash, g_str_equal); cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal); cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal); - cfg->statfiles = g_hash_table_new (g_str_hash, g_str_equal); + cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal); cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal); init_settings (cfg); @@ -207,10 +207,10 @@ free_config (struct config_file *cfg) g_hash_table_unref (cfg->c_modules); g_hash_table_remove_all (cfg->composite_symbols); g_hash_table_unref (cfg->composite_symbols); - g_hash_table_remove_all (cfg->statfiles); - g_hash_table_unref (cfg->statfiles); g_hash_table_remove_all (cfg->cfg_params); g_hash_table_unref (cfg->cfg_params); + g_hash_table_destroy (cfg->classifiers_symbols); + g_list_free (cfg->classifiers); g_list_free (cfg->metrics_list); memory_pool_delete (cfg->cfg_pool); } @@ -604,6 +604,20 @@ parse_comma_list (memory_pool_t *pool, char *line) return res; } +struct classifier_config * +check_classifier_cfg (struct config_file *cfg, struct classifier_config *c) +{ + if (c == NULL) { + c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config)); + } + if (c->opts == NULL) { + c->opts = g_hash_table_new (g_str_hash, g_str_equal); + memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->opts); + } + + return c; +} + /* * vi:ts=4 */ diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c index 283350972..482d111b0 100644 --- a/src/classifiers/classifiers.c +++ b/src/classifiers/classifiers.c @@ -35,7 +35,6 @@ struct classifier classifiers[] = { .init_func = winnow_init, .classify_func = winnow_classify, .learn_func = winnow_learn, - .result_file_func = winnow_result_file }, }; diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index 13a295724..fcb251da1 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -6,29 +6,30 @@ #include "../statfile.h" #include "../tokenizers/tokenizers.h" +struct classifier_config; +struct worker_task; + struct classifier_ctx { memory_pool_t *pool; GHashTable *results; + struct classifier_config *cfg; }; /* Common classifier structure */ struct classifier { char *name; - struct classifier_ctx* (*init_func)(memory_pool_t *pool); - void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, - char *statfile, GTree *input, double scale); + struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf); + void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, - char *statfile, GTree *input, int in_class); - char* (*result_file_func)(struct classifier_ctx *ctx, double *probability); + char *symbol, GTree *input, gboolean in_class); }; /* Get classifier structure by name or return NULL if this name is not found */ struct classifier* get_classifier (char *name); /* Winnow algorithm */ -struct classifier_ctx* winnow_init (memory_pool_t *pool); -void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale); -void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class); -char* winnow_result_file (struct classifier_ctx* ctx, double *probability); +struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf); +void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); +void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class); /* Array of all defined classifiers */ extern struct classifier classifiers[]; diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index edd929af0..88298faf4 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -27,6 +27,9 @@ */ #include "classifiers.h" +#include "../main.h" +#include "../filter.h" +#include "../cfg_file.h" #define WINNOW_PROMOTION 1.23 #define WINNOW_DEMOTION 0.83 @@ -85,21 +88,23 @@ learn_callback (gpointer key, gpointer value, gpointer data) } struct classifier_ctx* -winnow_init (memory_pool_t *pool) +winnow_init (memory_pool_t *pool, struct classifier_config *cfg) { struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx)); ctx->pool = pool; - ctx->results = g_hash_table_new (g_str_hash, g_str_equal); - memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results); + ctx->cfg = cfg; return ctx; } void -winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale) +winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task) { struct winnow_callback_data data; double *res = memory_pool_alloc (ctx->pool, sizeof (double)); + double max = 0; + GList *cur; + struct statfile *st, *sel = NULL; g_assert (pool != NULL); g_assert (ctx != NULL); @@ -109,29 +114,44 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfi data.count = 0; data.now = time (NULL); data.ctx = ctx; - - if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) { - if ((data.file = statfile_pool_open (pool, statfile)) == NULL) { - return; + + cur = ctx->cfg->statfiles; + while (cur) { + st = cur->data; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { + msg_warn ("winnow_classify: cannot open %s, skip it", st->path); + cur = g_list_next (cur); + continue; + } } - } - g_tree_foreach (input, classify_callback, &data); + g_tree_foreach (input, classify_callback, &data); - if (data.count != 0) { - *res = scale * (data.sum / data.count); + if (data.count != 0) { + *res = (data.sum / data.count); + } + else { + *res = 0; + } + if (*res > max) { + max = *res; + sel = st; + } + cur = g_list_next (cur); } - else { - *res = 0; + + if (sel != NULL) { + insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL); } - - g_hash_table_insert (ctx->results, statfile, res); } void -winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class) +winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class) { struct winnow_callback_data data; + GList *cur; + struct statfile *st; g_assert (pool != NULL); g_assert (ctx != NULL); @@ -142,50 +162,29 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, data.in_class = in_class; data.now = time (NULL); data.ctx = ctx; - - if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) { - if ((data.file = statfile_pool_open (pool, statfile)) == NULL) { - return; + + cur = g_list_first (ctx->cfg->statfiles); + while (cur) { + st = cur->data; + if (strcmp (symbol, st->symbol) == 0) { + if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { + /* Try to create statfile */ + if (statfile_pool_create (pool, + st->path, st->size / sizeof (struct stat_file_block)) == -1) { + msg_err ("winnow_learn: cannot create statfile %s", st->path); + return; + } + if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { + msg_err ("winnow_learn: cannot create statfile %s", st->path); + return; + } + } + break; } + cur = g_list_next (cur); } statfile_pool_lock_file (pool, data.file); g_tree_foreach (input, learn_callback, &data); statfile_pool_unlock_file (pool, data.file); } - -struct winnow_result_data { - char *filename; - double max_score; - double sum; -}; - -static void -result_file_callback (gpointer key, gpointer value, gpointer data) -{ - struct winnow_result_data *d = (struct winnow_result_data *)data; - double w = *((double *)value); - - if (fabs (w) > fabs (d->max_score)) { - d->filename = (char *)key; - d->max_score = w; - } - d->sum += fabs (w); -} - -char* -winnow_result_file (struct classifier_ctx* ctx, double *probability) -{ - struct winnow_result_data data = { NULL, 0, 0 }; - g_assert (ctx != NULL); - - g_hash_table_foreach (ctx->results, result_file_callback, &data); - if (data.sum != 0) { - *probability = data.max_score / data.sum; - } - else { - *probability = 1; - } - - return data.filename; -} diff --git a/src/controller.c b/src/controller.c index 4196e16f0..0aaa8bd99 100644 --- a/src/controller.c +++ b/src/controller.c @@ -181,9 +181,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control int r = 0, days, hours, minutes; time_t uptime; unsigned long size = 0; - struct statfile *statfile; - stat_file_t *file; - struct metric *metric; + struct classifier_config *cl; memory_pool_stat_t mem_st; char *password = g_hash_table_lookup (session->worker->cf->params, "password"); @@ -311,26 +309,16 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control return; } - statfile = g_hash_table_lookup (session->cfg->statfiles, *cmd_args); - if (statfile == NULL) { + session->learn_symbol = *cmd_args; + cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args); + if (cl == NULL) { r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args); rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); return; } + session->learn_classifier = cl; - metric = g_hash_table_lookup (session->cfg->metrics, statfile->metric); - - session->learn_rcpt = NULL; - session->learn_from = NULL; - session->learn_filename = NULL; - session->learn_tokenizer = statfile->tokenizer; - if (metric != NULL) { - session->learn_classifier = metric->classifier; - } - else { - session->learn_classifier = get_classifier ("winnow"); - } /* By default learn positive */ session->in_class = 1; /* Get all arguments */ @@ -366,22 +354,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control } } } - session->learn_filename = resolve_stat_filename (session->session_pool, statfile->pattern, - session->learn_rcpt, session->learn_from); - if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) { - /* Try to create statfile */ - if (statfile_pool_create (session->worker->srv->statfile_pool, - session->learn_filename, statfile->size / sizeof (struct stat_file_block)) == -1) { - r = snprintf (out_buf, sizeof (out_buf), "cannot create statfile %s" CRLF, session->learn_filename); - rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); - return; - } - if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) { - r = snprintf (out_buf, sizeof (out_buf), "cannot open statfile %s" CRLF, session->learn_filename); - rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); - return; - } - } rspamd_set_dispatcher_policy (session->dispatcher, BUFFER_CHARACTER, size); session->state = STATE_LEARN; } @@ -479,7 +451,7 @@ controller_read_socket (f_str_t *in, void *arg) while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) { c.begin = content->data; c.len = content->len; - if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer, + if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, session->session_pool, &c, &tokens)) { i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF); rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE); @@ -487,9 +459,9 @@ controller_read_socket (f_str_t *in, void *arg) return; } } - cls_ctx = session->learn_classifier->init_func (session->session_pool); - session->learn_classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool, - session->learn_filename, tokens, session->in_class); + cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier); + session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool, + session->learn_symbol, tokens, session->in_class); session->worker->srv->stat->messages_learned ++; i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF); rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE); diff --git a/src/filter.c b/src/filter.c index 34e487192..c9453dc61 100644 --- a/src/filter.c +++ b/src/filter.c @@ -444,7 +444,7 @@ check_autolearn (struct statfile_autolearn_params *params, struct worker_task *t return FALSE; } -static void +void process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens, struct classifier *classifier, char *filename, struct classifier_ctx* ctx) { @@ -464,7 +464,7 @@ process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens, } } - classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1); + classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, TRUE); } } } @@ -488,48 +488,27 @@ make_composites (struct worker_task *task) g_hash_table_foreach (task->results, composites_metric_callback, task); } -struct statfile_result_data { - struct metric *metric; - struct classifier_ctx *ctx; -}; struct statfile_callback_data { GHashTable *tokens; - GHashTable *classifiers; struct worker_task *task; }; static void -statfiles_callback (gpointer key, gpointer value, void *arg) +classifiers_callback (gpointer value, void *arg) { struct statfile_callback_data *data= (struct statfile_callback_data *)arg; struct worker_task *task = data->task; - struct statfile *st = (struct statfile *)value; - struct classifier *classifier; - struct statfile_result_data *res_data; - struct metric *metric; + struct classifier_config *cl = value; + struct classifier_ctx *ctx; struct mime_text_part *text_part; - + struct statfile *st; GTree *tokens = NULL; GList *cur; - - char *filename; f_str_t c; - if (g_list_length (task->rcpt) == 1) { - filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data); - } - else { - /* XXX: handle multiply recipients correctly */ - filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, ""); - } - - if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) { - return; - } - cur = g_list_first (task->text_parts); - if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) { + if ((tokens = g_hash_table_lookup (data->tokens, cl->tokenizer)) == NULL) { while (cur != NULL) { text_part = (struct mime_text_part *)cur->data; if (text_part->is_empty) { @@ -539,52 +518,32 @@ statfiles_callback (gpointer key, gpointer value, void *arg) c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) { msg_info ("statfiles_callback: cannot tokenize input"); return; } cur = g_list_next (cur); } - g_hash_table_insert (data->tokens, st->tokenizer, tokens); + g_hash_table_insert (data->tokens, cl->tokenizer, tokens); } - metric = g_hash_table_lookup (task->cfg->metrics, st->metric); - if (metric == NULL) { - classifier = get_classifier ("winnow"); - } - else { - classifier = metric->classifier; - } - if ((res_data = g_hash_table_lookup (data->classifiers, classifier)) == NULL) { - res_data = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result_data)); - res_data->ctx = classifier->init_func (task->task_pool); - res_data->metric = metric; - g_hash_table_insert (data->classifiers, classifier, res_data); - } + ctx = cl->classifier->init_func (task->task_pool, cl); + cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task); - classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight); - - if (st->autolearn) { - /* Process autolearn */ - process_autolearn (st, task, tokens, classifier, filename, res_data->ctx); + /* Autolearning */ + cur = g_list_first (cl->statfiles); + while (cur) { + st = cur->data; + if (st->autolearn) { + if (check_autolearn (st->autolearn, task)) { + /* Process autolearn */ + process_autolearn (st, task, tokens, cl->classifier, st->path, ctx); + } + } + cur = g_list_next (cur); } } -static void -statfiles_results_callback (gpointer key, gpointer value, void *arg) -{ - struct worker_task *task = (struct worker_task *)arg; - struct statfile_result_data *res = (struct statfile_result_data *)value; - struct classifier *classifier = (struct classifier *)key; - double *w; - char *filename; - - w = memory_pool_alloc (task->task_pool, sizeof (double)); - filename = classifier->result_file_func (res->ctx, w); - insert_result (task, res->metric->name, classifier->name, *w, NULL); - msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name); -} - void process_statfiles (struct worker_task *task) @@ -593,16 +552,11 @@ process_statfiles (struct worker_task *task) cd.task = task; cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal); - cd.classifiers = g_hash_table_new (g_str_hash, g_str_equal); - g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd); - g_hash_table_foreach (cd.classifiers, statfiles_results_callback, task); - + g_list_foreach (task->cfg->classifiers, classifiers_callback, &cd); g_hash_table_destroy (cd.tokens); - g_hash_table_destroy (cd.classifiers); - /* Process results */ - g_hash_table_foreach (task->results, metric_process_callback_forced, task); + /* Process results */ task->state = WRITE_REPLY; } diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index ff475c0a1..ab06166b3 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -2,6 +2,8 @@ #define RSPAMD_LUA_H #include "../config.h" +#ifdef WITH_LUA + #include "../main.h" #include "../cfg_file.h" #include <lua.h> @@ -30,4 +32,5 @@ int lua_call_chain_filter (const char *function, struct worker_task *task, int * double lua_consolidation_func (struct worker_task *task, const char *metric_name, const char *function_name); void add_luabuf (const char *line); -#endif +#endif /* WITH_LUA */ +#endif /* RSPAMD_LUA_H */ diff --git a/src/main.h b/src/main.h index bfc78e888..4c1ab8617 100644 --- a/src/main.h +++ b/src/main.h @@ -71,6 +71,7 @@ struct pidfh; struct config_file; struct tokenizer; struct classifier; +struct classifier_config; struct mime_part; struct rspamd_view; @@ -140,9 +141,8 @@ struct controller_session { struct config_file *cfg; /**< pointer to config file */ char *learn_rcpt; /**< recipient for learning */ char *learn_from; /**< from address for learning */ - struct tokenizer *learn_tokenizer; /**< tokenizer for learning */ - struct classifier *learn_classifier; /**< classifier for learning */ - char *learn_filename; /**< real filename for learning */ + struct classifier_config *learn_classifier; + char *learn_symbol; /**< symbol to train */ rspamd_io_dispatcher_t *dispatcher; /**< IO dispatcher object */ f_str_t *learn_buf; /**< learn input */ GList *parts; /**< extracted mime parts */ diff --git a/src/protocol.c b/src/protocol.c index 176160381..cd5f32424 100644 --- a/src/protocol.c +++ b/src/protocol.c @@ -313,15 +313,7 @@ parse_header (struct worker_task *task, f_str_t *line) task->rcpt = g_list_prepend (task->rcpt, tmp); msg_debug ("parse_header: read rcpt header, value: %s", tmp); } - else { - msg_info ("parse_header: wrong header: %s", headern); - return -1; - } - break; - case 'n': - case 'N': - /* nrcpt */ - if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) { + else if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) { tmp = memory_pool_fstrdup (task->task_pool, line); task->nrcpt = strtoul (tmp, &err, 10); msg_debug ("parse_header: read rcpt header, value: %d", (int)task->nrcpt); diff --git a/src/symbols_cache.c b/src/symbols_cache.c index 1d5d38d4d..cc7e8a1b1 100644 --- a/src/symbols_cache.c +++ b/src/symbols_cache.c @@ -360,6 +360,9 @@ call_symbol_callback (struct worker_task *task, struct symbols_cache *cache, str item = &cache->items[0]; } else { + if (cache == NULL) { + return FALSE; + } /* Next pointer */ if (*saved_item - cache->items >= cache->used_items - 1) { /* No more items in cache */ diff --git a/test/rspamd_statfile_test.c b/test/rspamd_statfile_test.c index 19a6cf7ab..282d4dc1c 100644 --- a/test/rspamd_statfile_test.c +++ b/test/rspamd_statfile_test.c @@ -25,6 +25,7 @@ void rspamd_statfile_test_func () { statfile_pool_t *pool; + stat_file_t *st; uint32_t random_hashes[HASHES_NUM], i, v; time_t now; @@ -40,17 +41,17 @@ rspamd_statfile_test_func () /* Create new file */ g_assert (statfile_pool_create (pool, TEST_FILENAME, 65535) != -1); - g_assert (statfile_pool_open (pool, TEST_FILENAME) != -1); + g_assert ((st = statfile_pool_open (pool, TEST_FILENAME)) != NULL); /* Get and set random blocks */ - statfile_pool_lock_file (pool, TEST_FILENAME); + statfile_pool_lock_file (pool, st); for (i = 0; i < HASHES_NUM; i ++) { - statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0); + statfile_pool_set_block (pool, st, random_hashes[i], random_hashes[i], now, 1.0); } - statfile_pool_unlock_file (pool, TEST_FILENAME); + statfile_pool_unlock_file (pool, st); for (i = 0; i < HASHES_NUM; i ++) { - v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now); + v = statfile_pool_get_block (pool, st, random_hashes[i], random_hashes[i], now); g_assert(v == 1.0); } diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c index 0f4768e73..24d8e0289 100644 --- a/test/rspamd_test_suite.c +++ b/test/rspamd_test_suite.c @@ -30,4 +30,6 @@ main (int argc, char **argv) g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func); g_test_run (); + + return 0; } diff --git a/utils/expression_parser.c b/utils/expression_parser.c index 4f37ec923..38b52934f 100644 --- a/utils/expression_parser.c +++ b/utils/expression_parser.c @@ -49,4 +49,6 @@ main (int argc, char **argv) } memory_pool_delete (pool); + + return 0; } |