aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-09-14 19:11:19 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-09-14 19:11:19 +0400
commita0f41f7c5712e73e8aa521f2064bc53be3315d0a (patch)
tree147e4d8956a5a3b85e0ecc15b9fcbe29742e4e5c
parenta90c7d7a12561845e3371efc6803b1ecf6ad7d89 (diff)
downloadrspamd-a0f41f7c5712e73e8aa521f2064bc53be3315d0a.tar.gz
rspamd-a0f41f7c5712e73e8aa521f2064bc53be3315d0a.zip
* New system of classifiers interface and statfiles processing
* Fix sample config * Fix compile warnings * Fix building without lua support * Fix bugs with nrcpt header parsing and symbols cache loading (by Anton Nekhoroshikh)
-rw-r--r--conf/rspamd.conf.sample9
-rw-r--r--rspamd.conf.sample138
-rw-r--r--src/cfg_file.h23
-rw-r--r--src/cfg_file.l48
-rw-r--r--src/cfg_file.y138
-rw-r--r--src/cfg_utils.c20
-rw-r--r--src/classifiers/classifiers.c1
-rw-r--r--src/classifiers/classifiers.h19
-rw-r--r--src/classifiers/winnow.c113
-rw-r--r--src/controller.c46
-rw-r--r--src/filter.c94
-rw-r--r--src/lua/lua_common.h5
-rw-r--r--src/main.h6
-rw-r--r--src/protocol.c10
-rw-r--r--src/symbols_cache.c3
-rw-r--r--test/rspamd_statfile_test.c11
-rw-r--r--test/rspamd_test_suite.c2
-rw-r--r--utils/expression_parser.c2
18 files changed, 372 insertions, 316 deletions
diff --git a/conf/rspamd.conf.sample b/conf/rspamd.conf.sample
index 926f9901e..42659259a 100644
--- a/conf/rspamd.conf.sample
+++ b/conf/rspamd.conf.sample
@@ -306,3 +306,12 @@ view {
# Symbols to check, can also be list of files or regexp:
symbols = "/^[A-Z]{2}_SURBL_MULTI$/i";
};
+
+# Settings files
+settings {
+ # json data for user's settings
+ #user_settings = "file:///some/json/file";
+
+ # json data for domain's settings
+ #domain_settings = "file:///some/other/json/file";
+};
diff --git a/rspamd.conf.sample b/rspamd.conf.sample
index 507878431..da67662a1 100644
--- a/rspamd.conf.sample
+++ b/rspamd.conf.sample
@@ -39,12 +39,52 @@ worker {
password = "q1";
};
+# Settings for fuzzy storage interface
+worker {
+ type = "fuzzy";
+
+ # Bind socket for control interface
+ bind_socket = localhost:11335;
+
+ count = 1;
+ # Path to filesystem storage
+ hashfile = "/tmp/fuzzy.db";
+};
+
+# Options for lmtp worker
+#worker {
+ #type = "lmtp";
+ # Bind socket for lmtp interface
+ #bind_socket = localhost:11335;
+ # Metric that is considered as main. If we have spam result on
+ # this metric, lmtp delivery would be failed
+ #metric = "default";
+ # Number of lmtp workers
+ #count = 1;
+#};
+
+#worker {
+ #type = "delivery";
+ # Path to delivery agent, %f is expanded as mail from address and %r
+ # is expanded as recipient address
+ # Expample: agent = "/usr/local/bin/procmail -f %f -d %r"
+ #agent = "/dev/null";
+ # Bind socket for lmtp interface
+ # Example: bind_socket = localhost:25
+
+ # Whether we should use lmtp for MTA delivery
+ #lmtp = no;
+#};
+
+
# Sample metric definition
metric {
# Name of metric
name = "testmetric";
# Score to count message as spam by this metric
required_score = 10.1;
+ # Symbols cache path for optimal checks planning
+ cache_file = "/tmp/symbols.cache";
};
# Logging settings
@@ -64,27 +104,36 @@ logging {
# Default: 100M
statfile_pool_size = 40M;
-
-# Sample statfile definition
-#statfile {
- # Alias is used for learning and is used as symbol
- #alias = "test.spam";
- # Pattern is path to file, can include %r - recipient name and %f - mail from value
- #pattern = "./test.spam";
- # Weight in spam/ham classifier
- #weight = 1.0;
- # Size of this statfile class
- #size = 10M;
- # Tokenizer for this statfile
- # Deafault: osb-text
- #tokenizer = "osb-text";
-#};
-#statfile {
- #alias = "test.ham";
- #pattern = "./test.ham";
- #weight = -2.0;
- #size = 10M;
-#};
+# Classifier definition
+classifier {
+ # Type of classfier
+ type = "winnow";
+ # Tokenizer used
+ tokenizer = "osb-text";
+ # Sample statfile definition
+ statfile {
+ # Alias is used for learning and is used as symbol
+ symbol = "WINNOW_SPAM";
+ # Pattern is path to file, can include %r - recipient name and %f - mail from value
+ path = "/tmp/test.spam";
+ # Size of this statfile class
+ size = 10M;
+ # Tokenizer for this statfile
+ # Deafault: osb-text
+ #tokenizer = "osb-text";
+ autolearn {
+ min_mark = 10.0;
+ };
+ };
+ statfile {
+ symbol = "WINNOW_HAM";
+ path = "/tmp/test.ham";
+ size = 10M;
+ autolearn {
+ max_mark = 0.1;
+ };
+ };
+};
# Factors coefficients
factors {
@@ -159,30 +208,7 @@ factors {
"R_MIXED_CHARSET" = 5;
"R_BAD_EMAIL" = 10.5;
};
-# Options for lmtp worker
-#worker {
- #type = "lmtp";
- # Bind socket for lmtp interface
- #bind_socket = localhost:11335;
- # Metric that is considered as main. If we have spam result on
- # this metric, lmtp delivery would be failed
- #metric = "default";
- # Number of lmtp workers
- #count = 1;
-#};
-#worker {
- #type = "delivery";
- # Path to delivery agent, %f is expanded as mail from address and %r
- # is expanded as recipient address
- # Expample: agent = "/usr/local/bin/procmail -f %f -d %r"
- #agent = "/dev/null";
- # Bind socket for lmtp interface
- # Example: bind_socket = localhost:25
-
- # Whether we should use lmtp for MTA delivery
- #lmtp = no;
-#};
# SURBL module params, note that single quotes are mandatory here
.module 'surbl' {
@@ -285,6 +311,14 @@ factors {
#blacklist = "file:///some/path/emails.lst";
};
+# Module for fuzzy checksum loading
+.module 'fuzzy_check' {
+ metric = "default";
+ symbol = "R_FUZZY";
+ # List of fuzzy storage servers, separated by ',' or ';' or simple by spaces
+ servers = "localhost:11335";
+};
+
# If enables threat each regexp as raw regex and do not try to convert
# each text part to utf8 encoding. Save a lot of resources but less
# portable.
@@ -315,3 +349,19 @@ settings {
# json data for domain's settings
#domain_settings = "file:///some/other/json/file";
};
+
+# Example of json config:
+# [
+# {
+# "name": "cebka@test.ru",
+# "metrics":
+# {
+# "default": 5.5
+# },
+# "factors":
+# {
+# "R_FUZZY": 10.1
+# },
+# "want_spam": false
+# }
+# ]
diff --git a/src/cfg_file.h b/src/cfg_file.h
index 3e932fc10..527c3f7c6 100644
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -130,17 +130,25 @@ struct statfile_autolearn_params {
* Statfile config definition
*/
struct statfile {
- char *alias; /**< alias of statfile */
- char *pattern; /**< filesystem pattern (with %r or %f) */
- double weight; /**< weight scale */
- char *metric; /**< metric name */
+ char *symbol; /**< symbol of statfile */
+ char *path; /**< filesystem pattern (with %r or %f) */
size_t size; /**< size of statfile */
- struct tokenizer *tokenizer; /**< tokenizer used for statfile */
GList *sections; /**< list of sections in statfile */
struct statfile_autolearn_params *autolearn; /**< autolearn params */
};
/**
+ * Classifier config definition
+ */
+struct classifier_config {
+ GList *statfiles; /**< statfiles list */
+ char *metric; /**< metric of this classifier */
+ struct classifier *classifier; /**< classifier interface */
+ struct tokenizer *tokenizer; /**< tokenizer used for classifier */
+ GHashTable *opts; /**< other options */
+};
+
+/**
* Config option for importing to script module
*/
struct config_scalar {
@@ -223,7 +231,8 @@ struct config_file {
GHashTable* factors; /**< hash of factors indexed by symbol name */
GHashTable* c_modules; /**< hash of c modules indexed by module name */
GHashTable* composite_symbols; /**< hash of composite symbols indexed by its name */
- GHashTable* statfiles; /**< hash of defined statfiles indexed by alias */
+ GList *classifiers; /**< list of all classifiers defined */
+ GHashTable *classifiers_symbols; /**< hashtable indexed by symbol name of classifiers */
GHashTable* cfg_params; /**< all cfg params indexed by its name in this structure */
int clock_res; /**< resolution of clock used */
GList *views; /**< views */
@@ -314,7 +323,7 @@ void post_load_config (struct config_file *cfg);
void unescape_quotes (char *line);
GList* parse_comma_list (memory_pool_t *pool, char *line);
-
+struct classifier_config* check_classifier_cfg (struct config_file *cfg, struct classifier_config *c);
int yylex (void);
int yyparse (void);
diff --git a/src/cfg_file.l b/src/cfg_file.l
index 64c113a71..5355a7c57 100644
--- a/src/cfg_file.l
+++ b/src/cfg_file.l
@@ -2,6 +2,7 @@
%x module
%x lua
%x worker
+%x classifier
%{
@@ -21,6 +22,7 @@ extern void add_luabuf (const char *line);
YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH];
int line_stack[MAX_INCLUDE_DEPTH];
int include_stack_ptr = 0;
+int nested_depth = 0;
extern struct config_file *cfg;
%}
@@ -74,17 +76,7 @@ enabled return ENABLED;
delivery return DELIVERY;
agent return AGENT;
-statfile return STATFILE;
-alias return ALIAS;
-pattern return PATTERN;
-weight return WEIGHT;
-size return SIZE;
-tokenizer return TOKENIZER;
-classifier return CLASSIFIER;
-section return SECTION;
-autolearn return AUTOLEARN;
-min_mark return MIN_MARK;
-max_mark return MAX_MARK;
+classifier BEGIN(classifier); return CLASSIFIER;
logging return LOGGING;
@@ -167,8 +159,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<module>[ \t]+ /* ignore whitespace */;
<module>[ \t]*#.* /* ignore comments */;
<module>\'[a-zA-Z0-9_-]+\' yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; return MODULE_OPT;
-<module>\{ return OBRACE;
-<module>\} BEGIN(INITIAL); return EBRACE;
+<module>\{ nested_depth ++; return OBRACE;
+<module>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<module>\; return SEMICOLON;
<module>= return EQSIGN;
<module>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
@@ -178,8 +170,8 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<worker>\n /* ignore EOL */;
<worker>[ \t]+ /* ignore whitespace */;
<worker>[ \t]*#.* /* ignore comments */;
-<worker>\{ return OBRACE;
-<worker>\} BEGIN(INITIAL); return EBRACE;
+<worker>\{ nested_depth ++; return OBRACE;
+<worker>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<worker>\; return SEMICOLON;
<worker>= return EQSIGN;
<worker>type return TYPE;
@@ -193,6 +185,32 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<worker>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
<worker>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;
+<classifier>\n /* ignore EOL */;
+<classifier>[ \t]+ /* ignore whitespace */;
+<classifier>[ \t]*#.* /* ignore comments */;
+<classifier>\{ nested_depth ++; return OBRACE;
+<classifier>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
+<classifier>\; return SEMICOLON;
+<classifier>= return EQSIGN;
+<classifier>type return TYPE;
+<classifier>bind_socket return BINDSOCK;
+<classifier>count return COUNT;
+<classifier>statfile return STATFILE;
+<classifier>symbol return SYMBOL;
+<classifier>path return PATH;
+<classifier>size return SIZE;
+<classifier>tokenizer return TOKENIZER;
+<classifier>section return SECTION;
+<classifier>autolearn return AUTOLEARN;
+<classifier>min_mark return MIN_MARK;
+<classifier>max_mark return MAX_MARK;
+<classifier>[0-9]+ yylval.number=strtol(yytext, NULL, 10); return NUMBER;
+<classifier>-?[0-9]+\.?[0-9]* yylval.fract=strtod(yytext, NULL); return FRACT;
+<classifier>[0-9]+[kKmMgG]? yylval.limit=parse_limit(yytext); return SIZELIMIT;
+<classifier>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
+<classifier>[a-zA-Z0-9_%-]+ yylval.string=strdup(yytext); return PARAM;
+<classifier>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;
+
<lua>\n /* ignore EOL */;
<lua>[ \t]+ /* ignore whitespace */;
<lua>[ \t]*#.* /* ignore comments */;
diff --git a/src/cfg_file.y b/src/cfg_file.y
index 19584be92..84a0a1000 100644
--- a/src/cfg_file.y
+++ b/src/cfg_file.y
@@ -23,6 +23,7 @@ extern char *yytext;
GList *cur_module_opt = NULL;
struct metric *cur_metric = NULL;
+struct classifier_config *cur_classifier = NULL;
struct statfile *cur_statfile = NULL;
struct statfile_section *cur_section = NULL;
struct statfile_autolearn_params *cur_autolearn = NULL;
@@ -58,7 +59,7 @@ struct rspamd_view *cur_view = NULL;
%token DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT
%token VIEW IP FROM SYMBOLS
%token AUTOLEARN MIN_MARK MAX_MARK
-%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS
+%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS SYMBOL PATH
%type <string> STRING
%type <string> VARIABLE
@@ -93,7 +94,7 @@ command :
| metric
| composites
| logging
- | statfile
+ | classifier
| statfile_pool_size
| luacode
| raw_mode
@@ -660,20 +661,81 @@ loggingfile:
}
;
+
+classifier:
+ CLASSIFIER OBRACE classifierbody EBRACE {
+ if (cur_classifier == NULL || cur_classifier->classifier == NULL) {
+ yyerror ("yyparse: invalid classifier definition");
+ YYERROR;
+ }
+ if (cur_classifier->metric == NULL) {
+ cur_classifier->metric = DEFAULT_METRIC;
+ }
+ if (cur_classifier->tokenizer == NULL) {
+ cur_classifier->tokenizer = get_tokenizer ("osb-text");
+ }
+
+ cfg->classifiers = g_list_prepend (cfg->classifiers, cur_classifier);
+ cur_classifier = NULL;
+ }
+ ;
+
+classifierbody:
+ | classifiercmd SEMICOLON
+ | classifierbody classifiercmd SEMICOLON
+ ;
+
+classifiercmd:
+ | statfile
+ | classifiertype
+ | classifiermetric
+ | classifiertokenizer
+ | classifieroption
+ ;
+
+classifiertype:
+ TYPE EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ if ((cur_classifier->classifier = get_classifier ($3)) == NULL) {
+ yyerror ("yyparse: unknown classifier type: %s", $3);
+ YYERROR;
+ }
+ }
+ ;
+classifiertokenizer:
+ TOKENIZER EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ if ((cur_classifier->tokenizer = get_tokenizer ($3)) == NULL) {
+ yyerror ("yyparse: unknown tokenizer %s", $3);
+ YYERROR;
+ }
+ }
+ ;
+
+classifiermetric:
+ METRIC EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ cur_classifier->metric = $3;
+ memory_pool_add_destructor (cfg->cfg_pool, g_free, cur_classifier->metric);
+ }
+ ;
+
+classifieroption:
+ PARAM EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ g_hash_table_insert (cur_classifier->opts, $1, $3);
+ memory_pool_add_destructor (cfg->cfg_pool, g_free, $1);
+ memory_pool_add_destructor (cfg->cfg_pool, g_free, $3);
+ };
+
statfile:
STATFILE OBRACE statfilebody EBRACE {
- if (cur_statfile == NULL || cur_statfile->alias == NULL || cur_statfile->pattern == NULL
- || cur_statfile->weight == 0 || cur_statfile->size == 0) {
+ if (cur_statfile == NULL || cur_statfile->path == NULL || cur_statfile->size == 0) {
yyerror ("yyparse: not enough arguments in statfile definition");
YYERROR;
}
- if (cur_statfile->metric == NULL) {
- cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default");
- }
- if (cur_statfile->tokenizer == NULL) {
- cur_statfile->tokenizer = get_tokenizer ("osb-text");
- }
- g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile);
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ cur_classifier->statfiles = g_list_prepend (cur_classifier->statfiles, cur_statfile);
cur_statfile = NULL;
}
;
@@ -684,48 +746,33 @@ statfilebody:
;
statfilecmd:
- | statfilealias
- | statfilepattern
- | statfileweight
+ | statfilesymbol
+ | statfilepath
| statfilesize
- | statfilemetric
- | statfiletokenizer
| statfilesection
| statfileautolearn
;
-statfilealias:
- ALIAS EQSIGN QUOTEDSTRING {
+statfilesymbol:
+ SYMBOL EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
- cur_statfile->alias = memory_pool_strdup (cfg->cfg_pool, $3);
+ cur_statfile->symbol = memory_pool_strdup (cfg->cfg_pool, $3);
+ g_hash_table_insert (cfg->classifiers_symbols, $3, cur_classifier);
}
;
-statfilepattern:
- PATTERN EQSIGN QUOTEDSTRING {
+statfilepath:
+ PATH EQSIGN QUOTEDSTRING {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
- cur_statfile->pattern = memory_pool_strdup (cfg->cfg_pool, $3);
+ cur_statfile->path = memory_pool_strdup (cfg->cfg_pool, $3);
}
;
-statfileweight:
- WEIGHT EQSIGN NUMBER {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- cur_statfile->weight = $3;
- }
- | WEIGHT EQSIGN FRACT {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- cur_statfile->weight = $3;
- }
- ;
statfilesize:
SIZE EQSIGN NUMBER {
@@ -742,26 +789,7 @@ statfilesize:
}
;
-statfilemetric:
- METRIC EQSIGN QUOTEDSTRING {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3);
- }
- ;
-statfiletokenizer:
- TOKENIZER EQSIGN QUOTEDSTRING {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) {
- yyerror ("yyparse: unknown tokenizer %s", $3);
- YYERROR;
- }
- }
- ;
statfilesection:
SECTION OBRACE sectionbody EBRACE {
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 7d06e662c..0acd50be8 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -186,7 +186,7 @@ init_defaults (struct config_file *cfg)
cfg->factors = g_hash_table_new (g_str_hash, g_str_equal);
cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal);
cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->statfiles = g_hash_table_new (g_str_hash, g_str_equal);
+ cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal);
cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal);
init_settings (cfg);
@@ -207,10 +207,10 @@ free_config (struct config_file *cfg)
g_hash_table_unref (cfg->c_modules);
g_hash_table_remove_all (cfg->composite_symbols);
g_hash_table_unref (cfg->composite_symbols);
- g_hash_table_remove_all (cfg->statfiles);
- g_hash_table_unref (cfg->statfiles);
g_hash_table_remove_all (cfg->cfg_params);
g_hash_table_unref (cfg->cfg_params);
+ g_hash_table_destroy (cfg->classifiers_symbols);
+ g_list_free (cfg->classifiers);
g_list_free (cfg->metrics_list);
memory_pool_delete (cfg->cfg_pool);
}
@@ -604,6 +604,20 @@ parse_comma_list (memory_pool_t *pool, char *line)
return res;
}
+struct classifier_config *
+check_classifier_cfg (struct config_file *cfg, struct classifier_config *c)
+{
+ if (c == NULL) {
+ c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config));
+ }
+ if (c->opts == NULL) {
+ c->opts = g_hash_table_new (g_str_hash, g_str_equal);
+ memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->opts);
+ }
+
+ return c;
+}
+
/*
* vi:ts=4
*/
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
index 283350972..482d111b0 100644
--- a/src/classifiers/classifiers.c
+++ b/src/classifiers/classifiers.c
@@ -35,7 +35,6 @@ struct classifier classifiers[] = {
.init_func = winnow_init,
.classify_func = winnow_classify,
.learn_func = winnow_learn,
- .result_file_func = winnow_result_file
},
};
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
index 13a295724..fcb251da1 100644
--- a/src/classifiers/classifiers.h
+++ b/src/classifiers/classifiers.h
@@ -6,29 +6,30 @@
#include "../statfile.h"
#include "../tokenizers/tokenizers.h"
+struct classifier_config;
+struct worker_task;
+
struct classifier_ctx {
memory_pool_t *pool;
GHashTable *results;
+ struct classifier_config *cfg;
};
/* Common classifier structure */
struct classifier {
char *name;
- struct classifier_ctx* (*init_func)(memory_pool_t *pool);
- void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- char *statfile, GTree *input, double scale);
+ struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf);
+ void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- char *statfile, GTree *input, int in_class);
- char* (*result_file_func)(struct classifier_ctx *ctx, double *probability);
+ char *symbol, GTree *input, gboolean in_class);
};
/* Get classifier structure by name or return NULL if this name is not found */
struct classifier* get_classifier (char *name);
/* Winnow algorithm */
-struct classifier_ctx* winnow_init (memory_pool_t *pool);
-void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale);
-void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
-char* winnow_result_file (struct classifier_ctx* ctx, double *probability);
+struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf);
+void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
+void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class);
/* Array of all defined classifiers */
extern struct classifier classifiers[];
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index edd929af0..88298faf4 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -27,6 +27,9 @@
*/
#include "classifiers.h"
+#include "../main.h"
+#include "../filter.h"
+#include "../cfg_file.h"
#define WINNOW_PROMOTION 1.23
#define WINNOW_DEMOTION 0.83
@@ -85,21 +88,23 @@ learn_callback (gpointer key, gpointer value, gpointer data)
}
struct classifier_ctx*
-winnow_init (memory_pool_t *pool)
+winnow_init (memory_pool_t *pool, struct classifier_config *cfg)
{
struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx));
ctx->pool = pool;
- ctx->results = g_hash_table_new (g_str_hash, g_str_equal);
- memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results);
+ ctx->cfg = cfg;
return ctx;
}
void
-winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale)
+winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
{
struct winnow_callback_data data;
double *res = memory_pool_alloc (ctx->pool, sizeof (double));
+ double max = 0;
+ GList *cur;
+ struct statfile *st, *sel = NULL;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -109,29 +114,44 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfi
data.count = 0;
data.now = time (NULL);
data.ctx = ctx;
-
- if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
- if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
- return;
+
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ st = cur->data;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ msg_warn ("winnow_classify: cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
}
- }
- g_tree_foreach (input, classify_callback, &data);
+ g_tree_foreach (input, classify_callback, &data);
- if (data.count != 0) {
- *res = scale * (data.sum / data.count);
+ if (data.count != 0) {
+ *res = (data.sum / data.count);
+ }
+ else {
+ *res = 0;
+ }
+ if (*res > max) {
+ max = *res;
+ sel = st;
+ }
+ cur = g_list_next (cur);
}
- else {
- *res = 0;
+
+ if (sel != NULL) {
+ insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL);
}
-
- g_hash_table_insert (ctx->results, statfile, res);
}
void
-winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
{
struct winnow_callback_data data;
+ GList *cur;
+ struct statfile *st;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -142,50 +162,29 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile,
data.in_class = in_class;
data.now = time (NULL);
data.ctx = ctx;
-
- if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
- if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
- return;
+
+ cur = g_list_first (ctx->cfg->statfiles);
+ while (cur) {
+ st = cur->data;
+ if (strcmp (symbol, st->symbol) == 0) {
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ /* Try to create statfile */
+ if (statfile_pool_create (pool,
+ st->path, st->size / sizeof (struct stat_file_block)) == -1) {
+ msg_err ("winnow_learn: cannot create statfile %s", st->path);
+ return;
+ }
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ msg_err ("winnow_learn: cannot create statfile %s", st->path);
+ return;
+ }
+ }
+ break;
}
+ cur = g_list_next (cur);
}
statfile_pool_lock_file (pool, data.file);
g_tree_foreach (input, learn_callback, &data);
statfile_pool_unlock_file (pool, data.file);
}
-
-struct winnow_result_data {
- char *filename;
- double max_score;
- double sum;
-};
-
-static void
-result_file_callback (gpointer key, gpointer value, gpointer data)
-{
- struct winnow_result_data *d = (struct winnow_result_data *)data;
- double w = *((double *)value);
-
- if (fabs (w) > fabs (d->max_score)) {
- d->filename = (char *)key;
- d->max_score = w;
- }
- d->sum += fabs (w);
-}
-
-char*
-winnow_result_file (struct classifier_ctx* ctx, double *probability)
-{
- struct winnow_result_data data = { NULL, 0, 0 };
- g_assert (ctx != NULL);
-
- g_hash_table_foreach (ctx->results, result_file_callback, &data);
- if (data.sum != 0) {
- *probability = data.max_score / data.sum;
- }
- else {
- *probability = 1;
- }
-
- return data.filename;
-}
diff --git a/src/controller.c b/src/controller.c
index 4196e16f0..0aaa8bd99 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -181,9 +181,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
int r = 0, days, hours, minutes;
time_t uptime;
unsigned long size = 0;
- struct statfile *statfile;
- stat_file_t *file;
- struct metric *metric;
+ struct classifier_config *cl;
memory_pool_stat_t mem_st;
char *password = g_hash_table_lookup (session->worker->cf->params, "password");
@@ -311,26 +309,16 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
return;
}
- statfile = g_hash_table_lookup (session->cfg->statfiles, *cmd_args);
- if (statfile == NULL) {
+ session->learn_symbol = *cmd_args;
+ cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args);
+ if (cl == NULL) {
r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args);
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
return;
}
+ session->learn_classifier = cl;
- metric = g_hash_table_lookup (session->cfg->metrics, statfile->metric);
-
- session->learn_rcpt = NULL;
- session->learn_from = NULL;
- session->learn_filename = NULL;
- session->learn_tokenizer = statfile->tokenizer;
- if (metric != NULL) {
- session->learn_classifier = metric->classifier;
- }
- else {
- session->learn_classifier = get_classifier ("winnow");
- }
/* By default learn positive */
session->in_class = 1;
/* Get all arguments */
@@ -366,22 +354,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
}
}
}
- session->learn_filename = resolve_stat_filename (session->session_pool, statfile->pattern,
- session->learn_rcpt, session->learn_from);
- if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) {
- /* Try to create statfile */
- if (statfile_pool_create (session->worker->srv->statfile_pool,
- session->learn_filename, statfile->size / sizeof (struct stat_file_block)) == -1) {
- r = snprintf (out_buf, sizeof (out_buf), "cannot create statfile %s" CRLF, session->learn_filename);
- rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
- return;
- }
- if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) {
- r = snprintf (out_buf, sizeof (out_buf), "cannot open statfile %s" CRLF, session->learn_filename);
- rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
- return;
- }
- }
rspamd_set_dispatcher_policy (session->dispatcher, BUFFER_CHARACTER, size);
session->state = STATE_LEARN;
}
@@ -479,7 +451,7 @@ controller_read_socket (f_str_t *in, void *arg)
while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
c.begin = content->data;
c.len = content->len;
- if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer,
+ if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
session->session_pool, &c, &tokens)) {
i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE);
@@ -487,9 +459,9 @@ controller_read_socket (f_str_t *in, void *arg)
return;
}
}
- cls_ctx = session->learn_classifier->init_func (session->session_pool);
- session->learn_classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
- session->learn_filename, tokens, session->in_class);
+ cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier);
+ session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
+ session->learn_symbol, tokens, session->in_class);
session->worker->srv->stat->messages_learned ++;
i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE);
diff --git a/src/filter.c b/src/filter.c
index 34e487192..c9453dc61 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -444,7 +444,7 @@ check_autolearn (struct statfile_autolearn_params *params, struct worker_task *t
return FALSE;
}
-static void
+void
process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens,
struct classifier *classifier, char *filename, struct classifier_ctx* ctx)
{
@@ -464,7 +464,7 @@ process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens,
}
}
- classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1);
+ classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, TRUE);
}
}
}
@@ -488,48 +488,27 @@ make_composites (struct worker_task *task)
g_hash_table_foreach (task->results, composites_metric_callback, task);
}
-struct statfile_result_data {
- struct metric *metric;
- struct classifier_ctx *ctx;
-};
struct statfile_callback_data {
GHashTable *tokens;
- GHashTable *classifiers;
struct worker_task *task;
};
static void
-statfiles_callback (gpointer key, gpointer value, void *arg)
+classifiers_callback (gpointer value, void *arg)
{
struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
struct worker_task *task = data->task;
- struct statfile *st = (struct statfile *)value;
- struct classifier *classifier;
- struct statfile_result_data *res_data;
- struct metric *metric;
+ struct classifier_config *cl = value;
+ struct classifier_ctx *ctx;
struct mime_text_part *text_part;
-
+ struct statfile *st;
GTree *tokens = NULL;
GList *cur;
-
- char *filename;
f_str_t c;
- if (g_list_length (task->rcpt) == 1) {
- filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
- }
- else {
- /* XXX: handle multiply recipients correctly */
- filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
- }
-
- if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) {
- return;
- }
-
cur = g_list_first (task->text_parts);
- if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
+ if ((tokens = g_hash_table_lookup (data->tokens, cl->tokenizer)) == NULL) {
while (cur != NULL) {
text_part = (struct mime_text_part *)cur->data;
if (text_part->is_empty) {
@@ -539,52 +518,32 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) {
msg_info ("statfiles_callback: cannot tokenize input");
return;
}
cur = g_list_next (cur);
}
- g_hash_table_insert (data->tokens, st->tokenizer, tokens);
+ g_hash_table_insert (data->tokens, cl->tokenizer, tokens);
}
- metric = g_hash_table_lookup (task->cfg->metrics, st->metric);
- if (metric == NULL) {
- classifier = get_classifier ("winnow");
- }
- else {
- classifier = metric->classifier;
- }
- if ((res_data = g_hash_table_lookup (data->classifiers, classifier)) == NULL) {
- res_data = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result_data));
- res_data->ctx = classifier->init_func (task->task_pool);
- res_data->metric = metric;
- g_hash_table_insert (data->classifiers, classifier, res_data);
- }
+ ctx = cl->classifier->init_func (task->task_pool, cl);
+ cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task);
- classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight);
-
- if (st->autolearn) {
- /* Process autolearn */
- process_autolearn (st, task, tokens, classifier, filename, res_data->ctx);
+ /* Autolearning */
+ cur = g_list_first (cl->statfiles);
+ while (cur) {
+ st = cur->data;
+ if (st->autolearn) {
+ if (check_autolearn (st->autolearn, task)) {
+ /* Process autolearn */
+ process_autolearn (st, task, tokens, cl->classifier, st->path, ctx);
+ }
+ }
+ cur = g_list_next (cur);
}
}
-static void
-statfiles_results_callback (gpointer key, gpointer value, void *arg)
-{
- struct worker_task *task = (struct worker_task *)arg;
- struct statfile_result_data *res = (struct statfile_result_data *)value;
- struct classifier *classifier = (struct classifier *)key;
- double *w;
- char *filename;
-
- w = memory_pool_alloc (task->task_pool, sizeof (double));
- filename = classifier->result_file_func (res->ctx, w);
- insert_result (task, res->metric->name, classifier->name, *w, NULL);
- msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name);
-}
-
void
process_statfiles (struct worker_task *task)
@@ -593,16 +552,11 @@ process_statfiles (struct worker_task *task)
cd.task = task;
cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
- cd.classifiers = g_hash_table_new (g_str_hash, g_str_equal);
- g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd);
- g_hash_table_foreach (cd.classifiers, statfiles_results_callback, task);
-
+ g_list_foreach (task->cfg->classifiers, classifiers_callback, &cd);
g_hash_table_destroy (cd.tokens);
- g_hash_table_destroy (cd.classifiers);
- /* Process results */
- g_hash_table_foreach (task->results, metric_process_callback_forced, task);
+ /* Process results */
task->state = WRITE_REPLY;
}
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index ff475c0a1..ab06166b3 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -2,6 +2,8 @@
#define RSPAMD_LUA_H
#include "../config.h"
+#ifdef WITH_LUA
+
#include "../main.h"
#include "../cfg_file.h"
#include <lua.h>
@@ -30,4 +32,5 @@ int lua_call_chain_filter (const char *function, struct worker_task *task, int *
double lua_consolidation_func (struct worker_task *task, const char *metric_name, const char *function_name);
void add_luabuf (const char *line);
-#endif
+#endif /* WITH_LUA */
+#endif /* RSPAMD_LUA_H */
diff --git a/src/main.h b/src/main.h
index bfc78e888..4c1ab8617 100644
--- a/src/main.h
+++ b/src/main.h
@@ -71,6 +71,7 @@ struct pidfh;
struct config_file;
struct tokenizer;
struct classifier;
+struct classifier_config;
struct mime_part;
struct rspamd_view;
@@ -140,9 +141,8 @@ struct controller_session {
struct config_file *cfg; /**< pointer to config file */
char *learn_rcpt; /**< recipient for learning */
char *learn_from; /**< from address for learning */
- struct tokenizer *learn_tokenizer; /**< tokenizer for learning */
- struct classifier *learn_classifier; /**< classifier for learning */
- char *learn_filename; /**< real filename for learning */
+ struct classifier_config *learn_classifier;
+ char *learn_symbol; /**< symbol to train */
rspamd_io_dispatcher_t *dispatcher; /**< IO dispatcher object */
f_str_t *learn_buf; /**< learn input */
GList *parts; /**< extracted mime parts */
diff --git a/src/protocol.c b/src/protocol.c
index 176160381..cd5f32424 100644
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -313,15 +313,7 @@ parse_header (struct worker_task *task, f_str_t *line)
task->rcpt = g_list_prepend (task->rcpt, tmp);
msg_debug ("parse_header: read rcpt header, value: %s", tmp);
}
- else {
- msg_info ("parse_header: wrong header: %s", headern);
- return -1;
- }
- break;
- case 'n':
- case 'N':
- /* nrcpt */
- if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) {
+ else if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) {
tmp = memory_pool_fstrdup (task->task_pool, line);
task->nrcpt = strtoul (tmp, &err, 10);
msg_debug ("parse_header: read rcpt header, value: %d", (int)task->nrcpt);
diff --git a/src/symbols_cache.c b/src/symbols_cache.c
index 1d5d38d4d..cc7e8a1b1 100644
--- a/src/symbols_cache.c
+++ b/src/symbols_cache.c
@@ -360,6 +360,9 @@ call_symbol_callback (struct worker_task *task, struct symbols_cache *cache, str
item = &cache->items[0];
}
else {
+ if (cache == NULL) {
+ return FALSE;
+ }
/* Next pointer */
if (*saved_item - cache->items >= cache->used_items - 1) {
/* No more items in cache */
diff --git a/test/rspamd_statfile_test.c b/test/rspamd_statfile_test.c
index 19a6cf7ab..282d4dc1c 100644
--- a/test/rspamd_statfile_test.c
+++ b/test/rspamd_statfile_test.c
@@ -25,6 +25,7 @@ void
rspamd_statfile_test_func ()
{
statfile_pool_t *pool;
+ stat_file_t *st;
uint32_t random_hashes[HASHES_NUM], i, v;
time_t now;
@@ -40,17 +41,17 @@ rspamd_statfile_test_func ()
/* Create new file */
g_assert (statfile_pool_create (pool, TEST_FILENAME, 65535) != -1);
- g_assert (statfile_pool_open (pool, TEST_FILENAME) != -1);
+ g_assert ((st = statfile_pool_open (pool, TEST_FILENAME)) != NULL);
/* Get and set random blocks */
- statfile_pool_lock_file (pool, TEST_FILENAME);
+ statfile_pool_lock_file (pool, st);
for (i = 0; i < HASHES_NUM; i ++) {
- statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0);
+ statfile_pool_set_block (pool, st, random_hashes[i], random_hashes[i], now, 1.0);
}
- statfile_pool_unlock_file (pool, TEST_FILENAME);
+ statfile_pool_unlock_file (pool, st);
for (i = 0; i < HASHES_NUM; i ++) {
- v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now);
+ v = statfile_pool_get_block (pool, st, random_hashes[i], random_hashes[i], now);
g_assert(v == 1.0);
}
diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c
index 0f4768e73..24d8e0289 100644
--- a/test/rspamd_test_suite.c
+++ b/test/rspamd_test_suite.c
@@ -30,4 +30,6 @@ main (int argc, char **argv)
g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);
g_test_run ();
+
+ return 0;
}
diff --git a/utils/expression_parser.c b/utils/expression_parser.c
index 4f37ec923..38b52934f 100644
--- a/utils/expression_parser.c
+++ b/utils/expression_parser.c
@@ -49,4 +49,6 @@ main (int argc, char **argv)
}
memory_pool_delete (pool);
+
+ return 0;
}