13 years ago · 92de380c2c
--- a/src/controller.c
+++ b/src/controller.c
@@ -851,7 +851,7 @@ controller_read_socket (f_str_t * in, void *arg)
 			c.begin = part->content->data;
 			c.len = part->content->len;
 			if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
 					session->session_pool, &c, &tokens, FALSE)) {
 					session->session_pool, &c, &tokens, FALSE, part->is_utf)) {
 				i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END);
 				free_task (task, FALSE);
 				if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
 				c.len = strlen (cur->data);
 				if (c.len > 0) {
 					c.begin = cur->data;
 					if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
 					if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
 						msg_info ("cannot tokenize input");
 						return;
 					}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
 				c.begin = text_part->content->data;
 				c.len = text_part->content->len;
 				/* Tree would be freed at task pool freeing */
 				if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
 				if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
 					msg_info ("cannot tokenize input");
 					return;
 				}
@@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 	stat_file_t                    *stf;
 	gdouble                         sum;
 	struct mime_text_part          *part;
 	gboolean                        is_utf = FALSE;

 	/* Load classifier by symbol */
 	cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 			}
 			c.begin = part->content->data;
 			c.len = part->content->len;
 			is_utf = part->is_utf;
 		}
 		/* Get tokens */
 		if (!cl->tokenizer->tokenize_func (
 				cl->tokenizer, task->task_pool,
 				&c, &tokens, FALSE)) {
 				&c, &tokens, FALSE, is_utf)) {
 			g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
 			return FALSE;
 		}
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -36,7 +36,7 @@ extern const int                primes[];

 int
 osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
 		gboolean save_token)
 		gboolean save_token, gboolean is_utf)
 {
 	token_node_t                   *new = NULL;
 	f_str_t                         token = { NULL, 0, 0 }, *res;
@@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t *

 	while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
 		/* Skip small words */
 		if (token.len < MIN_LEN) {
 			continue;
 		if (is_utf) {
 			if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
 				continue;
 			}
 		}
 		else {
 			if (token.len < MIN_LEN) {
 				continue;
 			}
 		}
 		/* Shift hashpipe */
 		for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
 		new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
 		subject.begin = task->subject;
 		subject.len = strlen (task->subject);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
 	}
 	if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
 		new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
 		subject.begin = (gchar *)sub;
 		subject.len = strlen (sub);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
 		osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
 	}
 }

--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -24,7 +24,7 @@ typedef struct token_node_s {
 /* Common tokenizer structure */
 struct tokenizer {
 	char *name;
 	int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
 	int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
 	f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
 };

@@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name);
 /* Get next word from specified f_str_t buf */
 f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
 /* OSB tokenize function */
 int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
 int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
 /* Common tokenizer for headers */
 int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
 /* Make tokens for a subject */
--- a/utils/statshow/CMakeLists.txt
+++ b/utils/statshow/CMakeLists.txt
@@ -17,11 +17,11 @@ ELSE(ENABLE_LUAJIT MATCHES "ON")
 ENDIF(ENABLE_LUAJIT MATCHES "ON")
 TARGET_LINK_LIBRARIES(statshow ${GLIB2_LIBRARIES})
 TARGET_LINK_LIBRARIES(statshow ${CMAKE_REQUIRED_LIBRARIES})
 IF(GMIME2_FOUND)
 	TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
 ELSE(GMIME2_FOUND)
 IF(GMIME24)
 	TARGET_LINK_LIBRARIES(statshow ${GMIME24_LIBRARIES})
 ENDIF(GMIME2_FOUND)
 ELSE(GMIME24)
    TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
 ENDIF(GMIME24)
 IF(ENABLE_STATIC MATCHES "ON")
 	TARGET_LINK_LIBRARIES(statshow ${PCRE_LIBRARIES})
 ENDIF(ENABLE_STATIC MATCHES "ON")