@@ -851,7 +851,7 @@ controller_read_socket (f_str_t * in, void *arg) | |||
c.begin = part->content->data; | |||
c.len = part->content->len; | |||
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, | |||
session->session_pool, &c, &tokens, FALSE)) { | |||
session->session_pool, &c, &tokens, FALSE, part->is_utf)) { | |||
i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END); | |||
free_task (task, FALSE); | |||
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { |
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) | |||
c.len = strlen (cur->data); | |||
if (c.len > 0) { | |||
c.begin = cur->data; | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) { | |||
msg_info ("cannot tokenize input"); | |||
return; | |||
} | |||
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) | |||
c.begin = text_part->content->data; | |||
c.len = text_part->content->len; | |||
/* Tree would be freed at task pool freeing */ | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) { | |||
msg_info ("cannot tokenize input"); | |||
return; | |||
} | |||
@@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) | |||
stat_file_t *stf; | |||
gdouble sum; | |||
struct mime_text_part *part; | |||
gboolean is_utf = FALSE; | |||
/* Load classifier by symbol */ | |||
cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile); | |||
@@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) | |||
} | |||
c.begin = part->content->data; | |||
c.len = part->content->len; | |||
is_utf = part->is_utf; | |||
} | |||
/* Get tokens */ | |||
if (!cl->tokenizer->tokenize_func ( | |||
cl->tokenizer, task->task_pool, | |||
&c, &tokens, FALSE)) { | |||
&c, &tokens, FALSE, is_utf)) { | |||
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); | |||
return FALSE; | |||
} |
@@ -36,7 +36,7 @@ extern const int primes[]; | |||
int | |||
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, | |||
gboolean save_token) | |||
gboolean save_token, gboolean is_utf) | |||
{ | |||
token_node_t *new = NULL; | |||
f_str_t token = { NULL, 0, 0 }, *res; | |||
@@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * | |||
while ((res = tokenizer->get_next_word (input, &token)) != NULL) { | |||
/* Skip small words */ | |||
if (token.len < MIN_LEN) { | |||
continue; | |||
if (is_utf) { | |||
if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) { | |||
continue; | |||
} | |||
} | |||
else { | |||
if (token.len < MIN_LEN) { | |||
continue; | |||
} | |||
} | |||
/* Shift hashpipe */ | |||
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { |
@@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) | |||
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); | |||
subject.begin = task->subject; | |||
subject.len = strlen (task->subject); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); | |||
} | |||
if ((sub = g_mime_message_get_subject (task->message)) != NULL) { | |||
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); | |||
subject.begin = (gchar *)sub; | |||
subject.len = strlen (sub); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); | |||
} | |||
} | |||
@@ -24,7 +24,7 @@ typedef struct token_node_s { | |||
/* Common tokenizer structure */ | |||
struct tokenizer { | |||
char *name; | |||
int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token); | |||
int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); | |||
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); | |||
}; | |||
@@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name); | |||
/* Get next word from specified f_str_t buf */ | |||
f_str_t *get_next_word (f_str_t *buf, f_str_t *token); | |||
/* OSB tokenize function */ | |||
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token); | |||
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); | |||
/* Common tokenizer for headers */ | |||
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); | |||
/* Make tokens for a subject */ |
@@ -17,11 +17,11 @@ ELSE(ENABLE_LUAJIT MATCHES "ON") | |||
ENDIF(ENABLE_LUAJIT MATCHES "ON") | |||
TARGET_LINK_LIBRARIES(statshow ${GLIB2_LIBRARIES}) | |||
TARGET_LINK_LIBRARIES(statshow ${CMAKE_REQUIRED_LIBRARIES}) | |||
IF(GMIME2_FOUND) | |||
TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES}) | |||
ELSE(GMIME2_FOUND) | |||
IF(GMIME24) | |||
TARGET_LINK_LIBRARIES(statshow ${GMIME24_LIBRARIES}) | |||
ENDIF(GMIME2_FOUND) | |||
ELSE(GMIME24) | |||
TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES}) | |||
ENDIF(GMIME24) | |||
IF(ENABLE_STATIC MATCHES "ON") | |||
TARGET_LINK_LIBRARIES(statshow ${PCRE_LIBRARIES}) | |||
ENDIF(ENABLE_STATIC MATCHES "ON") |