aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-03 20:23:13 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-03 20:23:13 +0400
commit92de380c2c5e8ce7073ce979df4e5c7868e52bb6 (patch)
tree27be3202d27f129f3d94d90298a4d1e0ecf2c281
parent83a9452974ec2f9c7be262a77e54a1ea2557c795 (diff)
downloadrspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.tar.gz
rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.zip
* Skip short utf words in statistics
-rw-r--r--src/controller.c2
-rw-r--r--src/filter.c8
-rw-r--r--src/tokenizers/osb.c13
-rw-r--r--src/tokenizers/tokenizers.c4
-rw-r--r--src/tokenizers/tokenizers.h4
-rw-r--r--utils/statshow/CMakeLists.txt8
6 files changed, 24 insertions, 15 deletions
diff --git a/src/controller.c b/src/controller.c
index b29af9ee1..f69a90f8b 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -851,7 +851,7 @@ controller_read_socket (f_str_t * in, void *arg)
c.begin = part->content->data;
c.len = part->content->len;
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
- session->session_pool, &c, &tokens, FALSE)) {
+ session->session_pool, &c, &tokens, FALSE, part->is_utf)) {
i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END);
free_task (task, FALSE);
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
diff --git a/src/filter.c b/src/filter.c
index 2f8b27060..753c17952 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
c.len = strlen (cur->data);
if (c.len > 0) {
c.begin = cur->data;
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
msg_info ("cannot tokenize input");
return;
}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
msg_info ("cannot tokenize input");
return;
}
@@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
stat_file_t *stf;
gdouble sum;
struct mime_text_part *part;
+ gboolean is_utf = FALSE;
/* Load classifier by symbol */
cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
}
c.begin = part->content->data;
c.len = part->content->len;
+ is_utf = part->is_utf;
}
/* Get tokens */
if (!cl->tokenizer->tokenize_func (
cl->tokenizer, task->task_pool,
- &c, &tokens, FALSE)) {
+ &c, &tokens, FALSE, is_utf)) {
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
return FALSE;
}
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 1a04f3464..5f5dfcdcd 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -36,7 +36,7 @@ extern const int primes[];
int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
- gboolean save_token)
+ gboolean save_token, gboolean is_utf)
{
token_node_t *new = NULL;
f_str_t token = { NULL, 0, 0 }, *res;
@@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t *
while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
/* Skip small words */
- if (token.len < MIN_LEN) {
- continue;
+ if (is_utf) {
+ if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
+ continue;
+ }
+ }
+ else {
+ if (token.len < MIN_LEN) {
+ continue;
+ }
}
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 5af3fe6d5..9e41a9101 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
subject.begin = task->subject;
subject.len = strlen (task->subject);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
+ osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
}
if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
subject.begin = (gchar *)sub;
subject.len = strlen (sub);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
+ osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
}
}
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 741753328..df5481a1f 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -24,7 +24,7 @@ typedef struct token_node_s {
/* Common tokenizer structure */
struct tokenizer {
char *name;
- int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
+ int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
};
@@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
/* Common tokenizer for headers */
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
/* Make tokens for a subject */
diff --git a/utils/statshow/CMakeLists.txt b/utils/statshow/CMakeLists.txt
index bf3308814..c6a4fd75d 100644
--- a/utils/statshow/CMakeLists.txt
+++ b/utils/statshow/CMakeLists.txt
@@ -17,11 +17,11 @@ ELSE(ENABLE_LUAJIT MATCHES "ON")
ENDIF(ENABLE_LUAJIT MATCHES "ON")
TARGET_LINK_LIBRARIES(statshow ${GLIB2_LIBRARIES})
TARGET_LINK_LIBRARIES(statshow ${CMAKE_REQUIRED_LIBRARIES})
-IF(GMIME2_FOUND)
- TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
-ELSE(GMIME2_FOUND)
+IF(GMIME24)
TARGET_LINK_LIBRARIES(statshow ${GMIME24_LIBRARIES})
-ENDIF(GMIME2_FOUND)
+ELSE(GMIME24)
+ TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
+ENDIF(GMIME24)
IF(ENABLE_STATIC MATCHES "ON")
TARGET_LINK_LIBRARIES(statshow ${PCRE_LIBRARIES})
ENDIF(ENABLE_STATIC MATCHES "ON") \ No newline at end of file