* Skip short utf words in statistics

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 3 Jun 2011 16:23:13 +0000 (20:23 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 3 Jun 2011 16:23:13 +0000 (20:23 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 3 Jun 2011 16:23:13 +0000 (20:23 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 3 Jun 2011 16:23:13 +0000 (20:23 +0400)
diff --git a/src/controller.c b/src/controller.c

index b29af9ee18288c69f545a75d07f37056aa5962cf..f69a90f8b8e0e68277de20e622c5056bd9145469 100644 (file)
--- a/src/controller.c
+++ b/src/controller.c
@@ -851,7 +851,7 @@ controller_read_socket (f_str_t * in, void *arg)
                         c.begin = part->content->data;
                         c.len = part->content->len;
                         if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
-                                       session->session_pool, &c, &tokens, FALSE)) {
+                                       session->session_pool, &c, &tokens, FALSE, part->is_utf)) {
                                 i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END);
                                 free_task (task, FALSE);
                                 if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
diff --git a/src/filter.c b/src/filter.c

index 2f8b27060bfdc6a6d15080398f72bf67427084c7..753c17952a9ff91958854677adfe8cb97b23e7e2 100644 (file)
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
                                 c.len = strlen (cur->data);
                                 if (c.len > 0) {
                                         c.begin = cur->data;
-                                       if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
+                                       if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
                                                 msg_info ("cannot tokenize input");
                                                 return;
                                         }
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
                                 c.begin = text_part->content->data;
                                 c.len = text_part->content->len;
                                 /* Tree would be freed at task pool freeing */
-                               if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
+                               if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
                                         msg_info ("cannot tokenize input");
                                         return;
                                 }
@@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
         stat_file_t                    *stf;
         gdouble                         sum;
         struct mime_text_part          *part;
+       gboolean                        is_utf = FALSE;
  
         /* Load classifier by symbol */
         cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
                         }
                         c.begin = part->content->data;
                         c.len = part->content->len;
+                       is_utf = part->is_utf;
                 }
                 /* Get tokens */
                 if (!cl->tokenizer->tokenize_func (
                                 cl->tokenizer, task->task_pool,
-                               &c, &tokens, FALSE)) {
+                               &c, &tokens, FALSE, is_utf)) {
                         g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
                         return FALSE;
                 }
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c

index 1a04f3464efa03c8f799e5e21162afb1aa675a58..5f5dfcdcd88d192b170e2caf26d0041e11b0a60d 100644 (file)
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -36,7 +36,7 @@ extern const int                primes[];
  
  int
  osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
-               gboolean save_token)
+               gboolean save_token, gboolean is_utf)
  {
         token_node_t                   *new = NULL;
         f_str_t                         token = { NULL, 0, 0 }, *res;
@@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t *
  
         while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
                 /* Skip small words */
-               if (token.len < MIN_LEN) {
-                       continue;
+               if (is_utf) {
+                       if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
+                               continue;
+                       }
+               }
+               else {
+                       if (token.len < MIN_LEN) {
+                               continue;
+                       }
                 }
                 /* Shift hashpipe */
                 for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c

index 5af3fe6d5883046020822d13a6844182eb9784ef..9e41a9101cc48fd9cda09736fe5360b0991856c6 100644 (file)
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
                 new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
                 subject.begin = task->subject;
                 subject.len = strlen (task->subject);
-               osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
+               osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
         }
         if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
                 new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
                 subject.begin = (gchar *)sub;
                 subject.len = strlen (sub);
-               osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
+               osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
         }
  }
  
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h

index 7417533289da10017cfabd2ce19fb85d725ccf53..df5481a1fea525cf794e35350291a7329256330c 100644 (file)
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -24,7 +24,7 @@ typedef struct token_node_s {
  /* Common tokenizer structure */
  struct tokenizer {
         char *name;
-       int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
+       int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
         f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
  };
  
@@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name);
  /* Get next word from specified f_str_t buf */
  f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
  /* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
  /* Common tokenizer for headers */
  int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
  /* Make tokens for a subject */
diff --git a/utils/statshow/CMakeLists.txt b/utils/statshow/CMakeLists.txt

index bf3308814049252ecbccc94554495e42b5b1adee..c6a4fd75d1c1741b5ac7872877a01d72b12183d7 100644 (file)
--- a/utils/statshow/CMakeLists.txt
+++ b/utils/statshow/CMakeLists.txt
@@ -17,11 +17,11 @@ ELSE(ENABLE_LUAJIT MATCHES "ON")
  ENDIF(ENABLE_LUAJIT MATCHES "ON")
  TARGET_LINK_LIBRARIES(statshow ${GLIB2_LIBRARIES})
  TARGET_LINK_LIBRARIES(statshow ${CMAKE_REQUIRED_LIBRARIES})
-IF(GMIME2_FOUND)
-       TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
-ELSE(GMIME2_FOUND)
+IF(GMIME24)
         TARGET_LINK_LIBRARIES(statshow ${GMIME24_LIBRARIES})
-ENDIF(GMIME2_FOUND)
+ELSE(GMIME24)
+    TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
+ENDIF(GMIME24)
  IF(ENABLE_STATIC MATCHES "ON")
         TARGET_LINK_LIBRARIES(statshow ${PCRE_LIBRARIES})
  ENDIF(ENABLE_STATIC MATCHES "ON")
 \ No newline at end of file
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 3 Jun 2011 16:23:13 +0000 (20:23 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 3 Jun 2011 16:23:13 +0000 (20:23 +0400)
src/controller.c		patch \| blob \| history
src/filter.c		patch \| blob \| history
src/tokenizers/osb.c		patch \| blob \| history
src/tokenizers/tokenizers.c		patch \| blob \| history
src/tokenizers/tokenizers.h		patch \| blob \| history
utils/statshow/CMakeLists.txt		patch \| blob \| history