mirror of
https://github.com/rspamd/rspamd.git
synced 2024-08-14 02:14:40 +02:00
* Skip short utf words in statistics
This commit is contained in:
parent
83a9452974
commit
92de380c2c
@ -851,7 +851,7 @@ controller_read_socket (f_str_t * in, void *arg)
|
||||
c.begin = part->content->data;
|
||||
c.len = part->content->len;
|
||||
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
|
||||
session->session_pool, &c, &tokens, FALSE)) {
|
||||
session->session_pool, &c, &tokens, FALSE, part->is_utf)) {
|
||||
i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END);
|
||||
free_task (task, FALSE);
|
||||
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
|
||||
|
@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
|
||||
c.len = strlen (cur->data);
|
||||
if (c.len > 0) {
|
||||
c.begin = cur->data;
|
||||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
|
||||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
|
||||
msg_info ("cannot tokenize input");
|
||||
return;
|
||||
}
|
||||
@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
|
||||
c.begin = text_part->content->data;
|
||||
c.len = text_part->content->len;
|
||||
/* Tree would be freed at task pool freeing */
|
||||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
|
||||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
|
||||
msg_info ("cannot tokenize input");
|
||||
return;
|
||||
}
|
||||
@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
|
||||
stat_file_t *stf;
|
||||
gdouble sum;
|
||||
struct mime_text_part *part;
|
||||
gboolean is_utf = FALSE;
|
||||
|
||||
/* Load classifier by symbol */
|
||||
cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
|
||||
@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
|
||||
}
|
||||
c.begin = part->content->data;
|
||||
c.len = part->content->len;
|
||||
is_utf = part->is_utf;
|
||||
}
|
||||
/* Get tokens */
|
||||
if (!cl->tokenizer->tokenize_func (
|
||||
cl->tokenizer, task->task_pool,
|
||||
&c, &tokens, FALSE)) {
|
||||
&c, &tokens, FALSE, is_utf)) {
|
||||
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ extern const int primes[];
|
||||
|
||||
int
|
||||
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
|
||||
gboolean save_token)
|
||||
gboolean save_token, gboolean is_utf)
|
||||
{
|
||||
token_node_t *new = NULL;
|
||||
f_str_t token = { NULL, 0, 0 }, *res;
|
||||
@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t *
|
||||
|
||||
while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
|
||||
/* Skip small words */
|
||||
if (token.len < MIN_LEN) {
|
||||
continue;
|
||||
if (is_utf) {
|
||||
if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (token.len < MIN_LEN) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
/* Shift hashpipe */
|
||||
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
|
||||
|
@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
|
||||
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
|
||||
subject.begin = task->subject;
|
||||
subject.len = strlen (task->subject);
|
||||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
|
||||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
|
||||
}
|
||||
if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
|
||||
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
|
||||
subject.begin = (gchar *)sub;
|
||||
subject.len = strlen (sub);
|
||||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE);
|
||||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -24,7 +24,7 @@ typedef struct token_node_s {
|
||||
/* Common tokenizer structure */
|
||||
struct tokenizer {
|
||||
char *name;
|
||||
int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
|
||||
int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
|
||||
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
|
||||
};
|
||||
|
||||
@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name);
|
||||
/* Get next word from specified f_str_t buf */
|
||||
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
|
||||
/* OSB tokenize function */
|
||||
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
|
||||
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
|
||||
/* Common tokenizer for headers */
|
||||
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
|
||||
/* Make tokens for a subject */
|
||||
|
@ -17,11 +17,11 @@ ELSE(ENABLE_LUAJIT MATCHES "ON")
|
||||
ENDIF(ENABLE_LUAJIT MATCHES "ON")
|
||||
TARGET_LINK_LIBRARIES(statshow ${GLIB2_LIBRARIES})
|
||||
TARGET_LINK_LIBRARIES(statshow ${CMAKE_REQUIRED_LIBRARIES})
|
||||
IF(GMIME2_FOUND)
|
||||
TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
|
||||
ELSE(GMIME2_FOUND)
|
||||
IF(GMIME24)
|
||||
TARGET_LINK_LIBRARIES(statshow ${GMIME24_LIBRARIES})
|
||||
ENDIF(GMIME2_FOUND)
|
||||
ELSE(GMIME24)
|
||||
TARGET_LINK_LIBRARIES(statshow ${GMIME2_LIBRARIES})
|
||||
ENDIF(GMIME24)
|
||||
IF(ENABLE_STATIC MATCHES "ON")
|
||||
TARGET_LINK_LIBRARIES(statshow ${PCRE_LIBRARIES})
|
||||
ENDIF(ENABLE_STATIC MATCHES "ON")
|
Loading…
Reference in New Issue
Block a user