aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 14:32:22 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 14:32:22 +0000
commit39b8dcb94620669ae369ab559175dde1a5c103b7 (patch)
tree4bcf0d71aa7cbe9520a3aadc2eae1415f5c79610 /src
parent074de730239d987adcf08a92baa3f4b65e7d63d9 (diff)
downloadrspamd-39b8dcb94620669ae369ab559175dde1a5c103b7.tar.gz
rspamd-39b8dcb94620669ae369ab559175dde1a5c103b7.zip
Rework text tokenizaton.
Diffstat (limited to 'src')
-rw-r--r--src/libmime/filter.c246
-rw-r--r--src/libmime/filter.h12
-rw-r--r--src/tokenizers/osb.c32
-rw-r--r--src/tokenizers/tokenizers.c37
-rw-r--r--src/tokenizers/tokenizers.h12
5 files changed, 69 insertions, 270 deletions
diff --git a/src/libmime/filter.c b/src/libmime/filter.c
index c6a4752fa..467f30b9c 100644
--- a/src/libmime/filter.c
+++ b/src/libmime/filter.c
@@ -746,88 +746,58 @@ classifiers_callback (gpointer value, void *arg)
struct rspamd_statfile_config *st;
GTree *tokens = NULL;
GList *cur;
- rspamd_fstring_t c;
- gchar *header = NULL;
gint *dist = NULL, diff;
- gboolean is_twopart = FALSE, is_headers = FALSE;
- struct raw_header *rh;
+ gboolean is_twopart = FALSE;
task = cbdata->task;
- if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) {
- cur =
- message_get_header (task, header, FALSE);
- is_headers = TRUE;
- }
- else {
- cur = g_list_first (task->text_parts);
- dist = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
- if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
- is_twopart = TRUE;
- }
+ cur = g_list_first (task->text_parts);
+ dist = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
}
ctx = cl->classifier->init_func (task->task_pool, cl);
if ((tokens = g_hash_table_lookup (task->tokens, cl->tokenizer)) == NULL) {
while (cur != NULL) {
- if (is_headers) {
- rh = (struct raw_header *)cur->data;
- if (rh->decoded == NULL) {
- cur = g_list_next (cur);
- continue;
- }
- c.len = strlen (rh->decoded);
- if (c.len > 0) {
- c.begin = rh->decoded;
- if (!cl->tokenizer->tokenize_func (cl->tokenizer,
- task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) {
- msg_info ("cannot tokenize input");
- return;
- }
- }
+ text_part = (struct mime_text_part *)cur->data;
+ if (text_part->is_empty) {
+ cur = g_list_next (cur);
+ continue;
}
- else {
- text_part = (struct mime_text_part *)cur->data;
- if (text_part->is_empty) {
- cur = g_list_next (cur);
- continue;
- }
- if (dist != NULL && cur->next == NULL) {
- /* Compare part's content */
+ if (dist != NULL && cur->next == NULL) {
+ /* Compare part's content */
- if (*dist >= COMMON_PART_FACTOR) {
- msg_info (
+ if (*dist >= COMMON_PART_FACTOR) {
+ msg_info (
"message <%s> has two common text parts, ignore the last one",
task->message_id);
- break;
- }
+ break;
}
- else if (cur->next == NULL && is_twopart) {
- p1 = cur->prev->data;
- p2 = text_part;
- if (p1->diff_str != NULL && p2->diff_str != NULL) {
- diff =
+ }
+ else if (cur->next == NULL && is_twopart) {
+ p1 = cur->prev->data;
+ p2 = text_part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff =
rspamd_diff_distance (p1->diff_str, p2->diff_str);
- }
- else {
- diff = rspamd_fuzzy_compare_parts (p1, p2);
- }
- if (diff >= COMMON_PART_FACTOR) {
- msg_info (
+ }
+ else {
+ diff = rspamd_fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
+ msg_info (
"message <%s> has two common text parts, ignore the last one",
task->message_id);
- break;
- }
+ break;
}
- c.begin = (gchar *)text_part->content->data;
- c.len = text_part->content->len;
- /* Tree would be freed at task pool freeing */
- if (!cl->tokenizer->tokenize_func (cl->tokenizer,
- task->task_pool, &c, &tokens,
+ }
+ /* Tree would be freed at task pool freeing */
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer,
+ task->task_pool, text_part->words, &tokens,
FALSE, text_part->is_utf, text_part->urls_offset)) {
- msg_info ("cannot tokenize input");
- return;
- }
+ msg_info ("cannot tokenize input");
+ return;
}
cur = g_list_next (cur);
}
@@ -1109,151 +1079,6 @@ rspamd_check_action_metric (struct rspamd_task *task,
}
gboolean
-rspamd_learn_task (const gchar *statfile, struct rspamd_task *task, GError **err)
-{
- GList *cur, *ex;
- struct rspamd_classifier_config *cl;
- struct classifier_ctx *cls_ctx;
- gchar *s;
- rspamd_fstring_t c;
- GTree *tokens = NULL;
- struct rspamd_statfile_config *st;
- stat_file_t *stf;
- gdouble sum;
- struct mime_text_part *part, *p1, *p2;
- gboolean is_utf = FALSE, is_twopart = FALSE;
- gint diff;
- struct raw_header *rh;
-
- /* Load classifier by symbol */
- cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
- if (cl == NULL) {
- g_set_error (err,
- filter_error_quark (), 1, "Statfile %s is not configured in any classifier",
- statfile);
- return FALSE;
- }
-
- /* If classifier has 'header' option just classify header of this type */
- if ((s = g_hash_table_lookup (cl->opts, "header")) != NULL) {
- cur = message_get_header (task, s, FALSE);
- }
- else {
- /* Classify message otherwise */
- cur = g_list_first (task->text_parts);
- if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
- is_twopart = TRUE;
- }
- }
-
- /* Get tokens from each element */
- while (cur) {
- if (s != NULL) {
- rh = (struct raw_header *)cur->data;
- if (rh->decoded == NULL) {
- cur = g_list_next (cur);
- continue;
- }
- c.len = strlen (rh->decoded);
- c.begin = rh->decoded;
- ex = NULL;
- }
- else {
- part = cur->data;
- /* Skip empty parts */
- if (part->is_empty) {
- cur = g_list_next (cur);
- continue;
- }
- c.begin = (gchar *)part->content->data;
- c.len = part->content->len;
- is_utf = part->is_utf;
- ex = part->urls_offset;
- if (is_twopart && cur->next == NULL) {
- /* Compare part's content */
- p1 = cur->prev->data;
- p2 = part;
- if (p1->diff_str != NULL && p2->diff_str != NULL) {
- diff = rspamd_diff_distance (p1->diff_str, p2->diff_str);
- }
- else {
- diff = rspamd_fuzzy_compare_parts (p1, p2);
- }
- if (diff >= COMMON_PART_FACTOR) {
- msg_info (
- "message <%s> has two common text parts, ignore the last one",
- task->message_id);
- break;
- }
- }
- }
- /* Get tokens */
- if (!cl->tokenizer->tokenize_func (
- cl->tokenizer, task->task_pool,
- &c, &tokens, FALSE, is_utf, ex)) {
- g_set_error (err,
- filter_error_quark (), 2, "Cannot tokenize message");
- return FALSE;
- }
- cur = g_list_next (cur);
- }
-
- /* Handle messages without text */
- if (tokens == NULL) {
- g_set_error (err,
- filter_error_quark (), 3, "Cannot tokenize message, no text data");
- msg_info ("learn failed for message <%s>, no tokens to extract",
- task->message_id);
- return FALSE;
- }
-
- /* Take care of subject */
- tokenize_subject (task, &tokens);
-
- /* Init classifier */
- cls_ctx = cl->classifier->init_func (
- task->task_pool, cl);
- /* Get or create statfile */
- stf = get_statfile_by_symbol (task->worker->srv->statfile_pool,
- cl, statfile, &st, TRUE);
-
- /* Learn */
- if (stf== NULL || !cl->classifier->learn_func (
- cls_ctx, task->worker->srv->statfile_pool,
- statfile, tokens, TRUE, &sum,
- 1.0, err)) {
- if (*err) {
- msg_info ("learn failed for message <%s>, learn error: %s",
- task->message_id,
- (*err)->message);
- return FALSE;
- }
- else {
- g_set_error (err,
- filter_error_quark (), 4,
- "Learn failed, unknown learn classifier error");
- msg_info ("learn failed for message <%s>, unknown learn error",
- task->message_id);
- return FALSE;
- }
- }
- /* Increase statistics */
- task->worker->srv->stat->messages_learned++;
-
- maybe_write_binlog (cl, st, stf, tokens);
- msg_info (
- "learn success for message <%s>, for statfile: %s, sum weight: %.2f",
- task->message_id,
- statfile,
- sum);
- statfile_pool_plan_invalidate (task->worker->srv->statfile_pool,
- DEFAULT_STATFILE_INVALIDATE_TIME,
- DEFAULT_STATFILE_INVALIDATE_JITTER);
-
- return TRUE;
-}
-
-gboolean
rspamd_learn_task_spam (struct rspamd_classifier_config *cl,
struct rspamd_task *task,
gboolean is_spam,
@@ -1261,7 +1086,6 @@ rspamd_learn_task_spam (struct rspamd_classifier_config *cl,
{
GList *cur, *ex;
struct classifier_ctx *cls_ctx;
- rspamd_fstring_t c;
GTree *tokens = NULL;
struct mime_text_part *part, *p1, *p2;
gboolean is_utf = FALSE, is_twopart = FALSE;
@@ -1280,8 +1104,6 @@ rspamd_learn_task_spam (struct rspamd_classifier_config *cl,
cur = g_list_next (cur);
continue;
}
- c.begin = (gchar *)part->content->data;
- c.len = part->content->len;
is_utf = part->is_utf;
ex = part->urls_offset;
if (is_twopart && cur->next == NULL) {
@@ -1307,7 +1129,7 @@ rspamd_learn_task_spam (struct rspamd_classifier_config *cl,
/* Get tokens */
if (!cl->tokenizer->tokenize_func (
cl->tokenizer, task->task_pool,
- &c, &tokens, FALSE, is_utf, ex)) {
+ part->words, &tokens, FALSE, is_utf, ex)) {
g_set_error (err,
filter_error_quark (), 2, "Cannot tokenize message");
return FALSE;
diff --git a/src/libmime/filter.h b/src/libmime/filter.h
index 3f01f2314..de324caf7 100644
--- a/src/libmime/filter.h
+++ b/src/libmime/filter.h
@@ -150,18 +150,8 @@ double rspamd_factor_consolidation_func (struct rspamd_task *task,
const gchar *metric_name,
const gchar *unused);
-/*
- * Learn specified statfile with message in a task
- * @param statfile symbol of statfile
- * @param task worker's task object
- * @param err pointer to GError
- * @return true if learn succeed
- */
-gboolean rspamd_learn_task (const gchar *statfile,
- struct rspamd_task *task,
- GError **err);
-/*
+/**
* Learn specified statfile with message in a task
* @param statfile symbol of statfile
* @param task worker's task object
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 1ee251f81..b879237c4 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -37,17 +37,17 @@ extern const int primes[];
int
osb_tokenize_text (struct tokenizer *tokenizer,
rspamd_mempool_t * pool,
- rspamd_fstring_t * input,
+ GArray * input,
GTree ** tree,
gboolean save_token,
gboolean is_utf,
GList *exceptions)
{
token_node_t *new = NULL;
- rspamd_fstring_t token = { NULL, 0, 0 };
+ rspamd_fstring_t *token;
guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- gint i, l, processed = 0;
- gchar *res;
+ gint i, processed = 0;
+ guint w;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
@@ -58,31 +58,20 @@ osb_tokenize_text (struct tokenizer *tokenizer,
memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
- while ((res =
- tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
- /* Skip small words */
- if (is_utf) {
- l = g_utf8_strlen (token.begin, token.len);
- }
- else {
- l = token.len;
- }
- if (l < MIN_LEN) {
- token.begin = res;
- continue;
- }
+ for (w = 0; w < input->len; w ++) {
+ token = &g_array_index (input, rspamd_fstring_t, w);
if (processed < FEATURE_WINDOW_SIZE) {
/* Just fill a hashpipe */
hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
- rspamd_fstrhash_lc (&token, is_utf);
+ rspamd_fstrhash_lc (token, is_utf);
}
else {
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
hashpipe[i] = hashpipe[i - 1];
}
- hashpipe[0] = rspamd_fstrhash_lc (&token, is_utf);
+ hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
processed++;
for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
@@ -94,7 +83,7 @@ osb_tokenize_text (struct tokenizer *tokenizer,
new->h2 = h2;
if (save_token) {
new->extra =
- (uintptr_t)rspamd_mempool_fstrdup (pool, &token);
+ (uintptr_t)rspamd_mempool_fstrdup (pool, token);
}
if (g_tree_lookup (*tree, new) == NULL) {
@@ -102,7 +91,6 @@ osb_tokenize_text (struct tokenizer *tokenizer,
}
}
}
- token.begin = res;
}
if (processed <= FEATURE_WINDOW_SIZE) {
@@ -113,7 +101,7 @@ osb_tokenize_text (struct tokenizer *tokenizer,
new->h1 = h1;
new->h2 = h2;
if (save_token) {
- new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, &token);
+ new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
}
if (g_tree_lookup (*tree, new) == NULL) {
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 01456a11b..8e423a211 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -114,7 +114,7 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
return NULL;
}
- if (*exceptions != NULL) {
+ if (exceptions != NULL && *exceptions != NULL) {
ex = (*exceptions)->data;
}
@@ -220,9 +220,9 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
void
tokenize_subject (struct rspamd_task *task, GTree ** tree)
{
- rspamd_fstring_t subject;
- const gchar *sub;
+ gchar *sub;
struct tokenizer *osb_tokenizer;
+ GArray *words;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
@@ -234,26 +234,21 @@ tokenize_subject (struct rspamd_task *task, GTree ** tree)
/* Try to use pre-defined subject */
if (task->subject != NULL) {
- subject.begin = task->subject;
- subject.len = strlen (task->subject);
- osb_tokenizer->tokenize_func (osb_tokenizer,
- task->task_pool,
- &subject,
- tree,
- FALSE,
- TRUE,
- NULL);
+ sub = task->subject;
+ }
+ else {
+ sub = (gchar *)g_mime_message_get_subject (task->message);
}
- if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
- subject.begin = (gchar *)sub;
- subject.len = strlen (sub);
+
+ if (sub != NULL) {
+ words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
osb_tokenizer->tokenize_func (osb_tokenizer,
- task->task_pool,
- &subject,
- tree,
- FALSE,
- TRUE,
- NULL);
+ task->task_pool,
+ words,
+ tree,
+ FALSE,
+ TRUE,
+ NULL);
}
}
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 51446b09d..ed47e0add 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -19,9 +19,13 @@ typedef struct token_node_s {
/* Common tokenizer structure */
struct tokenizer {
gchar *name;
- gint (*tokenize_func)(struct tokenizer *tokenizer, rspamd_mempool_t *pool,
- rspamd_fstring_t *input,
- GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
+ gint (*tokenize_func)(struct tokenizer *tokenizer,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ GTree **cur,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions);
gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
};
@@ -42,7 +46,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
/* OSB tokenize function */
int osb_tokenize_text (struct tokenizer *tokenizer,
rspamd_mempool_t *pool,
- rspamd_fstring_t *input,
+ GArray *input,
GTree **cur,
gboolean save_token,
gboolean is_utf,