aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-26 16:40:51 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-26 16:40:51 +0000
commit2dcb09b6eae0b21d4da68f9323be4cbfeb2b4237 (patch)
tree0a3751d753308c7b26c9b6495a06ac7d64345184
parentd2f95b4a010032f96a947b8ac63e1da51af14ad5 (diff)
downloadrspamd-2dcb09b6eae0b21d4da68f9323be4cbfeb2b4237.tar.gz
rspamd-2dcb09b6eae0b21d4da68f9323be4cbfeb2b4237.zip
[Project] Use more generalised API to produce meta words
-rw-r--r--src/libmime/message.c4
-rw-r--r--src/libserver/task.c4
-rw-r--r--src/libserver/task.h2
-rw-r--r--src/libstat/stat_api.h2
-rw-r--r--src/libstat/stat_process.c21
-rw-r--r--src/libstat/tokenizers/tokenizers.c127
-rw-r--r--src/libstat/tokenizers/tokenizers.h5
-rw-r--r--src/lua/lua_task.c4
-rw-r--r--src/lua/lua_util.c2
-rw-r--r--src/plugins/chartable.c38
10 files changed, 117 insertions, 92 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 994720f76..46f528ba7 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -173,7 +173,7 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
&part->utf_stripped_text,
tok_type, task->cfg,
part->exceptions,
- NULL);
+ NULL, NULL);
if (part->utf_words) {
@@ -1278,6 +1278,8 @@ rspamd_message_process (struct rspamd_task *task)
*var /= (double)total_words;
}
}
+
+ rspamd_tokenize_meta_words (task);
}
diff --git a/src/libserver/task.c b/src/libserver/task.c
index 6135bced4..664715fea 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -282,6 +282,10 @@ rspamd_task_free (struct rspamd_task *task)
rspamd_email_address_free (task->from_envelope);
}
+ if (task->meta_words) {
+ g_array_free (task->meta_words, TRUE);
+ }
+
ucl_object_unref (task->messages);
if (task->re_rt) {
diff --git a/src/libserver/task.h b/src/libserver/task.h
index 005f6af26..b41b308e4 100644
--- a/src/libserver/task.h
+++ b/src/libserver/task.h
@@ -173,6 +173,8 @@ struct rspamd_task {
struct rspamd_metric_result *result; /**< Metric result */
GHashTable *lua_cache; /**< cache of lua objects */
GPtrArray *tokens; /**< statistics tokens */
+ GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers
+ (e.g. Subject) */
GPtrArray *rcpt_mime;
GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index b912f8d20..ee8db8af2 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -30,7 +30,7 @@
#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
-#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1u << 4)
+#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index ed3f78fde..d601dbee9 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -126,7 +126,6 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
struct rspamd_mime_text_part *part;
rspamd_cryptobox_hash_state_t hst;
rspamd_token_t *st_tok;
- GArray *words;
guint i, reserved_len = 0;
gdouble *pdiff;
guchar hout[rspamd_cryptobox_HASHBYTES];
@@ -170,19 +169,13 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
}
- if (task->subject != NULL) {
- words = rspamd_tokenize_subject (task);
- if (words != NULL) {
- st_ctx->tokenizer->tokenize_func (st_ctx,
- task,
- words,
- TRUE,
- "SUBJECT",
- task->tokens);
-
- rspamd_mempool_add_destructor (task->task_pool,
- rspamd_array_free_hard, words);
- }
+ if (task->meta_words != NULL) {
+ st_ctx->tokenizer->tokenize_func (st_ctx,
+ task,
+ task->meta_words,
+ TRUE,
+ "SUBJECT",
+ task->tokens);
}
rspamd_stat_tokenize_parts_metadata (st_ctx, task);
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 604fc070e..dcd556910 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -251,7 +251,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
- guint64 *hash)
+ guint64 *hash,
+ GArray *cur_words)
{
rspamd_stat_token_t token, buf;
const gchar *pos = NULL;
@@ -265,7 +266,7 @@ rspamd_tokenize_text (const gchar *text, gsize len,
static UBreakIterator* bi = NULL;
if (text == NULL) {
- return NULL;
+ return cur_words;
}
buf.original.begin = text;
@@ -281,8 +282,13 @@ rspamd_tokenize_text (const gchar *text, gsize len,
initial_size = word_decay * 2;
}
- res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
- initial_size);
+ if (!cur_words) {
+ res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
+ initial_size);
+ }
+ else {
+ res = cur_words;
+ }
if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
@@ -474,71 +480,96 @@ start_over:
#undef SHIFT_EX
-GArray *
-rspamd_tokenize_subject (struct rspamd_task *task)
+static void
+rspamd_add_metawords_from_str (const gchar *beg, gsize len,
+ struct rspamd_task *task)
{
UText utxt = UTEXT_INITIALIZER;
UErrorCode uc_err = U_ZERO_ERROR;
- gsize slen;
- gboolean valid_utf = TRUE;
- GArray *words = NULL;
guint i = 0;
- gint32 uc;
- rspamd_stat_token_t *tok;
+ UChar32 uc;
+ gboolean valid_utf = TRUE;
- if (task->subject) {
- const gchar *p = task->subject;
+ while (i < len) {
+ U8_NEXT (beg, i, len, uc);
- slen = strlen (task->subject);
+ if (((gint32) uc) < 0) {
+ valid_utf = FALSE;
+ break;
+ }
- while (i < slen) {
- U8_NEXT (p, i, slen, uc);
+#if U_ICU_VERSION_MAJOR_NUM < 50
+ if (u_isalpha (uc)) {
+ gint32 sc = ublock_getCode (uc);
- if (((gint32) uc) < 0) {
+ if (sc == UBLOCK_THAI) {
valid_utf = FALSE;
+ msg_info_task ("enable workaround for Thai characters for old libicu");
break;
}
-#if U_ICU_VERSION_MAJOR_NUM < 50
- if (u_isalpha (uc)) {
- gint32 sc = ublock_getCode (uc);
-
- if (sc == UBLOCK_THAI) {
- valid_utf = FALSE;
- msg_info_task ("enable workaround for Thai characters for old libicu");
- break;
- }
- }
-#endif
}
+#endif
+ }
- if (valid_utf) {
- utext_openUTF8 (&utxt,
- task->subject,
- slen,
- &uc_err);
+ if (valid_utf) {
+ utext_openUTF8 (&utxt,
+ beg,
+ len,
+ &uc_err);
- words = rspamd_tokenize_text (task->subject, slen,
- &utxt, RSPAMD_TOKENIZE_UTF,
- task->cfg, NULL, NULL);
+ task->meta_words = rspamd_tokenize_text (beg, len,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL, task->meta_words);
- utext_close (&utxt);
- }
- else {
- words = rspamd_tokenize_text (task->subject, slen,
- NULL, RSPAMD_TOKENIZE_RAW,
- task->cfg, NULL, NULL);
- }
+ utext_close (&utxt);
}
+ else {
+ task->meta_words = rspamd_tokenize_text (beg, len,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL, task->meta_words);
+ }
+}
+
+void
+rspamd_tokenize_meta_words (struct rspamd_task *task)
+{
+ guint i = 0;
+ rspamd_stat_token_t *tok;
+
+ if (task->subject) {
+ rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task);
+ }
+
+ if (task->from_mime) {
+ struct rspamd_email_address *addr;
- if (words != NULL) {
+ addr = g_ptr_array_index (task->from_mime, 0);
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index (words, rspamd_stat_token_t, i);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+ if (addr->name) {
+ rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
}
}
- return words;
+ if (task->meta_words != NULL) {
+ const gchar *language = NULL;
+
+ if (task->text_parts && task->text_parts->len > 0) {
+ struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0);
+
+ if (tp->language) {
+ language = tp->language;
+ }
+ }
+
+ rspamd_normalize_words (task->meta_words, task->task_pool);
+ rspamd_stem_words (task->meta_words, task->task_pool, language,
+ task->lang_det);
+
+ for (i = 0; i < task->meta_words->len; i++) {
+ tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
+ }
+ }
}
static inline void
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 683d728ed..784426d31 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -43,7 +43,8 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
- guint64 *hash);
+ guint64 *hash,
+ GArray *cur_words);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
@@ -64,7 +65,7 @@ void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
const gchar *language,
struct rspamd_lang_detector *d);
-GArray * rspamd_tokenize_subject (struct rspamd_task *task);
+void rspamd_tokenize_meta_words (struct rspamd_task *task);
#endif
/*
* vi:ts=4
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 4f28e9492..1b5f33cb7 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -4796,8 +4796,8 @@ lua_push_stat_token (lua_State *L, rspamd_token_t *tok)
lua_pushboolean (L, true);
lua_settable (L, -3);
}
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_SUBJECT) {
- lua_pushstring (L, "subject");
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
+ lua_pushstring (L, "header");
lua_pushboolean (L, true);
lua_settable (L, -3);
}
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index ec22b8a9a..a064bce5b 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1144,7 +1144,7 @@ lua_util_tokenize_text (lua_State *L)
&utxt,
RSPAMD_TOKENIZE_UTF, NULL,
exceptions,
- NULL);
+ NULL, NULL);
if (res == NULL) {
lua_pushnil (L);
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index c566cc517..e48d19ea2 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -621,35 +621,27 @@ chartable_symbol_callback (struct rspamd_task *task,
rspamd_chartable_process_part (task, part, chartable_module_ctx);
}
- if (task->subject != NULL) {
- GArray *words;
+ if (task->meta_words != NULL) {
rspamd_stat_token_t *w;
- gdouble cur_score = 0.0;
+ gdouble cur_score = 0;
+ gsize arlen = task->meta_words->len;
- words = rspamd_tokenize_subject (task);
-
- if (words && words->len > 0) {
- for (i = 0; i < words->len; i++) {
- w = &g_array_index (words, rspamd_stat_token_t, i);
- cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
- NULL, chartable_module_ctx);
- }
-
- cur_score /= (gdouble)words->len;
-
- if (cur_score > 2.0) {
- cur_score = 2.0;
- }
+ for (i = 0; i < arlen; i++) {
+ w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
+ cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
+ NULL, chartable_module_ctx);
+ }
- if (cur_score > chartable_module_ctx->threshold) {
- rspamd_task_insert_result (task, chartable_module_ctx->symbol,
- cur_score, "subject");
+ cur_score /= (gdouble)arlen;
- }
+ if (cur_score > 2.0) {
+ cur_score = 2.0;
}
- if (words) {
- g_array_free (words, TRUE);
+ if (cur_score > chartable_module_ctx->threshold) {
+ rspamd_task_insert_result (task, chartable_module_ctx->symbol,
+ cur_score, "subject");
+
}
}