diff options
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 127 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 5 |
2 files changed, 82 insertions, 50 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 604fc070e..dcd556910 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -251,7 +251,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, - guint64 *hash) + guint64 *hash, + GArray *cur_words) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; @@ -265,7 +266,7 @@ rspamd_tokenize_text (const gchar *text, gsize len, static UBreakIterator* bi = NULL; if (text == NULL) { - return NULL; + return cur_words; } buf.original.begin = text; @@ -281,8 +282,13 @@ rspamd_tokenize_text (const gchar *text, gsize len, initial_size = word_decay * 2; } - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), - initial_size); + if (!cur_words) { + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), + initial_size); + } + else { + res = cur_words; + } if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { @@ -474,71 +480,96 @@ start_over: #undef SHIFT_EX -GArray * -rspamd_tokenize_subject (struct rspamd_task *task) +static void +rspamd_add_metawords_from_str (const gchar *beg, gsize len, + struct rspamd_task *task) { UText utxt = UTEXT_INITIALIZER; UErrorCode uc_err = U_ZERO_ERROR; - gsize slen; - gboolean valid_utf = TRUE; - GArray *words = NULL; guint i = 0; - gint32 uc; - rspamd_stat_token_t *tok; + UChar32 uc; + gboolean valid_utf = TRUE; - if (task->subject) { - const gchar *p = task->subject; + while (i < len) { + U8_NEXT (beg, i, len, uc); - slen = strlen (task->subject); + if (((gint32) uc) < 0) { + valid_utf = FALSE; + break; + } - while (i < slen) { - U8_NEXT (p, i, slen, uc); +#if U_ICU_VERSION_MAJOR_NUM < 50 + if (u_isalpha (uc)) { + gint32 sc = ublock_getCode (uc); - if (((gint32) uc) < 0) { + if (sc == UBLOCK_THAI) { valid_utf = FALSE; + msg_info_task ("enable workaround for Thai characters for old libicu"); break; } -#if U_ICU_VERSION_MAJOR_NUM < 50 - if (u_isalpha (uc)) { - gint32 sc = ublock_getCode (uc); - - if (sc == UBLOCK_THAI) { - valid_utf = FALSE; - msg_info_task ("enable workaround for Thai characters for old libicu"); - break; - } - } -#endif } +#endif + } - if (valid_utf) { - utext_openUTF8 (&utxt, - task->subject, - slen, - &uc_err); + if (valid_utf) { + utext_openUTF8 (&utxt, + beg, + len, + &uc_err); - words = rspamd_tokenize_text (task->subject, slen, - &utxt, RSPAMD_TOKENIZE_UTF, - task->cfg, NULL, NULL); + task->meta_words = rspamd_tokenize_text (beg, len, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL, task->meta_words); - utext_close (&utxt); - } - else { - words = rspamd_tokenize_text (task->subject, slen, - NULL, RSPAMD_TOKENIZE_RAW, - task->cfg, NULL, NULL); - } + utext_close (&utxt); } + else { + task->meta_words = rspamd_tokenize_text (beg, len, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL, task->meta_words); + } +} + +void +rspamd_tokenize_meta_words (struct rspamd_task *task) +{ + guint i = 0; + rspamd_stat_token_t *tok; + + if (task->subject) { + rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task); + } + + if (task->from_mime) { + struct rspamd_email_address *addr; - if (words != NULL) { + addr = g_ptr_array_index (task->from_mime, 0); - for (i = 0; i < words->len; i++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; + if (addr->name) { + rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task); } } - return words; + if (task->meta_words != NULL) { + const gchar *language = NULL; + + if (task->text_parts && task->text_parts->len > 0) { + struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0); + + if (tp->language) { + language = tp->language; + } + } + + rspamd_normalize_words (task->meta_words, task->task_pool); + rspamd_stem_words (task->meta_words, task->task_pool, language, + task->lang_det); + + for (i = 0; i < task->meta_words->len; i++) { + tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; + } + } } static inline void diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 683d728ed..784426d31 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -43,7 +43,8 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, - guint64 *hash); + guint64 *hash, + GArray *cur_words); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, @@ -64,7 +65,7 @@ void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, const gchar *language, struct rspamd_lang_detector *d); -GArray * rspamd_tokenize_subject (struct rspamd_task *task); +void rspamd_tokenize_meta_words (struct rspamd_task *task); #endif /* * vi:ts=4 |