aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-08 09:41:19 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-08 09:41:57 +0000
commit9ebb11d00a678e97f14b4cd33a68c3e5cd385392 (patch)
tree2a701df0af31d895766325f8d5d759eec7a50b47
parentad33efe0d2a35eee8a3a47a6fdffcae6d6b8e185 (diff)
downloadrspamd-9ebb11d00a678e97f14b4cd33a68c3e5cd385392.tar.gz
rspamd-9ebb11d00a678e97f14b4cd33a68c3e5cd385392.zip
[Minor] Move subject tokenisation to a separate routine
Issue: #2623
-rw-r--r--src/libstat/stat_process.c27
-rw-r--r--src/libstat/tokenizers/tokenizers.c70
-rw-r--r--src/libstat/tokenizers/tokenizers.h2
-rw-r--r--src/plugins/chartable.c19
4 files changed, 71 insertions, 47 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 2641226d5..228b6b87b 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -313,10 +313,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
{
struct rspamd_mime_text_part *part;
rspamd_cryptobox_hash_state_t hst;
- rspamd_stat_token_t *tok;
rspamd_token_t *st_tok;
GArray *words;
- gchar *sub = NULL;
guint i, reserved_len = 0;
gdouble *pdiff;
guchar hout[rspamd_cryptobox_HASHBYTES];
@@ -361,29 +359,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (task->subject != NULL) {
- sub = task->subject;
- }
-
- if (sub != NULL) {
- UText utxt = UTEXT_INITIALIZER;
- UErrorCode uc_err = U_ZERO_ERROR;
- gsize slen = strlen (sub);
-
- utext_openUTF8 (&utxt,
- sub,
- slen,
- &uc_err);
-
- words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
- NULL, NULL, NULL);
-
+ words = rspamd_tokenize_subject (task);
if (words != NULL) {
-
- for (i = 0; i < words->len; i ++) {
- tok = &g_array_index (words, rspamd_stat_token_t, i);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
- }
-
st_ctx->tokenizer->tokenize_func (st_ctx,
task->task_pool,
words,
@@ -394,8 +371,6 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
rspamd_mempool_add_destructor (task->task_pool,
rspamd_array_free_hard, words);
}
-
- utext_close (&utxt);
}
rspamd_stat_tokenize_parts_metadata (st_ctx, task);
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index c8e8e44df..2ef5c08fb 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -463,6 +463,70 @@ start_over:
#undef SHIFT_EX
-/*
- * vi:ts=4
- */
+GArray *
+rspamd_tokenize_subject (struct rspamd_task *task)
+{
+ UText utxt = UTEXT_INITIALIZER;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gsize slen;
+ gboolean valid_utf = TRUE;
+ GArray *words = NULL;
+ guint i = 0;
+ gint32 uc;
+ rspamd_stat_token_t *tok;
+
+ if (task->subject) {
+ const gchar *p = task->subject;
+
+ slen = strlen (task->subject);
+
+ while (i < slen) {
+ U8_NEXT (p, i, slen, uc);
+
+ if (((gint32) uc) < 0) {
+ valid_utf = FALSE;
+ break;
+ }
+#if U_ICU_VERSION_MAJOR_NUM < 50
+ if (u_isalpha (uc)) {
+ gint32 sc = ublock_getCode (uc);
+
+ if (sc == UBLOCK_THAI) {
+ valid_utf = FALSE;
+ msg_info_task ("enable workaround for Thai characters for old libicu");
+ break;
+ }
+ }
+#endif
+ }
+
+ if (valid_utf) {
+ utext_openUTF8 (&utxt,
+ task->subject,
+ slen,
+ &uc_err);
+
+ words = rspamd_tokenize_text (task->subject, slen,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL);
+
+ utext_close (&utxt);
+ }
+ else {
+ words = rspamd_tokenize_text (task->subject, slen,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL);
+ }
+ }
+
+ if (words != NULL) {
+
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+ }
+ }
+
+ return words;
+}
+
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 6c538eafc..bfabde74f 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -57,6 +57,8 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,
gsize *len);
+
+GArray * rspamd_tokenize_subject (struct rspamd_task *task);
#endif
/*
* vi:ts=4
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index ce80b2e66..0efbe55ca 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -622,24 +622,9 @@ chartable_symbol_callback (struct rspamd_task *task,
if (task->subject != NULL) {
GArray *words;
rspamd_stat_token_t *w;
- guint i;
gdouble cur_score = 0.0;
- UText utxt = UTEXT_INITIALIZER;
- UErrorCode uc_err = U_ZERO_ERROR;
- gsize slen = strlen (task->subject);
-
- utext_openUTF8 (&utxt,
- task->subject,
- slen,
- &uc_err);
-
- words = rspamd_tokenize_text (task->subject, slen,
- &utxt,
- RSPAMD_TOKENIZE_UTF,
- NULL,
- NULL,
- NULL);
+ words = rspamd_tokenize_subject (task);
if (words && words->len > 0) {
for (i = 0; i < words->len; i++) {
@@ -664,8 +649,6 @@ chartable_symbol_callback (struct rspamd_task *task,
if (words) {
g_array_free (words, TRUE);
}
-
- utext_close (&utxt);
}
rspamd_symcache_finalize_item (task, item);