diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-08-23 17:27:34 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-08-23 17:27:34 +0100 |
commit | e9c773e6bb0e09b4802f3cb06b93b7a082e464ed (patch) | |
tree | 96347e9b0885687b3ad6de3444c5bc5759f5e58a /src/libstat | |
parent | ed9d4ec8c8b62664f0157ccb6dceaba264e1891b (diff) | |
download | rspamd-e9c773e6bb0e09b4802f3cb06b93b7a082e464ed.tar.gz rspamd-e9c773e6bb0e09b4802f3cb06b93b7a082e464ed.zip |
[Project] Start unicode rework
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/stat_process.c | 4 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 34 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 14 |
3 files changed, 30 insertions, 22 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index f58bf6150..540a9e23f 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -365,8 +365,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, - NULL); + words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF, + NULL, NULL, NULL); if (words != NULL) { for (i = 0; i < words->len; i ++) { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 36861b196..fce98c53f 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -26,7 +26,7 @@ typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, rspamd_stat_token_t * token, - GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); + GList **exceptions, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, @@ -61,7 +61,7 @@ const gchar t_delimiters[255] = { static gboolean rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) + GList **exceptions, gsize *rl, gboolean unused) { gsize remain, pos; const gchar *p; @@ -138,12 +138,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, } if (rl) { - if (is_utf) { - *rl = g_utf8_strlen (token->begin, token->len); - } - else { - *rl = token->len; - } + *rl = token->len; } token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; @@ -156,7 +151,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, static gboolean rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gboolean is_utf, gsize *rl, + GList **exceptions, gsize *rl, gboolean check_signature) { gint32 i, siglen = 0, remain; @@ -179,7 +174,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, ex = (*exceptions)->data; } - g_assert (is_utf); g_assert (cur != NULL); if (*cur == NULL) { @@ -332,9 +326,10 @@ process_exception: } GArray * -rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - struct rspamd_config *cfg, GList *exceptions, gboolean compat, - guint64 *hash) +rspamd_tokenize_text (const gchar *text, gsize len, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, GList *exceptions, + guint64 *hash) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; @@ -358,11 +353,16 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, token.len = 0; token.flags = 0; - if (compat || !is_utf) { + switch (how) { + case RSPAMD_TOKENIZE_RAW: func = rspamd_tokenizer_get_word_compat; - } - else { + break; + case RSPAMD_TOKENIZE_UTF: func = rspamd_tokenizer_get_word; + break; + default: + g_assert_not_reached (); + break; } if (cfg != NULL) { @@ -375,7 +375,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), initial_size); - while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { + while (func (&buf, &pos, &token, &cur, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || (max_len > 0 && l > max_len)) { token.begin = pos; diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 530eb40a0..8be5f98a8 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -25,14 +25,22 @@ struct rspamd_stat_tokenizer { GPtrArray *result); }; +enum rspamd_tokenize_type { + RSPAMD_TOKENIZE_UTF = 0, + RSPAMD_TOKENIZE_RAW, + RSPAMD_TOKENIZE_UCS +}; + /* Compare two token nodes */ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_stat_token_t type) */ -GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - struct rspamd_config *cfg, GList *exceptions, gboolean compat, - guint64 *hash); +GArray * rspamd_tokenize_text (const gchar *text, gsize len, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + guint64 *hash); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, |