aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-05 17:43:20 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-05 17:44:35 +0100
commita64ce9b4245153e68fbbcd9c6610b9c1ccf76493 (patch)
treeb9b9798b77974cf8d0793c948966a95963266771 /src/libstat/tokenizers
parent3807688a67be66d00a24172c13b00b6fb1816d69 (diff)
downloadrspamd-a64ce9b4245153e68fbbcd9c6610b9c1ccf76493.tar.gz
rspamd-a64ce9b4245153e68fbbcd9c6610b9c1ccf76493.zip
[Rework] Rework utf content processing in text parts
- Store unicode in UTF parts - Store unicode for HTML parts - Rename struct fields and split them into unicode/utf components
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c8
-rw-r--r--src/libstat/tokenizers/tokenizers.h2
2 files changed, 5 insertions, 5 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index fce98c53f..5436430fe 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -59,7 +59,7 @@ const gchar t_delimiters[255] = {
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl, gboolean unused)
{
@@ -149,7 +149,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
}
static gboolean
-rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl,
gboolean check_signature)
@@ -355,10 +355,10 @@ rspamd_tokenize_text (const gchar *text, gsize len,
switch (how) {
case RSPAMD_TOKENIZE_RAW:
- func = rspamd_tokenizer_get_word_compat;
+ func = rspamd_tokenizer_get_word_raw;
break;
case RSPAMD_TOKENIZE_UTF:
- func = rspamd_tokenizer_get_word;
+ func = rspamd_tokenizer_get_word_utf8;
break;
default:
g_assert_not_reached ();
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 8be5f98a8..16ab142fd 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -28,7 +28,7 @@ struct rspamd_stat_tokenizer {
enum rspamd_tokenize_type {
RSPAMD_TOKENIZE_UTF = 0,
RSPAMD_TOKENIZE_RAW,
- RSPAMD_TOKENIZE_UCS
+ RSPAMD_TOKENIZE_UNICODE
};
/* Compare two token nodes */