aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-05 17:43:20 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-05 17:44:35 +0100
commita64ce9b4245153e68fbbcd9c6610b9c1ccf76493 (patch)
treeb9b9798b77974cf8d0793c948966a95963266771 /src/libstat
parent3807688a67be66d00a24172c13b00b6fb1816d69 (diff)
downloadrspamd-a64ce9b4245153e68fbbcd9c6610b9c1ccf76493.tar.gz
rspamd-a64ce9b4245153e68fbbcd9c6610b9c1ccf76493.zip
[Rework] Rework utf content processing in text parts
- Store unicode in UTF parts - Store unicode for HTML parts - Rename struct fields and split them into unicode/utf components
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/stat_process.c8
-rw-r--r--src/libstat/tokenizers/tokenizers.c8
-rw-r--r--src/libstat/tokenizers/tokenizers.h2
3 files changed, 9 insertions, 9 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 540a9e23f..394173444 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
- reserved_len += part->normalized_words->len;
+ if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
+ reserved_len += part->utf_words->len;
}
/* XXX: normal window size */
reserved_len += 5;
@@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+ if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
- part->normalized_words, IS_PART_UTF (part),
+ part->utf_words, IS_PART_UTF (part),
NULL, task->tokens);
}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index fce98c53f..5436430fe 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -59,7 +59,7 @@ const gchar t_delimiters[255] = {
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl, gboolean unused)
{
@@ -149,7 +149,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
}
static gboolean
-rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl,
gboolean check_signature)
@@ -355,10 +355,10 @@ rspamd_tokenize_text (const gchar *text, gsize len,
switch (how) {
case RSPAMD_TOKENIZE_RAW:
- func = rspamd_tokenizer_get_word_compat;
+ func = rspamd_tokenizer_get_word_raw;
break;
case RSPAMD_TOKENIZE_UTF:
- func = rspamd_tokenizer_get_word;
+ func = rspamd_tokenizer_get_word_utf8;
break;
default:
g_assert_not_reached ();
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 8be5f98a8..16ab142fd 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -28,7 +28,7 @@ struct rspamd_stat_tokenizer {
enum rspamd_tokenize_type {
RSPAMD_TOKENIZE_UTF = 0,
RSPAMD_TOKENIZE_RAW,
- RSPAMD_TOKENIZE_UCS
+ RSPAMD_TOKENIZE_UNICODE
};
/* Compare two token nodes */