From: Vsevolod Stakhov Date: Thu, 23 Aug 2018 16:27:34 +0000 (+0100) Subject: [Project] Start unicode rework X-Git-Tag: 1.8.0~212 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=e9c773e6bb0e09b4802f3cb06b93b7a082e464ed;p=rspamd.git [Project] Start unicode rework --- diff --git a/src/libmime/message.c b/src/libmime/message.c index e4c59be63..5d9cf19d1 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -194,20 +194,28 @@ rspamd_mime_part_create_words (struct rspamd_task *task, { rspamd_stat_token_t *w, ucs_w; guint i, ucs_len = 0; + enum rspamd_tokenize_type tok_type; + + if (IS_PART_UTF (part)) { + tok_type = RSPAMD_TOKENIZE_UTF; + } + else { + tok_type = RSPAMD_TOKENIZE_RAW; + } /* Ugly workaround */ if (IS_PART_HTML (part)) { part->normalized_words = rspamd_tokenize_text ( part->stripped_content->data, - part->stripped_content->len, IS_PART_UTF (part), task->cfg, - part->exceptions, FALSE, + part->stripped_content->len, tok_type, task->cfg, + part->exceptions, NULL); } else { part->normalized_words = rspamd_tokenize_text ( part->stripped_content->data, - part->stripped_content->len, IS_PART_UTF (part), task->cfg, - part->exceptions, FALSE, + part->stripped_content->len, tok_type, task->cfg, + part->exceptions, NULL); } diff --git a/src/libmime/message.h b/src/libmime/message.h index b16011666..b0a7983b4 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -13,6 +13,8 @@ #include "mime_headers.h" #include "content_type.h" +#include + struct rspamd_task; struct controller_session; struct html_content; @@ -77,16 +79,19 @@ struct rspamd_mime_part { #define IS_PART_RAW(part) (!((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF)) #define IS_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML) + struct rspamd_mime_text_part { const gchar *language; GPtrArray *languages; const gchar *real_charset; rspamd_ftok_t raw; - rspamd_ftok_t parsed; - GByteArray *content; - GByteArray *utf_raw_content; - GByteArray *stripped_content; - GPtrArray *newlines; /**< positions of newlines in text */ + rspamd_ftok_t parsed; /* decoded from mime encodings */ + GByteArray *content; /* utf8 encoded processed content */ + + UChar *ucs_raw_content; /* unicode raw content */ + GByteArray *utf_raw_content; /* utf raw content */ + GByteArray *stripped_content; /* utf content with no newlines */ + GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ struct html_content *html; GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index f58bf6150..540a9e23f 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -365,8 +365,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, - NULL); + words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF, + NULL, NULL, NULL); if (words != NULL) { for (i = 0; i < words->len; i ++) { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 36861b196..fce98c53f 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -26,7 +26,7 @@ typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, rspamd_stat_token_t * token, - GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature); + GList **exceptions, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, @@ -61,7 +61,7 @@ const gchar t_delimiters[255] = { static gboolean rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused) + GList **exceptions, gsize *rl, gboolean unused) { gsize remain, pos; const gchar *p; @@ -138,12 +138,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, } if (rl) { - if (is_utf) { - *rl = g_utf8_strlen (token->begin, token->len); - } - else { - *rl = token->len; - } + *rl = token->len; } token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; @@ -156,7 +151,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, static gboolean rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gboolean is_utf, gsize *rl, + GList **exceptions, gsize *rl, gboolean check_signature) { gint32 i, siglen = 0, remain; @@ -179,7 +174,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, ex = (*exceptions)->data; } - g_assert (is_utf); g_assert (cur != NULL); if (*cur == NULL) { @@ -332,9 +326,10 @@ process_exception: } GArray * -rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - struct rspamd_config *cfg, GList *exceptions, gboolean compat, - guint64 *hash) +rspamd_tokenize_text (const gchar *text, gsize len, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, GList *exceptions, + guint64 *hash) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; @@ -358,11 +353,16 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, token.len = 0; token.flags = 0; - if (compat || !is_utf) { + switch (how) { + case RSPAMD_TOKENIZE_RAW: func = rspamd_tokenizer_get_word_compat; - } - else { + break; + case RSPAMD_TOKENIZE_UTF: func = rspamd_tokenizer_get_word; + break; + default: + g_assert_not_reached (); + break; } if (cfg != NULL) { @@ -375,7 +375,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), initial_size); - while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { + while (func (&buf, &pos, &token, &cur, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || (max_len > 0 && l > max_len)) { token.begin = pos; diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 530eb40a0..8be5f98a8 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -25,14 +25,22 @@ struct rspamd_stat_tokenizer { GPtrArray *result); }; +enum rspamd_tokenize_type { + RSPAMD_TOKENIZE_UTF = 0, + RSPAMD_TOKENIZE_RAW, + RSPAMD_TOKENIZE_UCS +}; + /* Compare two token nodes */ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_stat_token_t type) */ -GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - struct rspamd_config *cfg, GList *exceptions, gboolean compat, - guint64 *hash); +GArray * rspamd_tokenize_text (const gchar *text, gsize len, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + guint64 *hash); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 70e16118d..3de68e60a 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -1080,7 +1080,6 @@ lua_util_tokenize_text (lua_State *L) struct rspamd_process_exception *ex; GArray *res; rspamd_stat_token_t *w; - gboolean compat = FALSE; if (lua_type (L, 1) == LUA_TSTRING) { in = luaL_checklstring (L, 1, &len); @@ -1126,15 +1125,12 @@ lua_util_tokenize_text (lua_State *L) lua_pop (L, 1); } - if (lua_gettop (L) > 2 && lua_type (L, 3) == LUA_TBOOLEAN) { - compat = lua_toboolean (L, 3); - } - if (exceptions) { exceptions = g_list_reverse (exceptions); } - res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat, + res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL, + exceptions, NULL); if (res == NULL) { diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 9331e42dd..987879258 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -620,10 +620,9 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused) gdouble cur_score = 0.0; words = rspamd_tokenize_text (task->subject, strlen (task->subject), - TRUE, + RSPAMD_TOKENIZE_UTF, NULL, NULL, - FALSE, NULL); if (words && words->len > 0) {