]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Start unicode rework
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Aug 2018 16:27:34 +0000 (17:27 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Aug 2018 16:27:34 +0000 (17:27 +0100)
src/libmime/message.c
src/libmime/message.h
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/lua/lua_util.c
src/plugins/chartable.c

index e4c59be630b10352c479f190b9ef210d7ed38e14..5d9cf19d1441ec9993579a2c2043e2494847a8b8 100644 (file)
@@ -194,20 +194,28 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
 {
        rspamd_stat_token_t *w, ucs_w;
        guint i, ucs_len = 0;
+       enum rspamd_tokenize_type tok_type;
+
+       if (IS_PART_UTF (part)) {
+               tok_type = RSPAMD_TOKENIZE_UTF;
+       }
+       else {
+               tok_type = RSPAMD_TOKENIZE_RAW;
+       }
 
        /* Ugly workaround */
        if (IS_PART_HTML (part)) {
                part->normalized_words = rspamd_tokenize_text (
                                part->stripped_content->data,
-                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
-                               part->exceptions, FALSE,
+                               part->stripped_content->len, tok_type, task->cfg,
+                               part->exceptions,
                                NULL);
        }
        else {
                part->normalized_words = rspamd_tokenize_text (
                                part->stripped_content->data,
-                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
-                               part->exceptions, FALSE,
+                               part->stripped_content->len, tok_type, task->cfg,
+                               part->exceptions,
                                NULL);
        }
 
index b16011666cb42f71c6c120c2d1a5f601774a80b1..b0a7983b47bb876f7f3bf3785b46631a8f31fb59 100644 (file)
@@ -13,6 +13,8 @@
 #include "mime_headers.h"
 #include "content_type.h"
 
+#include <unicode/uchar.h>
+
 struct rspamd_task;
 struct controller_session;
 struct html_content;
@@ -77,16 +79,19 @@ struct rspamd_mime_part {
 #define IS_PART_RAW(part) (!((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF))
 #define IS_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
 
+
 struct rspamd_mime_text_part {
        const gchar *language;
        GPtrArray *languages;
        const gchar *real_charset;
        rspamd_ftok_t raw;
-       rspamd_ftok_t parsed;
-       GByteArray *content;
-       GByteArray *utf_raw_content;
-       GByteArray *stripped_content;
-       GPtrArray *newlines;    /**< positions of newlines in text                                      */
+       rspamd_ftok_t parsed; /* decoded from mime encodings */
+       GByteArray *content; /* utf8 encoded processed content */
+
+       UChar *ucs_raw_content; /* unicode raw content */
+       GByteArray *utf_raw_content; /* utf raw content */
+       GByteArray *stripped_content; /* utf content with no newlines */
+       GPtrArray *newlines;    /**< positions of newlines in text, relative to content*/
        struct html_content *html;
        GList *exceptions;      /**< list of offsets of urls                                            */
        struct rspamd_mime_part *mime_part;
index f58bf6150d556fff5fd96d4cb43d8249c6e5f676..540a9e23f5252b7151fa0e669b35b923baccea05 100644 (file)
@@ -365,8 +365,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        }
 
        if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE,
-                               NULL);
+               words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+                               NULL, NULL, NULL);
                if (words != NULL) {
 
                        for (i = 0; i < words->len; i ++) {
index 36861b1965cb13104186dc5c2495d989acde6554..fce98c53fcf767fbe4bd201a9052ae45d181e698 100644 (file)
@@ -26,7 +26,7 @@
 
 typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
                rspamd_stat_token_t * token,
-               GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
+               GList **exceptions, gsize *rl, gboolean check_signature);
 
 const gchar t_delimiters[255] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
@@ -61,7 +61,7 @@ const gchar t_delimiters[255] = {
 static gboolean
 rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
                gchar const **cur, rspamd_stat_token_t * token,
-               GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
+               GList **exceptions, gsize *rl, gboolean unused)
 {
        gsize remain, pos;
        const gchar *p;
@@ -138,12 +138,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
        }
 
        if (rl) {
-               if (is_utf) {
-                       *rl = g_utf8_strlen (token->begin, token->len);
-               }
-               else {
-                       *rl = token->len;
-               }
+               *rl = token->len;
        }
 
        token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
@@ -156,7 +151,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
 static gboolean
 rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
                gchar const **cur, rspamd_stat_token_t * token,
-               GList **exceptions, gboolean is_utf, gsize *rl,
+               GList **exceptions, gsize *rl,
                gboolean check_signature)
 {
        gint32 i, siglen = 0, remain;
@@ -179,7 +174,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
                ex = (*exceptions)->data;
        }
 
-       g_assert (is_utf);
        g_assert (cur != NULL);
 
        if (*cur == NULL) {
@@ -332,9 +326,10 @@ process_exception:
 }
 
 GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               struct rspamd_config *cfg, GList *exceptions, gboolean compat,
-               guint64 *hash)
+rspamd_tokenize_text (const gchar *text, gsize len,
+                                         enum rspamd_tokenize_type how,
+                                         struct rspamd_config *cfg, GList *exceptions,
+                                         guint64 *hash)
 {
        rspamd_stat_token_t token, buf;
        const gchar *pos = NULL;
@@ -358,11 +353,16 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
        token.len = 0;
        token.flags = 0;
 
-       if (compat || !is_utf) {
+       switch (how) {
+       case RSPAMD_TOKENIZE_RAW:
                func = rspamd_tokenizer_get_word_compat;
-       }
-       else {
+               break;
+       case RSPAMD_TOKENIZE_UTF:
                func = rspamd_tokenizer_get_word;
+               break;
+       default:
+               g_assert_not_reached ();
+               break;
        }
 
        if (cfg != NULL) {
@@ -375,7 +375,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
        res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
                        initial_size);
 
-       while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
+       while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
                if (l == 0 || (min_len > 0 && l < min_len) ||
                                        (max_len > 0 && l > max_len)) {
                        token.begin = pos;
index 530eb40a04799bc6f7f6136b56837cea3f0ab912..8be5f98a843f2f794fa8e1b98c73cabc3aee4664 100644 (file)
@@ -25,14 +25,22 @@ struct rspamd_stat_tokenizer {
                        GPtrArray *result);
 };
 
+enum rspamd_tokenize_type {
+       RSPAMD_TOKENIZE_UTF = 0,
+       RSPAMD_TOKENIZE_RAW,
+       RSPAMD_TOKENIZE_UCS
+};
+
 /* Compare two token nodes */
 gint token_node_compare_func (gconstpointer a, gconstpointer b);
 
 
 /* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               struct rspamd_config *cfg, GList *exceptions, gboolean compat,
-               guint64 *hash);
+GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+                                                          enum rspamd_tokenize_type how,
+                                                          struct rspamd_config *cfg,
+                                                          GList *exceptions,
+                                                          guint64 *hash);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
index 70e16118d4ab746a58ac2f75295db0ee170f3fd5..3de68e60a48d35a3a56fb90281affdeac690dee6 100644 (file)
@@ -1080,7 +1080,6 @@ lua_util_tokenize_text (lua_State *L)
        struct rspamd_process_exception *ex;
        GArray *res;
        rspamd_stat_token_t *w;
-       gboolean compat = FALSE;
 
        if (lua_type (L, 1) == LUA_TSTRING) {
                in = luaL_checklstring (L, 1, &len);
@@ -1126,15 +1125,12 @@ lua_util_tokenize_text (lua_State *L)
                lua_pop (L, 1);
        }
 
-       if (lua_gettop (L) > 2 && lua_type (L, 3) == LUA_TBOOLEAN) {
-               compat = lua_toboolean (L, 3);
-       }
-
        if (exceptions) {
                exceptions = g_list_reverse (exceptions);
        }
 
-       res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat,
+       res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL,
+                       exceptions,
                        NULL);
 
        if (res == NULL) {
index 9331e42ddb290ba59952c6983aba53c7f3c534cf..987879258159d1f3b6dde8a07668bdd886a6e72c 100644 (file)
@@ -620,10 +620,9 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
                gdouble cur_score = 0.0;
 
                words = rspamd_tokenize_text (task->subject, strlen (task->subject),
-                               TRUE,
+                               RSPAMD_TOKENIZE_UTF,
                                NULL,
                                NULL,
-                               FALSE,
                                NULL);
 
                if (words && words->len > 0) {