]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Various unicode fixes in language detector
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
src/libmime/lang_detection.c
src/libmime/lang_detection.h
src/libmime/message.c
src/libmime/message.h
src/libserver/task.c
src/libstat/tokenizers/tokenizers.c

index b2a2f1f6cbc9ced0c5edf9984f2b6e6f15c0b6b1..dfcbb527ac33fdb3757801ce9721091333a6db6b 100644 (file)
@@ -24,6 +24,7 @@
 
 #include <glob.h>
 #include <unicode/utf8.h>
+#include <unicode/utf16.h>
 #include <unicode/ucnv.h>
 #include <unicode/uchar.h>
 #include <unicode/ustring.h>
@@ -873,31 +874,6 @@ end:
        return ret;
 }
 
-
-void
-rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
-               rspamd_mempool_t *pool,
-               rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
-{
-       UChar *out;
-       int32_t nsym;
-       UErrorCode uc_err = U_ZERO_ERROR;
-
-       ucs_token->flags = utf_token->flags;
-       out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1));
-       nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1),
-                       utf_token->normalized.begin, utf_token->normalized.len, &uc_err);
-
-       if (nsym >= 0 && uc_err == U_ZERO_ERROR) {
-               rspamd_language_detector_ucs_lowercase (out, nsym);
-               ucs_token->normalized.begin = (const gchar *) out;
-               ucs_token->normalized.len = nsym;
-       }
-       else {
-               ucs_token->normalized.len = 0;
-       }
-}
-
 static void
 rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
                goffset *offsets_out)
@@ -905,6 +881,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
        guint step_len, remainder, i, out_idx;
        guint64 coin, sel;
        rspamd_stat_token_t *tok;
+       UChar32 first, last;
 
        g_assert (nwords != 0);
        g_assert (offsets_out != NULL);
@@ -942,11 +919,17 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
                for (;;) {
                        tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
                        /* Filter bad tokens */
-                       if (tok->normalized.len >= 2 &&
-                               u_isalpha (*(UChar *)tok->normalized.begin) &&
-                               u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) {
-                               offsets_out[out_idx] = sel;
-                               break;
+
+                       if (tok->normalized.len >= 2) {
+                               U16_GET_OR_FFFD (tok->normalized.begin, 0, 0, tok->normalized.len,
+                                               first);
+                               U16_GET_OR_FFFD (tok->normalized.begin, 0, tok->normalized.len - 1,
+                                               tok->normalized.len,
+                                               last);
+                               if (u_isalpha (first) && u_isalpha (last)) {
+                                       offsets_out[out_idx] = sel;
+                                       break;
+                               }
                        }
                        else {
                                ntries ++;
@@ -966,8 +949,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
                }
        }
 
-
-
        /*
         * Fisher-Yates algorithm:
         * for i from 0 to n−2 do
@@ -1001,13 +982,13 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
                        window[0] = (UChar)' ';
 
                        for (i = 0; i < wlen - 1; i ++) {
-                               window[i + 1] = *(((UChar *)tok->normalized.begin) + i);
+                               window[i + 1] = tok->unicode.begin[i];
                        }
                }
                else if (cur_off + wlen == tok->normalized.len + 1) {
                        /* Add trailing space */
                        for (i = 0; i < wlen - 1; i ++) {
-                               window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i);
+                               window[i] = tok->unicode.begin[cur_off + i];
                        }
                        window[wlen - 1] = (UChar)' ';
                }
@@ -1018,7 +999,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
                else {
                        /* Normal case */
                        for (i = 0; i < wlen; i++) {
-                               window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i);
+                               window[i] = tok->unicode.begin[cur_off + i];
                        }
                }
        }
@@ -1027,7 +1008,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
                        return -1;
                }
 
-               window[0] = *(((UChar *)tok->normalized.begin) + cur_off);
+               window[0] = tok->unicode.begin[cur_off];
        }
 
        return cur_off + 1;
@@ -1200,10 +1181,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task,
        for (i = 0; i < nparts; i++) {
                tok = &g_array_index (words, rspamd_stat_token_t,
                                selected_words[i]);
-               rspamd_language_detector_to_ucs (task->lang_det,
-                               task->task_pool,
-                               tok, &ucs_w);
-               rspamd_language_detector_detect_word (task, d, &ucs_w, candidates,
+               rspamd_language_detector_detect_word (task, d, tok, candidates,
                                d->trigramms[cat]);
        }
 
index 204bdf9afb0adec8066579eecf5e2122852c3014..517ab037efbfaccb4efe70086d667f766c171d7c 100644 (file)
@@ -62,17 +62,6 @@ struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config
 struct rspamd_lang_detector* rspamd_language_detector_ref (struct rspamd_lang_detector* d);
 void rspamd_language_detector_unref (struct rspamd_lang_detector* d);
 
-/**
- * Convert string from utf8 to ucs32
- * @param d
- * @param utf_token
- * @param ucs_token
- */
-void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
-               rspamd_mempool_t *pool,
-               rspamd_stat_token_t *utf_token,
-               rspamd_stat_token_t *ucs_token);
-
 /**
  * Try to detect language of words
  * @param d
index 7572a417866162893c2b1954255dca239d38c57a..4a765643a1e00d6b62e752986a3740f9a291cfb7 100644 (file)
@@ -711,7 +711,6 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
        if (text_part->utf_raw_content != NULL) {
                /* Different from HTML, where we also parse HTML and strip tags */
                text_part->utf_content = text_part->utf_raw_content;
-               text_part->unicode_content = text_part->unicode_raw_content;
        }
        else {
                /*
index 0f5c3dfb7927880c496168ef4dc29417c21fd38a..ed9dfef6eff96f78b4a49de36ea8918abc81dab7 100644 (file)
@@ -104,10 +104,6 @@ struct rspamd_mime_text_part {
        GArray *utf_words;
        UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
 
-       /* Unicode content, used by libicu */
-       GArray *unicode_raw_content; /* unicode raw content (of UChar) */
-       GArray *unicode_content; /* unicode processed content (of UChar) */
-
        GPtrArray *newlines;    /**< positions of newlines in text, relative to content*/
        struct html_content *html;
        GList *exceptions;      /**< list of offsets of urls                                            */
index de2745701939aa984d79899ae9161f5370cc3563..6135bced4265bc168218d72c9dcb18fe709d2664 100644 (file)
@@ -267,9 +267,6 @@ rspamd_task_free (struct rspamd_task *task)
                        if (tp->languages) {
                                g_ptr_array_unref (tp->languages);
                        }
-                       if (tp->unicode_raw_content) {
-                               g_array_free (tp->unicode_raw_content, TRUE);
-                       }
                }
 
                if (task->rcpt_envelope) {
index 9bbe899fbc5a69e4106d5da8342fd317e89b612f..d27d9bc58b09af8c9fa9be3edb9ad742b86d3f56 100644 (file)
@@ -271,9 +271,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
        buf.original.begin = text;
        buf.original.len = len;
        buf.flags = 0;
-       token.original.begin = NULL;
-       token.original.len = 0;
-       token.flags = 0;
+
+       memset (&token, 0, sizeof (token));
 
        if (cfg != NULL) {
                min_len = cfg->min_word_len;