[Project] Various unicode fixes in language detector

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c

index b2a2f1f6cbc9ced0c5edf9984f2b6e6f15c0b6b1..dfcbb527ac33fdb3757801ce9721091333a6db6b 100644 (file)
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,6 +24,7 @@
  
  #include <glob.h>
  #include <unicode/utf8.h>
+#include <unicode/utf16.h>
  #include <unicode/ucnv.h>
  #include <unicode/uchar.h>
  #include <unicode/ustring.h>
@@ -873,31 +874,6 @@ end:
         return ret;
  }
  
-
-void
-rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
-               rspamd_mempool_t *pool,
-               rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
-{
-       UChar *out;
-       int32_t nsym;
-       UErrorCode uc_err = U_ZERO_ERROR;
-
-       ucs_token->flags = utf_token->flags;
-       out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1));
-       nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1),
-                       utf_token->normalized.begin, utf_token->normalized.len, &uc_err);
-
-       if (nsym >= 0 && uc_err == U_ZERO_ERROR) {
-               rspamd_language_detector_ucs_lowercase (out, nsym);
-               ucs_token->normalized.begin = (const gchar *) out;
-               ucs_token->normalized.len = nsym;
-       }
-       else {
-               ucs_token->normalized.len = 0;
-       }
-}
-
  static void
  rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
                 goffset *offsets_out)
@@ -905,6 +881,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
         guint step_len, remainder, i, out_idx;
         guint64 coin, sel;
         rspamd_stat_token_t *tok;
+       UChar32 first, last;
  
         g_assert (nwords != 0);
         g_assert (offsets_out != NULL);
@@ -942,11 +919,17 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
                 for (;;) {
                         tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
                         /* Filter bad tokens */
-                       if (tok->normalized.len >= 2 &&
-                               u_isalpha (*(UChar *)tok->normalized.begin) &&
-                               u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) {
-                               offsets_out[out_idx] = sel;
-                               break;
+
+                       if (tok->normalized.len >= 2) {
+                               U16_GET_OR_FFFD (tok->normalized.begin, 0, 0, tok->normalized.len,
+                                               first);
+                               U16_GET_OR_FFFD (tok->normalized.begin, 0, tok->normalized.len - 1,
+                                               tok->normalized.len,
+                                               last);
+                               if (u_isalpha (first) && u_isalpha (last)) {
+                                       offsets_out[out_idx] = sel;
+                                       break;
+                               }
                         }
                         else {
                                 ntries ++;
@@ -966,8 +949,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
                 }
         }
  
-
-
         /*
          * Fisher-Yates algorithm:
          * for i from 0 to n−2 do
@@ -1001,13 +982,13 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
                         window[0] = (UChar)' ';
  
                         for (i = 0; i < wlen - 1; i ++) {
-                               window[i + 1] = *(((UChar *)tok->normalized.begin) + i);
+                               window[i + 1] = tok->unicode.begin[i];
                         }
                 }
                 else if (cur_off + wlen == tok->normalized.len + 1) {
                         /* Add trailing space */
                         for (i = 0; i < wlen - 1; i ++) {
-                               window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i);
+                               window[i] = tok->unicode.begin[cur_off + i];
                         }
                         window[wlen - 1] = (UChar)' ';
                 }
@@ -1018,7 +999,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
                 else {
                         /* Normal case */
                         for (i = 0; i < wlen; i++) {
-                               window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i);
+                               window[i] = tok->unicode.begin[cur_off + i];
                         }
                 }
         }
@@ -1027,7 +1008,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
                         return -1;
                 }
  
-               window[0] = *(((UChar *)tok->normalized.begin) + cur_off);
+               window[0] = tok->unicode.begin[cur_off];
         }
  
         return cur_off + 1;
@@ -1200,10 +1181,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task,
         for (i = 0; i < nparts; i++) {
                 tok = &g_array_index (words, rspamd_stat_token_t,
                                 selected_words[i]);
-               rspamd_language_detector_to_ucs (task->lang_det,
-                               task->task_pool,
-                               tok, &ucs_w);
-               rspamd_language_detector_detect_word (task, d, &ucs_w, candidates,
+               rspamd_language_detector_detect_word (task, d, tok, candidates,
                                 d->trigramms[cat]);
         }
  
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h

index 204bdf9afb0adec8066579eecf5e2122852c3014..517ab037efbfaccb4efe70086d667f766c171d7c 100644 (file)
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -62,17 +62,6 @@ struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config
  struct rspamd_lang_detector* rspamd_language_detector_ref (struct rspamd_lang_detector* d);
  void rspamd_language_detector_unref (struct rspamd_lang_detector* d);
  
-/**
- * Convert string from utf8 to ucs32
- * @param d
- * @param utf_token
- * @param ucs_token
- */
-void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
-               rspamd_mempool_t *pool,
-               rspamd_stat_token_t *utf_token,
-               rspamd_stat_token_t *ucs_token);
-
  /**
   * Try to detect language of words
   * @param d
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 7572a417866162893c2b1954255dca239d38c57a..4a765643a1e00d6b62e752986a3740f9a291cfb7 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -711,7 +711,6 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
         if (text_part->utf_raw_content != NULL) {
                 /* Different from HTML, where we also parse HTML and strip tags */
                 text_part->utf_content = text_part->utf_raw_content;
-               text_part->unicode_content = text_part->unicode_raw_content;
         }
         else {
                 /*
diff --git a/src/libmime/message.h b/src/libmime/message.h

index 0f5c3dfb7927880c496168ef4dc29417c21fd38a..ed9dfef6eff96f78b4a49de36ea8918abc81dab7 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -104,10 +104,6 @@ struct rspamd_mime_text_part {
         GArray *utf_words;
         UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
  
-       /* Unicode content, used by libicu */
-       GArray *unicode_raw_content; /* unicode raw content (of UChar) */
-       GArray *unicode_content; /* unicode processed content (of UChar) */
-
         GPtrArray *newlines;    /**< positions of newlines in text, relative to content*/
         struct html_content *html;
         GList *exceptions;      /**< list of offsets of urls                                            */
diff --git a/src/libserver/task.c b/src/libserver/task.c

index de2745701939aa984d79899ae9161f5370cc3563..6135bced4265bc168218d72c9dcb18fe709d2664 100644 (file)
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -267,9 +267,6 @@ rspamd_task_free (struct rspamd_task *task)
                         if (tp->languages) {
                                 g_ptr_array_unref (tp->languages);
                         }
-                       if (tp->unicode_raw_content) {
-                               g_array_free (tp->unicode_raw_content, TRUE);
-                       }
                 }
  
                 if (task->rcpt_envelope) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 9bbe899fbc5a69e4106d5da8342fd317e89b612f..d27d9bc58b09af8c9fa9be3edb9ad742b86d3f56 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -271,9 +271,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
         buf.original.begin = text;
         buf.original.len = len;
         buf.flags = 0;
-       token.original.begin = NULL;
-       token.original.len = 0;
-       token.flags = 0;
+
+       memset (&token, 0, sizeof (token));
  
         if (cfg != NULL) {
                 min_len = cfg->min_word_len;
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sun, 25 Nov 2018 12:00:24 +0000 (12:00 +0000)
src/libmime/lang_detection.c		patch \| blob \| history
src/libmime/lang_detection.h		patch \| blob \| history
src/libmime/message.c		patch \| blob \| history
src/libmime/message.h		patch \| blob \| history
src/libserver/task.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history