From 0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 25 Nov 2018 12:00:24 +0000 Subject: [Project] Various unicode fixes in language detector --- src/libmime/lang_detection.c | 58 ++++++++++++------------------------- src/libmime/lang_detection.h | 11 ------- src/libmime/message.c | 1 - src/libmime/message.h | 4 --- src/libserver/task.c | 3 -- src/libstat/tokenizers/tokenizers.c | 5 ++-- 6 files changed, 20 insertions(+), 62 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index b2a2f1f6c..dfcbb527a 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -873,31 +874,6 @@ end: return ret; } - -void -rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, - rspamd_mempool_t *pool, - rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token) -{ - UChar *out; - int32_t nsym; - UErrorCode uc_err = U_ZERO_ERROR; - - ucs_token->flags = utf_token->flags; - out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1)); - nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1), - utf_token->normalized.begin, utf_token->normalized.len, &uc_err); - - if (nsym >= 0 && uc_err == U_ZERO_ERROR) { - rspamd_language_detector_ucs_lowercase (out, nsym); - ucs_token->normalized.begin = (const gchar *) out; - ucs_token->normalized.len = nsym; - } - else { - ucs_token->normalized.len = 0; - } -} - static void rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, goffset *offsets_out) @@ -905,6 +881,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, guint step_len, remainder, i, out_idx; guint64 coin, sel; rspamd_stat_token_t *tok; + UChar32 first, last; g_assert (nwords != 0); g_assert (offsets_out != NULL); @@ -942,11 +919,17 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, for (;;) { tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel); /* Filter bad tokens */ - if (tok->normalized.len >= 2 && - u_isalpha (*(UChar *)tok->normalized.begin) && - u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) { - offsets_out[out_idx] = sel; - break; + + if (tok->normalized.len >= 2) { + U16_GET_OR_FFFD (tok->normalized.begin, 0, 0, tok->normalized.len, + first); + U16_GET_OR_FFFD (tok->normalized.begin, 0, tok->normalized.len - 1, + tok->normalized.len, + last); + if (u_isalpha (first) && u_isalpha (last)) { + offsets_out[out_idx] = sel; + break; + } } else { ntries ++; @@ -966,8 +949,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, } } - - /* * Fisher-Yates algorithm: * for i from 0 to n−2 do @@ -1001,13 +982,13 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, window[0] = (UChar)' '; for (i = 0; i < wlen - 1; i ++) { - window[i + 1] = *(((UChar *)tok->normalized.begin) + i); + window[i + 1] = tok->unicode.begin[i]; } } else if (cur_off + wlen == tok->normalized.len + 1) { /* Add trailing space */ for (i = 0; i < wlen - 1; i ++) { - window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i); + window[i] = tok->unicode.begin[cur_off + i]; } window[wlen - 1] = (UChar)' '; } @@ -1018,7 +999,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, else { /* Normal case */ for (i = 0; i < wlen; i++) { - window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i); + window[i] = tok->unicode.begin[cur_off + i]; } } } @@ -1027,7 +1008,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, return -1; } - window[0] = *(((UChar *)tok->normalized.begin) + cur_off); + window[0] = tok->unicode.begin[cur_off]; } return cur_off + 1; @@ -1200,10 +1181,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task, for (i = 0; i < nparts; i++) { tok = &g_array_index (words, rspamd_stat_token_t, selected_words[i]); - rspamd_language_detector_to_ucs (task->lang_det, - task->task_pool, - tok, &ucs_w); - rspamd_language_detector_detect_word (task, d, &ucs_w, candidates, + rspamd_language_detector_detect_word (task, d, tok, candidates, d->trigramms[cat]); } diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index 204bdf9af..517ab037e 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -62,17 +62,6 @@ struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config struct rspamd_lang_detector* rspamd_language_detector_ref (struct rspamd_lang_detector* d); void rspamd_language_detector_unref (struct rspamd_lang_detector* d); -/** - * Convert string from utf8 to ucs32 - * @param d - * @param utf_token - * @param ucs_token - */ -void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, - rspamd_mempool_t *pool, - rspamd_stat_token_t *utf_token, - rspamd_stat_token_t *ucs_token); - /** * Try to detect language of words * @param d diff --git a/src/libmime/message.c b/src/libmime/message.c index 7572a4178..4a765643a 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -711,7 +711,6 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task, if (text_part->utf_raw_content != NULL) { /* Different from HTML, where we also parse HTML and strip tags */ text_part->utf_content = text_part->utf_raw_content; - text_part->unicode_content = text_part->unicode_raw_content; } else { /* diff --git a/src/libmime/message.h b/src/libmime/message.h index 0f5c3dfb7..ed9dfef6e 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -104,10 +104,6 @@ struct rspamd_mime_text_part { GArray *utf_words; UText utf_stripped_text; /* Used by libicu to represent the utf8 content */ - /* Unicode content, used by libicu */ - GArray *unicode_raw_content; /* unicode raw content (of UChar) */ - GArray *unicode_content; /* unicode processed content (of UChar) */ - GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ struct html_content *html; GList *exceptions; /**< list of offsets of urls */ diff --git a/src/libserver/task.c b/src/libserver/task.c index de2745701..6135bced4 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -267,9 +267,6 @@ rspamd_task_free (struct rspamd_task *task) if (tp->languages) { g_ptr_array_unref (tp->languages); } - if (tp->unicode_raw_content) { - g_array_free (tp->unicode_raw_content, TRUE); - } } if (task->rcpt_envelope) { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 9bbe899fb..d27d9bc58 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -271,9 +271,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, buf.original.begin = text; buf.original.len = len; buf.flags = 0; - token.original.begin = NULL; - token.original.len = 0; - token.flags = 0; + + memset (&token, 0, sizeof (token)); if (cfg != NULL) { min_len = cfg->min_word_len; -- cgit v1.2.3