aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-25 12:00:24 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-25 12:00:24 +0000
commit0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c (patch)
tree68ca9026c8a5b3b63b66957b23163a2a72381fc1
parent99b1cf76771eed3824693ed84751ba8054645e18 (diff)
downloadrspamd-0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c.tar.gz
rspamd-0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c.zip
[Project] Various unicode fixes in language detector
-rw-r--r--src/libmime/lang_detection.c58
-rw-r--r--src/libmime/lang_detection.h11
-rw-r--r--src/libmime/message.c1
-rw-r--r--src/libmime/message.h4
-rw-r--r--src/libserver/task.c3
-rw-r--r--src/libstat/tokenizers/tokenizers.c5
6 files changed, 20 insertions, 62 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index b2a2f1f6c..dfcbb527a 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,6 +24,7 @@
#include <glob.h>
#include <unicode/utf8.h>
+#include <unicode/utf16.h>
#include <unicode/ucnv.h>
#include <unicode/uchar.h>
#include <unicode/ustring.h>
@@ -873,31 +874,6 @@ end:
return ret;
}
-
-void
-rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
- rspamd_mempool_t *pool,
- rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
-{
- UChar *out;
- int32_t nsym;
- UErrorCode uc_err = U_ZERO_ERROR;
-
- ucs_token->flags = utf_token->flags;
- out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1));
- nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1),
- utf_token->normalized.begin, utf_token->normalized.len, &uc_err);
-
- if (nsym >= 0 && uc_err == U_ZERO_ERROR) {
- rspamd_language_detector_ucs_lowercase (out, nsym);
- ucs_token->normalized.begin = (const gchar *) out;
- ucs_token->normalized.len = nsym;
- }
- else {
- ucs_token->normalized.len = 0;
- }
-}
-
static void
rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
goffset *offsets_out)
@@ -905,6 +881,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
guint step_len, remainder, i, out_idx;
guint64 coin, sel;
rspamd_stat_token_t *tok;
+ UChar32 first, last;
g_assert (nwords != 0);
g_assert (offsets_out != NULL);
@@ -942,11 +919,17 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
for (;;) {
tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
/* Filter bad tokens */
- if (tok->normalized.len >= 2 &&
- u_isalpha (*(UChar *)tok->normalized.begin) &&
- u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) {
- offsets_out[out_idx] = sel;
- break;
+
+ if (tok->normalized.len >= 2) {
+ U16_GET_OR_FFFD (tok->normalized.begin, 0, 0, tok->normalized.len,
+ first);
+ U16_GET_OR_FFFD (tok->normalized.begin, 0, tok->normalized.len - 1,
+ tok->normalized.len,
+ last);
+ if (u_isalpha (first) && u_isalpha (last)) {
+ offsets_out[out_idx] = sel;
+ break;
+ }
}
else {
ntries ++;
@@ -966,8 +949,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
}
}
-
-
/*
* Fisher-Yates algorithm:
* for i from 0 to n−2 do
@@ -1001,13 +982,13 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
window[0] = (UChar)' ';
for (i = 0; i < wlen - 1; i ++) {
- window[i + 1] = *(((UChar *)tok->normalized.begin) + i);
+ window[i + 1] = tok->unicode.begin[i];
}
}
else if (cur_off + wlen == tok->normalized.len + 1) {
/* Add trailing space */
for (i = 0; i < wlen - 1; i ++) {
- window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i);
+ window[i] = tok->unicode.begin[cur_off + i];
}
window[wlen - 1] = (UChar)' ';
}
@@ -1018,7 +999,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
else {
/* Normal case */
for (i = 0; i < wlen; i++) {
- window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i);
+ window[i] = tok->unicode.begin[cur_off + i];
}
}
}
@@ -1027,7 +1008,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
return -1;
}
- window[0] = *(((UChar *)tok->normalized.begin) + cur_off);
+ window[0] = tok->unicode.begin[cur_off];
}
return cur_off + 1;
@@ -1200,10 +1181,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task,
for (i = 0; i < nparts; i++) {
tok = &g_array_index (words, rspamd_stat_token_t,
selected_words[i]);
- rspamd_language_detector_to_ucs (task->lang_det,
- task->task_pool,
- tok, &ucs_w);
- rspamd_language_detector_detect_word (task, d, &ucs_w, candidates,
+ rspamd_language_detector_detect_word (task, d, tok, candidates,
d->trigramms[cat]);
}
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index 204bdf9af..517ab037e 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -63,17 +63,6 @@ struct rspamd_lang_detector* rspamd_language_detector_ref (struct rspamd_lang_de
void rspamd_language_detector_unref (struct rspamd_lang_detector* d);
/**
- * Convert string from utf8 to ucs32
- * @param d
- * @param utf_token
- * @param ucs_token
- */
-void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
- rspamd_mempool_t *pool,
- rspamd_stat_token_t *utf_token,
- rspamd_stat_token_t *ucs_token);
-
-/**
* Try to detect language of words
* @param d
* @param ucs_tokens
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 7572a4178..4a765643a 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -711,7 +711,6 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
if (text_part->utf_raw_content != NULL) {
/* Different from HTML, where we also parse HTML and strip tags */
text_part->utf_content = text_part->utf_raw_content;
- text_part->unicode_content = text_part->unicode_raw_content;
}
else {
/*
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 0f5c3dfb7..ed9dfef6e 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -104,10 +104,6 @@ struct rspamd_mime_text_part {
GArray *utf_words;
UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
- /* Unicode content, used by libicu */
- GArray *unicode_raw_content; /* unicode raw content (of UChar) */
- GArray *unicode_content; /* unicode processed content (of UChar) */
-
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
struct html_content *html;
GList *exceptions; /**< list of offsets of urls */
diff --git a/src/libserver/task.c b/src/libserver/task.c
index de2745701..6135bced4 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -267,9 +267,6 @@ rspamd_task_free (struct rspamd_task *task)
if (tp->languages) {
g_ptr_array_unref (tp->languages);
}
- if (tp->unicode_raw_content) {
- g_array_free (tp->unicode_raw_content, TRUE);
- }
}
if (task->rcpt_envelope) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 9bbe899fb..d27d9bc58 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -271,9 +271,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
buf.original.begin = text;
buf.original.len = len;
buf.flags = 0;
- token.original.begin = NULL;
- token.original.len = 0;
- token.flags = 0;
+
+ memset (&token, 0, sizeof (token));
if (cfg != NULL) {
min_len = cfg->min_word_len;