summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libmime/message.c92
-rw-r--r--src/libmime/mime_encoding.c158
-rw-r--r--src/libmime/mime_encoding.h7
-rw-r--r--src/libstat/tokenizers/tokenizers.c13
-rw-r--r--src/lua/lua_mimepart.c4
5 files changed, 30 insertions, 244 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 4a765643a..b76fa1b23 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -61,9 +61,6 @@ static void
rspamd_mime_part_extract_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
-#ifdef WITH_SNOWBALL
- struct sb_stemmer *stem = NULL;
-#endif
rspamd_stat_token_t *w;
gchar *temp_word;
const guchar *r;
@@ -71,92 +68,26 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
gdouble avg_len = 0;
if (part->utf_words) {
-#ifdef WITH_SNOWBALL
-
-
- if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
-
- if (!stemmers) {
- stemmers = g_hash_table_new (rspamd_strcase_hash,
- rspamd_strcase_equal);
- }
-
- stem = g_hash_table_lookup (stemmers, part->language);
-
- if (stem == NULL) {
-
- stem = sb_stemmer_new (part->language, "UTF_8");
-
- if (stem == NULL) {
- msg_debug_task (
- "<%s> cannot create lemmatizer for %s language",
- task->message_id, part->language);
- } else {
- g_hash_table_insert (stemmers, g_strdup (part->language),
- stem);
- }
- }
- }
-#endif
-
+ rspamd_stem_words (part->utf_words, task->task_pool, part->language,
+ task->lang_det);
for (i = 0; i < part->utf_words->len; i++) {
guint64 h;
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
- r = NULL;
-#ifdef WITH_SNOWBALL
- if (stem) {
- r = sb_stemmer_stem (stem, w->begin, w->len);
- }
-#endif
- if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
- avg_len = avg_len + (w->len - avg_len) / (double) (i + 1);
-
- if (r != NULL) {
- nlen = strlen (r);
- nlen = MIN (nlen, w->len);
- temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
- memcpy (temp_word, r, nlen);
-
- if (IS_PART_UTF (part)) {
- rspamd_str_lc_utf8 (temp_word, nlen);
- }
- else {
- rspamd_str_lc (temp_word, nlen);
- }
-
- w->begin = temp_word;
- w->len = nlen;
- }
- else {
- temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
- memcpy (temp_word, w->begin, w->len);
-
- if (IS_PART_UTF (part)) {
- rspamd_str_lc_utf8 (temp_word, w->len);
- }
- else {
- rspamd_str_lc (temp_word, w->len);
- }
-
- w->begin = temp_word;
- }
- }
-
- if (w->len > 0) {
+ if (w->stemmed.len > 0) {
/*
* We use static hash seed if we would want to use that in shingles
* computation in future
*/
h = rspamd_cryptobox_fast_hash_specific (
RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
- w->begin, w->len, words_hash_seed);
+ w->stemmed.begin, w->stemmed.len, words_hash_seed);
g_array_append_val (part->normalized_hashes, h);
- total_len += w->len;
+ total_len += w->stemmed.len;
- if (w->len <= 3) {
+ if (w->stemmed.len <= 3) {
short_len++;
}
}
@@ -251,6 +182,7 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
if (part->utf_words) {
part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
sizeof (guint64), part->utf_words->len);
+ rspamd_normalize_words (part->utf_words, task->task_pool);
}
}
@@ -757,17 +689,9 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
}
- /* Also add unicode content */
- text_part->unicode_content = g_array_sized_new (FALSE, FALSE,
- sizeof (UChar), text_part->utf_content->len + 1);
- rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);
-
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) free_byte_array_callback,
text_part->utf_content);
- rspamd_mempool_add_destructor (task->task_pool,
- rspamd_array_free_hard,
- text_part->unicode_content);
return TRUE;
}
@@ -1265,7 +1189,7 @@ rspamd_message_process (struct rspamd_task *task)
sel = p2;
}
else {
- if (p1->unicode_content->len > p2->unicode_content->len) {
+ if (p1->utf_content->len > p2->utf_content->len) {
sel = p1;
}
else {
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 269166344..e3479c3e7 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -241,115 +241,6 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
return d;
}
-static void
-rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
-{
- GByteArray *utf;
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter = rspamd_get_utf8_converter ();
-
- utf = text_part->utf_raw_content;
- text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
- sizeof (UChar), utf->len + 1);
- text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
- (UChar *)text_part->unicode_raw_content->data,
- utf->len + 1,
- utf->data,
- utf->len,
- &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_array_free (text_part->unicode_raw_content, TRUE);
- text_part->unicode_raw_content = NULL;
- }
-}
-
-static void
-rspamd_mime_text_part_normalise (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
-{
-#if U_ICU_VERSION_MAJOR_NUM >= 44
- UErrorCode uc_err = U_ZERO_ERROR;
- gint32 nsym, end;
- UChar *src = NULL, *dest = NULL;
- const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
-
- if (!text_part->unicode_raw_content) {
- return;
- }
-
- src = (UChar *)text_part->unicode_raw_content->data;
- nsym = text_part->unicode_raw_content->len;
-
- /* We can now check if we need to decompose */
- end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- msg_warn_task ("cannot normalise URL, cannot check normalisation: %s",
- u_errorName (uc_err));
- return;
- }
-
- if (end == nsym) {
- /* Already normalised */
- return;
- }
-
- text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL;
- dest = g_malloc (nsym * sizeof (*dest));
- memcpy (dest, src, end * sizeof (*dest));
- nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
- src + end, nsym - end, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
- msg_warn_task ("cannot normalise URL: %s",
- u_errorName (uc_err));
- }
- }
- else {
- /* Copy normalised back */
- memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
- text_part->unicode_raw_content->len = nsym;
- text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
- }
-
- g_free (dest);
-#endif
-}
-
-/*
- * Recode utf from normalised unichars if needed
- */
-static void
-rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
-{
- UErrorCode uc_err = U_ZERO_ERROR;
- guint clen, dlen;
- gint r;
- UConverter *utf8_converter;
-
- utf8_converter = rspamd_get_utf8_converter ();
-
- if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
- text_part->unicode_raw_content) {
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
- clen);
- g_byte_array_set_size (text_part->utf_raw_content, dlen);
- r = ucnv_fromUChars (utf8_converter,
- text_part->utf_raw_content->data,
- dlen,
- (UChar *)text_part->unicode_raw_content->data,
- text_part->unicode_raw_content->len,
- &uc_err);
- text_part->utf_raw_content->len = r;
- }
-}
-
-
static gboolean
rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part,
@@ -358,8 +249,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
GError **err)
{
gchar *d;
- gint32 r, clen, dlen;
-
+ gint32 r, clen, dlen, uc_len;
+ UChar *tmp_buf;
UErrorCode uc_err = U_ZERO_ERROR;
UConverter *conv, *utf8_converter;
@@ -374,11 +265,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
return FALSE;
}
-
- text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
- sizeof (UChar), input->len + 1);
- r = ucnv_toUChars (conv,
- (UChar *)text_part->unicode_raw_content->data,
+ tmp_buf = g_new (UChar, input->len + 1);
+ uc_err = U_ZERO_ERROR;
+ uc_len = ucnv_toUChars (conv,
+ tmp_buf,
input->len + 1,
input->data,
input->len,
@@ -388,33 +278,34 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot convert data to unicode from %s: %s",
charset, u_errorName (uc_err));
+ g_free (tmp_buf);
+
return FALSE;
}
- text_part->unicode_raw_content->len = r;
- rspamd_mime_text_part_normalise (task, text_part);
-
/* Now, convert to utf8 */
clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
d = rspamd_mempool_alloc (task->task_pool, dlen);
r = ucnv_fromUChars (utf8_converter, d, dlen,
- (UChar *)text_part->unicode_raw_content->data, r, &uc_err);
+ tmp_buf, uc_len, &uc_err);
if (!U_SUCCESS (uc_err)) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot convert data from unicode from %s: %s",
charset, u_errorName (uc_err));
+ g_free (tmp_buf);
return FALSE;
}
- msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d",
- charset, input->len, r);
+ msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
+ charset, input->len, r, uc_len);
text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
text_part->utf_raw_content->data = d;
text_part->utf_raw_content->len = r;
+ g_free (tmp_buf);
return TRUE;
}
@@ -658,9 +549,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
else {
SET_PART_UTF (text_part);
text_part->utf_raw_content = part_content;
- rspamd_mime_text_part_ucs_from_utf (task, text_part);
- rspamd_mime_text_part_normalise (task, text_part);
- rspamd_mime_text_part_maybe_renormalise (task, text_part);
text_part->real_charset = UTF8_CHARSET;
return;
@@ -693,9 +581,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
part_content->len, !checked)) {
SET_PART_UTF (text_part);
text_part->utf_raw_content = part_content;
- rspamd_mime_text_part_ucs_from_utf (task, text_part);
- rspamd_mime_text_part_normalise (task, text_part);
- rspamd_mime_text_part_maybe_renormalise (task, text_part);
text_part->real_charset = UTF8_CHARSET;
return;
@@ -721,18 +606,3 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
SET_PART_UTF (text_part);
}
-
-void
-rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
-{
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter = rspamd_get_utf8_converter ();
-
- g_array_set_size (dest, in->len + 1);
- dest->len = ucnv_toUChars (utf8_converter,
- (UChar *)dest->data,
- in->len + 1,
- in->data,
- in->len,
- &uc_err);
-}
diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h
index 0754bb348..5f436d99d 100644
--- a/src/libmime/mime_encoding.h
+++ b/src/libmime/mime_encoding.h
@@ -18,6 +18,7 @@
#include "config.h"
#include "mem_pool.h"
+#include "fstring.h"
struct rspamd_task;
struct rspamd_mime_part;
@@ -86,11 +87,5 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
*/
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
-/**
- * Converts utf8 to libicu unichars
- * @param in
- * @param dest
- */
-void rspamd_utf_to_unicode (GByteArray *in, GArray *dest);
#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 32d9ba0df..9ec0c4315 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -745,28 +745,25 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
if (stem) {
- const gchar *stemmed;
+ const gchar *stemmed = NULL;
stemmed = sb_stemmer_stem (stem,
tok->normalized.begin, tok->normalized.len);
- dlen = strlen (stemmed);
+ dlen = stemmed ? strlen (stemmed) : 0;
if (dlen > 0) {
- dest = rspamd_mempool_alloc (pool, dlen);
+ dest = rspamd_mempool_alloc (pool, dlen + 1);
memcpy (dest, stemmed, dlen);
- rspamd_str_lc_utf8 (dest, dlen);
+ dest[dlen] = '\0';
tok->stemmed.len = dlen;
tok->stemmed.begin = dest;
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
}
else {
/* Fallback */
- dest = rspamd_mempool_alloc (pool, tok->normalized.len);
- memcpy (dest, tok->normalized.begin, tok->normalized.len);
- rspamd_str_lc_utf8 (dest, tok->normalized.len);
tok->stemmed.len = tok->normalized.len;
- tok->stemmed.begin = dest;
+ tok->stemmed.begin = tok->normalized.begin;
}
}
else {
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index a6fc2bfa5..9e74c87c0 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -923,8 +923,8 @@ struct lua_shingle_data {
#define STORE_TOKEN(i, t) do { \
if ((i) < part->utf_words->len) { \
word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \
- sd->t.begin = word->begin; \
- sd->t.len = word->len; \
+ sd->t.begin = word->stemmed.begin; \
+ sd->t.len = word->stemmed.len; \
} \
}while (0)