rspamd_mime_part_extract_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
-#ifdef WITH_SNOWBALL
- struct sb_stemmer *stem = NULL;
-#endif
rspamd_stat_token_t *w;
gchar *temp_word;
const guchar *r;
gdouble avg_len = 0;
if (part->utf_words) {
-#ifdef WITH_SNOWBALL
-
-
- if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
-
- if (!stemmers) {
- stemmers = g_hash_table_new (rspamd_strcase_hash,
- rspamd_strcase_equal);
- }
-
- stem = g_hash_table_lookup (stemmers, part->language);
-
- if (stem == NULL) {
-
- stem = sb_stemmer_new (part->language, "UTF_8");
-
- if (stem == NULL) {
- msg_debug_task (
- "<%s> cannot create lemmatizer for %s language",
- task->message_id, part->language);
- } else {
- g_hash_table_insert (stemmers, g_strdup (part->language),
- stem);
- }
- }
- }
-#endif
-
+ rspamd_stem_words (part->utf_words, task->task_pool, part->language,
+ task->lang_det);
for (i = 0; i < part->utf_words->len; i++) {
guint64 h;
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
- r = NULL;
-#ifdef WITH_SNOWBALL
- if (stem) {
- r = sb_stemmer_stem (stem, w->begin, w->len);
- }
-#endif
- if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
- avg_len = avg_len + (w->len - avg_len) / (double) (i + 1);
-
- if (r != NULL) {
- nlen = strlen (r);
- nlen = MIN (nlen, w->len);
- temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
- memcpy (temp_word, r, nlen);
-
- if (IS_PART_UTF (part)) {
- rspamd_str_lc_utf8 (temp_word, nlen);
- }
- else {
- rspamd_str_lc (temp_word, nlen);
- }
-
- w->begin = temp_word;
- w->len = nlen;
- }
- else {
- temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
- memcpy (temp_word, w->begin, w->len);
-
- if (IS_PART_UTF (part)) {
- rspamd_str_lc_utf8 (temp_word, w->len);
- }
- else {
- rspamd_str_lc (temp_word, w->len);
- }
-
- w->begin = temp_word;
- }
- }
-
- if (w->len > 0) {
+ if (w->stemmed.len > 0) {
/*
* We use static hash seed if we would want to use that in shingles
* computation in future
*/
h = rspamd_cryptobox_fast_hash_specific (
RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
- w->begin, w->len, words_hash_seed);
+ w->stemmed.begin, w->stemmed.len, words_hash_seed);
g_array_append_val (part->normalized_hashes, h);
- total_len += w->len;
+ total_len += w->stemmed.len;
- if (w->len <= 3) {
+ if (w->stemmed.len <= 3) {
short_len++;
}
}
if (part->utf_words) {
part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
sizeof (guint64), part->utf_words->len);
+ rspamd_normalize_words (part->utf_words, task->task_pool);
}
}
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
}
- /* Also add unicode content */
- text_part->unicode_content = g_array_sized_new (FALSE, FALSE,
- sizeof (UChar), text_part->utf_content->len + 1);
- rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);
-
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) free_byte_array_callback,
text_part->utf_content);
- rspamd_mempool_add_destructor (task->task_pool,
- rspamd_array_free_hard,
- text_part->unicode_content);
return TRUE;
}
sel = p2;
}
else {
- if (p1->unicode_content->len > p2->unicode_content->len) {
+ if (p1->utf_content->len > p2->utf_content->len) {
sel = p1;
}
else {
return d;
}
-static void
-rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
-{
- GByteArray *utf;
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter = rspamd_get_utf8_converter ();
-
- utf = text_part->utf_raw_content;
- text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
- sizeof (UChar), utf->len + 1);
- text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
- (UChar *)text_part->unicode_raw_content->data,
- utf->len + 1,
- utf->data,
- utf->len,
- &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_array_free (text_part->unicode_raw_content, TRUE);
- text_part->unicode_raw_content = NULL;
- }
-}
-
-static void
-rspamd_mime_text_part_normalise (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
-{
-#if U_ICU_VERSION_MAJOR_NUM >= 44
- UErrorCode uc_err = U_ZERO_ERROR;
- gint32 nsym, end;
- UChar *src = NULL, *dest = NULL;
- const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
-
- if (!text_part->unicode_raw_content) {
- return;
- }
-
- src = (UChar *)text_part->unicode_raw_content->data;
- nsym = text_part->unicode_raw_content->len;
-
- /* We can now check if we need to decompose */
- end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- msg_warn_task ("cannot normalise URL, cannot check normalisation: %s",
- u_errorName (uc_err));
- return;
- }
-
- if (end == nsym) {
- /* Already normalised */
- return;
- }
-
- text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL;
- dest = g_malloc (nsym * sizeof (*dest));
- memcpy (dest, src, end * sizeof (*dest));
- nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
- src + end, nsym - end, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
- msg_warn_task ("cannot normalise URL: %s",
- u_errorName (uc_err));
- }
- }
- else {
- /* Copy normalised back */
- memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
- text_part->unicode_raw_content->len = nsym;
- text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
- }
-
- g_free (dest);
-#endif
-}
-
-/*
- * Recode utf from normalised unichars if needed
- */
-static void
-rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
-{
- UErrorCode uc_err = U_ZERO_ERROR;
- guint clen, dlen;
- gint r;
- UConverter *utf8_converter;
-
- utf8_converter = rspamd_get_utf8_converter ();
-
- if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
- text_part->unicode_raw_content) {
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
- clen);
- g_byte_array_set_size (text_part->utf_raw_content, dlen);
- r = ucnv_fromUChars (utf8_converter,
- text_part->utf_raw_content->data,
- dlen,
- (UChar *)text_part->unicode_raw_content->data,
- text_part->unicode_raw_content->len,
- &uc_err);
- text_part->utf_raw_content->len = r;
- }
-}
-
-
static gboolean
rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part,
GError **err)
{
gchar *d;
- gint32 r, clen, dlen;
-
+ gint32 r, clen, dlen, uc_len;
+ UChar *tmp_buf;
UErrorCode uc_err = U_ZERO_ERROR;
UConverter *conv, *utf8_converter;
return FALSE;
}
-
- text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
- sizeof (UChar), input->len + 1);
- r = ucnv_toUChars (conv,
- (UChar *)text_part->unicode_raw_content->data,
+ tmp_buf = g_new (UChar, input->len + 1);
+ uc_err = U_ZERO_ERROR;
+ uc_len = ucnv_toUChars (conv,
+ tmp_buf,
input->len + 1,
input->data,
input->len,
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot convert data to unicode from %s: %s",
charset, u_errorName (uc_err));
+ g_free (tmp_buf);
+
return FALSE;
}
- text_part->unicode_raw_content->len = r;
- rspamd_mime_text_part_normalise (task, text_part);
-
/* Now, convert to utf8 */
clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
d = rspamd_mempool_alloc (task->task_pool, dlen);
r = ucnv_fromUChars (utf8_converter, d, dlen,
- (UChar *)text_part->unicode_raw_content->data, r, &uc_err);
+ tmp_buf, uc_len, &uc_err);
if (!U_SUCCESS (uc_err)) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot convert data from unicode from %s: %s",
charset, u_errorName (uc_err));
+ g_free (tmp_buf);
return FALSE;
}
- msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d",
- charset, input->len, r);
+ msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
+ charset, input->len, r, uc_len);
text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
text_part->utf_raw_content->data = d;
text_part->utf_raw_content->len = r;
+ g_free (tmp_buf);
return TRUE;
}
else {
SET_PART_UTF (text_part);
text_part->utf_raw_content = part_content;
- rspamd_mime_text_part_ucs_from_utf (task, text_part);
- rspamd_mime_text_part_normalise (task, text_part);
- rspamd_mime_text_part_maybe_renormalise (task, text_part);
text_part->real_charset = UTF8_CHARSET;
return;
part_content->len, !checked)) {
SET_PART_UTF (text_part);
text_part->utf_raw_content = part_content;
- rspamd_mime_text_part_ucs_from_utf (task, text_part);
- rspamd_mime_text_part_normalise (task, text_part);
- rspamd_mime_text_part_maybe_renormalise (task, text_part);
text_part->real_charset = UTF8_CHARSET;
return;
SET_PART_UTF (text_part);
}
-
-void
-rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
-{
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter = rspamd_get_utf8_converter ();
-
- g_array_set_size (dest, in->len + 1);
- dest->len = ucnv_toUChars (utf8_converter,
- (UChar *)dest->data,
- in->len + 1,
- in->data,
- in->len,
- &uc_err);
-}