GPtrArray *
rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
- GArray *ucs_tokens, gsize words_len)
+ GArray *ucs_tokens)
{
khash_t(rspamd_candidates_hash) *candidates;
GPtrArray *result;
*/
GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
- GArray *ucs_tokens, gsize words_len);
+ GArray *ucs_tokens);
#endif
guint i, nlen, total_len = 0, short_len = 0;
gdouble avg_len = 0;
- if (part->normalized_words) {
+ if (part->utf_words) {
#ifdef WITH_SNOWBALL
static GHashTable *stemmers = NULL;
#endif
- for (i = 0; i < part->normalized_words->len; i++) {
+ for (i = 0; i < part->utf_words->len; i++) {
guint64 h;
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
r = NULL;
#ifdef WITH_SNOWBALL
if (stem) {
}
}
- if (part->normalized_words && part->normalized_words->len) {
+ if (part->utf_words && part->utf_words->len) {
gdouble *avg_len_p, *short_len_p;
avg_len_p = rspamd_mempool_get_variable (task->task_pool,
/* Ugly workaround */
if (IS_PART_HTML (part)) {
- part->normalized_words = rspamd_tokenize_text (
- part->stripped_content->data,
- part->stripped_content->len, tok_type, task->cfg,
+ part->utf_words = rspamd_tokenize_text (
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len, tok_type, task->cfg,
part->exceptions,
NULL);
}
else {
- part->normalized_words = rspamd_tokenize_text (
- part->stripped_content->data,
- part->stripped_content->len, tok_type, task->cfg,
+ part->utf_words = rspamd_tokenize_text (
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len, tok_type, task->cfg,
part->exceptions,
NULL);
}
- if (part->normalized_words) {
+ if (part->utf_words) {
part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
- sizeof (guint64), part->normalized_words->len);
+ sizeof (guint64), part->utf_words->len);
if (IS_PART_UTF (part) && task->lang_det) {
- part->ucs32_words = g_array_sized_new (FALSE, FALSE,
- sizeof (rspamd_stat_token_t), part->normalized_words->len);
+ part->unicode_words = g_array_sized_new (FALSE, FALSE,
+ sizeof (rspamd_stat_token_t), part->utf_words->len);
}
- if (part->ucs32_words) {
+ if (part->unicode_words) {
- for (i = 0; i < part->normalized_words->len; i++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
+ for (i = 0; i < part->utf_words->len; i++) {
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t,
i);
if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
rspamd_language_detector_to_ucs (task->lang_det,
task->task_pool,
w, &ucs_w);
- g_array_append_val (part->ucs32_words, ucs_w);
+ g_array_append_val (part->unicode_words, ucs_w);
ucs_len += ucs_w.len;
}
}
static void
rspamd_mime_part_detect_language (struct rspamd_task *task,
- struct rspamd_mime_text_part *part, guint ucs_len)
+ struct rspamd_mime_text_part *part)
{
struct rspamd_lang_detector_res *lang;
- if (part->ucs32_words) {
+ if (part->unicode_words) {
part->languages = rspamd_language_detector_detect (task,
task->lang_det,
- part->ucs32_words, ucs_len);
+ part->unicode_words);
if (part->languages->len > 0) {
lang = g_ptr_array_index (part->languages, 0);
state = seen_cr;
if (p > c) {
last_c = *(p - 1);
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)c, p - c);
}
case seen_cr:
/* Double \r\r */
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
crlf_added = TRUE;
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
part->nlines ++;
if (p > c) {
last_c = *(p - 1);
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)c, p - c);
}
c = p + 1;
if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
crlf_added = TRUE;
}
else {
/* \r\n */
if (!crlf_added) {
if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *) " ", 1);
crlf_added = TRUE;
}
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
c = p + 1;
case seen_lf:
/* Double \n\n */
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
crlf_added = TRUE;
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
part->nlines++;
if (!crlf_added) {
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
/* Skip initial spaces */
if (G_UNLIKELY (*p == ' ')) {
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
}
switch (state) {
case normal_char:
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)c, p - c);
while (c < p) {
default:
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
part->nlines++;
struct rspamd_process_exception *ex;
/* Strip newlines */
- part->stripped_content = g_byte_array_sized_new (part->content->len);
+ part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
part->newlines = g_ptr_array_sized_new (128);
- p = (const gchar *)part->content->data;
- end = p + part->content->len;
+ p = (const gchar *)part->utf_content->data;
+ end = p + part->utf_content->len;
rspamd_strip_newlines_parse (p, end, part);
ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
off = (goffset)g_ptr_array_index (part->newlines, i);
g_ptr_array_index (part->newlines, i) = (gpointer)(goffset)
- (part->stripped_content->data + off);
+ (part->utf_stripped_content->data + off);
ex->pos = off;
ex->len = 0;
ex->type = RSPAMD_EXCEPTION_NEWLINE;
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) free_byte_array_callback,
- part->stripped_content);
+ part->utf_stripped_content);
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
part->newlines);
g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
}
- if (part->content && part->content->len >= sizeof (gtube_pattern_reject) &&
- part->content->len <= max_check_size) {
- if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data,
- part->content->len,
+ if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
+ part->utf_content->len <= max_check_size) {
+ if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
+ part->utf_content->len,
rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) {
switch (ret) {
msg_info_task (
"<%s>: gtube %s pattern has been found in part of length %ud",
task->message_id, rspamd_action_to_str (act),
- part->content->len);
+ part->utf_content->len);
}
}
}
return ea->pos - eb->pos;
}
+static gboolean
+rspamd_message_process_plain_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ if (text_part->parsed.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+ return TRUE;
+ }
+
+ rspamd_mime_text_part_maybe_convert (task, text_part);
+
+ if (text_part->utf_raw_content != NULL) {
+ /* Different from HTML, where we also parse HTML and strip tags */
+ text_part->utf_content = text_part->utf_raw_content;
+ text_part->unicode_content = text_part->unicode_raw_content;
+ }
+ else {
+ /*
+ * We ignore unconverted parts from now as it is dangerous
+ * to treat them as text parts
+ */
+
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static gboolean
+rspamd_message_process_html_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
+
+ if (text_part->parsed.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+ return TRUE;
+ }
+
+ rspamd_mime_text_part_maybe_convert (task, text_part);
+
+ if (text_part->utf_raw_content == NULL) {
+ return FALSE;
+ }
+
+ text_part->html = rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (*text_part->html));
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
+ text_part->utf_content = rspamd_html_process_part_full (
+ task->task_pool,
+ text_part->html,
+ text_part->utf_raw_content,
+ &text_part->exceptions,
+ task->urls,
+ task->emails);
+
+ if (text_part->utf_content->len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+ }
+
+ /* Also add unicode content */
+ text_part->unicode_content = g_array_sized_new (FALSE, FALSE,
+ sizeof (UChar), text_part->utf_content->len + 1);
+ rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);
+
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) free_byte_array_callback,
+ text_part->utf_content);
+ rspamd_mempool_add_destructor (task->task_pool,
+ rspamd_array_free_hard,
+ text_part->unicode_content);
+
+ return TRUE;
+}
+
static void
-rspamd_message_process_text_part (struct rspamd_task *task,
- struct rspamd_mime_part *mime_part)
+rspamd_message_process_text_part_maybe (struct rspamd_task *task,
+ struct rspamd_mime_part *mime_part)
{
struct rspamd_mime_text_part *text_part;
rspamd_ftok_t html_tok, xhtml_tok;
debug_task ("skip attachments for checking as text parts");
return;
}
-
- if (found_html) {
- text_part = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_text_part));
- text_part->raw.begin = mime_part->raw_data.begin;
- text_part->raw.len = mime_part->raw_data.len;
- text_part->parsed.begin = mime_part->parsed_data.begin;
- text_part->parsed.len = mime_part->parsed_data.len;
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
- text_part->mime_part = mime_part;
-
- if (mime_part->parsed_data.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- g_ptr_array_add (task->text_parts, text_part);
- return;
- }
-
- rspamd_mime_text_part_maybe_convert (task, text_part);
-
- if (text_part->utf_raw_content == NULL) {
- return;
- }
-
- text_part->html = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (*text_part->html));
- text_part->mime_part = mime_part;
-
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
- text_part->content = rspamd_html_process_part_full (
- task->task_pool,
- text_part->html,
- text_part->utf_raw_content,
- &text_part->exceptions,
- task->urls,
- task->emails);
-
- if (text_part->content->len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- }
-
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- text_part->content);
- g_ptr_array_add (task->text_parts, text_part);
+ else if (!(found_txt || found_html)) {
+ /* Not a text part */
+ return;
}
- else if (found_txt) {
- text_part =
- rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_text_part));
- text_part->mime_part = mime_part;
- text_part->raw.begin = mime_part->raw_data.begin;
- text_part->raw.len = mime_part->raw_data.len;
- text_part->parsed.begin = mime_part->parsed_data.begin;
- text_part->parsed.len = mime_part->parsed_data.len;
- text_part->mime_part = mime_part;
-
- if (mime_part->parsed_data.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- g_ptr_array_add (task->text_parts, text_part);
- return;
- }
- rspamd_mime_text_part_maybe_convert (task, text_part);
+ text_part = rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (struct rspamd_mime_text_part));
+ text_part->mime_part = mime_part;
+ text_part->raw.begin = mime_part->raw_data.begin;
+ text_part->raw.len = mime_part->raw_data.len;
+ text_part->parsed.begin = mime_part->parsed_data.begin;
+ text_part->parsed.len = mime_part->parsed_data.len;
- if (text_part->utf_raw_content != NULL) {
- /*
- * We ignore unconverted parts from now as it is dangerous
- * to treat them as text parts
- */
- text_part->content = text_part->utf_raw_content;
- g_ptr_array_add (task->text_parts, text_part);
- }
- else {
+ if (found_html) {
+ if (!rspamd_message_process_html_text_part (task, text_part)) {
return;
}
}
else {
- return;
+ if (!rspamd_message_process_plain_text_part (task, text_part)) {
+ return;
+ }
}
-
+ g_ptr_array_add (task->text_parts, text_part);
mime_part->flags |= RSPAMD_MIME_PART_TEXT;
mime_part->specific.txt = text_part;
text_part->exceptions);
}
- text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
+ rspamd_mime_part_create_words (task, text_part);
}
/* Creates message from various data using libmagic to detect type */
struct rspamd_mime_part *part;
part = g_ptr_array_index (task->parts, i);
- rspamd_message_process_text_part (task, part);
+ rspamd_message_process_text_part_maybe (task, part);
}
rspamd_images_process (task);
sel = p2;
}
else {
- if (p1->ucs_len > p2->ucs_len) {
+ if (p1->unicode_content->len > p2->unicode_content->len) {
sel = p1;
}
else {
}
}
- rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
+ rspamd_mime_part_detect_language (task, sel);
if (sel->language && sel->language[0]) {
/* Propagate language */
PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
if (!text_part->language) {
- rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
+ rspamd_mime_part_detect_language (task, text_part);
}
rspamd_mime_part_extract_words (task, text_part);
- if (text_part->normalized_words) {
- total_words += text_part->normalized_words->len;
+ if (text_part->utf_words) {
+ total_words += text_part->utf_words->len;
}
}
const gchar *language;
GPtrArray *languages;
const gchar *real_charset;
+
+ /* Raw data in native encoding */
rspamd_ftok_t raw;
rspamd_ftok_t parsed; /* decoded from mime encodings */
- GByteArray *content; /* utf8 encoded processed content */
- GArray *ucs_raw_content; /* unicode raw content (of UChar) */
+ /* UTF8 content */
+ GByteArray *utf_content; /* utf8 encoded processed content */
GByteArray *utf_raw_content; /* utf raw content */
- GByteArray *stripped_content; /* utf content with no newlines */
+ GByteArray *utf_stripped_content; /* utf content with no newlines */
+ GArray *normalized_hashes;
+ GArray *utf_words;
+
+ /* Unicode content, used by libicu */
+ GArray *unicode_raw_content; /* unicode raw content (of UChar) */
+ GArray *unicode_content; /* unicode processed content (of UChar) */
+ GArray *unicode_words;
+
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
struct html_content *html;
GList *exceptions; /**< list of offsets of urls */
struct rspamd_mime_part *mime_part;
- GArray *normalized_words;
- GArray *ucs32_words;
- GArray *normalized_hashes;
+
guint flags;
guint nlines;
guint spaces;
guint empty_lines;
guint capital_letters;
guint numeric_characters;
- guint ucs_len;
};
enum rspamd_received_type {
rspamd_mime_utf8_conv_init ();
utf = text_part->utf_raw_content;
- text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+ text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
sizeof (UChar), utf->len + 1);
- text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter,
- (UChar *)text_part->ucs_raw_content->data,
+ text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
+ (UChar *)text_part->unicode_raw_content->data,
utf->len + 1,
utf->data,
utf->len,
&uc_err);
if (!U_SUCCESS (uc_err)) {
- g_array_free (text_part->ucs_raw_content, TRUE);
- text_part->ucs_raw_content = NULL;
+ g_array_free (text_part->unicode_raw_content, TRUE);
+ text_part->unicode_raw_content = NULL;
}
}
norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
}
- if (!text_part->ucs_raw_content) {
+ if (!text_part->unicode_raw_content) {
return;
}
- src = (UChar *)text_part->ucs_raw_content->data;
- nsym = text_part->ucs_raw_content->len;
+ src = (UChar *)text_part->unicode_raw_content->data;
+ nsym = text_part->unicode_raw_content->len;
/* We can now check if we need to decompose */
end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
}
else {
/* Copy normalised back */
- memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar));
- text_part->ucs_raw_content->len = nsym;
+ memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
+ text_part->unicode_raw_content->len = nsym;
text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
}
rspamd_mime_utf8_conv_init ();
if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
- text_part->ucs_raw_content) {
+ text_part->unicode_raw_content) {
clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len,
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
clen);
g_byte_array_set_size (text_part->utf_raw_content, dlen);
r = ucnv_fromUChars (utf8_converter,
text_part->utf_raw_content->data,
dlen,
- (UChar *)text_part->ucs_raw_content->data,
- text_part->ucs_raw_content->len,
+ (UChar *)text_part->unicode_raw_content->data,
+ text_part->unicode_raw_content->len,
&uc_err);
text_part->utf_raw_content->len = r;
}
}
- text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+ text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
sizeof (UChar), input->len + 1);
r = ucnv_toUChars (conv,
- (UChar *)text_part->ucs_raw_content->data,
+ (UChar *)text_part->unicode_raw_content->data,
input->len + 1,
input->data,
input->len,
return FALSE;
}
- text_part->ucs_raw_content->len = r;
+ text_part->unicode_raw_content->len = r;
rspamd_mime_text_part_normalise (task, text_part);
/* Now, convert to utf8 */
dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
d = rspamd_mempool_alloc (task->task_pool, dlen);
r = ucnv_fromUChars (utf8_converter, d, dlen,
- (UChar *)text_part->ucs_raw_content->data, r, &uc_err);
+ (UChar *)text_part->unicode_raw_content->data, r, &uc_err);
if (!U_SUCCESS (uc_err)) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
SET_PART_UTF (text_part);
}
+
+void
+rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ g_array_set_size (dest, in->len + 1);
+ dest->len = ucnv_toUChars (utf8_converter,
+ (UChar *)dest->data,
+ in->len + 1,
+ in->data,
+ in->len,
+ &uc_err);
+}
*/
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
+/**
+ * Converts utf8 to libicu unichars
+ * @param in
+ * @param dest
+ */
+void rspamd_utf_to_unicode (GByteArray *in, GArray *dest);
+
#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
raw = TRUE;
}
- in = part->content->data;
- len = part->content->len;
+ in = part->utf_content->data;
+ len = part->utf_content->len;
}
}
for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
- if (part->stripped_content) {
- scvec[i + 1] = (guchar *)part->stripped_content->data;
- lenvec[i + 1] = part->stripped_content->len;
+ if (part->utf_stripped_content) {
+ scvec[i + 1] = (guchar *)part->utf_stripped_content->data;
+ lenvec[i + 1] = part->utf_stripped_content->len;
}
else {
scvec[i + 1] = (guchar *)"";
for (i = 0; i < task->text_parts->len; i ++) {
tp = g_ptr_array_index (task->text_parts, i);
- if (tp->normalized_words) {
- g_array_free (tp->normalized_words, TRUE);
+ if (tp->utf_words) {
+ g_array_free (tp->utf_words, TRUE);
}
if (tp->normalized_hashes) {
g_array_free (tp->normalized_hashes, TRUE);
}
- if (tp->ucs32_words) {
- g_array_free (tp->ucs32_words, TRUE);
+ if (tp->unicode_words) {
+ g_array_free (tp->unicode_words, TRUE);
}
if (tp->languages) {
g_ptr_array_unref (tp->languages);
}
- if (tp->ucs_raw_content) {
- g_array_free (tp->ucs_raw_content, TRUE);
+ if (tp->unicode_raw_content) {
+ g_array_free (tp->unicode_raw_content, TRUE);
}
}
{
struct rspamd_url_mimepart_cbdata mcbd;
- if (part->stripped_content == NULL || part->stripped_content->len == 0) {
+ if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) {
msg_warn_task ("got empty text part");
return;
}
mcbd.task = task;
mcbd.part = part;
- rspamd_url_find_multiple (task->task_pool, part->stripped_content->data,
- part->stripped_content->len, is_html, part->newlines,
+ rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
+ part->utf_stripped_content->len, is_html, part->newlines,
rspamd_url_text_part_callback, &mcbd);
}
for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
- reserved_len += part->normalized_words->len;
+ if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
+ reserved_len += part->utf_words->len;
}
/* XXX: normal window size */
reserved_len += 5;
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+ if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
- part->normalized_words, IS_PART_UTF (part),
+ part->utf_words, IS_PART_UTF (part),
NULL, task->tokens);
}
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl, gboolean unused)
{
}
static gboolean
-rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl,
gboolean check_signature)
switch (how) {
case RSPAMD_TOKENIZE_RAW:
- func = rspamd_tokenizer_get_word_compat;
+ func = rspamd_tokenizer_get_word_raw;
break;
case RSPAMD_TOKENIZE_UTF:
- func = rspamd_tokenizer_get_word;
+ func = rspamd_tokenizer_get_word_utf8;
break;
default:
g_assert_not_reached ();
enum rspamd_tokenize_type {
RSPAMD_TOKENIZE_UTF = 0,
RSPAMD_TOKENIZE_RAW,
- RSPAMD_TOKENIZE_UCS
+ RSPAMD_TOKENIZE_UNICODE
};
/* Compare two token nodes */
rspamd_lua_setclass (L, "rspamd{text}", -1);
if (!type) {
- start = part->content->data;
- len = part->content->len;
+ start = part->utf_content->data;
+ len = part->utf_content->len;
}
else if (strcmp (type, "content") == 0) {
- start = part->content->data;
- len = part->content->len;
+ start = part->utf_content->data;
+ len = part->utf_content->len;
}
else if (strcmp (type, "content_oneline") == 0) {
- start = part->stripped_content->data;
- len = part->stripped_content->len;
+ start = part->utf_stripped_content->data;
+ len = part->utf_stripped_content->len;
}
else if (strcmp (type, "raw_parsed") == 0) {
start = part->parsed.begin;
t = lua_newuserdata (L, sizeof (*t));
rspamd_lua_setclass (L, "rspamd{text}", -1);
- t->start = part->stripped_content->data;
- t->len = part->stripped_content->len;
+ t->start = part->utf_stripped_content->data;
+ t->len = part->utf_stripped_content->len;
t->flags = 0;
return 1;
return 1;
}
- if (IS_PART_EMPTY (part) || part->content == NULL) {
+ if (IS_PART_EMPTY (part) || part->utf_content == NULL) {
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->content->len);
+ lua_pushinteger (L, part->utf_content->len);
}
return 1;
return 1;
}
- if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
+ if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->normalized_words->len);
+ lua_pushinteger (L, part->utf_words->len);
}
return 1;
return luaL_error (L, "invalid arguments");
}
- if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
+ if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
lua_createtable (L, 0, 0);
}
else {
- lua_createtable (L, part->normalized_words->len, 0);
+ lua_createtable (L, part->utf_words->len, 0);
- for (i = 0; i < part->normalized_words->len; i ++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ for (i = 0; i < part->utf_words->len; i ++) {
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
lua_pushlstring (L, w->begin, w->len);
lua_rawseti (L, -2, i + 1);
};
#define STORE_TOKEN(i, t) do { \
- if ((i) < part->normalized_words->len) { \
- word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \
+ if ((i) < part->utf_words->len) { \
+ word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \
sd->t.begin = word->begin; \
sd->t.len = word->len; \
} \
/* Calculate direct hash */
rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES);
- for (i = 0; i < part->normalized_words->len; i ++) {
- word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ for (i = 0; i < part->utf_words->len; i ++) {
+ word = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
rspamd_cryptobox_hash_update (&st, word->begin, word->len);
}
sizeof (hexdigest));
lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1);
- sgl = rspamd_shingles_from_text (part->normalized_words, key,
+ sgl = rspamd_shingles_from_text (part->utf_words, key,
pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH);
if (sgl == NULL) {
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->content != NULL) {
- text = part->content->data;
- len = part->content->len;
+ if (!IS_PART_EMPTY (part) && part->utf_content != NULL) {
+ text = part->utf_content->data;
+ len = part->utf_content->len;
if (lua_trie_search_str (L, trie, text, len) != 0) {
found = TRUE;
guint i, ncap = 0;
gdouble cur_score = 0.0;
- if (part == NULL || part->normalized_words == NULL ||
- part->normalized_words->len == 0) {
+ if (part == NULL || part->utf_words == NULL ||
+ part->utf_words->len == 0) {
return;
}
- for (i = 0; i < part->normalized_words->len; i++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ for (i = 0; i < part->utf_words->len; i++) {
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
*/
part->capital_letters += ncap;
- cur_score /= (gdouble)part->normalized_words->len;
+ cur_score /= (gdouble)part->utf_words->len;
if (cur_score > 2.0) {
cur_score = 2.0;
static GArray *
fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
{
- return part->normalized_words;
+ return part->utf_words;
}
static void
rspamd_cryptobox_hash_init (&st, rule->hash_key->str,
rule->hash_key->len);
- rspamd_cryptobox_hash_update (&st, part->stripped_content->data,
- part->stripped_content->len);
+ rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data,
+ part->utf_stripped_content->len);
if (task->subject) {
/* We also include subject */
}
/* Check length of part */
- fac = rule->ctx->text_multiplier * part->content->len;
+ fac = rule->ctx->text_multiplier * part->utf_content->len;
if ((double)min_bytes > fac) {
if (!rule->short_text_direct_hash) {
msg_info_task (
"skip fuzzy check",
task->message_id, min_bytes,
fac,
- part->content->len,
+ part->utf_content->len,
rule->ctx->text_multiplier);
continue;
}
"use direct hash",
task->message_id, min_bytes,
fac,
- part->content->len,
+ part->utf_content->len,
rule->ctx->text_multiplier);
short_text = TRUE;
}
}
- if (part->normalized_words == NULL ||
- part->normalized_words->len == 0) {
+ if (part->utf_words == NULL ||
+ part->utf_words->len == 0) {
msg_info_task ("<%s>, part hash empty, skip fuzzy check",
task->message_id);
continue;
}
if (rule->ctx->min_hash_len != 0 &&
- part->normalized_words->len <
+ part->utf_words->len <
rule->ctx->min_hash_len) {
if (!rule->short_text_direct_hash) {
msg_info_task (