|
|
@@ -0,0 +1,275 @@ |
|
|
|
/*- |
|
|
|
* Copyright 2016 Vsevolod Stakhov |
|
|
|
* |
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
|
* you may not use this file except in compliance with the License. |
|
|
|
* You may obtain a copy of the License at |
|
|
|
* |
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
|
|
* |
|
|
|
* Unless required by applicable law or agreed to in writing, software |
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
|
|
* See the License for the specific language governing permissions and |
|
|
|
* limitations under the License. |
|
|
|
*/ |
|
|
|
|
|
|
|
#include "config.h" |
|
|
|
#include "libutil/mem_pool.h" |
|
|
|
#include "libutil/regexp.h" |
|
|
|
#include "libserver/task.h" |
|
|
|
#include "message.h" |
|
|
|
#include <iconv.h> |
|
|
|
|
|
|
|
#define UTF8_CHARSET "UTF-8" |
|
|
|
|
|
|
|
#define RSPAMD_CHARSET_FLAG_UTF (1 << 0) |
|
|
|
#define RSPAMD_CHARSET_FLAG_ASCII (1 << 1) |
|
|
|
|
|
|
|
#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) |
|
|
|
#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) |
|
|
|
|
|
|
|
static rspamd_regexp_t *utf_compatible_re = NULL; |
|
|
|
|
|
|
|
struct rspamd_charset_substitution { |
|
|
|
const gchar *input; |
|
|
|
const gchar *canon; |
|
|
|
gint flags; |
|
|
|
}; |
|
|
|
|
|
|
|
#include "mime_encoding_list.h" |
|
|
|
|
|
|
|
static GHashTable *sub_hash = NULL; |
|
|
|
|
|
|
|
|
|
|
|
static GQuark |
|
|
|
rspamd_iconv_error_quark (void) |
|
|
|
{ |
|
|
|
return g_quark_from_static_string ("iconv error"); |
|
|
|
} |
|
|
|
|
|
|
|
static void |
|
|
|
rspamd_mime_encoding_substitute_init (void) |
|
|
|
{ |
|
|
|
guint i; |
|
|
|
|
|
|
|
sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal); |
|
|
|
|
|
|
|
for (i = 0; i < G_N_ELEMENTS (sub); i ++) { |
|
|
|
g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static void |
|
|
|
rspamd_charset_normalize (gchar *in) |
|
|
|
{ |
|
|
|
/* |
|
|
|
* This is a simple routine to validate input charset |
|
|
|
* we just check that charset starts with alphanumeric and ends |
|
|
|
* with alphanumeric |
|
|
|
*/ |
|
|
|
gchar *begin, *end; |
|
|
|
gboolean changed = FALSE; |
|
|
|
|
|
|
|
begin = in; |
|
|
|
|
|
|
|
while (*begin && !g_ascii_isalnum (*begin)) { |
|
|
|
begin ++; |
|
|
|
changed = TRUE; |
|
|
|
} |
|
|
|
|
|
|
|
end = begin + strlen (begin) - 1; |
|
|
|
|
|
|
|
while (end > begin && !g_ascii_isalnum (*end)) { |
|
|
|
end --; |
|
|
|
changed = TRUE; |
|
|
|
} |
|
|
|
|
|
|
|
if (changed) { |
|
|
|
memmove (in, begin, end - begin + 2); |
|
|
|
*(end + 1) = '\0'; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
const gchar * |
|
|
|
rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) |
|
|
|
{ |
|
|
|
gchar *ret = NULL, *h, *t; |
|
|
|
struct rspamd_charset_substitution *s; |
|
|
|
|
|
|
|
if (sub_hash == NULL) { |
|
|
|
rspamd_mime_encoding_substitute_init (); |
|
|
|
} |
|
|
|
|
|
|
|
ret = rspamd_mempool_ftokdup (pool, in); |
|
|
|
rspamd_charset_normalize (ret); |
|
|
|
|
|
|
|
if (memchr (in->begin, '-', in->len) != NULL) { |
|
|
|
/* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ |
|
|
|
h = ret; |
|
|
|
t = ret; |
|
|
|
|
|
|
|
while (*h != '\0') { |
|
|
|
if (*h != '-') { |
|
|
|
*t++ = *h; |
|
|
|
} |
|
|
|
|
|
|
|
h ++; |
|
|
|
} |
|
|
|
|
|
|
|
*t = '\0'; |
|
|
|
} |
|
|
|
|
|
|
|
s = g_hash_table_lookup (sub_hash, ret); |
|
|
|
|
|
|
|
if (s) { |
|
|
|
return s->canon; |
|
|
|
} |
|
|
|
|
|
|
|
return ret; |
|
|
|
} |
|
|
|
|
|
|
|
gchar * |
|
|
|
rspamd_text_to_utf8 (rspamd_mempool_t *pool, |
|
|
|
gchar *input, gsize len, const gchar *in_enc, |
|
|
|
gsize *olen, GError **err) |
|
|
|
{ |
|
|
|
gchar *s, *d; |
|
|
|
gsize outlen; |
|
|
|
iconv_t ic; |
|
|
|
rspamd_fstring_t *dst; |
|
|
|
gsize remain, ret, inremain = len; |
|
|
|
|
|
|
|
ic = iconv_open (UTF8_CHARSET, in_enc); |
|
|
|
|
|
|
|
if (ic == (iconv_t)-1) { |
|
|
|
g_set_error (err, rspamd_iconv_error_quark (), EINVAL, |
|
|
|
"cannot open iconv for: %s", in_enc); |
|
|
|
|
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
/* Preallocate for half of characters to be converted */ |
|
|
|
outlen = len + len / 2 + 1; |
|
|
|
dst = rspamd_fstring_sized_new (outlen); |
|
|
|
s = input; |
|
|
|
d = dst->str; |
|
|
|
remain = outlen - 1; |
|
|
|
|
|
|
|
while (inremain > 0 && remain > 0) { |
|
|
|
ret = iconv (ic, &s, &inremain, &d, &remain); |
|
|
|
dst->len = d - dst->str; |
|
|
|
|
|
|
|
if (ret == (gsize)-1) { |
|
|
|
switch (errno) { |
|
|
|
case E2BIG: |
|
|
|
/* Enlarge string */ |
|
|
|
if (inremain > 0) { |
|
|
|
dst = rspamd_fstring_grow (dst, inremain * 2); |
|
|
|
d = dst->str + dst->len; |
|
|
|
remain = dst->allocated - dst->len - 1; |
|
|
|
} |
|
|
|
break; |
|
|
|
case EILSEQ: |
|
|
|
case EINVAL: |
|
|
|
/* Ignore bad characters */ |
|
|
|
if (remain > 0 && inremain > 0) { |
|
|
|
*d++ = '?'; |
|
|
|
s++; |
|
|
|
inremain --; |
|
|
|
remain --; |
|
|
|
} |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
else if (ret == 0) { |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
*d = '\0'; |
|
|
|
*olen = dst->len; |
|
|
|
iconv_close (ic); |
|
|
|
rspamd_mempool_add_destructor (pool, |
|
|
|
(rspamd_mempool_destruct_t)rspamd_fstring_free, dst); |
|
|
|
msg_info_pool ("converted from %s to UTF-8 inlen: %z, outlen: %z", |
|
|
|
in_enc, len, dst->len); |
|
|
|
|
|
|
|
return dst->str; |
|
|
|
} |
|
|
|
|
|
|
|
GByteArray * |
|
|
|
rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, |
|
|
|
struct rspamd_mime_text_part *text_part) |
|
|
|
{ |
|
|
|
GError *err = NULL; |
|
|
|
gsize write_bytes; |
|
|
|
const gchar *charset; |
|
|
|
gchar *res_str; |
|
|
|
GByteArray *result_array, *part_content = text_part->orig; |
|
|
|
struct rspamd_mime_part *part = text_part->mime_part; |
|
|
|
|
|
|
|
if (task->cfg && task->cfg->raw_mode) { |
|
|
|
SET_PART_RAW (text_part); |
|
|
|
return part_content; |
|
|
|
} |
|
|
|
|
|
|
|
if (utf_compatible_re == NULL) { |
|
|
|
utf_compatible_re = rspamd_regexp_new ( |
|
|
|
"^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:us)|(?:ISO-8859-1)|" |
|
|
|
"(?:latin.*)|(?:CSASCII)$", |
|
|
|
"i", NULL); |
|
|
|
} |
|
|
|
|
|
|
|
if (part->ct->charset.len == 0) { |
|
|
|
SET_PART_RAW (text_part); |
|
|
|
return part_content; |
|
|
|
} |
|
|
|
|
|
|
|
charset = rspamd_mime_detect_charset (&part->ct->charset, task->task_pool); |
|
|
|
|
|
|
|
if (charset == NULL) { |
|
|
|
msg_info_task ("<%s>: has invalid charset", task->message_id); |
|
|
|
SET_PART_RAW (text_part); |
|
|
|
|
|
|
|
return part_content; |
|
|
|
} |
|
|
|
|
|
|
|
if (rspamd_regexp_match (utf_compatible_re, charset, strlen (charset), TRUE)) { |
|
|
|
if (g_utf8_validate (part_content->data, part_content->len, NULL)) { |
|
|
|
SET_PART_UTF (text_part); |
|
|
|
return part_content; |
|
|
|
} |
|
|
|
else { |
|
|
|
msg_info_task ("<%s>: contains invalid utf8 characters, assume it as raw", |
|
|
|
task->message_id); |
|
|
|
SET_PART_RAW (text_part); |
|
|
|
return part_content; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
res_str = rspamd_text_to_utf8 (task->task_pool, part_content->data, |
|
|
|
part_content->len, |
|
|
|
charset, |
|
|
|
&write_bytes, |
|
|
|
&err); |
|
|
|
|
|
|
|
if (res_str == NULL) { |
|
|
|
msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", |
|
|
|
task->message_id, |
|
|
|
charset, |
|
|
|
err ? err->message : "unknown problem"); |
|
|
|
SET_PART_RAW (text_part); |
|
|
|
g_error_free (err); |
|
|
|
|
|
|
|
return part_content; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); |
|
|
|
result_array->data = res_str; |
|
|
|
result_array->len = write_bytes; |
|
|
|
SET_PART_UTF (text_part); |
|
|
|
|
|
|
|
return result_array; |
|
|
|
} |