Browse Source

[Feature] Add mime encoding manipulation routines

tags/1.5.0
Vsevolod Stakhov 7 years ago
parent
commit
1e672eedb5
4 changed files with 1916 additions and 1 deletions
  1. 2
    1
      src/libmime/CMakeLists.txt
  2. 275
    0
      src/libmime/mime_encoding.c
  3. 62
    0
      src/libmime/mime_encoding.h
  4. 1577
    0
      src/libmime/mime_encoding_list.h

+ 2
- 1
src/libmime/CMakeLists.txt View File

@@ -8,6 +8,7 @@ SET(LIBRSPAMDMIMESRC
${CMAKE_CURRENT_SOURCE_DIR}/archives.c
${CMAKE_CURRENT_SOURCE_DIR}/content_type.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c)
${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c)

SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)

+ 275
- 0
src/libmime/mime_encoding.c View File

@@ -0,0 +1,275 @@
/*-
* Copyright 2016 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "config.h"
#include "libutil/mem_pool.h"
#include "libutil/regexp.h"
#include "libserver/task.h"
#include "message.h"
#include <iconv.h>

#define UTF8_CHARSET "UTF-8"

#define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
#define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)

#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)

static rspamd_regexp_t *utf_compatible_re = NULL;

struct rspamd_charset_substitution {
const gchar *input;
const gchar *canon;
gint flags;
};

#include "mime_encoding_list.h"

static GHashTable *sub_hash = NULL;


static GQuark
rspamd_iconv_error_quark (void)
{
return g_quark_from_static_string ("iconv error");
}

static void
rspamd_mime_encoding_substitute_init (void)
{
guint i;

sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);

for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
}
}

static void
rspamd_charset_normalize (gchar *in)
{
/*
* This is a simple routine to validate input charset
* we just check that charset starts with alphanumeric and ends
* with alphanumeric
*/
gchar *begin, *end;
gboolean changed = FALSE;

begin = in;

while (*begin && !g_ascii_isalnum (*begin)) {
begin ++;
changed = TRUE;
}

end = begin + strlen (begin) - 1;

while (end > begin && !g_ascii_isalnum (*end)) {
end --;
changed = TRUE;
}

if (changed) {
memmove (in, begin, end - begin + 2);
*(end + 1) = '\0';
}
}

const gchar *
rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
{
gchar *ret = NULL, *h, *t;
struct rspamd_charset_substitution *s;

if (sub_hash == NULL) {
rspamd_mime_encoding_substitute_init ();
}

ret = rspamd_mempool_ftokdup (pool, in);
rspamd_charset_normalize (ret);

if (memchr (in->begin, '-', in->len) != NULL) {
/* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
h = ret;
t = ret;

while (*h != '\0') {
if (*h != '-') {
*t++ = *h;
}

h ++;
}

*t = '\0';
}

s = g_hash_table_lookup (sub_hash, ret);

if (s) {
return s->canon;
}

return ret;
}

gchar *
rspamd_text_to_utf8 (rspamd_mempool_t *pool,
gchar *input, gsize len, const gchar *in_enc,
gsize *olen, GError **err)
{
gchar *s, *d;
gsize outlen;
iconv_t ic;
rspamd_fstring_t *dst;
gsize remain, ret, inremain = len;

ic = iconv_open (UTF8_CHARSET, in_enc);

if (ic == (iconv_t)-1) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
"cannot open iconv for: %s", in_enc);

return NULL;
}

/* Preallocate for half of characters to be converted */
outlen = len + len / 2 + 1;
dst = rspamd_fstring_sized_new (outlen);
s = input;
d = dst->str;
remain = outlen - 1;

while (inremain > 0 && remain > 0) {
ret = iconv (ic, &s, &inremain, &d, &remain);
dst->len = d - dst->str;

if (ret == (gsize)-1) {
switch (errno) {
case E2BIG:
/* Enlarge string */
if (inremain > 0) {
dst = rspamd_fstring_grow (dst, inremain * 2);
d = dst->str + dst->len;
remain = dst->allocated - dst->len - 1;
}
break;
case EILSEQ:
case EINVAL:
/* Ignore bad characters */
if (remain > 0 && inremain > 0) {
*d++ = '?';
s++;
inremain --;
remain --;
}
break;
}
}
else if (ret == 0) {
break;
}
}

*d = '\0';
*olen = dst->len;
iconv_close (ic);
rspamd_mempool_add_destructor (pool,
(rspamd_mempool_destruct_t)rspamd_fstring_free, dst);
msg_info_pool ("converted from %s to UTF-8 inlen: %z, outlen: %z",
in_enc, len, dst->len);

return dst->str;
}

GByteArray *
rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part)
{
GError *err = NULL;
gsize write_bytes;
const gchar *charset;
gchar *res_str;
GByteArray *result_array, *part_content = text_part->orig;
struct rspamd_mime_part *part = text_part->mime_part;

if (task->cfg && task->cfg->raw_mode) {
SET_PART_RAW (text_part);
return part_content;
}

if (utf_compatible_re == NULL) {
utf_compatible_re = rspamd_regexp_new (
"^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:us)|(?:ISO-8859-1)|"
"(?:latin.*)|(?:CSASCII)$",
"i", NULL);
}

if (part->ct->charset.len == 0) {
SET_PART_RAW (text_part);
return part_content;
}

charset = rspamd_mime_detect_charset (&part->ct->charset, task->task_pool);

if (charset == NULL) {
msg_info_task ("<%s>: has invalid charset", task->message_id);
SET_PART_RAW (text_part);

return part_content;
}

if (rspamd_regexp_match (utf_compatible_re, charset, strlen (charset), TRUE)) {
if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
SET_PART_UTF (text_part);
return part_content;
}
else {
msg_info_task ("<%s>: contains invalid utf8 characters, assume it as raw",
task->message_id);
SET_PART_RAW (text_part);
return part_content;
}
}
else {
res_str = rspamd_text_to_utf8 (task->task_pool, part_content->data,
part_content->len,
charset,
&write_bytes,
&err);

if (res_str == NULL) {
msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
task->message_id,
charset,
err ? err->message : "unknown problem");
SET_PART_RAW (text_part);
g_error_free (err);

return part_content;
}
}

result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
result_array->data = res_str;
result_array->len = write_bytes;
SET_PART_UTF (text_part);

return result_array;
}

+ 62
- 0
src/libmime/mime_encoding.h View File

@@ -0,0 +1,62 @@
/*-
* Copyright 2016 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SRC_LIBMIME_MIME_ENCODING_H_
#define SRC_LIBMIME_MIME_ENCODING_H_

#include "config.h"
#include "mem_pool.h"

struct rspamd_task;
struct rspamd_mime_part;
struct rspamd_mime_text_part;

/**
* Convert charset to a valid iconv charset
* @param pool pool to store temporary data
* @param in
* @return
*/
const gchar * rspamd_mime_detect_charset (rspamd_mempool_t *pool,
const rspamd_ftok_t *in);

/**
* Convert text chunk to utf-8. Input encoding is substituted using
* `rspamd_mime_detect_charset`.
* If input encoding is already utf, this function returns input pointer.
* Memory is allocated from pool if a conversion is needed
* @param pool
* @param input
* @param len
* @param in_enc
* @param olen
* @param err
* @return
*/
gchar * rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
gchar *input, gsize len, const gchar *in_enc,
gsize *olen, GError **err);

/**
* Maybe convert part to utf-8
* @param task
* @param text_part
* @return
*/
GByteArray * rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part);


#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */

+ 1577
- 0
src/libmime/mime_encoding_list.h
File diff suppressed because it is too large
View File


Loading…
Cancel
Save