]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] More fixes to rfc2047 encoding
authorVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 18 Nov 2024 16:31:15 +0000 (16:31 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 18 Nov 2024 16:31:15 +0000 (16:31 +0000)
src/libmime/mime_headers.c
src/libmime/mime_headers.h
src/libserver/protocol.c
src/lua/lua_util.c
test/rspamd_cxx_unit_rfc2047.hxx

index e4d2ca45870f10dc2ec63de3534eeafea27d6772..a511f5e36b526e3c9853643d20219def50546e96 100644 (file)
@@ -816,7 +816,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
 }
 
 char *
-rspamd_mime_header_encode(const char *in, gsize len)
+rspamd_mime_header_encode(const char *in, gsize len, bool is_structured)
 {
        static const size_t max_token_size = 76 - 12; /* 12 is the length of "=?UTF-8?Q??="; */
        GString *outbuf = g_string_sized_new(len);
@@ -831,16 +831,17 @@ rspamd_mime_header_encode(const char *in, gsize len)
                        p++;
                }
                else {
-                       size_t remain = end - p;
-                       gsize next_offset = rspamd_memcspn(p, " \r\n()", MIN(max_token_size, remain));
-                       const char *q = p + next_offset;
+                       const char *q = end;
                        size_t piece_len = q - p, encoded_len = 0;
 
                        /* Check if the piece contains non-ASCII characters */
-                       gboolean has_non_ascii = FALSE;
+                       gboolean need_encoding = FALSE;
+                       size_t unencoded_prefix = 0, unencoded_suffix = 0;
                        for (size_t i = 0; i < piece_len; i++) {
-                               if ((unsigned char) p[i] >= 128) {
-                                       has_non_ascii = TRUE;
+                               unsigned char c = p[i];
+                               if (c >= 128 || (is_structured && !g_ascii_isalnum(c))) {
+                                       need_encoding = TRUE;
+                                       unencoded_suffix = 0;
                                        encoded_len += 3;
 
                                        if (encoded_len > max_token_size) {
@@ -853,21 +854,41 @@ rspamd_mime_header_encode(const char *in, gsize len)
                                else {
                                        encoded_len++;
 
+                                       if (!need_encoding) {
+                                               unencoded_prefix++;
+                                       }
+                                       else {
+                                               unencoded_suffix++;
+                                       }
+
                                        if (encoded_len > max_token_size) {
                                                piece_len = i;
                                                q = p + piece_len;
                                                /* No more space */
                                                break;
                                        }
+
+                                       if (need_encoding && (c == '(' || c == ')')) {
+                                               /* If we need to encode, we must stop on comments characters */
+                                               piece_len = i + 1;
+                                               q = p + piece_len;
+                                               /* No more space */
+                                               break;
+                                       }
                                }
                        }
 
-                       if (has_non_ascii) {
+                       if (need_encoding) {
+                               g_string_append_len(outbuf, p, unencoded_prefix);
+                               p += unencoded_prefix;
                                g_string_append(outbuf, "=?UTF-8?Q?");
                                /* Do encode */
-                               encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, max_token_size + 3);
+                               encoded_len = rspamd_encode_qp2047_buf(p, piece_len - unencoded_prefix - unencoded_suffix,
+                                                                                                          encode_buf, max_token_size + 3);
+                               p += piece_len - unencoded_prefix - unencoded_suffix;
                                g_string_append_len(outbuf, encode_buf, encoded_len);
                                g_string_append(outbuf, "?=");
+                               g_string_append_len(outbuf, p, unencoded_suffix);
                        }
                        else {
                                /* No transformation */
index 9f89daece258aa08cd651222e91d920844bb11c2..290f9479908a1031dfeda12fb2d8e550f4c8c823 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2024 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,9 +100,10 @@ char *rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
  * Encode mime header if needed
  * @param in
  * @param len
+ * @param is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters)
  * @return newly allocated encoded header
  */
-char *rspamd_mime_header_encode(const char *in, gsize len);
+char *rspamd_mime_header_encode(const char *in, gsize len, bool is_structured);
 
 /**
  * Generate new unique message id
index 2dc641dfe0a513256e68df04580471001d74fcef..1196d2d14b080bc13e073b9de4cf6fc74afe9c98 100644 (file)
@@ -1046,7 +1046,7 @@ rspamd_protocol_rewrite_subject(struct rspamd_task *task)
                g_string_append_len(subj_buf, c, p - c);
        }
 
-       res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len);
+       res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len, false);
 
        rspamd_mempool_add_destructor(task->task_pool,
                                                                  (rspamd_mempool_destruct_t) g_free,
index 251d1e1e7ea0e72c1847e73775c04fe12dd55598..e92e4977af5f067f760fb89d20983eb7dd932bd8 100644 (file)
@@ -644,9 +644,10 @@ LUA_FUNCTION_DEF(util, get_hostname);
 LUA_FUNCTION_DEF(util, parse_content_type);
 
 /***
- *  @function util.mime_header_encode(hdr)
+ *  @function util.mime_header_encode(hdr[, is_structured])
  * Encodes header if needed
  * @param {string} hdr input header
+ * @param {boolean} is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters)
  * @return encoded header
  */
 LUA_FUNCTION_DEF(util, mime_header_encode);
@@ -2406,15 +2407,19 @@ static int
 lua_util_mime_header_encode(lua_State *L)
 {
        LUA_TRACE_POINT;
-       gsize len;
-       const char *hdr = luaL_checklstring(L, 1, &len);
+       struct rspamd_lua_text *hdr = lua_check_text_or_string(L, 1);
        char *encoded;
+       bool is_structured = false;
 
        if (!hdr) {
                return luaL_error(L, "invalid arguments");
        }
 
-       encoded = rspamd_mime_header_encode(hdr, len);
+       if (lua_isboolean(L, 2)) {
+               is_structured = lua_toboolean(L, 2);
+       }
+
+       encoded = rspamd_mime_header_encode(hdr->start, hdr->len, is_structured);
        lua_pushstring(L, encoded);
        g_free(encoded);
 
index e66c7934024323e06d8f343d4e6ee60817cbfc93..25afd15d55847f6849220dca6217f6fa01693c93 100644 (file)
@@ -21,6 +21,7 @@
 #include "doctest/doctest.h"
 
 #include <string>
+#include "libutil/mem_pool.h"
 #include "libmime/mime_headers.h"
 
 TEST_SUITE("rfc2047 encode")
@@ -28,7 +29,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles ASCII-only input")
        {
                const char *input = "Hello World";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "Hello World";
                CHECK(output == expected_output);
@@ -38,7 +39,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles input with non-ASCII characters")
        {
                const char *input = "Hello Мир";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?=";
                CHECK(output == expected_output);
@@ -48,7 +49,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles mixed input with separators")
        {
                const char *input = "ололо (ололо test)    test";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= "
                                                                          "(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test)    test";
@@ -59,7 +60,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles multiple spaces and separators")
        {
                const char *input = "Привет    мир\nКак дела?";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82?=    "
                                                                          "=?UTF-8?Q?=D0=BC=D0=B8=D1=80?=\n"
@@ -72,7 +73,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles empty input")
        {
                const char *input = "";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr ? output_cstr : "");
                std::string expected_output = "";
                CHECK(output == expected_output);
@@ -82,7 +83,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles input with only separators")
        {
                const char *input = " \r\n()";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = " \r\n()";
                CHECK(output == expected_output);
@@ -92,7 +93,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles non-ASCII separators")
        {
                const char *input = "こんにちは(世界)";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?="
                                                                          "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)";
@@ -103,7 +104,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles input starting with separator")
        {
                const char *input = " (Hello)";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = " (Hello)";
                CHECK(output == expected_output);
@@ -113,7 +114,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles input ending with separator")
        {
                const char *input = "Hello) ";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "Hello) ";
                CHECK(output == expected_output);
@@ -123,7 +124,7 @@ TEST_SUITE("rfc2047 encode")
        TEST_CASE("rspamd_mime_header_encode handles consecutive non-ASCII pieces")
        {
                const char *input = "你好世界";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?=";
                CHECK(output == expected_output);
@@ -133,7 +134,7 @@ TEST_SUITE("rfc2047 encode")
        {
                // Input string consisting of repeated non-ASCII characters
                const char *input = "これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?="
                                                                          "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?="
@@ -151,7 +152,7 @@ TEST_SUITE("rfc2047 encode")
                // Input string consisting of repeated ASCII characters
                std::string input_str(100, 'A');// 100 'A's
                const char *input = input_str.c_str();
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
                std::string expected_output = input_str;
 
@@ -164,7 +165,7 @@ TEST_SUITE("rfc2047 encode")
                // Input string with mix of ASCII and non-ASCII characters forming long pieces
                const char *input = "ASCII_Text "
                                                        "これは非常に長い非ASCIIテキストで、エンコードが必要になります。";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
 
                // Expected output: ASCII text as-is, non-ASCII text encoded and split accordingly
@@ -184,7 +185,7 @@ TEST_SUITE("rfc2047 encode")
                const char *input =
                        "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。"
                        "データが長すぎる場合、正しく分割されるべきです。";
-               char *output_cstr = rspamd_mime_header_encode(input, strlen(input));
+               char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
                std::string output(output_cstr);
 
                std::string expected_output =