diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-11-19 02:39:23 +0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-19 02:39:23 +0600 |
commit | 41eab5b874721f7abc23144e5c3386392f1820f7 (patch) | |
tree | 7dff5fcc31a0e97b8a612348889dcbf2e4a40724 | |
parent | c06830279ff9f98d3d5ee2888da7b2fa33a8c339 (diff) | |
parent | 9bd616ca7aca3ff3fcd3a051e110f2d6e9abc76b (diff) | |
download | rspamd-41eab5b874721f7abc23144e5c3386392f1820f7.tar.gz rspamd-41eab5b874721f7abc23144e5c3386392f1820f7.zip |
Merge pull request #5223 from rspamd/vstakhov-fix-2047-encode
Fix RFC 2047 encoding
-rw-r--r-- | src/libmime/mime_headers.c | 137 | ||||
-rw-r--r-- | src/libmime/mime_headers.h | 5 | ||||
-rw-r--r-- | src/libserver/protocol.c | 2 | ||||
-rw-r--r-- | src/libutil/fstring.c | 29 | ||||
-rw-r--r-- | src/lua/lua_util.c | 13 | ||||
-rw-r--r-- | test/rspamd_cxx_unit.cxx | 7 | ||||
-rw-r--r-- | test/rspamd_cxx_unit_rfc2047.hxx | 109 |
7 files changed, 213 insertions, 89 deletions
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index e4bf4db06..63419d6a3 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -754,7 +754,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, state = parse_normal; } } /* qmarks >= 3 */ - } /* p == '=' */ + } /* p == '=' */ else { state = got_encoded_start; } @@ -816,88 +816,93 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, } char * -rspamd_mime_header_encode(const char *in, gsize len) +rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) { - const char *p = in, *end = in + len; - char *out, encode_buf[80 * sizeof(uint32_t)]; - GString *res; - gboolean need_encoding = FALSE; + static const size_t max_token_size = 76 - 12; /* 12 is the length of "=?UTF-8?Q??="; */ + GString *outbuf = g_string_sized_new(len); + char *encode_buf = g_alloca(max_token_size + 3); + const char *p = in; + const char *end = in + len; - /* Check if we need to encode */ while (p < end) { - if ((((unsigned char) *p) & 0x80) != 0) { - need_encoding = TRUE; - break; + if (*p == ' ' || *p == '\r' || *p == '\n' || *p == '(' || *p == ')') { + /* Append the separator as is */ + g_string_append_c(outbuf, *p); + p++; } - p++; - } + else { + const char *q = end; + size_t piece_len = q - p, encoded_len = 0; + + /* Check if the piece contains non-ASCII characters */ + gboolean need_encoding = FALSE; + size_t unencoded_prefix = 0, unencoded_suffix = 0; + for (size_t i = 0; i < piece_len; i++) { + unsigned char c = p[i]; + if (c >= 128 || (is_structured && !g_ascii_isalnum(c))) { + need_encoding = TRUE; + unencoded_suffix = 0; + encoded_len += 3; + + if (encoded_len > max_token_size) { + piece_len = i; + q = p + piece_len; + /* No more space */ + break; + } + } + else { + encoded_len++; - if (!need_encoding) { - out = g_malloc(len + 1); - rspamd_strlcpy(out, in, len + 1); - } - else { - /* Need encode */ - gsize ulen, pos; - int r; - const char *prev; - /* Choose step: =?UTF-8?Q?<qp>?= should be less than 76 chars */ - unsigned int step = (76 - 12) / 3 + 1; - - ulen = g_utf8_strlen(in, len); - res = g_string_sized_new(len * 2 + 1); - pos = 0; - prev = in; - /* Adjust chunk size for unicode average length */ - step *= 1.0 * ulen / (double) len; - - while (pos < ulen) { - p = g_utf8_offset_to_pointer(in, pos); - - if (p > prev) { - /* Encode and print */ - r = rspamd_encode_qp2047_buf(prev, p - prev, - encode_buf, sizeof(encode_buf)); - - if (r != -1) { - if (res->len > 0) { - rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r, - encode_buf); + if (encoded_len > max_token_size) { + piece_len = i; + q = p + piece_len; + /* No more space */ + break; + } + + if (need_encoding && (c == '(' || c == ')')) { + /* If we need to encode, we must stop on comments characters */ + piece_len = i; + q = p + piece_len; + /* No more space */ + break; + } + + if (!need_encoding) { + unencoded_prefix++; } else { - rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r, - encode_buf); + unencoded_suffix++; } } } - pos += MIN(step, ulen - pos); - prev = p; - } - - /* Leftover */ - if (prev < end) { - r = rspamd_encode_qp2047_buf(prev, end - prev, - encode_buf, sizeof(encode_buf)); - - if (r != -1) { - if (res->len > 0) { - rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r, - encode_buf); - } - else { - rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r, - encode_buf); - } + if (need_encoding) { + g_string_append_len(outbuf, p, unencoded_prefix); + p += unencoded_prefix; + g_string_append(outbuf, "=?UTF-8?Q?"); + /* Do encode */ + encoded_len = rspamd_encode_qp2047_buf(p, piece_len - unencoded_prefix - unencoded_suffix, + encode_buf, max_token_size + 3); + p += piece_len - unencoded_prefix - unencoded_suffix; + g_string_append_len(outbuf, encode_buf, encoded_len); + g_string_append(outbuf, "?="); + g_string_append_len(outbuf, p, unencoded_suffix); + } + else { + /* No transformation */ + g_string_append_len(outbuf, p, piece_len); } + p = q; } - - out = g_string_free(res, FALSE); } - return out; + /* return the allocated string and free the GString struct */ + return g_string_free(outbuf, FALSE); } + char * rspamd_mime_message_id_generate(const char *fqdn) { diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h index 9f89daece..290f94799 100644 --- a/src/libmime/mime_headers.h +++ b/src/libmime/mime_headers.h @@ -1,5 +1,5 @@ /* - * Copyright 2023 Vsevolod Stakhov + * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -100,9 +100,10 @@ char *rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, * Encode mime header if needed * @param in * @param len + * @param is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters) * @return newly allocated encoded header */ -char *rspamd_mime_header_encode(const char *in, gsize len); +char *rspamd_mime_header_encode(const char *in, gsize len, bool is_structured); /** * Generate new unique message id diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index 2dc641dfe..1196d2d14 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -1046,7 +1046,7 @@ rspamd_protocol_rewrite_subject(struct rspamd_task *task) g_string_append_len(subj_buf, c, p - c); } - res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len); + res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len, false); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) g_free, diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c index ffe130477..082620c27 100644 --- a/src/libutil/fstring.c +++ b/src/libutil/fstring.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2016 Vsevolod Stakhov +/* + * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -242,10 +242,8 @@ fstrhash_c(uint64_t c, uint64_t hval) uint32_t rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf) { - gsize i; uint64_t hval; - const char *p, *end = NULL; - gunichar uc; + const char *p; if (str == NULL) { return 0; @@ -253,21 +251,26 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf) p = str->begin; hval = str->len; - end = p + str->len; if (is_utf) { if (rspamd_fast_utf8_validate(p, str->len) != 0) { return rspamd_fstrhash_lc(str, FALSE); } - while (p < end) { - uc = g_unichar_tolower(g_utf8_get_char(p)); - hval = fstrhash_c(uc, hval); - p = g_utf8_next_char(p); + + size_t i = 0, len = str->len; + UChar32 uc; + + while (i < len) { + U8_NEXT(p, i, len, uc); + + if (uc > 0) { + hval = fstrhash_c(u_tolower(uc), hval); + } } } else { gsize large_steps = str->len / sizeof(uint64_t); - for (i = 0; i < large_steps; i++, p += sizeof(uint64_t)) { + for (size_t i = 0; i < large_steps; i++, p += sizeof(uint64_t)) { /* Copy to the uint64 lowercasing each byte */ union { char c[sizeof(uint64_t)]; @@ -280,7 +283,7 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf) } gsize remain = str->len % sizeof(uint64_t); - for (i = 0; i < remain; i++, p++) { + for (size_t i = 0; i < remain; i++, p++) { hval = fstrhash_c(g_ascii_tolower(*p), hval); } } diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 251d1e1e7..e92e4977a 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -644,9 +644,10 @@ LUA_FUNCTION_DEF(util, get_hostname); LUA_FUNCTION_DEF(util, parse_content_type); /*** - * @function util.mime_header_encode(hdr) + * @function util.mime_header_encode(hdr[, is_structured]) * Encodes header if needed * @param {string} hdr input header + * @param {boolean} is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters) * @return encoded header */ LUA_FUNCTION_DEF(util, mime_header_encode); @@ -2406,15 +2407,19 @@ static int lua_util_mime_header_encode(lua_State *L) { LUA_TRACE_POINT; - gsize len; - const char *hdr = luaL_checklstring(L, 1, &len); + struct rspamd_lua_text *hdr = lua_check_text_or_string(L, 1); char *encoded; + bool is_structured = false; if (!hdr) { return luaL_error(L, "invalid arguments"); } - encoded = rspamd_mime_header_encode(hdr, len); + if (lua_isboolean(L, 2)) { + is_structured = lua_toboolean(L, 2); + } + + encoded = rspamd_mime_header_encode(hdr->start, hdr->len, is_structured); lua_pushstring(L, encoded); g_free(encoded); diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx index b7cb0c6bf..ff323fb85 100644 --- a/test/rspamd_cxx_unit.cxx +++ b/test/rspamd_cxx_unit.cxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -25,6 +25,7 @@ #include "rspamd_cxx_local_ptr.hxx" #include "rspamd_cxx_unit_dkim.hxx" #include "rspamd_cxx_unit_cryptobox.hxx" +#include "rspamd_cxx_unit_rfc2047.hxx" static gboolean verbose = false; static const GOptionEntry entries[] = diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx new file mode 100644 index 000000000..ebb11cdc1 --- /dev/null +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -0,0 +1,109 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX +#define RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +#include <string> +#include <vector> +#include "libutil/mem_pool.h" +#include "libmime/mime_headers.h" + +TEST_SUITE("rfc2047 encode") +{ + TEST_CASE("rspamd_mime_header_encode handles ASCII-only input") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::vector<std::pair<std::string, std::string>> cases = { + {"PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]", + "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"}, + {"Hello World", "Hello World"}, + {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}, + {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"}, + {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" + "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"}, + {"", ""}, + {"こんにちは(世界)", "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=" + "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"}, + {"(Hello)", "(Hello)"}, + {"Hello)", "Hello)"}, + {"你好世界", "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="}, + {"これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。", + "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?=" + "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?=" + "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?=" + "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?=" + "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?=" + "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="}, + {"ASCII_Text " + "これは非常に長い非ASCIIテキストで、エンコードが必要になります。", + "ASCII_Text " + "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?=" + "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9?=" + "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?=" + "=?UTF-8?Q?=E3=83=89=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A?=" + "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="}, + {"非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。" + "データが長すぎる場合、正しく分割されるべきです。", + "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" + "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?=" + "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?=" + "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?=" + "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?=" + "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?=" + "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?=" + "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?=" + "=?UTF-8?Q?=81=99=E3=80=82?="}, + + }; + + for (const auto &c: cases) { + SUBCASE(c.first.c_str()) + { + gboolean invalid_utf = FALSE; + const char *input = c.first.c_str(); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); + std::string output(output_cstr); + std::string expected_output = c.second; + CHECK(output == expected_output); + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == c.first); + g_free(output_cstr); + } + } + + rspamd_mempool_delete(pool); + } + + TEST_CASE("rspamd_mime_header_encode handles long ASCII input without encoding") + { + // Input string consisting of repeated ASCII characters + std::string input_str(100, 'A');// 100 'A's + const char *input = input_str.c_str(); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); + std::string output(output_cstr); + std::string expected_output = input_str; + + CHECK(output == expected_output); + g_free(output_cstr); + } +} +#endif |