From e2cb33d09efcb989133e2862cea832a14a61dacd Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 16 Nov 2024 14:26:54 +0000 Subject: [Minor] Get rid of glib utf8 functions --- src/libutil/fstring.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c index ffe130477..082620c27 100644 --- a/src/libutil/fstring.c +++ b/src/libutil/fstring.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2016 Vsevolod Stakhov +/* + * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -242,10 +242,8 @@ fstrhash_c(uint64_t c, uint64_t hval) uint32_t rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf) { - gsize i; uint64_t hval; - const char *p, *end = NULL; - gunichar uc; + const char *p; if (str == NULL) { return 0; @@ -253,21 +251,26 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf) p = str->begin; hval = str->len; - end = p + str->len; if (is_utf) { if (rspamd_fast_utf8_validate(p, str->len) != 0) { return rspamd_fstrhash_lc(str, FALSE); } - while (p < end) { - uc = g_unichar_tolower(g_utf8_get_char(p)); - hval = fstrhash_c(uc, hval); - p = g_utf8_next_char(p); + + size_t i = 0, len = str->len; + UChar32 uc; + + while (i < len) { + U8_NEXT(p, i, len, uc); + + if (uc > 0) { + hval = fstrhash_c(u_tolower(uc), hval); + } } } else { gsize large_steps = str->len / sizeof(uint64_t); - for (i = 0; i < large_steps; i++, p += sizeof(uint64_t)) { + for (size_t i = 0; i < large_steps; i++, p += sizeof(uint64_t)) { /* Copy to the uint64 lowercasing each byte */ union { char c[sizeof(uint64_t)]; @@ -280,7 +283,7 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf) } gsize remain = str->len % sizeof(uint64_t); - for (i = 0; i < remain; i++, p++) { + for (size_t i = 0; i < remain; i++, p++) { hval = fstrhash_c(g_ascii_tolower(*p), hval); } } -- cgit v1.2.3 From 7be5889e7033f019adb7310685121babf6f802d4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 16 Nov 2024 15:45:56 +0000 Subject: [Rework] Rewrite rfc2047 encoding as it was totally broken --- src/libmime/mime_headers.c | 114 ++++++++++++++++++--------------------------- 1 file changed, 46 insertions(+), 68 deletions(-) diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index e4bf4db06..9d11210f3 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -754,7 +754,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, state = parse_normal; } } /* qmarks >= 3 */ - } /* p == '=' */ + } /* p == '=' */ else { state = got_encoded_start; } @@ -818,86 +818,64 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, char * rspamd_mime_header_encode(const char *in, gsize len) { - const char *p = in, *end = in + len; - char *out, encode_buf[80 * sizeof(uint32_t)]; - GString *res; - gboolean need_encoding = FALSE; + static const size_t max_token_size = 76; + GString *outbuf = g_string_sized_new(len); + size_t encode_buf_size = max_token_size; + char *encode_buf = g_alloca(encode_buf_size + 3); + const char *p = in; + const char *end = in + len; - /* Check if we need to encode */ while (p < end) { - if ((((unsigned char) *p) & 0x80) != 0) { - need_encoding = TRUE; - break; + if (*p == ' ' || *p == '\r' || *p == '\n' || *p == '(' || *p == ')') { + /* Append the separator as is */ + g_string_append_c(outbuf, *p); + p++; } - p++; - } - - if (!need_encoding) { - out = g_malloc(len + 1); - rspamd_strlcpy(out, in, len + 1); - } - else { - /* Need encode */ - gsize ulen, pos; - int r; - const char *prev; - /* Choose step: =?UTF-8?Q??= should be less than 76 chars */ - unsigned int step = (76 - 12) / 3 + 1; - - ulen = g_utf8_strlen(in, len); - res = g_string_sized_new(len * 2 + 1); - pos = 0; - prev = in; - /* Adjust chunk size for unicode average length */ - step *= 1.0 * ulen / (double) len; - - while (pos < ulen) { - p = g_utf8_offset_to_pointer(in, pos); - - if (p > prev) { - /* Encode and print */ - r = rspamd_encode_qp2047_buf(prev, p - prev, - encode_buf, sizeof(encode_buf)); - - if (r != -1) { - if (res->len > 0) { - rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r, - encode_buf); - } - else { - rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r, - encode_buf); + else { + size_t remain = end - p; + gsize next_offset = rspamd_memcspn(p, " \r\n()", MIN(max_token_size, remain)); + const char *q = p + next_offset; + size_t piece_len = q - p, encoded_len = 0; + + /* Check if the piece contains non-ASCII characters */ + gboolean has_non_ascii = FALSE; + for (size_t i = 0; i < piece_len; i++) { + if ((unsigned char) p[i] >= 128) { + has_non_ascii = TRUE; + encoded_len += 3; + + if (encoded_len > max_token_size) { + piece_len = i - 1; + q = p + piece_len; + /* No more space */ + break; } } - } - - pos += MIN(step, ulen - pos); - prev = p; - } - - /* Leftover */ - if (prev < end) { - r = rspamd_encode_qp2047_buf(prev, end - prev, - encode_buf, sizeof(encode_buf)); - - if (r != -1) { - if (res->len > 0) { - rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r, - encode_buf); - } else { - rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r, - encode_buf); + encoded_len++; } } - } - out = g_string_free(res, FALSE); + if (has_non_ascii) { + g_string_append(outbuf, "=?UTF-8?Q?"); + /* Do encode */ + gssize encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, encode_buf_size); + g_string_append_len(outbuf, encode_buf, encoded_len); + g_string_append(outbuf, "?="); + } + else { + /* No transformation */ + g_string_append_len(outbuf, p, piece_len); + } + p = q; + } } - return out; + /* return the allocated string and free the GString struct */ + return g_string_free(outbuf, FALSE); } + char * rspamd_mime_message_id_generate(const char *fqdn) { -- cgit v1.2.3 From 0806e4d11bcc08bdc3b8efbf55c372f844b0a722 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 16 Nov 2024 17:46:44 +0000 Subject: [Fix] Some more fixes --- src/libmime/mime_headers.c | 14 ++- test/rspamd_cxx_unit.cxx | 7 +- test/rspamd_cxx_unit_rfc2047.hxx | 212 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+), 7 deletions(-) create mode 100644 test/rspamd_cxx_unit_rfc2047.hxx diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index 9d11210f3..3565eefba 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -818,10 +818,9 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, char * rspamd_mime_header_encode(const char *in, gsize len) { - static const size_t max_token_size = 76; + static const size_t max_token_size = 76 - (sizeof("=?UTF-8?Q? ?=") - 3); GString *outbuf = g_string_sized_new(len); - size_t encode_buf_size = max_token_size; - char *encode_buf = g_alloca(encode_buf_size + 3); + char *encode_buf = g_alloca(max_token_size + 3); const char *p = in; const char *end = in + len; @@ -853,13 +852,20 @@ rspamd_mime_header_encode(const char *in, gsize len) } else { encoded_len++; + + if (encoded_len > max_token_size) { + piece_len = i - 1; + q = p + piece_len; + /* No more space */ + break; + } } } if (has_non_ascii) { g_string_append(outbuf, "=?UTF-8?Q?"); /* Do encode */ - gssize encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, encode_buf_size); + encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, max_token_size); g_string_append_len(outbuf, encode_buf, encoded_len); g_string_append(outbuf, "?="); } diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx index b7cb0c6bf..ff323fb85 100644 --- a/test/rspamd_cxx_unit.cxx +++ b/test/rspamd_cxx_unit.cxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -25,6 +25,7 @@ #include "rspamd_cxx_local_ptr.hxx" #include "rspamd_cxx_unit_dkim.hxx" #include "rspamd_cxx_unit_cryptobox.hxx" +#include "rspamd_cxx_unit_rfc2047.hxx" static gboolean verbose = false; static const GOptionEntry entries[] = diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx new file mode 100644 index 000000000..6f2a42414 --- /dev/null +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -0,0 +1,212 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX +#define RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +#include +#include "libmime/mime_headers.h" + +TEST_SUITE("rfc2047 encode") +{ + TEST_CASE("rspamd_mime_header_encode handles ASCII-only input") + { + const char *input = "Hello World"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "Hello World"; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles input with non-ASCII characters") + { + const char *input = "Hello Мир"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles mixed input with separators") + { + const char *input = "ололо (ололо test) test"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= " + "(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles multiple spaces and separators") + { + const char *input = "Привет мир\nКак дела?"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82?= " + "=?UTF-8?Q?=D0=BC=D0=B8=D1=80?=\n" + "=?UTF-8?Q?=D0=9A=D0=B0=D0=BA?= " + "=?UTF-8?Q?=D0=B4=D0=B5=D0=BB=D0=B0=3F?="; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles empty input") + { + const char *input = ""; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr ? output_cstr : ""); + std::string expected_output = ""; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles input with only separators") + { + const char *input = " \r\n()"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = " \r\n()"; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles non-ASCII separators") + { + const char *input = "こんにちは(世界)"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=" + "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles input starting with separator") + { + const char *input = " (Hello)"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = " (Hello)"; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles input ending with separator") + { + const char *input = "Hello) "; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "Hello) "; + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles consecutive non-ASCII pieces") + { + const char *input = "你好世界"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="; + CHECK(output == expected_output); + g_free(output_cstr); + } + TEST_CASE("rspamd_mime_header_encode handles long non-ASCII input requiring encoded-word splitting") + { + // Input string consisting of repeated non-ASCII characters + const char *input = "これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + + // Expected output with proper splitting into multiple encoded-words + // The actual encoding would produce a long string; we need to split it into parts + // Each encoded-word should be less than or equal to 76 characters (including the '=?UTF-8?Q?' prefix and '?=' suffix) + // For our mock, we'll simulate the splitting + + // For simplicity in this test, we assume that the encoded output, after encoding and wrapping with '=?UTF-8?Q?' and '?=', is split correctly. + + // Construct the expected output manually (in practice, you may want to write a helper to split it) + std::string expected_output = "=?UTF-8?Q?" + "=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7=E3=81=84=E3=83=86=E3=82=AD?= " + "=?UTF-8?Q?=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95?= " + "=?UTF-8?Q?=E3=82=8C=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD=97=E3=82=92=E8?= " + "=?UTF-8?Q?=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99?=."; + + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles long ASCII input without encoding") + { + // Input string consisting of repeated ASCII characters + std::string input_str(100, 'A');// 100 'A's + const char *input = input_str.c_str(); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + std::string expected_output = input_str; + + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("rspamd_mime_header_encode handles long mixed input requiring encoded-word splitting") + { + // Input string with mix of ASCII and non-ASCII characters forming long pieces + const char *input = "ASCII_Text " + "これは非常に長い非ASCIIテキストで、エンコードが必要になります。"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + + // Expected output: ASCII text as-is, non-ASCII text encoded and split accordingly + std::string expected_output = "ASCII_Text " + "=?UTF-8?Q?" + "=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86?= " + "=?UTF-8?Q?=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89?= " + "=?UTF-8?Q?=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="; + + CHECK(output == expected_output); + g_free(output_cstr); + } + + TEST_CASE("process_string handles very long non-ASCII word requiring multiple splits") + { + const char *input = + "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。" + "データが長すぎる場合、正しく分割されるべきです。"; + char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + std::string output(output_cstr); + + std::string expected_output = + "=?UTF-8?Q?=E9=9D=9E=E5=B8=B6=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" + "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97?=" + "=?UTF-8?Q?=E3=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF?=" + "=?UTF-8?Q?=E3=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86?=" + "=?UTF-8?Q?=E3=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87?=" + "=?UTF-8?Q?=E3=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B?=" + "=?UTF-8?Q?=E5=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86?=" + "=?UTF-8?Q?=E5=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7?=" + "=?UTF-8?Q?=E3=81=99=E3=80=82?=";// ≤76 chars + + CHECK(output == expected_output); + g_free(output_cstr); + } +} +#endif -- cgit v1.2.3 From a282883e6f9d70a787970e92dc3d7644661cd8a3 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 16 Nov 2024 18:10:12 +0000 Subject: [Minor] Some more fixes --- src/libmime/mime_headers.c | 8 +++---- test/rspamd_cxx_unit_rfc2047.hxx | 47 +++++++++++++++++----------------------- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index 3565eefba..e4d2ca458 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -818,7 +818,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, char * rspamd_mime_header_encode(const char *in, gsize len) { - static const size_t max_token_size = 76 - (sizeof("=?UTF-8?Q? ?=") - 3); + static const size_t max_token_size = 76 - 12; /* 12 is the length of "=?UTF-8?Q??="; */ GString *outbuf = g_string_sized_new(len); char *encode_buf = g_alloca(max_token_size + 3); const char *p = in; @@ -844,7 +844,7 @@ rspamd_mime_header_encode(const char *in, gsize len) encoded_len += 3; if (encoded_len > max_token_size) { - piece_len = i - 1; + piece_len = i; q = p + piece_len; /* No more space */ break; @@ -854,7 +854,7 @@ rspamd_mime_header_encode(const char *in, gsize len) encoded_len++; if (encoded_len > max_token_size) { - piece_len = i - 1; + piece_len = i; q = p + piece_len; /* No more space */ break; @@ -865,7 +865,7 @@ rspamd_mime_header_encode(const char *in, gsize len) if (has_non_ascii) { g_string_append(outbuf, "=?UTF-8?Q?"); /* Do encode */ - encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, max_token_size); + encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, max_token_size + 3); g_string_append_len(outbuf, encode_buf, encoded_len); g_string_append(outbuf, "?="); } diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index 6f2a42414..e66c79340 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -135,20 +135,12 @@ TEST_SUITE("rfc2047 encode") const char *input = "これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。"; char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); std::string output(output_cstr); - - // Expected output with proper splitting into multiple encoded-words - // The actual encoding would produce a long string; we need to split it into parts - // Each encoded-word should be less than or equal to 76 characters (including the '=?UTF-8?Q?' prefix and '?=' suffix) - // For our mock, we'll simulate the splitting - - // For simplicity in this test, we assume that the encoded output, after encoding and wrapping with '=?UTF-8?Q?' and '?=', is split correctly. - - // Construct the expected output manually (in practice, you may want to write a helper to split it) - std::string expected_output = "=?UTF-8?Q?" - "=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7=E3=81=84=E3=83=86=E3=82=AD?= " - "=?UTF-8?Q?=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95?= " - "=?UTF-8?Q?=E3=82=8C=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD=97=E3=82=92=E8?= " - "=?UTF-8?Q?=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99?=."; + std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?=" + "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?=" + "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?=" + "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?=" + "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?=" + "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="; CHECK(output == expected_output); g_free(output_cstr); @@ -177,10 +169,11 @@ TEST_SUITE("rfc2047 encode") // Expected output: ASCII text as-is, non-ASCII text encoded and split accordingly std::string expected_output = "ASCII_Text " - "=?UTF-8?Q?" - "=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86?= " - "=?UTF-8?Q?=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89?= " - "=?UTF-8?Q?=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="; + "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7?=" + "=?UTF-8?Q?=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3?=" + "=?UTF-8?Q?=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3?=" + "=?UTF-8?Q?=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A=E3=81=BE=E3?=" + "=?UTF-8?Q?=81=99=E3=80=82?="; CHECK(output == expected_output); g_free(output_cstr); @@ -195,15 +188,15 @@ TEST_SUITE("rfc2047 encode") std::string output(output_cstr); std::string expected_output = - "=?UTF-8?Q?=E9=9D=9E=E5=B8=B6=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" - "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97?=" - "=?UTF-8?Q?=E3=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF?=" - "=?UTF-8?Q?=E3=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86?=" - "=?UTF-8?Q?=E3=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87?=" - "=?UTF-8?Q?=E3=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B?=" - "=?UTF-8?Q?=E5=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86?=" - "=?UTF-8?Q?=E5=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7?=" - "=?UTF-8?Q?=E3=81=99=E3=80=82?=";// ≤76 chars + "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" + "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?=" + "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?=" + "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?=" + "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?=" + "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?=" + "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?=" + "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?=" + "=?UTF-8?Q?=81=99=E3=80=82?=";// ≤76 chars CHECK(output == expected_output); g_free(output_cstr); -- cgit v1.2.3 From f286abaac361de836a276172ce9e46e4c058b75d Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 18 Nov 2024 16:31:15 +0000 Subject: [Fix] More fixes to rfc2047 encoding --- src/libmime/mime_headers.c | 39 ++++++++++++++++++++++++++++++--------- src/libmime/mime_headers.h | 5 +++-- src/libserver/protocol.c | 2 +- src/lua/lua_util.c | 13 +++++++++---- test/rspamd_cxx_unit_rfc2047.hxx | 29 +++++++++++++++-------------- 5 files changed, 58 insertions(+), 30 deletions(-) diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index e4d2ca458..a511f5e36 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -816,7 +816,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, } char * -rspamd_mime_header_encode(const char *in, gsize len) +rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) { static const size_t max_token_size = 76 - 12; /* 12 is the length of "=?UTF-8?Q??="; */ GString *outbuf = g_string_sized_new(len); @@ -831,16 +831,17 @@ rspamd_mime_header_encode(const char *in, gsize len) p++; } else { - size_t remain = end - p; - gsize next_offset = rspamd_memcspn(p, " \r\n()", MIN(max_token_size, remain)); - const char *q = p + next_offset; + const char *q = end; size_t piece_len = q - p, encoded_len = 0; /* Check if the piece contains non-ASCII characters */ - gboolean has_non_ascii = FALSE; + gboolean need_encoding = FALSE; + size_t unencoded_prefix = 0, unencoded_suffix = 0; for (size_t i = 0; i < piece_len; i++) { - if ((unsigned char) p[i] >= 128) { - has_non_ascii = TRUE; + unsigned char c = p[i]; + if (c >= 128 || (is_structured && !g_ascii_isalnum(c))) { + need_encoding = TRUE; + unencoded_suffix = 0; encoded_len += 3; if (encoded_len > max_token_size) { @@ -853,21 +854,41 @@ rspamd_mime_header_encode(const char *in, gsize len) else { encoded_len++; + if (!need_encoding) { + unencoded_prefix++; + } + else { + unencoded_suffix++; + } + if (encoded_len > max_token_size) { piece_len = i; q = p + piece_len; /* No more space */ break; } + + if (need_encoding && (c == '(' || c == ')')) { + /* If we need to encode, we must stop on comments characters */ + piece_len = i + 1; + q = p + piece_len; + /* No more space */ + break; + } } } - if (has_non_ascii) { + if (need_encoding) { + g_string_append_len(outbuf, p, unencoded_prefix); + p += unencoded_prefix; g_string_append(outbuf, "=?UTF-8?Q?"); /* Do encode */ - encoded_len = rspamd_encode_qp2047_buf(p, piece_len, encode_buf, max_token_size + 3); + encoded_len = rspamd_encode_qp2047_buf(p, piece_len - unencoded_prefix - unencoded_suffix, + encode_buf, max_token_size + 3); + p += piece_len - unencoded_prefix - unencoded_suffix; g_string_append_len(outbuf, encode_buf, encoded_len); g_string_append(outbuf, "?="); + g_string_append_len(outbuf, p, unencoded_suffix); } else { /* No transformation */ diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h index 9f89daece..290f94799 100644 --- a/src/libmime/mime_headers.h +++ b/src/libmime/mime_headers.h @@ -1,5 +1,5 @@ /* - * Copyright 2023 Vsevolod Stakhov + * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -100,9 +100,10 @@ char *rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, * Encode mime header if needed * @param in * @param len + * @param is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters) * @return newly allocated encoded header */ -char *rspamd_mime_header_encode(const char *in, gsize len); +char *rspamd_mime_header_encode(const char *in, gsize len, bool is_structured); /** * Generate new unique message id diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index 2dc641dfe..1196d2d14 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -1046,7 +1046,7 @@ rspamd_protocol_rewrite_subject(struct rspamd_task *task) g_string_append_len(subj_buf, c, p - c); } - res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len); + res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len, false); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) g_free, diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 251d1e1e7..e92e4977a 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -644,9 +644,10 @@ LUA_FUNCTION_DEF(util, get_hostname); LUA_FUNCTION_DEF(util, parse_content_type); /*** - * @function util.mime_header_encode(hdr) + * @function util.mime_header_encode(hdr[, is_structured]) * Encodes header if needed * @param {string} hdr input header + * @param {boolean} is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters) * @return encoded header */ LUA_FUNCTION_DEF(util, mime_header_encode); @@ -2406,15 +2407,19 @@ static int lua_util_mime_header_encode(lua_State *L) { LUA_TRACE_POINT; - gsize len; - const char *hdr = luaL_checklstring(L, 1, &len); + struct rspamd_lua_text *hdr = lua_check_text_or_string(L, 1); char *encoded; + bool is_structured = false; if (!hdr) { return luaL_error(L, "invalid arguments"); } - encoded = rspamd_mime_header_encode(hdr, len); + if (lua_isboolean(L, 2)) { + is_structured = lua_toboolean(L, 2); + } + + encoded = rspamd_mime_header_encode(hdr->start, hdr->len, is_structured); lua_pushstring(L, encoded); g_free(encoded); diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index e66c79340..25afd15d5 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -21,6 +21,7 @@ #include "doctest/doctest.h" #include +#include "libutil/mem_pool.h" #include "libmime/mime_headers.h" TEST_SUITE("rfc2047 encode") @@ -28,7 +29,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles ASCII-only input") { const char *input = "Hello World"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "Hello World"; CHECK(output == expected_output); @@ -38,7 +39,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles input with non-ASCII characters") { const char *input = "Hello Мир"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="; CHECK(output == expected_output); @@ -48,7 +49,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles mixed input with separators") { const char *input = "ололо (ололо test) test"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= " "(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"; @@ -59,7 +60,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles multiple spaces and separators") { const char *input = "Привет мир\nКак дела?"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82?= " "=?UTF-8?Q?=D0=BC=D0=B8=D1=80?=\n" @@ -72,7 +73,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles empty input") { const char *input = ""; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr ? output_cstr : ""); std::string expected_output = ""; CHECK(output == expected_output); @@ -82,7 +83,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles input with only separators") { const char *input = " \r\n()"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = " \r\n()"; CHECK(output == expected_output); @@ -92,7 +93,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles non-ASCII separators") { const char *input = "こんにちは(世界)"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=" "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"; @@ -103,7 +104,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles input starting with separator") { const char *input = " (Hello)"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = " (Hello)"; CHECK(output == expected_output); @@ -113,7 +114,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles input ending with separator") { const char *input = "Hello) "; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "Hello) "; CHECK(output == expected_output); @@ -123,7 +124,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("rspamd_mime_header_encode handles consecutive non-ASCII pieces") { const char *input = "你好世界"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="; CHECK(output == expected_output); @@ -133,7 +134,7 @@ TEST_SUITE("rfc2047 encode") { // Input string consisting of repeated non-ASCII characters const char *input = "これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?=" "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?=" @@ -151,7 +152,7 @@ TEST_SUITE("rfc2047 encode") // Input string consisting of repeated ASCII characters std::string input_str(100, 'A');// 100 'A's const char *input = input_str.c_str(); - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = input_str; @@ -164,7 +165,7 @@ TEST_SUITE("rfc2047 encode") // Input string with mix of ASCII and non-ASCII characters forming long pieces const char *input = "ASCII_Text " "これは非常に長い非ASCIIテキストで、エンコードが必要になります。"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); // Expected output: ASCII text as-is, non-ASCII text encoded and split accordingly @@ -184,7 +185,7 @@ TEST_SUITE("rfc2047 encode") const char *input = "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。" "データが長すぎる場合、正しく分割されるべきです。"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input)); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = -- cgit v1.2.3 From 16f5221d71b3a25412412fc353a9d0a660783c7e Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 18 Nov 2024 16:37:39 +0000 Subject: [Test] Rework tests structure --- test/rspamd_cxx_unit_rfc2047.hxx | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index 25afd15d5..cdd37d882 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -21,6 +21,7 @@ #include "doctest/doctest.h" #include +#include #include "libutil/mem_pool.h" #include "libmime/mime_headers.h" @@ -28,12 +29,29 @@ TEST_SUITE("rfc2047 encode") { TEST_CASE("rspamd_mime_header_encode handles ASCII-only input") { - const char *input = "Hello World"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "Hello World"; - CHECK(output == expected_output); - g_free(output_cstr); + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::vector> cases = { + {"Hello World", "Hello World"}, + {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}}; + + for (const auto &c: cases) { + SUBCASE(c.first.c_str()) + { + gboolean invalid_utf = FALSE; + const char *input = c.first.c_str(); + char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); + std::string output(output_cstr); + std::string expected_output = c.second; + CHECK(output == expected_output); + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + } + } + + rspamd_mempool_delete(pool); } TEST_CASE("rspamd_mime_header_encode handles input with non-ASCII characters") -- cgit v1.2.3 From 4c1bfe96ff449fb68b8b363f8679ada84a2f82de Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 18 Nov 2024 16:51:44 +0000 Subject: [Test] Test encode with decode --- test/rspamd_cxx_unit_rfc2047.hxx | 195 ++++++++------------------------------- 1 file changed, 39 insertions(+), 156 deletions(-) diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index cdd37d882..e403f99ed 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -32,7 +32,44 @@ TEST_SUITE("rfc2047 encode") rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); std::vector> cases = { {"Hello World", "Hello World"}, - {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}}; + {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}, + {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= " + "(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"}, + {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" + "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"}, + {"", ""}, + {"こんにちは(世界)", "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=" + "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"}, + {"(Hello)", "(Hello)"}, + {"Hello)", "Hello)"}, + {"你好世界", "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="}, + {"これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。", + "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?=" + "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?=" + "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?=" + "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?=" + "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?=" + "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="}, + {"ASCII_Text " + "これは非常に長い非ASCIIテキストで、エンコードが必要になります。", + "ASCII_Text " + "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?=" + "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9?=" + "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?=" + "=?UTF-8?Q?=E3=83=89=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A?=" + "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="}, + {"非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。" + "データが長すぎる場合、正しく分割されるべきです。", + "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" + "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?=" + "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?=" + "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?=" + "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?=" + "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?=" + "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?=" + "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?=" + "=?UTF-8?Q?=81=99=E3=80=82?="}, + }; for (const auto &c: cases) { SUBCASE(c.first.c_str()) @@ -46,7 +83,7 @@ TEST_SUITE("rfc2047 encode") char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); std::string decoded(decoded_cstr); CHECK(invalid_utf == FALSE); - CHECK(decoded == input); + CHECK(decoded == c.first); g_free(output_cstr); } } @@ -54,117 +91,6 @@ TEST_SUITE("rfc2047 encode") rspamd_mempool_delete(pool); } - TEST_CASE("rspamd_mime_header_encode handles input with non-ASCII characters") - { - const char *input = "Hello Мир"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles mixed input with separators") - { - const char *input = "ололо (ололо test) test"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= " - "(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles multiple spaces and separators") - { - const char *input = "Привет мир\nКак дела?"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82?= " - "=?UTF-8?Q?=D0=BC=D0=B8=D1=80?=\n" - "=?UTF-8?Q?=D0=9A=D0=B0=D0=BA?= " - "=?UTF-8?Q?=D0=B4=D0=B5=D0=BB=D0=B0=3F?="; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles empty input") - { - const char *input = ""; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr ? output_cstr : ""); - std::string expected_output = ""; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles input with only separators") - { - const char *input = " \r\n()"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = " \r\n()"; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles non-ASCII separators") - { - const char *input = "こんにちは(世界)"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=" - "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles input starting with separator") - { - const char *input = " (Hello)"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = " (Hello)"; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles input ending with separator") - { - const char *input = "Hello) "; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "Hello) "; - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("rspamd_mime_header_encode handles consecutive non-ASCII pieces") - { - const char *input = "你好世界"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="; - CHECK(output == expected_output); - g_free(output_cstr); - } - TEST_CASE("rspamd_mime_header_encode handles long non-ASCII input requiring encoded-word splitting") - { - // Input string consisting of repeated non-ASCII characters - const char *input = "これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?=" - "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?=" - "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?=" - "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?=" - "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?=" - "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="; - - CHECK(output == expected_output); - g_free(output_cstr); - } - TEST_CASE("rspamd_mime_header_encode handles long ASCII input without encoding") { // Input string consisting of repeated ASCII characters @@ -177,48 +103,5 @@ TEST_SUITE("rfc2047 encode") CHECK(output == expected_output); g_free(output_cstr); } - - TEST_CASE("rspamd_mime_header_encode handles long mixed input requiring encoded-word splitting") - { - // Input string with mix of ASCII and non-ASCII characters forming long pieces - const char *input = "ASCII_Text " - "これは非常に長い非ASCIIテキストで、エンコードが必要になります。"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - - // Expected output: ASCII text as-is, non-ASCII text encoded and split accordingly - std::string expected_output = "ASCII_Text " - "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7?=" - "=?UTF-8?Q?=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3?=" - "=?UTF-8?Q?=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3?=" - "=?UTF-8?Q?=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A=E3=81=BE=E3?=" - "=?UTF-8?Q?=81=99=E3=80=82?="; - - CHECK(output == expected_output); - g_free(output_cstr); - } - - TEST_CASE("process_string handles very long non-ASCII word requiring multiple splits") - { - const char *input = - "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。" - "データが長すぎる場合、正しく分割されるべきです。"; - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - - std::string expected_output = - "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" - "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?=" - "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?=" - "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?=" - "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?=" - "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?=" - "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?=" - "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?=" - "=?UTF-8?Q?=81=99=E3=80=82?=";// ≤76 chars - - CHECK(output == expected_output); - g_free(output_cstr); - } } #endif -- cgit v1.2.3 From 0f467cca8287f12aa890a4378c76946ccef55abf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 18 Nov 2024 17:30:51 +0000 Subject: [Fix] More things to fix --- src/libmime/mime_headers.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index a511f5e36..f33311bbf 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -854,13 +854,6 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) else { encoded_len++; - if (!need_encoding) { - unencoded_prefix++; - } - else { - unencoded_suffix++; - } - if (encoded_len > max_token_size) { piece_len = i; q = p + piece_len; @@ -875,6 +868,13 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) /* No more space */ break; } + + if (!need_encoding) { + unencoded_prefix++; + } + else { + unencoded_suffix++; + } } } -- cgit v1.2.3 From 5298f6ce75da70ad4c906bc9180f0ac4b9d86ec0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 18 Nov 2024 17:43:00 +0000 Subject: [Test] Fix some tests again --- test/rspamd_cxx_unit_rfc2047.hxx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index e403f99ed..0f6f2eabb 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -31,10 +31,11 @@ TEST_SUITE("rfc2047 encode") { rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); std::vector> cases = { + {"PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]", + "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"}, {"Hello World", "Hello World"}, {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}, - {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= " - "(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"}, + {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE_?=(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE_?=test) test"}, {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"}, {"", ""}, @@ -69,6 +70,7 @@ TEST_SUITE("rfc2047 encode") "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?=" "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?=" "=?UTF-8?Q?=81=99=E3=80=82?="}, + }; for (const auto &c: cases) { -- cgit v1.2.3 From 9bd616ca7aca3ff3fcd3a051e110f2d6e9abc76b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 18 Nov 2024 18:43:03 +0000 Subject: [Fix] Sigh, another fix --- src/libmime/mime_headers.c | 2 +- test/rspamd_cxx_unit_rfc2047.hxx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index f33311bbf..63419d6a3 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -863,7 +863,7 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) if (need_encoding && (c == '(' || c == ')')) { /* If we need to encode, we must stop on comments characters */ - piece_len = i + 1; + piece_len = i; q = p + piece_len; /* No more space */ break; diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index 0f6f2eabb..ebb11cdc1 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -35,7 +35,7 @@ TEST_SUITE("rfc2047 encode") "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"}, {"Hello World", "Hello World"}, {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}, - {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE_?=(=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE_?=test) test"}, + {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"}, {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"}, {"", ""}, -- cgit v1.2.3