7 files changed, 213 insertions, 89 deletions
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
index e4bf4db06..63419d6a3 100644
--- a/src/libmime/mime_headers.c
+++ b/src/libmime/mime_headers.c
@@ -754,7 +754,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
 						state = parse_normal;
 					}
 				} /* qmarks >= 3 */
-			}     /* p == '=' */
+			} /* p == '=' */
 			else {
 				state = got_encoded_start;
 			}
@@ -816,88 +816,93 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
 }
 
 char *
-rspamd_mime_header_encode(const char *in, gsize len)
+rspamd_mime_header_encode(const char *in, gsize len, bool is_structured)
 {
-	const char *p = in, *end = in + len;
-	char *out, encode_buf[80 * sizeof(uint32_t)];
-	GString *res;
-	gboolean need_encoding = FALSE;
+	static const size_t max_token_size = 76 - 12; /* 12 is the length of "=?UTF-8?Q??="; */
+	GString *outbuf = g_string_sized_new(len);
+	char *encode_buf = g_alloca(max_token_size + 3);
+	const char *p = in;
+	const char *end = in + len;
 
-	/* Check if we need to encode */
 	while (p < end) {
-		if ((((unsigned char) *p) & 0x80) != 0) {
-			need_encoding = TRUE;
-			break;
+		if (*p == ' ' || *p == '\r' || *p == '\n' || *p == '(' || *p == ')') {
+			/* Append the separator as is */
+			g_string_append_c(outbuf, *p);
+			p++;
 		}
-		p++;
-	}
+		else {
+			const char *q = end;
+			size_t piece_len = q - p, encoded_len = 0;
+
+			/* Check if the piece contains non-ASCII characters */
+			gboolean need_encoding = FALSE;
+			size_t unencoded_prefix = 0, unencoded_suffix = 0;
+			for (size_t i = 0; i < piece_len; i++) {
+				unsigned char c = p[i];
+				if (c >= 128 || (is_structured && !g_ascii_isalnum(c))) {
+					need_encoding = TRUE;
+					unencoded_suffix = 0;
+					encoded_len += 3;
+
+					if (encoded_len > max_token_size) {
+						piece_len = i;
+						q = p + piece_len;
+						/* No more space */
+						break;
+					}
+				}
+				else {
+					encoded_len++;
 
-	if (!need_encoding) {
-		out = g_malloc(len + 1);
-		rspamd_strlcpy(out, in, len + 1);
-	}
-	else {
-		/* Need encode */
-		gsize ulen, pos;
-		int r;
-		const char *prev;
-		/* Choose step: =?UTF-8?Q?<qp>?= should be less than 76 chars */
-		unsigned int step = (76 - 12) / 3 + 1;
-
-		ulen = g_utf8_strlen(in, len);
-		res = g_string_sized_new(len * 2 + 1);
-		pos = 0;
-		prev = in;
-		/* Adjust chunk size for unicode average length */
-		step *= 1.0 * ulen / (double) len;
-
-		while (pos < ulen) {
-			p = g_utf8_offset_to_pointer(in, pos);
-
-			if (p > prev) {
-				/* Encode and print */
-				r = rspamd_encode_qp2047_buf(prev, p - prev,
-											 encode_buf, sizeof(encode_buf));
-
-				if (r != -1) {
-					if (res->len > 0) {
-						rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r,
-											  encode_buf);
+					if (encoded_len > max_token_size) {
+						piece_len = i;
+						q = p + piece_len;
+						/* No more space */
+						break;
+					}
+
+					if (need_encoding && (c == '(' || c == ')')) {
+						/* If we need to encode, we must stop on comments characters */
+						piece_len = i;
+						q = p + piece_len;
+						/* No more space */
+						break;
+					}
+
+					if (!need_encoding) {
+						unencoded_prefix++;
 					}
 					else {
-						rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r,
-											  encode_buf);
+						unencoded_suffix++;
 					}
 				}
 			}
 
-			pos += MIN(step, ulen - pos);
-			prev = p;
-		}
-
-		/* Leftover */
-		if (prev < end) {
-			r = rspamd_encode_qp2047_buf(prev, end - prev,
-										 encode_buf, sizeof(encode_buf));
-
-			if (r != -1) {
-				if (res->len > 0) {
-					rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r,
-										  encode_buf);
-				}
-				else {
-					rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r,
-										  encode_buf);
-				}
+			if (need_encoding) {
+				g_string_append_len(outbuf, p, unencoded_prefix);
+				p += unencoded_prefix;
+				g_string_append(outbuf, "=?UTF-8?Q?");
+				/* Do encode */
+				encoded_len = rspamd_encode_qp2047_buf(p, piece_len - unencoded_prefix - unencoded_suffix,
+													   encode_buf, max_token_size + 3);
+				p += piece_len - unencoded_prefix - unencoded_suffix;
+				g_string_append_len(outbuf, encode_buf, encoded_len);
+				g_string_append(outbuf, "?=");
+				g_string_append_len(outbuf, p, unencoded_suffix);
+			}
+			else {
+				/* No transformation */
+				g_string_append_len(outbuf, p, piece_len);
 			}
+			p = q;
 		}
-
-		out = g_string_free(res, FALSE);
 	}
 
-	return out;
+	/* return the allocated string and free the GString struct */
+	return g_string_free(outbuf, FALSE);
 }
 
+
 char *
 rspamd_mime_message_id_generate(const char *fqdn)
 {
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h
index 9f89daece..290f94799 100644
--- a/src/libmime/mime_headers.h
+++ b/src/libmime/mime_headers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2024 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,9 +100,10 @@ char *rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
  * Encode mime header if needed
  * @param in
  * @param len
+ * @param is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters)
  * @return newly allocated encoded header
  */
-char *rspamd_mime_header_encode(const char *in, gsize len);
+char *rspamd_mime_header_encode(const char *in, gsize len, bool is_structured);
 
 /**
  * Generate new unique message id
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 2dc641dfe..1196d2d14 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -1046,7 +1046,7 @@ rspamd_protocol_rewrite_subject(struct rspamd_task *task)
 		g_string_append_len(subj_buf, c, p - c);
 	}
 
-	res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len);
+	res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len, false);
 
 	rspamd_mempool_add_destructor(task->task_pool,
 								  (rspamd_mempool_destruct_t) g_free,
diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c
index ffe130477..082620c27 100644
--- a/src/libutil/fstring.c
+++ b/src/libutil/fstring.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2024 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -242,10 +242,8 @@ fstrhash_c(uint64_t c, uint64_t hval)
 uint32_t
 rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf)
 {
-	gsize i;
 	uint64_t hval;
-	const char *p, *end = NULL;
-	gunichar uc;
+	const char *p;
 
 	if (str == NULL) {
 		return 0;
@@ -253,21 +251,26 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf)
 
 	p = str->begin;
 	hval = str->len;
-	end = p + str->len;
 
 	if (is_utf) {
 		if (rspamd_fast_utf8_validate(p, str->len) != 0) {
 			return rspamd_fstrhash_lc(str, FALSE);
 		}
-		while (p < end) {
-			uc = g_unichar_tolower(g_utf8_get_char(p));
-			hval = fstrhash_c(uc, hval);
-			p = g_utf8_next_char(p);
+
+		size_t i = 0, len = str->len;
+		UChar32 uc;
+
+		while (i < len) {
+			U8_NEXT(p, i, len, uc);
+
+			if (uc > 0) {
+				hval = fstrhash_c(u_tolower(uc), hval);
+			}
 		}
 	}
 	else {
 		gsize large_steps = str->len / sizeof(uint64_t);
-		for (i = 0; i < large_steps; i++, p += sizeof(uint64_t)) {
+		for (size_t i = 0; i < large_steps; i++, p += sizeof(uint64_t)) {
 			/* Copy to the uint64 lowercasing each byte */
 			union {
 				char c[sizeof(uint64_t)];
@@ -280,7 +283,7 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf)
 		}
 
 		gsize remain = str->len % sizeof(uint64_t);
-		for (i = 0; i < remain; i++, p++) {
+		for (size_t i = 0; i < remain; i++, p++) {
 			hval = fstrhash_c(g_ascii_tolower(*p), hval);
 		}
 	}
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 251d1e1e7..e92e4977a 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -644,9 +644,10 @@ LUA_FUNCTION_DEF(util, get_hostname);
 LUA_FUNCTION_DEF(util, parse_content_type);
 
 /***
- *  @function util.mime_header_encode(hdr)
+ *  @function util.mime_header_encode(hdr[, is_structured])
  * Encodes header if needed
  * @param {string} hdr input header
+ * @param {boolean} is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters)
  * @return encoded header
  */
 LUA_FUNCTION_DEF(util, mime_header_encode);
@@ -2406,15 +2407,19 @@ static int
 lua_util_mime_header_encode(lua_State *L)
 {
 	LUA_TRACE_POINT;
-	gsize len;
-	const char *hdr = luaL_checklstring(L, 1, &len);
+	struct rspamd_lua_text *hdr = lua_check_text_or_string(L, 1);
 	char *encoded;
+	bool is_structured = false;
 
 	if (!hdr) {
 		return luaL_error(L, "invalid arguments");
 	}
 
-	encoded = rspamd_mime_header_encode(hdr, len);
+	if (lua_isboolean(L, 2)) {
+		is_structured = lua_toboolean(L, 2);
+	}
+
+	encoded = rspamd_mime_header_encode(hdr->start, hdr->len, is_structured);
 	lua_pushstring(L, encoded);
 	g_free(encoded);
 
diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx
index b7cb0c6bf..ff323fb85 100644
--- a/test/rspamd_cxx_unit.cxx
+++ b/test/rspamd_cxx_unit.cxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2024 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,6 +25,7 @@
 #include "rspamd_cxx_local_ptr.hxx"
 #include "rspamd_cxx_unit_dkim.hxx"
 #include "rspamd_cxx_unit_cryptobox.hxx"
+#include "rspamd_cxx_unit_rfc2047.hxx"
 
 static gboolean verbose = false;
 static const GOptionEntry entries[] =
diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx
new file mode 100644
index 000000000..ebb11cdc1
--- /dev/null
+++ b/test/rspamd_cxx_unit_rfc2047.hxx
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+#include <string>
+#include <vector>
+#include "libutil/mem_pool.h"
+#include "libmime/mime_headers.h"
+
+TEST_SUITE("rfc2047 encode")
+{
+	TEST_CASE("rspamd_mime_header_encode handles ASCII-only input")
+	{
+		rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+		std::vector<std::pair<std::string, std::string>> cases = {
+			{"PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]",
+			 "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"},
+			{"Hello World", "Hello World"},
+			{"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="},
+			{"ололо (ололо test)    test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test)    test"},
+			{"Привет    мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?="
+										"=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"},
+			{"", ""},
+			{"こんにちは(世界)", "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?="
+								 "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"},
+			{"(Hello)", "(Hello)"},
+			{"Hello)", "Hello)"},
+			{"你好世界", "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="},
+			{"これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。",
+			 "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?="
+			 "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?="
+			 "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?="
+			 "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?="
+			 "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?="
+			 "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="},
+			{"ASCII_Text "
+			 "これは非常に長い非ASCIIテキストで、エンコードが必要になります。",
+			 "ASCII_Text "
+			 "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?="
+			 "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9?="
+			 "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?="
+			 "=?UTF-8?Q?=E3=83=89=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A?="
+			 "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="},
+			{"非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。"
+			 "データが長すぎる場合、正しく分割されるべきです。",
+			 "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?="
+			 "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?="
+			 "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?="
+			 "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?="
+			 "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?="
+			 "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?="
+			 "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?="
+			 "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?="
+			 "=?UTF-8?Q?=81=99=E3=80=82?="},
+
+		};
+
+		for (const auto &c: cases) {
+			SUBCASE(c.first.c_str())
+			{
+				gboolean invalid_utf = FALSE;
+				const char *input = c.first.c_str();
+				char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
+				std::string output(output_cstr);
+				std::string expected_output = c.second;
+				CHECK(output == expected_output);
+				char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+				std::string decoded(decoded_cstr);
+				CHECK(invalid_utf == FALSE);
+				CHECK(decoded == c.first);
+				g_free(output_cstr);
+			}
+		}
+
+		rspamd_mempool_delete(pool);
+	}
+
+	TEST_CASE("rspamd_mime_header_encode handles long ASCII input without encoding")
+	{
+		// Input string consisting of repeated ASCII characters
+		std::string input_str(100, 'A');// 100 'A's
+		const char *input = input_str.c_str();
+		char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
+		std::string output(output_cstr);
+		std::string expected_output = input_str;
+
+		CHECK(output == expected_output);
+		g_free(output_cstr);
+	}
+}
+#endif