aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2024-11-19 02:39:23 +0600
committerGitHub <noreply@github.com>2024-11-19 02:39:23 +0600
commit41eab5b874721f7abc23144e5c3386392f1820f7 (patch)
tree7dff5fcc31a0e97b8a612348889dcbf2e4a40724
parentc06830279ff9f98d3d5ee2888da7b2fa33a8c339 (diff)
parent9bd616ca7aca3ff3fcd3a051e110f2d6e9abc76b (diff)
downloadrspamd-41eab5b874721f7abc23144e5c3386392f1820f7.tar.gz
rspamd-41eab5b874721f7abc23144e5c3386392f1820f7.zip
Merge pull request #5223 from rspamd/vstakhov-fix-2047-encode
Fix RFC 2047 encoding
-rw-r--r--src/libmime/mime_headers.c137
-rw-r--r--src/libmime/mime_headers.h5
-rw-r--r--src/libserver/protocol.c2
-rw-r--r--src/libutil/fstring.c29
-rw-r--r--src/lua/lua_util.c13
-rw-r--r--test/rspamd_cxx_unit.cxx7
-rw-r--r--test/rspamd_cxx_unit_rfc2047.hxx109
7 files changed, 213 insertions, 89 deletions
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
index e4bf4db06..63419d6a3 100644
--- a/src/libmime/mime_headers.c
+++ b/src/libmime/mime_headers.c
@@ -754,7 +754,7 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
state = parse_normal;
}
} /* qmarks >= 3 */
- } /* p == '=' */
+ } /* p == '=' */
else {
state = got_encoded_start;
}
@@ -816,88 +816,93 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
}
char *
-rspamd_mime_header_encode(const char *in, gsize len)
+rspamd_mime_header_encode(const char *in, gsize len, bool is_structured)
{
- const char *p = in, *end = in + len;
- char *out, encode_buf[80 * sizeof(uint32_t)];
- GString *res;
- gboolean need_encoding = FALSE;
+ static const size_t max_token_size = 76 - 12; /* 12 is the length of "=?UTF-8?Q??="; */
+ GString *outbuf = g_string_sized_new(len);
+ char *encode_buf = g_alloca(max_token_size + 3);
+ const char *p = in;
+ const char *end = in + len;
- /* Check if we need to encode */
while (p < end) {
- if ((((unsigned char) *p) & 0x80) != 0) {
- need_encoding = TRUE;
- break;
+ if (*p == ' ' || *p == '\r' || *p == '\n' || *p == '(' || *p == ')') {
+ /* Append the separator as is */
+ g_string_append_c(outbuf, *p);
+ p++;
}
- p++;
- }
+ else {
+ const char *q = end;
+ size_t piece_len = q - p, encoded_len = 0;
+
+ /* Check if the piece contains non-ASCII characters */
+ gboolean need_encoding = FALSE;
+ size_t unencoded_prefix = 0, unencoded_suffix = 0;
+ for (size_t i = 0; i < piece_len; i++) {
+ unsigned char c = p[i];
+ if (c >= 128 || (is_structured && !g_ascii_isalnum(c))) {
+ need_encoding = TRUE;
+ unencoded_suffix = 0;
+ encoded_len += 3;
+
+ if (encoded_len > max_token_size) {
+ piece_len = i;
+ q = p + piece_len;
+ /* No more space */
+ break;
+ }
+ }
+ else {
+ encoded_len++;
- if (!need_encoding) {
- out = g_malloc(len + 1);
- rspamd_strlcpy(out, in, len + 1);
- }
- else {
- /* Need encode */
- gsize ulen, pos;
- int r;
- const char *prev;
- /* Choose step: =?UTF-8?Q?<qp>?= should be less than 76 chars */
- unsigned int step = (76 - 12) / 3 + 1;
-
- ulen = g_utf8_strlen(in, len);
- res = g_string_sized_new(len * 2 + 1);
- pos = 0;
- prev = in;
- /* Adjust chunk size for unicode average length */
- step *= 1.0 * ulen / (double) len;
-
- while (pos < ulen) {
- p = g_utf8_offset_to_pointer(in, pos);
-
- if (p > prev) {
- /* Encode and print */
- r = rspamd_encode_qp2047_buf(prev, p - prev,
- encode_buf, sizeof(encode_buf));
-
- if (r != -1) {
- if (res->len > 0) {
- rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r,
- encode_buf);
+ if (encoded_len > max_token_size) {
+ piece_len = i;
+ q = p + piece_len;
+ /* No more space */
+ break;
+ }
+
+ if (need_encoding && (c == '(' || c == ')')) {
+ /* If we need to encode, we must stop on comments characters */
+ piece_len = i;
+ q = p + piece_len;
+ /* No more space */
+ break;
+ }
+
+ if (!need_encoding) {
+ unencoded_prefix++;
}
else {
- rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r,
- encode_buf);
+ unencoded_suffix++;
}
}
}
- pos += MIN(step, ulen - pos);
- prev = p;
- }
-
- /* Leftover */
- if (prev < end) {
- r = rspamd_encode_qp2047_buf(prev, end - prev,
- encode_buf, sizeof(encode_buf));
-
- if (r != -1) {
- if (res->len > 0) {
- rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r,
- encode_buf);
- }
- else {
- rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r,
- encode_buf);
- }
+ if (need_encoding) {
+ g_string_append_len(outbuf, p, unencoded_prefix);
+ p += unencoded_prefix;
+ g_string_append(outbuf, "=?UTF-8?Q?");
+ /* Do encode */
+ encoded_len = rspamd_encode_qp2047_buf(p, piece_len - unencoded_prefix - unencoded_suffix,
+ encode_buf, max_token_size + 3);
+ p += piece_len - unencoded_prefix - unencoded_suffix;
+ g_string_append_len(outbuf, encode_buf, encoded_len);
+ g_string_append(outbuf, "?=");
+ g_string_append_len(outbuf, p, unencoded_suffix);
+ }
+ else {
+ /* No transformation */
+ g_string_append_len(outbuf, p, piece_len);
}
+ p = q;
}
-
- out = g_string_free(res, FALSE);
}
- return out;
+ /* return the allocated string and free the GString struct */
+ return g_string_free(outbuf, FALSE);
}
+
char *
rspamd_mime_message_id_generate(const char *fqdn)
{
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h
index 9f89daece..290f94799 100644
--- a/src/libmime/mime_headers.h
+++ b/src/libmime/mime_headers.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2024 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -100,9 +100,10 @@ char *rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in,
* Encode mime header if needed
* @param in
* @param len
+ * @param is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters)
* @return newly allocated encoded header
*/
-char *rspamd_mime_header_encode(const char *in, gsize len);
+char *rspamd_mime_header_encode(const char *in, gsize len, bool is_structured);
/**
* Generate new unique message id
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 2dc641dfe..1196d2d14 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -1046,7 +1046,7 @@ rspamd_protocol_rewrite_subject(struct rspamd_task *task)
g_string_append_len(subj_buf, c, p - c);
}
- res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len);
+ res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len, false);
rspamd_mempool_add_destructor(task->task_pool,
(rspamd_mempool_destruct_t) g_free,
diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c
index ffe130477..082620c27 100644
--- a/src/libutil/fstring.c
+++ b/src/libutil/fstring.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2024 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -242,10 +242,8 @@ fstrhash_c(uint64_t c, uint64_t hval)
uint32_t
rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf)
{
- gsize i;
uint64_t hval;
- const char *p, *end = NULL;
- gunichar uc;
+ const char *p;
if (str == NULL) {
return 0;
@@ -253,21 +251,26 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf)
p = str->begin;
hval = str->len;
- end = p + str->len;
if (is_utf) {
if (rspamd_fast_utf8_validate(p, str->len) != 0) {
return rspamd_fstrhash_lc(str, FALSE);
}
- while (p < end) {
- uc = g_unichar_tolower(g_utf8_get_char(p));
- hval = fstrhash_c(uc, hval);
- p = g_utf8_next_char(p);
+
+ size_t i = 0, len = str->len;
+ UChar32 uc;
+
+ while (i < len) {
+ U8_NEXT(p, i, len, uc);
+
+ if (uc > 0) {
+ hval = fstrhash_c(u_tolower(uc), hval);
+ }
}
}
else {
gsize large_steps = str->len / sizeof(uint64_t);
- for (i = 0; i < large_steps; i++, p += sizeof(uint64_t)) {
+ for (size_t i = 0; i < large_steps; i++, p += sizeof(uint64_t)) {
/* Copy to the uint64 lowercasing each byte */
union {
char c[sizeof(uint64_t)];
@@ -280,7 +283,7 @@ rspamd_fstrhash_lc(const rspamd_ftok_t *str, gboolean is_utf)
}
gsize remain = str->len % sizeof(uint64_t);
- for (i = 0; i < remain; i++, p++) {
+ for (size_t i = 0; i < remain; i++, p++) {
hval = fstrhash_c(g_ascii_tolower(*p), hval);
}
}
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 251d1e1e7..e92e4977a 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -644,9 +644,10 @@ LUA_FUNCTION_DEF(util, get_hostname);
LUA_FUNCTION_DEF(util, parse_content_type);
/***
- * @function util.mime_header_encode(hdr)
+ * @function util.mime_header_encode(hdr[, is_structured])
* Encodes header if needed
* @param {string} hdr input header
+ * @param {boolean} is_structured if true, then we encode as structured header (e.g. encode all non alpha-numeric characters)
* @return encoded header
*/
LUA_FUNCTION_DEF(util, mime_header_encode);
@@ -2406,15 +2407,19 @@ static int
lua_util_mime_header_encode(lua_State *L)
{
LUA_TRACE_POINT;
- gsize len;
- const char *hdr = luaL_checklstring(L, 1, &len);
+ struct rspamd_lua_text *hdr = lua_check_text_or_string(L, 1);
char *encoded;
+ bool is_structured = false;
if (!hdr) {
return luaL_error(L, "invalid arguments");
}
- encoded = rspamd_mime_header_encode(hdr, len);
+ if (lua_isboolean(L, 2)) {
+ is_structured = lua_toboolean(L, 2);
+ }
+
+ encoded = rspamd_mime_header_encode(hdr->start, hdr->len, is_structured);
lua_pushstring(L, encoded);
g_free(encoded);
diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx
index b7cb0c6bf..ff323fb85 100644
--- a/test/rspamd_cxx_unit.cxx
+++ b/test/rspamd_cxx_unit.cxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2024 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,6 +25,7 @@
#include "rspamd_cxx_local_ptr.hxx"
#include "rspamd_cxx_unit_dkim.hxx"
#include "rspamd_cxx_unit_cryptobox.hxx"
+#include "rspamd_cxx_unit_rfc2047.hxx"
static gboolean verbose = false;
static const GOptionEntry entries[] =
diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx
new file mode 100644
index 000000000..ebb11cdc1
--- /dev/null
+++ b/test/rspamd_cxx_unit_rfc2047.hxx
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_RFC2047_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+#include <string>
+#include <vector>
+#include "libutil/mem_pool.h"
+#include "libmime/mime_headers.h"
+
+TEST_SUITE("rfc2047 encode")
+{
+ TEST_CASE("rspamd_mime_header_encode handles ASCII-only input")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::vector<std::pair<std::string, std::string>> cases = {
+ {"PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]",
+ "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"},
+ {"Hello World", "Hello World"},
+ {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="},
+ {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"},
+ {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?="
+ "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"},
+ {"", ""},
+ {"こんにちは(世界)", "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?="
+ "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"},
+ {"(Hello)", "(Hello)"},
+ {"Hello)", "Hello)"},
+ {"你好世界", "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="},
+ {"これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。",
+ "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?="
+ "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?="
+ "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?="
+ "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?="
+ "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?="
+ "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="},
+ {"ASCII_Text "
+ "これは非常に長い非ASCIIテキストで、エンコードが必要になります。",
+ "ASCII_Text "
+ "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?="
+ "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9?="
+ "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?="
+ "=?UTF-8?Q?=E3=83=89=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A?="
+ "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="},
+ {"非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。"
+ "データが長すぎる場合、正しく分割されるべきです。",
+ "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?="
+ "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?="
+ "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?="
+ "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?="
+ "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?="
+ "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?="
+ "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?="
+ "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?="
+ "=?UTF-8?Q?=81=99=E3=80=82?="},
+
+ };
+
+ for (const auto &c: cases) {
+ SUBCASE(c.first.c_str())
+ {
+ gboolean invalid_utf = FALSE;
+ const char *input = c.first.c_str();
+ char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
+ std::string output(output_cstr);
+ std::string expected_output = c.second;
+ CHECK(output == expected_output);
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == c.first);
+ g_free(output_cstr);
+ }
+ }
+
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("rspamd_mime_header_encode handles long ASCII input without encoding")
+ {
+ // Input string consisting of repeated ASCII characters
+ std::string input_str(100, 'A');// 100 'A's
+ const char *input = input_str.c_str();
+ char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
+ std::string output(output_cstr);
+ std::string expected_output = input_str;
+
+ CHECK(output == expected_output);
+ g_free(output_cstr);
+ }
+}
+#endif