aboutsummaryrefslogtreecommitdiffstats
path: root/src/libutil/cxx
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2021-05-14 20:58:28 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2021-05-14 20:58:28 +0100
commita59e81ca90c986725107c8c013ccf33a91b07d45 (patch)
treed936608b6caaa4f0263aecc401a1915851d414fe /src/libutil/cxx
parenta2eb042dcd36228b9e0a6d1417c54032489d91ff (diff)
downloadrspamd-a59e81ca90c986725107c8c013ccf33a91b07d45.tar.gz
rspamd-a59e81ca90c986725107c8c013ccf33a91b07d45.zip
[Rework] Use C++ utf8 library with unit tests to trim whitespaces
Diffstat (limited to 'src/libutil/cxx')
-rw-r--r--src/libutil/cxx/utf8_util.cxx100
-rw-r--r--src/libutil/cxx/utf8_util.h41
2 files changed, 141 insertions, 0 deletions
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
new file mode 100644
index 000000000..f44d02671
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -0,0 +1,100 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/utypes.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <utility>
+#include <string>
+
+#include "utf8_util.h"
+#include "str_util.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+char *
+rspamd_string_unicode_trim_inplace (char *str, size_t *len)
+{
+ auto *p = str, *end = str + *len;
+ auto i = 0;
+
+ while (i < *len) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_NEXT(p, i, *len, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ p += i;
+ (*len) -= i;
+ i = end - p;
+ auto *ret = p;
+
+ if (i > 0) {
+
+ while (i > 0) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_PREV(p, 0, i, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ *len = i;
+ }
+
+ return ret;
+}
+
+TEST_SUITE("utf8 utils") {
+ TEST_CASE("utf8 trim") {
+ std::pair<const char *, const char *> cases[] = {
+ {" \u200B""abc ", "abc"},
+ {" ", ""},
+ {" a", "a"},
+ {"a ", "a"},
+ {"a a", "a a"},
+ {"abc", "abc"},
+ {"a ", "a"},
+ {" abc ", "abc"},
+ {" abc ", "abc"},
+ {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+ {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+ {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"},
+ };
+
+ for (const auto &c : cases) {
+ std::string cpy{c.first};
+ auto ns = cpy.size();
+ auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+ std::string res{nstart, ns};
+ CHECK(res == std::string{c.second});
+ }
+ }
+}
+
+
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
new file mode 100644
index 000000000..40bb53bf0
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef RSPAMD_UTF8_UTIL_H
+#define RSPAMD_UTF8_UTIL_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Removes all unicode spaces from a string
+ * @param str start of the string
+ * @param len length
+ * @return new length of the string trimmed
+ */
+char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //RSPAMD_UTF8_UTIL_H