#include "contrib/libucl/khash.h"
#include "libmime/images.h"
#include "css/css.h"
+#include "libutil/cxx/utf8_util.h"
#include <unicode/uversion.h>
#include <unicode/ucnv.h>
dlen = dest->len - href_offset;
/* Strip unicode spaces from the start and the end */
- gchar *p = url->visible_part, *end = url->visible_part + dlen;
- gint i = 0;
-
- while (i < dlen) {
- UChar32 uc;
- gint prev_i = i;
-
- U8_NEXT(p, i, dlen, uc);
-
- if (!u_isspace (uc)) {
- i = prev_i;
- break;
- }
- }
-
- p += i;
- dlen -= i;
- url->visible_part = p;
- i = end - url->visible_part - 1;
-
- if (i > 0) {
- gint32 dl = dlen;
-
- while (i > 0) {
- UChar32 uc;
-
- U8_PREV(p, i, dl, uc);
-
- if (!u_isspace (uc)) {
- break;
- }
- }
-
- dlen = i;
- }
-
-
+ url->visible_part = rspamd_string_unicode_trim_inplace (url->visible_part,
+ &dlen);
rspamd_html_url_is_phished (pool, url,
url->visible_part,
dlen,
${CMAKE_CURRENT_SOURCE_DIR}/upstream.c
${CMAKE_CURRENT_SOURCE_DIR}/util.c
${CMAKE_CURRENT_SOURCE_DIR}/heap.c
- ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c)
+ ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx)
# Rspamdutil
SET(RSPAMD_UTIL ${LIBRSPAMDUTILSRC} PARENT_SCOPE)
\ No newline at end of file
--- /dev/null
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/utypes.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <utility>
+#include <string>
+
+#include "utf8_util.h"
+#include "str_util.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+char *
+rspamd_string_unicode_trim_inplace (char *str, size_t *len)
+{
+ auto *p = str, *end = str + *len;
+ auto i = 0;
+
+ while (i < *len) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_NEXT(p, i, *len, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ p += i;
+ (*len) -= i;
+ i = end - p;
+ auto *ret = p;
+
+ if (i > 0) {
+
+ while (i > 0) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_PREV(p, 0, i, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ *len = i;
+ }
+
+ return ret;
+}
+
+TEST_SUITE("utf8 utils") {
+ TEST_CASE("utf8 trim") {
+ std::pair<const char *, const char *> cases[] = {
+ {" \u200B""abc ", "abc"},
+ {" ", ""},
+ {" a", "a"},
+ {"a ", "a"},
+ {"a a", "a a"},
+ {"abc", "abc"},
+ {"a ", "a"},
+ {" abc ", "abc"},
+ {" abc ", "abc"},
+ {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+ {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+ {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"},
+ };
+
+ for (const auto &c : cases) {
+ std::string cpy{c.first};
+ auto ns = cpy.size();
+ auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+ std::string res{nstart, ns};
+ CHECK(res == std::string{c.second});
+ }
+ }
+}
+
+
--- /dev/null
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef RSPAMD_UTF8_UTIL_H
+#define RSPAMD_UTF8_UTIL_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Removes all unicode spaces from a string
+ * @param str start of the string
+ * @param len length
+ * @return new length of the string trimmed
+ */
+char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //RSPAMD_UTF8_UTIL_H