From a59e81ca90c986725107c8c013ccf33a91b07d45 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 14 May 2021 20:58:28 +0100 Subject: [PATCH] [Rework] Use C++ utf8 library with unit tests to trim whitespaces --- src/libserver/html.c | 40 +------------- src/libutil/CMakeLists.txt | 3 +- src/libutil/cxx/utf8_util.cxx | 100 ++++++++++++++++++++++++++++++++++ src/libutil/cxx/utf8_util.h | 41 ++++++++++++++ 4 files changed, 146 insertions(+), 38 deletions(-) create mode 100644 src/libutil/cxx/utf8_util.cxx create mode 100644 src/libutil/cxx/utf8_util.h diff --git a/src/libserver/html.c b/src/libserver/html.c index 30c2c022b..8d7b722a5 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -25,6 +25,7 @@ #include "contrib/libucl/khash.h" #include "libmime/images.h" #include "css/css.h" +#include "libutil/cxx/utf8_util.h" #include #include @@ -2619,43 +2620,8 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, dlen = dest->len - href_offset; /* Strip unicode spaces from the start and the end */ - gchar *p = url->visible_part, *end = url->visible_part + dlen; - gint i = 0; - - while (i < dlen) { - UChar32 uc; - gint prev_i = i; - - U8_NEXT(p, i, dlen, uc); - - if (!u_isspace (uc)) { - i = prev_i; - break; - } - } - - p += i; - dlen -= i; - url->visible_part = p; - i = end - url->visible_part - 1; - - if (i > 0) { - gint32 dl = dlen; - - while (i > 0) { - UChar32 uc; - - U8_PREV(p, i, dl, uc); - - if (!u_isspace (uc)) { - break; - } - } - - dlen = i; - } - - + url->visible_part = rspamd_string_unicode_trim_inplace (url->visible_part, + &dlen); rspamd_html_url_is_phished (pool, url, url->visible_part, dlen, diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt index 64cc8ee1e..5160dfe7b 100644 --- a/src/libutil/CMakeLists.txt +++ b/src/libutil/CMakeLists.txt @@ -16,6 +16,7 @@ SET(LIBRSPAMDUTILSRC ${CMAKE_CURRENT_SOURCE_DIR}/upstream.c ${CMAKE_CURRENT_SOURCE_DIR}/util.c ${CMAKE_CURRENT_SOURCE_DIR}/heap.c - ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c) + ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c + ${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx) # Rspamdutil SET(RSPAMD_UTIL ${LIBRSPAMDUTILSRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx new file mode 100644 index 000000000..f44d02671 --- /dev/null +++ b/src/libutil/cxx/utf8_util.cxx @@ -0,0 +1,100 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define U_CHARSET_IS_UTF8 1 +#include +#include +#include +#include +#include + +#include "utf8_util.h" +#include "str_util.h" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +char * +rspamd_string_unicode_trim_inplace (char *str, size_t *len) +{ + auto *p = str, *end = str + *len; + auto i = 0; + + while (i < *len) { + UChar32 uc; + auto prev_i = i; + + U8_NEXT(p, i, *len, uc); + + if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) { + i = prev_i; + break; + } + } + + p += i; + (*len) -= i; + i = end - p; + auto *ret = p; + + if (i > 0) { + + while (i > 0) { + UChar32 uc; + auto prev_i = i; + + U8_PREV(p, 0, i, uc); + + if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) { + i = prev_i; + break; + } + } + + *len = i; + } + + return ret; +} + +TEST_SUITE("utf8 utils") { + TEST_CASE("utf8 trim") { + std::pair cases[] = { + {" \u200B""abc ", "abc"}, + {" ", ""}, + {" a", "a"}, + {"a ", "a"}, + {"a a", "a a"}, + {"abc", "abc"}, + {"a ", "a"}, + {" abc ", "abc"}, + {" abc ", "abc"}, + {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"}, + {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"}, + {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"}, + }; + + for (const auto &c : cases) { + std::string cpy{c.first}; + auto ns = cpy.size(); + auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns); + std::string res{nstart, ns}; + CHECK(res == std::string{c.second}); + } + } +} + + diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h new file mode 100644 index 000000000..40bb53bf0 --- /dev/null +++ b/src/libutil/cxx/utf8_util.h @@ -0,0 +1,41 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_UTF8_UTIL_H +#define RSPAMD_UTF8_UTIL_H + +#include "config.h" +#include "mem_pool.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Removes all unicode spaces from a string + * @param str start of the string + * @param len length + * @return new length of the string trimmed + */ +char* rspamd_string_unicode_trim_inplace (char *str, size_t *len); + +#ifdef __cplusplus +} +#endif + +#endif //RSPAMD_UTF8_UTIL_H -- 2.39.5