[Rework] Use C++ utf8 library with unit tests to trim whitespaces

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Fri, 14 May 2021 19:58:28 +0000 (20:58 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Fri, 14 May 2021 19:58:28 +0000 (20:58 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 14 May 2021 19:58:28 +0000 (20:58 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 14 May 2021 19:58:28 +0000 (20:58 +0100)
diff --git a/src/libserver/html.c b/src/libserver/html.c

index 30c2c022bf27cb13418deceadf3f0bef529acf42..8d7b722a5695bccc553bd9eb7d29c6fafbb42d3e 100644 (file)
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -25,6 +25,7 @@
  #include "contrib/libucl/khash.h"
  #include "libmime/images.h"
  #include "css/css.h"
+#include "libutil/cxx/utf8_util.h"
  
  #include <unicode/uversion.h>
  #include <unicode/ucnv.h>
@@ -2619,43 +2620,8 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
         dlen = dest->len - href_offset;
  
         /* Strip unicode spaces from the start and the end */
-       gchar *p = url->visible_part, *end = url->visible_part + dlen;
-       gint i = 0;
-
-       while (i < dlen) {
-               UChar32 uc;
-               gint prev_i = i;
-
-               U8_NEXT(p, i, dlen, uc);
-
-               if (!u_isspace (uc)) {
-                       i = prev_i;
-                       break;
-               }
-       }
-
-       p += i;
-       dlen -= i;
-       url->visible_part = p;
-       i = end - url->visible_part - 1;
-
-       if (i > 0) {
-               gint32 dl = dlen;
-
-               while (i > 0) {
-                       UChar32 uc;
-
-                       U8_PREV(p, i, dl, uc);
-
-                       if (!u_isspace (uc)) {
-                               break;
-                       }
-               }
-
-               dlen = i;
-       }
-
-
+       url->visible_part = rspamd_string_unicode_trim_inplace (url->visible_part,
+                       &dlen);
         rspamd_html_url_is_phished (pool, url,
                         url->visible_part,
                         dlen,
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt

index 64cc8ee1e9deb8be25c3f19f4521bd7eb876e9fa..5160dfe7b67d4dc88d918636208baef43d0fc75a 100644 (file)
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -16,6 +16,7 @@ SET(LIBRSPAMDUTILSRC
                                 ${CMAKE_CURRENT_SOURCE_DIR}/upstream.c
                                 ${CMAKE_CURRENT_SOURCE_DIR}/util.c
                                 ${CMAKE_CURRENT_SOURCE_DIR}/heap.c
-                               ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c)
+                               ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c
+                               ${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx)
  # Rspamdutil
  SET(RSPAMD_UTIL ${LIBRSPAMDUTILSRC} PARENT_SCOPE)
 \ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx

new file mode 100644 (file)

index 0000000..f44d026
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -0,0 +1,100 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/utypes.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <utility>
+#include <string>
+
+#include "utf8_util.h"
+#include "str_util.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+char *
+rspamd_string_unicode_trim_inplace (char *str, size_t *len)
+{
+       auto *p = str, *end = str + *len;
+       auto i = 0;
+
+       while (i < *len) {
+               UChar32 uc;
+               auto prev_i = i;
+
+               U8_NEXT(p, i, *len, uc);
+
+               if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+                       i = prev_i;
+                       break;
+               }
+       }
+
+       p += i;
+       (*len) -= i;
+       i = end - p;
+       auto *ret = p;
+
+       if (i > 0) {
+
+               while (i > 0) {
+                       UChar32 uc;
+                       auto prev_i = i;
+
+                       U8_PREV(p, 0, i, uc);
+
+                       if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+                               i = prev_i;
+                               break;
+                       }
+               }
+
+               *len = i;
+       }
+
+       return ret;
+}
+
+TEST_SUITE("utf8 utils") {
+       TEST_CASE("utf8 trim") {
+               std::pair<const char *, const char *> cases[] = {
+                               {" \u200B""abc ", "abc"},
+                               {"   ",  ""},
+                               {"   a", "a"},
+                               {"a   ", "a"},
+                               {"a a",  "a a"},
+                               {"abc",  "abc"},
+                               {"a ", "a"},
+                               {"   abc      ", "abc"},
+                               {" abc ", "abc"},
+                               {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+                               {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+                               {" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
+               };
+
+               for (const auto &c : cases) {
+                       std::string cpy{c.first};
+                       auto ns = cpy.size();
+                       auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+                       std::string res{nstart, ns};
+                       CHECK(res == std::string{c.second});
+               }
+       }
+}
+
+
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h

new file mode 100644 (file)

index 0000000..40bb53b
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef RSPAMD_UTF8_UTIL_H
+#define RSPAMD_UTF8_UTIL_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Removes all unicode spaces from a string
+ * @param str start of the string
+ * @param len length
+ * @return new length of the string trimmed
+ */
+char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif //RSPAMD_UTF8_UTIL_H
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Fri, 14 May 2021 19:58:28 +0000 (20:58 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Fri, 14 May 2021 19:58:28 +0000 (20:58 +0100)
src/libserver/html.c		patch \| blob \| history
src/libutil/CMakeLists.txt		patch \| blob \| history
src/libutil/cxx/utf8_util.cxx	[new file with mode: 0644]	patch \| blob
src/libutil/cxx/utf8_util.h	[new file with mode: 0644]	patch \| blob