From b39a9f52ed3f33082f13f51678d053ee80a2e1f4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 29 Nov 2024 11:31:35 +0000 Subject: [Rework] Replace fastutf with simdutf Simdutf is faster and has way better support of the architectures (especially when it comes to non-x86 stuff). Hence, it is a good idea to use it instead of the non-supported fastutf8 stuff. --- src/libutil/CMakeLists.txt | 1 + src/libutil/cxx/rspamd-simdutf.cxx | 41 ++++++++++++++++++++++++++++++++++++++ src/libutil/fstring.c | 2 +- src/libutil/regexp.c | 2 +- src/libutil/rspamd_simdutf.h | 34 +++++++++++++++++++++++++++++++ src/libutil/str_util.c | 2 +- 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 src/libutil/cxx/rspamd-simdutf.cxx create mode 100644 src/libutil/rspamd_simdutf.h (limited to 'src/libutil') diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt index 67b7e948f..acf082708 100644 --- a/src/libutil/CMakeLists.txt +++ b/src/libutil/CMakeLists.txt @@ -18,6 +18,7 @@ SET(LIBRSPAMDUTILSRC ${CMAKE_CURRENT_SOURCE_DIR}/heap.c ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c ${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/cxx/rspamd-simdutf.cxx ${CMAKE_CURRENT_SOURCE_DIR}/cxx/util_tests.cxx ${CMAKE_CURRENT_SOURCE_DIR}/cxx/file_util.cxx) # Rspamdutil diff --git a/src/libutil/cxx/rspamd-simdutf.cxx b/src/libutil/cxx/rspamd-simdutf.cxx new file mode 100644 index 000000000..67b585812 --- /dev/null +++ b/src/libutil/cxx/rspamd-simdutf.cxx @@ -0,0 +1,41 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A simple interface for simdutf library to allow old functions to work properly + */ + +#include "config.h" +#include "simdutf.h" + +extern "C" { + +void rspamd_fast_utf8_library_init(unsigned flags) +{ + // This library requires no initialisation +} + +off_t rspamd_fast_utf8_validate(const unsigned char *data, size_t len) +{ + auto res = simdutf::validate_utf8_with_errors((const char *) data, len); + + if (res.error == simdutf::error_code::SUCCESS) { + return 0; + } + + return res.count + 1;// We need to return offset for the first invalid character +} +} \ No newline at end of file diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c index 082620c27..8da6b0068 100644 --- a/src/libutil/fstring.c +++ b/src/libutil/fstring.c @@ -15,7 +15,7 @@ */ #include "fstring.h" #include "str_util.h" -#include "contrib/fastutf8/fastutf8.h" +#include "rspamd_simdutf.h" #include "contrib/mumhash/mum.h" diff --git a/src/libutil/regexp.c b/src/libutil/regexp.c index 9e98699fe..0646285ae 100644 --- a/src/libutil/regexp.c +++ b/src/libutil/regexp.c @@ -19,7 +19,7 @@ #include "ref.h" #include "util.h" #include "rspamd.h" -#include "contrib/fastutf8/fastutf8.h" +#include "rspamd_simdutf.h" #ifndef WITH_PCRE2 /* Normal pcre path */ diff --git a/src/libutil/rspamd_simdutf.h b/src/libutil/rspamd_simdutf.h new file mode 100644 index 000000000..c1fa07892 --- /dev/null +++ b/src/libutil/rspamd_simdutf.h @@ -0,0 +1,34 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_RSPAMD_SIMDUTF_H +#define RSPAMD_RSPAMD_SIMDUTF_H + +#pragma once +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void rspamd_fast_utf8_library_init(unsigned flags); +off_t rspamd_fast_utf8_validate(const unsigned char *data, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif//RSPAMD_RSPAMD_SIMDUTF_H diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index f8fff0dca..b3e47b7d4 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -31,7 +31,7 @@ #include #endif -#include "contrib/fastutf8/fastutf8.h" +#include "rspamd_simdutf.h" const unsigned char lc_map[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, -- cgit v1.2.3