aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/fastutf8/fastutf8.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2024-11-29 11:31:35 +0000
committerVsevolod Stakhov <vsevolod@rspamd.com>2024-11-29 11:31:35 +0000
commitb39a9f52ed3f33082f13f51678d053ee80a2e1f4 (patch)
tree2144a18d85681df09f83e255f2e5c6d04e61e878 /contrib/fastutf8/fastutf8.c
parent6c0223b32b8fcb6621fa64197214abb400a09f52 (diff)
downloadrspamd-b39a9f52ed3f33082f13f51678d053ee80a2e1f4.tar.gz
rspamd-b39a9f52ed3f33082f13f51678d053ee80a2e1f4.zip
[Rework] Replace fastutf with simdutf
Simdutf is faster and has way better support of the architectures (especially when it comes to non-x86 stuff). Hence, it is a good idea to use it instead of the non-supported fastutf8 stuff.
Diffstat (limited to 'contrib/fastutf8/fastutf8.c')
-rw-r--r--contrib/fastutf8/fastutf8.c160
1 files changed, 0 insertions, 160 deletions
diff --git a/contrib/fastutf8/fastutf8.c b/contrib/fastutf8/fastutf8.c
deleted file mode 100644
index 89becaf0a..000000000
--- a/contrib/fastutf8/fastutf8.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2019 Yibo Cai
- * Copyright (c) 2019 Vsevolod Stakhov
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "fastutf8.h"
-#include "libcryptobox/platform_config.h"
-
-
-/*
- * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
- *
- * Table 3-7. Well-Formed UTF-8 Byte Sequences
- *
- * +--------------------+------------+-------------+------------+-------------+
- * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+0000..U+007F | 00..7F | | | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+0080..U+07FF | C2..DF | 80..BF | | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
- * +--------------------+------------+-------------+------------+-------------+
- */
-
-/* Return 0 - success, >0 - index (1 based) of first error char */
-off_t
-rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len)
-{
- off_t err_pos = 1;
-
- while (len) {
- int bytes;
- const unsigned char byte1 = data[0];
-
- /* 00..7F */
- if (byte1 <= 0x7F) {
- bytes = 1;
- /* C2..DF, 80..BF */
- }
- else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
- (signed char) data[1] <= (signed char) 0xBF) {
- bytes = 2;
- }
- else if (len >= 3) {
- const unsigned char byte2 = data[1];
-
- /* Is byte2, byte3 between 0x80 ~ 0xBF */
- const int byte2_ok = (signed char) byte2 <= (signed char) 0xBF;
- const int byte3_ok = (signed char) data[2] <= (signed char) 0xBF;
-
- if (byte2_ok && byte3_ok &&
- /* E0, A0..BF, 80..BF */
- ((byte1 == 0xE0 && byte2 >= 0xA0) ||
- /* E1..EC, 80..BF, 80..BF */
- (byte1 >= 0xE1 && byte1 <= 0xEC) ||
- /* ED, 80..9F, 80..BF */
- (byte1 == 0xED && byte2 <= 0x9F) ||
- /* EE..EF, 80..BF, 80..BF */
- (byte1 >= 0xEE && byte1 <= 0xEF))) {
- bytes = 3;
- }
- else if (len >= 4) {
- /* Is byte4 between 0x80 ~ 0xBF */
- const int byte4_ok = (signed char) data[3] <= (signed char) 0xBF;
-
- if (byte2_ok && byte3_ok && byte4_ok &&
- /* F0, 90..BF, 80..BF, 80..BF */
- ((byte1 == 0xF0 && byte2 >= 0x90) ||
- /* F1..F3, 80..BF, 80..BF, 80..BF */
- (byte1 >= 0xF1 && byte1 <= 0xF3) ||
- /* F4, 80..8F, 80..BF, 80..BF */
- (byte1 == 0xF4 && byte2 <= 0x8F))) {
- bytes = 4;
- }
- else {
- return err_pos;
- }
- }
- else {
- return err_pos;
- }
- }
- else {
- return err_pos;
- }
-
- len -= bytes;
- err_pos += bytes;
- data += bytes;
- }
-
- return 0;
-}
-
-/* Prototypes */
-#if defined(HAVE_SSE41) && defined(__x86_64__)
-extern off_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
-#endif
-#if defined(HAVE_AVX2) && defined(__x86_64__)
-extern off_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
-#endif
-
-static off_t (*validate_func) (const unsigned char *data, size_t len) =
- rspamd_fast_utf8_validate_ref;
-
-
-void
-rspamd_fast_utf8_library_init (unsigned flags)
-{
-#if defined(HAVE_SSE41) && defined(__x86_64__)
- if (flags & RSPAMD_FAST_UTF8_FLAG_SSE41) {
- validate_func = rspamd_fast_utf8_validate_sse41;
- }
-#endif
-#if defined(HAVE_AVX2) && defined(__x86_64__)
- if (flags & RSPAMD_FAST_UTF8_FLAG_AVX2) {
- validate_func = rspamd_fast_utf8_validate_avx2;
- }
-#endif
-}
-
-off_t
-rspamd_fast_utf8_validate (const unsigned char *data, size_t len)
-{
- return len >= 64 ?
- validate_func (data, len) :
- rspamd_fast_utf8_validate_ref (data, len);
-} \ No newline at end of file