diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-03-29 14:54:48 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-03-29 14:54:48 +0100 |
commit | f305fb7394a3e80a2487889834b37e4f66fe43e6 (patch) | |
tree | d59197aac3041c41a922bcf63c3ff1e6f98741a1 | |
parent | 6ac7be499b330a26348253a80b082b744f346711 (diff) | |
download | rspamd-f305fb7394a3e80a2487889834b37e4f66fe43e6.tar.gz rspamd-f305fb7394a3e80a2487889834b37e4f66fe43e6.zip |
[Feature] Add avx2 codec for base64
-rw-r--r-- | src/libcryptobox/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/libcryptobox/base64/avx2.c | 280 | ||||
-rw-r--r-- | src/libcryptobox/base64/base64.c | 14 | ||||
-rw-r--r-- | src/libcryptobox/base64/sse42.c | 7 |
4 files changed, 298 insertions, 4 deletions
diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt index 6e0344bbf..9b1079ff2 100644 --- a/src/libcryptobox/CMakeLists.txt +++ b/src/libcryptobox/CMakeLists.txt @@ -79,6 +79,7 @@ IF(HAVE_AVX2) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx2.S) SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx2.S) SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/avx2.S) + SET(BASE64SRC ${BASE64SRC} ${CMAKE_CURRENT_SOURCE_DIR}/base64/avx2.c) ENDIF(HAVE_AVX2) IF(HAVE_AVX) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx.S) diff --git a/src/libcryptobox/base64/avx2.c b/src/libcryptobox/base64/avx2.c new file mode 100644 index 000000000..80f3b9972 --- /dev/null +++ b/src/libcryptobox/base64/avx2.c @@ -0,0 +1,280 @@ +/*- + * Copyright 2018 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- +Copyright (c) 2013-2015, Alfred Klomp +Copyright (c) 2018, Vsevolod Stakhov +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "cryptobox.h" + +extern const uint8_t base64_table_dec[256]; + +#ifdef RSPAMD_HAS_TARGET_ATTR +#pragma GCC push_options +#pragma GCC target("avx2") +#ifndef __SSE2__ +#define __SSE2__ +#endif +#ifndef __SSE__ +#define __SSE__ +#endif +#ifndef __SSE4_2__ +#define __SSE4_2__ +#endif +#ifndef __SSE4_1__ +#define __SSE4_1__ +#endif +#ifndef __SSEE3__ +#define __SSEE3__ +#endif +#ifndef __AVX__ +#define __AVX__ +#endif +#ifndef __AVX2__ +#define __AVX2__ +#endif + +#include <immintrin.h> + +#define CMPGT(s,n) _mm256_cmpgt_epi8((s), _mm256_set1_epi8(n)) +#define CMPEQ(s,n) _mm256_cmpeq_epi8((s), _mm256_set1_epi8(n)) +#define REPLACE(s,n) _mm256_and_si256((s), _mm256_set1_epi8(n)) +#define RANGE(s,a,b) _mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1)) + +static inline __m256i +dec_reshuffle (__m256i in) __attribute__((__target__("avx2"))); + +static inline __m256i +dec_reshuffle (__m256i in) +{ + // in, lower lane, bits, upper case are most significant bits, lower case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together in each lane: + out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes + return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); +} + + +#define INNER_LOOP_AVX2 \ + while (inlen >= 45) { \ + __m256i str = _mm256_loadu_si256((__m256i *)c); \ + const __m256i lut_lo = _mm256_setr_epi8( \ + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, \ + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, \ + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, \ + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); \ + const __m256i lut_hi = _mm256_setr_epi8( \ + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, \ + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, \ + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, \ + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); \ + const __m256i lut_roll = _mm256_setr_epi8( \ + 0, 16, 19, 4, -65, -65, -71, -71, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 16, 19, 4, -65, -65, -71, -71, \ + 0, 0, 0, 0, 0, 0, 0, 0); \ + const __m256i mask_2F = _mm256_set1_epi8(0x2f); \ + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); \ + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); \ + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); \ + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); \ + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); \ + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); \ + if (!_mm256_testz_si256(lo, hi)) { \ + break; \ + } \ + str = _mm256_add_epi8(str, roll); \ + str = dec_reshuffle(str); \ + _mm256_storeu_si256((__m256i *)o, str); \ + c += 32; \ + o += 24; \ + outl += 24; \ + inlen -= 32; \ + } + +int +base64_decode_avx2 (const char *in, size_t inlen, + unsigned char *out, size_t *outlen) __attribute__((__target__("avx2"))); +int +base64_decode_avx2 (const char *in, size_t inlen, + unsigned char *out, size_t *outlen) +{ + ssize_t ret = 0; + const uint8_t *c = (const uint8_t *)in; + uint8_t *o = (uint8_t *)out; + uint8_t q, carry; + size_t outl = 0; + size_t leftover = 0; + +repeat: + switch (leftover) { + for (;;) { + case 0: + INNER_LOOP_AVX2 + + if (inlen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec[*c++]) >= 254) { + ret = 0; + break; + } + carry = q << 2; + leftover++; + + case 1: + if (inlen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec[*c++]) >= 254) { + ret = 0; + break; + } + *o++ = carry | (q >> 4); + carry = q << 4; + leftover++; + outl++; + + case 2: + if (inlen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec[*c++]) >= 254) { + leftover++; + + if (q == 254) { + if (inlen-- != 0) { + leftover = 0; + q = base64_table_dec[*c++]; + ret = ((q == 254) && (inlen == 0)) ? 1 : 0; + break; + } + else { + ret = 1; + break; + } + } + else { + leftover --; + } + /* If we get here, there was an error: */ + break; + } + *o++ = carry | (q >> 2); + carry = q << 6; + leftover++; + outl++; + + case 3: + if (inlen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec[*c++]) >= 254) { + /* + * When q == 254, the input char is '='. Return 1 and EOF. + * When q == 255, the input char is invalid. Return 0 and EOF. + */ + if (q == 254 && inlen == 0) { + ret = 1; + leftover = 0; + } + else { + ret = 0; + } + + break; + } + + *o++ = carry | q; + carry = 0; + leftover = 0; + outl++; + } + } + + if (!ret && inlen > 0) { + /* Skip to the next valid character in input */ + while (inlen > 0 && base64_table_dec[*c] >= 254) { + c ++; + inlen --; + } + + if (inlen > 0) { + goto repeat; + } + } + + *outlen = outl; + + return ret; +} + +#pragma GCC pop_options +#endif diff --git a/src/libcryptobox/base64/base64.c b/src/libcryptobox/base64/base64.c index a0115cad3..a0a68d7d6 100644 --- a/src/libcryptobox/base64/base64.c +++ b/src/libcryptobox/base64/base64.c @@ -69,8 +69,21 @@ BASE64_DECLARE(sse42); # endif #endif +#ifdef RSPAMD_HAS_TARGET_ATTR +# if defined(HAVE_AVX2) +int base64_decode_avx2 (const char *in, size_t inlen, + unsigned char *out, size_t *outlen) __attribute__((__target__("avx2"))); + +BASE64_DECLARE(avx2); +# define BASE64_AVX2 BASE64_IMPL(CPUID_AVX2, "avx2", avx2) +# endif +#endif + static const base64_impl_t base64_list[] = { BASE64_REF, +#ifdef BASE64_AVX2 + BASE64_AVX2, +#endif #ifdef BASE64_SSE42 BASE64_SSE42, #endif @@ -118,6 +131,7 @@ base64_test (bool generic, size_t niters, size_t len) impl = generic ? &base64_list[0] : base64_opt; + printf("hui: %s\n", impl->desc); out = rspamd_encode_base64 (in, len, 0, &outlen); impl->decode (out, outlen, tmp, &len); diff --git a/src/libcryptobox/base64/sse42.c b/src/libcryptobox/base64/sse42.c index db585a637..1d1287ad2 100644 --- a/src/libcryptobox/base64/sse42.c +++ b/src/libcryptobox/base64/sse42.c @@ -99,7 +99,7 @@ static inline __m128i dec_reshuffle (__m128i in) #define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n)) -#define INNER_LOOP_SSE42 do { \ +#define INNER_LOOP_SSE42 \ while (inlen >= 24) { \ __m128i str = _mm_loadu_si128((__m128i *)c); \ const __m128i lut = _mm_setr_epi8( \ @@ -135,8 +135,7 @@ static inline __m128i dec_reshuffle (__m128i in) o += 12; \ outl += 12; \ inlen -= 16; \ - } \ -} while (0) + } int base64_decode_sse42 (const char *in, size_t inlen, @@ -156,7 +155,7 @@ repeat: switch (leftover) { for (;;) { case 0: - INNER_LOOP_SSE42; + INNER_LOOP_SSE42 if (inlen-- == 0) { ret = 1; |