From: Vsevolod Stakhov Date: Wed, 8 Apr 2015 12:05:55 +0000 (+0100) Subject: Add AVX implementation generated by clang. X-Git-Tag: 0.9.0~292 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=639c8e49fdf60966c5a75899efbb65084a099998;p=rspamd.git Add AVX implementation generated by clang. --- diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt index ecd729d6a..84b6db96e 100644 --- a/src/libcryptobox/CMakeLists.txt +++ b/src/libcryptobox/CMakeLists.txt @@ -49,6 +49,7 @@ ENDIF(HAVE_AVX2) IF(HAVE_AVX) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx.S) SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx.S) + SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/avx.S) ENDIF(HAVE_AVX) IF(HAVE_SSE2) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/sse2.S) diff --git a/src/libcryptobox/siphash/avx.S b/src/libcryptobox/siphash/avx.S new file mode 100644 index 000000000..72e18c7c1 --- /dev/null +++ b/src/libcryptobox/siphash/avx.S @@ -0,0 +1,332 @@ +#include "../macro.S" +#include "constants.S" + +/* + * Generated by clang-3.7 with -mavx -Ofast from reference implementation + */ + +SECTION_TEXT + +GLOBAL_HIDDEN_FN siphash_avx +siphash_avx_local: + .cfi_startproc + pushq %r15 +.Ltmp0: + .cfi_def_cfa_offset 16 + pushq %r14 +.Ltmp1: + .cfi_def_cfa_offset 24 + pushq %r12 +.Ltmp2: + .cfi_def_cfa_offset 32 + pushq %rbx +.Ltmp3: + .cfi_def_cfa_offset 40 +.Ltmp4: + .cfi_offset %rbx, -40 +.Ltmp5: + .cfi_offset %r12, -32 +.Ltmp6: + .cfi_offset %r14, -24 +.Ltmp7: + .cfi_offset %r15, -16 + movq (%rdi), %rcx + movq 8(%rdi), %rbx + movq %rdx, %r9 + shlq $56, %r9 + movq %r9, -8(%rsp) + movabsq $8317987319222330741, %r12 # imm = 0x736F6D6570736575 + xorq %rcx, %r12 + movabsq $7237128888997146477, %rax # imm = 0x646F72616E646F6D + xorq %rbx, %rax + movabsq $7816392313619706465, %r8 # imm = 0x6C7967656E657261 + xorq %rcx, %r8 + movabsq $8387220255154660723, %rdi # imm = 0x7465646279746573 + xorq %rbx, %rdi + cmpq $8, %rdx + jb .LBB0_4 +# BB#1: # %.lr.ph104 + leaq -8(%rdx), %r10 + movq %r10, %r11 + andq $-8, %r11 + leaq 8(%r11), %r14 + movq %rsi, %rbx + .align 16, 0x90 +.LBB0_2: # =>This Inner Loop Header: Depth=1 + movq (%rbx), %r15 + addq $8, %rbx + xorq %r15, %rdi + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + xorq %r15, %r12 + addq $-8, %rdx + cmpq $7, %rdx + ja .LBB0_2 +# BB#3: # %..preheader_crit_edge + subq %r11, %r10 + addq %r14, %rsi + movq %r10, %rdx +.LBB0_4: # %.preheader + testq %rdx, %rdx + je .LBB0_13 +# BB#5: # %overflow.checked + xorl %ebx, %ebx + movq %rdx, %r9 + andq $-128, %r9 + je .LBB0_9 +# BB#6: # %vector.body.preheader + leaq 88(%rsp), %rbx + leaq 96(%rsi), %rcx + movq %rdx, %r10 + andq $-128, %r10 + .align 16, 0x90 +.LBB0_7: # %vector.body + # =>This Inner Loop Header: Depth=1 + vmovups -96(%rcx), %ymm0 + vmovups -64(%rcx), %ymm1 + vmovups -32(%rcx), %ymm2 + vmovups (%rcx), %ymm3 + vmovups %ymm0, -96(%rbx) + vmovups %ymm1, -64(%rbx) + vmovups %ymm2, -32(%rbx) + vmovups %ymm3, (%rbx) + subq $-128, %rbx + subq $-128, %rcx + addq $-128, %r10 + jne .LBB0_7 +# BB#8: + movq %r9, %rbx +.LBB0_9: # %middle.block + subq %rbx, %rdx + je .LBB0_12 +# BB#10: # %.lr.ph.preheader + leaq -8(%rsp,%rbx), %rcx + addq %rbx, %rsi + .align 16, 0x90 +.LBB0_11: # %.lr.ph + # =>This Inner Loop Header: Depth=1 + movb (%rsi), %bl + movb %bl, (%rcx) + incq %rcx + incq %rsi + decq %rdx + jne .LBB0_11 +.LBB0_12: # %._crit_edge + movq -8(%rsp), %r9 +.LBB0_13: + xorq %r9, %rdi + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + xorq %r9, %r12 + xorq $255, %r8 + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + #APP + shldq $32, %r12, %r12 + #NO_APP + addq %rax, %r8 + addq %rdi, %r12 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + xorq %r12, %rdi + #APP + shldq $32, %r8, %r8 + #NO_APP + addq %rax, %r12 + addq %rdi, %r8 + #APP + shldq $13, %rax, %rax + #NO_APP + #APP + shldq $16, %rdi, %rdi + #NO_APP + xorq %r12, %rax + xorq %r8, %rdi + addq %rax, %r8 + #APP + shldq $17, %rax, %rax + #NO_APP + #APP + shldq $21, %rdi, %rdi + #NO_APP + xorq %r8, %rax + #APP + shldq $32, %r8, %r8 + #NO_APP + xorq %rdi, %rax + xorq %r8, %rax + popq %rbx + popq %r12 + popq %r14 + popq %r15 + vzeroupper + retq +.Lfunc_end0: + .size siphash_avx_local, .Lfunc_end0-siphash_avx_local + .cfi_endproc +FN_END siphash_avx diff --git a/src/libcryptobox/siphash/siphash.c b/src/libcryptobox/siphash/siphash.c index f42456b7f..498609fc5 100644 --- a/src/libcryptobox/siphash/siphash.c +++ b/src/libcryptobox/siphash/siphash.c @@ -49,10 +49,17 @@ SIPHASH_DECLARE(ref) SIPHASH_DECLARE(sse41) #define SIPHASH_SSE41 SIPHASH_IMPL(CPUID_SSE41, "sse41", sse41) #endif +#if defined(HAVE_AVX) +SIPHASH_DECLARE(avx) +#define SIPHASH_AVX SIPHASH_IMPL(CPUID_AVX, "avx", avx) +#endif /* list implemenations from most optimized to least, with generic as the last entry */ static const siphash_impl_t siphash_list[] = { SIPHASH_GENERIC, +#if defined(SIPHASH_AVX) + SIPHASH_AVX, +#endif #if defined(SIPHASH_SSE41) SIPHASH_SSE41, #endif @@ -73,7 +80,6 @@ siphash_load(void) } } } - fprintf(stderr, "selected %s\n", siphash_opt->desc); } void siphash24 (unsigned char *out, const unsigned char *in, diff --git a/src/libcryptobox/siphash/sse41.S b/src/libcryptobox/siphash/sse41.S index 58acfee8f..92c15671a 100644 --- a/src/libcryptobox/siphash/sse41.S +++ b/src/libcryptobox/siphash/sse41.S @@ -1,6 +1,11 @@ #include "../macro.S" #include "constants.S" +/* + * Generated by gcc-4.9 from siphash sse41 implementation written by + * Samuel Neves and submitted to supercop competition + */ + SECTION_TEXT GLOBAL_HIDDEN_FN siphash_sse41 diff --git a/test/lua/unit/siphash.lua b/test/lua/unit/siphash.lua index 62a30b01a..1c773b45e 100644 --- a/test/lua/unit/siphash.lua +++ b/test/lua/unit/siphash.lua @@ -3,12 +3,27 @@ context("Siphash check functions", function() local ffi = require("ffi") ffi.cdef[[ - size_t siphash24_test(void); + void rspamd_cryptobox_init (void); + size_t siphash24_test(bool generic); + double rspamd_get_ticks (void); ]] - - test("Siphash test vectors", function() - local res = ffi.C.siphash24_test() + + ffi.C.rspamd_cryptobox_init() + + test("Siphash test reference vectors", function() + local t1 = ffi.C.rspamd_get_ticks() + local res = ffi.C.siphash24_test(true) + local t2 = ffi.C.rspamd_get_ticks() + + print("Refrence siphash: " .. tostring(t2 - t1) .. " sec") + assert_not_equal(res, 0) + end) + test("Siphash test optimized vectors", function() + local t1 = ffi.C.rspamd_get_ticks() + local res = ffi.C.siphash24_test(false) + local t2 = ffi.C.rspamd_get_ticks() + print("Optimized siphash: " .. tostring(t2 - t1) .. " sec") assert_not_equal(res, 0) end) end) \ No newline at end of file