]> source.dussan.org Git - rspamd.git/commitdiff
Add AVX implementation generated by clang.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 8 Apr 2015 12:05:55 +0000 (13:05 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 8 Apr 2015 12:05:55 +0000 (13:05 +0100)
src/libcryptobox/CMakeLists.txt
src/libcryptobox/siphash/avx.S [new file with mode: 0644]
src/libcryptobox/siphash/siphash.c
src/libcryptobox/siphash/sse41.S
test/lua/unit/siphash.lua

index ecd729d6a0c00f9fb1e7b6152f13e4a2a5dc88d8..84b6db96e8301238eac24ecabdc0b0a156df31c1 100644 (file)
@@ -49,6 +49,7 @@ ENDIF(HAVE_AVX2)
 IF(HAVE_AVX)
        SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx.S)
        SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx.S)
+       SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/avx.S)
 ENDIF(HAVE_AVX)
 IF(HAVE_SSE2)
        SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/sse2.S)
diff --git a/src/libcryptobox/siphash/avx.S b/src/libcryptobox/siphash/avx.S
new file mode 100644 (file)
index 0000000..72e18c7
--- /dev/null
@@ -0,0 +1,332 @@
+#include "../macro.S"
+#include "constants.S"
+
+/*
+ * Generated by clang-3.7 with -mavx -Ofast from reference implementation
+ */
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN siphash_avx
+siphash_avx_local:
+       .cfi_startproc
+       pushq   %r15
+.Ltmp0:
+       .cfi_def_cfa_offset 16
+       pushq   %r14
+.Ltmp1:
+       .cfi_def_cfa_offset 24
+       pushq   %r12
+.Ltmp2:
+       .cfi_def_cfa_offset 32
+       pushq   %rbx
+.Ltmp3:
+       .cfi_def_cfa_offset 40
+.Ltmp4:
+       .cfi_offset %rbx, -40
+.Ltmp5:
+       .cfi_offset %r12, -32
+.Ltmp6:
+       .cfi_offset %r14, -24
+.Ltmp7:
+       .cfi_offset %r15, -16
+       movq    (%rdi), %rcx
+       movq    8(%rdi), %rbx
+       movq    %rdx, %r9
+       shlq    $56, %r9
+       movq    %r9, -8(%rsp)
+       movabsq $8317987319222330741, %r12 # imm = 0x736F6D6570736575
+       xorq    %rcx, %r12
+       movabsq $7237128888997146477, %rax # imm = 0x646F72616E646F6D
+       xorq    %rbx, %rax
+       movabsq $7816392313619706465, %r8 # imm = 0x6C7967656E657261
+       xorq    %rcx, %r8
+       movabsq $8387220255154660723, %rdi # imm = 0x7465646279746573
+       xorq    %rbx, %rdi
+       cmpq    $8, %rdx
+       jb      .LBB0_4
+# BB#1:                                 # %.lr.ph104
+       leaq    -8(%rdx), %r10
+       movq    %r10, %r11
+       andq    $-8, %r11
+       leaq    8(%r11), %r14
+       movq    %rsi, %rbx
+       .align  16, 0x90
+.LBB0_2:                                # =>This Inner Loop Header: Depth=1
+       movq    (%rbx), %r15
+       addq    $8, %rbx
+       xorq    %r15, %rdi
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       xorq    %r15, %r12
+       addq    $-8, %rdx
+       cmpq    $7, %rdx
+       ja      .LBB0_2
+# BB#3:                                 # %..preheader_crit_edge
+       subq    %r11, %r10
+       addq    %r14, %rsi
+       movq    %r10, %rdx
+.LBB0_4:                                # %.preheader
+       testq   %rdx, %rdx
+       je      .LBB0_13
+# BB#5:                                 # %overflow.checked
+       xorl    %ebx, %ebx
+       movq    %rdx, %r9
+       andq    $-128, %r9
+       je      .LBB0_9
+# BB#6:                                 # %vector.body.preheader
+       leaq    88(%rsp), %rbx
+       leaq    96(%rsi), %rcx
+       movq    %rdx, %r10
+       andq    $-128, %r10
+       .align  16, 0x90
+.LBB0_7:                                # %vector.body
+                                        # =>This Inner Loop Header: Depth=1
+       vmovups -96(%rcx), %ymm0
+       vmovups -64(%rcx), %ymm1
+       vmovups -32(%rcx), %ymm2
+       vmovups (%rcx), %ymm3
+       vmovups %ymm0, -96(%rbx)
+       vmovups %ymm1, -64(%rbx)
+       vmovups %ymm2, -32(%rbx)
+       vmovups %ymm3, (%rbx)
+       subq    $-128, %rbx
+       subq    $-128, %rcx
+       addq    $-128, %r10
+       jne     .LBB0_7
+# BB#8:
+       movq    %r9, %rbx
+.LBB0_9:                                # %middle.block
+       subq    %rbx, %rdx
+       je      .LBB0_12
+# BB#10:                                # %.lr.ph.preheader
+       leaq    -8(%rsp,%rbx), %rcx
+       addq    %rbx, %rsi
+       .align  16, 0x90
+.LBB0_11:                               # %.lr.ph
+                                        # =>This Inner Loop Header: Depth=1
+       movb    (%rsi), %bl
+       movb    %bl, (%rcx)
+       incq    %rcx
+       incq    %rsi
+       decq    %rdx
+       jne     .LBB0_11
+.LBB0_12:                               # %._crit_edge
+       movq    -8(%rsp), %r9
+.LBB0_13:
+       xorq    %r9, %rdi
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       xorq    %r9, %r12
+       xorq    $255, %r8
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       addq    %rax, %r8
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       xorq    %rdi, %rax
+       xorq    %r8, %rax
+       popq    %rbx
+       popq    %r12
+       popq    %r14
+       popq    %r15
+       vzeroupper
+       retq
+.Lfunc_end0:
+       .size   siphash_avx_local, .Lfunc_end0-siphash_avx_local
+       .cfi_endproc
+FN_END siphash_avx
index f42456b7f49dbe4640ac58b036e8abe626598d24..498609fc53f4874e6abdb114379b618f4cfd7634 100644 (file)
@@ -49,10 +49,17 @@ SIPHASH_DECLARE(ref)
 SIPHASH_DECLARE(sse41)
 #define SIPHASH_SSE41 SIPHASH_IMPL(CPUID_SSE41, "sse41", sse41)
 #endif
+#if defined(HAVE_AVX)
+SIPHASH_DECLARE(avx)
+#define SIPHASH_AVX SIPHASH_IMPL(CPUID_AVX, "avx", avx)
+#endif
 
 /* list implemenations from most optimized to least, with generic as the last entry */
 static const siphash_impl_t siphash_list[] = {
                SIPHASH_GENERIC,
+#if defined(SIPHASH_AVX)
+               SIPHASH_AVX,
+#endif
 #if defined(SIPHASH_SSE41)
                SIPHASH_SSE41,
 #endif
@@ -73,7 +80,6 @@ siphash_load(void)
                        }
                }
        }
-       fprintf(stderr, "selected %s\n", siphash_opt->desc);
 }
 
 void siphash24 (unsigned char *out, const unsigned char *in,
index 58acfee8f6b9d46da9ded0fa8e3a89062adbd509..92c15671a1e4dee858f254a73d9e979da0d23f22 100644 (file)
@@ -1,6 +1,11 @@
 #include "../macro.S"
 #include "constants.S"
 
+/*
+ * Generated by gcc-4.9 from siphash sse41 implementation written by
+ * Samuel Neves and submitted to supercop competition
+ */
+
 SECTION_TEXT
 
 GLOBAL_HIDDEN_FN siphash_sse41
index 62a30b01a62f71492047705a34bdd170c25ab397..1c773b45e48b9f8734d29bd154f5468e88a6f1bb 100644 (file)
@@ -3,12 +3,27 @@
 context("Siphash check functions", function()
   local ffi = require("ffi")
   ffi.cdef[[
-    size_t siphash24_test(void);
+    void rspamd_cryptobox_init (void);
+    size_t siphash24_test(bool generic);
+    double rspamd_get_ticks (void);
   ]]
-
-  test("Siphash test vectors", function()
-    local res = ffi.C.siphash24_test()
+  
+  ffi.C.rspamd_cryptobox_init()
+  
+  test("Siphash test reference vectors", function()
+    local t1 = ffi.C.rspamd_get_ticks()
+    local res = ffi.C.siphash24_test(true)
+    local t2 = ffi.C.rspamd_get_ticks()
+    
+    print("Refrence siphash: " .. tostring(t2 - t1) .. " sec")
+    assert_not_equal(res, 0)
+  end)
+  test("Siphash test optimized vectors", function()
+    local t1 = ffi.C.rspamd_get_ticks()
+    local res = ffi.C.siphash24_test(false)
+    local t2 = ffi.C.rspamd_get_ticks()
     
+    print("Optimized siphash: " .. tostring(t2 - t1) .. " sec")
     assert_not_equal(res, 0)
   end)
 end)
\ No newline at end of file