]> source.dussan.org Git - rspamd.git/commitdiff
Add ssse3 version.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 8 Apr 2015 12:53:22 +0000 (13:53 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 8 Apr 2015 12:53:22 +0000 (13:53 +0100)
src/libcryptobox/CMakeLists.txt
src/libcryptobox/siphash/siphash.c
src/libcryptobox/siphash/ssse3.S [new file with mode: 0644]

index 84b6db96e8301238eac24ecabdc0b0a156df31c1..f59ec8146cc79171d7f36eaed94cffca7f0fd68d 100644 (file)
@@ -55,6 +55,9 @@ IF(HAVE_SSE2)
        SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/sse2.S)
        SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/sse2.S)
 ENDIF(HAVE_SSE2)
+IF(HAVE_SSSE3)
+       SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/ssse3.S)
+ENDIF(HAVE_SSSE3)
 IF(HAVE_SSE41)
        SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/sse41.S)
 ENDIF(HAVE_SSE41)
index 498609fc53f4874e6abdb114379b618f4cfd7634..c2c61b3e96c0711e77a94ed903ea866faad68dad 100644 (file)
@@ -49,6 +49,10 @@ SIPHASH_DECLARE(ref)
 SIPHASH_DECLARE(sse41)
 #define SIPHASH_SSE41 SIPHASH_IMPL(CPUID_SSE41, "sse41", sse41)
 #endif
+#if defined(HAVE_SSSE3)
+SIPHASH_DECLARE(ssse3)
+#define SIPHASH_SSSE3 SIPHASH_IMPL(CPUID_SSSE3, "ssse3", ssse3)
+#endif
 #if defined(HAVE_AVX)
 SIPHASH_DECLARE(avx)
 #define SIPHASH_AVX SIPHASH_IMPL(CPUID_AVX, "avx", avx)
@@ -60,6 +64,9 @@ static const siphash_impl_t siphash_list[] = {
 #if defined(SIPHASH_AVX)
                SIPHASH_AVX,
 #endif
+#if defined(SIPHASH_SSSE3)
+               SIPHASH_SSSE3,
+#endif
 #if defined(SIPHASH_SSE41)
                SIPHASH_SSE41,
 #endif
diff --git a/src/libcryptobox/siphash/ssse3.S b/src/libcryptobox/siphash/ssse3.S
new file mode 100644 (file)
index 0000000..bed181c
--- /dev/null
@@ -0,0 +1,328 @@
+#include "../macro.S"
+#include "constants.S"
+
+/*
+ * Generated by clang-3.7 with -mssse3 -Ofast from reference implementation
+ */
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN siphash_ssse3
+siphash_ssse3_local:
+       .cfi_startproc
+# BB#0:
+       pushq   %r15
+.Ltmp0:
+       .cfi_def_cfa_offset 16
+       pushq   %r14
+.Ltmp1:
+       .cfi_def_cfa_offset 24
+       pushq   %r12
+.Ltmp2:
+       .cfi_def_cfa_offset 32
+       pushq   %rbx
+.Ltmp3:
+       .cfi_def_cfa_offset 40
+.Ltmp4:
+       .cfi_offset %rbx, -40
+.Ltmp5:
+       .cfi_offset %r12, -32
+.Ltmp6:
+       .cfi_offset %r14, -24
+.Ltmp7:
+       .cfi_offset %r15, -16
+       movq    (%rdi), %rcx
+       movq    8(%rdi), %rbx
+       movq    %rdx, %r9
+       shlq    $56, %r9
+       movq    %r9, -8(%rsp)
+       movabsq $8317987319222330741, %r12 # imm = 0x736F6D6570736575
+       xorq    %rcx, %r12
+       movabsq $7237128888997146477, %rax # imm = 0x646F72616E646F6D
+       xorq    %rbx, %rax
+       movabsq $7816392313619706465, %r8 # imm = 0x6C7967656E657261
+       xorq    %rcx, %r8
+       movabsq $8387220255154660723, %rdi # imm = 0x7465646279746573
+       xorq    %rbx, %rdi
+       cmpq    $8, %rdx
+       jb      .LBB0_4
+# BB#1:                                 # %.lr.ph104
+       leaq    -8(%rdx), %r10
+       movq    %r10, %r11
+       andq    $-8, %r11
+       leaq    8(%r11), %r14
+       movq    %rsi, %rbx
+       .align  16, 0x90
+.LBB0_2:                                # =>This Inner Loop Header: Depth=1
+       movq    (%rbx), %r15
+       addq    $8, %rbx
+       xorq    %r15, %rdi
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       xorq    %r15, %r12
+       addq    $-8, %rdx
+       cmpq    $7, %rdx
+       ja      .LBB0_2
+# BB#3:                                 # %..preheader_crit_edge
+       subq    %r11, %r10
+       addq    %r14, %rsi
+       movq    %r10, %rdx
+.LBB0_4:                                # %.preheader
+       testq   %rdx, %rdx
+       je      .LBB0_13
+# BB#5:                                 # %overflow.checked
+       xorl    %ebx, %ebx
+       movq    %rdx, %r9
+       andq    $-32, %r9
+       je      .LBB0_9
+# BB#6:                                 # %vector.body.preheader
+       leaq    8(%rsp), %rbx
+       leaq    16(%rsi), %rcx
+       movq    %rdx, %r10
+       andq    $-32, %r10
+       .align  16, 0x90
+.LBB0_7:                                # %vector.body
+                                        # =>This Inner Loop Header: Depth=1
+       movups  -16(%rcx), %xmm0
+       movups  (%rcx), %xmm1
+       movups  %xmm0, -16(%rbx)
+       movups  %xmm1, (%rbx)
+       addq    $32, %rbx
+       addq    $32, %rcx
+       addq    $-32, %r10
+       jne     .LBB0_7
+# BB#8:
+       movq    %r9, %rbx
+.LBB0_9:                                # %middle.block
+       subq    %rbx, %rdx
+       je      .LBB0_12
+# BB#10:                                # %.lr.ph.preheader
+       leaq    -8(%rsp,%rbx), %rcx
+       addq    %rbx, %rsi
+       .align  16, 0x90
+.LBB0_11:                               # %.lr.ph
+                                        # =>This Inner Loop Header: Depth=1
+       movb    (%rsi), %bl
+       movb    %bl, (%rcx)
+       incq    %rcx
+       incq    %rsi
+       decq    %rdx
+       jne     .LBB0_11
+.LBB0_12:                               # %._crit_edge
+       movq    -8(%rsp), %r9
+.LBB0_13:
+       xorq    %r9, %rdi
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       xorq    %r9, %r12
+       xorq    $255, %r8
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       #APP
+       shldq   $32, %r12, %r12
+       #NO_APP
+       addq    %rax, %r8
+       addq    %rdi, %r12
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       xorq    %r12, %rdi
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       addq    %rax, %r12
+       addq    %rdi, %r8
+       #APP
+       shldq   $13, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $16, %rdi, %rdi
+       #NO_APP
+       xorq    %r12, %rax
+       xorq    %r8, %rdi
+       addq    %rax, %r8
+       #APP
+       shldq   $17, %rax, %rax
+       #NO_APP
+       #APP
+       shldq   $21, %rdi, %rdi
+       #NO_APP
+       xorq    %r8, %rax
+       #APP
+       shldq   $32, %r8, %r8
+       #NO_APP
+       xorq    %rdi, %rax
+       xorq    %r8, %rax
+       popq    %rbx
+       popq    %r12
+       popq    %r14
+       popq    %r15
+       retq
+.Lfunc_end0:
+       .size   siphash_ssse3_local, .Lfunc_end0-siphash_ssse3_local
+       .cfi_endproc
+FN_END siphash_ssse3