Browse Source

Add optimized versions of ChaCha20 by @floodyberry

tags/0.9.0
Vsevolod Stakhov 9 years ago
parent
commit
82f9e6dff5

+ 1
- 1
CMakeLists.txt View File

@@ -20,7 +20,7 @@ IF(NOT RSPAMD_USER)
SET(RSPAMD_GROUP "nobody")
ENDIF(NOT RSPAMD_USER)

CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0 FATAL_ERROR)
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0 FATAL_ERROR)
SET_PROPERTY(GLOBAL PROPERTY ALLOW_DUPLICATE_CUSTOM_TARGETS 1)

############################# OPTIONS SECTION #############################################

+ 21
- 0
src/libcryptobox/AsmOp.cmake View File

@@ -0,0 +1,21 @@
# Check for assembler option specified

function(asm_op output_var op description)
SET(asm_code "
${op}
")
file(WRITE "${CMAKE_BINARY_DIR}/asm.S" "${asm_code}")
try_compile(HAVE_OP
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/asm.S"
CMAKE_FLAGS "-DCMAKE_ASM_LINK_EXECUTABLE='echo not linking now...'")
#file(REMOVE "${CMAKE_BINARY_DIR}/asm.s")
if(HAVE_OP)
MESSAGE(STATUS "Compilation of ${description} asm set is supported")
else()
MESSAGE(STATUS "Compilation of ${description} asm set is -NOT- supported")
endif()
set(${output_var} "${HAVE_OP}" PARENT_SCOPE)
endfunction()

+ 0
- 0
src/libcryptobox/AsmOpt.cmake View File


+ 38
- 1
src/libcryptobox/CMakeLists.txt View File

@@ -1,8 +1,45 @@
INCLUDE(FindArch.cmake)
INCLUDE(AsmOp.cmake)

TARGET_ARCHITECTURE(ARCH)

SET(CHACHASRC chacha20/ref.c)

# For now we support only x86_64 architecture with optimizations
IF(${ARCH} STREQUAL "x86_64")
ENABLE_LANGUAGE(ASM)
ASM_OP(HAVE_AVX2 "vpaddq %ymm0, %ymm0, %ymm0" "avx2")
ASM_OP(HAVE_AVX "vpaddq %xmm0, %xmm0, %xmm0" "avx")
ASM_OP(HAVE_SSE2 "pmuludq %xmm0, %xmm0" "sse2")
ASM_OP(HAVE_SLASHMACRO "
.macro TEST1 op
\\op %eax, %eax
.endm
TEST1 xorl
" "slash macro convention")
ASM_OP(HAVE_DOLLARMACRO "
.macro TEST1 op
$0 %eax, %eax
.endm
TEST1 xorl
" "dollar macro convention")
CONFIGURE_FILE(platform_config.h.in platform_config.h)
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
ENDIF()

IF(HAVE_AVX2)
SET(CHACHASRC ${CHACHASRC} chacha20/avx2.S)
ENDIF(HAVE_AVX2)
IF(HAVE_AVX)
SET(CHACHASRC ${CHACHASRC} chacha20/avx.S)
ENDIF(HAVE_AVX)
IF(HAVE_SSE2)
SET(CHACHASRC ${CHACHASRC} chacha20/sse2.S)
ENDIF(HAVE_SSE2)

SET(LIBCRYPTOBOXSRC cryptobox.c)

ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC})
ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} ${CHACHASRC})
IF(NOT DEBIAN_BUILD)
SET_TARGET_PROPERTIES(rspamd-cryptobox PROPERTIES VERSION ${RSPAMD_VERSION})
ENDIF(NOT DEBIAN_BUILD)

+ 127
- 0
src/libcryptobox/FindArch.cmake View File

@@ -0,0 +1,127 @@
set(archdetect_c_code "
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \\
|| defined(__ARM_ARCH_7A__) \\
|| defined(__ARM_ARCH_7R__) \\
|| defined(__ARM_ARCH_7M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \\
|| defined(__ARM_ARCH_6J__) \\
|| defined(__ARM_ARCH_6T2__) \\
|| defined(__ARM_ARCH_6Z__) \\
|| defined(__ARM_ARCH_6K__) \\
|| defined(__ARM_ARCH_6ZK__) \\
|| defined(__ARM_ARCH_6M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif

#error cmake_ARCH unknown
")

# Set ppc_support to TRUE before including this file or ppc and ppc64
# will be treated as invalid architectures since they are no longer supported by Apple

function(target_architecture output_var)
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
# First let's normalize the order of the values

# Note that it's not possible to compile PowerPC applications if you are using
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
# disable it by default
# See this page for more information:
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4

# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.

foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
set(osx_arch_ppc TRUE)
elseif("${osx_arch}" STREQUAL "i386")
set(osx_arch_i386 TRUE)
elseif("${osx_arch}" STREQUAL "x86_64")
set(osx_arch_x86_64 TRUE)
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
set(osx_arch_ppc64 TRUE)
else()
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
endif()
endforeach()

# Now add all the architectures in our normalized order
if(osx_arch_ppc)
list(APPEND ARCH ppc)
endif()

if(osx_arch_i386)
list(APPEND ARCH i386)
endif()

if(osx_arch_x86_64)
list(APPEND ARCH x86_64)
endif()

if(osx_arch_ppc64)
list(APPEND ARCH ppc64)
endif()
else()
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")

enable_language(C)

# Detect the architecture in a rather creative way...
# This compiles a small C program which is a series of ifdefs that selects a
# particular #error preprocessor directive whose message string contains the
# target architecture. The program will always fail to compile (both because
# file is not a valid C program, and obviously because of the presence of the
# #error preprocessor directives... but by exploiting the preprocessor in this
# way, we can detect the correct target architecture even when cross-compiling,
# since the program itself never needs to be run (only the compiler/preprocessor)
try_run(
run_result_unused
compile_result_unused
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/arch.c"
COMPILE_OUTPUT_VARIABLE ARCH
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)

# Parse the architecture name from the compiler output
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")

# Get rid of the value marker leaving just the architecture name
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")

# If we are compiling with an unknown architecture this variable should
# already be set to "unknown" but in the case that it's empty (i.e. due
# to a typo in the code), then set it to unknown
if (NOT ARCH)
set(ARCH unknown)
endif()
endif()

set(${output_var} "${ARCH}" PARENT_SCOPE)
endfunction()

+ 613
- 0
src/libcryptobox/chacha20/avx.S View File

@@ -0,0 +1,613 @@
#include "macro.S"
SECTION_TEXT

GLOBAL_HIDDEN_FN chacha_blocks_avx
chacha_blocks_avx_local:
pushq %rbx
pushq %rbp
movq %rsp, %rbp
andq $~63, %rsp
subq $512, %rsp
LOAD_VAR_PIC chacha_constants, %rax
vmovdqa 0(%rax), %xmm8
vmovdqa 16(%rax), %xmm6
vmovdqa 32(%rax), %xmm7
vmovdqu 0(%rdi), %xmm9
vmovdqu 16(%rdi), %xmm10
vmovdqu 32(%rdi), %xmm11
movq 48(%rdi), %rax
movq $1, %r9
vmovdqa %xmm8, 0(%rsp)
vmovdqa %xmm9, 16(%rsp)
vmovdqa %xmm10, 32(%rsp)
vmovdqa %xmm11, 48(%rsp)
vmovdqa %xmm6, 80(%rsp)
vmovdqa %xmm7, 96(%rsp)
movq %rax, 64(%rsp)
cmpq $256, %rcx
jb chacha_blocks_avx_below256
vpshufd $0x00, %xmm8, %xmm0
vpshufd $0x55, %xmm8, %xmm1
vpshufd $0xaa, %xmm8, %xmm2
vpshufd $0xff, %xmm8, %xmm3
vmovdqa %xmm0, 128(%rsp)
vmovdqa %xmm1, 144(%rsp)
vmovdqa %xmm2, 160(%rsp)
vmovdqa %xmm3, 176(%rsp)
vpshufd $0x00, %xmm9, %xmm0
vpshufd $0x55, %xmm9, %xmm1
vpshufd $0xaa, %xmm9, %xmm2
vpshufd $0xff, %xmm9, %xmm3
vmovdqa %xmm0, 192(%rsp)
vmovdqa %xmm1, 208(%rsp)
vmovdqa %xmm2, 224(%rsp)
vmovdqa %xmm3, 240(%rsp)
vpshufd $0x00, %xmm10, %xmm0
vpshufd $0x55, %xmm10, %xmm1
vpshufd $0xaa, %xmm10, %xmm2
vpshufd $0xff, %xmm10, %xmm3
vmovdqa %xmm0, 256(%rsp)
vmovdqa %xmm1, 272(%rsp)
vmovdqa %xmm2, 288(%rsp)
vmovdqa %xmm3, 304(%rsp)
vpshufd $0xaa, %xmm11, %xmm0
vpshufd $0xff, %xmm11, %xmm1
vmovdqa %xmm0, 352(%rsp)
vmovdqa %xmm1, 368(%rsp)
jmp chacha_blocks_avx_atleast256
.p2align 6,,63
nop
nop
nop
nop
nop
chacha_blocks_avx_atleast256:
movq 48(%rsp), %rax
leaq 1(%rax), %r8
leaq 2(%rax), %r9
leaq 3(%rax), %r10
leaq 4(%rax), %rbx
movl %eax, 320(%rsp)
movl %r8d, 4+320(%rsp)
movl %r9d, 8+320(%rsp)
movl %r10d, 12+320(%rsp)
shrq $32, %rax
shrq $32, %r8
shrq $32, %r9
shrq $32, %r10
movl %eax, 336(%rsp)
movl %r8d, 4+336(%rsp)
movl %r9d, 8+336(%rsp)
movl %r10d, 12+336(%rsp)
movq %rbx, 48(%rsp)
movq 64(%rsp), %rax
vmovdqa 128(%rsp), %xmm0
vmovdqa 144(%rsp), %xmm1
vmovdqa 160(%rsp), %xmm2
vmovdqa 176(%rsp), %xmm3
vmovdqa 192(%rsp), %xmm4
vmovdqa 208(%rsp), %xmm5
vmovdqa 224(%rsp), %xmm6
vmovdqa 240(%rsp), %xmm7
vmovdqa 256(%rsp), %xmm8
vmovdqa 272(%rsp), %xmm9
vmovdqa 288(%rsp), %xmm10
vmovdqa 304(%rsp), %xmm11
vmovdqa 320(%rsp), %xmm12
vmovdqa 336(%rsp), %xmm13
vmovdqa 352(%rsp), %xmm14
vmovdqa 368(%rsp), %xmm15
chacha_blocks_avx_mainloop1:
vpaddd %xmm0, %xmm4, %xmm0
vpaddd %xmm1, %xmm5, %xmm1
vpxor %xmm12, %xmm0, %xmm12
vpxor %xmm13, %xmm1, %xmm13
vpaddd %xmm2, %xmm6, %xmm2
vpaddd %xmm3, %xmm7, %xmm3
vpxor %xmm14, %xmm2, %xmm14
vpxor %xmm15, %xmm3, %xmm15
vpshufb 80(%rsp), %xmm12, %xmm12
vpshufb 80(%rsp), %xmm13, %xmm13
vpaddd %xmm8, %xmm12, %xmm8
vpaddd %xmm9, %xmm13, %xmm9
vpshufb 80(%rsp), %xmm14, %xmm14
vpshufb 80(%rsp), %xmm15, %xmm15
vpaddd %xmm10, %xmm14, %xmm10
vpaddd %xmm11, %xmm15, %xmm11
vmovdqa %xmm12, 112(%rsp)
vpxor %xmm4, %xmm8, %xmm4
vpxor %xmm5, %xmm9, %xmm5
vpslld $ 12, %xmm4, %xmm12
vpsrld $20, %xmm4, %xmm4
vpxor %xmm4, %xmm12, %xmm4
vpslld $ 12, %xmm5, %xmm12
vpsrld $20, %xmm5, %xmm5
vpxor %xmm5, %xmm12, %xmm5
vpxor %xmm6, %xmm10, %xmm6
vpxor %xmm7, %xmm11, %xmm7
vpslld $ 12, %xmm6, %xmm12
vpsrld $20, %xmm6, %xmm6
vpxor %xmm6, %xmm12, %xmm6
vpslld $ 12, %xmm7, %xmm12
vpsrld $20, %xmm7, %xmm7
vpxor %xmm7, %xmm12, %xmm7
vpaddd %xmm0, %xmm4, %xmm0
vpaddd %xmm1, %xmm5, %xmm1
vpxor 112(%rsp), %xmm0, %xmm12
vpxor %xmm13, %xmm1, %xmm13
vpaddd %xmm2, %xmm6, %xmm2
vpaddd %xmm3, %xmm7, %xmm3
vpxor %xmm14, %xmm2, %xmm14
vpxor %xmm15, %xmm3, %xmm15
vpshufb 96(%rsp), %xmm12, %xmm12
vpshufb 96(%rsp), %xmm13, %xmm13
vpaddd %xmm8, %xmm12, %xmm8
vpaddd %xmm9, %xmm13, %xmm9
vpshufb 96(%rsp), %xmm14, %xmm14
vpshufb 96(%rsp), %xmm15, %xmm15
vpaddd %xmm10, %xmm14, %xmm10
vpaddd %xmm11, %xmm15, %xmm11
vmovdqa %xmm12, 112(%rsp)
vpxor %xmm4, %xmm8, %xmm4
vpxor %xmm5, %xmm9, %xmm5
vpslld $ 7, %xmm4, %xmm12
vpsrld $25, %xmm4, %xmm4
vpxor %xmm4, %xmm12, %xmm4
vpslld $ 7, %xmm5, %xmm12
vpsrld $25, %xmm5, %xmm5
vpxor %xmm5, %xmm12, %xmm5
vpxor %xmm6, %xmm10, %xmm6
vpxor %xmm7, %xmm11, %xmm7
vpslld $ 7, %xmm6, %xmm12
vpsrld $25, %xmm6, %xmm6
vpxor %xmm6, %xmm12, %xmm6
vpslld $ 7, %xmm7, %xmm12
vpsrld $25, %xmm7, %xmm7
vpxor %xmm7, %xmm12, %xmm7
vpaddd %xmm0, %xmm5, %xmm0
vpaddd %xmm1, %xmm6, %xmm1
vpxor %xmm15, %xmm0, %xmm15
vpxor 112(%rsp), %xmm1, %xmm12
vpaddd %xmm2, %xmm7, %xmm2
vpaddd %xmm3, %xmm4, %xmm3
vpxor %xmm13, %xmm2, %xmm13
vpxor %xmm14, %xmm3, %xmm14
vpshufb 80(%rsp), %xmm15, %xmm15
vpshufb 80(%rsp), %xmm12, %xmm12
vpaddd %xmm10, %xmm15, %xmm10
vpaddd %xmm11, %xmm12, %xmm11
vpshufb 80(%rsp), %xmm13, %xmm13
vpshufb 80(%rsp), %xmm14, %xmm14
vpaddd %xmm8, %xmm13, %xmm8
vpaddd %xmm9, %xmm14, %xmm9
vmovdqa %xmm15, 112(%rsp)
vpxor %xmm5, %xmm10, %xmm5
vpxor %xmm6, %xmm11, %xmm6
vpslld $ 12, %xmm5, %xmm15
vpsrld $20, %xmm5, %xmm5
vpxor %xmm5, %xmm15, %xmm5
vpslld $ 12, %xmm6, %xmm15
vpsrld $20, %xmm6, %xmm6
vpxor %xmm6, %xmm15, %xmm6
vpxor %xmm7, %xmm8, %xmm7
vpxor %xmm4, %xmm9, %xmm4
vpslld $ 12, %xmm7, %xmm15
vpsrld $20, %xmm7, %xmm7
vpxor %xmm7, %xmm15, %xmm7
vpslld $ 12, %xmm4, %xmm15
vpsrld $20, %xmm4, %xmm4
vpxor %xmm4, %xmm15, %xmm4
vpaddd %xmm0, %xmm5, %xmm0
vpaddd %xmm1, %xmm6, %xmm1
vpxor 112(%rsp), %xmm0, %xmm15
vpxor %xmm12, %xmm1, %xmm12
vpaddd %xmm2, %xmm7, %xmm2
vpaddd %xmm3, %xmm4, %xmm3
vpxor %xmm13, %xmm2, %xmm13
vpxor %xmm14, %xmm3, %xmm14
vpshufb 96(%rsp), %xmm15, %xmm15
vpshufb 96(%rsp), %xmm12, %xmm12
vpaddd %xmm10, %xmm15, %xmm10
vpaddd %xmm11, %xmm12, %xmm11
vpshufb 96(%rsp), %xmm13, %xmm13
vpshufb 96(%rsp), %xmm14, %xmm14
vpaddd %xmm8, %xmm13, %xmm8
vpaddd %xmm9, %xmm14, %xmm9
vmovdqa %xmm15, 112(%rsp)
vpxor %xmm5, %xmm10, %xmm5
vpxor %xmm6, %xmm11, %xmm6
vpslld $ 7, %xmm5, %xmm15
vpsrld $25, %xmm5, %xmm5
vpxor %xmm5, %xmm15, %xmm5
vpslld $ 7, %xmm6, %xmm15
vpsrld $25, %xmm6, %xmm6
vpxor %xmm6, %xmm15, %xmm6
vpxor %xmm7, %xmm8, %xmm7
vpxor %xmm4, %xmm9, %xmm4
vpslld $ 7, %xmm7, %xmm15
vpsrld $25, %xmm7, %xmm7
vpxor %xmm7, %xmm15, %xmm7
vpslld $ 7, %xmm4, %xmm15
vpsrld $25, %xmm4, %xmm4
vpxor %xmm4, %xmm15, %xmm4
vmovdqa 112(%rsp), %xmm15
subq $2, %rax
jnz chacha_blocks_avx_mainloop1
vpaddd 128(%rsp), %xmm0, %xmm0
vpaddd 144(%rsp), %xmm1, %xmm1
vpaddd 160(%rsp), %xmm2, %xmm2
vpaddd 176(%rsp), %xmm3, %xmm3
vpaddd 192(%rsp), %xmm4, %xmm4
vpaddd 208(%rsp), %xmm5, %xmm5
vpaddd 224(%rsp), %xmm6, %xmm6
vpaddd 240(%rsp), %xmm7, %xmm7
vpaddd 256(%rsp), %xmm8, %xmm8
vpaddd 272(%rsp), %xmm9, %xmm9
vpaddd 288(%rsp), %xmm10, %xmm10
vpaddd 304(%rsp), %xmm11, %xmm11
vpaddd 320(%rsp), %xmm12, %xmm12
vpaddd 336(%rsp), %xmm13, %xmm13
vpaddd 352(%rsp), %xmm14, %xmm14
vpaddd 368(%rsp), %xmm15, %xmm15
vmovdqa %xmm8, 384(%rsp)
vmovdqa %xmm9, 400(%rsp)
vmovdqa %xmm10, 416(%rsp)
vmovdqa %xmm11, 432(%rsp)
vmovdqa %xmm12, 448(%rsp)
vmovdqa %xmm13, 464(%rsp)
vmovdqa %xmm14, 480(%rsp)
vmovdqa %xmm15, 496(%rsp)
vpunpckldq %xmm1, %xmm0, %xmm8
vpunpckldq %xmm3, %xmm2, %xmm9
vpunpckhdq %xmm1, %xmm0, %xmm12
vpunpckhdq %xmm3, %xmm2, %xmm13
vpunpckldq %xmm5, %xmm4, %xmm10
vpunpckldq %xmm7, %xmm6, %xmm11
vpunpckhdq %xmm5, %xmm4, %xmm14
vpunpckhdq %xmm7, %xmm6, %xmm15
vpunpcklqdq %xmm9, %xmm8, %xmm0
vpunpcklqdq %xmm11, %xmm10, %xmm1
vpunpckhqdq %xmm9, %xmm8, %xmm2
vpunpckhqdq %xmm11, %xmm10, %xmm3
vpunpcklqdq %xmm13, %xmm12, %xmm4
vpunpcklqdq %xmm15, %xmm14, %xmm5
vpunpckhqdq %xmm13, %xmm12, %xmm6
vpunpckhqdq %xmm15, %xmm14, %xmm7
andq %rsi, %rsi
jz chacha_blocks_avx_noinput1
vpxor 0(%rsi), %xmm0, %xmm0
vpxor 16(%rsi), %xmm1, %xmm1
vpxor 64(%rsi), %xmm2, %xmm2
vpxor 80(%rsi), %xmm3, %xmm3
vpxor 128(%rsi), %xmm4, %xmm4
vpxor 144(%rsi), %xmm5, %xmm5
vpxor 192(%rsi), %xmm6, %xmm6
vpxor 208(%rsi), %xmm7, %xmm7
vmovdqu %xmm0, 0(%rdx)
vmovdqu %xmm1, 16(%rdx)
vmovdqu %xmm2, 64(%rdx)
vmovdqu %xmm3, 80(%rdx)
vmovdqu %xmm4, 128(%rdx)
vmovdqu %xmm5, 144(%rdx)
vmovdqu %xmm6, 192(%rdx)
vmovdqu %xmm7, 208(%rdx)
vmovdqa 384(%rsp), %xmm0
vmovdqa 400(%rsp), %xmm1
vmovdqa 416(%rsp), %xmm2
vmovdqa 432(%rsp), %xmm3
vmovdqa 448(%rsp), %xmm4
vmovdqa 464(%rsp), %xmm5
vmovdqa 480(%rsp), %xmm6
vmovdqa 496(%rsp), %xmm7
vpunpckldq %xmm1, %xmm0, %xmm8
vpunpckldq %xmm3, %xmm2, %xmm9
vpunpckhdq %xmm1, %xmm0, %xmm12
vpunpckhdq %xmm3, %xmm2, %xmm13
vpunpckldq %xmm5, %xmm4, %xmm10
vpunpckldq %xmm7, %xmm6, %xmm11
vpunpckhdq %xmm5, %xmm4, %xmm14
vpunpckhdq %xmm7, %xmm6, %xmm15
vpunpcklqdq %xmm9, %xmm8, %xmm0
vpunpcklqdq %xmm11, %xmm10, %xmm1
vpunpckhqdq %xmm9, %xmm8, %xmm2
vpunpckhqdq %xmm11, %xmm10, %xmm3
vpunpcklqdq %xmm13, %xmm12, %xmm4
vpunpcklqdq %xmm15, %xmm14, %xmm5
vpunpckhqdq %xmm13, %xmm12, %xmm6
vpunpckhqdq %xmm15, %xmm14, %xmm7
vpxor 32(%rsi), %xmm0, %xmm0
vpxor 48(%rsi), %xmm1, %xmm1
vpxor 96(%rsi), %xmm2, %xmm2
vpxor 112(%rsi), %xmm3, %xmm3
vpxor 160(%rsi), %xmm4, %xmm4
vpxor 176(%rsi), %xmm5, %xmm5
vpxor 224(%rsi), %xmm6, %xmm6
vpxor 240(%rsi), %xmm7, %xmm7
vmovdqu %xmm0, 32(%rdx)
vmovdqu %xmm1, 48(%rdx)
vmovdqu %xmm2, 96(%rdx)
vmovdqu %xmm3, 112(%rdx)
vmovdqu %xmm4, 160(%rdx)
vmovdqu %xmm5, 176(%rdx)
vmovdqu %xmm6, 224(%rdx)
vmovdqu %xmm7, 240(%rdx)
addq $256, %rsi
jmp chacha_blocks_avx_mainloop_cont
chacha_blocks_avx_noinput1:
vmovdqu %xmm0, 0(%rdx)
vmovdqu %xmm1, 16(%rdx)
vmovdqu %xmm2, 64(%rdx)
vmovdqu %xmm3, 80(%rdx)
vmovdqu %xmm4, 128(%rdx)
vmovdqu %xmm5, 144(%rdx)
vmovdqu %xmm6, 192(%rdx)
vmovdqu %xmm7, 208(%rdx)
vmovdqa 384(%rsp), %xmm0
vmovdqa 400(%rsp), %xmm1
vmovdqa 416(%rsp), %xmm2
vmovdqa 432(%rsp), %xmm3
vmovdqa 448(%rsp), %xmm4
vmovdqa 464(%rsp), %xmm5
vmovdqa 480(%rsp), %xmm6
vmovdqa 496(%rsp), %xmm7
vpunpckldq %xmm1, %xmm0, %xmm8
vpunpckldq %xmm3, %xmm2, %xmm9
vpunpckhdq %xmm1, %xmm0, %xmm12
vpunpckhdq %xmm3, %xmm2, %xmm13
vpunpckldq %xmm5, %xmm4, %xmm10
vpunpckldq %xmm7, %xmm6, %xmm11
vpunpckhdq %xmm5, %xmm4, %xmm14
vpunpckhdq %xmm7, %xmm6, %xmm15
vpunpcklqdq %xmm9, %xmm8, %xmm0
vpunpcklqdq %xmm11, %xmm10, %xmm1
vpunpckhqdq %xmm9, %xmm8, %xmm2
vpunpckhqdq %xmm11, %xmm10, %xmm3
vpunpcklqdq %xmm13, %xmm12, %xmm4
vpunpcklqdq %xmm15, %xmm14, %xmm5
vpunpckhqdq %xmm13, %xmm12, %xmm6
vpunpckhqdq %xmm15, %xmm14, %xmm7
vmovdqu %xmm0, 32(%rdx)
vmovdqu %xmm1, 48(%rdx)
vmovdqu %xmm2, 96(%rdx)
vmovdqu %xmm3, 112(%rdx)
vmovdqu %xmm4, 160(%rdx)
vmovdqu %xmm5, 176(%rdx)
vmovdqu %xmm6, 224(%rdx)
vmovdqu %xmm7, 240(%rdx)
chacha_blocks_avx_mainloop_cont:
addq $256, %rdx
subq $256, %rcx
cmp $256, %rcx
jae chacha_blocks_avx_atleast256
vmovdqa 80(%rsp), %xmm6
vmovdqa 96(%rsp), %xmm7
vmovdqa 0(%rsp), %xmm8
vmovdqa 16(%rsp), %xmm9
vmovdqa 32(%rsp), %xmm10
vmovdqa 48(%rsp), %xmm11
movq $1, %r9
chacha_blocks_avx_below256:
vmovq %r9, %xmm5
andq %rcx, %rcx
jz chacha_blocks_avx_done
cmpq $64, %rcx
jae chacha_blocks_avx_above63
movq %rdx, %r9
andq %rsi, %rsi
jz chacha_blocks_avx_noinput2
movq %rcx, %r10
movq %rsp, %rdx
addq %r10, %rsi
addq %r10, %rdx
negq %r10
chacha_blocks_avx_copyinput:
movb (%rsi, %r10), %al
movb %al, (%rdx, %r10)
incq %r10
jnz chacha_blocks_avx_copyinput
movq %rsp, %rsi
chacha_blocks_avx_noinput2:
movq %rsp, %rdx
chacha_blocks_avx_above63:
vmovdqa %xmm8, %xmm0
vmovdqa %xmm9, %xmm1
vmovdqa %xmm10, %xmm2
vmovdqa %xmm11, %xmm3
movq 64(%rsp), %rax
chacha_blocks_avx_mainloop2:
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm6, %xmm3, %xmm3
vpaddd %xmm2, %xmm3, %xmm2
vpxor %xmm1, %xmm2, %xmm1
vpslld $12, %xmm1, %xmm4
vpsrld $20, %xmm1, %xmm1
vpxor %xmm1, %xmm4, %xmm1
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm7, %xmm3, %xmm3
vpshufd $0x93, %xmm0, %xmm0
vpaddd %xmm2, %xmm3, %xmm2
vpshufd $0x4e, %xmm3, %xmm3
vpxor %xmm1, %xmm2, %xmm1
vpshufd $0x39, %xmm2, %xmm2
vpslld $7, %xmm1, %xmm4
vpsrld $25, %xmm1, %xmm1
vpxor %xmm1, %xmm4, %xmm1
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm6, %xmm3, %xmm3
vpaddd %xmm2, %xmm3, %xmm2
vpxor %xmm1, %xmm2, %xmm1
vpslld $12, %xmm1, %xmm4
vpsrld $20, %xmm1, %xmm1
vpxor %xmm1, %xmm4, %xmm1
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm7, %xmm3, %xmm3
vpshufd $0x39, %xmm0, %xmm0
vpaddd %xmm2, %xmm3, %xmm2
vpshufd $0x4e, %xmm3, %xmm3
vpxor %xmm1, %xmm2, %xmm1
vpshufd $0x93, %xmm2, %xmm2
vpslld $7, %xmm1, %xmm4
vpsrld $25, %xmm1, %xmm1
vpxor %xmm1, %xmm4, %xmm1
subq $2, %rax
jnz chacha_blocks_avx_mainloop2
vpaddd %xmm0, %xmm8, %xmm0
vpaddd %xmm1, %xmm9, %xmm1
vpaddd %xmm2, %xmm10, %xmm2
vpaddd %xmm3, %xmm11, %xmm3
andq %rsi, %rsi
jz chacha_blocks_avx_noinput3
vpxor 0(%rsi), %xmm0, %xmm0
vpxor 16(%rsi), %xmm1, %xmm1
vpxor 32(%rsi), %xmm2, %xmm2
vpxor 48(%rsi), %xmm3, %xmm3
addq $64, %rsi
chacha_blocks_avx_noinput3:
vmovdqu %xmm0, 0(%rdx)
vmovdqu %xmm1, 16(%rdx)
vmovdqu %xmm2, 32(%rdx)
vmovdqu %xmm3, 48(%rdx)
vpaddq %xmm11, %xmm5, %xmm11
cmpq $64, %rcx
jbe chacha_blocks_avx_mainloop2_finishup
addq $64, %rdx
subq $64, %rcx
jmp chacha_blocks_avx_below256
chacha_blocks_avx_mainloop2_finishup:
cmpq $64, %rcx
je chacha_blocks_avx_done
addq %rcx, %r9
addq %rcx, %rdx
negq %rcx
chacha_blocks_avx_copyoutput:
movb (%rdx, %rcx), %al
movb %al, (%r9, %rcx)
incq %rcx
jnz chacha_blocks_avx_copyoutput
chacha_blocks_avx_done:
vmovdqu %xmm11, 32(%rdi)
movq %rbp, %rsp
popq %rbp
popq %rbx
ret
FN_END chacha_blocks_avx

GLOBAL_HIDDEN_FN hchacha_avx
hchacha_avx_local:
LOAD_VAR_PIC chacha_constants, %rax
vmovdqa 0(%rax), %xmm0
vmovdqa 16(%rax), %xmm6
vmovdqa 32(%rax), %xmm5
vmovdqu 0(%rdi), %xmm1
vmovdqu 16(%rdi), %xmm2
vmovdqu 0(%rsi), %xmm3
hhacha_mainloop_avx:
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm6, %xmm3, %xmm3
vpaddd %xmm2, %xmm3, %xmm2
vpxor %xmm1, %xmm2, %xmm1
vpslld $12, %xmm1, %xmm4
vpsrld $20, %xmm1, %xmm1
vpxor %xmm1, %xmm4, %xmm1
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm5, %xmm3, %xmm3
vpaddd %xmm2, %xmm3, %xmm2
vpxor %xmm1, %xmm2, %xmm1
vpslld $7, %xmm1, %xmm4
vpsrld $25, %xmm1, %xmm1
vpshufd $0x93, %xmm0, %xmm0
vpxor %xmm1, %xmm4, %xmm1
vpshufd $0x4e, %xmm3, %xmm3
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm6, %xmm3, %xmm3
vpshufd $0x39, %xmm2, %xmm2
vpaddd %xmm2, %xmm3, %xmm2
vpxor %xmm1, %xmm2, %xmm1
vpslld $12, %xmm1, %xmm4
vpsrld $20, %xmm1, %xmm1
vpxor %xmm1, %xmm4, %xmm1
vpaddd %xmm0, %xmm1, %xmm0
vpxor %xmm3, %xmm0, %xmm3
vpshufb %xmm5, %xmm3, %xmm3
vpaddd %xmm2, %xmm3, %xmm2
vpxor %xmm1, %xmm2, %xmm1
vpshufd $0x39, %xmm0, %xmm0
vpslld $7, %xmm1, %xmm4
vpshufd $0x4e, %xmm3, %xmm3
vpsrld $25, %xmm1, %xmm1
vpshufd $0x93, %xmm2, %xmm2
vpxor %xmm1, %xmm4, %xmm1
subl $2, %ecx
jne hhacha_mainloop_avx
vmovdqu %xmm0, (%rdx)
vmovdqu %xmm3, 16(%rdx)
ret
FN_END hchacha_avx

GLOBAL_HIDDEN_FN_EXT chacha_avx, 6, 16
pushq %rbp
movq %rsp, %rbp
subq $64, %rsp
andq $~63, %rsp
vmovdqu 0(%rdi), %xmm0
vmovdqu 16(%rdi), %xmm1
vmovdqa %xmm0, 0(%rsp)
vmovdqa %xmm1, 16(%rsp)
xorq %rdi, %rdi
movq %rdi, 32(%rsp)
movq 0(%rsi), %rsi
movq %rsi, 40(%rsp)
movq %r9, 48(%rsp)
movq %rsp, %rdi
movq %rdx, %rsi
movq %rcx, %rdx
movq %r8, %rcx
call chacha_blocks_avx_local
vpxor %xmm0, %xmm0, %xmm0
vmovdqa %xmm0, 0(%rsp)
vmovdqa %xmm0, 16(%rsp)
vmovdqa %xmm0, 32(%rsp)
movq %rbp, %rsp
popq %rbp
ret
FN_END chacha_avx

GLOBAL_HIDDEN_FN_EXT xchacha_avx, 6, 16
pushq %rbp
pushq %rbx
movq %rsp, %rbp
subq $64, %rsp
andq $~63, %rsp
movq %rsp, %rbx
xorq %rax, %rax
movq %rax, 32(%rbx)
movq 16(%rsi), %rax
movq %rax, 40(%rbx)
movq %r9, 48(%rbx)
pushq %rdx
pushq %rcx
pushq %r8
movq %rbx, %rdx
movq %r9, %rcx
call hchacha_avx_local
movq %rbx, %rdi
popq %rcx
popq %rdx
popq %rsi
call chacha_blocks_avx_local
vpxor %xmm0, %xmm0, %xmm0
vmovdqa %xmm0, 0(%rbx)
vmovdqa %xmm0, 16(%rbx)
vmovdqa %xmm0, 32(%rbx)
movq %rbp, %rsp
popq %rbx
popq %rbp
ret
FN_END xchacha_avx

+ 1017
- 0
src/libcryptobox/chacha20/avx2.S
File diff suppressed because it is too large
View File


+ 185
- 0
src/libcryptobox/chacha20/macro.S View File

@@ -0,0 +1,185 @@
#include "platform_config.h"

#define IS_MACH (defined(__MACH__))

#if !defined(HAVE_SLASHMACRO) && !defined(HAVE_DOLLARMACRO)
#error Unknown gnu as macro parameter convention! Run ./configure
#endif

#if (IS_MACH)
.macro FN name
#if defined(HAVE_SLASHMACRO)
\name:
_\name:
#elif defined(HAVE_DOLLARMACRO)
$0:
_$0:
#endif
.endm

.macro FN_EXT name, args, xmmused
#if defined(HAVE_SLASHMACRO)
FN \name
#elif defined(HAVE_DOLLARMACRO)
FN $0
#endif
.endm

.macro FN_END name
.endm

.macro HIDDEN name
#if defined(HAVE_AS_PRIVATE_EXTERN)
#if defined(HAVE_SLASHMACRO)
.private_extern \name
.private_extern _\name
#elif defined(HAVE_DOLLARMACRO)
.private_extern $0
.private_extern _$0
#endif
#endif
.endm
#else
.macro FN name
\name:
_\name:
.endm

.macro FN_EXT name, args, xmmused
FN \name
.endm

.macro FN_END name
.size \name, .-\name
.size _\name, .-_\name
.type \name, @function
.type _\name, @function
.endm

.macro HIDDEN name
#if defined(HAVE_AS_HIDDEN)
.hidden \name
.hidden _\name
#endif
.endm

/* set NX for stack */
.section .note.GNU-stack,"",@progbits
#endif
#if (IS_MACH)
.macro SECTION_TEXT
.section __TEXT,__text,regular
.endm

.macro SECTION_RODATA
.section __TEXT,__text,regular
.endm
#else
/* put everything in the code segment to simplify things */
.macro SECTION_TEXT
.text
.endm

.macro SECTION_RODATA
.text
.endm
#endif

/* declare a global function */
.macro GLOBAL name
#if defined(HAVE_SLASHMACRO)
.globl \name
.globl _\name
#elif defined(HAVE_DOLLARMACRO)
.globl $0
.globl _$0
#endif
.endm

.macro FN_LOCAL_PREFIX name
#if defined(HAVE_SLASHMACRO)
FN LOCAL_PREFIX(\name)
#elif defined(HAVE_DOLLARMACRO)
FN LOCAL_PREFIX($0)
#endif
.endm

.macro FN_EXT_LOCAL_PREFIX name, args, xmmused
#if defined(HAVE_SLASHMACRO)
FN_EXT LOCAL_PREFIX(\name), \args, \xmmused
#elif defined(HAVE_DOLLARMACRO)
FN_EXT LOCAL_PREFIX($0), $1, $2
#endif
.endm

.macro FN_END_LOCAL_PREFIX name
#if defined(HAVE_SLASHMACRO)
FN_END LOCAL_PREFIX(\name)
#elif defined(HAVE_DOLLARMACRO)
FN_END LOCAL_PREFIX($0)
#endif
.endm

.macro GLOBAL_LOCAL_PREFIX name
#if defined(HAVE_SLASHMACRO)
GLOBAL LOCAL_PREFIX(\name)
HIDDEN LOCAL_PREFIX(\name)
#elif defined(HAVE_DOLLARMACRO)
GLOBAL LOCAL_PREFIX($0)
HIDDEN LOCAL_PREFIX($0)
#endif
.endm

.macro GLOBAL_HIDDEN_FN name
#if defined(HAVE_SLASHMACRO)
GLOBAL \name
HIDDEN \name
FN \name
#elif defined(HAVE_DOLLARMACRO)
GLOBAL $0
HIDDEN $0
FN $0
#endif
.endm

.macro GLOBAL_HIDDEN_FN_EXT name, args, xmmused
#if defined(HAVE_SLASHMACRO)
GLOBAL \name
HIDDEN \name
FN_EXT \name, \args, \xmmused
#elif defined(HAVE_DOLLARMACRO)
GLOBAL $0
HIDDEN $0
FN_EXT $0, $1, $2
#endif
.endm

/* pic support */
.macro LOAD_VAR_PIC var, reg
#if (IS_X86_32)
#if defined(HAVE_SLASHMACRO)
call 1f
1:
popl \reg
leal \var - 1b(\reg), \reg
#elif defined(HAVE_DOLLARMACRO)
call 1f
1:
popl $1
leal $0 - 1b($1), $1
#endif
#else
#if defined(HAVE_SLASHMACRO)
leaq \var(%rip), \reg
#elif defined(HAVE_DOLLARMACRO)
leaq $0(%rip), $1
#endif
#endif
.endm

SECTION_RODATA
.p2align 4,,15
chacha_constants:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */
.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */

+ 734
- 0
src/libcryptobox/chacha20/sse2.S View File

@@ -0,0 +1,734 @@
#include "macro.S"

SECTION_TEXT

GLOBAL_HIDDEN_FN chacha_blocks_sse2
chacha_blocks_sse2_local:
pushq %rbx
pushq %rbp
movq %rsp, %rbp
andq $~63, %rsp
subq $512, %rsp
movq $0x3320646e61707865, %rax
movq $0x6b20657479622d32, %r8
movd %rax, %xmm8
movd %r8, %xmm14
punpcklqdq %xmm14, %xmm8
movdqu 0(%rdi), %xmm9
movdqu 16(%rdi), %xmm10
movdqu 32(%rdi), %xmm11
movq 48(%rdi), %rax
movq $1, %r9
movdqa %xmm8, 0(%rsp)
movdqa %xmm9, 16(%rsp)
movdqa %xmm10, 32(%rsp)
movdqa %xmm11, 48(%rsp)
movq %rax, 64(%rsp)
cmpq $256, %rcx
jb chacha_blocks_sse2_below256
pshufd $0x00, %xmm8, %xmm0
pshufd $0x55, %xmm8, %xmm1
pshufd $0xaa, %xmm8, %xmm2
pshufd $0xff, %xmm8, %xmm3
movdqa %xmm0, 128(%rsp)
movdqa %xmm1, 144(%rsp)
movdqa %xmm2, 160(%rsp)
movdqa %xmm3, 176(%rsp)
pshufd $0x00, %xmm9, %xmm0
pshufd $0x55, %xmm9, %xmm1
pshufd $0xaa, %xmm9, %xmm2
pshufd $0xff, %xmm9, %xmm3
movdqa %xmm0, 192(%rsp)
movdqa %xmm1, 208(%rsp)
movdqa %xmm2, 224(%rsp)
movdqa %xmm3, 240(%rsp)
pshufd $0x00, %xmm10, %xmm0
pshufd $0x55, %xmm10, %xmm1
pshufd $0xaa, %xmm10, %xmm2
pshufd $0xff, %xmm10, %xmm3
movdqa %xmm0, 256(%rsp)
movdqa %xmm1, 272(%rsp)
movdqa %xmm2, 288(%rsp)
movdqa %xmm3, 304(%rsp)
pshufd $0xaa, %xmm11, %xmm0
pshufd $0xff, %xmm11, %xmm1
movdqa %xmm0, 352(%rsp)
movdqa %xmm1, 368(%rsp)
jmp chacha_blocks_sse2_atleast256
.p2align 6,,63
chacha_blocks_sse2_atleast256:
movq 48(%rsp), %rax
leaq 1(%rax), %r8
leaq 2(%rax), %r9
leaq 3(%rax), %r10
leaq 4(%rax), %rbx
movl %eax, 320(%rsp)
movl %r8d, 4+320(%rsp)
movl %r9d, 8+320(%rsp)
movl %r10d, 12+320(%rsp)
shrq $32, %rax
shrq $32, %r8
shrq $32, %r9
shrq $32, %r10
movl %eax, 336(%rsp)
movl %r8d, 4+336(%rsp)
movl %r9d, 8+336(%rsp)
movl %r10d, 12+336(%rsp)
movq %rbx, 48(%rsp)
movq 64(%rsp), %rax
movdqa 128(%rsp), %xmm0
movdqa 144(%rsp), %xmm1
movdqa 160(%rsp), %xmm2
movdqa 176(%rsp), %xmm3
movdqa 192(%rsp), %xmm4
movdqa 208(%rsp), %xmm5
movdqa 224(%rsp), %xmm6
movdqa 240(%rsp), %xmm7
movdqa 256(%rsp), %xmm8
movdqa 272(%rsp), %xmm9
movdqa 288(%rsp), %xmm10
movdqa 304(%rsp), %xmm11
movdqa 320(%rsp), %xmm12
movdqa 336(%rsp), %xmm13
movdqa 352(%rsp), %xmm14
movdqa 368(%rsp), %xmm15
chacha_blocks_sse2_mainloop1:
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
pxor %xmm0, %xmm12
pxor %xmm1, %xmm13
paddd %xmm6, %xmm2
paddd %xmm7, %xmm3
movdqa %xmm6, 96(%rsp)
pxor %xmm2, %xmm14
pxor %xmm3, %xmm15
pshuflw $0xb1,%xmm12,%xmm12
pshufhw $0xb1,%xmm12,%xmm12
pshuflw $0xb1,%xmm13,%xmm13
pshufhw $0xb1,%xmm13,%xmm13
pshuflw $0xb1,%xmm14,%xmm14
pshufhw $0xb1,%xmm14,%xmm14
pshuflw $0xb1,%xmm15,%xmm15
pshufhw $0xb1,%xmm15,%xmm15
paddd %xmm12, %xmm8
paddd %xmm13, %xmm9
paddd %xmm14, %xmm10
paddd %xmm15, %xmm11
movdqa %xmm12, 112(%rsp)
pxor %xmm8, %xmm4
pxor %xmm9, %xmm5
movdqa 96(%rsp), %xmm6
movdqa %xmm4, %xmm12
pslld $ 12, %xmm4
psrld $20, %xmm12
pxor %xmm12, %xmm4
movdqa %xmm5, %xmm12
pslld $ 12, %xmm5
psrld $20, %xmm12
pxor %xmm12, %xmm5
pxor %xmm10, %xmm6
pxor %xmm11, %xmm7
movdqa %xmm6, %xmm12
pslld $ 12, %xmm6
psrld $20, %xmm12
pxor %xmm12, %xmm6
movdqa %xmm7, %xmm12
pslld $ 12, %xmm7
psrld $20, %xmm12
pxor %xmm12, %xmm7
movdqa 112(%rsp), %xmm12
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
pxor %xmm0, %xmm12
pxor %xmm1, %xmm13
paddd %xmm6, %xmm2
paddd %xmm7, %xmm3
movdqa %xmm6, 96(%rsp)
pxor %xmm2, %xmm14
pxor %xmm3, %xmm15
movdqa %xmm12, %xmm6
pslld $ 8, %xmm12
psrld $24, %xmm6
pxor %xmm6, %xmm12
movdqa %xmm13, %xmm6
pslld $ 8, %xmm13
psrld $24, %xmm6
pxor %xmm6, %xmm13
paddd %xmm12, %xmm8
paddd %xmm13, %xmm9
movdqa %xmm14, %xmm6
pslld $ 8, %xmm14
psrld $24, %xmm6
pxor %xmm6, %xmm14
movdqa %xmm15, %xmm6
pslld $ 8, %xmm15
psrld $24, %xmm6
pxor %xmm6, %xmm15
paddd %xmm14, %xmm10
paddd %xmm15, %xmm11
movdqa %xmm12, 112(%rsp)
pxor %xmm8, %xmm4
pxor %xmm9, %xmm5
movdqa 96(%rsp), %xmm6
movdqa %xmm4, %xmm12
pslld $ 7, %xmm4
psrld $25, %xmm12
pxor %xmm12, %xmm4
movdqa %xmm5, %xmm12
pslld $ 7, %xmm5
psrld $25, %xmm12
pxor %xmm12, %xmm5
pxor %xmm10, %xmm6
pxor %xmm11, %xmm7
movdqa %xmm6, %xmm12
pslld $ 7, %xmm6
psrld $25, %xmm12
pxor %xmm12, %xmm6
movdqa %xmm7, %xmm12
pslld $ 7, %xmm7
psrld $25, %xmm12
pxor %xmm12, %xmm7
movdqa 112(%rsp), %xmm12
paddd %xmm5, %xmm0
paddd %xmm6, %xmm1
pxor %xmm0, %xmm15
pxor %xmm1, %xmm12
paddd %xmm7, %xmm2
paddd %xmm4, %xmm3
movdqa %xmm7, 96(%rsp)
pxor %xmm2, %xmm13
pxor %xmm3, %xmm14
pshuflw $0xb1,%xmm15,%xmm15
pshufhw $0xb1,%xmm15,%xmm15
pshuflw $0xb1,%xmm12,%xmm12
pshufhw $0xb1,%xmm12,%xmm12
pshuflw $0xb1,%xmm13,%xmm13
pshufhw $0xb1,%xmm13,%xmm13
pshuflw $0xb1,%xmm14,%xmm14
pshufhw $0xb1,%xmm14,%xmm14
paddd %xmm15, %xmm10
paddd %xmm12, %xmm11
paddd %xmm13, %xmm8
paddd %xmm14, %xmm9
movdqa %xmm15, 112(%rsp)
pxor %xmm10, %xmm5
pxor %xmm11, %xmm6
movdqa 96(%rsp), %xmm7
movdqa %xmm5, %xmm15
pslld $ 12, %xmm5
psrld $20, %xmm15
pxor %xmm15, %xmm5
movdqa %xmm6, %xmm15
pslld $ 12, %xmm6
psrld $20, %xmm15
pxor %xmm15, %xmm6
pxor %xmm8, %xmm7
pxor %xmm9, %xmm4
movdqa %xmm7, %xmm15
pslld $ 12, %xmm7
psrld $20, %xmm15
pxor %xmm15, %xmm7
movdqa %xmm4, %xmm15
pslld $ 12, %xmm4
psrld $20, %xmm15
pxor %xmm15, %xmm4
movdqa 112(%rsp), %xmm15
paddd %xmm5, %xmm0
paddd %xmm6, %xmm1
pxor %xmm0, %xmm15
pxor %xmm1, %xmm12
paddd %xmm7, %xmm2
paddd %xmm4, %xmm3
movdqa %xmm7, 96(%rsp)
pxor %xmm2, %xmm13
pxor %xmm3, %xmm14
movdqa %xmm15, %xmm7
pslld $ 8, %xmm15
psrld $24, %xmm7
pxor %xmm7, %xmm15
movdqa %xmm12, %xmm7
pslld $ 8, %xmm12
psrld $24, %xmm7
pxor %xmm7, %xmm12
paddd %xmm15, %xmm10
paddd %xmm12, %xmm11
movdqa %xmm13, %xmm7
pslld $ 8, %xmm13
psrld $24, %xmm7
pxor %xmm7, %xmm13
movdqa %xmm14, %xmm7
pslld $ 8, %xmm14
psrld $24, %xmm7
pxor %xmm7, %xmm14
paddd %xmm13, %xmm8
paddd %xmm14, %xmm9
movdqa %xmm15, 112(%rsp)
pxor %xmm10, %xmm5
pxor %xmm11, %xmm6
movdqa 96(%rsp), %xmm7
movdqa %xmm5, %xmm15
pslld $ 7, %xmm5
psrld $25, %xmm15
pxor %xmm15, %xmm5
movdqa %xmm6, %xmm15
pslld $ 7, %xmm6
psrld $25, %xmm15
pxor %xmm15, %xmm6
pxor %xmm8, %xmm7
pxor %xmm9, %xmm4
movdqa %xmm7, %xmm15
pslld $ 7, %xmm7
psrld $25, %xmm15
pxor %xmm15, %xmm7
movdqa %xmm4, %xmm15
pslld $ 7, %xmm4
psrld $25, %xmm15
pxor %xmm15, %xmm4
movdqa 112(%rsp), %xmm15
subq $2, %rax
jnz chacha_blocks_sse2_mainloop1
paddd 128(%rsp), %xmm0
paddd 144(%rsp), %xmm1
paddd 160(%rsp), %xmm2
paddd 176(%rsp), %xmm3
paddd 192(%rsp), %xmm4
paddd 208(%rsp), %xmm5
paddd 224(%rsp), %xmm6
paddd 240(%rsp), %xmm7
paddd 256(%rsp), %xmm8
paddd 272(%rsp), %xmm9
paddd 288(%rsp), %xmm10
paddd 304(%rsp), %xmm11
paddd 320(%rsp), %xmm12
paddd 336(%rsp), %xmm13
paddd 352(%rsp), %xmm14
paddd 368(%rsp), %xmm15
movdqa %xmm8, 384(%rsp)
movdqa %xmm9, 400(%rsp)
movdqa %xmm10, 416(%rsp)
movdqa %xmm11, 432(%rsp)
movdqa %xmm12, 448(%rsp)
movdqa %xmm13, 464(%rsp)
movdqa %xmm14, 480(%rsp)
movdqa %xmm15, 496(%rsp)
movdqa %xmm0, %xmm8
movdqa %xmm2, %xmm9
movdqa %xmm4, %xmm10
movdqa %xmm6, %xmm11
punpckhdq %xmm1, %xmm0
punpckhdq %xmm3, %xmm2
punpckhdq %xmm5, %xmm4
punpckhdq %xmm7, %xmm6
punpckldq %xmm1, %xmm8
punpckldq %xmm3, %xmm9
punpckldq %xmm5, %xmm10
punpckldq %xmm7, %xmm11
movdqa %xmm0, %xmm1
movdqa %xmm4, %xmm3
movdqa %xmm8, %xmm5
movdqa %xmm10, %xmm7
punpckhqdq %xmm2, %xmm0
punpckhqdq %xmm6, %xmm4
punpckhqdq %xmm9, %xmm8
punpckhqdq %xmm11, %xmm10
punpcklqdq %xmm2, %xmm1
punpcklqdq %xmm6, %xmm3
punpcklqdq %xmm9, %xmm5
punpcklqdq %xmm11, %xmm7
andq %rsi, %rsi
jz chacha_blocks_sse2_noinput1
movdqu 0(%rsi), %xmm2
movdqu 16(%rsi), %xmm6
movdqu 64(%rsi), %xmm9
movdqu 80(%rsi), %xmm11
movdqu 128(%rsi), %xmm12
movdqu 144(%rsi), %xmm13
movdqu 192(%rsi), %xmm14
movdqu 208(%rsi), %xmm15
pxor %xmm2, %xmm5
pxor %xmm6, %xmm7
pxor %xmm9, %xmm8
pxor %xmm11, %xmm10
pxor %xmm12, %xmm1
pxor %xmm13, %xmm3
pxor %xmm14, %xmm0
pxor %xmm15, %xmm4
movdqu %xmm5, 0(%rdx)
movdqu %xmm7, 16(%rdx)
movdqu %xmm8, 64(%rdx)
movdqu %xmm10, 80(%rdx)
movdqu %xmm1, 128(%rdx)
movdqu %xmm3, 144(%rdx)
movdqu %xmm0, 192(%rdx)
movdqu %xmm4, 208(%rdx)
movdqa 384(%rsp), %xmm0
movdqa 400(%rsp), %xmm1
movdqa 416(%rsp), %xmm2
movdqa 432(%rsp), %xmm3
movdqa 448(%rsp), %xmm4
movdqa 464(%rsp), %xmm5
movdqa 480(%rsp), %xmm6
movdqa 496(%rsp), %xmm7
movdqa %xmm0, %xmm8
movdqa %xmm2, %xmm9
movdqa %xmm4, %xmm10
movdqa %xmm6, %xmm11
punpckldq %xmm1, %xmm8
punpckldq %xmm3, %xmm9
punpckhdq %xmm1, %xmm0
punpckhdq %xmm3, %xmm2
punpckldq %xmm5, %xmm10
punpckldq %xmm7, %xmm11
punpckhdq %xmm5, %xmm4
punpckhdq %xmm7, %xmm6
movdqa %xmm8, %xmm1
movdqa %xmm0, %xmm3
movdqa %xmm10, %xmm5
movdqa %xmm4, %xmm7
punpcklqdq %xmm9, %xmm1
punpcklqdq %xmm11, %xmm5
punpckhqdq %xmm9, %xmm8
punpckhqdq %xmm11, %xmm10
punpcklqdq %xmm2, %xmm3
punpcklqdq %xmm6, %xmm7
punpckhqdq %xmm2, %xmm0
punpckhqdq %xmm6, %xmm4
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm6
movdqu 96(%rsi), %xmm9
movdqu 112(%rsi), %xmm11
movdqu 160(%rsi), %xmm12
movdqu 176(%rsi), %xmm13
movdqu 224(%rsi), %xmm14
movdqu 240(%rsi), %xmm15
pxor %xmm2, %xmm1
pxor %xmm6, %xmm5
pxor %xmm9, %xmm8
pxor %xmm11, %xmm10
pxor %xmm12, %xmm3
pxor %xmm13, %xmm7
pxor %xmm14, %xmm0
pxor %xmm15, %xmm4
movdqu %xmm1, 32(%rdx)
movdqu %xmm5, 48(%rdx)
movdqu %xmm8, 96(%rdx)
movdqu %xmm10, 112(%rdx)
movdqu %xmm3, 160(%rdx)
movdqu %xmm7, 176(%rdx)
movdqu %xmm0, 224(%rdx)
movdqu %xmm4, 240(%rdx)
addq $256, %rsi
jmp chacha_blocks_sse2_mainloop_cont
chacha_blocks_sse2_noinput1:
movdqu %xmm5, 0(%rdx)
movdqu %xmm7, 16(%rdx)
movdqu %xmm8, 64(%rdx)
movdqu %xmm10, 80(%rdx)
movdqu %xmm1, 128(%rdx)
movdqu %xmm3, 144(%rdx)
movdqu %xmm0, 192(%rdx)
movdqu %xmm4, 208(%rdx)
movdqa 384(%rsp), %xmm0
movdqa 400(%rsp), %xmm1
movdqa 416(%rsp), %xmm2
movdqa 432(%rsp), %xmm3
movdqa 448(%rsp), %xmm4
movdqa 464(%rsp), %xmm5
movdqa 480(%rsp), %xmm6
movdqa 496(%rsp), %xmm7
movdqa %xmm0, %xmm8
movdqa %xmm2, %xmm9
movdqa %xmm4, %xmm10
movdqa %xmm6, %xmm11
punpckldq %xmm1, %xmm8
punpckldq %xmm3, %xmm9
punpckhdq %xmm1, %xmm0
punpckhdq %xmm3, %xmm2
punpckldq %xmm5, %xmm10
punpckldq %xmm7, %xmm11
punpckhdq %xmm5, %xmm4
punpckhdq %xmm7, %xmm6
movdqa %xmm8, %xmm1
movdqa %xmm0, %xmm3
movdqa %xmm10, %xmm5
movdqa %xmm4, %xmm7
punpcklqdq %xmm9, %xmm1
punpcklqdq %xmm11, %xmm5
punpckhqdq %xmm9, %xmm8
punpckhqdq %xmm11, %xmm10
punpcklqdq %xmm2, %xmm3
punpcklqdq %xmm6, %xmm7
punpckhqdq %xmm2, %xmm0
punpckhqdq %xmm6, %xmm4
movdqu %xmm1, 32(%rdx)
movdqu %xmm5, 48(%rdx)
movdqu %xmm8, 96(%rdx)
movdqu %xmm10, 112(%rdx)
movdqu %xmm3, 160(%rdx)
movdqu %xmm7, 176(%rdx)
movdqu %xmm0, 224(%rdx)
movdqu %xmm4, 240(%rdx)
chacha_blocks_sse2_mainloop_cont:
addq $256, %rdx
subq $256, %rcx
cmp $256, %rcx
jae chacha_blocks_sse2_atleast256
movdqa 0(%rsp), %xmm8
movdqa 16(%rsp), %xmm9
movdqa 32(%rsp), %xmm10
movdqa 48(%rsp), %xmm11
movq $1, %r9
chacha_blocks_sse2_below256:
movq %r9, %xmm5
andq %rcx, %rcx
jz chacha_blocks_sse2_done
cmpq $64, %rcx
jae chacha_blocks_sse2_above63
movq %rdx, %r9
andq %rsi, %rsi
jz chacha_blocks_sse2_noinput2
movq %rcx, %r10
movq %rsp, %rdx
addq %r10, %rsi
addq %r10, %rdx
negq %r10
chacha_blocks_sse2_copyinput:
movb (%rsi, %r10), %al
movb %al, (%rdx, %r10)
incq %r10
jnz chacha_blocks_sse2_copyinput
movq %rsp, %rsi
chacha_blocks_sse2_noinput2:
movq %rsp, %rdx
chacha_blocks_sse2_above63:
movdqa %xmm8, %xmm0
movdqa %xmm9, %xmm1
movdqa %xmm10, %xmm2
movdqa %xmm11, %xmm3
movq 64(%rsp), %rax
chacha_blocks_sse2_mainloop2:
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
pshuflw $0xb1,%xmm3,%xmm3
pshufhw $0xb1,%xmm3,%xmm3
paddd %xmm3, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm1,%xmm4
pslld $12, %xmm1
psrld $20, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
movdqa %xmm3,%xmm4
pslld $8, %xmm3
psrld $24, %xmm4
pshufd $0x93,%xmm0,%xmm0
pxor %xmm4, %xmm3
paddd %xmm3, %xmm2
pshufd $0x4e,%xmm3,%xmm3
pxor %xmm2, %xmm1
pshufd $0x39,%xmm2,%xmm2
movdqa %xmm1,%xmm4
pslld $7, %xmm1
psrld $25, %xmm4
pxor %xmm4, %xmm1
subq $2, %rax
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
pshuflw $0xb1,%xmm3,%xmm3
pshufhw $0xb1,%xmm3,%xmm3
paddd %xmm3, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm1,%xmm4
pslld $12, %xmm1
psrld $20, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
movdqa %xmm3,%xmm4
pslld $8, %xmm3
psrld $24, %xmm4
pshufd $0x39,%xmm0,%xmm0
pxor %xmm4, %xmm3
paddd %xmm3, %xmm2
pshufd $0x4e,%xmm3,%xmm3
pxor %xmm2, %xmm1
pshufd $0x93,%xmm2,%xmm2
movdqa %xmm1,%xmm4
pslld $7, %xmm1
psrld $25, %xmm4
pxor %xmm4, %xmm1
jnz chacha_blocks_sse2_mainloop2
paddd %xmm8, %xmm0
paddd %xmm9, %xmm1
paddd %xmm10, %xmm2
paddd %xmm11, %xmm3
andq %rsi, %rsi
jz chacha_blocks_sse2_noinput3
movdqu 0(%rsi), %xmm12
movdqu 16(%rsi), %xmm13
movdqu 32(%rsi), %xmm14
movdqu 48(%rsi), %xmm15
pxor %xmm12, %xmm0
pxor %xmm13, %xmm1
pxor %xmm14, %xmm2
pxor %xmm15, %xmm3
addq $64, %rsi
chacha_blocks_sse2_noinput3:
movdqu %xmm0, 0(%rdx)
movdqu %xmm1, 16(%rdx)
movdqu %xmm2, 32(%rdx)
movdqu %xmm3, 48(%rdx)
paddq %xmm5, %xmm11
cmpq $64, %rcx
jbe chacha_blocks_sse2_mainloop2_finishup
addq $64, %rdx
subq $64, %rcx
jmp chacha_blocks_sse2_below256
chacha_blocks_sse2_mainloop2_finishup:
cmpq $64, %rcx
je chacha_blocks_sse2_done
addq %rcx, %r9
addq %rcx, %rdx
negq %rcx
chacha_blocks_sse2_copyoutput:
movb (%rdx, %rcx), %al
movb %al, (%r9, %rcx)
incq %rcx
jnz chacha_blocks_sse2_copyoutput
chacha_blocks_sse2_done:
movdqu %xmm11, 32(%rdi)
movq %rbp, %rsp
popq %rbp
popq %rbx
ret
FN_END chacha_blocks_sse2

GLOBAL_HIDDEN_FN hchacha_sse2
hchacha_sse2_local:
movq $0x3320646e61707865, %rax
movq $0x6b20657479622d32, %r8
movd %rax, %xmm0
movd %r8, %xmm4
punpcklqdq %xmm4, %xmm0
movdqu 0(%rdi), %xmm1
movdqu 16(%rdi), %xmm2
movdqu 0(%rsi), %xmm3
hchacha_sse2_mainloop:
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
pshuflw $0xb1,%xmm3,%xmm3
pshufhw $0xb1,%xmm3,%xmm3
paddd %xmm3, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm1,%xmm4
pslld $12, %xmm1
psrld $20, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
movdqa %xmm3,%xmm4
pslld $8, %xmm3
psrld $24, %xmm4
pshufd $0x93,%xmm0,%xmm0
pxor %xmm4, %xmm3
paddd %xmm3, %xmm2
pshufd $0x4e,%xmm3,%xmm3
pxor %xmm2, %xmm1
pshufd $0x39,%xmm2,%xmm2
movdqa %xmm1,%xmm4
pslld $7, %xmm1
psrld $25, %xmm4
pxor %xmm4, %xmm1
subq $2, %rcx
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
pshuflw $0xb1,%xmm3,%xmm3
pshufhw $0xb1,%xmm3,%xmm3
paddd %xmm3, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm1,%xmm4
pslld $12, %xmm1
psrld $20, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
movdqa %xmm3,%xmm4
pslld $8, %xmm3
psrld $24, %xmm4
pshufd $0x39,%xmm0,%xmm0
pxor %xmm4, %xmm3
paddd %xmm3, %xmm2
pshufd $0x4e,%xmm3,%xmm3
pxor %xmm2, %xmm1
pshufd $0x93,%xmm2,%xmm2
movdqa %xmm1,%xmm4
pslld $7, %xmm1
psrld $25, %xmm4
pxor %xmm4, %xmm1
ja hchacha_sse2_mainloop
movdqu %xmm0, 0(%rdx)
movdqu %xmm3, 16(%rdx)
ret
FN_END hchacha_sse2

GLOBAL_HIDDEN_FN_EXT chacha_sse2, 6, 16
pushq %rbp
movq %rsp, %rbp
subq $64, %rsp
andq $~63, %rsp
movdqu 0(%rdi), %xmm0
movdqu 16(%rdi), %xmm1
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
xorq %rdi, %rdi
movq %rdi, 32(%rsp)
movq 0(%rsi), %rsi
movq %rsi, 40(%rsp)
movq %r9, 48(%rsp)
movq %rsp, %rdi
movq %rdx, %rsi
movq %rcx, %rdx
movq %r8, %rcx
call chacha_blocks_sse2_local
pxor %xmm0, %xmm0
movdqa %xmm0, 0(%rsp)
movdqa %xmm0, 16(%rsp)
movdqa %xmm0, 32(%rsp)
movq %rbp, %rsp
popq %rbp
ret
FN_END chacha_sse2

GLOBAL_HIDDEN_FN_EXT xchacha_sse2, 6, 16
pushq %rbp
pushq %rbx
movq %rsp, %rbp
subq $64, %rsp
andq $~63, %rsp
movq %rsp, %rbx
xorq %rax, %rax
movq %rax, 32(%rbx)
movq 16(%rsi), %rax
movq %rax, 40(%rbx)
movq %r9, 48(%rbx)
pushq %rdx
pushq %rcx
pushq %r8
movq %rbx, %rdx
movq %r9, %rcx
call hchacha_sse2_local
movq %rbx, %rdi
popq %rcx
popq %rdx
popq %rsi
call chacha_blocks_sse2_local
pxor %xmm0, %xmm0
movdqa %xmm0, 0(%rbx)
movdqa %xmm0, 16(%rbx)
movdqa %xmm0, 32(%rbx)
movq %rbp, %rsp
popq %rbx
popq %rbp
ret
FN_END xchacha_sse2

+ 1
- 1
src/libcryptobox/cryptobox.h View File

@@ -61,7 +61,7 @@ void rspamd_cryptobox_keypair (rspamd_pk_t pk, rspamd_sk_t sk);
* @param sk local secret key
* @param sig output signature
*/
void rspamd_cryptobox_encrypt_inplace (struct rspamd_encrypt_segment *segments,
void rspamd_cryptobox_encrypt_inplace (guchar *data, gsize len,
gsize cnt, const rspamd_pk_t pk, const rspamd_sk_t sk, rspamd_sig_t sig);



+ 11
- 0
src/libcryptobox/platform_config.h.in View File

@@ -0,0 +1,11 @@
#ifndef PLATFORM_H_CONFIG
#define PLATFORM_H_CONFIG

#define ARCH ${ARCH}
#cmakedefine HAVE_AVX2 1
#cmakedefine HAVE_AVX 1
#cmakedefine HAVE_SSE2 1
#cmakedefine HAVE_SLASHMACRO 1
#cmakedefine HAVE_DOLLARMACRO 1

#endif

Loading…
Cancel
Save