@@ -20,7 +20,7 @@ IF(NOT RSPAMD_USER) | |||
SET(RSPAMD_GROUP "nobody") | |||
ENDIF(NOT RSPAMD_USER) | |||
CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0 FATAL_ERROR) | |||
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0 FATAL_ERROR) | |||
SET_PROPERTY(GLOBAL PROPERTY ALLOW_DUPLICATE_CUSTOM_TARGETS 1) | |||
############################# OPTIONS SECTION ############################################# |
@@ -0,0 +1,21 @@ | |||
# Check for assembler option specified | |||
function(asm_op output_var op description) | |||
SET(asm_code " | |||
${op} | |||
") | |||
file(WRITE "${CMAKE_BINARY_DIR}/asm.S" "${asm_code}") | |||
try_compile(HAVE_OP | |||
"${CMAKE_BINARY_DIR}" | |||
"${CMAKE_BINARY_DIR}/asm.S" | |||
CMAKE_FLAGS "-DCMAKE_ASM_LINK_EXECUTABLE='echo not linking now...'") | |||
#file(REMOVE "${CMAKE_BINARY_DIR}/asm.s") | |||
if(HAVE_OP) | |||
MESSAGE(STATUS "Compilation of ${description} asm set is supported") | |||
else() | |||
MESSAGE(STATUS "Compilation of ${description} asm set is -NOT- supported") | |||
endif() | |||
set(${output_var} "${HAVE_OP}" PARENT_SCOPE) | |||
endfunction() |
@@ -1,8 +1,45 @@ | |||
INCLUDE(FindArch.cmake) | |||
INCLUDE(AsmOp.cmake) | |||
TARGET_ARCHITECTURE(ARCH) | |||
SET(CHACHASRC chacha20/ref.c) | |||
# For now we support only x86_64 architecture with optimizations | |||
IF(${ARCH} STREQUAL "x86_64") | |||
ENABLE_LANGUAGE(ASM) | |||
ASM_OP(HAVE_AVX2 "vpaddq %ymm0, %ymm0, %ymm0" "avx2") | |||
ASM_OP(HAVE_AVX "vpaddq %xmm0, %xmm0, %xmm0" "avx") | |||
ASM_OP(HAVE_SSE2 "pmuludq %xmm0, %xmm0" "sse2") | |||
ASM_OP(HAVE_SLASHMACRO " | |||
.macro TEST1 op | |||
\\op %eax, %eax | |||
.endm | |||
TEST1 xorl | |||
" "slash macro convention") | |||
ASM_OP(HAVE_DOLLARMACRO " | |||
.macro TEST1 op | |||
$0 %eax, %eax | |||
.endm | |||
TEST1 xorl | |||
" "dollar macro convention") | |||
CONFIGURE_FILE(platform_config.h.in platform_config.h) | |||
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}") | |||
ENDIF() | |||
IF(HAVE_AVX2) | |||
SET(CHACHASRC ${CHACHASRC} chacha20/avx2.S) | |||
ENDIF(HAVE_AVX2) | |||
IF(HAVE_AVX) | |||
SET(CHACHASRC ${CHACHASRC} chacha20/avx.S) | |||
ENDIF(HAVE_AVX) | |||
IF(HAVE_SSE2) | |||
SET(CHACHASRC ${CHACHASRC} chacha20/sse2.S) | |||
ENDIF(HAVE_SSE2) | |||
SET(LIBCRYPTOBOXSRC cryptobox.c) | |||
ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC}) | |||
ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} ${CHACHASRC}) | |||
IF(NOT DEBIAN_BUILD) | |||
SET_TARGET_PROPERTIES(rspamd-cryptobox PROPERTIES VERSION ${RSPAMD_VERSION}) | |||
ENDIF(NOT DEBIAN_BUILD) |
@@ -0,0 +1,127 @@ | |||
set(archdetect_c_code " | |||
#if defined(__arm__) || defined(__TARGET_ARCH_ARM) | |||
#if defined(__ARM_ARCH_7__) \\ | |||
|| defined(__ARM_ARCH_7A__) \\ | |||
|| defined(__ARM_ARCH_7R__) \\ | |||
|| defined(__ARM_ARCH_7M__) \\ | |||
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7) | |||
#error cmake_ARCH armv7 | |||
#elif defined(__ARM_ARCH_6__) \\ | |||
|| defined(__ARM_ARCH_6J__) \\ | |||
|| defined(__ARM_ARCH_6T2__) \\ | |||
|| defined(__ARM_ARCH_6Z__) \\ | |||
|| defined(__ARM_ARCH_6K__) \\ | |||
|| defined(__ARM_ARCH_6ZK__) \\ | |||
|| defined(__ARM_ARCH_6M__) \\ | |||
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6) | |||
#error cmake_ARCH armv6 | |||
#elif defined(__ARM_ARCH_5TEJ__) \\ | |||
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5) | |||
#error cmake_ARCH armv5 | |||
#else | |||
#error cmake_ARCH arm | |||
#endif | |||
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86) | |||
#error cmake_ARCH i386 | |||
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) | |||
#error cmake_ARCH x86_64 | |||
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) | |||
#error cmake_ARCH ia64 | |||
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\ | |||
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\ | |||
|| defined(_M_MPPC) || defined(_M_PPC) | |||
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__) | |||
#error cmake_ARCH ppc64 | |||
#else | |||
#error cmake_ARCH ppc | |||
#endif | |||
#endif | |||
#error cmake_ARCH unknown | |||
") | |||
# Set ppc_support to TRUE before including this file or ppc and ppc64 | |||
# will be treated as invalid architectures since they are no longer supported by Apple | |||
function(target_architecture output_var) | |||
if(APPLE AND CMAKE_OSX_ARCHITECTURES) | |||
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set | |||
# First let's normalize the order of the values | |||
# Note that it's not possible to compile PowerPC applications if you are using | |||
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we | |||
# disable it by default | |||
# See this page for more information: | |||
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4 | |||
# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime. | |||
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise. | |||
foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES}) | |||
if("${osx_arch}" STREQUAL "ppc" AND ppc_support) | |||
set(osx_arch_ppc TRUE) | |||
elseif("${osx_arch}" STREQUAL "i386") | |||
set(osx_arch_i386 TRUE) | |||
elseif("${osx_arch}" STREQUAL "x86_64") | |||
set(osx_arch_x86_64 TRUE) | |||
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support) | |||
set(osx_arch_ppc64 TRUE) | |||
else() | |||
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}") | |||
endif() | |||
endforeach() | |||
# Now add all the architectures in our normalized order | |||
if(osx_arch_ppc) | |||
list(APPEND ARCH ppc) | |||
endif() | |||
if(osx_arch_i386) | |||
list(APPEND ARCH i386) | |||
endif() | |||
if(osx_arch_x86_64) | |||
list(APPEND ARCH x86_64) | |||
endif() | |||
if(osx_arch_ppc64) | |||
list(APPEND ARCH ppc64) | |||
endif() | |||
else() | |||
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}") | |||
enable_language(C) | |||
# Detect the architecture in a rather creative way... | |||
# This compiles a small C program which is a series of ifdefs that selects a | |||
# particular #error preprocessor directive whose message string contains the | |||
# target architecture. The program will always fail to compile (both because | |||
# file is not a valid C program, and obviously because of the presence of the | |||
# #error preprocessor directives... but by exploiting the preprocessor in this | |||
# way, we can detect the correct target architecture even when cross-compiling, | |||
# since the program itself never needs to be run (only the compiler/preprocessor) | |||
try_run( | |||
run_result_unused | |||
compile_result_unused | |||
"${CMAKE_BINARY_DIR}" | |||
"${CMAKE_BINARY_DIR}/arch.c" | |||
COMPILE_OUTPUT_VARIABLE ARCH | |||
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} | |||
) | |||
# Parse the architecture name from the compiler output | |||
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}") | |||
# Get rid of the value marker leaving just the architecture name | |||
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}") | |||
# If we are compiling with an unknown architecture this variable should | |||
# already be set to "unknown" but in the case that it's empty (i.e. due | |||
# to a typo in the code), then set it to unknown | |||
if (NOT ARCH) | |||
set(ARCH unknown) | |||
endif() | |||
endif() | |||
set(${output_var} "${ARCH}" PARENT_SCOPE) | |||
endfunction() |
@@ -0,0 +1,613 @@ | |||
#include "macro.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN chacha_blocks_avx | |||
chacha_blocks_avx_local: | |||
pushq %rbx | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
andq $~63, %rsp | |||
subq $512, %rsp | |||
LOAD_VAR_PIC chacha_constants, %rax | |||
vmovdqa 0(%rax), %xmm8 | |||
vmovdqa 16(%rax), %xmm6 | |||
vmovdqa 32(%rax), %xmm7 | |||
vmovdqu 0(%rdi), %xmm9 | |||
vmovdqu 16(%rdi), %xmm10 | |||
vmovdqu 32(%rdi), %xmm11 | |||
movq 48(%rdi), %rax | |||
movq $1, %r9 | |||
vmovdqa %xmm8, 0(%rsp) | |||
vmovdqa %xmm9, 16(%rsp) | |||
vmovdqa %xmm10, 32(%rsp) | |||
vmovdqa %xmm11, 48(%rsp) | |||
vmovdqa %xmm6, 80(%rsp) | |||
vmovdqa %xmm7, 96(%rsp) | |||
movq %rax, 64(%rsp) | |||
cmpq $256, %rcx | |||
jb chacha_blocks_avx_below256 | |||
vpshufd $0x00, %xmm8, %xmm0 | |||
vpshufd $0x55, %xmm8, %xmm1 | |||
vpshufd $0xaa, %xmm8, %xmm2 | |||
vpshufd $0xff, %xmm8, %xmm3 | |||
vmovdqa %xmm0, 128(%rsp) | |||
vmovdqa %xmm1, 144(%rsp) | |||
vmovdqa %xmm2, 160(%rsp) | |||
vmovdqa %xmm3, 176(%rsp) | |||
vpshufd $0x00, %xmm9, %xmm0 | |||
vpshufd $0x55, %xmm9, %xmm1 | |||
vpshufd $0xaa, %xmm9, %xmm2 | |||
vpshufd $0xff, %xmm9, %xmm3 | |||
vmovdqa %xmm0, 192(%rsp) | |||
vmovdqa %xmm1, 208(%rsp) | |||
vmovdqa %xmm2, 224(%rsp) | |||
vmovdqa %xmm3, 240(%rsp) | |||
vpshufd $0x00, %xmm10, %xmm0 | |||
vpshufd $0x55, %xmm10, %xmm1 | |||
vpshufd $0xaa, %xmm10, %xmm2 | |||
vpshufd $0xff, %xmm10, %xmm3 | |||
vmovdqa %xmm0, 256(%rsp) | |||
vmovdqa %xmm1, 272(%rsp) | |||
vmovdqa %xmm2, 288(%rsp) | |||
vmovdqa %xmm3, 304(%rsp) | |||
vpshufd $0xaa, %xmm11, %xmm0 | |||
vpshufd $0xff, %xmm11, %xmm1 | |||
vmovdqa %xmm0, 352(%rsp) | |||
vmovdqa %xmm1, 368(%rsp) | |||
jmp chacha_blocks_avx_atleast256 | |||
.p2align 6,,63 | |||
nop | |||
nop | |||
nop | |||
nop | |||
nop | |||
chacha_blocks_avx_atleast256: | |||
movq 48(%rsp), %rax | |||
leaq 1(%rax), %r8 | |||
leaq 2(%rax), %r9 | |||
leaq 3(%rax), %r10 | |||
leaq 4(%rax), %rbx | |||
movl %eax, 320(%rsp) | |||
movl %r8d, 4+320(%rsp) | |||
movl %r9d, 8+320(%rsp) | |||
movl %r10d, 12+320(%rsp) | |||
shrq $32, %rax | |||
shrq $32, %r8 | |||
shrq $32, %r9 | |||
shrq $32, %r10 | |||
movl %eax, 336(%rsp) | |||
movl %r8d, 4+336(%rsp) | |||
movl %r9d, 8+336(%rsp) | |||
movl %r10d, 12+336(%rsp) | |||
movq %rbx, 48(%rsp) | |||
movq 64(%rsp), %rax | |||
vmovdqa 128(%rsp), %xmm0 | |||
vmovdqa 144(%rsp), %xmm1 | |||
vmovdqa 160(%rsp), %xmm2 | |||
vmovdqa 176(%rsp), %xmm3 | |||
vmovdqa 192(%rsp), %xmm4 | |||
vmovdqa 208(%rsp), %xmm5 | |||
vmovdqa 224(%rsp), %xmm6 | |||
vmovdqa 240(%rsp), %xmm7 | |||
vmovdqa 256(%rsp), %xmm8 | |||
vmovdqa 272(%rsp), %xmm9 | |||
vmovdqa 288(%rsp), %xmm10 | |||
vmovdqa 304(%rsp), %xmm11 | |||
vmovdqa 320(%rsp), %xmm12 | |||
vmovdqa 336(%rsp), %xmm13 | |||
vmovdqa 352(%rsp), %xmm14 | |||
vmovdqa 368(%rsp), %xmm15 | |||
chacha_blocks_avx_mainloop1: | |||
vpaddd %xmm0, %xmm4, %xmm0 | |||
vpaddd %xmm1, %xmm5, %xmm1 | |||
vpxor %xmm12, %xmm0, %xmm12 | |||
vpxor %xmm13, %xmm1, %xmm13 | |||
vpaddd %xmm2, %xmm6, %xmm2 | |||
vpaddd %xmm3, %xmm7, %xmm3 | |||
vpxor %xmm14, %xmm2, %xmm14 | |||
vpxor %xmm15, %xmm3, %xmm15 | |||
vpshufb 80(%rsp), %xmm12, %xmm12 | |||
vpshufb 80(%rsp), %xmm13, %xmm13 | |||
vpaddd %xmm8, %xmm12, %xmm8 | |||
vpaddd %xmm9, %xmm13, %xmm9 | |||
vpshufb 80(%rsp), %xmm14, %xmm14 | |||
vpshufb 80(%rsp), %xmm15, %xmm15 | |||
vpaddd %xmm10, %xmm14, %xmm10 | |||
vpaddd %xmm11, %xmm15, %xmm11 | |||
vmovdqa %xmm12, 112(%rsp) | |||
vpxor %xmm4, %xmm8, %xmm4 | |||
vpxor %xmm5, %xmm9, %xmm5 | |||
vpslld $ 12, %xmm4, %xmm12 | |||
vpsrld $20, %xmm4, %xmm4 | |||
vpxor %xmm4, %xmm12, %xmm4 | |||
vpslld $ 12, %xmm5, %xmm12 | |||
vpsrld $20, %xmm5, %xmm5 | |||
vpxor %xmm5, %xmm12, %xmm5 | |||
vpxor %xmm6, %xmm10, %xmm6 | |||
vpxor %xmm7, %xmm11, %xmm7 | |||
vpslld $ 12, %xmm6, %xmm12 | |||
vpsrld $20, %xmm6, %xmm6 | |||
vpxor %xmm6, %xmm12, %xmm6 | |||
vpslld $ 12, %xmm7, %xmm12 | |||
vpsrld $20, %xmm7, %xmm7 | |||
vpxor %xmm7, %xmm12, %xmm7 | |||
vpaddd %xmm0, %xmm4, %xmm0 | |||
vpaddd %xmm1, %xmm5, %xmm1 | |||
vpxor 112(%rsp), %xmm0, %xmm12 | |||
vpxor %xmm13, %xmm1, %xmm13 | |||
vpaddd %xmm2, %xmm6, %xmm2 | |||
vpaddd %xmm3, %xmm7, %xmm3 | |||
vpxor %xmm14, %xmm2, %xmm14 | |||
vpxor %xmm15, %xmm3, %xmm15 | |||
vpshufb 96(%rsp), %xmm12, %xmm12 | |||
vpshufb 96(%rsp), %xmm13, %xmm13 | |||
vpaddd %xmm8, %xmm12, %xmm8 | |||
vpaddd %xmm9, %xmm13, %xmm9 | |||
vpshufb 96(%rsp), %xmm14, %xmm14 | |||
vpshufb 96(%rsp), %xmm15, %xmm15 | |||
vpaddd %xmm10, %xmm14, %xmm10 | |||
vpaddd %xmm11, %xmm15, %xmm11 | |||
vmovdqa %xmm12, 112(%rsp) | |||
vpxor %xmm4, %xmm8, %xmm4 | |||
vpxor %xmm5, %xmm9, %xmm5 | |||
vpslld $ 7, %xmm4, %xmm12 | |||
vpsrld $25, %xmm4, %xmm4 | |||
vpxor %xmm4, %xmm12, %xmm4 | |||
vpslld $ 7, %xmm5, %xmm12 | |||
vpsrld $25, %xmm5, %xmm5 | |||
vpxor %xmm5, %xmm12, %xmm5 | |||
vpxor %xmm6, %xmm10, %xmm6 | |||
vpxor %xmm7, %xmm11, %xmm7 | |||
vpslld $ 7, %xmm6, %xmm12 | |||
vpsrld $25, %xmm6, %xmm6 | |||
vpxor %xmm6, %xmm12, %xmm6 | |||
vpslld $ 7, %xmm7, %xmm12 | |||
vpsrld $25, %xmm7, %xmm7 | |||
vpxor %xmm7, %xmm12, %xmm7 | |||
vpaddd %xmm0, %xmm5, %xmm0 | |||
vpaddd %xmm1, %xmm6, %xmm1 | |||
vpxor %xmm15, %xmm0, %xmm15 | |||
vpxor 112(%rsp), %xmm1, %xmm12 | |||
vpaddd %xmm2, %xmm7, %xmm2 | |||
vpaddd %xmm3, %xmm4, %xmm3 | |||
vpxor %xmm13, %xmm2, %xmm13 | |||
vpxor %xmm14, %xmm3, %xmm14 | |||
vpshufb 80(%rsp), %xmm15, %xmm15 | |||
vpshufb 80(%rsp), %xmm12, %xmm12 | |||
vpaddd %xmm10, %xmm15, %xmm10 | |||
vpaddd %xmm11, %xmm12, %xmm11 | |||
vpshufb 80(%rsp), %xmm13, %xmm13 | |||
vpshufb 80(%rsp), %xmm14, %xmm14 | |||
vpaddd %xmm8, %xmm13, %xmm8 | |||
vpaddd %xmm9, %xmm14, %xmm9 | |||
vmovdqa %xmm15, 112(%rsp) | |||
vpxor %xmm5, %xmm10, %xmm5 | |||
vpxor %xmm6, %xmm11, %xmm6 | |||
vpslld $ 12, %xmm5, %xmm15 | |||
vpsrld $20, %xmm5, %xmm5 | |||
vpxor %xmm5, %xmm15, %xmm5 | |||
vpslld $ 12, %xmm6, %xmm15 | |||
vpsrld $20, %xmm6, %xmm6 | |||
vpxor %xmm6, %xmm15, %xmm6 | |||
vpxor %xmm7, %xmm8, %xmm7 | |||
vpxor %xmm4, %xmm9, %xmm4 | |||
vpslld $ 12, %xmm7, %xmm15 | |||
vpsrld $20, %xmm7, %xmm7 | |||
vpxor %xmm7, %xmm15, %xmm7 | |||
vpslld $ 12, %xmm4, %xmm15 | |||
vpsrld $20, %xmm4, %xmm4 | |||
vpxor %xmm4, %xmm15, %xmm4 | |||
vpaddd %xmm0, %xmm5, %xmm0 | |||
vpaddd %xmm1, %xmm6, %xmm1 | |||
vpxor 112(%rsp), %xmm0, %xmm15 | |||
vpxor %xmm12, %xmm1, %xmm12 | |||
vpaddd %xmm2, %xmm7, %xmm2 | |||
vpaddd %xmm3, %xmm4, %xmm3 | |||
vpxor %xmm13, %xmm2, %xmm13 | |||
vpxor %xmm14, %xmm3, %xmm14 | |||
vpshufb 96(%rsp), %xmm15, %xmm15 | |||
vpshufb 96(%rsp), %xmm12, %xmm12 | |||
vpaddd %xmm10, %xmm15, %xmm10 | |||
vpaddd %xmm11, %xmm12, %xmm11 | |||
vpshufb 96(%rsp), %xmm13, %xmm13 | |||
vpshufb 96(%rsp), %xmm14, %xmm14 | |||
vpaddd %xmm8, %xmm13, %xmm8 | |||
vpaddd %xmm9, %xmm14, %xmm9 | |||
vmovdqa %xmm15, 112(%rsp) | |||
vpxor %xmm5, %xmm10, %xmm5 | |||
vpxor %xmm6, %xmm11, %xmm6 | |||
vpslld $ 7, %xmm5, %xmm15 | |||
vpsrld $25, %xmm5, %xmm5 | |||
vpxor %xmm5, %xmm15, %xmm5 | |||
vpslld $ 7, %xmm6, %xmm15 | |||
vpsrld $25, %xmm6, %xmm6 | |||
vpxor %xmm6, %xmm15, %xmm6 | |||
vpxor %xmm7, %xmm8, %xmm7 | |||
vpxor %xmm4, %xmm9, %xmm4 | |||
vpslld $ 7, %xmm7, %xmm15 | |||
vpsrld $25, %xmm7, %xmm7 | |||
vpxor %xmm7, %xmm15, %xmm7 | |||
vpslld $ 7, %xmm4, %xmm15 | |||
vpsrld $25, %xmm4, %xmm4 | |||
vpxor %xmm4, %xmm15, %xmm4 | |||
vmovdqa 112(%rsp), %xmm15 | |||
subq $2, %rax | |||
jnz chacha_blocks_avx_mainloop1 | |||
vpaddd 128(%rsp), %xmm0, %xmm0 | |||
vpaddd 144(%rsp), %xmm1, %xmm1 | |||
vpaddd 160(%rsp), %xmm2, %xmm2 | |||
vpaddd 176(%rsp), %xmm3, %xmm3 | |||
vpaddd 192(%rsp), %xmm4, %xmm4 | |||
vpaddd 208(%rsp), %xmm5, %xmm5 | |||
vpaddd 224(%rsp), %xmm6, %xmm6 | |||
vpaddd 240(%rsp), %xmm7, %xmm7 | |||
vpaddd 256(%rsp), %xmm8, %xmm8 | |||
vpaddd 272(%rsp), %xmm9, %xmm9 | |||
vpaddd 288(%rsp), %xmm10, %xmm10 | |||
vpaddd 304(%rsp), %xmm11, %xmm11 | |||
vpaddd 320(%rsp), %xmm12, %xmm12 | |||
vpaddd 336(%rsp), %xmm13, %xmm13 | |||
vpaddd 352(%rsp), %xmm14, %xmm14 | |||
vpaddd 368(%rsp), %xmm15, %xmm15 | |||
vmovdqa %xmm8, 384(%rsp) | |||
vmovdqa %xmm9, 400(%rsp) | |||
vmovdqa %xmm10, 416(%rsp) | |||
vmovdqa %xmm11, 432(%rsp) | |||
vmovdqa %xmm12, 448(%rsp) | |||
vmovdqa %xmm13, 464(%rsp) | |||
vmovdqa %xmm14, 480(%rsp) | |||
vmovdqa %xmm15, 496(%rsp) | |||
vpunpckldq %xmm1, %xmm0, %xmm8 | |||
vpunpckldq %xmm3, %xmm2, %xmm9 | |||
vpunpckhdq %xmm1, %xmm0, %xmm12 | |||
vpunpckhdq %xmm3, %xmm2, %xmm13 | |||
vpunpckldq %xmm5, %xmm4, %xmm10 | |||
vpunpckldq %xmm7, %xmm6, %xmm11 | |||
vpunpckhdq %xmm5, %xmm4, %xmm14 | |||
vpunpckhdq %xmm7, %xmm6, %xmm15 | |||
vpunpcklqdq %xmm9, %xmm8, %xmm0 | |||
vpunpcklqdq %xmm11, %xmm10, %xmm1 | |||
vpunpckhqdq %xmm9, %xmm8, %xmm2 | |||
vpunpckhqdq %xmm11, %xmm10, %xmm3 | |||
vpunpcklqdq %xmm13, %xmm12, %xmm4 | |||
vpunpcklqdq %xmm15, %xmm14, %xmm5 | |||
vpunpckhqdq %xmm13, %xmm12, %xmm6 | |||
vpunpckhqdq %xmm15, %xmm14, %xmm7 | |||
andq %rsi, %rsi | |||
jz chacha_blocks_avx_noinput1 | |||
vpxor 0(%rsi), %xmm0, %xmm0 | |||
vpxor 16(%rsi), %xmm1, %xmm1 | |||
vpxor 64(%rsi), %xmm2, %xmm2 | |||
vpxor 80(%rsi), %xmm3, %xmm3 | |||
vpxor 128(%rsi), %xmm4, %xmm4 | |||
vpxor 144(%rsi), %xmm5, %xmm5 | |||
vpxor 192(%rsi), %xmm6, %xmm6 | |||
vpxor 208(%rsi), %xmm7, %xmm7 | |||
vmovdqu %xmm0, 0(%rdx) | |||
vmovdqu %xmm1, 16(%rdx) | |||
vmovdqu %xmm2, 64(%rdx) | |||
vmovdqu %xmm3, 80(%rdx) | |||
vmovdqu %xmm4, 128(%rdx) | |||
vmovdqu %xmm5, 144(%rdx) | |||
vmovdqu %xmm6, 192(%rdx) | |||
vmovdqu %xmm7, 208(%rdx) | |||
vmovdqa 384(%rsp), %xmm0 | |||
vmovdqa 400(%rsp), %xmm1 | |||
vmovdqa 416(%rsp), %xmm2 | |||
vmovdqa 432(%rsp), %xmm3 | |||
vmovdqa 448(%rsp), %xmm4 | |||
vmovdqa 464(%rsp), %xmm5 | |||
vmovdqa 480(%rsp), %xmm6 | |||
vmovdqa 496(%rsp), %xmm7 | |||
vpunpckldq %xmm1, %xmm0, %xmm8 | |||
vpunpckldq %xmm3, %xmm2, %xmm9 | |||
vpunpckhdq %xmm1, %xmm0, %xmm12 | |||
vpunpckhdq %xmm3, %xmm2, %xmm13 | |||
vpunpckldq %xmm5, %xmm4, %xmm10 | |||
vpunpckldq %xmm7, %xmm6, %xmm11 | |||
vpunpckhdq %xmm5, %xmm4, %xmm14 | |||
vpunpckhdq %xmm7, %xmm6, %xmm15 | |||
vpunpcklqdq %xmm9, %xmm8, %xmm0 | |||
vpunpcklqdq %xmm11, %xmm10, %xmm1 | |||
vpunpckhqdq %xmm9, %xmm8, %xmm2 | |||
vpunpckhqdq %xmm11, %xmm10, %xmm3 | |||
vpunpcklqdq %xmm13, %xmm12, %xmm4 | |||
vpunpcklqdq %xmm15, %xmm14, %xmm5 | |||
vpunpckhqdq %xmm13, %xmm12, %xmm6 | |||
vpunpckhqdq %xmm15, %xmm14, %xmm7 | |||
vpxor 32(%rsi), %xmm0, %xmm0 | |||
vpxor 48(%rsi), %xmm1, %xmm1 | |||
vpxor 96(%rsi), %xmm2, %xmm2 | |||
vpxor 112(%rsi), %xmm3, %xmm3 | |||
vpxor 160(%rsi), %xmm4, %xmm4 | |||
vpxor 176(%rsi), %xmm5, %xmm5 | |||
vpxor 224(%rsi), %xmm6, %xmm6 | |||
vpxor 240(%rsi), %xmm7, %xmm7 | |||
vmovdqu %xmm0, 32(%rdx) | |||
vmovdqu %xmm1, 48(%rdx) | |||
vmovdqu %xmm2, 96(%rdx) | |||
vmovdqu %xmm3, 112(%rdx) | |||
vmovdqu %xmm4, 160(%rdx) | |||
vmovdqu %xmm5, 176(%rdx) | |||
vmovdqu %xmm6, 224(%rdx) | |||
vmovdqu %xmm7, 240(%rdx) | |||
addq $256, %rsi | |||
jmp chacha_blocks_avx_mainloop_cont | |||
chacha_blocks_avx_noinput1: | |||
vmovdqu %xmm0, 0(%rdx) | |||
vmovdqu %xmm1, 16(%rdx) | |||
vmovdqu %xmm2, 64(%rdx) | |||
vmovdqu %xmm3, 80(%rdx) | |||
vmovdqu %xmm4, 128(%rdx) | |||
vmovdqu %xmm5, 144(%rdx) | |||
vmovdqu %xmm6, 192(%rdx) | |||
vmovdqu %xmm7, 208(%rdx) | |||
vmovdqa 384(%rsp), %xmm0 | |||
vmovdqa 400(%rsp), %xmm1 | |||
vmovdqa 416(%rsp), %xmm2 | |||
vmovdqa 432(%rsp), %xmm3 | |||
vmovdqa 448(%rsp), %xmm4 | |||
vmovdqa 464(%rsp), %xmm5 | |||
vmovdqa 480(%rsp), %xmm6 | |||
vmovdqa 496(%rsp), %xmm7 | |||
vpunpckldq %xmm1, %xmm0, %xmm8 | |||
vpunpckldq %xmm3, %xmm2, %xmm9 | |||
vpunpckhdq %xmm1, %xmm0, %xmm12 | |||
vpunpckhdq %xmm3, %xmm2, %xmm13 | |||
vpunpckldq %xmm5, %xmm4, %xmm10 | |||
vpunpckldq %xmm7, %xmm6, %xmm11 | |||
vpunpckhdq %xmm5, %xmm4, %xmm14 | |||
vpunpckhdq %xmm7, %xmm6, %xmm15 | |||
vpunpcklqdq %xmm9, %xmm8, %xmm0 | |||
vpunpcklqdq %xmm11, %xmm10, %xmm1 | |||
vpunpckhqdq %xmm9, %xmm8, %xmm2 | |||
vpunpckhqdq %xmm11, %xmm10, %xmm3 | |||
vpunpcklqdq %xmm13, %xmm12, %xmm4 | |||
vpunpcklqdq %xmm15, %xmm14, %xmm5 | |||
vpunpckhqdq %xmm13, %xmm12, %xmm6 | |||
vpunpckhqdq %xmm15, %xmm14, %xmm7 | |||
vmovdqu %xmm0, 32(%rdx) | |||
vmovdqu %xmm1, 48(%rdx) | |||
vmovdqu %xmm2, 96(%rdx) | |||
vmovdqu %xmm3, 112(%rdx) | |||
vmovdqu %xmm4, 160(%rdx) | |||
vmovdqu %xmm5, 176(%rdx) | |||
vmovdqu %xmm6, 224(%rdx) | |||
vmovdqu %xmm7, 240(%rdx) | |||
chacha_blocks_avx_mainloop_cont: | |||
addq $256, %rdx | |||
subq $256, %rcx | |||
cmp $256, %rcx | |||
jae chacha_blocks_avx_atleast256 | |||
vmovdqa 80(%rsp), %xmm6 | |||
vmovdqa 96(%rsp), %xmm7 | |||
vmovdqa 0(%rsp), %xmm8 | |||
vmovdqa 16(%rsp), %xmm9 | |||
vmovdqa 32(%rsp), %xmm10 | |||
vmovdqa 48(%rsp), %xmm11 | |||
movq $1, %r9 | |||
chacha_blocks_avx_below256: | |||
vmovq %r9, %xmm5 | |||
andq %rcx, %rcx | |||
jz chacha_blocks_avx_done | |||
cmpq $64, %rcx | |||
jae chacha_blocks_avx_above63 | |||
movq %rdx, %r9 | |||
andq %rsi, %rsi | |||
jz chacha_blocks_avx_noinput2 | |||
movq %rcx, %r10 | |||
movq %rsp, %rdx | |||
addq %r10, %rsi | |||
addq %r10, %rdx | |||
negq %r10 | |||
chacha_blocks_avx_copyinput: | |||
movb (%rsi, %r10), %al | |||
movb %al, (%rdx, %r10) | |||
incq %r10 | |||
jnz chacha_blocks_avx_copyinput | |||
movq %rsp, %rsi | |||
chacha_blocks_avx_noinput2: | |||
movq %rsp, %rdx | |||
chacha_blocks_avx_above63: | |||
vmovdqa %xmm8, %xmm0 | |||
vmovdqa %xmm9, %xmm1 | |||
vmovdqa %xmm10, %xmm2 | |||
vmovdqa %xmm11, %xmm3 | |||
movq 64(%rsp), %rax | |||
chacha_blocks_avx_mainloop2: | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm6, %xmm3, %xmm3 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpslld $12, %xmm1, %xmm4 | |||
vpsrld $20, %xmm1, %xmm1 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm7, %xmm3, %xmm3 | |||
vpshufd $0x93, %xmm0, %xmm0 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpshufd $0x4e, %xmm3, %xmm3 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpshufd $0x39, %xmm2, %xmm2 | |||
vpslld $7, %xmm1, %xmm4 | |||
vpsrld $25, %xmm1, %xmm1 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm6, %xmm3, %xmm3 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpslld $12, %xmm1, %xmm4 | |||
vpsrld $20, %xmm1, %xmm1 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm7, %xmm3, %xmm3 | |||
vpshufd $0x39, %xmm0, %xmm0 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpshufd $0x4e, %xmm3, %xmm3 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpshufd $0x93, %xmm2, %xmm2 | |||
vpslld $7, %xmm1, %xmm4 | |||
vpsrld $25, %xmm1, %xmm1 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
subq $2, %rax | |||
jnz chacha_blocks_avx_mainloop2 | |||
vpaddd %xmm0, %xmm8, %xmm0 | |||
vpaddd %xmm1, %xmm9, %xmm1 | |||
vpaddd %xmm2, %xmm10, %xmm2 | |||
vpaddd %xmm3, %xmm11, %xmm3 | |||
andq %rsi, %rsi | |||
jz chacha_blocks_avx_noinput3 | |||
vpxor 0(%rsi), %xmm0, %xmm0 | |||
vpxor 16(%rsi), %xmm1, %xmm1 | |||
vpxor 32(%rsi), %xmm2, %xmm2 | |||
vpxor 48(%rsi), %xmm3, %xmm3 | |||
addq $64, %rsi | |||
chacha_blocks_avx_noinput3: | |||
vmovdqu %xmm0, 0(%rdx) | |||
vmovdqu %xmm1, 16(%rdx) | |||
vmovdqu %xmm2, 32(%rdx) | |||
vmovdqu %xmm3, 48(%rdx) | |||
vpaddq %xmm11, %xmm5, %xmm11 | |||
cmpq $64, %rcx | |||
jbe chacha_blocks_avx_mainloop2_finishup | |||
addq $64, %rdx | |||
subq $64, %rcx | |||
jmp chacha_blocks_avx_below256 | |||
chacha_blocks_avx_mainloop2_finishup: | |||
cmpq $64, %rcx | |||
je chacha_blocks_avx_done | |||
addq %rcx, %r9 | |||
addq %rcx, %rdx | |||
negq %rcx | |||
chacha_blocks_avx_copyoutput: | |||
movb (%rdx, %rcx), %al | |||
movb %al, (%r9, %rcx) | |||
incq %rcx | |||
jnz chacha_blocks_avx_copyoutput | |||
chacha_blocks_avx_done: | |||
vmovdqu %xmm11, 32(%rdi) | |||
movq %rbp, %rsp | |||
popq %rbp | |||
popq %rbx | |||
ret | |||
FN_END chacha_blocks_avx | |||
GLOBAL_HIDDEN_FN hchacha_avx | |||
hchacha_avx_local: | |||
LOAD_VAR_PIC chacha_constants, %rax | |||
vmovdqa 0(%rax), %xmm0 | |||
vmovdqa 16(%rax), %xmm6 | |||
vmovdqa 32(%rax), %xmm5 | |||
vmovdqu 0(%rdi), %xmm1 | |||
vmovdqu 16(%rdi), %xmm2 | |||
vmovdqu 0(%rsi), %xmm3 | |||
hhacha_mainloop_avx: | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm6, %xmm3, %xmm3 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpslld $12, %xmm1, %xmm4 | |||
vpsrld $20, %xmm1, %xmm1 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm5, %xmm3, %xmm3 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpslld $7, %xmm1, %xmm4 | |||
vpsrld $25, %xmm1, %xmm1 | |||
vpshufd $0x93, %xmm0, %xmm0 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
vpshufd $0x4e, %xmm3, %xmm3 | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm6, %xmm3, %xmm3 | |||
vpshufd $0x39, %xmm2, %xmm2 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpslld $12, %xmm1, %xmm4 | |||
vpsrld $20, %xmm1, %xmm1 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
vpaddd %xmm0, %xmm1, %xmm0 | |||
vpxor %xmm3, %xmm0, %xmm3 | |||
vpshufb %xmm5, %xmm3, %xmm3 | |||
vpaddd %xmm2, %xmm3, %xmm2 | |||
vpxor %xmm1, %xmm2, %xmm1 | |||
vpshufd $0x39, %xmm0, %xmm0 | |||
vpslld $7, %xmm1, %xmm4 | |||
vpshufd $0x4e, %xmm3, %xmm3 | |||
vpsrld $25, %xmm1, %xmm1 | |||
vpshufd $0x93, %xmm2, %xmm2 | |||
vpxor %xmm1, %xmm4, %xmm1 | |||
subl $2, %ecx | |||
jne hhacha_mainloop_avx | |||
vmovdqu %xmm0, (%rdx) | |||
vmovdqu %xmm3, 16(%rdx) | |||
ret | |||
FN_END hchacha_avx | |||
GLOBAL_HIDDEN_FN_EXT chacha_avx, 6, 16 | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
subq $64, %rsp | |||
andq $~63, %rsp | |||
vmovdqu 0(%rdi), %xmm0 | |||
vmovdqu 16(%rdi), %xmm1 | |||
vmovdqa %xmm0, 0(%rsp) | |||
vmovdqa %xmm1, 16(%rsp) | |||
xorq %rdi, %rdi | |||
movq %rdi, 32(%rsp) | |||
movq 0(%rsi), %rsi | |||
movq %rsi, 40(%rsp) | |||
movq %r9, 48(%rsp) | |||
movq %rsp, %rdi | |||
movq %rdx, %rsi | |||
movq %rcx, %rdx | |||
movq %r8, %rcx | |||
call chacha_blocks_avx_local | |||
vpxor %xmm0, %xmm0, %xmm0 | |||
vmovdqa %xmm0, 0(%rsp) | |||
vmovdqa %xmm0, 16(%rsp) | |||
vmovdqa %xmm0, 32(%rsp) | |||
movq %rbp, %rsp | |||
popq %rbp | |||
ret | |||
FN_END chacha_avx | |||
GLOBAL_HIDDEN_FN_EXT xchacha_avx, 6, 16 | |||
pushq %rbp | |||
pushq %rbx | |||
movq %rsp, %rbp | |||
subq $64, %rsp | |||
andq $~63, %rsp | |||
movq %rsp, %rbx | |||
xorq %rax, %rax | |||
movq %rax, 32(%rbx) | |||
movq 16(%rsi), %rax | |||
movq %rax, 40(%rbx) | |||
movq %r9, 48(%rbx) | |||
pushq %rdx | |||
pushq %rcx | |||
pushq %r8 | |||
movq %rbx, %rdx | |||
movq %r9, %rcx | |||
call hchacha_avx_local | |||
movq %rbx, %rdi | |||
popq %rcx | |||
popq %rdx | |||
popq %rsi | |||
call chacha_blocks_avx_local | |||
vpxor %xmm0, %xmm0, %xmm0 | |||
vmovdqa %xmm0, 0(%rbx) | |||
vmovdqa %xmm0, 16(%rbx) | |||
vmovdqa %xmm0, 32(%rbx) | |||
movq %rbp, %rsp | |||
popq %rbx | |||
popq %rbp | |||
ret | |||
FN_END xchacha_avx |
@@ -0,0 +1,185 @@ | |||
#include "platform_config.h" | |||
#define IS_MACH (defined(__MACH__)) | |||
#if !defined(HAVE_SLASHMACRO) && !defined(HAVE_DOLLARMACRO) | |||
#error Unknown gnu as macro parameter convention! Run ./configure | |||
#endif | |||
#if (IS_MACH) | |||
.macro FN name | |||
#if defined(HAVE_SLASHMACRO) | |||
\name: | |||
_\name: | |||
#elif defined(HAVE_DOLLARMACRO) | |||
$0: | |||
_$0: | |||
#endif | |||
.endm | |||
.macro FN_EXT name, args, xmmused | |||
#if defined(HAVE_SLASHMACRO) | |||
FN \name | |||
#elif defined(HAVE_DOLLARMACRO) | |||
FN $0 | |||
#endif | |||
.endm | |||
.macro FN_END name | |||
.endm | |||
.macro HIDDEN name | |||
#if defined(HAVE_AS_PRIVATE_EXTERN) | |||
#if defined(HAVE_SLASHMACRO) | |||
.private_extern \name | |||
.private_extern _\name | |||
#elif defined(HAVE_DOLLARMACRO) | |||
.private_extern $0 | |||
.private_extern _$0 | |||
#endif | |||
#endif | |||
.endm | |||
#else | |||
.macro FN name | |||
\name: | |||
_\name: | |||
.endm | |||
.macro FN_EXT name, args, xmmused | |||
FN \name | |||
.endm | |||
.macro FN_END name | |||
.size \name, .-\name | |||
.size _\name, .-_\name | |||
.type \name, @function | |||
.type _\name, @function | |||
.endm | |||
.macro HIDDEN name | |||
#if defined(HAVE_AS_HIDDEN) | |||
.hidden \name | |||
.hidden _\name | |||
#endif | |||
.endm | |||
/* set NX for stack */ | |||
.section .note.GNU-stack,"",@progbits | |||
#endif | |||
#if (IS_MACH) | |||
.macro SECTION_TEXT | |||
.section __TEXT,__text,regular | |||
.endm | |||
.macro SECTION_RODATA | |||
.section __TEXT,__text,regular | |||
.endm | |||
#else | |||
/* put everything in the code segment to simplify things */ | |||
.macro SECTION_TEXT | |||
.text | |||
.endm | |||
.macro SECTION_RODATA | |||
.text | |||
.endm | |||
#endif | |||
/* declare a global function */ | |||
.macro GLOBAL name | |||
#if defined(HAVE_SLASHMACRO) | |||
.globl \name | |||
.globl _\name | |||
#elif defined(HAVE_DOLLARMACRO) | |||
.globl $0 | |||
.globl _$0 | |||
#endif | |||
.endm | |||
.macro FN_LOCAL_PREFIX name | |||
#if defined(HAVE_SLASHMACRO) | |||
FN LOCAL_PREFIX(\name) | |||
#elif defined(HAVE_DOLLARMACRO) | |||
FN LOCAL_PREFIX($0) | |||
#endif | |||
.endm | |||
.macro FN_EXT_LOCAL_PREFIX name, args, xmmused | |||
#if defined(HAVE_SLASHMACRO) | |||
FN_EXT LOCAL_PREFIX(\name), \args, \xmmused | |||
#elif defined(HAVE_DOLLARMACRO) | |||
FN_EXT LOCAL_PREFIX($0), $1, $2 | |||
#endif | |||
.endm | |||
.macro FN_END_LOCAL_PREFIX name | |||
#if defined(HAVE_SLASHMACRO) | |||
FN_END LOCAL_PREFIX(\name) | |||
#elif defined(HAVE_DOLLARMACRO) | |||
FN_END LOCAL_PREFIX($0) | |||
#endif | |||
.endm | |||
.macro GLOBAL_LOCAL_PREFIX name | |||
#if defined(HAVE_SLASHMACRO) | |||
GLOBAL LOCAL_PREFIX(\name) | |||
HIDDEN LOCAL_PREFIX(\name) | |||
#elif defined(HAVE_DOLLARMACRO) | |||
GLOBAL LOCAL_PREFIX($0) | |||
HIDDEN LOCAL_PREFIX($0) | |||
#endif | |||
.endm | |||
.macro GLOBAL_HIDDEN_FN name | |||
#if defined(HAVE_SLASHMACRO) | |||
GLOBAL \name | |||
HIDDEN \name | |||
FN \name | |||
#elif defined(HAVE_DOLLARMACRO) | |||
GLOBAL $0 | |||
HIDDEN $0 | |||
FN $0 | |||
#endif | |||
.endm | |||
.macro GLOBAL_HIDDEN_FN_EXT name, args, xmmused | |||
#if defined(HAVE_SLASHMACRO) | |||
GLOBAL \name | |||
HIDDEN \name | |||
FN_EXT \name, \args, \xmmused | |||
#elif defined(HAVE_DOLLARMACRO) | |||
GLOBAL $0 | |||
HIDDEN $0 | |||
FN_EXT $0, $1, $2 | |||
#endif | |||
.endm | |||
/* pic support */ | |||
.macro LOAD_VAR_PIC var, reg | |||
#if (IS_X86_32) | |||
#if defined(HAVE_SLASHMACRO) | |||
call 1f | |||
1: | |||
popl \reg | |||
leal \var - 1b(\reg), \reg | |||
#elif defined(HAVE_DOLLARMACRO) | |||
call 1f | |||
1: | |||
popl $1 | |||
leal $0 - 1b($1), $1 | |||
#endif | |||
#else | |||
#if defined(HAVE_SLASHMACRO) | |||
leaq \var(%rip), \reg | |||
#elif defined(HAVE_DOLLARMACRO) | |||
leaq $0(%rip), $1 | |||
#endif | |||
#endif | |||
.endm | |||
SECTION_RODATA | |||
.p2align 4,,15 | |||
chacha_constants: | |||
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */ | |||
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ | |||
.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ |
@@ -0,0 +1,734 @@ | |||
#include "macro.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN chacha_blocks_sse2 | |||
chacha_blocks_sse2_local: | |||
pushq %rbx | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
andq $~63, %rsp | |||
subq $512, %rsp | |||
movq $0x3320646e61707865, %rax | |||
movq $0x6b20657479622d32, %r8 | |||
movd %rax, %xmm8 | |||
movd %r8, %xmm14 | |||
punpcklqdq %xmm14, %xmm8 | |||
movdqu 0(%rdi), %xmm9 | |||
movdqu 16(%rdi), %xmm10 | |||
movdqu 32(%rdi), %xmm11 | |||
movq 48(%rdi), %rax | |||
movq $1, %r9 | |||
movdqa %xmm8, 0(%rsp) | |||
movdqa %xmm9, 16(%rsp) | |||
movdqa %xmm10, 32(%rsp) | |||
movdqa %xmm11, 48(%rsp) | |||
movq %rax, 64(%rsp) | |||
cmpq $256, %rcx | |||
jb chacha_blocks_sse2_below256 | |||
pshufd $0x00, %xmm8, %xmm0 | |||
pshufd $0x55, %xmm8, %xmm1 | |||
pshufd $0xaa, %xmm8, %xmm2 | |||
pshufd $0xff, %xmm8, %xmm3 | |||
movdqa %xmm0, 128(%rsp) | |||
movdqa %xmm1, 144(%rsp) | |||
movdqa %xmm2, 160(%rsp) | |||
movdqa %xmm3, 176(%rsp) | |||
pshufd $0x00, %xmm9, %xmm0 | |||
pshufd $0x55, %xmm9, %xmm1 | |||
pshufd $0xaa, %xmm9, %xmm2 | |||
pshufd $0xff, %xmm9, %xmm3 | |||
movdqa %xmm0, 192(%rsp) | |||
movdqa %xmm1, 208(%rsp) | |||
movdqa %xmm2, 224(%rsp) | |||
movdqa %xmm3, 240(%rsp) | |||
pshufd $0x00, %xmm10, %xmm0 | |||
pshufd $0x55, %xmm10, %xmm1 | |||
pshufd $0xaa, %xmm10, %xmm2 | |||
pshufd $0xff, %xmm10, %xmm3 | |||
movdqa %xmm0, 256(%rsp) | |||
movdqa %xmm1, 272(%rsp) | |||
movdqa %xmm2, 288(%rsp) | |||
movdqa %xmm3, 304(%rsp) | |||
pshufd $0xaa, %xmm11, %xmm0 | |||
pshufd $0xff, %xmm11, %xmm1 | |||
movdqa %xmm0, 352(%rsp) | |||
movdqa %xmm1, 368(%rsp) | |||
jmp chacha_blocks_sse2_atleast256 | |||
.p2align 6,,63 | |||
chacha_blocks_sse2_atleast256: | |||
movq 48(%rsp), %rax | |||
leaq 1(%rax), %r8 | |||
leaq 2(%rax), %r9 | |||
leaq 3(%rax), %r10 | |||
leaq 4(%rax), %rbx | |||
movl %eax, 320(%rsp) | |||
movl %r8d, 4+320(%rsp) | |||
movl %r9d, 8+320(%rsp) | |||
movl %r10d, 12+320(%rsp) | |||
shrq $32, %rax | |||
shrq $32, %r8 | |||
shrq $32, %r9 | |||
shrq $32, %r10 | |||
movl %eax, 336(%rsp) | |||
movl %r8d, 4+336(%rsp) | |||
movl %r9d, 8+336(%rsp) | |||
movl %r10d, 12+336(%rsp) | |||
movq %rbx, 48(%rsp) | |||
movq 64(%rsp), %rax | |||
movdqa 128(%rsp), %xmm0 | |||
movdqa 144(%rsp), %xmm1 | |||
movdqa 160(%rsp), %xmm2 | |||
movdqa 176(%rsp), %xmm3 | |||
movdqa 192(%rsp), %xmm4 | |||
movdqa 208(%rsp), %xmm5 | |||
movdqa 224(%rsp), %xmm6 | |||
movdqa 240(%rsp), %xmm7 | |||
movdqa 256(%rsp), %xmm8 | |||
movdqa 272(%rsp), %xmm9 | |||
movdqa 288(%rsp), %xmm10 | |||
movdqa 304(%rsp), %xmm11 | |||
movdqa 320(%rsp), %xmm12 | |||
movdqa 336(%rsp), %xmm13 | |||
movdqa 352(%rsp), %xmm14 | |||
movdqa 368(%rsp), %xmm15 | |||
chacha_blocks_sse2_mainloop1: | |||
paddd %xmm4, %xmm0 | |||
paddd %xmm5, %xmm1 | |||
pxor %xmm0, %xmm12 | |||
pxor %xmm1, %xmm13 | |||
paddd %xmm6, %xmm2 | |||
paddd %xmm7, %xmm3 | |||
movdqa %xmm6, 96(%rsp) | |||
pxor %xmm2, %xmm14 | |||
pxor %xmm3, %xmm15 | |||
pshuflw $0xb1,%xmm12,%xmm12 | |||
pshufhw $0xb1,%xmm12,%xmm12 | |||
pshuflw $0xb1,%xmm13,%xmm13 | |||
pshufhw $0xb1,%xmm13,%xmm13 | |||
pshuflw $0xb1,%xmm14,%xmm14 | |||
pshufhw $0xb1,%xmm14,%xmm14 | |||
pshuflw $0xb1,%xmm15,%xmm15 | |||
pshufhw $0xb1,%xmm15,%xmm15 | |||
paddd %xmm12, %xmm8 | |||
paddd %xmm13, %xmm9 | |||
paddd %xmm14, %xmm10 | |||
paddd %xmm15, %xmm11 | |||
movdqa %xmm12, 112(%rsp) | |||
pxor %xmm8, %xmm4 | |||
pxor %xmm9, %xmm5 | |||
movdqa 96(%rsp), %xmm6 | |||
movdqa %xmm4, %xmm12 | |||
pslld $ 12, %xmm4 | |||
psrld $20, %xmm12 | |||
pxor %xmm12, %xmm4 | |||
movdqa %xmm5, %xmm12 | |||
pslld $ 12, %xmm5 | |||
psrld $20, %xmm12 | |||
pxor %xmm12, %xmm5 | |||
pxor %xmm10, %xmm6 | |||
pxor %xmm11, %xmm7 | |||
movdqa %xmm6, %xmm12 | |||
pslld $ 12, %xmm6 | |||
psrld $20, %xmm12 | |||
pxor %xmm12, %xmm6 | |||
movdqa %xmm7, %xmm12 | |||
pslld $ 12, %xmm7 | |||
psrld $20, %xmm12 | |||
pxor %xmm12, %xmm7 | |||
movdqa 112(%rsp), %xmm12 | |||
paddd %xmm4, %xmm0 | |||
paddd %xmm5, %xmm1 | |||
pxor %xmm0, %xmm12 | |||
pxor %xmm1, %xmm13 | |||
paddd %xmm6, %xmm2 | |||
paddd %xmm7, %xmm3 | |||
movdqa %xmm6, 96(%rsp) | |||
pxor %xmm2, %xmm14 | |||
pxor %xmm3, %xmm15 | |||
movdqa %xmm12, %xmm6 | |||
pslld $ 8, %xmm12 | |||
psrld $24, %xmm6 | |||
pxor %xmm6, %xmm12 | |||
movdqa %xmm13, %xmm6 | |||
pslld $ 8, %xmm13 | |||
psrld $24, %xmm6 | |||
pxor %xmm6, %xmm13 | |||
paddd %xmm12, %xmm8 | |||
paddd %xmm13, %xmm9 | |||
movdqa %xmm14, %xmm6 | |||
pslld $ 8, %xmm14 | |||
psrld $24, %xmm6 | |||
pxor %xmm6, %xmm14 | |||
movdqa %xmm15, %xmm6 | |||
pslld $ 8, %xmm15 | |||
psrld $24, %xmm6 | |||
pxor %xmm6, %xmm15 | |||
paddd %xmm14, %xmm10 | |||
paddd %xmm15, %xmm11 | |||
movdqa %xmm12, 112(%rsp) | |||
pxor %xmm8, %xmm4 | |||
pxor %xmm9, %xmm5 | |||
movdqa 96(%rsp), %xmm6 | |||
movdqa %xmm4, %xmm12 | |||
pslld $ 7, %xmm4 | |||
psrld $25, %xmm12 | |||
pxor %xmm12, %xmm4 | |||
movdqa %xmm5, %xmm12 | |||
pslld $ 7, %xmm5 | |||
psrld $25, %xmm12 | |||
pxor %xmm12, %xmm5 | |||
pxor %xmm10, %xmm6 | |||
pxor %xmm11, %xmm7 | |||
movdqa %xmm6, %xmm12 | |||
pslld $ 7, %xmm6 | |||
psrld $25, %xmm12 | |||
pxor %xmm12, %xmm6 | |||
movdqa %xmm7, %xmm12 | |||
pslld $ 7, %xmm7 | |||
psrld $25, %xmm12 | |||
pxor %xmm12, %xmm7 | |||
movdqa 112(%rsp), %xmm12 | |||
paddd %xmm5, %xmm0 | |||
paddd %xmm6, %xmm1 | |||
pxor %xmm0, %xmm15 | |||
pxor %xmm1, %xmm12 | |||
paddd %xmm7, %xmm2 | |||
paddd %xmm4, %xmm3 | |||
movdqa %xmm7, 96(%rsp) | |||
pxor %xmm2, %xmm13 | |||
pxor %xmm3, %xmm14 | |||
pshuflw $0xb1,%xmm15,%xmm15 | |||
pshufhw $0xb1,%xmm15,%xmm15 | |||
pshuflw $0xb1,%xmm12,%xmm12 | |||
pshufhw $0xb1,%xmm12,%xmm12 | |||
pshuflw $0xb1,%xmm13,%xmm13 | |||
pshufhw $0xb1,%xmm13,%xmm13 | |||
pshuflw $0xb1,%xmm14,%xmm14 | |||
pshufhw $0xb1,%xmm14,%xmm14 | |||
paddd %xmm15, %xmm10 | |||
paddd %xmm12, %xmm11 | |||
paddd %xmm13, %xmm8 | |||
paddd %xmm14, %xmm9 | |||
movdqa %xmm15, 112(%rsp) | |||
pxor %xmm10, %xmm5 | |||
pxor %xmm11, %xmm6 | |||
movdqa 96(%rsp), %xmm7 | |||
movdqa %xmm5, %xmm15 | |||
pslld $ 12, %xmm5 | |||
psrld $20, %xmm15 | |||
pxor %xmm15, %xmm5 | |||
movdqa %xmm6, %xmm15 | |||
pslld $ 12, %xmm6 | |||
psrld $20, %xmm15 | |||
pxor %xmm15, %xmm6 | |||
pxor %xmm8, %xmm7 | |||
pxor %xmm9, %xmm4 | |||
movdqa %xmm7, %xmm15 | |||
pslld $ 12, %xmm7 | |||
psrld $20, %xmm15 | |||
pxor %xmm15, %xmm7 | |||
movdqa %xmm4, %xmm15 | |||
pslld $ 12, %xmm4 | |||
psrld $20, %xmm15 | |||
pxor %xmm15, %xmm4 | |||
movdqa 112(%rsp), %xmm15 | |||
paddd %xmm5, %xmm0 | |||
paddd %xmm6, %xmm1 | |||
pxor %xmm0, %xmm15 | |||
pxor %xmm1, %xmm12 | |||
paddd %xmm7, %xmm2 | |||
paddd %xmm4, %xmm3 | |||
movdqa %xmm7, 96(%rsp) | |||
pxor %xmm2, %xmm13 | |||
pxor %xmm3, %xmm14 | |||
movdqa %xmm15, %xmm7 | |||
pslld $ 8, %xmm15 | |||
psrld $24, %xmm7 | |||
pxor %xmm7, %xmm15 | |||
movdqa %xmm12, %xmm7 | |||
pslld $ 8, %xmm12 | |||
psrld $24, %xmm7 | |||
pxor %xmm7, %xmm12 | |||
paddd %xmm15, %xmm10 | |||
paddd %xmm12, %xmm11 | |||
movdqa %xmm13, %xmm7 | |||
pslld $ 8, %xmm13 | |||
psrld $24, %xmm7 | |||
pxor %xmm7, %xmm13 | |||
movdqa %xmm14, %xmm7 | |||
pslld $ 8, %xmm14 | |||
psrld $24, %xmm7 | |||
pxor %xmm7, %xmm14 | |||
paddd %xmm13, %xmm8 | |||
paddd %xmm14, %xmm9 | |||
movdqa %xmm15, 112(%rsp) | |||
pxor %xmm10, %xmm5 | |||
pxor %xmm11, %xmm6 | |||
movdqa 96(%rsp), %xmm7 | |||
movdqa %xmm5, %xmm15 | |||
pslld $ 7, %xmm5 | |||
psrld $25, %xmm15 | |||
pxor %xmm15, %xmm5 | |||
movdqa %xmm6, %xmm15 | |||
pslld $ 7, %xmm6 | |||
psrld $25, %xmm15 | |||
pxor %xmm15, %xmm6 | |||
pxor %xmm8, %xmm7 | |||
pxor %xmm9, %xmm4 | |||
movdqa %xmm7, %xmm15 | |||
pslld $ 7, %xmm7 | |||
psrld $25, %xmm15 | |||
pxor %xmm15, %xmm7 | |||
movdqa %xmm4, %xmm15 | |||
pslld $ 7, %xmm4 | |||
psrld $25, %xmm15 | |||
pxor %xmm15, %xmm4 | |||
movdqa 112(%rsp), %xmm15 | |||
subq $2, %rax | |||
jnz chacha_blocks_sse2_mainloop1 | |||
paddd 128(%rsp), %xmm0 | |||
paddd 144(%rsp), %xmm1 | |||
paddd 160(%rsp), %xmm2 | |||
paddd 176(%rsp), %xmm3 | |||
paddd 192(%rsp), %xmm4 | |||
paddd 208(%rsp), %xmm5 | |||
paddd 224(%rsp), %xmm6 | |||
paddd 240(%rsp), %xmm7 | |||
paddd 256(%rsp), %xmm8 | |||
paddd 272(%rsp), %xmm9 | |||
paddd 288(%rsp), %xmm10 | |||
paddd 304(%rsp), %xmm11 | |||
paddd 320(%rsp), %xmm12 | |||
paddd 336(%rsp), %xmm13 | |||
paddd 352(%rsp), %xmm14 | |||
paddd 368(%rsp), %xmm15 | |||
movdqa %xmm8, 384(%rsp) | |||
movdqa %xmm9, 400(%rsp) | |||
movdqa %xmm10, 416(%rsp) | |||
movdqa %xmm11, 432(%rsp) | |||
movdqa %xmm12, 448(%rsp) | |||
movdqa %xmm13, 464(%rsp) | |||
movdqa %xmm14, 480(%rsp) | |||
movdqa %xmm15, 496(%rsp) | |||
movdqa %xmm0, %xmm8 | |||
movdqa %xmm2, %xmm9 | |||
movdqa %xmm4, %xmm10 | |||
movdqa %xmm6, %xmm11 | |||
punpckhdq %xmm1, %xmm0 | |||
punpckhdq %xmm3, %xmm2 | |||
punpckhdq %xmm5, %xmm4 | |||
punpckhdq %xmm7, %xmm6 | |||
punpckldq %xmm1, %xmm8 | |||
punpckldq %xmm3, %xmm9 | |||
punpckldq %xmm5, %xmm10 | |||
punpckldq %xmm7, %xmm11 | |||
movdqa %xmm0, %xmm1 | |||
movdqa %xmm4, %xmm3 | |||
movdqa %xmm8, %xmm5 | |||
movdqa %xmm10, %xmm7 | |||
punpckhqdq %xmm2, %xmm0 | |||
punpckhqdq %xmm6, %xmm4 | |||
punpckhqdq %xmm9, %xmm8 | |||
punpckhqdq %xmm11, %xmm10 | |||
punpcklqdq %xmm2, %xmm1 | |||
punpcklqdq %xmm6, %xmm3 | |||
punpcklqdq %xmm9, %xmm5 | |||
punpcklqdq %xmm11, %xmm7 | |||
andq %rsi, %rsi | |||
jz chacha_blocks_sse2_noinput1 | |||
movdqu 0(%rsi), %xmm2 | |||
movdqu 16(%rsi), %xmm6 | |||
movdqu 64(%rsi), %xmm9 | |||
movdqu 80(%rsi), %xmm11 | |||
movdqu 128(%rsi), %xmm12 | |||
movdqu 144(%rsi), %xmm13 | |||
movdqu 192(%rsi), %xmm14 | |||
movdqu 208(%rsi), %xmm15 | |||
pxor %xmm2, %xmm5 | |||
pxor %xmm6, %xmm7 | |||
pxor %xmm9, %xmm8 | |||
pxor %xmm11, %xmm10 | |||
pxor %xmm12, %xmm1 | |||
pxor %xmm13, %xmm3 | |||
pxor %xmm14, %xmm0 | |||
pxor %xmm15, %xmm4 | |||
movdqu %xmm5, 0(%rdx) | |||
movdqu %xmm7, 16(%rdx) | |||
movdqu %xmm8, 64(%rdx) | |||
movdqu %xmm10, 80(%rdx) | |||
movdqu %xmm1, 128(%rdx) | |||
movdqu %xmm3, 144(%rdx) | |||
movdqu %xmm0, 192(%rdx) | |||
movdqu %xmm4, 208(%rdx) | |||
movdqa 384(%rsp), %xmm0 | |||
movdqa 400(%rsp), %xmm1 | |||
movdqa 416(%rsp), %xmm2 | |||
movdqa 432(%rsp), %xmm3 | |||
movdqa 448(%rsp), %xmm4 | |||
movdqa 464(%rsp), %xmm5 | |||
movdqa 480(%rsp), %xmm6 | |||
movdqa 496(%rsp), %xmm7 | |||
movdqa %xmm0, %xmm8 | |||
movdqa %xmm2, %xmm9 | |||
movdqa %xmm4, %xmm10 | |||
movdqa %xmm6, %xmm11 | |||
punpckldq %xmm1, %xmm8 | |||
punpckldq %xmm3, %xmm9 | |||
punpckhdq %xmm1, %xmm0 | |||
punpckhdq %xmm3, %xmm2 | |||
punpckldq %xmm5, %xmm10 | |||
punpckldq %xmm7, %xmm11 | |||
punpckhdq %xmm5, %xmm4 | |||
punpckhdq %xmm7, %xmm6 | |||
movdqa %xmm8, %xmm1 | |||
movdqa %xmm0, %xmm3 | |||
movdqa %xmm10, %xmm5 | |||
movdqa %xmm4, %xmm7 | |||
punpcklqdq %xmm9, %xmm1 | |||
punpcklqdq %xmm11, %xmm5 | |||
punpckhqdq %xmm9, %xmm8 | |||
punpckhqdq %xmm11, %xmm10 | |||
punpcklqdq %xmm2, %xmm3 | |||
punpcklqdq %xmm6, %xmm7 | |||
punpckhqdq %xmm2, %xmm0 | |||
punpckhqdq %xmm6, %xmm4 | |||
movdqu 32(%rsi), %xmm2 | |||
movdqu 48(%rsi), %xmm6 | |||
movdqu 96(%rsi), %xmm9 | |||
movdqu 112(%rsi), %xmm11 | |||
movdqu 160(%rsi), %xmm12 | |||
movdqu 176(%rsi), %xmm13 | |||
movdqu 224(%rsi), %xmm14 | |||
movdqu 240(%rsi), %xmm15 | |||
pxor %xmm2, %xmm1 | |||
pxor %xmm6, %xmm5 | |||
pxor %xmm9, %xmm8 | |||
pxor %xmm11, %xmm10 | |||
pxor %xmm12, %xmm3 | |||
pxor %xmm13, %xmm7 | |||
pxor %xmm14, %xmm0 | |||
pxor %xmm15, %xmm4 | |||
movdqu %xmm1, 32(%rdx) | |||
movdqu %xmm5, 48(%rdx) | |||
movdqu %xmm8, 96(%rdx) | |||
movdqu %xmm10, 112(%rdx) | |||
movdqu %xmm3, 160(%rdx) | |||
movdqu %xmm7, 176(%rdx) | |||
movdqu %xmm0, 224(%rdx) | |||
movdqu %xmm4, 240(%rdx) | |||
addq $256, %rsi | |||
jmp chacha_blocks_sse2_mainloop_cont | |||
chacha_blocks_sse2_noinput1: | |||
movdqu %xmm5, 0(%rdx) | |||
movdqu %xmm7, 16(%rdx) | |||
movdqu %xmm8, 64(%rdx) | |||
movdqu %xmm10, 80(%rdx) | |||
movdqu %xmm1, 128(%rdx) | |||
movdqu %xmm3, 144(%rdx) | |||
movdqu %xmm0, 192(%rdx) | |||
movdqu %xmm4, 208(%rdx) | |||
movdqa 384(%rsp), %xmm0 | |||
movdqa 400(%rsp), %xmm1 | |||
movdqa 416(%rsp), %xmm2 | |||
movdqa 432(%rsp), %xmm3 | |||
movdqa 448(%rsp), %xmm4 | |||
movdqa 464(%rsp), %xmm5 | |||
movdqa 480(%rsp), %xmm6 | |||
movdqa 496(%rsp), %xmm7 | |||
movdqa %xmm0, %xmm8 | |||
movdqa %xmm2, %xmm9 | |||
movdqa %xmm4, %xmm10 | |||
movdqa %xmm6, %xmm11 | |||
punpckldq %xmm1, %xmm8 | |||
punpckldq %xmm3, %xmm9 | |||
punpckhdq %xmm1, %xmm0 | |||
punpckhdq %xmm3, %xmm2 | |||
punpckldq %xmm5, %xmm10 | |||
punpckldq %xmm7, %xmm11 | |||
punpckhdq %xmm5, %xmm4 | |||
punpckhdq %xmm7, %xmm6 | |||
movdqa %xmm8, %xmm1 | |||
movdqa %xmm0, %xmm3 | |||
movdqa %xmm10, %xmm5 | |||
movdqa %xmm4, %xmm7 | |||
punpcklqdq %xmm9, %xmm1 | |||
punpcklqdq %xmm11, %xmm5 | |||
punpckhqdq %xmm9, %xmm8 | |||
punpckhqdq %xmm11, %xmm10 | |||
punpcklqdq %xmm2, %xmm3 | |||
punpcklqdq %xmm6, %xmm7 | |||
punpckhqdq %xmm2, %xmm0 | |||
punpckhqdq %xmm6, %xmm4 | |||
movdqu %xmm1, 32(%rdx) | |||
movdqu %xmm5, 48(%rdx) | |||
movdqu %xmm8, 96(%rdx) | |||
movdqu %xmm10, 112(%rdx) | |||
movdqu %xmm3, 160(%rdx) | |||
movdqu %xmm7, 176(%rdx) | |||
movdqu %xmm0, 224(%rdx) | |||
movdqu %xmm4, 240(%rdx) | |||
chacha_blocks_sse2_mainloop_cont: | |||
addq $256, %rdx | |||
subq $256, %rcx | |||
cmp $256, %rcx | |||
jae chacha_blocks_sse2_atleast256 | |||
movdqa 0(%rsp), %xmm8 | |||
movdqa 16(%rsp), %xmm9 | |||
movdqa 32(%rsp), %xmm10 | |||
movdqa 48(%rsp), %xmm11 | |||
movq $1, %r9 | |||
chacha_blocks_sse2_below256: | |||
movq %r9, %xmm5 | |||
andq %rcx, %rcx | |||
jz chacha_blocks_sse2_done | |||
cmpq $64, %rcx | |||
jae chacha_blocks_sse2_above63 | |||
movq %rdx, %r9 | |||
andq %rsi, %rsi | |||
jz chacha_blocks_sse2_noinput2 | |||
movq %rcx, %r10 | |||
movq %rsp, %rdx | |||
addq %r10, %rsi | |||
addq %r10, %rdx | |||
negq %r10 | |||
chacha_blocks_sse2_copyinput: | |||
movb (%rsi, %r10), %al | |||
movb %al, (%rdx, %r10) | |||
incq %r10 | |||
jnz chacha_blocks_sse2_copyinput | |||
movq %rsp, %rsi | |||
chacha_blocks_sse2_noinput2: | |||
movq %rsp, %rdx | |||
chacha_blocks_sse2_above63: | |||
movdqa %xmm8, %xmm0 | |||
movdqa %xmm9, %xmm1 | |||
movdqa %xmm10, %xmm2 | |||
movdqa %xmm11, %xmm3 | |||
movq 64(%rsp), %rax | |||
chacha_blocks_sse2_mainloop2: | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
pshuflw $0xb1,%xmm3,%xmm3 | |||
pshufhw $0xb1,%xmm3,%xmm3 | |||
paddd %xmm3, %xmm2 | |||
pxor %xmm2, %xmm1 | |||
movdqa %xmm1,%xmm4 | |||
pslld $12, %xmm1 | |||
psrld $20, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
movdqa %xmm3,%xmm4 | |||
pslld $8, %xmm3 | |||
psrld $24, %xmm4 | |||
pshufd $0x93,%xmm0,%xmm0 | |||
pxor %xmm4, %xmm3 | |||
paddd %xmm3, %xmm2 | |||
pshufd $0x4e,%xmm3,%xmm3 | |||
pxor %xmm2, %xmm1 | |||
pshufd $0x39,%xmm2,%xmm2 | |||
movdqa %xmm1,%xmm4 | |||
pslld $7, %xmm1 | |||
psrld $25, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
subq $2, %rax | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
pshuflw $0xb1,%xmm3,%xmm3 | |||
pshufhw $0xb1,%xmm3,%xmm3 | |||
paddd %xmm3, %xmm2 | |||
pxor %xmm2, %xmm1 | |||
movdqa %xmm1,%xmm4 | |||
pslld $12, %xmm1 | |||
psrld $20, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
movdqa %xmm3,%xmm4 | |||
pslld $8, %xmm3 | |||
psrld $24, %xmm4 | |||
pshufd $0x39,%xmm0,%xmm0 | |||
pxor %xmm4, %xmm3 | |||
paddd %xmm3, %xmm2 | |||
pshufd $0x4e,%xmm3,%xmm3 | |||
pxor %xmm2, %xmm1 | |||
pshufd $0x93,%xmm2,%xmm2 | |||
movdqa %xmm1,%xmm4 | |||
pslld $7, %xmm1 | |||
psrld $25, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
jnz chacha_blocks_sse2_mainloop2 | |||
paddd %xmm8, %xmm0 | |||
paddd %xmm9, %xmm1 | |||
paddd %xmm10, %xmm2 | |||
paddd %xmm11, %xmm3 | |||
andq %rsi, %rsi | |||
jz chacha_blocks_sse2_noinput3 | |||
movdqu 0(%rsi), %xmm12 | |||
movdqu 16(%rsi), %xmm13 | |||
movdqu 32(%rsi), %xmm14 | |||
movdqu 48(%rsi), %xmm15 | |||
pxor %xmm12, %xmm0 | |||
pxor %xmm13, %xmm1 | |||
pxor %xmm14, %xmm2 | |||
pxor %xmm15, %xmm3 | |||
addq $64, %rsi | |||
chacha_blocks_sse2_noinput3: | |||
movdqu %xmm0, 0(%rdx) | |||
movdqu %xmm1, 16(%rdx) | |||
movdqu %xmm2, 32(%rdx) | |||
movdqu %xmm3, 48(%rdx) | |||
paddq %xmm5, %xmm11 | |||
cmpq $64, %rcx | |||
jbe chacha_blocks_sse2_mainloop2_finishup | |||
addq $64, %rdx | |||
subq $64, %rcx | |||
jmp chacha_blocks_sse2_below256 | |||
chacha_blocks_sse2_mainloop2_finishup: | |||
cmpq $64, %rcx | |||
je chacha_blocks_sse2_done | |||
addq %rcx, %r9 | |||
addq %rcx, %rdx | |||
negq %rcx | |||
chacha_blocks_sse2_copyoutput: | |||
movb (%rdx, %rcx), %al | |||
movb %al, (%r9, %rcx) | |||
incq %rcx | |||
jnz chacha_blocks_sse2_copyoutput | |||
chacha_blocks_sse2_done: | |||
movdqu %xmm11, 32(%rdi) | |||
movq %rbp, %rsp | |||
popq %rbp | |||
popq %rbx | |||
ret | |||
FN_END chacha_blocks_sse2 | |||
GLOBAL_HIDDEN_FN hchacha_sse2 | |||
hchacha_sse2_local: | |||
movq $0x3320646e61707865, %rax | |||
movq $0x6b20657479622d32, %r8 | |||
movd %rax, %xmm0 | |||
movd %r8, %xmm4 | |||
punpcklqdq %xmm4, %xmm0 | |||
movdqu 0(%rdi), %xmm1 | |||
movdqu 16(%rdi), %xmm2 | |||
movdqu 0(%rsi), %xmm3 | |||
hchacha_sse2_mainloop: | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
pshuflw $0xb1,%xmm3,%xmm3 | |||
pshufhw $0xb1,%xmm3,%xmm3 | |||
paddd %xmm3, %xmm2 | |||
pxor %xmm2, %xmm1 | |||
movdqa %xmm1,%xmm4 | |||
pslld $12, %xmm1 | |||
psrld $20, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
movdqa %xmm3,%xmm4 | |||
pslld $8, %xmm3 | |||
psrld $24, %xmm4 | |||
pshufd $0x93,%xmm0,%xmm0 | |||
pxor %xmm4, %xmm3 | |||
paddd %xmm3, %xmm2 | |||
pshufd $0x4e,%xmm3,%xmm3 | |||
pxor %xmm2, %xmm1 | |||
pshufd $0x39,%xmm2,%xmm2 | |||
movdqa %xmm1,%xmm4 | |||
pslld $7, %xmm1 | |||
psrld $25, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
subq $2, %rcx | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
pshuflw $0xb1,%xmm3,%xmm3 | |||
pshufhw $0xb1,%xmm3,%xmm3 | |||
paddd %xmm3, %xmm2 | |||
pxor %xmm2, %xmm1 | |||
movdqa %xmm1,%xmm4 | |||
pslld $12, %xmm1 | |||
psrld $20, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
paddd %xmm1, %xmm0 | |||
pxor %xmm0, %xmm3 | |||
movdqa %xmm3,%xmm4 | |||
pslld $8, %xmm3 | |||
psrld $24, %xmm4 | |||
pshufd $0x39,%xmm0,%xmm0 | |||
pxor %xmm4, %xmm3 | |||
paddd %xmm3, %xmm2 | |||
pshufd $0x4e,%xmm3,%xmm3 | |||
pxor %xmm2, %xmm1 | |||
pshufd $0x93,%xmm2,%xmm2 | |||
movdqa %xmm1,%xmm4 | |||
pslld $7, %xmm1 | |||
psrld $25, %xmm4 | |||
pxor %xmm4, %xmm1 | |||
ja hchacha_sse2_mainloop | |||
movdqu %xmm0, 0(%rdx) | |||
movdqu %xmm3, 16(%rdx) | |||
ret | |||
FN_END hchacha_sse2 | |||
GLOBAL_HIDDEN_FN_EXT chacha_sse2, 6, 16 | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
subq $64, %rsp | |||
andq $~63, %rsp | |||
movdqu 0(%rdi), %xmm0 | |||
movdqu 16(%rdi), %xmm1 | |||
movdqa %xmm0, 0(%rsp) | |||
movdqa %xmm1, 16(%rsp) | |||
xorq %rdi, %rdi | |||
movq %rdi, 32(%rsp) | |||
movq 0(%rsi), %rsi | |||
movq %rsi, 40(%rsp) | |||
movq %r9, 48(%rsp) | |||
movq %rsp, %rdi | |||
movq %rdx, %rsi | |||
movq %rcx, %rdx | |||
movq %r8, %rcx | |||
call chacha_blocks_sse2_local | |||
pxor %xmm0, %xmm0 | |||
movdqa %xmm0, 0(%rsp) | |||
movdqa %xmm0, 16(%rsp) | |||
movdqa %xmm0, 32(%rsp) | |||
movq %rbp, %rsp | |||
popq %rbp | |||
ret | |||
FN_END chacha_sse2 | |||
GLOBAL_HIDDEN_FN_EXT xchacha_sse2, 6, 16 | |||
pushq %rbp | |||
pushq %rbx | |||
movq %rsp, %rbp | |||
subq $64, %rsp | |||
andq $~63, %rsp | |||
movq %rsp, %rbx | |||
xorq %rax, %rax | |||
movq %rax, 32(%rbx) | |||
movq 16(%rsi), %rax | |||
movq %rax, 40(%rbx) | |||
movq %r9, 48(%rbx) | |||
pushq %rdx | |||
pushq %rcx | |||
pushq %r8 | |||
movq %rbx, %rdx | |||
movq %r9, %rcx | |||
call hchacha_sse2_local | |||
movq %rbx, %rdi | |||
popq %rcx | |||
popq %rdx | |||
popq %rsi | |||
call chacha_blocks_sse2_local | |||
pxor %xmm0, %xmm0 | |||
movdqa %xmm0, 0(%rbx) | |||
movdqa %xmm0, 16(%rbx) | |||
movdqa %xmm0, 32(%rbx) | |||
movq %rbp, %rsp | |||
popq %rbx | |||
popq %rbp | |||
ret | |||
FN_END xchacha_sse2 |
@@ -61,7 +61,7 @@ void rspamd_cryptobox_keypair (rspamd_pk_t pk, rspamd_sk_t sk); | |||
* @param sk local secret key | |||
* @param sig output signature | |||
*/ | |||
void rspamd_cryptobox_encrypt_inplace (struct rspamd_encrypt_segment *segments, | |||
void rspamd_cryptobox_encrypt_inplace (guchar *data, gsize len, | |||
gsize cnt, const rspamd_pk_t pk, const rspamd_sk_t sk, rspamd_sig_t sig); | |||
@@ -0,0 +1,11 @@ | |||
#ifndef PLATFORM_H_CONFIG | |||
#define PLATFORM_H_CONFIG | |||
#define ARCH ${ARCH} | |||
#cmakedefine HAVE_AVX2 1 | |||
#cmakedefine HAVE_AVX 1 | |||
#cmakedefine HAVE_SSE2 1 | |||
#cmakedefine HAVE_SLASHMACRO 1 | |||
#cmakedefine HAVE_DOLLARMACRO 1 | |||
#endif |