aboutsummaryrefslogtreecommitdiffstats
path: root/src/libcryptobox/base64
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-12-13 17:15:58 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-12-13 17:16:18 +0000
commitfe3062454ca9f40448d2b8fb1b08268441b229dd (patch)
tree2bb4985b86aea0f635fb5cf8d1e7481b55a66e42 /src/libcryptobox/base64
parent845926b715857267b3ccbaeccb2f4cd0d53f5a95 (diff)
downloadrspamd-fe3062454ca9f40448d2b8fb1b08268441b229dd.tar.gz
rspamd-fe3062454ca9f40448d2b8fb1b08268441b229dd.zip
[Feature] Add ssse3 and avx2 base64 decoders
Diffstat (limited to 'src/libcryptobox/base64')
-rw-r--r--src/libcryptobox/base64/avx2.S558
-rw-r--r--src/libcryptobox/base64/base64.c27
-rw-r--r--src/libcryptobox/base64/ref.c21
-rw-r--r--src/libcryptobox/base64/ssse3.S404
4 files changed, 987 insertions, 23 deletions
diff --git a/src/libcryptobox/base64/avx2.S b/src/libcryptobox/base64/avx2.S
new file mode 100644
index 000000000..ccc0e0d6c
--- /dev/null
+++ b/src/libcryptobox/base64/avx2.S
@@ -0,0 +1,558 @@
+/*
+ * Generated from https://github.com/aklomp/base64/blob/master/lib/arch/avx2/
+ * using gcc6 -march=core-avx2 -mtune=core-avx2 -O3 -S
+ */
+
+#include "../macro.S"
+
+#ifdef LINUX
+#define PROGBITS @progbits
+#else
+#define PROGBITS
+#endif
+.comm _base64_table_dec, 256, 5
+ .file "avx2.c"
+ .text
+ .p2align 4,,15
+GLOBAL_HIDDEN_FN_EXT base64_decode_avx2,4,1
+base64_decode_avx2_local:
+.LFB4659:
+ .cfi_startproc
+ vmovdqa .LC0(%rip), %ymm15
+ leaq 8(%rsp), %r10
+ .cfi_def_cfa 10, 0
+ andq $-32, %rsp
+ vmovdqa .LC1(%rip), %ymm6
+ vmovdqa .LC2(%rip), %ymm14
+ pushq -8(%r10)
+ vmovdqa .LC3(%rip), %ymm13
+ pushq %rbp
+ vmovdqa .LC4(%rip), %ymm12
+ vmovdqa .LC5(%rip), %ymm11
+ vmovdqa .LC6(%rip), %ymm10
+ vmovdqa .LC7(%rip), %ymm9
+ vmovdqa .LC8(%rip), %ymm8
+ .cfi_escape 0x10,0x6,0x2,0x76,0
+ movq %rsp, %rbp
+ pushq %r12
+ vmovdqa .LC9(%rip), %ymm7
+ pushq %r10
+ .cfi_escape 0xf,0x3,0x76,0x70,0x6
+ .cfi_escape 0x10,0xc,0x2,0x76,0x78
+ xorl %r10d, %r10d
+ pushq %rbx
+ .cfi_escape 0x10,0x3,0x2,0x76,0x68
+.L2:
+ cmpq $44, %rsi
+ ja .L4
+.L26:
+ leaq -1(%rsi), %rbx
+ testq %rsi, %rsi
+ je .L22
+ movzbl (%rdi), %eax
+ leaq _base64_table_dec(%rip), %r11
+ movzbl (%r11,%rax), %eax
+ cmpb $-3, %al
+ ja .L11
+ leal 0(,%rax,4), %r8d
+ testq %rbx, %rbx
+ je .L22
+ movzbl 1(%rdi), %eax
+ movzbl (%r11,%rax), %eax
+ cmpb $-3, %al
+ ja .L15
+.L28:
+ movl %eax, %r9d
+ leaq -3(%rsi), %r12
+ sall $4, %eax
+ shrb $4, %r9b
+ orl %r9d, %r8d
+ leaq 1(%r10), %r9
+ movb %r8b, (%rdx)
+ cmpq $1, %rbx
+ je .L16
+ movzbl 2(%rdi), %r8d
+ movzbl (%r11,%r8), %r8d
+ cmpb $-3, %r8b
+ ja .L24
+ movl %r8d, %r9d
+ leaq 2(%r10), %rbx
+ sall $6, %r8d
+ subq $4, %rsi
+ shrb $2, %r9b
+ orl %r9d, %eax
+ movb %al, 1(%rdx)
+ testq %r12, %r12
+ je .L19
+ movzbl 3(%rdi), %eax
+ leaq 4(%rdi), %r9
+ movzbl (%r11,%rax), %eax
+ cmpb $-3, %al
+ ja .L25
+ orl %eax, %r8d
+ addq $3, %r10
+ addq $3, %rdx
+ movq %r9, %rdi
+ movb %r8b, -1(%rdx)
+ cmpq $44, %rsi
+ jbe .L26
+.L4:
+ vmovdqu (%rdi), %ymm0
+ vpcmpgtb %ymm6, %ymm0, %ymm1
+ vpcmpgtb %ymm15, %ymm0, %ymm2
+ vpcmpgtb %ymm14, %ymm0, %ymm4
+ vpcmpgtb %ymm12, %ymm0, %ymm3
+ vpandn %ymm1, %ymm2, %ymm2
+ vpcmpgtb %ymm13, %ymm0, %ymm1
+ vpand %ymm7, %ymm2, %ymm2
+ vpandn %ymm1, %ymm4, %ymm4
+ vpcmpgtb %ymm11, %ymm0, %ymm1
+ vpand .LC10(%rip), %ymm4, %ymm4
+ vpor %ymm4, %ymm2, %ymm2
+ vpandn %ymm1, %ymm3, %ymm3
+ vpcmpeqb %ymm6, %ymm0, %ymm1
+ vpand .LC11(%rip), %ymm3, %ymm3
+ vpand %ymm10, %ymm1, %ymm5
+ vpcmpeqb %ymm9, %ymm0, %ymm1
+ vpand %ymm8, %ymm1, %ymm1
+ vpor %ymm1, %ymm5, %ymm1
+ vpor %ymm2, %ymm1, %ymm1
+ vpxor %xmm2, %xmm2, %xmm2
+ vpor %ymm3, %ymm1, %ymm1
+ vpcmpeqb %ymm2, %ymm1, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ je .L27
+ movzbl (%rdi), %eax
+ leaq _base64_table_dec(%rip), %r11
+ leaq -1(%rsi), %rbx
+ movzbl (%r11,%rax), %eax
+ cmpb $-3, %al
+ ja .L11
+ leal 0(,%rax,4), %r8d
+ movzbl 1(%rdi), %eax
+ movzbl (%r11,%rax), %eax
+ cmpb $-3, %al
+ jbe .L28
+.L15:
+ movl $-1, %eax
+ vzeroupper
+ popq %rbx
+ popq %r10
+ .cfi_remember_state
+ .cfi_def_cfa 10, 0
+ popq %r12
+ popq %rbp
+ leaq -8(%r10), %rsp
+ .cfi_def_cfa 7, 8
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L27:
+ .cfi_restore_state
+ vpaddb %ymm1, %ymm0, %ymm0
+ addq $32, %rdi
+ addq $24, %rdx
+ addq $24, %r10
+ vpand .LC12(%rip), %ymm0, %ymm3
+ vpsrld $16, %ymm0, %ymm2
+ subq $32, %rsi
+ vpand .LC13(%rip), %ymm0, %ymm1
+ vpslld $26, %ymm0, %ymm0
+ vpsrld $2, %ymm3, %ymm3
+ vpslld $12, %ymm1, %ymm1
+ vpor %ymm3, %ymm2, %ymm2
+ vpor %ymm0, %ymm1, %ymm0
+ vmovdqa .LC15(%rip), %ymm1
+ vpor %ymm2, %ymm0, %ymm0
+ vpshufb .LC14(%rip), %ymm0, %ymm0
+ vpermd %ymm0, %ymm1, %ymm0
+ vmovdqu %ymm0, -24(%rdx)
+ jmp .L2
+ .p2align 4,,10
+ .p2align 3
+.L19:
+ movq %rbx, %r10
+.L22:
+ movl $1, %eax
+.L5:
+ movq %r10, (%rcx)
+ vzeroupper
+ popq %rbx
+ popq %r10
+ .cfi_remember_state
+ .cfi_def_cfa 10, 0
+ popq %r12
+ popq %rbp
+ leaq -8(%r10), %rsp
+ .cfi_def_cfa 7, 8
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L11:
+ .cfi_restore_state
+ xorl %eax, %eax
+ jmp .L5
+ .p2align 4,,10
+ .p2align 3
+.L16:
+ movq %r9, %r10
+ movl $1, %eax
+ jmp .L5
+ .p2align 4,,10
+ .p2align 3
+.L24:
+ movq %r9, %r10
+ xorl %eax, %eax
+ cmpb $-2, %r8b
+ jne .L5
+ movl $1, %eax
+ testq %r12, %r12
+ je .L5
+ movzbl 3(%rdi), %eax
+ cmpb $-2, (%r11,%rax)
+ sete %dl
+ xorl %eax, %eax
+ cmpq $4, %rsi
+ sete %al
+ andl %edx, %eax
+ jmp .L5
+ .p2align 4,,10
+ .p2align 3
+.L25:
+ cmpb $-2, %al
+ movq %rbx, %r10
+ sete %dl
+ xorl %eax, %eax
+ testq %rsi, %rsi
+ sete %al
+ andl %edx, %eax
+ jmp .L5
+ .cfi_endproc
+.LFE4659:
+ .section .rodata.cst32,"aM",PROGBITS,32
+ .align 16
+.LC0:
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .align 16
+.LC1:
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .align 16
+.LC2:
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .align 16
+.LC3:
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .align 16
+.LC4:
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .align 16
+.LC5:
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .align 16
+.LC6:
+ .quad 1157442765409226768
+ .quad 1157442765409226768
+ .quad 1157442765409226768
+ .quad 1157442765409226768
+ .align 16
+.LC7:
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .align 16
+.LC8:
+ .quad 1374463283923456787
+ .quad 1374463283923456787
+ .quad 1374463283923456787
+ .quad 1374463283923456787
+ .align 16
+.LC9:
+ .quad 289360691352306692
+ .quad 289360691352306692
+ .quad 289360691352306692
+ .quad 289360691352306692
+ .align 16
+.LC10:
+ .quad -4629771061636907073
+ .quad -4629771061636907073
+ .quad -4629771061636907073
+ .quad -4629771061636907073
+ .align 16
+.LC11:
+ .quad -5063812098665367111
+ .quad -5063812098665367111
+ .quad -5063812098665367111
+ .quad -5063812098665367111
+ .align 16
+.LC12:
+ .quad 17732923536900096
+ .quad 17732923536900096
+ .quad 17732923536900096
+ .quad 17732923536900096
+ .align 16
+.LC13:
+ .quad 69269232566016
+ .quad 69269232566016
+ .quad 69269232566016
+ .quad 69269232566016
+ .align 16
+.LC14:
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte -1
+ .byte -1
+ .byte -1
+ .byte -1
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte -1
+ .byte -1
+ .byte -1
+ .byte -1
+ .align 16
+.LC15:
+ .long 0
+ .long 1
+ .long 2
+ .long 4
+ .long 5
+ .long 6
+ .long -1
+ .long -1
+ .ident "GCC: (Debian 6.2.1-5) 6.2.1 20161124"
diff --git a/src/libcryptobox/base64/base64.c b/src/libcryptobox/base64/base64.c
index c280b59fa..3a1e74ac7 100644
--- a/src/libcryptobox/base64/base64.c
+++ b/src/libcryptobox/base64/base64.c
@@ -20,6 +20,26 @@
#include "platform_config.h"
extern unsigned long cpu_config;
+const uint8_t
+base64_table_dec[256] =
+{
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 254, 255, 255,
+ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255,
+ 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
typedef struct base64_impl {
unsigned long cpu_flags;
@@ -36,15 +56,16 @@ typedef struct base64_impl {
BASE64_DECLARE(ref);
#define BASE64_REF BASE64_IMPL(0, "ref", ref)
+BASE64_DECLARE(avx2);
+#define BASE64_AVX2 BASE64_IMPL(CPUID_AVX2,"avx2", avx2)
+BASE64_DECLARE(ssse3);
+#define BASE64_SSSE3 BASE64_IMPL(CPUID_SSSE3, "avx2", ssse3)
static const base64_impl_t base64_list[] = {
BASE64_REF,
#if defined(BASE64_AVX2)
BASE64_AVX2,
#endif
-#if defined(BASE64_AVX)
- BASE64_AVX,
-#endif
#if defined(BASE64_SSSE3)
BASE64_SSSE3,
#endif
diff --git a/src/libcryptobox/base64/ref.c b/src/libcryptobox/base64/ref.c
index b262fa5e5..22cedee6a 100644
--- a/src/libcryptobox/base64/ref.c
+++ b/src/libcryptobox/base64/ref.c
@@ -29,26 +29,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "config.h"
-const uint8_t
-base64_table_dec[] =
-{
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63,
- 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 254, 255, 255,
- 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255,
- 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
- 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-};
+extern const uint8_t base64_table_dec[256];
#define INNER_LOOP_64 do { \
while (inlen >= 13) { \
diff --git a/src/libcryptobox/base64/ssse3.S b/src/libcryptobox/base64/ssse3.S
new file mode 100644
index 000000000..5cf6da52f
--- /dev/null
+++ b/src/libcryptobox/base64/ssse3.S
@@ -0,0 +1,404 @@
+/*
+ * Generated from https://github.com/aklomp/base64/blob/master/lib/arch/ssse3/
+ * using gcc6 -march=core2 -mtune=core2 -O3 -S
+ */
+
+
+#include "../macro.S"
+
+
+#ifdef LINUX
+#define PROGBITS @progbits
+#else
+#define PROGBITS
+#endif
+
+.comm _base64_table_dec, 256, 5
+
+ .file "ssse3.c"
+ .text
+ .p2align 4,,15
+GLOBAL_HIDDEN_FN_EXT base64_decode_ssse3,4,1
+base64_decode_ssse3_local:
+.LFB4658:
+ .cfi_startproc
+ movdqa .LC0(%rip), %xmm11
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset 6, -16
+ xorl %r10d, %r10d
+ movdqa .LC1(%rip), %xmm2
+ pushq %rbx
+ .cfi_def_cfa_offset 24
+ .cfi_offset 3, -24
+ movdqa .LC2(%rip), %xmm10
+ movdqa .LC3(%rip), %xmm9
+ movdqa .LC4(%rip), %xmm8
+ movdqa .LC5(%rip), %xmm7
+ movdqa .LC6(%rip), %xmm6
+ movdqa .LC7(%rip), %xmm5
+ movdqa .LC8(%rip), %xmm4
+ movdqa .LC9(%rip), %xmm3
+.L2:
+ cmpq $23, %rsi
+ ja .L4
+.L25:
+ leaq -1(%rsi), %rbx
+ testq %rsi, %rsi
+ je .L21
+ movzbl (%rdi), %eax
+ leaq _base64_table_dec(%rip), %r9
+ movzbl (%r9,%rax), %eax
+ cmpb $-3, %al
+ ja .L11
+ leal 0(,%rax,4), %r8d
+ testq %rbx, %rbx
+ je .L21
+ movzbl 1(%rdi), %eax
+ movzbl (%r9,%rax), %eax
+ cmpb $-3, %al
+ ja .L15
+.L27:
+ leaq -3(%rsi), %rbp
+ movl %eax, %r11d
+ sall $4, %eax
+ shrb $4, %r11b
+ orl %r11d, %r8d
+ cmpq $1, %rbx
+ movb %r8b, (%rdx)
+ leaq 1(%r10), %r11
+ je .L16
+ movzbl 2(%rdi), %r8d
+ movzbl (%r9,%r8), %r8d
+ cmpb $-3, %r8b
+ ja .L23
+ leaq 2(%r10), %rbx
+ movl %r8d, %r11d
+ subq $4, %rsi
+ shrb $2, %r11b
+ sall $6, %r8d
+ orl %r11d, %eax
+ testq %rbp, %rbp
+ movb %al, 1(%rdx)
+ je .L19
+ movzbl 3(%rdi), %eax
+ leaq 4(%rdi), %r11
+ movzbl (%r9,%rax), %eax
+ cmpb $-3, %al
+ ja .L24
+ orl %eax, %r8d
+ addq $3, %r10
+ addq $3, %rdx
+ movb %r8b, -1(%rdx)
+ cmpq $23, %rsi
+ movq %r11, %rdi
+ jbe .L25
+.L4:
+ movdqu (%rdi), %xmm1
+ movdqa %xmm1, %xmm0
+ movdqa %xmm1, %xmm12
+ movdqa %xmm1, %xmm13
+ pcmpgtb %xmm2, %xmm12
+ movdqa %xmm1, %xmm14
+ pcmpgtb %xmm11, %xmm0
+ pcmpgtb %xmm10, %xmm13
+ pcmpgtb %xmm7, %xmm14
+ pandn %xmm12, %xmm0
+ movdqa %xmm1, %xmm12
+ pand %xmm6, %xmm0
+ pcmpgtb %xmm9, %xmm12
+ pandn %xmm12, %xmm13
+ movdqa %xmm1, %xmm12
+ pand %xmm5, %xmm13
+ por %xmm13, %xmm0
+ pcmpgtb %xmm8, %xmm12
+ pandn %xmm14, %xmm12
+ pand %xmm4, %xmm12
+ por %xmm12, %xmm0
+ movdqa %xmm1, %xmm12
+ pcmpeqb %xmm3, %xmm12
+ pand .LC10(%rip), %xmm12
+ por %xmm12, %xmm0
+ movdqa %xmm1, %xmm12
+ pcmpeqb %xmm2, %xmm12
+ pand .LC11(%rip), %xmm12
+ por %xmm12, %xmm0
+ pxor %xmm12, %xmm12
+ pcmpeqb %xmm0, %xmm12
+ pmovmskb %xmm12, %eax
+ testl %eax, %eax
+ je .L26
+ movzbl (%rdi), %eax
+ leaq _base64_table_dec(%rip), %r9
+ leaq -1(%rsi), %rbx
+ movzbl (%r9,%rax), %eax
+ cmpb $-3, %al
+ ja .L11
+ leal 0(,%rax,4), %r8d
+ movzbl 1(%rdi), %eax
+ movzbl (%r9,%rax), %eax
+ cmpb $-3, %al
+ jbe .L27
+.L15:
+ movl $-1, %eax
+ popq %rbx
+ .cfi_remember_state
+ .cfi_def_cfa_offset 16
+ popq %rbp
+ .cfi_def_cfa_offset 8
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L26:
+ .cfi_restore_state
+ paddb %xmm1, %xmm0
+ addq $16, %rdi
+ addq $12, %rdx
+ movdqa .LC12(%rip), %xmm13
+ movdqa %xmm0, %xmm1
+ addq $12, %r10
+ subq $16, %rsi
+ movdqa .LC13(%rip), %xmm12
+ pand %xmm0, %xmm13
+ psrld $16, %xmm1
+ pand %xmm0, %xmm12
+ psrld $2, %xmm13
+ pslld $12, %xmm12
+ por %xmm13, %xmm1
+ pslld $26, %xmm0
+ por %xmm12, %xmm1
+ por %xmm1, %xmm0
+ pshufb .LC14(%rip), %xmm0
+ movups %xmm0, -12(%rdx)
+ jmp .L2
+ .p2align 4,,10
+ .p2align 3
+.L19:
+ movq %rbx, %r10
+.L21:
+ movl $1, %eax
+.L5:
+ popq %rbx
+ .cfi_remember_state
+ .cfi_def_cfa_offset 16
+ movq %r10, (%rcx)
+ popq %rbp
+ .cfi_def_cfa_offset 8
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L11:
+ .cfi_restore_state
+ xorl %eax, %eax
+ jmp .L5
+ .p2align 4,,10
+ .p2align 3
+.L16:
+ movq %r11, %r10
+ movl $1, %eax
+ jmp .L5
+ .p2align 4,,10
+ .p2align 3
+.L23:
+ xorl %eax, %eax
+ cmpb $-2, %r8b
+ movq %r11, %r10
+ jne .L5
+ testq %rbp, %rbp
+ movl $1, %eax
+ je .L5
+ movzbl 3(%rdi), %eax
+ cmpb $-2, (%r9,%rax)
+ sete %dl
+ xorl %eax, %eax
+ cmpq $4, %rsi
+ sete %al
+ andl %edx, %eax
+ jmp .L5
+ .p2align 4,,10
+ .p2align 3
+.L24:
+ cmpb $-2, %al
+ movq %rbx, %r10
+ sete %dl
+ xorl %eax, %eax
+ testq %rsi, %rsi
+ sete %al
+ andl %edx, %eax
+ jmp .L5
+ .cfi_endproc
+.LFE4658:
+ .section .rodata.cst16,"aM",PROGBITS,16
+ .align 16
+.LC0:
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .byte 57
+ .align 16
+.LC1:
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .byte 47
+ .align 16
+.LC2:
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .byte 90
+ .align 16
+.LC3:
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .byte 64
+ .align 16
+.LC4:
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .byte 122
+ .align 16
+.LC5:
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .byte 96
+ .align 16
+.LC6:
+ .quad 289360691352306692
+ .quad 289360691352306692
+ .align 16
+.LC7:
+ .quad -4629771061636907073
+ .quad -4629771061636907073
+ .align 16
+.LC8:
+ .quad -5063812098665367111
+ .quad -5063812098665367111
+ .align 16
+.LC9:
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .byte 43
+ .align 16
+.LC10:
+ .quad 1374463283923456787
+ .quad 1374463283923456787
+ .align 16
+.LC11:
+ .quad 1157442765409226768
+ .quad 1157442765409226768
+ .align 16
+.LC12:
+ .quad 17732923536900096
+ .quad 17732923536900096
+ .align 16
+.LC13:
+ .quad 69269232566016
+ .quad 69269232566016
+ .align 16
+.LC14:
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte -1
+ .byte -1
+ .byte -1
+ .byte -1
+ .ident "GCC: (Debian 6.2.1-5) 6.2.1 20161124"