311 lines
5.3 KiB
NASM
311 lines
5.3 KiB
NASM
; AesOpt.asm -- Intel's AES.
|
|
; 2009-12-12 : Igor Pavlov : Public domain
|
|
|
|
%include "7zAsm.asm"
|
|
|
|
MY_ASM_START
|
|
|
|
%ifndef x64
|
|
; FIXME .xmm
|
|
%endif
|
|
|
|
%define rD r2
|
|
%define rN r0
|
|
|
|
%macro MY_PROLOG 1 ; MY_PROLOG macro reg:req
|
|
%ifdef x64
|
|
%ifdef CYGWIN64
|
|
; ivAes : %rcx
|
|
; data : %rdx
|
|
; numBlocks : %r8
|
|
%else
|
|
mov RCX,RDI
|
|
mov R8 ,RDX
|
|
mov RDX,RSI
|
|
%endif
|
|
; movdqa [r4 + 8], xmm6
|
|
; movdqa [r4 + 8 + 16], xmm7
|
|
%endif
|
|
|
|
push r3
|
|
push r5
|
|
|
|
%ifdef x64
|
|
%ifdef CYGWIN64
|
|
push r6
|
|
%endif
|
|
mov rN, r8
|
|
%else
|
|
push r6
|
|
mov ecx, [r4 + REG_SIZE * 4]
|
|
mov edx, [r4 + REG_SIZE * 5]
|
|
mov rN, [r4 + REG_SIZE * 6]
|
|
%endif
|
|
|
|
mov x6, [r1 + 16]
|
|
shl x6, 5
|
|
|
|
movdqa %1, [r1] ; reg
|
|
add r1, 32
|
|
%endmacro
|
|
|
|
%macro MY_EPILOG 0
|
|
%ifdef x64
|
|
%ifdef CYGWIN64
|
|
pop r6
|
|
%endif
|
|
%else
|
|
pop r6
|
|
%endif
|
|
pop r5
|
|
pop r3
|
|
|
|
%ifdef x64
|
|
; movdqa xmm6, [r4 + 8]
|
|
; movdqa xmm7, [r4 + 8 + 16]
|
|
%endif
|
|
|
|
MY_ENDP
|
|
%endmacro
|
|
|
|
ways equ 4
|
|
ways16 equ (ways * 16)
|
|
|
|
%macro OP_W 2 ; op, op2
|
|
|
|
%define i 0
|
|
%1 xmm0,%2
|
|
%define i 1
|
|
%1 xmm1,%2
|
|
%define i 2
|
|
%1 xmm2,%2
|
|
%define i 3
|
|
%1 xmm3,%2
|
|
|
|
%endmacro
|
|
|
|
%macro LOAD_OP 2 ; LOAD_OP macro op:req, offs:req
|
|
%1 xmm0, [r1 + r3 %2]
|
|
%endmacro
|
|
|
|
%macro LOAD_OP_W 2 ; LOAD_OP_W macro op:req, offs:req
|
|
movdqa xmm7, [r1 + r3 %2]
|
|
; OP_W %1, xmm7
|
|
%1 xmm0,xmm7
|
|
%1 xmm1,xmm7
|
|
%1 xmm2,xmm7
|
|
%1 xmm3,xmm7
|
|
%endmacro
|
|
|
|
|
|
; ---------- AES-CBC Decode ----------
|
|
|
|
%macro CBC_DEC_UPDATE 2 ; CBC_DEC_UPDATE macro reg, offs
|
|
pxor %1, xmm6
|
|
movdqa xmm6, [rD + %2]
|
|
movdqa [rD + %2], %1
|
|
%endmacro
|
|
|
|
%macro DECODE 1 ; macro op:req
|
|
%1 aesdec, +16
|
|
%%B:
|
|
%1 aesdec, +0
|
|
%1 aesdec, -16
|
|
sub x3, 32
|
|
jnz %%B
|
|
%1 aesdeclast, +0
|
|
%endmacro
|
|
|
|
; void AesCbc_Decode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks)
|
|
MY_PROC AesCbc_Decode_Intel, 3
|
|
MY_PROLOG xmm6
|
|
|
|
sub x6, 32
|
|
|
|
jmp check2
|
|
|
|
align 16
|
|
nextBlocks2:
|
|
mov x3, x6
|
|
OP_W movdqa, [rD + i * 16]
|
|
|
|
LOAD_OP_W pxor, +32
|
|
|
|
DECODE LOAD_OP_W
|
|
|
|
;OP_W CBC_DEC_UPDATE, i * 16
|
|
CBC_DEC_UPDATE xmm0, 0 * 16
|
|
CBC_DEC_UPDATE xmm1, 1 * 16
|
|
CBC_DEC_UPDATE xmm2, 2 * 16
|
|
CBC_DEC_UPDATE xmm3, 3 * 16
|
|
|
|
|
|
add rD, ways16
|
|
check2:
|
|
sub rN, ways
|
|
jnc nextBlocks2
|
|
|
|
add rN, ways
|
|
jmp check
|
|
|
|
nextBlock:
|
|
mov x3, x6
|
|
movdqa xmm1, [rD]
|
|
LOAD_OP movdqa, +32
|
|
pxor xmm0, xmm1
|
|
|
|
DECODE LOAD_OP
|
|
|
|
pxor xmm0, xmm6
|
|
movdqa [rD], xmm0
|
|
movdqa xmm6, xmm1
|
|
add rD, 16
|
|
check:
|
|
sub rN, 1
|
|
jnc nextBlock
|
|
|
|
movdqa [r1 - 32], xmm6
|
|
|
|
MY_EPILOG
|
|
|
|
|
|
; ---------- AES-CBC Encode ----------
|
|
|
|
%macro ENCODE 1 ; macro op:req
|
|
%1 aesenc, -16
|
|
%%B:
|
|
%1 aesenc, +0
|
|
%1 aesenc, +16
|
|
add r3, 32
|
|
jnz %%B
|
|
%1 aesenclast, +0
|
|
%endmacro
|
|
|
|
MY_PROC AesCbc_Encode_Intel, 3
|
|
MY_PROLOG xmm0
|
|
|
|
add r1, r6
|
|
neg r6
|
|
add r6, 32
|
|
|
|
jmp check_e
|
|
|
|
align 16
|
|
nextBlock_e:
|
|
mov r3, r6
|
|
pxor xmm0, [rD]
|
|
pxor xmm0, [r1 + r3 - 32]
|
|
|
|
ENCODE LOAD_OP
|
|
|
|
movdqa [rD], xmm0
|
|
add rD, 16
|
|
check_e:
|
|
sub rN, 1
|
|
jnc nextBlock_e
|
|
|
|
movdqa [r1 + r6 - 64], xmm0
|
|
MY_EPILOG
|
|
|
|
|
|
; ---------- AES-CTR ----------
|
|
|
|
%macro XOR_UPD_1 2 ; reg, offs
|
|
pxor %1, [rD + %2]
|
|
%endmacro
|
|
|
|
%macro XOR_UPD_2 2 ; reg, offs
|
|
movdqa [rD + %2], %1
|
|
%endmacro
|
|
|
|
MY_PROC AesCtr_Code_Intel, 3
|
|
MY_PROLOG xmm6
|
|
|
|
mov r5, r4
|
|
shr r5, 4
|
|
dec r5
|
|
shl r5, 4
|
|
|
|
mov DWORD [r5], 1
|
|
mov DWORD [r5 + 4], 0
|
|
mov DWORD [r5 + 8], 0
|
|
mov DWORD [r5 + 12], 0
|
|
|
|
add r1, r6
|
|
neg r6
|
|
add r6, 32
|
|
|
|
jmp check2_c
|
|
|
|
align 16
|
|
nextBlocks2_c:
|
|
movdqa xmm7, [r5]
|
|
|
|
; i = 0
|
|
; rept ways
|
|
; paddq xmm6, xmm7
|
|
; movdqa @CatStr(xmm,%i), xmm6
|
|
; i = i + 1
|
|
; endm
|
|
paddq xmm6, xmm7
|
|
movdqa xmm0, xmm6
|
|
|
|
paddq xmm6, xmm7
|
|
movdqa xmm1, xmm6
|
|
|
|
paddq xmm6, xmm7
|
|
movdqa xmm2, xmm6
|
|
|
|
paddq xmm6, xmm7
|
|
movdqa xmm3, xmm6
|
|
|
|
|
|
|
|
mov r3, r6
|
|
LOAD_OP_W pxor, -32
|
|
|
|
ENCODE LOAD_OP_W
|
|
|
|
;OP_W XOR_UPD_1, i * 16
|
|
XOR_UPD_1 xmm0, 0 * 16
|
|
XOR_UPD_1 xmm1, 1 * 16
|
|
XOR_UPD_1 xmm2, 2 * 16
|
|
XOR_UPD_1 xmm3, 3 * 16
|
|
|
|
;OP_W XOR_UPD_2, i * 16
|
|
XOR_UPD_2 xmm0, 0 * 16
|
|
XOR_UPD_2 xmm1, 1 * 16
|
|
XOR_UPD_2 xmm2, 2 * 16
|
|
XOR_UPD_2 xmm3, 3 * 16
|
|
|
|
add rD, ways16
|
|
check2_c:
|
|
sub rN, ways
|
|
jnc nextBlocks2_c
|
|
|
|
add rN, ways
|
|
jmp check_c
|
|
|
|
nextBlock_c:
|
|
paddq xmm6, [r5]
|
|
mov r3, r6
|
|
movdqa xmm0, [r1 + r3 - 32]
|
|
pxor xmm0, xmm6
|
|
ENCODE LOAD_OP
|
|
XOR_UPD_1 xmm0, 0
|
|
XOR_UPD_2 xmm0, 0
|
|
add rD, 16
|
|
check_c:
|
|
sub rN, 1
|
|
jnc nextBlock_c
|
|
|
|
movdqa [r1 + r6 - 64], xmm6
|
|
MY_EPILOG
|
|
|
|
; end
|
|
|
|
%ifidn __OUTPUT_FORMAT__,elf
|
|
section .note.GNU-stack noalloc noexec nowrite progbits
|
|
%endif
|
|
|