openssl1.0/crypto/modes/asm/ghash-alpha.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
# loops are aggressively modulo-scheduled in respect to references to
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
# scheduling "glitch," because uprofile(1) indicates uniform sample
# distribution, as if all instruction bundles execute in 1.5 cycles.
# Meaning that it could have been even faster, yet 12 cycles is ~60%
# better than gcc-generated code and ~80% than code generated by vendor
# compiler.

$cnt="v0";	# $0
$t0="t0";
$t1="t1";
$t2="t2";
$Thi0="t3";	# $4
$Tlo0="t4";
$Thi1="t5";
$Tlo1="t6";
$rem="t7";	# $8
#################
$Xi="a0";	# $16, input argument block
$Htbl="a1";
$inp="a2";
$len="a3";
$nlo="a4";	# $20
$nhi="a5";
$Zhi="t8";
$Zlo="t9";
$Xhi="t10";	# $24
$Xlo="t11";
$remp="t12";
$rem_4bit="AT";	# $28

{ my $N;
  sub loop() {

	$N++;
$code.=<<___;
.align	4
	extbl	$Xlo,7,$nlo
	and	$nlo,0xf0,$nhi
	sll	$nlo,4,$nlo
	and	$nlo,0xf0,$nlo

	addq	$nlo,$Htbl,$nlo
	ldq	$Zlo,8($nlo)
	addq	$nhi,$Htbl,$nhi
	ldq	$Zhi,0($nlo)

	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	lda	$cnt,6(zero)
	extbl	$Xlo,6,$nlo

	ldq	$Tlo1,8($nhi)
	s8addq	$remp,$rem_4bit,$remp
	ldq	$Thi1,0($nhi)
	srl	$Zlo,4,$Zlo

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	and	$nlo,0xf0,$nhi

	xor	$Tlo1,$Zlo,$Zlo
	sll	$nlo,4,$nlo
	xor	$Thi1,$Zhi,$Zhi
	and	$nlo,0xf0,$nlo

	addq	$nlo,$Htbl,$nlo
	ldq	$Tlo0,8($nlo)
	addq	$nhi,$Htbl,$nhi
	ldq	$Thi0,0($nlo)

.Looplo$N:
	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	subq	$cnt,1,$cnt
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	extbl	$Xlo,$cnt,$nlo

	and	$nlo,0xf0,$nhi
	xor	$Thi0,$Zhi,$Zhi
	xor	$Tlo0,$Zlo,$Zlo
	sll	$nlo,4,$nlo


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	and	$nlo,0xf0,$nlo
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi
	addq	$nlo,$Htbl,$nlo
	addq	$nhi,$Htbl,$nhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	ldq	$Tlo0,8($nlo)
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	ldq	$Thi0,0($nlo)
	bne	$cnt,.Looplo$N


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	lda	$cnt,7(zero)
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	extbl	$Xhi,$cnt,$nlo

	and	$nlo,0xf0,$nhi
	xor	$Thi0,$Zhi,$Zhi
	xor	$Tlo0,$Zlo,$Zlo
	sll	$nlo,4,$nlo

	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	and	$nlo,0xf0,$nlo
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi
	addq	$nlo,$Htbl,$nlo
	addq	$nhi,$Htbl,$nhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	ldq	$Tlo0,8($nlo)
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	ldq	$Thi0,0($nlo)
	unop


.Loophi$N:
	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	subq	$cnt,1,$cnt
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	extbl	$Xhi,$cnt,$nlo

	and	$nlo,0xf0,$nhi
	xor	$Thi0,$Zhi,$Zhi
	xor	$Tlo0,$Zlo,$Zlo
	sll	$nlo,4,$nlo


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	and	$nlo,0xf0,$nlo
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi
	addq	$nlo,$Htbl,$nlo
	addq	$nhi,$Htbl,$nhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	ldq	$Tlo0,8($nlo)
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	ldq	$Thi0,0($nlo)
	bne	$cnt,.Loophi$N


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo0,$Zlo,$Zlo
	xor	$Thi0,$Zhi,$Zhi

	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	xor	$t0,$Zlo,$Zlo
	xor	$rem,$Zhi,$Zhi
___
}}

$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif

.text

.set	noat
.set	noreorder
.globl	gcm_gmult_4bit
.align	4
.ent	gcm_gmult_4bit
gcm_gmult_4bit:
	.frame	sp,0,ra
	.prologue 0

	ldq	$Xlo,8($Xi)
	ldq	$Xhi,0($Xi)

	bsr	$t0,picmeup
	nop
___

	&loop();

$code.=<<___;
	srl	$Zlo,24,$t0	# byte swap
	srl	$Zlo,8,$t1

	sll	$Zlo,8,$t2
	sll	$Zlo,24,$Zlo
	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1

	zapnot	$Zlo,0x88,$Zlo
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zlo,$t0,$Zlo
	srl	$Zhi,24,$t0
	srl	$Zhi,8,$t1

	or	$Zlo,$t2,$Zlo
	sll	$Zhi,8,$t2
	sll	$Zhi,24,$Zhi

	srl	$Zlo,32,$Xlo
	sll	$Zlo,32,$Zlo

	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1
	or	$Zlo,$Xlo,$Xlo

	zapnot	$Zhi,0x88,$Zhi
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zhi,$t0,$Zhi
	or	$Zhi,$t2,$Zhi

	srl	$Zhi,32,$Xhi
	sll	$Zhi,32,$Zhi

	or	$Zhi,$Xhi,$Xhi
	stq	$Xlo,8($Xi)
	stq	$Xhi,0($Xi)

	ret	(ra)
.end	gcm_gmult_4bit
___

$inhi="s0";
$inlo="s1";

$code.=<<___;
.globl	gcm_ghash_4bit
.align	4
.ent	gcm_ghash_4bit
gcm_ghash_4bit:
	lda	sp,-32(sp)
	stq	ra,0(sp)
	stq	s0,8(sp)
	stq	s1,16(sp)
	.mask	0x04000600,-32
	.frame	sp,32,ra
	.prologue 0

	ldq_u	$inhi,0($inp)
	ldq_u	$Thi0,7($inp)
	ldq_u	$inlo,8($inp)
	ldq_u	$Tlo0,15($inp)
	ldq	$Xhi,0($Xi)
	ldq	$Xlo,8($Xi)

	bsr	$t0,picmeup
	nop

.Louter:
	extql	$inhi,$inp,$inhi
	extqh	$Thi0,$inp,$Thi0
	or	$inhi,$Thi0,$inhi
	lda	$inp,16($inp)

	extql	$inlo,$inp,$inlo
	extqh	$Tlo0,$inp,$Tlo0
	or	$inlo,$Tlo0,$inlo
	subq	$len,16,$len

	xor	$Xlo,$inlo,$Xlo
	xor	$Xhi,$inhi,$Xhi
___

	&loop();

$code.=<<___;
	srl	$Zlo,24,$t0	# byte swap
	srl	$Zlo,8,$t1

	sll	$Zlo,8,$t2
	sll	$Zlo,24,$Zlo
	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1

	zapnot	$Zlo,0x88,$Zlo
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zlo,$t0,$Zlo
	srl	$Zhi,24,$t0
	srl	$Zhi,8,$t1

	or	$Zlo,$t2,$Zlo
	sll	$Zhi,8,$t2
	sll	$Zhi,24,$Zhi

	srl	$Zlo,32,$Xlo
	sll	$Zlo,32,$Zlo
	beq	$len,.Ldone

	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1
	or	$Zlo,$Xlo,$Xlo
	ldq_u	$inhi,0($inp)

	zapnot	$Zhi,0x88,$Zhi
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2
	ldq_u	$Thi0,7($inp)

	or	$Zhi,$t0,$Zhi
	or	$Zhi,$t2,$Zhi
	ldq_u	$inlo,8($inp)
	ldq_u	$Tlo0,15($inp)

	srl	$Zhi,32,$Xhi
	sll	$Zhi,32,$Zhi

	or	$Zhi,$Xhi,$Xhi
	br	zero,.Louter

.Ldone:
	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1
	or	$Zlo,$Xlo,$Xlo

	zapnot	$Zhi,0x88,$Zhi
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zhi,$t0,$Zhi
	or	$Zhi,$t2,$Zhi

	srl	$Zhi,32,$Xhi
	sll	$Zhi,32,$Zhi

	or	$Zhi,$Xhi,$Xhi

	stq	$Xlo,8($Xi)
	stq	$Xhi,0($Xi)

	.set	noreorder
	/*ldq	ra,0(sp)*/
	ldq	s0,8(sp)
	ldq	s1,16(sp)
	lda	sp,32(sp)
	ret	(ra)
.end	gcm_ghash_4bit

.align	4
.ent	picmeup
picmeup:
	.frame	sp,0,$t0
	.prologue 0
	br	$rem_4bit,.Lpic
.Lpic:	lda	$rem_4bit,12($rem_4bit)
	ret	($t0)
.end	picmeup
	nop
rem_4bit:
	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align	4

___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;
Imported Upstream version 1.0.2s 2019-08-09 10:00:55 +02:00			`#!/usr/bin/env perl`
			`#`
			`# ====================================================================`
			`# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL`
			`# project. The module is, however, dual licensed under OpenSSL and`
			`# CRYPTOGAMS licenses depending on where you obtain it. For further`
			`# details see http://www.openssl.org/~appro/cryptogams/.`
			`# ====================================================================`
			`#`
			`# March 2010`
			`#`
			`# The module implements "4-bit" GCM GHASH function and underlying`
			`# single multiplication operation in GF(2^128). "4-bit" means that it`
			`# uses 256 bytes per-key table [+128 bytes shared table]. Even though`
			`# loops are aggressively modulo-scheduled in respect to references to`
			`# Htbl and Z.hi updates for 8 cycles per byte, measured performance is`
			`# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic`
			`# scheduling "glitch," because uprofile(1) indicates uniform sample`
			`# distribution, as if all instruction bundles execute in 1.5 cycles.`
			`# Meaning that it could have been even faster, yet 12 cycles is ~60%`
			`# better than gcc-generated code and ~80% than code generated by vendor`
			`# compiler.`

			`$cnt="v0"; # $0`
			`$t0="t0";`
			`$t1="t1";`
			`$t2="t2";`
			`$Thi0="t3"; # $4`
			`$Tlo0="t4";`
			`$Thi1="t5";`
			`$Tlo1="t6";`
			`$rem="t7"; # $8`
			`#################`
			`$Xi="a0"; # $16, input argument block`
			`$Htbl="a1";`
			`$inp="a2";`
			`$len="a3";`
			`$nlo="a4"; # $20`
			`$nhi="a5";`
			`$Zhi="t8";`
			`$Zlo="t9";`
			`$Xhi="t10"; # $24`
			`$Xlo="t11";`
			`$remp="t12";`
			`$rem_4bit="AT"; # $28`

			`{ my $N;`
			`sub loop() {`

			`$N++;`
			`$code.=<<___;`
			`.align 4`
			`extbl $Xlo,7,$nlo`
			`and $nlo,0xf0,$nhi`
			`sll $nlo,4,$nlo`
			`and $nlo,0xf0,$nlo`

			`addq $nlo,$Htbl,$nlo`
			`ldq $Zlo,8($nlo)`
			`addq $nhi,$Htbl,$nhi`
			`ldq $Zhi,0($nlo)`

			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`lda $cnt,6(zero)`
			`extbl $Xlo,6,$nlo`

			`ldq $Tlo1,8($nhi)`
			`s8addq $remp,$rem_4bit,$remp`
			`ldq $Thi1,0($nhi)`
			`srl $Zlo,4,$Zlo`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`xor $t0,$Zlo,$Zlo`
			`and $nlo,0xf0,$nhi`

			`xor $Tlo1,$Zlo,$Zlo`
			`sll $nlo,4,$nlo`
			`xor $Thi1,$Zhi,$Zhi`
			`and $nlo,0xf0,$nlo`

			`addq $nlo,$Htbl,$nlo`
			`ldq $Tlo0,8($nlo)`
			`addq $nhi,$Htbl,$nhi`
			`ldq $Thi0,0($nlo)`

			`.Looplo$N:`
			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`subq $cnt,1,$cnt`
			`srl $Zlo,4,$Zlo`

			`ldq $Tlo1,8($nhi)`
			`xor $rem,$Zhi,$Zhi`
			`ldq $Thi1,0($nhi)`
			`s8addq $remp,$rem_4bit,$remp`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`xor $t0,$Zlo,$Zlo`
			`extbl $Xlo,$cnt,$nlo`

			`and $nlo,0xf0,$nhi`
			`xor $Thi0,$Zhi,$Zhi`
			`xor $Tlo0,$Zlo,$Zlo`
			`sll $nlo,4,$nlo`


			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`and $nlo,0xf0,$nlo`
			`srl $Zlo,4,$Zlo`

			`s8addq $remp,$rem_4bit,$remp`
			`xor $rem,$Zhi,$Zhi`
			`addq $nlo,$Htbl,$nlo`
			`addq $nhi,$Htbl,$nhi`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`ldq $Tlo0,8($nlo)`
			`xor $t0,$Zlo,$Zlo`

			`xor $Tlo1,$Zlo,$Zlo`
			`xor $Thi1,$Zhi,$Zhi`
			`ldq $Thi0,0($nlo)`
			`bne $cnt,.Looplo$N`


			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`lda $cnt,7(zero)`
			`srl $Zlo,4,$Zlo`

			`ldq $Tlo1,8($nhi)`
			`xor $rem,$Zhi,$Zhi`
			`ldq $Thi1,0($nhi)`
			`s8addq $remp,$rem_4bit,$remp`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`xor $t0,$Zlo,$Zlo`
			`extbl $Xhi,$cnt,$nlo`

			`and $nlo,0xf0,$nhi`
			`xor $Thi0,$Zhi,$Zhi`
			`xor $Tlo0,$Zlo,$Zlo`
			`sll $nlo,4,$nlo`

			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`and $nlo,0xf0,$nlo`
			`srl $Zlo,4,$Zlo`

			`s8addq $remp,$rem_4bit,$remp`
			`xor $rem,$Zhi,$Zhi`
			`addq $nlo,$Htbl,$nlo`
			`addq $nhi,$Htbl,$nhi`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`ldq $Tlo0,8($nlo)`
			`xor $t0,$Zlo,$Zlo`

			`xor $Tlo1,$Zlo,$Zlo`
			`xor $Thi1,$Zhi,$Zhi`
			`ldq $Thi0,0($nlo)`
			`unop`


			`.Loophi$N:`
			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`subq $cnt,1,$cnt`
			`srl $Zlo,4,$Zlo`

			`ldq $Tlo1,8($nhi)`
			`xor $rem,$Zhi,$Zhi`
			`ldq $Thi1,0($nhi)`
			`s8addq $remp,$rem_4bit,$remp`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`xor $t0,$Zlo,$Zlo`
			`extbl $Xhi,$cnt,$nlo`

			`and $nlo,0xf0,$nhi`
			`xor $Thi0,$Zhi,$Zhi`
			`xor $Tlo0,$Zlo,$Zlo`
			`sll $nlo,4,$nlo`


			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`and $nlo,0xf0,$nlo`
			`srl $Zlo,4,$Zlo`

			`s8addq $remp,$rem_4bit,$remp`
			`xor $rem,$Zhi,$Zhi`
			`addq $nlo,$Htbl,$nlo`
			`addq $nhi,$Htbl,$nhi`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`ldq $Tlo0,8($nlo)`
			`xor $t0,$Zlo,$Zlo`

			`xor $Tlo1,$Zlo,$Zlo`
			`xor $Thi1,$Zhi,$Zhi`
			`ldq $Thi0,0($nlo)`
			`bne $cnt,.Loophi$N`


			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`srl $Zlo,4,$Zlo`

			`ldq $Tlo1,8($nhi)`
			`xor $rem,$Zhi,$Zhi`
			`ldq $Thi1,0($nhi)`
			`s8addq $remp,$rem_4bit,$remp`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`xor $t0,$Zlo,$Zlo`

			`xor $Tlo0,$Zlo,$Zlo`
			`xor $Thi0,$Zhi,$Zhi`

			`and $Zlo,0x0f,$remp`
			`sll $Zhi,60,$t0`
			`srl $Zlo,4,$Zlo`

			`s8addq $remp,$rem_4bit,$remp`
			`xor $rem,$Zhi,$Zhi`

			`ldq $rem,0($remp)`
			`srl $Zhi,4,$Zhi`
			`xor $Tlo1,$Zlo,$Zlo`
			`xor $Thi1,$Zhi,$Zhi`
			`xor $t0,$Zlo,$Zlo`
			`xor $rem,$Zhi,$Zhi`
			`___`
			`}}`

			`$code=<<___;`
			`#ifdef __linux__`
			`#include <asm/regdef.h>`
			`#else`
			`#include <asm.h>`
			`#include <regdef.h>`
			`#endif`

			`.text`

			`.set noat`
			`.set noreorder`
			`.globl gcm_gmult_4bit`
			`.align 4`
			`.ent gcm_gmult_4bit`
			`gcm_gmult_4bit:`
			`.frame sp,0,ra`
			`.prologue 0`

			`ldq $Xlo,8($Xi)`
			`ldq $Xhi,0($Xi)`

			`bsr $t0,picmeup`
			`nop`
			`___`

			`&loop();`

			`$code.=<<___;`
			`srl $Zlo,24,$t0 # byte swap`
			`srl $Zlo,8,$t1`

			`sll $Zlo,8,$t2`
			`sll $Zlo,24,$Zlo`
			`zapnot $t0,0x11,$t0`
			`zapnot $t1,0x22,$t1`

			`zapnot $Zlo,0x88,$Zlo`
			`or $t0,$t1,$t0`
			`zapnot $t2,0x44,$t2`

			`or $Zlo,$t0,$Zlo`
			`srl $Zhi,24,$t0`
			`srl $Zhi,8,$t1`

			`or $Zlo,$t2,$Zlo`
			`sll $Zhi,8,$t2`
			`sll $Zhi,24,$Zhi`

			`srl $Zlo,32,$Xlo`
			`sll $Zlo,32,$Zlo`

			`zapnot $t0,0x11,$t0`
			`zapnot $t1,0x22,$t1`
			`or $Zlo,$Xlo,$Xlo`

			`zapnot $Zhi,0x88,$Zhi`
			`or $t0,$t1,$t0`
			`zapnot $t2,0x44,$t2`

			`or $Zhi,$t0,$Zhi`
			`or $Zhi,$t2,$Zhi`

			`srl $Zhi,32,$Xhi`
			`sll $Zhi,32,$Zhi`

			`or $Zhi,$Xhi,$Xhi`
			`stq $Xlo,8($Xi)`
			`stq $Xhi,0($Xi)`

			`ret (ra)`
			`.end gcm_gmult_4bit`
			`___`

			`$inhi="s0";`
			`$inlo="s1";`

			`$code.=<<___;`
			`.globl gcm_ghash_4bit`
			`.align 4`
			`.ent gcm_ghash_4bit`
			`gcm_ghash_4bit:`
			`lda sp,-32(sp)`
			`stq ra,0(sp)`
			`stq s0,8(sp)`
			`stq s1,16(sp)`
			`.mask 0x04000600,-32`
			`.frame sp,32,ra`
			`.prologue 0`

			`ldq_u $inhi,0($inp)`
			`ldq_u $Thi0,7($inp)`
			`ldq_u $inlo,8($inp)`
			`ldq_u $Tlo0,15($inp)`
			`ldq $Xhi,0($Xi)`
			`ldq $Xlo,8($Xi)`

			`bsr $t0,picmeup`
			`nop`

			`.Louter:`
			`extql $inhi,$inp,$inhi`
			`extqh $Thi0,$inp,$Thi0`
			`or $inhi,$Thi0,$inhi`
			`lda $inp,16($inp)`

			`extql $inlo,$inp,$inlo`
			`extqh $Tlo0,$inp,$Tlo0`
			`or $inlo,$Tlo0,$inlo`
			`subq $len,16,$len`

			`xor $Xlo,$inlo,$Xlo`
			`xor $Xhi,$inhi,$Xhi`
			`___`

			`&loop();`

			`$code.=<<___;`
			`srl $Zlo,24,$t0 # byte swap`
			`srl $Zlo,8,$t1`

			`sll $Zlo,8,$t2`
			`sll $Zlo,24,$Zlo`
			`zapnot $t0,0x11,$t0`
			`zapnot $t1,0x22,$t1`

			`zapnot $Zlo,0x88,$Zlo`
			`or $t0,$t1,$t0`
			`zapnot $t2,0x44,$t2`

			`or $Zlo,$t0,$Zlo`
			`srl $Zhi,24,$t0`
			`srl $Zhi,8,$t1`

			`or $Zlo,$t2,$Zlo`
			`sll $Zhi,8,$t2`
			`sll $Zhi,24,$Zhi`

			`srl $Zlo,32,$Xlo`
			`sll $Zlo,32,$Zlo`
			`beq $len,.Ldone`

			`zapnot $t0,0x11,$t0`
			`zapnot $t1,0x22,$t1`
			`or $Zlo,$Xlo,$Xlo`
			`ldq_u $inhi,0($inp)`

			`zapnot $Zhi,0x88,$Zhi`
			`or $t0,$t1,$t0`
			`zapnot $t2,0x44,$t2`
			`ldq_u $Thi0,7($inp)`

			`or $Zhi,$t0,$Zhi`
			`or $Zhi,$t2,$Zhi`
			`ldq_u $inlo,8($inp)`
			`ldq_u $Tlo0,15($inp)`

			`srl $Zhi,32,$Xhi`
			`sll $Zhi,32,$Zhi`

			`or $Zhi,$Xhi,$Xhi`
			`br zero,.Louter`

			`.Ldone:`
			`zapnot $t0,0x11,$t0`
			`zapnot $t1,0x22,$t1`
			`or $Zlo,$Xlo,$Xlo`

			`zapnot $Zhi,0x88,$Zhi`
			`or $t0,$t1,$t0`
			`zapnot $t2,0x44,$t2`

			`or $Zhi,$t0,$Zhi`
			`or $Zhi,$t2,$Zhi`

			`srl $Zhi,32,$Xhi`
			`sll $Zhi,32,$Zhi`

			`or $Zhi,$Xhi,$Xhi`

			`stq $Xlo,8($Xi)`
			`stq $Xhi,0($Xi)`

			`.set noreorder`
			`/ldq ra,0(sp)/`
			`ldq s0,8(sp)`
			`ldq s1,16(sp)`
			`lda sp,32(sp)`
			`ret (ra)`
			`.end gcm_ghash_4bit`

			`.align 4`
			`.ent picmeup`
			`picmeup:`
			`.frame sp,0,$t0`
			`.prologue 0`
			`br $rem_4bit,.Lpic`
			`.Lpic: lda $rem_4bit,12($rem_4bit)`
			`ret ($t0)`
			`.end picmeup`
			`nop`
			`rem_4bit:`
			`.long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16`
			`.long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16`
			`.long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16`
			`.long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16`
			`.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"`
			`.align 4`

			`___`
			`$output=shift and open STDOUT,">$output";`
			`print $code;`
			`close STDOUT;`