Skip to content

Commit

Permalink
internal/poly1305: Port sum_amd64.s to Avo
Browse files Browse the repository at this point in the history
This implementation utilizes the same registers found in the reference
implementation, aiming to produce a minimal semantic diff between the
Avo-generated output and the original hand-written assembly.

To verify the Avo implementation, the reference and Avo-generated
assembly files are fed to `go tool asm`, capturing the debug output into
corresponding temp files. The debug output contains supplementary
metadata (line numbers, instruction offsets, and source file references)
that must be removed in order to obtain a semantic diff of the two
files. This is accomplished via a small utility script written in awk.

Commands used to verify Avo output:

GOROOT=$(go env GOROOT)
ASM_PATH="internal/poly1305/sum_amd64.s"
REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340"

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  <(git cat-file -p "$REFERENCE:$ASM_PATH") \
  > /tmp/reference.s

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  "$ASM_PATH" \
  > /tmp/avo.s

normalize(){
  awk '{
    $1=$2=$3="";
    print substr($0,4)
  }'
}

diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s)

Change-Id: I80212c95d1b05335d7f6b73a3030b6f812f6105b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600035
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
  • Loading branch information
Garrett-Bodley authored and rolandshoemaker committed Sep 4, 2024
1 parent 7eace71 commit bcb0f91
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 74 deletions.
15 changes: 15 additions & 0 deletions internal/poly1305/_asm/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module internal/poly1305/_asm

go 1.23

require (
github.com/mmcloughlin/avo v0.6.0
golang.org/x/crypto v0.26.0
)

require (
golang.org/x/mod v0.20.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.24.0 // indirect
golang.org/x/tools v0.24.0 // indirect
)
12 changes: 12 additions & 0 deletions internal/poly1305/_asm/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
126 changes: 126 additions & 0 deletions internal/poly1305/_asm/sum_amd64_asm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
_ "golang.org/x/crypto/sha3"
)

//go:generate go run . -out ../sum_amd64.s -pkg poly1305

func main() {
Package("golang.org/x/crypto/internal/poly1305")
ConstraintExpr("gc,!purego")
update()
Generate()
}

func update() {
Implement("update")

Load(Param("state"), RDI)
MOVQ(NewParamAddr("msg_base", 8), RSI)
MOVQ(NewParamAddr("msg_len", 16), R15)

MOVQ(Mem{Base: DI}.Offset(0), R8) // h0
MOVQ(Mem{Base: DI}.Offset(8), R9) // h1
MOVQ(Mem{Base: DI}.Offset(16), R10) // h2
MOVQ(Mem{Base: DI}.Offset(24), R11) // r0
MOVQ(Mem{Base: DI}.Offset(32), R12) // r1

CMPQ(R15, Imm(16))
JB(LabelRef("bytes_between_0_and_15"))

Label("loop")
POLY1305_ADD(RSI, R8, R9, R10)

Label("multiply")
POLY1305_MUL(R8, R9, R10, R11, R12, RBX, RCX, R13, R14)
SUBQ(Imm(16), R15)
CMPQ(R15, Imm(16))
JAE(LabelRef("loop"))

Label("bytes_between_0_and_15")
TESTQ(R15, R15)
JZ(LabelRef("done"))
MOVQ(U32(1), RBX)
XORQ(RCX, RCX)
XORQ(R13, R13)
ADDQ(R15, RSI)

Label("flush_buffer")
SHLQ(Imm(8), RBX, RCX)
SHLQ(Imm(8), RBX)
MOVB(Mem{Base: SI}.Offset(-1), R13B)
XORQ(R13, RBX)
DECQ(RSI)
DECQ(R15)
JNZ(LabelRef("flush_buffer"))

ADDQ(RBX, R8)
ADCQ(RCX, R9)
ADCQ(Imm(0), R10)
MOVQ(U32(16), R15)
JMP(LabelRef("multiply"))

Label("done")
MOVQ(R8, Mem{Base: DI}.Offset(0))
MOVQ(R9, Mem{Base: DI}.Offset(8))
MOVQ(R10, Mem{Base: DI}.Offset(16))
RET()
}

func POLY1305_ADD(msg, h0, h1, h2 GPPhysical) {
ADDQ(Mem{Base: msg}.Offset(0), h0)
ADCQ(Mem{Base: msg}.Offset(8), h1)
ADCQ(Imm(1), h2)
LEAQ(Mem{Base: msg}.Offset(16), msg)
}

func POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3 GPPhysical) {
MOVQ(r0, RAX)
MULQ(h0)
MOVQ(RAX, t0)
MOVQ(RDX, t1)
MOVQ(r0, RAX)
MULQ(h1)
ADDQ(RAX, t1)
ADCQ(Imm(0), RDX)
MOVQ(r0, t2)
IMULQ(h2, t2)
ADDQ(RDX, t2)

MOVQ(r1, RAX)
MULQ(h0)
ADDQ(RAX, t1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, h0)
MOVQ(r1, t3)
IMULQ(h2, t3)
MOVQ(r1, RAX)
MULQ(h1)
ADDQ(RAX, t2)
ADCQ(RDX, t3)
ADDQ(h0, t2)
ADCQ(Imm(0), t3)

MOVQ(t0, h0)
MOVQ(t1, h1)
MOVQ(t2, h2)
ANDQ(Imm(3), h2)
MOVQ(t2, t0)
ANDQ(I32(-4), t0)
ADDQ(t0, h0)
ADCQ(t3, h1)
ADCQ(Imm(0), h2)
SHRQ(Imm(2), t3, t2)
SHRQ(Imm(2), t3)
ADDQ(t2, h0)
ADCQ(t3, h1)
ADCQ(Imm(0), h2)
}
133 changes: 59 additions & 74 deletions internal/poly1305/sum_amd64.s
Original file line number Diff line number Diff line change
@@ -1,108 +1,93 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.

//go:build gc && !purego

#include "textflag.h"

#define POLY1305_ADD(msg, h0, h1, h2) \
ADDQ 0(msg), h0; \
ADCQ 8(msg), h1; \
ADCQ $1, h2; \
LEAQ 16(msg), msg

#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
MOVQ r0, AX; \
MULQ h0; \
MOVQ AX, t0; \
MOVQ DX, t1; \
MOVQ r0, AX; \
MULQ h1; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ r0, t2; \
IMULQ h2, t2; \
ADDQ DX, t2; \
\
MOVQ r1, AX; \
MULQ h0; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ DX, h0; \
MOVQ r1, t3; \
IMULQ h2, t3; \
MOVQ r1, AX; \
MULQ h1; \
ADDQ AX, t2; \
ADCQ DX, t3; \
ADDQ h0, t2; \
ADCQ $0, t3; \
\
MOVQ t0, h0; \
MOVQ t1, h1; \
MOVQ t2, h2; \
ANDQ $3, h2; \
MOVQ t2, t0; \
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
ADDQ t0, h0; \
ADCQ t3, h1; \
ADCQ $0, h2; \
SHRQ $2, t3, t2; \
SHRQ $2, t3; \
ADDQ t2, h0; \
ADCQ t3, h1; \
ADCQ $0, h2

// func update(state *[7]uint64, msg []byte)
// func update(state *macState, msg []byte)
TEXT ·update(SB), $0-32
MOVQ state+0(FP), DI
MOVQ msg_base+8(FP), SI
MOVQ msg_len+16(FP), R15

MOVQ 0(DI), R8 // h0
MOVQ 8(DI), R9 // h1
MOVQ 16(DI), R10 // h2
MOVQ 24(DI), R11 // r0
MOVQ 32(DI), R12 // r1

CMPQ R15, $16
MOVQ (DI), R8
MOVQ 8(DI), R9
MOVQ 16(DI), R10
MOVQ 24(DI), R11
MOVQ 32(DI), R12
CMPQ R15, $0x10
JB bytes_between_0_and_15

loop:
POLY1305_ADD(SI, R8, R9, R10)
ADDQ (SI), R8
ADCQ 8(SI), R9
ADCQ $0x01, R10
LEAQ 16(SI), SI

multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
SUBQ $16, R15
CMPQ R15, $16
JAE loop
MOVQ R11, AX
MULQ R8
MOVQ AX, BX
MOVQ DX, CX
MOVQ R11, AX
MULQ R9
ADDQ AX, CX
ADCQ $0x00, DX
MOVQ R11, R13
IMULQ R10, R13
ADDQ DX, R13
MOVQ R12, AX
MULQ R8
ADDQ AX, CX
ADCQ $0x00, DX
MOVQ DX, R8
MOVQ R12, R14
IMULQ R10, R14
MOVQ R12, AX
MULQ R9
ADDQ AX, R13
ADCQ DX, R14
ADDQ R8, R13
ADCQ $0x00, R14
MOVQ BX, R8
MOVQ CX, R9
MOVQ R13, R10
ANDQ $0x03, R10
MOVQ R13, BX
ANDQ $-4, BX
ADDQ BX, R8
ADCQ R14, R9
ADCQ $0x00, R10
SHRQ $0x02, R14, R13
SHRQ $0x02, R14
ADDQ R13, R8
ADCQ R14, R9
ADCQ $0x00, R10
SUBQ $0x10, R15
CMPQ R15, $0x10
JAE loop

bytes_between_0_and_15:
TESTQ R15, R15
JZ done
MOVQ $1, BX
MOVQ $0x00000001, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI

flush_buffer:
SHLQ $8, BX, CX
SHLQ $8, BX
SHLQ $0x08, BX, CX
SHLQ $0x08, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer

ADDQ BX, R8
ADCQ CX, R9
ADCQ $0, R10
MOVQ $16, R15
ADCQ $0x00, R10
MOVQ $0x00000010, R15
JMP multiply

done:
MOVQ R8, 0(DI)
MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
RET

0 comments on commit bcb0f91

Please sign in to comment.