1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "asm_riscv64.h"
6 #include "go_asm.h"
7 #include "textflag.h"
8
9 // TODO(mzh): use Zvkb if possible
10
11 #define QR(A, B, C, D) \
12 VADDVV A, B, A \
13 VXORVV D, A, D \
14 VSLLVI $16, D, V28 \
15 VSRLVI $16, D, D \
16 VXORVV V28, D, D \
17 VADDVV D, C, C \
18 VXORVV C, B, B \
19 VSLLVI $12, B, V29 \
20 VSRLVI $20, B, B \
21 VXORVV V29, B, B \
22 VADDVV B, A, A \
23 VXORVV A, D, D \
24 VSLLVI $8, D, V30 \
25 VSRLVI $24, D, D \
26 VXORVV V30, D, D \
27 VADDVV D, C, C \
28 VXORVV C, B, B \
29 VSLLVI $7, B, V31 \
30 VSRLVI $25, B, B \
31 VXORVV V31, B, B
32
33 // block runs four ChaCha8 block transformations using four elements in each V register.
34 // func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
35 TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
36 // seed in X10
37 // blocks in X11
38 // counter in X12
39
40 #ifndef hasV
41 MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X13
42 BNEZ X13, vector_chacha8
43 JMP ·block_generic<ABIInternal>(SB)
44 #endif
45
46 vector_chacha8:
47 // At least VLEN >= 128
48 VSETIVLI $4, E32, M1, TA, MA, X0
49 // Load initial constants into top row.
50 MOV $·chachaConst(SB), X14
51 VLSSEG4E32V (X14), X0, V0 // V0, V1, V2, V3 = const row
52 VLSSEG8E32V (X10), X0, V4 // V4 ... V11, seed
53 VIDV V12
54 VADDVX X12, V12, V12 // counter
55
56 // Clear all nonces.
57 VXORVV V13, V13, V13
58 VXORVV V14, V14, V14
59 VXORVV V15, V15, V15
60
61 // Copy initial state.
62 VMV4RV V4, V20
63 VMV4RV V8, V24
64
65 MOV $4, X15
66 PCALIGN $16
67 loop:
68 QR(V0, V4, V8, V12)
69 QR(V1, V5, V9, V13)
70 QR(V2, V6, V10, V14)
71 QR(V3, V7, V11, V15)
72
73 QR(V0, V5, V10, V15)
74 QR(V1, V6, V11, V12)
75 QR(V2, V7, V8, V13)
76 QR(V3, V4, V9, V14)
77
78 SUB $1, X15
79 BNEZ X15, loop
80
81 VADDVV V20, V4, V4
82 VADDVV V21, V5, V5
83 VADDVV V22, V6, V6
84 VADDVV V23, V7, V7
85 VADDVV V24, V8, V8
86 VADDVV V25, V9, V9
87 VADDVV V26, V10, V10
88 VADDVV V27, V11, V11
89
90 VSE32V V0, (X11); ADD $16, X11;
91 VSE32V V1, (X11); ADD $16, X11;
92 VSE32V V2, (X11); ADD $16, X11;
93 VSE32V V3, (X11); ADD $16, X11;
94 VSE32V V4, (X11); ADD $16, X11;
95 VSE32V V5, (X11); ADD $16, X11;
96 VSE32V V6, (X11); ADD $16, X11;
97 VSE32V V7, (X11); ADD $16, X11;
98 VSE32V V8, (X11); ADD $16, X11;
99 VSE32V V9, (X11); ADD $16, X11;
100 VSE32V V10, (X11); ADD $16, X11;
101 VSE32V V11, (X11); ADD $16, X11;
102 VSE32V V12, (X11); ADD $16, X11;
103 VSE32V V13, (X11); ADD $16, X11;
104 VSE32V V14, (X11); ADD $16, X11;
105 VSE32V V15, (X11); ADD $16, X11;
106
107 RET
108
109 GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
110 DATA ·chachaConst+0x00(SB)/4, $0x61707865
111 DATA ·chachaConst+0x04(SB)/4, $0x3320646e
112 DATA ·chachaConst+0x08(SB)/4, $0x79622d32
113 DATA ·chachaConst+0x0c(SB)/4, $0x6b206574
114
View as plain text