Text file
src/math/big/arith_arm64.s
1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
6
7 //go:build !math_big_pure_go
8
9 #include "textflag.h"
10
11 // func addVV(z, x, y []Word) (c Word)
12 TEXT ·addVV(SB), NOSPLIT, $0
13 MOVD z_len+8(FP), R0
14 MOVD x_base+24(FP), R1
15 MOVD y_base+48(FP), R2
16 MOVD z_base+0(FP), R3
17 // compute unrolled loop lengths
18 AND $3, R0, R4
19 LSR $2, R0
20 ADDS ZR, R0 // clear carry
21 loop1:
22 CBZ R4, loop1done
23 loop1cont:
24 // unroll 1X
25 MOVD.P 8(R1), R5
26 MOVD.P 8(R2), R6
27 ADCS R6, R5
28 MOVD.P R5, 8(R3)
29 SUB $1, R4
30 CBNZ R4, loop1cont
31 loop1done:
32 loop4:
33 CBZ R0, loop4done
34 loop4cont:
35 // unroll 4X
36 LDP.P 32(R1), (R4, R5)
37 LDP -16(R1), (R6, R7)
38 LDP.P 32(R2), (R8, R9)
39 LDP -16(R2), (R10, R11)
40 ADCS R8, R4
41 ADCS R9, R5
42 ADCS R10, R6
43 ADCS R11, R7
44 STP.P (R4, R5), 32(R3)
45 STP (R6, R7), -16(R3)
46 SUB $1, R0
47 CBNZ R0, loop4cont
48 loop4done:
49 ADC ZR, ZR, R1 // save & convert add carry
50 MOVD R1, c+72(FP)
51 RET
52
53 // func subVV(z, x, y []Word) (c Word)
54 TEXT ·subVV(SB), NOSPLIT, $0
55 MOVD z_len+8(FP), R0
56 MOVD x_base+24(FP), R1
57 MOVD y_base+48(FP), R2
58 MOVD z_base+0(FP), R3
59 // compute unrolled loop lengths
60 AND $3, R0, R4
61 LSR $2, R0
62 SUBS ZR, R0 // clear carry
63 loop1:
64 CBZ R4, loop1done
65 loop1cont:
66 // unroll 1X
67 MOVD.P 8(R1), R5
68 MOVD.P 8(R2), R6
69 SBCS R6, R5
70 MOVD.P R5, 8(R3)
71 SUB $1, R4
72 CBNZ R4, loop1cont
73 loop1done:
74 loop4:
75 CBZ R0, loop4done
76 loop4cont:
77 // unroll 4X
78 LDP.P 32(R1), (R4, R5)
79 LDP -16(R1), (R6, R7)
80 LDP.P 32(R2), (R8, R9)
81 LDP -16(R2), (R10, R11)
82 SBCS R8, R4
83 SBCS R9, R5
84 SBCS R10, R6
85 SBCS R11, R7
86 STP.P (R4, R5), 32(R3)
87 STP (R6, R7), -16(R3)
88 SUB $1, R0
89 CBNZ R0, loop4cont
90 loop4done:
91 SBC R1, R1 // save carry
92 SUB R1, ZR, R1 // convert sub carry
93 MOVD R1, c+72(FP)
94 RET
95
96 // func lshVU(z, x []Word, s uint) (c Word)
97 TEXT ·lshVU(SB), NOSPLIT, $0
98 MOVD z_len+8(FP), R0
99 CBZ R0, ret0
100 MOVD s+48(FP), R1
101 MOVD x_base+24(FP), R2
102 MOVD z_base+0(FP), R3
103 // run loop backward
104 ADD R0<<3, R2, R2
105 ADD R0<<3, R3, R3
106 // shift first word into carry
107 MOVD.W -8(R2), R4
108 MOVD $64, R5
109 SUB R1, R5
110 LSR R5, R4, R6
111 LSL R1, R4
112 MOVD R6, c+56(FP)
113 // shift remaining words
114 SUB $1, R0
115 // compute unrolled loop lengths
116 AND $3, R0, R6
117 LSR $2, R0
118 loop1:
119 CBZ R6, loop1done
120 loop1cont:
121 // unroll 1X
122 MOVD.W -8(R2), R7
123 LSR R5, R7, R8
124 ORR R4, R8
125 LSL R1, R7, R4
126 MOVD.W R8, -8(R3)
127 SUB $1, R6
128 CBNZ R6, loop1cont
129 loop1done:
130 loop4:
131 CBZ R0, loop4done
132 loop4cont:
133 // unroll 4X
134 LDP.W -32(R2), (R9, R8)
135 LDP 16(R2), (R7, R6)
136 LSR R5, R6, R10
137 ORR R4, R10
138 LSL R1, R6, R4
139 LSR R5, R7, R6
140 ORR R4, R6
141 LSL R1, R7, R4
142 LSR R5, R8, R7
143 ORR R4, R7
144 LSL R1, R8, R4
145 LSR R5, R9, R8
146 ORR R4, R8
147 LSL R1, R9, R4
148 STP.W (R8, R7), -32(R3)
149 STP (R6, R10), 16(R3)
150 SUB $1, R0
151 CBNZ R0, loop4cont
152 loop4done:
153 // store final shifted bits
154 MOVD.W R4, -8(R3)
155 RET
156 ret0:
157 MOVD ZR, c+56(FP)
158 RET
159
160 // func rshVU(z, x []Word, s uint) (c Word)
161 TEXT ·rshVU(SB), NOSPLIT, $0
162 MOVD z_len+8(FP), R0
163 CBZ R0, ret0
164 MOVD s+48(FP), R1
165 MOVD x_base+24(FP), R2
166 MOVD z_base+0(FP), R3
167 // shift first word into carry
168 MOVD.P 8(R2), R4
169 MOVD $64, R5
170 SUB R1, R5
171 LSL R5, R4, R6
172 LSR R1, R4
173 MOVD R6, c+56(FP)
174 // shift remaining words
175 SUB $1, R0
176 // compute unrolled loop lengths
177 AND $3, R0, R6
178 LSR $2, R0
179 loop1:
180 CBZ R6, loop1done
181 loop1cont:
182 // unroll 1X
183 MOVD.P 8(R2), R7
184 LSL R5, R7, R8
185 ORR R4, R8
186 LSR R1, R7, R4
187 MOVD.P R8, 8(R3)
188 SUB $1, R6
189 CBNZ R6, loop1cont
190 loop1done:
191 loop4:
192 CBZ R0, loop4done
193 loop4cont:
194 // unroll 4X
195 LDP.P 32(R2), (R6, R7)
196 LDP -16(R2), (R8, R9)
197 LSL R5, R6, R10
198 ORR R4, R10
199 LSR R1, R6, R4
200 LSL R5, R7, R6
201 ORR R4, R6
202 LSR R1, R7, R4
203 LSL R5, R8, R7
204 ORR R4, R7
205 LSR R1, R8, R4
206 LSL R5, R9, R8
207 ORR R4, R8
208 LSR R1, R9, R4
209 STP.P (R10, R6), 32(R3)
210 STP (R7, R8), -16(R3)
211 SUB $1, R0
212 CBNZ R0, loop4cont
213 loop4done:
214 // store final shifted bits
215 MOVD.P R4, 8(R3)
216 RET
217 ret0:
218 MOVD ZR, c+56(FP)
219 RET
220
221 // func mulAddVWW(z, x []Word, m, a Word) (c Word)
222 TEXT ·mulAddVWW(SB), NOSPLIT, $0
223 MOVD m+48(FP), R0
224 MOVD a+56(FP), R1
225 MOVD z_len+8(FP), R2
226 MOVD x_base+24(FP), R3
227 MOVD z_base+0(FP), R4
228 // compute unrolled loop lengths
229 AND $7, R2, R5
230 LSR $3, R2
231 loop1:
232 CBZ R5, loop1done
233 loop1cont:
234 // unroll 1X
235 MOVD.P 8(R3), R6
236 // multiply
237 UMULH R0, R6, R7
238 MUL R0, R6
239 ADDS R1, R6
240 ADC ZR, R7, R1
241 MOVD.P R6, 8(R4)
242 SUB $1, R5
243 CBNZ R5, loop1cont
244 loop1done:
245 loop8:
246 CBZ R2, loop8done
247 loop8cont:
248 // unroll 8X
249 LDP.P 64(R3), (R5, R6)
250 LDP -48(R3), (R7, R8)
251 LDP -32(R3), (R9, R10)
252 LDP -16(R3), (R11, R12)
253 // multiply
254 UMULH R0, R5, R13
255 MUL R0, R5
256 ADDS R1, R5
257 UMULH R0, R6, R14
258 MUL R0, R6
259 ADCS R13, R6
260 UMULH R0, R7, R13
261 MUL R0, R7
262 ADCS R14, R7
263 UMULH R0, R8, R14
264 MUL R0, R8
265 ADCS R13, R8
266 UMULH R0, R9, R13
267 MUL R0, R9
268 ADCS R14, R9
269 UMULH R0, R10, R14
270 MUL R0, R10
271 ADCS R13, R10
272 UMULH R0, R11, R13
273 MUL R0, R11
274 ADCS R14, R11
275 UMULH R0, R12, R14
276 MUL R0, R12
277 ADCS R13, R12
278 ADC ZR, R14, R1
279 STP.P (R5, R6), 64(R4)
280 STP (R7, R8), -48(R4)
281 STP (R9, R10), -32(R4)
282 STP (R11, R12), -16(R4)
283 SUB $1, R2
284 CBNZ R2, loop8cont
285 loop8done:
286 MOVD R1, c+64(FP)
287 RET
288
289 // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
290 TEXT ·addMulVVWW(SB), NOSPLIT, $0
291 MOVD m+72(FP), R0
292 MOVD a+80(FP), R1
293 MOVD z_len+8(FP), R2
294 MOVD x_base+24(FP), R3
295 MOVD y_base+48(FP), R4
296 MOVD z_base+0(FP), R5
297 // compute unrolled loop lengths
298 AND $7, R2, R6
299 LSR $3, R2
300 loop1:
301 CBZ R6, loop1done
302 loop1cont:
303 // unroll 1X
304 MOVD.P 8(R3), R7
305 MOVD.P 8(R4), R8
306 // multiply
307 UMULH R0, R8, R9
308 MUL R0, R8
309 ADDS R1, R8
310 ADC ZR, R9, R1
311 // add
312 ADDS R7, R8
313 ADC ZR, R1
314 MOVD.P R8, 8(R5)
315 SUB $1, R6
316 CBNZ R6, loop1cont
317 loop1done:
318 loop8:
319 CBZ R2, loop8done
320 loop8cont:
321 // unroll 8X
322 LDP.P 64(R3), (R6, R7)
323 LDP -48(R3), (R8, R9)
324 LDP -32(R3), (R10, R11)
325 LDP -16(R3), (R12, R13)
326 LDP.P 64(R4), (R14, R15)
327 LDP -48(R4), (R16, R17)
328 LDP -32(R4), (R19, R20)
329 LDP -16(R4), (R21, R22)
330 // multiply
331 UMULH R0, R14, R23
332 MUL R0, R14
333 ADDS R1, R14
334 UMULH R0, R15, R24
335 MUL R0, R15
336 ADCS R23, R15
337 UMULH R0, R16, R23
338 MUL R0, R16
339 ADCS R24, R16
340 UMULH R0, R17, R24
341 MUL R0, R17
342 ADCS R23, R17
343 UMULH R0, R19, R23
344 MUL R0, R19
345 ADCS R24, R19
346 UMULH R0, R20, R24
347 MUL R0, R20
348 ADCS R23, R20
349 UMULH R0, R21, R23
350 MUL R0, R21
351 ADCS R24, R21
352 UMULH R0, R22, R24
353 MUL R0, R22
354 ADCS R23, R22
355 ADC ZR, R24, R1
356 // add
357 ADDS R6, R14
358 ADCS R7, R15
359 ADCS R8, R16
360 ADCS R9, R17
361 ADCS R10, R19
362 ADCS R11, R20
363 ADCS R12, R21
364 ADCS R13, R22
365 ADC ZR, R1
366 STP.P (R14, R15), 64(R5)
367 STP (R16, R17), -48(R5)
368 STP (R19, R20), -32(R5)
369 STP (R21, R22), -16(R5)
370 SUB $1, R2
371 CBNZ R2, loop8cont
372 loop8done:
373 MOVD R1, c+88(FP)
374 RET
375
View as plain text