Text file
src/math/big/arith_amd64.s
1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
6
7 //go:build !math_big_pure_go
8
9 #include "textflag.h"
10
11 // func addVV(z, x, y []Word) (c Word)
12 TEXT ·addVV(SB), NOSPLIT, $0
13 MOVQ z_len+8(FP), BX
14 MOVQ x_base+24(FP), SI
15 MOVQ y_base+48(FP), DI
16 MOVQ z_base+0(FP), R8
17 // compute unrolled loop lengths
18 MOVQ BX, R9
19 ANDQ $3, R9
20 SHRQ $2, BX
21 MOVQ $0, R10 // clear saved carry
22 loop1:
23 TESTQ R9, R9; JZ loop1done
24 loop1cont:
25 // unroll 1X
26 ADDQ R10, R10 // restore carry
27 MOVQ 0(SI), R10
28 ADCQ 0(DI), R10
29 MOVQ R10, 0(R8)
30 SBBQ R10, R10 // save carry
31 LEAQ 8(SI), SI // ADD $8, SI
32 LEAQ 8(DI), DI // ADD $8, DI
33 LEAQ 8(R8), R8 // ADD $8, R8
34 SUBQ $1, R9; JNZ loop1cont
35 loop1done:
36 loop4:
37 TESTQ BX, BX; JZ loop4done
38 loop4cont:
39 // unroll 4X
40 ADDQ R10, R10 // restore carry
41 MOVQ 0(SI), R9
42 MOVQ 8(SI), R10
43 MOVQ 16(SI), R11
44 MOVQ 24(SI), R12
45 ADCQ 0(DI), R9
46 ADCQ 8(DI), R10
47 ADCQ 16(DI), R11
48 ADCQ 24(DI), R12
49 MOVQ R9, 0(R8)
50 MOVQ R10, 8(R8)
51 MOVQ R11, 16(R8)
52 MOVQ R12, 24(R8)
53 SBBQ R10, R10 // save carry
54 LEAQ 32(SI), SI // ADD $32, SI
55 LEAQ 32(DI), DI // ADD $32, DI
56 LEAQ 32(R8), R8 // ADD $32, R8
57 SUBQ $1, BX; JNZ loop4cont
58 loop4done:
59 NEGQ R10 // convert add carry
60 MOVQ R10, c+72(FP)
61 RET
62
63 // func subVV(z, x, y []Word) (c Word)
64 TEXT ·subVV(SB), NOSPLIT, $0
65 MOVQ z_len+8(FP), BX
66 MOVQ x_base+24(FP), SI
67 MOVQ y_base+48(FP), DI
68 MOVQ z_base+0(FP), R8
69 // compute unrolled loop lengths
70 MOVQ BX, R9
71 ANDQ $3, R9
72 SHRQ $2, BX
73 MOVQ $0, R10 // clear saved carry
74 loop1:
75 TESTQ R9, R9; JZ loop1done
76 loop1cont:
77 // unroll 1X
78 ADDQ R10, R10 // restore carry
79 MOVQ 0(SI), R10
80 SBBQ 0(DI), R10
81 MOVQ R10, 0(R8)
82 SBBQ R10, R10 // save carry
83 LEAQ 8(SI), SI // ADD $8, SI
84 LEAQ 8(DI), DI // ADD $8, DI
85 LEAQ 8(R8), R8 // ADD $8, R8
86 SUBQ $1, R9; JNZ loop1cont
87 loop1done:
88 loop4:
89 TESTQ BX, BX; JZ loop4done
90 loop4cont:
91 // unroll 4X
92 ADDQ R10, R10 // restore carry
93 MOVQ 0(SI), R9
94 MOVQ 8(SI), R10
95 MOVQ 16(SI), R11
96 MOVQ 24(SI), R12
97 SBBQ 0(DI), R9
98 SBBQ 8(DI), R10
99 SBBQ 16(DI), R11
100 SBBQ 24(DI), R12
101 MOVQ R9, 0(R8)
102 MOVQ R10, 8(R8)
103 MOVQ R11, 16(R8)
104 MOVQ R12, 24(R8)
105 SBBQ R10, R10 // save carry
106 LEAQ 32(SI), SI // ADD $32, SI
107 LEAQ 32(DI), DI // ADD $32, DI
108 LEAQ 32(R8), R8 // ADD $32, R8
109 SUBQ $1, BX; JNZ loop4cont
110 loop4done:
111 NEGQ R10 // convert sub carry
112 MOVQ R10, c+72(FP)
113 RET
114
115 // func lshVU(z, x []Word, s uint) (c Word)
116 TEXT ·lshVU(SB), NOSPLIT, $0
117 MOVQ z_len+8(FP), BX
118 TESTQ BX, BX; JZ ret0
119 MOVQ s+48(FP), CX
120 MOVQ x_base+24(FP), SI
121 MOVQ z_base+0(FP), DI
122 // run loop backward
123 LEAQ (SI)(BX*8), SI
124 LEAQ (DI)(BX*8), DI
125 // shift first word into carry
126 MOVQ -8(SI), R8
127 MOVQ $0, R9
128 SHLQ CX, R8, R9
129 MOVQ R9, c+56(FP)
130 // shift remaining words
131 SUBQ $1, BX
132 // compute unrolled loop lengths
133 MOVQ BX, R9
134 ANDQ $3, R9
135 SHRQ $2, BX
136 loop1:
137 TESTQ R9, R9; JZ loop1done
138 loop1cont:
139 // unroll 1X
140 MOVQ -16(SI), R10
141 SHLQ CX, R10, R8
142 MOVQ R8, -8(DI)
143 MOVQ R10, R8
144 LEAQ -8(SI), SI // ADD $-8, SI
145 LEAQ -8(DI), DI // ADD $-8, DI
146 SUBQ $1, R9; JNZ loop1cont
147 loop1done:
148 loop4:
149 TESTQ BX, BX; JZ loop4done
150 loop4cont:
151 // unroll 4X
152 MOVQ -16(SI), R9
153 MOVQ -24(SI), R10
154 MOVQ -32(SI), R11
155 MOVQ -40(SI), R12
156 SHLQ CX, R9, R8
157 SHLQ CX, R10, R9
158 SHLQ CX, R11, R10
159 SHLQ CX, R12, R11
160 MOVQ R8, -8(DI)
161 MOVQ R9, -16(DI)
162 MOVQ R10, -24(DI)
163 MOVQ R11, -32(DI)
164 MOVQ R12, R8
165 LEAQ -32(SI), SI // ADD $-32, SI
166 LEAQ -32(DI), DI // ADD $-32, DI
167 SUBQ $1, BX; JNZ loop4cont
168 loop4done:
169 // store final shifted bits
170 SHLQ CX, R8
171 MOVQ R8, -8(DI)
172 RET
173 ret0:
174 MOVQ $0, c+56(FP)
175 RET
176
177 // func rshVU(z, x []Word, s uint) (c Word)
178 TEXT ·rshVU(SB), NOSPLIT, $0
179 MOVQ z_len+8(FP), BX
180 TESTQ BX, BX; JZ ret0
181 MOVQ s+48(FP), CX
182 MOVQ x_base+24(FP), SI
183 MOVQ z_base+0(FP), DI
184 // shift first word into carry
185 MOVQ 0(SI), R8
186 MOVQ $0, R9
187 SHRQ CX, R8, R9
188 MOVQ R9, c+56(FP)
189 // shift remaining words
190 SUBQ $1, BX
191 // compute unrolled loop lengths
192 MOVQ BX, R9
193 ANDQ $3, R9
194 SHRQ $2, BX
195 loop1:
196 TESTQ R9, R9; JZ loop1done
197 loop1cont:
198 // unroll 1X
199 MOVQ 8(SI), R10
200 SHRQ CX, R10, R8
201 MOVQ R8, 0(DI)
202 MOVQ R10, R8
203 LEAQ 8(SI), SI // ADD $8, SI
204 LEAQ 8(DI), DI // ADD $8, DI
205 SUBQ $1, R9; JNZ loop1cont
206 loop1done:
207 loop4:
208 TESTQ BX, BX; JZ loop4done
209 loop4cont:
210 // unroll 4X
211 MOVQ 8(SI), R9
212 MOVQ 16(SI), R10
213 MOVQ 24(SI), R11
214 MOVQ 32(SI), R12
215 SHRQ CX, R9, R8
216 SHRQ CX, R10, R9
217 SHRQ CX, R11, R10
218 SHRQ CX, R12, R11
219 MOVQ R8, 0(DI)
220 MOVQ R9, 8(DI)
221 MOVQ R10, 16(DI)
222 MOVQ R11, 24(DI)
223 MOVQ R12, R8
224 LEAQ 32(SI), SI // ADD $32, SI
225 LEAQ 32(DI), DI // ADD $32, DI
226 SUBQ $1, BX; JNZ loop4cont
227 loop4done:
228 // store final shifted bits
229 SHRQ CX, R8
230 MOVQ R8, 0(DI)
231 RET
232 ret0:
233 MOVQ $0, c+56(FP)
234 RET
235
236 // func mulAddVWW(z, x []Word, m, a Word) (c Word)
237 TEXT ·mulAddVWW(SB), NOSPLIT, $0
238 MOVQ m+48(FP), BX
239 MOVQ a+56(FP), SI
240 MOVQ z_len+8(FP), DI
241 MOVQ x_base+24(FP), R8
242 MOVQ z_base+0(FP), R9
243 // compute unrolled loop lengths
244 MOVQ DI, R10
245 ANDQ $3, R10
246 SHRQ $2, DI
247 loop1:
248 TESTQ R10, R10; JZ loop1done
249 loop1cont:
250 // unroll 1X in batches of 1
251 MOVQ 0(R8), AX
252 // multiply
253 MULQ BX
254 ADDQ SI, AX
255 MOVQ DX, SI
256 ADCQ $0, SI
257 MOVQ AX, 0(R9)
258 LEAQ 8(R8), R8 // ADD $8, R8
259 LEAQ 8(R9), R9 // ADD $8, R9
260 SUBQ $1, R10; JNZ loop1cont
261 loop1done:
262 loop4:
263 TESTQ DI, DI; JZ loop4done
264 loop4cont:
265 // unroll 4X in batches of 1
266 MOVQ 0(R8), AX
267 // multiply
268 MULQ BX
269 ADDQ SI, AX
270 MOVQ DX, SI
271 ADCQ $0, SI
272 MOVQ AX, 0(R9)
273 MOVQ 8(R8), AX
274 // multiply
275 MULQ BX
276 ADDQ SI, AX
277 MOVQ DX, SI
278 ADCQ $0, SI
279 MOVQ AX, 8(R9)
280 MOVQ 16(R8), AX
281 // multiply
282 MULQ BX
283 ADDQ SI, AX
284 MOVQ DX, SI
285 ADCQ $0, SI
286 MOVQ AX, 16(R9)
287 MOVQ 24(R8), AX
288 // multiply
289 MULQ BX
290 ADDQ SI, AX
291 MOVQ DX, SI
292 ADCQ $0, SI
293 MOVQ AX, 24(R9)
294 LEAQ 32(R8), R8 // ADD $32, R8
295 LEAQ 32(R9), R9 // ADD $32, R9
296 SUBQ $1, DI; JNZ loop4cont
297 loop4done:
298 MOVQ SI, c+64(FP)
299 RET
300
301 // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
302 TEXT ·addMulVVWW(SB), NOSPLIT, $0
303 CMPB ·hasADX(SB), $0; JNZ altcarry
304 MOVQ m+72(FP), BX
305 MOVQ a+80(FP), SI
306 MOVQ z_len+8(FP), DI
307 MOVQ x_base+24(FP), R8
308 MOVQ y_base+48(FP), R9
309 MOVQ z_base+0(FP), R10
310 // compute unrolled loop lengths
311 MOVQ DI, R11
312 ANDQ $3, R11
313 SHRQ $2, DI
314 loop1:
315 TESTQ R11, R11; JZ loop1done
316 loop1cont:
317 // unroll 1X in batches of 1
318 MOVQ 0(R9), AX
319 // multiply
320 MULQ BX
321 ADDQ SI, AX
322 MOVQ DX, SI
323 ADCQ $0, SI
324 // add
325 ADDQ 0(R8), AX
326 ADCQ $0, SI
327 MOVQ AX, 0(R10)
328 LEAQ 8(R8), R8 // ADD $8, R8
329 LEAQ 8(R9), R9 // ADD $8, R9
330 LEAQ 8(R10), R10 // ADD $8, R10
331 SUBQ $1, R11; JNZ loop1cont
332 loop1done:
333 loop4:
334 TESTQ DI, DI; JZ loop4done
335 loop4cont:
336 // unroll 4X in batches of 1
337 MOVQ 0(R9), AX
338 // multiply
339 MULQ BX
340 ADDQ SI, AX
341 MOVQ DX, SI
342 ADCQ $0, SI
343 // add
344 ADDQ 0(R8), AX
345 ADCQ $0, SI
346 MOVQ AX, 0(R10)
347 MOVQ 8(R9), AX
348 // multiply
349 MULQ BX
350 ADDQ SI, AX
351 MOVQ DX, SI
352 ADCQ $0, SI
353 // add
354 ADDQ 8(R8), AX
355 ADCQ $0, SI
356 MOVQ AX, 8(R10)
357 MOVQ 16(R9), AX
358 // multiply
359 MULQ BX
360 ADDQ SI, AX
361 MOVQ DX, SI
362 ADCQ $0, SI
363 // add
364 ADDQ 16(R8), AX
365 ADCQ $0, SI
366 MOVQ AX, 16(R10)
367 MOVQ 24(R9), AX
368 // multiply
369 MULQ BX
370 ADDQ SI, AX
371 MOVQ DX, SI
372 ADCQ $0, SI
373 // add
374 ADDQ 24(R8), AX
375 ADCQ $0, SI
376 MOVQ AX, 24(R10)
377 LEAQ 32(R8), R8 // ADD $32, R8
378 LEAQ 32(R9), R9 // ADD $32, R9
379 LEAQ 32(R10), R10 // ADD $32, R10
380 SUBQ $1, DI; JNZ loop4cont
381 loop4done:
382 MOVQ SI, c+88(FP)
383 RET
384 altcarry:
385 MOVQ m+72(FP), DX
386 MOVQ a+80(FP), BX
387 MOVQ z_len+8(FP), SI
388 MOVQ $0, DI
389 MOVQ x_base+24(FP), R8
390 MOVQ y_base+48(FP), R9
391 MOVQ z_base+0(FP), R10
392 // compute unrolled loop lengths
393 MOVQ SI, R11
394 ANDQ $7, R11
395 SHRQ $3, SI
396 alt1:
397 TESTQ R11, R11; JZ alt1done
398 alt1cont:
399 // unroll 1X
400 // multiply and add
401 TESTQ AX, AX // clear carry
402 TESTQ AX, AX // clear carry
403 MULXQ 0(R9), R13, R12
404 ADCXQ BX, R13
405 ADOXQ 0(R8), R13
406 MOVQ R13, 0(R10)
407 MOVQ R12, BX
408 ADCXQ DI, BX
409 ADOXQ DI, BX
410 LEAQ 8(R8), R8 // ADD $8, R8
411 LEAQ 8(R9), R9 // ADD $8, R9
412 LEAQ 8(R10), R10 // ADD $8, R10
413 SUBQ $1, R11; JNZ alt1cont
414 alt1done:
415 alt8:
416 TESTQ SI, SI; JZ alt8done
417 alt8cont:
418 // unroll 8X in batches of 2
419 // multiply and add
420 TESTQ AX, AX // clear carry
421 TESTQ AX, AX // clear carry
422 MULXQ 0(R9), R13, R11
423 ADCXQ BX, R13
424 ADOXQ 0(R8), R13
425 MULXQ 8(R9), R14, BX
426 ADCXQ R11, R14
427 ADOXQ 8(R8), R14
428 MOVQ R13, 0(R10)
429 MOVQ R14, 8(R10)
430 MULXQ 16(R9), R13, R11
431 ADCXQ BX, R13
432 ADOXQ 16(R8), R13
433 MULXQ 24(R9), R14, BX
434 ADCXQ R11, R14
435 ADOXQ 24(R8), R14
436 MOVQ R13, 16(R10)
437 MOVQ R14, 24(R10)
438 MULXQ 32(R9), R13, R11
439 ADCXQ BX, R13
440 ADOXQ 32(R8), R13
441 MULXQ 40(R9), R14, BX
442 ADCXQ R11, R14
443 ADOXQ 40(R8), R14
444 MOVQ R13, 32(R10)
445 MOVQ R14, 40(R10)
446 MULXQ 48(R9), R13, R11
447 ADCXQ BX, R13
448 ADOXQ 48(R8), R13
449 MULXQ 56(R9), R14, BX
450 ADCXQ R11, R14
451 ADOXQ 56(R8), R14
452 MOVQ R13, 48(R10)
453 MOVQ R14, 56(R10)
454 ADCXQ DI, BX
455 ADOXQ DI, BX
456 LEAQ 64(R8), R8 // ADD $64, R8
457 LEAQ 64(R9), R9 // ADD $64, R9
458 LEAQ 64(R10), R10 // ADD $64, R10
459 SUBQ $1, SI; JNZ alt8cont
460 alt8done:
461 MOVQ BX, c+88(FP)
462 RET
463
View as plain text