Text file
src/math/big/arith_riscv64.s
1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
6
7 //go:build !math_big_pure_go
8
9 #include "textflag.h"
10
11 // func addVV(z, x, y []Word) (c Word)
12 TEXT ·addVV(SB), NOSPLIT, $0
13 MOV z_len+8(FP), X5
14 MOV x_base+24(FP), X6
15 MOV y_base+48(FP), X7
16 MOV z_base+0(FP), X8
17 // compute unrolled loop lengths
18 AND $3, X5, X9
19 SRL $2, X5
20 XOR X28, X28 // clear carry
21 loop1:
22 BEQZ X9, loop1done
23 loop1cont:
24 // unroll 1X
25 MOV 0(X6), X10
26 MOV 0(X7), X11
27 ADD X11, X10 // ADCS X11, X10, X10 (cr=X28)
28 SLTU X11, X10, X31 // ...
29 ADD X28, X10 // ...
30 SLTU X28, X10, X28 // ...
31 ADD X31, X28 // ...
32 MOV X10, 0(X8)
33 ADD $8, X6
34 ADD $8, X7
35 ADD $8, X8
36 SUB $1, X9
37 BNEZ X9, loop1cont
38 loop1done:
39 loop4:
40 BEQZ X5, loop4done
41 loop4cont:
42 // unroll 4X
43 MOV 0(X6), X9
44 MOV 8(X6), X10
45 MOV 16(X6), X11
46 MOV 24(X6), X12
47 MOV 0(X7), X13
48 MOV 8(X7), X14
49 MOV 16(X7), X15
50 MOV 24(X7), X16
51 ADD X13, X9 // ADCS X13, X9, X9 (cr=X28)
52 SLTU X13, X9, X31 // ...
53 ADD X28, X9 // ...
54 SLTU X28, X9, X28 // ...
55 ADD X31, X28 // ...
56 ADD X14, X10 // ADCS X14, X10, X10 (cr=X28)
57 SLTU X14, X10, X31 // ...
58 ADD X28, X10 // ...
59 SLTU X28, X10, X28 // ...
60 ADD X31, X28 // ...
61 ADD X15, X11 // ADCS X15, X11, X11 (cr=X28)
62 SLTU X15, X11, X31 // ...
63 ADD X28, X11 // ...
64 SLTU X28, X11, X28 // ...
65 ADD X31, X28 // ...
66 ADD X16, X12 // ADCS X16, X12, X12 (cr=X28)
67 SLTU X16, X12, X31 // ...
68 ADD X28, X12 // ...
69 SLTU X28, X12, X28 // ...
70 ADD X31, X28 // ...
71 MOV X9, 0(X8)
72 MOV X10, 8(X8)
73 MOV X11, 16(X8)
74 MOV X12, 24(X8)
75 ADD $32, X6
76 ADD $32, X7
77 ADD $32, X8
78 SUB $1, X5
79 BNEZ X5, loop4cont
80 loop4done:
81 MOV X28, c+72(FP)
82 RET
83
84 // func subVV(z, x, y []Word) (c Word)
85 TEXT ·subVV(SB), NOSPLIT, $0
86 MOV z_len+8(FP), X5
87 MOV x_base+24(FP), X6
88 MOV y_base+48(FP), X7
89 MOV z_base+0(FP), X8
90 // compute unrolled loop lengths
91 AND $3, X5, X9
92 SRL $2, X5
93 XOR X28, X28 // clear carry
94 loop1:
95 BEQZ X9, loop1done
96 loop1cont:
97 // unroll 1X
98 MOV 0(X6), X10
99 MOV 0(X7), X11
100 SLTU X28, X10, X31 // SBCS X11, X10, X10
101 SUB X28, X10 // ...
102 SLTU X11, X10, X28 // ...
103 SUB X11, X10 // ...
104 ADD X31, X28 // ...
105 MOV X10, 0(X8)
106 ADD $8, X6
107 ADD $8, X7
108 ADD $8, X8
109 SUB $1, X9
110 BNEZ X9, loop1cont
111 loop1done:
112 loop4:
113 BEQZ X5, loop4done
114 loop4cont:
115 // unroll 4X
116 MOV 0(X6), X9
117 MOV 8(X6), X10
118 MOV 16(X6), X11
119 MOV 24(X6), X12
120 MOV 0(X7), X13
121 MOV 8(X7), X14
122 MOV 16(X7), X15
123 MOV 24(X7), X16
124 SLTU X28, X9, X31 // SBCS X13, X9, X9
125 SUB X28, X9 // ...
126 SLTU X13, X9, X28 // ...
127 SUB X13, X9 // ...
128 ADD X31, X28 // ...
129 SLTU X28, X10, X31 // SBCS X14, X10, X10
130 SUB X28, X10 // ...
131 SLTU X14, X10, X28 // ...
132 SUB X14, X10 // ...
133 ADD X31, X28 // ...
134 SLTU X28, X11, X31 // SBCS X15, X11, X11
135 SUB X28, X11 // ...
136 SLTU X15, X11, X28 // ...
137 SUB X15, X11 // ...
138 ADD X31, X28 // ...
139 SLTU X28, X12, X31 // SBCS X16, X12, X12
140 SUB X28, X12 // ...
141 SLTU X16, X12, X28 // ...
142 SUB X16, X12 // ...
143 ADD X31, X28 // ...
144 MOV X9, 0(X8)
145 MOV X10, 8(X8)
146 MOV X11, 16(X8)
147 MOV X12, 24(X8)
148 ADD $32, X6
149 ADD $32, X7
150 ADD $32, X8
151 SUB $1, X5
152 BNEZ X5, loop4cont
153 loop4done:
154 MOV X28, c+72(FP)
155 RET
156
157 // func lshVU(z, x []Word, s uint) (c Word)
158 TEXT ·lshVU(SB), NOSPLIT, $0
159 MOV z_len+8(FP), X5
160 BEQZ X5, ret0
161 MOV s+48(FP), X6
162 MOV x_base+24(FP), X7
163 MOV z_base+0(FP), X8
164 // run loop backward
165 SLL $3, X5, X9
166 ADD X9, X7
167 SLL $3, X5, X9
168 ADD X9, X8
169 // shift first word into carry
170 MOV -8(X7), X9
171 MOV $64, X10
172 SUB X6, X10
173 SRL X10, X9, X11
174 SLL X6, X9
175 MOV X11, c+56(FP)
176 // shift remaining words
177 SUB $1, X5
178 // compute unrolled loop lengths
179 AND $3, X5, X11
180 SRL $2, X5
181 loop1:
182 BEQZ X11, loop1done
183 loop1cont:
184 // unroll 1X
185 MOV -16(X7), X12
186 SRL X10, X12, X13
187 OR X9, X13
188 SLL X6, X12, X9
189 MOV X13, -8(X8)
190 ADD $-8, X7
191 ADD $-8, X8
192 SUB $1, X11
193 BNEZ X11, loop1cont
194 loop1done:
195 loop4:
196 BEQZ X5, loop4done
197 loop4cont:
198 // unroll 4X
199 MOV -16(X7), X11
200 MOV -24(X7), X12
201 MOV -32(X7), X13
202 MOV -40(X7), X14
203 SRL X10, X11, X15
204 OR X9, X15
205 SLL X6, X11, X9
206 SRL X10, X12, X11
207 OR X9, X11
208 SLL X6, X12, X9
209 SRL X10, X13, X12
210 OR X9, X12
211 SLL X6, X13, X9
212 SRL X10, X14, X13
213 OR X9, X13
214 SLL X6, X14, X9
215 MOV X15, -8(X8)
216 MOV X11, -16(X8)
217 MOV X12, -24(X8)
218 MOV X13, -32(X8)
219 ADD $-32, X7
220 ADD $-32, X8
221 SUB $1, X5
222 BNEZ X5, loop4cont
223 loop4done:
224 // store final shifted bits
225 MOV X9, -8(X8)
226 RET
227 ret0:
228 MOV X0, c+56(FP)
229 RET
230
231 // func rshVU(z, x []Word, s uint) (c Word)
232 TEXT ·rshVU(SB), NOSPLIT, $0
233 MOV z_len+8(FP), X5
234 BEQZ X5, ret0
235 MOV s+48(FP), X6
236 MOV x_base+24(FP), X7
237 MOV z_base+0(FP), X8
238 // shift first word into carry
239 MOV 0(X7), X9
240 MOV $64, X10
241 SUB X6, X10
242 SLL X10, X9, X11
243 SRL X6, X9
244 MOV X11, c+56(FP)
245 // shift remaining words
246 SUB $1, X5
247 // compute unrolled loop lengths
248 AND $3, X5, X11
249 SRL $2, X5
250 loop1:
251 BEQZ X11, loop1done
252 loop1cont:
253 // unroll 1X
254 MOV 8(X7), X12
255 SLL X10, X12, X13
256 OR X9, X13
257 SRL X6, X12, X9
258 MOV X13, 0(X8)
259 ADD $8, X7
260 ADD $8, X8
261 SUB $1, X11
262 BNEZ X11, loop1cont
263 loop1done:
264 loop4:
265 BEQZ X5, loop4done
266 loop4cont:
267 // unroll 4X
268 MOV 8(X7), X11
269 MOV 16(X7), X12
270 MOV 24(X7), X13
271 MOV 32(X7), X14
272 SLL X10, X11, X15
273 OR X9, X15
274 SRL X6, X11, X9
275 SLL X10, X12, X11
276 OR X9, X11
277 SRL X6, X12, X9
278 SLL X10, X13, X12
279 OR X9, X12
280 SRL X6, X13, X9
281 SLL X10, X14, X13
282 OR X9, X13
283 SRL X6, X14, X9
284 MOV X15, 0(X8)
285 MOV X11, 8(X8)
286 MOV X12, 16(X8)
287 MOV X13, 24(X8)
288 ADD $32, X7
289 ADD $32, X8
290 SUB $1, X5
291 BNEZ X5, loop4cont
292 loop4done:
293 // store final shifted bits
294 MOV X9, 0(X8)
295 RET
296 ret0:
297 MOV X0, c+56(FP)
298 RET
299
300 // func mulAddVWW(z, x []Word, m, a Word) (c Word)
301 TEXT ·mulAddVWW(SB), NOSPLIT, $0
302 MOV m+48(FP), X5
303 MOV a+56(FP), X6
304 MOV z_len+8(FP), X7
305 MOV x_base+24(FP), X8
306 MOV z_base+0(FP), X9
307 // compute unrolled loop lengths
308 AND $3, X7, X10
309 SRL $2, X7
310 loop1:
311 BEQZ X10, loop1done
312 loop1cont:
313 // unroll 1X
314 MOV 0(X8), X11
315 // synthetic carry, one column at a time
316 MUL X5, X11, X12
317 MULHU X5, X11, X13
318 ADD X6, X12, X11 // ADDS X6, X12, X11 (cr=X28)
319 SLTU X6, X11, X28 // ...
320 ADD X28, X13, X6 // ADC $0, X13, X6
321 MOV X11, 0(X9)
322 ADD $8, X8
323 ADD $8, X9
324 SUB $1, X10
325 BNEZ X10, loop1cont
326 loop1done:
327 loop4:
328 BEQZ X7, loop4done
329 loop4cont:
330 // unroll 4X
331 MOV 0(X8), X10
332 MOV 8(X8), X11
333 MOV 16(X8), X12
334 MOV 24(X8), X13
335 // synthetic carry, one column at a time
336 MUL X5, X10, X14
337 MULHU X5, X10, X15
338 ADD X6, X14, X10 // ADDS X6, X14, X10 (cr=X28)
339 SLTU X6, X10, X28 // ...
340 ADD X28, X15, X6 // ADC $0, X15, X6
341 MUL X5, X11, X14
342 MULHU X5, X11, X15
343 ADD X6, X14, X11 // ADDS X6, X14, X11 (cr=X28)
344 SLTU X6, X11, X28 // ...
345 ADD X28, X15, X6 // ADC $0, X15, X6
346 MUL X5, X12, X14
347 MULHU X5, X12, X15
348 ADD X6, X14, X12 // ADDS X6, X14, X12 (cr=X28)
349 SLTU X6, X12, X28 // ...
350 ADD X28, X15, X6 // ADC $0, X15, X6
351 MUL X5, X13, X14
352 MULHU X5, X13, X15
353 ADD X6, X14, X13 // ADDS X6, X14, X13 (cr=X28)
354 SLTU X6, X13, X28 // ...
355 ADD X28, X15, X6 // ADC $0, X15, X6
356 MOV X10, 0(X9)
357 MOV X11, 8(X9)
358 MOV X12, 16(X9)
359 MOV X13, 24(X9)
360 ADD $32, X8
361 ADD $32, X9
362 SUB $1, X7
363 BNEZ X7, loop4cont
364 loop4done:
365 MOV X6, c+64(FP)
366 RET
367
368 // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
369 TEXT ·addMulVVWW(SB), NOSPLIT, $0
370 MOV m+72(FP), X5
371 MOV a+80(FP), X6
372 MOV z_len+8(FP), X7
373 MOV x_base+24(FP), X8
374 MOV y_base+48(FP), X9
375 MOV z_base+0(FP), X10
376 // compute unrolled loop lengths
377 AND $3, X7, X11
378 SRL $2, X7
379 loop1:
380 BEQZ X11, loop1done
381 loop1cont:
382 // unroll 1X
383 MOV 0(X8), X12
384 MOV 0(X9), X13
385 // synthetic carry, one column at a time
386 MUL X5, X13, X14
387 MULHU X5, X13, X15
388 ADD X12, X14 // ADDS X12, X14, X14 (cr=X28)
389 SLTU X12, X14, X28 // ...
390 ADD X28, X15 // ADC $0, X15, X15
391 ADD X6, X14, X13 // ADDS X6, X14, X13 (cr=X28)
392 SLTU X6, X13, X28 // ...
393 ADD X28, X15, X6 // ADC $0, X15, X6
394 MOV X13, 0(X10)
395 ADD $8, X8
396 ADD $8, X9
397 ADD $8, X10
398 SUB $1, X11
399 BNEZ X11, loop1cont
400 loop1done:
401 loop4:
402 BEQZ X7, loop4done
403 loop4cont:
404 // unroll 4X
405 MOV 0(X8), X11
406 MOV 8(X8), X12
407 MOV 16(X8), X13
408 MOV 24(X8), X14
409 MOV 0(X9), X15
410 MOV 8(X9), X16
411 MOV 16(X9), X17
412 MOV 24(X9), X18
413 // synthetic carry, one column at a time
414 MUL X5, X15, X19
415 MULHU X5, X15, X20
416 ADD X11, X19 // ADDS X11, X19, X19 (cr=X28)
417 SLTU X11, X19, X28 // ...
418 ADD X28, X20 // ADC $0, X20, X20
419 ADD X6, X19, X15 // ADDS X6, X19, X15 (cr=X28)
420 SLTU X6, X15, X28 // ...
421 ADD X28, X20, X6 // ADC $0, X20, X6
422 MUL X5, X16, X19
423 MULHU X5, X16, X20
424 ADD X12, X19 // ADDS X12, X19, X19 (cr=X28)
425 SLTU X12, X19, X28 // ...
426 ADD X28, X20 // ADC $0, X20, X20
427 ADD X6, X19, X16 // ADDS X6, X19, X16 (cr=X28)
428 SLTU X6, X16, X28 // ...
429 ADD X28, X20, X6 // ADC $0, X20, X6
430 MUL X5, X17, X19
431 MULHU X5, X17, X20
432 ADD X13, X19 // ADDS X13, X19, X19 (cr=X28)
433 SLTU X13, X19, X28 // ...
434 ADD X28, X20 // ADC $0, X20, X20
435 ADD X6, X19, X17 // ADDS X6, X19, X17 (cr=X28)
436 SLTU X6, X17, X28 // ...
437 ADD X28, X20, X6 // ADC $0, X20, X6
438 MUL X5, X18, X19
439 MULHU X5, X18, X20
440 ADD X14, X19 // ADDS X14, X19, X19 (cr=X28)
441 SLTU X14, X19, X28 // ...
442 ADD X28, X20 // ADC $0, X20, X20
443 ADD X6, X19, X18 // ADDS X6, X19, X18 (cr=X28)
444 SLTU X6, X18, X28 // ...
445 ADD X28, X20, X6 // ADC $0, X20, X6
446 MOV X15, 0(X10)
447 MOV X16, 8(X10)
448 MOV X17, 16(X10)
449 MOV X18, 24(X10)
450 ADD $32, X8
451 ADD $32, X9
452 ADD $32, X10
453 SUB $1, X7
454 BNEZ X7, loop4cont
455 loop4done:
456 MOV X6, c+88(FP)
457 RET
458
View as plain text