// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT. //go:build !math_big_pure_go #include "textflag.h" // func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB), NOSPLIT, $0 MOVQ z_len+8(FP), BX MOVQ x_base+24(FP), SI MOVQ y_base+48(FP), DI MOVQ z_base+0(FP), R8 // compute unrolled loop lengths MOVQ BX, R9 ANDQ $3, R9 SHRQ $2, BX MOVQ $0, R10 // clear saved carry loop1: TESTQ R9, R9; JZ loop1done loop1cont: // unroll 1X ADDQ R10, R10 // restore carry MOVQ 0(SI), R10 ADCQ 0(DI), R10 MOVQ R10, 0(R8) SBBQ R10, R10 // save carry LEAQ 8(SI), SI // ADD $8, SI LEAQ 8(DI), DI // ADD $8, DI LEAQ 8(R8), R8 // ADD $8, R8 SUBQ $1, R9; JNZ loop1cont loop1done: loop4: TESTQ BX, BX; JZ loop4done loop4cont: // unroll 4X ADDQ R10, R10 // restore carry MOVQ 0(SI), R9 MOVQ 8(SI), R10 MOVQ 16(SI), R11 MOVQ 24(SI), R12 ADCQ 0(DI), R9 ADCQ 8(DI), R10 ADCQ 16(DI), R11 ADCQ 24(DI), R12 MOVQ R9, 0(R8) MOVQ R10, 8(R8) MOVQ R11, 16(R8) MOVQ R12, 24(R8) SBBQ R10, R10 // save carry LEAQ 32(SI), SI // ADD $32, SI LEAQ 32(DI), DI // ADD $32, DI LEAQ 32(R8), R8 // ADD $32, R8 SUBQ $1, BX; JNZ loop4cont loop4done: NEGQ R10 // convert add carry MOVQ R10, c+72(FP) RET // func subVV(z, x, y []Word) (c Word) TEXT ·subVV(SB), NOSPLIT, $0 MOVQ z_len+8(FP), BX MOVQ x_base+24(FP), SI MOVQ y_base+48(FP), DI MOVQ z_base+0(FP), R8 // compute unrolled loop lengths MOVQ BX, R9 ANDQ $3, R9 SHRQ $2, BX MOVQ $0, R10 // clear saved carry loop1: TESTQ R9, R9; JZ loop1done loop1cont: // unroll 1X ADDQ R10, R10 // restore carry MOVQ 0(SI), R10 SBBQ 0(DI), R10 MOVQ R10, 0(R8) SBBQ R10, R10 // save carry LEAQ 8(SI), SI // ADD $8, SI LEAQ 8(DI), DI // ADD $8, DI LEAQ 8(R8), R8 // ADD $8, R8 SUBQ $1, R9; JNZ loop1cont loop1done: loop4: TESTQ BX, BX; JZ loop4done loop4cont: // unroll 4X ADDQ R10, R10 // restore carry MOVQ 0(SI), R9 MOVQ 8(SI), R10 MOVQ 16(SI), R11 MOVQ 24(SI), R12 SBBQ 0(DI), R9 SBBQ 8(DI), R10 SBBQ 16(DI), R11 SBBQ 24(DI), R12 MOVQ R9, 0(R8) MOVQ R10, 8(R8) MOVQ R11, 16(R8) MOVQ R12, 24(R8) SBBQ R10, R10 // save carry LEAQ 32(SI), SI // ADD $32, SI LEAQ 32(DI), DI // ADD $32, DI LEAQ 32(R8), R8 // ADD $32, R8 SUBQ $1, BX; JNZ loop4cont loop4done: NEGQ R10 // convert sub carry MOVQ R10, c+72(FP) RET // func lshVU(z, x []Word, s uint) (c Word) TEXT ·lshVU(SB), NOSPLIT, $0 MOVQ z_len+8(FP), BX TESTQ BX, BX; JZ ret0 MOVQ s+48(FP), CX MOVQ x_base+24(FP), SI MOVQ z_base+0(FP), DI // run loop backward LEAQ (SI)(BX*8), SI LEAQ (DI)(BX*8), DI // shift first word into carry MOVQ -8(SI), R8 MOVQ $0, R9 SHLQ CX, R8, R9 MOVQ R9, c+56(FP) // shift remaining words SUBQ $1, BX // compute unrolled loop lengths MOVQ BX, R9 ANDQ $3, R9 SHRQ $2, BX loop1: TESTQ R9, R9; JZ loop1done loop1cont: // unroll 1X MOVQ -16(SI), R10 SHLQ CX, R10, R8 MOVQ R8, -8(DI) MOVQ R10, R8 LEAQ -8(SI), SI // ADD $-8, SI LEAQ -8(DI), DI // ADD $-8, DI SUBQ $1, R9; JNZ loop1cont loop1done: loop4: TESTQ BX, BX; JZ loop4done loop4cont: // unroll 4X MOVQ -16(SI), R9 MOVQ -24(SI), R10 MOVQ -32(SI), R11 MOVQ -40(SI), R12 SHLQ CX, R9, R8 SHLQ CX, R10, R9 SHLQ CX, R11, R10 SHLQ CX, R12, R11 MOVQ R8, -8(DI) MOVQ R9, -16(DI) MOVQ R10, -24(DI) MOVQ R11, -32(DI) MOVQ R12, R8 LEAQ -32(SI), SI // ADD $-32, SI LEAQ -32(DI), DI // ADD $-32, DI SUBQ $1, BX; JNZ loop4cont loop4done: // store final shifted bits SHLQ CX, R8 MOVQ R8, -8(DI) RET ret0: MOVQ $0, c+56(FP) RET // func rshVU(z, x []Word, s uint) (c Word) TEXT ·rshVU(SB), NOSPLIT, $0 MOVQ z_len+8(FP), BX TESTQ BX, BX; JZ ret0 MOVQ s+48(FP), CX MOVQ x_base+24(FP), SI MOVQ z_base+0(FP), DI // shift first word into carry MOVQ 0(SI), R8 MOVQ $0, R9 SHRQ CX, R8, R9 MOVQ R9, c+56(FP) // shift remaining words SUBQ $1, BX // compute unrolled loop lengths MOVQ BX, R9 ANDQ $3, R9 SHRQ $2, BX loop1: TESTQ R9, R9; JZ loop1done loop1cont: // unroll 1X MOVQ 8(SI), R10 SHRQ CX, R10, R8 MOVQ R8, 0(DI) MOVQ R10, R8 LEAQ 8(SI), SI // ADD $8, SI LEAQ 8(DI), DI // ADD $8, DI SUBQ $1, R9; JNZ loop1cont loop1done: loop4: TESTQ BX, BX; JZ loop4done loop4cont: // unroll 4X MOVQ 8(SI), R9 MOVQ 16(SI), R10 MOVQ 24(SI), R11 MOVQ 32(SI), R12 SHRQ CX, R9, R8 SHRQ CX, R10, R9 SHRQ CX, R11, R10 SHRQ CX, R12, R11 MOVQ R8, 0(DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) MOVQ R11, 24(DI) MOVQ R12, R8 LEAQ 32(SI), SI // ADD $32, SI LEAQ 32(DI), DI // ADD $32, DI SUBQ $1, BX; JNZ loop4cont loop4done: // store final shifted bits SHRQ CX, R8 MOVQ R8, 0(DI) RET ret0: MOVQ $0, c+56(FP) RET // func mulAddVWW(z, x []Word, m, a Word) (c Word) TEXT ·mulAddVWW(SB), NOSPLIT, $0 MOVQ m+48(FP), BX MOVQ a+56(FP), SI MOVQ z_len+8(FP), DI MOVQ x_base+24(FP), R8 MOVQ z_base+0(FP), R9 // compute unrolled loop lengths MOVQ DI, R10 ANDQ $3, R10 SHRQ $2, DI loop1: TESTQ R10, R10; JZ loop1done loop1cont: // unroll 1X in batches of 1 MOVQ 0(R8), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI MOVQ AX, 0(R9) LEAQ 8(R8), R8 // ADD $8, R8 LEAQ 8(R9), R9 // ADD $8, R9 SUBQ $1, R10; JNZ loop1cont loop1done: loop4: TESTQ DI, DI; JZ loop4done loop4cont: // unroll 4X in batches of 1 MOVQ 0(R8), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI MOVQ AX, 0(R9) MOVQ 8(R8), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI MOVQ AX, 8(R9) MOVQ 16(R8), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI MOVQ AX, 16(R9) MOVQ 24(R8), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI MOVQ AX, 24(R9) LEAQ 32(R8), R8 // ADD $32, R8 LEAQ 32(R9), R9 // ADD $32, R9 SUBQ $1, DI; JNZ loop4cont loop4done: MOVQ SI, c+64(FP) RET // func addMulVVWW(z, x, y []Word, m, a Word) (c Word) TEXT ·addMulVVWW(SB), NOSPLIT, $0 CMPB ·hasADX(SB), $0; JNZ altcarry MOVQ m+72(FP), BX MOVQ a+80(FP), SI MOVQ z_len+8(FP), DI MOVQ x_base+24(FP), R8 MOVQ y_base+48(FP), R9 MOVQ z_base+0(FP), R10 // compute unrolled loop lengths MOVQ DI, R11 ANDQ $3, R11 SHRQ $2, DI loop1: TESTQ R11, R11; JZ loop1done loop1cont: // unroll 1X in batches of 1 MOVQ 0(R9), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI // add ADDQ 0(R8), AX ADCQ $0, SI MOVQ AX, 0(R10) LEAQ 8(R8), R8 // ADD $8, R8 LEAQ 8(R9), R9 // ADD $8, R9 LEAQ 8(R10), R10 // ADD $8, R10 SUBQ $1, R11; JNZ loop1cont loop1done: loop4: TESTQ DI, DI; JZ loop4done loop4cont: // unroll 4X in batches of 1 MOVQ 0(R9), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI // add ADDQ 0(R8), AX ADCQ $0, SI MOVQ AX, 0(R10) MOVQ 8(R9), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI // add ADDQ 8(R8), AX ADCQ $0, SI MOVQ AX, 8(R10) MOVQ 16(R9), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI // add ADDQ 16(R8), AX ADCQ $0, SI MOVQ AX, 16(R10) MOVQ 24(R9), AX // multiply MULQ BX ADDQ SI, AX MOVQ DX, SI ADCQ $0, SI // add ADDQ 24(R8), AX ADCQ $0, SI MOVQ AX, 24(R10) LEAQ 32(R8), R8 // ADD $32, R8 LEAQ 32(R9), R9 // ADD $32, R9 LEAQ 32(R10), R10 // ADD $32, R10 SUBQ $1, DI; JNZ loop4cont loop4done: MOVQ SI, c+88(FP) RET altcarry: MOVQ m+72(FP), DX MOVQ a+80(FP), BX MOVQ z_len+8(FP), SI MOVQ $0, DI MOVQ x_base+24(FP), R8 MOVQ y_base+48(FP), R9 MOVQ z_base+0(FP), R10 // compute unrolled loop lengths MOVQ SI, R11 ANDQ $7, R11 SHRQ $3, SI alt1: TESTQ R11, R11; JZ alt1done alt1cont: // unroll 1X // multiply and add TESTQ AX, AX // clear carry TESTQ AX, AX // clear carry MULXQ 0(R9), R13, R12 ADCXQ BX, R13 ADOXQ 0(R8), R13 MOVQ R13, 0(R10) MOVQ R12, BX ADCXQ DI, BX ADOXQ DI, BX LEAQ 8(R8), R8 // ADD $8, R8 LEAQ 8(R9), R9 // ADD $8, R9 LEAQ 8(R10), R10 // ADD $8, R10 SUBQ $1, R11; JNZ alt1cont alt1done: alt8: TESTQ SI, SI; JZ alt8done alt8cont: // unroll 8X in batches of 2 // multiply and add TESTQ AX, AX // clear carry TESTQ AX, AX // clear carry MULXQ 0(R9), R13, R11 ADCXQ BX, R13 ADOXQ 0(R8), R13 MULXQ 8(R9), R14, BX ADCXQ R11, R14 ADOXQ 8(R8), R14 MOVQ R13, 0(R10) MOVQ R14, 8(R10) MULXQ 16(R9), R13, R11 ADCXQ BX, R13 ADOXQ 16(R8), R13 MULXQ 24(R9), R14, BX ADCXQ R11, R14 ADOXQ 24(R8), R14 MOVQ R13, 16(R10) MOVQ R14, 24(R10) MULXQ 32(R9), R13, R11 ADCXQ BX, R13 ADOXQ 32(R8), R13 MULXQ 40(R9), R14, BX ADCXQ R11, R14 ADOXQ 40(R8), R14 MOVQ R13, 32(R10) MOVQ R14, 40(R10) MULXQ 48(R9), R13, R11 ADCXQ BX, R13 ADOXQ 48(R8), R13 MULXQ 56(R9), R14, BX ADCXQ R11, R14 ADOXQ 56(R8), R14 MOVQ R13, 48(R10) MOVQ R14, 56(R10) ADCXQ DI, BX ADOXQ DI, BX LEAQ 64(R8), R8 // ADD $64, R8 LEAQ 64(R9), R9 // ADD $64, R9 LEAQ 64(R10), R10 // ADD $64, R10 SUBQ $1, SI; JNZ alt8cont alt8done: MOVQ BX, c+88(FP) RET