// Copyright 2022 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" // Register map // // R4: ptr // R5: n // R6: ptrend // R7: tmp // Algorithm: // // 1. if lasx is enabled: // THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256, // else if lsx is enabled: // THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128, // else // THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64, // // 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted. // The handling is divided into distinct cases based on the size of count: // a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8, // clr_9through16, clr_17through32, clr_33through64, // b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128, // c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128, // lasx_clr_65through128, lasx_clr_129through256 // // 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned // bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then // a LOOPBLOCKS-byte loop is executed to zero out memory. // When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail // processing is performed, invoking the corresponding case based on the size of n. // // example: // THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64 // // ptr newptr ptrend // | |<----count after correction---->| // |<-------------count before correction---------->| // |<--8-(ptr&7)-->| |<---64 bytes--->| // +------------------------------------------------+ // | Head | Body | Tail | // +---------------+---------------+----------------+ // newptr = ptr - (ptr & 7) + 8 // count = count - 8 + (ptr & 7) // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 BEQ R5, clr_0 ADDV R4, R5, R6 tail: // <=64 bytes, clear directly, not check aligned SGTU $2, R5, R7 BNE R7, clr_1 SGTU $3, R5, R7 BNE R7, clr_2 SGTU $4, R5, R7 BNE R7, clr_3 SGTU $5, R5, R7 BNE R7, clr_4 SGTU $8, R5, R7 BNE R7, clr_5through7 SGTU $9, R5, R7 BNE R7, clr_8 SGTU $17, R5, R7 BNE R7, clr_9through16 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7 BNE R7, lasx_tail MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7 BNE R7, lsx_tail SGTU $33, R5, R7 BNE R7, clr_17through32 SGTU $65, R5, R7 BNE R7, clr_33through64 JMP clr_large lasx_tail: // X0 = 0 XVXORV X0, X0, X0 SGTU $33, R5, R7 BNE R7, lasx_clr_17through32 SGTU $65, R5, R7 BNE R7, lasx_clr_33through64 SGTU $129, R5, R7 BNE R7, lasx_clr_65through128 SGTU $257, R5, R7 BNE R7, lasx_clr_129through256 JMP lasx_clr_large lsx_tail: // V0 = 0 VXORV V0, V0, V0 SGTU $33, R5, R7 BNE R7, lsx_clr_17through32 SGTU $65, R5, R7 BNE R7, lsx_clr_33through64 SGTU $129, R5, R7 BNE R7, lsx_clr_65through128 JMP lsx_clr_large // use simd 256 instructions to implement memclr // n > 256 bytes, check 32-byte alignment lasx_clr_large: AND $31, R4, R7 BEQ R7, lasx_clr_256loop XVMOVQ X0, (R4) SUBV R7, R4 ADDV R7, R5 SUBV $32, R5 // newn = n - (32 - (ptr & 31)) ADDV $32, R4 // newptr = ptr + (32 - (ptr & 31)) SGTU $257, R5, R7 BNE R7, lasx_clr_129through256 lasx_clr_256loop: SUBV $256, R5 SGTU $256, R5, R7 XVMOVQ X0, 0(R4) XVMOVQ X0, 32(R4) XVMOVQ X0, 64(R4) XVMOVQ X0, 96(R4) XVMOVQ X0, 128(R4) XVMOVQ X0, 160(R4) XVMOVQ X0, 192(R4) XVMOVQ X0, 224(R4) ADDV $256, R4 BEQ R7, lasx_clr_256loop // remaining_length is 0 BEQ R5, clr_0 // 128 < remaining_length < 256 SGTU $129, R5, R7 BEQ R7, lasx_clr_129through256 // 64 < remaining_length <= 128 SGTU $65, R5, R7 BEQ R7, lasx_clr_65through128 // 32 < remaining_length <= 64 SGTU $33, R5, R7 BEQ R7, lasx_clr_33through64 // 16 < remaining_length <= 32 SGTU $17, R5, R7 BEQ R7, lasx_clr_17through32 // 0 < remaining_length <= 16 JMP tail // use simd 128 instructions to implement memclr // n > 128 bytes, check 16-byte alignment lsx_clr_large: // check 16-byte alignment AND $15, R4, R7 BEQ R7, lsx_clr_128loop VMOVQ V0, (R4) SUBV R7, R4 ADDV R7, R5 SUBV $16, R5 // newn = n - (16 - (ptr & 15)) ADDV $16, R4 // newptr = ptr + (16 - (ptr & 15)) SGTU $129, R5, R7 BNE R7, lsx_clr_65through128 lsx_clr_128loop: SUBV $128, R5 SGTU $128, R5, R7 VMOVQ V0, 0(R4) VMOVQ V0, 16(R4) VMOVQ V0, 32(R4) VMOVQ V0, 48(R4) VMOVQ V0, 64(R4) VMOVQ V0, 80(R4) VMOVQ V0, 96(R4) VMOVQ V0, 112(R4) ADDV $128, R4 BEQ R7, lsx_clr_128loop // remaining_length is 0 BEQ R5, clr_0 // 64 < remaining_length <= 128 SGTU $65, R5, R7 BEQ R7, lsx_clr_65through128 // 32 < remaining_length <= 64 SGTU $33, R5, R7 BEQ R7, lsx_clr_33through64 // 16 < remaining_length <= 32 SGTU $17, R5, R7 BEQ R7, lsx_clr_17through32 // 0 < remaining_length <= 16 JMP tail // use general instructions to implement memclr // n > 64 bytes, check 16-byte alignment clr_large: AND $7, R4, R7 BEQ R7, clr_64loop MOVV R0, (R4) SUBV R7, R4 ADDV R7, R5 ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7)) SUBV $8, R5 // newn = n - (8 - (ptr & 7)) MOVV $64, R7 BLT R5, R7, clr_33through64 clr_64loop: SUBV $64, R5 SGTU $64, R5, R7 MOVV R0, (R4) MOVV R0, 8(R4) MOVV R0, 16(R4) MOVV R0, 24(R4) MOVV R0, 32(R4) MOVV R0, 40(R4) MOVV R0, 48(R4) MOVV R0, 56(R4) ADDV $64, R4 BEQ R7, clr_64loop // remaining_length is 0 BEQ R5, clr_0 // 32 < remaining_length < 64 SGTU $33, R5, R7 BEQ R7, clr_33through64 // 16 < remaining_length <= 32 SGTU $17, R5, R7 BEQ R7, clr_17through32 // 0 < remaining_length <= 16 JMP tail clr_0: RET clr_1: MOVB R0, (R4) RET clr_2: MOVH R0, (R4) RET clr_3: MOVH R0, (R4) MOVB R0, 2(R4) RET clr_4: MOVW R0, (R4) RET clr_5through7: MOVW R0, (R4) MOVW R0, -4(R6) RET clr_8: MOVV R0, (R4) RET clr_9through16: MOVV R0, (R4) MOVV R0, -8(R6) RET clr_17through32: MOVV R0, (R4) MOVV R0, 8(R4) MOVV R0, -16(R6) MOVV R0, -8(R6) RET clr_33through64: MOVV R0, (R4) MOVV R0, 8(R4) MOVV R0, 16(R4) MOVV R0, 24(R4) MOVV R0, -32(R6) MOVV R0, -24(R6) MOVV R0, -16(R6) MOVV R0, -8(R6) RET lasx_clr_17through32: VMOVQ V0, 0(R4) VMOVQ V0, -16(R6) RET lasx_clr_33through64: XVMOVQ X0, 0(R4) XVMOVQ X0, -32(R6) RET lasx_clr_65through128: XVMOVQ X0, 0(R4) XVMOVQ X0, 32(R4) XVMOVQ X0, -64(R6) XVMOVQ X0, -32(R6) RET lasx_clr_129through256: XVMOVQ X0, 0(R4) XVMOVQ X0, 32(R4) XVMOVQ X0, 64(R4) XVMOVQ X0, 96(R4) XVMOVQ X0, -128(R6) XVMOVQ X0, -96(R6) XVMOVQ X0, -64(R6) XVMOVQ X0, -32(R6) RET lsx_clr_17through32: VMOVQ V0, 0(R4) VMOVQ V0, -16(R6) RET lsx_clr_33through64: VMOVQ V0, 0(R4) VMOVQ V0, 16(R4) VMOVQ V0, -32(R6) VMOVQ V0, -16(R6) RET lsx_clr_65through128: VMOVQ V0, 0(R4) VMOVQ V0, 16(R4) VMOVQ V0, 32(R4) VMOVQ V0, 48(R4) VMOVQ V0, -64(R6) VMOVQ V0, -48(R6) VMOVQ V0, -32(R6) VMOVQ V0, -16(R6) RET