Text file
src/runtime/memmove_loong64.s
1 // Copyright 2022 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 // See memmove Go doc for important implementation constraints.
9
10 // Register map
11 //
12 // to R4
13 // from R5
14 // n(aka count) R6
15 // to-end R7
16 // from-end R8
17 // data R11-R18
18 // tmp R9
19
20 // Algorithm:
21 //
22 // Memory alignment check is only performed for copy size greater
23 // than 64 bytes to minimize overhead.
24 //
25 // when copy size <= 64 bytes, jump to label tail, according to the
26 // copy size to select the appropriate case and copy directly.
27 // Based on the common memory access instructions of loong64, the
28 // currently implemented cases are:
29 // move_0, move_1, move_2, move_3, move_4, move_5through7, move_8,
30 // move_9through16, move_17through32, move_33through64
31 //
32 // when copy size > 64 bytes, use the destination-aligned copying,
33 // adopt the following strategy to copy in 3 parts:
34 // 1. Head: do the memory alignment
35 // 2. Body: a 64-byte loop structure
36 // 3. Tail: processing of the remaining part (<= 64 bytes)
37 //
38 // forward:
39 //
40 // Dst NewDst Dstend
41 // | |<----count after correction---->|
42 // |<-------------count before correction---------->|
43 // |<--8-(Dst&7)-->| |<---64 bytes--->|
44 // +------------------------------------------------+
45 // | Head | Body | Tail |
46 // +---------------+---------------+----------------+
47 // NewDst = Dst - (Dst & 7) + 8
48 // count = count - 8 + (Dst & 7)
49 // Src = Src - (Dst & 7) + 8
50 //
51 // backward:
52 //
53 // Dst NewDstend Dstend
54 // |<-----count after correction------>| |
55 // |<------------count before correction--------------->|
56 // |<---64 bytes--->| |<---Dstend&7--->|
57 // +----------------------------------------------------+
58 // | Tail | Body | Head |
59 // +----------------+------------------+----------------+
60 // NewDstend = Dstend - (Dstend & 7)
61 // count = count - (Dstend & 7)
62 // Srcend = Srcend - (Dstend & 7)
63
64 // func memmove(to, from unsafe.Pointer, n uintptr)
65 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
66 BEQ R4, R5, move_0
67 BEQ R6, move_0
68
69 ADDV R4, R6, R7 // to-end pointer
70 ADDV R5, R6, R8 // from-end pointer
71
72 // copy size <= 64 bytes, copy directly, not check aligned
73 tail:
74 // < 2 bytes
75 SGTU $2, R6, R9
76 BNE R9, move_1
77
78 // < 3 bytes
79 SGTU $3, R6, R9
80 BNE R9, move_2
81
82 // < 4 bytes
83 SGTU $4, R6, R9
84 BNE R9, move_3
85
86 // < 5 bytes
87 SGTU $5, R6, R9
88 BNE R9, move_4
89
90 // >= 5 bytes and < 8 bytes
91 SGTU $8, R6, R9
92 BNE R9, move_5through7
93
94 // < 9 bytes
95 SGTU $9, R6, R9
96 BNE R9, move_8
97
98 // >= 9 bytes and < 17 bytes
99 SGTU $17, R6, R9
100 BNE R9, move_9through16
101
102 // >= 17 bytes and < 33 bytes
103 SGTU $33, R6, R9
104 BNE R9, move_17through32
105
106 // >= 33 bytes and < 65 bytes
107 SGTU $65, R6, R9
108 BNE R9, move_33through64
109
110 // >= 65 bytes and < 256 bytes
111 SGTU $256, R6, R9
112 BNE R9, move_large
113
114 // >= 256
115 JMP lasx_move_large
116
117 move_0:
118 RET
119
120 move_1:
121 MOVB (R5), R11
122 MOVB R11, (R4)
123 RET
124 move_2:
125 MOVH (R5), R11
126 MOVH R11, (R4)
127 RET
128 move_3:
129 MOVH (R5), R11
130 MOVB -1(R8), R12
131 MOVH R11, (R4)
132 MOVB R12, -1(R7)
133 RET
134 move_4:
135 MOVW (R5), R11
136 MOVW R11, (R4)
137 RET
138 move_5through7:
139 MOVW (R5), R11
140 MOVW -4(R8), R12
141 MOVW R11, (R4)
142 MOVW R12, -4(R7)
143 RET
144 move_8:
145 MOVV (R5), R11
146 MOVV R11, (R4)
147 RET
148 move_9through16:
149 MOVV (R5), R11
150 MOVV -8(R8), R12
151 MOVV R11, (R4)
152 MOVV R12, -8(R7)
153 RET
154 move_17through32:
155 MOVV (R5), R11
156 MOVV 8(R5), R12
157 MOVV -16(R8), R13
158 MOVV -8(R8), R14
159 MOVV R11, (R4)
160 MOVV R12, 8(R4)
161 MOVV R13, -16(R7)
162 MOVV R14, -8(R7)
163 RET
164 move_33through64:
165 MOVV (R5), R11
166 MOVV 8(R5), R12
167 MOVV 16(R5), R13
168 MOVV 24(R5), R14
169 MOVV -32(R8), R15
170 MOVV -24(R8), R16
171 MOVV -16(R8), R17
172 MOVV -8(R8), R18
173 MOVV R11, (R4)
174 MOVV R12, 8(R4)
175 MOVV R13, 16(R4)
176 MOVV R14, 24(R4)
177 MOVV R15, -32(R7)
178 MOVV R16, -24(R7)
179 MOVV R17, -16(R7)
180 MOVV R18, -8(R7)
181 RET
182
183 move_large:
184 // if (dst > src) && (dst < (src + count))
185 // regarded as memory overlap
186 // jump to backward
187 // else
188 // jump to forward
189 BGEU R5, R4, forward
190 ADDV R5, R6, R10
191 BLTU R4, R10, backward
192 forward:
193 AND $7, R4, R9 // dst & 7
194 BEQ R9, forward_move_64loop
195 forward_unaligned:
196 MOVV $8, R10
197 SUBV R9, R10 // head = 8 - (dst & 7)
198 MOVV (R5), R11
199 SUBV R10, R6 // newcount = count - (8 - (dst & 7))
200 ADDV R10, R5 // newsrc = src + (8 - (dst & 7))
201 MOVV (R5), R12
202 MOVV R11, (R4)
203 ADDV R10, R4 // newdst = dst + (8 - (dst & 7))
204 MOVV R12, (R4)
205 SUBV $8, R6
206 ADDV $8, R4
207 ADDV $8, R5
208 SGTU $65, R6, R9
209 BNE R9, move_33through64
210 forward_move_64loop:
211 SUBV $64, R6
212 SGTU $64, R6, R9
213 MOVV (R5), R11
214 MOVV 8(R5), R12
215 MOVV 16(R5), R13
216 MOVV 24(R5), R14
217 MOVV 32(R5), R15
218 MOVV 40(R5), R16
219 MOVV 48(R5), R17
220 MOVV 56(R5), R18
221 MOVV R11, (R4)
222 MOVV R12, 8(R4)
223 MOVV R13, 16(R4)
224 MOVV R14, 24(R4)
225 MOVV R15, 32(R4)
226 MOVV R16, 40(R4)
227 MOVV R17, 48(R4)
228 MOVV R18, 56(R4)
229 ADDV $64, R5
230 ADDV $64, R4
231 BEQ R9, forward_move_64loop
232 // 0 < remaining_length < 64
233 BNE R6, tail
234 RET
235
236 // The backward copy algorithm is the same as the forward
237 // copy, except for the direction.
238 backward:
239 AND $7, R7, R9 // dstend & 7
240 BEQ R9, backward_move_64loop
241 backward_unaligned:
242 MOVV -8(R8), R11
243 SUBV R9, R6 // newcount = count - (dstend & 7)
244 SUBV R9, R8 // newsrcend = srcend - (dstend & 7)
245 MOVV -8(R8), R12
246 MOVV R11, -8(R7)
247 SUBV R9, R7 // newdstend = dstend - (dstend & 7)
248 MOVV R12, -8(R7)
249 SUBV $8, R6
250 SUBV $8, R7
251 SUBV $8, R8
252 SGTU $65, R6, R9
253 BNE R9, move_33through64
254 backward_move_64loop:
255 SUBV $64, R6
256 SGTU $64, R6, R9
257 MOVV -8(R8), R11
258 MOVV -16(R8), R12
259 MOVV -24(R8), R13
260 MOVV -32(R8), R14
261 MOVV -40(R8), R15
262 MOVV -48(R8), R16
263 MOVV -56(R8), R17
264 MOVV -64(R8), R18
265 MOVV R11, -8(R7)
266 MOVV R12, -16(R7)
267 MOVV R13, -24(R7)
268 MOVV R14, -32(R7)
269 MOVV R15, -40(R7)
270 MOVV R16, -48(R7)
271 MOVV R17, -56(R7)
272 MOVV R18, -64(R7)
273 SUBV $64, R7
274 SUBV $64, R8
275 BEQ R9, backward_move_64loop
276 // 0 < remaining_length < 64
277 BNE R6, tail
278 RET
279
280 // use simd 128 instructions to implement memmove
281 // n >= 256 bytes, check 16-byte alignment
282 lsx_move_large:
283 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
284 BEQ R9, move_large
285
286 // if (dst > src) && (dst < (src + count))
287 // regarded as memory overlap
288 // jump to lsx_backward
289 // else
290 // jump to lsx_forward
291 BGEU R5, R4, lsx_forward
292 ADDV R5, R6, R10
293 BLTU R4, R10, lsx_backward
294 lsx_forward:
295 AND $15, R4, R9 // dst & 15
296 BEQ R9, lsx_forward_move_128
297 lsx_forward_unaligned:
298 MOVV $16, R10
299 SUBV R9, R10 // head = 16 - (dst & 15)
300 VMOVQ (R5), V0
301 SUBV R10, R6 // newcount = count - (16 - (dst & 15))
302 ADDV R10, R5 // newsrc = src + (16 - (dst & 15))
303 VMOVQ (R5), V1
304 VMOVQ V0, (R4)
305 ADDV R10, R4 // newdst = dst + (16 - (dst & 15))
306 VMOVQ V1, (R4)
307 SUBV $16, R6
308 ADDV $16, R4
309 ADDV $16, R5
310 lsx_forward_move_128:
311 SGTU $128, R6, R9
312 BNE R9, lsx_forward_move_32
313 lsx_forward_move_128loop:
314 SUBV $128, R6
315 SGTU $128, R6, R9
316 VMOVQ 0(R5), V0
317 VMOVQ 16(R5), V1
318 VMOVQ 32(R5), V2
319 VMOVQ 48(R5), V3
320 VMOVQ 64(R5), V4
321 VMOVQ 80(R5), V5
322 VMOVQ 96(R5), V6
323 VMOVQ 112(R5), V7
324 VMOVQ V0, 0(R4)
325 VMOVQ V1, 16(R4)
326 VMOVQ V2, 32(R4)
327 VMOVQ V3, 48(R4)
328 VMOVQ V4, 64(R4)
329 VMOVQ V5, 80(R4)
330 VMOVQ V6, 96(R4)
331 VMOVQ V7, 112(R4)
332 ADDV $128, R5
333 ADDV $128, R4
334 BEQ R9, lsx_forward_move_128loop
335 lsx_forward_move_32:
336 SGTU $32, R6, R9
337 BNE R9, lsx_forward_move_tail
338 lsx_forward_move_32loop:
339 SUBV $32, R6
340 SGTU $32, R6, R9
341 VMOVQ 0(R5), V0
342 VMOVQ 16(R5), V1
343 VMOVQ V0, 0(R4)
344 VMOVQ V1, 16(R4)
345 ADDV $32, R5
346 ADDV $32, R4
347 BEQ R9, lsx_forward_move_32loop
348 lsx_forward_move_tail:
349 // 0 < remaining_length < 64
350 BNE R6, tail
351 RET
352
353 lsx_backward:
354 AND $15, R7, R9 // dstend & 15
355 BEQ R9, lsx_backward_move_128
356 lsx_backward_unaligned:
357 VMOVQ -16(R8), V0
358 SUBV R9, R6 // newcount = count - (dstend & 15)
359 SUBV R9, R8 // newsrcend = srcend - (dstend & 15)
360 VMOVQ -16(R8), V1
361 VMOVQ V0, -16(R7)
362 SUBV R9, R7 // newdstend = dstend - (dstend & 15)
363 VMOVQ V1, -16(R7)
364 SUBV $16, R6
365 SUBV $16, R7
366 SUBV $16, R8
367 lsx_backward_move_128:
368 SGTU $128, R6, R9
369 BNE R9, lsx_backward_move_32
370 lsx_backward_move_128loop:
371 SUBV $128, R6
372 SGTU $128, R6, R9
373 VMOVQ -16(R8), V0
374 VMOVQ -32(R8), V1
375 VMOVQ -48(R8), V2
376 VMOVQ -64(R8), V3
377 VMOVQ -80(R8), V4
378 VMOVQ -96(R8), V5
379 VMOVQ -112(R8), V6
380 VMOVQ -128(R8), V7
381 VMOVQ V0, -16(R7)
382 VMOVQ V1, -32(R7)
383 VMOVQ V2, -48(R7)
384 VMOVQ V3, -64(R7)
385 VMOVQ V4, -80(R7)
386 VMOVQ V5, -96(R7)
387 VMOVQ V6, -112(R7)
388 VMOVQ V7, -128(R7)
389 SUBV $128, R8
390 SUBV $128, R7
391 BEQ R9, lsx_backward_move_128loop
392 lsx_backward_move_32:
393 SGTU $32, R6, R9
394 BNE R9, lsx_backward_move_tail
395 lsx_backward_move_32loop:
396 SUBV $32, R6
397 SGTU $32, R6, R9
398 VMOVQ -16(R8), V0
399 VMOVQ -32(R8), V1
400 VMOVQ V0, -16(R7)
401 VMOVQ V1, -32(R7)
402 SUBV $32, R8
403 SUBV $32, R7
404 BEQ R9, lsx_backward_move_32loop
405 lsx_backward_move_tail:
406 // 0 < remaining_length < 64
407 BNE R6, tail
408 RET
409
410 // use simd 256 instructions to implement memmove
411 // n >= 256 bytes, check 32-byte alignment
412 lasx_move_large:
413 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
414 BEQ R9, lsx_move_large
415
416 // if (dst > src) && (dst < (src + count))
417 // regarded as memory overlap
418 // jump to lasx_backward
419 // else
420 // jump to lasx_forward
421 BGEU R5, R4, lasx_forward
422 ADDV R5, R6, R10
423 BLTU R4, R10, lasx_backward
424 lasx_forward:
425 AND $31, R4, R9 // dst & 31
426 BEQ R9, lasx_forward_move_256
427 lasx_forward_unaligned:
428 MOVV $32, R10
429 SUBV R9, R10 // head = 32 - (dst & 31)
430 XVMOVQ (R5), X0
431 SUBV R10, R6 // newcount = count - (32 - (dst & 31))
432 ADDV R10, R5 // newsrc = src + (32 - (dst & 31))
433 XVMOVQ (R5), X1
434 XVMOVQ X0, (R4)
435 ADDV R10, R4 // newdst = dst + (32 - (dst & 31))
436 XVMOVQ X1, (R4)
437 SUBV $32, R6
438 ADDV $32, R4
439 ADDV $32, R5
440 lasx_forward_move_256:
441 SGTU $256, R6, R9
442 BNE R9, lasx_forward_move_64
443 lasx_forward_move_256loop:
444 SUBV $256, R6
445 SGTU $256, R6, R9
446 XVMOVQ 0(R5), X0
447 XVMOVQ 32(R5), X1
448 XVMOVQ 64(R5), X2
449 XVMOVQ 96(R5), X3
450 XVMOVQ 128(R5), X4
451 XVMOVQ 160(R5), X5
452 XVMOVQ 192(R5), X6
453 XVMOVQ 224(R5), X7
454 XVMOVQ X0, 0(R4)
455 XVMOVQ X1, 32(R4)
456 XVMOVQ X2, 64(R4)
457 XVMOVQ X3, 96(R4)
458 XVMOVQ X4, 128(R4)
459 XVMOVQ X5, 160(R4)
460 XVMOVQ X6, 192(R4)
461 XVMOVQ X7, 224(R4)
462 ADDV $256, R5
463 ADDV $256, R4
464 BEQ R9, lasx_forward_move_256loop
465 lasx_forward_move_64:
466 SGTU $64, R6, R9
467 BNE R9, lasx_forward_move_tail
468 lasx_forward_move_64loop:
469 SUBV $64, R6
470 SGTU $64, R6, R9
471 XVMOVQ (R5), X0
472 XVMOVQ 32(R5), X1
473 XVMOVQ X0, (R4)
474 XVMOVQ X1, 32(R4)
475 ADDV $64, R5
476 ADDV $64, R4
477 BEQ R9, lasx_forward_move_64loop
478 lasx_forward_move_tail:
479 // 0 < remaining_length < 64
480 BNE R6, tail
481 RET
482
483 lasx_backward:
484 AND $31, R7, R9 // dstend & 31
485 BEQ R9, lasx_backward_move_256
486 lasx_backward_unaligned:
487 XVMOVQ -32(R8), X0
488 SUBV R9, R6 // newcount = count - (dstend & 31)
489 SUBV R9, R8 // newsrcend = srcend - (dstend & 31)
490 XVMOVQ -32(R8), X1
491 XVMOVQ X0, -32(R7)
492 SUBV R9, R7 // newdstend = dstend - (dstend & 31)
493 XVMOVQ X1, -32(R7)
494 SUBV $32, R6
495 SUBV $32, R7
496 SUBV $32, R8
497 lasx_backward_move_256:
498 SGTU $256, R6, R9
499 BNE R9, lasx_backward_move_64
500 lasx_backward_move_256loop:
501 SUBV $256, R6
502 SGTU $256, R6, R9
503 XVMOVQ -32(R8), X0
504 XVMOVQ -64(R8), X1
505 XVMOVQ -96(R8), X2
506 XVMOVQ -128(R8), X3
507 XVMOVQ -160(R8), X4
508 XVMOVQ -192(R8), X5
509 XVMOVQ -224(R8), X6
510 XVMOVQ -256(R8), X7
511 XVMOVQ X0, -32(R7)
512 XVMOVQ X1, -64(R7)
513 XVMOVQ X2, -96(R7)
514 XVMOVQ X3, -128(R7)
515 XVMOVQ X4, -160(R7)
516 XVMOVQ X5, -192(R7)
517 XVMOVQ X6, -224(R7)
518 XVMOVQ X7, -256(R7)
519 SUBV $256, R8
520 SUBV $256, R7
521 BEQ R9, lasx_backward_move_256loop
522 lasx_backward_move_64:
523 SGTU $64, R6, R9
524 BNE R9, lasx_backward_move_tail
525 lasx_backward_move_64loop:
526 SUBV $64, R6
527 SGTU $64, R6, R9
528 XVMOVQ -32(R8), X0
529 XVMOVQ -64(R8), X1
530 XVMOVQ X0, -32(R7)
531 XVMOVQ X1, -64(R7)
532 SUBV $64, R8
533 SUBV $64, R7
534 BEQ R9, lasx_backward_move_64loop
535 lasx_backward_move_tail:
536 // 0 < remaining_length < 64
537 BNE R6, tail
538 RET
539
View as plain text