Text file
src/math/big/arithvec_s390x.s
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !math_big_pure_go
6
7 #include "textflag.h"
8
9 TEXT ·addVVvec(SB), NOSPLIT, $0
10 MOVD z_len+8(FP), R3
11 MOVD x+24(FP), R8
12 MOVD y+48(FP), R9
13 MOVD z+0(FP), R2
14
15 MOVD $0, R4 // c = 0
16 MOVD $0, R0 // make sure it's zero
17 MOVD $0, R10 // i = 0
18
19 // s/JL/JMP/ below to disable the unrolled loop
20 SUB $4, R3
21 BLT v1
22 SUB $12, R3 // n -= 16
23 BLT A1 // if n < 0 goto A1
24
25 MOVD R8, R5
26 MOVD R9, R6
27 MOVD R2, R7
28
29 // n >= 0
30 // regular loop body unrolled 16x
31 VZERO V0 // c = 0
32
33 UU1:
34 VLM 0(R5), V1, V4 // 64-bytes into V1..V8
35 ADD $64, R5
36 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
37 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
38
39 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
40 ADD $64, R6
41 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
42 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
43
44 VACCCQ V1, V9, V0, V25
45 VACQ V1, V9, V0, V17
46 VACCCQ V2, V10, V25, V26
47 VACQ V2, V10, V25, V18
48
49 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
50 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
51 ADD $32, R5
52 ADD $32, R6
53
54 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
55 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
56 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
57 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
58
59 VACCCQ V3, V11, V26, V27
60 VACQ V3, V11, V26, V19
61 VACCCQ V4, V12, V27, V28
62 VACQ V4, V12, V27, V20
63
64 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
65 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
66 ADD $32, R5
67 ADD $32, R6
68
69 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
70 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
71 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
72 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
73
74 VACCCQ V5, V13, V28, V29
75 VACQ V5, V13, V28, V21
76 VACCCQ V6, V14, V29, V30
77 VACQ V6, V14, V29, V22
78
79 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
80 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
81 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
82 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
83
84 VACCCQ V7, V15, V30, V31
85 VACQ V7, V15, V30, V23
86 VACCCQ V8, V16, V31, V0 // V0 has carry-over
87 VACQ V8, V16, V31, V24
88
89 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
90 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
91 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
92 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
93 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
94 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
95 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
96 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
97 VSTM V17, V24, 0(R7) // 128-bytes into z
98 ADD $128, R7
99 ADD $128, R10 // i += 16
100 SUB $16, R3 // n -= 16
101 BGE UU1 // if n >= 0 goto U1
102 VLGVG $1, V0, R4 // put cf into R4
103 NEG R4, R4 // save cf
104
105 A1:
106 ADD $12, R3 // n += 16
107
108 // s/JL/JMP/ below to disable the unrolled loop
109 BLT v1 // if n < 0 goto v1
110
111 U1: // n >= 0
112 // regular loop body unrolled 4x
113 MOVD 0(R8)(R10*1), R5
114 MOVD 8(R8)(R10*1), R6
115 MOVD 16(R8)(R10*1), R7
116 MOVD 24(R8)(R10*1), R1
117 ADDC R4, R4 // restore CF
118 MOVD 0(R9)(R10*1), R11
119 ADDE R11, R5
120 MOVD 8(R9)(R10*1), R11
121 ADDE R11, R6
122 MOVD 16(R9)(R10*1), R11
123 ADDE R11, R7
124 MOVD 24(R9)(R10*1), R11
125 ADDE R11, R1
126 MOVD R0, R4
127 ADDE R4, R4 // save CF
128 NEG R4, R4
129 MOVD R5, 0(R2)(R10*1)
130 MOVD R6, 8(R2)(R10*1)
131 MOVD R7, 16(R2)(R10*1)
132 MOVD R1, 24(R2)(R10*1)
133
134 ADD $32, R10 // i += 4
135 SUB $4, R3 // n -= 4
136 BGE U1 // if n >= 0 goto U1
137
138 v1:
139 ADD $4, R3 // n += 4
140 BLE E1 // if n <= 0 goto E1
141
142 L1: // n > 0
143 ADDC R4, R4 // restore CF
144 MOVD 0(R8)(R10*1), R5
145 MOVD 0(R9)(R10*1), R11
146 ADDE R11, R5
147 MOVD R5, 0(R2)(R10*1)
148 MOVD R0, R4
149 ADDE R4, R4 // save CF
150 NEG R4, R4
151
152 ADD $8, R10 // i++
153 SUB $1, R3 // n--
154 BGT L1 // if n > 0 goto L1
155
156 E1:
157 NEG R4, R4
158 MOVD R4, c+72(FP) // return c
159 RET
160
161 TEXT ·subVVvec(SB), NOSPLIT, $0
162 MOVD z_len+8(FP), R3
163 MOVD x+24(FP), R8
164 MOVD y+48(FP), R9
165 MOVD z+0(FP), R2
166 MOVD $0, R4 // c = 0
167 MOVD $0, R0 // make sure it's zero
168 MOVD $0, R10 // i = 0
169
170 // s/JL/JMP/ below to disable the unrolled loop
171 SUB $4, R3 // n -= 4
172 BLT v1 // if n < 0 goto v1
173 SUB $12, R3 // n -= 16
174 BLT A1 // if n < 0 goto A1
175
176 MOVD R8, R5
177 MOVD R9, R6
178 MOVD R2, R7
179
180 // n >= 0
181 // regular loop body unrolled 16x
182 VZERO V0 // cf = 0
183 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
184 VLVGG $1, R4, V0 // put carry into V0
185
186 UU1:
187 VLM 0(R5), V1, V4 // 64-bytes into V1..V8
188 ADD $64, R5
189 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
190 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
191
192 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
193 ADD $64, R6
194 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
195 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
196
197 VSBCBIQ V1, V9, V0, V25
198 VSBIQ V1, V9, V0, V17
199 VSBCBIQ V2, V10, V25, V26
200 VSBIQ V2, V10, V25, V18
201
202 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
203 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
204 ADD $32, R5
205 ADD $32, R6
206
207 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
208 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
209 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
210 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
211
212 VSBCBIQ V3, V11, V26, V27
213 VSBIQ V3, V11, V26, V19
214 VSBCBIQ V4, V12, V27, V28
215 VSBIQ V4, V12, V27, V20
216
217 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
218 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
219 ADD $32, R5
220 ADD $32, R6
221
222 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
223 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
224 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
225 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
226
227 VSBCBIQ V5, V13, V28, V29
228 VSBIQ V5, V13, V28, V21
229 VSBCBIQ V6, V14, V29, V30
230 VSBIQ V6, V14, V29, V22
231
232 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
233 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
234 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
235 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
236
237 VSBCBIQ V7, V15, V30, V31
238 VSBIQ V7, V15, V30, V23
239 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over
240 VSBIQ V8, V16, V31, V24
241
242 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
243 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
244 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
245 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
246 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
247 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
248 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
249 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
250 VSTM V17, V24, 0(R7) // 128-bytes into z
251 ADD $128, R7
252 ADD $128, R10 // i += 16
253 SUB $16, R3 // n -= 16
254 BGE UU1 // if n >= 0 goto U1
255 VLGVG $1, V0, R4 // put cf into R4
256 SUB $1, R4 // save cf
257
258 A1:
259 ADD $12, R3 // n += 16
260 BLT v1 // if n < 0 goto v1
261
262 U1: // n >= 0
263 // regular loop body unrolled 4x
264 MOVD 0(R8)(R10*1), R5
265 MOVD 8(R8)(R10*1), R6
266 MOVD 16(R8)(R10*1), R7
267 MOVD 24(R8)(R10*1), R1
268 MOVD R0, R11
269 SUBC R4, R11 // restore CF
270 MOVD 0(R9)(R10*1), R11
271 SUBE R11, R5
272 MOVD 8(R9)(R10*1), R11
273 SUBE R11, R6
274 MOVD 16(R9)(R10*1), R11
275 SUBE R11, R7
276 MOVD 24(R9)(R10*1), R11
277 SUBE R11, R1
278 MOVD R0, R4
279 SUBE R4, R4 // save CF
280 MOVD R5, 0(R2)(R10*1)
281 MOVD R6, 8(R2)(R10*1)
282 MOVD R7, 16(R2)(R10*1)
283 MOVD R1, 24(R2)(R10*1)
284
285 ADD $32, R10 // i += 4
286 SUB $4, R3 // n -= 4
287 BGE U1 // if n >= 0 goto U1n
288
289 v1:
290 ADD $4, R3 // n += 4
291 BLE E1 // if n <= 0 goto E1
292
293 L1: // n > 0
294 MOVD R0, R11
295 SUBC R4, R11 // restore CF
296 MOVD 0(R8)(R10*1), R5
297 MOVD 0(R9)(R10*1), R11
298 SUBE R11, R5
299 MOVD R5, 0(R2)(R10*1)
300 MOVD R0, R4
301 SUBE R4, R4 // save CF
302
303 ADD $8, R10 // i++
304 SUB $1, R3 // n--
305 BGT L1 // if n > 0 goto L1n
306
307 E1:
308 NEG R4, R4
309 MOVD R4, c+72(FP) // return c
310 RET
311
View as plain text