Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/rs6000/si2vmx.h @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | 04ced10e8804 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics | |
2 Copyright (C) 2007, 2009 Free Software Foundation, Inc. | |
3 | |
4 This file is free software; you can redistribute it and/or modify it under | |
5 the terms of the GNU General Public License as published by the Free | |
6 Software Foundation; either version 3 of the License, or (at your option) | |
7 any later version. | |
8 | |
9 This file is distributed in the hope that it will be useful, but WITHOUT | |
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
12 for more details. | |
13 | |
14 Under Section 7 of GPL version 3, you are granted additional | |
15 permissions described in the GCC Runtime Library Exception, version | |
16 3.1, as published by the Free Software Foundation. | |
17 | |
18 You should have received a copy of the GNU General Public License and | |
19 a copy of the GCC Runtime Library Exception along with this program; | |
20 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
21 <http://www.gnu.org/licenses/>. */ | |
22 | |
23 #ifndef _SI2VMX_H_ | |
24 #define _SI2VMX_H_ 1 | |
25 | |
26 #ifndef __SPU__ | |
27 | |
28 #include <stdlib.h> | |
29 #include <vec_types.h> | |
30 | |
31 | |
32 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics. | |
33 * Users can override the action by defining it prior to including this | |
34 * header file. | |
35 */ | |
36 #ifndef SPU_HALT_ACTION | |
37 #define SPU_HALT_ACTION abort() | |
38 #endif | |
39 | |
40 /* Specify a default stop action for the spu_stop intrinsic. | |
41 * Users can override the action by defining it prior to including this | |
42 * header file. | |
43 */ | |
44 #ifndef SPU_STOP_ACTION | |
45 #define SPU_STOP_ACTION abort() | |
46 #endif | |
47 | |
48 | |
49 /* Specify a default action for unsupported intrinsic. | |
50 * Users can override the action by defining it prior to including this | |
51 * header file. | |
52 */ | |
53 #ifndef SPU_UNSUPPORTED_ACTION | |
54 #define SPU_UNSUPPORTED_ACTION abort() | |
55 #endif | |
56 | |
57 | |
58 /* Casting intrinsics - from scalar to quadword | |
59 */ | |
60 | |
61 static __inline qword si_from_uchar(unsigned char c) { | |
62 union { | |
63 qword q; | |
64 unsigned char c[16]; | |
65 } x; | |
66 x.c[3] = c; | |
67 return (x.q); | |
68 } | |
69 | |
70 static __inline qword si_from_char(signed char c) { | |
71 union { | |
72 qword q; | |
73 signed char c[16]; | |
74 } x; | |
75 x.c[3] = c; | |
76 return (x.q); | |
77 } | |
78 | |
79 static __inline qword si_from_ushort(unsigned short s) { | |
80 union { | |
81 qword q; | |
82 unsigned short s[8]; | |
83 } x; | |
84 x.s[1] = s; | |
85 return (x.q); | |
86 } | |
87 | |
88 static __inline qword si_from_short(short s) { | |
89 union { | |
90 qword q; | |
91 short s[8]; | |
92 } x; | |
93 x.s[1] = s; | |
94 return (x.q); | |
95 } | |
96 | |
97 | |
98 static __inline qword si_from_uint(unsigned int i) { | |
99 union { | |
100 qword q; | |
101 unsigned int i[4]; | |
102 } x; | |
103 x.i[0] = i; | |
104 return (x.q); | |
105 } | |
106 | |
107 static __inline qword si_from_int(int i) { | |
108 union { | |
109 qword q; | |
110 int i[4]; | |
111 } x; | |
112 x.i[0] = i; | |
113 return (x.q); | |
114 } | |
115 | |
116 static __inline qword si_from_ullong(unsigned long long l) { | |
117 union { | |
118 qword q; | |
119 unsigned long long l[2]; | |
120 } x; | |
121 x.l[0] = l; | |
122 return (x.q); | |
123 } | |
124 | |
125 static __inline qword si_from_llong(long long l) { | |
126 union { | |
127 qword q; | |
128 long long l[2]; | |
129 } x; | |
130 x.l[0] = l; | |
131 return (x.q); | |
132 } | |
133 | |
134 static __inline qword si_from_float(float f) { | |
135 union { | |
136 qword q; | |
137 float f[4]; | |
138 } x; | |
139 x.f[0] = f; | |
140 return (x.q); | |
141 } | |
142 | |
143 static __inline qword si_from_double(double d) { | |
144 union { | |
145 qword q; | |
146 double d[2]; | |
147 } x; | |
148 x.d[0] = d; | |
149 return (x.q); | |
150 } | |
151 | |
152 static __inline qword si_from_ptr(void *ptr) { | |
153 union { | |
154 qword q; | |
155 void *p; | |
156 } x; | |
157 x.p = ptr; | |
158 return (x.q); | |
159 } | |
160 | |
161 | |
162 /* Casting intrinsics - from quadword to scalar | |
163 */ | |
164 static __inline unsigned char si_to_uchar(qword q) { | |
165 union { | |
166 qword q; | |
167 unsigned char c[16]; | |
168 } x; | |
169 x.q = q; | |
170 return (x.c[3]); | |
171 } | |
172 | |
173 static __inline signed char si_to_char(qword q) { | |
174 union { | |
175 qword q; | |
176 signed char c[16]; | |
177 } x; | |
178 x.q = q; | |
179 return (x.c[3]); | |
180 } | |
181 | |
182 static __inline unsigned short si_to_ushort(qword q) { | |
183 union { | |
184 qword q; | |
185 unsigned short s[8]; | |
186 } x; | |
187 x.q = q; | |
188 return (x.s[1]); | |
189 } | |
190 | |
191 static __inline short si_to_short(qword q) { | |
192 union { | |
193 qword q; | |
194 short s[8]; | |
195 } x; | |
196 x.q = q; | |
197 return (x.s[1]); | |
198 } | |
199 | |
200 static __inline unsigned int si_to_uint(qword q) { | |
201 union { | |
202 qword q; | |
203 unsigned int i[4]; | |
204 } x; | |
205 x.q = q; | |
206 return (x.i[0]); | |
207 } | |
208 | |
209 static __inline int si_to_int(qword q) { | |
210 union { | |
211 qword q; | |
212 int i[4]; | |
213 } x; | |
214 x.q = q; | |
215 return (x.i[0]); | |
216 } | |
217 | |
218 static __inline unsigned long long si_to_ullong(qword q) { | |
219 union { | |
220 qword q; | |
221 unsigned long long l[2]; | |
222 } x; | |
223 x.q = q; | |
224 return (x.l[0]); | |
225 } | |
226 | |
227 static __inline long long si_to_llong(qword q) { | |
228 union { | |
229 qword q; | |
230 long long l[2]; | |
231 } x; | |
232 x.q = q; | |
233 return (x.l[0]); | |
234 } | |
235 | |
236 static __inline float si_to_float(qword q) { | |
237 union { | |
238 qword q; | |
239 float f[4]; | |
240 } x; | |
241 x.q = q; | |
242 return (x.f[0]); | |
243 } | |
244 | |
245 static __inline double si_to_double(qword q) { | |
246 union { | |
247 qword q; | |
248 double d[2]; | |
249 } x; | |
250 x.q = q; | |
251 return (x.d[0]); | |
252 } | |
253 | |
254 static __inline void * si_to_ptr(qword q) { | |
255 union { | |
256 qword q; | |
257 void *p; | |
258 } x; | |
259 x.q = q; | |
260 return (x.p); | |
261 } | |
262 | |
263 | |
264 /* Absolute difference | |
265 */ | |
266 static __inline qword si_absdb(qword a, qword b) | |
267 { | |
268 vec_uchar16 ac, bc, dc; | |
269 | |
270 ac = (vec_uchar16)(a); | |
271 bc = (vec_uchar16)(b); | |
272 dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc)); | |
273 | |
274 return ((qword)(dc)); | |
275 } | |
276 | |
277 /* Add intrinsics | |
278 */ | |
279 #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b)))) | |
280 | |
281 #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b)))) | |
282 | |
283 static __inline qword si_ai(qword a, int b) | |
284 { | |
285 return ((qword)(vec_add((vec_int4)(a), | |
286 vec_splat((vec_int4)(si_from_int(b)), 0)))); | |
287 } | |
288 | |
289 | |
290 static __inline qword si_ahi(qword a, short b) | |
291 { | |
292 return ((qword)(vec_add((vec_short8)(a), | |
293 vec_splat((vec_short8)(si_from_short(b)), 1)))); | |
294 } | |
295 | |
296 | |
297 #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b)))) | |
298 | |
299 | |
300 static __inline qword si_dfa(qword a, qword b) | |
301 { | |
302 union { | |
303 vec_double2 v; | |
304 double d[2]; | |
305 } ad, bd, dd; | |
306 | |
307 ad.v = (vec_double2)(a); | |
308 bd.v = (vec_double2)(b); | |
309 dd.d[0] = ad.d[0] + bd.d[0]; | |
310 dd.d[1] = ad.d[1] + bd.d[1]; | |
311 | |
312 return ((qword)(dd.v)); | |
313 } | |
314 | |
315 /* Add word extended | |
316 */ | |
317 #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \ | |
318 vec_and((vec_uint4)(_c), vec_splat_u32(1))))) | |
319 | |
320 | |
321 /* Bit-wise AND | |
322 */ | |
323 #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b)))) | |
324 | |
325 | |
326 static __inline qword si_andbi(qword a, signed char b) | |
327 { | |
328 return ((qword)(vec_and((vec_char16)(a), | |
329 vec_splat((vec_char16)(si_from_char(b)), 3)))); | |
330 } | |
331 | |
332 static __inline qword si_andhi(qword a, signed short b) | |
333 { | |
334 return ((qword)(vec_and((vec_short8)(a), | |
335 vec_splat((vec_short8)(si_from_short(b)), 1)))); | |
336 } | |
337 | |
338 | |
339 static __inline qword si_andi(qword a, signed int b) | |
340 { | |
341 return ((qword)(vec_and((vec_int4)(a), | |
342 vec_splat((vec_int4)(si_from_int(b)), 0)))); | |
343 } | |
344 | |
345 | |
346 /* Bit-wise AND with complement | |
347 */ | |
348 #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b)))) | |
349 | |
350 | |
351 /* Average byte vectors | |
352 */ | |
353 #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b)))) | |
354 | |
355 | |
356 /* Branch indirect and set link on external data | |
357 */ | |
358 #define si_bisled(_func) /* not mappable */ | |
359 #define si_bisledd(_func) /* not mappable */ | |
360 #define si_bislede(_func) /* not mappable */ | |
361 | |
362 | |
363 /* Borrow generate | |
364 */ | |
365 #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a)))) | |
366 | |
367 #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \ | |
368 vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \ | |
369 (vec_uint4)(_c))), vec_splat_u32(1)))) | |
370 | |
371 /* Compare absolute equal | |
372 */ | |
373 static __inline qword si_fcmeq(qword a, qword b) | |
374 { | |
375 vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000}); | |
376 | |
377 return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb), | |
378 vec_andc((vec_float4)(b), msb)))); | |
379 } | |
380 | |
381 static __inline qword si_dfcmeq(qword a, qword b) | |
382 { | |
383 vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; | |
384 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 }; | |
385 vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; | |
386 | |
387 vec_uint4 biteq; | |
388 vec_uint4 aabs; | |
389 vec_uint4 babs; | |
390 vec_uint4 a_gt; | |
391 vec_uint4 ahi_inf; | |
392 vec_uint4 anan; | |
393 vec_uint4 result; | |
394 | |
395 union { | |
396 vec_uchar16 v; | |
397 int i[4]; | |
398 } x; | |
399 | |
400 /* Shift 4 bytes */ | |
401 x.i[3] = 4 << 3; | |
402 | |
403 /* Mask out sign bits */ | |
404 aabs = vec_and((vec_uint4)a,sign_mask); | |
405 babs = vec_and((vec_uint4)b,sign_mask); | |
406 | |
407 /* A) Check for bit equality, store in high word */ | |
408 biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs); | |
409 biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v)); | |
410 | |
411 /* | |
412 B) Check if a is NaN, store in high word | |
413 | |
414 B1) If the high word is greater than max_exp (indicates a NaN) | |
415 B2) If the low word is greater than 0 | |
416 */ | |
417 a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask); | |
418 | |
419 /* B3) Check if the high word is equal to the inf exponent */ | |
420 ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask); | |
421 | |
422 /* anan = B1[hi] or (B2[lo] and B3[hi]) */ | |
423 anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf)); | |
424 | |
425 /* result = A and not B */ | |
426 result = vec_andc(biteq, anan); | |
427 | |
428 /* Promote high words to 64 bits and return */ | |
429 return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); | |
430 } | |
431 | |
432 | |
433 /* Compare absolute greater than | |
434 */ | |
435 static __inline qword si_fcmgt(qword a, qword b) | |
436 { | |
437 vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000}); | |
438 | |
439 return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb), | |
440 vec_andc((vec_float4)(b), msb)))); | |
441 } | |
442 | |
443 static __inline qword si_dfcmgt(qword a, qword b) | |
444 { | |
445 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; | |
446 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 }; | |
447 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; | |
448 | |
449 union { | |
450 vec_uchar16 v; | |
451 int i[4]; | |
452 } x; | |
453 | |
454 /* Shift 4 bytes */ | |
455 x.i[3] = 4 << 3; | |
456 | |
457 // absolute value of a,b | |
458 vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask); | |
459 vec_uint4 babs = vec_and((vec_uint4)b, sign_mask); | |
460 | |
461 // check if a is nan | |
462 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask); | |
463 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); | |
464 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); | |
465 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); | |
466 | |
467 // check if b is nan | |
468 vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask); | |
469 vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask); | |
470 b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf)); | |
471 b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi); | |
472 | |
473 // A) Check if the exponents are different | |
474 vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs); | |
475 | |
476 // B) Check if high word equal, and low word greater | |
477 vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs); | |
478 vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs); | |
479 vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v)); | |
480 | |
481 // If either A or B is true, return true (unless NaNs detected) | |
482 vec_uint4 r = vec_or(gt_hi, eqgt); | |
483 | |
484 // splat the high words of the comparison step | |
485 r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi); | |
486 | |
487 // correct for NaNs in input | |
488 return ((qword)vec_andc(r,vec_or(a_nan,b_nan))); | |
489 } | |
490 | |
491 | |
492 /* Compare equal | |
493 */ | |
494 static __inline qword si_ceqb(qword a, qword b) | |
495 { | |
496 return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b)))); | |
497 } | |
498 | |
499 static __inline qword si_ceqh(qword a, qword b) | |
500 { | |
501 return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b)))); | |
502 } | |
503 | |
504 static __inline qword si_ceq(qword a, qword b) | |
505 { | |
506 return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b)))); | |
507 } | |
508 | |
509 static __inline qword si_fceq(qword a, qword b) | |
510 { | |
511 return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b)))); | |
512 } | |
513 | |
514 static __inline qword si_ceqbi(qword a, signed char b) | |
515 { | |
516 return ((qword)(vec_cmpeq((vec_char16)(a), | |
517 vec_splat((vec_char16)(si_from_char(b)), 3)))); | |
518 } | |
519 | |
520 static __inline qword si_ceqhi(qword a, signed short b) | |
521 { | |
522 return ((qword)(vec_cmpeq((vec_short8)(a), | |
523 vec_splat((vec_short8)(si_from_short(b)), 1)))); | |
524 } | |
525 | |
526 static __inline qword si_ceqi(qword a, signed int b) | |
527 { | |
528 return ((qword)(vec_cmpeq((vec_int4)(a), | |
529 vec_splat((vec_int4)(si_from_int(b)), 0)))); | |
530 } | |
531 | |
532 static __inline qword si_dfceq(qword a, qword b) | |
533 { | |
534 vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; | |
535 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 }; | |
536 vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; | |
537 | |
538 vec_uint4 biteq; | |
539 vec_uint4 aabs; | |
540 vec_uint4 babs; | |
541 vec_uint4 a_gt; | |
542 vec_uint4 ahi_inf; | |
543 vec_uint4 anan; | |
544 vec_uint4 iszero; | |
545 vec_uint4 result; | |
546 | |
547 union { | |
548 vec_uchar16 v; | |
549 int i[4]; | |
550 } x; | |
551 | |
552 /* Shift 4 bytes */ | |
553 x.i[3] = 4 << 3; | |
554 | |
555 /* A) Check for bit equality, store in high word */ | |
556 biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b); | |
557 biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v)); | |
558 | |
559 /* Mask out sign bits */ | |
560 aabs = vec_and((vec_uint4)a,sign_mask); | |
561 babs = vec_and((vec_uint4)b,sign_mask); | |
562 | |
563 /* | |
564 B) Check if a is NaN, store in high word | |
565 | |
566 B1) If the high word is greater than max_exp (indicates a NaN) | |
567 B2) If the low word is greater than 0 | |
568 */ | |
569 a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask); | |
570 | |
571 /* B3) Check if the high word is equal to the inf exponent */ | |
572 ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask); | |
573 | |
574 /* anan = B1[hi] or (B2[lo] and B3[hi]) */ | |
575 anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf)); | |
576 | |
577 /* C) Check for 0 = -0 special case */ | |
578 iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0)); | |
579 iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v)); | |
580 | |
581 /* result = (A or C) and not B */ | |
582 result = vec_or(biteq,iszero); | |
583 result = vec_andc(result, anan); | |
584 | |
585 /* Promote high words to 64 bits and return */ | |
586 return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); | |
587 } | |
588 | |
589 | |
590 /* Compare greater than | |
591 */ | |
592 static __inline qword si_cgtb(qword a, qword b) | |
593 { | |
594 return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b)))); | |
595 } | |
596 | |
597 static __inline qword si_cgth(qword a, qword b) | |
598 { | |
599 return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b)))); | |
600 } | |
601 | |
602 static __inline qword si_cgt(qword a, qword b) | |
603 { | |
604 return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b)))); | |
605 } | |
606 | |
607 static __inline qword si_clgtb(qword a, qword b) | |
608 { | |
609 return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b)))); | |
610 } | |
611 | |
612 static __inline qword si_clgth(qword a, qword b) | |
613 { | |
614 return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b)))); | |
615 } | |
616 | |
617 static __inline qword si_clgt(qword a, qword b) | |
618 { | |
619 return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b)))); | |
620 } | |
621 | |
622 static __inline qword si_fcgt(qword a, qword b) | |
623 { | |
624 return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b)))); | |
625 } | |
626 | |
627 static __inline qword si_dfcgt(qword a, qword b) | |
628 { | |
629 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; | |
630 vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 }; | |
631 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 }; | |
632 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; | |
633 | |
634 union { | |
635 vec_uchar16 v; | |
636 int i[4]; | |
637 } x; | |
638 | |
639 /* Shift 4 bytes */ | |
640 x.i[3] = 4 << 3; | |
641 | |
642 // absolute value of a,b | |
643 vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask); | |
644 vec_uint4 babs = vec_and((vec_uint4)b, sign_mask); | |
645 | |
646 // check if a is nan | |
647 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask); | |
648 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); | |
649 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); | |
650 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); | |
651 | |
652 // check if b is nan | |
653 vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask); | |
654 vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask); | |
655 b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf)); | |
656 b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi); | |
657 | |
658 // sign of a | |
659 vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); | |
660 asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi); | |
661 | |
662 // sign of b | |
663 vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); | |
664 bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi); | |
665 | |
666 // negative a | |
667 vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs); | |
668 vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7))); | |
669 abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat)); | |
670 vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1))); | |
671 | |
672 // pick the one we want | |
673 vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel); | |
674 | |
675 // negative b | |
676 vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs); | |
677 bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat)); | |
678 vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1))); | |
679 | |
680 // pick the one we want | |
681 vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel); | |
682 | |
683 // A) Check if the exponents are different | |
684 vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval); | |
685 | |
686 // B) Check if high word equal, and low word greater | |
687 vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval); | |
688 vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval); | |
689 vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v)); | |
690 | |
691 // If either A or B is true, return true (unless NaNs detected) | |
692 vec_uint4 r = vec_or(gt_hi, eqgt); | |
693 | |
694 // splat the high words of the comparison step | |
695 r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi); | |
696 | |
697 // correct for NaNs in input | |
698 return ((qword)vec_andc(r,vec_or(a_nan,b_nan))); | |
699 } | |
700 | |
701 static __inline qword si_cgtbi(qword a, signed char b) | |
702 { | |
703 return ((qword)(vec_cmpgt((vec_char16)(a), | |
704 vec_splat((vec_char16)(si_from_char(b)), 3)))); | |
705 } | |
706 | |
707 static __inline qword si_cgthi(qword a, signed short b) | |
708 { | |
709 return ((qword)(vec_cmpgt((vec_short8)(a), | |
710 vec_splat((vec_short8)(si_from_short(b)), 1)))); | |
711 } | |
712 | |
713 static __inline qword si_cgti(qword a, signed int b) | |
714 { | |
715 return ((qword)(vec_cmpgt((vec_int4)(a), | |
716 vec_splat((vec_int4)(si_from_int(b)), 0)))); | |
717 } | |
718 | |
719 static __inline qword si_clgtbi(qword a, unsigned char b) | |
720 { | |
721 return ((qword)(vec_cmpgt((vec_uchar16)(a), | |
722 vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); | |
723 } | |
724 | |
725 static __inline qword si_clgthi(qword a, unsigned short b) | |
726 { | |
727 return ((qword)(vec_cmpgt((vec_ushort8)(a), | |
728 vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); | |
729 } | |
730 | |
731 static __inline qword si_clgti(qword a, unsigned int b) | |
732 { | |
733 return ((qword)(vec_cmpgt((vec_uint4)(a), | |
734 vec_splat((vec_uint4)(si_from_uint(b)), 0)))); | |
735 } | |
736 | |
737 static __inline qword si_dftsv(qword a, char b) | |
738 { | |
739 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; | |
740 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; | |
741 vec_uint4 result = (vec_uint4){0}; | |
742 vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); | |
743 sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi); | |
744 vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask); | |
745 | |
746 union { | |
747 vec_uchar16 v; | |
748 int i[4]; | |
749 } x; | |
750 | |
751 /* Shift 4 bytes */ | |
752 x.i[3] = 4 << 3; | |
753 | |
754 /* Nan or +inf or -inf */ | |
755 if (b & 0x70) | |
756 { | |
757 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 }; | |
758 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask); | |
759 /* NaN */ | |
760 if (b & 0x40) | |
761 { | |
762 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); | |
763 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); | |
764 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); | |
765 result = vec_or(result, a_nan); | |
766 } | |
767 /* inf */ | |
768 if (b & 0x30) | |
769 { | |
770 a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf); | |
771 a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi); | |
772 /* +inf */ | |
773 if (b & 0x20) | |
774 result = vec_or(vec_andc(a_inf, sign), result); | |
775 /* -inf */ | |
776 if (b & 0x10) | |
777 result = vec_or(vec_and(a_inf, sign), result); | |
778 } | |
779 } | |
780 /* 0 or denorm */ | |
781 if (b & 0xF) | |
782 { | |
783 vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0)); | |
784 iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v)); | |
785 /* denorm */ | |
786 if (b & 0x3) | |
787 { | |
788 vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF}; | |
789 vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero); | |
790 isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi); | |
791 /* +denorm */ | |
792 if (b & 0x2) | |
793 result = vec_or(vec_andc(isdenorm, sign), result); | |
794 /* -denorm */ | |
795 if (b & 0x1) | |
796 result = vec_or(vec_and(isdenorm, sign), result); | |
797 } | |
798 /* 0 */ | |
799 if (b & 0xC) | |
800 { | |
801 iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi); | |
802 /* +0 */ | |
803 if (b & 0x8) | |
804 result = vec_or(vec_andc(iszero, sign), result); | |
805 /* -0 */ | |
806 if (b & 0x4) | |
807 result = vec_or(vec_and(iszero, sign), result); | |
808 } | |
809 } | |
810 return ((qword)result); | |
811 } | |
812 | |
813 | |
814 /* Carry generate | |
815 */ | |
816 #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)))) | |
817 | |
818 #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \ | |
819 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \ | |
820 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))) | |
821 | |
822 | |
823 /* Count ones for bytes | |
824 */ | |
825 static __inline qword si_cntb(qword a) | |
826 { | |
827 vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; | |
828 vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; | |
829 vec_uchar16 av; | |
830 | |
831 av = (vec_uchar16)(a); | |
832 | |
833 return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av), | |
834 vec_perm(nib_cnt, nib_cnt, vec_sr (av, four))))); | |
835 } | |
836 | |
837 /* Count ones for bytes | |
838 */ | |
839 static __inline qword si_clz(qword a) | |
840 { | |
841 vec_uchar16 av; | |
842 vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3; | |
843 vec_uchar16 four = vec_splat_u8(4); | |
844 vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}; | |
845 vec_uchar16 eight = vec_splat_u8(8); | |
846 vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}; | |
847 vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24}; | |
848 | |
849 av = (vec_uchar16)(a); | |
850 | |
851 cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four)); | |
852 cnt_lo = vec_perm(nib_cnt, nib_cnt, av); | |
853 | |
854 cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four))); | |
855 | |
856 tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight)); | |
857 tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen)); | |
858 tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour)); | |
859 | |
860 cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight))); | |
861 cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen))); | |
862 cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour))); | |
863 | |
864 return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour)))); | |
865 } | |
866 | |
867 /* Convert to float | |
868 */ | |
869 #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b))) | |
870 #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b))) | |
871 | |
872 /* Convert to signed int | |
873 */ | |
874 #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b))) | |
875 | |
876 /* Convert to unsigned int | |
877 */ | |
878 #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b))) | |
879 | |
880 /* Synchronize | |
881 */ | |
882 #define si_dsync() /* do nothing */ | |
883 #define si_sync() /* do nothing */ | |
884 #define si_syncc() /* do nothing */ | |
885 | |
886 | |
887 /* Equivalence | |
888 */ | |
889 static __inline qword si_eqv(qword a, qword b) | |
890 { | |
891 vec_uchar16 d; | |
892 | |
893 d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b)); | |
894 return ((qword)(vec_nor(d, d))); | |
895 } | |
896 | |
897 /* Extend | |
898 */ | |
899 static __inline qword si_xsbh(qword a) | |
900 { | |
901 vec_char16 av; | |
902 | |
903 av = (vec_char16)(a); | |
904 return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15, | |
905 0, 0, 0, 0, 0, 0, 0, 0}))))); | |
906 } | |
907 | |
908 static __inline qword si_xshw(qword a) | |
909 { | |
910 vec_short8 av; | |
911 | |
912 av = (vec_short8)(a); | |
913 return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7, | |
914 10,11,14,15, | |
915 0, 0, 0, 0, | |
916 0, 0, 0, 0}))))); | |
917 } | |
918 | |
919 static __inline qword si_xswd(qword a) | |
920 { | |
921 vec_int4 av; | |
922 | |
923 av = (vec_int4)(a); | |
924 return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})), | |
925 ((vec_uchar16){20, 21, 22, 23, | |
926 4, 5, 6, 7, | |
927 28, 29, 30, 31, | |
928 12, 13, 14, 15})))); | |
929 } | |
930 | |
931 static __inline qword si_fesd(qword a) | |
932 { | |
933 union { | |
934 double d[2]; | |
935 vec_double2 vd; | |
936 } out; | |
937 union { | |
938 float f[4]; | |
939 vec_float4 vf; | |
940 } in; | |
941 | |
942 in.vf = (vec_float4)(a); | |
943 out.d[0] = (double)(in.f[0]); | |
944 out.d[1] = (double)(in.f[2]); | |
945 return ((qword)(out.vd)); | |
946 } | |
947 | |
948 /* Gather | |
949 */ | |
950 static __inline qword si_gbb(qword a) | |
951 { | |
952 vec_uchar16 bits; | |
953 vec_uint4 bytes; | |
954 | |
955 bits = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0, | |
956 7, 6, 5, 4, 3, 2, 1, 0})); | |
957 bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0})); | |
958 | |
959 return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0, | |
960 0, 0, 0, 0, 0, 0, 0, 0})))); | |
961 } | |
962 | |
963 | |
964 static __inline qword si_gbh(qword a) | |
965 { | |
966 vec_ushort8 bits; | |
967 vec_uint4 bytes; | |
968 | |
969 bits = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0})); | |
970 | |
971 bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0}); | |
972 | |
973 return ((qword)(vec_sld(bytes, bytes, 12))); | |
974 } | |
975 | |
976 static __inline qword si_gb(qword a) | |
977 { | |
978 vec_uint4 bits; | |
979 vec_uint4 bytes; | |
980 | |
981 bits = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0})); | |
982 bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0})); | |
983 return ((qword)(vec_sld(bytes, bytes, 12))); | |
984 } | |
985 | |
986 | |
987 /* Compare and halt | |
988 */ | |
989 static __inline void si_heq(qword a, qword b) | |
990 { | |
991 union { | |
992 vector unsigned int v; | |
993 unsigned int i[4]; | |
994 } aa, bb; | |
995 | |
996 aa.v = (vector unsigned int)(a); | |
997 bb.v = (vector unsigned int)(b); | |
998 | |
999 if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; }; | |
1000 } | |
1001 | |
1002 static __inline void si_heqi(qword a, unsigned int b) | |
1003 { | |
1004 union { | |
1005 vector unsigned int v; | |
1006 unsigned int i[4]; | |
1007 } aa; | |
1008 | |
1009 aa.v = (vector unsigned int)(a); | |
1010 | |
1011 if (aa.i[0] == b) { SPU_HALT_ACTION; }; | |
1012 } | |
1013 | |
1014 static __inline void si_hgt(qword a, qword b) | |
1015 { | |
1016 union { | |
1017 vector signed int v; | |
1018 signed int i[4]; | |
1019 } aa, bb; | |
1020 | |
1021 aa.v = (vector signed int)(a); | |
1022 bb.v = (vector signed int)(b); | |
1023 | |
1024 if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; }; | |
1025 } | |
1026 | |
1027 static __inline void si_hgti(qword a, signed int b) | |
1028 { | |
1029 union { | |
1030 vector signed int v; | |
1031 signed int i[4]; | |
1032 } aa; | |
1033 | |
1034 aa.v = (vector signed int)(a); | |
1035 | |
1036 if (aa.i[0] > b) { SPU_HALT_ACTION; }; | |
1037 } | |
1038 | |
1039 static __inline void si_hlgt(qword a, qword b) | |
1040 { | |
1041 union { | |
1042 vector unsigned int v; | |
1043 unsigned int i[4]; | |
1044 } aa, bb; | |
1045 | |
1046 aa.v = (vector unsigned int)(a); | |
1047 bb.v = (vector unsigned int)(b); | |
1048 | |
1049 if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; }; | |
1050 } | |
1051 | |
1052 static __inline void si_hlgti(qword a, unsigned int b) | |
1053 { | |
1054 union { | |
1055 vector unsigned int v; | |
1056 unsigned int i[4]; | |
1057 } aa; | |
1058 | |
1059 aa.v = (vector unsigned int)(a); | |
1060 | |
1061 if (aa.i[0] > b) { SPU_HALT_ACTION; }; | |
1062 } | |
1063 | |
1064 | |
1065 /* Multiply and Add | |
1066 */ | |
1067 static __inline qword si_mpya(qword a, qword b, qword c) | |
1068 { | |
1069 return ((qword)(vec_msum(vec_and((vec_short8)(a), | |
1070 ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})), | |
1071 (vec_short8)(b), (vec_int4)(c)))); | |
1072 } | |
1073 | |
1074 static __inline qword si_fma(qword a, qword b, qword c) | |
1075 { | |
1076 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c)))); | |
1077 } | |
1078 | |
1079 static __inline qword si_dfma(qword a, qword b, qword c) | |
1080 { | |
1081 union { | |
1082 vec_double2 v; | |
1083 double d[2]; | |
1084 } aa, bb, cc, dd; | |
1085 | |
1086 aa.v = (vec_double2)(a); | |
1087 bb.v = (vec_double2)(b); | |
1088 cc.v = (vec_double2)(c); | |
1089 dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0]; | |
1090 dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1]; | |
1091 return ((qword)(dd.v)); | |
1092 } | |
1093 | |
1094 /* Form Mask | |
1095 */ | |
1096 #define si_fsmbi(_a) si_fsmb(si_from_int(_a)) | |
1097 | |
1098 static __inline qword si_fsmb(qword a) | |
1099 { | |
1100 vec_char16 mask; | |
1101 vec_ushort8 in; | |
1102 | |
1103 in = (vec_ushort8)(a); | |
1104 mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2, | |
1105 3, 3, 3, 3, 3, 3, 3, 3}))); | |
1106 return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, | |
1107 0, 1, 2, 3, 4, 5, 6, 7})), | |
1108 vec_splat_u8(7)))); | |
1109 } | |
1110 | |
1111 | |
1112 static __inline qword si_fsmh(qword a) | |
1113 { | |
1114 vec_uchar16 in; | |
1115 vec_short8 mask; | |
1116 | |
1117 in = (vec_uchar16)(a); | |
1118 mask = (vec_short8)(vec_splat(in, 3)); | |
1119 return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})), | |
1120 vec_splat_u16(15)))); | |
1121 } | |
1122 | |
1123 static __inline qword si_fsm(qword a) | |
1124 { | |
1125 vec_uchar16 in; | |
1126 vec_int4 mask; | |
1127 | |
1128 in = (vec_uchar16)(a); | |
1129 mask = (vec_int4)(vec_splat(in, 3)); | |
1130 return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})), | |
1131 ((vec_uint4){31,31,31,31})))); | |
1132 } | |
1133 | |
1134 /* Move from/to registers | |
1135 */ | |
1136 #define si_fscrrd() ((qword)((vec_uint4){0})) | |
1137 #define si_fscrwr(_a) | |
1138 | |
1139 #define si_mfspr(_reg) ((qword)((vec_uint4){0})) | |
1140 #define si_mtspr(_reg, _a) | |
1141 | |
1142 /* Multiply High High Add | |
1143 */ | |
1144 static __inline qword si_mpyhha(qword a, qword b, qword c) | |
1145 { | |
1146 return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c)))); | |
1147 } | |
1148 | |
1149 static __inline qword si_mpyhhau(qword a, qword b, qword c) | |
1150 { | |
1151 return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c)))); | |
1152 } | |
1153 | |
1154 /* Multiply Subtract | |
1155 */ | |
1156 static __inline qword si_fms(qword a, qword b, qword c) | |
1157 { | |
1158 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), | |
1159 vec_sub(((vec_float4){0.0f}), (vec_float4)(c))))); | |
1160 } | |
1161 | |
1162 static __inline qword si_dfms(qword a, qword b, qword c) | |
1163 { | |
1164 union { | |
1165 vec_double2 v; | |
1166 double d[2]; | |
1167 } aa, bb, cc, dd; | |
1168 | |
1169 aa.v = (vec_double2)(a); | |
1170 bb.v = (vec_double2)(b); | |
1171 cc.v = (vec_double2)(c); | |
1172 dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0]; | |
1173 dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1]; | |
1174 return ((qword)(dd.v)); | |
1175 } | |
1176 | |
1177 /* Multiply | |
1178 */ | |
1179 static __inline qword si_fm(qword a, qword b) | |
1180 { | |
1181 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f})))); | |
1182 } | |
1183 | |
1184 static __inline qword si_dfm(qword a, qword b) | |
1185 { | |
1186 union { | |
1187 vec_double2 v; | |
1188 double d[2]; | |
1189 } aa, bb, dd; | |
1190 | |
1191 aa.v = (vec_double2)(a); | |
1192 bb.v = (vec_double2)(b); | |
1193 dd.d[0] = aa.d[0] * bb.d[0]; | |
1194 dd.d[1] = aa.d[1] * bb.d[1]; | |
1195 return ((qword)(dd.v)); | |
1196 } | |
1197 | |
1198 /* Multiply High | |
1199 */ | |
1200 static __inline qword si_mpyh(qword a, qword b) | |
1201 { | |
1202 vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16}; | |
1203 | |
1204 return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen))); | |
1205 } | |
1206 | |
1207 | |
1208 /* Multiply High High | |
1209 */ | |
1210 static __inline qword si_mpyhh(qword a, qword b) | |
1211 { | |
1212 return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b)))); | |
1213 } | |
1214 | |
1215 static __inline qword si_mpyhhu(qword a, qword b) | |
1216 { | |
1217 return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)))); | |
1218 } | |
1219 | |
1220 /* Multiply Odd | |
1221 */ | |
1222 static __inline qword si_mpy(qword a, qword b) | |
1223 { | |
1224 return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b)))); | |
1225 } | |
1226 | |
1227 static __inline qword si_mpyu(qword a, qword b) | |
1228 { | |
1229 return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b)))); | |
1230 } | |
1231 | |
1232 static __inline qword si_mpyi(qword a, short b) | |
1233 { | |
1234 return ((qword)(vec_mulo((vec_short8)(a), | |
1235 vec_splat((vec_short8)(si_from_short(b)), 1)))); | |
1236 } | |
1237 | |
1238 static __inline qword si_mpyui(qword a, unsigned short b) | |
1239 { | |
1240 return ((qword)(vec_mulo((vec_ushort8)(a), | |
1241 vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); | |
1242 } | |
1243 | |
1244 /* Multiply and Shift Right | |
1245 */ | |
1246 static __inline qword si_mpys(qword a, qword b) | |
1247 { | |
1248 return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16})))); | |
1249 } | |
1250 | |
1251 /* Nand | |
1252 */ | |
1253 static __inline qword si_nand(qword a, qword b) | |
1254 { | |
1255 vec_uchar16 d; | |
1256 | |
1257 d = vec_and((vec_uchar16)(a), (vec_uchar16)(b)); | |
1258 return ((qword)(vec_nor(d, d))); | |
1259 } | |
1260 | |
1261 /* Negative Multiply Add | |
1262 */ | |
1263 static __inline qword si_dfnma(qword a, qword b, qword c) | |
1264 { | |
1265 union { | |
1266 vec_double2 v; | |
1267 double d[2]; | |
1268 } aa, bb, cc, dd; | |
1269 | |
1270 aa.v = (vec_double2)(a); | |
1271 bb.v = (vec_double2)(b); | |
1272 cc.v = (vec_double2)(c); | |
1273 dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0]; | |
1274 dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1]; | |
1275 return ((qword)(dd.v)); | |
1276 } | |
1277 | |
1278 /* Negative Multiply and Subtract | |
1279 */ | |
1280 static __inline qword si_fnms(qword a, qword b, qword c) | |
1281 { | |
1282 return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c)))); | |
1283 } | |
1284 | |
1285 static __inline qword si_dfnms(qword a, qword b, qword c) | |
1286 { | |
1287 union { | |
1288 vec_double2 v; | |
1289 double d[2]; | |
1290 } aa, bb, cc, dd; | |
1291 | |
1292 aa.v = (vec_double2)(a); | |
1293 bb.v = (vec_double2)(b); | |
1294 cc.v = (vec_double2)(c); | |
1295 dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0]; | |
1296 dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1]; | |
1297 return ((qword)(dd.v)); | |
1298 } | |
1299 | |
1300 /* Nor | |
1301 */ | |
1302 static __inline qword si_nor(qword a, qword b) | |
1303 { | |
1304 return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b)))); | |
1305 } | |
1306 | |
1307 /* Or | |
1308 */ | |
1309 static __inline qword si_or(qword a, qword b) | |
1310 { | |
1311 return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b)))); | |
1312 } | |
1313 | |
1314 static __inline qword si_orbi(qword a, unsigned char b) | |
1315 { | |
1316 return ((qword)(vec_or((vec_uchar16)(a), | |
1317 vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); | |
1318 } | |
1319 | |
1320 static __inline qword si_orhi(qword a, unsigned short b) | |
1321 { | |
1322 return ((qword)(vec_or((vec_ushort8)(a), | |
1323 vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); | |
1324 } | |
1325 | |
1326 static __inline qword si_ori(qword a, unsigned int b) | |
1327 { | |
1328 return ((qword)(vec_or((vec_uint4)(a), | |
1329 vec_splat((vec_uint4)(si_from_uint(b)), 0)))); | |
1330 } | |
1331 | |
1332 /* Or Complement | |
1333 */ | |
1334 static __inline qword si_orc(qword a, qword b) | |
1335 { | |
1336 return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b))))); | |
1337 } | |
1338 | |
1339 | |
1340 /* Or Across | |
1341 */ | |
1342 static __inline qword si_orx(qword a) | |
1343 { | |
1344 vec_uchar16 tmp; | |
1345 tmp = (vec_uchar16)(a); | |
1346 tmp = vec_or(tmp, vec_sld(tmp, tmp, 8)); | |
1347 tmp = vec_or(tmp, vec_sld(tmp, tmp, 4)); | |
1348 return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00, | |
1349 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00})))); | |
1350 } | |
1351 | |
1352 | |
1353 /* Estimates | |
1354 */ | |
1355 static __inline qword si_frest(qword a) | |
1356 { | |
1357 return ((qword)(vec_re((vec_float4)(a)))); | |
1358 } | |
1359 | |
1360 static __inline qword si_frsqest(qword a) | |
1361 { | |
1362 return ((qword)(vec_rsqrte((vec_float4)(a)))); | |
1363 } | |
1364 | |
1365 #define si_fi(_a, _d) (_d) | |
1366 | |
1367 /* Channel Read and Write | |
1368 */ | |
1369 #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */ | |
1370 #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */ | |
1371 #define si_wrch(_channel, _a) /* not mappable */ | |
1372 | |
1373 /* Rotate Left | |
1374 */ | |
1375 static __inline qword si_roth(qword a, qword b) | |
1376 { | |
1377 return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b)))); | |
1378 } | |
1379 | |
1380 static __inline qword si_rot(qword a, qword b) | |
1381 { | |
1382 return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b)))); | |
1383 } | |
1384 | |
1385 static __inline qword si_rothi(qword a, int b) | |
1386 { | |
1387 return ((qword)(vec_rl((vec_ushort8)(a), | |
1388 vec_splat((vec_ushort8)(si_from_int(b)), 1)))); | |
1389 } | |
1390 | |
1391 static __inline qword si_roti(qword a, int b) | |
1392 { | |
1393 return ((qword)(vec_rl((vec_uint4)(a), | |
1394 vec_splat((vec_uint4)(si_from_int(b)), 0)))); | |
1395 } | |
1396 | |
1397 /* Rotate Left with Mask | |
1398 */ | |
1399 static __inline qword si_rothm(qword a, qword b) | |
1400 { | |
1401 vec_ushort8 neg_b; | |
1402 vec_ushort8 mask; | |
1403 | |
1404 neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b)); | |
1405 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); | |
1406 return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask))); | |
1407 } | |
1408 | |
1409 static __inline qword si_rotm(qword a, qword b) | |
1410 { | |
1411 vec_uint4 neg_b; | |
1412 vec_uint4 mask; | |
1413 | |
1414 neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b)); | |
1415 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); | |
1416 return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask))); | |
1417 } | |
1418 | |
1419 static __inline qword si_rothmi(qword a, int b) | |
1420 { | |
1421 vec_ushort8 neg_b; | |
1422 vec_ushort8 mask; | |
1423 | |
1424 neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1); | |
1425 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); | |
1426 return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask))); | |
1427 } | |
1428 | |
1429 static __inline qword si_rotmi(qword a, int b) | |
1430 { | |
1431 vec_uint4 neg_b; | |
1432 vec_uint4 mask; | |
1433 | |
1434 neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0); | |
1435 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); | |
1436 return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask))); | |
1437 } | |
1438 | |
1439 | |
1440 /* Rotate Left Algebraic with Mask | |
1441 */ | |
1442 static __inline qword si_rotmah(qword a, qword b) | |
1443 { | |
1444 vec_ushort8 neg_b; | |
1445 vec_ushort8 mask; | |
1446 | |
1447 neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b)); | |
1448 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); | |
1449 return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask)))); | |
1450 } | |
1451 | |
1452 static __inline qword si_rotma(qword a, qword b) | |
1453 { | |
1454 vec_uint4 neg_b; | |
1455 vec_uint4 mask; | |
1456 | |
1457 neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b)); | |
1458 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); | |
1459 return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask)))); | |
1460 } | |
1461 | |
1462 | |
1463 static __inline qword si_rotmahi(qword a, int b) | |
1464 { | |
1465 vec_ushort8 neg_b; | |
1466 vec_ushort8 mask; | |
1467 | |
1468 neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1); | |
1469 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); | |
1470 return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask)))); | |
1471 } | |
1472 | |
1473 static __inline qword si_rotmai(qword a, int b) | |
1474 { | |
1475 vec_uint4 neg_b; | |
1476 vec_uint4 mask; | |
1477 | |
1478 neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0); | |
1479 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); | |
1480 return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask)))); | |
1481 } | |
1482 | |
1483 | |
1484 /* Rotate Left Quadword by Bytes with Mask | |
1485 */ | |
1486 static __inline qword si_rotqmbyi(qword a, int count) | |
1487 { | |
1488 union { | |
1489 vec_uchar16 v; | |
1490 int i[4]; | |
1491 } x; | |
1492 vec_uchar16 mask; | |
1493 | |
1494 count = 0 - count; | |
1495 x.i[3] = count << 3; | |
1496 mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1); | |
1497 | |
1498 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask))); | |
1499 } | |
1500 | |
1501 | |
1502 static __inline qword si_rotqmby(qword a, qword count) | |
1503 { | |
1504 union { | |
1505 vec_uchar16 v; | |
1506 int i[4]; | |
1507 } x; | |
1508 int cnt; | |
1509 vec_uchar16 mask; | |
1510 | |
1511 x.v = (vec_uchar16)(count); | |
1512 x.i[0] = cnt = (0 - x.i[0]) << 3; | |
1513 | |
1514 x.v = vec_splat(x.v, 3); | |
1515 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); | |
1516 | |
1517 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask))); | |
1518 } | |
1519 | |
1520 | |
1521 /* Rotate Left Quadword by Bytes | |
1522 */ | |
1523 static __inline qword si_rotqbyi(qword a, int count) | |
1524 { | |
1525 union { | |
1526 vec_uchar16 v; | |
1527 int i[4]; | |
1528 } left, right; | |
1529 | |
1530 count <<= 3; | |
1531 left.i[3] = count; | |
1532 right.i[3] = 0 - count; | |
1533 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v)))); | |
1534 } | |
1535 | |
1536 static __inline qword si_rotqby(qword a, qword count) | |
1537 { | |
1538 vec_uchar16 left, right; | |
1539 | |
1540 left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3)); | |
1541 right = vec_sub(vec_splat_u8(0), left); | |
1542 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right)))); | |
1543 } | |
1544 | |
1545 /* Rotate Left Quadword by Bytes Bit Count | |
1546 */ | |
1547 static __inline qword si_rotqbybi(qword a, qword count) | |
1548 { | |
1549 vec_uchar16 left, right; | |
1550 | |
1551 left = vec_splat((vec_uchar16)(count), 3); | |
1552 right = vec_sub(vec_splat_u8(7), left); | |
1553 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right)))); | |
1554 } | |
1555 | |
1556 | |
1557 /* Rotate Left Quadword by Bytes Bit Count | |
1558 */ | |
1559 static __inline qword si_rotqbii(qword a, int count) | |
1560 { | |
1561 vec_uchar16 x, y; | |
1562 vec_uchar16 result; | |
1563 | |
1564 x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3); | |
1565 y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))), | |
1566 (vec_uint4)vec_sub(vec_splat_u8(8), x))); | |
1567 result = vec_or(vec_sll((qword)(a), x), y); | |
1568 return ((qword)(result)); | |
1569 } | |
1570 | |
1571 static __inline qword si_rotqbi(qword a, qword count) | |
1572 { | |
1573 vec_uchar16 x, y; | |
1574 vec_uchar16 result; | |
1575 | |
1576 x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7)); | |
1577 y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))), | |
1578 (vec_uint4)vec_sub(vec_splat_u8(8), x))); | |
1579 | |
1580 result = vec_or(vec_sll((qword)(a), x), y); | |
1581 return ((qword)(result)); | |
1582 } | |
1583 | |
1584 | |
1585 /* Rotate Left Quadword and Mask by Bits | |
1586 */ | |
1587 static __inline qword si_rotqmbii(qword a, int count) | |
1588 { | |
1589 return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3)))); | |
1590 } | |
1591 | |
1592 static __inline qword si_rotqmbi(qword a, qword count) | |
1593 { | |
1594 return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3))))); | |
1595 } | |
1596 | |
1597 | |
1598 /* Rotate Left Quadword and Mask by Bytes with Bit Count | |
1599 */ | |
1600 static __inline qword si_rotqmbybi(qword a, qword count) | |
1601 { | |
1602 union { | |
1603 vec_uchar16 v; | |
1604 int i[4]; | |
1605 } x; | |
1606 int cnt; | |
1607 vec_uchar16 mask; | |
1608 | |
1609 x.v = (vec_uchar16)(count); | |
1610 x.i[0] = cnt = 0 - (x.i[0] & ~7); | |
1611 x.v = vec_splat(x.v, 3); | |
1612 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); | |
1613 | |
1614 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask))); | |
1615 } | |
1616 | |
1617 | |
1618 | |
1619 | |
1620 /* Round Double to Float | |
1621 */ | |
1622 static __inline qword si_frds(qword a) | |
1623 { | |
1624 union { | |
1625 vec_float4 v; | |
1626 float f[4]; | |
1627 } d; | |
1628 union { | |
1629 vec_double2 v; | |
1630 double d[2]; | |
1631 } in; | |
1632 | |
1633 in.v = (vec_double2)(a); | |
1634 d.v = (vec_float4){0.0f}; | |
1635 d.f[0] = (float)in.d[0]; | |
1636 d.f[2] = (float)in.d[1]; | |
1637 | |
1638 return ((qword)(d.v)); | |
1639 } | |
1640 | |
1641 /* Select Bits | |
1642 */ | |
1643 static __inline qword si_selb(qword a, qword b, qword c) | |
1644 { | |
1645 return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c)))); | |
1646 } | |
1647 | |
1648 | |
1649 /* Shuffle Bytes | |
1650 */ | |
1651 static __inline qword si_shufb(qword a, qword b, qword pattern) | |
1652 { | |
1653 vec_uchar16 pat; | |
1654 | |
1655 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), | |
1656 vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)), | |
1657 vec_sra((vec_uchar16)(pattern), vec_splat_u8(7))); | |
1658 return ((qword)(vec_perm(vec_perm(a, b, pattern), | |
1659 ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, | |
1660 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}), | |
1661 pat))); | |
1662 } | |
1663 | |
1664 | |
1665 /* Shift Left | |
1666 */ | |
1667 static __inline qword si_shlh(qword a, qword b) | |
1668 { | |
1669 vec_ushort8 mask; | |
1670 | |
1671 mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15)); | |
1672 return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask))); | |
1673 } | |
1674 | |
1675 static __inline qword si_shl(qword a, qword b) | |
1676 { | |
1677 vec_uint4 mask; | |
1678 | |
1679 mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); | |
1680 return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask))); | |
1681 } | |
1682 | |
1683 | |
1684 static __inline qword si_shlhi(qword a, unsigned int b) | |
1685 { | |
1686 vec_ushort8 mask; | |
1687 vec_ushort8 bv; | |
1688 | |
1689 bv = vec_splat((vec_ushort8)(si_from_int(b)), 1); | |
1690 mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15)); | |
1691 return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask))); | |
1692 } | |
1693 | |
1694 static __inline qword si_shli(qword a, unsigned int b) | |
1695 { | |
1696 vec_uint4 bv; | |
1697 vec_uint4 mask; | |
1698 | |
1699 bv = vec_splat((vec_uint4)(si_from_uint(b)), 0); | |
1700 mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); | |
1701 return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask))); | |
1702 } | |
1703 | |
1704 | |
1705 /* Shift Left Quadword | |
1706 */ | |
1707 static __inline qword si_shlqbii(qword a, unsigned int count) | |
1708 { | |
1709 vec_uchar16 x; | |
1710 | |
1711 x = vec_splat((vec_uchar16)(si_from_uint(count)), 3); | |
1712 return ((qword)(vec_sll((vec_uchar16)(a), x))); | |
1713 } | |
1714 | |
1715 static __inline qword si_shlqbi(qword a, qword count) | |
1716 { | |
1717 vec_uchar16 x; | |
1718 | |
1719 x = vec_splat((vec_uchar16)(count), 3); | |
1720 return ((qword)(vec_sll((vec_uchar16)(a), x))); | |
1721 } | |
1722 | |
1723 | |
1724 /* Shift Left Quadword by Bytes | |
1725 */ | |
1726 static __inline qword si_shlqbyi(qword a, unsigned int count) | |
1727 { | |
1728 union { | |
1729 vec_uchar16 v; | |
1730 int i[4]; | |
1731 } x; | |
1732 vec_uchar16 mask; | |
1733 | |
1734 x.i[3] = count << 3; | |
1735 mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1); | |
1736 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask))); | |
1737 } | |
1738 | |
1739 static __inline qword si_shlqby(qword a, qword count) | |
1740 { | |
1741 union { | |
1742 vec_uchar16 v; | |
1743 unsigned int i[4]; | |
1744 } x; | |
1745 unsigned int cnt; | |
1746 vec_uchar16 mask; | |
1747 | |
1748 x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3)); | |
1749 cnt = x.i[0]; | |
1750 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); | |
1751 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask))); | |
1752 } | |
1753 | |
1754 /* Shift Left Quadword by Bytes with Bit Count | |
1755 */ | |
1756 static __inline qword si_shlqbybi(qword a, qword count) | |
1757 { | |
1758 union { | |
1759 vec_uchar16 v; | |
1760 int i[4]; | |
1761 } x; | |
1762 unsigned int cnt; | |
1763 vec_uchar16 mask; | |
1764 | |
1765 x.v = vec_splat((vec_uchar16)(count), 3); | |
1766 cnt = x.i[0]; | |
1767 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); | |
1768 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask))); | |
1769 } | |
1770 | |
1771 | |
1772 /* Stop and Signal | |
1773 */ | |
1774 #define si_stop(_type) SPU_STOP_ACTION | |
1775 #define si_stopd(a, b, c) SPU_STOP_ACTION | |
1776 | |
1777 | |
1778 /* Subtract | |
1779 */ | |
1780 static __inline qword si_sfh(qword a, qword b) | |
1781 { | |
1782 return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a)))); | |
1783 } | |
1784 | |
1785 static __inline qword si_sf(qword a, qword b) | |
1786 { | |
1787 return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a)))); | |
1788 } | |
1789 | |
1790 static __inline qword si_fs(qword a, qword b) | |
1791 { | |
1792 return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b)))); | |
1793 } | |
1794 | |
1795 static __inline qword si_dfs(qword a, qword b) | |
1796 { | |
1797 union { | |
1798 vec_double2 v; | |
1799 double d[2]; | |
1800 } aa, bb, dd; | |
1801 | |
1802 aa.v = (vec_double2)(a); | |
1803 bb.v = (vec_double2)(b); | |
1804 dd.d[0] = aa.d[0] - bb.d[0]; | |
1805 dd.d[1] = aa.d[1] - bb.d[1]; | |
1806 return ((qword)(dd.v)); | |
1807 } | |
1808 | |
1809 static __inline qword si_sfhi(qword a, short b) | |
1810 { | |
1811 return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1), | |
1812 (vec_short8)(a)))); | |
1813 } | |
1814 | |
1815 static __inline qword si_sfi(qword a, int b) | |
1816 { | |
1817 return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0), | |
1818 (vec_int4)(a)))); | |
1819 } | |
1820 | |
1821 /* Subtract word extended | |
1822 */ | |
1823 #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \ | |
1824 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \ | |
1825 vec_and((vec_uint4)(_c), vec_splat_u32(1))))) | |
1826 | |
1827 | |
1828 /* Sum Bytes into Shorts | |
1829 */ | |
1830 static __inline qword si_sumb(qword a, qword b) | |
1831 { | |
1832 vec_uint4 zero = (vec_uint4){0}; | |
1833 vec_ushort8 sum_a, sum_b; | |
1834 | |
1835 sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero); | |
1836 sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero); | |
1837 | |
1838 return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19, 2, 3, 22, 23, 6, 7, | |
1839 26, 27, 10, 11, 30, 31, 14, 15})))); | |
1840 } | |
1841 | |
1842 /* Exclusive OR | |
1843 */ | |
1844 static __inline qword si_xor(qword a, qword b) | |
1845 { | |
1846 return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b)))); | |
1847 } | |
1848 | |
1849 static __inline qword si_xorbi(qword a, unsigned char b) | |
1850 { | |
1851 return ((qword)(vec_xor((vec_uchar16)(a), | |
1852 vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); | |
1853 } | |
1854 | |
1855 static __inline qword si_xorhi(qword a, unsigned short b) | |
1856 { | |
1857 return ((qword)(vec_xor((vec_ushort8)(a), | |
1858 vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); | |
1859 } | |
1860 | |
1861 static __inline qword si_xori(qword a, unsigned int b) | |
1862 { | |
1863 return ((qword)(vec_xor((vec_uint4)(a), | |
1864 vec_splat((vec_uint4)(si_from_uint(b)), 0)))); | |
1865 } | |
1866 | |
1867 | |
1868 /* Generate Controls for Sub-Quadword Insertion | |
1869 */ | |
1870 static __inline qword si_cbd(qword a, int imm) | |
1871 { | |
1872 union { | |
1873 vec_uint4 v; | |
1874 unsigned char c[16]; | |
1875 } shmask; | |
1876 | |
1877 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1878 shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03; | |
1879 return ((qword)(shmask.v)); | |
1880 } | |
1881 | |
1882 static __inline qword si_cdd(qword a, int imm) | |
1883 { | |
1884 union { | |
1885 vec_uint4 v; | |
1886 unsigned long long ll[2]; | |
1887 } shmask; | |
1888 | |
1889 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1890 shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL; | |
1891 return ((qword)(shmask.v)); | |
1892 } | |
1893 | |
1894 static __inline qword si_chd(qword a, int imm) | |
1895 { | |
1896 union { | |
1897 vec_uint4 v; | |
1898 unsigned short s[8]; | |
1899 } shmask; | |
1900 | |
1901 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1902 shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203; | |
1903 return ((qword)(shmask.v)); | |
1904 } | |
1905 | |
1906 static __inline qword si_cwd(qword a, int imm) | |
1907 { | |
1908 union { | |
1909 vec_uint4 v; | |
1910 unsigned int i[4]; | |
1911 } shmask; | |
1912 | |
1913 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1914 shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203; | |
1915 return ((qword)(shmask.v)); | |
1916 } | |
1917 | |
1918 static __inline qword si_cbx(qword a, qword b) | |
1919 { | |
1920 union { | |
1921 vec_uint4 v; | |
1922 unsigned char c[16]; | |
1923 } shmask; | |
1924 | |
1925 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1926 shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03; | |
1927 return ((qword)(shmask.v)); | |
1928 } | |
1929 | |
1930 | |
1931 static __inline qword si_cdx(qword a, qword b) | |
1932 { | |
1933 union { | |
1934 vec_uint4 v; | |
1935 unsigned long long ll[2]; | |
1936 } shmask; | |
1937 | |
1938 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1939 shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL; | |
1940 return ((qword)(shmask.v)); | |
1941 } | |
1942 | |
1943 static __inline qword si_chx(qword a, qword b) | |
1944 { | |
1945 union { | |
1946 vec_uint4 v; | |
1947 unsigned short s[8]; | |
1948 } shmask; | |
1949 | |
1950 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1951 shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203; | |
1952 return ((qword)(shmask.v)); | |
1953 } | |
1954 | |
1955 static __inline qword si_cwx(qword a, qword b) | |
1956 { | |
1957 union { | |
1958 vec_uint4 v; | |
1959 unsigned int i[4]; | |
1960 } shmask; | |
1961 | |
1962 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); | |
1963 shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203; | |
1964 return ((qword)(shmask.v)); | |
1965 } | |
1966 | |
1967 | |
1968 /* Constant Formation | |
1969 */ | |
1970 static __inline qword si_il(signed short imm) | |
1971 { | |
1972 return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0))); | |
1973 } | |
1974 | |
1975 | |
1976 static __inline qword si_ila(unsigned int imm) | |
1977 { | |
1978 return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0))); | |
1979 } | |
1980 | |
1981 static __inline qword si_ilh(signed short imm) | |
1982 { | |
1983 return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1))); | |
1984 } | |
1985 | |
1986 static __inline qword si_ilhu(signed short imm) | |
1987 { | |
1988 return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0))); | |
1989 } | |
1990 | |
1991 static __inline qword si_iohl(qword a, unsigned short imm) | |
1992 { | |
1993 return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0)))); | |
1994 } | |
1995 | |
1996 /* No Operation | |
1997 */ | |
1998 #define si_lnop() /* do nothing */ | |
1999 #define si_nop() /* do nothing */ | |
2000 | |
2001 | |
2002 /* Memory Load and Store | |
2003 */ | |
2004 static __inline qword si_lqa(unsigned int imm) | |
2005 { | |
2006 return ((qword)(vec_ld(0, (vector unsigned char *)(imm)))); | |
2007 } | |
2008 | |
2009 static __inline qword si_lqd(qword a, unsigned int imm) | |
2010 { | |
2011 return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm)))); | |
2012 } | |
2013 | |
2014 static __inline qword si_lqr(unsigned int imm) | |
2015 { | |
2016 return ((qword)(vec_ld(0, (vector unsigned char *)(imm)))); | |
2017 } | |
2018 | |
2019 static __inline qword si_lqx(qword a, qword b) | |
2020 { | |
2021 return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0)))); | |
2022 } | |
2023 | |
2024 static __inline void si_stqa(qword a, unsigned int imm) | |
2025 { | |
2026 vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm)); | |
2027 } | |
2028 | |
2029 static __inline void si_stqd(qword a, qword b, unsigned int imm) | |
2030 { | |
2031 vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm)); | |
2032 } | |
2033 | |
2034 static __inline void si_stqr(qword a, unsigned int imm) | |
2035 { | |
2036 vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm)); | |
2037 } | |
2038 | |
2039 static __inline void si_stqx(qword a, qword b, qword c) | |
2040 { | |
2041 vec_st((vec_uchar16)(a), | |
2042 si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))), | |
2043 (vector unsigned char *)(0)); | |
2044 } | |
2045 | |
2046 #endif /* !__SPU__ */ | |
2047 #endif /* !_SI2VMX_H_ */ | |
2048 |