comparison gcc/config/rs6000/xmmintrin.h @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
comparison
equal deleted inserted replaced
68:561a7518be6b 111:04ced10e8804
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
45
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
49
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
53 Posix APIs. */
54 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
55 #endif
56
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
59
60 #include <altivec.h>
61 #include <assert.h>
62
63 /* We need type definitions from the MMX header file. */
64 #include <mmintrin.h>
65
66 /* Get _mm_malloc () and _mm_free (). */
67 #include <mm_malloc.h>
68
69 /* The Intel API is flexible enough that we must allow aliasing with other
70 vector types, and their scalar components. */
71 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
72
73 /* Internal data types for implementing the intrinsics. */
74 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
75
76 /* Create an undefined vector. */
77 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _mm_undefined_ps (void)
79 {
80 __m128 __Y = __Y;
81 return __Y;
82 }
83
84 /* Create a vector of zeros. */
85 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 _mm_setzero_ps (void)
87 {
88 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
89 }
90
91 /* Load four SPFP values from P. The address must be 16-byte aligned. */
92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_load_ps (float const *__P)
94 {
95 assert(((unsigned long)__P & 0xfUL) == 0UL);
96 return ((__m128)vec_ld(0, (__v4sf*)__P));
97 }
98
99 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101 _mm_loadu_ps (float const *__P)
102 {
103 return (vec_vsx_ld(0, __P));
104 }
105
106 /* Load four SPFP values in reverse order. The address must be aligned. */
107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
108 _mm_loadr_ps (float const *__P)
109 {
110 __v4sf __tmp;
111 __m128 result;
112 static const __vector unsigned char permute_vector =
113 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
114 0x17, 0x10, 0x11, 0x12, 0x13 };
115
116 __tmp = vec_ld (0, (__v4sf *) __P);
117 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
118 return result;
119 }
120
121 /* Create a vector with all four elements equal to F. */
122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 _mm_set1_ps (float __F)
124 {
125 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
126 }
127
128 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129 _mm_set_ps1 (float __F)
130 {
131 return _mm_set1_ps (__F);
132 }
133
134 /* Create the vector [Z Y X W]. */
135 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
137 {
138 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
139 }
140
141 /* Create the vector [W X Y Z]. */
142 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
144 {
145 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
146 }
147
148 /* Store four SPFP values. The address must be 16-byte aligned. */
149 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm_store_ps (float *__P, __m128 __A)
151 {
152 assert(((unsigned long)__P & 0xfUL) == 0UL);
153 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
154 }
155
156 /* Store four SPFP values. The address need not be 16-byte aligned. */
157 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm_storeu_ps (float *__P, __m128 __A)
159 {
160 *(__m128 *)__P = __A;
161 }
162
163 /* Store four SPFP values in reverse order. The address must be aligned. */
164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 _mm_storer_ps (float *__P, __m128 __A)
166 {
167 __v4sf __tmp;
168 static const __vector unsigned char permute_vector =
169 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
170 0x17, 0x10, 0x11, 0x12, 0x13 };
171
172 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
173
174 _mm_store_ps (__P, __tmp);
175 }
176
177 /* Store the lower SPFP value across four words. */
178 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 _mm_store1_ps (float *__P, __m128 __A)
180 {
181 __v4sf __va = vec_splat((__v4sf)__A, 0);
182 _mm_store_ps (__P, __va);
183 }
184
185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm_store_ps1 (float *__P, __m128 __A)
187 {
188 _mm_store1_ps (__P, __A);
189 }
190
191 /* Create a vector with element 0 as F and the rest zero. */
192 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193 _mm_set_ss (float __F)
194 {
195 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
196 }
197
198 /* Sets the low SPFP value of A from the low value of B. */
199 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200 _mm_move_ss (__m128 __A, __m128 __B)
201 {
202 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
203
204 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
205 }
206
207 /* Create a vector with element 0 as *P and the rest zero. */
208 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209 _mm_load_ss (float const *__P)
210 {
211 return _mm_set_ss (*__P);
212 }
213
214 /* Stores the lower SPFP value. */
215 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm_store_ss (float *__P, __m128 __A)
217 {
218 *__P = ((__v4sf)__A)[0];
219 }
220
221 /* Perform the respective operation on the lower SPFP (single-precision
222 floating-point) values of A and B; the upper three SPFP values are
223 passed through from A. */
224
225 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_add_ss (__m128 __A, __m128 __B)
227 {
228 #ifdef _ARCH_PWR7
229 __m128 a, b, c;
230 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
231 /* PowerISA VSX does not allow partial (for just lower double)
232 results. So to insure we don't generate spurious exceptions
233 (from the upper double values) we splat the lower double
234 before we to the operation. */
235 a = vec_splat (__A, 0);
236 b = vec_splat (__B, 0);
237 c = a + b;
238 /* Then we merge the lower float result with the original upper
239 float elements from __A. */
240 return (vec_sel (__A, c, mask));
241 #else
242 __A[0] = __A[0] + __B[0];
243 return (__A);
244 #endif
245 }
246
247 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_sub_ss (__m128 __A, __m128 __B)
249 {
250 #ifdef _ARCH_PWR7
251 __m128 a, b, c;
252 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
253 /* PowerISA VSX does not allow partial (for just lower double)
254 results. So to insure we don't generate spurious exceptions
255 (from the upper double values) we splat the lower double
256 before we to the operation. */
257 a = vec_splat (__A, 0);
258 b = vec_splat (__B, 0);
259 c = a - b;
260 /* Then we merge the lower float result with the original upper
261 float elements from __A. */
262 return (vec_sel (__A, c, mask));
263 #else
264 __A[0] = __A[0] - __B[0];
265 return (__A);
266 #endif
267 }
268
269 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 _mm_mul_ss (__m128 __A, __m128 __B)
271 {
272 #ifdef _ARCH_PWR7
273 __m128 a, b, c;
274 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
275 /* PowerISA VSX does not allow partial (for just lower double)
276 results. So to insure we don't generate spurious exceptions
277 (from the upper double values) we splat the lower double
278 before we to the operation. */
279 a = vec_splat (__A, 0);
280 b = vec_splat (__B, 0);
281 c = a * b;
282 /* Then we merge the lower float result with the original upper
283 float elements from __A. */
284 return (vec_sel (__A, c, mask));
285 #else
286 __A[0] = __A[0] * __B[0];
287 return (__A);
288 #endif
289 }
290
291 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292 _mm_div_ss (__m128 __A, __m128 __B)
293 {
294 #ifdef _ARCH_PWR7
295 __m128 a, b, c;
296 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
297 /* PowerISA VSX does not allow partial (for just lower double)
298 results. So to insure we don't generate spurious exceptions
299 (from the upper double values) we splat the lower double
300 before we to the operation. */
301 a = vec_splat (__A, 0);
302 b = vec_splat (__B, 0);
303 c = a / b;
304 /* Then we merge the lower float result with the original upper
305 float elements from __A. */
306 return (vec_sel (__A, c, mask));
307 #else
308 __A[0] = __A[0] / __B[0];
309 return (__A);
310 #endif
311 }
312
313 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 _mm_sqrt_ss (__m128 __A)
315 {
316 __m128 a, c;
317 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
318 /* PowerISA VSX does not allow partial (for just lower double)
319 * results. So to insure we don't generate spurious exceptions
320 * (from the upper double values) we splat the lower double
321 * before we to the operation. */
322 a = vec_splat (__A, 0);
323 c = vec_sqrt (a);
324 /* Then we merge the lower float result with the original upper
325 * float elements from __A. */
326 return (vec_sel (__A, c, mask));
327 }
328
329 /* Perform the respective operation on the four SPFP values in A and B. */
330 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
331 _mm_add_ps (__m128 __A, __m128 __B)
332 {
333 return (__m128) ((__v4sf)__A + (__v4sf)__B);
334 }
335
336 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
337 _mm_sub_ps (__m128 __A, __m128 __B)
338 {
339 return (__m128) ((__v4sf)__A - (__v4sf)__B);
340 }
341
342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_mul_ps (__m128 __A, __m128 __B)
344 {
345 return (__m128) ((__v4sf)__A * (__v4sf)__B);
346 }
347
348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_div_ps (__m128 __A, __m128 __B)
350 {
351 return (__m128) ((__v4sf)__A / (__v4sf)__B);
352 }
353
354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_sqrt_ps (__m128 __A)
356 {
357 return (vec_sqrt ((__v4sf)__A));
358 }
359
360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_rcp_ps (__m128 __A)
362 {
363 return (vec_re ((__v4sf)__A));
364 }
365
366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_rsqrt_ps (__m128 __A)
368 {
369 return (vec_rsqrte (__A));
370 }
371
372 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_rcp_ss (__m128 __A)
374 {
375 __m128 a, c;
376 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
377 /* PowerISA VSX does not allow partial (for just lower double)
378 * results. So to insure we don't generate spurious exceptions
379 * (from the upper double values) we splat the lower double
380 * before we to the operation. */
381 a = vec_splat (__A, 0);
382 c = _mm_rcp_ps (a);
383 /* Then we merge the lower float result with the original upper
384 * float elements from __A. */
385 return (vec_sel (__A, c, mask));
386 }
387
388 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 _mm_rsqrt_ss (__m128 __A)
390 {
391 __m128 a, c;
392 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
393 /* PowerISA VSX does not allow partial (for just lower double)
394 * results. So to insure we don't generate spurious exceptions
395 * (from the upper double values) we splat the lower double
396 * before we to the operation. */
397 a = vec_splat (__A, 0);
398 c = vec_rsqrte (a);
399 /* Then we merge the lower float result with the original upper
400 * float elements from __A. */
401 return (vec_sel (__A, c, mask));
402 }
403
404 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 _mm_min_ss (__m128 __A, __m128 __B)
406 {
407 __v4sf a, b, c;
408 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
409 /* PowerISA VSX does not allow partial (for just lower float)
410 * results. So to insure we don't generate spurious exceptions
411 * (from the upper float values) we splat the lower float
412 * before we to the operation. */
413 a = vec_splat ((__v4sf)__A, 0);
414 b = vec_splat ((__v4sf)__B, 0);
415 c = vec_min (a, b);
416 /* Then we merge the lower float result with the original upper
417 * float elements from __A. */
418 return (vec_sel ((__v4sf)__A, c, mask));
419 }
420
421 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _mm_max_ss (__m128 __A, __m128 __B)
423 {
424 __v4sf a, b, c;
425 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
426 /* PowerISA VSX does not allow partial (for just lower float)
427 * results. So to insure we don't generate spurious exceptions
428 * (from the upper float values) we splat the lower float
429 * before we to the operation. */
430 a = vec_splat (__A, 0);
431 b = vec_splat (__B, 0);
432 c = vec_max (a, b);
433 /* Then we merge the lower float result with the original upper
434 * float elements from __A. */
435 return (vec_sel ((__v4sf)__A, c, mask));
436 }
437
438 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439 _mm_min_ps (__m128 __A, __m128 __B)
440 {
441 return ((__m128)vec_min ((__v4sf)__A,(__v4sf) __B));
442 }
443
444 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445 _mm_max_ps (__m128 __A, __m128 __B)
446 {
447 return ((__m128)vec_max ((__v4sf)__A, (__v4sf)__B));
448 }
449
450 /* Perform logical bit-wise operations on 128-bit values. */
451 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452 _mm_and_ps (__m128 __A, __m128 __B)
453 {
454 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
455 // return __builtin_ia32_andps (__A, __B);
456 }
457
458 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm_andnot_ps (__m128 __A, __m128 __B)
460 {
461 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
462 }
463
464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_or_ps (__m128 __A, __m128 __B)
466 {
467 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
468 }
469
470 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _mm_xor_ps (__m128 __A, __m128 __B)
472 {
473 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
474 }
475
476 /* Perform a comparison on the four SPFP values of A and B. For each
477 element, if the comparison is true, place a mask of all ones in the
478 result, otherwise a mask of zeros. */
479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 _mm_cmpeq_ps (__m128 __A, __m128 __B)
481 {
482 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
483 }
484
485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_cmplt_ps (__m128 __A, __m128 __B)
487 {
488 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
489 }
490
491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_cmple_ps (__m128 __A, __m128 __B)
493 {
494 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
495 }
496
497 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498 _mm_cmpgt_ps (__m128 __A, __m128 __B)
499 {
500 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
501 }
502
503 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm_cmpge_ps (__m128 __A, __m128 __B)
505 {
506 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
507 }
508
509 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510 _mm_cmpneq_ps (__m128 __A, __m128 __B)
511 {
512 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
513 return ((__m128)vec_nor (temp, temp));
514 }
515
516 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
518 {
519 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
520 }
521
522 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523 _mm_cmpnle_ps (__m128 __A, __m128 __B)
524 {
525 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
526 }
527
528 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpngt_ps (__m128 __A, __m128 __B)
530 {
531 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
532 }
533
534 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 _mm_cmpnge_ps (__m128 __A, __m128 __B)
536 {
537 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
538 }
539
540 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541 _mm_cmpord_ps (__m128 __A, __m128 __B)
542 {
543 __vector unsigned int a, b;
544 __vector unsigned int c, d;
545 static const __vector unsigned int float_exp_mask =
546 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
547
548 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
549 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
550 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
551 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
552 return ((__m128 ) vec_and (c, d));
553 }
554
555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 _mm_cmpunord_ps (__m128 __A, __m128 __B)
557 {
558 __vector unsigned int a, b;
559 __vector unsigned int c, d;
560 static const __vector unsigned int float_exp_mask =
561 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
562
563 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
564 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
565 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
566 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
567 return ((__m128 ) vec_or (c, d));
568 }
569
570 /* Perform a comparison on the lower SPFP values of A and B. If the
571 comparison is true, place a mask of all ones in the result, otherwise a
572 mask of zeros. The upper three SPFP values are passed through from A. */
573 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574 _mm_cmpeq_ss (__m128 __A, __m128 __B)
575 {
576 static const __vector unsigned int mask =
577 { 0xffffffff, 0, 0, 0 };
578 __v4sf a, b, c;
579 /* PowerISA VMX does not allow partial (for just element 0)
580 * results. So to insure we don't generate spurious exceptions
581 * (from the upper elements) we splat the lower float
582 * before we to the operation. */
583 a = vec_splat ((__v4sf) __A, 0);
584 b = vec_splat ((__v4sf) __B, 0);
585 c = (__v4sf) vec_cmpeq(a, b);
586 /* Then we merge the lower float result with the original upper
587 * float elements from __A. */
588 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
589 }
590
591 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_cmplt_ss (__m128 __A, __m128 __B)
593 {
594 static const __vector unsigned int mask =
595 { 0xffffffff, 0, 0, 0 };
596 __v4sf a, b, c;
597 /* PowerISA VMX does not allow partial (for just element 0)
598 * results. So to insure we don't generate spurious exceptions
599 * (from the upper elements) we splat the lower float
600 * before we to the operation. */
601 a = vec_splat ((__v4sf) __A, 0);
602 b = vec_splat ((__v4sf) __B, 0);
603 c = (__v4sf) vec_cmplt(a, b);
604 /* Then we merge the lower float result with the original upper
605 * float elements from __A. */
606 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
607 }
608
609 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm_cmple_ss (__m128 __A, __m128 __B)
611 {
612 static const __vector unsigned int mask =
613 { 0xffffffff, 0, 0, 0 };
614 __v4sf a, b, c;
615 /* PowerISA VMX does not allow partial (for just element 0)
616 * results. So to insure we don't generate spurious exceptions
617 * (from the upper elements) we splat the lower float
618 * before we to the operation. */
619 a = vec_splat ((__v4sf) __A, 0);
620 b = vec_splat ((__v4sf) __B, 0);
621 c = (__v4sf) vec_cmple(a, b);
622 /* Then we merge the lower float result with the original upper
623 * float elements from __A. */
624 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
625 }
626
627 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 _mm_cmpgt_ss (__m128 __A, __m128 __B)
629 {
630 static const __vector unsigned int mask =
631 { 0xffffffff, 0, 0, 0 };
632 __v4sf a, b, c;
633 /* PowerISA VMX does not allow partial (for just element 0)
634 * results. So to insure we don't generate spurious exceptions
635 * (from the upper elements) we splat the lower float
636 * before we to the operation. */
637 a = vec_splat ((__v4sf) __A, 0);
638 b = vec_splat ((__v4sf) __B, 0);
639 c = (__v4sf) vec_cmpgt(a, b);
640 /* Then we merge the lower float result with the original upper
641 * float elements from __A. */
642 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
643 }
644
645 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 _mm_cmpge_ss (__m128 __A, __m128 __B)
647 {
648 static const __vector unsigned int mask =
649 { 0xffffffff, 0, 0, 0 };
650 __v4sf a, b, c;
651 /* PowerISA VMX does not allow partial (for just element 0)
652 * results. So to insure we don't generate spurious exceptions
653 * (from the upper elements) we splat the lower float
654 * before we to the operation. */
655 a = vec_splat ((__v4sf) __A, 0);
656 b = vec_splat ((__v4sf) __B, 0);
657 c = (__v4sf) vec_cmpge(a, b);
658 /* Then we merge the lower float result with the original upper
659 * float elements from __A. */
660 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
661 }
662
663 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 _mm_cmpneq_ss (__m128 __A, __m128 __B)
665 {
666 static const __vector unsigned int mask =
667 { 0xffffffff, 0, 0, 0 };
668 __v4sf a, b, c;
669 /* PowerISA VMX does not allow partial (for just element 0)
670 * results. So to insure we don't generate spurious exceptions
671 * (from the upper elements) we splat the lower float
672 * before we to the operation. */
673 a = vec_splat ((__v4sf) __A, 0);
674 b = vec_splat ((__v4sf) __B, 0);
675 c = (__v4sf) vec_cmpeq(a, b);
676 c = vec_nor (c, c);
677 /* Then we merge the lower float result with the original upper
678 * float elements from __A. */
679 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
680 }
681
682 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
684 {
685 static const __vector unsigned int mask =
686 { 0xffffffff, 0, 0, 0 };
687 __v4sf a, b, c;
688 /* PowerISA VMX does not allow partial (for just element 0)
689 * results. So to insure we don't generate spurious exceptions
690 * (from the upper elements) we splat the lower float
691 * before we to the operation. */
692 a = vec_splat ((__v4sf) __A, 0);
693 b = vec_splat ((__v4sf) __B, 0);
694 c = (__v4sf) vec_cmpge(a, b);
695 /* Then we merge the lower float result with the original upper
696 * float elements from __A. */
697 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
698 }
699
700 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_cmpnle_ss (__m128 __A, __m128 __B)
702 {
703 static const __vector unsigned int mask =
704 { 0xffffffff, 0, 0, 0 };
705 __v4sf a, b, c;
706 /* PowerISA VMX does not allow partial (for just element 0)
707 * results. So to insure we don't generate spurious exceptions
708 * (from the upper elements) we splat the lower float
709 * before we to the operation. */
710 a = vec_splat ((__v4sf) __A, 0);
711 b = vec_splat ((__v4sf) __B, 0);
712 c = (__v4sf) vec_cmpgt(a, b);
713 /* Then we merge the lower float result with the original upper
714 * float elements from __A. */
715 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
716 }
717
718 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_cmpngt_ss (__m128 __A, __m128 __B)
720 {
721 static const __vector unsigned int mask =
722 { 0xffffffff, 0, 0, 0 };
723 __v4sf a, b, c;
724 /* PowerISA VMX does not allow partial (for just element 0)
725 * results. So to insure we don't generate spurious exceptions
726 * (from the upper elements) we splat the lower float
727 * before we to the operation. */
728 a = vec_splat ((__v4sf) __A, 0);
729 b = vec_splat ((__v4sf) __B, 0);
730 c = (__v4sf) vec_cmple(a, b);
731 /* Then we merge the lower float result with the original upper
732 * float elements from __A. */
733 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
734 }
735
736 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_cmpnge_ss (__m128 __A, __m128 __B)
738 {
739 static const __vector unsigned int mask =
740 { 0xffffffff, 0, 0, 0 };
741 __v4sf a, b, c;
742 /* PowerISA VMX does not allow partial (for just element 0)
743 * results. So to insure we don't generate spurious exceptions
744 * (from the upper elements) we splat the lower float
745 * before we do the operation. */
746 a = vec_splat ((__v4sf) __A, 0);
747 b = vec_splat ((__v4sf) __B, 0);
748 c = (__v4sf) vec_cmplt(a, b);
749 /* Then we merge the lower float result with the original upper
750 * float elements from __A. */
751 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
752 }
753
754 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 _mm_cmpord_ss (__m128 __A, __m128 __B)
756 {
757 __vector unsigned int a, b;
758 __vector unsigned int c, d;
759 static const __vector unsigned int float_exp_mask =
760 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
761 static const __vector unsigned int mask =
762 { 0xffffffff, 0, 0, 0 };
763
764 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
765 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
766 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
767 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
768 c = vec_and (c, d);
769 /* Then we merge the lower float result with the original upper
770 * float elements from __A. */
771 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
772 }
773
774 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775 _mm_cmpunord_ss (__m128 __A, __m128 __B)
776 {
777 __vector unsigned int a, b;
778 __vector unsigned int c, d;
779 static const __vector unsigned int float_exp_mask =
780 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
781 static const __vector unsigned int mask =
782 { 0xffffffff, 0, 0, 0 };
783
784 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
785 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
786 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
787 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
788 c = vec_or (c, d);
789 /* Then we merge the lower float result with the original upper
790 * float elements from __A. */
791 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
792 }
793
794 /* Compare the lower SPFP values of A and B and return 1 if true
795 and 0 if false. */
796 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 _mm_comieq_ss (__m128 __A, __m128 __B)
798 {
799 return (__A[0] == __B[0]);
800 }
801
802 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_comilt_ss (__m128 __A, __m128 __B)
804 {
805 return (__A[0] < __B[0]);
806 }
807
808 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 _mm_comile_ss (__m128 __A, __m128 __B)
810 {
811 return (__A[0] <= __B[0]);
812 }
813
814 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 _mm_comigt_ss (__m128 __A, __m128 __B)
816 {
817 return (__A[0] > __B[0]);
818 }
819
820 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm_comige_ss (__m128 __A, __m128 __B)
822 {
823 return (__A[0] >= __B[0]);
824 }
825
826 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 _mm_comineq_ss (__m128 __A, __m128 __B)
828 {
829 return (__A[0] != __B[0]);
830 }
831
832 /* FIXME
833 * The __mm_ucomi??_ss implementations below are exactly the same as
834 * __mm_comi??_ss because GCC for PowerPC only generates unordered
835 * compares (scalar and vector).
836 * Technically __mm_comieq_ss et al should be using the ordered
837 * compare and signal for QNaNs.
838 * The __mm_ucomieq_sd et all should be OK, as is.
839 */
840 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _mm_ucomieq_ss (__m128 __A, __m128 __B)
842 {
843 return (__A[0] == __B[0]);
844 }
845
846 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847 _mm_ucomilt_ss (__m128 __A, __m128 __B)
848 {
849 return (__A[0] < __B[0]);
850 }
851
852 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm_ucomile_ss (__m128 __A, __m128 __B)
854 {
855 return (__A[0] <= __B[0]);
856 }
857
858 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _mm_ucomigt_ss (__m128 __A, __m128 __B)
860 {
861 return (__A[0] > __B[0]);
862 }
863
864 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm_ucomige_ss (__m128 __A, __m128 __B)
866 {
867 return (__A[0] >= __B[0]);
868 }
869
870 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_ucomineq_ss (__m128 __A, __m128 __B)
872 {
873 return (__A[0] != __B[0]);
874 }
875
876 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877 _mm_cvtss_f32 (__m128 __A)
878 {
879 return ((__v4sf)__A)[0];
880 }
881
882 /* Convert the lower SPFP value to a 32-bit integer according to the current
883 rounding mode. */
884 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
885 _mm_cvtss_si32 (__m128 __A)
886 {
887 __m64 res = 0;
888 #ifdef _ARCH_PWR8
889 __m128 vtmp;
890 __asm__(
891 "xxsldwi %x1,%x2,%x2,3;\n"
892 "xscvspdp %x1,%x1;\n"
893 "fctiw %1,%1;\n"
894 "mfvsrd %0,%x1;\n"
895 : "=r" (res),
896 "=&wi" (vtmp)
897 : "wa" (__A)
898 : );
899 #else
900 res = __builtin_rint(__A[0]);
901 #endif
902 return (res);
903 }
904
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm_cvt_ss2si (__m128 __A)
907 {
908 return _mm_cvtss_si32 (__A);
909 }
910
911 /* Convert the lower SPFP value to a 32-bit integer according to the
912 current rounding mode. */
913
914 /* Intel intrinsic. */
915 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _mm_cvtss_si64 (__m128 __A)
917 {
918 __m64 res = 0;
919 #ifdef _ARCH_PWR8
920 __m128 vtmp;
921 __asm__(
922 "xxsldwi %x1,%x2,%x2,3;\n"
923 "xscvspdp %x1,%x1;\n"
924 "fctid %1,%1;\n"
925 "mfvsrd %0,%x1;\n"
926 : "=r" (res),
927 "=&wi" (vtmp)
928 : "wa" (__A)
929 : );
930 #else
931 res = __builtin_llrint(__A[0]);
932 #endif
933 return (res);
934 }
935
936 /* Microsoft intrinsic. */
937 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_cvtss_si64x (__m128 __A)
939 {
940 return _mm_cvtss_si64 ((__v4sf) __A);
941 }
942
943 /* Constants for use with _mm_prefetch. */
944 enum _mm_hint
945 {
946 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
947 _MM_HINT_ET0 = 7,
948 _MM_HINT_ET1 = 6,
949 _MM_HINT_T0 = 3,
950 _MM_HINT_T1 = 2,
951 _MM_HINT_T2 = 1,
952 _MM_HINT_NTA = 0
953 };
954
955 /* Loads one cache line from address P to a location "closer" to the
956 processor. The selector I specifies the type of prefetch operation. */
957 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _mm_prefetch (const void *__P, enum _mm_hint __I)
959 {
960 /* Current PowerPC will ignores the hint parameters. */
961 __builtin_prefetch (__P);
962 }
963
964 /* Convert the two lower SPFP values to 32-bit integers according to the
965 current rounding mode. Return the integers in packed form. */
966 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967 _mm_cvtps_pi32 (__m128 __A)
968 {
969 /* Splat two lower SPFP values to both halves. */
970 __v4sf temp, rounded;
971 __vector __m64 result;
972
973 /* Splat two lower SPFP values to both halves. */
974 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
975 rounded = vec_rint(temp);
976 result = (__vector __m64) vec_cts (rounded, 0);
977
978 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
979 }
980
981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
982 _mm_cvt_ps2pi (__m128 __A)
983 {
984 return _mm_cvtps_pi32 (__A);
985 }
986
987 /* Truncate the lower SPFP value to a 32-bit integer. */
988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989 _mm_cvttss_si32 (__m128 __A)
990 {
991 /* Extract the lower float element. */
992 float temp = __A[0];
993 /* truncate to 32-bit integer and return. */
994 return temp;
995 }
996
997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998 _mm_cvtt_ss2si (__m128 __A)
999 {
1000 return _mm_cvttss_si32 (__A);
1001 }
1002
1003 /* Intel intrinsic. */
1004 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm_cvttss_si64 (__m128 __A)
1006 {
1007 /* Extract the lower float element. */
1008 float temp = __A[0];
1009 /* truncate to 32-bit integer and return. */
1010 return temp;
1011 }
1012
1013 /* Microsoft intrinsic. */
1014 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_cvttss_si64x (__m128 __A)
1016 {
1017 /* Extract the lower float element. */
1018 float temp = __A[0];
1019 /* truncate to 32-bit integer and return. */
1020 return temp;
1021 }
1022
1023 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1024 integers in packed form. */
1025 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvttps_pi32 (__m128 __A)
1027 {
1028 __v4sf temp;
1029 __vector __m64 result;
1030
1031 /* Splat two lower SPFP values to both halves. */
1032 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1033 result = (__vector __m64) vec_cts (temp, 0);
1034
1035 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1036 }
1037
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 _mm_cvtt_ps2pi (__m128 __A)
1040 {
1041 return _mm_cvttps_pi32 (__A);
1042 }
1043
1044 /* Convert B to a SPFP value and insert it as element zero in A. */
1045 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046 _mm_cvtsi32_ss (__m128 __A, int __B)
1047 {
1048 float temp = __B;
1049 __A[0] = temp;
1050
1051 return __A;
1052 }
1053
1054 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055 _mm_cvt_si2ss (__m128 __A, int __B)
1056 {
1057 return _mm_cvtsi32_ss (__A, __B);
1058 }
1059
1060 /* Convert B to a SPFP value and insert it as element zero in A. */
1061 /* Intel intrinsic. */
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_cvtsi64_ss (__m128 __A, long long __B)
1064 {
1065 float temp = __B;
1066 __A[0] = temp;
1067
1068 return __A;
1069 }
1070
1071 /* Microsoft intrinsic. */
1072 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1074 {
1075 return _mm_cvtsi64_ss (__A, __B);
1076 }
1077
1078 /* Convert the two 32-bit values in B to SPFP form and insert them
1079 as the two lower elements in A. */
1080 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1082 {
1083 __vector signed int vm1;
1084 __vector float vf1;
1085
1086 vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1087 vf1 = (__vector float) vec_ctf (vm1, 0);
1088
1089 return ((__m128) (__vector __m64)
1090 { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1091 }
1092
1093 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1094 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1095 {
1096 return _mm_cvtpi32_ps (__A, __B);
1097 }
1098
1099 /* Convert the four signed 16-bit values in A to SPFP form. */
1100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1101 _mm_cvtpi16_ps (__m64 __A)
1102 {
1103 __vector signed short vs8;
1104 __vector signed int vi4;
1105 __vector float vf1;
1106
1107 vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1108 vi4 = vec_vupklsh (vs8);
1109 vf1 = (__vector float) vec_ctf (vi4, 0);
1110
1111 return (__m128) vf1;
1112 }
1113
1114 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116 _mm_cvtpu16_ps (__m64 __A)
1117 {
1118 const __vector unsigned short zero =
1119 { 0, 0, 0, 0, 0, 0, 0, 0 };
1120 __vector unsigned short vs8;
1121 __vector unsigned int vi4;
1122 __vector float vf1;
1123
1124 vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1125 vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1126 vf1 = (__vector float) vec_ctf (vi4, 0);
1127
1128 return (__m128) vf1;
1129 }
1130
1131 /* Convert the low four signed 8-bit values in A to SPFP form. */
1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_cvtpi8_ps (__m64 __A)
1134 {
1135 __vector signed char vc16;
1136 __vector signed short vs8;
1137 __vector signed int vi4;
1138 __vector float vf1;
1139
1140 vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1141 vs8 = vec_vupkhsb (vc16);
1142 vi4 = vec_vupkhsh (vs8);
1143 vf1 = (__vector float) vec_ctf (vi4, 0);
1144
1145 return (__m128) vf1;
1146 }
1147
1148 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1149 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150
1151 _mm_cvtpu8_ps (__m64 __A)
1152 {
1153 const __vector unsigned char zero =
1154 { 0, 0, 0, 0, 0, 0, 0, 0 };
1155 __vector unsigned char vc16;
1156 __vector unsigned short vs8;
1157 __vector unsigned int vi4;
1158 __vector float vf1;
1159
1160 vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1161 vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1162 vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1163 (__vector unsigned short) zero);
1164 vf1 = (__vector float) vec_ctf (vi4, 0);
1165
1166 return (__m128) vf1;
1167 }
1168
1169 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1170 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1172 {
1173 __vector signed int vi4;
1174 __vector float vf4;
1175
1176 vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1177 vf4 = (__vector float) vec_ctf (vi4, 0);
1178 return (__m128) vf4;
1179 }
1180
1181 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm_cvtps_pi16(__m128 __A)
1184 {
1185 __v4sf rounded;
1186 __vector signed int temp;
1187 __vector __m64 result;
1188
1189 rounded = vec_rint(__A);
1190 temp = vec_cts (rounded, 0);
1191 result = (__vector __m64) vec_pack (temp, temp);
1192
1193 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1194 }
1195
1196 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm_cvtps_pi8(__m128 __A)
1199 {
1200 __v4sf rounded;
1201 __vector signed int tmp_i;
1202 static const __vector signed int zero = {0, 0, 0, 0};
1203 __vector signed short tmp_s;
1204 __vector signed char res_v;
1205 __m64 result;
1206
1207 rounded = vec_rint(__A);
1208 tmp_i = vec_cts (rounded, 0);
1209 tmp_s = vec_pack (tmp_i, zero);
1210 res_v = vec_pack (tmp_s, tmp_s);
1211 result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1212
1213 return (result);
1214 }
1215
1216 /* Selects four specific SPFP values from A and B based on MASK. */
1217 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218
1219 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1220 {
1221 unsigned long element_selector_10 = __mask & 0x03;
1222 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1223 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1224 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1225 static const unsigned int permute_selectors[4] =
1226 {
1227 #ifdef __LITTLE_ENDIAN__
1228 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1229 #elif __BIG_ENDIAN__
1230 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1231 #endif
1232 };
1233 __vector unsigned int t;
1234
1235 #ifdef __LITTLE_ENDIAN__
1236 t[0] = permute_selectors[element_selector_10];
1237 t[1] = permute_selectors[element_selector_32];
1238 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1239 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1240 #elif __BIG_ENDIAN__
1241 t[3] = permute_selectors[element_selector_10] + 0x10101010;
1242 t[2] = permute_selectors[element_selector_32] + 0x10101010;
1243 t[1] = permute_selectors[element_selector_54];
1244 t[0] = permute_selectors[element_selector_76];
1245 #endif
1246 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1247 }
1248
1249 /* Selects and interleaves the upper two SPFP values from A and B. */
1250 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1252 {
1253 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1254 }
1255
1256 /* Selects and interleaves the lower two SPFP values from A and B. */
1257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1259 {
1260 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1261 }
1262
1263 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1264 the lower two values are passed through from A. */
1265 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1267 {
1268 __vector __m64 __a = (__vector __m64)__A;
1269 __vector __m64 __p = vec_splats(*__P);
1270 __a [1] = __p [1];
1271
1272 return (__m128)__a;
1273 }
1274
1275 /* Stores the upper two SPFP values of A into P. */
1276 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277 _mm_storeh_pi (__m64 *__P, __m128 __A)
1278 {
1279 __vector __m64 __a = (__vector __m64) __A;
1280
1281 *__P = __a[1];
1282 }
1283
1284 /* Moves the upper two values of B into the lower two values of A. */
1285 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_movehl_ps (__m128 __A, __m128 __B)
1287 {
1288 return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1289 }
1290
1291 /* Moves the lower two values of B into the upper two values of A. */
1292 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_movelh_ps (__m128 __A, __m128 __B)
1294 {
1295 return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1296 }
1297
1298 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1299 the upper two values are passed through from A. */
1300 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1301 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1302 {
1303 __vector __m64 __a = (__vector __m64)__A;
1304 __vector __m64 __p = vec_splats(*__P);
1305 __a [0] = __p [0];
1306
1307 return (__m128)__a;
1308 }
1309
1310 /* Stores the lower two SPFP values of A into P. */
1311 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_storel_pi (__m64 *__P, __m128 __A)
1313 {
1314 __vector __m64 __a = (__vector __m64) __A;
1315
1316 *__P = __a[0];
1317 }
1318
1319 #ifdef _ARCH_PWR8
1320 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1321
1322 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1323 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324 _mm_movemask_ps (__m128 __A)
1325 {
1326 __vector __m64 result;
1327 static const __vector unsigned int perm_mask =
1328 {
1329 #ifdef __LITTLE_ENDIAN__
1330 0x00204060, 0x80808080, 0x80808080, 0x80808080
1331 #elif __BIG_ENDIAN__
1332 0x80808080, 0x80808080, 0x80808080, 0x00204060
1333 #endif
1334 };
1335
1336 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1337 (__vector unsigned char) perm_mask);
1338
1339 #ifdef __LITTLE_ENDIAN__
1340 return result[1];
1341 #elif __BIG_ENDIAN__
1342 return result[0];
1343 #endif
1344 }
1345 #endif /* _ARCH_PWR8 */
1346
1347 /* Create a vector with all four elements equal to *P. */
1348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_load1_ps (float const *__P)
1350 {
1351 return _mm_set1_ps (*__P);
1352 }
1353
1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_load_ps1 (float const *__P)
1356 {
1357 return _mm_load1_ps (__P);
1358 }
1359
1360 /* Extracts one of the four words of A. The selector N must be immediate. */
1361 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362 _mm_extract_pi16 (__m64 const __A, int const __N)
1363 {
1364 const int shiftr = (__N & 3) * 16;
1365
1366 return ((__A >> shiftr) & 0xffff);
1367 }
1368
1369 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _m_pextrw (__m64 const __A, int const __N)
1371 {
1372 return _mm_extract_pi16 (__A, __N);
1373 }
1374
1375 /* Inserts word D into one of four words of A. The selector N must be
1376 immediate. */
1377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1379 {
1380 const int shiftl = (__N & 3) * 16;
1381 const __m64 shiftD = (const __m64) __D << shiftl;
1382 const __m64 mask = 0xffffUL << shiftl;
1383 __m64 result = (__A & (~mask)) | (shiftD & mask);
1384
1385 return (result);
1386 }
1387
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1390 {
1391 return _mm_insert_pi16 (__A, __D, __N);
1392 }
1393
1394 /* Compute the element-wise maximum of signed 16-bit values. */
1395 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396
1397 _mm_max_pi16 (__m64 __A, __m64 __B)
1398 {
1399 #if _ARCH_PWR8
1400 __vector signed short a, b, r;
1401 __vector bool short c;
1402
1403 a = (__vector signed short)vec_splats (__A);
1404 b = (__vector signed short)vec_splats (__B);
1405 c = (__vector bool short)vec_cmpgt (a, b);
1406 r = vec_sel (b, a, c);
1407 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1408 #else
1409 __m64_union m1, m2, res;
1410
1411 m1.as_m64 = __A;
1412 m2.as_m64 = __B;
1413
1414 res.as_short[0] =
1415 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1416 res.as_short[1] =
1417 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1418 res.as_short[2] =
1419 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1420 res.as_short[3] =
1421 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1422
1423 return (__m64) res.as_m64;
1424 #endif
1425 }
1426
1427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1428 _m_pmaxsw (__m64 __A, __m64 __B)
1429 {
1430 return _mm_max_pi16 (__A, __B);
1431 }
1432
1433 /* Compute the element-wise maximum of unsigned 8-bit values. */
1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435 _mm_max_pu8 (__m64 __A, __m64 __B)
1436 {
1437 #if _ARCH_PWR8
1438 __vector unsigned char a, b, r;
1439 __vector bool char c;
1440
1441 a = (__vector unsigned char)vec_splats (__A);
1442 b = (__vector unsigned char)vec_splats (__B);
1443 c = (__vector bool char)vec_cmpgt (a, b);
1444 r = vec_sel (b, a, c);
1445 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1446 #else
1447 __m64_union m1, m2, res;
1448 long i;
1449
1450 m1.as_m64 = __A;
1451 m2.as_m64 = __B;
1452
1453
1454 for (i = 0; i < 8; i++)
1455 res.as_char[i] =
1456 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1457 m1.as_char[i] : m2.as_char[i];
1458
1459 return (__m64) res.as_m64;
1460 #endif
1461 }
1462
1463 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1464 _m_pmaxub (__m64 __A, __m64 __B)
1465 {
1466 return _mm_max_pu8 (__A, __B);
1467 }
1468
1469 /* Compute the element-wise minimum of signed 16-bit values. */
1470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1471 _mm_min_pi16 (__m64 __A, __m64 __B)
1472 {
1473 #if _ARCH_PWR8
1474 __vector signed short a, b, r;
1475 __vector bool short c;
1476
1477 a = (__vector signed short)vec_splats (__A);
1478 b = (__vector signed short)vec_splats (__B);
1479 c = (__vector bool short)vec_cmplt (a, b);
1480 r = vec_sel (b, a, c);
1481 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1482 #else
1483 __m64_union m1, m2, res;
1484
1485 m1.as_m64 = __A;
1486 m2.as_m64 = __B;
1487
1488 res.as_short[0] =
1489 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1490 res.as_short[1] =
1491 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1492 res.as_short[2] =
1493 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1494 res.as_short[3] =
1495 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1496
1497 return (__m64) res.as_m64;
1498 #endif
1499 }
1500
1501 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502 _m_pminsw (__m64 __A, __m64 __B)
1503 {
1504 return _mm_min_pi16 (__A, __B);
1505 }
1506
1507 /* Compute the element-wise minimum of unsigned 8-bit values. */
1508 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1509 _mm_min_pu8 (__m64 __A, __m64 __B)
1510 {
1511 #if _ARCH_PWR8
1512 __vector unsigned char a, b, r;
1513 __vector bool char c;
1514
1515 a = (__vector unsigned char)vec_splats (__A);
1516 b = (__vector unsigned char)vec_splats (__B);
1517 c = (__vector bool char)vec_cmplt (a, b);
1518 r = vec_sel (b, a, c);
1519 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1520 #else
1521 __m64_union m1, m2, res;
1522 long i;
1523
1524 m1.as_m64 = __A;
1525 m2.as_m64 = __B;
1526
1527
1528 for (i = 0; i < 8; i++)
1529 res.as_char[i] =
1530 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1531 m1.as_char[i] : m2.as_char[i];
1532
1533 return (__m64) res.as_m64;
1534 #endif
1535 }
1536
1537 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 _m_pminub (__m64 __A, __m64 __B)
1539 {
1540 return _mm_min_pu8 (__A, __B);
1541 }
1542
1543 /* Create an 8-bit mask of the signs of 8-bit values. */
1544 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1545 _mm_movemask_pi8 (__m64 __A)
1546 {
1547 unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1548
1549 return __builtin_bpermd (p, __A);
1550 }
1551
1552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1553 _m_pmovmskb (__m64 __A)
1554 {
1555 return _mm_movemask_pi8 (__A);
1556 }
1557
1558 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1559 in B and produce the high 16 bits of the 32-bit results. */
1560 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1561 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1562 {
1563 __vector unsigned short a, b;
1564 __vector unsigned short c;
1565 __vector unsigned int w0, w1;
1566 __vector unsigned char xform1 = {
1567 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1568 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1569 };
1570
1571 a = (__vector unsigned short)vec_splats (__A);
1572 b = (__vector unsigned short)vec_splats (__B);
1573
1574 w0 = vec_vmuleuh (a, b);
1575 w1 = vec_vmulouh (a, b);
1576 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1577
1578 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1579 }
1580
1581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 _m_pmulhuw (__m64 __A, __m64 __B)
1583 {
1584 return _mm_mulhi_pu16 (__A, __B);
1585 }
1586
1587 /* Return a combination of the four 16-bit values in A. The selector
1588 must be an immediate. */
1589 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1590 _mm_shuffle_pi16 (__m64 __A, int const __N)
1591 {
1592 unsigned long element_selector_10 = __N & 0x03;
1593 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1594 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1595 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1596 static const unsigned short permute_selectors[4] =
1597 {
1598 #ifdef __LITTLE_ENDIAN__
1599 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1600 #elif __BIG_ENDIAN__
1601 0x0607, 0x0405, 0x0203, 0x0001
1602 #endif
1603 };
1604 __m64_union t;
1605 __vector __m64 a, p, r;
1606
1607 #ifdef __LITTLE_ENDIAN__
1608 t.as_short[0] = permute_selectors[element_selector_10];
1609 t.as_short[1] = permute_selectors[element_selector_32];
1610 t.as_short[2] = permute_selectors[element_selector_54];
1611 t.as_short[3] = permute_selectors[element_selector_76];
1612 #elif __BIG_ENDIAN__
1613 t.as_short[3] = permute_selectors[element_selector_10];
1614 t.as_short[2] = permute_selectors[element_selector_32];
1615 t.as_short[1] = permute_selectors[element_selector_54];
1616 t.as_short[0] = permute_selectors[element_selector_76];
1617 #endif
1618 p = vec_splats (t.as_m64);
1619 a = vec_splats (__A);
1620 r = vec_perm (a, a, (__vector unsigned char)p);
1621 return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1622 }
1623
1624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1625 _m_pshufw (__m64 __A, int const __N)
1626 {
1627 return _mm_shuffle_pi16 (__A, __N);
1628 }
1629
1630 /* Conditionally store byte elements of A into P. The high bit of each
1631 byte in the selector N determines whether the corresponding byte from
1632 A is stored. */
1633 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1635 {
1636 __m64 hibit = 0x8080808080808080UL;
1637 __m64 mask, tmp;
1638 __m64 *p = (__m64*)__P;
1639
1640 tmp = *p;
1641 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1642 tmp = (tmp & (~mask)) | (__A & mask);
1643 *p = tmp;
1644 }
1645
1646 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1647 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1648 {
1649 _mm_maskmove_si64 (__A, __N, __P);
1650 }
1651
1652 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1654 _mm_avg_pu8 (__m64 __A, __m64 __B)
1655 {
1656 __vector unsigned char a, b, c;
1657
1658 a = (__vector unsigned char)vec_splats (__A);
1659 b = (__vector unsigned char)vec_splats (__B);
1660 c = vec_avg (a, b);
1661 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1662 }
1663
1664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1665 _m_pavgb (__m64 __A, __m64 __B)
1666 {
1667 return _mm_avg_pu8 (__A, __B);
1668 }
1669
1670 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_avg_pu16 (__m64 __A, __m64 __B)
1673 {
1674 __vector unsigned short a, b, c;
1675
1676 a = (__vector unsigned short)vec_splats (__A);
1677 b = (__vector unsigned short)vec_splats (__B);
1678 c = vec_avg (a, b);
1679 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1680 }
1681
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _m_pavgw (__m64 __A, __m64 __B)
1684 {
1685 return _mm_avg_pu16 (__A, __B);
1686 }
1687
1688 /* Compute the sum of the absolute differences of the unsigned 8-bit
1689 values in A and B. Return the value in the lower 16-bit word; the
1690 upper words are cleared. */
1691 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1692 _mm_sad_pu8 (__m64 __A, __m64 __B)
1693 {
1694 __vector unsigned char a, b;
1695 __vector unsigned char vmin, vmax, vabsdiff;
1696 __vector signed int vsum;
1697 const __vector unsigned int zero =
1698 { 0, 0, 0, 0 };
1699 unsigned short result;
1700
1701 a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1702 b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1703 vmin = vec_min (a, b);
1704 vmax = vec_max (a, b);
1705 vabsdiff = vec_sub (vmax, vmin);
1706 /* Sum four groups of bytes into integers. */
1707 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1708 /* Sum across four integers with integer result. */
1709 vsum = vec_sums (vsum, (__vector signed int) zero);
1710 /* The sum is in the right most 32-bits of the vector result.
1711 Transfer to a GPR and truncate to 16 bits. */
1712 result = vsum[3];
1713 return (result);
1714 }
1715
1716 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1717 _m_psadbw (__m64 __A, __m64 __B)
1718 {
1719 return _mm_sad_pu8 (__A, __B);
1720 }
1721
1722 /* Stores the data in A to the address P without polluting the caches. */
1723 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1724 _mm_stream_pi (__m64 *__P, __m64 __A)
1725 {
1726 /* Use the data cache block touch for store transient. */
1727 __asm__ (
1728 " dcbtstt 0,%0"
1729 :
1730 : "b" (__P)
1731 : "memory"
1732 );
1733 *__P = __A;
1734 }
1735
1736 /* Likewise. The address must be 16-byte aligned. */
1737 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_stream_ps (float *__P, __m128 __A)
1739 {
1740 /* Use the data cache block touch for store transient. */
1741 __asm__ (
1742 " dcbtstt 0,%0"
1743 :
1744 : "b" (__P)
1745 : "memory"
1746 );
1747 _mm_store_ps (__P, __A);
1748 }
1749
1750 /* Guarantees that every preceding store is globally visible before
1751 any subsequent store. */
1752 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1753 _mm_sfence (void)
1754 {
1755 /* Generate a light weight sync. */
1756 __atomic_thread_fence (__ATOMIC_RELEASE);
1757 }
1758
1759 /* The execution of the next instruction is delayed by an implementation
1760 specific amount of time. The instruction does not modify the
1761 architectural state. This is after the pop_options pragma because
1762 it does not require SSE support in the processor--the encoding is a
1763 nop on processors that do not support it. */
1764 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1765 _mm_pause (void)
1766 {
1767 /* There is no exact match with this construct, but the following is
1768 close to the desired effect. */
1769 #if _ARCH_PWR8
1770 /* On power8 and later processors we can depend on Program Priority
1771 (PRI) and associated "very low" PPI setting. Since we don't know
1772 what PPI this thread is running at we: 1) save the current PRI
1773 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1774 via the special or 31,31,31 encoding. 3) issue an "isync" to
1775 insure the PRI change takes effect before we execute any more
1776 instructions.
1777 Now we can execute a lwsync (release barrier) while we execute
1778 this thread at "very low" PRI. Finally we restore the original
1779 PRI and continue execution. */
1780 unsigned long __PPR;
1781
1782 __asm__ volatile (
1783 " mfppr %0;"
1784 " or 31,31,31;"
1785 " isync;"
1786 " lwsync;"
1787 " isync;"
1788 " mtppr %0;"
1789 : "=r" (__PPR)
1790 :
1791 : "memory"
1792 );
1793 #else
1794 /* For older processor where we may not even have Program Priority
1795 controls we can only depend on Heavy Weight Sync. */
1796 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1797 #endif
1798 }
1799
1800 /* Transpose the 4x4 matrix composed of row[0-3]. */
1801 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1802 do { \
1803 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1804 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1805 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1806 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1807 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1808 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1809 (__vector long long)__t1); \
1810 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1811 (__vector long long)__t1); \
1812 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1813 (__vector long long)__t3); \
1814 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1815 (__vector long long)__t3); \
1816 } while (0)
1817
1818 /* For backward source compatibility. */
1819 //# include <emmintrin.h>
1820
1821 #endif /* _XMMINTRIN_H_INCLUDED */