Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/rs6000/xmmintrin.h @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc. | |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation; either version 3, or (at your option) | |
8 any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. */ | |
23 | |
24 /* Implemented from the specification included in the Intel C++ Compiler | |
25 User Guide and Reference, version 9.0. */ | |
26 | |
27 #ifndef NO_WARN_X86_INTRINSICS | |
28 /* This header is distributed to simplify porting x86_64 code that | |
29 makes explicit use of Intel intrinsics to powerpc64le. | |
30 It is the user's responsibility to determine if the results are | |
31 acceptable and make additional changes as necessary. | |
32 Note that much code that uses Intel intrinsics can be rewritten in | |
33 standard C or GNU C extensions, which are more portable and better | |
34 optimized across multiple targets. | |
35 | |
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC | |
37 VMX/VSX ISA is a good match for vector float SIMD operations. | |
38 However scalar float operations in vector (XMM) registers require | |
39 the POWER8 VSX ISA (2.07) level. Also there are important | |
40 differences for data format and placement of float scalars in the | |
41 vector register. For PowerISA Scalar floats in FPRs (left most | |
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE | |
43 uses the right most 32-bits of the XMM. These differences require | |
44 extra steps on POWER to match the SSE scalar float semantics. | |
45 | |
46 Most SSE scalar float intrinsic operations can be performed more | |
47 efficiently as C language float scalar operations or optimized to | |
48 use vector SIMD operations. We recommend this for new applications. | |
49 | |
50 Another difference is the format and details of the X86_64 MXSCR vs | |
51 the PowerISA FPSCR / VSCR registers. We recommend applications | |
52 replace direct access to the MXSCR with the more portable <fenv.h> | |
53 Posix APIs. */ | |
54 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." | |
55 #endif | |
56 | |
57 #ifndef _XMMINTRIN_H_INCLUDED | |
58 #define _XMMINTRIN_H_INCLUDED | |
59 | |
60 #include <altivec.h> | |
61 #include <assert.h> | |
62 | |
63 /* We need type definitions from the MMX header file. */ | |
64 #include <mmintrin.h> | |
65 | |
66 /* Get _mm_malloc () and _mm_free (). */ | |
67 #include <mm_malloc.h> | |
68 | |
69 /* The Intel API is flexible enough that we must allow aliasing with other | |
70 vector types, and their scalar components. */ | |
71 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); | |
72 | |
73 /* Internal data types for implementing the intrinsics. */ | |
74 typedef float __v4sf __attribute__ ((__vector_size__ (16))); | |
75 | |
76 /* Create an undefined vector. */ | |
77 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
78 _mm_undefined_ps (void) | |
79 { | |
80 __m128 __Y = __Y; | |
81 return __Y; | |
82 } | |
83 | |
84 /* Create a vector of zeros. */ | |
85 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
86 _mm_setzero_ps (void) | |
87 { | |
88 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; | |
89 } | |
90 | |
91 /* Load four SPFP values from P. The address must be 16-byte aligned. */ | |
92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
93 _mm_load_ps (float const *__P) | |
94 { | |
95 assert(((unsigned long)__P & 0xfUL) == 0UL); | |
96 return ((__m128)vec_ld(0, (__v4sf*)__P)); | |
97 } | |
98 | |
99 /* Load four SPFP values from P. The address need not be 16-byte aligned. */ | |
100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
101 _mm_loadu_ps (float const *__P) | |
102 { | |
103 return (vec_vsx_ld(0, __P)); | |
104 } | |
105 | |
106 /* Load four SPFP values in reverse order. The address must be aligned. */ | |
107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
108 _mm_loadr_ps (float const *__P) | |
109 { | |
110 __v4sf __tmp; | |
111 __m128 result; | |
112 static const __vector unsigned char permute_vector = | |
113 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, | |
114 0x17, 0x10, 0x11, 0x12, 0x13 }; | |
115 | |
116 __tmp = vec_ld (0, (__v4sf *) __P); | |
117 result = (__m128) vec_perm (__tmp, __tmp, permute_vector); | |
118 return result; | |
119 } | |
120 | |
121 /* Create a vector with all four elements equal to F. */ | |
122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
123 _mm_set1_ps (float __F) | |
124 { | |
125 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; | |
126 } | |
127 | |
128 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
129 _mm_set_ps1 (float __F) | |
130 { | |
131 return _mm_set1_ps (__F); | |
132 } | |
133 | |
134 /* Create the vector [Z Y X W]. */ | |
135 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
136 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) | |
137 { | |
138 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; | |
139 } | |
140 | |
141 /* Create the vector [W X Y Z]. */ | |
142 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
143 _mm_setr_ps (float __Z, float __Y, float __X, float __W) | |
144 { | |
145 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; | |
146 } | |
147 | |
148 /* Store four SPFP values. The address must be 16-byte aligned. */ | |
149 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
150 _mm_store_ps (float *__P, __m128 __A) | |
151 { | |
152 assert(((unsigned long)__P & 0xfUL) == 0UL); | |
153 vec_st((__v4sf)__A, 0, (__v4sf*)__P); | |
154 } | |
155 | |
156 /* Store four SPFP values. The address need not be 16-byte aligned. */ | |
157 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
158 _mm_storeu_ps (float *__P, __m128 __A) | |
159 { | |
160 *(__m128 *)__P = __A; | |
161 } | |
162 | |
163 /* Store four SPFP values in reverse order. The address must be aligned. */ | |
164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
165 _mm_storer_ps (float *__P, __m128 __A) | |
166 { | |
167 __v4sf __tmp; | |
168 static const __vector unsigned char permute_vector = | |
169 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, | |
170 0x17, 0x10, 0x11, 0x12, 0x13 }; | |
171 | |
172 __tmp = (__m128) vec_perm (__A, __A, permute_vector); | |
173 | |
174 _mm_store_ps (__P, __tmp); | |
175 } | |
176 | |
177 /* Store the lower SPFP value across four words. */ | |
178 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
179 _mm_store1_ps (float *__P, __m128 __A) | |
180 { | |
181 __v4sf __va = vec_splat((__v4sf)__A, 0); | |
182 _mm_store_ps (__P, __va); | |
183 } | |
184 | |
185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
186 _mm_store_ps1 (float *__P, __m128 __A) | |
187 { | |
188 _mm_store1_ps (__P, __A); | |
189 } | |
190 | |
191 /* Create a vector with element 0 as F and the rest zero. */ | |
192 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
193 _mm_set_ss (float __F) | |
194 { | |
195 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; | |
196 } | |
197 | |
198 /* Sets the low SPFP value of A from the low value of B. */ | |
199 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
200 _mm_move_ss (__m128 __A, __m128 __B) | |
201 { | |
202 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
203 | |
204 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); | |
205 } | |
206 | |
207 /* Create a vector with element 0 as *P and the rest zero. */ | |
208 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
209 _mm_load_ss (float const *__P) | |
210 { | |
211 return _mm_set_ss (*__P); | |
212 } | |
213 | |
214 /* Stores the lower SPFP value. */ | |
215 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
216 _mm_store_ss (float *__P, __m128 __A) | |
217 { | |
218 *__P = ((__v4sf)__A)[0]; | |
219 } | |
220 | |
221 /* Perform the respective operation on the lower SPFP (single-precision | |
222 floating-point) values of A and B; the upper three SPFP values are | |
223 passed through from A. */ | |
224 | |
225 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
226 _mm_add_ss (__m128 __A, __m128 __B) | |
227 { | |
228 #ifdef _ARCH_PWR7 | |
229 __m128 a, b, c; | |
230 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
231 /* PowerISA VSX does not allow partial (for just lower double) | |
232 results. So to insure we don't generate spurious exceptions | |
233 (from the upper double values) we splat the lower double | |
234 before we to the operation. */ | |
235 a = vec_splat (__A, 0); | |
236 b = vec_splat (__B, 0); | |
237 c = a + b; | |
238 /* Then we merge the lower float result with the original upper | |
239 float elements from __A. */ | |
240 return (vec_sel (__A, c, mask)); | |
241 #else | |
242 __A[0] = __A[0] + __B[0]; | |
243 return (__A); | |
244 #endif | |
245 } | |
246 | |
247 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
248 _mm_sub_ss (__m128 __A, __m128 __B) | |
249 { | |
250 #ifdef _ARCH_PWR7 | |
251 __m128 a, b, c; | |
252 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
253 /* PowerISA VSX does not allow partial (for just lower double) | |
254 results. So to insure we don't generate spurious exceptions | |
255 (from the upper double values) we splat the lower double | |
256 before we to the operation. */ | |
257 a = vec_splat (__A, 0); | |
258 b = vec_splat (__B, 0); | |
259 c = a - b; | |
260 /* Then we merge the lower float result with the original upper | |
261 float elements from __A. */ | |
262 return (vec_sel (__A, c, mask)); | |
263 #else | |
264 __A[0] = __A[0] - __B[0]; | |
265 return (__A); | |
266 #endif | |
267 } | |
268 | |
269 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
270 _mm_mul_ss (__m128 __A, __m128 __B) | |
271 { | |
272 #ifdef _ARCH_PWR7 | |
273 __m128 a, b, c; | |
274 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
275 /* PowerISA VSX does not allow partial (for just lower double) | |
276 results. So to insure we don't generate spurious exceptions | |
277 (from the upper double values) we splat the lower double | |
278 before we to the operation. */ | |
279 a = vec_splat (__A, 0); | |
280 b = vec_splat (__B, 0); | |
281 c = a * b; | |
282 /* Then we merge the lower float result with the original upper | |
283 float elements from __A. */ | |
284 return (vec_sel (__A, c, mask)); | |
285 #else | |
286 __A[0] = __A[0] * __B[0]; | |
287 return (__A); | |
288 #endif | |
289 } | |
290 | |
291 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
292 _mm_div_ss (__m128 __A, __m128 __B) | |
293 { | |
294 #ifdef _ARCH_PWR7 | |
295 __m128 a, b, c; | |
296 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
297 /* PowerISA VSX does not allow partial (for just lower double) | |
298 results. So to insure we don't generate spurious exceptions | |
299 (from the upper double values) we splat the lower double | |
300 before we to the operation. */ | |
301 a = vec_splat (__A, 0); | |
302 b = vec_splat (__B, 0); | |
303 c = a / b; | |
304 /* Then we merge the lower float result with the original upper | |
305 float elements from __A. */ | |
306 return (vec_sel (__A, c, mask)); | |
307 #else | |
308 __A[0] = __A[0] / __B[0]; | |
309 return (__A); | |
310 #endif | |
311 } | |
312 | |
313 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
314 _mm_sqrt_ss (__m128 __A) | |
315 { | |
316 __m128 a, c; | |
317 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
318 /* PowerISA VSX does not allow partial (for just lower double) | |
319 * results. So to insure we don't generate spurious exceptions | |
320 * (from the upper double values) we splat the lower double | |
321 * before we to the operation. */ | |
322 a = vec_splat (__A, 0); | |
323 c = vec_sqrt (a); | |
324 /* Then we merge the lower float result with the original upper | |
325 * float elements from __A. */ | |
326 return (vec_sel (__A, c, mask)); | |
327 } | |
328 | |
329 /* Perform the respective operation on the four SPFP values in A and B. */ | |
330 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
331 _mm_add_ps (__m128 __A, __m128 __B) | |
332 { | |
333 return (__m128) ((__v4sf)__A + (__v4sf)__B); | |
334 } | |
335 | |
336 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
337 _mm_sub_ps (__m128 __A, __m128 __B) | |
338 { | |
339 return (__m128) ((__v4sf)__A - (__v4sf)__B); | |
340 } | |
341 | |
342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
343 _mm_mul_ps (__m128 __A, __m128 __B) | |
344 { | |
345 return (__m128) ((__v4sf)__A * (__v4sf)__B); | |
346 } | |
347 | |
348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
349 _mm_div_ps (__m128 __A, __m128 __B) | |
350 { | |
351 return (__m128) ((__v4sf)__A / (__v4sf)__B); | |
352 } | |
353 | |
354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
355 _mm_sqrt_ps (__m128 __A) | |
356 { | |
357 return (vec_sqrt ((__v4sf)__A)); | |
358 } | |
359 | |
360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
361 _mm_rcp_ps (__m128 __A) | |
362 { | |
363 return (vec_re ((__v4sf)__A)); | |
364 } | |
365 | |
366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
367 _mm_rsqrt_ps (__m128 __A) | |
368 { | |
369 return (vec_rsqrte (__A)); | |
370 } | |
371 | |
372 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
373 _mm_rcp_ss (__m128 __A) | |
374 { | |
375 __m128 a, c; | |
376 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
377 /* PowerISA VSX does not allow partial (for just lower double) | |
378 * results. So to insure we don't generate spurious exceptions | |
379 * (from the upper double values) we splat the lower double | |
380 * before we to the operation. */ | |
381 a = vec_splat (__A, 0); | |
382 c = _mm_rcp_ps (a); | |
383 /* Then we merge the lower float result with the original upper | |
384 * float elements from __A. */ | |
385 return (vec_sel (__A, c, mask)); | |
386 } | |
387 | |
388 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
389 _mm_rsqrt_ss (__m128 __A) | |
390 { | |
391 __m128 a, c; | |
392 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
393 /* PowerISA VSX does not allow partial (for just lower double) | |
394 * results. So to insure we don't generate spurious exceptions | |
395 * (from the upper double values) we splat the lower double | |
396 * before we to the operation. */ | |
397 a = vec_splat (__A, 0); | |
398 c = vec_rsqrte (a); | |
399 /* Then we merge the lower float result with the original upper | |
400 * float elements from __A. */ | |
401 return (vec_sel (__A, c, mask)); | |
402 } | |
403 | |
404 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
405 _mm_min_ss (__m128 __A, __m128 __B) | |
406 { | |
407 __v4sf a, b, c; | |
408 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
409 /* PowerISA VSX does not allow partial (for just lower float) | |
410 * results. So to insure we don't generate spurious exceptions | |
411 * (from the upper float values) we splat the lower float | |
412 * before we to the operation. */ | |
413 a = vec_splat ((__v4sf)__A, 0); | |
414 b = vec_splat ((__v4sf)__B, 0); | |
415 c = vec_min (a, b); | |
416 /* Then we merge the lower float result with the original upper | |
417 * float elements from __A. */ | |
418 return (vec_sel ((__v4sf)__A, c, mask)); | |
419 } | |
420 | |
421 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
422 _mm_max_ss (__m128 __A, __m128 __B) | |
423 { | |
424 __v4sf a, b, c; | |
425 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; | |
426 /* PowerISA VSX does not allow partial (for just lower float) | |
427 * results. So to insure we don't generate spurious exceptions | |
428 * (from the upper float values) we splat the lower float | |
429 * before we to the operation. */ | |
430 a = vec_splat (__A, 0); | |
431 b = vec_splat (__B, 0); | |
432 c = vec_max (a, b); | |
433 /* Then we merge the lower float result with the original upper | |
434 * float elements from __A. */ | |
435 return (vec_sel ((__v4sf)__A, c, mask)); | |
436 } | |
437 | |
438 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
439 _mm_min_ps (__m128 __A, __m128 __B) | |
440 { | |
441 return ((__m128)vec_min ((__v4sf)__A,(__v4sf) __B)); | |
442 } | |
443 | |
444 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
445 _mm_max_ps (__m128 __A, __m128 __B) | |
446 { | |
447 return ((__m128)vec_max ((__v4sf)__A, (__v4sf)__B)); | |
448 } | |
449 | |
450 /* Perform logical bit-wise operations on 128-bit values. */ | |
451 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
452 _mm_and_ps (__m128 __A, __m128 __B) | |
453 { | |
454 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); | |
455 // return __builtin_ia32_andps (__A, __B); | |
456 } | |
457 | |
458 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
459 _mm_andnot_ps (__m128 __A, __m128 __B) | |
460 { | |
461 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); | |
462 } | |
463 | |
464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
465 _mm_or_ps (__m128 __A, __m128 __B) | |
466 { | |
467 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); | |
468 } | |
469 | |
470 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
471 _mm_xor_ps (__m128 __A, __m128 __B) | |
472 { | |
473 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); | |
474 } | |
475 | |
476 /* Perform a comparison on the four SPFP values of A and B. For each | |
477 element, if the comparison is true, place a mask of all ones in the | |
478 result, otherwise a mask of zeros. */ | |
479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
480 _mm_cmpeq_ps (__m128 __A, __m128 __B) | |
481 { | |
482 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); | |
483 } | |
484 | |
485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
486 _mm_cmplt_ps (__m128 __A, __m128 __B) | |
487 { | |
488 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); | |
489 } | |
490 | |
491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
492 _mm_cmple_ps (__m128 __A, __m128 __B) | |
493 { | |
494 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); | |
495 } | |
496 | |
497 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
498 _mm_cmpgt_ps (__m128 __A, __m128 __B) | |
499 { | |
500 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); | |
501 } | |
502 | |
503 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
504 _mm_cmpge_ps (__m128 __A, __m128 __B) | |
505 { | |
506 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); | |
507 } | |
508 | |
509 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
510 _mm_cmpneq_ps (__m128 __A, __m128 __B) | |
511 { | |
512 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); | |
513 return ((__m128)vec_nor (temp, temp)); | |
514 } | |
515 | |
516 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
517 _mm_cmpnlt_ps (__m128 __A, __m128 __B) | |
518 { | |
519 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); | |
520 } | |
521 | |
522 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
523 _mm_cmpnle_ps (__m128 __A, __m128 __B) | |
524 { | |
525 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); | |
526 } | |
527 | |
528 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
529 _mm_cmpngt_ps (__m128 __A, __m128 __B) | |
530 { | |
531 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); | |
532 } | |
533 | |
534 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
535 _mm_cmpnge_ps (__m128 __A, __m128 __B) | |
536 { | |
537 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); | |
538 } | |
539 | |
540 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
541 _mm_cmpord_ps (__m128 __A, __m128 __B) | |
542 { | |
543 __vector unsigned int a, b; | |
544 __vector unsigned int c, d; | |
545 static const __vector unsigned int float_exp_mask = | |
546 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
547 | |
548 a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
549 b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
550 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); | |
551 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); | |
552 return ((__m128 ) vec_and (c, d)); | |
553 } | |
554 | |
555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
556 _mm_cmpunord_ps (__m128 __A, __m128 __B) | |
557 { | |
558 __vector unsigned int a, b; | |
559 __vector unsigned int c, d; | |
560 static const __vector unsigned int float_exp_mask = | |
561 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
562 | |
563 a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
564 b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
565 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); | |
566 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); | |
567 return ((__m128 ) vec_or (c, d)); | |
568 } | |
569 | |
570 /* Perform a comparison on the lower SPFP values of A and B. If the | |
571 comparison is true, place a mask of all ones in the result, otherwise a | |
572 mask of zeros. The upper three SPFP values are passed through from A. */ | |
573 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
574 _mm_cmpeq_ss (__m128 __A, __m128 __B) | |
575 { | |
576 static const __vector unsigned int mask = | |
577 { 0xffffffff, 0, 0, 0 }; | |
578 __v4sf a, b, c; | |
579 /* PowerISA VMX does not allow partial (for just element 0) | |
580 * results. So to insure we don't generate spurious exceptions | |
581 * (from the upper elements) we splat the lower float | |
582 * before we to the operation. */ | |
583 a = vec_splat ((__v4sf) __A, 0); | |
584 b = vec_splat ((__v4sf) __B, 0); | |
585 c = (__v4sf) vec_cmpeq(a, b); | |
586 /* Then we merge the lower float result with the original upper | |
587 * float elements from __A. */ | |
588 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
589 } | |
590 | |
591 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
592 _mm_cmplt_ss (__m128 __A, __m128 __B) | |
593 { | |
594 static const __vector unsigned int mask = | |
595 { 0xffffffff, 0, 0, 0 }; | |
596 __v4sf a, b, c; | |
597 /* PowerISA VMX does not allow partial (for just element 0) | |
598 * results. So to insure we don't generate spurious exceptions | |
599 * (from the upper elements) we splat the lower float | |
600 * before we to the operation. */ | |
601 a = vec_splat ((__v4sf) __A, 0); | |
602 b = vec_splat ((__v4sf) __B, 0); | |
603 c = (__v4sf) vec_cmplt(a, b); | |
604 /* Then we merge the lower float result with the original upper | |
605 * float elements from __A. */ | |
606 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
607 } | |
608 | |
609 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
610 _mm_cmple_ss (__m128 __A, __m128 __B) | |
611 { | |
612 static const __vector unsigned int mask = | |
613 { 0xffffffff, 0, 0, 0 }; | |
614 __v4sf a, b, c; | |
615 /* PowerISA VMX does not allow partial (for just element 0) | |
616 * results. So to insure we don't generate spurious exceptions | |
617 * (from the upper elements) we splat the lower float | |
618 * before we to the operation. */ | |
619 a = vec_splat ((__v4sf) __A, 0); | |
620 b = vec_splat ((__v4sf) __B, 0); | |
621 c = (__v4sf) vec_cmple(a, b); | |
622 /* Then we merge the lower float result with the original upper | |
623 * float elements from __A. */ | |
624 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
625 } | |
626 | |
627 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
628 _mm_cmpgt_ss (__m128 __A, __m128 __B) | |
629 { | |
630 static const __vector unsigned int mask = | |
631 { 0xffffffff, 0, 0, 0 }; | |
632 __v4sf a, b, c; | |
633 /* PowerISA VMX does not allow partial (for just element 0) | |
634 * results. So to insure we don't generate spurious exceptions | |
635 * (from the upper elements) we splat the lower float | |
636 * before we to the operation. */ | |
637 a = vec_splat ((__v4sf) __A, 0); | |
638 b = vec_splat ((__v4sf) __B, 0); | |
639 c = (__v4sf) vec_cmpgt(a, b); | |
640 /* Then we merge the lower float result with the original upper | |
641 * float elements from __A. */ | |
642 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
643 } | |
644 | |
645 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
646 _mm_cmpge_ss (__m128 __A, __m128 __B) | |
647 { | |
648 static const __vector unsigned int mask = | |
649 { 0xffffffff, 0, 0, 0 }; | |
650 __v4sf a, b, c; | |
651 /* PowerISA VMX does not allow partial (for just element 0) | |
652 * results. So to insure we don't generate spurious exceptions | |
653 * (from the upper elements) we splat the lower float | |
654 * before we to the operation. */ | |
655 a = vec_splat ((__v4sf) __A, 0); | |
656 b = vec_splat ((__v4sf) __B, 0); | |
657 c = (__v4sf) vec_cmpge(a, b); | |
658 /* Then we merge the lower float result with the original upper | |
659 * float elements from __A. */ | |
660 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
661 } | |
662 | |
663 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
664 _mm_cmpneq_ss (__m128 __A, __m128 __B) | |
665 { | |
666 static const __vector unsigned int mask = | |
667 { 0xffffffff, 0, 0, 0 }; | |
668 __v4sf a, b, c; | |
669 /* PowerISA VMX does not allow partial (for just element 0) | |
670 * results. So to insure we don't generate spurious exceptions | |
671 * (from the upper elements) we splat the lower float | |
672 * before we to the operation. */ | |
673 a = vec_splat ((__v4sf) __A, 0); | |
674 b = vec_splat ((__v4sf) __B, 0); | |
675 c = (__v4sf) vec_cmpeq(a, b); | |
676 c = vec_nor (c, c); | |
677 /* Then we merge the lower float result with the original upper | |
678 * float elements from __A. */ | |
679 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
680 } | |
681 | |
682 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
683 _mm_cmpnlt_ss (__m128 __A, __m128 __B) | |
684 { | |
685 static const __vector unsigned int mask = | |
686 { 0xffffffff, 0, 0, 0 }; | |
687 __v4sf a, b, c; | |
688 /* PowerISA VMX does not allow partial (for just element 0) | |
689 * results. So to insure we don't generate spurious exceptions | |
690 * (from the upper elements) we splat the lower float | |
691 * before we to the operation. */ | |
692 a = vec_splat ((__v4sf) __A, 0); | |
693 b = vec_splat ((__v4sf) __B, 0); | |
694 c = (__v4sf) vec_cmpge(a, b); | |
695 /* Then we merge the lower float result with the original upper | |
696 * float elements from __A. */ | |
697 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
698 } | |
699 | |
700 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
701 _mm_cmpnle_ss (__m128 __A, __m128 __B) | |
702 { | |
703 static const __vector unsigned int mask = | |
704 { 0xffffffff, 0, 0, 0 }; | |
705 __v4sf a, b, c; | |
706 /* PowerISA VMX does not allow partial (for just element 0) | |
707 * results. So to insure we don't generate spurious exceptions | |
708 * (from the upper elements) we splat the lower float | |
709 * before we to the operation. */ | |
710 a = vec_splat ((__v4sf) __A, 0); | |
711 b = vec_splat ((__v4sf) __B, 0); | |
712 c = (__v4sf) vec_cmpgt(a, b); | |
713 /* Then we merge the lower float result with the original upper | |
714 * float elements from __A. */ | |
715 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
716 } | |
717 | |
718 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
719 _mm_cmpngt_ss (__m128 __A, __m128 __B) | |
720 { | |
721 static const __vector unsigned int mask = | |
722 { 0xffffffff, 0, 0, 0 }; | |
723 __v4sf a, b, c; | |
724 /* PowerISA VMX does not allow partial (for just element 0) | |
725 * results. So to insure we don't generate spurious exceptions | |
726 * (from the upper elements) we splat the lower float | |
727 * before we to the operation. */ | |
728 a = vec_splat ((__v4sf) __A, 0); | |
729 b = vec_splat ((__v4sf) __B, 0); | |
730 c = (__v4sf) vec_cmple(a, b); | |
731 /* Then we merge the lower float result with the original upper | |
732 * float elements from __A. */ | |
733 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
734 } | |
735 | |
736 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
737 _mm_cmpnge_ss (__m128 __A, __m128 __B) | |
738 { | |
739 static const __vector unsigned int mask = | |
740 { 0xffffffff, 0, 0, 0 }; | |
741 __v4sf a, b, c; | |
742 /* PowerISA VMX does not allow partial (for just element 0) | |
743 * results. So to insure we don't generate spurious exceptions | |
744 * (from the upper elements) we splat the lower float | |
745 * before we do the operation. */ | |
746 a = vec_splat ((__v4sf) __A, 0); | |
747 b = vec_splat ((__v4sf) __B, 0); | |
748 c = (__v4sf) vec_cmplt(a, b); | |
749 /* Then we merge the lower float result with the original upper | |
750 * float elements from __A. */ | |
751 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); | |
752 } | |
753 | |
754 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
755 _mm_cmpord_ss (__m128 __A, __m128 __B) | |
756 { | |
757 __vector unsigned int a, b; | |
758 __vector unsigned int c, d; | |
759 static const __vector unsigned int float_exp_mask = | |
760 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
761 static const __vector unsigned int mask = | |
762 { 0xffffffff, 0, 0, 0 }; | |
763 | |
764 a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
765 b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
766 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); | |
767 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); | |
768 c = vec_and (c, d); | |
769 /* Then we merge the lower float result with the original upper | |
770 * float elements from __A. */ | |
771 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); | |
772 } | |
773 | |
774 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
775 _mm_cmpunord_ss (__m128 __A, __m128 __B) | |
776 { | |
777 __vector unsigned int a, b; | |
778 __vector unsigned int c, d; | |
779 static const __vector unsigned int float_exp_mask = | |
780 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; | |
781 static const __vector unsigned int mask = | |
782 { 0xffffffff, 0, 0, 0 }; | |
783 | |
784 a = (__vector unsigned int) vec_abs ((__v4sf)__A); | |
785 b = (__vector unsigned int) vec_abs ((__v4sf)__B); | |
786 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); | |
787 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); | |
788 c = vec_or (c, d); | |
789 /* Then we merge the lower float result with the original upper | |
790 * float elements from __A. */ | |
791 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); | |
792 } | |
793 | |
794 /* Compare the lower SPFP values of A and B and return 1 if true | |
795 and 0 if false. */ | |
796 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
797 _mm_comieq_ss (__m128 __A, __m128 __B) | |
798 { | |
799 return (__A[0] == __B[0]); | |
800 } | |
801 | |
802 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
803 _mm_comilt_ss (__m128 __A, __m128 __B) | |
804 { | |
805 return (__A[0] < __B[0]); | |
806 } | |
807 | |
808 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
809 _mm_comile_ss (__m128 __A, __m128 __B) | |
810 { | |
811 return (__A[0] <= __B[0]); | |
812 } | |
813 | |
814 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
815 _mm_comigt_ss (__m128 __A, __m128 __B) | |
816 { | |
817 return (__A[0] > __B[0]); | |
818 } | |
819 | |
820 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
821 _mm_comige_ss (__m128 __A, __m128 __B) | |
822 { | |
823 return (__A[0] >= __B[0]); | |
824 } | |
825 | |
826 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
827 _mm_comineq_ss (__m128 __A, __m128 __B) | |
828 { | |
829 return (__A[0] != __B[0]); | |
830 } | |
831 | |
832 /* FIXME | |
833 * The __mm_ucomi??_ss implementations below are exactly the same as | |
834 * __mm_comi??_ss because GCC for PowerPC only generates unordered | |
835 * compares (scalar and vector). | |
836 * Technically __mm_comieq_ss et al should be using the ordered | |
837 * compare and signal for QNaNs. | |
838 * The __mm_ucomieq_sd et all should be OK, as is. | |
839 */ | |
840 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
841 _mm_ucomieq_ss (__m128 __A, __m128 __B) | |
842 { | |
843 return (__A[0] == __B[0]); | |
844 } | |
845 | |
846 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
847 _mm_ucomilt_ss (__m128 __A, __m128 __B) | |
848 { | |
849 return (__A[0] < __B[0]); | |
850 } | |
851 | |
852 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
853 _mm_ucomile_ss (__m128 __A, __m128 __B) | |
854 { | |
855 return (__A[0] <= __B[0]); | |
856 } | |
857 | |
858 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
859 _mm_ucomigt_ss (__m128 __A, __m128 __B) | |
860 { | |
861 return (__A[0] > __B[0]); | |
862 } | |
863 | |
864 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
865 _mm_ucomige_ss (__m128 __A, __m128 __B) | |
866 { | |
867 return (__A[0] >= __B[0]); | |
868 } | |
869 | |
870 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
871 _mm_ucomineq_ss (__m128 __A, __m128 __B) | |
872 { | |
873 return (__A[0] != __B[0]); | |
874 } | |
875 | |
876 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
877 _mm_cvtss_f32 (__m128 __A) | |
878 { | |
879 return ((__v4sf)__A)[0]; | |
880 } | |
881 | |
882 /* Convert the lower SPFP value to a 32-bit integer according to the current | |
883 rounding mode. */ | |
884 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
885 _mm_cvtss_si32 (__m128 __A) | |
886 { | |
887 __m64 res = 0; | |
888 #ifdef _ARCH_PWR8 | |
889 __m128 vtmp; | |
890 __asm__( | |
891 "xxsldwi %x1,%x2,%x2,3;\n" | |
892 "xscvspdp %x1,%x1;\n" | |
893 "fctiw %1,%1;\n" | |
894 "mfvsrd %0,%x1;\n" | |
895 : "=r" (res), | |
896 "=&wi" (vtmp) | |
897 : "wa" (__A) | |
898 : ); | |
899 #else | |
900 res = __builtin_rint(__A[0]); | |
901 #endif | |
902 return (res); | |
903 } | |
904 | |
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
906 _mm_cvt_ss2si (__m128 __A) | |
907 { | |
908 return _mm_cvtss_si32 (__A); | |
909 } | |
910 | |
911 /* Convert the lower SPFP value to a 32-bit integer according to the | |
912 current rounding mode. */ | |
913 | |
914 /* Intel intrinsic. */ | |
915 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
916 _mm_cvtss_si64 (__m128 __A) | |
917 { | |
918 __m64 res = 0; | |
919 #ifdef _ARCH_PWR8 | |
920 __m128 vtmp; | |
921 __asm__( | |
922 "xxsldwi %x1,%x2,%x2,3;\n" | |
923 "xscvspdp %x1,%x1;\n" | |
924 "fctid %1,%1;\n" | |
925 "mfvsrd %0,%x1;\n" | |
926 : "=r" (res), | |
927 "=&wi" (vtmp) | |
928 : "wa" (__A) | |
929 : ); | |
930 #else | |
931 res = __builtin_llrint(__A[0]); | |
932 #endif | |
933 return (res); | |
934 } | |
935 | |
936 /* Microsoft intrinsic. */ | |
937 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
938 _mm_cvtss_si64x (__m128 __A) | |
939 { | |
940 return _mm_cvtss_si64 ((__v4sf) __A); | |
941 } | |
942 | |
943 /* Constants for use with _mm_prefetch. */ | |
944 enum _mm_hint | |
945 { | |
946 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ | |
947 _MM_HINT_ET0 = 7, | |
948 _MM_HINT_ET1 = 6, | |
949 _MM_HINT_T0 = 3, | |
950 _MM_HINT_T1 = 2, | |
951 _MM_HINT_T2 = 1, | |
952 _MM_HINT_NTA = 0 | |
953 }; | |
954 | |
955 /* Loads one cache line from address P to a location "closer" to the | |
956 processor. The selector I specifies the type of prefetch operation. */ | |
957 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
958 _mm_prefetch (const void *__P, enum _mm_hint __I) | |
959 { | |
960 /* Current PowerPC will ignores the hint parameters. */ | |
961 __builtin_prefetch (__P); | |
962 } | |
963 | |
964 /* Convert the two lower SPFP values to 32-bit integers according to the | |
965 current rounding mode. Return the integers in packed form. */ | |
966 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
967 _mm_cvtps_pi32 (__m128 __A) | |
968 { | |
969 /* Splat two lower SPFP values to both halves. */ | |
970 __v4sf temp, rounded; | |
971 __vector __m64 result; | |
972 | |
973 /* Splat two lower SPFP values to both halves. */ | |
974 temp = (__v4sf) vec_splat ((__vector long long)__A, 0); | |
975 rounded = vec_rint(temp); | |
976 result = (__vector __m64) vec_cts (rounded, 0); | |
977 | |
978 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0)); | |
979 } | |
980 | |
981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
982 _mm_cvt_ps2pi (__m128 __A) | |
983 { | |
984 return _mm_cvtps_pi32 (__A); | |
985 } | |
986 | |
987 /* Truncate the lower SPFP value to a 32-bit integer. */ | |
988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
989 _mm_cvttss_si32 (__m128 __A) | |
990 { | |
991 /* Extract the lower float element. */ | |
992 float temp = __A[0]; | |
993 /* truncate to 32-bit integer and return. */ | |
994 return temp; | |
995 } | |
996 | |
997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
998 _mm_cvtt_ss2si (__m128 __A) | |
999 { | |
1000 return _mm_cvttss_si32 (__A); | |
1001 } | |
1002 | |
1003 /* Intel intrinsic. */ | |
1004 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1005 _mm_cvttss_si64 (__m128 __A) | |
1006 { | |
1007 /* Extract the lower float element. */ | |
1008 float temp = __A[0]; | |
1009 /* truncate to 32-bit integer and return. */ | |
1010 return temp; | |
1011 } | |
1012 | |
1013 /* Microsoft intrinsic. */ | |
1014 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1015 _mm_cvttss_si64x (__m128 __A) | |
1016 { | |
1017 /* Extract the lower float element. */ | |
1018 float temp = __A[0]; | |
1019 /* truncate to 32-bit integer and return. */ | |
1020 return temp; | |
1021 } | |
1022 | |
1023 /* Truncate the two lower SPFP values to 32-bit integers. Return the | |
1024 integers in packed form. */ | |
1025 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1026 _mm_cvttps_pi32 (__m128 __A) | |
1027 { | |
1028 __v4sf temp; | |
1029 __vector __m64 result; | |
1030 | |
1031 /* Splat two lower SPFP values to both halves. */ | |
1032 temp = (__v4sf) vec_splat ((__vector long long)__A, 0); | |
1033 result = (__vector __m64) vec_cts (temp, 0); | |
1034 | |
1035 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0)); | |
1036 } | |
1037 | |
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1039 _mm_cvtt_ps2pi (__m128 __A) | |
1040 { | |
1041 return _mm_cvttps_pi32 (__A); | |
1042 } | |
1043 | |
1044 /* Convert B to a SPFP value and insert it as element zero in A. */ | |
1045 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1046 _mm_cvtsi32_ss (__m128 __A, int __B) | |
1047 { | |
1048 float temp = __B; | |
1049 __A[0] = temp; | |
1050 | |
1051 return __A; | |
1052 } | |
1053 | |
1054 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1055 _mm_cvt_si2ss (__m128 __A, int __B) | |
1056 { | |
1057 return _mm_cvtsi32_ss (__A, __B); | |
1058 } | |
1059 | |
1060 /* Convert B to a SPFP value and insert it as element zero in A. */ | |
1061 /* Intel intrinsic. */ | |
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1063 _mm_cvtsi64_ss (__m128 __A, long long __B) | |
1064 { | |
1065 float temp = __B; | |
1066 __A[0] = temp; | |
1067 | |
1068 return __A; | |
1069 } | |
1070 | |
1071 /* Microsoft intrinsic. */ | |
1072 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1073 _mm_cvtsi64x_ss (__m128 __A, long long __B) | |
1074 { | |
1075 return _mm_cvtsi64_ss (__A, __B); | |
1076 } | |
1077 | |
1078 /* Convert the two 32-bit values in B to SPFP form and insert them | |
1079 as the two lower elements in A. */ | |
1080 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1081 _mm_cvtpi32_ps (__m128 __A, __m64 __B) | |
1082 { | |
1083 __vector signed int vm1; | |
1084 __vector float vf1; | |
1085 | |
1086 vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B); | |
1087 vf1 = (__vector float) vec_ctf (vm1, 0); | |
1088 | |
1089 return ((__m128) (__vector __m64) | |
1090 { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]}); | |
1091 } | |
1092 | |
1093 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1094 _mm_cvt_pi2ps (__m128 __A, __m64 __B) | |
1095 { | |
1096 return _mm_cvtpi32_ps (__A, __B); | |
1097 } | |
1098 | |
1099 /* Convert the four signed 16-bit values in A to SPFP form. */ | |
1100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1101 _mm_cvtpi16_ps (__m64 __A) | |
1102 { | |
1103 __vector signed short vs8; | |
1104 __vector signed int vi4; | |
1105 __vector float vf1; | |
1106 | |
1107 vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A); | |
1108 vi4 = vec_vupklsh (vs8); | |
1109 vf1 = (__vector float) vec_ctf (vi4, 0); | |
1110 | |
1111 return (__m128) vf1; | |
1112 } | |
1113 | |
1114 /* Convert the four unsigned 16-bit values in A to SPFP form. */ | |
1115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1116 _mm_cvtpu16_ps (__m64 __A) | |
1117 { | |
1118 const __vector unsigned short zero = | |
1119 { 0, 0, 0, 0, 0, 0, 0, 0 }; | |
1120 __vector unsigned short vs8; | |
1121 __vector unsigned int vi4; | |
1122 __vector float vf1; | |
1123 | |
1124 vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A); | |
1125 vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero); | |
1126 vf1 = (__vector float) vec_ctf (vi4, 0); | |
1127 | |
1128 return (__m128) vf1; | |
1129 } | |
1130 | |
1131 /* Convert the low four signed 8-bit values in A to SPFP form. */ | |
1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1133 _mm_cvtpi8_ps (__m64 __A) | |
1134 { | |
1135 __vector signed char vc16; | |
1136 __vector signed short vs8; | |
1137 __vector signed int vi4; | |
1138 __vector float vf1; | |
1139 | |
1140 vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A); | |
1141 vs8 = vec_vupkhsb (vc16); | |
1142 vi4 = vec_vupkhsh (vs8); | |
1143 vf1 = (__vector float) vec_ctf (vi4, 0); | |
1144 | |
1145 return (__m128) vf1; | |
1146 } | |
1147 | |
1148 /* Convert the low four unsigned 8-bit values in A to SPFP form. */ | |
1149 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1150 | |
1151 _mm_cvtpu8_ps (__m64 __A) | |
1152 { | |
1153 const __vector unsigned char zero = | |
1154 { 0, 0, 0, 0, 0, 0, 0, 0 }; | |
1155 __vector unsigned char vc16; | |
1156 __vector unsigned short vs8; | |
1157 __vector unsigned int vi4; | |
1158 __vector float vf1; | |
1159 | |
1160 vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A); | |
1161 vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero); | |
1162 vi4 = (__vector unsigned int) vec_vmrghh (vs8, | |
1163 (__vector unsigned short) zero); | |
1164 vf1 = (__vector float) vec_ctf (vi4, 0); | |
1165 | |
1166 return (__m128) vf1; | |
1167 } | |
1168 | |
1169 /* Convert the four signed 32-bit values in A and B to SPFP form. */ | |
1170 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1171 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) | |
1172 { | |
1173 __vector signed int vi4; | |
1174 __vector float vf4; | |
1175 | |
1176 vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A); | |
1177 vf4 = (__vector float) vec_ctf (vi4, 0); | |
1178 return (__m128) vf4; | |
1179 } | |
1180 | |
1181 /* Convert the four SPFP values in A to four signed 16-bit integers. */ | |
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1183 _mm_cvtps_pi16(__m128 __A) | |
1184 { | |
1185 __v4sf rounded; | |
1186 __vector signed int temp; | |
1187 __vector __m64 result; | |
1188 | |
1189 rounded = vec_rint(__A); | |
1190 temp = vec_cts (rounded, 0); | |
1191 result = (__vector __m64) vec_pack (temp, temp); | |
1192 | |
1193 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0)); | |
1194 } | |
1195 | |
1196 /* Convert the four SPFP values in A to four signed 8-bit integers. */ | |
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1198 _mm_cvtps_pi8(__m128 __A) | |
1199 { | |
1200 __v4sf rounded; | |
1201 __vector signed int tmp_i; | |
1202 static const __vector signed int zero = {0, 0, 0, 0}; | |
1203 __vector signed short tmp_s; | |
1204 __vector signed char res_v; | |
1205 __m64 result; | |
1206 | |
1207 rounded = vec_rint(__A); | |
1208 tmp_i = vec_cts (rounded, 0); | |
1209 tmp_s = vec_pack (tmp_i, zero); | |
1210 res_v = vec_pack (tmp_s, tmp_s); | |
1211 result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0); | |
1212 | |
1213 return (result); | |
1214 } | |
1215 | |
1216 /* Selects four specific SPFP values from A and B based on MASK. */ | |
1217 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1218 | |
1219 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) | |
1220 { | |
1221 unsigned long element_selector_10 = __mask & 0x03; | |
1222 unsigned long element_selector_32 = (__mask >> 2) & 0x03; | |
1223 unsigned long element_selector_54 = (__mask >> 4) & 0x03; | |
1224 unsigned long element_selector_76 = (__mask >> 6) & 0x03; | |
1225 static const unsigned int permute_selectors[4] = | |
1226 { | |
1227 #ifdef __LITTLE_ENDIAN__ | |
1228 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C | |
1229 #elif __BIG_ENDIAN__ | |
1230 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 | |
1231 #endif | |
1232 }; | |
1233 __vector unsigned int t; | |
1234 | |
1235 #ifdef __LITTLE_ENDIAN__ | |
1236 t[0] = permute_selectors[element_selector_10]; | |
1237 t[1] = permute_selectors[element_selector_32]; | |
1238 t[2] = permute_selectors[element_selector_54] + 0x10101010; | |
1239 t[3] = permute_selectors[element_selector_76] + 0x10101010; | |
1240 #elif __BIG_ENDIAN__ | |
1241 t[3] = permute_selectors[element_selector_10] + 0x10101010; | |
1242 t[2] = permute_selectors[element_selector_32] + 0x10101010; | |
1243 t[1] = permute_selectors[element_selector_54]; | |
1244 t[0] = permute_selectors[element_selector_76]; | |
1245 #endif | |
1246 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); | |
1247 } | |
1248 | |
1249 /* Selects and interleaves the upper two SPFP values from A and B. */ | |
1250 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1251 _mm_unpackhi_ps (__m128 __A, __m128 __B) | |
1252 { | |
1253 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); | |
1254 } | |
1255 | |
1256 /* Selects and interleaves the lower two SPFP values from A and B. */ | |
1257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1258 _mm_unpacklo_ps (__m128 __A, __m128 __B) | |
1259 { | |
1260 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); | |
1261 } | |
1262 | |
1263 /* Sets the upper two SPFP values with 64-bits of data loaded from P; | |
1264 the lower two values are passed through from A. */ | |
1265 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1266 _mm_loadh_pi (__m128 __A, __m64 const *__P) | |
1267 { | |
1268 __vector __m64 __a = (__vector __m64)__A; | |
1269 __vector __m64 __p = vec_splats(*__P); | |
1270 __a [1] = __p [1]; | |
1271 | |
1272 return (__m128)__a; | |
1273 } | |
1274 | |
1275 /* Stores the upper two SPFP values of A into P. */ | |
1276 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1277 _mm_storeh_pi (__m64 *__P, __m128 __A) | |
1278 { | |
1279 __vector __m64 __a = (__vector __m64) __A; | |
1280 | |
1281 *__P = __a[1]; | |
1282 } | |
1283 | |
1284 /* Moves the upper two values of B into the lower two values of A. */ | |
1285 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1286 _mm_movehl_ps (__m128 __A, __m128 __B) | |
1287 { | |
1288 return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A); | |
1289 } | |
1290 | |
1291 /* Moves the lower two values of B into the upper two values of A. */ | |
1292 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1293 _mm_movelh_ps (__m128 __A, __m128 __B) | |
1294 { | |
1295 return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B); | |
1296 } | |
1297 | |
1298 /* Sets the lower two SPFP values with 64-bits of data loaded from P; | |
1299 the upper two values are passed through from A. */ | |
1300 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1301 _mm_loadl_pi (__m128 __A, __m64 const *__P) | |
1302 { | |
1303 __vector __m64 __a = (__vector __m64)__A; | |
1304 __vector __m64 __p = vec_splats(*__P); | |
1305 __a [0] = __p [0]; | |
1306 | |
1307 return (__m128)__a; | |
1308 } | |
1309 | |
1310 /* Stores the lower two SPFP values of A into P. */ | |
1311 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1312 _mm_storel_pi (__m64 *__P, __m128 __A) | |
1313 { | |
1314 __vector __m64 __a = (__vector __m64) __A; | |
1315 | |
1316 *__P = __a[0]; | |
1317 } | |
1318 | |
1319 #ifdef _ARCH_PWR8 | |
1320 /* Intrinsic functions that require PowerISA 2.07 minimum. */ | |
1321 | |
1322 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ | |
1323 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1324 _mm_movemask_ps (__m128 __A) | |
1325 { | |
1326 __vector __m64 result; | |
1327 static const __vector unsigned int perm_mask = | |
1328 { | |
1329 #ifdef __LITTLE_ENDIAN__ | |
1330 0x00204060, 0x80808080, 0x80808080, 0x80808080 | |
1331 #elif __BIG_ENDIAN__ | |
1332 0x80808080, 0x80808080, 0x80808080, 0x00204060 | |
1333 #endif | |
1334 }; | |
1335 | |
1336 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A, | |
1337 (__vector unsigned char) perm_mask); | |
1338 | |
1339 #ifdef __LITTLE_ENDIAN__ | |
1340 return result[1]; | |
1341 #elif __BIG_ENDIAN__ | |
1342 return result[0]; | |
1343 #endif | |
1344 } | |
1345 #endif /* _ARCH_PWR8 */ | |
1346 | |
1347 /* Create a vector with all four elements equal to *P. */ | |
1348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1349 _mm_load1_ps (float const *__P) | |
1350 { | |
1351 return _mm_set1_ps (*__P); | |
1352 } | |
1353 | |
1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1355 _mm_load_ps1 (float const *__P) | |
1356 { | |
1357 return _mm_load1_ps (__P); | |
1358 } | |
1359 | |
1360 /* Extracts one of the four words of A. The selector N must be immediate. */ | |
1361 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1362 _mm_extract_pi16 (__m64 const __A, int const __N) | |
1363 { | |
1364 const int shiftr = (__N & 3) * 16; | |
1365 | |
1366 return ((__A >> shiftr) & 0xffff); | |
1367 } | |
1368 | |
1369 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1370 _m_pextrw (__m64 const __A, int const __N) | |
1371 { | |
1372 return _mm_extract_pi16 (__A, __N); | |
1373 } | |
1374 | |
1375 /* Inserts word D into one of four words of A. The selector N must be | |
1376 immediate. */ | |
1377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1378 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) | |
1379 { | |
1380 const int shiftl = (__N & 3) * 16; | |
1381 const __m64 shiftD = (const __m64) __D << shiftl; | |
1382 const __m64 mask = 0xffffUL << shiftl; | |
1383 __m64 result = (__A & (~mask)) | (shiftD & mask); | |
1384 | |
1385 return (result); | |
1386 } | |
1387 | |
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1389 _m_pinsrw (__m64 const __A, int const __D, int const __N) | |
1390 { | |
1391 return _mm_insert_pi16 (__A, __D, __N); | |
1392 } | |
1393 | |
1394 /* Compute the element-wise maximum of signed 16-bit values. */ | |
1395 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1396 | |
1397 _mm_max_pi16 (__m64 __A, __m64 __B) | |
1398 { | |
1399 #if _ARCH_PWR8 | |
1400 __vector signed short a, b, r; | |
1401 __vector bool short c; | |
1402 | |
1403 a = (__vector signed short)vec_splats (__A); | |
1404 b = (__vector signed short)vec_splats (__B); | |
1405 c = (__vector bool short)vec_cmpgt (a, b); | |
1406 r = vec_sel (b, a, c); | |
1407 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1408 #else | |
1409 __m64_union m1, m2, res; | |
1410 | |
1411 m1.as_m64 = __A; | |
1412 m2.as_m64 = __B; | |
1413 | |
1414 res.as_short[0] = | |
1415 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; | |
1416 res.as_short[1] = | |
1417 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; | |
1418 res.as_short[2] = | |
1419 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; | |
1420 res.as_short[3] = | |
1421 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; | |
1422 | |
1423 return (__m64) res.as_m64; | |
1424 #endif | |
1425 } | |
1426 | |
1427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1428 _m_pmaxsw (__m64 __A, __m64 __B) | |
1429 { | |
1430 return _mm_max_pi16 (__A, __B); | |
1431 } | |
1432 | |
1433 /* Compute the element-wise maximum of unsigned 8-bit values. */ | |
1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1435 _mm_max_pu8 (__m64 __A, __m64 __B) | |
1436 { | |
1437 #if _ARCH_PWR8 | |
1438 __vector unsigned char a, b, r; | |
1439 __vector bool char c; | |
1440 | |
1441 a = (__vector unsigned char)vec_splats (__A); | |
1442 b = (__vector unsigned char)vec_splats (__B); | |
1443 c = (__vector bool char)vec_cmpgt (a, b); | |
1444 r = vec_sel (b, a, c); | |
1445 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1446 #else | |
1447 __m64_union m1, m2, res; | |
1448 long i; | |
1449 | |
1450 m1.as_m64 = __A; | |
1451 m2.as_m64 = __B; | |
1452 | |
1453 | |
1454 for (i = 0; i < 8; i++) | |
1455 res.as_char[i] = | |
1456 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? | |
1457 m1.as_char[i] : m2.as_char[i]; | |
1458 | |
1459 return (__m64) res.as_m64; | |
1460 #endif | |
1461 } | |
1462 | |
1463 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1464 _m_pmaxub (__m64 __A, __m64 __B) | |
1465 { | |
1466 return _mm_max_pu8 (__A, __B); | |
1467 } | |
1468 | |
1469 /* Compute the element-wise minimum of signed 16-bit values. */ | |
1470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1471 _mm_min_pi16 (__m64 __A, __m64 __B) | |
1472 { | |
1473 #if _ARCH_PWR8 | |
1474 __vector signed short a, b, r; | |
1475 __vector bool short c; | |
1476 | |
1477 a = (__vector signed short)vec_splats (__A); | |
1478 b = (__vector signed short)vec_splats (__B); | |
1479 c = (__vector bool short)vec_cmplt (a, b); | |
1480 r = vec_sel (b, a, c); | |
1481 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1482 #else | |
1483 __m64_union m1, m2, res; | |
1484 | |
1485 m1.as_m64 = __A; | |
1486 m2.as_m64 = __B; | |
1487 | |
1488 res.as_short[0] = | |
1489 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; | |
1490 res.as_short[1] = | |
1491 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; | |
1492 res.as_short[2] = | |
1493 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; | |
1494 res.as_short[3] = | |
1495 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; | |
1496 | |
1497 return (__m64) res.as_m64; | |
1498 #endif | |
1499 } | |
1500 | |
1501 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1502 _m_pminsw (__m64 __A, __m64 __B) | |
1503 { | |
1504 return _mm_min_pi16 (__A, __B); | |
1505 } | |
1506 | |
1507 /* Compute the element-wise minimum of unsigned 8-bit values. */ | |
1508 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1509 _mm_min_pu8 (__m64 __A, __m64 __B) | |
1510 { | |
1511 #if _ARCH_PWR8 | |
1512 __vector unsigned char a, b, r; | |
1513 __vector bool char c; | |
1514 | |
1515 a = (__vector unsigned char)vec_splats (__A); | |
1516 b = (__vector unsigned char)vec_splats (__B); | |
1517 c = (__vector bool char)vec_cmplt (a, b); | |
1518 r = vec_sel (b, a, c); | |
1519 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1520 #else | |
1521 __m64_union m1, m2, res; | |
1522 long i; | |
1523 | |
1524 m1.as_m64 = __A; | |
1525 m2.as_m64 = __B; | |
1526 | |
1527 | |
1528 for (i = 0; i < 8; i++) | |
1529 res.as_char[i] = | |
1530 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? | |
1531 m1.as_char[i] : m2.as_char[i]; | |
1532 | |
1533 return (__m64) res.as_m64; | |
1534 #endif | |
1535 } | |
1536 | |
1537 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1538 _m_pminub (__m64 __A, __m64 __B) | |
1539 { | |
1540 return _mm_min_pu8 (__A, __B); | |
1541 } | |
1542 | |
1543 /* Create an 8-bit mask of the signs of 8-bit values. */ | |
1544 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1545 _mm_movemask_pi8 (__m64 __A) | |
1546 { | |
1547 unsigned long p = 0x0008101820283038UL; // permute control for sign bits | |
1548 | |
1549 return __builtin_bpermd (p, __A); | |
1550 } | |
1551 | |
1552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1553 _m_pmovmskb (__m64 __A) | |
1554 { | |
1555 return _mm_movemask_pi8 (__A); | |
1556 } | |
1557 | |
1558 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values | |
1559 in B and produce the high 16 bits of the 32-bit results. */ | |
1560 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1561 _mm_mulhi_pu16 (__m64 __A, __m64 __B) | |
1562 { | |
1563 __vector unsigned short a, b; | |
1564 __vector unsigned short c; | |
1565 __vector unsigned int w0, w1; | |
1566 __vector unsigned char xform1 = { | |
1567 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, | |
1568 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F | |
1569 }; | |
1570 | |
1571 a = (__vector unsigned short)vec_splats (__A); | |
1572 b = (__vector unsigned short)vec_splats (__B); | |
1573 | |
1574 w0 = vec_vmuleuh (a, b); | |
1575 w1 = vec_vmulouh (a, b); | |
1576 c = (__vector unsigned short)vec_perm (w0, w1, xform1); | |
1577 | |
1578 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0)); | |
1579 } | |
1580 | |
1581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1582 _m_pmulhuw (__m64 __A, __m64 __B) | |
1583 { | |
1584 return _mm_mulhi_pu16 (__A, __B); | |
1585 } | |
1586 | |
1587 /* Return a combination of the four 16-bit values in A. The selector | |
1588 must be an immediate. */ | |
1589 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1590 _mm_shuffle_pi16 (__m64 __A, int const __N) | |
1591 { | |
1592 unsigned long element_selector_10 = __N & 0x03; | |
1593 unsigned long element_selector_32 = (__N >> 2) & 0x03; | |
1594 unsigned long element_selector_54 = (__N >> 4) & 0x03; | |
1595 unsigned long element_selector_76 = (__N >> 6) & 0x03; | |
1596 static const unsigned short permute_selectors[4] = | |
1597 { | |
1598 #ifdef __LITTLE_ENDIAN__ | |
1599 0x0908, 0x0B0A, 0x0D0C, 0x0F0E | |
1600 #elif __BIG_ENDIAN__ | |
1601 0x0607, 0x0405, 0x0203, 0x0001 | |
1602 #endif | |
1603 }; | |
1604 __m64_union t; | |
1605 __vector __m64 a, p, r; | |
1606 | |
1607 #ifdef __LITTLE_ENDIAN__ | |
1608 t.as_short[0] = permute_selectors[element_selector_10]; | |
1609 t.as_short[1] = permute_selectors[element_selector_32]; | |
1610 t.as_short[2] = permute_selectors[element_selector_54]; | |
1611 t.as_short[3] = permute_selectors[element_selector_76]; | |
1612 #elif __BIG_ENDIAN__ | |
1613 t.as_short[3] = permute_selectors[element_selector_10]; | |
1614 t.as_short[2] = permute_selectors[element_selector_32]; | |
1615 t.as_short[1] = permute_selectors[element_selector_54]; | |
1616 t.as_short[0] = permute_selectors[element_selector_76]; | |
1617 #endif | |
1618 p = vec_splats (t.as_m64); | |
1619 a = vec_splats (__A); | |
1620 r = vec_perm (a, a, (__vector unsigned char)p); | |
1621 return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0)); | |
1622 } | |
1623 | |
1624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1625 _m_pshufw (__m64 __A, int const __N) | |
1626 { | |
1627 return _mm_shuffle_pi16 (__A, __N); | |
1628 } | |
1629 | |
1630 /* Conditionally store byte elements of A into P. The high bit of each | |
1631 byte in the selector N determines whether the corresponding byte from | |
1632 A is stored. */ | |
1633 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1634 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) | |
1635 { | |
1636 __m64 hibit = 0x8080808080808080UL; | |
1637 __m64 mask, tmp; | |
1638 __m64 *p = (__m64*)__P; | |
1639 | |
1640 tmp = *p; | |
1641 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); | |
1642 tmp = (tmp & (~mask)) | (__A & mask); | |
1643 *p = tmp; | |
1644 } | |
1645 | |
1646 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1647 _m_maskmovq (__m64 __A, __m64 __N, char *__P) | |
1648 { | |
1649 _mm_maskmove_si64 (__A, __N, __P); | |
1650 } | |
1651 | |
1652 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ | |
1653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1654 _mm_avg_pu8 (__m64 __A, __m64 __B) | |
1655 { | |
1656 __vector unsigned char a, b, c; | |
1657 | |
1658 a = (__vector unsigned char)vec_splats (__A); | |
1659 b = (__vector unsigned char)vec_splats (__B); | |
1660 c = vec_avg (a, b); | |
1661 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0)); | |
1662 } | |
1663 | |
1664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1665 _m_pavgb (__m64 __A, __m64 __B) | |
1666 { | |
1667 return _mm_avg_pu8 (__A, __B); | |
1668 } | |
1669 | |
1670 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ | |
1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1672 _mm_avg_pu16 (__m64 __A, __m64 __B) | |
1673 { | |
1674 __vector unsigned short a, b, c; | |
1675 | |
1676 a = (__vector unsigned short)vec_splats (__A); | |
1677 b = (__vector unsigned short)vec_splats (__B); | |
1678 c = vec_avg (a, b); | |
1679 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0)); | |
1680 } | |
1681 | |
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1683 _m_pavgw (__m64 __A, __m64 __B) | |
1684 { | |
1685 return _mm_avg_pu16 (__A, __B); | |
1686 } | |
1687 | |
1688 /* Compute the sum of the absolute differences of the unsigned 8-bit | |
1689 values in A and B. Return the value in the lower 16-bit word; the | |
1690 upper words are cleared. */ | |
1691 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1692 _mm_sad_pu8 (__m64 __A, __m64 __B) | |
1693 { | |
1694 __vector unsigned char a, b; | |
1695 __vector unsigned char vmin, vmax, vabsdiff; | |
1696 __vector signed int vsum; | |
1697 const __vector unsigned int zero = | |
1698 { 0, 0, 0, 0 }; | |
1699 unsigned short result; | |
1700 | |
1701 a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A); | |
1702 b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B); | |
1703 vmin = vec_min (a, b); | |
1704 vmax = vec_max (a, b); | |
1705 vabsdiff = vec_sub (vmax, vmin); | |
1706 /* Sum four groups of bytes into integers. */ | |
1707 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); | |
1708 /* Sum across four integers with integer result. */ | |
1709 vsum = vec_sums (vsum, (__vector signed int) zero); | |
1710 /* The sum is in the right most 32-bits of the vector result. | |
1711 Transfer to a GPR and truncate to 16 bits. */ | |
1712 result = vsum[3]; | |
1713 return (result); | |
1714 } | |
1715 | |
1716 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1717 _m_psadbw (__m64 __A, __m64 __B) | |
1718 { | |
1719 return _mm_sad_pu8 (__A, __B); | |
1720 } | |
1721 | |
1722 /* Stores the data in A to the address P without polluting the caches. */ | |
1723 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1724 _mm_stream_pi (__m64 *__P, __m64 __A) | |
1725 { | |
1726 /* Use the data cache block touch for store transient. */ | |
1727 __asm__ ( | |
1728 " dcbtstt 0,%0" | |
1729 : | |
1730 : "b" (__P) | |
1731 : "memory" | |
1732 ); | |
1733 *__P = __A; | |
1734 } | |
1735 | |
1736 /* Likewise. The address must be 16-byte aligned. */ | |
1737 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1738 _mm_stream_ps (float *__P, __m128 __A) | |
1739 { | |
1740 /* Use the data cache block touch for store transient. */ | |
1741 __asm__ ( | |
1742 " dcbtstt 0,%0" | |
1743 : | |
1744 : "b" (__P) | |
1745 : "memory" | |
1746 ); | |
1747 _mm_store_ps (__P, __A); | |
1748 } | |
1749 | |
1750 /* Guarantees that every preceding store is globally visible before | |
1751 any subsequent store. */ | |
1752 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1753 _mm_sfence (void) | |
1754 { | |
1755 /* Generate a light weight sync. */ | |
1756 __atomic_thread_fence (__ATOMIC_RELEASE); | |
1757 } | |
1758 | |
1759 /* The execution of the next instruction is delayed by an implementation | |
1760 specific amount of time. The instruction does not modify the | |
1761 architectural state. This is after the pop_options pragma because | |
1762 it does not require SSE support in the processor--the encoding is a | |
1763 nop on processors that do not support it. */ | |
1764 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1765 _mm_pause (void) | |
1766 { | |
1767 /* There is no exact match with this construct, but the following is | |
1768 close to the desired effect. */ | |
1769 #if _ARCH_PWR8 | |
1770 /* On power8 and later processors we can depend on Program Priority | |
1771 (PRI) and associated "very low" PPI setting. Since we don't know | |
1772 what PPI this thread is running at we: 1) save the current PRI | |
1773 from the PPR SPR into a local GRP, 2) set the PRI to "very low* | |
1774 via the special or 31,31,31 encoding. 3) issue an "isync" to | |
1775 insure the PRI change takes effect before we execute any more | |
1776 instructions. | |
1777 Now we can execute a lwsync (release barrier) while we execute | |
1778 this thread at "very low" PRI. Finally we restore the original | |
1779 PRI and continue execution. */ | |
1780 unsigned long __PPR; | |
1781 | |
1782 __asm__ volatile ( | |
1783 " mfppr %0;" | |
1784 " or 31,31,31;" | |
1785 " isync;" | |
1786 " lwsync;" | |
1787 " isync;" | |
1788 " mtppr %0;" | |
1789 : "=r" (__PPR) | |
1790 : | |
1791 : "memory" | |
1792 ); | |
1793 #else | |
1794 /* For older processor where we may not even have Program Priority | |
1795 controls we can only depend on Heavy Weight Sync. */ | |
1796 __atomic_thread_fence (__ATOMIC_SEQ_CST); | |
1797 #endif | |
1798 } | |
1799 | |
1800 /* Transpose the 4x4 matrix composed of row[0-3]. */ | |
1801 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ | |
1802 do { \ | |
1803 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ | |
1804 __v4sf __t0 = vec_vmrghw (__r0, __r1); \ | |
1805 __v4sf __t1 = vec_vmrghw (__r2, __r3); \ | |
1806 __v4sf __t2 = vec_vmrglw (__r0, __r1); \ | |
1807 __v4sf __t3 = vec_vmrglw (__r2, __r3); \ | |
1808 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ | |
1809 (__vector long long)__t1); \ | |
1810 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ | |
1811 (__vector long long)__t1); \ | |
1812 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ | |
1813 (__vector long long)__t3); \ | |
1814 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ | |
1815 (__vector long long)__t3); \ | |
1816 } while (0) | |
1817 | |
1818 /* For backward source compatibility. */ | |
1819 //# include <emmintrin.h> | |
1820 | |
1821 #endif /* _XMMINTRIN_H_INCLUDED */ |