Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/i386/xmmintrin.h @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | f6334be47118 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 | |
2 Free Software Foundation, Inc. | |
3 | |
4 This file is part of GCC. | |
5 | |
6 GCC is free software; you can redistribute it and/or modify | |
7 it under the terms of the GNU General Public License as published by | |
8 the Free Software Foundation; either version 3, or (at your option) | |
9 any later version. | |
10 | |
11 GCC is distributed in the hope that it will be useful, | |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 GNU General Public License for more details. | |
15 | |
16 Under Section 7 of GPL version 3, you are granted additional | |
17 permissions described in the GCC Runtime Library Exception, version | |
18 3.1, as published by the Free Software Foundation. | |
19 | |
20 You should have received a copy of the GNU General Public License and | |
21 a copy of the GCC Runtime Library Exception along with this program; | |
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 <http://www.gnu.org/licenses/>. */ | |
24 | |
25 /* Implemented from the specification included in the Intel C++ Compiler | |
26 User Guide and Reference, version 9.0. */ | |
27 | |
28 #ifndef _XMMINTRIN_H_INCLUDED | |
29 #define _XMMINTRIN_H_INCLUDED | |
30 | |
31 #ifndef __SSE__ | |
32 # error "SSE instruction set not enabled" | |
33 #else | |
34 | |
35 /* We need type definitions from the MMX header file. */ | |
36 #include <mmintrin.h> | |
37 | |
38 /* Get _mm_malloc () and _mm_free (). */ | |
39 #include <mm_malloc.h> | |
40 | |
41 /* The Intel API is flexible enough that we must allow aliasing with other | |
42 vector types, and their scalar components. */ | |
43 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); | |
44 | |
45 /* Internal data types for implementing the intrinsics. */ | |
46 typedef float __v4sf __attribute__ ((__vector_size__ (16))); | |
47 | |
48 /* Create a selector for use with the SHUFPS instruction. */ | |
49 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ | |
50 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) | |
51 | |
52 /* Constants for use with _mm_prefetch. */ | |
53 enum _mm_hint | |
54 { | |
55 _MM_HINT_T0 = 3, | |
56 _MM_HINT_T1 = 2, | |
57 _MM_HINT_T2 = 1, | |
58 _MM_HINT_NTA = 0 | |
59 }; | |
60 | |
61 /* Bits in the MXCSR. */ | |
62 #define _MM_EXCEPT_MASK 0x003f | |
63 #define _MM_EXCEPT_INVALID 0x0001 | |
64 #define _MM_EXCEPT_DENORM 0x0002 | |
65 #define _MM_EXCEPT_DIV_ZERO 0x0004 | |
66 #define _MM_EXCEPT_OVERFLOW 0x0008 | |
67 #define _MM_EXCEPT_UNDERFLOW 0x0010 | |
68 #define _MM_EXCEPT_INEXACT 0x0020 | |
69 | |
70 #define _MM_MASK_MASK 0x1f80 | |
71 #define _MM_MASK_INVALID 0x0080 | |
72 #define _MM_MASK_DENORM 0x0100 | |
73 #define _MM_MASK_DIV_ZERO 0x0200 | |
74 #define _MM_MASK_OVERFLOW 0x0400 | |
75 #define _MM_MASK_UNDERFLOW 0x0800 | |
76 #define _MM_MASK_INEXACT 0x1000 | |
77 | |
78 #define _MM_ROUND_MASK 0x6000 | |
79 #define _MM_ROUND_NEAREST 0x0000 | |
80 #define _MM_ROUND_DOWN 0x2000 | |
81 #define _MM_ROUND_UP 0x4000 | |
82 #define _MM_ROUND_TOWARD_ZERO 0x6000 | |
83 | |
84 #define _MM_FLUSH_ZERO_MASK 0x8000 | |
85 #define _MM_FLUSH_ZERO_ON 0x8000 | |
86 #define _MM_FLUSH_ZERO_OFF 0x0000 | |
87 | |
88 /* Create a vector of zeros. */ | |
89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
90 _mm_setzero_ps (void) | |
91 { | |
92 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; | |
93 } | |
94 | |
95 /* Perform the respective operation on the lower SPFP (single-precision | |
96 floating-point) values of A and B; the upper three SPFP values are | |
97 passed through from A. */ | |
98 | |
99 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
100 _mm_add_ss (__m128 __A, __m128 __B) | |
101 { | |
102 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); | |
103 } | |
104 | |
105 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
106 _mm_sub_ss (__m128 __A, __m128 __B) | |
107 { | |
108 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); | |
109 } | |
110 | |
111 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
112 _mm_mul_ss (__m128 __A, __m128 __B) | |
113 { | |
114 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); | |
115 } | |
116 | |
117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
118 _mm_div_ss (__m128 __A, __m128 __B) | |
119 { | |
120 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); | |
121 } | |
122 | |
123 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
124 _mm_sqrt_ss (__m128 __A) | |
125 { | |
126 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); | |
127 } | |
128 | |
129 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
130 _mm_rcp_ss (__m128 __A) | |
131 { | |
132 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); | |
133 } | |
134 | |
135 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
136 _mm_rsqrt_ss (__m128 __A) | |
137 { | |
138 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); | |
139 } | |
140 | |
141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
142 _mm_min_ss (__m128 __A, __m128 __B) | |
143 { | |
144 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); | |
145 } | |
146 | |
147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
148 _mm_max_ss (__m128 __A, __m128 __B) | |
149 { | |
150 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); | |
151 } | |
152 | |
153 /* Perform the respective operation on the four SPFP values in A and B. */ | |
154 | |
155 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
156 _mm_add_ps (__m128 __A, __m128 __B) | |
157 { | |
158 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); | |
159 } | |
160 | |
161 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
162 _mm_sub_ps (__m128 __A, __m128 __B) | |
163 { | |
164 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); | |
165 } | |
166 | |
167 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
168 _mm_mul_ps (__m128 __A, __m128 __B) | |
169 { | |
170 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); | |
171 } | |
172 | |
173 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
174 _mm_div_ps (__m128 __A, __m128 __B) | |
175 { | |
176 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); | |
177 } | |
178 | |
179 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
180 _mm_sqrt_ps (__m128 __A) | |
181 { | |
182 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); | |
183 } | |
184 | |
185 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
186 _mm_rcp_ps (__m128 __A) | |
187 { | |
188 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); | |
189 } | |
190 | |
191 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
192 _mm_rsqrt_ps (__m128 __A) | |
193 { | |
194 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); | |
195 } | |
196 | |
197 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
198 _mm_min_ps (__m128 __A, __m128 __B) | |
199 { | |
200 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); | |
201 } | |
202 | |
203 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
204 _mm_max_ps (__m128 __A, __m128 __B) | |
205 { | |
206 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); | |
207 } | |
208 | |
209 /* Perform logical bit-wise operations on 128-bit values. */ | |
210 | |
211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
212 _mm_and_ps (__m128 __A, __m128 __B) | |
213 { | |
214 return __builtin_ia32_andps (__A, __B); | |
215 } | |
216 | |
217 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
218 _mm_andnot_ps (__m128 __A, __m128 __B) | |
219 { | |
220 return __builtin_ia32_andnps (__A, __B); | |
221 } | |
222 | |
223 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
224 _mm_or_ps (__m128 __A, __m128 __B) | |
225 { | |
226 return __builtin_ia32_orps (__A, __B); | |
227 } | |
228 | |
229 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
230 _mm_xor_ps (__m128 __A, __m128 __B) | |
231 { | |
232 return __builtin_ia32_xorps (__A, __B); | |
233 } | |
234 | |
235 /* Perform a comparison on the lower SPFP values of A and B. If the | |
236 comparison is true, place a mask of all ones in the result, otherwise a | |
237 mask of zeros. The upper three SPFP values are passed through from A. */ | |
238 | |
239 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
240 _mm_cmpeq_ss (__m128 __A, __m128 __B) | |
241 { | |
242 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); | |
243 } | |
244 | |
245 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
246 _mm_cmplt_ss (__m128 __A, __m128 __B) | |
247 { | |
248 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); | |
249 } | |
250 | |
251 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
252 _mm_cmple_ss (__m128 __A, __m128 __B) | |
253 { | |
254 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); | |
255 } | |
256 | |
257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
258 _mm_cmpgt_ss (__m128 __A, __m128 __B) | |
259 { | |
260 return (__m128) __builtin_ia32_movss ((__v4sf) __A, | |
261 (__v4sf) | |
262 __builtin_ia32_cmpltss ((__v4sf) __B, | |
263 (__v4sf) | |
264 __A)); | |
265 } | |
266 | |
267 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
268 _mm_cmpge_ss (__m128 __A, __m128 __B) | |
269 { | |
270 return (__m128) __builtin_ia32_movss ((__v4sf) __A, | |
271 (__v4sf) | |
272 __builtin_ia32_cmpless ((__v4sf) __B, | |
273 (__v4sf) | |
274 __A)); | |
275 } | |
276 | |
277 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
278 _mm_cmpneq_ss (__m128 __A, __m128 __B) | |
279 { | |
280 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); | |
281 } | |
282 | |
283 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
284 _mm_cmpnlt_ss (__m128 __A, __m128 __B) | |
285 { | |
286 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); | |
287 } | |
288 | |
289 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
290 _mm_cmpnle_ss (__m128 __A, __m128 __B) | |
291 { | |
292 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); | |
293 } | |
294 | |
295 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
296 _mm_cmpngt_ss (__m128 __A, __m128 __B) | |
297 { | |
298 return (__m128) __builtin_ia32_movss ((__v4sf) __A, | |
299 (__v4sf) | |
300 __builtin_ia32_cmpnltss ((__v4sf) __B, | |
301 (__v4sf) | |
302 __A)); | |
303 } | |
304 | |
305 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
306 _mm_cmpnge_ss (__m128 __A, __m128 __B) | |
307 { | |
308 return (__m128) __builtin_ia32_movss ((__v4sf) __A, | |
309 (__v4sf) | |
310 __builtin_ia32_cmpnless ((__v4sf) __B, | |
311 (__v4sf) | |
312 __A)); | |
313 } | |
314 | |
315 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
316 _mm_cmpord_ss (__m128 __A, __m128 __B) | |
317 { | |
318 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); | |
319 } | |
320 | |
321 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
322 _mm_cmpunord_ss (__m128 __A, __m128 __B) | |
323 { | |
324 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); | |
325 } | |
326 | |
327 /* Perform a comparison on the four SPFP values of A and B. For each | |
328 element, if the comparison is true, place a mask of all ones in the | |
329 result, otherwise a mask of zeros. */ | |
330 | |
331 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
332 _mm_cmpeq_ps (__m128 __A, __m128 __B) | |
333 { | |
334 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); | |
335 } | |
336 | |
337 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
338 _mm_cmplt_ps (__m128 __A, __m128 __B) | |
339 { | |
340 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); | |
341 } | |
342 | |
343 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
344 _mm_cmple_ps (__m128 __A, __m128 __B) | |
345 { | |
346 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); | |
347 } | |
348 | |
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
350 _mm_cmpgt_ps (__m128 __A, __m128 __B) | |
351 { | |
352 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); | |
353 } | |
354 | |
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
356 _mm_cmpge_ps (__m128 __A, __m128 __B) | |
357 { | |
358 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); | |
359 } | |
360 | |
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
362 _mm_cmpneq_ps (__m128 __A, __m128 __B) | |
363 { | |
364 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); | |
365 } | |
366 | |
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
368 _mm_cmpnlt_ps (__m128 __A, __m128 __B) | |
369 { | |
370 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); | |
371 } | |
372 | |
373 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
374 _mm_cmpnle_ps (__m128 __A, __m128 __B) | |
375 { | |
376 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); | |
377 } | |
378 | |
379 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
380 _mm_cmpngt_ps (__m128 __A, __m128 __B) | |
381 { | |
382 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); | |
383 } | |
384 | |
385 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
386 _mm_cmpnge_ps (__m128 __A, __m128 __B) | |
387 { | |
388 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); | |
389 } | |
390 | |
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
392 _mm_cmpord_ps (__m128 __A, __m128 __B) | |
393 { | |
394 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); | |
395 } | |
396 | |
397 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
398 _mm_cmpunord_ps (__m128 __A, __m128 __B) | |
399 { | |
400 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); | |
401 } | |
402 | |
403 /* Compare the lower SPFP values of A and B and return 1 if true | |
404 and 0 if false. */ | |
405 | |
406 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
407 _mm_comieq_ss (__m128 __A, __m128 __B) | |
408 { | |
409 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); | |
410 } | |
411 | |
412 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
413 _mm_comilt_ss (__m128 __A, __m128 __B) | |
414 { | |
415 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); | |
416 } | |
417 | |
418 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
419 _mm_comile_ss (__m128 __A, __m128 __B) | |
420 { | |
421 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); | |
422 } | |
423 | |
424 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
425 _mm_comigt_ss (__m128 __A, __m128 __B) | |
426 { | |
427 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); | |
428 } | |
429 | |
430 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
431 _mm_comige_ss (__m128 __A, __m128 __B) | |
432 { | |
433 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); | |
434 } | |
435 | |
436 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
437 _mm_comineq_ss (__m128 __A, __m128 __B) | |
438 { | |
439 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); | |
440 } | |
441 | |
442 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
443 _mm_ucomieq_ss (__m128 __A, __m128 __B) | |
444 { | |
445 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); | |
446 } | |
447 | |
448 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
449 _mm_ucomilt_ss (__m128 __A, __m128 __B) | |
450 { | |
451 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); | |
452 } | |
453 | |
454 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
455 _mm_ucomile_ss (__m128 __A, __m128 __B) | |
456 { | |
457 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); | |
458 } | |
459 | |
460 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
461 _mm_ucomigt_ss (__m128 __A, __m128 __B) | |
462 { | |
463 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); | |
464 } | |
465 | |
466 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
467 _mm_ucomige_ss (__m128 __A, __m128 __B) | |
468 { | |
469 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); | |
470 } | |
471 | |
472 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
473 _mm_ucomineq_ss (__m128 __A, __m128 __B) | |
474 { | |
475 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); | |
476 } | |
477 | |
478 /* Convert the lower SPFP value to a 32-bit integer according to the current | |
479 rounding mode. */ | |
480 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
481 _mm_cvtss_si32 (__m128 __A) | |
482 { | |
483 return __builtin_ia32_cvtss2si ((__v4sf) __A); | |
484 } | |
485 | |
486 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
487 _mm_cvt_ss2si (__m128 __A) | |
488 { | |
489 return _mm_cvtss_si32 (__A); | |
490 } | |
491 | |
492 #ifdef __x86_64__ | |
493 /* Convert the lower SPFP value to a 32-bit integer according to the | |
494 current rounding mode. */ | |
495 | |
496 /* Intel intrinsic. */ | |
497 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
498 _mm_cvtss_si64 (__m128 __A) | |
499 { | |
500 return __builtin_ia32_cvtss2si64 ((__v4sf) __A); | |
501 } | |
502 | |
503 /* Microsoft intrinsic. */ | |
504 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
505 _mm_cvtss_si64x (__m128 __A) | |
506 { | |
507 return __builtin_ia32_cvtss2si64 ((__v4sf) __A); | |
508 } | |
509 #endif | |
510 | |
511 /* Convert the two lower SPFP values to 32-bit integers according to the | |
512 current rounding mode. Return the integers in packed form. */ | |
513 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
514 _mm_cvtps_pi32 (__m128 __A) | |
515 { | |
516 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); | |
517 } | |
518 | |
519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
520 _mm_cvt_ps2pi (__m128 __A) | |
521 { | |
522 return _mm_cvtps_pi32 (__A); | |
523 } | |
524 | |
525 /* Truncate the lower SPFP value to a 32-bit integer. */ | |
526 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
527 _mm_cvttss_si32 (__m128 __A) | |
528 { | |
529 return __builtin_ia32_cvttss2si ((__v4sf) __A); | |
530 } | |
531 | |
532 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
533 _mm_cvtt_ss2si (__m128 __A) | |
534 { | |
535 return _mm_cvttss_si32 (__A); | |
536 } | |
537 | |
538 #ifdef __x86_64__ | |
539 /* Truncate the lower SPFP value to a 32-bit integer. */ | |
540 | |
541 /* Intel intrinsic. */ | |
542 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
543 _mm_cvttss_si64 (__m128 __A) | |
544 { | |
545 return __builtin_ia32_cvttss2si64 ((__v4sf) __A); | |
546 } | |
547 | |
548 /* Microsoft intrinsic. */ | |
549 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
550 _mm_cvttss_si64x (__m128 __A) | |
551 { | |
552 return __builtin_ia32_cvttss2si64 ((__v4sf) __A); | |
553 } | |
554 #endif | |
555 | |
556 /* Truncate the two lower SPFP values to 32-bit integers. Return the | |
557 integers in packed form. */ | |
558 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
559 _mm_cvttps_pi32 (__m128 __A) | |
560 { | |
561 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); | |
562 } | |
563 | |
564 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
565 _mm_cvtt_ps2pi (__m128 __A) | |
566 { | |
567 return _mm_cvttps_pi32 (__A); | |
568 } | |
569 | |
570 /* Convert B to a SPFP value and insert it as element zero in A. */ | |
571 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
572 _mm_cvtsi32_ss (__m128 __A, int __B) | |
573 { | |
574 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); | |
575 } | |
576 | |
577 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
578 _mm_cvt_si2ss (__m128 __A, int __B) | |
579 { | |
580 return _mm_cvtsi32_ss (__A, __B); | |
581 } | |
582 | |
583 #ifdef __x86_64__ | |
584 /* Convert B to a SPFP value and insert it as element zero in A. */ | |
585 | |
586 /* Intel intrinsic. */ | |
587 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
588 _mm_cvtsi64_ss (__m128 __A, long long __B) | |
589 { | |
590 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); | |
591 } | |
592 | |
593 /* Microsoft intrinsic. */ | |
594 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
595 _mm_cvtsi64x_ss (__m128 __A, long long __B) | |
596 { | |
597 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); | |
598 } | |
599 #endif | |
600 | |
601 /* Convert the two 32-bit values in B to SPFP form and insert them | |
602 as the two lower elements in A. */ | |
603 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
604 _mm_cvtpi32_ps (__m128 __A, __m64 __B) | |
605 { | |
606 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); | |
607 } | |
608 | |
609 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
610 _mm_cvt_pi2ps (__m128 __A, __m64 __B) | |
611 { | |
612 return _mm_cvtpi32_ps (__A, __B); | |
613 } | |
614 | |
615 /* Convert the four signed 16-bit values in A to SPFP form. */ | |
616 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
617 _mm_cvtpi16_ps (__m64 __A) | |
618 { | |
619 __v4hi __sign; | |
620 __v2si __hisi, __losi; | |
621 __v4sf __zero, __ra, __rb; | |
622 | |
623 /* This comparison against zero gives us a mask that can be used to | |
624 fill in the missing sign bits in the unpack operations below, so | |
625 that we get signed values after unpacking. */ | |
626 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); | |
627 | |
628 /* Convert the four words to doublewords. */ | |
629 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); | |
630 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); | |
631 | |
632 /* Convert the doublewords to floating point two at a time. */ | |
633 __zero = (__v4sf) _mm_setzero_ps (); | |
634 __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi); | |
635 __rb = __builtin_ia32_cvtpi2ps (__ra, __losi); | |
636 | |
637 return (__m128) __builtin_ia32_movlhps (__ra, __rb); | |
638 } | |
639 | |
640 /* Convert the four unsigned 16-bit values in A to SPFP form. */ | |
641 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
642 _mm_cvtpu16_ps (__m64 __A) | |
643 { | |
644 __v2si __hisi, __losi; | |
645 __v4sf __zero, __ra, __rb; | |
646 | |
647 /* Convert the four words to doublewords. */ | |
648 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); | |
649 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); | |
650 | |
651 /* Convert the doublewords to floating point two at a time. */ | |
652 __zero = (__v4sf) _mm_setzero_ps (); | |
653 __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi); | |
654 __rb = __builtin_ia32_cvtpi2ps (__ra, __losi); | |
655 | |
656 return (__m128) __builtin_ia32_movlhps (__ra, __rb); | |
657 } | |
658 | |
659 /* Convert the low four signed 8-bit values in A to SPFP form. */ | |
660 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
661 _mm_cvtpi8_ps (__m64 __A) | |
662 { | |
663 __v8qi __sign; | |
664 | |
665 /* This comparison against zero gives us a mask that can be used to | |
666 fill in the missing sign bits in the unpack operations below, so | |
667 that we get signed values after unpacking. */ | |
668 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); | |
669 | |
670 /* Convert the four low bytes to words. */ | |
671 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); | |
672 | |
673 return _mm_cvtpi16_ps(__A); | |
674 } | |
675 | |
676 /* Convert the low four unsigned 8-bit values in A to SPFP form. */ | |
677 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
678 _mm_cvtpu8_ps(__m64 __A) | |
679 { | |
680 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); | |
681 return _mm_cvtpu16_ps(__A); | |
682 } | |
683 | |
684 /* Convert the four signed 32-bit values in A and B to SPFP form. */ | |
685 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
686 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) | |
687 { | |
688 __v4sf __zero = (__v4sf) _mm_setzero_ps (); | |
689 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); | |
690 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); | |
691 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); | |
692 } | |
693 | |
694 /* Convert the four SPFP values in A to four signed 16-bit integers. */ | |
695 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
696 _mm_cvtps_pi16(__m128 __A) | |
697 { | |
698 __v4sf __hisf = (__v4sf)__A; | |
699 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); | |
700 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); | |
701 __v2si __losi = __builtin_ia32_cvtps2pi (__losf); | |
702 return (__m64) __builtin_ia32_packssdw (__hisi, __losi); | |
703 } | |
704 | |
705 /* Convert the four SPFP values in A to four signed 8-bit integers. */ | |
706 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
707 _mm_cvtps_pi8(__m128 __A) | |
708 { | |
709 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); | |
710 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); | |
711 } | |
712 | |
713 /* Selects four specific SPFP values from A and B based on MASK. */ | |
714 #ifdef __OPTIMIZE__ | |
715 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
716 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) | |
717 { | |
718 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); | |
719 } | |
720 #else | |
721 #define _mm_shuffle_ps(A, B, MASK) \ | |
722 ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \ | |
723 (__v4sf)(__m128)(B), (int)(MASK))) | |
724 #endif | |
725 | |
726 /* Selects and interleaves the upper two SPFP values from A and B. */ | |
727 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
728 _mm_unpackhi_ps (__m128 __A, __m128 __B) | |
729 { | |
730 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); | |
731 } | |
732 | |
733 /* Selects and interleaves the lower two SPFP values from A and B. */ | |
734 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
735 _mm_unpacklo_ps (__m128 __A, __m128 __B) | |
736 { | |
737 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); | |
738 } | |
739 | |
740 /* Sets the upper two SPFP values with 64-bits of data loaded from P; | |
741 the lower two values are passed through from A. */ | |
742 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
743 _mm_loadh_pi (__m128 __A, __m64 const *__P) | |
744 { | |
745 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P); | |
746 } | |
747 | |
748 /* Stores the upper two SPFP values of A into P. */ | |
749 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
750 _mm_storeh_pi (__m64 *__P, __m128 __A) | |
751 { | |
752 __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A); | |
753 } | |
754 | |
755 /* Moves the upper two values of B into the lower two values of A. */ | |
756 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
757 _mm_movehl_ps (__m128 __A, __m128 __B) | |
758 { | |
759 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); | |
760 } | |
761 | |
762 /* Moves the lower two values of B into the upper two values of A. */ | |
763 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
764 _mm_movelh_ps (__m128 __A, __m128 __B) | |
765 { | |
766 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); | |
767 } | |
768 | |
769 /* Sets the lower two SPFP values with 64-bits of data loaded from P; | |
770 the upper two values are passed through from A. */ | |
771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
772 _mm_loadl_pi (__m128 __A, __m64 const *__P) | |
773 { | |
774 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P); | |
775 } | |
776 | |
777 /* Stores the lower two SPFP values of A into P. */ | |
778 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
779 _mm_storel_pi (__m64 *__P, __m128 __A) | |
780 { | |
781 __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A); | |
782 } | |
783 | |
784 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ | |
785 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
786 _mm_movemask_ps (__m128 __A) | |
787 { | |
788 return __builtin_ia32_movmskps ((__v4sf)__A); | |
789 } | |
790 | |
791 /* Return the contents of the control register. */ | |
792 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
793 _mm_getcsr (void) | |
794 { | |
795 return __builtin_ia32_stmxcsr (); | |
796 } | |
797 | |
798 /* Read exception bits from the control register. */ | |
799 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
800 _MM_GET_EXCEPTION_STATE (void) | |
801 { | |
802 return _mm_getcsr() & _MM_EXCEPT_MASK; | |
803 } | |
804 | |
805 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
806 _MM_GET_EXCEPTION_MASK (void) | |
807 { | |
808 return _mm_getcsr() & _MM_MASK_MASK; | |
809 } | |
810 | |
811 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
812 _MM_GET_ROUNDING_MODE (void) | |
813 { | |
814 return _mm_getcsr() & _MM_ROUND_MASK; | |
815 } | |
816 | |
817 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
818 _MM_GET_FLUSH_ZERO_MODE (void) | |
819 { | |
820 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; | |
821 } | |
822 | |
823 /* Set the control register to I. */ | |
824 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
825 _mm_setcsr (unsigned int __I) | |
826 { | |
827 __builtin_ia32_ldmxcsr (__I); | |
828 } | |
829 | |
830 /* Set exception bits in the control register. */ | |
831 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
832 _MM_SET_EXCEPTION_STATE(unsigned int __mask) | |
833 { | |
834 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); | |
835 } | |
836 | |
837 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
838 _MM_SET_EXCEPTION_MASK (unsigned int __mask) | |
839 { | |
840 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); | |
841 } | |
842 | |
843 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
844 _MM_SET_ROUNDING_MODE (unsigned int __mode) | |
845 { | |
846 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); | |
847 } | |
848 | |
849 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
850 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) | |
851 { | |
852 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); | |
853 } | |
854 | |
855 /* Create a vector with element 0 as F and the rest zero. */ | |
856 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
857 _mm_set_ss (float __F) | |
858 { | |
859 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; | |
860 } | |
861 | |
862 /* Create a vector with all four elements equal to F. */ | |
863 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
864 _mm_set1_ps (float __F) | |
865 { | |
866 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; | |
867 } | |
868 | |
869 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
870 _mm_set_ps1 (float __F) | |
871 { | |
872 return _mm_set1_ps (__F); | |
873 } | |
874 | |
875 /* Create a vector with element 0 as *P and the rest zero. */ | |
876 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
877 _mm_load_ss (float const *__P) | |
878 { | |
879 return _mm_set_ss (*__P); | |
880 } | |
881 | |
882 /* Create a vector with all four elements equal to *P. */ | |
883 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
884 _mm_load1_ps (float const *__P) | |
885 { | |
886 return _mm_set1_ps (*__P); | |
887 } | |
888 | |
889 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
890 _mm_load_ps1 (float const *__P) | |
891 { | |
892 return _mm_load1_ps (__P); | |
893 } | |
894 | |
895 /* Load four SPFP values from P. The address must be 16-byte aligned. */ | |
896 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
897 _mm_load_ps (float const *__P) | |
898 { | |
899 return (__m128) *(__v4sf *)__P; | |
900 } | |
901 | |
902 /* Load four SPFP values from P. The address need not be 16-byte aligned. */ | |
903 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
904 _mm_loadu_ps (float const *__P) | |
905 { | |
906 return (__m128) __builtin_ia32_loadups (__P); | |
907 } | |
908 | |
909 /* Load four SPFP values in reverse order. The address must be aligned. */ | |
910 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
911 _mm_loadr_ps (float const *__P) | |
912 { | |
913 __v4sf __tmp = *(__v4sf *)__P; | |
914 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); | |
915 } | |
916 | |
917 /* Create the vector [Z Y X W]. */ | |
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
919 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) | |
920 { | |
921 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; | |
922 } | |
923 | |
924 /* Create the vector [W X Y Z]. */ | |
925 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
926 _mm_setr_ps (float __Z, float __Y, float __X, float __W) | |
927 { | |
928 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; | |
929 } | |
930 | |
931 /* Stores the lower SPFP value. */ | |
932 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
933 _mm_store_ss (float *__P, __m128 __A) | |
934 { | |
935 *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); | |
936 } | |
937 | |
938 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
939 _mm_cvtss_f32 (__m128 __A) | |
940 { | |
941 return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); | |
942 } | |
943 | |
944 /* Store four SPFP values. The address must be 16-byte aligned. */ | |
945 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
946 _mm_store_ps (float *__P, __m128 __A) | |
947 { | |
948 *(__v4sf *)__P = (__v4sf)__A; | |
949 } | |
950 | |
951 /* Store four SPFP values. The address need not be 16-byte aligned. */ | |
952 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
953 _mm_storeu_ps (float *__P, __m128 __A) | |
954 { | |
955 __builtin_ia32_storeups (__P, (__v4sf)__A); | |
956 } | |
957 | |
958 /* Store the lower SPFP value across four words. */ | |
959 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
960 _mm_store1_ps (float *__P, __m128 __A) | |
961 { | |
962 __v4sf __va = (__v4sf)__A; | |
963 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); | |
964 _mm_storeu_ps (__P, __tmp); | |
965 } | |
966 | |
967 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
968 _mm_store_ps1 (float *__P, __m128 __A) | |
969 { | |
970 _mm_store1_ps (__P, __A); | |
971 } | |
972 | |
973 /* Store four SPFP values in reverse order. The address must be aligned. */ | |
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
975 _mm_storer_ps (float *__P, __m128 __A) | |
976 { | |
977 __v4sf __va = (__v4sf)__A; | |
978 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); | |
979 _mm_store_ps (__P, __tmp); | |
980 } | |
981 | |
982 /* Sets the low SPFP value of A from the low value of B. */ | |
983 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
984 _mm_move_ss (__m128 __A, __m128 __B) | |
985 { | |
986 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); | |
987 } | |
988 | |
989 /* Extracts one of the four words of A. The selector N must be immediate. */ | |
990 #ifdef __OPTIMIZE__ | |
991 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
992 _mm_extract_pi16 (__m64 const __A, int const __N) | |
993 { | |
994 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); | |
995 } | |
996 | |
997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
998 _m_pextrw (__m64 const __A, int const __N) | |
999 { | |
1000 return _mm_extract_pi16 (__A, __N); | |
1001 } | |
1002 #else | |
1003 #define _mm_extract_pi16(A, N) \ | |
1004 ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N))) | |
1005 | |
1006 #define _m_pextrw(A, N) _mm_extract_pi16(A, N) | |
1007 #endif | |
1008 | |
1009 /* Inserts word D into one of four words of A. The selector N must be | |
1010 immediate. */ | |
1011 #ifdef __OPTIMIZE__ | |
1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1013 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) | |
1014 { | |
1015 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); | |
1016 } | |
1017 | |
1018 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1019 _m_pinsrw (__m64 const __A, int const __D, int const __N) | |
1020 { | |
1021 return _mm_insert_pi16 (__A, __D, __N); | |
1022 } | |
1023 #else | |
1024 #define _mm_insert_pi16(A, D, N) \ | |
1025 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \ | |
1026 (int)(D), (int)(N))) | |
1027 | |
1028 #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) | |
1029 #endif | |
1030 | |
1031 /* Compute the element-wise maximum of signed 16-bit values. */ | |
1032 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1033 _mm_max_pi16 (__m64 __A, __m64 __B) | |
1034 { | |
1035 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); | |
1036 } | |
1037 | |
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1039 _m_pmaxsw (__m64 __A, __m64 __B) | |
1040 { | |
1041 return _mm_max_pi16 (__A, __B); | |
1042 } | |
1043 | |
1044 /* Compute the element-wise maximum of unsigned 8-bit values. */ | |
1045 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1046 _mm_max_pu8 (__m64 __A, __m64 __B) | |
1047 { | |
1048 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); | |
1049 } | |
1050 | |
1051 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1052 _m_pmaxub (__m64 __A, __m64 __B) | |
1053 { | |
1054 return _mm_max_pu8 (__A, __B); | |
1055 } | |
1056 | |
1057 /* Compute the element-wise minimum of signed 16-bit values. */ | |
1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1059 _mm_min_pi16 (__m64 __A, __m64 __B) | |
1060 { | |
1061 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); | |
1062 } | |
1063 | |
1064 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1065 _m_pminsw (__m64 __A, __m64 __B) | |
1066 { | |
1067 return _mm_min_pi16 (__A, __B); | |
1068 } | |
1069 | |
1070 /* Compute the element-wise minimum of unsigned 8-bit values. */ | |
1071 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1072 _mm_min_pu8 (__m64 __A, __m64 __B) | |
1073 { | |
1074 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); | |
1075 } | |
1076 | |
1077 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1078 _m_pminub (__m64 __A, __m64 __B) | |
1079 { | |
1080 return _mm_min_pu8 (__A, __B); | |
1081 } | |
1082 | |
1083 /* Create an 8-bit mask of the signs of 8-bit values. */ | |
1084 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1085 _mm_movemask_pi8 (__m64 __A) | |
1086 { | |
1087 return __builtin_ia32_pmovmskb ((__v8qi)__A); | |
1088 } | |
1089 | |
1090 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1091 _m_pmovmskb (__m64 __A) | |
1092 { | |
1093 return _mm_movemask_pi8 (__A); | |
1094 } | |
1095 | |
1096 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values | |
1097 in B and produce the high 16 bits of the 32-bit results. */ | |
1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1099 _mm_mulhi_pu16 (__m64 __A, __m64 __B) | |
1100 { | |
1101 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); | |
1102 } | |
1103 | |
1104 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1105 _m_pmulhuw (__m64 __A, __m64 __B) | |
1106 { | |
1107 return _mm_mulhi_pu16 (__A, __B); | |
1108 } | |
1109 | |
1110 /* Return a combination of the four 16-bit values in A. The selector | |
1111 must be an immediate. */ | |
1112 #ifdef __OPTIMIZE__ | |
1113 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1114 _mm_shuffle_pi16 (__m64 __A, int const __N) | |
1115 { | |
1116 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); | |
1117 } | |
1118 | |
1119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1120 _m_pshufw (__m64 __A, int const __N) | |
1121 { | |
1122 return _mm_shuffle_pi16 (__A, __N); | |
1123 } | |
1124 #else | |
1125 #define _mm_shuffle_pi16(A, N) \ | |
1126 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N))) | |
1127 | |
1128 #define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N) | |
1129 #endif | |
1130 | |
1131 /* Conditionally store byte elements of A into P. The high bit of each | |
1132 byte in the selector N determines whether the corresponding byte from | |
1133 A is stored. */ | |
1134 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1135 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) | |
1136 { | |
1137 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); | |
1138 } | |
1139 | |
1140 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1141 _m_maskmovq (__m64 __A, __m64 __N, char *__P) | |
1142 { | |
1143 _mm_maskmove_si64 (__A, __N, __P); | |
1144 } | |
1145 | |
1146 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ | |
1147 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1148 _mm_avg_pu8 (__m64 __A, __m64 __B) | |
1149 { | |
1150 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); | |
1151 } | |
1152 | |
1153 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1154 _m_pavgb (__m64 __A, __m64 __B) | |
1155 { | |
1156 return _mm_avg_pu8 (__A, __B); | |
1157 } | |
1158 | |
1159 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ | |
1160 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1161 _mm_avg_pu16 (__m64 __A, __m64 __B) | |
1162 { | |
1163 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); | |
1164 } | |
1165 | |
1166 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1167 _m_pavgw (__m64 __A, __m64 __B) | |
1168 { | |
1169 return _mm_avg_pu16 (__A, __B); | |
1170 } | |
1171 | |
1172 /* Compute the sum of the absolute differences of the unsigned 8-bit | |
1173 values in A and B. Return the value in the lower 16-bit word; the | |
1174 upper words are cleared. */ | |
1175 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1176 _mm_sad_pu8 (__m64 __A, __m64 __B) | |
1177 { | |
1178 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); | |
1179 } | |
1180 | |
1181 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1182 _m_psadbw (__m64 __A, __m64 __B) | |
1183 { | |
1184 return _mm_sad_pu8 (__A, __B); | |
1185 } | |
1186 | |
1187 /* Loads one cache line from address P to a location "closer" to the | |
1188 processor. The selector I specifies the type of prefetch operation. */ | |
1189 #ifdef __OPTIMIZE__ | |
1190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1191 _mm_prefetch (const void *__P, enum _mm_hint __I) | |
1192 { | |
1193 __builtin_prefetch (__P, 0, __I); | |
1194 } | |
1195 #else | |
1196 #define _mm_prefetch(P, I) \ | |
1197 __builtin_prefetch ((P), 0, (I)) | |
1198 #endif | |
1199 | |
1200 /* Stores the data in A to the address P without polluting the caches. */ | |
1201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1202 _mm_stream_pi (__m64 *__P, __m64 __A) | |
1203 { | |
1204 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); | |
1205 } | |
1206 | |
1207 /* Likewise. The address must be 16-byte aligned. */ | |
1208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1209 _mm_stream_ps (float *__P, __m128 __A) | |
1210 { | |
1211 __builtin_ia32_movntps (__P, (__v4sf)__A); | |
1212 } | |
1213 | |
1214 /* Guarantees that every preceding store is globally visible before | |
1215 any subsequent store. */ | |
1216 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1217 _mm_sfence (void) | |
1218 { | |
1219 __builtin_ia32_sfence (); | |
1220 } | |
1221 | |
1222 /* The execution of the next instruction is delayed by an implementation | |
1223 specific amount of time. The instruction does not modify the | |
1224 architectural state. */ | |
1225 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1226 _mm_pause (void) | |
1227 { | |
1228 __asm__ __volatile__ ("rep; nop" : : ); | |
1229 } | |
1230 | |
1231 /* Transpose the 4x4 matrix composed of row[0-3]. */ | |
1232 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ | |
1233 do { \ | |
1234 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ | |
1235 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ | |
1236 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \ | |
1237 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \ | |
1238 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ | |
1239 (row0) = __builtin_ia32_movlhps (__t0, __t1); \ | |
1240 (row1) = __builtin_ia32_movhlps (__t1, __t0); \ | |
1241 (row2) = __builtin_ia32_movlhps (__t2, __t3); \ | |
1242 (row3) = __builtin_ia32_movhlps (__t3, __t2); \ | |
1243 } while (0) | |
1244 | |
1245 /* For backward source compatibility. */ | |
1246 #ifdef __SSE2__ | |
1247 # include <emmintrin.h> | |
1248 #endif | |
1249 | |
1250 #endif /* __SSE__ */ | |
1251 #endif /* _XMMINTRIN_H_INCLUDED */ |