0
|
1 /* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
|
|
2 Free Software Foundation, Inc.
|
|
3
|
|
4 This file is part of GCC.
|
|
5
|
|
6 GCC is free software; you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation; either version 3, or (at your option)
|
|
9 any later version.
|
|
10
|
|
11 GCC is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 Under Section 7 of GPL version 3, you are granted additional
|
|
17 permissions described in the GCC Runtime Library Exception, version
|
|
18 3.1, as published by the Free Software Foundation.
|
|
19
|
|
20 You should have received a copy of the GNU General Public License and
|
|
21 a copy of the GCC Runtime Library Exception along with this program;
|
|
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
23 <http://www.gnu.org/licenses/>. */
|
|
24
|
|
25 /* Implemented from the specification included in the Intel C++ Compiler
|
|
26 User Guide and Reference, version 9.0. */
|
|
27
|
|
28 #ifndef _XMMINTRIN_H_INCLUDED
|
|
29 #define _XMMINTRIN_H_INCLUDED
|
|
30
|
|
31 #ifndef __SSE__
|
|
32 # error "SSE instruction set not enabled"
|
|
33 #else
|
|
34
|
|
35 /* We need type definitions from the MMX header file. */
|
|
36 #include <mmintrin.h>
|
|
37
|
|
38 /* Get _mm_malloc () and _mm_free (). */
|
|
39 #include <mm_malloc.h>
|
|
40
|
|
41 /* The Intel API is flexible enough that we must allow aliasing with other
|
|
42 vector types, and their scalar components. */
|
|
43 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
|
|
44
|
|
45 /* Internal data types for implementing the intrinsics. */
|
|
46 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
|
|
47
|
|
48 /* Create a selector for use with the SHUFPS instruction. */
|
|
49 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
|
|
50 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
|
|
51
|
|
52 /* Constants for use with _mm_prefetch. */
|
|
53 enum _mm_hint
|
|
54 {
|
|
55 _MM_HINT_T0 = 3,
|
|
56 _MM_HINT_T1 = 2,
|
|
57 _MM_HINT_T2 = 1,
|
|
58 _MM_HINT_NTA = 0
|
|
59 };
|
|
60
|
|
61 /* Bits in the MXCSR. */
|
|
62 #define _MM_EXCEPT_MASK 0x003f
|
|
63 #define _MM_EXCEPT_INVALID 0x0001
|
|
64 #define _MM_EXCEPT_DENORM 0x0002
|
|
65 #define _MM_EXCEPT_DIV_ZERO 0x0004
|
|
66 #define _MM_EXCEPT_OVERFLOW 0x0008
|
|
67 #define _MM_EXCEPT_UNDERFLOW 0x0010
|
|
68 #define _MM_EXCEPT_INEXACT 0x0020
|
|
69
|
|
70 #define _MM_MASK_MASK 0x1f80
|
|
71 #define _MM_MASK_INVALID 0x0080
|
|
72 #define _MM_MASK_DENORM 0x0100
|
|
73 #define _MM_MASK_DIV_ZERO 0x0200
|
|
74 #define _MM_MASK_OVERFLOW 0x0400
|
|
75 #define _MM_MASK_UNDERFLOW 0x0800
|
|
76 #define _MM_MASK_INEXACT 0x1000
|
|
77
|
|
78 #define _MM_ROUND_MASK 0x6000
|
|
79 #define _MM_ROUND_NEAREST 0x0000
|
|
80 #define _MM_ROUND_DOWN 0x2000
|
|
81 #define _MM_ROUND_UP 0x4000
|
|
82 #define _MM_ROUND_TOWARD_ZERO 0x6000
|
|
83
|
|
84 #define _MM_FLUSH_ZERO_MASK 0x8000
|
|
85 #define _MM_FLUSH_ZERO_ON 0x8000
|
|
86 #define _MM_FLUSH_ZERO_OFF 0x0000
|
|
87
|
|
88 /* Create a vector of zeros. */
|
|
89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
90 _mm_setzero_ps (void)
|
|
91 {
|
|
92 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
|
|
93 }
|
|
94
|
|
95 /* Perform the respective operation on the lower SPFP (single-precision
|
|
96 floating-point) values of A and B; the upper three SPFP values are
|
|
97 passed through from A. */
|
|
98
|
|
99 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
100 _mm_add_ss (__m128 __A, __m128 __B)
|
|
101 {
|
|
102 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
|
|
103 }
|
|
104
|
|
105 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
106 _mm_sub_ss (__m128 __A, __m128 __B)
|
|
107 {
|
|
108 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
|
|
109 }
|
|
110
|
|
111 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
112 _mm_mul_ss (__m128 __A, __m128 __B)
|
|
113 {
|
|
114 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
|
|
115 }
|
|
116
|
|
117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
118 _mm_div_ss (__m128 __A, __m128 __B)
|
|
119 {
|
|
120 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
|
|
121 }
|
|
122
|
|
123 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
124 _mm_sqrt_ss (__m128 __A)
|
|
125 {
|
|
126 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
|
|
127 }
|
|
128
|
|
129 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
130 _mm_rcp_ss (__m128 __A)
|
|
131 {
|
|
132 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
|
|
133 }
|
|
134
|
|
135 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
136 _mm_rsqrt_ss (__m128 __A)
|
|
137 {
|
|
138 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
|
|
139 }
|
|
140
|
|
141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
142 _mm_min_ss (__m128 __A, __m128 __B)
|
|
143 {
|
|
144 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
|
|
145 }
|
|
146
|
|
147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
148 _mm_max_ss (__m128 __A, __m128 __B)
|
|
149 {
|
|
150 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
|
|
151 }
|
|
152
|
|
153 /* Perform the respective operation on the four SPFP values in A and B. */
|
|
154
|
|
155 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
156 _mm_add_ps (__m128 __A, __m128 __B)
|
|
157 {
|
|
158 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
|
|
159 }
|
|
160
|
|
161 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
162 _mm_sub_ps (__m128 __A, __m128 __B)
|
|
163 {
|
|
164 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
|
|
165 }
|
|
166
|
|
167 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
168 _mm_mul_ps (__m128 __A, __m128 __B)
|
|
169 {
|
|
170 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
|
|
171 }
|
|
172
|
|
173 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
174 _mm_div_ps (__m128 __A, __m128 __B)
|
|
175 {
|
|
176 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
|
|
177 }
|
|
178
|
|
179 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
180 _mm_sqrt_ps (__m128 __A)
|
|
181 {
|
|
182 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
|
|
183 }
|
|
184
|
|
185 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
186 _mm_rcp_ps (__m128 __A)
|
|
187 {
|
|
188 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
|
|
189 }
|
|
190
|
|
191 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
192 _mm_rsqrt_ps (__m128 __A)
|
|
193 {
|
|
194 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
|
|
195 }
|
|
196
|
|
197 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
198 _mm_min_ps (__m128 __A, __m128 __B)
|
|
199 {
|
|
200 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
|
|
201 }
|
|
202
|
|
203 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
204 _mm_max_ps (__m128 __A, __m128 __B)
|
|
205 {
|
|
206 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
|
|
207 }
|
|
208
|
|
209 /* Perform logical bit-wise operations on 128-bit values. */
|
|
210
|
|
211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
212 _mm_and_ps (__m128 __A, __m128 __B)
|
|
213 {
|
|
214 return __builtin_ia32_andps (__A, __B);
|
|
215 }
|
|
216
|
|
217 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
218 _mm_andnot_ps (__m128 __A, __m128 __B)
|
|
219 {
|
|
220 return __builtin_ia32_andnps (__A, __B);
|
|
221 }
|
|
222
|
|
223 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
224 _mm_or_ps (__m128 __A, __m128 __B)
|
|
225 {
|
|
226 return __builtin_ia32_orps (__A, __B);
|
|
227 }
|
|
228
|
|
229 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
230 _mm_xor_ps (__m128 __A, __m128 __B)
|
|
231 {
|
|
232 return __builtin_ia32_xorps (__A, __B);
|
|
233 }
|
|
234
|
|
235 /* Perform a comparison on the lower SPFP values of A and B. If the
|
|
236 comparison is true, place a mask of all ones in the result, otherwise a
|
|
237 mask of zeros. The upper three SPFP values are passed through from A. */
|
|
238
|
|
239 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
240 _mm_cmpeq_ss (__m128 __A, __m128 __B)
|
|
241 {
|
|
242 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
|
|
243 }
|
|
244
|
|
245 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
246 _mm_cmplt_ss (__m128 __A, __m128 __B)
|
|
247 {
|
|
248 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
|
|
249 }
|
|
250
|
|
251 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
252 _mm_cmple_ss (__m128 __A, __m128 __B)
|
|
253 {
|
|
254 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
|
|
255 }
|
|
256
|
|
257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
258 _mm_cmpgt_ss (__m128 __A, __m128 __B)
|
|
259 {
|
|
260 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
|
|
261 (__v4sf)
|
|
262 __builtin_ia32_cmpltss ((__v4sf) __B,
|
|
263 (__v4sf)
|
|
264 __A));
|
|
265 }
|
|
266
|
|
267 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
268 _mm_cmpge_ss (__m128 __A, __m128 __B)
|
|
269 {
|
|
270 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
|
|
271 (__v4sf)
|
|
272 __builtin_ia32_cmpless ((__v4sf) __B,
|
|
273 (__v4sf)
|
|
274 __A));
|
|
275 }
|
|
276
|
|
277 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
278 _mm_cmpneq_ss (__m128 __A, __m128 __B)
|
|
279 {
|
|
280 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
|
|
281 }
|
|
282
|
|
283 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
284 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
|
|
285 {
|
|
286 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
|
|
287 }
|
|
288
|
|
289 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
290 _mm_cmpnle_ss (__m128 __A, __m128 __B)
|
|
291 {
|
|
292 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
|
|
293 }
|
|
294
|
|
295 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
296 _mm_cmpngt_ss (__m128 __A, __m128 __B)
|
|
297 {
|
|
298 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
|
|
299 (__v4sf)
|
|
300 __builtin_ia32_cmpnltss ((__v4sf) __B,
|
|
301 (__v4sf)
|
|
302 __A));
|
|
303 }
|
|
304
|
|
305 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
306 _mm_cmpnge_ss (__m128 __A, __m128 __B)
|
|
307 {
|
|
308 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
|
|
309 (__v4sf)
|
|
310 __builtin_ia32_cmpnless ((__v4sf) __B,
|
|
311 (__v4sf)
|
|
312 __A));
|
|
313 }
|
|
314
|
|
315 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
316 _mm_cmpord_ss (__m128 __A, __m128 __B)
|
|
317 {
|
|
318 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
|
|
319 }
|
|
320
|
|
321 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
322 _mm_cmpunord_ss (__m128 __A, __m128 __B)
|
|
323 {
|
|
324 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
|
|
325 }
|
|
326
|
|
327 /* Perform a comparison on the four SPFP values of A and B. For each
|
|
328 element, if the comparison is true, place a mask of all ones in the
|
|
329 result, otherwise a mask of zeros. */
|
|
330
|
|
331 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
332 _mm_cmpeq_ps (__m128 __A, __m128 __B)
|
|
333 {
|
|
334 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
|
|
335 }
|
|
336
|
|
337 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
338 _mm_cmplt_ps (__m128 __A, __m128 __B)
|
|
339 {
|
|
340 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
|
|
341 }
|
|
342
|
|
343 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
344 _mm_cmple_ps (__m128 __A, __m128 __B)
|
|
345 {
|
|
346 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
|
|
347 }
|
|
348
|
|
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
350 _mm_cmpgt_ps (__m128 __A, __m128 __B)
|
|
351 {
|
|
352 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
|
|
353 }
|
|
354
|
|
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
356 _mm_cmpge_ps (__m128 __A, __m128 __B)
|
|
357 {
|
|
358 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
|
|
359 }
|
|
360
|
|
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
362 _mm_cmpneq_ps (__m128 __A, __m128 __B)
|
|
363 {
|
|
364 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
|
|
365 }
|
|
366
|
|
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
368 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
|
|
369 {
|
|
370 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
|
|
371 }
|
|
372
|
|
373 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
374 _mm_cmpnle_ps (__m128 __A, __m128 __B)
|
|
375 {
|
|
376 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
|
|
377 }
|
|
378
|
|
379 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
380 _mm_cmpngt_ps (__m128 __A, __m128 __B)
|
|
381 {
|
|
382 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
|
|
383 }
|
|
384
|
|
385 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
386 _mm_cmpnge_ps (__m128 __A, __m128 __B)
|
|
387 {
|
|
388 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
|
|
389 }
|
|
390
|
|
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
392 _mm_cmpord_ps (__m128 __A, __m128 __B)
|
|
393 {
|
|
394 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
|
|
395 }
|
|
396
|
|
397 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
398 _mm_cmpunord_ps (__m128 __A, __m128 __B)
|
|
399 {
|
|
400 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
|
|
401 }
|
|
402
|
|
403 /* Compare the lower SPFP values of A and B and return 1 if true
|
|
404 and 0 if false. */
|
|
405
|
|
406 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
407 _mm_comieq_ss (__m128 __A, __m128 __B)
|
|
408 {
|
|
409 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
|
|
410 }
|
|
411
|
|
412 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
413 _mm_comilt_ss (__m128 __A, __m128 __B)
|
|
414 {
|
|
415 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
|
|
416 }
|
|
417
|
|
418 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
419 _mm_comile_ss (__m128 __A, __m128 __B)
|
|
420 {
|
|
421 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
|
|
422 }
|
|
423
|
|
424 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
425 _mm_comigt_ss (__m128 __A, __m128 __B)
|
|
426 {
|
|
427 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
|
|
428 }
|
|
429
|
|
430 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
431 _mm_comige_ss (__m128 __A, __m128 __B)
|
|
432 {
|
|
433 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
|
|
434 }
|
|
435
|
|
436 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
437 _mm_comineq_ss (__m128 __A, __m128 __B)
|
|
438 {
|
|
439 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
|
|
440 }
|
|
441
|
|
442 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
443 _mm_ucomieq_ss (__m128 __A, __m128 __B)
|
|
444 {
|
|
445 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
|
|
446 }
|
|
447
|
|
448 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
449 _mm_ucomilt_ss (__m128 __A, __m128 __B)
|
|
450 {
|
|
451 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
|
|
452 }
|
|
453
|
|
454 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
455 _mm_ucomile_ss (__m128 __A, __m128 __B)
|
|
456 {
|
|
457 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
|
|
458 }
|
|
459
|
|
460 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
461 _mm_ucomigt_ss (__m128 __A, __m128 __B)
|
|
462 {
|
|
463 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
|
|
464 }
|
|
465
|
|
466 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
467 _mm_ucomige_ss (__m128 __A, __m128 __B)
|
|
468 {
|
|
469 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
|
|
470 }
|
|
471
|
|
472 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
473 _mm_ucomineq_ss (__m128 __A, __m128 __B)
|
|
474 {
|
|
475 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
|
|
476 }
|
|
477
|
|
478 /* Convert the lower SPFP value to a 32-bit integer according to the current
|
|
479 rounding mode. */
|
|
480 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
481 _mm_cvtss_si32 (__m128 __A)
|
|
482 {
|
|
483 return __builtin_ia32_cvtss2si ((__v4sf) __A);
|
|
484 }
|
|
485
|
|
486 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
487 _mm_cvt_ss2si (__m128 __A)
|
|
488 {
|
|
489 return _mm_cvtss_si32 (__A);
|
|
490 }
|
|
491
|
|
492 #ifdef __x86_64__
|
|
493 /* Convert the lower SPFP value to a 32-bit integer according to the
|
|
494 current rounding mode. */
|
|
495
|
|
496 /* Intel intrinsic. */
|
|
497 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
498 _mm_cvtss_si64 (__m128 __A)
|
|
499 {
|
|
500 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
|
|
501 }
|
|
502
|
|
503 /* Microsoft intrinsic. */
|
|
504 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
505 _mm_cvtss_si64x (__m128 __A)
|
|
506 {
|
|
507 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
|
|
508 }
|
|
509 #endif
|
|
510
|
|
511 /* Convert the two lower SPFP values to 32-bit integers according to the
|
|
512 current rounding mode. Return the integers in packed form. */
|
|
513 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
514 _mm_cvtps_pi32 (__m128 __A)
|
|
515 {
|
|
516 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
|
|
517 }
|
|
518
|
|
519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
520 _mm_cvt_ps2pi (__m128 __A)
|
|
521 {
|
|
522 return _mm_cvtps_pi32 (__A);
|
|
523 }
|
|
524
|
|
525 /* Truncate the lower SPFP value to a 32-bit integer. */
|
|
526 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
527 _mm_cvttss_si32 (__m128 __A)
|
|
528 {
|
|
529 return __builtin_ia32_cvttss2si ((__v4sf) __A);
|
|
530 }
|
|
531
|
|
532 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
533 _mm_cvtt_ss2si (__m128 __A)
|
|
534 {
|
|
535 return _mm_cvttss_si32 (__A);
|
|
536 }
|
|
537
|
|
538 #ifdef __x86_64__
|
|
539 /* Truncate the lower SPFP value to a 32-bit integer. */
|
|
540
|
|
541 /* Intel intrinsic. */
|
|
542 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
543 _mm_cvttss_si64 (__m128 __A)
|
|
544 {
|
|
545 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
|
|
546 }
|
|
547
|
|
548 /* Microsoft intrinsic. */
|
|
549 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
550 _mm_cvttss_si64x (__m128 __A)
|
|
551 {
|
|
552 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
|
|
553 }
|
|
554 #endif
|
|
555
|
|
556 /* Truncate the two lower SPFP values to 32-bit integers. Return the
|
|
557 integers in packed form. */
|
|
558 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
559 _mm_cvttps_pi32 (__m128 __A)
|
|
560 {
|
|
561 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
|
|
562 }
|
|
563
|
|
564 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
565 _mm_cvtt_ps2pi (__m128 __A)
|
|
566 {
|
|
567 return _mm_cvttps_pi32 (__A);
|
|
568 }
|
|
569
|
|
570 /* Convert B to a SPFP value and insert it as element zero in A. */
|
|
571 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
572 _mm_cvtsi32_ss (__m128 __A, int __B)
|
|
573 {
|
|
574 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
|
|
575 }
|
|
576
|
|
577 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
578 _mm_cvt_si2ss (__m128 __A, int __B)
|
|
579 {
|
|
580 return _mm_cvtsi32_ss (__A, __B);
|
|
581 }
|
|
582
|
|
583 #ifdef __x86_64__
|
|
584 /* Convert B to a SPFP value and insert it as element zero in A. */
|
|
585
|
|
586 /* Intel intrinsic. */
|
|
587 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
588 _mm_cvtsi64_ss (__m128 __A, long long __B)
|
|
589 {
|
|
590 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
|
|
591 }
|
|
592
|
|
593 /* Microsoft intrinsic. */
|
|
594 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
595 _mm_cvtsi64x_ss (__m128 __A, long long __B)
|
|
596 {
|
|
597 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
|
|
598 }
|
|
599 #endif
|
|
600
|
|
601 /* Convert the two 32-bit values in B to SPFP form and insert them
|
|
602 as the two lower elements in A. */
|
|
603 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
604 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
|
|
605 {
|
|
606 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
|
|
607 }
|
|
608
|
|
609 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
610 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
|
|
611 {
|
|
612 return _mm_cvtpi32_ps (__A, __B);
|
|
613 }
|
|
614
|
|
615 /* Convert the four signed 16-bit values in A to SPFP form. */
|
|
616 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
617 _mm_cvtpi16_ps (__m64 __A)
|
|
618 {
|
|
619 __v4hi __sign;
|
|
620 __v2si __hisi, __losi;
|
|
621 __v4sf __zero, __ra, __rb;
|
|
622
|
|
623 /* This comparison against zero gives us a mask that can be used to
|
|
624 fill in the missing sign bits in the unpack operations below, so
|
|
625 that we get signed values after unpacking. */
|
|
626 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
|
|
627
|
|
628 /* Convert the four words to doublewords. */
|
|
629 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
|
|
630 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
|
|
631
|
|
632 /* Convert the doublewords to floating point two at a time. */
|
|
633 __zero = (__v4sf) _mm_setzero_ps ();
|
|
634 __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi);
|
|
635 __rb = __builtin_ia32_cvtpi2ps (__ra, __losi);
|
|
636
|
|
637 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
|
|
638 }
|
|
639
|
|
640 /* Convert the four unsigned 16-bit values in A to SPFP form. */
|
|
641 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
642 _mm_cvtpu16_ps (__m64 __A)
|
|
643 {
|
|
644 __v2si __hisi, __losi;
|
|
645 __v4sf __zero, __ra, __rb;
|
|
646
|
|
647 /* Convert the four words to doublewords. */
|
|
648 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
|
|
649 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
|
|
650
|
|
651 /* Convert the doublewords to floating point two at a time. */
|
|
652 __zero = (__v4sf) _mm_setzero_ps ();
|
|
653 __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi);
|
|
654 __rb = __builtin_ia32_cvtpi2ps (__ra, __losi);
|
|
655
|
|
656 return (__m128) __builtin_ia32_movlhps (__ra, __rb);
|
|
657 }
|
|
658
|
|
659 /* Convert the low four signed 8-bit values in A to SPFP form. */
|
|
660 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
661 _mm_cvtpi8_ps (__m64 __A)
|
|
662 {
|
|
663 __v8qi __sign;
|
|
664
|
|
665 /* This comparison against zero gives us a mask that can be used to
|
|
666 fill in the missing sign bits in the unpack operations below, so
|
|
667 that we get signed values after unpacking. */
|
|
668 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
|
|
669
|
|
670 /* Convert the four low bytes to words. */
|
|
671 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
|
|
672
|
|
673 return _mm_cvtpi16_ps(__A);
|
|
674 }
|
|
675
|
|
676 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
|
|
677 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
678 _mm_cvtpu8_ps(__m64 __A)
|
|
679 {
|
|
680 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
|
|
681 return _mm_cvtpu16_ps(__A);
|
|
682 }
|
|
683
|
|
684 /* Convert the four signed 32-bit values in A and B to SPFP form. */
|
|
685 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
686 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
|
|
687 {
|
|
688 __v4sf __zero = (__v4sf) _mm_setzero_ps ();
|
|
689 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
|
|
690 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
|
|
691 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
|
|
692 }
|
|
693
|
|
694 /* Convert the four SPFP values in A to four signed 16-bit integers. */
|
|
695 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
696 _mm_cvtps_pi16(__m128 __A)
|
|
697 {
|
|
698 __v4sf __hisf = (__v4sf)__A;
|
|
699 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
|
|
700 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
|
|
701 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
|
|
702 return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
|
|
703 }
|
|
704
|
|
705 /* Convert the four SPFP values in A to four signed 8-bit integers. */
|
|
706 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
707 _mm_cvtps_pi8(__m128 __A)
|
|
708 {
|
|
709 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
|
|
710 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
|
|
711 }
|
|
712
|
|
713 /* Selects four specific SPFP values from A and B based on MASK. */
|
|
714 #ifdef __OPTIMIZE__
|
|
715 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
716 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
|
|
717 {
|
|
718 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
|
|
719 }
|
|
720 #else
|
|
721 #define _mm_shuffle_ps(A, B, MASK) \
|
|
722 ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \
|
|
723 (__v4sf)(__m128)(B), (int)(MASK)))
|
|
724 #endif
|
|
725
|
|
726 /* Selects and interleaves the upper two SPFP values from A and B. */
|
|
727 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
728 _mm_unpackhi_ps (__m128 __A, __m128 __B)
|
|
729 {
|
|
730 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
|
|
731 }
|
|
732
|
|
733 /* Selects and interleaves the lower two SPFP values from A and B. */
|
|
734 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
735 _mm_unpacklo_ps (__m128 __A, __m128 __B)
|
|
736 {
|
|
737 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
|
|
738 }
|
|
739
|
|
740 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
|
|
741 the lower two values are passed through from A. */
|
|
742 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
743 _mm_loadh_pi (__m128 __A, __m64 const *__P)
|
|
744 {
|
|
745 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
|
|
746 }
|
|
747
|
|
748 /* Stores the upper two SPFP values of A into P. */
|
|
749 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
750 _mm_storeh_pi (__m64 *__P, __m128 __A)
|
|
751 {
|
|
752 __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
|
|
753 }
|
|
754
|
|
755 /* Moves the upper two values of B into the lower two values of A. */
|
|
756 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
757 _mm_movehl_ps (__m128 __A, __m128 __B)
|
|
758 {
|
|
759 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
|
|
760 }
|
|
761
|
|
762 /* Moves the lower two values of B into the upper two values of A. */
|
|
763 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
764 _mm_movelh_ps (__m128 __A, __m128 __B)
|
|
765 {
|
|
766 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
|
|
767 }
|
|
768
|
|
769 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
|
|
770 the upper two values are passed through from A. */
|
|
771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
772 _mm_loadl_pi (__m128 __A, __m64 const *__P)
|
|
773 {
|
|
774 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
|
|
775 }
|
|
776
|
|
777 /* Stores the lower two SPFP values of A into P. */
|
|
778 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
779 _mm_storel_pi (__m64 *__P, __m128 __A)
|
|
780 {
|
|
781 __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
|
|
782 }
|
|
783
|
|
784 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
|
|
785 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
786 _mm_movemask_ps (__m128 __A)
|
|
787 {
|
|
788 return __builtin_ia32_movmskps ((__v4sf)__A);
|
|
789 }
|
|
790
|
|
791 /* Return the contents of the control register. */
|
|
792 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
793 _mm_getcsr (void)
|
|
794 {
|
|
795 return __builtin_ia32_stmxcsr ();
|
|
796 }
|
|
797
|
|
798 /* Read exception bits from the control register. */
|
|
799 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
800 _MM_GET_EXCEPTION_STATE (void)
|
|
801 {
|
|
802 return _mm_getcsr() & _MM_EXCEPT_MASK;
|
|
803 }
|
|
804
|
|
805 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
806 _MM_GET_EXCEPTION_MASK (void)
|
|
807 {
|
|
808 return _mm_getcsr() & _MM_MASK_MASK;
|
|
809 }
|
|
810
|
|
811 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
812 _MM_GET_ROUNDING_MODE (void)
|
|
813 {
|
|
814 return _mm_getcsr() & _MM_ROUND_MASK;
|
|
815 }
|
|
816
|
|
817 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
818 _MM_GET_FLUSH_ZERO_MODE (void)
|
|
819 {
|
|
820 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
|
|
821 }
|
|
822
|
|
823 /* Set the control register to I. */
|
|
824 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
825 _mm_setcsr (unsigned int __I)
|
|
826 {
|
|
827 __builtin_ia32_ldmxcsr (__I);
|
|
828 }
|
|
829
|
|
830 /* Set exception bits in the control register. */
|
|
831 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
832 _MM_SET_EXCEPTION_STATE(unsigned int __mask)
|
|
833 {
|
|
834 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
|
|
835 }
|
|
836
|
|
837 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
838 _MM_SET_EXCEPTION_MASK (unsigned int __mask)
|
|
839 {
|
|
840 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
|
|
841 }
|
|
842
|
|
843 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
844 _MM_SET_ROUNDING_MODE (unsigned int __mode)
|
|
845 {
|
|
846 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
|
|
847 }
|
|
848
|
|
849 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
850 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
|
|
851 {
|
|
852 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
|
|
853 }
|
|
854
|
|
855 /* Create a vector with element 0 as F and the rest zero. */
|
|
856 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
857 _mm_set_ss (float __F)
|
|
858 {
|
|
859 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
|
|
860 }
|
|
861
|
|
862 /* Create a vector with all four elements equal to F. */
|
|
863 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
864 _mm_set1_ps (float __F)
|
|
865 {
|
|
866 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
|
|
867 }
|
|
868
|
|
869 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
870 _mm_set_ps1 (float __F)
|
|
871 {
|
|
872 return _mm_set1_ps (__F);
|
|
873 }
|
|
874
|
|
875 /* Create a vector with element 0 as *P and the rest zero. */
|
|
876 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
877 _mm_load_ss (float const *__P)
|
|
878 {
|
|
879 return _mm_set_ss (*__P);
|
|
880 }
|
|
881
|
|
882 /* Create a vector with all four elements equal to *P. */
|
|
883 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
884 _mm_load1_ps (float const *__P)
|
|
885 {
|
|
886 return _mm_set1_ps (*__P);
|
|
887 }
|
|
888
|
|
889 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
890 _mm_load_ps1 (float const *__P)
|
|
891 {
|
|
892 return _mm_load1_ps (__P);
|
|
893 }
|
|
894
|
|
895 /* Load four SPFP values from P. The address must be 16-byte aligned. */
|
|
896 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
897 _mm_load_ps (float const *__P)
|
|
898 {
|
|
899 return (__m128) *(__v4sf *)__P;
|
|
900 }
|
|
901
|
|
902 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
|
|
903 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
904 _mm_loadu_ps (float const *__P)
|
|
905 {
|
|
906 return (__m128) __builtin_ia32_loadups (__P);
|
|
907 }
|
|
908
|
|
909 /* Load four SPFP values in reverse order. The address must be aligned. */
|
|
910 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
911 _mm_loadr_ps (float const *__P)
|
|
912 {
|
|
913 __v4sf __tmp = *(__v4sf *)__P;
|
|
914 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
|
|
915 }
|
|
916
|
|
917 /* Create the vector [Z Y X W]. */
|
|
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
919 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
|
|
920 {
|
|
921 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
|
|
922 }
|
|
923
|
|
924 /* Create the vector [W X Y Z]. */
|
|
925 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
926 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
|
|
927 {
|
|
928 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
|
|
929 }
|
|
930
|
|
931 /* Stores the lower SPFP value. */
|
|
932 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
933 _mm_store_ss (float *__P, __m128 __A)
|
|
934 {
|
|
935 *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
|
|
936 }
|
|
937
|
|
938 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
939 _mm_cvtss_f32 (__m128 __A)
|
|
940 {
|
|
941 return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
|
|
942 }
|
|
943
|
|
944 /* Store four SPFP values. The address must be 16-byte aligned. */
|
|
945 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
946 _mm_store_ps (float *__P, __m128 __A)
|
|
947 {
|
|
948 *(__v4sf *)__P = (__v4sf)__A;
|
|
949 }
|
|
950
|
|
951 /* Store four SPFP values. The address need not be 16-byte aligned. */
|
|
952 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
953 _mm_storeu_ps (float *__P, __m128 __A)
|
|
954 {
|
|
955 __builtin_ia32_storeups (__P, (__v4sf)__A);
|
|
956 }
|
|
957
|
|
958 /* Store the lower SPFP value across four words. */
|
|
959 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
960 _mm_store1_ps (float *__P, __m128 __A)
|
|
961 {
|
|
962 __v4sf __va = (__v4sf)__A;
|
|
963 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
|
|
964 _mm_storeu_ps (__P, __tmp);
|
|
965 }
|
|
966
|
|
967 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
968 _mm_store_ps1 (float *__P, __m128 __A)
|
|
969 {
|
|
970 _mm_store1_ps (__P, __A);
|
|
971 }
|
|
972
|
|
973 /* Store four SPFP values in reverse order. The address must be aligned. */
|
|
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
975 _mm_storer_ps (float *__P, __m128 __A)
|
|
976 {
|
|
977 __v4sf __va = (__v4sf)__A;
|
|
978 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
|
|
979 _mm_store_ps (__P, __tmp);
|
|
980 }
|
|
981
|
|
982 /* Sets the low SPFP value of A from the low value of B. */
|
|
983 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
984 _mm_move_ss (__m128 __A, __m128 __B)
|
|
985 {
|
|
986 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
|
|
987 }
|
|
988
|
|
989 /* Extracts one of the four words of A. The selector N must be immediate. */
|
|
990 #ifdef __OPTIMIZE__
|
|
991 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
992 _mm_extract_pi16 (__m64 const __A, int const __N)
|
|
993 {
|
|
994 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
|
|
995 }
|
|
996
|
|
997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
998 _m_pextrw (__m64 const __A, int const __N)
|
|
999 {
|
|
1000 return _mm_extract_pi16 (__A, __N);
|
|
1001 }
|
|
1002 #else
|
|
1003 #define _mm_extract_pi16(A, N) \
|
|
1004 ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
|
|
1005
|
|
1006 #define _m_pextrw(A, N) _mm_extract_pi16(A, N)
|
|
1007 #endif
|
|
1008
|
|
1009 /* Inserts word D into one of four words of A. The selector N must be
|
|
1010 immediate. */
|
|
1011 #ifdef __OPTIMIZE__
|
|
1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1013 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
|
|
1014 {
|
|
1015 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
|
|
1016 }
|
|
1017
|
|
1018 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1019 _m_pinsrw (__m64 const __A, int const __D, int const __N)
|
|
1020 {
|
|
1021 return _mm_insert_pi16 (__A, __D, __N);
|
|
1022 }
|
|
1023 #else
|
|
1024 #define _mm_insert_pi16(A, D, N) \
|
|
1025 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \
|
|
1026 (int)(D), (int)(N)))
|
|
1027
|
|
1028 #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
|
|
1029 #endif
|
|
1030
|
|
1031 /* Compute the element-wise maximum of signed 16-bit values. */
|
|
1032 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1033 _mm_max_pi16 (__m64 __A, __m64 __B)
|
|
1034 {
|
|
1035 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
|
|
1036 }
|
|
1037
|
|
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1039 _m_pmaxsw (__m64 __A, __m64 __B)
|
|
1040 {
|
|
1041 return _mm_max_pi16 (__A, __B);
|
|
1042 }
|
|
1043
|
|
1044 /* Compute the element-wise maximum of unsigned 8-bit values. */
|
|
1045 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1046 _mm_max_pu8 (__m64 __A, __m64 __B)
|
|
1047 {
|
|
1048 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
|
|
1049 }
|
|
1050
|
|
1051 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1052 _m_pmaxub (__m64 __A, __m64 __B)
|
|
1053 {
|
|
1054 return _mm_max_pu8 (__A, __B);
|
|
1055 }
|
|
1056
|
|
1057 /* Compute the element-wise minimum of signed 16-bit values. */
|
|
1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1059 _mm_min_pi16 (__m64 __A, __m64 __B)
|
|
1060 {
|
|
1061 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
|
|
1062 }
|
|
1063
|
|
1064 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1065 _m_pminsw (__m64 __A, __m64 __B)
|
|
1066 {
|
|
1067 return _mm_min_pi16 (__A, __B);
|
|
1068 }
|
|
1069
|
|
1070 /* Compute the element-wise minimum of unsigned 8-bit values. */
|
|
1071 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1072 _mm_min_pu8 (__m64 __A, __m64 __B)
|
|
1073 {
|
|
1074 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
|
|
1075 }
|
|
1076
|
|
1077 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1078 _m_pminub (__m64 __A, __m64 __B)
|
|
1079 {
|
|
1080 return _mm_min_pu8 (__A, __B);
|
|
1081 }
|
|
1082
|
|
1083 /* Create an 8-bit mask of the signs of 8-bit values. */
|
|
1084 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1085 _mm_movemask_pi8 (__m64 __A)
|
|
1086 {
|
|
1087 return __builtin_ia32_pmovmskb ((__v8qi)__A);
|
|
1088 }
|
|
1089
|
|
1090 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1091 _m_pmovmskb (__m64 __A)
|
|
1092 {
|
|
1093 return _mm_movemask_pi8 (__A);
|
|
1094 }
|
|
1095
|
|
1096 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
|
|
1097 in B and produce the high 16 bits of the 32-bit results. */
|
|
1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1099 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
|
|
1100 {
|
|
1101 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
|
|
1102 }
|
|
1103
|
|
1104 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1105 _m_pmulhuw (__m64 __A, __m64 __B)
|
|
1106 {
|
|
1107 return _mm_mulhi_pu16 (__A, __B);
|
|
1108 }
|
|
1109
|
|
1110 /* Return a combination of the four 16-bit values in A. The selector
|
|
1111 must be an immediate. */
|
|
1112 #ifdef __OPTIMIZE__
|
|
1113 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1114 _mm_shuffle_pi16 (__m64 __A, int const __N)
|
|
1115 {
|
|
1116 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
|
|
1117 }
|
|
1118
|
|
1119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1120 _m_pshufw (__m64 __A, int const __N)
|
|
1121 {
|
|
1122 return _mm_shuffle_pi16 (__A, __N);
|
|
1123 }
|
|
1124 #else
|
|
1125 #define _mm_shuffle_pi16(A, N) \
|
|
1126 ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
|
|
1127
|
|
1128 #define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
|
|
1129 #endif
|
|
1130
|
|
1131 /* Conditionally store byte elements of A into P. The high bit of each
|
|
1132 byte in the selector N determines whether the corresponding byte from
|
|
1133 A is stored. */
|
|
1134 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1135 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
|
|
1136 {
|
|
1137 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
|
|
1138 }
|
|
1139
|
|
1140 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1141 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
|
|
1142 {
|
|
1143 _mm_maskmove_si64 (__A, __N, __P);
|
|
1144 }
|
|
1145
|
|
1146 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
|
|
1147 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1148 _mm_avg_pu8 (__m64 __A, __m64 __B)
|
|
1149 {
|
|
1150 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
|
|
1151 }
|
|
1152
|
|
1153 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1154 _m_pavgb (__m64 __A, __m64 __B)
|
|
1155 {
|
|
1156 return _mm_avg_pu8 (__A, __B);
|
|
1157 }
|
|
1158
|
|
1159 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
|
|
1160 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1161 _mm_avg_pu16 (__m64 __A, __m64 __B)
|
|
1162 {
|
|
1163 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
|
|
1164 }
|
|
1165
|
|
1166 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1167 _m_pavgw (__m64 __A, __m64 __B)
|
|
1168 {
|
|
1169 return _mm_avg_pu16 (__A, __B);
|
|
1170 }
|
|
1171
|
|
1172 /* Compute the sum of the absolute differences of the unsigned 8-bit
|
|
1173 values in A and B. Return the value in the lower 16-bit word; the
|
|
1174 upper words are cleared. */
|
|
1175 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1176 _mm_sad_pu8 (__m64 __A, __m64 __B)
|
|
1177 {
|
|
1178 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
|
|
1179 }
|
|
1180
|
|
1181 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1182 _m_psadbw (__m64 __A, __m64 __B)
|
|
1183 {
|
|
1184 return _mm_sad_pu8 (__A, __B);
|
|
1185 }
|
|
1186
|
|
1187 /* Loads one cache line from address P to a location "closer" to the
|
|
1188 processor. The selector I specifies the type of prefetch operation. */
|
|
1189 #ifdef __OPTIMIZE__
|
|
1190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1191 _mm_prefetch (const void *__P, enum _mm_hint __I)
|
|
1192 {
|
|
1193 __builtin_prefetch (__P, 0, __I);
|
|
1194 }
|
|
1195 #else
|
|
1196 #define _mm_prefetch(P, I) \
|
|
1197 __builtin_prefetch ((P), 0, (I))
|
|
1198 #endif
|
|
1199
|
|
1200 /* Stores the data in A to the address P without polluting the caches. */
|
|
1201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1202 _mm_stream_pi (__m64 *__P, __m64 __A)
|
|
1203 {
|
|
1204 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
|
|
1205 }
|
|
1206
|
|
1207 /* Likewise. The address must be 16-byte aligned. */
|
|
1208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1209 _mm_stream_ps (float *__P, __m128 __A)
|
|
1210 {
|
|
1211 __builtin_ia32_movntps (__P, (__v4sf)__A);
|
|
1212 }
|
|
1213
|
|
1214 /* Guarantees that every preceding store is globally visible before
|
|
1215 any subsequent store. */
|
|
1216 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1217 _mm_sfence (void)
|
|
1218 {
|
|
1219 __builtin_ia32_sfence ();
|
|
1220 }
|
|
1221
|
|
1222 /* The execution of the next instruction is delayed by an implementation
|
|
1223 specific amount of time. The instruction does not modify the
|
|
1224 architectural state. */
|
|
1225 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1226 _mm_pause (void)
|
|
1227 {
|
|
1228 __asm__ __volatile__ ("rep; nop" : : );
|
|
1229 }
|
|
1230
|
|
1231 /* Transpose the 4x4 matrix composed of row[0-3]. */
|
|
1232 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
|
|
1233 do { \
|
|
1234 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
|
|
1235 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \
|
|
1236 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \
|
|
1237 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \
|
|
1238 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \
|
|
1239 (row0) = __builtin_ia32_movlhps (__t0, __t1); \
|
|
1240 (row1) = __builtin_ia32_movhlps (__t1, __t0); \
|
|
1241 (row2) = __builtin_ia32_movlhps (__t2, __t3); \
|
|
1242 (row3) = __builtin_ia32_movhlps (__t3, __t2); \
|
|
1243 } while (0)
|
|
1244
|
|
1245 /* For backward source compatibility. */
|
|
1246 #ifdef __SSE2__
|
|
1247 # include <emmintrin.h>
|
|
1248 #endif
|
|
1249
|
|
1250 #endif /* __SSE__ */
|
|
1251 #endif /* _XMMINTRIN_H_INCLUDED */
|