Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/i386/avxintrin.h @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | f6334be47118 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc. | |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation; either version 3, or (at your option) | |
8 any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. */ | |
23 | |
24 /* Implemented from the specification included in the Intel C++ Compiler | |
25 User Guide and Reference, version 11.0. */ | |
26 | |
27 #ifndef _IMMINTRIN_H_INCLUDED | |
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead." | |
29 #endif | |
30 | |
31 /* Internal data types for implementing the intrinsics. */ | |
32 typedef double __v4df __attribute__ ((__vector_size__ (32))); | |
33 typedef float __v8sf __attribute__ ((__vector_size__ (32))); | |
34 typedef long long __v4di __attribute__ ((__vector_size__ (32))); | |
35 typedef int __v8si __attribute__ ((__vector_size__ (32))); | |
36 typedef short __v16hi __attribute__ ((__vector_size__ (32))); | |
37 typedef char __v32qi __attribute__ ((__vector_size__ (32))); | |
38 | |
39 /* The Intel API is flexible enough that we must allow aliasing with other | |
40 vector types, and their scalar components. */ | |
41 typedef float __m256 __attribute__ ((__vector_size__ (32), | |
42 __may_alias__)); | |
43 typedef long long __m256i __attribute__ ((__vector_size__ (32), | |
44 __may_alias__)); | |
45 typedef double __m256d __attribute__ ((__vector_size__ (32), | |
46 __may_alias__)); | |
47 | |
48 /* Compare predicates for scalar and packed compare intrinsics. */ | |
49 | |
50 /* Equal (ordered, non-signaling) */ | |
51 #define _CMP_EQ_OQ 0x00 | |
52 /* Less-than (ordered, signaling) */ | |
53 #define _CMP_LT_OS 0x01 | |
54 /* Less-than-or-equal (ordered, signaling) */ | |
55 #define _CMP_LE_OS 0x02 | |
56 /* Unordered (non-signaling) */ | |
57 #define _CMP_UNORD_Q 0x03 | |
58 /* Not-equal (unordered, non-signaling) */ | |
59 #define _CMP_NEQ_UQ 0x04 | |
60 /* Not-less-than (unordered, signaling) */ | |
61 #define _CMP_NLT_US 0x05 | |
62 /* Not-less-than-or-equal (unordered, signaling) */ | |
63 #define _CMP_NLE_US 0x06 | |
64 /* Ordered (nonsignaling) */ | |
65 #define _CMP_ORD_Q 0x07 | |
66 /* Equal (unordered, non-signaling) */ | |
67 #define _CMP_EQ_UQ 0x08 | |
68 /* Not-greater-than-or-equal (unordered, signaling) */ | |
69 #define _CMP_NGE_US 0x09 | |
70 /* Not-greater-than (unordered, signaling) */ | |
71 #define _CMP_NGT_US 0x0a | |
72 /* False (ordered, non-signaling) */ | |
73 #define _CMP_FALSE_OQ 0x0b | |
74 /* Not-equal (ordered, non-signaling) */ | |
75 #define _CMP_NEQ_OQ 0x0c | |
76 /* Greater-than-or-equal (ordered, signaling) */ | |
77 #define _CMP_GE_OS 0x0d | |
78 /* Greater-than (ordered, signaling) */ | |
79 #define _CMP_GT_OS 0x0e | |
80 /* True (unordered, non-signaling) */ | |
81 #define _CMP_TRUE_UQ 0x0f | |
82 /* Equal (ordered, signaling) */ | |
83 #define _CMP_EQ_OS 0x10 | |
84 /* Less-than (ordered, non-signaling) */ | |
85 #define _CMP_LT_OQ 0x11 | |
86 /* Less-than-or-equal (ordered, non-signaling) */ | |
87 #define _CMP_LE_OQ 0x12 | |
88 /* Unordered (signaling) */ | |
89 #define _CMP_UNORD_S 0x13 | |
90 /* Not-equal (unordered, signaling) */ | |
91 #define _CMP_NEQ_US 0x14 | |
92 /* Not-less-than (unordered, non-signaling) */ | |
93 #define _CMP_NLT_UQ 0x15 | |
94 /* Not-less-than-or-equal (unordered, non-signaling) */ | |
95 #define _CMP_NLE_UQ 0x16 | |
96 /* Ordered (signaling) */ | |
97 #define _CMP_ORD_S 0x17 | |
98 /* Equal (unordered, signaling) */ | |
99 #define _CMP_EQ_US 0x18 | |
100 /* Not-greater-than-or-equal (unordered, non-signaling) */ | |
101 #define _CMP_NGE_UQ 0x19 | |
102 /* Not-greater-than (unordered, non-signaling) */ | |
103 #define _CMP_NGT_UQ 0x1a | |
104 /* False (ordered, signaling) */ | |
105 #define _CMP_FALSE_OS 0x1b | |
106 /* Not-equal (ordered, signaling) */ | |
107 #define _CMP_NEQ_OS 0x1c | |
108 /* Greater-than-or-equal (ordered, non-signaling) */ | |
109 #define _CMP_GE_OQ 0x1d | |
110 /* Greater-than (ordered, non-signaling) */ | |
111 #define _CMP_GT_OQ 0x1e | |
112 /* True (unordered, signaling) */ | |
113 #define _CMP_TRUE_US 0x1f | |
114 | |
115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
116 _mm256_add_pd (__m256d __A, __m256d __B) | |
117 { | |
118 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B); | |
119 } | |
120 | |
121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
122 _mm256_add_ps (__m256 __A, __m256 __B) | |
123 { | |
124 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B); | |
125 } | |
126 | |
127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
128 _mm256_addsub_pd (__m256d __A, __m256d __B) | |
129 { | |
130 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); | |
131 } | |
132 | |
133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
134 _mm256_addsub_ps (__m256 __A, __m256 __B) | |
135 { | |
136 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); | |
137 } | |
138 | |
139 | |
140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
141 _mm256_and_pd (__m256d __A, __m256d __B) | |
142 { | |
143 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); | |
144 } | |
145 | |
146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
147 _mm256_and_ps (__m256 __A, __m256 __B) | |
148 { | |
149 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); | |
150 } | |
151 | |
152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
153 _mm256_andnot_pd (__m256d __A, __m256d __B) | |
154 { | |
155 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); | |
156 } | |
157 | |
158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
159 _mm256_andnot_ps (__m256 __A, __m256 __B) | |
160 { | |
161 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); | |
162 } | |
163 | |
164 /* Double/single precision floating point blend instructions - select | |
165 data from 2 sources using constant/variable mask. */ | |
166 | |
167 #ifdef __OPTIMIZE__ | |
168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) | |
170 { | |
171 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, | |
172 (__v4df)__Y, | |
173 __M); | |
174 } | |
175 | |
176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) | |
178 { | |
179 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, | |
180 (__v8sf)__Y, | |
181 __M); | |
182 } | |
183 #else | |
184 #define _mm256_blend_pd(X, Y, M) \ | |
185 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ | |
186 (__v4df)(__m256d)(Y), (int)(M))) | |
187 | |
188 #define _mm256_blend_ps(X, Y, M) \ | |
189 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ | |
190 (__v8sf)(__m256)(Y), (int)(M))) | |
191 #endif | |
192 | |
193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) | |
195 { | |
196 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, | |
197 (__v4df)__Y, | |
198 (__v4df)__M); | |
199 } | |
200 | |
201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) | |
203 { | |
204 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, | |
205 (__v8sf)__Y, | |
206 (__v8sf)__M); | |
207 } | |
208 | |
209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
210 _mm256_div_pd (__m256d __A, __m256d __B) | |
211 { | |
212 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B); | |
213 } | |
214 | |
215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
216 _mm256_div_ps (__m256 __A, __m256 __B) | |
217 { | |
218 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B); | |
219 } | |
220 | |
221 /* Dot product instructions with mask-defined summing and zeroing parts | |
222 of result. */ | |
223 | |
224 #ifdef __OPTIMIZE__ | |
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) | |
227 { | |
228 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, | |
229 (__v8sf)__Y, | |
230 __M); | |
231 } | |
232 #else | |
233 #define _mm256_dp_ps(X, Y, M) \ | |
234 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ | |
235 (__v8sf)(__m256)(Y), (int)(M))) | |
236 #endif | |
237 | |
238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
239 _mm256_hadd_pd (__m256d __X, __m256d __Y) | |
240 { | |
241 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); | |
242 } | |
243 | |
244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
245 _mm256_hadd_ps (__m256 __X, __m256 __Y) | |
246 { | |
247 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); | |
248 } | |
249 | |
250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
251 _mm256_hsub_pd (__m256d __X, __m256d __Y) | |
252 { | |
253 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); | |
254 } | |
255 | |
256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
257 _mm256_hsub_ps (__m256 __X, __m256 __Y) | |
258 { | |
259 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); | |
260 } | |
261 | |
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
263 _mm256_max_pd (__m256d __A, __m256d __B) | |
264 { | |
265 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); | |
266 } | |
267 | |
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
269 _mm256_max_ps (__m256 __A, __m256 __B) | |
270 { | |
271 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); | |
272 } | |
273 | |
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
275 _mm256_min_pd (__m256d __A, __m256d __B) | |
276 { | |
277 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); | |
278 } | |
279 | |
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
281 _mm256_min_ps (__m256 __A, __m256 __B) | |
282 { | |
283 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); | |
284 } | |
285 | |
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
287 _mm256_mul_pd (__m256d __A, __m256d __B) | |
288 { | |
289 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B); | |
290 } | |
291 | |
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
293 _mm256_mul_ps (__m256 __A, __m256 __B) | |
294 { | |
295 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B); | |
296 } | |
297 | |
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
299 _mm256_or_pd (__m256d __A, __m256d __B) | |
300 { | |
301 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); | |
302 } | |
303 | |
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
305 _mm256_or_ps (__m256 __A, __m256 __B) | |
306 { | |
307 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); | |
308 } | |
309 | |
310 #ifdef __OPTIMIZE__ | |
311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) | |
313 { | |
314 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, | |
315 __mask); | |
316 } | |
317 | |
318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) | |
320 { | |
321 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, | |
322 __mask); | |
323 } | |
324 #else | |
325 #define _mm256_shuffle_pd(A, B, N) \ | |
326 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ | |
327 (__v4df)(__m256d)(B), (int)(N))) | |
328 | |
329 #define _mm256_shuffle_ps(A, B, N) \ | |
330 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ | |
331 (__v8sf)(__m256)(B), (int)(N))) | |
332 #endif | |
333 | |
334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
335 _mm256_sub_pd (__m256d __A, __m256d __B) | |
336 { | |
337 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B); | |
338 } | |
339 | |
340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
341 _mm256_sub_ps (__m256 __A, __m256 __B) | |
342 { | |
343 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B); | |
344 } | |
345 | |
346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
347 _mm256_xor_pd (__m256d __A, __m256d __B) | |
348 { | |
349 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); | |
350 } | |
351 | |
352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
353 _mm256_xor_ps (__m256 __A, __m256 __B) | |
354 { | |
355 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); | |
356 } | |
357 | |
358 #ifdef __OPTIMIZE__ | |
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) | |
361 { | |
362 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); | |
363 } | |
364 | |
365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) | |
367 { | |
368 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); | |
369 } | |
370 | |
371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) | |
373 { | |
374 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, | |
375 __P); | |
376 } | |
377 | |
378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) | |
380 { | |
381 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, | |
382 __P); | |
383 } | |
384 | |
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) | |
387 { | |
388 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); | |
389 } | |
390 | |
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) | |
393 { | |
394 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); | |
395 } | |
396 #else | |
397 #define _mm_cmp_pd(X, Y, P) \ | |
398 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ | |
399 (__v2df)(__m128d)(Y), (int)(P))) | |
400 | |
401 #define _mm_cmp_ps(X, Y, P) \ | |
402 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ | |
403 (__v4sf)(__m128)(Y), (int)(P))) | |
404 | |
405 #define _mm256_cmp_pd(X, Y, P) \ | |
406 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ | |
407 (__v4df)(__m256d)(Y), (int)(P))) | |
408 | |
409 #define _mm256_cmp_ps(X, Y, P) \ | |
410 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ | |
411 (__v8sf)(__m256)(Y), (int)(P))) | |
412 | |
413 #define _mm_cmp_sd(X, Y, P) \ | |
414 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ | |
415 (__v2df)(__m128d)(Y), (int)(P))) | |
416 | |
417 #define _mm_cmp_ss(X, Y, P) \ | |
418 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ | |
419 (__v4sf)(__m128)(Y), (int)(P))) | |
420 #endif | |
421 | |
422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
423 _mm256_cvtepi32_pd (__m128i __A) | |
424 { | |
425 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); | |
426 } | |
427 | |
428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
429 _mm256_cvtepi32_ps (__m256i __A) | |
430 { | |
431 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); | |
432 } | |
433 | |
434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
435 _mm256_cvtpd_ps (__m256d __A) | |
436 { | |
437 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); | |
438 } | |
439 | |
440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
441 _mm256_cvtps_epi32 (__m256 __A) | |
442 { | |
443 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); | |
444 } | |
445 | |
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
447 _mm256_cvtps_pd (__m128 __A) | |
448 { | |
449 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); | |
450 } | |
451 | |
452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
453 _mm256_cvttpd_epi32 (__m256d __A) | |
454 { | |
455 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); | |
456 } | |
457 | |
458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
459 _mm256_cvtpd_epi32 (__m256d __A) | |
460 { | |
461 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); | |
462 } | |
463 | |
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
465 _mm256_cvttps_epi32 (__m256 __A) | |
466 { | |
467 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); | |
468 } | |
469 | |
470 #ifdef __OPTIMIZE__ | |
471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
472 _mm256_extractf128_pd (__m256d __X, const int __N) | |
473 { | |
474 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); | |
475 } | |
476 | |
477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
478 _mm256_extractf128_ps (__m256 __X, const int __N) | |
479 { | |
480 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); | |
481 } | |
482 | |
483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
484 _mm256_extractf128_si256 (__m256i __X, const int __N) | |
485 { | |
486 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); | |
487 } | |
488 | |
489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
490 _mm256_extract_epi32 (__m256i __X, int const __N) | |
491 { | |
492 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); | |
493 return _mm_extract_epi32 (__Y, __N % 4); | |
494 } | |
495 | |
496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
497 _mm256_extract_epi16 (__m256i __X, int const __N) | |
498 { | |
499 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); | |
500 return _mm_extract_epi16 (__Y, __N % 8); | |
501 } | |
502 | |
503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
504 _mm256_extract_epi8 (__m256i __X, int const __N) | |
505 { | |
506 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); | |
507 return _mm_extract_epi8 (__Y, __N % 16); | |
508 } | |
509 | |
510 #ifdef __x86_64__ | |
511 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
512 _mm256_extract_epi64 (__m256i __X, const int __N) | |
513 { | |
514 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); | |
515 return _mm_extract_epi64 (__Y, __N % 2); | |
516 } | |
517 #endif | |
518 #else | |
519 #define _mm256_extractf128_pd(X, N) \ | |
520 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ | |
521 (int)(N))) | |
522 | |
523 #define _mm256_extractf128_ps(X, N) \ | |
524 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ | |
525 (int)(N))) | |
526 | |
527 #define _mm256_extractf128_si256(X, N) \ | |
528 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ | |
529 (int)(N))) | |
530 | |
531 #define _mm256_extract_epi32(X, N) \ | |
532 (__extension__ \ | |
533 ({ \ | |
534 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ | |
535 _mm_extract_epi32 (__Y, (N) % 4); \ | |
536 })) | |
537 | |
538 #define _mm256_extract_epi16(X, N) \ | |
539 (__extension__ \ | |
540 ({ \ | |
541 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ | |
542 _mm_extract_epi16 (__Y, (N) % 8); \ | |
543 })) | |
544 | |
545 #define _mm256_extract_epi8(X, N) \ | |
546 (__extension__ \ | |
547 ({ \ | |
548 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ | |
549 _mm_extract_epi8 (__Y, (N) % 16); \ | |
550 })) | |
551 | |
552 #ifdef __x86_64__ | |
553 #define _mm256_extract_epi64(X, N) \ | |
554 (__extension__ \ | |
555 ({ \ | |
556 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ | |
557 _mm_extract_epi64 (__Y, (N) % 2); \ | |
558 })) | |
559 #endif | |
560 #endif | |
561 | |
562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
563 _mm256_zeroall (void) | |
564 { | |
565 __builtin_ia32_vzeroall (); | |
566 } | |
567 | |
568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
569 _mm256_zeroupper (void) | |
570 { | |
571 __builtin_ia32_vzeroupper (); | |
572 } | |
573 | |
574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
575 _mm_permutevar_pd (__m128d __A, __m128i __C) | |
576 { | |
577 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, | |
578 (__v2di)__C); | |
579 } | |
580 | |
581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
582 _mm256_permutevar_pd (__m256d __A, __m256i __C) | |
583 { | |
584 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, | |
585 (__v4di)__C); | |
586 } | |
587 | |
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
589 _mm_permutevar_ps (__m128 __A, __m128i __C) | |
590 { | |
591 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, | |
592 (__v4si)__C); | |
593 } | |
594 | |
595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
596 _mm256_permutevar_ps (__m256 __A, __m256i __C) | |
597 { | |
598 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, | |
599 (__v8si)__C); | |
600 } | |
601 | |
602 #ifdef __OPTIMIZE__ | |
603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
604 _mm_permute_pd (__m128d __X, const int __C) | |
605 { | |
606 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); | |
607 } | |
608 | |
609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
610 _mm256_permute_pd (__m256d __X, const int __C) | |
611 { | |
612 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); | |
613 } | |
614 | |
615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
616 _mm_permute_ps (__m128 __X, const int __C) | |
617 { | |
618 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); | |
619 } | |
620 | |
621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
622 _mm256_permute_ps (__m256 __X, const int __C) | |
623 { | |
624 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); | |
625 } | |
626 #else | |
627 #define _mm_permute_pd(X, C) \ | |
628 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) | |
629 | |
630 #define _mm256_permute_pd(X, C) \ | |
631 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) | |
632 | |
633 #define _mm_permute_ps(X, C) \ | |
634 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) | |
635 | |
636 #define _mm256_permute_ps(X, C) \ | |
637 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) | |
638 #endif | |
639 | |
640 #ifdef __OPTIMIZE__ | |
641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) | |
643 { | |
644 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, | |
645 (__v4df)__Y, | |
646 __C); | |
647 } | |
648 | |
649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) | |
651 { | |
652 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, | |
653 (__v8sf)__Y, | |
654 __C); | |
655 } | |
656 | |
657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) | |
659 { | |
660 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, | |
661 (__v8si)__Y, | |
662 __C); | |
663 } | |
664 #else | |
665 #define _mm256_permute2f128_pd(X, Y, C) \ | |
666 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ | |
667 (__v4df)(__m256d)(Y), \ | |
668 (int)(C))) | |
669 | |
670 #define _mm256_permute2f128_ps(X, Y, C) \ | |
671 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ | |
672 (__v8sf)(__m256)(Y), \ | |
673 (int)(C))) | |
674 | |
675 #define _mm256_permute2f128_si256(X, Y, C) \ | |
676 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ | |
677 (__v8si)(__m256i)(Y), \ | |
678 (int)(C))) | |
679 #endif | |
680 | |
681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
682 _mm_broadcast_ss (float const *__X) | |
683 { | |
684 return (__m128) __builtin_ia32_vbroadcastss (__X); | |
685 } | |
686 | |
687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
688 _mm256_broadcast_sd (double const *__X) | |
689 { | |
690 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); | |
691 } | |
692 | |
693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
694 _mm256_broadcast_ss (float const *__X) | |
695 { | |
696 return (__m256) __builtin_ia32_vbroadcastss256 (__X); | |
697 } | |
698 | |
699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
700 _mm256_broadcast_pd (__m128d const *__X) | |
701 { | |
702 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); | |
703 } | |
704 | |
705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
706 _mm256_broadcast_ps (__m128 const *__X) | |
707 { | |
708 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); | |
709 } | |
710 | |
711 #ifdef __OPTIMIZE__ | |
712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) | |
714 { | |
715 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, | |
716 (__v2df)__Y, | |
717 __O); | |
718 } | |
719 | |
720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) | |
722 { | |
723 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, | |
724 (__v4sf)__Y, | |
725 __O); | |
726 } | |
727 | |
728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) | |
730 { | |
731 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, | |
732 (__v4si)__Y, | |
733 __O); | |
734 } | |
735 | |
736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N) | |
738 { | |
739 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); | |
740 __Y = _mm_insert_epi16 (__Y, __D, __N % 4); | |
741 return _mm256_insertf128_si256 (__X, __Y, __N >> 2); | |
742 } | |
743 | |
744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N) | |
746 { | |
747 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); | |
748 __Y = _mm_insert_epi16 (__Y, __D, __N % 8); | |
749 return _mm256_insertf128_si256 (__X, __Y, __N >> 3); | |
750 } | |
751 | |
752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N) | |
754 { | |
755 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); | |
756 __Y = _mm_insert_epi8 (__Y, __D, __N % 16); | |
757 return _mm256_insertf128_si256 (__X, __Y, __N >> 4); | |
758 } | |
759 | |
760 #ifdef __x86_64__ | |
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
762 _mm256_insert_epi64 (__m256i __X, int __D, int const __N) | |
763 { | |
764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); | |
765 __Y = _mm_insert_epi16 (__Y, __D, __N % 2); | |
766 return _mm256_insertf128_si256 (__X, __Y, __N >> 1); | |
767 } | |
768 #endif | |
769 #else | |
770 #define _mm256_insertf128_pd(X, Y, O) \ | |
771 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ | |
772 (__v2df)(__m128d)(Y), \ | |
773 (int)(O))) | |
774 | |
775 #define _mm256_insertf128_ps(X, Y, O) \ | |
776 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ | |
777 (__v4sf)(__m128)(Y), \ | |
778 (int)(O))) | |
779 | |
780 #define _mm256_insertf128_si256(X, Y, O) \ | |
781 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ | |
782 (__v4si)(__m128i)(Y), \ | |
783 (int)(O))) | |
784 | |
785 #define _mm256_insert_epi32(X, D, N) \ | |
786 (__extension__ \ | |
787 ({ \ | |
788 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ | |
789 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ | |
790 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ | |
791 })) | |
792 | |
793 #define _mm256_insert_epi16(X, D, N) \ | |
794 (__extension__ \ | |
795 ({ \ | |
796 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ | |
797 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ | |
798 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ | |
799 })) | |
800 | |
801 #define _mm256_insert_epi8(X, D, N) \ | |
802 (__extension__ \ | |
803 ({ \ | |
804 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ | |
805 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ | |
806 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ | |
807 })) | |
808 | |
809 #ifdef __x86_64__ | |
810 #define _mm256_insert_epi64(X, D, N) \ | |
811 (__extension__ \ | |
812 ({ \ | |
813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ | |
814 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ | |
815 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ | |
816 })) | |
817 #endif | |
818 #endif | |
819 | |
820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
821 _mm256_load_pd (double const *__P) | |
822 { | |
823 return *(__m256d *)__P; | |
824 } | |
825 | |
826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
827 _mm256_store_pd (double *__P, __m256d __A) | |
828 { | |
829 *(__m256d *)__P = __A; | |
830 } | |
831 | |
832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
833 _mm256_load_ps (float const *__P) | |
834 { | |
835 return *(__m256 *)__P; | |
836 } | |
837 | |
838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
839 _mm256_store_ps (float *__P, __m256 __A) | |
840 { | |
841 *(__m256 *)__P = __A; | |
842 } | |
843 | |
844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
845 _mm256_loadu_pd (double const *__P) | |
846 { | |
847 return (__m256d) __builtin_ia32_loadupd256 (__P); | |
848 } | |
849 | |
850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
851 _mm256_storeu_pd (double *__P, __m256d __A) | |
852 { | |
853 __builtin_ia32_storeupd256 (__P, (__v4df)__A); | |
854 } | |
855 | |
856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
857 _mm256_loadu_ps (float const *__P) | |
858 { | |
859 return (__m256) __builtin_ia32_loadups256 (__P); | |
860 } | |
861 | |
862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
863 _mm256_storeu_ps (float *__P, __m256 __A) | |
864 { | |
865 __builtin_ia32_storeups256 (__P, (__v8sf)__A); | |
866 } | |
867 | |
868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
869 _mm256_load_si256 (__m256i const *__P) | |
870 { | |
871 return *__P; | |
872 } | |
873 | |
874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
875 _mm256_store_si256 (__m256i *__P, __m256i __A) | |
876 { | |
877 *__P = __A; | |
878 } | |
879 | |
880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
881 _mm256_loadu_si256 (__m256i const *__P) | |
882 { | |
883 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); | |
884 } | |
885 | |
886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
887 _mm256_storeu_si256 (__m256i *__P, __m256i __A) | |
888 { | |
889 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); | |
890 } | |
891 | |
892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
893 _mm_maskload_pd (double const *__P, __m128d __M) | |
894 { | |
895 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, | |
896 (__v2df)__M); | |
897 } | |
898 | |
899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
900 _mm_maskstore_pd (double *__P, __m128d __M, __m128d __A) | |
901 { | |
902 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A); | |
903 } | |
904 | |
905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
906 _mm256_maskload_pd (double const *__P, __m256d __M) | |
907 { | |
908 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, | |
909 (__v4df)__M); | |
910 } | |
911 | |
912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
913 _mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A) | |
914 { | |
915 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A); | |
916 } | |
917 | |
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
919 _mm_maskload_ps (float const *__P, __m128 __M) | |
920 { | |
921 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, | |
922 (__v4sf)__M); | |
923 } | |
924 | |
925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
926 _mm_maskstore_ps (float *__P, __m128 __M, __m128 __A) | |
927 { | |
928 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A); | |
929 } | |
930 | |
931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
932 _mm256_maskload_ps (float const *__P, __m256 __M) | |
933 { | |
934 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, | |
935 (__v8sf)__M); | |
936 } | |
937 | |
938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
939 _mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A) | |
940 { | |
941 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A); | |
942 } | |
943 | |
944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
945 _mm256_movehdup_ps (__m256 __X) | |
946 { | |
947 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); | |
948 } | |
949 | |
950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
951 _mm256_moveldup_ps (__m256 __X) | |
952 { | |
953 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); | |
954 } | |
955 | |
956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
957 _mm256_movedup_pd (__m256d __X) | |
958 { | |
959 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); | |
960 } | |
961 | |
962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
963 _mm256_lddqu_si256 (__m256i const *__P) | |
964 { | |
965 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); | |
966 } | |
967 | |
968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
969 _mm256_stream_si256 (__m256i *__A, __m256i __B) | |
970 { | |
971 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); | |
972 } | |
973 | |
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
975 _mm256_stream_pd (double *__A, __m256d __B) | |
976 { | |
977 __builtin_ia32_movntpd256 (__A, (__v4df)__B); | |
978 } | |
979 | |
980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
981 _mm256_stream_ps (float *__P, __m256 __A) | |
982 { | |
983 __builtin_ia32_movntps256 (__P, (__v8sf)__A); | |
984 } | |
985 | |
986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
987 _mm256_rcp_ps (__m256 __A) | |
988 { | |
989 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); | |
990 } | |
991 | |
992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
993 _mm256_rsqrt_ps (__m256 __A) | |
994 { | |
995 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); | |
996 } | |
997 | |
998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
999 _mm256_sqrt_pd (__m256d __A) | |
1000 { | |
1001 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); | |
1002 } | |
1003 | |
1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1005 _mm256_sqrt_ps (__m256 __A) | |
1006 { | |
1007 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); | |
1008 } | |
1009 | |
1010 #ifdef __OPTIMIZE__ | |
1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1012 _mm256_round_pd (__m256d __V, const int __M) | |
1013 { | |
1014 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); | |
1015 } | |
1016 | |
1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1018 _mm256_round_ps (__m256 __V, const int __M) | |
1019 { | |
1020 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); | |
1021 } | |
1022 #else | |
1023 #define _mm256_round_pd(V, M) \ | |
1024 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) | |
1025 | |
1026 #define _mm256_round_ps(V, M) \ | |
1027 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) | |
1028 #endif | |
1029 | |
1030 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) | |
1031 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) | |
1032 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) | |
1033 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) | |
1034 | |
1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B) | |
1037 { | |
1038 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); | |
1039 } | |
1040 | |
1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B) | |
1043 { | |
1044 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); | |
1045 } | |
1046 | |
1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B) | |
1049 { | |
1050 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); | |
1051 } | |
1052 | |
1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B) | |
1055 { | |
1056 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); | |
1057 } | |
1058 | |
1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1060 _mm_testz_pd (__m128d __M, __m128d __V) | |
1061 { | |
1062 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); | |
1063 } | |
1064 | |
1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1066 _mm_testc_pd (__m128d __M, __m128d __V) | |
1067 { | |
1068 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); | |
1069 } | |
1070 | |
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1072 _mm_testnzc_pd (__m128d __M, __m128d __V) | |
1073 { | |
1074 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); | |
1075 } | |
1076 | |
1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1078 _mm_testz_ps (__m128 __M, __m128 __V) | |
1079 { | |
1080 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); | |
1081 } | |
1082 | |
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1084 _mm_testc_ps (__m128 __M, __m128 __V) | |
1085 { | |
1086 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); | |
1087 } | |
1088 | |
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1090 _mm_testnzc_ps (__m128 __M, __m128 __V) | |
1091 { | |
1092 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); | |
1093 } | |
1094 | |
1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1096 _mm256_testz_pd (__m256d __M, __m256d __V) | |
1097 { | |
1098 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); | |
1099 } | |
1100 | |
1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1102 _mm256_testc_pd (__m256d __M, __m256d __V) | |
1103 { | |
1104 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); | |
1105 } | |
1106 | |
1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1108 _mm256_testnzc_pd (__m256d __M, __m256d __V) | |
1109 { | |
1110 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); | |
1111 } | |
1112 | |
1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1114 _mm256_testz_ps (__m256 __M, __m256 __V) | |
1115 { | |
1116 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); | |
1117 } | |
1118 | |
1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1120 _mm256_testc_ps (__m256 __M, __m256 __V) | |
1121 { | |
1122 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); | |
1123 } | |
1124 | |
1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1126 _mm256_testnzc_ps (__m256 __M, __m256 __V) | |
1127 { | |
1128 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); | |
1129 } | |
1130 | |
1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1132 _mm256_testz_si256 (__m256i __M, __m256i __V) | |
1133 { | |
1134 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); | |
1135 } | |
1136 | |
1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1138 _mm256_testc_si256 (__m256i __M, __m256i __V) | |
1139 { | |
1140 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); | |
1141 } | |
1142 | |
1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V) | |
1145 { | |
1146 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); | |
1147 } | |
1148 | |
1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1150 _mm256_movemask_pd (__m256d __A) | |
1151 { | |
1152 return __builtin_ia32_movmskpd256 ((__v4df)__A); | |
1153 } | |
1154 | |
1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1156 _mm256_movemask_ps (__m256 __A) | |
1157 { | |
1158 return __builtin_ia32_movmskps256 ((__v8sf)__A); | |
1159 } | |
1160 | |
1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1162 _mm256_setzero_pd (void) | |
1163 { | |
1164 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; | |
1165 } | |
1166 | |
1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1168 _mm256_setzero_ps (void) | |
1169 { | |
1170 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, | |
1171 0.0, 0.0, 0.0, 0.0 }; | |
1172 } | |
1173 | |
1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1175 _mm256_setzero_si256 (void) | |
1176 { | |
1177 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; | |
1178 } | |
1179 | |
1180 /* Create the vector [A B C D]. */ | |
1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1182 _mm256_set_pd (double __A, double __B, double __C, double __D) | |
1183 { | |
1184 return __extension__ (__m256d){ __D, __C, __B, __A }; | |
1185 } | |
1186 | |
1187 /* Create the vector [A B C D E F G H]. */ | |
1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1189 _mm256_set_ps (float __A, float __B, float __C, float __D, | |
1190 float __E, float __F, float __G, float __H) | |
1191 { | |
1192 return __extension__ (__m256){ __H, __G, __F, __E, | |
1193 __D, __C, __B, __A }; | |
1194 } | |
1195 | |
1196 /* Create the vector [A B C D E F G H]. */ | |
1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D, | |
1199 int __E, int __F, int __G, int __H) | |
1200 { | |
1201 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, | |
1202 __D, __C, __B, __A }; | |
1203 } | |
1204 | |
1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, | |
1207 short __q11, short __q10, short __q09, short __q08, | |
1208 short __q07, short __q06, short __q05, short __q04, | |
1209 short __q03, short __q02, short __q01, short __q00) | |
1210 { | |
1211 return __extension__ (__m256i)(__v16hi){ | |
1212 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, | |
1213 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 | |
1214 }; | |
1215 } | |
1216 | |
1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1218 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, | |
1219 char __q27, char __q26, char __q25, char __q24, | |
1220 char __q23, char __q22, char __q21, char __q20, | |
1221 char __q19, char __q18, char __q17, char __q16, | |
1222 char __q15, char __q14, char __q13, char __q12, | |
1223 char __q11, char __q10, char __q09, char __q08, | |
1224 char __q07, char __q06, char __q05, char __q04, | |
1225 char __q03, char __q02, char __q01, char __q00) | |
1226 { | |
1227 return __extension__ (__m256i)(__v32qi){ | |
1228 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, | |
1229 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, | |
1230 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, | |
1231 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 | |
1232 }; | |
1233 } | |
1234 | |
1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1236 _mm256_set_epi64x (long long __A, long long __B, long long __C, | |
1237 long long __D) | |
1238 { | |
1239 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; | |
1240 } | |
1241 | |
1242 /* Create a vector with all elements equal to A. */ | |
1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1244 _mm256_set1_pd (double __A) | |
1245 { | |
1246 return __extension__ (__m256d){ __A, __A, __A, __A }; | |
1247 } | |
1248 | |
1249 /* Create a vector with all elements equal to A. */ | |
1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1251 _mm256_set1_ps (float __A) | |
1252 { | |
1253 return __extension__ (__m256){ __A, __A, __A, __A, | |
1254 __A, __A, __A, __A }; | |
1255 } | |
1256 | |
1257 /* Create a vector with all elements equal to A. */ | |
1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1259 _mm256_set1_epi32 (int __A) | |
1260 { | |
1261 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, | |
1262 __A, __A, __A, __A }; | |
1263 } | |
1264 | |
1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1266 _mm256_set1_epi16 (short __A) | |
1267 { | |
1268 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, | |
1269 __A, __A, __A, __A, __A, __A, __A, __A); | |
1270 } | |
1271 | |
1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1273 _mm256_set1_epi8 (char __A) | |
1274 { | |
1275 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, | |
1276 __A, __A, __A, __A, __A, __A, __A, __A, | |
1277 __A, __A, __A, __A, __A, __A, __A, __A, | |
1278 __A, __A, __A, __A, __A, __A, __A, __A); | |
1279 } | |
1280 | |
1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1282 _mm256_set1_epi64x (long long __A) | |
1283 { | |
1284 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; | |
1285 } | |
1286 | |
1287 /* Create vectors of elements in the reversed order from the | |
1288 _mm256_set_XXX functions. */ | |
1289 | |
1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1291 _mm256_setr_pd (double __A, double __B, double __C, double __D) | |
1292 { | |
1293 return _mm256_set_pd (__D, __C, __B, __A); | |
1294 } | |
1295 | |
1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1297 _mm256_setr_ps (float __A, float __B, float __C, float __D, | |
1298 float __E, float __F, float __G, float __H) | |
1299 { | |
1300 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); | |
1301 } | |
1302 | |
1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D, | |
1305 int __E, int __F, int __G, int __H) | |
1306 { | |
1307 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); | |
1308 } | |
1309 | |
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, | |
1312 short __q11, short __q10, short __q09, short __q08, | |
1313 short __q07, short __q06, short __q05, short __q04, | |
1314 short __q03, short __q02, short __q01, short __q00) | |
1315 { | |
1316 return _mm256_set_epi16 (__q00, __q01, __q02, __q03, | |
1317 __q04, __q05, __q06, __q07, | |
1318 __q08, __q09, __q10, __q11, | |
1319 __q12, __q13, __q14, __q15); | |
1320 } | |
1321 | |
1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1323 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, | |
1324 char __q27, char __q26, char __q25, char __q24, | |
1325 char __q23, char __q22, char __q21, char __q20, | |
1326 char __q19, char __q18, char __q17, char __q16, | |
1327 char __q15, char __q14, char __q13, char __q12, | |
1328 char __q11, char __q10, char __q09, char __q08, | |
1329 char __q07, char __q06, char __q05, char __q04, | |
1330 char __q03, char __q02, char __q01, char __q00) | |
1331 { | |
1332 return _mm256_set_epi8 (__q00, __q01, __q02, __q03, | |
1333 __q04, __q05, __q06, __q07, | |
1334 __q08, __q09, __q10, __q11, | |
1335 __q12, __q13, __q14, __q15, | |
1336 __q16, __q17, __q18, __q19, | |
1337 __q20, __q21, __q22, __q23, | |
1338 __q24, __q25, __q26, __q27, | |
1339 __q28, __q29, __q30, __q31); | |
1340 } | |
1341 | |
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C, | |
1344 long long __D) | |
1345 { | |
1346 return _mm256_set_epi64x (__D, __C, __B, __A); | |
1347 } | |
1348 | |
1349 /* Casts between various SP, DP, INT vector types. Note that these do no | |
1350 conversion of values, they just change the type. */ | |
1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1352 _mm256_castpd_ps (__m256d __A) | |
1353 { | |
1354 return (__m256) __A; | |
1355 } | |
1356 | |
1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1358 _mm256_castpd_si256 (__m256d __A) | |
1359 { | |
1360 return (__m256i) __A; | |
1361 } | |
1362 | |
1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1364 _mm256_castps_pd (__m256 __A) | |
1365 { | |
1366 return (__m256d) __A; | |
1367 } | |
1368 | |
1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1370 _mm256_castps_si256(__m256 __A) | |
1371 { | |
1372 return (__m256i) __A; | |
1373 } | |
1374 | |
1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1376 _mm256_castsi256_ps (__m256i __A) | |
1377 { | |
1378 return (__m256) __A; | |
1379 } | |
1380 | |
1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1382 _mm256_castsi256_pd (__m256i __A) | |
1383 { | |
1384 return (__m256d) __A; | |
1385 } | |
1386 | |
1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1388 _mm256_castpd256_pd128 (__m256d __A) | |
1389 { | |
1390 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); | |
1391 } | |
1392 | |
1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1394 _mm256_castps256_ps128 (__m256 __A) | |
1395 { | |
1396 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); | |
1397 } | |
1398 | |
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1400 _mm256_castsi256_si128 (__m256i __A) | |
1401 { | |
1402 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); | |
1403 } | |
1404 | |
1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of | |
1406 the 256-bit result contain source parameter value and the upper 128 | |
1407 bits of the result are undefined. Those intrinsics shouldn't | |
1408 generate any extra moves. */ | |
1409 | |
1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1411 _mm256_castpd128_pd256 (__m128d __A) | |
1412 { | |
1413 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); | |
1414 } | |
1415 | |
1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1417 _mm256_castps128_ps256 (__m128 __A) | |
1418 { | |
1419 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); | |
1420 } | |
1421 | |
1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1423 _mm256_castsi128_si256 (__m128i __A) | |
1424 { | |
1425 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); | |
1426 } |