0
|
1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
|
|
2
|
|
3 This file is part of GCC.
|
|
4
|
|
5 GCC is free software; you can redistribute it and/or modify
|
|
6 it under the terms of the GNU General Public License as published by
|
|
7 the Free Software Foundation; either version 3, or (at your option)
|
|
8 any later version.
|
|
9
|
|
10 GCC is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU General Public License for more details.
|
|
14
|
|
15 Under Section 7 of GPL version 3, you are granted additional
|
|
16 permissions described in the GCC Runtime Library Exception, version
|
|
17 3.1, as published by the Free Software Foundation.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License and
|
|
20 a copy of the GCC Runtime Library Exception along with this program;
|
|
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
22 <http://www.gnu.org/licenses/>. */
|
|
23
|
|
24 /* Implemented from the specification included in the Intel C++ Compiler
|
|
25 User Guide and Reference, version 11.0. */
|
|
26
|
|
27 #ifndef _IMMINTRIN_H_INCLUDED
|
|
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
|
|
29 #endif
|
|
30
|
|
31 /* Internal data types for implementing the intrinsics. */
|
|
32 typedef double __v4df __attribute__ ((__vector_size__ (32)));
|
|
33 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
|
|
34 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
|
|
35 typedef int __v8si __attribute__ ((__vector_size__ (32)));
|
|
36 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
|
|
37 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
|
|
38
|
|
39 /* The Intel API is flexible enough that we must allow aliasing with other
|
|
40 vector types, and their scalar components. */
|
|
41 typedef float __m256 __attribute__ ((__vector_size__ (32),
|
|
42 __may_alias__));
|
|
43 typedef long long __m256i __attribute__ ((__vector_size__ (32),
|
|
44 __may_alias__));
|
|
45 typedef double __m256d __attribute__ ((__vector_size__ (32),
|
|
46 __may_alias__));
|
|
47
|
|
48 /* Compare predicates for scalar and packed compare intrinsics. */
|
|
49
|
|
50 /* Equal (ordered, non-signaling) */
|
|
51 #define _CMP_EQ_OQ 0x00
|
|
52 /* Less-than (ordered, signaling) */
|
|
53 #define _CMP_LT_OS 0x01
|
|
54 /* Less-than-or-equal (ordered, signaling) */
|
|
55 #define _CMP_LE_OS 0x02
|
|
56 /* Unordered (non-signaling) */
|
|
57 #define _CMP_UNORD_Q 0x03
|
|
58 /* Not-equal (unordered, non-signaling) */
|
|
59 #define _CMP_NEQ_UQ 0x04
|
|
60 /* Not-less-than (unordered, signaling) */
|
|
61 #define _CMP_NLT_US 0x05
|
|
62 /* Not-less-than-or-equal (unordered, signaling) */
|
|
63 #define _CMP_NLE_US 0x06
|
|
64 /* Ordered (nonsignaling) */
|
|
65 #define _CMP_ORD_Q 0x07
|
|
66 /* Equal (unordered, non-signaling) */
|
|
67 #define _CMP_EQ_UQ 0x08
|
|
68 /* Not-greater-than-or-equal (unordered, signaling) */
|
|
69 #define _CMP_NGE_US 0x09
|
|
70 /* Not-greater-than (unordered, signaling) */
|
|
71 #define _CMP_NGT_US 0x0a
|
|
72 /* False (ordered, non-signaling) */
|
|
73 #define _CMP_FALSE_OQ 0x0b
|
|
74 /* Not-equal (ordered, non-signaling) */
|
|
75 #define _CMP_NEQ_OQ 0x0c
|
|
76 /* Greater-than-or-equal (ordered, signaling) */
|
|
77 #define _CMP_GE_OS 0x0d
|
|
78 /* Greater-than (ordered, signaling) */
|
|
79 #define _CMP_GT_OS 0x0e
|
|
80 /* True (unordered, non-signaling) */
|
|
81 #define _CMP_TRUE_UQ 0x0f
|
|
82 /* Equal (ordered, signaling) */
|
|
83 #define _CMP_EQ_OS 0x10
|
|
84 /* Less-than (ordered, non-signaling) */
|
|
85 #define _CMP_LT_OQ 0x11
|
|
86 /* Less-than-or-equal (ordered, non-signaling) */
|
|
87 #define _CMP_LE_OQ 0x12
|
|
88 /* Unordered (signaling) */
|
|
89 #define _CMP_UNORD_S 0x13
|
|
90 /* Not-equal (unordered, signaling) */
|
|
91 #define _CMP_NEQ_US 0x14
|
|
92 /* Not-less-than (unordered, non-signaling) */
|
|
93 #define _CMP_NLT_UQ 0x15
|
|
94 /* Not-less-than-or-equal (unordered, non-signaling) */
|
|
95 #define _CMP_NLE_UQ 0x16
|
|
96 /* Ordered (signaling) */
|
|
97 #define _CMP_ORD_S 0x17
|
|
98 /* Equal (unordered, signaling) */
|
|
99 #define _CMP_EQ_US 0x18
|
|
100 /* Not-greater-than-or-equal (unordered, non-signaling) */
|
|
101 #define _CMP_NGE_UQ 0x19
|
|
102 /* Not-greater-than (unordered, non-signaling) */
|
|
103 #define _CMP_NGT_UQ 0x1a
|
|
104 /* False (ordered, signaling) */
|
|
105 #define _CMP_FALSE_OS 0x1b
|
|
106 /* Not-equal (ordered, signaling) */
|
|
107 #define _CMP_NEQ_OS 0x1c
|
|
108 /* Greater-than-or-equal (ordered, non-signaling) */
|
|
109 #define _CMP_GE_OQ 0x1d
|
|
110 /* Greater-than (ordered, non-signaling) */
|
|
111 #define _CMP_GT_OQ 0x1e
|
|
112 /* True (unordered, signaling) */
|
|
113 #define _CMP_TRUE_US 0x1f
|
|
114
|
|
115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
116 _mm256_add_pd (__m256d __A, __m256d __B)
|
|
117 {
|
|
118 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
|
|
119 }
|
|
120
|
|
121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
122 _mm256_add_ps (__m256 __A, __m256 __B)
|
|
123 {
|
|
124 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
|
|
125 }
|
|
126
|
|
127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
128 _mm256_addsub_pd (__m256d __A, __m256d __B)
|
|
129 {
|
|
130 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
|
|
131 }
|
|
132
|
|
133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
134 _mm256_addsub_ps (__m256 __A, __m256 __B)
|
|
135 {
|
|
136 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
|
|
137 }
|
|
138
|
|
139
|
|
140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
141 _mm256_and_pd (__m256d __A, __m256d __B)
|
|
142 {
|
|
143 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
|
|
144 }
|
|
145
|
|
146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
147 _mm256_and_ps (__m256 __A, __m256 __B)
|
|
148 {
|
|
149 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
|
|
150 }
|
|
151
|
|
152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
153 _mm256_andnot_pd (__m256d __A, __m256d __B)
|
|
154 {
|
|
155 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
|
|
156 }
|
|
157
|
|
158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
159 _mm256_andnot_ps (__m256 __A, __m256 __B)
|
|
160 {
|
|
161 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
|
|
162 }
|
|
163
|
|
164 /* Double/single precision floating point blend instructions - select
|
|
165 data from 2 sources using constant/variable mask. */
|
|
166
|
|
167 #ifdef __OPTIMIZE__
|
|
168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
|
|
170 {
|
|
171 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
|
|
172 (__v4df)__Y,
|
|
173 __M);
|
|
174 }
|
|
175
|
|
176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
|
|
178 {
|
|
179 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
|
|
180 (__v8sf)__Y,
|
|
181 __M);
|
|
182 }
|
|
183 #else
|
|
184 #define _mm256_blend_pd(X, Y, M) \
|
|
185 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
|
|
186 (__v4df)(__m256d)(Y), (int)(M)))
|
|
187
|
|
188 #define _mm256_blend_ps(X, Y, M) \
|
|
189 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
|
|
190 (__v8sf)(__m256)(Y), (int)(M)))
|
|
191 #endif
|
|
192
|
|
193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
|
|
195 {
|
|
196 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
|
|
197 (__v4df)__Y,
|
|
198 (__v4df)__M);
|
|
199 }
|
|
200
|
|
201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
|
|
203 {
|
|
204 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
|
|
205 (__v8sf)__Y,
|
|
206 (__v8sf)__M);
|
|
207 }
|
|
208
|
|
209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
210 _mm256_div_pd (__m256d __A, __m256d __B)
|
|
211 {
|
|
212 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
|
|
213 }
|
|
214
|
|
215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
216 _mm256_div_ps (__m256 __A, __m256 __B)
|
|
217 {
|
|
218 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
|
|
219 }
|
|
220
|
|
221 /* Dot product instructions with mask-defined summing and zeroing parts
|
|
222 of result. */
|
|
223
|
|
224 #ifdef __OPTIMIZE__
|
|
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
|
|
227 {
|
|
228 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
|
|
229 (__v8sf)__Y,
|
|
230 __M);
|
|
231 }
|
|
232 #else
|
|
233 #define _mm256_dp_ps(X, Y, M) \
|
|
234 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
|
|
235 (__v8sf)(__m256)(Y), (int)(M)))
|
|
236 #endif
|
|
237
|
|
238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
239 _mm256_hadd_pd (__m256d __X, __m256d __Y)
|
|
240 {
|
|
241 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
|
|
242 }
|
|
243
|
|
244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
245 _mm256_hadd_ps (__m256 __X, __m256 __Y)
|
|
246 {
|
|
247 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
|
|
248 }
|
|
249
|
|
250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
251 _mm256_hsub_pd (__m256d __X, __m256d __Y)
|
|
252 {
|
|
253 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
|
|
254 }
|
|
255
|
|
256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
257 _mm256_hsub_ps (__m256 __X, __m256 __Y)
|
|
258 {
|
|
259 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
|
|
260 }
|
|
261
|
|
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
263 _mm256_max_pd (__m256d __A, __m256d __B)
|
|
264 {
|
|
265 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
|
|
266 }
|
|
267
|
|
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
269 _mm256_max_ps (__m256 __A, __m256 __B)
|
|
270 {
|
|
271 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
|
|
272 }
|
|
273
|
|
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
275 _mm256_min_pd (__m256d __A, __m256d __B)
|
|
276 {
|
|
277 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
|
|
278 }
|
|
279
|
|
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
281 _mm256_min_ps (__m256 __A, __m256 __B)
|
|
282 {
|
|
283 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
|
|
284 }
|
|
285
|
|
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
287 _mm256_mul_pd (__m256d __A, __m256d __B)
|
|
288 {
|
|
289 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
|
|
290 }
|
|
291
|
|
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
293 _mm256_mul_ps (__m256 __A, __m256 __B)
|
|
294 {
|
|
295 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
|
|
296 }
|
|
297
|
|
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
299 _mm256_or_pd (__m256d __A, __m256d __B)
|
|
300 {
|
|
301 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
|
|
302 }
|
|
303
|
|
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
305 _mm256_or_ps (__m256 __A, __m256 __B)
|
|
306 {
|
|
307 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
|
|
308 }
|
|
309
|
|
310 #ifdef __OPTIMIZE__
|
|
311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
|
|
313 {
|
|
314 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
|
|
315 __mask);
|
|
316 }
|
|
317
|
|
318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
|
|
320 {
|
|
321 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
|
|
322 __mask);
|
|
323 }
|
|
324 #else
|
|
325 #define _mm256_shuffle_pd(A, B, N) \
|
|
326 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
|
|
327 (__v4df)(__m256d)(B), (int)(N)))
|
|
328
|
|
329 #define _mm256_shuffle_ps(A, B, N) \
|
|
330 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
|
|
331 (__v8sf)(__m256)(B), (int)(N)))
|
|
332 #endif
|
|
333
|
|
334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
335 _mm256_sub_pd (__m256d __A, __m256d __B)
|
|
336 {
|
|
337 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
|
|
338 }
|
|
339
|
|
340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
341 _mm256_sub_ps (__m256 __A, __m256 __B)
|
|
342 {
|
|
343 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
|
|
344 }
|
|
345
|
|
346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
347 _mm256_xor_pd (__m256d __A, __m256d __B)
|
|
348 {
|
|
349 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
|
|
350 }
|
|
351
|
|
352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
353 _mm256_xor_ps (__m256 __A, __m256 __B)
|
|
354 {
|
|
355 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
|
|
356 }
|
|
357
|
|
358 #ifdef __OPTIMIZE__
|
|
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
|
|
361 {
|
|
362 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
|
|
363 }
|
|
364
|
|
365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
|
|
367 {
|
|
368 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
|
|
369 }
|
|
370
|
|
371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
|
|
373 {
|
|
374 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
|
|
375 __P);
|
|
376 }
|
|
377
|
|
378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
|
|
380 {
|
|
381 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
|
|
382 __P);
|
|
383 }
|
|
384
|
|
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
|
|
387 {
|
|
388 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
|
|
389 }
|
|
390
|
|
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
|
|
393 {
|
|
394 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
|
|
395 }
|
|
396 #else
|
|
397 #define _mm_cmp_pd(X, Y, P) \
|
|
398 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
|
|
399 (__v2df)(__m128d)(Y), (int)(P)))
|
|
400
|
|
401 #define _mm_cmp_ps(X, Y, P) \
|
|
402 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
|
|
403 (__v4sf)(__m128)(Y), (int)(P)))
|
|
404
|
|
405 #define _mm256_cmp_pd(X, Y, P) \
|
|
406 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
|
|
407 (__v4df)(__m256d)(Y), (int)(P)))
|
|
408
|
|
409 #define _mm256_cmp_ps(X, Y, P) \
|
|
410 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
|
|
411 (__v8sf)(__m256)(Y), (int)(P)))
|
|
412
|
|
413 #define _mm_cmp_sd(X, Y, P) \
|
|
414 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
|
|
415 (__v2df)(__m128d)(Y), (int)(P)))
|
|
416
|
|
417 #define _mm_cmp_ss(X, Y, P) \
|
|
418 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
|
|
419 (__v4sf)(__m128)(Y), (int)(P)))
|
|
420 #endif
|
|
421
|
|
422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
423 _mm256_cvtepi32_pd (__m128i __A)
|
|
424 {
|
|
425 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
|
|
426 }
|
|
427
|
|
428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
429 _mm256_cvtepi32_ps (__m256i __A)
|
|
430 {
|
|
431 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
|
|
432 }
|
|
433
|
|
434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
435 _mm256_cvtpd_ps (__m256d __A)
|
|
436 {
|
|
437 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
|
|
438 }
|
|
439
|
|
440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
441 _mm256_cvtps_epi32 (__m256 __A)
|
|
442 {
|
|
443 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
|
|
444 }
|
|
445
|
|
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
447 _mm256_cvtps_pd (__m128 __A)
|
|
448 {
|
|
449 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
|
|
450 }
|
|
451
|
|
452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
453 _mm256_cvttpd_epi32 (__m256d __A)
|
|
454 {
|
|
455 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
|
|
456 }
|
|
457
|
|
458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
459 _mm256_cvtpd_epi32 (__m256d __A)
|
|
460 {
|
|
461 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
|
|
462 }
|
|
463
|
|
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
465 _mm256_cvttps_epi32 (__m256 __A)
|
|
466 {
|
|
467 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
|
|
468 }
|
|
469
|
|
470 #ifdef __OPTIMIZE__
|
|
471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
472 _mm256_extractf128_pd (__m256d __X, const int __N)
|
|
473 {
|
|
474 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
|
|
475 }
|
|
476
|
|
477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
478 _mm256_extractf128_ps (__m256 __X, const int __N)
|
|
479 {
|
|
480 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
|
|
481 }
|
|
482
|
|
483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
484 _mm256_extractf128_si256 (__m256i __X, const int __N)
|
|
485 {
|
|
486 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
|
|
487 }
|
|
488
|
|
489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
490 _mm256_extract_epi32 (__m256i __X, int const __N)
|
|
491 {
|
|
492 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
|
|
493 return _mm_extract_epi32 (__Y, __N % 4);
|
|
494 }
|
|
495
|
|
496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
497 _mm256_extract_epi16 (__m256i __X, int const __N)
|
|
498 {
|
|
499 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
|
|
500 return _mm_extract_epi16 (__Y, __N % 8);
|
|
501 }
|
|
502
|
|
503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
504 _mm256_extract_epi8 (__m256i __X, int const __N)
|
|
505 {
|
|
506 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
|
|
507 return _mm_extract_epi8 (__Y, __N % 16);
|
|
508 }
|
|
509
|
|
510 #ifdef __x86_64__
|
|
511 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
512 _mm256_extract_epi64 (__m256i __X, const int __N)
|
|
513 {
|
|
514 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
|
|
515 return _mm_extract_epi64 (__Y, __N % 2);
|
|
516 }
|
|
517 #endif
|
|
518 #else
|
|
519 #define _mm256_extractf128_pd(X, N) \
|
|
520 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
|
|
521 (int)(N)))
|
|
522
|
|
523 #define _mm256_extractf128_ps(X, N) \
|
|
524 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
|
|
525 (int)(N)))
|
|
526
|
|
527 #define _mm256_extractf128_si256(X, N) \
|
|
528 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
|
|
529 (int)(N)))
|
|
530
|
|
531 #define _mm256_extract_epi32(X, N) \
|
|
532 (__extension__ \
|
|
533 ({ \
|
|
534 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
|
|
535 _mm_extract_epi32 (__Y, (N) % 4); \
|
|
536 }))
|
|
537
|
|
538 #define _mm256_extract_epi16(X, N) \
|
|
539 (__extension__ \
|
|
540 ({ \
|
|
541 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
|
|
542 _mm_extract_epi16 (__Y, (N) % 8); \
|
|
543 }))
|
|
544
|
|
545 #define _mm256_extract_epi8(X, N) \
|
|
546 (__extension__ \
|
|
547 ({ \
|
|
548 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
|
|
549 _mm_extract_epi8 (__Y, (N) % 16); \
|
|
550 }))
|
|
551
|
|
552 #ifdef __x86_64__
|
|
553 #define _mm256_extract_epi64(X, N) \
|
|
554 (__extension__ \
|
|
555 ({ \
|
|
556 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
|
|
557 _mm_extract_epi64 (__Y, (N) % 2); \
|
|
558 }))
|
|
559 #endif
|
|
560 #endif
|
|
561
|
|
562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
563 _mm256_zeroall (void)
|
|
564 {
|
|
565 __builtin_ia32_vzeroall ();
|
|
566 }
|
|
567
|
|
568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
569 _mm256_zeroupper (void)
|
|
570 {
|
|
571 __builtin_ia32_vzeroupper ();
|
|
572 }
|
|
573
|
|
574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
575 _mm_permutevar_pd (__m128d __A, __m128i __C)
|
|
576 {
|
|
577 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
|
|
578 (__v2di)__C);
|
|
579 }
|
|
580
|
|
581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
582 _mm256_permutevar_pd (__m256d __A, __m256i __C)
|
|
583 {
|
|
584 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
|
|
585 (__v4di)__C);
|
|
586 }
|
|
587
|
|
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
589 _mm_permutevar_ps (__m128 __A, __m128i __C)
|
|
590 {
|
|
591 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
|
|
592 (__v4si)__C);
|
|
593 }
|
|
594
|
|
595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
596 _mm256_permutevar_ps (__m256 __A, __m256i __C)
|
|
597 {
|
|
598 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
|
|
599 (__v8si)__C);
|
|
600 }
|
|
601
|
|
602 #ifdef __OPTIMIZE__
|
|
603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
604 _mm_permute_pd (__m128d __X, const int __C)
|
|
605 {
|
|
606 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
|
|
607 }
|
|
608
|
|
609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
610 _mm256_permute_pd (__m256d __X, const int __C)
|
|
611 {
|
|
612 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
|
|
613 }
|
|
614
|
|
615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
616 _mm_permute_ps (__m128 __X, const int __C)
|
|
617 {
|
|
618 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
|
|
619 }
|
|
620
|
|
621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
622 _mm256_permute_ps (__m256 __X, const int __C)
|
|
623 {
|
|
624 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
|
|
625 }
|
|
626 #else
|
|
627 #define _mm_permute_pd(X, C) \
|
|
628 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
|
|
629
|
|
630 #define _mm256_permute_pd(X, C) \
|
|
631 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
|
|
632
|
|
633 #define _mm_permute_ps(X, C) \
|
|
634 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
|
|
635
|
|
636 #define _mm256_permute_ps(X, C) \
|
|
637 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
|
|
638 #endif
|
|
639
|
|
640 #ifdef __OPTIMIZE__
|
|
641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
|
|
643 {
|
|
644 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
|
|
645 (__v4df)__Y,
|
|
646 __C);
|
|
647 }
|
|
648
|
|
649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
|
|
651 {
|
|
652 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
|
|
653 (__v8sf)__Y,
|
|
654 __C);
|
|
655 }
|
|
656
|
|
657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
|
|
659 {
|
|
660 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
|
|
661 (__v8si)__Y,
|
|
662 __C);
|
|
663 }
|
|
664 #else
|
|
665 #define _mm256_permute2f128_pd(X, Y, C) \
|
|
666 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
|
|
667 (__v4df)(__m256d)(Y), \
|
|
668 (int)(C)))
|
|
669
|
|
670 #define _mm256_permute2f128_ps(X, Y, C) \
|
|
671 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
|
|
672 (__v8sf)(__m256)(Y), \
|
|
673 (int)(C)))
|
|
674
|
|
675 #define _mm256_permute2f128_si256(X, Y, C) \
|
|
676 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
|
|
677 (__v8si)(__m256i)(Y), \
|
|
678 (int)(C)))
|
|
679 #endif
|
|
680
|
|
681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
682 _mm_broadcast_ss (float const *__X)
|
|
683 {
|
|
684 return (__m128) __builtin_ia32_vbroadcastss (__X);
|
|
685 }
|
|
686
|
|
687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
688 _mm256_broadcast_sd (double const *__X)
|
|
689 {
|
|
690 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
|
|
691 }
|
|
692
|
|
693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
694 _mm256_broadcast_ss (float const *__X)
|
|
695 {
|
|
696 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
|
|
697 }
|
|
698
|
|
699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
700 _mm256_broadcast_pd (__m128d const *__X)
|
|
701 {
|
|
702 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
|
|
703 }
|
|
704
|
|
705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
706 _mm256_broadcast_ps (__m128 const *__X)
|
|
707 {
|
|
708 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
|
|
709 }
|
|
710
|
|
711 #ifdef __OPTIMIZE__
|
|
712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
|
|
714 {
|
|
715 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
|
|
716 (__v2df)__Y,
|
|
717 __O);
|
|
718 }
|
|
719
|
|
720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
|
|
722 {
|
|
723 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
|
|
724 (__v4sf)__Y,
|
|
725 __O);
|
|
726 }
|
|
727
|
|
728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
|
|
730 {
|
|
731 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
|
|
732 (__v4si)__Y,
|
|
733 __O);
|
|
734 }
|
|
735
|
|
736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
|
|
738 {
|
|
739 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
|
|
740 __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
|
|
741 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
|
|
742 }
|
|
743
|
|
744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
|
|
746 {
|
|
747 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
|
|
748 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
|
|
749 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
|
|
750 }
|
|
751
|
|
752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
|
|
754 {
|
|
755 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
|
|
756 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
|
|
757 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
|
|
758 }
|
|
759
|
|
760 #ifdef __x86_64__
|
|
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
762 _mm256_insert_epi64 (__m256i __X, int __D, int const __N)
|
|
763 {
|
|
764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
|
|
765 __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
|
|
766 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
|
|
767 }
|
|
768 #endif
|
|
769 #else
|
|
770 #define _mm256_insertf128_pd(X, Y, O) \
|
|
771 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
|
|
772 (__v2df)(__m128d)(Y), \
|
|
773 (int)(O)))
|
|
774
|
|
775 #define _mm256_insertf128_ps(X, Y, O) \
|
|
776 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
|
|
777 (__v4sf)(__m128)(Y), \
|
|
778 (int)(O)))
|
|
779
|
|
780 #define _mm256_insertf128_si256(X, Y, O) \
|
|
781 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
|
|
782 (__v4si)(__m128i)(Y), \
|
|
783 (int)(O)))
|
|
784
|
|
785 #define _mm256_insert_epi32(X, D, N) \
|
|
786 (__extension__ \
|
|
787 ({ \
|
|
788 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
|
|
789 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
|
|
790 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
|
|
791 }))
|
|
792
|
|
793 #define _mm256_insert_epi16(X, D, N) \
|
|
794 (__extension__ \
|
|
795 ({ \
|
|
796 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
|
|
797 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
|
|
798 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
|
|
799 }))
|
|
800
|
|
801 #define _mm256_insert_epi8(X, D, N) \
|
|
802 (__extension__ \
|
|
803 ({ \
|
|
804 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
|
|
805 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
|
|
806 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
|
|
807 }))
|
|
808
|
|
809 #ifdef __x86_64__
|
|
810 #define _mm256_insert_epi64(X, D, N) \
|
|
811 (__extension__ \
|
|
812 ({ \
|
|
813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
|
|
814 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
|
|
815 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
|
|
816 }))
|
|
817 #endif
|
|
818 #endif
|
|
819
|
|
820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
821 _mm256_load_pd (double const *__P)
|
|
822 {
|
|
823 return *(__m256d *)__P;
|
|
824 }
|
|
825
|
|
826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
827 _mm256_store_pd (double *__P, __m256d __A)
|
|
828 {
|
|
829 *(__m256d *)__P = __A;
|
|
830 }
|
|
831
|
|
832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
833 _mm256_load_ps (float const *__P)
|
|
834 {
|
|
835 return *(__m256 *)__P;
|
|
836 }
|
|
837
|
|
838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
839 _mm256_store_ps (float *__P, __m256 __A)
|
|
840 {
|
|
841 *(__m256 *)__P = __A;
|
|
842 }
|
|
843
|
|
844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
845 _mm256_loadu_pd (double const *__P)
|
|
846 {
|
|
847 return (__m256d) __builtin_ia32_loadupd256 (__P);
|
|
848 }
|
|
849
|
|
850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
851 _mm256_storeu_pd (double *__P, __m256d __A)
|
|
852 {
|
|
853 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
|
|
854 }
|
|
855
|
|
856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
857 _mm256_loadu_ps (float const *__P)
|
|
858 {
|
|
859 return (__m256) __builtin_ia32_loadups256 (__P);
|
|
860 }
|
|
861
|
|
862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
863 _mm256_storeu_ps (float *__P, __m256 __A)
|
|
864 {
|
|
865 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
|
|
866 }
|
|
867
|
|
868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
869 _mm256_load_si256 (__m256i const *__P)
|
|
870 {
|
|
871 return *__P;
|
|
872 }
|
|
873
|
|
874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
875 _mm256_store_si256 (__m256i *__P, __m256i __A)
|
|
876 {
|
|
877 *__P = __A;
|
|
878 }
|
|
879
|
|
880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
881 _mm256_loadu_si256 (__m256i const *__P)
|
|
882 {
|
|
883 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
|
|
884 }
|
|
885
|
|
886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
887 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
|
|
888 {
|
|
889 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
|
|
890 }
|
|
891
|
|
892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
893 _mm_maskload_pd (double const *__P, __m128d __M)
|
|
894 {
|
|
895 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
|
|
896 (__v2df)__M);
|
|
897 }
|
|
898
|
|
899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
900 _mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
|
|
901 {
|
|
902 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
|
|
903 }
|
|
904
|
|
905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
906 _mm256_maskload_pd (double const *__P, __m256d __M)
|
|
907 {
|
|
908 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
|
|
909 (__v4df)__M);
|
|
910 }
|
|
911
|
|
912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
913 _mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
|
|
914 {
|
|
915 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
|
|
916 }
|
|
917
|
|
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
919 _mm_maskload_ps (float const *__P, __m128 __M)
|
|
920 {
|
|
921 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
|
|
922 (__v4sf)__M);
|
|
923 }
|
|
924
|
|
925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
926 _mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
|
|
927 {
|
|
928 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
|
|
929 }
|
|
930
|
|
931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
932 _mm256_maskload_ps (float const *__P, __m256 __M)
|
|
933 {
|
|
934 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
|
|
935 (__v8sf)__M);
|
|
936 }
|
|
937
|
|
938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
939 _mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
|
|
940 {
|
|
941 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
|
|
942 }
|
|
943
|
|
944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
945 _mm256_movehdup_ps (__m256 __X)
|
|
946 {
|
|
947 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
|
|
948 }
|
|
949
|
|
950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
951 _mm256_moveldup_ps (__m256 __X)
|
|
952 {
|
|
953 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
|
|
954 }
|
|
955
|
|
956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
957 _mm256_movedup_pd (__m256d __X)
|
|
958 {
|
|
959 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
|
|
960 }
|
|
961
|
|
962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
963 _mm256_lddqu_si256 (__m256i const *__P)
|
|
964 {
|
|
965 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
|
|
966 }
|
|
967
|
|
968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
969 _mm256_stream_si256 (__m256i *__A, __m256i __B)
|
|
970 {
|
|
971 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
|
|
972 }
|
|
973
|
|
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
975 _mm256_stream_pd (double *__A, __m256d __B)
|
|
976 {
|
|
977 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
|
|
978 }
|
|
979
|
|
980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
981 _mm256_stream_ps (float *__P, __m256 __A)
|
|
982 {
|
|
983 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
|
|
984 }
|
|
985
|
|
986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
987 _mm256_rcp_ps (__m256 __A)
|
|
988 {
|
|
989 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
|
|
990 }
|
|
991
|
|
992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
993 _mm256_rsqrt_ps (__m256 __A)
|
|
994 {
|
|
995 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
|
|
996 }
|
|
997
|
|
998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
999 _mm256_sqrt_pd (__m256d __A)
|
|
1000 {
|
|
1001 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
|
|
1002 }
|
|
1003
|
|
1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1005 _mm256_sqrt_ps (__m256 __A)
|
|
1006 {
|
|
1007 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
|
|
1008 }
|
|
1009
|
|
1010 #ifdef __OPTIMIZE__
|
|
1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1012 _mm256_round_pd (__m256d __V, const int __M)
|
|
1013 {
|
|
1014 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
|
|
1015 }
|
|
1016
|
|
1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1018 _mm256_round_ps (__m256 __V, const int __M)
|
|
1019 {
|
|
1020 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
|
|
1021 }
|
|
1022 #else
|
|
1023 #define _mm256_round_pd(V, M) \
|
|
1024 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
|
|
1025
|
|
1026 #define _mm256_round_ps(V, M) \
|
|
1027 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
|
|
1028 #endif
|
|
1029
|
|
1030 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
|
|
1031 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
|
|
1032 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
|
|
1033 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
|
|
1034
|
|
1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
|
|
1037 {
|
|
1038 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
|
|
1039 }
|
|
1040
|
|
1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
|
|
1043 {
|
|
1044 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
|
|
1045 }
|
|
1046
|
|
1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
|
|
1049 {
|
|
1050 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
|
|
1051 }
|
|
1052
|
|
1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
|
|
1055 {
|
|
1056 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
|
|
1057 }
|
|
1058
|
|
1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1060 _mm_testz_pd (__m128d __M, __m128d __V)
|
|
1061 {
|
|
1062 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
|
|
1063 }
|
|
1064
|
|
1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1066 _mm_testc_pd (__m128d __M, __m128d __V)
|
|
1067 {
|
|
1068 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
|
|
1069 }
|
|
1070
|
|
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1072 _mm_testnzc_pd (__m128d __M, __m128d __V)
|
|
1073 {
|
|
1074 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
|
|
1075 }
|
|
1076
|
|
1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1078 _mm_testz_ps (__m128 __M, __m128 __V)
|
|
1079 {
|
|
1080 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
|
|
1081 }
|
|
1082
|
|
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1084 _mm_testc_ps (__m128 __M, __m128 __V)
|
|
1085 {
|
|
1086 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
|
|
1087 }
|
|
1088
|
|
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1090 _mm_testnzc_ps (__m128 __M, __m128 __V)
|
|
1091 {
|
|
1092 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
|
|
1093 }
|
|
1094
|
|
1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1096 _mm256_testz_pd (__m256d __M, __m256d __V)
|
|
1097 {
|
|
1098 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
|
|
1099 }
|
|
1100
|
|
1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1102 _mm256_testc_pd (__m256d __M, __m256d __V)
|
|
1103 {
|
|
1104 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
|
|
1105 }
|
|
1106
|
|
1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1108 _mm256_testnzc_pd (__m256d __M, __m256d __V)
|
|
1109 {
|
|
1110 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
|
|
1111 }
|
|
1112
|
|
1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1114 _mm256_testz_ps (__m256 __M, __m256 __V)
|
|
1115 {
|
|
1116 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
|
|
1117 }
|
|
1118
|
|
1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1120 _mm256_testc_ps (__m256 __M, __m256 __V)
|
|
1121 {
|
|
1122 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
|
|
1123 }
|
|
1124
|
|
1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1126 _mm256_testnzc_ps (__m256 __M, __m256 __V)
|
|
1127 {
|
|
1128 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
|
|
1129 }
|
|
1130
|
|
1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1132 _mm256_testz_si256 (__m256i __M, __m256i __V)
|
|
1133 {
|
|
1134 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
|
|
1135 }
|
|
1136
|
|
1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1138 _mm256_testc_si256 (__m256i __M, __m256i __V)
|
|
1139 {
|
|
1140 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
|
|
1141 }
|
|
1142
|
|
1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
|
|
1145 {
|
|
1146 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
|
|
1147 }
|
|
1148
|
|
1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1150 _mm256_movemask_pd (__m256d __A)
|
|
1151 {
|
|
1152 return __builtin_ia32_movmskpd256 ((__v4df)__A);
|
|
1153 }
|
|
1154
|
|
1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1156 _mm256_movemask_ps (__m256 __A)
|
|
1157 {
|
|
1158 return __builtin_ia32_movmskps256 ((__v8sf)__A);
|
|
1159 }
|
|
1160
|
|
1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1162 _mm256_setzero_pd (void)
|
|
1163 {
|
|
1164 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
|
|
1165 }
|
|
1166
|
|
1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1168 _mm256_setzero_ps (void)
|
|
1169 {
|
|
1170 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
|
|
1171 0.0, 0.0, 0.0, 0.0 };
|
|
1172 }
|
|
1173
|
|
1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1175 _mm256_setzero_si256 (void)
|
|
1176 {
|
|
1177 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
|
|
1178 }
|
|
1179
|
|
1180 /* Create the vector [A B C D]. */
|
|
1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1182 _mm256_set_pd (double __A, double __B, double __C, double __D)
|
|
1183 {
|
|
1184 return __extension__ (__m256d){ __D, __C, __B, __A };
|
|
1185 }
|
|
1186
|
|
1187 /* Create the vector [A B C D E F G H]. */
|
|
1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1189 _mm256_set_ps (float __A, float __B, float __C, float __D,
|
|
1190 float __E, float __F, float __G, float __H)
|
|
1191 {
|
|
1192 return __extension__ (__m256){ __H, __G, __F, __E,
|
|
1193 __D, __C, __B, __A };
|
|
1194 }
|
|
1195
|
|
1196 /* Create the vector [A B C D E F G H]. */
|
|
1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
|
|
1199 int __E, int __F, int __G, int __H)
|
|
1200 {
|
|
1201 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
|
|
1202 __D, __C, __B, __A };
|
|
1203 }
|
|
1204
|
|
1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
|
|
1207 short __q11, short __q10, short __q09, short __q08,
|
|
1208 short __q07, short __q06, short __q05, short __q04,
|
|
1209 short __q03, short __q02, short __q01, short __q00)
|
|
1210 {
|
|
1211 return __extension__ (__m256i)(__v16hi){
|
|
1212 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
|
|
1213 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
|
|
1214 };
|
|
1215 }
|
|
1216
|
|
1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1218 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
|
|
1219 char __q27, char __q26, char __q25, char __q24,
|
|
1220 char __q23, char __q22, char __q21, char __q20,
|
|
1221 char __q19, char __q18, char __q17, char __q16,
|
|
1222 char __q15, char __q14, char __q13, char __q12,
|
|
1223 char __q11, char __q10, char __q09, char __q08,
|
|
1224 char __q07, char __q06, char __q05, char __q04,
|
|
1225 char __q03, char __q02, char __q01, char __q00)
|
|
1226 {
|
|
1227 return __extension__ (__m256i)(__v32qi){
|
|
1228 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
|
|
1229 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
|
|
1230 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
|
|
1231 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
|
|
1232 };
|
|
1233 }
|
|
1234
|
|
1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1236 _mm256_set_epi64x (long long __A, long long __B, long long __C,
|
|
1237 long long __D)
|
|
1238 {
|
|
1239 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
|
|
1240 }
|
|
1241
|
|
1242 /* Create a vector with all elements equal to A. */
|
|
1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1244 _mm256_set1_pd (double __A)
|
|
1245 {
|
|
1246 return __extension__ (__m256d){ __A, __A, __A, __A };
|
|
1247 }
|
|
1248
|
|
1249 /* Create a vector with all elements equal to A. */
|
|
1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1251 _mm256_set1_ps (float __A)
|
|
1252 {
|
|
1253 return __extension__ (__m256){ __A, __A, __A, __A,
|
|
1254 __A, __A, __A, __A };
|
|
1255 }
|
|
1256
|
|
1257 /* Create a vector with all elements equal to A. */
|
|
1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1259 _mm256_set1_epi32 (int __A)
|
|
1260 {
|
|
1261 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
|
|
1262 __A, __A, __A, __A };
|
|
1263 }
|
|
1264
|
|
1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1266 _mm256_set1_epi16 (short __A)
|
|
1267 {
|
|
1268 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
|
|
1269 __A, __A, __A, __A, __A, __A, __A, __A);
|
|
1270 }
|
|
1271
|
|
1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1273 _mm256_set1_epi8 (char __A)
|
|
1274 {
|
|
1275 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
|
|
1276 __A, __A, __A, __A, __A, __A, __A, __A,
|
|
1277 __A, __A, __A, __A, __A, __A, __A, __A,
|
|
1278 __A, __A, __A, __A, __A, __A, __A, __A);
|
|
1279 }
|
|
1280
|
|
1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1282 _mm256_set1_epi64x (long long __A)
|
|
1283 {
|
|
1284 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
|
|
1285 }
|
|
1286
|
|
1287 /* Create vectors of elements in the reversed order from the
|
|
1288 _mm256_set_XXX functions. */
|
|
1289
|
|
1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1291 _mm256_setr_pd (double __A, double __B, double __C, double __D)
|
|
1292 {
|
|
1293 return _mm256_set_pd (__D, __C, __B, __A);
|
|
1294 }
|
|
1295
|
|
1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1297 _mm256_setr_ps (float __A, float __B, float __C, float __D,
|
|
1298 float __E, float __F, float __G, float __H)
|
|
1299 {
|
|
1300 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
|
|
1301 }
|
|
1302
|
|
1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
|
|
1305 int __E, int __F, int __G, int __H)
|
|
1306 {
|
|
1307 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
|
|
1308 }
|
|
1309
|
|
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
|
|
1312 short __q11, short __q10, short __q09, short __q08,
|
|
1313 short __q07, short __q06, short __q05, short __q04,
|
|
1314 short __q03, short __q02, short __q01, short __q00)
|
|
1315 {
|
|
1316 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
|
|
1317 __q04, __q05, __q06, __q07,
|
|
1318 __q08, __q09, __q10, __q11,
|
|
1319 __q12, __q13, __q14, __q15);
|
|
1320 }
|
|
1321
|
|
1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1323 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
|
|
1324 char __q27, char __q26, char __q25, char __q24,
|
|
1325 char __q23, char __q22, char __q21, char __q20,
|
|
1326 char __q19, char __q18, char __q17, char __q16,
|
|
1327 char __q15, char __q14, char __q13, char __q12,
|
|
1328 char __q11, char __q10, char __q09, char __q08,
|
|
1329 char __q07, char __q06, char __q05, char __q04,
|
|
1330 char __q03, char __q02, char __q01, char __q00)
|
|
1331 {
|
|
1332 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
|
|
1333 __q04, __q05, __q06, __q07,
|
|
1334 __q08, __q09, __q10, __q11,
|
|
1335 __q12, __q13, __q14, __q15,
|
|
1336 __q16, __q17, __q18, __q19,
|
|
1337 __q20, __q21, __q22, __q23,
|
|
1338 __q24, __q25, __q26, __q27,
|
|
1339 __q28, __q29, __q30, __q31);
|
|
1340 }
|
|
1341
|
|
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
|
|
1344 long long __D)
|
|
1345 {
|
|
1346 return _mm256_set_epi64x (__D, __C, __B, __A);
|
|
1347 }
|
|
1348
|
|
1349 /* Casts between various SP, DP, INT vector types. Note that these do no
|
|
1350 conversion of values, they just change the type. */
|
|
1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1352 _mm256_castpd_ps (__m256d __A)
|
|
1353 {
|
|
1354 return (__m256) __A;
|
|
1355 }
|
|
1356
|
|
1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1358 _mm256_castpd_si256 (__m256d __A)
|
|
1359 {
|
|
1360 return (__m256i) __A;
|
|
1361 }
|
|
1362
|
|
1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1364 _mm256_castps_pd (__m256 __A)
|
|
1365 {
|
|
1366 return (__m256d) __A;
|
|
1367 }
|
|
1368
|
|
1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1370 _mm256_castps_si256(__m256 __A)
|
|
1371 {
|
|
1372 return (__m256i) __A;
|
|
1373 }
|
|
1374
|
|
1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1376 _mm256_castsi256_ps (__m256i __A)
|
|
1377 {
|
|
1378 return (__m256) __A;
|
|
1379 }
|
|
1380
|
|
1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1382 _mm256_castsi256_pd (__m256i __A)
|
|
1383 {
|
|
1384 return (__m256d) __A;
|
|
1385 }
|
|
1386
|
|
1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1388 _mm256_castpd256_pd128 (__m256d __A)
|
|
1389 {
|
|
1390 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
|
|
1391 }
|
|
1392
|
|
1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1394 _mm256_castps256_ps128 (__m256 __A)
|
|
1395 {
|
|
1396 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
|
|
1397 }
|
|
1398
|
|
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1400 _mm256_castsi256_si128 (__m256i __A)
|
|
1401 {
|
|
1402 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
|
|
1403 }
|
|
1404
|
|
1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
|
|
1406 the 256-bit result contain source parameter value and the upper 128
|
|
1407 bits of the result are undefined. Those intrinsics shouldn't
|
|
1408 generate any extra moves. */
|
|
1409
|
|
1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1411 _mm256_castpd128_pd256 (__m128d __A)
|
|
1412 {
|
|
1413 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
|
|
1414 }
|
|
1415
|
|
1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1417 _mm256_castps128_ps256 (__m128 __A)
|
|
1418 {
|
|
1419 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
|
|
1420 }
|
|
1421
|
|
1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1423 _mm256_castsi128_si256 (__m128i __A)
|
|
1424 {
|
|
1425 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
|
|
1426 }
|