0
|
1 /* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
|
|
2
|
|
3 This file is part of GCC.
|
|
4
|
|
5 GCC is free software; you can redistribute it and/or modify
|
|
6 it under the terms of the GNU General Public License as published by
|
|
7 the Free Software Foundation; either version 3, or (at your option)
|
|
8 any later version.
|
|
9
|
|
10 GCC is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU General Public License for more details.
|
|
14
|
|
15 Under Section 7 of GPL version 3, you are granted additional
|
|
16 permissions described in the GCC Runtime Library Exception, version
|
|
17 3.1, as published by the Free Software Foundation.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License and
|
|
20 a copy of the GCC Runtime Library Exception along with this program;
|
|
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
22 <http://www.gnu.org/licenses/>.
|
|
23
|
|
24
|
|
25 /* Implemented from the specification included in the Intel C++ Compiler
|
|
26 User Guide and Reference, version 10.0. */
|
|
27
|
|
28 #ifndef _SMMINTRIN_H_INCLUDED
|
|
29 #define _SMMINTRIN_H_INCLUDED
|
|
30
|
|
31 #ifndef __SSE4_1__
|
|
32 # error "SSE4.1 instruction set not enabled"
|
|
33 #else
|
|
34
|
|
35 /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
|
|
36 files. */
|
|
37 #include <tmmintrin.h>
|
|
38 #include <mmintrin-common.h>
|
|
39
|
|
40 /* SSE4.1 */
|
|
41
|
|
42 /* Integer blend instructions - select data from 2 sources using
|
|
43 constant/variable mask. */
|
|
44
|
|
45 #ifdef __OPTIMIZE__
|
|
46 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
47 _mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
|
|
48 {
|
|
49 return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
|
|
50 (__v8hi)__Y,
|
|
51 __M);
|
|
52 }
|
|
53 #else
|
|
54 #define _mm_blend_epi16(X, Y, M) \
|
|
55 ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \
|
|
56 (__v8hi)(__m128i)(Y), (int)(M)))
|
|
57 #endif
|
|
58
|
|
59 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
60 _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
|
|
61 {
|
|
62 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
|
|
63 (__v16qi)__Y,
|
|
64 (__v16qi)__M);
|
|
65 }
|
|
66
|
|
67 /* Single precision floating point blend instructions - select data
|
|
68 from 2 sources using constant/variable mask. */
|
|
69
|
|
70 #ifdef __OPTIMIZE__
|
|
71 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
72 _mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
|
|
73 {
|
|
74 return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
|
|
75 (__v4sf)__Y,
|
|
76 __M);
|
|
77 }
|
|
78 #else
|
|
79 #define _mm_blend_ps(X, Y, M) \
|
|
80 ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \
|
|
81 (__v4sf)(__m128)(Y), (int)(M)))
|
|
82 #endif
|
|
83
|
|
84 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
85 _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
|
|
86 {
|
|
87 return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
|
|
88 (__v4sf)__Y,
|
|
89 (__v4sf)__M);
|
|
90 }
|
|
91
|
|
92 /* Double precision floating point blend instructions - select data
|
|
93 from 2 sources using constant/variable mask. */
|
|
94
|
|
95 #ifdef __OPTIMIZE__
|
|
96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
97 _mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
|
|
98 {
|
|
99 return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
|
|
100 (__v2df)__Y,
|
|
101 __M);
|
|
102 }
|
|
103 #else
|
|
104 #define _mm_blend_pd(X, Y, M) \
|
|
105 ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \
|
|
106 (__v2df)(__m128d)(Y), (int)(M)))
|
|
107 #endif
|
|
108
|
|
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
110 _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
|
|
111 {
|
|
112 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
|
|
113 (__v2df)__Y,
|
|
114 (__v2df)__M);
|
|
115 }
|
|
116
|
|
117 /* Dot product instructions with mask-defined summing and zeroing parts
|
|
118 of result. */
|
|
119
|
|
120 #ifdef __OPTIMIZE__
|
|
121 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
122 _mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
|
|
123 {
|
|
124 return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
|
|
125 (__v4sf)__Y,
|
|
126 __M);
|
|
127 }
|
|
128
|
|
129 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
130 _mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
|
|
131 {
|
|
132 return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
|
|
133 (__v2df)__Y,
|
|
134 __M);
|
|
135 }
|
|
136 #else
|
|
137 #define _mm_dp_ps(X, Y, M) \
|
|
138 ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \
|
|
139 (__v4sf)(__m128)(Y), (int)(M)))
|
|
140
|
|
141 #define _mm_dp_pd(X, Y, M) \
|
|
142 ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \
|
|
143 (__v2df)(__m128d)(Y), (int)(M)))
|
|
144 #endif
|
|
145
|
|
146 /* Packed integer 64-bit comparison, zeroing or filling with ones
|
|
147 corresponding parts of result. */
|
|
148 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
149 _mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
|
|
150 {
|
|
151 return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y);
|
|
152 }
|
|
153
|
|
154 /* Min/max packed integer instructions. */
|
|
155
|
|
156 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
157 _mm_min_epi8 (__m128i __X, __m128i __Y)
|
|
158 {
|
|
159 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
|
|
160 }
|
|
161
|
|
162 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
163 _mm_max_epi8 (__m128i __X, __m128i __Y)
|
|
164 {
|
|
165 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
|
|
166 }
|
|
167
|
|
168 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
169 _mm_min_epu16 (__m128i __X, __m128i __Y)
|
|
170 {
|
|
171 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
|
|
172 }
|
|
173
|
|
174 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
175 _mm_max_epu16 (__m128i __X, __m128i __Y)
|
|
176 {
|
|
177 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
|
|
178 }
|
|
179
|
|
180 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
181 _mm_min_epi32 (__m128i __X, __m128i __Y)
|
|
182 {
|
|
183 return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
|
|
184 }
|
|
185
|
|
186 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
187 _mm_max_epi32 (__m128i __X, __m128i __Y)
|
|
188 {
|
|
189 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
|
|
190 }
|
|
191
|
|
192 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
193 _mm_min_epu32 (__m128i __X, __m128i __Y)
|
|
194 {
|
|
195 return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
|
|
196 }
|
|
197
|
|
198 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
199 _mm_max_epu32 (__m128i __X, __m128i __Y)
|
|
200 {
|
|
201 return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
|
|
202 }
|
|
203
|
|
204 /* Packed integer 32-bit multiplication with truncation of upper
|
|
205 halves of results. */
|
|
206 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
207 _mm_mullo_epi32 (__m128i __X, __m128i __Y)
|
|
208 {
|
|
209 return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
|
|
210 }
|
|
211
|
|
212 /* Packed integer 32-bit multiplication of 2 pairs of operands
|
|
213 with two 64-bit results. */
|
|
214 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
215 _mm_mul_epi32 (__m128i __X, __m128i __Y)
|
|
216 {
|
|
217 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
|
|
218 }
|
|
219
|
|
220 /* Insert single precision float into packed single precision array
|
|
221 element selected by index N. The bits [7-6] of N define S
|
|
222 index, the bits [5-4] define D index, and bits [3-0] define
|
|
223 zeroing mask for D. */
|
|
224
|
|
225 #ifdef __OPTIMIZE__
|
|
226 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
227 _mm_insert_ps (__m128 __D, __m128 __S, const int __N)
|
|
228 {
|
|
229 return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
|
|
230 (__v4sf)__S,
|
|
231 __N);
|
|
232 }
|
|
233 #else
|
|
234 #define _mm_insert_ps(D, S, N) \
|
|
235 ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \
|
|
236 (__v4sf)(__m128)(S), (int)(N)))
|
|
237 #endif
|
|
238
|
|
239 /* Helper macro to create the N value for _mm_insert_ps. */
|
|
240 #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
|
|
241
|
|
242 /* Extract binary representation of single precision float from packed
|
|
243 single precision array element of X selected by index N. */
|
|
244
|
|
245 #ifdef __OPTIMIZE__
|
|
246 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
247 _mm_extract_ps (__m128 __X, const int __N)
|
|
248 {
|
|
249 union { int i; float f; } __tmp;
|
|
250 __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
|
|
251 return __tmp.i;
|
|
252 }
|
|
253 #else
|
|
254 #define _mm_extract_ps(X, N) \
|
|
255 (__extension__ \
|
|
256 ({ \
|
|
257 union { int i; float f; } __tmp; \
|
|
258 __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \
|
|
259 __tmp.i; \
|
|
260 }))
|
|
261 #endif
|
|
262
|
|
263 /* Extract binary representation of single precision float into
|
|
264 D from packed single precision array element of S selected
|
|
265 by index N. */
|
|
266 #define _MM_EXTRACT_FLOAT(D, S, N) \
|
|
267 { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
|
|
268
|
|
269 /* Extract specified single precision float element into the lower
|
|
270 part of __m128. */
|
|
271 #define _MM_PICK_OUT_PS(X, N) \
|
|
272 _mm_insert_ps (_mm_setzero_ps (), (X), \
|
|
273 _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
|
|
274
|
|
275 /* Insert integer, S, into packed integer array element of D
|
|
276 selected by index N. */
|
|
277
|
|
278 #ifdef __OPTIMIZE__
|
|
279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
280 _mm_insert_epi8 (__m128i __D, int __S, const int __N)
|
|
281 {
|
|
282 return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
|
|
283 __S, __N);
|
|
284 }
|
|
285
|
|
286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
287 _mm_insert_epi32 (__m128i __D, int __S, const int __N)
|
|
288 {
|
|
289 return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
|
|
290 __S, __N);
|
|
291 }
|
|
292
|
|
293 #ifdef __x86_64__
|
|
294 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
295 _mm_insert_epi64 (__m128i __D, long long __S, const int __N)
|
|
296 {
|
|
297 return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
|
|
298 __S, __N);
|
|
299 }
|
|
300 #endif
|
|
301 #else
|
|
302 #define _mm_insert_epi8(D, S, N) \
|
|
303 ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \
|
|
304 (int)(S), (int)(N)))
|
|
305
|
|
306 #define _mm_insert_epi32(D, S, N) \
|
|
307 ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \
|
|
308 (int)(S), (int)(N)))
|
|
309
|
|
310 #ifdef __x86_64__
|
|
311 #define _mm_insert_epi64(D, S, N) \
|
|
312 ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \
|
|
313 (long long)(S), (int)(N)))
|
|
314 #endif
|
|
315 #endif
|
|
316
|
|
317 /* Extract integer from packed integer array element of X selected by
|
|
318 index N. */
|
|
319
|
|
320 #ifdef __OPTIMIZE__
|
|
321 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
322 _mm_extract_epi8 (__m128i __X, const int __N)
|
|
323 {
|
|
324 return __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
|
|
325 }
|
|
326
|
|
327 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
328 _mm_extract_epi32 (__m128i __X, const int __N)
|
|
329 {
|
|
330 return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
|
|
331 }
|
|
332
|
|
333 #ifdef __x86_64__
|
|
334 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
335 _mm_extract_epi64 (__m128i __X, const int __N)
|
|
336 {
|
|
337 return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
|
|
338 }
|
|
339 #endif
|
|
340 #else
|
|
341 #define _mm_extract_epi8(X, N) \
|
|
342 ((int) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
|
|
343 #define _mm_extract_epi32(X, N) \
|
|
344 ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
|
|
345
|
|
346 #ifdef __x86_64__
|
|
347 #define _mm_extract_epi64(X, N) \
|
|
348 ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
|
|
349 #endif
|
|
350 #endif
|
|
351
|
|
352 /* Return horizontal packed word minimum and its index in bits [15:0]
|
|
353 and bits [18:16] respectively. */
|
|
354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
355 _mm_minpos_epu16 (__m128i __X)
|
|
356 {
|
|
357 return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
|
|
358 }
|
|
359
|
|
360 /* Packed integer sign-extension. */
|
|
361
|
|
362 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
363 _mm_cvtepi8_epi32 (__m128i __X)
|
|
364 {
|
|
365 return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
|
|
366 }
|
|
367
|
|
368 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
369 _mm_cvtepi16_epi32 (__m128i __X)
|
|
370 {
|
|
371 return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
|
|
372 }
|
|
373
|
|
374 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
375 _mm_cvtepi8_epi64 (__m128i __X)
|
|
376 {
|
|
377 return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
|
|
378 }
|
|
379
|
|
380 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
381 _mm_cvtepi32_epi64 (__m128i __X)
|
|
382 {
|
|
383 return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
|
|
384 }
|
|
385
|
|
386 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
387 _mm_cvtepi16_epi64 (__m128i __X)
|
|
388 {
|
|
389 return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
|
|
390 }
|
|
391
|
|
392 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
393 _mm_cvtepi8_epi16 (__m128i __X)
|
|
394 {
|
|
395 return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
|
|
396 }
|
|
397
|
|
398 /* Packed integer zero-extension. */
|
|
399
|
|
400 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
401 _mm_cvtepu8_epi32 (__m128i __X)
|
|
402 {
|
|
403 return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
|
|
404 }
|
|
405
|
|
406 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
407 _mm_cvtepu16_epi32 (__m128i __X)
|
|
408 {
|
|
409 return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
|
|
410 }
|
|
411
|
|
412 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
413 _mm_cvtepu8_epi64 (__m128i __X)
|
|
414 {
|
|
415 return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
|
|
416 }
|
|
417
|
|
418 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
419 _mm_cvtepu32_epi64 (__m128i __X)
|
|
420 {
|
|
421 return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
|
|
422 }
|
|
423
|
|
424 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
425 _mm_cvtepu16_epi64 (__m128i __X)
|
|
426 {
|
|
427 return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
|
|
428 }
|
|
429
|
|
430 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
431 _mm_cvtepu8_epi16 (__m128i __X)
|
|
432 {
|
|
433 return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
|
|
434 }
|
|
435
|
|
436 /* Pack 8 double words from 2 operands into 8 words of result with
|
|
437 unsigned saturation. */
|
|
438 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
439 _mm_packus_epi32 (__m128i __X, __m128i __Y)
|
|
440 {
|
|
441 return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
|
|
442 }
|
|
443
|
|
444 /* Sum absolute 8-bit integer difference of adjacent groups of 4
|
|
445 byte integers in the first 2 operands. Starting offsets within
|
|
446 operands are determined by the 3rd mask operand. */
|
|
447
|
|
448 #ifdef __OPTIMIZE__
|
|
449 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
450 _mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
|
|
451 {
|
|
452 return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
|
|
453 (__v16qi)__Y, __M);
|
|
454 }
|
|
455 #else
|
|
456 #define _mm_mpsadbw_epu8(X, Y, M) \
|
|
457 ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \
|
|
458 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
459 #endif
|
|
460
|
|
461 /* Load double quadword using non-temporal aligned hint. */
|
|
462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
463 _mm_stream_load_si128 (__m128i *__X)
|
|
464 {
|
|
465 return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
|
|
466 }
|
|
467
|
|
468 #ifdef __SSE4_2__
|
|
469
|
|
470 /* These macros specify the source data format. */
|
|
471 #define _SIDD_UBYTE_OPS 0x00
|
|
472 #define _SIDD_UWORD_OPS 0x01
|
|
473 #define _SIDD_SBYTE_OPS 0x02
|
|
474 #define _SIDD_SWORD_OPS 0x03
|
|
475
|
|
476 /* These macros specify the comparison operation. */
|
|
477 #define _SIDD_CMP_EQUAL_ANY 0x00
|
|
478 #define _SIDD_CMP_RANGES 0x04
|
|
479 #define _SIDD_CMP_EQUAL_EACH 0x08
|
|
480 #define _SIDD_CMP_EQUAL_ORDERED 0x0c
|
|
481
|
|
482 /* These macros specify the the polarity. */
|
|
483 #define _SIDD_POSITIVE_POLARITY 0x00
|
|
484 #define _SIDD_NEGATIVE_POLARITY 0x10
|
|
485 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
|
|
486 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
|
|
487
|
|
488 /* These macros specify the output selection in _mm_cmpXstri (). */
|
|
489 #define _SIDD_LEAST_SIGNIFICANT 0x00
|
|
490 #define _SIDD_MOST_SIGNIFICANT 0x40
|
|
491
|
|
492 /* These macros specify the output selection in _mm_cmpXstrm (). */
|
|
493 #define _SIDD_BIT_MASK 0x00
|
|
494 #define _SIDD_UNIT_MASK 0x40
|
|
495
|
|
496 /* Intrinsics for text/string processing. */
|
|
497
|
|
498 #ifdef __OPTIMIZE__
|
|
499 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
500 _mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
|
|
501 {
|
|
502 return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
|
|
503 (__v16qi)__Y,
|
|
504 __M);
|
|
505 }
|
|
506
|
|
507 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
508 _mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
|
|
509 {
|
|
510 return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
|
|
511 (__v16qi)__Y,
|
|
512 __M);
|
|
513 }
|
|
514
|
|
515 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
516 _mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
517 {
|
|
518 return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
|
|
519 (__v16qi)__Y, __LY,
|
|
520 __M);
|
|
521 }
|
|
522
|
|
523 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
524 _mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
525 {
|
|
526 return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
|
|
527 (__v16qi)__Y, __LY,
|
|
528 __M);
|
|
529 }
|
|
530 #else
|
|
531 #define _mm_cmpistrm(X, Y, M) \
|
|
532 ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \
|
|
533 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
534 #define _mm_cmpistri(X, Y, M) \
|
|
535 ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \
|
|
536 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
537
|
|
538 #define _mm_cmpestrm(X, LX, Y, LY, M) \
|
|
539 ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \
|
|
540 (int)(LX), (__v16qi)(__m128i)(Y), \
|
|
541 (int)(LY), (int)(M)))
|
|
542 #define _mm_cmpestri(X, LX, Y, LY, M) \
|
|
543 ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \
|
|
544 (__v16qi)(__m128i)(Y), (int)(LY), \
|
|
545 (int)(M)))
|
|
546 #endif
|
|
547
|
|
548 /* Intrinsics for text/string processing and reading values of
|
|
549 EFlags. */
|
|
550
|
|
551 #ifdef __OPTIMIZE__
|
|
552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
553 _mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
|
|
554 {
|
|
555 return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
|
|
556 (__v16qi)__Y,
|
|
557 __M);
|
|
558 }
|
|
559
|
|
560 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
561 _mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
|
|
562 {
|
|
563 return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
|
|
564 (__v16qi)__Y,
|
|
565 __M);
|
|
566 }
|
|
567
|
|
568 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
569 _mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
|
|
570 {
|
|
571 return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
|
|
572 (__v16qi)__Y,
|
|
573 __M);
|
|
574 }
|
|
575
|
|
576 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
577 _mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
|
|
578 {
|
|
579 return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
|
|
580 (__v16qi)__Y,
|
|
581 __M);
|
|
582 }
|
|
583
|
|
584 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
585 _mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
|
|
586 {
|
|
587 return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
|
|
588 (__v16qi)__Y,
|
|
589 __M);
|
|
590 }
|
|
591
|
|
592 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
593 _mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
594 {
|
|
595 return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
|
|
596 (__v16qi)__Y, __LY,
|
|
597 __M);
|
|
598 }
|
|
599
|
|
600 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
601 _mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
602 {
|
|
603 return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
|
|
604 (__v16qi)__Y, __LY,
|
|
605 __M);
|
|
606 }
|
|
607
|
|
608 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
609 _mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
610 {
|
|
611 return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
|
|
612 (__v16qi)__Y, __LY,
|
|
613 __M);
|
|
614 }
|
|
615
|
|
616 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
617 _mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
618 {
|
|
619 return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
|
|
620 (__v16qi)__Y, __LY,
|
|
621 __M);
|
|
622 }
|
|
623
|
|
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
625 _mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
|
|
626 {
|
|
627 return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
|
|
628 (__v16qi)__Y, __LY,
|
|
629 __M);
|
|
630 }
|
|
631 #else
|
|
632 #define _mm_cmpistra(X, Y, M) \
|
|
633 ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \
|
|
634 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
635 #define _mm_cmpistrc(X, Y, M) \
|
|
636 ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \
|
|
637 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
638 #define _mm_cmpistro(X, Y, M) \
|
|
639 ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \
|
|
640 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
641 #define _mm_cmpistrs(X, Y, M) \
|
|
642 ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \
|
|
643 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
644 #define _mm_cmpistrz(X, Y, M) \
|
|
645 ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \
|
|
646 (__v16qi)(__m128i)(Y), (int)(M)))
|
|
647
|
|
648 #define _mm_cmpestra(X, LX, Y, LY, M) \
|
|
649 ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
|
|
650 (__v16qi)(__m128i)(Y), (int)(LY), \
|
|
651 (int)(M)))
|
|
652 #define _mm_cmpestrc(X, LX, Y, LY, M) \
|
|
653 ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
|
|
654 (__v16qi)(__m128i)(Y), (int)(LY), \
|
|
655 (int)(M)))
|
|
656 #define _mm_cmpestro(X, LX, Y, LY, M) \
|
|
657 ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
|
|
658 (__v16qi)(__m128i)(Y), (int)(LY), \
|
|
659 (int)(M)))
|
|
660 #define _mm_cmpestrs(X, LX, Y, LY, M) \
|
|
661 ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
|
|
662 (__v16qi)(__m128i)(Y), (int)(LY), \
|
|
663 (int)(M)))
|
|
664 #define _mm_cmpestrz(X, LX, Y, LY, M) \
|
|
665 ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
|
|
666 (__v16qi)(__m128i)(Y), (int)(LY), \
|
|
667 (int)(M)))
|
|
668 #endif
|
|
669
|
|
670 /* Packed integer 64-bit comparison, zeroing or filling with ones
|
|
671 corresponding parts of result. */
|
|
672 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
673 _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
|
|
674 {
|
|
675 return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y);
|
|
676 }
|
|
677
|
|
678 /* Calculate a number of bits set to 1. */
|
|
679 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
680 _mm_popcnt_u32 (unsigned int __X)
|
|
681 {
|
|
682 return __builtin_popcount (__X);
|
|
683 }
|
|
684
|
|
685 #ifdef __x86_64__
|
|
686 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
687 _mm_popcnt_u64 (unsigned long long __X)
|
|
688 {
|
|
689 return __builtin_popcountll (__X);
|
|
690 }
|
|
691 #endif
|
|
692
|
|
693 /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */
|
|
694 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
695 _mm_crc32_u8 (unsigned int __C, unsigned char __V)
|
|
696 {
|
|
697 return __builtin_ia32_crc32qi (__C, __V);
|
|
698 }
|
|
699
|
|
700 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
701 _mm_crc32_u16 (unsigned int __C, unsigned short __V)
|
|
702 {
|
|
703 return __builtin_ia32_crc32hi (__C, __V);
|
|
704 }
|
|
705
|
|
706 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
707 _mm_crc32_u32 (unsigned int __C, unsigned int __V)
|
|
708 {
|
|
709 return __builtin_ia32_crc32si (__C, __V);
|
|
710 }
|
|
711
|
|
712 #ifdef __x86_64__
|
|
713 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
714 _mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
|
|
715 {
|
|
716 return __builtin_ia32_crc32di (__C, __V);
|
|
717 }
|
|
718 #endif
|
|
719
|
|
720 #endif /* __SSE4_2__ */
|
|
721
|
|
722 #endif /* __SSE4_1__ */
|
|
723
|
|
724 #endif /* _SMMINTRIN_H_INCLUDED */
|