Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/i386/smmintrin.h @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | 77e2b8dfacca |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. | |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation; either version 3, or (at your option) | |
8 any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. | |
23 | |
24 | |
25 /* Implemented from the specification included in the Intel C++ Compiler | |
26 User Guide and Reference, version 10.0. */ | |
27 | |
28 #ifndef _SMMINTRIN_H_INCLUDED | |
29 #define _SMMINTRIN_H_INCLUDED | |
30 | |
31 #ifndef __SSE4_1__ | |
32 # error "SSE4.1 instruction set not enabled" | |
33 #else | |
34 | |
35 /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header | |
36 files. */ | |
37 #include <tmmintrin.h> | |
38 #include <mmintrin-common.h> | |
39 | |
40 /* SSE4.1 */ | |
41 | |
42 /* Integer blend instructions - select data from 2 sources using | |
43 constant/variable mask. */ | |
44 | |
45 #ifdef __OPTIMIZE__ | |
46 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
47 _mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) | |
48 { | |
49 return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, | |
50 (__v8hi)__Y, | |
51 __M); | |
52 } | |
53 #else | |
54 #define _mm_blend_epi16(X, Y, M) \ | |
55 ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \ | |
56 (__v8hi)(__m128i)(Y), (int)(M))) | |
57 #endif | |
58 | |
59 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
60 _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) | |
61 { | |
62 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X, | |
63 (__v16qi)__Y, | |
64 (__v16qi)__M); | |
65 } | |
66 | |
67 /* Single precision floating point blend instructions - select data | |
68 from 2 sources using constant/variable mask. */ | |
69 | |
70 #ifdef __OPTIMIZE__ | |
71 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
72 _mm_blend_ps (__m128 __X, __m128 __Y, const int __M) | |
73 { | |
74 return (__m128) __builtin_ia32_blendps ((__v4sf)__X, | |
75 (__v4sf)__Y, | |
76 __M); | |
77 } | |
78 #else | |
79 #define _mm_blend_ps(X, Y, M) \ | |
80 ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \ | |
81 (__v4sf)(__m128)(Y), (int)(M))) | |
82 #endif | |
83 | |
84 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
85 _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) | |
86 { | |
87 return (__m128) __builtin_ia32_blendvps ((__v4sf)__X, | |
88 (__v4sf)__Y, | |
89 (__v4sf)__M); | |
90 } | |
91 | |
92 /* Double precision floating point blend instructions - select data | |
93 from 2 sources using constant/variable mask. */ | |
94 | |
95 #ifdef __OPTIMIZE__ | |
96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
97 _mm_blend_pd (__m128d __X, __m128d __Y, const int __M) | |
98 { | |
99 return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, | |
100 (__v2df)__Y, | |
101 __M); | |
102 } | |
103 #else | |
104 #define _mm_blend_pd(X, Y, M) \ | |
105 ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \ | |
106 (__v2df)(__m128d)(Y), (int)(M))) | |
107 #endif | |
108 | |
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
110 _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) | |
111 { | |
112 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X, | |
113 (__v2df)__Y, | |
114 (__v2df)__M); | |
115 } | |
116 | |
117 /* Dot product instructions with mask-defined summing and zeroing parts | |
118 of result. */ | |
119 | |
120 #ifdef __OPTIMIZE__ | |
121 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
122 _mm_dp_ps (__m128 __X, __m128 __Y, const int __M) | |
123 { | |
124 return (__m128) __builtin_ia32_dpps ((__v4sf)__X, | |
125 (__v4sf)__Y, | |
126 __M); | |
127 } | |
128 | |
129 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
130 _mm_dp_pd (__m128d __X, __m128d __Y, const int __M) | |
131 { | |
132 return (__m128d) __builtin_ia32_dppd ((__v2df)__X, | |
133 (__v2df)__Y, | |
134 __M); | |
135 } | |
136 #else | |
137 #define _mm_dp_ps(X, Y, M) \ | |
138 ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \ | |
139 (__v4sf)(__m128)(Y), (int)(M))) | |
140 | |
141 #define _mm_dp_pd(X, Y, M) \ | |
142 ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \ | |
143 (__v2df)(__m128d)(Y), (int)(M))) | |
144 #endif | |
145 | |
146 /* Packed integer 64-bit comparison, zeroing or filling with ones | |
147 corresponding parts of result. */ | |
148 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
149 _mm_cmpeq_epi64 (__m128i __X, __m128i __Y) | |
150 { | |
151 return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y); | |
152 } | |
153 | |
154 /* Min/max packed integer instructions. */ | |
155 | |
156 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
157 _mm_min_epi8 (__m128i __X, __m128i __Y) | |
158 { | |
159 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); | |
160 } | |
161 | |
162 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
163 _mm_max_epi8 (__m128i __X, __m128i __Y) | |
164 { | |
165 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y); | |
166 } | |
167 | |
168 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
169 _mm_min_epu16 (__m128i __X, __m128i __Y) | |
170 { | |
171 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y); | |
172 } | |
173 | |
174 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
175 _mm_max_epu16 (__m128i __X, __m128i __Y) | |
176 { | |
177 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y); | |
178 } | |
179 | |
180 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
181 _mm_min_epi32 (__m128i __X, __m128i __Y) | |
182 { | |
183 return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y); | |
184 } | |
185 | |
186 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
187 _mm_max_epi32 (__m128i __X, __m128i __Y) | |
188 { | |
189 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y); | |
190 } | |
191 | |
192 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
193 _mm_min_epu32 (__m128i __X, __m128i __Y) | |
194 { | |
195 return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y); | |
196 } | |
197 | |
198 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
199 _mm_max_epu32 (__m128i __X, __m128i __Y) | |
200 { | |
201 return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); | |
202 } | |
203 | |
204 /* Packed integer 32-bit multiplication with truncation of upper | |
205 halves of results. */ | |
206 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
207 _mm_mullo_epi32 (__m128i __X, __m128i __Y) | |
208 { | |
209 return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y); | |
210 } | |
211 | |
212 /* Packed integer 32-bit multiplication of 2 pairs of operands | |
213 with two 64-bit results. */ | |
214 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
215 _mm_mul_epi32 (__m128i __X, __m128i __Y) | |
216 { | |
217 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); | |
218 } | |
219 | |
220 /* Insert single precision float into packed single precision array | |
221 element selected by index N. The bits [7-6] of N define S | |
222 index, the bits [5-4] define D index, and bits [3-0] define | |
223 zeroing mask for D. */ | |
224 | |
225 #ifdef __OPTIMIZE__ | |
226 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
227 _mm_insert_ps (__m128 __D, __m128 __S, const int __N) | |
228 { | |
229 return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, | |
230 (__v4sf)__S, | |
231 __N); | |
232 } | |
233 #else | |
234 #define _mm_insert_ps(D, S, N) \ | |
235 ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \ | |
236 (__v4sf)(__m128)(S), (int)(N))) | |
237 #endif | |
238 | |
239 /* Helper macro to create the N value for _mm_insert_ps. */ | |
240 #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) | |
241 | |
242 /* Extract binary representation of single precision float from packed | |
243 single precision array element of X selected by index N. */ | |
244 | |
245 #ifdef __OPTIMIZE__ | |
246 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
247 _mm_extract_ps (__m128 __X, const int __N) | |
248 { | |
249 union { int i; float f; } __tmp; | |
250 __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); | |
251 return __tmp.i; | |
252 } | |
253 #else | |
254 #define _mm_extract_ps(X, N) \ | |
255 (__extension__ \ | |
256 ({ \ | |
257 union { int i; float f; } __tmp; \ | |
258 __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \ | |
259 __tmp.i; \ | |
260 })) | |
261 #endif | |
262 | |
263 /* Extract binary representation of single precision float into | |
264 D from packed single precision array element of S selected | |
265 by index N. */ | |
266 #define _MM_EXTRACT_FLOAT(D, S, N) \ | |
267 { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } | |
268 | |
269 /* Extract specified single precision float element into the lower | |
270 part of __m128. */ | |
271 #define _MM_PICK_OUT_PS(X, N) \ | |
272 _mm_insert_ps (_mm_setzero_ps (), (X), \ | |
273 _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) | |
274 | |
275 /* Insert integer, S, into packed integer array element of D | |
276 selected by index N. */ | |
277 | |
278 #ifdef __OPTIMIZE__ | |
279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
280 _mm_insert_epi8 (__m128i __D, int __S, const int __N) | |
281 { | |
282 return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, | |
283 __S, __N); | |
284 } | |
285 | |
286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
287 _mm_insert_epi32 (__m128i __D, int __S, const int __N) | |
288 { | |
289 return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, | |
290 __S, __N); | |
291 } | |
292 | |
293 #ifdef __x86_64__ | |
294 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
295 _mm_insert_epi64 (__m128i __D, long long __S, const int __N) | |
296 { | |
297 return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, | |
298 __S, __N); | |
299 } | |
300 #endif | |
301 #else | |
302 #define _mm_insert_epi8(D, S, N) \ | |
303 ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \ | |
304 (int)(S), (int)(N))) | |
305 | |
306 #define _mm_insert_epi32(D, S, N) \ | |
307 ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \ | |
308 (int)(S), (int)(N))) | |
309 | |
310 #ifdef __x86_64__ | |
311 #define _mm_insert_epi64(D, S, N) \ | |
312 ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \ | |
313 (long long)(S), (int)(N))) | |
314 #endif | |
315 #endif | |
316 | |
317 /* Extract integer from packed integer array element of X selected by | |
318 index N. */ | |
319 | |
320 #ifdef __OPTIMIZE__ | |
321 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
322 _mm_extract_epi8 (__m128i __X, const int __N) | |
323 { | |
324 return __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); | |
325 } | |
326 | |
327 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
328 _mm_extract_epi32 (__m128i __X, const int __N) | |
329 { | |
330 return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); | |
331 } | |
332 | |
333 #ifdef __x86_64__ | |
334 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
335 _mm_extract_epi64 (__m128i __X, const int __N) | |
336 { | |
337 return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); | |
338 } | |
339 #endif | |
340 #else | |
341 #define _mm_extract_epi8(X, N) \ | |
342 ((int) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N))) | |
343 #define _mm_extract_epi32(X, N) \ | |
344 ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N))) | |
345 | |
346 #ifdef __x86_64__ | |
347 #define _mm_extract_epi64(X, N) \ | |
348 ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N))) | |
349 #endif | |
350 #endif | |
351 | |
352 /* Return horizontal packed word minimum and its index in bits [15:0] | |
353 and bits [18:16] respectively. */ | |
354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
355 _mm_minpos_epu16 (__m128i __X) | |
356 { | |
357 return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X); | |
358 } | |
359 | |
360 /* Packed integer sign-extension. */ | |
361 | |
362 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
363 _mm_cvtepi8_epi32 (__m128i __X) | |
364 { | |
365 return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X); | |
366 } | |
367 | |
368 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
369 _mm_cvtepi16_epi32 (__m128i __X) | |
370 { | |
371 return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X); | |
372 } | |
373 | |
374 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
375 _mm_cvtepi8_epi64 (__m128i __X) | |
376 { | |
377 return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X); | |
378 } | |
379 | |
380 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
381 _mm_cvtepi32_epi64 (__m128i __X) | |
382 { | |
383 return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X); | |
384 } | |
385 | |
386 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
387 _mm_cvtepi16_epi64 (__m128i __X) | |
388 { | |
389 return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X); | |
390 } | |
391 | |
392 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
393 _mm_cvtepi8_epi16 (__m128i __X) | |
394 { | |
395 return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X); | |
396 } | |
397 | |
398 /* Packed integer zero-extension. */ | |
399 | |
400 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
401 _mm_cvtepu8_epi32 (__m128i __X) | |
402 { | |
403 return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X); | |
404 } | |
405 | |
406 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
407 _mm_cvtepu16_epi32 (__m128i __X) | |
408 { | |
409 return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X); | |
410 } | |
411 | |
412 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
413 _mm_cvtepu8_epi64 (__m128i __X) | |
414 { | |
415 return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X); | |
416 } | |
417 | |
418 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
419 _mm_cvtepu32_epi64 (__m128i __X) | |
420 { | |
421 return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X); | |
422 } | |
423 | |
424 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
425 _mm_cvtepu16_epi64 (__m128i __X) | |
426 { | |
427 return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X); | |
428 } | |
429 | |
430 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
431 _mm_cvtepu8_epi16 (__m128i __X) | |
432 { | |
433 return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X); | |
434 } | |
435 | |
436 /* Pack 8 double words from 2 operands into 8 words of result with | |
437 unsigned saturation. */ | |
438 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
439 _mm_packus_epi32 (__m128i __X, __m128i __Y) | |
440 { | |
441 return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); | |
442 } | |
443 | |
444 /* Sum absolute 8-bit integer difference of adjacent groups of 4 | |
445 byte integers in the first 2 operands. Starting offsets within | |
446 operands are determined by the 3rd mask operand. */ | |
447 | |
448 #ifdef __OPTIMIZE__ | |
449 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
450 _mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) | |
451 { | |
452 return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, | |
453 (__v16qi)__Y, __M); | |
454 } | |
455 #else | |
456 #define _mm_mpsadbw_epu8(X, Y, M) \ | |
457 ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \ | |
458 (__v16qi)(__m128i)(Y), (int)(M))) | |
459 #endif | |
460 | |
461 /* Load double quadword using non-temporal aligned hint. */ | |
462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
463 _mm_stream_load_si128 (__m128i *__X) | |
464 { | |
465 return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X); | |
466 } | |
467 | |
468 #ifdef __SSE4_2__ | |
469 | |
470 /* These macros specify the source data format. */ | |
471 #define _SIDD_UBYTE_OPS 0x00 | |
472 #define _SIDD_UWORD_OPS 0x01 | |
473 #define _SIDD_SBYTE_OPS 0x02 | |
474 #define _SIDD_SWORD_OPS 0x03 | |
475 | |
476 /* These macros specify the comparison operation. */ | |
477 #define _SIDD_CMP_EQUAL_ANY 0x00 | |
478 #define _SIDD_CMP_RANGES 0x04 | |
479 #define _SIDD_CMP_EQUAL_EACH 0x08 | |
480 #define _SIDD_CMP_EQUAL_ORDERED 0x0c | |
481 | |
482 /* These macros specify the the polarity. */ | |
483 #define _SIDD_POSITIVE_POLARITY 0x00 | |
484 #define _SIDD_NEGATIVE_POLARITY 0x10 | |
485 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 | |
486 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 | |
487 | |
488 /* These macros specify the output selection in _mm_cmpXstri (). */ | |
489 #define _SIDD_LEAST_SIGNIFICANT 0x00 | |
490 #define _SIDD_MOST_SIGNIFICANT 0x40 | |
491 | |
492 /* These macros specify the output selection in _mm_cmpXstrm (). */ | |
493 #define _SIDD_BIT_MASK 0x00 | |
494 #define _SIDD_UNIT_MASK 0x40 | |
495 | |
496 /* Intrinsics for text/string processing. */ | |
497 | |
498 #ifdef __OPTIMIZE__ | |
499 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
500 _mm_cmpistrm (__m128i __X, __m128i __Y, const int __M) | |
501 { | |
502 return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X, | |
503 (__v16qi)__Y, | |
504 __M); | |
505 } | |
506 | |
507 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
508 _mm_cmpistri (__m128i __X, __m128i __Y, const int __M) | |
509 { | |
510 return __builtin_ia32_pcmpistri128 ((__v16qi)__X, | |
511 (__v16qi)__Y, | |
512 __M); | |
513 } | |
514 | |
515 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
516 _mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
517 { | |
518 return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX, | |
519 (__v16qi)__Y, __LY, | |
520 __M); | |
521 } | |
522 | |
523 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
524 _mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
525 { | |
526 return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX, | |
527 (__v16qi)__Y, __LY, | |
528 __M); | |
529 } | |
530 #else | |
531 #define _mm_cmpistrm(X, Y, M) \ | |
532 ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \ | |
533 (__v16qi)(__m128i)(Y), (int)(M))) | |
534 #define _mm_cmpistri(X, Y, M) \ | |
535 ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \ | |
536 (__v16qi)(__m128i)(Y), (int)(M))) | |
537 | |
538 #define _mm_cmpestrm(X, LX, Y, LY, M) \ | |
539 ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \ | |
540 (int)(LX), (__v16qi)(__m128i)(Y), \ | |
541 (int)(LY), (int)(M))) | |
542 #define _mm_cmpestri(X, LX, Y, LY, M) \ | |
543 ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
544 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
545 (int)(M))) | |
546 #endif | |
547 | |
548 /* Intrinsics for text/string processing and reading values of | |
549 EFlags. */ | |
550 | |
551 #ifdef __OPTIMIZE__ | |
552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
553 _mm_cmpistra (__m128i __X, __m128i __Y, const int __M) | |
554 { | |
555 return __builtin_ia32_pcmpistria128 ((__v16qi)__X, | |
556 (__v16qi)__Y, | |
557 __M); | |
558 } | |
559 | |
560 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
561 _mm_cmpistrc (__m128i __X, __m128i __Y, const int __M) | |
562 { | |
563 return __builtin_ia32_pcmpistric128 ((__v16qi)__X, | |
564 (__v16qi)__Y, | |
565 __M); | |
566 } | |
567 | |
568 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
569 _mm_cmpistro (__m128i __X, __m128i __Y, const int __M) | |
570 { | |
571 return __builtin_ia32_pcmpistrio128 ((__v16qi)__X, | |
572 (__v16qi)__Y, | |
573 __M); | |
574 } | |
575 | |
576 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
577 _mm_cmpistrs (__m128i __X, __m128i __Y, const int __M) | |
578 { | |
579 return __builtin_ia32_pcmpistris128 ((__v16qi)__X, | |
580 (__v16qi)__Y, | |
581 __M); | |
582 } | |
583 | |
584 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
585 _mm_cmpistrz (__m128i __X, __m128i __Y, const int __M) | |
586 { | |
587 return __builtin_ia32_pcmpistriz128 ((__v16qi)__X, | |
588 (__v16qi)__Y, | |
589 __M); | |
590 } | |
591 | |
592 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
593 _mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
594 { | |
595 return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX, | |
596 (__v16qi)__Y, __LY, | |
597 __M); | |
598 } | |
599 | |
600 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
601 _mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
602 { | |
603 return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX, | |
604 (__v16qi)__Y, __LY, | |
605 __M); | |
606 } | |
607 | |
608 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
609 _mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
610 { | |
611 return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX, | |
612 (__v16qi)__Y, __LY, | |
613 __M); | |
614 } | |
615 | |
616 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
617 _mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
618 { | |
619 return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX, | |
620 (__v16qi)__Y, __LY, | |
621 __M); | |
622 } | |
623 | |
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
625 _mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
626 { | |
627 return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX, | |
628 (__v16qi)__Y, __LY, | |
629 __M); | |
630 } | |
631 #else | |
632 #define _mm_cmpistra(X, Y, M) \ | |
633 ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \ | |
634 (__v16qi)(__m128i)(Y), (int)(M))) | |
635 #define _mm_cmpistrc(X, Y, M) \ | |
636 ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \ | |
637 (__v16qi)(__m128i)(Y), (int)(M))) | |
638 #define _mm_cmpistro(X, Y, M) \ | |
639 ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \ | |
640 (__v16qi)(__m128i)(Y), (int)(M))) | |
641 #define _mm_cmpistrs(X, Y, M) \ | |
642 ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \ | |
643 (__v16qi)(__m128i)(Y), (int)(M))) | |
644 #define _mm_cmpistrz(X, Y, M) \ | |
645 ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \ | |
646 (__v16qi)(__m128i)(Y), (int)(M))) | |
647 | |
648 #define _mm_cmpestra(X, LX, Y, LY, M) \ | |
649 ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
650 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
651 (int)(M))) | |
652 #define _mm_cmpestrc(X, LX, Y, LY, M) \ | |
653 ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
654 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
655 (int)(M))) | |
656 #define _mm_cmpestro(X, LX, Y, LY, M) \ | |
657 ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
658 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
659 (int)(M))) | |
660 #define _mm_cmpestrs(X, LX, Y, LY, M) \ | |
661 ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
662 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
663 (int)(M))) | |
664 #define _mm_cmpestrz(X, LX, Y, LY, M) \ | |
665 ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
666 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
667 (int)(M))) | |
668 #endif | |
669 | |
670 /* Packed integer 64-bit comparison, zeroing or filling with ones | |
671 corresponding parts of result. */ | |
672 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
673 _mm_cmpgt_epi64 (__m128i __X, __m128i __Y) | |
674 { | |
675 return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y); | |
676 } | |
677 | |
678 /* Calculate a number of bits set to 1. */ | |
679 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
680 _mm_popcnt_u32 (unsigned int __X) | |
681 { | |
682 return __builtin_popcount (__X); | |
683 } | |
684 | |
685 #ifdef __x86_64__ | |
686 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
687 _mm_popcnt_u64 (unsigned long long __X) | |
688 { | |
689 return __builtin_popcountll (__X); | |
690 } | |
691 #endif | |
692 | |
693 /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ | |
694 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
695 _mm_crc32_u8 (unsigned int __C, unsigned char __V) | |
696 { | |
697 return __builtin_ia32_crc32qi (__C, __V); | |
698 } | |
699 | |
700 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
701 _mm_crc32_u16 (unsigned int __C, unsigned short __V) | |
702 { | |
703 return __builtin_ia32_crc32hi (__C, __V); | |
704 } | |
705 | |
706 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
707 _mm_crc32_u32 (unsigned int __C, unsigned int __V) | |
708 { | |
709 return __builtin_ia32_crc32si (__C, __V); | |
710 } | |
711 | |
712 #ifdef __x86_64__ | |
713 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
714 _mm_crc32_u64 (unsigned long long __C, unsigned long long __V) | |
715 { | |
716 return __builtin_ia32_crc32di (__C, __V); | |
717 } | |
718 #endif | |
719 | |
720 #endif /* __SSE4_2__ */ | |
721 | |
722 #endif /* __SSE4_1__ */ | |
723 | |
724 #endif /* _SMMINTRIN_H_INCLUDED */ |