Mercurial > hg > CbC > CbC_gcc
annotate gcc/config/i386/smmintrin.h @ 55:77e2b8dfacca gcc-4.4.5
update it from 4.4.3 to 4.5.0
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 12 Feb 2010 23:39:51 +0900 |
parents | a06113de4d67 |
children | f6334be47118 |
rev | line source |
---|---|
0 | 1 /* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation; either version 3, or (at your option) | |
8 any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. | |
23 | |
24 | |
25 /* Implemented from the specification included in the Intel C++ Compiler | |
26 User Guide and Reference, version 10.0. */ | |
27 | |
28 #ifndef _SMMINTRIN_H_INCLUDED | |
29 #define _SMMINTRIN_H_INCLUDED | |
30 | |
31 #ifndef __SSE4_1__ | |
32 # error "SSE4.1 instruction set not enabled" | |
33 #else | |
34 | |
35 /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header | |
36 files. */ | |
37 #include <tmmintrin.h> | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
38 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
39 /* Rounding mode macros. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
40 #define _MM_FROUND_TO_NEAREST_INT 0x00 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
41 #define _MM_FROUND_TO_NEG_INF 0x01 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
42 #define _MM_FROUND_TO_POS_INF 0x02 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
43 #define _MM_FROUND_TO_ZERO 0x03 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
44 #define _MM_FROUND_CUR_DIRECTION 0x04 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
45 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
46 #define _MM_FROUND_RAISE_EXC 0x00 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
47 #define _MM_FROUND_NO_EXC 0x08 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
48 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
49 #define _MM_FROUND_NINT \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
50 (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
51 #define _MM_FROUND_FLOOR \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
52 (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
53 #define _MM_FROUND_CEIL \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
54 (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
55 #define _MM_FROUND_TRUNC \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
56 (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
57 #define _MM_FROUND_RINT \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
58 (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
59 #define _MM_FROUND_NEARBYINT \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
60 (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
61 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
62 /* Test Instruction */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
63 /* Packed integer 128-bit bitwise comparison. Return 1 if |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
64 (__V & __M) == 0. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
65 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
66 _mm_testz_si128 (__m128i __M, __m128i __V) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
67 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
68 return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
69 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
70 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
71 /* Packed integer 128-bit bitwise comparison. Return 1 if |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
72 (__V & ~__M) == 0. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
73 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
74 _mm_testc_si128 (__m128i __M, __m128i __V) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
75 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
76 return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
77 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
78 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
79 /* Packed integer 128-bit bitwise comparison. Return 1 if |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
80 (__V & __M) != 0 && (__V & ~__M) != 0. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
81 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
82 _mm_testnzc_si128 (__m128i __M, __m128i __V) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
83 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
84 return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
85 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
86 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
87 /* Macros for packed integer 128-bit comparison intrinsics. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
88 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
89 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
90 #define _mm_test_all_ones(V) \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
91 _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
92 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
93 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
94 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
95 /* Packed/scalar double precision floating point rounding. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
96 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
97 #ifdef __OPTIMIZE__ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
98 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
99 _mm_round_pd (__m128d __V, const int __M) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
100 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
101 return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
102 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
103 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
104 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
105 _mm_round_sd(__m128d __D, __m128d __V, const int __M) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
106 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
107 return (__m128d) __builtin_ia32_roundsd ((__v2df)__D, |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
108 (__v2df)__V, |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
109 __M); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
110 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
111 #else |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
112 #define _mm_round_pd(V, M) \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
113 ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M))) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
114 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
115 #define _mm_round_sd(D, V, M) \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
116 ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
117 (__v2df)(__m128d)(V), (int)(M))) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
118 #endif |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
119 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
120 /* Packed/scalar single precision floating point rounding. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
121 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
122 #ifdef __OPTIMIZE__ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
123 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
124 _mm_round_ps (__m128 __V, const int __M) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
125 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
126 return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
127 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
128 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
129 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
130 _mm_round_ss (__m128 __D, __m128 __V, const int __M) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
131 { |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
132 return (__m128) __builtin_ia32_roundss ((__v4sf)__D, |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
133 (__v4sf)__V, |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
134 __M); |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
135 } |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
136 #else |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
137 #define _mm_round_ps(V, M) \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
138 ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M))) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
139 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
140 #define _mm_round_ss(D, V, M) \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
141 ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
142 (__v4sf)(__m128)(V), (int)(M))) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
143 #endif |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
144 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
145 /* Macros for ceil/floor intrinsics. */ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
146 #define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
147 #define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
148 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
149 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
150 #define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
151 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
152 #define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
153 #define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
154 |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
155 #define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
156 #define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) |
0 | 157 |
158 /* SSE4.1 */ | |
159 | |
160 /* Integer blend instructions - select data from 2 sources using | |
161 constant/variable mask. */ | |
162 | |
163 #ifdef __OPTIMIZE__ | |
164 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
165 _mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) | |
166 { | |
167 return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, | |
168 (__v8hi)__Y, | |
169 __M); | |
170 } | |
171 #else | |
172 #define _mm_blend_epi16(X, Y, M) \ | |
173 ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \ | |
174 (__v8hi)(__m128i)(Y), (int)(M))) | |
175 #endif | |
176 | |
177 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
178 _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) | |
179 { | |
180 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X, | |
181 (__v16qi)__Y, | |
182 (__v16qi)__M); | |
183 } | |
184 | |
185 /* Single precision floating point blend instructions - select data | |
186 from 2 sources using constant/variable mask. */ | |
187 | |
188 #ifdef __OPTIMIZE__ | |
189 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
190 _mm_blend_ps (__m128 __X, __m128 __Y, const int __M) | |
191 { | |
192 return (__m128) __builtin_ia32_blendps ((__v4sf)__X, | |
193 (__v4sf)__Y, | |
194 __M); | |
195 } | |
196 #else | |
197 #define _mm_blend_ps(X, Y, M) \ | |
198 ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \ | |
199 (__v4sf)(__m128)(Y), (int)(M))) | |
200 #endif | |
201 | |
202 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
203 _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) | |
204 { | |
205 return (__m128) __builtin_ia32_blendvps ((__v4sf)__X, | |
206 (__v4sf)__Y, | |
207 (__v4sf)__M); | |
208 } | |
209 | |
210 /* Double precision floating point blend instructions - select data | |
211 from 2 sources using constant/variable mask. */ | |
212 | |
213 #ifdef __OPTIMIZE__ | |
214 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
215 _mm_blend_pd (__m128d __X, __m128d __Y, const int __M) | |
216 { | |
217 return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, | |
218 (__v2df)__Y, | |
219 __M); | |
220 } | |
221 #else | |
222 #define _mm_blend_pd(X, Y, M) \ | |
223 ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \ | |
224 (__v2df)(__m128d)(Y), (int)(M))) | |
225 #endif | |
226 | |
227 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
228 _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) | |
229 { | |
230 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X, | |
231 (__v2df)__Y, | |
232 (__v2df)__M); | |
233 } | |
234 | |
235 /* Dot product instructions with mask-defined summing and zeroing parts | |
236 of result. */ | |
237 | |
238 #ifdef __OPTIMIZE__ | |
239 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
240 _mm_dp_ps (__m128 __X, __m128 __Y, const int __M) | |
241 { | |
242 return (__m128) __builtin_ia32_dpps ((__v4sf)__X, | |
243 (__v4sf)__Y, | |
244 __M); | |
245 } | |
246 | |
247 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
248 _mm_dp_pd (__m128d __X, __m128d __Y, const int __M) | |
249 { | |
250 return (__m128d) __builtin_ia32_dppd ((__v2df)__X, | |
251 (__v2df)__Y, | |
252 __M); | |
253 } | |
254 #else | |
255 #define _mm_dp_ps(X, Y, M) \ | |
256 ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \ | |
257 (__v4sf)(__m128)(Y), (int)(M))) | |
258 | |
259 #define _mm_dp_pd(X, Y, M) \ | |
260 ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \ | |
261 (__v2df)(__m128d)(Y), (int)(M))) | |
262 #endif | |
263 | |
264 /* Packed integer 64-bit comparison, zeroing or filling with ones | |
265 corresponding parts of result. */ | |
266 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
267 _mm_cmpeq_epi64 (__m128i __X, __m128i __Y) | |
268 { | |
269 return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y); | |
270 } | |
271 | |
272 /* Min/max packed integer instructions. */ | |
273 | |
274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
275 _mm_min_epi8 (__m128i __X, __m128i __Y) | |
276 { | |
277 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); | |
278 } | |
279 | |
280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
281 _mm_max_epi8 (__m128i __X, __m128i __Y) | |
282 { | |
283 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y); | |
284 } | |
285 | |
286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
287 _mm_min_epu16 (__m128i __X, __m128i __Y) | |
288 { | |
289 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y); | |
290 } | |
291 | |
292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
293 _mm_max_epu16 (__m128i __X, __m128i __Y) | |
294 { | |
295 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y); | |
296 } | |
297 | |
298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
299 _mm_min_epi32 (__m128i __X, __m128i __Y) | |
300 { | |
301 return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y); | |
302 } | |
303 | |
304 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
305 _mm_max_epi32 (__m128i __X, __m128i __Y) | |
306 { | |
307 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y); | |
308 } | |
309 | |
310 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
311 _mm_min_epu32 (__m128i __X, __m128i __Y) | |
312 { | |
313 return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y); | |
314 } | |
315 | |
316 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
317 _mm_max_epu32 (__m128i __X, __m128i __Y) | |
318 { | |
319 return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); | |
320 } | |
321 | |
322 /* Packed integer 32-bit multiplication with truncation of upper | |
323 halves of results. */ | |
324 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
325 _mm_mullo_epi32 (__m128i __X, __m128i __Y) | |
326 { | |
327 return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y); | |
328 } | |
329 | |
330 /* Packed integer 32-bit multiplication of 2 pairs of operands | |
331 with two 64-bit results. */ | |
332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
333 _mm_mul_epi32 (__m128i __X, __m128i __Y) | |
334 { | |
335 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); | |
336 } | |
337 | |
338 /* Insert single precision float into packed single precision array | |
339 element selected by index N. The bits [7-6] of N define S | |
340 index, the bits [5-4] define D index, and bits [3-0] define | |
341 zeroing mask for D. */ | |
342 | |
343 #ifdef __OPTIMIZE__ | |
344 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
345 _mm_insert_ps (__m128 __D, __m128 __S, const int __N) | |
346 { | |
347 return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, | |
348 (__v4sf)__S, | |
349 __N); | |
350 } | |
351 #else | |
352 #define _mm_insert_ps(D, S, N) \ | |
353 ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \ | |
354 (__v4sf)(__m128)(S), (int)(N))) | |
355 #endif | |
356 | |
357 /* Helper macro to create the N value for _mm_insert_ps. */ | |
358 #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) | |
359 | |
360 /* Extract binary representation of single precision float from packed | |
361 single precision array element of X selected by index N. */ | |
362 | |
363 #ifdef __OPTIMIZE__ | |
364 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
365 _mm_extract_ps (__m128 __X, const int __N) | |
366 { | |
367 union { int i; float f; } __tmp; | |
368 __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); | |
369 return __tmp.i; | |
370 } | |
371 #else | |
372 #define _mm_extract_ps(X, N) \ | |
373 (__extension__ \ | |
374 ({ \ | |
375 union { int i; float f; } __tmp; \ | |
376 __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \ | |
377 __tmp.i; \ | |
378 })) | |
379 #endif | |
380 | |
381 /* Extract binary representation of single precision float into | |
382 D from packed single precision array element of S selected | |
383 by index N. */ | |
384 #define _MM_EXTRACT_FLOAT(D, S, N) \ | |
385 { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } | |
386 | |
387 /* Extract specified single precision float element into the lower | |
388 part of __m128. */ | |
389 #define _MM_PICK_OUT_PS(X, N) \ | |
390 _mm_insert_ps (_mm_setzero_ps (), (X), \ | |
391 _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) | |
392 | |
393 /* Insert integer, S, into packed integer array element of D | |
394 selected by index N. */ | |
395 | |
396 #ifdef __OPTIMIZE__ | |
397 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
398 _mm_insert_epi8 (__m128i __D, int __S, const int __N) | |
399 { | |
400 return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, | |
401 __S, __N); | |
402 } | |
403 | |
404 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
405 _mm_insert_epi32 (__m128i __D, int __S, const int __N) | |
406 { | |
407 return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, | |
408 __S, __N); | |
409 } | |
410 | |
411 #ifdef __x86_64__ | |
412 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
413 _mm_insert_epi64 (__m128i __D, long long __S, const int __N) | |
414 { | |
415 return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, | |
416 __S, __N); | |
417 } | |
418 #endif | |
419 #else | |
420 #define _mm_insert_epi8(D, S, N) \ | |
421 ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \ | |
422 (int)(S), (int)(N))) | |
423 | |
424 #define _mm_insert_epi32(D, S, N) \ | |
425 ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \ | |
426 (int)(S), (int)(N))) | |
427 | |
428 #ifdef __x86_64__ | |
429 #define _mm_insert_epi64(D, S, N) \ | |
430 ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \ | |
431 (long long)(S), (int)(N))) | |
432 #endif | |
433 #endif | |
434 | |
435 /* Extract integer from packed integer array element of X selected by | |
436 index N. */ | |
437 | |
438 #ifdef __OPTIMIZE__ | |
439 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
440 _mm_extract_epi8 (__m128i __X, const int __N) | |
441 { | |
442 return __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); | |
443 } | |
444 | |
445 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
446 _mm_extract_epi32 (__m128i __X, const int __N) | |
447 { | |
448 return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); | |
449 } | |
450 | |
451 #ifdef __x86_64__ | |
452 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
453 _mm_extract_epi64 (__m128i __X, const int __N) | |
454 { | |
455 return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); | |
456 } | |
457 #endif | |
458 #else | |
459 #define _mm_extract_epi8(X, N) \ | |
460 ((int) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N))) | |
461 #define _mm_extract_epi32(X, N) \ | |
462 ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N))) | |
463 | |
464 #ifdef __x86_64__ | |
465 #define _mm_extract_epi64(X, N) \ | |
466 ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N))) | |
467 #endif | |
468 #endif | |
469 | |
470 /* Return horizontal packed word minimum and its index in bits [15:0] | |
471 and bits [18:16] respectively. */ | |
472 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
473 _mm_minpos_epu16 (__m128i __X) | |
474 { | |
475 return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X); | |
476 } | |
477 | |
478 /* Packed integer sign-extension. */ | |
479 | |
480 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
481 _mm_cvtepi8_epi32 (__m128i __X) | |
482 { | |
483 return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X); | |
484 } | |
485 | |
486 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
487 _mm_cvtepi16_epi32 (__m128i __X) | |
488 { | |
489 return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X); | |
490 } | |
491 | |
492 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
493 _mm_cvtepi8_epi64 (__m128i __X) | |
494 { | |
495 return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X); | |
496 } | |
497 | |
498 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
499 _mm_cvtepi32_epi64 (__m128i __X) | |
500 { | |
501 return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X); | |
502 } | |
503 | |
504 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
505 _mm_cvtepi16_epi64 (__m128i __X) | |
506 { | |
507 return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X); | |
508 } | |
509 | |
510 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
511 _mm_cvtepi8_epi16 (__m128i __X) | |
512 { | |
513 return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X); | |
514 } | |
515 | |
516 /* Packed integer zero-extension. */ | |
517 | |
518 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
519 _mm_cvtepu8_epi32 (__m128i __X) | |
520 { | |
521 return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X); | |
522 } | |
523 | |
524 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
525 _mm_cvtepu16_epi32 (__m128i __X) | |
526 { | |
527 return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X); | |
528 } | |
529 | |
530 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
531 _mm_cvtepu8_epi64 (__m128i __X) | |
532 { | |
533 return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X); | |
534 } | |
535 | |
536 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
537 _mm_cvtepu32_epi64 (__m128i __X) | |
538 { | |
539 return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X); | |
540 } | |
541 | |
542 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
543 _mm_cvtepu16_epi64 (__m128i __X) | |
544 { | |
545 return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X); | |
546 } | |
547 | |
548 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
549 _mm_cvtepu8_epi16 (__m128i __X) | |
550 { | |
551 return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X); | |
552 } | |
553 | |
554 /* Pack 8 double words from 2 operands into 8 words of result with | |
555 unsigned saturation. */ | |
556 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
557 _mm_packus_epi32 (__m128i __X, __m128i __Y) | |
558 { | |
559 return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); | |
560 } | |
561 | |
562 /* Sum absolute 8-bit integer difference of adjacent groups of 4 | |
563 byte integers in the first 2 operands. Starting offsets within | |
564 operands are determined by the 3rd mask operand. */ | |
565 | |
566 #ifdef __OPTIMIZE__ | |
567 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
568 _mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) | |
569 { | |
570 return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, | |
571 (__v16qi)__Y, __M); | |
572 } | |
573 #else | |
574 #define _mm_mpsadbw_epu8(X, Y, M) \ | |
575 ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \ | |
576 (__v16qi)(__m128i)(Y), (int)(M))) | |
577 #endif | |
578 | |
579 /* Load double quadword using non-temporal aligned hint. */ | |
580 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
581 _mm_stream_load_si128 (__m128i *__X) | |
582 { | |
583 return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X); | |
584 } | |
585 | |
586 #ifdef __SSE4_2__ | |
587 | |
588 /* These macros specify the source data format. */ | |
589 #define _SIDD_UBYTE_OPS 0x00 | |
590 #define _SIDD_UWORD_OPS 0x01 | |
591 #define _SIDD_SBYTE_OPS 0x02 | |
592 #define _SIDD_SWORD_OPS 0x03 | |
593 | |
594 /* These macros specify the comparison operation. */ | |
595 #define _SIDD_CMP_EQUAL_ANY 0x00 | |
596 #define _SIDD_CMP_RANGES 0x04 | |
597 #define _SIDD_CMP_EQUAL_EACH 0x08 | |
598 #define _SIDD_CMP_EQUAL_ORDERED 0x0c | |
599 | |
600 /* These macros specify the the polarity. */ | |
601 #define _SIDD_POSITIVE_POLARITY 0x00 | |
602 #define _SIDD_NEGATIVE_POLARITY 0x10 | |
603 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 | |
604 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 | |
605 | |
606 /* These macros specify the output selection in _mm_cmpXstri (). */ | |
607 #define _SIDD_LEAST_SIGNIFICANT 0x00 | |
608 #define _SIDD_MOST_SIGNIFICANT 0x40 | |
609 | |
610 /* These macros specify the output selection in _mm_cmpXstrm (). */ | |
611 #define _SIDD_BIT_MASK 0x00 | |
612 #define _SIDD_UNIT_MASK 0x40 | |
613 | |
614 /* Intrinsics for text/string processing. */ | |
615 | |
616 #ifdef __OPTIMIZE__ | |
617 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
618 _mm_cmpistrm (__m128i __X, __m128i __Y, const int __M) | |
619 { | |
620 return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X, | |
621 (__v16qi)__Y, | |
622 __M); | |
623 } | |
624 | |
625 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
626 _mm_cmpistri (__m128i __X, __m128i __Y, const int __M) | |
627 { | |
628 return __builtin_ia32_pcmpistri128 ((__v16qi)__X, | |
629 (__v16qi)__Y, | |
630 __M); | |
631 } | |
632 | |
633 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
634 _mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
635 { | |
636 return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX, | |
637 (__v16qi)__Y, __LY, | |
638 __M); | |
639 } | |
640 | |
641 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
642 _mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
643 { | |
644 return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX, | |
645 (__v16qi)__Y, __LY, | |
646 __M); | |
647 } | |
648 #else | |
649 #define _mm_cmpistrm(X, Y, M) \ | |
650 ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \ | |
651 (__v16qi)(__m128i)(Y), (int)(M))) | |
652 #define _mm_cmpistri(X, Y, M) \ | |
653 ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \ | |
654 (__v16qi)(__m128i)(Y), (int)(M))) | |
655 | |
656 #define _mm_cmpestrm(X, LX, Y, LY, M) \ | |
657 ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \ | |
658 (int)(LX), (__v16qi)(__m128i)(Y), \ | |
659 (int)(LY), (int)(M))) | |
660 #define _mm_cmpestri(X, LX, Y, LY, M) \ | |
661 ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
662 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
663 (int)(M))) | |
664 #endif | |
665 | |
666 /* Intrinsics for text/string processing and reading values of | |
667 EFlags. */ | |
668 | |
669 #ifdef __OPTIMIZE__ | |
670 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
671 _mm_cmpistra (__m128i __X, __m128i __Y, const int __M) | |
672 { | |
673 return __builtin_ia32_pcmpistria128 ((__v16qi)__X, | |
674 (__v16qi)__Y, | |
675 __M); | |
676 } | |
677 | |
678 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
679 _mm_cmpistrc (__m128i __X, __m128i __Y, const int __M) | |
680 { | |
681 return __builtin_ia32_pcmpistric128 ((__v16qi)__X, | |
682 (__v16qi)__Y, | |
683 __M); | |
684 } | |
685 | |
686 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
687 _mm_cmpistro (__m128i __X, __m128i __Y, const int __M) | |
688 { | |
689 return __builtin_ia32_pcmpistrio128 ((__v16qi)__X, | |
690 (__v16qi)__Y, | |
691 __M); | |
692 } | |
693 | |
694 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
695 _mm_cmpistrs (__m128i __X, __m128i __Y, const int __M) | |
696 { | |
697 return __builtin_ia32_pcmpistris128 ((__v16qi)__X, | |
698 (__v16qi)__Y, | |
699 __M); | |
700 } | |
701 | |
702 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
703 _mm_cmpistrz (__m128i __X, __m128i __Y, const int __M) | |
704 { | |
705 return __builtin_ia32_pcmpistriz128 ((__v16qi)__X, | |
706 (__v16qi)__Y, | |
707 __M); | |
708 } | |
709 | |
710 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
711 _mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
712 { | |
713 return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX, | |
714 (__v16qi)__Y, __LY, | |
715 __M); | |
716 } | |
717 | |
718 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
719 _mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
720 { | |
721 return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX, | |
722 (__v16qi)__Y, __LY, | |
723 __M); | |
724 } | |
725 | |
726 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
727 _mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
728 { | |
729 return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX, | |
730 (__v16qi)__Y, __LY, | |
731 __M); | |
732 } | |
733 | |
734 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
735 _mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
736 { | |
737 return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX, | |
738 (__v16qi)__Y, __LY, | |
739 __M); | |
740 } | |
741 | |
742 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
743 _mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) | |
744 { | |
745 return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX, | |
746 (__v16qi)__Y, __LY, | |
747 __M); | |
748 } | |
749 #else | |
750 #define _mm_cmpistra(X, Y, M) \ | |
751 ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \ | |
752 (__v16qi)(__m128i)(Y), (int)(M))) | |
753 #define _mm_cmpistrc(X, Y, M) \ | |
754 ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \ | |
755 (__v16qi)(__m128i)(Y), (int)(M))) | |
756 #define _mm_cmpistro(X, Y, M) \ | |
757 ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \ | |
758 (__v16qi)(__m128i)(Y), (int)(M))) | |
759 #define _mm_cmpistrs(X, Y, M) \ | |
760 ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \ | |
761 (__v16qi)(__m128i)(Y), (int)(M))) | |
762 #define _mm_cmpistrz(X, Y, M) \ | |
763 ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \ | |
764 (__v16qi)(__m128i)(Y), (int)(M))) | |
765 | |
766 #define _mm_cmpestra(X, LX, Y, LY, M) \ | |
767 ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
768 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
769 (int)(M))) | |
770 #define _mm_cmpestrc(X, LX, Y, LY, M) \ | |
771 ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
772 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
773 (int)(M))) | |
774 #define _mm_cmpestro(X, LX, Y, LY, M) \ | |
775 ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
776 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
777 (int)(M))) | |
778 #define _mm_cmpestrs(X, LX, Y, LY, M) \ | |
779 ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
780 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
781 (int)(M))) | |
782 #define _mm_cmpestrz(X, LX, Y, LY, M) \ | |
783 ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \ | |
784 (__v16qi)(__m128i)(Y), (int)(LY), \ | |
785 (int)(M))) | |
786 #endif | |
787 | |
788 /* Packed integer 64-bit comparison, zeroing or filling with ones | |
789 corresponding parts of result. */ | |
790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
791 _mm_cmpgt_epi64 (__m128i __X, __m128i __Y) | |
792 { | |
793 return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y); | |
794 } | |
795 | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
796 #ifdef __POPCNT__ |
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
797 #include <popcntintrin.h> |
0 | 798 #endif |
799 | |
800 /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ | |
801 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
802 _mm_crc32_u8 (unsigned int __C, unsigned char __V) | |
803 { | |
804 return __builtin_ia32_crc32qi (__C, __V); | |
805 } | |
806 | |
807 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
808 _mm_crc32_u16 (unsigned int __C, unsigned short __V) | |
809 { | |
810 return __builtin_ia32_crc32hi (__C, __V); | |
811 } | |
812 | |
813 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
814 _mm_crc32_u32 (unsigned int __C, unsigned int __V) | |
815 { | |
816 return __builtin_ia32_crc32si (__C, __V); | |
817 } | |
818 | |
819 #ifdef __x86_64__ | |
820 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
821 _mm_crc32_u64 (unsigned long long __C, unsigned long long __V) | |
822 { | |
823 return __builtin_ia32_crc32di (__C, __V); | |
824 } | |
825 #endif | |
826 | |
827 #endif /* __SSE4_2__ */ | |
828 | |
829 #endif /* __SSE4_1__ */ | |
830 | |
831 #endif /* _SMMINTRIN_H_INCLUDED */ |