131
|
1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
|
2
|
|
3 This file is part of GCC.
|
|
4
|
|
5 GCC is free software; you can redistribute it and/or modify
|
|
6 it under the terms of the GNU General Public License as published by
|
|
7 the Free Software Foundation; either version 3, or (at your option)
|
|
8 any later version.
|
|
9
|
|
10 GCC is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU General Public License for more details.
|
|
14
|
|
15 Under Section 7 of GPL version 3, you are granted additional
|
|
16 permissions described in the GCC Runtime Library Exception, version
|
|
17 3.1, as published by the Free Software Foundation.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License and
|
|
20 a copy of the GCC Runtime Library Exception along with this program;
|
|
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
22 <http://www.gnu.org/licenses/>. */
|
|
23
|
|
24 /* Implemented from the specification included in the Intel C++ Compiler
|
|
25 User Guide and Reference, version 9.0. */
|
|
26
|
|
27 #ifndef NO_WARN_X86_INTRINSICS
|
|
28 /* This header is distributed to simplify porting x86_64 code that
|
|
29 makes explicit use of Intel intrinsics to powerpc64le.
|
|
30 It is the user's responsibility to determine if the results are
|
|
31 acceptable and make additional changes as necessary.
|
|
32 Note that much code that uses Intel intrinsics can be rewritten in
|
|
33 standard C or GNU C extensions, which are more portable and better
|
|
34 optimized across multiple targets.
|
|
35
|
|
36 In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
|
|
37 is a good match for most SIMD operations. However the Horizontal
|
|
38 add/sub requires the data pairs be permuted into a separate
|
|
39 registers with vertical even/odd alignment for the operation.
|
|
40 And the addsub operation requires the sign of only the even numbered
|
|
41 elements be flipped (xored with -0.0).
|
|
42 For larger blocks of code using these intrinsic implementations,
|
|
43 the compiler be should be able to schedule instructions to avoid
|
|
44 additional latency.
|
|
45
|
|
46 In the specific case of the monitor and mwait instructions there are
|
|
47 no direct equivalent in the PowerISA at this time. So those
|
|
48 intrinsics are not implemented. */
|
|
49 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
|
50 #endif
|
|
51
|
|
52 #ifndef _PMMINTRIN_H_INCLUDED
|
|
53 #define _PMMINTRIN_H_INCLUDED
|
|
54
|
|
55 /* We need definitions from the SSE2 and SSE header files*/
|
|
56 #include <emmintrin.h>
|
|
57
|
|
58 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
59 _mm_addsub_ps (__m128 __X, __m128 __Y)
|
|
60 {
|
|
61 const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
|
|
62 __v4sf even_neg_Y = vec_xor(__Y, even_n0);
|
|
63 return (__m128) vec_add (__X, even_neg_Y);
|
|
64 }
|
|
65
|
|
66 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
67 _mm_addsub_pd (__m128d __X, __m128d __Y)
|
|
68 {
|
|
69 const __v2df even_n0 = {-0.0, 0.0};
|
|
70 __v2df even_neg_Y = vec_xor(__Y, even_n0);
|
|
71 return (__m128d) vec_add (__X, even_neg_Y);
|
|
72 }
|
|
73
|
|
74 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
75 _mm_hadd_ps (__m128 __X, __m128 __Y)
|
|
76 {
|
|
77 __vector unsigned char xform2 = {
|
|
78 #ifdef __LITTLE_ENDIAN__
|
|
79 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B
|
|
80 #elif __BIG_ENDIAN__
|
|
81 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F
|
|
82 #endif
|
|
83 };
|
|
84 __vector unsigned char xform1 = {
|
|
85 #ifdef __LITTLE_ENDIAN__
|
|
86 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F
|
|
87 #elif __BIG_ENDIAN__
|
|
88 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B
|
|
89 #endif
|
|
90 };
|
|
91 return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
|
92 vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
|
93 }
|
|
94
|
|
95 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
96 _mm_hsub_ps (__m128 __X, __m128 __Y)
|
|
97 {
|
|
98 __vector unsigned char xform2 = {
|
|
99 #ifdef __LITTLE_ENDIAN__
|
|
100 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B
|
|
101 #elif __BIG_ENDIAN__
|
|
102 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F
|
|
103 #endif
|
|
104 };
|
|
105 __vector unsigned char xform1 = {
|
|
106 #ifdef __LITTLE_ENDIAN__
|
|
107 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F
|
|
108 #elif __BIG_ENDIAN__
|
|
109 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B
|
|
110 #endif
|
|
111 };
|
|
112 return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
|
113 vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
|
114 }
|
|
115
|
|
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
117 _mm_hadd_pd (__m128d __X, __m128d __Y)
|
|
118 {
|
|
119 return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
|
120 vec_mergel ((__v2df) __X, (__v2df)__Y));
|
|
121 }
|
|
122
|
|
123 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
124 _mm_hsub_pd (__m128d __X, __m128d __Y)
|
|
125 {
|
|
126 return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
|
127 vec_mergel ((__v2df) __X, (__v2df)__Y));
|
|
128 }
|
|
129
|
|
130 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
131 _mm_movehdup_ps (__m128 __X)
|
|
132 {
|
|
133 return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
|
|
134 }
|
|
135
|
|
136 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
137 _mm_moveldup_ps (__m128 __X)
|
|
138 {
|
|
139 return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
|
|
140 }
|
|
141
|
|
142 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
143 _mm_loaddup_pd (double const *__P)
|
|
144 {
|
|
145 return (__m128d) vec_splats (*__P);
|
|
146 }
|
|
147
|
|
148 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
149 _mm_movedup_pd (__m128d __X)
|
|
150 {
|
|
151 return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
|
|
152 }
|
|
153
|
|
154 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
155 _mm_lddqu_si128 (__m128i const *__P)
|
|
156 {
|
|
157 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
|
|
158 }
|
|
159
|
|
160 /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */
|
|
161
|
|
162 #endif /* _PMMINTRIN_H_INCLUDED */
|