comparison gcc/config/rs6000/tmmintrin.h @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents
children
comparison
equal deleted inserted replaced
131:84e7813d76e9 145:1830386684a0
1 /* Copyright (C) 2003-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets. */
35 #endif
36
37 #ifndef TMMINTRIN_H_
38 #define TMMINTRIN_H_
39
40 #include <altivec.h>
41 #include <assert.h>
42
43 /* We need definitions from the SSE header files. */
44 #include <pmmintrin.h>
45
46 extern __inline __m128i
47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 _mm_abs_epi16 (__m128i __A)
49 {
50 return (__m128i) vec_abs ((__v8hi) __A);
51 }
52
53 extern __inline __m128i
54 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
55 _mm_abs_epi32 (__m128i __A)
56 {
57 return (__m128i) vec_abs ((__v4si) __A);
58 }
59
60 extern __inline __m128i
61 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
62 _mm_abs_epi8 (__m128i __A)
63 {
64 return (__m128i) vec_abs ((__v16qi) __A);
65 }
66
67 extern __inline __m64
68 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
69 _mm_abs_pi16 (__m64 __A)
70 {
71 __v8hi __B = (__v8hi) (__v2du) { __A, __A };
72 return (__m64) ((__v2du) vec_abs (__B))[0];
73 }
74
75 extern __inline __m64
76 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 _mm_abs_pi32 (__m64 __A)
78 {
79 __v4si __B = (__v4si) (__v2du) { __A, __A };
80 return (__m64) ((__v2du) vec_abs (__B))[0];
81 }
82
83 extern __inline __m64
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 _mm_abs_pi8 (__m64 __A)
86 {
87 __v16qi __B = (__v16qi) (__v2du) { __A, __A };
88 return (__m64) ((__v2du) vec_abs (__B))[0];
89 }
90
91 extern __inline __m128i
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
94 {
95 if (__builtin_constant_p (__count) && __count < 16)
96 {
97 #ifdef __LITTLE_ENDIAN__
98 __A = (__m128i) vec_reve ((__v16qu) __A);
99 __B = (__m128i) vec_reve ((__v16qu) __B);
100 #endif
101 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
102 #ifdef __LITTLE_ENDIAN__
103 __A = (__m128i) vec_reve ((__v16qu) __A);
104 #endif
105 return __A;
106 }
107
108 if (__count == 0)
109 return __B;
110
111 if (__count >= 16)
112 {
113 if (__count >= 32)
114 {
115 const __v16qu zero = { 0 };
116 return (__m128i) zero;
117 }
118 else
119 {
120 const __v16qu __shift =
121 vec_splats ((unsigned char) ((__count - 16) * 8));
122 #ifdef __LITTLE_ENDIAN__
123 return (__m128i) vec_sro ((__v16qu) __A, __shift);
124 #else
125 return (__m128i) vec_slo ((__v16qu) __A, __shift);
126 #endif
127 }
128 }
129 else
130 {
131 const __v16qu __shiftA =
132 vec_splats ((unsigned char) ((16 - __count) * 8));
133 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
134 #ifdef __LITTLE_ENDIAN__
135 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
136 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
137 #else
138 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
139 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
140 #endif
141 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
142 }
143 }
144
145 extern __inline __m64
146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
148 {
149 if (__count < 16)
150 {
151 __v2du __C = { __B, __A };
152 #ifdef __LITTLE_ENDIAN__
153 const __v4su __shift = { __count << 3, 0, 0, 0 };
154 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
155 #else
156 const __v4su __shift = { 0, 0, 0, __count << 3 };
157 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
158 #endif
159 return (__m64) __C[0];
160 }
161 else
162 {
163 const __m64 __zero = { 0 };
164 return __zero;
165 }
166 }
167
168 extern __inline __m128i
169 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_hadd_epi16 (__m128i __A, __m128i __B)
171 {
172 const __v16qu __P =
173 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
174 const __v16qu __Q =
175 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
176 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
177 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
178 return (__m128i) vec_add (__C, __D);
179 }
180
181 extern __inline __m128i
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_hadd_epi32 (__m128i __A, __m128i __B)
184 {
185 const __v16qu __P =
186 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
187 const __v16qu __Q =
188 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
189 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
190 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
191 return (__m128i) vec_add (__C, __D);
192 }
193
194 extern __inline __m64
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_hadd_pi16 (__m64 __A, __m64 __B)
197 {
198 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
199 const __v16qu __P =
200 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
201 const __v16qu __Q =
202 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
203 __v8hi __D = vec_perm (__C, __C, __Q);
204 __C = vec_perm (__C, __C, __P);
205 __C = vec_add (__C, __D);
206 return (__m64) ((__v2du) __C)[1];
207 }
208
209 extern __inline __m64
210 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm_hadd_pi32 (__m64 __A, __m64 __B)
212 {
213 __v4si __C = (__v4si) (__v2du) { __A, __B };
214 const __v16qu __P =
215 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
216 const __v16qu __Q =
217 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
218 __v4si __D = vec_perm (__C, __C, __Q);
219 __C = vec_perm (__C, __C, __P);
220 __C = vec_add (__C, __D);
221 return (__m64) ((__v2du) __C)[1];
222 }
223
224 extern __inline __m128i
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_hadds_epi16 (__m128i __A, __m128i __B)
227 {
228 __v4si __C = { 0 }, __D = { 0 };
229 __C = vec_sum4s ((__v8hi) __A, __C);
230 __D = vec_sum4s ((__v8hi) __B, __D);
231 __C = (__v4si) vec_packs (__C, __D);
232 return (__m128i) __C;
233 }
234
235 extern __inline __m64
236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237 _mm_hadds_pi16 (__m64 __A, __m64 __B)
238 {
239 const __v4si __zero = { 0 };
240 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
241 __v4si __D = vec_sum4s (__C, __zero);
242 __C = vec_packs (__D, __D);
243 return (__m64) ((__v2du) __C)[1];
244 }
245
246 extern __inline __m128i
247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_hsub_epi16 (__m128i __A, __m128i __B)
249 {
250 const __v16qu __P =
251 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
252 const __v16qu __Q =
253 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
254 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
255 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
256 return (__m128i) vec_sub (__C, __D);
257 }
258
259 extern __inline __m128i
260 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 _mm_hsub_epi32 (__m128i __A, __m128i __B)
262 {
263 const __v16qu __P =
264 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
265 const __v16qu __Q =
266 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
267 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
268 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
269 return (__m128i) vec_sub (__C, __D);
270 }
271
272 extern __inline __m64
273 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 _mm_hsub_pi16 (__m64 __A, __m64 __B)
275 {
276 const __v16qu __P =
277 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
278 const __v16qu __Q =
279 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
280 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
281 __v8hi __D = vec_perm (__C, __C, __Q);
282 __C = vec_perm (__C, __C, __P);
283 __C = vec_sub (__C, __D);
284 return (__m64) ((__v2du) __C)[1];
285 }
286
287 extern __inline __m64
288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_hsub_pi32 (__m64 __A, __m64 __B)
290 {
291 const __v16qu __P =
292 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
293 const __v16qu __Q =
294 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
295 __v4si __C = (__v4si) (__v2du) { __A, __B };
296 __v4si __D = vec_perm (__C, __C, __Q);
297 __C = vec_perm (__C, __C, __P);
298 __C = vec_sub (__C, __D);
299 return (__m64) ((__v2du) __C)[1];
300 }
301
302 extern __inline __m128i
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
305 {
306 const __v16qu __P =
307 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
308 const __v16qu __Q =
309 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
310 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
311 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
312 return (__m128i) vec_subs (__C, __D);
313 }
314
315 extern __inline __m64
316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
318 {
319 const __v16qu __P =
320 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
321 const __v16qu __Q =
322 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
323 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
324 __v8hi __D = vec_perm (__C, __C, __P);
325 __v8hi __E = vec_perm (__C, __C, __Q);
326 __C = vec_subs (__D, __E);
327 return (__m64) ((__v2du) __C)[1];
328 }
329
330 extern __inline __m128i
331 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
333 {
334 const __v16qi __zero = { 0 };
335 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
336 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
337 return (__m128i) vec_sel (__C, __zero, __select);
338 }
339
340 extern __inline __m64
341 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
342 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
343 {
344 const __v16qi __zero = { 0 };
345 __v16qi __C = (__v16qi) (__v2du) { __A, __A };
346 __v16qi __D = (__v16qi) (__v2du) { __B, __B };
347 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
348 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
349 __C = vec_sel (__C, __zero, __select);
350 return (__m64) ((__v2du) (__C))[0];
351 }
352
353 extern __inline __m128i
354 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_sign_epi8 (__m128i __A, __m128i __B)
356 {
357 const __v16qi __zero = { 0 };
358 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
359 __v16qi __selectpos =
360 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
361 __v16qi __conv = vec_add (__selectneg, __selectpos);
362 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
363 }
364
365 extern __inline __m128i
366 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_sign_epi16 (__m128i __A, __m128i __B)
368 {
369 const __v8hi __zero = { 0 };
370 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
371 __v8hi __selectpos =
372 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
373 __v8hi __conv = vec_add (__selectneg, __selectpos);
374 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
375 }
376
377 extern __inline __m128i
378 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm_sign_epi32 (__m128i __A, __m128i __B)
380 {
381 const __v4si __zero = { 0 };
382 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
383 __v4si __selectpos =
384 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
385 __v4si __conv = vec_add (__selectneg, __selectpos);
386 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
387 }
388
389 extern __inline __m64
390 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 _mm_sign_pi8 (__m64 __A, __m64 __B)
392 {
393 const __v16qi __zero = { 0 };
394 __v16qi __C = (__v16qi) (__v2du) { __A, __A };
395 __v16qi __D = (__v16qi) (__v2du) { __B, __B };
396 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
397 return (__m64) ((__v2du) (__C))[0];
398 }
399
400 extern __inline __m64
401 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 _mm_sign_pi16 (__m64 __A, __m64 __B)
403 {
404 const __v8hi __zero = { 0 };
405 __v8hi __C = (__v8hi) (__v2du) { __A, __A };
406 __v8hi __D = (__v8hi) (__v2du) { __B, __B };
407 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
408 return (__m64) ((__v2du) (__C))[0];
409 }
410
411 extern __inline __m64
412 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 _mm_sign_pi32 (__m64 __A, __m64 __B)
414 {
415 const __v4si __zero = { 0 };
416 __v4si __C = (__v4si) (__v2du) { __A, __A };
417 __v4si __D = (__v4si) (__v2du) { __B, __B };
418 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
419 return (__m64) ((__v2du) (__C))[0];
420 }
421
422 extern __inline __m128i
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
425 {
426 __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
427 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
428 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
429 __v8hi __E = vec_unpackh ((__v16qi) __B);
430 __v8hi __F = vec_unpackl ((__v16qi) __B);
431 __C = vec_mul (__C, __E);
432 __D = vec_mul (__D, __F);
433 const __v16qu __odds =
434 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
435 const __v16qu __evens =
436 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
437 __E = vec_perm (__C, __D, __odds);
438 __F = vec_perm (__C, __D, __evens);
439 return (__m128i) vec_adds (__E, __F);
440 }
441
442 extern __inline __m64
443 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
445 {
446 __v8hi __C = (__v8hi) (__v2du) { __A, __A };
447 __C = vec_unpackl ((__v16qi) __C);
448 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
449 __C = vec_and (__C, __unsigned);
450 __v8hi __D = (__v8hi) (__v2du) { __B, __B };
451 __D = vec_unpackl ((__v16qi) __D);
452 __D = vec_mul (__C, __D);
453 const __v16qu __odds =
454 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
455 const __v16qu __evens =
456 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
457 __C = vec_perm (__D, __D, __odds);
458 __D = vec_perm (__D, __D, __evens);
459 __C = vec_adds (__C, __D);
460 return (__m64) ((__v2du) (__C))[0];
461 }
462
463 extern __inline __m128i
464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
466 {
467 __v4si __C = vec_unpackh ((__v8hi) __A);
468 __v4si __D = vec_unpackh ((__v8hi) __B);
469 __C = vec_mul (__C, __D);
470 __D = vec_unpackl ((__v8hi) __A);
471 __v4si __E = vec_unpackl ((__v8hi) __B);
472 __D = vec_mul (__D, __E);
473 const __v4su __shift = vec_splats ((unsigned int) 14);
474 __C = vec_sr (__C, __shift);
475 __D = vec_sr (__D, __shift);
476 const __v4si __ones = vec_splats ((signed int) 1);
477 __C = vec_add (__C, __ones);
478 __C = vec_sr (__C, (__v4su) __ones);
479 __D = vec_add (__D, __ones);
480 __D = vec_sr (__D, (__v4su) __ones);
481 return (__m128i) vec_pack (__C, __D);
482 }
483
484 extern __inline __m64
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
487 {
488 __v4si __C = (__v4si) (__v2du) { __A, __A };
489 __C = vec_unpackh ((__v8hi) __C);
490 __v4si __D = (__v4si) (__v2du) { __B, __B };
491 __D = vec_unpackh ((__v8hi) __D);
492 __C = vec_mul (__C, __D);
493 const __v4su __shift = vec_splats ((unsigned int) 14);
494 __C = vec_sr (__C, __shift);
495 const __v4si __ones = vec_splats ((signed int) 1);
496 __C = vec_add (__C, __ones);
497 __C = vec_sr (__C, (__v4su) __ones);
498 __v8hi __E = vec_pack (__C, __D);
499 return (__m64) ((__v2du) (__E))[0];
500 }
501
502 #endif