comparison gcc/config/rs6000/mmintrin.h @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
comparison
equal deleted inserted replaced
68:561a7518be6b 111:04ced10e8804
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 target does not support a native __vector_size__ (8) type. Instead
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
39 supported in 64-bit mode. This works well for the _si64 and some
40 _pi32 operations, but starts to generate long sequences for _pi16
41 and _pi8 operations. For those cases it better (faster and
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 unit, perform the operation, and then transfer the result back to
44 the __m64 type. This implies that the direct register move
45 instructions, introduced with power8, are available for efficient
46 implementation of these transfers.
47
48 Most MMX intrinsic operations can be performed efficiently as
49 C language 64-bit scalar operation or optimized to use the newer
50 128-bit SSE/Altivec operations. We recomend this for new
51 applications. */
52 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
53 #endif
54
55 #ifndef _MMINTRIN_H_INCLUDED
56 #define _MMINTRIN_H_INCLUDED
57
58 #include <altivec.h>
59 /* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
62
63 typedef __attribute__ ((__aligned__ (8)))
64 union
65 {
66 __m64 as_m64;
67 char as_char[8];
68 signed char as_signed_char [8];
69 short as_short[4];
70 int as_int[2];
71 long long as_long_long;
72 float as_float[2];
73 double as_double;
74 } __m64_union;
75
76 /* Empty the multimedia state. */
77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _mm_empty (void)
79 {
80 /* nothing to do on PowerPC. */
81 }
82
83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 _m_empty (void)
85 {
86 /* nothing to do on PowerPC. */
87 }
88
89 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
90 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 _mm_cvtsi32_si64 (int __i)
92 {
93 return (__m64) (unsigned int) __i;
94 }
95
96 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _m_from_int (int __i)
98 {
99 return _mm_cvtsi32_si64 (__i);
100 }
101
102 /* Convert the lower 32 bits of the __m64 object into an integer. */
103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 _mm_cvtsi64_si32 (__m64 __i)
105 {
106 return ((int) __i);
107 }
108
109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _m_to_int (__m64 __i)
111 {
112 return _mm_cvtsi64_si32 (__i);
113 }
114
115 #ifdef __powerpc64__
116 /* Convert I to a __m64 object. */
117
118 /* Intel intrinsic. */
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _m_from_int64 (long long __i)
121 {
122 return (__m64) __i;
123 }
124
125 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm_cvtsi64_m64 (long long __i)
127 {
128 return (__m64) __i;
129 }
130
131 /* Microsoft intrinsic. */
132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_cvtsi64x_si64 (long long __i)
134 {
135 return (__m64) __i;
136 }
137
138 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_set_pi64x (long long __i)
140 {
141 return (__m64) __i;
142 }
143
144 /* Convert the __m64 object to a 64bit integer. */
145
146 /* Intel intrinsic. */
147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _m_to_int64 (__m64 __i)
149 {
150 return (long long)__i;
151 }
152
153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 _mm_cvtm64_si64 (__m64 __i)
155 {
156 return (long long) __i;
157 }
158
159 /* Microsoft intrinsic. */
160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 _mm_cvtsi64_si64x (__m64 __i)
162 {
163 return (long long) __i;
164 }
165
166 #ifdef _ARCH_PWR8
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168 the result, and the four 16-bit values from M2 into the upper four 8-bit
169 values of the result, all with signed saturation. */
170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 {
173 __vector signed short vm1;
174 __vector signed char vresult;
175
176 vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
177 vresult = vec_vpkshss (vm1, vm1);
178 return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0);
179 }
180
181 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _m_packsswb (__m64 __m1, __m64 __m2)
183 {
184 return _mm_packs_pi16 (__m1, __m2);
185 }
186
187 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
188 the result, and the two 32-bit values from M2 into the upper two 16-bit
189 values of the result, all with signed saturation. */
190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
192 {
193 __vector signed int vm1;
194 __vector signed short vresult;
195
196 vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1);
197 vresult = vec_vpkswss (vm1, vm1);
198 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
199 }
200
201 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _m_packssdw (__m64 __m1, __m64 __m2)
203 {
204 return _mm_packs_pi32 (__m1, __m2);
205 }
206
207 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
208 the result, and the four 16-bit values from M2 into the upper four 8-bit
209 values of the result, all with unsigned saturation. */
210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
212 {
213 __vector signed short vm1;
214 __vector unsigned char vresult;
215
216 vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
217 vresult = vec_vpkshus (vm1, vm1);
218 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
219 }
220
221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 _m_packuswb (__m64 __m1, __m64 __m2)
223 {
224 return _mm_packs_pu16 (__m1, __m2);
225 }
226 #endif /* end ARCH_PWR8 */
227
228 /* Interleave the four 8-bit values from the high half of M1 with the four
229 8-bit values from the high half of M2. */
230 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
232 {
233 #if _ARCH_PWR8
234 __vector unsigned char a, b, c;
235
236 a = (__vector unsigned char)vec_splats (__m1);
237 b = (__vector unsigned char)vec_splats (__m2);
238 c = vec_mergel (a, b);
239 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
240 #else
241 __m64_union m1, m2, res;
242
243 m1.as_m64 = __m1;
244 m2.as_m64 = __m2;
245
246 res.as_char[0] = m1.as_char[4];
247 res.as_char[1] = m2.as_char[4];
248 res.as_char[2] = m1.as_char[5];
249 res.as_char[3] = m2.as_char[5];
250 res.as_char[4] = m1.as_char[6];
251 res.as_char[5] = m2.as_char[6];
252 res.as_char[6] = m1.as_char[7];
253 res.as_char[7] = m2.as_char[7];
254
255 return (__m64) res.as_m64;
256 #endif
257 }
258
259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 _m_punpckhbw (__m64 __m1, __m64 __m2)
261 {
262 return _mm_unpackhi_pi8 (__m1, __m2);
263 }
264
265 /* Interleave the two 16-bit values from the high half of M1 with the two
266 16-bit values from the high half of M2. */
267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
269 {
270 __m64_union m1, m2, res;
271
272 m1.as_m64 = __m1;
273 m2.as_m64 = __m2;
274
275 res.as_short[0] = m1.as_short[2];
276 res.as_short[1] = m2.as_short[2];
277 res.as_short[2] = m1.as_short[3];
278 res.as_short[3] = m2.as_short[3];
279
280 return (__m64) res.as_m64;
281 }
282
283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _m_punpckhwd (__m64 __m1, __m64 __m2)
285 {
286 return _mm_unpackhi_pi16 (__m1, __m2);
287 }
288 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
289 value from the high half of M2. */
290 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
292 {
293 __m64_union m1, m2, res;
294
295 m1.as_m64 = __m1;
296 m2.as_m64 = __m2;
297
298 res.as_int[0] = m1.as_int[1];
299 res.as_int[1] = m2.as_int[1];
300
301 return (__m64) res.as_m64;
302 }
303
304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _m_punpckhdq (__m64 __m1, __m64 __m2)
306 {
307 return _mm_unpackhi_pi32 (__m1, __m2);
308 }
309 /* Interleave the four 8-bit values from the low half of M1 with the four
310 8-bit values from the low half of M2. */
311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
313 {
314 #if _ARCH_PWR8
315 __vector unsigned char a, b, c;
316
317 a = (__vector unsigned char)vec_splats (__m1);
318 b = (__vector unsigned char)vec_splats (__m2);
319 c = vec_mergel (a, b);
320 return (__builtin_unpack_vector_int128 ((vector __int128_t)c, 1));
321 #else
322 __m64_union m1, m2, res;
323
324 m1.as_m64 = __m1;
325 m2.as_m64 = __m2;
326
327 res.as_char[0] = m1.as_char[0];
328 res.as_char[1] = m2.as_char[0];
329 res.as_char[2] = m1.as_char[1];
330 res.as_char[3] = m2.as_char[1];
331 res.as_char[4] = m1.as_char[2];
332 res.as_char[5] = m2.as_char[2];
333 res.as_char[6] = m1.as_char[3];
334 res.as_char[7] = m2.as_char[3];
335
336 return (__m64) res.as_m64;
337 #endif
338 }
339
340 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _m_punpcklbw (__m64 __m1, __m64 __m2)
342 {
343 return _mm_unpacklo_pi8 (__m1, __m2);
344 }
345 /* Interleave the two 16-bit values from the low half of M1 with the two
346 16-bit values from the low half of M2. */
347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
349 {
350 __m64_union m1, m2, res;
351
352 m1.as_m64 = __m1;
353 m2.as_m64 = __m2;
354
355 res.as_short[0] = m1.as_short[0];
356 res.as_short[1] = m2.as_short[0];
357 res.as_short[2] = m1.as_short[1];
358 res.as_short[3] = m2.as_short[1];
359
360 return (__m64) res.as_m64;
361 }
362
363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
364 _m_punpcklwd (__m64 __m1, __m64 __m2)
365 {
366 return _mm_unpacklo_pi16 (__m1, __m2);
367 }
368
369 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
370 value from the low half of M2. */
371 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
373 {
374 __m64_union m1, m2, res;
375
376 m1.as_m64 = __m1;
377 m2.as_m64 = __m2;
378
379 res.as_int[0] = m1.as_int[0];
380 res.as_int[1] = m2.as_int[0];
381
382 return (__m64) res.as_m64;
383 }
384
385 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _m_punpckldq (__m64 __m1, __m64 __m2)
387 {
388 return _mm_unpacklo_pi32 (__m1, __m2);
389 }
390
391 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393 _mm_add_pi8 (__m64 __m1, __m64 __m2)
394 {
395 #if _ARCH_PWR8
396 __vector signed char a, b, c;
397
398 a = (__vector signed char)vec_splats (__m1);
399 b = (__vector signed char)vec_splats (__m2);
400 c = vec_add (a, b);
401 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
402 #else
403 __m64_union m1, m2, res;
404
405 m1.as_m64 = __m1;
406 m2.as_m64 = __m2;
407
408 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
409 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
410 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
411 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
412 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
413 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
414 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
415 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
416
417 return (__m64) res.as_m64;
418 #endif
419 }
420
421 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _m_paddb (__m64 __m1, __m64 __m2)
423 {
424 return _mm_add_pi8 (__m1, __m2);
425 }
426
427 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429 _mm_add_pi16 (__m64 __m1, __m64 __m2)
430 {
431 #if _ARCH_PWR8
432 __vector signed short a, b, c;
433
434 a = (__vector signed short)vec_splats (__m1);
435 b = (__vector signed short)vec_splats (__m2);
436 c = vec_add (a, b);
437 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
438 #else
439 __m64_union m1, m2, res;
440
441 m1.as_m64 = __m1;
442 m2.as_m64 = __m2;
443
444 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
445 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
446 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
447 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
448
449 return (__m64) res.as_m64;
450 #endif
451 }
452
453 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 _m_paddw (__m64 __m1, __m64 __m2)
455 {
456 return _mm_add_pi16 (__m1, __m2);
457 }
458
459 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461 _mm_add_pi32 (__m64 __m1, __m64 __m2)
462 {
463 #if _ARCH_PWR9
464 __vector signed int a, b, c;
465
466 a = (__vector signed int)vec_splats (__m1, __m1);
467 b = (__vector signed int)vec_splats (__m2, __m2);
468 c = vec_add (a, b);
469 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
470 #else
471 __m64_union m1, m2, res;
472
473 m1.as_m64 = __m1;
474 m2.as_m64 = __m2;
475
476 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
477 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
478
479 return (__m64) res.as_m64;
480 #endif
481 }
482
483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 _m_paddd (__m64 __m1, __m64 __m2)
485 {
486 return _mm_add_pi32 (__m1, __m2);
487 }
488
489 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
491 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
492 {
493 #if _ARCH_PWR8
494 __vector signed char a, b, c;
495
496 a = (__vector signed char)vec_splats (__m1);
497 b = (__vector signed char)vec_splats (__m2);
498 c = vec_sub (a, b);
499 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
500 #else
501 __m64_union m1, m2, res;
502
503 m1.as_m64 = __m1;
504 m2.as_m64 = __m2;
505
506 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
507 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
508 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
509 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
510 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
511 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
512 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
513 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
514
515 return (__m64) res.as_m64;
516 #endif
517 }
518
519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520 _m_psubb (__m64 __m1, __m64 __m2)
521 {
522 return _mm_sub_pi8 (__m1, __m2);
523 }
524
525 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
526 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
528 {
529 #if _ARCH_PWR8
530 __vector signed short a, b, c;
531
532 a = (__vector signed short)vec_splats (__m1);
533 b = (__vector signed short)vec_splats (__m2);
534 c = vec_sub (a, b);
535 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
536 #else
537 __m64_union m1, m2, res;
538
539 m1.as_m64 = __m1;
540 m2.as_m64 = __m2;
541
542 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
543 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
544 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
545 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
546
547 return (__m64) res.as_m64;
548 #endif
549 }
550
551 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
552 _m_psubw (__m64 __m1, __m64 __m2)
553 {
554 return _mm_sub_pi16 (__m1, __m2);
555 }
556
557 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
558 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
560 {
561 #if _ARCH_PWR9
562 __vector signed int a, b, c;
563
564 a = (__vector signed int)vec_splats (__m1);
565 b = (__vector signed int)vec_splats (__m2);
566 c = vec_sub (a, b);
567 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
568 #else
569 __m64_union m1, m2, res;
570
571 m1.as_m64 = __m1;
572 m2.as_m64 = __m2;
573
574 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
575 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
576
577 return (__m64) res.as_m64;
578 #endif
579 }
580
581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 _m_psubd (__m64 __m1, __m64 __m2)
583 {
584 return _mm_add_pi32 (__m1, __m2);
585 }
586
587 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588 _mm_add_si64 (__m64 __m1, __m64 __m2)
589 {
590 return (__m1 + __m2);
591 }
592
593 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
594 _mm_sub_si64 (__m64 __m1, __m64 __m2)
595 {
596 return (__m1 - __m2);
597 }
598
599 /* Shift the 64-bit value in M left by COUNT. */
600 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 _mm_sll_si64 (__m64 __m, __m64 __count)
602 {
603 return (__m << __count);
604 }
605
606 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _m_psllq (__m64 __m, __m64 __count)
608 {
609 return _mm_sll_si64 (__m, __count);
610 }
611
612 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_slli_si64 (__m64 __m, const int __count)
614 {
615 return (__m << __count);
616 }
617
618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _m_psllqi (__m64 __m, const int __count)
620 {
621 return _mm_slli_si64 (__m, __count);
622 }
623
624 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
625 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 _mm_srl_si64 (__m64 __m, __m64 __count)
627 {
628 return (__m >> __count);
629 }
630
631 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632 _m_psrlq (__m64 __m, __m64 __count)
633 {
634 return _mm_srl_si64 (__m, __count);
635 }
636
637 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
638 _mm_srli_si64 (__m64 __m, const int __count)
639 {
640 return (__m >> __count);
641 }
642
643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644 _m_psrlqi (__m64 __m, const int __count)
645 {
646 return _mm_srli_si64 (__m, __count);
647 }
648
649 /* Bit-wise AND the 64-bit values in M1 and M2. */
650 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 _mm_and_si64 (__m64 __m1, __m64 __m2)
652 {
653 return (__m1 & __m2);
654 }
655
656 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
657 _m_pand (__m64 __m1, __m64 __m2)
658 {
659 return _mm_and_si64 (__m1, __m2);
660 }
661
662 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
663 64-bit value in M2. */
664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
666 {
667 return (~__m1 & __m2);
668 }
669
670 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671 _m_pandn (__m64 __m1, __m64 __m2)
672 {
673 return _mm_andnot_si64 (__m1, __m2);
674 }
675
676 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
677 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678 _mm_or_si64 (__m64 __m1, __m64 __m2)
679 {
680 return (__m1 | __m2);
681 }
682
683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684 _m_por (__m64 __m1, __m64 __m2)
685 {
686 return _mm_or_si64 (__m1, __m2);
687 }
688
689 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
690 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 _mm_xor_si64 (__m64 __m1, __m64 __m2)
692 {
693 return (__m1 ^ __m2);
694 }
695
696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _m_pxor (__m64 __m1, __m64 __m2)
698 {
699 return _mm_xor_si64 (__m1, __m2);
700 }
701
702 /* Creates a 64-bit zero. */
703 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704 _mm_setzero_si64 (void)
705 {
706 return (__m64) 0;
707 }
708
709 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
710 test is true and zero if false. */
711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
713 {
714 #ifdef _ARCH_PWR6
715 __m64 res;
716 __asm__(
717 "cmpb %0,%1,%2;\n"
718 : "=r" (res)
719 : "r" (__m1),
720 "r" (__m2)
721 : );
722 return (res);
723 #else
724 __m64_union m1, m2, res;
725
726 m1.as_m64 = __m1;
727 m2.as_m64 = __m2;
728
729 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
730 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
731 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
732 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
733 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
734 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
735 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
736 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
737
738 return (__m64) res.as_m64;
739 #endif
740 }
741
742 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _m_pcmpeqb (__m64 __m1, __m64 __m2)
744 {
745 return _mm_cmpeq_pi8 (__m1, __m2);
746 }
747
748 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
750 {
751 #if _ARCH_PWR8
752 __vector signed char a, b, c;
753
754 a = (__vector signed char)vec_splats (__m1);
755 b = (__vector signed char)vec_splats (__m2);
756 c = (__vector signed char)vec_cmpgt (a, b);
757 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
758 #else
759 __m64_union m1, m2, res;
760
761 m1.as_m64 = __m1;
762 m2.as_m64 = __m2;
763
764 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
765 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
766 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
767 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
768 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
769 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
770 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
771 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
772
773 return (__m64) res.as_m64;
774 #endif
775 }
776
777 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 _m_pcmpgtb (__m64 __m1, __m64 __m2)
779 {
780 return _mm_cmpgt_pi8 (__m1, __m2);
781 }
782
783 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
784 the test is true and zero if false. */
785 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
787 {
788 #if _ARCH_PWR8
789 __vector signed short a, b, c;
790
791 a = (__vector signed short)vec_splats (__m1);
792 b = (__vector signed short)vec_splats (__m2);
793 c = (__vector signed short)vec_cmpeq (a, b);
794 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
795 #else
796 __m64_union m1, m2, res;
797
798 m1.as_m64 = __m1;
799 m2.as_m64 = __m2;
800
801 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
802 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
803 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
804 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
805
806 return (__m64) res.as_m64;
807 #endif
808 }
809
810 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 _m_pcmpeqw (__m64 __m1, __m64 __m2)
812 {
813 return _mm_cmpeq_pi16 (__m1, __m2);
814 }
815
816 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
818 {
819 #if _ARCH_PWR8
820 __vector signed short a, b, c;
821
822 a = (__vector signed short)vec_splats (__m1);
823 b = (__vector signed short)vec_splats (__m2);
824 c = (__vector signed short)vec_cmpgt (a, b);
825 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
826 #else
827 __m64_union m1, m2, res;
828
829 m1.as_m64 = __m1;
830 m2.as_m64 = __m2;
831
832 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
833 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
834 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
835 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
836
837 return (__m64) res.as_m64;
838 #endif
839 }
840
841 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _m_pcmpgtw (__m64 __m1, __m64 __m2)
843 {
844 return _mm_cmpgt_pi16 (__m1, __m2);
845 }
846
847 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
848 the test is true and zero if false. */
849 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
851 {
852 #if _ARCH_PWR9
853 __vector signed int a, b, c;
854
855 a = (__vector signed int)vec_splats (__m1);
856 b = (__vector signed int)vec_splats (__m2);
857 c = (__vector signed short)vec_cmpeq (a, b);
858 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
859 #else
860 __m64_union m1, m2, res;
861
862 m1.as_m64 = __m1;
863 m2.as_m64 = __m2;
864
865 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
866 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
867
868 return (__m64) res.as_m64;
869 #endif
870 }
871
872 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873 _m_pcmpeqd (__m64 __m1, __m64 __m2)
874 {
875 return _mm_cmpeq_pi32 (__m1, __m2);
876 }
877
878 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
880 {
881 #if _ARCH_PWR9
882 __vector signed int a, b, c;
883
884 a = (__vector signed int)vec_splats (__m1);
885 b = (__vector signed int)vec_splats (__m2);
886 c = (__vector signed short)vec_cmpgt (a, b);
887 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
888 #else
889 __m64_union m1, m2, res;
890
891 m1.as_m64 = __m1;
892 m2.as_m64 = __m2;
893
894 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
895 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
896
897 return (__m64) res.as_m64;
898 #endif
899 }
900
901 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902 _m_pcmpgtd (__m64 __m1, __m64 __m2)
903 {
904 return _mm_cmpgt_pi32 (__m1, __m2);
905 }
906
907 #if _ARCH_PWR8
908 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
909 saturated arithmetic. */
910 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
912 {
913 __vector signed char a, b, c;
914
915 a = (__vector signed char)vec_splats (__m1);
916 b = (__vector signed char)vec_splats (__m2);
917 c = vec_adds (a, b);
918 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
919 }
920
921 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922 _m_paddsb (__m64 __m1, __m64 __m2)
923 {
924 return _mm_adds_pi8 (__m1, __m2);
925 }
926 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
927 saturated arithmetic. */
928 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
930 {
931 __vector signed short a, b, c;
932
933 a = (__vector signed short)vec_splats (__m1);
934 b = (__vector signed short)vec_splats (__m2);
935 c = vec_adds (a, b);
936 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
937 }
938
939 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940 _m_paddsw (__m64 __m1, __m64 __m2)
941 {
942 return _mm_adds_pi16 (__m1, __m2);
943 }
944 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
945 saturated arithmetic. */
946 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
948 {
949 __vector unsigned char a, b, c;
950
951 a = (__vector unsigned char)vec_splats (__m1);
952 b = (__vector unsigned char)vec_splats (__m2);
953 c = vec_adds (a, b);
954 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
955 }
956
957 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _m_paddusb (__m64 __m1, __m64 __m2)
959 {
960 return _mm_adds_pu8 (__m1, __m2);
961 }
962
963 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
964 saturated arithmetic. */
965 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
967 {
968 __vector unsigned short a, b, c;
969
970 a = (__vector unsigned short)vec_splats (__m1);
971 b = (__vector unsigned short)vec_splats (__m2);
972 c = vec_adds (a, b);
973 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
974 }
975
976 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _m_paddusw (__m64 __m1, __m64 __m2)
978 {
979 return _mm_adds_pu16 (__m1, __m2);
980 }
981
982 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
983 saturating arithmetic. */
984 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
986 {
987 __vector signed char a, b, c;
988
989 a = (__vector signed char)vec_splats (__m1);
990 b = (__vector signed char)vec_splats (__m2);
991 c = vec_subs (a, b);
992 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
993 }
994
995 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _m_psubsb (__m64 __m1, __m64 __m2)
997 {
998 return _mm_subs_pi8 (__m1, __m2);
999 }
1000
1001 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1002 signed saturating arithmetic. */
1003 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1004 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1005 {
1006 __vector signed short a, b, c;
1007
1008 a = (__vector signed short)vec_splats (__m1);
1009 b = (__vector signed short)vec_splats (__m2);
1010 c = vec_subs (a, b);
1011 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1012 }
1013
1014 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _m_psubsw (__m64 __m1, __m64 __m2)
1016 {
1017 return _mm_subs_pi16 (__m1, __m2);
1018 }
1019
1020 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1021 unsigned saturating arithmetic. */
1022 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1024 {
1025 __vector unsigned char a, b, c;
1026
1027 a = (__vector unsigned char)vec_splats (__m1);
1028 b = (__vector unsigned char)vec_splats (__m2);
1029 c = vec_subs (a, b);
1030 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1031 }
1032
1033 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _m_psubusb (__m64 __m1, __m64 __m2)
1035 {
1036 return _mm_subs_pu8 (__m1, __m2);
1037 }
1038
1039 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1040 unsigned saturating arithmetic. */
1041 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1043 {
1044 __vector unsigned short a, b, c;
1045
1046 a = (__vector unsigned short)vec_splats (__m1);
1047 b = (__vector unsigned short)vec_splats (__m2);
1048 c = vec_subs (a, b);
1049 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1050 }
1051
1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053 _m_psubusw (__m64 __m1, __m64 __m2)
1054 {
1055 return _mm_subs_pu16 (__m1, __m2);
1056 }
1057
1058 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1059 four 32-bit intermediate results, which are then summed by pairs to
1060 produce two 32-bit results. */
1061 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1062 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1063 {
1064 __vector signed short a, b;
1065 __vector signed int c;
1066 __vector signed int zero = {0, 0, 0, 0};
1067
1068 a = (__vector signed short)vec_splats (__m1);
1069 b = (__vector signed short)vec_splats (__m2);
1070 c = vec_vmsumshm (a, b, zero);
1071 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1072 }
1073
1074 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _m_pmaddwd (__m64 __m1, __m64 __m2)
1076 {
1077 return _mm_madd_pi16 (__m1, __m2);
1078 }
1079 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1080 M2 and produce the high 16 bits of the 32-bit results. */
1081 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1083 {
1084 __vector signed short a, b;
1085 __vector signed short c;
1086 __vector signed int w0, w1;
1087 __vector unsigned char xform1 = {
1088 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1089 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1090 };
1091
1092 a = (__vector signed short)vec_splats (__m1);
1093 b = (__vector signed short)vec_splats (__m2);
1094
1095 w0 = vec_vmulesh (a, b);
1096 w1 = vec_vmulosh (a, b);
1097 c = (__vector signed short)vec_perm (w0, w1, xform1);
1098
1099 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1100 }
1101
1102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103 _m_pmulhw (__m64 __m1, __m64 __m2)
1104 {
1105 return _mm_mulhi_pi16 (__m1, __m2);
1106 }
1107
1108 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1109 the low 16 bits of the results. */
1110 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1112 {
1113 __vector signed short a, b, c;
1114
1115 a = (__vector signed short)vec_splats (__m1);
1116 b = (__vector signed short)vec_splats (__m2);
1117 c = a * b;
1118 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
1119 }
1120
1121 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _m_pmullw (__m64 __m1, __m64 __m2)
1123 {
1124 return _mm_mullo_pi16 (__m1, __m2);
1125 }
1126
1127 /* Shift four 16-bit values in M left by COUNT. */
1128 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129 _mm_sll_pi16 (__m64 __m, __m64 __count)
1130 {
1131 __vector signed short m, r;
1132 __vector unsigned short c;
1133
1134 if (__count <= 15)
1135 {
1136 m = (__vector signed short)vec_splats (__m);
1137 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1138 r = vec_sl (m, (__vector unsigned short)c);
1139 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1140 }
1141 else
1142 return (0);
1143 }
1144
1145 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1146 _m_psllw (__m64 __m, __m64 __count)
1147 {
1148 return _mm_sll_pi16 (__m, __count);
1149 }
1150
1151 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152 _mm_slli_pi16 (__m64 __m, int __count)
1153 {
1154 /* Promote int to long then invoke mm_sll_pi16. */
1155 return _mm_sll_pi16 (__m, __count);
1156 }
1157
1158 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _m_psllwi (__m64 __m, int __count)
1160 {
1161 return _mm_slli_pi16 (__m, __count);
1162 }
1163
1164 /* Shift two 32-bit values in M left by COUNT. */
1165 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1166 _mm_sll_pi32 (__m64 __m, __m64 __count)
1167 {
1168 __m64_union m, res;
1169
1170 m.as_m64 = __m;
1171
1172 res.as_int[0] = m.as_int[0] << __count;
1173 res.as_int[1] = m.as_int[1] << __count;
1174 return (res.as_m64);
1175 }
1176
1177 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 _m_pslld (__m64 __m, __m64 __count)
1179 {
1180 return _mm_sll_pi32 (__m, __count);
1181 }
1182
1183 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm_slli_pi32 (__m64 __m, int __count)
1185 {
1186 /* Promote int to long then invoke mm_sll_pi32. */
1187 return _mm_sll_pi32 (__m, __count);
1188 }
1189
1190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191 _m_pslldi (__m64 __m, int __count)
1192 {
1193 return _mm_slli_pi32 (__m, __count);
1194 }
1195
1196 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm_sra_pi16 (__m64 __m, __m64 __count)
1199 {
1200 __vector signed short m, r;
1201 __vector unsigned short c;
1202
1203 if (__count <= 15)
1204 {
1205 m = (__vector signed short)vec_splats (__m);
1206 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1207 r = vec_sra (m, (__vector unsigned short)c);
1208 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1209 }
1210 else
1211 return (0);
1212 }
1213
1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 _m_psraw (__m64 __m, __m64 __count)
1216 {
1217 return _mm_sra_pi16 (__m, __count);
1218 }
1219
1220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm_srai_pi16 (__m64 __m, int __count)
1222 {
1223 /* Promote int to long then invoke mm_sra_pi32. */
1224 return _mm_sra_pi16 (__m, __count);
1225 }
1226
1227 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 _m_psrawi (__m64 __m, int __count)
1229 {
1230 return _mm_srai_pi16 (__m, __count);
1231 }
1232
1233 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1234 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235 _mm_sra_pi32 (__m64 __m, __m64 __count)
1236 {
1237 __m64_union m, res;
1238
1239 m.as_m64 = __m;
1240
1241 res.as_int[0] = m.as_int[0] >> __count;
1242 res.as_int[1] = m.as_int[1] >> __count;
1243 return (res.as_m64);
1244 }
1245
1246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247 _m_psrad (__m64 __m, __m64 __count)
1248 {
1249 return _mm_sra_pi32 (__m, __count);
1250 }
1251
1252 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _mm_srai_pi32 (__m64 __m, int __count)
1254 {
1255 /* Promote int to long then invoke mm_sra_pi32. */
1256 return _mm_sra_pi32 (__m, __count);
1257 }
1258
1259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260 _m_psradi (__m64 __m, int __count)
1261 {
1262 return _mm_srai_pi32 (__m, __count);
1263 }
1264
1265 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1266 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267 _mm_srl_pi16 (__m64 __m, __m64 __count)
1268 {
1269 __vector unsigned short m, r;
1270 __vector unsigned short c;
1271
1272 if (__count <= 15)
1273 {
1274 m = (__vector unsigned short)vec_splats (__m);
1275 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1276 r = vec_sr (m, (__vector unsigned short)c);
1277 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1278 }
1279 else
1280 return (0);
1281 }
1282
1283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284 _m_psrlw (__m64 __m, __m64 __count)
1285 {
1286 return _mm_srl_pi16 (__m, __count);
1287 }
1288
1289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _mm_srli_pi16 (__m64 __m, int __count)
1291 {
1292 /* Promote int to long then invoke mm_sra_pi32. */
1293 return _mm_srl_pi16 (__m, __count);
1294 }
1295
1296 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _m_psrlwi (__m64 __m, int __count)
1298 {
1299 return _mm_srli_pi16 (__m, __count);
1300 }
1301
1302 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1303 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm_srl_pi32 (__m64 __m, __m64 __count)
1305 {
1306 __m64_union m, res;
1307
1308 m.as_m64 = __m;
1309
1310 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1311 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1312 return (res.as_m64);
1313 }
1314
1315 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 _m_psrld (__m64 __m, __m64 __count)
1317 {
1318 return _mm_srl_pi32 (__m, __count);
1319 }
1320
1321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 _mm_srli_pi32 (__m64 __m, int __count)
1323 {
1324 /* Promote int to long then invoke mm_srl_pi32. */
1325 return _mm_srl_pi32 (__m, __count);
1326 }
1327
1328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _m_psrldi (__m64 __m, int __count)
1330 {
1331 return _mm_srli_pi32 (__m, __count);
1332 }
1333 #endif /* _ARCH_PWR8 */
1334
1335 /* Creates a vector of two 32-bit values; I0 is least significant. */
1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm_set_pi32 (int __i1, int __i0)
1338 {
1339 __m64_union res;
1340
1341 res.as_int[0] = __i0;
1342 res.as_int[1] = __i1;
1343 return (res.as_m64);
1344 }
1345
1346 /* Creates a vector of four 16-bit values; W0 is least significant. */
1347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1348 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1349 {
1350 __m64_union res;
1351
1352 res.as_short[0] = __w0;
1353 res.as_short[1] = __w1;
1354 res.as_short[2] = __w2;
1355 res.as_short[3] = __w3;
1356 return (res.as_m64);
1357 }
1358
1359 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1360 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1362 char __b3, char __b2, char __b1, char __b0)
1363 {
1364 __m64_union res;
1365
1366 res.as_char[0] = __b0;
1367 res.as_char[1] = __b1;
1368 res.as_char[2] = __b2;
1369 res.as_char[3] = __b3;
1370 res.as_char[4] = __b4;
1371 res.as_char[5] = __b5;
1372 res.as_char[6] = __b6;
1373 res.as_char[7] = __b7;
1374 return (res.as_m64);
1375 }
1376
1377 /* Similar, but with the arguments in reverse order. */
1378 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm_setr_pi32 (int __i0, int __i1)
1380 {
1381 __m64_union res;
1382
1383 res.as_int[0] = __i0;
1384 res.as_int[1] = __i1;
1385 return (res.as_m64);
1386 }
1387
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1390 {
1391 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1392 }
1393
1394 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1396 char __b4, char __b5, char __b6, char __b7)
1397 {
1398 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1399 }
1400
1401 /* Creates a vector of two 32-bit values, both elements containing I. */
1402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm_set1_pi32 (int __i)
1404 {
1405 __m64_union res;
1406
1407 res.as_int[0] = __i;
1408 res.as_int[1] = __i;
1409 return (res.as_m64);
1410 }
1411
1412 /* Creates a vector of four 16-bit values, all elements containing W. */
1413 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1414 _mm_set1_pi16 (short __w)
1415 {
1416 #if _ARCH_PWR9
1417 __vector signed short w;
1418
1419 w = (__vector signed short)vec_splats (__w);
1420 return (__builtin_unpack_vector_int128 ((__vector __int128_t)w, 0));
1421 #else
1422 __m64_union res;
1423
1424 res.as_short[0] = __w;
1425 res.as_short[1] = __w;
1426 res.as_short[2] = __w;
1427 res.as_short[3] = __w;
1428 return (res.as_m64);
1429 #endif
1430 }
1431
1432 /* Creates a vector of eight 8-bit values, all elements containing B. */
1433 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1434 _mm_set1_pi8 (signed char __b)
1435 {
1436 #if _ARCH_PWR8
1437 __vector signed char b;
1438
1439 b = (__vector signed char)vec_splats (__b);
1440 return (__builtin_unpack_vector_int128 ((__vector __int128_t)b, 0));
1441 #else
1442 __m64_union res;
1443
1444 res.as_char[0] = __b;
1445 res.as_char[1] = __b;
1446 res.as_char[2] = __b;
1447 res.as_char[3] = __b;
1448 res.as_char[4] = __b;
1449 res.as_char[5] = __b;
1450 res.as_char[6] = __b;
1451 res.as_char[7] = __b;
1452 return (res.as_m64);
1453 #endif
1454 }
1455 #endif /* __powerpc64__ */
1456 #endif /* _MMINTRIN_H_INCLUDED */