Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/rs6000/mmintrin.h @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc. | |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation; either version 3, or (at your option) | |
8 any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. */ | |
23 | |
24 /* Implemented from the specification included in the Intel C++ Compiler | |
25 User Guide and Reference, version 9.0. */ | |
26 | |
27 #ifndef NO_WARN_X86_INTRINSICS | |
28 /* This header is distributed to simplify porting x86_64 code that | |
29 makes explicit use of Intel intrinsics to powerpc64le. | |
30 It is the user's responsibility to determine if the results are | |
31 acceptable and make additional changes as necessary. | |
32 Note that much code that uses Intel intrinsics can be rewritten in | |
33 standard C or GNU C extensions, which are more portable and better | |
34 optimized across multiple targets. | |
35 | |
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC | |
37 target does not support a native __vector_size__ (8) type. Instead | |
38 we typedef __m64 to a 64-bit unsigned long long, which is natively | |
39 supported in 64-bit mode. This works well for the _si64 and some | |
40 _pi32 operations, but starts to generate long sequences for _pi16 | |
41 and _pi8 operations. For those cases it better (faster and | |
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit | |
43 unit, perform the operation, and then transfer the result back to | |
44 the __m64 type. This implies that the direct register move | |
45 instructions, introduced with power8, are available for efficient | |
46 implementation of these transfers. | |
47 | |
48 Most MMX intrinsic operations can be performed efficiently as | |
49 C language 64-bit scalar operation or optimized to use the newer | |
50 128-bit SSE/Altivec operations. We recomend this for new | |
51 applications. */ | |
52 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." | |
53 #endif | |
54 | |
55 #ifndef _MMINTRIN_H_INCLUDED | |
56 #define _MMINTRIN_H_INCLUDED | |
57 | |
58 #include <altivec.h> | |
59 /* The Intel API is flexible enough that we must allow aliasing with other | |
60 vector types, and their scalar components. */ | |
61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64; | |
62 | |
63 typedef __attribute__ ((__aligned__ (8))) | |
64 union | |
65 { | |
66 __m64 as_m64; | |
67 char as_char[8]; | |
68 signed char as_signed_char [8]; | |
69 short as_short[4]; | |
70 int as_int[2]; | |
71 long long as_long_long; | |
72 float as_float[2]; | |
73 double as_double; | |
74 } __m64_union; | |
75 | |
76 /* Empty the multimedia state. */ | |
77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
78 _mm_empty (void) | |
79 { | |
80 /* nothing to do on PowerPC. */ | |
81 } | |
82 | |
83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
84 _m_empty (void) | |
85 { | |
86 /* nothing to do on PowerPC. */ | |
87 } | |
88 | |
89 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ | |
90 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
91 _mm_cvtsi32_si64 (int __i) | |
92 { | |
93 return (__m64) (unsigned int) __i; | |
94 } | |
95 | |
96 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
97 _m_from_int (int __i) | |
98 { | |
99 return _mm_cvtsi32_si64 (__i); | |
100 } | |
101 | |
102 /* Convert the lower 32 bits of the __m64 object into an integer. */ | |
103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
104 _mm_cvtsi64_si32 (__m64 __i) | |
105 { | |
106 return ((int) __i); | |
107 } | |
108 | |
109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
110 _m_to_int (__m64 __i) | |
111 { | |
112 return _mm_cvtsi64_si32 (__i); | |
113 } | |
114 | |
115 #ifdef __powerpc64__ | |
116 /* Convert I to a __m64 object. */ | |
117 | |
118 /* Intel intrinsic. */ | |
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
120 _m_from_int64 (long long __i) | |
121 { | |
122 return (__m64) __i; | |
123 } | |
124 | |
125 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
126 _mm_cvtsi64_m64 (long long __i) | |
127 { | |
128 return (__m64) __i; | |
129 } | |
130 | |
131 /* Microsoft intrinsic. */ | |
132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
133 _mm_cvtsi64x_si64 (long long __i) | |
134 { | |
135 return (__m64) __i; | |
136 } | |
137 | |
138 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
139 _mm_set_pi64x (long long __i) | |
140 { | |
141 return (__m64) __i; | |
142 } | |
143 | |
144 /* Convert the __m64 object to a 64bit integer. */ | |
145 | |
146 /* Intel intrinsic. */ | |
147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
148 _m_to_int64 (__m64 __i) | |
149 { | |
150 return (long long)__i; | |
151 } | |
152 | |
153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
154 _mm_cvtm64_si64 (__m64 __i) | |
155 { | |
156 return (long long) __i; | |
157 } | |
158 | |
159 /* Microsoft intrinsic. */ | |
160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
161 _mm_cvtsi64_si64x (__m64 __i) | |
162 { | |
163 return (long long) __i; | |
164 } | |
165 | |
166 #ifdef _ARCH_PWR8 | |
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of | |
168 the result, and the four 16-bit values from M2 into the upper four 8-bit | |
169 values of the result, all with signed saturation. */ | |
170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
171 _mm_packs_pi16 (__m64 __m1, __m64 __m2) | |
172 { | |
173 __vector signed short vm1; | |
174 __vector signed char vresult; | |
175 | |
176 vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1); | |
177 vresult = vec_vpkshss (vm1, vm1); | |
178 return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0); | |
179 } | |
180 | |
181 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
182 _m_packsswb (__m64 __m1, __m64 __m2) | |
183 { | |
184 return _mm_packs_pi16 (__m1, __m2); | |
185 } | |
186 | |
187 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of | |
188 the result, and the two 32-bit values from M2 into the upper two 16-bit | |
189 values of the result, all with signed saturation. */ | |
190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
191 _mm_packs_pi32 (__m64 __m1, __m64 __m2) | |
192 { | |
193 __vector signed int vm1; | |
194 __vector signed short vresult; | |
195 | |
196 vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1); | |
197 vresult = vec_vpkswss (vm1, vm1); | |
198 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0)); | |
199 } | |
200 | |
201 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
202 _m_packssdw (__m64 __m1, __m64 __m2) | |
203 { | |
204 return _mm_packs_pi32 (__m1, __m2); | |
205 } | |
206 | |
207 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of | |
208 the result, and the four 16-bit values from M2 into the upper four 8-bit | |
209 values of the result, all with unsigned saturation. */ | |
210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
211 _mm_packs_pu16 (__m64 __m1, __m64 __m2) | |
212 { | |
213 __vector signed short vm1; | |
214 __vector unsigned char vresult; | |
215 | |
216 vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1); | |
217 vresult = vec_vpkshus (vm1, vm1); | |
218 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0)); | |
219 } | |
220 | |
221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
222 _m_packuswb (__m64 __m1, __m64 __m2) | |
223 { | |
224 return _mm_packs_pu16 (__m1, __m2); | |
225 } | |
226 #endif /* end ARCH_PWR8 */ | |
227 | |
228 /* Interleave the four 8-bit values from the high half of M1 with the four | |
229 8-bit values from the high half of M2. */ | |
230 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
231 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) | |
232 { | |
233 #if _ARCH_PWR8 | |
234 __vector unsigned char a, b, c; | |
235 | |
236 a = (__vector unsigned char)vec_splats (__m1); | |
237 b = (__vector unsigned char)vec_splats (__m2); | |
238 c = vec_mergel (a, b); | |
239 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
240 #else | |
241 __m64_union m1, m2, res; | |
242 | |
243 m1.as_m64 = __m1; | |
244 m2.as_m64 = __m2; | |
245 | |
246 res.as_char[0] = m1.as_char[4]; | |
247 res.as_char[1] = m2.as_char[4]; | |
248 res.as_char[2] = m1.as_char[5]; | |
249 res.as_char[3] = m2.as_char[5]; | |
250 res.as_char[4] = m1.as_char[6]; | |
251 res.as_char[5] = m2.as_char[6]; | |
252 res.as_char[6] = m1.as_char[7]; | |
253 res.as_char[7] = m2.as_char[7]; | |
254 | |
255 return (__m64) res.as_m64; | |
256 #endif | |
257 } | |
258 | |
259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
260 _m_punpckhbw (__m64 __m1, __m64 __m2) | |
261 { | |
262 return _mm_unpackhi_pi8 (__m1, __m2); | |
263 } | |
264 | |
265 /* Interleave the two 16-bit values from the high half of M1 with the two | |
266 16-bit values from the high half of M2. */ | |
267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
268 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) | |
269 { | |
270 __m64_union m1, m2, res; | |
271 | |
272 m1.as_m64 = __m1; | |
273 m2.as_m64 = __m2; | |
274 | |
275 res.as_short[0] = m1.as_short[2]; | |
276 res.as_short[1] = m2.as_short[2]; | |
277 res.as_short[2] = m1.as_short[3]; | |
278 res.as_short[3] = m2.as_short[3]; | |
279 | |
280 return (__m64) res.as_m64; | |
281 } | |
282 | |
283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
284 _m_punpckhwd (__m64 __m1, __m64 __m2) | |
285 { | |
286 return _mm_unpackhi_pi16 (__m1, __m2); | |
287 } | |
288 /* Interleave the 32-bit value from the high half of M1 with the 32-bit | |
289 value from the high half of M2. */ | |
290 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
291 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) | |
292 { | |
293 __m64_union m1, m2, res; | |
294 | |
295 m1.as_m64 = __m1; | |
296 m2.as_m64 = __m2; | |
297 | |
298 res.as_int[0] = m1.as_int[1]; | |
299 res.as_int[1] = m2.as_int[1]; | |
300 | |
301 return (__m64) res.as_m64; | |
302 } | |
303 | |
304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
305 _m_punpckhdq (__m64 __m1, __m64 __m2) | |
306 { | |
307 return _mm_unpackhi_pi32 (__m1, __m2); | |
308 } | |
309 /* Interleave the four 8-bit values from the low half of M1 with the four | |
310 8-bit values from the low half of M2. */ | |
311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
312 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) | |
313 { | |
314 #if _ARCH_PWR8 | |
315 __vector unsigned char a, b, c; | |
316 | |
317 a = (__vector unsigned char)vec_splats (__m1); | |
318 b = (__vector unsigned char)vec_splats (__m2); | |
319 c = vec_mergel (a, b); | |
320 return (__builtin_unpack_vector_int128 ((vector __int128_t)c, 1)); | |
321 #else | |
322 __m64_union m1, m2, res; | |
323 | |
324 m1.as_m64 = __m1; | |
325 m2.as_m64 = __m2; | |
326 | |
327 res.as_char[0] = m1.as_char[0]; | |
328 res.as_char[1] = m2.as_char[0]; | |
329 res.as_char[2] = m1.as_char[1]; | |
330 res.as_char[3] = m2.as_char[1]; | |
331 res.as_char[4] = m1.as_char[2]; | |
332 res.as_char[5] = m2.as_char[2]; | |
333 res.as_char[6] = m1.as_char[3]; | |
334 res.as_char[7] = m2.as_char[3]; | |
335 | |
336 return (__m64) res.as_m64; | |
337 #endif | |
338 } | |
339 | |
340 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
341 _m_punpcklbw (__m64 __m1, __m64 __m2) | |
342 { | |
343 return _mm_unpacklo_pi8 (__m1, __m2); | |
344 } | |
345 /* Interleave the two 16-bit values from the low half of M1 with the two | |
346 16-bit values from the low half of M2. */ | |
347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
348 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) | |
349 { | |
350 __m64_union m1, m2, res; | |
351 | |
352 m1.as_m64 = __m1; | |
353 m2.as_m64 = __m2; | |
354 | |
355 res.as_short[0] = m1.as_short[0]; | |
356 res.as_short[1] = m2.as_short[0]; | |
357 res.as_short[2] = m1.as_short[1]; | |
358 res.as_short[3] = m2.as_short[1]; | |
359 | |
360 return (__m64) res.as_m64; | |
361 } | |
362 | |
363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
364 _m_punpcklwd (__m64 __m1, __m64 __m2) | |
365 { | |
366 return _mm_unpacklo_pi16 (__m1, __m2); | |
367 } | |
368 | |
369 /* Interleave the 32-bit value from the low half of M1 with the 32-bit | |
370 value from the low half of M2. */ | |
371 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
372 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) | |
373 { | |
374 __m64_union m1, m2, res; | |
375 | |
376 m1.as_m64 = __m1; | |
377 m2.as_m64 = __m2; | |
378 | |
379 res.as_int[0] = m1.as_int[0]; | |
380 res.as_int[1] = m2.as_int[0]; | |
381 | |
382 return (__m64) res.as_m64; | |
383 } | |
384 | |
385 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
386 _m_punpckldq (__m64 __m1, __m64 __m2) | |
387 { | |
388 return _mm_unpacklo_pi32 (__m1, __m2); | |
389 } | |
390 | |
391 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ | |
392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
393 _mm_add_pi8 (__m64 __m1, __m64 __m2) | |
394 { | |
395 #if _ARCH_PWR8 | |
396 __vector signed char a, b, c; | |
397 | |
398 a = (__vector signed char)vec_splats (__m1); | |
399 b = (__vector signed char)vec_splats (__m2); | |
400 c = vec_add (a, b); | |
401 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
402 #else | |
403 __m64_union m1, m2, res; | |
404 | |
405 m1.as_m64 = __m1; | |
406 m2.as_m64 = __m2; | |
407 | |
408 res.as_char[0] = m1.as_char[0] + m2.as_char[0]; | |
409 res.as_char[1] = m1.as_char[1] + m2.as_char[1]; | |
410 res.as_char[2] = m1.as_char[2] + m2.as_char[2]; | |
411 res.as_char[3] = m1.as_char[3] + m2.as_char[3]; | |
412 res.as_char[4] = m1.as_char[4] + m2.as_char[4]; | |
413 res.as_char[5] = m1.as_char[5] + m2.as_char[5]; | |
414 res.as_char[6] = m1.as_char[6] + m2.as_char[6]; | |
415 res.as_char[7] = m1.as_char[7] + m2.as_char[7]; | |
416 | |
417 return (__m64) res.as_m64; | |
418 #endif | |
419 } | |
420 | |
421 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
422 _m_paddb (__m64 __m1, __m64 __m2) | |
423 { | |
424 return _mm_add_pi8 (__m1, __m2); | |
425 } | |
426 | |
427 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ | |
428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
429 _mm_add_pi16 (__m64 __m1, __m64 __m2) | |
430 { | |
431 #if _ARCH_PWR8 | |
432 __vector signed short a, b, c; | |
433 | |
434 a = (__vector signed short)vec_splats (__m1); | |
435 b = (__vector signed short)vec_splats (__m2); | |
436 c = vec_add (a, b); | |
437 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
438 #else | |
439 __m64_union m1, m2, res; | |
440 | |
441 m1.as_m64 = __m1; | |
442 m2.as_m64 = __m2; | |
443 | |
444 res.as_short[0] = m1.as_short[0] + m2.as_short[0]; | |
445 res.as_short[1] = m1.as_short[1] + m2.as_short[1]; | |
446 res.as_short[2] = m1.as_short[2] + m2.as_short[2]; | |
447 res.as_short[3] = m1.as_short[3] + m2.as_short[3]; | |
448 | |
449 return (__m64) res.as_m64; | |
450 #endif | |
451 } | |
452 | |
453 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
454 _m_paddw (__m64 __m1, __m64 __m2) | |
455 { | |
456 return _mm_add_pi16 (__m1, __m2); | |
457 } | |
458 | |
459 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ | |
460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
461 _mm_add_pi32 (__m64 __m1, __m64 __m2) | |
462 { | |
463 #if _ARCH_PWR9 | |
464 __vector signed int a, b, c; | |
465 | |
466 a = (__vector signed int)vec_splats (__m1, __m1); | |
467 b = (__vector signed int)vec_splats (__m2, __m2); | |
468 c = vec_add (a, b); | |
469 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
470 #else | |
471 __m64_union m1, m2, res; | |
472 | |
473 m1.as_m64 = __m1; | |
474 m2.as_m64 = __m2; | |
475 | |
476 res.as_int[0] = m1.as_int[0] + m2.as_int[0]; | |
477 res.as_int[1] = m1.as_int[1] + m2.as_int[1]; | |
478 | |
479 return (__m64) res.as_m64; | |
480 #endif | |
481 } | |
482 | |
483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
484 _m_paddd (__m64 __m1, __m64 __m2) | |
485 { | |
486 return _mm_add_pi32 (__m1, __m2); | |
487 } | |
488 | |
489 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ | |
490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
491 _mm_sub_pi8 (__m64 __m1, __m64 __m2) | |
492 { | |
493 #if _ARCH_PWR8 | |
494 __vector signed char a, b, c; | |
495 | |
496 a = (__vector signed char)vec_splats (__m1); | |
497 b = (__vector signed char)vec_splats (__m2); | |
498 c = vec_sub (a, b); | |
499 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
500 #else | |
501 __m64_union m1, m2, res; | |
502 | |
503 m1.as_m64 = __m1; | |
504 m2.as_m64 = __m2; | |
505 | |
506 res.as_char[0] = m1.as_char[0] - m2.as_char[0]; | |
507 res.as_char[1] = m1.as_char[1] - m2.as_char[1]; | |
508 res.as_char[2] = m1.as_char[2] - m2.as_char[2]; | |
509 res.as_char[3] = m1.as_char[3] - m2.as_char[3]; | |
510 res.as_char[4] = m1.as_char[4] - m2.as_char[4]; | |
511 res.as_char[5] = m1.as_char[5] - m2.as_char[5]; | |
512 res.as_char[6] = m1.as_char[6] - m2.as_char[6]; | |
513 res.as_char[7] = m1.as_char[7] - m2.as_char[7]; | |
514 | |
515 return (__m64) res.as_m64; | |
516 #endif | |
517 } | |
518 | |
519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
520 _m_psubb (__m64 __m1, __m64 __m2) | |
521 { | |
522 return _mm_sub_pi8 (__m1, __m2); | |
523 } | |
524 | |
525 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ | |
526 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
527 _mm_sub_pi16 (__m64 __m1, __m64 __m2) | |
528 { | |
529 #if _ARCH_PWR8 | |
530 __vector signed short a, b, c; | |
531 | |
532 a = (__vector signed short)vec_splats (__m1); | |
533 b = (__vector signed short)vec_splats (__m2); | |
534 c = vec_sub (a, b); | |
535 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
536 #else | |
537 __m64_union m1, m2, res; | |
538 | |
539 m1.as_m64 = __m1; | |
540 m2.as_m64 = __m2; | |
541 | |
542 res.as_short[0] = m1.as_short[0] - m2.as_short[0]; | |
543 res.as_short[1] = m1.as_short[1] - m2.as_short[1]; | |
544 res.as_short[2] = m1.as_short[2] - m2.as_short[2]; | |
545 res.as_short[3] = m1.as_short[3] - m2.as_short[3]; | |
546 | |
547 return (__m64) res.as_m64; | |
548 #endif | |
549 } | |
550 | |
551 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
552 _m_psubw (__m64 __m1, __m64 __m2) | |
553 { | |
554 return _mm_sub_pi16 (__m1, __m2); | |
555 } | |
556 | |
557 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ | |
558 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
559 _mm_sub_pi32 (__m64 __m1, __m64 __m2) | |
560 { | |
561 #if _ARCH_PWR9 | |
562 __vector signed int a, b, c; | |
563 | |
564 a = (__vector signed int)vec_splats (__m1); | |
565 b = (__vector signed int)vec_splats (__m2); | |
566 c = vec_sub (a, b); | |
567 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
568 #else | |
569 __m64_union m1, m2, res; | |
570 | |
571 m1.as_m64 = __m1; | |
572 m2.as_m64 = __m2; | |
573 | |
574 res.as_int[0] = m1.as_int[0] - m2.as_int[0]; | |
575 res.as_int[1] = m1.as_int[1] - m2.as_int[1]; | |
576 | |
577 return (__m64) res.as_m64; | |
578 #endif | |
579 } | |
580 | |
581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
582 _m_psubd (__m64 __m1, __m64 __m2) | |
583 { | |
584 return _mm_add_pi32 (__m1, __m2); | |
585 } | |
586 | |
587 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
588 _mm_add_si64 (__m64 __m1, __m64 __m2) | |
589 { | |
590 return (__m1 + __m2); | |
591 } | |
592 | |
593 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
594 _mm_sub_si64 (__m64 __m1, __m64 __m2) | |
595 { | |
596 return (__m1 - __m2); | |
597 } | |
598 | |
599 /* Shift the 64-bit value in M left by COUNT. */ | |
600 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
601 _mm_sll_si64 (__m64 __m, __m64 __count) | |
602 { | |
603 return (__m << __count); | |
604 } | |
605 | |
606 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
607 _m_psllq (__m64 __m, __m64 __count) | |
608 { | |
609 return _mm_sll_si64 (__m, __count); | |
610 } | |
611 | |
612 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
613 _mm_slli_si64 (__m64 __m, const int __count) | |
614 { | |
615 return (__m << __count); | |
616 } | |
617 | |
618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
619 _m_psllqi (__m64 __m, const int __count) | |
620 { | |
621 return _mm_slli_si64 (__m, __count); | |
622 } | |
623 | |
624 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ | |
625 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
626 _mm_srl_si64 (__m64 __m, __m64 __count) | |
627 { | |
628 return (__m >> __count); | |
629 } | |
630 | |
631 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
632 _m_psrlq (__m64 __m, __m64 __count) | |
633 { | |
634 return _mm_srl_si64 (__m, __count); | |
635 } | |
636 | |
637 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
638 _mm_srli_si64 (__m64 __m, const int __count) | |
639 { | |
640 return (__m >> __count); | |
641 } | |
642 | |
643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
644 _m_psrlqi (__m64 __m, const int __count) | |
645 { | |
646 return _mm_srli_si64 (__m, __count); | |
647 } | |
648 | |
649 /* Bit-wise AND the 64-bit values in M1 and M2. */ | |
650 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
651 _mm_and_si64 (__m64 __m1, __m64 __m2) | |
652 { | |
653 return (__m1 & __m2); | |
654 } | |
655 | |
656 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
657 _m_pand (__m64 __m1, __m64 __m2) | |
658 { | |
659 return _mm_and_si64 (__m1, __m2); | |
660 } | |
661 | |
662 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the | |
663 64-bit value in M2. */ | |
664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
665 _mm_andnot_si64 (__m64 __m1, __m64 __m2) | |
666 { | |
667 return (~__m1 & __m2); | |
668 } | |
669 | |
670 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
671 _m_pandn (__m64 __m1, __m64 __m2) | |
672 { | |
673 return _mm_andnot_si64 (__m1, __m2); | |
674 } | |
675 | |
676 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ | |
677 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
678 _mm_or_si64 (__m64 __m1, __m64 __m2) | |
679 { | |
680 return (__m1 | __m2); | |
681 } | |
682 | |
683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
684 _m_por (__m64 __m1, __m64 __m2) | |
685 { | |
686 return _mm_or_si64 (__m1, __m2); | |
687 } | |
688 | |
689 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ | |
690 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
691 _mm_xor_si64 (__m64 __m1, __m64 __m2) | |
692 { | |
693 return (__m1 ^ __m2); | |
694 } | |
695 | |
696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
697 _m_pxor (__m64 __m1, __m64 __m2) | |
698 { | |
699 return _mm_xor_si64 (__m1, __m2); | |
700 } | |
701 | |
702 /* Creates a 64-bit zero. */ | |
703 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
704 _mm_setzero_si64 (void) | |
705 { | |
706 return (__m64) 0; | |
707 } | |
708 | |
709 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the | |
710 test is true and zero if false. */ | |
711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
712 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) | |
713 { | |
714 #ifdef _ARCH_PWR6 | |
715 __m64 res; | |
716 __asm__( | |
717 "cmpb %0,%1,%2;\n" | |
718 : "=r" (res) | |
719 : "r" (__m1), | |
720 "r" (__m2) | |
721 : ); | |
722 return (res); | |
723 #else | |
724 __m64_union m1, m2, res; | |
725 | |
726 m1.as_m64 = __m1; | |
727 m2.as_m64 = __m2; | |
728 | |
729 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0; | |
730 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0; | |
731 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0; | |
732 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0; | |
733 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0; | |
734 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0; | |
735 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0; | |
736 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0; | |
737 | |
738 return (__m64) res.as_m64; | |
739 #endif | |
740 } | |
741 | |
742 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
743 _m_pcmpeqb (__m64 __m1, __m64 __m2) | |
744 { | |
745 return _mm_cmpeq_pi8 (__m1, __m2); | |
746 } | |
747 | |
748 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
749 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) | |
750 { | |
751 #if _ARCH_PWR8 | |
752 __vector signed char a, b, c; | |
753 | |
754 a = (__vector signed char)vec_splats (__m1); | |
755 b = (__vector signed char)vec_splats (__m2); | |
756 c = (__vector signed char)vec_cmpgt (a, b); | |
757 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
758 #else | |
759 __m64_union m1, m2, res; | |
760 | |
761 m1.as_m64 = __m1; | |
762 m2.as_m64 = __m2; | |
763 | |
764 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0; | |
765 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0; | |
766 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0; | |
767 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0; | |
768 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0; | |
769 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0; | |
770 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0; | |
771 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0; | |
772 | |
773 return (__m64) res.as_m64; | |
774 #endif | |
775 } | |
776 | |
777 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
778 _m_pcmpgtb (__m64 __m1, __m64 __m2) | |
779 { | |
780 return _mm_cmpgt_pi8 (__m1, __m2); | |
781 } | |
782 | |
783 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if | |
784 the test is true and zero if false. */ | |
785 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
786 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) | |
787 { | |
788 #if _ARCH_PWR8 | |
789 __vector signed short a, b, c; | |
790 | |
791 a = (__vector signed short)vec_splats (__m1); | |
792 b = (__vector signed short)vec_splats (__m2); | |
793 c = (__vector signed short)vec_cmpeq (a, b); | |
794 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
795 #else | |
796 __m64_union m1, m2, res; | |
797 | |
798 m1.as_m64 = __m1; | |
799 m2.as_m64 = __m2; | |
800 | |
801 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0; | |
802 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0; | |
803 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0; | |
804 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0; | |
805 | |
806 return (__m64) res.as_m64; | |
807 #endif | |
808 } | |
809 | |
810 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
811 _m_pcmpeqw (__m64 __m1, __m64 __m2) | |
812 { | |
813 return _mm_cmpeq_pi16 (__m1, __m2); | |
814 } | |
815 | |
816 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
817 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) | |
818 { | |
819 #if _ARCH_PWR8 | |
820 __vector signed short a, b, c; | |
821 | |
822 a = (__vector signed short)vec_splats (__m1); | |
823 b = (__vector signed short)vec_splats (__m2); | |
824 c = (__vector signed short)vec_cmpgt (a, b); | |
825 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
826 #else | |
827 __m64_union m1, m2, res; | |
828 | |
829 m1.as_m64 = __m1; | |
830 m2.as_m64 = __m2; | |
831 | |
832 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0; | |
833 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0; | |
834 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0; | |
835 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0; | |
836 | |
837 return (__m64) res.as_m64; | |
838 #endif | |
839 } | |
840 | |
841 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
842 _m_pcmpgtw (__m64 __m1, __m64 __m2) | |
843 { | |
844 return _mm_cmpgt_pi16 (__m1, __m2); | |
845 } | |
846 | |
847 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if | |
848 the test is true and zero if false. */ | |
849 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
850 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) | |
851 { | |
852 #if _ARCH_PWR9 | |
853 __vector signed int a, b, c; | |
854 | |
855 a = (__vector signed int)vec_splats (__m1); | |
856 b = (__vector signed int)vec_splats (__m2); | |
857 c = (__vector signed short)vec_cmpeq (a, b); | |
858 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
859 #else | |
860 __m64_union m1, m2, res; | |
861 | |
862 m1.as_m64 = __m1; | |
863 m2.as_m64 = __m2; | |
864 | |
865 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0; | |
866 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0; | |
867 | |
868 return (__m64) res.as_m64; | |
869 #endif | |
870 } | |
871 | |
872 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
873 _m_pcmpeqd (__m64 __m1, __m64 __m2) | |
874 { | |
875 return _mm_cmpeq_pi32 (__m1, __m2); | |
876 } | |
877 | |
878 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
879 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) | |
880 { | |
881 #if _ARCH_PWR9 | |
882 __vector signed int a, b, c; | |
883 | |
884 a = (__vector signed int)vec_splats (__m1); | |
885 b = (__vector signed int)vec_splats (__m2); | |
886 c = (__vector signed short)vec_cmpgt (a, b); | |
887 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
888 #else | |
889 __m64_union m1, m2, res; | |
890 | |
891 m1.as_m64 = __m1; | |
892 m2.as_m64 = __m2; | |
893 | |
894 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0; | |
895 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0; | |
896 | |
897 return (__m64) res.as_m64; | |
898 #endif | |
899 } | |
900 | |
901 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
902 _m_pcmpgtd (__m64 __m1, __m64 __m2) | |
903 { | |
904 return _mm_cmpgt_pi32 (__m1, __m2); | |
905 } | |
906 | |
907 #if _ARCH_PWR8 | |
908 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed | |
909 saturated arithmetic. */ | |
910 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
911 _mm_adds_pi8 (__m64 __m1, __m64 __m2) | |
912 { | |
913 __vector signed char a, b, c; | |
914 | |
915 a = (__vector signed char)vec_splats (__m1); | |
916 b = (__vector signed char)vec_splats (__m2); | |
917 c = vec_adds (a, b); | |
918 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
919 } | |
920 | |
921 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
922 _m_paddsb (__m64 __m1, __m64 __m2) | |
923 { | |
924 return _mm_adds_pi8 (__m1, __m2); | |
925 } | |
926 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed | |
927 saturated arithmetic. */ | |
928 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
929 _mm_adds_pi16 (__m64 __m1, __m64 __m2) | |
930 { | |
931 __vector signed short a, b, c; | |
932 | |
933 a = (__vector signed short)vec_splats (__m1); | |
934 b = (__vector signed short)vec_splats (__m2); | |
935 c = vec_adds (a, b); | |
936 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
937 } | |
938 | |
939 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
940 _m_paddsw (__m64 __m1, __m64 __m2) | |
941 { | |
942 return _mm_adds_pi16 (__m1, __m2); | |
943 } | |
944 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned | |
945 saturated arithmetic. */ | |
946 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
947 _mm_adds_pu8 (__m64 __m1, __m64 __m2) | |
948 { | |
949 __vector unsigned char a, b, c; | |
950 | |
951 a = (__vector unsigned char)vec_splats (__m1); | |
952 b = (__vector unsigned char)vec_splats (__m2); | |
953 c = vec_adds (a, b); | |
954 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
955 } | |
956 | |
957 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
958 _m_paddusb (__m64 __m1, __m64 __m2) | |
959 { | |
960 return _mm_adds_pu8 (__m1, __m2); | |
961 } | |
962 | |
963 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned | |
964 saturated arithmetic. */ | |
965 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
966 _mm_adds_pu16 (__m64 __m1, __m64 __m2) | |
967 { | |
968 __vector unsigned short a, b, c; | |
969 | |
970 a = (__vector unsigned short)vec_splats (__m1); | |
971 b = (__vector unsigned short)vec_splats (__m2); | |
972 c = vec_adds (a, b); | |
973 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
974 } | |
975 | |
976 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
977 _m_paddusw (__m64 __m1, __m64 __m2) | |
978 { | |
979 return _mm_adds_pu16 (__m1, __m2); | |
980 } | |
981 | |
982 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed | |
983 saturating arithmetic. */ | |
984 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
985 _mm_subs_pi8 (__m64 __m1, __m64 __m2) | |
986 { | |
987 __vector signed char a, b, c; | |
988 | |
989 a = (__vector signed char)vec_splats (__m1); | |
990 b = (__vector signed char)vec_splats (__m2); | |
991 c = vec_subs (a, b); | |
992 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
993 } | |
994 | |
995 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
996 _m_psubsb (__m64 __m1, __m64 __m2) | |
997 { | |
998 return _mm_subs_pi8 (__m1, __m2); | |
999 } | |
1000 | |
1001 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using | |
1002 signed saturating arithmetic. */ | |
1003 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1004 _mm_subs_pi16 (__m64 __m1, __m64 __m2) | |
1005 { | |
1006 __vector signed short a, b, c; | |
1007 | |
1008 a = (__vector signed short)vec_splats (__m1); | |
1009 b = (__vector signed short)vec_splats (__m2); | |
1010 c = vec_subs (a, b); | |
1011 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
1012 } | |
1013 | |
1014 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1015 _m_psubsw (__m64 __m1, __m64 __m2) | |
1016 { | |
1017 return _mm_subs_pi16 (__m1, __m2); | |
1018 } | |
1019 | |
1020 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using | |
1021 unsigned saturating arithmetic. */ | |
1022 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1023 _mm_subs_pu8 (__m64 __m1, __m64 __m2) | |
1024 { | |
1025 __vector unsigned char a, b, c; | |
1026 | |
1027 a = (__vector unsigned char)vec_splats (__m1); | |
1028 b = (__vector unsigned char)vec_splats (__m2); | |
1029 c = vec_subs (a, b); | |
1030 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
1031 } | |
1032 | |
1033 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1034 _m_psubusb (__m64 __m1, __m64 __m2) | |
1035 { | |
1036 return _mm_subs_pu8 (__m1, __m2); | |
1037 } | |
1038 | |
1039 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using | |
1040 unsigned saturating arithmetic. */ | |
1041 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1042 _mm_subs_pu16 (__m64 __m1, __m64 __m2) | |
1043 { | |
1044 __vector unsigned short a, b, c; | |
1045 | |
1046 a = (__vector unsigned short)vec_splats (__m1); | |
1047 b = (__vector unsigned short)vec_splats (__m2); | |
1048 c = vec_subs (a, b); | |
1049 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
1050 } | |
1051 | |
1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1053 _m_psubusw (__m64 __m1, __m64 __m2) | |
1054 { | |
1055 return _mm_subs_pu16 (__m1, __m2); | |
1056 } | |
1057 | |
1058 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing | |
1059 four 32-bit intermediate results, which are then summed by pairs to | |
1060 produce two 32-bit results. */ | |
1061 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1062 _mm_madd_pi16 (__m64 __m1, __m64 __m2) | |
1063 { | |
1064 __vector signed short a, b; | |
1065 __vector signed int c; | |
1066 __vector signed int zero = {0, 0, 0, 0}; | |
1067 | |
1068 a = (__vector signed short)vec_splats (__m1); | |
1069 b = (__vector signed short)vec_splats (__m2); | |
1070 c = vec_vmsumshm (a, b, zero); | |
1071 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
1072 } | |
1073 | |
1074 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1075 _m_pmaddwd (__m64 __m1, __m64 __m2) | |
1076 { | |
1077 return _mm_madd_pi16 (__m1, __m2); | |
1078 } | |
1079 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in | |
1080 M2 and produce the high 16 bits of the 32-bit results. */ | |
1081 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1082 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) | |
1083 { | |
1084 __vector signed short a, b; | |
1085 __vector signed short c; | |
1086 __vector signed int w0, w1; | |
1087 __vector unsigned char xform1 = { | |
1088 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, | |
1089 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F | |
1090 }; | |
1091 | |
1092 a = (__vector signed short)vec_splats (__m1); | |
1093 b = (__vector signed short)vec_splats (__m2); | |
1094 | |
1095 w0 = vec_vmulesh (a, b); | |
1096 w1 = vec_vmulosh (a, b); | |
1097 c = (__vector signed short)vec_perm (w0, w1, xform1); | |
1098 | |
1099 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
1100 } | |
1101 | |
1102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1103 _m_pmulhw (__m64 __m1, __m64 __m2) | |
1104 { | |
1105 return _mm_mulhi_pi16 (__m1, __m2); | |
1106 } | |
1107 | |
1108 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce | |
1109 the low 16 bits of the results. */ | |
1110 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1111 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) | |
1112 { | |
1113 __vector signed short a, b, c; | |
1114 | |
1115 a = (__vector signed short)vec_splats (__m1); | |
1116 b = (__vector signed short)vec_splats (__m2); | |
1117 c = a * b; | |
1118 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0)); | |
1119 } | |
1120 | |
1121 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1122 _m_pmullw (__m64 __m1, __m64 __m2) | |
1123 { | |
1124 return _mm_mullo_pi16 (__m1, __m2); | |
1125 } | |
1126 | |
1127 /* Shift four 16-bit values in M left by COUNT. */ | |
1128 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1129 _mm_sll_pi16 (__m64 __m, __m64 __count) | |
1130 { | |
1131 __vector signed short m, r; | |
1132 __vector unsigned short c; | |
1133 | |
1134 if (__count <= 15) | |
1135 { | |
1136 m = (__vector signed short)vec_splats (__m); | |
1137 c = (__vector unsigned short)vec_splats ((unsigned short)__count); | |
1138 r = vec_sl (m, (__vector unsigned short)c); | |
1139 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1140 } | |
1141 else | |
1142 return (0); | |
1143 } | |
1144 | |
1145 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1146 _m_psllw (__m64 __m, __m64 __count) | |
1147 { | |
1148 return _mm_sll_pi16 (__m, __count); | |
1149 } | |
1150 | |
1151 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1152 _mm_slli_pi16 (__m64 __m, int __count) | |
1153 { | |
1154 /* Promote int to long then invoke mm_sll_pi16. */ | |
1155 return _mm_sll_pi16 (__m, __count); | |
1156 } | |
1157 | |
1158 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1159 _m_psllwi (__m64 __m, int __count) | |
1160 { | |
1161 return _mm_slli_pi16 (__m, __count); | |
1162 } | |
1163 | |
1164 /* Shift two 32-bit values in M left by COUNT. */ | |
1165 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1166 _mm_sll_pi32 (__m64 __m, __m64 __count) | |
1167 { | |
1168 __m64_union m, res; | |
1169 | |
1170 m.as_m64 = __m; | |
1171 | |
1172 res.as_int[0] = m.as_int[0] << __count; | |
1173 res.as_int[1] = m.as_int[1] << __count; | |
1174 return (res.as_m64); | |
1175 } | |
1176 | |
1177 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1178 _m_pslld (__m64 __m, __m64 __count) | |
1179 { | |
1180 return _mm_sll_pi32 (__m, __count); | |
1181 } | |
1182 | |
1183 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1184 _mm_slli_pi32 (__m64 __m, int __count) | |
1185 { | |
1186 /* Promote int to long then invoke mm_sll_pi32. */ | |
1187 return _mm_sll_pi32 (__m, __count); | |
1188 } | |
1189 | |
1190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1191 _m_pslldi (__m64 __m, int __count) | |
1192 { | |
1193 return _mm_slli_pi32 (__m, __count); | |
1194 } | |
1195 | |
1196 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ | |
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1198 _mm_sra_pi16 (__m64 __m, __m64 __count) | |
1199 { | |
1200 __vector signed short m, r; | |
1201 __vector unsigned short c; | |
1202 | |
1203 if (__count <= 15) | |
1204 { | |
1205 m = (__vector signed short)vec_splats (__m); | |
1206 c = (__vector unsigned short)vec_splats ((unsigned short)__count); | |
1207 r = vec_sra (m, (__vector unsigned short)c); | |
1208 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1209 } | |
1210 else | |
1211 return (0); | |
1212 } | |
1213 | |
1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1215 _m_psraw (__m64 __m, __m64 __count) | |
1216 { | |
1217 return _mm_sra_pi16 (__m, __count); | |
1218 } | |
1219 | |
1220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1221 _mm_srai_pi16 (__m64 __m, int __count) | |
1222 { | |
1223 /* Promote int to long then invoke mm_sra_pi32. */ | |
1224 return _mm_sra_pi16 (__m, __count); | |
1225 } | |
1226 | |
1227 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1228 _m_psrawi (__m64 __m, int __count) | |
1229 { | |
1230 return _mm_srai_pi16 (__m, __count); | |
1231 } | |
1232 | |
1233 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ | |
1234 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1235 _mm_sra_pi32 (__m64 __m, __m64 __count) | |
1236 { | |
1237 __m64_union m, res; | |
1238 | |
1239 m.as_m64 = __m; | |
1240 | |
1241 res.as_int[0] = m.as_int[0] >> __count; | |
1242 res.as_int[1] = m.as_int[1] >> __count; | |
1243 return (res.as_m64); | |
1244 } | |
1245 | |
1246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1247 _m_psrad (__m64 __m, __m64 __count) | |
1248 { | |
1249 return _mm_sra_pi32 (__m, __count); | |
1250 } | |
1251 | |
1252 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1253 _mm_srai_pi32 (__m64 __m, int __count) | |
1254 { | |
1255 /* Promote int to long then invoke mm_sra_pi32. */ | |
1256 return _mm_sra_pi32 (__m, __count); | |
1257 } | |
1258 | |
1259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1260 _m_psradi (__m64 __m, int __count) | |
1261 { | |
1262 return _mm_srai_pi32 (__m, __count); | |
1263 } | |
1264 | |
1265 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ | |
1266 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1267 _mm_srl_pi16 (__m64 __m, __m64 __count) | |
1268 { | |
1269 __vector unsigned short m, r; | |
1270 __vector unsigned short c; | |
1271 | |
1272 if (__count <= 15) | |
1273 { | |
1274 m = (__vector unsigned short)vec_splats (__m); | |
1275 c = (__vector unsigned short)vec_splats ((unsigned short)__count); | |
1276 r = vec_sr (m, (__vector unsigned short)c); | |
1277 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0)); | |
1278 } | |
1279 else | |
1280 return (0); | |
1281 } | |
1282 | |
1283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1284 _m_psrlw (__m64 __m, __m64 __count) | |
1285 { | |
1286 return _mm_srl_pi16 (__m, __count); | |
1287 } | |
1288 | |
1289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1290 _mm_srli_pi16 (__m64 __m, int __count) | |
1291 { | |
1292 /* Promote int to long then invoke mm_sra_pi32. */ | |
1293 return _mm_srl_pi16 (__m, __count); | |
1294 } | |
1295 | |
1296 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1297 _m_psrlwi (__m64 __m, int __count) | |
1298 { | |
1299 return _mm_srli_pi16 (__m, __count); | |
1300 } | |
1301 | |
1302 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ | |
1303 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1304 _mm_srl_pi32 (__m64 __m, __m64 __count) | |
1305 { | |
1306 __m64_union m, res; | |
1307 | |
1308 m.as_m64 = __m; | |
1309 | |
1310 res.as_int[0] = (unsigned int)m.as_int[0] >> __count; | |
1311 res.as_int[1] = (unsigned int)m.as_int[1] >> __count; | |
1312 return (res.as_m64); | |
1313 } | |
1314 | |
1315 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1316 _m_psrld (__m64 __m, __m64 __count) | |
1317 { | |
1318 return _mm_srl_pi32 (__m, __count); | |
1319 } | |
1320 | |
1321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1322 _mm_srli_pi32 (__m64 __m, int __count) | |
1323 { | |
1324 /* Promote int to long then invoke mm_srl_pi32. */ | |
1325 return _mm_srl_pi32 (__m, __count); | |
1326 } | |
1327 | |
1328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1329 _m_psrldi (__m64 __m, int __count) | |
1330 { | |
1331 return _mm_srli_pi32 (__m, __count); | |
1332 } | |
1333 #endif /* _ARCH_PWR8 */ | |
1334 | |
1335 /* Creates a vector of two 32-bit values; I0 is least significant. */ | |
1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1337 _mm_set_pi32 (int __i1, int __i0) | |
1338 { | |
1339 __m64_union res; | |
1340 | |
1341 res.as_int[0] = __i0; | |
1342 res.as_int[1] = __i1; | |
1343 return (res.as_m64); | |
1344 } | |
1345 | |
1346 /* Creates a vector of four 16-bit values; W0 is least significant. */ | |
1347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1348 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) | |
1349 { | |
1350 __m64_union res; | |
1351 | |
1352 res.as_short[0] = __w0; | |
1353 res.as_short[1] = __w1; | |
1354 res.as_short[2] = __w2; | |
1355 res.as_short[3] = __w3; | |
1356 return (res.as_m64); | |
1357 } | |
1358 | |
1359 /* Creates a vector of eight 8-bit values; B0 is least significant. */ | |
1360 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1361 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, | |
1362 char __b3, char __b2, char __b1, char __b0) | |
1363 { | |
1364 __m64_union res; | |
1365 | |
1366 res.as_char[0] = __b0; | |
1367 res.as_char[1] = __b1; | |
1368 res.as_char[2] = __b2; | |
1369 res.as_char[3] = __b3; | |
1370 res.as_char[4] = __b4; | |
1371 res.as_char[5] = __b5; | |
1372 res.as_char[6] = __b6; | |
1373 res.as_char[7] = __b7; | |
1374 return (res.as_m64); | |
1375 } | |
1376 | |
1377 /* Similar, but with the arguments in reverse order. */ | |
1378 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1379 _mm_setr_pi32 (int __i0, int __i1) | |
1380 { | |
1381 __m64_union res; | |
1382 | |
1383 res.as_int[0] = __i0; | |
1384 res.as_int[1] = __i1; | |
1385 return (res.as_m64); | |
1386 } | |
1387 | |
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1389 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) | |
1390 { | |
1391 return _mm_set_pi16 (__w3, __w2, __w1, __w0); | |
1392 } | |
1393 | |
1394 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1395 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, | |
1396 char __b4, char __b5, char __b6, char __b7) | |
1397 { | |
1398 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); | |
1399 } | |
1400 | |
1401 /* Creates a vector of two 32-bit values, both elements containing I. */ | |
1402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1403 _mm_set1_pi32 (int __i) | |
1404 { | |
1405 __m64_union res; | |
1406 | |
1407 res.as_int[0] = __i; | |
1408 res.as_int[1] = __i; | |
1409 return (res.as_m64); | |
1410 } | |
1411 | |
1412 /* Creates a vector of four 16-bit values, all elements containing W. */ | |
1413 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1414 _mm_set1_pi16 (short __w) | |
1415 { | |
1416 #if _ARCH_PWR9 | |
1417 __vector signed short w; | |
1418 | |
1419 w = (__vector signed short)vec_splats (__w); | |
1420 return (__builtin_unpack_vector_int128 ((__vector __int128_t)w, 0)); | |
1421 #else | |
1422 __m64_union res; | |
1423 | |
1424 res.as_short[0] = __w; | |
1425 res.as_short[1] = __w; | |
1426 res.as_short[2] = __w; | |
1427 res.as_short[3] = __w; | |
1428 return (res.as_m64); | |
1429 #endif | |
1430 } | |
1431 | |
1432 /* Creates a vector of eight 8-bit values, all elements containing B. */ | |
1433 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
1434 _mm_set1_pi8 (signed char __b) | |
1435 { | |
1436 #if _ARCH_PWR8 | |
1437 __vector signed char b; | |
1438 | |
1439 b = (__vector signed char)vec_splats (__b); | |
1440 return (__builtin_unpack_vector_int128 ((__vector __int128_t)b, 0)); | |
1441 #else | |
1442 __m64_union res; | |
1443 | |
1444 res.as_char[0] = __b; | |
1445 res.as_char[1] = __b; | |
1446 res.as_char[2] = __b; | |
1447 res.as_char[3] = __b; | |
1448 res.as_char[4] = __b; | |
1449 res.as_char[5] = __b; | |
1450 res.as_char[6] = __b; | |
1451 res.as_char[7] = __b; | |
1452 return (res.as_m64); | |
1453 #endif | |
1454 } | |
1455 #endif /* __powerpc64__ */ | |
1456 #endif /* _MMINTRIN_H_INCLUDED */ |