111
|
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
|
2
|
|
3 This file is part of GCC.
|
|
4
|
|
5 GCC is free software; you can redistribute it and/or modify
|
|
6 it under the terms of the GNU General Public License as published by
|
|
7 the Free Software Foundation; either version 3, or (at your option)
|
|
8 any later version.
|
|
9
|
|
10 GCC is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU General Public License for more details.
|
|
14
|
|
15 Under Section 7 of GPL version 3, you are granted additional
|
|
16 permissions described in the GCC Runtime Library Exception, version
|
|
17 3.1, as published by the Free Software Foundation.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License and
|
|
20 a copy of the GCC Runtime Library Exception along with this program;
|
|
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
22 <http://www.gnu.org/licenses/>. */
|
|
23
|
|
24 /* Implemented from the specification included in the Intel C++ Compiler
|
|
25 User Guide and Reference, version 9.0. */
|
|
26
|
|
27 #ifndef NO_WARN_X86_INTRINSICS
|
|
28 /* This header is distributed to simplify porting x86_64 code that
|
|
29 makes explicit use of Intel intrinsics to powerpc64le.
|
|
30 It is the user's responsibility to determine if the results are
|
|
31 acceptable and make additional changes as necessary.
|
|
32 Note that much code that uses Intel intrinsics can be rewritten in
|
|
33 standard C or GNU C extensions, which are more portable and better
|
|
34 optimized across multiple targets.
|
|
35
|
|
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
|
|
37 target does not support a native __vector_size__ (8) type. Instead
|
|
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
|
|
39 supported in 64-bit mode. This works well for the _si64 and some
|
|
40 _pi32 operations, but starts to generate long sequences for _pi16
|
|
41 and _pi8 operations. For those cases it better (faster and
|
|
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
|
|
43 unit, perform the operation, and then transfer the result back to
|
|
44 the __m64 type. This implies that the direct register move
|
|
45 instructions, introduced with power8, are available for efficient
|
|
46 implementation of these transfers.
|
|
47
|
|
48 Most MMX intrinsic operations can be performed efficiently as
|
|
49 C language 64-bit scalar operation or optimized to use the newer
|
|
50 128-bit SSE/Altivec operations. We recomend this for new
|
|
51 applications. */
|
|
52 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
|
53 #endif
|
|
54
|
|
55 #ifndef _MMINTRIN_H_INCLUDED
|
|
56 #define _MMINTRIN_H_INCLUDED
|
|
57
|
|
58 #include <altivec.h>
|
|
59 /* The Intel API is flexible enough that we must allow aliasing with other
|
|
60 vector types, and their scalar components. */
|
|
61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
|
|
62
|
|
63 typedef __attribute__ ((__aligned__ (8)))
|
|
64 union
|
|
65 {
|
|
66 __m64 as_m64;
|
|
67 char as_char[8];
|
|
68 signed char as_signed_char [8];
|
|
69 short as_short[4];
|
|
70 int as_int[2];
|
|
71 long long as_long_long;
|
|
72 float as_float[2];
|
|
73 double as_double;
|
|
74 } __m64_union;
|
|
75
|
|
76 /* Empty the multimedia state. */
|
|
77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
78 _mm_empty (void)
|
|
79 {
|
|
80 /* nothing to do on PowerPC. */
|
|
81 }
|
|
82
|
|
83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
84 _m_empty (void)
|
|
85 {
|
|
86 /* nothing to do on PowerPC. */
|
|
87 }
|
|
88
|
|
89 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
|
|
90 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
91 _mm_cvtsi32_si64 (int __i)
|
|
92 {
|
|
93 return (__m64) (unsigned int) __i;
|
|
94 }
|
|
95
|
|
96 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
97 _m_from_int (int __i)
|
|
98 {
|
|
99 return _mm_cvtsi32_si64 (__i);
|
|
100 }
|
|
101
|
|
102 /* Convert the lower 32 bits of the __m64 object into an integer. */
|
|
103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
104 _mm_cvtsi64_si32 (__m64 __i)
|
|
105 {
|
|
106 return ((int) __i);
|
|
107 }
|
|
108
|
|
109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
110 _m_to_int (__m64 __i)
|
|
111 {
|
|
112 return _mm_cvtsi64_si32 (__i);
|
|
113 }
|
|
114
|
|
115 #ifdef __powerpc64__
|
|
116 /* Convert I to a __m64 object. */
|
|
117
|
|
118 /* Intel intrinsic. */
|
|
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
120 _m_from_int64 (long long __i)
|
|
121 {
|
|
122 return (__m64) __i;
|
|
123 }
|
|
124
|
|
125 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
126 _mm_cvtsi64_m64 (long long __i)
|
|
127 {
|
|
128 return (__m64) __i;
|
|
129 }
|
|
130
|
|
131 /* Microsoft intrinsic. */
|
|
132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
133 _mm_cvtsi64x_si64 (long long __i)
|
|
134 {
|
|
135 return (__m64) __i;
|
|
136 }
|
|
137
|
|
138 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
139 _mm_set_pi64x (long long __i)
|
|
140 {
|
|
141 return (__m64) __i;
|
|
142 }
|
|
143
|
|
144 /* Convert the __m64 object to a 64bit integer. */
|
|
145
|
|
146 /* Intel intrinsic. */
|
|
147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
148 _m_to_int64 (__m64 __i)
|
|
149 {
|
|
150 return (long long)__i;
|
|
151 }
|
|
152
|
|
153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
154 _mm_cvtm64_si64 (__m64 __i)
|
|
155 {
|
|
156 return (long long) __i;
|
|
157 }
|
|
158
|
|
159 /* Microsoft intrinsic. */
|
|
160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
161 _mm_cvtsi64_si64x (__m64 __i)
|
|
162 {
|
|
163 return (long long) __i;
|
|
164 }
|
|
165
|
|
166 #ifdef _ARCH_PWR8
|
|
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
|
|
168 the result, and the four 16-bit values from M2 into the upper four 8-bit
|
|
169 values of the result, all with signed saturation. */
|
|
170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
|
|
172 {
|
|
173 __vector signed short vm1;
|
|
174 __vector signed char vresult;
|
|
175
|
|
176 vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
|
|
177 vresult = vec_vpkshss (vm1, vm1);
|
|
178 return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0);
|
|
179 }
|
|
180
|
|
181 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
182 _m_packsswb (__m64 __m1, __m64 __m2)
|
|
183 {
|
|
184 return _mm_packs_pi16 (__m1, __m2);
|
|
185 }
|
|
186
|
|
187 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
|
|
188 the result, and the two 32-bit values from M2 into the upper two 16-bit
|
|
189 values of the result, all with signed saturation. */
|
|
190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
191 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
|
|
192 {
|
|
193 __vector signed int vm1;
|
|
194 __vector signed short vresult;
|
|
195
|
|
196 vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1);
|
|
197 vresult = vec_vpkswss (vm1, vm1);
|
|
198 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
|
|
199 }
|
|
200
|
|
201 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
202 _m_packssdw (__m64 __m1, __m64 __m2)
|
|
203 {
|
|
204 return _mm_packs_pi32 (__m1, __m2);
|
|
205 }
|
|
206
|
|
207 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
|
|
208 the result, and the four 16-bit values from M2 into the upper four 8-bit
|
|
209 values of the result, all with unsigned saturation. */
|
|
210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
211 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
|
|
212 {
|
|
213 __vector signed short vm1;
|
|
214 __vector unsigned char vresult;
|
|
215
|
|
216 vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
|
|
217 vresult = vec_vpkshus (vm1, vm1);
|
|
218 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
|
|
219 }
|
|
220
|
|
221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
222 _m_packuswb (__m64 __m1, __m64 __m2)
|
|
223 {
|
|
224 return _mm_packs_pu16 (__m1, __m2);
|
|
225 }
|
|
226 #endif /* end ARCH_PWR8 */
|
|
227
|
|
228 /* Interleave the four 8-bit values from the high half of M1 with the four
|
|
229 8-bit values from the high half of M2. */
|
|
230 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
231 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
|
|
232 {
|
|
233 #if _ARCH_PWR8
|
|
234 __vector unsigned char a, b, c;
|
|
235
|
|
236 a = (__vector unsigned char)vec_splats (__m1);
|
|
237 b = (__vector unsigned char)vec_splats (__m2);
|
|
238 c = vec_mergel (a, b);
|
|
239 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
240 #else
|
|
241 __m64_union m1, m2, res;
|
|
242
|
|
243 m1.as_m64 = __m1;
|
|
244 m2.as_m64 = __m2;
|
|
245
|
|
246 res.as_char[0] = m1.as_char[4];
|
|
247 res.as_char[1] = m2.as_char[4];
|
|
248 res.as_char[2] = m1.as_char[5];
|
|
249 res.as_char[3] = m2.as_char[5];
|
|
250 res.as_char[4] = m1.as_char[6];
|
|
251 res.as_char[5] = m2.as_char[6];
|
|
252 res.as_char[6] = m1.as_char[7];
|
|
253 res.as_char[7] = m2.as_char[7];
|
|
254
|
|
255 return (__m64) res.as_m64;
|
|
256 #endif
|
|
257 }
|
|
258
|
|
259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
260 _m_punpckhbw (__m64 __m1, __m64 __m2)
|
|
261 {
|
|
262 return _mm_unpackhi_pi8 (__m1, __m2);
|
|
263 }
|
|
264
|
|
265 /* Interleave the two 16-bit values from the high half of M1 with the two
|
|
266 16-bit values from the high half of M2. */
|
|
267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
268 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
|
|
269 {
|
|
270 __m64_union m1, m2, res;
|
|
271
|
|
272 m1.as_m64 = __m1;
|
|
273 m2.as_m64 = __m2;
|
|
274
|
|
275 res.as_short[0] = m1.as_short[2];
|
|
276 res.as_short[1] = m2.as_short[2];
|
|
277 res.as_short[2] = m1.as_short[3];
|
|
278 res.as_short[3] = m2.as_short[3];
|
|
279
|
|
280 return (__m64) res.as_m64;
|
|
281 }
|
|
282
|
|
283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
284 _m_punpckhwd (__m64 __m1, __m64 __m2)
|
|
285 {
|
|
286 return _mm_unpackhi_pi16 (__m1, __m2);
|
|
287 }
|
|
288 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
|
|
289 value from the high half of M2. */
|
|
290 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
291 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
|
|
292 {
|
|
293 __m64_union m1, m2, res;
|
|
294
|
|
295 m1.as_m64 = __m1;
|
|
296 m2.as_m64 = __m2;
|
|
297
|
|
298 res.as_int[0] = m1.as_int[1];
|
|
299 res.as_int[1] = m2.as_int[1];
|
|
300
|
|
301 return (__m64) res.as_m64;
|
|
302 }
|
|
303
|
|
304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
305 _m_punpckhdq (__m64 __m1, __m64 __m2)
|
|
306 {
|
|
307 return _mm_unpackhi_pi32 (__m1, __m2);
|
|
308 }
|
|
309 /* Interleave the four 8-bit values from the low half of M1 with the four
|
|
310 8-bit values from the low half of M2. */
|
|
311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
312 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
|
|
313 {
|
|
314 #if _ARCH_PWR8
|
|
315 __vector unsigned char a, b, c;
|
|
316
|
|
317 a = (__vector unsigned char)vec_splats (__m1);
|
|
318 b = (__vector unsigned char)vec_splats (__m2);
|
|
319 c = vec_mergel (a, b);
|
|
320 return (__builtin_unpack_vector_int128 ((vector __int128_t)c, 1));
|
|
321 #else
|
|
322 __m64_union m1, m2, res;
|
|
323
|
|
324 m1.as_m64 = __m1;
|
|
325 m2.as_m64 = __m2;
|
|
326
|
|
327 res.as_char[0] = m1.as_char[0];
|
|
328 res.as_char[1] = m2.as_char[0];
|
|
329 res.as_char[2] = m1.as_char[1];
|
|
330 res.as_char[3] = m2.as_char[1];
|
|
331 res.as_char[4] = m1.as_char[2];
|
|
332 res.as_char[5] = m2.as_char[2];
|
|
333 res.as_char[6] = m1.as_char[3];
|
|
334 res.as_char[7] = m2.as_char[3];
|
|
335
|
|
336 return (__m64) res.as_m64;
|
|
337 #endif
|
|
338 }
|
|
339
|
|
340 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
341 _m_punpcklbw (__m64 __m1, __m64 __m2)
|
|
342 {
|
|
343 return _mm_unpacklo_pi8 (__m1, __m2);
|
|
344 }
|
|
345 /* Interleave the two 16-bit values from the low half of M1 with the two
|
|
346 16-bit values from the low half of M2. */
|
|
347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
348 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
|
|
349 {
|
|
350 __m64_union m1, m2, res;
|
|
351
|
|
352 m1.as_m64 = __m1;
|
|
353 m2.as_m64 = __m2;
|
|
354
|
|
355 res.as_short[0] = m1.as_short[0];
|
|
356 res.as_short[1] = m2.as_short[0];
|
|
357 res.as_short[2] = m1.as_short[1];
|
|
358 res.as_short[3] = m2.as_short[1];
|
|
359
|
|
360 return (__m64) res.as_m64;
|
|
361 }
|
|
362
|
|
363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
364 _m_punpcklwd (__m64 __m1, __m64 __m2)
|
|
365 {
|
|
366 return _mm_unpacklo_pi16 (__m1, __m2);
|
|
367 }
|
|
368
|
|
369 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
|
|
370 value from the low half of M2. */
|
|
371 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
372 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
|
|
373 {
|
|
374 __m64_union m1, m2, res;
|
|
375
|
|
376 m1.as_m64 = __m1;
|
|
377 m2.as_m64 = __m2;
|
|
378
|
|
379 res.as_int[0] = m1.as_int[0];
|
|
380 res.as_int[1] = m2.as_int[0];
|
|
381
|
|
382 return (__m64) res.as_m64;
|
|
383 }
|
|
384
|
|
385 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
386 _m_punpckldq (__m64 __m1, __m64 __m2)
|
|
387 {
|
|
388 return _mm_unpacklo_pi32 (__m1, __m2);
|
|
389 }
|
|
390
|
|
391 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
|
|
392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
393 _mm_add_pi8 (__m64 __m1, __m64 __m2)
|
|
394 {
|
|
395 #if _ARCH_PWR8
|
|
396 __vector signed char a, b, c;
|
|
397
|
|
398 a = (__vector signed char)vec_splats (__m1);
|
|
399 b = (__vector signed char)vec_splats (__m2);
|
|
400 c = vec_add (a, b);
|
|
401 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
402 #else
|
|
403 __m64_union m1, m2, res;
|
|
404
|
|
405 m1.as_m64 = __m1;
|
|
406 m2.as_m64 = __m2;
|
|
407
|
|
408 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
|
|
409 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
|
|
410 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
|
|
411 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
|
|
412 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
|
|
413 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
|
|
414 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
|
|
415 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
|
|
416
|
|
417 return (__m64) res.as_m64;
|
|
418 #endif
|
|
419 }
|
|
420
|
|
421 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
422 _m_paddb (__m64 __m1, __m64 __m2)
|
|
423 {
|
|
424 return _mm_add_pi8 (__m1, __m2);
|
|
425 }
|
|
426
|
|
427 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
|
|
428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
429 _mm_add_pi16 (__m64 __m1, __m64 __m2)
|
|
430 {
|
|
431 #if _ARCH_PWR8
|
|
432 __vector signed short a, b, c;
|
|
433
|
|
434 a = (__vector signed short)vec_splats (__m1);
|
|
435 b = (__vector signed short)vec_splats (__m2);
|
|
436 c = vec_add (a, b);
|
|
437 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
438 #else
|
|
439 __m64_union m1, m2, res;
|
|
440
|
|
441 m1.as_m64 = __m1;
|
|
442 m2.as_m64 = __m2;
|
|
443
|
|
444 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
|
|
445 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
|
|
446 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
|
|
447 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
|
|
448
|
|
449 return (__m64) res.as_m64;
|
|
450 #endif
|
|
451 }
|
|
452
|
|
453 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
454 _m_paddw (__m64 __m1, __m64 __m2)
|
|
455 {
|
|
456 return _mm_add_pi16 (__m1, __m2);
|
|
457 }
|
|
458
|
|
459 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
|
|
460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
461 _mm_add_pi32 (__m64 __m1, __m64 __m2)
|
|
462 {
|
|
463 #if _ARCH_PWR9
|
|
464 __vector signed int a, b, c;
|
|
465
|
|
466 a = (__vector signed int)vec_splats (__m1, __m1);
|
|
467 b = (__vector signed int)vec_splats (__m2, __m2);
|
|
468 c = vec_add (a, b);
|
|
469 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
470 #else
|
|
471 __m64_union m1, m2, res;
|
|
472
|
|
473 m1.as_m64 = __m1;
|
|
474 m2.as_m64 = __m2;
|
|
475
|
|
476 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
|
|
477 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
|
|
478
|
|
479 return (__m64) res.as_m64;
|
|
480 #endif
|
|
481 }
|
|
482
|
|
483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
484 _m_paddd (__m64 __m1, __m64 __m2)
|
|
485 {
|
|
486 return _mm_add_pi32 (__m1, __m2);
|
|
487 }
|
|
488
|
|
489 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
|
|
490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
491 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
|
|
492 {
|
|
493 #if _ARCH_PWR8
|
|
494 __vector signed char a, b, c;
|
|
495
|
|
496 a = (__vector signed char)vec_splats (__m1);
|
|
497 b = (__vector signed char)vec_splats (__m2);
|
|
498 c = vec_sub (a, b);
|
|
499 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
500 #else
|
|
501 __m64_union m1, m2, res;
|
|
502
|
|
503 m1.as_m64 = __m1;
|
|
504 m2.as_m64 = __m2;
|
|
505
|
|
506 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
|
|
507 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
|
|
508 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
|
|
509 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
|
|
510 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
|
|
511 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
|
|
512 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
|
|
513 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
|
|
514
|
|
515 return (__m64) res.as_m64;
|
|
516 #endif
|
|
517 }
|
|
518
|
|
519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
520 _m_psubb (__m64 __m1, __m64 __m2)
|
|
521 {
|
|
522 return _mm_sub_pi8 (__m1, __m2);
|
|
523 }
|
|
524
|
|
525 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
|
|
526 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
527 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
|
|
528 {
|
|
529 #if _ARCH_PWR8
|
|
530 __vector signed short a, b, c;
|
|
531
|
|
532 a = (__vector signed short)vec_splats (__m1);
|
|
533 b = (__vector signed short)vec_splats (__m2);
|
|
534 c = vec_sub (a, b);
|
|
535 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
536 #else
|
|
537 __m64_union m1, m2, res;
|
|
538
|
|
539 m1.as_m64 = __m1;
|
|
540 m2.as_m64 = __m2;
|
|
541
|
|
542 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
|
|
543 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
|
|
544 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
|
|
545 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
|
|
546
|
|
547 return (__m64) res.as_m64;
|
|
548 #endif
|
|
549 }
|
|
550
|
|
551 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
552 _m_psubw (__m64 __m1, __m64 __m2)
|
|
553 {
|
|
554 return _mm_sub_pi16 (__m1, __m2);
|
|
555 }
|
|
556
|
|
557 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
|
|
558 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
559 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
|
|
560 {
|
|
561 #if _ARCH_PWR9
|
|
562 __vector signed int a, b, c;
|
|
563
|
|
564 a = (__vector signed int)vec_splats (__m1);
|
|
565 b = (__vector signed int)vec_splats (__m2);
|
|
566 c = vec_sub (a, b);
|
|
567 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
568 #else
|
|
569 __m64_union m1, m2, res;
|
|
570
|
|
571 m1.as_m64 = __m1;
|
|
572 m2.as_m64 = __m2;
|
|
573
|
|
574 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
|
|
575 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
|
|
576
|
|
577 return (__m64) res.as_m64;
|
|
578 #endif
|
|
579 }
|
|
580
|
|
581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
582 _m_psubd (__m64 __m1, __m64 __m2)
|
|
583 {
|
|
584 return _mm_add_pi32 (__m1, __m2);
|
|
585 }
|
|
586
|
|
587 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
588 _mm_add_si64 (__m64 __m1, __m64 __m2)
|
|
589 {
|
|
590 return (__m1 + __m2);
|
|
591 }
|
|
592
|
|
593 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
594 _mm_sub_si64 (__m64 __m1, __m64 __m2)
|
|
595 {
|
|
596 return (__m1 - __m2);
|
|
597 }
|
|
598
|
|
599 /* Shift the 64-bit value in M left by COUNT. */
|
|
600 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
601 _mm_sll_si64 (__m64 __m, __m64 __count)
|
|
602 {
|
|
603 return (__m << __count);
|
|
604 }
|
|
605
|
|
606 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
607 _m_psllq (__m64 __m, __m64 __count)
|
|
608 {
|
|
609 return _mm_sll_si64 (__m, __count);
|
|
610 }
|
|
611
|
|
612 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
613 _mm_slli_si64 (__m64 __m, const int __count)
|
|
614 {
|
|
615 return (__m << __count);
|
|
616 }
|
|
617
|
|
618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
619 _m_psllqi (__m64 __m, const int __count)
|
|
620 {
|
|
621 return _mm_slli_si64 (__m, __count);
|
|
622 }
|
|
623
|
|
624 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
|
|
625 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
626 _mm_srl_si64 (__m64 __m, __m64 __count)
|
|
627 {
|
|
628 return (__m >> __count);
|
|
629 }
|
|
630
|
|
631 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
632 _m_psrlq (__m64 __m, __m64 __count)
|
|
633 {
|
|
634 return _mm_srl_si64 (__m, __count);
|
|
635 }
|
|
636
|
|
637 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
638 _mm_srli_si64 (__m64 __m, const int __count)
|
|
639 {
|
|
640 return (__m >> __count);
|
|
641 }
|
|
642
|
|
643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
644 _m_psrlqi (__m64 __m, const int __count)
|
|
645 {
|
|
646 return _mm_srli_si64 (__m, __count);
|
|
647 }
|
|
648
|
|
649 /* Bit-wise AND the 64-bit values in M1 and M2. */
|
|
650 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
651 _mm_and_si64 (__m64 __m1, __m64 __m2)
|
|
652 {
|
|
653 return (__m1 & __m2);
|
|
654 }
|
|
655
|
|
656 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
657 _m_pand (__m64 __m1, __m64 __m2)
|
|
658 {
|
|
659 return _mm_and_si64 (__m1, __m2);
|
|
660 }
|
|
661
|
|
662 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
|
|
663 64-bit value in M2. */
|
|
664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
665 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
|
|
666 {
|
|
667 return (~__m1 & __m2);
|
|
668 }
|
|
669
|
|
670 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
671 _m_pandn (__m64 __m1, __m64 __m2)
|
|
672 {
|
|
673 return _mm_andnot_si64 (__m1, __m2);
|
|
674 }
|
|
675
|
|
676 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
|
|
677 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
678 _mm_or_si64 (__m64 __m1, __m64 __m2)
|
|
679 {
|
|
680 return (__m1 | __m2);
|
|
681 }
|
|
682
|
|
683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
684 _m_por (__m64 __m1, __m64 __m2)
|
|
685 {
|
|
686 return _mm_or_si64 (__m1, __m2);
|
|
687 }
|
|
688
|
|
689 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
|
|
690 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
691 _mm_xor_si64 (__m64 __m1, __m64 __m2)
|
|
692 {
|
|
693 return (__m1 ^ __m2);
|
|
694 }
|
|
695
|
|
696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
697 _m_pxor (__m64 __m1, __m64 __m2)
|
|
698 {
|
|
699 return _mm_xor_si64 (__m1, __m2);
|
|
700 }
|
|
701
|
|
702 /* Creates a 64-bit zero. */
|
|
703 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
704 _mm_setzero_si64 (void)
|
|
705 {
|
|
706 return (__m64) 0;
|
|
707 }
|
|
708
|
|
709 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
|
|
710 test is true and zero if false. */
|
|
711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
712 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
|
|
713 {
|
|
714 #ifdef _ARCH_PWR6
|
|
715 __m64 res;
|
|
716 __asm__(
|
|
717 "cmpb %0,%1,%2;\n"
|
|
718 : "=r" (res)
|
|
719 : "r" (__m1),
|
|
720 "r" (__m2)
|
|
721 : );
|
|
722 return (res);
|
|
723 #else
|
|
724 __m64_union m1, m2, res;
|
|
725
|
|
726 m1.as_m64 = __m1;
|
|
727 m2.as_m64 = __m2;
|
|
728
|
|
729 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
|
|
730 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
|
|
731 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
|
|
732 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
|
|
733 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
|
|
734 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
|
|
735 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
|
|
736 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
|
|
737
|
|
738 return (__m64) res.as_m64;
|
|
739 #endif
|
|
740 }
|
|
741
|
|
742 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
743 _m_pcmpeqb (__m64 __m1, __m64 __m2)
|
|
744 {
|
|
745 return _mm_cmpeq_pi8 (__m1, __m2);
|
|
746 }
|
|
747
|
|
748 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
749 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
|
|
750 {
|
|
751 #if _ARCH_PWR8
|
|
752 __vector signed char a, b, c;
|
|
753
|
|
754 a = (__vector signed char)vec_splats (__m1);
|
|
755 b = (__vector signed char)vec_splats (__m2);
|
|
756 c = (__vector signed char)vec_cmpgt (a, b);
|
|
757 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
758 #else
|
|
759 __m64_union m1, m2, res;
|
|
760
|
|
761 m1.as_m64 = __m1;
|
|
762 m2.as_m64 = __m2;
|
|
763
|
|
764 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
|
|
765 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
|
|
766 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
|
|
767 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
|
|
768 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
|
|
769 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
|
|
770 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
|
|
771 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
|
|
772
|
|
773 return (__m64) res.as_m64;
|
|
774 #endif
|
|
775 }
|
|
776
|
|
777 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
778 _m_pcmpgtb (__m64 __m1, __m64 __m2)
|
|
779 {
|
|
780 return _mm_cmpgt_pi8 (__m1, __m2);
|
|
781 }
|
|
782
|
|
783 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
|
|
784 the test is true and zero if false. */
|
|
785 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
786 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
|
|
787 {
|
|
788 #if _ARCH_PWR8
|
|
789 __vector signed short a, b, c;
|
|
790
|
|
791 a = (__vector signed short)vec_splats (__m1);
|
|
792 b = (__vector signed short)vec_splats (__m2);
|
|
793 c = (__vector signed short)vec_cmpeq (a, b);
|
|
794 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
795 #else
|
|
796 __m64_union m1, m2, res;
|
|
797
|
|
798 m1.as_m64 = __m1;
|
|
799 m2.as_m64 = __m2;
|
|
800
|
|
801 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
|
|
802 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
|
|
803 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
|
|
804 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
|
|
805
|
|
806 return (__m64) res.as_m64;
|
|
807 #endif
|
|
808 }
|
|
809
|
|
810 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
811 _m_pcmpeqw (__m64 __m1, __m64 __m2)
|
|
812 {
|
|
813 return _mm_cmpeq_pi16 (__m1, __m2);
|
|
814 }
|
|
815
|
|
816 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
817 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
|
|
818 {
|
|
819 #if _ARCH_PWR8
|
|
820 __vector signed short a, b, c;
|
|
821
|
|
822 a = (__vector signed short)vec_splats (__m1);
|
|
823 b = (__vector signed short)vec_splats (__m2);
|
|
824 c = (__vector signed short)vec_cmpgt (a, b);
|
|
825 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
826 #else
|
|
827 __m64_union m1, m2, res;
|
|
828
|
|
829 m1.as_m64 = __m1;
|
|
830 m2.as_m64 = __m2;
|
|
831
|
|
832 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
|
|
833 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
|
|
834 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
|
|
835 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
|
|
836
|
|
837 return (__m64) res.as_m64;
|
|
838 #endif
|
|
839 }
|
|
840
|
|
841 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
842 _m_pcmpgtw (__m64 __m1, __m64 __m2)
|
|
843 {
|
|
844 return _mm_cmpgt_pi16 (__m1, __m2);
|
|
845 }
|
|
846
|
|
847 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
|
|
848 the test is true and zero if false. */
|
|
849 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
850 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
|
|
851 {
|
|
852 #if _ARCH_PWR9
|
|
853 __vector signed int a, b, c;
|
|
854
|
|
855 a = (__vector signed int)vec_splats (__m1);
|
|
856 b = (__vector signed int)vec_splats (__m2);
|
|
857 c = (__vector signed short)vec_cmpeq (a, b);
|
|
858 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
859 #else
|
|
860 __m64_union m1, m2, res;
|
|
861
|
|
862 m1.as_m64 = __m1;
|
|
863 m2.as_m64 = __m2;
|
|
864
|
|
865 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
|
|
866 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
|
|
867
|
|
868 return (__m64) res.as_m64;
|
|
869 #endif
|
|
870 }
|
|
871
|
|
872 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
873 _m_pcmpeqd (__m64 __m1, __m64 __m2)
|
|
874 {
|
|
875 return _mm_cmpeq_pi32 (__m1, __m2);
|
|
876 }
|
|
877
|
|
878 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
879 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
|
|
880 {
|
|
881 #if _ARCH_PWR9
|
|
882 __vector signed int a, b, c;
|
|
883
|
|
884 a = (__vector signed int)vec_splats (__m1);
|
|
885 b = (__vector signed int)vec_splats (__m2);
|
|
886 c = (__vector signed short)vec_cmpgt (a, b);
|
|
887 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
888 #else
|
|
889 __m64_union m1, m2, res;
|
|
890
|
|
891 m1.as_m64 = __m1;
|
|
892 m2.as_m64 = __m2;
|
|
893
|
|
894 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
|
|
895 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
|
|
896
|
|
897 return (__m64) res.as_m64;
|
|
898 #endif
|
|
899 }
|
|
900
|
|
901 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
902 _m_pcmpgtd (__m64 __m1, __m64 __m2)
|
|
903 {
|
|
904 return _mm_cmpgt_pi32 (__m1, __m2);
|
|
905 }
|
|
906
|
|
907 #if _ARCH_PWR8
|
|
908 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
|
|
909 saturated arithmetic. */
|
|
910 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
911 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
|
|
912 {
|
|
913 __vector signed char a, b, c;
|
|
914
|
|
915 a = (__vector signed char)vec_splats (__m1);
|
|
916 b = (__vector signed char)vec_splats (__m2);
|
|
917 c = vec_adds (a, b);
|
|
918 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
919 }
|
|
920
|
|
921 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
922 _m_paddsb (__m64 __m1, __m64 __m2)
|
|
923 {
|
|
924 return _mm_adds_pi8 (__m1, __m2);
|
|
925 }
|
|
926 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
|
|
927 saturated arithmetic. */
|
|
928 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
929 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
|
|
930 {
|
|
931 __vector signed short a, b, c;
|
|
932
|
|
933 a = (__vector signed short)vec_splats (__m1);
|
|
934 b = (__vector signed short)vec_splats (__m2);
|
|
935 c = vec_adds (a, b);
|
|
936 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
937 }
|
|
938
|
|
939 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
940 _m_paddsw (__m64 __m1, __m64 __m2)
|
|
941 {
|
|
942 return _mm_adds_pi16 (__m1, __m2);
|
|
943 }
|
|
944 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
|
|
945 saturated arithmetic. */
|
|
946 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
947 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
|
|
948 {
|
|
949 __vector unsigned char a, b, c;
|
|
950
|
|
951 a = (__vector unsigned char)vec_splats (__m1);
|
|
952 b = (__vector unsigned char)vec_splats (__m2);
|
|
953 c = vec_adds (a, b);
|
|
954 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
955 }
|
|
956
|
|
957 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
958 _m_paddusb (__m64 __m1, __m64 __m2)
|
|
959 {
|
|
960 return _mm_adds_pu8 (__m1, __m2);
|
|
961 }
|
|
962
|
|
963 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
|
|
964 saturated arithmetic. */
|
|
965 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
966 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
|
|
967 {
|
|
968 __vector unsigned short a, b, c;
|
|
969
|
|
970 a = (__vector unsigned short)vec_splats (__m1);
|
|
971 b = (__vector unsigned short)vec_splats (__m2);
|
|
972 c = vec_adds (a, b);
|
|
973 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
974 }
|
|
975
|
|
976 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
977 _m_paddusw (__m64 __m1, __m64 __m2)
|
|
978 {
|
|
979 return _mm_adds_pu16 (__m1, __m2);
|
|
980 }
|
|
981
|
|
982 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
|
|
983 saturating arithmetic. */
|
|
984 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
985 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
|
|
986 {
|
|
987 __vector signed char a, b, c;
|
|
988
|
|
989 a = (__vector signed char)vec_splats (__m1);
|
|
990 b = (__vector signed char)vec_splats (__m2);
|
|
991 c = vec_subs (a, b);
|
|
992 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
993 }
|
|
994
|
|
995 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
996 _m_psubsb (__m64 __m1, __m64 __m2)
|
|
997 {
|
|
998 return _mm_subs_pi8 (__m1, __m2);
|
|
999 }
|
|
1000
|
|
1001 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
|
|
1002 signed saturating arithmetic. */
|
|
1003 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1004 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
|
|
1005 {
|
|
1006 __vector signed short a, b, c;
|
|
1007
|
|
1008 a = (__vector signed short)vec_splats (__m1);
|
|
1009 b = (__vector signed short)vec_splats (__m2);
|
|
1010 c = vec_subs (a, b);
|
|
1011 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
1012 }
|
|
1013
|
|
1014 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1015 _m_psubsw (__m64 __m1, __m64 __m2)
|
|
1016 {
|
|
1017 return _mm_subs_pi16 (__m1, __m2);
|
|
1018 }
|
|
1019
|
|
1020 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
|
|
1021 unsigned saturating arithmetic. */
|
|
1022 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1023 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
|
|
1024 {
|
|
1025 __vector unsigned char a, b, c;
|
|
1026
|
|
1027 a = (__vector unsigned char)vec_splats (__m1);
|
|
1028 b = (__vector unsigned char)vec_splats (__m2);
|
|
1029 c = vec_subs (a, b);
|
|
1030 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
1031 }
|
|
1032
|
|
1033 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1034 _m_psubusb (__m64 __m1, __m64 __m2)
|
|
1035 {
|
|
1036 return _mm_subs_pu8 (__m1, __m2);
|
|
1037 }
|
|
1038
|
|
1039 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
|
|
1040 unsigned saturating arithmetic. */
|
|
1041 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1042 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
|
|
1043 {
|
|
1044 __vector unsigned short a, b, c;
|
|
1045
|
|
1046 a = (__vector unsigned short)vec_splats (__m1);
|
|
1047 b = (__vector unsigned short)vec_splats (__m2);
|
|
1048 c = vec_subs (a, b);
|
|
1049 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
1050 }
|
|
1051
|
|
1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1053 _m_psubusw (__m64 __m1, __m64 __m2)
|
|
1054 {
|
|
1055 return _mm_subs_pu16 (__m1, __m2);
|
|
1056 }
|
|
1057
|
|
1058 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
|
|
1059 four 32-bit intermediate results, which are then summed by pairs to
|
|
1060 produce two 32-bit results. */
|
|
1061 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1062 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
|
|
1063 {
|
|
1064 __vector signed short a, b;
|
|
1065 __vector signed int c;
|
|
1066 __vector signed int zero = {0, 0, 0, 0};
|
|
1067
|
|
1068 a = (__vector signed short)vec_splats (__m1);
|
|
1069 b = (__vector signed short)vec_splats (__m2);
|
|
1070 c = vec_vmsumshm (a, b, zero);
|
|
1071 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
1072 }
|
|
1073
|
|
1074 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1075 _m_pmaddwd (__m64 __m1, __m64 __m2)
|
|
1076 {
|
|
1077 return _mm_madd_pi16 (__m1, __m2);
|
|
1078 }
|
|
1079 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
|
|
1080 M2 and produce the high 16 bits of the 32-bit results. */
|
|
1081 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1082 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
|
|
1083 {
|
|
1084 __vector signed short a, b;
|
|
1085 __vector signed short c;
|
|
1086 __vector signed int w0, w1;
|
|
1087 __vector unsigned char xform1 = {
|
|
1088 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
|
|
1089 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
|
|
1090 };
|
|
1091
|
|
1092 a = (__vector signed short)vec_splats (__m1);
|
|
1093 b = (__vector signed short)vec_splats (__m2);
|
|
1094
|
|
1095 w0 = vec_vmulesh (a, b);
|
|
1096 w1 = vec_vmulosh (a, b);
|
|
1097 c = (__vector signed short)vec_perm (w0, w1, xform1);
|
|
1098
|
|
1099 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
1100 }
|
|
1101
|
|
1102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1103 _m_pmulhw (__m64 __m1, __m64 __m2)
|
|
1104 {
|
|
1105 return _mm_mulhi_pi16 (__m1, __m2);
|
|
1106 }
|
|
1107
|
|
1108 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
|
|
1109 the low 16 bits of the results. */
|
|
1110 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1111 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
|
|
1112 {
|
|
1113 __vector signed short a, b, c;
|
|
1114
|
|
1115 a = (__vector signed short)vec_splats (__m1);
|
|
1116 b = (__vector signed short)vec_splats (__m2);
|
|
1117 c = a * b;
|
|
1118 return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
|
|
1119 }
|
|
1120
|
|
1121 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1122 _m_pmullw (__m64 __m1, __m64 __m2)
|
|
1123 {
|
|
1124 return _mm_mullo_pi16 (__m1, __m2);
|
|
1125 }
|
|
1126
|
|
1127 /* Shift four 16-bit values in M left by COUNT. */
|
|
1128 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1129 _mm_sll_pi16 (__m64 __m, __m64 __count)
|
|
1130 {
|
|
1131 __vector signed short m, r;
|
|
1132 __vector unsigned short c;
|
|
1133
|
|
1134 if (__count <= 15)
|
|
1135 {
|
|
1136 m = (__vector signed short)vec_splats (__m);
|
|
1137 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
|
|
1138 r = vec_sl (m, (__vector unsigned short)c);
|
|
1139 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
|
|
1140 }
|
|
1141 else
|
|
1142 return (0);
|
|
1143 }
|
|
1144
|
|
1145 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1146 _m_psllw (__m64 __m, __m64 __count)
|
|
1147 {
|
|
1148 return _mm_sll_pi16 (__m, __count);
|
|
1149 }
|
|
1150
|
|
1151 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1152 _mm_slli_pi16 (__m64 __m, int __count)
|
|
1153 {
|
|
1154 /* Promote int to long then invoke mm_sll_pi16. */
|
|
1155 return _mm_sll_pi16 (__m, __count);
|
|
1156 }
|
|
1157
|
|
1158 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1159 _m_psllwi (__m64 __m, int __count)
|
|
1160 {
|
|
1161 return _mm_slli_pi16 (__m, __count);
|
|
1162 }
|
|
1163
|
|
1164 /* Shift two 32-bit values in M left by COUNT. */
|
|
1165 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1166 _mm_sll_pi32 (__m64 __m, __m64 __count)
|
|
1167 {
|
|
1168 __m64_union m, res;
|
|
1169
|
|
1170 m.as_m64 = __m;
|
|
1171
|
|
1172 res.as_int[0] = m.as_int[0] << __count;
|
|
1173 res.as_int[1] = m.as_int[1] << __count;
|
|
1174 return (res.as_m64);
|
|
1175 }
|
|
1176
|
|
1177 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1178 _m_pslld (__m64 __m, __m64 __count)
|
|
1179 {
|
|
1180 return _mm_sll_pi32 (__m, __count);
|
|
1181 }
|
|
1182
|
|
1183 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1184 _mm_slli_pi32 (__m64 __m, int __count)
|
|
1185 {
|
|
1186 /* Promote int to long then invoke mm_sll_pi32. */
|
|
1187 return _mm_sll_pi32 (__m, __count);
|
|
1188 }
|
|
1189
|
|
1190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1191 _m_pslldi (__m64 __m, int __count)
|
|
1192 {
|
|
1193 return _mm_slli_pi32 (__m, __count);
|
|
1194 }
|
|
1195
|
|
1196 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
|
|
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1198 _mm_sra_pi16 (__m64 __m, __m64 __count)
|
|
1199 {
|
|
1200 __vector signed short m, r;
|
|
1201 __vector unsigned short c;
|
|
1202
|
|
1203 if (__count <= 15)
|
|
1204 {
|
|
1205 m = (__vector signed short)vec_splats (__m);
|
|
1206 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
|
|
1207 r = vec_sra (m, (__vector unsigned short)c);
|
|
1208 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
|
|
1209 }
|
|
1210 else
|
|
1211 return (0);
|
|
1212 }
|
|
1213
|
|
1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1215 _m_psraw (__m64 __m, __m64 __count)
|
|
1216 {
|
|
1217 return _mm_sra_pi16 (__m, __count);
|
|
1218 }
|
|
1219
|
|
1220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1221 _mm_srai_pi16 (__m64 __m, int __count)
|
|
1222 {
|
|
1223 /* Promote int to long then invoke mm_sra_pi32. */
|
|
1224 return _mm_sra_pi16 (__m, __count);
|
|
1225 }
|
|
1226
|
|
1227 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1228 _m_psrawi (__m64 __m, int __count)
|
|
1229 {
|
|
1230 return _mm_srai_pi16 (__m, __count);
|
|
1231 }
|
|
1232
|
|
1233 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
|
|
1234 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1235 _mm_sra_pi32 (__m64 __m, __m64 __count)
|
|
1236 {
|
|
1237 __m64_union m, res;
|
|
1238
|
|
1239 m.as_m64 = __m;
|
|
1240
|
|
1241 res.as_int[0] = m.as_int[0] >> __count;
|
|
1242 res.as_int[1] = m.as_int[1] >> __count;
|
|
1243 return (res.as_m64);
|
|
1244 }
|
|
1245
|
|
1246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1247 _m_psrad (__m64 __m, __m64 __count)
|
|
1248 {
|
|
1249 return _mm_sra_pi32 (__m, __count);
|
|
1250 }
|
|
1251
|
|
1252 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1253 _mm_srai_pi32 (__m64 __m, int __count)
|
|
1254 {
|
|
1255 /* Promote int to long then invoke mm_sra_pi32. */
|
|
1256 return _mm_sra_pi32 (__m, __count);
|
|
1257 }
|
|
1258
|
|
1259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1260 _m_psradi (__m64 __m, int __count)
|
|
1261 {
|
|
1262 return _mm_srai_pi32 (__m, __count);
|
|
1263 }
|
|
1264
|
|
1265 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
|
|
1266 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1267 _mm_srl_pi16 (__m64 __m, __m64 __count)
|
|
1268 {
|
|
1269 __vector unsigned short m, r;
|
|
1270 __vector unsigned short c;
|
|
1271
|
|
1272 if (__count <= 15)
|
|
1273 {
|
|
1274 m = (__vector unsigned short)vec_splats (__m);
|
|
1275 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
|
|
1276 r = vec_sr (m, (__vector unsigned short)c);
|
|
1277 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
|
|
1278 }
|
|
1279 else
|
|
1280 return (0);
|
|
1281 }
|
|
1282
|
|
1283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1284 _m_psrlw (__m64 __m, __m64 __count)
|
|
1285 {
|
|
1286 return _mm_srl_pi16 (__m, __count);
|
|
1287 }
|
|
1288
|
|
1289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1290 _mm_srli_pi16 (__m64 __m, int __count)
|
|
1291 {
|
|
1292 /* Promote int to long then invoke mm_sra_pi32. */
|
|
1293 return _mm_srl_pi16 (__m, __count);
|
|
1294 }
|
|
1295
|
|
1296 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1297 _m_psrlwi (__m64 __m, int __count)
|
|
1298 {
|
|
1299 return _mm_srli_pi16 (__m, __count);
|
|
1300 }
|
|
1301
|
|
1302 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
|
|
1303 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1304 _mm_srl_pi32 (__m64 __m, __m64 __count)
|
|
1305 {
|
|
1306 __m64_union m, res;
|
|
1307
|
|
1308 m.as_m64 = __m;
|
|
1309
|
|
1310 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
|
|
1311 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
|
|
1312 return (res.as_m64);
|
|
1313 }
|
|
1314
|
|
1315 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1316 _m_psrld (__m64 __m, __m64 __count)
|
|
1317 {
|
|
1318 return _mm_srl_pi32 (__m, __count);
|
|
1319 }
|
|
1320
|
|
1321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1322 _mm_srli_pi32 (__m64 __m, int __count)
|
|
1323 {
|
|
1324 /* Promote int to long then invoke mm_srl_pi32. */
|
|
1325 return _mm_srl_pi32 (__m, __count);
|
|
1326 }
|
|
1327
|
|
1328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1329 _m_psrldi (__m64 __m, int __count)
|
|
1330 {
|
|
1331 return _mm_srli_pi32 (__m, __count);
|
|
1332 }
|
|
1333 #endif /* _ARCH_PWR8 */
|
|
1334
|
|
1335 /* Creates a vector of two 32-bit values; I0 is least significant. */
|
|
1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1337 _mm_set_pi32 (int __i1, int __i0)
|
|
1338 {
|
|
1339 __m64_union res;
|
|
1340
|
|
1341 res.as_int[0] = __i0;
|
|
1342 res.as_int[1] = __i1;
|
|
1343 return (res.as_m64);
|
|
1344 }
|
|
1345
|
|
1346 /* Creates a vector of four 16-bit values; W0 is least significant. */
|
|
1347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1348 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
|
|
1349 {
|
|
1350 __m64_union res;
|
|
1351
|
|
1352 res.as_short[0] = __w0;
|
|
1353 res.as_short[1] = __w1;
|
|
1354 res.as_short[2] = __w2;
|
|
1355 res.as_short[3] = __w3;
|
|
1356 return (res.as_m64);
|
|
1357 }
|
|
1358
|
|
1359 /* Creates a vector of eight 8-bit values; B0 is least significant. */
|
|
1360 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1361 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
|
|
1362 char __b3, char __b2, char __b1, char __b0)
|
|
1363 {
|
|
1364 __m64_union res;
|
|
1365
|
|
1366 res.as_char[0] = __b0;
|
|
1367 res.as_char[1] = __b1;
|
|
1368 res.as_char[2] = __b2;
|
|
1369 res.as_char[3] = __b3;
|
|
1370 res.as_char[4] = __b4;
|
|
1371 res.as_char[5] = __b5;
|
|
1372 res.as_char[6] = __b6;
|
|
1373 res.as_char[7] = __b7;
|
|
1374 return (res.as_m64);
|
|
1375 }
|
|
1376
|
|
1377 /* Similar, but with the arguments in reverse order. */
|
|
1378 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1379 _mm_setr_pi32 (int __i0, int __i1)
|
|
1380 {
|
|
1381 __m64_union res;
|
|
1382
|
|
1383 res.as_int[0] = __i0;
|
|
1384 res.as_int[1] = __i1;
|
|
1385 return (res.as_m64);
|
|
1386 }
|
|
1387
|
|
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1389 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
|
|
1390 {
|
|
1391 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
|
|
1392 }
|
|
1393
|
|
1394 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1395 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
|
|
1396 char __b4, char __b5, char __b6, char __b7)
|
|
1397 {
|
|
1398 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
|
|
1399 }
|
|
1400
|
|
1401 /* Creates a vector of two 32-bit values, both elements containing I. */
|
|
1402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1403 _mm_set1_pi32 (int __i)
|
|
1404 {
|
|
1405 __m64_union res;
|
|
1406
|
|
1407 res.as_int[0] = __i;
|
|
1408 res.as_int[1] = __i;
|
|
1409 return (res.as_m64);
|
|
1410 }
|
|
1411
|
|
1412 /* Creates a vector of four 16-bit values, all elements containing W. */
|
|
1413 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1414 _mm_set1_pi16 (short __w)
|
|
1415 {
|
|
1416 #if _ARCH_PWR9
|
|
1417 __vector signed short w;
|
|
1418
|
|
1419 w = (__vector signed short)vec_splats (__w);
|
|
1420 return (__builtin_unpack_vector_int128 ((__vector __int128_t)w, 0));
|
|
1421 #else
|
|
1422 __m64_union res;
|
|
1423
|
|
1424 res.as_short[0] = __w;
|
|
1425 res.as_short[1] = __w;
|
|
1426 res.as_short[2] = __w;
|
|
1427 res.as_short[3] = __w;
|
|
1428 return (res.as_m64);
|
|
1429 #endif
|
|
1430 }
|
|
1431
|
|
1432 /* Creates a vector of eight 8-bit values, all elements containing B. */
|
|
1433 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1434 _mm_set1_pi8 (signed char __b)
|
|
1435 {
|
|
1436 #if _ARCH_PWR8
|
|
1437 __vector signed char b;
|
|
1438
|
|
1439 b = (__vector signed char)vec_splats (__b);
|
|
1440 return (__builtin_unpack_vector_int128 ((__vector __int128_t)b, 0));
|
|
1441 #else
|
|
1442 __m64_union res;
|
|
1443
|
|
1444 res.as_char[0] = __b;
|
|
1445 res.as_char[1] = __b;
|
|
1446 res.as_char[2] = __b;
|
|
1447 res.as_char[3] = __b;
|
|
1448 res.as_char[4] = __b;
|
|
1449 res.as_char[5] = __b;
|
|
1450 res.as_char[6] = __b;
|
|
1451 res.as_char[7] = __b;
|
|
1452 return (res.as_m64);
|
|
1453 #endif
|
|
1454 }
|
|
1455 #endif /* __powerpc64__ */
|
|
1456 #endif /* _MMINTRIN_H_INCLUDED */
|