145
|
1 /* Copyright (C) 2002-2020 Free Software Foundation, Inc.
|
111
|
2
|
|
3 This file is part of GCC.
|
|
4
|
|
5 GCC is free software; you can redistribute it and/or modify
|
|
6 it under the terms of the GNU General Public License as published by
|
|
7 the Free Software Foundation; either version 3, or (at your option)
|
|
8 any later version.
|
|
9
|
|
10 GCC is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU General Public License for more details.
|
|
14
|
|
15 Under Section 7 of GPL version 3, you are granted additional
|
|
16 permissions described in the GCC Runtime Library Exception, version
|
|
17 3.1, as published by the Free Software Foundation.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License and
|
|
20 a copy of the GCC Runtime Library Exception along with this program;
|
|
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
22 <http://www.gnu.org/licenses/>. */
|
|
23
|
|
24 /* Implemented from the specification included in the Intel C++ Compiler
|
|
25 User Guide and Reference, version 9.0. */
|
|
26
|
|
27 #ifndef NO_WARN_X86_INTRINSICS
|
|
28 /* This header is distributed to simplify porting x86_64 code that
|
|
29 makes explicit use of Intel intrinsics to powerpc64le.
|
|
30 It is the user's responsibility to determine if the results are
|
|
31 acceptable and make additional changes as necessary.
|
|
32 Note that much code that uses Intel intrinsics can be rewritten in
|
|
33 standard C or GNU C extensions, which are more portable and better
|
|
34 optimized across multiple targets.
|
|
35
|
|
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
|
|
37 target does not support a native __vector_size__ (8) type. Instead
|
|
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
|
|
39 supported in 64-bit mode. This works well for the _si64 and some
|
|
40 _pi32 operations, but starts to generate long sequences for _pi16
|
|
41 and _pi8 operations. For those cases it better (faster and
|
|
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
|
|
43 unit, perform the operation, and then transfer the result back to
|
|
44 the __m64 type. This implies that the direct register move
|
|
45 instructions, introduced with power8, are available for efficient
|
|
46 implementation of these transfers.
|
|
47
|
|
48 Most MMX intrinsic operations can be performed efficiently as
|
|
49 C language 64-bit scalar operation or optimized to use the newer
|
|
50 128-bit SSE/Altivec operations. We recomend this for new
|
|
51 applications. */
|
131
|
52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
111
|
53 #endif
|
|
54
|
|
55 #ifndef _MMINTRIN_H_INCLUDED
|
|
56 #define _MMINTRIN_H_INCLUDED
|
|
57
|
|
58 #include <altivec.h>
|
|
59 /* The Intel API is flexible enough that we must allow aliasing with other
|
|
60 vector types, and their scalar components. */
|
|
61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
|
|
62
|
|
63 typedef __attribute__ ((__aligned__ (8)))
|
|
64 union
|
|
65 {
|
|
66 __m64 as_m64;
|
|
67 char as_char[8];
|
|
68 signed char as_signed_char [8];
|
|
69 short as_short[4];
|
|
70 int as_int[2];
|
|
71 long long as_long_long;
|
|
72 float as_float[2];
|
|
73 double as_double;
|
|
74 } __m64_union;
|
|
75
|
|
76 /* Empty the multimedia state. */
|
|
77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
78 _mm_empty (void)
|
|
79 {
|
|
80 /* nothing to do on PowerPC. */
|
|
81 }
|
|
82
|
|
83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
84 _m_empty (void)
|
|
85 {
|
|
86 /* nothing to do on PowerPC. */
|
|
87 }
|
|
88
|
|
89 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
|
|
90 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
91 _mm_cvtsi32_si64 (int __i)
|
|
92 {
|
|
93 return (__m64) (unsigned int) __i;
|
|
94 }
|
|
95
|
|
96 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
97 _m_from_int (int __i)
|
|
98 {
|
|
99 return _mm_cvtsi32_si64 (__i);
|
|
100 }
|
|
101
|
|
102 /* Convert the lower 32 bits of the __m64 object into an integer. */
|
|
103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
104 _mm_cvtsi64_si32 (__m64 __i)
|
|
105 {
|
|
106 return ((int) __i);
|
|
107 }
|
|
108
|
|
109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
110 _m_to_int (__m64 __i)
|
|
111 {
|
|
112 return _mm_cvtsi64_si32 (__i);
|
|
113 }
|
|
114
|
|
115 /* Convert I to a __m64 object. */
|
|
116
|
|
117 /* Intel intrinsic. */
|
|
118 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
119 _m_from_int64 (long long __i)
|
|
120 {
|
|
121 return (__m64) __i;
|
|
122 }
|
|
123
|
|
124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
125 _mm_cvtsi64_m64 (long long __i)
|
|
126 {
|
|
127 return (__m64) __i;
|
|
128 }
|
|
129
|
|
130 /* Microsoft intrinsic. */
|
|
131 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
132 _mm_cvtsi64x_si64 (long long __i)
|
|
133 {
|
|
134 return (__m64) __i;
|
|
135 }
|
|
136
|
|
137 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
138 _mm_set_pi64x (long long __i)
|
|
139 {
|
|
140 return (__m64) __i;
|
|
141 }
|
|
142
|
|
143 /* Convert the __m64 object to a 64bit integer. */
|
|
144
|
|
145 /* Intel intrinsic. */
|
|
146 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
147 _m_to_int64 (__m64 __i)
|
|
148 {
|
|
149 return (long long)__i;
|
|
150 }
|
|
151
|
|
152 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
153 _mm_cvtm64_si64 (__m64 __i)
|
|
154 {
|
|
155 return (long long) __i;
|
|
156 }
|
|
157
|
|
158 /* Microsoft intrinsic. */
|
|
159 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
160 _mm_cvtsi64_si64x (__m64 __i)
|
|
161 {
|
|
162 return (long long) __i;
|
|
163 }
|
|
164
|
|
165 #ifdef _ARCH_PWR8
|
|
166 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
|
|
167 the result, and the four 16-bit values from M2 into the upper four 8-bit
|
|
168 values of the result, all with signed saturation. */
|
|
169 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
170 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
|
|
171 {
|
|
172 __vector signed short vm1;
|
|
173 __vector signed char vresult;
|
|
174
|
145
|
175 vm1 = (__vector signed short) (__vector unsigned long long)
|
|
176 #ifdef __LITTLE_ENDIAN__
|
|
177 { __m1, __m2 };
|
|
178 #else
|
|
179 { __m2, __m1 };
|
|
180 #endif
|
|
181 vresult = vec_packs (vm1, vm1);
|
|
182 return (__m64) ((__vector long long) vresult)[0];
|
111
|
183 }
|
|
184
|
|
185 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
186 _m_packsswb (__m64 __m1, __m64 __m2)
|
|
187 {
|
|
188 return _mm_packs_pi16 (__m1, __m2);
|
|
189 }
|
|
190
|
|
191 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
|
|
192 the result, and the two 32-bit values from M2 into the upper two 16-bit
|
|
193 values of the result, all with signed saturation. */
|
|
194 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
195 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
|
|
196 {
|
|
197 __vector signed int vm1;
|
|
198 __vector signed short vresult;
|
|
199
|
145
|
200 vm1 = (__vector signed int) (__vector unsigned long long)
|
|
201 #ifdef __LITTLE_ENDIAN__
|
|
202 { __m1, __m2 };
|
|
203 #else
|
|
204 { __m2, __m1 };
|
|
205 #endif
|
|
206 vresult = vec_packs (vm1, vm1);
|
|
207 return (__m64) ((__vector long long) vresult)[0];
|
111
|
208 }
|
|
209
|
|
210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
211 _m_packssdw (__m64 __m1, __m64 __m2)
|
|
212 {
|
|
213 return _mm_packs_pi32 (__m1, __m2);
|
|
214 }
|
|
215
|
|
216 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
|
|
217 the result, and the four 16-bit values from M2 into the upper four 8-bit
|
|
218 values of the result, all with unsigned saturation. */
|
|
219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
220 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
|
|
221 {
|
145
|
222 __vector unsigned char r;
|
|
223 __vector signed short vm1 = (__vector signed short) (__vector long long)
|
|
224 #ifdef __LITTLE_ENDIAN__
|
|
225 { __m1, __m2 };
|
|
226 #else
|
|
227 { __m2, __m1 };
|
|
228 #endif
|
|
229 const __vector signed short __zero = { 0 };
|
|
230 __vector __bool short __select = vec_cmplt (vm1, __zero);
|
|
231 r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
|
|
232 __vector __bool char packsel = vec_pack (__select, __select);
|
|
233 r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
|
|
234 return (__m64) ((__vector long long) r)[0];
|
111
|
235 }
|
|
236
|
|
237 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
238 _m_packuswb (__m64 __m1, __m64 __m2)
|
|
239 {
|
|
240 return _mm_packs_pu16 (__m1, __m2);
|
|
241 }
|
|
242 #endif /* end ARCH_PWR8 */
|
|
243
|
|
244 /* Interleave the four 8-bit values from the high half of M1 with the four
|
|
245 8-bit values from the high half of M2. */
|
|
246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
247 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
|
|
248 {
|
|
249 #if _ARCH_PWR8
|
|
250 __vector unsigned char a, b, c;
|
|
251
|
|
252 a = (__vector unsigned char)vec_splats (__m1);
|
|
253 b = (__vector unsigned char)vec_splats (__m2);
|
|
254 c = vec_mergel (a, b);
|
145
|
255 return (__m64) ((__vector long long) c)[1];
|
111
|
256 #else
|
|
257 __m64_union m1, m2, res;
|
|
258
|
|
259 m1.as_m64 = __m1;
|
|
260 m2.as_m64 = __m2;
|
|
261
|
|
262 res.as_char[0] = m1.as_char[4];
|
|
263 res.as_char[1] = m2.as_char[4];
|
|
264 res.as_char[2] = m1.as_char[5];
|
|
265 res.as_char[3] = m2.as_char[5];
|
|
266 res.as_char[4] = m1.as_char[6];
|
|
267 res.as_char[5] = m2.as_char[6];
|
|
268 res.as_char[6] = m1.as_char[7];
|
|
269 res.as_char[7] = m2.as_char[7];
|
|
270
|
|
271 return (__m64) res.as_m64;
|
|
272 #endif
|
|
273 }
|
|
274
|
|
275 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
276 _m_punpckhbw (__m64 __m1, __m64 __m2)
|
|
277 {
|
|
278 return _mm_unpackhi_pi8 (__m1, __m2);
|
|
279 }
|
|
280
|
|
281 /* Interleave the two 16-bit values from the high half of M1 with the two
|
|
282 16-bit values from the high half of M2. */
|
|
283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
284 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
|
|
285 {
|
|
286 __m64_union m1, m2, res;
|
|
287
|
|
288 m1.as_m64 = __m1;
|
|
289 m2.as_m64 = __m2;
|
|
290
|
|
291 res.as_short[0] = m1.as_short[2];
|
|
292 res.as_short[1] = m2.as_short[2];
|
|
293 res.as_short[2] = m1.as_short[3];
|
|
294 res.as_short[3] = m2.as_short[3];
|
|
295
|
|
296 return (__m64) res.as_m64;
|
|
297 }
|
|
298
|
|
299 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
300 _m_punpckhwd (__m64 __m1, __m64 __m2)
|
|
301 {
|
|
302 return _mm_unpackhi_pi16 (__m1, __m2);
|
|
303 }
|
|
304 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
|
|
305 value from the high half of M2. */
|
|
306 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
307 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
|
|
308 {
|
|
309 __m64_union m1, m2, res;
|
|
310
|
|
311 m1.as_m64 = __m1;
|
|
312 m2.as_m64 = __m2;
|
|
313
|
|
314 res.as_int[0] = m1.as_int[1];
|
|
315 res.as_int[1] = m2.as_int[1];
|
|
316
|
|
317 return (__m64) res.as_m64;
|
|
318 }
|
|
319
|
|
320 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
321 _m_punpckhdq (__m64 __m1, __m64 __m2)
|
|
322 {
|
|
323 return _mm_unpackhi_pi32 (__m1, __m2);
|
|
324 }
|
|
325 /* Interleave the four 8-bit values from the low half of M1 with the four
|
|
326 8-bit values from the low half of M2. */
|
|
327 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
328 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
|
|
329 {
|
|
330 #if _ARCH_PWR8
|
|
331 __vector unsigned char a, b, c;
|
|
332
|
|
333 a = (__vector unsigned char)vec_splats (__m1);
|
|
334 b = (__vector unsigned char)vec_splats (__m2);
|
|
335 c = vec_mergel (a, b);
|
145
|
336 return (__m64) ((__vector long long) c)[0];
|
111
|
337 #else
|
|
338 __m64_union m1, m2, res;
|
|
339
|
|
340 m1.as_m64 = __m1;
|
|
341 m2.as_m64 = __m2;
|
|
342
|
|
343 res.as_char[0] = m1.as_char[0];
|
|
344 res.as_char[1] = m2.as_char[0];
|
|
345 res.as_char[2] = m1.as_char[1];
|
|
346 res.as_char[3] = m2.as_char[1];
|
|
347 res.as_char[4] = m1.as_char[2];
|
|
348 res.as_char[5] = m2.as_char[2];
|
|
349 res.as_char[6] = m1.as_char[3];
|
|
350 res.as_char[7] = m2.as_char[3];
|
|
351
|
|
352 return (__m64) res.as_m64;
|
|
353 #endif
|
|
354 }
|
|
355
|
|
356 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
357 _m_punpcklbw (__m64 __m1, __m64 __m2)
|
|
358 {
|
|
359 return _mm_unpacklo_pi8 (__m1, __m2);
|
|
360 }
|
|
361 /* Interleave the two 16-bit values from the low half of M1 with the two
|
|
362 16-bit values from the low half of M2. */
|
|
363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
364 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
|
|
365 {
|
|
366 __m64_union m1, m2, res;
|
|
367
|
|
368 m1.as_m64 = __m1;
|
|
369 m2.as_m64 = __m2;
|
|
370
|
|
371 res.as_short[0] = m1.as_short[0];
|
|
372 res.as_short[1] = m2.as_short[0];
|
|
373 res.as_short[2] = m1.as_short[1];
|
|
374 res.as_short[3] = m2.as_short[1];
|
|
375
|
|
376 return (__m64) res.as_m64;
|
|
377 }
|
|
378
|
|
379 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
380 _m_punpcklwd (__m64 __m1, __m64 __m2)
|
|
381 {
|
|
382 return _mm_unpacklo_pi16 (__m1, __m2);
|
|
383 }
|
|
384
|
|
385 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
|
|
386 value from the low half of M2. */
|
|
387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
388 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
|
|
389 {
|
|
390 __m64_union m1, m2, res;
|
|
391
|
|
392 m1.as_m64 = __m1;
|
|
393 m2.as_m64 = __m2;
|
|
394
|
|
395 res.as_int[0] = m1.as_int[0];
|
|
396 res.as_int[1] = m2.as_int[0];
|
|
397
|
|
398 return (__m64) res.as_m64;
|
|
399 }
|
|
400
|
|
401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
402 _m_punpckldq (__m64 __m1, __m64 __m2)
|
|
403 {
|
|
404 return _mm_unpacklo_pi32 (__m1, __m2);
|
|
405 }
|
|
406
|
|
407 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
|
|
408 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
409 _mm_add_pi8 (__m64 __m1, __m64 __m2)
|
|
410 {
|
|
411 #if _ARCH_PWR8
|
|
412 __vector signed char a, b, c;
|
|
413
|
|
414 a = (__vector signed char)vec_splats (__m1);
|
|
415 b = (__vector signed char)vec_splats (__m2);
|
|
416 c = vec_add (a, b);
|
145
|
417 return (__m64) ((__vector long long) c)[0];
|
111
|
418 #else
|
|
419 __m64_union m1, m2, res;
|
|
420
|
|
421 m1.as_m64 = __m1;
|
|
422 m2.as_m64 = __m2;
|
|
423
|
|
424 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
|
|
425 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
|
|
426 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
|
|
427 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
|
|
428 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
|
|
429 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
|
|
430 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
|
|
431 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
|
|
432
|
|
433 return (__m64) res.as_m64;
|
|
434 #endif
|
|
435 }
|
|
436
|
|
437 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
438 _m_paddb (__m64 __m1, __m64 __m2)
|
|
439 {
|
|
440 return _mm_add_pi8 (__m1, __m2);
|
|
441 }
|
|
442
|
|
443 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
|
|
444 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
445 _mm_add_pi16 (__m64 __m1, __m64 __m2)
|
|
446 {
|
|
447 #if _ARCH_PWR8
|
|
448 __vector signed short a, b, c;
|
|
449
|
|
450 a = (__vector signed short)vec_splats (__m1);
|
|
451 b = (__vector signed short)vec_splats (__m2);
|
|
452 c = vec_add (a, b);
|
145
|
453 return (__m64) ((__vector long long) c)[0];
|
111
|
454 #else
|
|
455 __m64_union m1, m2, res;
|
|
456
|
|
457 m1.as_m64 = __m1;
|
|
458 m2.as_m64 = __m2;
|
|
459
|
|
460 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
|
|
461 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
|
|
462 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
|
|
463 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
|
|
464
|
|
465 return (__m64) res.as_m64;
|
|
466 #endif
|
|
467 }
|
|
468
|
|
469 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
470 _m_paddw (__m64 __m1, __m64 __m2)
|
|
471 {
|
|
472 return _mm_add_pi16 (__m1, __m2);
|
|
473 }
|
|
474
|
|
475 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
|
|
476 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
477 _mm_add_pi32 (__m64 __m1, __m64 __m2)
|
|
478 {
|
|
479 #if _ARCH_PWR9
|
|
480 __vector signed int a, b, c;
|
|
481
|
131
|
482 a = (__vector signed int)vec_splats (__m1);
|
|
483 b = (__vector signed int)vec_splats (__m2);
|
111
|
484 c = vec_add (a, b);
|
145
|
485 return (__m64) ((__vector long long) c)[0];
|
111
|
486 #else
|
|
487 __m64_union m1, m2, res;
|
|
488
|
|
489 m1.as_m64 = __m1;
|
|
490 m2.as_m64 = __m2;
|
|
491
|
|
492 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
|
|
493 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
|
|
494
|
|
495 return (__m64) res.as_m64;
|
|
496 #endif
|
|
497 }
|
|
498
|
|
499 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
500 _m_paddd (__m64 __m1, __m64 __m2)
|
|
501 {
|
|
502 return _mm_add_pi32 (__m1, __m2);
|
|
503 }
|
|
504
|
|
505 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
|
|
506 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
507 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
|
|
508 {
|
|
509 #if _ARCH_PWR8
|
|
510 __vector signed char a, b, c;
|
|
511
|
|
512 a = (__vector signed char)vec_splats (__m1);
|
|
513 b = (__vector signed char)vec_splats (__m2);
|
|
514 c = vec_sub (a, b);
|
145
|
515 return (__m64) ((__vector long long) c)[0];
|
111
|
516 #else
|
|
517 __m64_union m1, m2, res;
|
|
518
|
|
519 m1.as_m64 = __m1;
|
|
520 m2.as_m64 = __m2;
|
|
521
|
|
522 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
|
|
523 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
|
|
524 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
|
|
525 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
|
|
526 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
|
|
527 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
|
|
528 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
|
|
529 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
|
|
530
|
|
531 return (__m64) res.as_m64;
|
|
532 #endif
|
|
533 }
|
|
534
|
|
535 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
536 _m_psubb (__m64 __m1, __m64 __m2)
|
|
537 {
|
|
538 return _mm_sub_pi8 (__m1, __m2);
|
|
539 }
|
|
540
|
|
541 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
|
|
542 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
543 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
|
|
544 {
|
|
545 #if _ARCH_PWR8
|
|
546 __vector signed short a, b, c;
|
|
547
|
|
548 a = (__vector signed short)vec_splats (__m1);
|
|
549 b = (__vector signed short)vec_splats (__m2);
|
|
550 c = vec_sub (a, b);
|
145
|
551 return (__m64) ((__vector long long) c)[0];
|
111
|
552 #else
|
|
553 __m64_union m1, m2, res;
|
|
554
|
|
555 m1.as_m64 = __m1;
|
|
556 m2.as_m64 = __m2;
|
|
557
|
|
558 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
|
|
559 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
|
|
560 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
|
|
561 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
|
|
562
|
|
563 return (__m64) res.as_m64;
|
|
564 #endif
|
|
565 }
|
|
566
|
|
567 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
568 _m_psubw (__m64 __m1, __m64 __m2)
|
|
569 {
|
|
570 return _mm_sub_pi16 (__m1, __m2);
|
|
571 }
|
|
572
|
|
573 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
|
|
574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
575 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
|
|
576 {
|
|
577 #if _ARCH_PWR9
|
|
578 __vector signed int a, b, c;
|
|
579
|
|
580 a = (__vector signed int)vec_splats (__m1);
|
|
581 b = (__vector signed int)vec_splats (__m2);
|
|
582 c = vec_sub (a, b);
|
145
|
583 return (__m64) ((__vector long long) c)[0];
|
111
|
584 #else
|
|
585 __m64_union m1, m2, res;
|
|
586
|
|
587 m1.as_m64 = __m1;
|
|
588 m2.as_m64 = __m2;
|
|
589
|
|
590 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
|
|
591 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
|
|
592
|
|
593 return (__m64) res.as_m64;
|
|
594 #endif
|
|
595 }
|
|
596
|
|
597 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
598 _m_psubd (__m64 __m1, __m64 __m2)
|
|
599 {
|
145
|
600 return _mm_sub_pi32 (__m1, __m2);
|
111
|
601 }
|
|
602
|
|
603 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
604 _mm_add_si64 (__m64 __m1, __m64 __m2)
|
|
605 {
|
|
606 return (__m1 + __m2);
|
|
607 }
|
|
608
|
|
609 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
610 _mm_sub_si64 (__m64 __m1, __m64 __m2)
|
|
611 {
|
|
612 return (__m1 - __m2);
|
|
613 }
|
|
614
|
|
615 /* Shift the 64-bit value in M left by COUNT. */
|
|
616 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
617 _mm_sll_si64 (__m64 __m, __m64 __count)
|
|
618 {
|
|
619 return (__m << __count);
|
|
620 }
|
|
621
|
|
622 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
623 _m_psllq (__m64 __m, __m64 __count)
|
|
624 {
|
|
625 return _mm_sll_si64 (__m, __count);
|
|
626 }
|
|
627
|
|
628 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
629 _mm_slli_si64 (__m64 __m, const int __count)
|
|
630 {
|
|
631 return (__m << __count);
|
|
632 }
|
|
633
|
|
634 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
635 _m_psllqi (__m64 __m, const int __count)
|
|
636 {
|
|
637 return _mm_slli_si64 (__m, __count);
|
|
638 }
|
|
639
|
|
640 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
|
|
641 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
642 _mm_srl_si64 (__m64 __m, __m64 __count)
|
|
643 {
|
|
644 return (__m >> __count);
|
|
645 }
|
|
646
|
|
647 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
648 _m_psrlq (__m64 __m, __m64 __count)
|
|
649 {
|
|
650 return _mm_srl_si64 (__m, __count);
|
|
651 }
|
|
652
|
|
653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
654 _mm_srli_si64 (__m64 __m, const int __count)
|
|
655 {
|
|
656 return (__m >> __count);
|
|
657 }
|
|
658
|
|
659 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
660 _m_psrlqi (__m64 __m, const int __count)
|
|
661 {
|
|
662 return _mm_srli_si64 (__m, __count);
|
|
663 }
|
|
664
|
|
665 /* Bit-wise AND the 64-bit values in M1 and M2. */
|
|
666 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
667 _mm_and_si64 (__m64 __m1, __m64 __m2)
|
|
668 {
|
|
669 return (__m1 & __m2);
|
|
670 }
|
|
671
|
|
672 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
673 _m_pand (__m64 __m1, __m64 __m2)
|
|
674 {
|
|
675 return _mm_and_si64 (__m1, __m2);
|
|
676 }
|
|
677
|
|
678 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
|
|
679 64-bit value in M2. */
|
|
680 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
681 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
|
|
682 {
|
|
683 return (~__m1 & __m2);
|
|
684 }
|
|
685
|
|
686 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
687 _m_pandn (__m64 __m1, __m64 __m2)
|
|
688 {
|
|
689 return _mm_andnot_si64 (__m1, __m2);
|
|
690 }
|
|
691
|
|
692 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
|
|
693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
694 _mm_or_si64 (__m64 __m1, __m64 __m2)
|
|
695 {
|
|
696 return (__m1 | __m2);
|
|
697 }
|
|
698
|
|
699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
700 _m_por (__m64 __m1, __m64 __m2)
|
|
701 {
|
|
702 return _mm_or_si64 (__m1, __m2);
|
|
703 }
|
|
704
|
|
705 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
|
|
706 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
707 _mm_xor_si64 (__m64 __m1, __m64 __m2)
|
|
708 {
|
|
709 return (__m1 ^ __m2);
|
|
710 }
|
|
711
|
|
712 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
713 _m_pxor (__m64 __m1, __m64 __m2)
|
|
714 {
|
|
715 return _mm_xor_si64 (__m1, __m2);
|
|
716 }
|
|
717
|
|
718 /* Creates a 64-bit zero. */
|
|
719 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
720 _mm_setzero_si64 (void)
|
|
721 {
|
|
722 return (__m64) 0;
|
|
723 }
|
|
724
|
|
725 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
|
|
726 test is true and zero if false. */
|
|
727 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
728 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
|
|
729 {
|
145
|
730 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
|
111
|
731 __m64 res;
|
|
732 __asm__(
|
|
733 "cmpb %0,%1,%2;\n"
|
|
734 : "=r" (res)
|
|
735 : "r" (__m1),
|
|
736 "r" (__m2)
|
|
737 : );
|
|
738 return (res);
|
|
739 #else
|
|
740 __m64_union m1, m2, res;
|
|
741
|
|
742 m1.as_m64 = __m1;
|
|
743 m2.as_m64 = __m2;
|
|
744
|
|
745 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
|
|
746 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
|
|
747 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
|
|
748 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
|
|
749 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
|
|
750 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
|
|
751 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
|
|
752 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
|
|
753
|
|
754 return (__m64) res.as_m64;
|
|
755 #endif
|
|
756 }
|
|
757
|
|
758 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
759 _m_pcmpeqb (__m64 __m1, __m64 __m2)
|
|
760 {
|
|
761 return _mm_cmpeq_pi8 (__m1, __m2);
|
|
762 }
|
|
763
|
|
764 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
765 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
|
|
766 {
|
|
767 #if _ARCH_PWR8
|
|
768 __vector signed char a, b, c;
|
|
769
|
|
770 a = (__vector signed char)vec_splats (__m1);
|
|
771 b = (__vector signed char)vec_splats (__m2);
|
|
772 c = (__vector signed char)vec_cmpgt (a, b);
|
145
|
773 return (__m64) ((__vector long long) c)[0];
|
111
|
774 #else
|
|
775 __m64_union m1, m2, res;
|
|
776
|
|
777 m1.as_m64 = __m1;
|
|
778 m2.as_m64 = __m2;
|
|
779
|
|
780 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
|
|
781 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
|
|
782 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
|
|
783 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
|
|
784 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
|
|
785 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
|
|
786 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
|
|
787 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
|
|
788
|
|
789 return (__m64) res.as_m64;
|
|
790 #endif
|
|
791 }
|
|
792
|
|
793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
794 _m_pcmpgtb (__m64 __m1, __m64 __m2)
|
|
795 {
|
|
796 return _mm_cmpgt_pi8 (__m1, __m2);
|
|
797 }
|
|
798
|
|
799 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
|
|
800 the test is true and zero if false. */
|
|
801 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
802 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
|
|
803 {
|
|
804 #if _ARCH_PWR8
|
|
805 __vector signed short a, b, c;
|
|
806
|
|
807 a = (__vector signed short)vec_splats (__m1);
|
|
808 b = (__vector signed short)vec_splats (__m2);
|
|
809 c = (__vector signed short)vec_cmpeq (a, b);
|
145
|
810 return (__m64) ((__vector long long) c)[0];
|
111
|
811 #else
|
|
812 __m64_union m1, m2, res;
|
|
813
|
|
814 m1.as_m64 = __m1;
|
|
815 m2.as_m64 = __m2;
|
|
816
|
|
817 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
|
|
818 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
|
|
819 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
|
|
820 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
|
|
821
|
|
822 return (__m64) res.as_m64;
|
|
823 #endif
|
|
824 }
|
|
825
|
|
826 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
827 _m_pcmpeqw (__m64 __m1, __m64 __m2)
|
|
828 {
|
|
829 return _mm_cmpeq_pi16 (__m1, __m2);
|
|
830 }
|
|
831
|
|
832 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
833 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
|
|
834 {
|
|
835 #if _ARCH_PWR8
|
|
836 __vector signed short a, b, c;
|
|
837
|
|
838 a = (__vector signed short)vec_splats (__m1);
|
|
839 b = (__vector signed short)vec_splats (__m2);
|
|
840 c = (__vector signed short)vec_cmpgt (a, b);
|
145
|
841 return (__m64) ((__vector long long) c)[0];
|
111
|
842 #else
|
|
843 __m64_union m1, m2, res;
|
|
844
|
|
845 m1.as_m64 = __m1;
|
|
846 m2.as_m64 = __m2;
|
|
847
|
|
848 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
|
|
849 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
|
|
850 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
|
|
851 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
|
|
852
|
|
853 return (__m64) res.as_m64;
|
|
854 #endif
|
|
855 }
|
|
856
|
|
857 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
858 _m_pcmpgtw (__m64 __m1, __m64 __m2)
|
|
859 {
|
|
860 return _mm_cmpgt_pi16 (__m1, __m2);
|
|
861 }
|
|
862
|
|
863 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
|
|
864 the test is true and zero if false. */
|
|
865 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
866 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
|
|
867 {
|
|
868 #if _ARCH_PWR9
|
|
869 __vector signed int a, b, c;
|
|
870
|
|
871 a = (__vector signed int)vec_splats (__m1);
|
|
872 b = (__vector signed int)vec_splats (__m2);
|
131
|
873 c = (__vector signed int)vec_cmpeq (a, b);
|
145
|
874 return (__m64) ((__vector long long) c)[0];
|
111
|
875 #else
|
|
876 __m64_union m1, m2, res;
|
|
877
|
|
878 m1.as_m64 = __m1;
|
|
879 m2.as_m64 = __m2;
|
|
880
|
|
881 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
|
|
882 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
|
|
883
|
|
884 return (__m64) res.as_m64;
|
|
885 #endif
|
|
886 }
|
|
887
|
|
888 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
889 _m_pcmpeqd (__m64 __m1, __m64 __m2)
|
|
890 {
|
|
891 return _mm_cmpeq_pi32 (__m1, __m2);
|
|
892 }
|
|
893
|
|
894 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
895 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
|
|
896 {
|
|
897 #if _ARCH_PWR9
|
|
898 __vector signed int a, b, c;
|
|
899
|
|
900 a = (__vector signed int)vec_splats (__m1);
|
|
901 b = (__vector signed int)vec_splats (__m2);
|
131
|
902 c = (__vector signed int)vec_cmpgt (a, b);
|
145
|
903 return (__m64) ((__vector long long) c)[0];
|
111
|
904 #else
|
|
905 __m64_union m1, m2, res;
|
|
906
|
|
907 m1.as_m64 = __m1;
|
|
908 m2.as_m64 = __m2;
|
|
909
|
|
910 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
|
|
911 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
|
|
912
|
|
913 return (__m64) res.as_m64;
|
|
914 #endif
|
|
915 }
|
|
916
|
|
917 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
918 _m_pcmpgtd (__m64 __m1, __m64 __m2)
|
|
919 {
|
|
920 return _mm_cmpgt_pi32 (__m1, __m2);
|
|
921 }
|
|
922
|
|
923 #if _ARCH_PWR8
|
|
924 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
|
|
925 saturated arithmetic. */
|
|
926 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
927 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
|
|
928 {
|
|
929 __vector signed char a, b, c;
|
|
930
|
|
931 a = (__vector signed char)vec_splats (__m1);
|
|
932 b = (__vector signed char)vec_splats (__m2);
|
|
933 c = vec_adds (a, b);
|
145
|
934 return (__m64) ((__vector long long) c)[0];
|
111
|
935 }
|
|
936
|
|
937 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
938 _m_paddsb (__m64 __m1, __m64 __m2)
|
|
939 {
|
|
940 return _mm_adds_pi8 (__m1, __m2);
|
|
941 }
|
|
942 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
|
|
943 saturated arithmetic. */
|
|
944 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
945 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
|
|
946 {
|
|
947 __vector signed short a, b, c;
|
|
948
|
|
949 a = (__vector signed short)vec_splats (__m1);
|
|
950 b = (__vector signed short)vec_splats (__m2);
|
|
951 c = vec_adds (a, b);
|
145
|
952 return (__m64) ((__vector long long) c)[0];
|
111
|
953 }
|
|
954
|
|
955 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
956 _m_paddsw (__m64 __m1, __m64 __m2)
|
|
957 {
|
|
958 return _mm_adds_pi16 (__m1, __m2);
|
|
959 }
|
|
960 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
|
|
961 saturated arithmetic. */
|
|
962 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
963 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
|
|
964 {
|
|
965 __vector unsigned char a, b, c;
|
|
966
|
|
967 a = (__vector unsigned char)vec_splats (__m1);
|
|
968 b = (__vector unsigned char)vec_splats (__m2);
|
|
969 c = vec_adds (a, b);
|
145
|
970 return (__m64) ((__vector long long) c)[0];
|
111
|
971 }
|
|
972
|
|
973 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
974 _m_paddusb (__m64 __m1, __m64 __m2)
|
|
975 {
|
|
976 return _mm_adds_pu8 (__m1, __m2);
|
|
977 }
|
|
978
|
|
979 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
|
|
980 saturated arithmetic. */
|
|
981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
982 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
|
|
983 {
|
|
984 __vector unsigned short a, b, c;
|
|
985
|
|
986 a = (__vector unsigned short)vec_splats (__m1);
|
|
987 b = (__vector unsigned short)vec_splats (__m2);
|
|
988 c = vec_adds (a, b);
|
145
|
989 return (__m64) ((__vector long long) c)[0];
|
111
|
990 }
|
|
991
|
|
992 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
993 _m_paddusw (__m64 __m1, __m64 __m2)
|
|
994 {
|
|
995 return _mm_adds_pu16 (__m1, __m2);
|
|
996 }
|
|
997
|
|
998 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
|
|
999 saturating arithmetic. */
|
|
1000 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1001 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
|
|
1002 {
|
|
1003 __vector signed char a, b, c;
|
|
1004
|
|
1005 a = (__vector signed char)vec_splats (__m1);
|
|
1006 b = (__vector signed char)vec_splats (__m2);
|
|
1007 c = vec_subs (a, b);
|
145
|
1008 return (__m64) ((__vector long long) c)[0];
|
111
|
1009 }
|
|
1010
|
|
1011 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1012 _m_psubsb (__m64 __m1, __m64 __m2)
|
|
1013 {
|
|
1014 return _mm_subs_pi8 (__m1, __m2);
|
|
1015 }
|
|
1016
|
|
1017 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
|
|
1018 signed saturating arithmetic. */
|
|
1019 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1020 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
|
|
1021 {
|
|
1022 __vector signed short a, b, c;
|
|
1023
|
|
1024 a = (__vector signed short)vec_splats (__m1);
|
|
1025 b = (__vector signed short)vec_splats (__m2);
|
|
1026 c = vec_subs (a, b);
|
145
|
1027 return (__m64) ((__vector long long) c)[0];
|
111
|
1028 }
|
|
1029
|
|
1030 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1031 _m_psubsw (__m64 __m1, __m64 __m2)
|
|
1032 {
|
|
1033 return _mm_subs_pi16 (__m1, __m2);
|
|
1034 }
|
|
1035
|
|
1036 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
|
|
1037 unsigned saturating arithmetic. */
|
|
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1039 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
|
|
1040 {
|
|
1041 __vector unsigned char a, b, c;
|
|
1042
|
|
1043 a = (__vector unsigned char)vec_splats (__m1);
|
|
1044 b = (__vector unsigned char)vec_splats (__m2);
|
|
1045 c = vec_subs (a, b);
|
145
|
1046 return (__m64) ((__vector long long) c)[0];
|
111
|
1047 }
|
|
1048
|
|
1049 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1050 _m_psubusb (__m64 __m1, __m64 __m2)
|
|
1051 {
|
|
1052 return _mm_subs_pu8 (__m1, __m2);
|
|
1053 }
|
|
1054
|
|
1055 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
|
|
1056 unsigned saturating arithmetic. */
|
|
1057 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1058 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
|
|
1059 {
|
|
1060 __vector unsigned short a, b, c;
|
|
1061
|
|
1062 a = (__vector unsigned short)vec_splats (__m1);
|
|
1063 b = (__vector unsigned short)vec_splats (__m2);
|
|
1064 c = vec_subs (a, b);
|
145
|
1065 return (__m64) ((__vector long long) c)[0];
|
111
|
1066 }
|
|
1067
|
|
1068 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1069 _m_psubusw (__m64 __m1, __m64 __m2)
|
|
1070 {
|
|
1071 return _mm_subs_pu16 (__m1, __m2);
|
|
1072 }
|
|
1073
|
|
1074 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
|
|
1075 four 32-bit intermediate results, which are then summed by pairs to
|
|
1076 produce two 32-bit results. */
|
|
1077 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1078 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
|
|
1079 {
|
|
1080 __vector signed short a, b;
|
|
1081 __vector signed int c;
|
|
1082 __vector signed int zero = {0, 0, 0, 0};
|
|
1083
|
|
1084 a = (__vector signed short)vec_splats (__m1);
|
|
1085 b = (__vector signed short)vec_splats (__m2);
|
|
1086 c = vec_vmsumshm (a, b, zero);
|
145
|
1087 return (__m64) ((__vector long long) c)[0];
|
111
|
1088 }
|
|
1089
|
|
1090 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1091 _m_pmaddwd (__m64 __m1, __m64 __m2)
|
|
1092 {
|
|
1093 return _mm_madd_pi16 (__m1, __m2);
|
|
1094 }
|
|
1095 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
|
|
1096 M2 and produce the high 16 bits of the 32-bit results. */
|
|
1097 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1098 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
|
|
1099 {
|
|
1100 __vector signed short a, b;
|
|
1101 __vector signed short c;
|
|
1102 __vector signed int w0, w1;
|
|
1103 __vector unsigned char xform1 = {
|
145
|
1104 #ifdef __LITTLE_ENDIAN__
|
111
|
1105 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
|
|
1106 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
|
145
|
1107 #else
|
|
1108 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
|
|
1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
|
|
1110 #endif
|
111
|
1111 };
|
|
1112
|
|
1113 a = (__vector signed short)vec_splats (__m1);
|
|
1114 b = (__vector signed short)vec_splats (__m2);
|
|
1115
|
|
1116 w0 = vec_vmulesh (a, b);
|
|
1117 w1 = vec_vmulosh (a, b);
|
|
1118 c = (__vector signed short)vec_perm (w0, w1, xform1);
|
|
1119
|
145
|
1120 return (__m64) ((__vector long long) c)[0];
|
111
|
1121 }
|
|
1122
|
|
1123 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1124 _m_pmulhw (__m64 __m1, __m64 __m2)
|
|
1125 {
|
|
1126 return _mm_mulhi_pi16 (__m1, __m2);
|
|
1127 }
|
|
1128
|
|
1129 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
|
|
1130 the low 16 bits of the results. */
|
|
1131 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1132 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
|
|
1133 {
|
|
1134 __vector signed short a, b, c;
|
|
1135
|
|
1136 a = (__vector signed short)vec_splats (__m1);
|
|
1137 b = (__vector signed short)vec_splats (__m2);
|
|
1138 c = a * b;
|
145
|
1139 return (__m64) ((__vector long long) c)[0];
|
111
|
1140 }
|
|
1141
|
|
1142 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1143 _m_pmullw (__m64 __m1, __m64 __m2)
|
|
1144 {
|
|
1145 return _mm_mullo_pi16 (__m1, __m2);
|
|
1146 }
|
|
1147
|
|
1148 /* Shift four 16-bit values in M left by COUNT. */
|
|
1149 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1150 _mm_sll_pi16 (__m64 __m, __m64 __count)
|
|
1151 {
|
|
1152 __vector signed short m, r;
|
|
1153 __vector unsigned short c;
|
|
1154
|
|
1155 if (__count <= 15)
|
|
1156 {
|
|
1157 m = (__vector signed short)vec_splats (__m);
|
|
1158 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
|
|
1159 r = vec_sl (m, (__vector unsigned short)c);
|
145
|
1160 return (__m64) ((__vector long long) r)[0];
|
111
|
1161 }
|
|
1162 else
|
|
1163 return (0);
|
|
1164 }
|
|
1165
|
|
1166 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1167 _m_psllw (__m64 __m, __m64 __count)
|
|
1168 {
|
|
1169 return _mm_sll_pi16 (__m, __count);
|
|
1170 }
|
|
1171
|
|
1172 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1173 _mm_slli_pi16 (__m64 __m, int __count)
|
|
1174 {
|
|
1175 /* Promote int to long then invoke mm_sll_pi16. */
|
|
1176 return _mm_sll_pi16 (__m, __count);
|
|
1177 }
|
|
1178
|
|
1179 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1180 _m_psllwi (__m64 __m, int __count)
|
|
1181 {
|
|
1182 return _mm_slli_pi16 (__m, __count);
|
|
1183 }
|
|
1184
|
|
1185 /* Shift two 32-bit values in M left by COUNT. */
|
|
1186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1187 _mm_sll_pi32 (__m64 __m, __m64 __count)
|
|
1188 {
|
|
1189 __m64_union m, res;
|
|
1190
|
|
1191 m.as_m64 = __m;
|
|
1192
|
|
1193 res.as_int[0] = m.as_int[0] << __count;
|
|
1194 res.as_int[1] = m.as_int[1] << __count;
|
|
1195 return (res.as_m64);
|
|
1196 }
|
|
1197
|
|
1198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1199 _m_pslld (__m64 __m, __m64 __count)
|
|
1200 {
|
|
1201 return _mm_sll_pi32 (__m, __count);
|
|
1202 }
|
|
1203
|
|
1204 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1205 _mm_slli_pi32 (__m64 __m, int __count)
|
|
1206 {
|
|
1207 /* Promote int to long then invoke mm_sll_pi32. */
|
|
1208 return _mm_sll_pi32 (__m, __count);
|
|
1209 }
|
|
1210
|
|
1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1212 _m_pslldi (__m64 __m, int __count)
|
|
1213 {
|
|
1214 return _mm_slli_pi32 (__m, __count);
|
|
1215 }
|
|
1216
|
|
1217 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
|
|
1218 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1219 _mm_sra_pi16 (__m64 __m, __m64 __count)
|
|
1220 {
|
|
1221 __vector signed short m, r;
|
|
1222 __vector unsigned short c;
|
|
1223
|
|
1224 if (__count <= 15)
|
|
1225 {
|
|
1226 m = (__vector signed short)vec_splats (__m);
|
|
1227 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
|
|
1228 r = vec_sra (m, (__vector unsigned short)c);
|
145
|
1229 return (__m64) ((__vector long long) r)[0];
|
111
|
1230 }
|
|
1231 else
|
|
1232 return (0);
|
|
1233 }
|
|
1234
|
|
1235 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1236 _m_psraw (__m64 __m, __m64 __count)
|
|
1237 {
|
|
1238 return _mm_sra_pi16 (__m, __count);
|
|
1239 }
|
|
1240
|
|
1241 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1242 _mm_srai_pi16 (__m64 __m, int __count)
|
|
1243 {
|
|
1244 /* Promote int to long then invoke mm_sra_pi32. */
|
|
1245 return _mm_sra_pi16 (__m, __count);
|
|
1246 }
|
|
1247
|
|
1248 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1249 _m_psrawi (__m64 __m, int __count)
|
|
1250 {
|
|
1251 return _mm_srai_pi16 (__m, __count);
|
|
1252 }
|
|
1253
|
|
1254 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
|
|
1255 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1256 _mm_sra_pi32 (__m64 __m, __m64 __count)
|
|
1257 {
|
|
1258 __m64_union m, res;
|
|
1259
|
|
1260 m.as_m64 = __m;
|
|
1261
|
|
1262 res.as_int[0] = m.as_int[0] >> __count;
|
|
1263 res.as_int[1] = m.as_int[1] >> __count;
|
|
1264 return (res.as_m64);
|
|
1265 }
|
|
1266
|
|
1267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1268 _m_psrad (__m64 __m, __m64 __count)
|
|
1269 {
|
|
1270 return _mm_sra_pi32 (__m, __count);
|
|
1271 }
|
|
1272
|
|
1273 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1274 _mm_srai_pi32 (__m64 __m, int __count)
|
|
1275 {
|
|
1276 /* Promote int to long then invoke mm_sra_pi32. */
|
|
1277 return _mm_sra_pi32 (__m, __count);
|
|
1278 }
|
|
1279
|
|
1280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1281 _m_psradi (__m64 __m, int __count)
|
|
1282 {
|
|
1283 return _mm_srai_pi32 (__m, __count);
|
|
1284 }
|
|
1285
|
|
1286 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
|
|
1287 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1288 _mm_srl_pi16 (__m64 __m, __m64 __count)
|
|
1289 {
|
|
1290 __vector unsigned short m, r;
|
|
1291 __vector unsigned short c;
|
|
1292
|
|
1293 if (__count <= 15)
|
|
1294 {
|
|
1295 m = (__vector unsigned short)vec_splats (__m);
|
|
1296 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
|
|
1297 r = vec_sr (m, (__vector unsigned short)c);
|
145
|
1298 return (__m64) ((__vector long long) r)[0];
|
111
|
1299 }
|
|
1300 else
|
|
1301 return (0);
|
|
1302 }
|
|
1303
|
|
1304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1305 _m_psrlw (__m64 __m, __m64 __count)
|
|
1306 {
|
|
1307 return _mm_srl_pi16 (__m, __count);
|
|
1308 }
|
|
1309
|
|
1310 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1311 _mm_srli_pi16 (__m64 __m, int __count)
|
|
1312 {
|
|
1313 /* Promote int to long then invoke mm_sra_pi32. */
|
|
1314 return _mm_srl_pi16 (__m, __count);
|
|
1315 }
|
|
1316
|
|
1317 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1318 _m_psrlwi (__m64 __m, int __count)
|
|
1319 {
|
|
1320 return _mm_srli_pi16 (__m, __count);
|
|
1321 }
|
|
1322
|
|
1323 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
|
|
1324 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1325 _mm_srl_pi32 (__m64 __m, __m64 __count)
|
|
1326 {
|
|
1327 __m64_union m, res;
|
|
1328
|
|
1329 m.as_m64 = __m;
|
|
1330
|
|
1331 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
|
|
1332 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
|
|
1333 return (res.as_m64);
|
|
1334 }
|
|
1335
|
|
1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1337 _m_psrld (__m64 __m, __m64 __count)
|
|
1338 {
|
|
1339 return _mm_srl_pi32 (__m, __count);
|
|
1340 }
|
|
1341
|
|
1342 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1343 _mm_srli_pi32 (__m64 __m, int __count)
|
|
1344 {
|
|
1345 /* Promote int to long then invoke mm_srl_pi32. */
|
|
1346 return _mm_srl_pi32 (__m, __count);
|
|
1347 }
|
|
1348
|
|
1349 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1350 _m_psrldi (__m64 __m, int __count)
|
|
1351 {
|
|
1352 return _mm_srli_pi32 (__m, __count);
|
|
1353 }
|
|
1354 #endif /* _ARCH_PWR8 */
|
|
1355
|
|
1356 /* Creates a vector of two 32-bit values; I0 is least significant. */
|
|
1357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1358 _mm_set_pi32 (int __i1, int __i0)
|
|
1359 {
|
|
1360 __m64_union res;
|
|
1361
|
|
1362 res.as_int[0] = __i0;
|
|
1363 res.as_int[1] = __i1;
|
|
1364 return (res.as_m64);
|
|
1365 }
|
|
1366
|
|
1367 /* Creates a vector of four 16-bit values; W0 is least significant. */
|
|
1368 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1369 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
|
|
1370 {
|
|
1371 __m64_union res;
|
|
1372
|
|
1373 res.as_short[0] = __w0;
|
|
1374 res.as_short[1] = __w1;
|
|
1375 res.as_short[2] = __w2;
|
|
1376 res.as_short[3] = __w3;
|
|
1377 return (res.as_m64);
|
|
1378 }
|
|
1379
|
|
1380 /* Creates a vector of eight 8-bit values; B0 is least significant. */
|
|
1381 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1382 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
|
|
1383 char __b3, char __b2, char __b1, char __b0)
|
|
1384 {
|
|
1385 __m64_union res;
|
|
1386
|
|
1387 res.as_char[0] = __b0;
|
|
1388 res.as_char[1] = __b1;
|
|
1389 res.as_char[2] = __b2;
|
|
1390 res.as_char[3] = __b3;
|
|
1391 res.as_char[4] = __b4;
|
|
1392 res.as_char[5] = __b5;
|
|
1393 res.as_char[6] = __b6;
|
|
1394 res.as_char[7] = __b7;
|
|
1395 return (res.as_m64);
|
|
1396 }
|
|
1397
|
|
1398 /* Similar, but with the arguments in reverse order. */
|
|
1399 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1400 _mm_setr_pi32 (int __i0, int __i1)
|
|
1401 {
|
|
1402 __m64_union res;
|
|
1403
|
|
1404 res.as_int[0] = __i0;
|
|
1405 res.as_int[1] = __i1;
|
|
1406 return (res.as_m64);
|
|
1407 }
|
|
1408
|
|
1409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1410 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
|
|
1411 {
|
|
1412 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
|
|
1413 }
|
|
1414
|
|
1415 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1416 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
|
|
1417 char __b4, char __b5, char __b6, char __b7)
|
|
1418 {
|
|
1419 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
|
|
1420 }
|
|
1421
|
|
1422 /* Creates a vector of two 32-bit values, both elements containing I. */
|
|
1423 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1424 _mm_set1_pi32 (int __i)
|
|
1425 {
|
|
1426 __m64_union res;
|
|
1427
|
|
1428 res.as_int[0] = __i;
|
|
1429 res.as_int[1] = __i;
|
|
1430 return (res.as_m64);
|
|
1431 }
|
|
1432
|
|
1433 /* Creates a vector of four 16-bit values, all elements containing W. */
|
|
1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1435 _mm_set1_pi16 (short __w)
|
|
1436 {
|
|
1437 #if _ARCH_PWR9
|
|
1438 __vector signed short w;
|
|
1439
|
|
1440 w = (__vector signed short)vec_splats (__w);
|
145
|
1441 return (__m64) ((__vector long long) w)[0];
|
111
|
1442 #else
|
|
1443 __m64_union res;
|
|
1444
|
|
1445 res.as_short[0] = __w;
|
|
1446 res.as_short[1] = __w;
|
|
1447 res.as_short[2] = __w;
|
|
1448 res.as_short[3] = __w;
|
|
1449 return (res.as_m64);
|
|
1450 #endif
|
|
1451 }
|
|
1452
|
|
1453 /* Creates a vector of eight 8-bit values, all elements containing B. */
|
|
1454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
1455 _mm_set1_pi8 (signed char __b)
|
|
1456 {
|
|
1457 #if _ARCH_PWR8
|
|
1458 __vector signed char b;
|
|
1459
|
|
1460 b = (__vector signed char)vec_splats (__b);
|
145
|
1461 return (__m64) ((__vector long long) b)[0];
|
111
|
1462 #else
|
|
1463 __m64_union res;
|
|
1464
|
|
1465 res.as_char[0] = __b;
|
|
1466 res.as_char[1] = __b;
|
|
1467 res.as_char[2] = __b;
|
|
1468 res.as_char[3] = __b;
|
|
1469 res.as_char[4] = __b;
|
|
1470 res.as_char[5] = __b;
|
|
1471 res.as_char[6] = __b;
|
|
1472 res.as_char[7] = __b;
|
|
1473 return (res.as_m64);
|
|
1474 #endif
|
|
1475 }
|
|
1476 #endif /* _MMINTRIN_H_INCLUDED */
|