Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/rs6000/tmmintrin.h @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 /* Copyright (C) 2003-2020 Free Software Foundation, Inc. | |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation; either version 3, or (at your option) | |
8 any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. */ | |
23 | |
24 /* Implemented from the specification included in the Intel C++ Compiler | |
25 User Guide and Reference, version 9.0. */ | |
26 | |
27 #ifndef NO_WARN_X86_INTRINSICS | |
28 /* This header is distributed to simplify porting x86_64 code that | |
29 makes explicit use of Intel intrinsics to powerpc64le. | |
30 It is the user's responsibility to determine if the results are | |
31 acceptable and make additional changes as necessary. | |
32 Note that much code that uses Intel intrinsics can be rewritten in | |
33 standard C or GNU C extensions, which are more portable and better | |
34 optimized across multiple targets. */ | |
35 #endif | |
36 | |
37 #ifndef TMMINTRIN_H_ | |
38 #define TMMINTRIN_H_ | |
39 | |
40 #include <altivec.h> | |
41 #include <assert.h> | |
42 | |
43 /* We need definitions from the SSE header files. */ | |
44 #include <pmmintrin.h> | |
45 | |
46 extern __inline __m128i | |
47 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
48 _mm_abs_epi16 (__m128i __A) | |
49 { | |
50 return (__m128i) vec_abs ((__v8hi) __A); | |
51 } | |
52 | |
53 extern __inline __m128i | |
54 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
55 _mm_abs_epi32 (__m128i __A) | |
56 { | |
57 return (__m128i) vec_abs ((__v4si) __A); | |
58 } | |
59 | |
60 extern __inline __m128i | |
61 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
62 _mm_abs_epi8 (__m128i __A) | |
63 { | |
64 return (__m128i) vec_abs ((__v16qi) __A); | |
65 } | |
66 | |
67 extern __inline __m64 | |
68 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
69 _mm_abs_pi16 (__m64 __A) | |
70 { | |
71 __v8hi __B = (__v8hi) (__v2du) { __A, __A }; | |
72 return (__m64) ((__v2du) vec_abs (__B))[0]; | |
73 } | |
74 | |
75 extern __inline __m64 | |
76 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
77 _mm_abs_pi32 (__m64 __A) | |
78 { | |
79 __v4si __B = (__v4si) (__v2du) { __A, __A }; | |
80 return (__m64) ((__v2du) vec_abs (__B))[0]; | |
81 } | |
82 | |
83 extern __inline __m64 | |
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
85 _mm_abs_pi8 (__m64 __A) | |
86 { | |
87 __v16qi __B = (__v16qi) (__v2du) { __A, __A }; | |
88 return (__m64) ((__v2du) vec_abs (__B))[0]; | |
89 } | |
90 | |
91 extern __inline __m128i | |
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
93 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) | |
94 { | |
95 if (__builtin_constant_p (__count) && __count < 16) | |
96 { | |
97 #ifdef __LITTLE_ENDIAN__ | |
98 __A = (__m128i) vec_reve ((__v16qu) __A); | |
99 __B = (__m128i) vec_reve ((__v16qu) __B); | |
100 #endif | |
101 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); | |
102 #ifdef __LITTLE_ENDIAN__ | |
103 __A = (__m128i) vec_reve ((__v16qu) __A); | |
104 #endif | |
105 return __A; | |
106 } | |
107 | |
108 if (__count == 0) | |
109 return __B; | |
110 | |
111 if (__count >= 16) | |
112 { | |
113 if (__count >= 32) | |
114 { | |
115 const __v16qu zero = { 0 }; | |
116 return (__m128i) zero; | |
117 } | |
118 else | |
119 { | |
120 const __v16qu __shift = | |
121 vec_splats ((unsigned char) ((__count - 16) * 8)); | |
122 #ifdef __LITTLE_ENDIAN__ | |
123 return (__m128i) vec_sro ((__v16qu) __A, __shift); | |
124 #else | |
125 return (__m128i) vec_slo ((__v16qu) __A, __shift); | |
126 #endif | |
127 } | |
128 } | |
129 else | |
130 { | |
131 const __v16qu __shiftA = | |
132 vec_splats ((unsigned char) ((16 - __count) * 8)); | |
133 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); | |
134 #ifdef __LITTLE_ENDIAN__ | |
135 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); | |
136 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); | |
137 #else | |
138 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); | |
139 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); | |
140 #endif | |
141 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); | |
142 } | |
143 } | |
144 | |
145 extern __inline __m64 | |
146 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
147 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) | |
148 { | |
149 if (__count < 16) | |
150 { | |
151 __v2du __C = { __B, __A }; | |
152 #ifdef __LITTLE_ENDIAN__ | |
153 const __v4su __shift = { __count << 3, 0, 0, 0 }; | |
154 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); | |
155 #else | |
156 const __v4su __shift = { 0, 0, 0, __count << 3 }; | |
157 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); | |
158 #endif | |
159 return (__m64) __C[0]; | |
160 } | |
161 else | |
162 { | |
163 const __m64 __zero = { 0 }; | |
164 return __zero; | |
165 } | |
166 } | |
167 | |
168 extern __inline __m128i | |
169 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
170 _mm_hadd_epi16 (__m128i __A, __m128i __B) | |
171 { | |
172 const __v16qu __P = | |
173 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
174 const __v16qu __Q = | |
175 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
176 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); | |
177 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); | |
178 return (__m128i) vec_add (__C, __D); | |
179 } | |
180 | |
181 extern __inline __m128i | |
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
183 _mm_hadd_epi32 (__m128i __A, __m128i __B) | |
184 { | |
185 const __v16qu __P = | |
186 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; | |
187 const __v16qu __Q = | |
188 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; | |
189 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); | |
190 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); | |
191 return (__m128i) vec_add (__C, __D); | |
192 } | |
193 | |
194 extern __inline __m64 | |
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
196 _mm_hadd_pi16 (__m64 __A, __m64 __B) | |
197 { | |
198 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
199 const __v16qu __P = | |
200 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; | |
201 const __v16qu __Q = | |
202 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; | |
203 __v8hi __D = vec_perm (__C, __C, __Q); | |
204 __C = vec_perm (__C, __C, __P); | |
205 __C = vec_add (__C, __D); | |
206 return (__m64) ((__v2du) __C)[1]; | |
207 } | |
208 | |
209 extern __inline __m64 | |
210 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
211 _mm_hadd_pi32 (__m64 __A, __m64 __B) | |
212 { | |
213 __v4si __C = (__v4si) (__v2du) { __A, __B }; | |
214 const __v16qu __P = | |
215 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; | |
216 const __v16qu __Q = | |
217 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; | |
218 __v4si __D = vec_perm (__C, __C, __Q); | |
219 __C = vec_perm (__C, __C, __P); | |
220 __C = vec_add (__C, __D); | |
221 return (__m64) ((__v2du) __C)[1]; | |
222 } | |
223 | |
224 extern __inline __m128i | |
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
226 _mm_hadds_epi16 (__m128i __A, __m128i __B) | |
227 { | |
228 __v4si __C = { 0 }, __D = { 0 }; | |
229 __C = vec_sum4s ((__v8hi) __A, __C); | |
230 __D = vec_sum4s ((__v8hi) __B, __D); | |
231 __C = (__v4si) vec_packs (__C, __D); | |
232 return (__m128i) __C; | |
233 } | |
234 | |
235 extern __inline __m64 | |
236 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
237 _mm_hadds_pi16 (__m64 __A, __m64 __B) | |
238 { | |
239 const __v4si __zero = { 0 }; | |
240 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
241 __v4si __D = vec_sum4s (__C, __zero); | |
242 __C = vec_packs (__D, __D); | |
243 return (__m64) ((__v2du) __C)[1]; | |
244 } | |
245 | |
246 extern __inline __m128i | |
247 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
248 _mm_hsub_epi16 (__m128i __A, __m128i __B) | |
249 { | |
250 const __v16qu __P = | |
251 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
252 const __v16qu __Q = | |
253 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
254 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); | |
255 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); | |
256 return (__m128i) vec_sub (__C, __D); | |
257 } | |
258 | |
259 extern __inline __m128i | |
260 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
261 _mm_hsub_epi32 (__m128i __A, __m128i __B) | |
262 { | |
263 const __v16qu __P = | |
264 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; | |
265 const __v16qu __Q = | |
266 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; | |
267 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); | |
268 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); | |
269 return (__m128i) vec_sub (__C, __D); | |
270 } | |
271 | |
272 extern __inline __m64 | |
273 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
274 _mm_hsub_pi16 (__m64 __A, __m64 __B) | |
275 { | |
276 const __v16qu __P = | |
277 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; | |
278 const __v16qu __Q = | |
279 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; | |
280 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
281 __v8hi __D = vec_perm (__C, __C, __Q); | |
282 __C = vec_perm (__C, __C, __P); | |
283 __C = vec_sub (__C, __D); | |
284 return (__m64) ((__v2du) __C)[1]; | |
285 } | |
286 | |
287 extern __inline __m64 | |
288 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
289 _mm_hsub_pi32 (__m64 __A, __m64 __B) | |
290 { | |
291 const __v16qu __P = | |
292 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; | |
293 const __v16qu __Q = | |
294 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; | |
295 __v4si __C = (__v4si) (__v2du) { __A, __B }; | |
296 __v4si __D = vec_perm (__C, __C, __Q); | |
297 __C = vec_perm (__C, __C, __P); | |
298 __C = vec_sub (__C, __D); | |
299 return (__m64) ((__v2du) __C)[1]; | |
300 } | |
301 | |
302 extern __inline __m128i | |
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
304 _mm_hsubs_epi16 (__m128i __A, __m128i __B) | |
305 { | |
306 const __v16qu __P = | |
307 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
308 const __v16qu __Q = | |
309 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
310 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); | |
311 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); | |
312 return (__m128i) vec_subs (__C, __D); | |
313 } | |
314 | |
315 extern __inline __m64 | |
316 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
317 _mm_hsubs_pi16 (__m64 __A, __m64 __B) | |
318 { | |
319 const __v16qu __P = | |
320 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; | |
321 const __v16qu __Q = | |
322 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; | |
323 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
324 __v8hi __D = vec_perm (__C, __C, __P); | |
325 __v8hi __E = vec_perm (__C, __C, __Q); | |
326 __C = vec_subs (__D, __E); | |
327 return (__m64) ((__v2du) __C)[1]; | |
328 } | |
329 | |
330 extern __inline __m128i | |
331 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
332 _mm_shuffle_epi8 (__m128i __A, __m128i __B) | |
333 { | |
334 const __v16qi __zero = { 0 }; | |
335 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); | |
336 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); | |
337 return (__m128i) vec_sel (__C, __zero, __select); | |
338 } | |
339 | |
340 extern __inline __m64 | |
341 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
342 _mm_shuffle_pi8 (__m64 __A, __m64 __B) | |
343 { | |
344 const __v16qi __zero = { 0 }; | |
345 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; | |
346 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; | |
347 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); | |
348 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); | |
349 __C = vec_sel (__C, __zero, __select); | |
350 return (__m64) ((__v2du) (__C))[0]; | |
351 } | |
352 | |
353 extern __inline __m128i | |
354 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
355 _mm_sign_epi8 (__m128i __A, __m128i __B) | |
356 { | |
357 const __v16qi __zero = { 0 }; | |
358 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); | |
359 __v16qi __selectpos = | |
360 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); | |
361 __v16qi __conv = vec_add (__selectneg, __selectpos); | |
362 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); | |
363 } | |
364 | |
365 extern __inline __m128i | |
366 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
367 _mm_sign_epi16 (__m128i __A, __m128i __B) | |
368 { | |
369 const __v8hi __zero = { 0 }; | |
370 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); | |
371 __v8hi __selectpos = | |
372 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); | |
373 __v8hi __conv = vec_add (__selectneg, __selectpos); | |
374 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); | |
375 } | |
376 | |
377 extern __inline __m128i | |
378 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
379 _mm_sign_epi32 (__m128i __A, __m128i __B) | |
380 { | |
381 const __v4si __zero = { 0 }; | |
382 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); | |
383 __v4si __selectpos = | |
384 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); | |
385 __v4si __conv = vec_add (__selectneg, __selectpos); | |
386 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); | |
387 } | |
388 | |
389 extern __inline __m64 | |
390 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
391 _mm_sign_pi8 (__m64 __A, __m64 __B) | |
392 { | |
393 const __v16qi __zero = { 0 }; | |
394 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; | |
395 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; | |
396 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); | |
397 return (__m64) ((__v2du) (__C))[0]; | |
398 } | |
399 | |
400 extern __inline __m64 | |
401 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
402 _mm_sign_pi16 (__m64 __A, __m64 __B) | |
403 { | |
404 const __v8hi __zero = { 0 }; | |
405 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; | |
406 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; | |
407 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); | |
408 return (__m64) ((__v2du) (__C))[0]; | |
409 } | |
410 | |
411 extern __inline __m64 | |
412 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
413 _mm_sign_pi32 (__m64 __A, __m64 __B) | |
414 { | |
415 const __v4si __zero = { 0 }; | |
416 __v4si __C = (__v4si) (__v2du) { __A, __A }; | |
417 __v4si __D = (__v4si) (__v2du) { __B, __B }; | |
418 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); | |
419 return (__m64) ((__v2du) (__C))[0]; | |
420 } | |
421 | |
422 extern __inline __m128i | |
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
424 _mm_maddubs_epi16 (__m128i __A, __m128i __B) | |
425 { | |
426 __v8hi __unsigned = vec_splats ((signed short) 0x00ff); | |
427 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); | |
428 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); | |
429 __v8hi __E = vec_unpackh ((__v16qi) __B); | |
430 __v8hi __F = vec_unpackl ((__v16qi) __B); | |
431 __C = vec_mul (__C, __E); | |
432 __D = vec_mul (__D, __F); | |
433 const __v16qu __odds = | |
434 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
435 const __v16qu __evens = | |
436 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
437 __E = vec_perm (__C, __D, __odds); | |
438 __F = vec_perm (__C, __D, __evens); | |
439 return (__m128i) vec_adds (__E, __F); | |
440 } | |
441 | |
442 extern __inline __m64 | |
443 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
444 _mm_maddubs_pi16 (__m64 __A, __m64 __B) | |
445 { | |
446 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; | |
447 __C = vec_unpackl ((__v16qi) __C); | |
448 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); | |
449 __C = vec_and (__C, __unsigned); | |
450 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; | |
451 __D = vec_unpackl ((__v16qi) __D); | |
452 __D = vec_mul (__C, __D); | |
453 const __v16qu __odds = | |
454 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
455 const __v16qu __evens = | |
456 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
457 __C = vec_perm (__D, __D, __odds); | |
458 __D = vec_perm (__D, __D, __evens); | |
459 __C = vec_adds (__C, __D); | |
460 return (__m64) ((__v2du) (__C))[0]; | |
461 } | |
462 | |
463 extern __inline __m128i | |
464 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
465 _mm_mulhrs_epi16 (__m128i __A, __m128i __B) | |
466 { | |
467 __v4si __C = vec_unpackh ((__v8hi) __A); | |
468 __v4si __D = vec_unpackh ((__v8hi) __B); | |
469 __C = vec_mul (__C, __D); | |
470 __D = vec_unpackl ((__v8hi) __A); | |
471 __v4si __E = vec_unpackl ((__v8hi) __B); | |
472 __D = vec_mul (__D, __E); | |
473 const __v4su __shift = vec_splats ((unsigned int) 14); | |
474 __C = vec_sr (__C, __shift); | |
475 __D = vec_sr (__D, __shift); | |
476 const __v4si __ones = vec_splats ((signed int) 1); | |
477 __C = vec_add (__C, __ones); | |
478 __C = vec_sr (__C, (__v4su) __ones); | |
479 __D = vec_add (__D, __ones); | |
480 __D = vec_sr (__D, (__v4su) __ones); | |
481 return (__m128i) vec_pack (__C, __D); | |
482 } | |
483 | |
484 extern __inline __m64 | |
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
486 _mm_mulhrs_pi16 (__m64 __A, __m64 __B) | |
487 { | |
488 __v4si __C = (__v4si) (__v2du) { __A, __A }; | |
489 __C = vec_unpackh ((__v8hi) __C); | |
490 __v4si __D = (__v4si) (__v2du) { __B, __B }; | |
491 __D = vec_unpackh ((__v8hi) __D); | |
492 __C = vec_mul (__C, __D); | |
493 const __v4su __shift = vec_splats ((unsigned int) 14); | |
494 __C = vec_sr (__C, __shift); | |
495 const __v4si __ones = vec_splats ((signed int) 1); | |
496 __C = vec_add (__C, __ones); | |
497 __C = vec_sr (__C, (__v4su) __ones); | |
498 __v8hi __E = vec_pack (__C, __D); | |
499 return (__m64) ((__v2du) (__E))[0]; | |
500 } | |
501 | |
502 #endif |