Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/arm/mmintrin.h @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | 04ced10e8804 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* Copyright (C) 2002, 2003, 2004, 2009 Free Software Foundation, Inc. | |
2 | |
3 This file is part of GCC. | |
4 | |
5 GCC is free software; you can redistribute it and/or modify it | |
6 under the terms of the GNU General Public License as published | |
7 by the Free Software Foundation; either version 3, or (at your | |
8 option) any later version. | |
9 | |
10 GCC is distributed in the hope that it will be useful, but WITHOUT | |
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
12 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
13 License for more details. | |
14 | |
15 Under Section 7 of GPL version 3, you are granted additional | |
16 permissions described in the GCC Runtime Library Exception, version | |
17 3.1, as published by the Free Software Foundation. | |
18 | |
19 You should have received a copy of the GNU General Public License and | |
20 a copy of the GCC Runtime Library Exception along with this program; | |
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 <http://www.gnu.org/licenses/>. */ | |
23 | |
24 #ifndef _MMINTRIN_H_INCLUDED | |
25 #define _MMINTRIN_H_INCLUDED | |
26 | |
27 /* The data type intended for user use. */ | |
28 typedef unsigned long long __m64, __int64; | |
29 | |
30 /* Internal data types for implementing the intrinsics. */ | |
31 typedef int __v2si __attribute__ ((vector_size (8))); | |
32 typedef short __v4hi __attribute__ ((vector_size (8))); | |
33 typedef char __v8qi __attribute__ ((vector_size (8))); | |
34 | |
35 /* "Convert" __m64 and __int64 into each other. */ | |
36 static __inline __m64 | |
37 _mm_cvtsi64_m64 (__int64 __i) | |
38 { | |
39 return __i; | |
40 } | |
41 | |
42 static __inline __int64 | |
43 _mm_cvtm64_si64 (__m64 __i) | |
44 { | |
45 return __i; | |
46 } | |
47 | |
48 static __inline int | |
49 _mm_cvtsi64_si32 (__int64 __i) | |
50 { | |
51 return __i; | |
52 } | |
53 | |
54 static __inline __int64 | |
55 _mm_cvtsi32_si64 (int __i) | |
56 { | |
57 return __i; | |
58 } | |
59 | |
60 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of | |
61 the result, and the four 16-bit values from M2 into the upper four 8-bit | |
62 values of the result, all with signed saturation. */ | |
63 static __inline __m64 | |
64 _mm_packs_pi16 (__m64 __m1, __m64 __m2) | |
65 { | |
66 return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2); | |
67 } | |
68 | |
69 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of | |
70 the result, and the two 32-bit values from M2 into the upper two 16-bit | |
71 values of the result, all with signed saturation. */ | |
72 static __inline __m64 | |
73 _mm_packs_pi32 (__m64 __m1, __m64 __m2) | |
74 { | |
75 return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2); | |
76 } | |
77 | |
78 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and | |
79 the 64-bit value from M2 into the upper 32-bits of the result, all with | |
80 signed saturation for values that do not fit exactly into 32-bits. */ | |
81 static __inline __m64 | |
82 _mm_packs_pi64 (__m64 __m1, __m64 __m2) | |
83 { | |
84 return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2); | |
85 } | |
86 | |
87 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of | |
88 the result, and the four 16-bit values from M2 into the upper four 8-bit | |
89 values of the result, all with unsigned saturation. */ | |
90 static __inline __m64 | |
91 _mm_packs_pu16 (__m64 __m1, __m64 __m2) | |
92 { | |
93 return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2); | |
94 } | |
95 | |
96 /* Pack the two 32-bit values from M1 into the lower two 16-bit values of | |
97 the result, and the two 32-bit values from M2 into the upper two 16-bit | |
98 values of the result, all with unsigned saturation. */ | |
99 static __inline __m64 | |
100 _mm_packs_pu32 (__m64 __m1, __m64 __m2) | |
101 { | |
102 return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2); | |
103 } | |
104 | |
105 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and | |
106 the 64-bit value from M2 into the upper 32-bits of the result, all with | |
107 unsigned saturation for values that do not fit exactly into 32-bits. */ | |
108 static __inline __m64 | |
109 _mm_packs_pu64 (__m64 __m1, __m64 __m2) | |
110 { | |
111 return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2); | |
112 } | |
113 | |
114 /* Interleave the four 8-bit values from the high half of M1 with the four | |
115 8-bit values from the high half of M2. */ | |
116 static __inline __m64 | |
117 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) | |
118 { | |
119 return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2); | |
120 } | |
121 | |
122 /* Interleave the two 16-bit values from the high half of M1 with the two | |
123 16-bit values from the high half of M2. */ | |
124 static __inline __m64 | |
125 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) | |
126 { | |
127 return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2); | |
128 } | |
129 | |
130 /* Interleave the 32-bit value from the high half of M1 with the 32-bit | |
131 value from the high half of M2. */ | |
132 static __inline __m64 | |
133 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) | |
134 { | |
135 return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2); | |
136 } | |
137 | |
138 /* Interleave the four 8-bit values from the low half of M1 with the four | |
139 8-bit values from the low half of M2. */ | |
140 static __inline __m64 | |
141 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) | |
142 { | |
143 return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2); | |
144 } | |
145 | |
146 /* Interleave the two 16-bit values from the low half of M1 with the two | |
147 16-bit values from the low half of M2. */ | |
148 static __inline __m64 | |
149 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) | |
150 { | |
151 return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2); | |
152 } | |
153 | |
154 /* Interleave the 32-bit value from the low half of M1 with the 32-bit | |
155 value from the low half of M2. */ | |
156 static __inline __m64 | |
157 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) | |
158 { | |
159 return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2); | |
160 } | |
161 | |
162 /* Take the four 8-bit values from the low half of M1, sign extend them, | |
163 and return the result as a vector of four 16-bit quantities. */ | |
164 static __inline __m64 | |
165 _mm_unpackel_pi8 (__m64 __m1) | |
166 { | |
167 return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1); | |
168 } | |
169 | |
170 /* Take the two 16-bit values from the low half of M1, sign extend them, | |
171 and return the result as a vector of two 32-bit quantities. */ | |
172 static __inline __m64 | |
173 _mm_unpackel_pi16 (__m64 __m1) | |
174 { | |
175 return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1); | |
176 } | |
177 | |
178 /* Take the 32-bit value from the low half of M1, and return it sign extended | |
179 to 64 bits. */ | |
180 static __inline __m64 | |
181 _mm_unpackel_pi32 (__m64 __m1) | |
182 { | |
183 return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1); | |
184 } | |
185 | |
186 /* Take the four 8-bit values from the high half of M1, sign extend them, | |
187 and return the result as a vector of four 16-bit quantities. */ | |
188 static __inline __m64 | |
189 _mm_unpackeh_pi8 (__m64 __m1) | |
190 { | |
191 return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1); | |
192 } | |
193 | |
194 /* Take the two 16-bit values from the high half of M1, sign extend them, | |
195 and return the result as a vector of two 32-bit quantities. */ | |
196 static __inline __m64 | |
197 _mm_unpackeh_pi16 (__m64 __m1) | |
198 { | |
199 return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1); | |
200 } | |
201 | |
202 /* Take the 32-bit value from the high half of M1, and return it sign extended | |
203 to 64 bits. */ | |
204 static __inline __m64 | |
205 _mm_unpackeh_pi32 (__m64 __m1) | |
206 { | |
207 return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1); | |
208 } | |
209 | |
210 /* Take the four 8-bit values from the low half of M1, zero extend them, | |
211 and return the result as a vector of four 16-bit quantities. */ | |
212 static __inline __m64 | |
213 _mm_unpackel_pu8 (__m64 __m1) | |
214 { | |
215 return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1); | |
216 } | |
217 | |
218 /* Take the two 16-bit values from the low half of M1, zero extend them, | |
219 and return the result as a vector of two 32-bit quantities. */ | |
220 static __inline __m64 | |
221 _mm_unpackel_pu16 (__m64 __m1) | |
222 { | |
223 return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1); | |
224 } | |
225 | |
226 /* Take the 32-bit value from the low half of M1, and return it zero extended | |
227 to 64 bits. */ | |
228 static __inline __m64 | |
229 _mm_unpackel_pu32 (__m64 __m1) | |
230 { | |
231 return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1); | |
232 } | |
233 | |
234 /* Take the four 8-bit values from the high half of M1, zero extend them, | |
235 and return the result as a vector of four 16-bit quantities. */ | |
236 static __inline __m64 | |
237 _mm_unpackeh_pu8 (__m64 __m1) | |
238 { | |
239 return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1); | |
240 } | |
241 | |
242 /* Take the two 16-bit values from the high half of M1, zero extend them, | |
243 and return the result as a vector of two 32-bit quantities. */ | |
244 static __inline __m64 | |
245 _mm_unpackeh_pu16 (__m64 __m1) | |
246 { | |
247 return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1); | |
248 } | |
249 | |
250 /* Take the 32-bit value from the high half of M1, and return it zero extended | |
251 to 64 bits. */ | |
252 static __inline __m64 | |
253 _mm_unpackeh_pu32 (__m64 __m1) | |
254 { | |
255 return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1); | |
256 } | |
257 | |
258 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ | |
259 static __inline __m64 | |
260 _mm_add_pi8 (__m64 __m1, __m64 __m2) | |
261 { | |
262 return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2); | |
263 } | |
264 | |
265 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ | |
266 static __inline __m64 | |
267 _mm_add_pi16 (__m64 __m1, __m64 __m2) | |
268 { | |
269 return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2); | |
270 } | |
271 | |
272 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ | |
273 static __inline __m64 | |
274 _mm_add_pi32 (__m64 __m1, __m64 __m2) | |
275 { | |
276 return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2); | |
277 } | |
278 | |
279 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed | |
280 saturated arithmetic. */ | |
281 static __inline __m64 | |
282 _mm_adds_pi8 (__m64 __m1, __m64 __m2) | |
283 { | |
284 return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2); | |
285 } | |
286 | |
287 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed | |
288 saturated arithmetic. */ | |
289 static __inline __m64 | |
290 _mm_adds_pi16 (__m64 __m1, __m64 __m2) | |
291 { | |
292 return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2); | |
293 } | |
294 | |
295 /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed | |
296 saturated arithmetic. */ | |
297 static __inline __m64 | |
298 _mm_adds_pi32 (__m64 __m1, __m64 __m2) | |
299 { | |
300 return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2); | |
301 } | |
302 | |
303 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned | |
304 saturated arithmetic. */ | |
305 static __inline __m64 | |
306 _mm_adds_pu8 (__m64 __m1, __m64 __m2) | |
307 { | |
308 return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2); | |
309 } | |
310 | |
311 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned | |
312 saturated arithmetic. */ | |
313 static __inline __m64 | |
314 _mm_adds_pu16 (__m64 __m1, __m64 __m2) | |
315 { | |
316 return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2); | |
317 } | |
318 | |
319 /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned | |
320 saturated arithmetic. */ | |
321 static __inline __m64 | |
322 _mm_adds_pu32 (__m64 __m1, __m64 __m2) | |
323 { | |
324 return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2); | |
325 } | |
326 | |
327 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ | |
328 static __inline __m64 | |
329 _mm_sub_pi8 (__m64 __m1, __m64 __m2) | |
330 { | |
331 return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2); | |
332 } | |
333 | |
334 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ | |
335 static __inline __m64 | |
336 _mm_sub_pi16 (__m64 __m1, __m64 __m2) | |
337 { | |
338 return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2); | |
339 } | |
340 | |
341 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ | |
342 static __inline __m64 | |
343 _mm_sub_pi32 (__m64 __m1, __m64 __m2) | |
344 { | |
345 return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2); | |
346 } | |
347 | |
348 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed | |
349 saturating arithmetic. */ | |
350 static __inline __m64 | |
351 _mm_subs_pi8 (__m64 __m1, __m64 __m2) | |
352 { | |
353 return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2); | |
354 } | |
355 | |
356 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using | |
357 signed saturating arithmetic. */ | |
358 static __inline __m64 | |
359 _mm_subs_pi16 (__m64 __m1, __m64 __m2) | |
360 { | |
361 return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2); | |
362 } | |
363 | |
364 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using | |
365 signed saturating arithmetic. */ | |
366 static __inline __m64 | |
367 _mm_subs_pi32 (__m64 __m1, __m64 __m2) | |
368 { | |
369 return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2); | |
370 } | |
371 | |
372 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using | |
373 unsigned saturating arithmetic. */ | |
374 static __inline __m64 | |
375 _mm_subs_pu8 (__m64 __m1, __m64 __m2) | |
376 { | |
377 return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2); | |
378 } | |
379 | |
380 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using | |
381 unsigned saturating arithmetic. */ | |
382 static __inline __m64 | |
383 _mm_subs_pu16 (__m64 __m1, __m64 __m2) | |
384 { | |
385 return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2); | |
386 } | |
387 | |
388 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using | |
389 unsigned saturating arithmetic. */ | |
390 static __inline __m64 | |
391 _mm_subs_pu32 (__m64 __m1, __m64 __m2) | |
392 { | |
393 return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2); | |
394 } | |
395 | |
396 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing | |
397 four 32-bit intermediate results, which are then summed by pairs to | |
398 produce two 32-bit results. */ | |
399 static __inline __m64 | |
400 _mm_madd_pi16 (__m64 __m1, __m64 __m2) | |
401 { | |
402 return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2); | |
403 } | |
404 | |
405 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing | |
406 four 32-bit intermediate results, which are then summed by pairs to | |
407 produce two 32-bit results. */ | |
408 static __inline __m64 | |
409 _mm_madd_pu16 (__m64 __m1, __m64 __m2) | |
410 { | |
411 return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2); | |
412 } | |
413 | |
414 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in | |
415 M2 and produce the high 16 bits of the 32-bit results. */ | |
416 static __inline __m64 | |
417 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) | |
418 { | |
419 return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2); | |
420 } | |
421 | |
422 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in | |
423 M2 and produce the high 16 bits of the 32-bit results. */ | |
424 static __inline __m64 | |
425 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2) | |
426 { | |
427 return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2); | |
428 } | |
429 | |
430 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce | |
431 the low 16 bits of the results. */ | |
432 static __inline __m64 | |
433 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) | |
434 { | |
435 return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2); | |
436 } | |
437 | |
438 /* Shift four 16-bit values in M left by COUNT. */ | |
439 static __inline __m64 | |
440 _mm_sll_pi16 (__m64 __m, __m64 __count) | |
441 { | |
442 return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count); | |
443 } | |
444 | |
445 static __inline __m64 | |
446 _mm_slli_pi16 (__m64 __m, int __count) | |
447 { | |
448 return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count); | |
449 } | |
450 | |
451 /* Shift two 32-bit values in M left by COUNT. */ | |
452 static __inline __m64 | |
453 _mm_sll_pi32 (__m64 __m, __m64 __count) | |
454 { | |
455 return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count); | |
456 } | |
457 | |
458 static __inline __m64 | |
459 _mm_slli_pi32 (__m64 __m, int __count) | |
460 { | |
461 return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count); | |
462 } | |
463 | |
464 /* Shift the 64-bit value in M left by COUNT. */ | |
465 static __inline __m64 | |
466 _mm_sll_si64 (__m64 __m, __m64 __count) | |
467 { | |
468 return (__m64) __builtin_arm_wslld (__m, __count); | |
469 } | |
470 | |
471 static __inline __m64 | |
472 _mm_slli_si64 (__m64 __m, int __count) | |
473 { | |
474 return (__m64) __builtin_arm_wslldi (__m, __count); | |
475 } | |
476 | |
477 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ | |
478 static __inline __m64 | |
479 _mm_sra_pi16 (__m64 __m, __m64 __count) | |
480 { | |
481 return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count); | |
482 } | |
483 | |
484 static __inline __m64 | |
485 _mm_srai_pi16 (__m64 __m, int __count) | |
486 { | |
487 return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count); | |
488 } | |
489 | |
490 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ | |
491 static __inline __m64 | |
492 _mm_sra_pi32 (__m64 __m, __m64 __count) | |
493 { | |
494 return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count); | |
495 } | |
496 | |
497 static __inline __m64 | |
498 _mm_srai_pi32 (__m64 __m, int __count) | |
499 { | |
500 return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count); | |
501 } | |
502 | |
503 /* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */ | |
504 static __inline __m64 | |
505 _mm_sra_si64 (__m64 __m, __m64 __count) | |
506 { | |
507 return (__m64) __builtin_arm_wsrad (__m, __count); | |
508 } | |
509 | |
510 static __inline __m64 | |
511 _mm_srai_si64 (__m64 __m, int __count) | |
512 { | |
513 return (__m64) __builtin_arm_wsradi (__m, __count); | |
514 } | |
515 | |
516 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ | |
517 static __inline __m64 | |
518 _mm_srl_pi16 (__m64 __m, __m64 __count) | |
519 { | |
520 return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count); | |
521 } | |
522 | |
523 static __inline __m64 | |
524 _mm_srli_pi16 (__m64 __m, int __count) | |
525 { | |
526 return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count); | |
527 } | |
528 | |
529 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ | |
530 static __inline __m64 | |
531 _mm_srl_pi32 (__m64 __m, __m64 __count) | |
532 { | |
533 return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count); | |
534 } | |
535 | |
536 static __inline __m64 | |
537 _mm_srli_pi32 (__m64 __m, int __count) | |
538 { | |
539 return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count); | |
540 } | |
541 | |
542 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ | |
543 static __inline __m64 | |
544 _mm_srl_si64 (__m64 __m, __m64 __count) | |
545 { | |
546 return (__m64) __builtin_arm_wsrld (__m, __count); | |
547 } | |
548 | |
549 static __inline __m64 | |
550 _mm_srli_si64 (__m64 __m, int __count) | |
551 { | |
552 return (__m64) __builtin_arm_wsrldi (__m, __count); | |
553 } | |
554 | |
555 /* Rotate four 16-bit values in M right by COUNT. */ | |
556 static __inline __m64 | |
557 _mm_ror_pi16 (__m64 __m, __m64 __count) | |
558 { | |
559 return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count); | |
560 } | |
561 | |
562 static __inline __m64 | |
563 _mm_rori_pi16 (__m64 __m, int __count) | |
564 { | |
565 return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count); | |
566 } | |
567 | |
568 /* Rotate two 32-bit values in M right by COUNT. */ | |
569 static __inline __m64 | |
570 _mm_ror_pi32 (__m64 __m, __m64 __count) | |
571 { | |
572 return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count); | |
573 } | |
574 | |
575 static __inline __m64 | |
576 _mm_rori_pi32 (__m64 __m, int __count) | |
577 { | |
578 return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count); | |
579 } | |
580 | |
581 /* Rotate two 64-bit values in M right by COUNT. */ | |
582 static __inline __m64 | |
583 _mm_ror_si64 (__m64 __m, __m64 __count) | |
584 { | |
585 return (__m64) __builtin_arm_wrord (__m, __count); | |
586 } | |
587 | |
588 static __inline __m64 | |
589 _mm_rori_si64 (__m64 __m, int __count) | |
590 { | |
591 return (__m64) __builtin_arm_wrordi (__m, __count); | |
592 } | |
593 | |
594 /* Bit-wise AND the 64-bit values in M1 and M2. */ | |
595 static __inline __m64 | |
596 _mm_and_si64 (__m64 __m1, __m64 __m2) | |
597 { | |
598 return __builtin_arm_wand (__m1, __m2); | |
599 } | |
600 | |
601 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the | |
602 64-bit value in M2. */ | |
603 static __inline __m64 | |
604 _mm_andnot_si64 (__m64 __m1, __m64 __m2) | |
605 { | |
606 return __builtin_arm_wandn (__m1, __m2); | |
607 } | |
608 | |
609 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ | |
610 static __inline __m64 | |
611 _mm_or_si64 (__m64 __m1, __m64 __m2) | |
612 { | |
613 return __builtin_arm_wor (__m1, __m2); | |
614 } | |
615 | |
616 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ | |
617 static __inline __m64 | |
618 _mm_xor_si64 (__m64 __m1, __m64 __m2) | |
619 { | |
620 return __builtin_arm_wxor (__m1, __m2); | |
621 } | |
622 | |
623 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the | |
624 test is true and zero if false. */ | |
625 static __inline __m64 | |
626 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) | |
627 { | |
628 return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2); | |
629 } | |
630 | |
631 static __inline __m64 | |
632 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) | |
633 { | |
634 return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2); | |
635 } | |
636 | |
637 static __inline __m64 | |
638 _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2) | |
639 { | |
640 return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2); | |
641 } | |
642 | |
643 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if | |
644 the test is true and zero if false. */ | |
645 static __inline __m64 | |
646 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) | |
647 { | |
648 return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2); | |
649 } | |
650 | |
651 static __inline __m64 | |
652 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) | |
653 { | |
654 return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2); | |
655 } | |
656 | |
657 static __inline __m64 | |
658 _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2) | |
659 { | |
660 return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2); | |
661 } | |
662 | |
663 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if | |
664 the test is true and zero if false. */ | |
665 static __inline __m64 | |
666 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) | |
667 { | |
668 return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2); | |
669 } | |
670 | |
671 static __inline __m64 | |
672 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) | |
673 { | |
674 return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2); | |
675 } | |
676 | |
677 static __inline __m64 | |
678 _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2) | |
679 { | |
680 return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2); | |
681 } | |
682 | |
683 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed | |
684 by accumulate across all elements and __A. */ | |
685 static __inline __m64 | |
686 _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C) | |
687 { | |
688 return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C); | |
689 } | |
690 | |
691 /* Element-wise multiplication of signed 16-bit values __B and __C, followed | |
692 by accumulate across all elements and __A. */ | |
693 static __inline __m64 | |
694 _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C) | |
695 { | |
696 return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C); | |
697 } | |
698 | |
699 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed | |
700 by accumulate across all elements. */ | |
701 static __inline __m64 | |
702 _mm_macz_pu16 (__m64 __A, __m64 __B) | |
703 { | |
704 return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B); | |
705 } | |
706 | |
707 /* Element-wise multiplication of signed 16-bit values __B and __C, followed | |
708 by accumulate across all elements. */ | |
709 static __inline __m64 | |
710 _mm_macz_pi16 (__m64 __A, __m64 __B) | |
711 { | |
712 return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B); | |
713 } | |
714 | |
715 /* Accumulate across all unsigned 8-bit values in __A. */ | |
716 static __inline __m64 | |
717 _mm_acc_pu8 (__m64 __A) | |
718 { | |
719 return __builtin_arm_waccb ((__v8qi)__A); | |
720 } | |
721 | |
722 /* Accumulate across all unsigned 16-bit values in __A. */ | |
723 static __inline __m64 | |
724 _mm_acc_pu16 (__m64 __A) | |
725 { | |
726 return __builtin_arm_wacch ((__v4hi)__A); | |
727 } | |
728 | |
729 /* Accumulate across all unsigned 32-bit values in __A. */ | |
730 static __inline __m64 | |
731 _mm_acc_pu32 (__m64 __A) | |
732 { | |
733 return __builtin_arm_waccw ((__v2si)__A); | |
734 } | |
735 | |
736 static __inline __m64 | |
737 _mm_mia_si64 (__m64 __A, int __B, int __C) | |
738 { | |
739 return __builtin_arm_tmia (__A, __B, __C); | |
740 } | |
741 | |
742 static __inline __m64 | |
743 _mm_miaph_si64 (__m64 __A, int __B, int __C) | |
744 { | |
745 return __builtin_arm_tmiaph (__A, __B, __C); | |
746 } | |
747 | |
748 static __inline __m64 | |
749 _mm_miabb_si64 (__m64 __A, int __B, int __C) | |
750 { | |
751 return __builtin_arm_tmiabb (__A, __B, __C); | |
752 } | |
753 | |
754 static __inline __m64 | |
755 _mm_miabt_si64 (__m64 __A, int __B, int __C) | |
756 { | |
757 return __builtin_arm_tmiabt (__A, __B, __C); | |
758 } | |
759 | |
760 static __inline __m64 | |
761 _mm_miatb_si64 (__m64 __A, int __B, int __C) | |
762 { | |
763 return __builtin_arm_tmiatb (__A, __B, __C); | |
764 } | |
765 | |
766 static __inline __m64 | |
767 _mm_miatt_si64 (__m64 __A, int __B, int __C) | |
768 { | |
769 return __builtin_arm_tmiatt (__A, __B, __C); | |
770 } | |
771 | |
772 /* Extract one of the elements of A and sign extend. The selector N must | |
773 be immediate. */ | |
774 #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N)) | |
775 #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N)) | |
776 #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N)) | |
777 | |
778 /* Extract one of the elements of A and zero extend. The selector N must | |
779 be immediate. */ | |
780 #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N)) | |
781 #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N)) | |
782 #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N)) | |
783 | |
784 /* Inserts word D into one of the elements of A. The selector N must be | |
785 immediate. */ | |
786 #define _mm_insert_pi8(A, D, N) \ | |
787 ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N))) | |
788 #define _mm_insert_pi16(A, D, N) \ | |
789 ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N))) | |
790 #define _mm_insert_pi32(A, D, N) \ | |
791 ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N))) | |
792 | |
793 /* Compute the element-wise maximum of signed 8-bit values. */ | |
794 static __inline __m64 | |
795 _mm_max_pi8 (__m64 __A, __m64 __B) | |
796 { | |
797 return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B); | |
798 } | |
799 | |
800 /* Compute the element-wise maximum of signed 16-bit values. */ | |
801 static __inline __m64 | |
802 _mm_max_pi16 (__m64 __A, __m64 __B) | |
803 { | |
804 return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B); | |
805 } | |
806 | |
807 /* Compute the element-wise maximum of signed 32-bit values. */ | |
808 static __inline __m64 | |
809 _mm_max_pi32 (__m64 __A, __m64 __B) | |
810 { | |
811 return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B); | |
812 } | |
813 | |
814 /* Compute the element-wise maximum of unsigned 8-bit values. */ | |
815 static __inline __m64 | |
816 _mm_max_pu8 (__m64 __A, __m64 __B) | |
817 { | |
818 return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B); | |
819 } | |
820 | |
821 /* Compute the element-wise maximum of unsigned 16-bit values. */ | |
822 static __inline __m64 | |
823 _mm_max_pu16 (__m64 __A, __m64 __B) | |
824 { | |
825 return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B); | |
826 } | |
827 | |
828 /* Compute the element-wise maximum of unsigned 32-bit values. */ | |
829 static __inline __m64 | |
830 _mm_max_pu32 (__m64 __A, __m64 __B) | |
831 { | |
832 return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B); | |
833 } | |
834 | |
835 /* Compute the element-wise minimum of signed 16-bit values. */ | |
836 static __inline __m64 | |
837 _mm_min_pi8 (__m64 __A, __m64 __B) | |
838 { | |
839 return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B); | |
840 } | |
841 | |
842 /* Compute the element-wise minimum of signed 16-bit values. */ | |
843 static __inline __m64 | |
844 _mm_min_pi16 (__m64 __A, __m64 __B) | |
845 { | |
846 return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B); | |
847 } | |
848 | |
849 /* Compute the element-wise minimum of signed 32-bit values. */ | |
850 static __inline __m64 | |
851 _mm_min_pi32 (__m64 __A, __m64 __B) | |
852 { | |
853 return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B); | |
854 } | |
855 | |
856 /* Compute the element-wise minimum of unsigned 16-bit values. */ | |
857 static __inline __m64 | |
858 _mm_min_pu8 (__m64 __A, __m64 __B) | |
859 { | |
860 return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B); | |
861 } | |
862 | |
863 /* Compute the element-wise minimum of unsigned 16-bit values. */ | |
864 static __inline __m64 | |
865 _mm_min_pu16 (__m64 __A, __m64 __B) | |
866 { | |
867 return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B); | |
868 } | |
869 | |
870 /* Compute the element-wise minimum of unsigned 32-bit values. */ | |
871 static __inline __m64 | |
872 _mm_min_pu32 (__m64 __A, __m64 __B) | |
873 { | |
874 return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B); | |
875 } | |
876 | |
877 /* Create an 8-bit mask of the signs of 8-bit values. */ | |
878 static __inline int | |
879 _mm_movemask_pi8 (__m64 __A) | |
880 { | |
881 return __builtin_arm_tmovmskb ((__v8qi)__A); | |
882 } | |
883 | |
884 /* Create an 8-bit mask of the signs of 16-bit values. */ | |
885 static __inline int | |
886 _mm_movemask_pi16 (__m64 __A) | |
887 { | |
888 return __builtin_arm_tmovmskh ((__v4hi)__A); | |
889 } | |
890 | |
891 /* Create an 8-bit mask of the signs of 32-bit values. */ | |
892 static __inline int | |
893 _mm_movemask_pi32 (__m64 __A) | |
894 { | |
895 return __builtin_arm_tmovmskw ((__v2si)__A); | |
896 } | |
897 | |
898 /* Return a combination of the four 16-bit values in A. The selector | |
899 must be an immediate. */ | |
900 #define _mm_shuffle_pi16(A, N) \ | |
901 ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N))) | |
902 | |
903 | |
904 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ | |
905 static __inline __m64 | |
906 _mm_avg_pu8 (__m64 __A, __m64 __B) | |
907 { | |
908 return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B); | |
909 } | |
910 | |
911 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ | |
912 static __inline __m64 | |
913 _mm_avg_pu16 (__m64 __A, __m64 __B) | |
914 { | |
915 return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B); | |
916 } | |
917 | |
918 /* Compute the averages of the unsigned 8-bit values in A and B. */ | |
919 static __inline __m64 | |
920 _mm_avg2_pu8 (__m64 __A, __m64 __B) | |
921 { | |
922 return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B); | |
923 } | |
924 | |
925 /* Compute the averages of the unsigned 16-bit values in A and B. */ | |
926 static __inline __m64 | |
927 _mm_avg2_pu16 (__m64 __A, __m64 __B) | |
928 { | |
929 return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B); | |
930 } | |
931 | |
932 /* Compute the sum of the absolute differences of the unsigned 8-bit | |
933 values in A and B. Return the value in the lower 16-bit word; the | |
934 upper words are cleared. */ | |
935 static __inline __m64 | |
936 _mm_sad_pu8 (__m64 __A, __m64 __B) | |
937 { | |
938 return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B); | |
939 } | |
940 | |
941 /* Compute the sum of the absolute differences of the unsigned 16-bit | |
942 values in A and B. Return the value in the lower 32-bit word; the | |
943 upper words are cleared. */ | |
944 static __inline __m64 | |
945 _mm_sad_pu16 (__m64 __A, __m64 __B) | |
946 { | |
947 return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B); | |
948 } | |
949 | |
950 /* Compute the sum of the absolute differences of the unsigned 8-bit | |
951 values in A and B. Return the value in the lower 16-bit word; the | |
952 upper words are cleared. */ | |
953 static __inline __m64 | |
954 _mm_sadz_pu8 (__m64 __A, __m64 __B) | |
955 { | |
956 return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); | |
957 } | |
958 | |
959 /* Compute the sum of the absolute differences of the unsigned 16-bit | |
960 values in A and B. Return the value in the lower 32-bit word; the | |
961 upper words are cleared. */ | |
962 static __inline __m64 | |
963 _mm_sadz_pu16 (__m64 __A, __m64 __B) | |
964 { | |
965 return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); | |
966 } | |
967 | |
968 static __inline __m64 | |
969 _mm_align_si64 (__m64 __A, __m64 __B, int __C) | |
970 { | |
971 return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C); | |
972 } | |
973 | |
974 /* Creates a 64-bit zero. */ | |
975 static __inline __m64 | |
976 _mm_setzero_si64 (void) | |
977 { | |
978 return __builtin_arm_wzero (); | |
979 } | |
980 | |
981 /* Set and Get arbitrary iWMMXt Control registers. | |
982 Note only registers 0-3 and 8-11 are currently defined, | |
983 the rest are reserved. */ | |
984 | |
985 static __inline void | |
986 _mm_setwcx (const int __value, const int __regno) | |
987 { | |
988 switch (__regno) | |
989 { | |
990 case 0: __builtin_arm_setwcx (__value, 0); break; | |
991 case 1: __builtin_arm_setwcx (__value, 1); break; | |
992 case 2: __builtin_arm_setwcx (__value, 2); break; | |
993 case 3: __builtin_arm_setwcx (__value, 3); break; | |
994 case 8: __builtin_arm_setwcx (__value, 8); break; | |
995 case 9: __builtin_arm_setwcx (__value, 9); break; | |
996 case 10: __builtin_arm_setwcx (__value, 10); break; | |
997 case 11: __builtin_arm_setwcx (__value, 11); break; | |
998 default: break; | |
999 } | |
1000 } | |
1001 | |
1002 static __inline int | |
1003 _mm_getwcx (const int __regno) | |
1004 { | |
1005 switch (__regno) | |
1006 { | |
1007 case 0: return __builtin_arm_getwcx (0); | |
1008 case 1: return __builtin_arm_getwcx (1); | |
1009 case 2: return __builtin_arm_getwcx (2); | |
1010 case 3: return __builtin_arm_getwcx (3); | |
1011 case 8: return __builtin_arm_getwcx (8); | |
1012 case 9: return __builtin_arm_getwcx (9); | |
1013 case 10: return __builtin_arm_getwcx (10); | |
1014 case 11: return __builtin_arm_getwcx (11); | |
1015 default: return 0; | |
1016 } | |
1017 } | |
1018 | |
1019 /* Creates a vector of two 32-bit values; I0 is least significant. */ | |
1020 static __inline __m64 | |
1021 _mm_set_pi32 (int __i1, int __i0) | |
1022 { | |
1023 union { | |
1024 __m64 __q; | |
1025 struct { | |
1026 unsigned int __i0; | |
1027 unsigned int __i1; | |
1028 } __s; | |
1029 } __u; | |
1030 | |
1031 __u.__s.__i0 = __i0; | |
1032 __u.__s.__i1 = __i1; | |
1033 | |
1034 return __u.__q; | |
1035 } | |
1036 | |
1037 /* Creates a vector of four 16-bit values; W0 is least significant. */ | |
1038 static __inline __m64 | |
1039 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) | |
1040 { | |
1041 unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2; | |
1042 unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0; | |
1043 return _mm_set_pi32 (__i1, __i0); | |
1044 | |
1045 } | |
1046 | |
1047 /* Creates a vector of eight 8-bit values; B0 is least significant. */ | |
1048 static __inline __m64 | |
1049 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, | |
1050 char __b3, char __b2, char __b1, char __b0) | |
1051 { | |
1052 unsigned int __i1, __i0; | |
1053 | |
1054 __i1 = (unsigned char)__b7; | |
1055 __i1 = __i1 << 8 | (unsigned char)__b6; | |
1056 __i1 = __i1 << 8 | (unsigned char)__b5; | |
1057 __i1 = __i1 << 8 | (unsigned char)__b4; | |
1058 | |
1059 __i0 = (unsigned char)__b3; | |
1060 __i0 = __i0 << 8 | (unsigned char)__b2; | |
1061 __i0 = __i0 << 8 | (unsigned char)__b1; | |
1062 __i0 = __i0 << 8 | (unsigned char)__b0; | |
1063 | |
1064 return _mm_set_pi32 (__i1, __i0); | |
1065 } | |
1066 | |
1067 /* Similar, but with the arguments in reverse order. */ | |
1068 static __inline __m64 | |
1069 _mm_setr_pi32 (int __i0, int __i1) | |
1070 { | |
1071 return _mm_set_pi32 (__i1, __i0); | |
1072 } | |
1073 | |
1074 static __inline __m64 | |
1075 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) | |
1076 { | |
1077 return _mm_set_pi16 (__w3, __w2, __w1, __w0); | |
1078 } | |
1079 | |
1080 static __inline __m64 | |
1081 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, | |
1082 char __b4, char __b5, char __b6, char __b7) | |
1083 { | |
1084 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); | |
1085 } | |
1086 | |
1087 /* Creates a vector of two 32-bit values, both elements containing I. */ | |
1088 static __inline __m64 | |
1089 _mm_set1_pi32 (int __i) | |
1090 { | |
1091 return _mm_set_pi32 (__i, __i); | |
1092 } | |
1093 | |
1094 /* Creates a vector of four 16-bit values, all elements containing W. */ | |
1095 static __inline __m64 | |
1096 _mm_set1_pi16 (short __w) | |
1097 { | |
1098 unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w; | |
1099 return _mm_set1_pi32 (__i); | |
1100 } | |
1101 | |
1102 /* Creates a vector of four 16-bit values, all elements containing B. */ | |
1103 static __inline __m64 | |
1104 _mm_set1_pi8 (char __b) | |
1105 { | |
1106 unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b; | |
1107 unsigned int __i = __w << 16 | __w; | |
1108 return _mm_set1_pi32 (__i); | |
1109 } | |
1110 | |
1111 /* Convert an integer to a __m64 object. */ | |
1112 static __inline __m64 | |
1113 _m_from_int (int __a) | |
1114 { | |
1115 return (__m64)__a; | |
1116 } | |
1117 | |
1118 #define _m_packsswb _mm_packs_pi16 | |
1119 #define _m_packssdw _mm_packs_pi32 | |
1120 #define _m_packuswb _mm_packs_pu16 | |
1121 #define _m_packusdw _mm_packs_pu32 | |
1122 #define _m_packssqd _mm_packs_pi64 | |
1123 #define _m_packusqd _mm_packs_pu64 | |
1124 #define _mm_packs_si64 _mm_packs_pi64 | |
1125 #define _mm_packs_su64 _mm_packs_pu64 | |
1126 #define _m_punpckhbw _mm_unpackhi_pi8 | |
1127 #define _m_punpckhwd _mm_unpackhi_pi16 | |
1128 #define _m_punpckhdq _mm_unpackhi_pi32 | |
1129 #define _m_punpcklbw _mm_unpacklo_pi8 | |
1130 #define _m_punpcklwd _mm_unpacklo_pi16 | |
1131 #define _m_punpckldq _mm_unpacklo_pi32 | |
1132 #define _m_punpckehsbw _mm_unpackeh_pi8 | |
1133 #define _m_punpckehswd _mm_unpackeh_pi16 | |
1134 #define _m_punpckehsdq _mm_unpackeh_pi32 | |
1135 #define _m_punpckehubw _mm_unpackeh_pu8 | |
1136 #define _m_punpckehuwd _mm_unpackeh_pu16 | |
1137 #define _m_punpckehudq _mm_unpackeh_pu32 | |
1138 #define _m_punpckelsbw _mm_unpackel_pi8 | |
1139 #define _m_punpckelswd _mm_unpackel_pi16 | |
1140 #define _m_punpckelsdq _mm_unpackel_pi32 | |
1141 #define _m_punpckelubw _mm_unpackel_pu8 | |
1142 #define _m_punpckeluwd _mm_unpackel_pu16 | |
1143 #define _m_punpckeludq _mm_unpackel_pu32 | |
1144 #define _m_paddb _mm_add_pi8 | |
1145 #define _m_paddw _mm_add_pi16 | |
1146 #define _m_paddd _mm_add_pi32 | |
1147 #define _m_paddsb _mm_adds_pi8 | |
1148 #define _m_paddsw _mm_adds_pi16 | |
1149 #define _m_paddsd _mm_adds_pi32 | |
1150 #define _m_paddusb _mm_adds_pu8 | |
1151 #define _m_paddusw _mm_adds_pu16 | |
1152 #define _m_paddusd _mm_adds_pu32 | |
1153 #define _m_psubb _mm_sub_pi8 | |
1154 #define _m_psubw _mm_sub_pi16 | |
1155 #define _m_psubd _mm_sub_pi32 | |
1156 #define _m_psubsb _mm_subs_pi8 | |
1157 #define _m_psubsw _mm_subs_pi16 | |
1158 #define _m_psubuw _mm_subs_pi32 | |
1159 #define _m_psubusb _mm_subs_pu8 | |
1160 #define _m_psubusw _mm_subs_pu16 | |
1161 #define _m_psubusd _mm_subs_pu32 | |
1162 #define _m_pmaddwd _mm_madd_pi16 | |
1163 #define _m_pmadduwd _mm_madd_pu16 | |
1164 #define _m_pmulhw _mm_mulhi_pi16 | |
1165 #define _m_pmulhuw _mm_mulhi_pu16 | |
1166 #define _m_pmullw _mm_mullo_pi16 | |
1167 #define _m_pmacsw _mm_mac_pi16 | |
1168 #define _m_pmacuw _mm_mac_pu16 | |
1169 #define _m_pmacszw _mm_macz_pi16 | |
1170 #define _m_pmacuzw _mm_macz_pu16 | |
1171 #define _m_paccb _mm_acc_pu8 | |
1172 #define _m_paccw _mm_acc_pu16 | |
1173 #define _m_paccd _mm_acc_pu32 | |
1174 #define _m_pmia _mm_mia_si64 | |
1175 #define _m_pmiaph _mm_miaph_si64 | |
1176 #define _m_pmiabb _mm_miabb_si64 | |
1177 #define _m_pmiabt _mm_miabt_si64 | |
1178 #define _m_pmiatb _mm_miatb_si64 | |
1179 #define _m_pmiatt _mm_miatt_si64 | |
1180 #define _m_psllw _mm_sll_pi16 | |
1181 #define _m_psllwi _mm_slli_pi16 | |
1182 #define _m_pslld _mm_sll_pi32 | |
1183 #define _m_pslldi _mm_slli_pi32 | |
1184 #define _m_psllq _mm_sll_si64 | |
1185 #define _m_psllqi _mm_slli_si64 | |
1186 #define _m_psraw _mm_sra_pi16 | |
1187 #define _m_psrawi _mm_srai_pi16 | |
1188 #define _m_psrad _mm_sra_pi32 | |
1189 #define _m_psradi _mm_srai_pi32 | |
1190 #define _m_psraq _mm_sra_si64 | |
1191 #define _m_psraqi _mm_srai_si64 | |
1192 #define _m_psrlw _mm_srl_pi16 | |
1193 #define _m_psrlwi _mm_srli_pi16 | |
1194 #define _m_psrld _mm_srl_pi32 | |
1195 #define _m_psrldi _mm_srli_pi32 | |
1196 #define _m_psrlq _mm_srl_si64 | |
1197 #define _m_psrlqi _mm_srli_si64 | |
1198 #define _m_prorw _mm_ror_pi16 | |
1199 #define _m_prorwi _mm_rori_pi16 | |
1200 #define _m_prord _mm_ror_pi32 | |
1201 #define _m_prordi _mm_rori_pi32 | |
1202 #define _m_prorq _mm_ror_si64 | |
1203 #define _m_prorqi _mm_rori_si64 | |
1204 #define _m_pand _mm_and_si64 | |
1205 #define _m_pandn _mm_andnot_si64 | |
1206 #define _m_por _mm_or_si64 | |
1207 #define _m_pxor _mm_xor_si64 | |
1208 #define _m_pcmpeqb _mm_cmpeq_pi8 | |
1209 #define _m_pcmpeqw _mm_cmpeq_pi16 | |
1210 #define _m_pcmpeqd _mm_cmpeq_pi32 | |
1211 #define _m_pcmpgtb _mm_cmpgt_pi8 | |
1212 #define _m_pcmpgtub _mm_cmpgt_pu8 | |
1213 #define _m_pcmpgtw _mm_cmpgt_pi16 | |
1214 #define _m_pcmpgtuw _mm_cmpgt_pu16 | |
1215 #define _m_pcmpgtd _mm_cmpgt_pi32 | |
1216 #define _m_pcmpgtud _mm_cmpgt_pu32 | |
1217 #define _m_pextrb _mm_extract_pi8 | |
1218 #define _m_pextrw _mm_extract_pi16 | |
1219 #define _m_pextrd _mm_extract_pi32 | |
1220 #define _m_pextrub _mm_extract_pu8 | |
1221 #define _m_pextruw _mm_extract_pu16 | |
1222 #define _m_pextrud _mm_extract_pu32 | |
1223 #define _m_pinsrb _mm_insert_pi8 | |
1224 #define _m_pinsrw _mm_insert_pi16 | |
1225 #define _m_pinsrd _mm_insert_pi32 | |
1226 #define _m_pmaxsb _mm_max_pi8 | |
1227 #define _m_pmaxsw _mm_max_pi16 | |
1228 #define _m_pmaxsd _mm_max_pi32 | |
1229 #define _m_pmaxub _mm_max_pu8 | |
1230 #define _m_pmaxuw _mm_max_pu16 | |
1231 #define _m_pmaxud _mm_max_pu32 | |
1232 #define _m_pminsb _mm_min_pi8 | |
1233 #define _m_pminsw _mm_min_pi16 | |
1234 #define _m_pminsd _mm_min_pi32 | |
1235 #define _m_pminub _mm_min_pu8 | |
1236 #define _m_pminuw _mm_min_pu16 | |
1237 #define _m_pminud _mm_min_pu32 | |
1238 #define _m_pmovmskb _mm_movemask_pi8 | |
1239 #define _m_pmovmskw _mm_movemask_pi16 | |
1240 #define _m_pmovmskd _mm_movemask_pi32 | |
1241 #define _m_pshufw _mm_shuffle_pi16 | |
1242 #define _m_pavgb _mm_avg_pu8 | |
1243 #define _m_pavgw _mm_avg_pu16 | |
1244 #define _m_pavg2b _mm_avg2_pu8 | |
1245 #define _m_pavg2w _mm_avg2_pu16 | |
1246 #define _m_psadbw _mm_sad_pu8 | |
1247 #define _m_psadwd _mm_sad_pu16 | |
1248 #define _m_psadzbw _mm_sadz_pu8 | |
1249 #define _m_psadzwd _mm_sadz_pu16 | |
1250 #define _m_paligniq _mm_align_si64 | |
1251 #define _m_cvt_si2pi _mm_cvtsi64_m64 | |
1252 #define _m_cvt_pi2si _mm_cvtm64_si64 | |
1253 | |
1254 #endif /* _MMINTRIN_H_INCLUDED */ |