111
|
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
0
|
2
|
|
3 This file is part of GCC.
|
|
4
|
|
5 GCC is free software; you can redistribute it and/or modify it
|
|
6 under the terms of the GNU General Public License as published
|
|
7 by the Free Software Foundation; either version 3, or (at your
|
|
8 option) any later version.
|
|
9
|
|
10 GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
12 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
|
13 License for more details.
|
|
14
|
|
15 Under Section 7 of GPL version 3, you are granted additional
|
|
16 permissions described in the GCC Runtime Library Exception, version
|
|
17 3.1, as published by the Free Software Foundation.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License and
|
|
20 a copy of the GCC Runtime Library Exception along with this program;
|
|
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
22 <http://www.gnu.org/licenses/>. */
|
|
23
|
|
24 #ifndef _MMINTRIN_H_INCLUDED
|
|
25 #define _MMINTRIN_H_INCLUDED
|
|
26
|
111
|
27 #ifndef __IWMMXT__
|
|
28 #error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2)
|
|
29 #endif
|
|
30
|
|
31
|
|
32 #if defined __cplusplus
|
|
33 extern "C" {
|
|
34 /* Intrinsics use C name-mangling. */
|
|
35 #endif /* __cplusplus */
|
|
36
|
0
|
37 /* The data type intended for user use. */
|
|
38 typedef unsigned long long __m64, __int64;
|
|
39
|
|
40 /* Internal data types for implementing the intrinsics. */
|
|
41 typedef int __v2si __attribute__ ((vector_size (8)));
|
|
42 typedef short __v4hi __attribute__ ((vector_size (8)));
|
111
|
43 typedef signed char __v8qi __attribute__ ((vector_size (8)));
|
|
44
|
|
45 /* Provided for source compatibility with MMX. */
|
|
46 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
47 _mm_empty (void)
|
|
48 {
|
|
49 }
|
0
|
50
|
|
51 /* "Convert" __m64 and __int64 into each other. */
|
111
|
52 static __inline __m64
|
0
|
53 _mm_cvtsi64_m64 (__int64 __i)
|
|
54 {
|
|
55 return __i;
|
|
56 }
|
|
57
|
|
58 static __inline __int64
|
|
59 _mm_cvtm64_si64 (__m64 __i)
|
|
60 {
|
|
61 return __i;
|
|
62 }
|
|
63
|
|
64 static __inline int
|
|
65 _mm_cvtsi64_si32 (__int64 __i)
|
|
66 {
|
|
67 return __i;
|
|
68 }
|
|
69
|
|
70 static __inline __int64
|
|
71 _mm_cvtsi32_si64 (int __i)
|
|
72 {
|
111
|
73 return (__i & 0xffffffff);
|
0
|
74 }
|
|
75
|
|
76 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
|
|
77 the result, and the four 16-bit values from M2 into the upper four 8-bit
|
|
78 values of the result, all with signed saturation. */
|
|
79 static __inline __m64
|
|
80 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
|
|
81 {
|
|
82 return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2);
|
|
83 }
|
|
84
|
|
85 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
|
|
86 the result, and the two 32-bit values from M2 into the upper two 16-bit
|
|
87 values of the result, all with signed saturation. */
|
|
88 static __inline __m64
|
|
89 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
|
|
90 {
|
|
91 return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2);
|
|
92 }
|
|
93
|
|
94 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
|
|
95 the 64-bit value from M2 into the upper 32-bits of the result, all with
|
|
96 signed saturation for values that do not fit exactly into 32-bits. */
|
|
97 static __inline __m64
|
|
98 _mm_packs_pi64 (__m64 __m1, __m64 __m2)
|
|
99 {
|
|
100 return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2);
|
|
101 }
|
|
102
|
|
103 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
|
|
104 the result, and the four 16-bit values from M2 into the upper four 8-bit
|
|
105 values of the result, all with unsigned saturation. */
|
|
106 static __inline __m64
|
|
107 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
|
|
108 {
|
|
109 return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2);
|
|
110 }
|
|
111
|
|
112 /* Pack the two 32-bit values from M1 into the lower two 16-bit values of
|
|
113 the result, and the two 32-bit values from M2 into the upper two 16-bit
|
|
114 values of the result, all with unsigned saturation. */
|
|
115 static __inline __m64
|
|
116 _mm_packs_pu32 (__m64 __m1, __m64 __m2)
|
|
117 {
|
|
118 return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2);
|
|
119 }
|
|
120
|
|
121 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
|
|
122 the 64-bit value from M2 into the upper 32-bits of the result, all with
|
|
123 unsigned saturation for values that do not fit exactly into 32-bits. */
|
|
124 static __inline __m64
|
|
125 _mm_packs_pu64 (__m64 __m1, __m64 __m2)
|
|
126 {
|
|
127 return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2);
|
|
128 }
|
|
129
|
|
130 /* Interleave the four 8-bit values from the high half of M1 with the four
|
|
131 8-bit values from the high half of M2. */
|
|
132 static __inline __m64
|
|
133 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
|
|
134 {
|
|
135 return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2);
|
|
136 }
|
|
137
|
|
138 /* Interleave the two 16-bit values from the high half of M1 with the two
|
|
139 16-bit values from the high half of M2. */
|
|
140 static __inline __m64
|
|
141 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
|
|
142 {
|
|
143 return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2);
|
|
144 }
|
|
145
|
|
146 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
|
|
147 value from the high half of M2. */
|
|
148 static __inline __m64
|
|
149 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
|
|
150 {
|
|
151 return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2);
|
|
152 }
|
|
153
|
|
154 /* Interleave the four 8-bit values from the low half of M1 with the four
|
|
155 8-bit values from the low half of M2. */
|
|
156 static __inline __m64
|
|
157 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
|
|
158 {
|
|
159 return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2);
|
|
160 }
|
|
161
|
|
162 /* Interleave the two 16-bit values from the low half of M1 with the two
|
|
163 16-bit values from the low half of M2. */
|
|
164 static __inline __m64
|
|
165 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
|
|
166 {
|
|
167 return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2);
|
|
168 }
|
|
169
|
|
170 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
|
|
171 value from the low half of M2. */
|
|
172 static __inline __m64
|
|
173 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
|
|
174 {
|
|
175 return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2);
|
|
176 }
|
|
177
|
|
178 /* Take the four 8-bit values from the low half of M1, sign extend them,
|
|
179 and return the result as a vector of four 16-bit quantities. */
|
|
180 static __inline __m64
|
|
181 _mm_unpackel_pi8 (__m64 __m1)
|
|
182 {
|
|
183 return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1);
|
|
184 }
|
|
185
|
|
186 /* Take the two 16-bit values from the low half of M1, sign extend them,
|
|
187 and return the result as a vector of two 32-bit quantities. */
|
|
188 static __inline __m64
|
|
189 _mm_unpackel_pi16 (__m64 __m1)
|
|
190 {
|
|
191 return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1);
|
|
192 }
|
|
193
|
|
194 /* Take the 32-bit value from the low half of M1, and return it sign extended
|
|
195 to 64 bits. */
|
|
196 static __inline __m64
|
|
197 _mm_unpackel_pi32 (__m64 __m1)
|
|
198 {
|
|
199 return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1);
|
|
200 }
|
|
201
|
|
202 /* Take the four 8-bit values from the high half of M1, sign extend them,
|
|
203 and return the result as a vector of four 16-bit quantities. */
|
|
204 static __inline __m64
|
|
205 _mm_unpackeh_pi8 (__m64 __m1)
|
|
206 {
|
|
207 return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1);
|
|
208 }
|
|
209
|
|
210 /* Take the two 16-bit values from the high half of M1, sign extend them,
|
|
211 and return the result as a vector of two 32-bit quantities. */
|
|
212 static __inline __m64
|
|
213 _mm_unpackeh_pi16 (__m64 __m1)
|
|
214 {
|
|
215 return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1);
|
|
216 }
|
|
217
|
|
218 /* Take the 32-bit value from the high half of M1, and return it sign extended
|
|
219 to 64 bits. */
|
|
220 static __inline __m64
|
|
221 _mm_unpackeh_pi32 (__m64 __m1)
|
|
222 {
|
|
223 return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1);
|
|
224 }
|
|
225
|
|
226 /* Take the four 8-bit values from the low half of M1, zero extend them,
|
|
227 and return the result as a vector of four 16-bit quantities. */
|
|
228 static __inline __m64
|
|
229 _mm_unpackel_pu8 (__m64 __m1)
|
|
230 {
|
|
231 return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1);
|
|
232 }
|
|
233
|
|
234 /* Take the two 16-bit values from the low half of M1, zero extend them,
|
|
235 and return the result as a vector of two 32-bit quantities. */
|
|
236 static __inline __m64
|
|
237 _mm_unpackel_pu16 (__m64 __m1)
|
|
238 {
|
|
239 return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1);
|
|
240 }
|
|
241
|
|
242 /* Take the 32-bit value from the low half of M1, and return it zero extended
|
|
243 to 64 bits. */
|
|
244 static __inline __m64
|
|
245 _mm_unpackel_pu32 (__m64 __m1)
|
|
246 {
|
|
247 return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1);
|
|
248 }
|
|
249
|
|
250 /* Take the four 8-bit values from the high half of M1, zero extend them,
|
|
251 and return the result as a vector of four 16-bit quantities. */
|
|
252 static __inline __m64
|
|
253 _mm_unpackeh_pu8 (__m64 __m1)
|
|
254 {
|
|
255 return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1);
|
|
256 }
|
|
257
|
|
258 /* Take the two 16-bit values from the high half of M1, zero extend them,
|
|
259 and return the result as a vector of two 32-bit quantities. */
|
|
260 static __inline __m64
|
|
261 _mm_unpackeh_pu16 (__m64 __m1)
|
|
262 {
|
|
263 return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1);
|
|
264 }
|
|
265
|
|
266 /* Take the 32-bit value from the high half of M1, and return it zero extended
|
|
267 to 64 bits. */
|
|
268 static __inline __m64
|
|
269 _mm_unpackeh_pu32 (__m64 __m1)
|
|
270 {
|
|
271 return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1);
|
|
272 }
|
|
273
|
|
274 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
|
|
275 static __inline __m64
|
|
276 _mm_add_pi8 (__m64 __m1, __m64 __m2)
|
|
277 {
|
|
278 return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2);
|
|
279 }
|
|
280
|
|
281 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
|
|
282 static __inline __m64
|
|
283 _mm_add_pi16 (__m64 __m1, __m64 __m2)
|
|
284 {
|
|
285 return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2);
|
|
286 }
|
|
287
|
|
288 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
|
|
289 static __inline __m64
|
|
290 _mm_add_pi32 (__m64 __m1, __m64 __m2)
|
|
291 {
|
|
292 return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2);
|
|
293 }
|
|
294
|
|
295 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
|
|
296 saturated arithmetic. */
|
|
297 static __inline __m64
|
|
298 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
|
|
299 {
|
|
300 return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2);
|
|
301 }
|
|
302
|
|
303 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
|
|
304 saturated arithmetic. */
|
|
305 static __inline __m64
|
|
306 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
|
|
307 {
|
|
308 return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2);
|
|
309 }
|
|
310
|
|
311 /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed
|
|
312 saturated arithmetic. */
|
|
313 static __inline __m64
|
|
314 _mm_adds_pi32 (__m64 __m1, __m64 __m2)
|
|
315 {
|
|
316 return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2);
|
|
317 }
|
|
318
|
|
319 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
|
|
320 saturated arithmetic. */
|
|
321 static __inline __m64
|
|
322 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
|
|
323 {
|
|
324 return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2);
|
|
325 }
|
|
326
|
|
327 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
|
|
328 saturated arithmetic. */
|
|
329 static __inline __m64
|
|
330 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
|
|
331 {
|
|
332 return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2);
|
|
333 }
|
|
334
|
|
335 /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned
|
|
336 saturated arithmetic. */
|
|
337 static __inline __m64
|
|
338 _mm_adds_pu32 (__m64 __m1, __m64 __m2)
|
|
339 {
|
|
340 return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2);
|
|
341 }
|
|
342
|
|
343 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
|
|
344 static __inline __m64
|
|
345 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
|
|
346 {
|
|
347 return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2);
|
|
348 }
|
|
349
|
|
350 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
|
|
351 static __inline __m64
|
|
352 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
|
|
353 {
|
|
354 return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2);
|
|
355 }
|
|
356
|
|
357 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
|
|
358 static __inline __m64
|
|
359 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
|
|
360 {
|
|
361 return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2);
|
|
362 }
|
|
363
|
|
364 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
|
|
365 saturating arithmetic. */
|
|
366 static __inline __m64
|
|
367 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
|
|
368 {
|
|
369 return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2);
|
|
370 }
|
|
371
|
|
372 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
|
|
373 signed saturating arithmetic. */
|
|
374 static __inline __m64
|
|
375 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
|
|
376 {
|
|
377 return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2);
|
|
378 }
|
|
379
|
|
380 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
|
|
381 signed saturating arithmetic. */
|
|
382 static __inline __m64
|
|
383 _mm_subs_pi32 (__m64 __m1, __m64 __m2)
|
|
384 {
|
|
385 return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2);
|
|
386 }
|
|
387
|
|
388 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
|
|
389 unsigned saturating arithmetic. */
|
|
390 static __inline __m64
|
|
391 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
|
|
392 {
|
|
393 return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2);
|
|
394 }
|
|
395
|
|
396 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
|
|
397 unsigned saturating arithmetic. */
|
|
398 static __inline __m64
|
|
399 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
|
|
400 {
|
|
401 return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2);
|
|
402 }
|
|
403
|
|
404 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
|
|
405 unsigned saturating arithmetic. */
|
|
406 static __inline __m64
|
|
407 _mm_subs_pu32 (__m64 __m1, __m64 __m2)
|
|
408 {
|
|
409 return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2);
|
|
410 }
|
|
411
|
|
412 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
|
|
413 four 32-bit intermediate results, which are then summed by pairs to
|
|
414 produce two 32-bit results. */
|
|
415 static __inline __m64
|
|
416 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
|
|
417 {
|
|
418 return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2);
|
|
419 }
|
|
420
|
|
421 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
|
|
422 four 32-bit intermediate results, which are then summed by pairs to
|
|
423 produce two 32-bit results. */
|
|
424 static __inline __m64
|
|
425 _mm_madd_pu16 (__m64 __m1, __m64 __m2)
|
|
426 {
|
|
427 return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2);
|
|
428 }
|
|
429
|
|
430 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
|
|
431 M2 and produce the high 16 bits of the 32-bit results. */
|
|
432 static __inline __m64
|
|
433 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
|
|
434 {
|
|
435 return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2);
|
|
436 }
|
|
437
|
|
438 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
|
|
439 M2 and produce the high 16 bits of the 32-bit results. */
|
|
440 static __inline __m64
|
|
441 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
|
|
442 {
|
|
443 return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2);
|
|
444 }
|
|
445
|
|
446 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
|
|
447 the low 16 bits of the results. */
|
|
448 static __inline __m64
|
|
449 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
|
|
450 {
|
|
451 return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2);
|
|
452 }
|
|
453
|
|
454 /* Shift four 16-bit values in M left by COUNT. */
|
|
455 static __inline __m64
|
|
456 _mm_sll_pi16 (__m64 __m, __m64 __count)
|
|
457 {
|
|
458 return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count);
|
|
459 }
|
|
460
|
|
461 static __inline __m64
|
|
462 _mm_slli_pi16 (__m64 __m, int __count)
|
|
463 {
|
|
464 return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count);
|
|
465 }
|
|
466
|
|
467 /* Shift two 32-bit values in M left by COUNT. */
|
|
468 static __inline __m64
|
|
469 _mm_sll_pi32 (__m64 __m, __m64 __count)
|
|
470 {
|
|
471 return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count);
|
|
472 }
|
|
473
|
|
474 static __inline __m64
|
|
475 _mm_slli_pi32 (__m64 __m, int __count)
|
|
476 {
|
|
477 return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count);
|
|
478 }
|
|
479
|
|
480 /* Shift the 64-bit value in M left by COUNT. */
|
|
481 static __inline __m64
|
|
482 _mm_sll_si64 (__m64 __m, __m64 __count)
|
|
483 {
|
|
484 return (__m64) __builtin_arm_wslld (__m, __count);
|
|
485 }
|
|
486
|
|
487 static __inline __m64
|
|
488 _mm_slli_si64 (__m64 __m, int __count)
|
|
489 {
|
|
490 return (__m64) __builtin_arm_wslldi (__m, __count);
|
|
491 }
|
|
492
|
|
493 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
|
|
494 static __inline __m64
|
|
495 _mm_sra_pi16 (__m64 __m, __m64 __count)
|
|
496 {
|
|
497 return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count);
|
|
498 }
|
|
499
|
|
500 static __inline __m64
|
|
501 _mm_srai_pi16 (__m64 __m, int __count)
|
|
502 {
|
|
503 return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count);
|
|
504 }
|
|
505
|
|
506 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
|
|
507 static __inline __m64
|
|
508 _mm_sra_pi32 (__m64 __m, __m64 __count)
|
|
509 {
|
|
510 return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count);
|
|
511 }
|
|
512
|
|
513 static __inline __m64
|
|
514 _mm_srai_pi32 (__m64 __m, int __count)
|
|
515 {
|
|
516 return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count);
|
|
517 }
|
|
518
|
|
519 /* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */
|
|
520 static __inline __m64
|
|
521 _mm_sra_si64 (__m64 __m, __m64 __count)
|
|
522 {
|
|
523 return (__m64) __builtin_arm_wsrad (__m, __count);
|
|
524 }
|
|
525
|
|
526 static __inline __m64
|
|
527 _mm_srai_si64 (__m64 __m, int __count)
|
|
528 {
|
|
529 return (__m64) __builtin_arm_wsradi (__m, __count);
|
|
530 }
|
|
531
|
|
532 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
|
|
533 static __inline __m64
|
|
534 _mm_srl_pi16 (__m64 __m, __m64 __count)
|
|
535 {
|
|
536 return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count);
|
|
537 }
|
|
538
|
|
539 static __inline __m64
|
|
540 _mm_srli_pi16 (__m64 __m, int __count)
|
|
541 {
|
|
542 return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count);
|
|
543 }
|
|
544
|
|
545 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
|
|
546 static __inline __m64
|
|
547 _mm_srl_pi32 (__m64 __m, __m64 __count)
|
|
548 {
|
|
549 return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count);
|
|
550 }
|
|
551
|
|
552 static __inline __m64
|
|
553 _mm_srli_pi32 (__m64 __m, int __count)
|
|
554 {
|
|
555 return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count);
|
|
556 }
|
|
557
|
|
558 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
|
|
559 static __inline __m64
|
|
560 _mm_srl_si64 (__m64 __m, __m64 __count)
|
|
561 {
|
|
562 return (__m64) __builtin_arm_wsrld (__m, __count);
|
|
563 }
|
|
564
|
|
565 static __inline __m64
|
|
566 _mm_srli_si64 (__m64 __m, int __count)
|
|
567 {
|
|
568 return (__m64) __builtin_arm_wsrldi (__m, __count);
|
|
569 }
|
|
570
|
|
571 /* Rotate four 16-bit values in M right by COUNT. */
|
|
572 static __inline __m64
|
|
573 _mm_ror_pi16 (__m64 __m, __m64 __count)
|
|
574 {
|
|
575 return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count);
|
|
576 }
|
|
577
|
|
578 static __inline __m64
|
|
579 _mm_rori_pi16 (__m64 __m, int __count)
|
|
580 {
|
|
581 return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count);
|
|
582 }
|
|
583
|
|
584 /* Rotate two 32-bit values in M right by COUNT. */
|
|
585 static __inline __m64
|
|
586 _mm_ror_pi32 (__m64 __m, __m64 __count)
|
|
587 {
|
|
588 return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count);
|
|
589 }
|
|
590
|
|
591 static __inline __m64
|
|
592 _mm_rori_pi32 (__m64 __m, int __count)
|
|
593 {
|
|
594 return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count);
|
|
595 }
|
|
596
|
|
597 /* Rotate two 64-bit values in M right by COUNT. */
|
|
598 static __inline __m64
|
|
599 _mm_ror_si64 (__m64 __m, __m64 __count)
|
|
600 {
|
|
601 return (__m64) __builtin_arm_wrord (__m, __count);
|
|
602 }
|
|
603
|
|
604 static __inline __m64
|
|
605 _mm_rori_si64 (__m64 __m, int __count)
|
|
606 {
|
|
607 return (__m64) __builtin_arm_wrordi (__m, __count);
|
|
608 }
|
|
609
|
|
610 /* Bit-wise AND the 64-bit values in M1 and M2. */
|
|
611 static __inline __m64
|
|
612 _mm_and_si64 (__m64 __m1, __m64 __m2)
|
|
613 {
|
|
614 return __builtin_arm_wand (__m1, __m2);
|
|
615 }
|
|
616
|
|
617 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
|
|
618 64-bit value in M2. */
|
|
619 static __inline __m64
|
|
620 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
|
|
621 {
|
111
|
622 return __builtin_arm_wandn (__m2, __m1);
|
0
|
623 }
|
|
624
|
|
625 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
|
|
626 static __inline __m64
|
|
627 _mm_or_si64 (__m64 __m1, __m64 __m2)
|
|
628 {
|
|
629 return __builtin_arm_wor (__m1, __m2);
|
|
630 }
|
|
631
|
|
632 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
|
|
633 static __inline __m64
|
|
634 _mm_xor_si64 (__m64 __m1, __m64 __m2)
|
|
635 {
|
|
636 return __builtin_arm_wxor (__m1, __m2);
|
|
637 }
|
|
638
|
|
639 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
|
|
640 test is true and zero if false. */
|
|
641 static __inline __m64
|
|
642 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
|
|
643 {
|
|
644 return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
|
|
645 }
|
|
646
|
|
647 static __inline __m64
|
|
648 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
|
|
649 {
|
|
650 return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2);
|
|
651 }
|
|
652
|
|
653 static __inline __m64
|
|
654 _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2)
|
|
655 {
|
|
656 return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2);
|
|
657 }
|
|
658
|
|
659 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
|
|
660 the test is true and zero if false. */
|
|
661 static __inline __m64
|
|
662 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
|
|
663 {
|
|
664 return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2);
|
|
665 }
|
|
666
|
|
667 static __inline __m64
|
|
668 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
|
|
669 {
|
|
670 return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2);
|
|
671 }
|
|
672
|
|
673 static __inline __m64
|
|
674 _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2)
|
|
675 {
|
|
676 return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2);
|
|
677 }
|
|
678
|
|
679 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
|
|
680 the test is true and zero if false. */
|
|
681 static __inline __m64
|
|
682 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
|
|
683 {
|
|
684 return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2);
|
|
685 }
|
|
686
|
|
687 static __inline __m64
|
|
688 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
|
|
689 {
|
|
690 return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2);
|
|
691 }
|
|
692
|
|
693 static __inline __m64
|
|
694 _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2)
|
|
695 {
|
|
696 return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2);
|
|
697 }
|
|
698
|
|
699 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
|
|
700 by accumulate across all elements and __A. */
|
|
701 static __inline __m64
|
|
702 _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C)
|
|
703 {
|
|
704 return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C);
|
|
705 }
|
|
706
|
|
707 /* Element-wise multiplication of signed 16-bit values __B and __C, followed
|
|
708 by accumulate across all elements and __A. */
|
|
709 static __inline __m64
|
|
710 _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C)
|
|
711 {
|
|
712 return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C);
|
|
713 }
|
|
714
|
|
715 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
|
|
716 by accumulate across all elements. */
|
|
717 static __inline __m64
|
|
718 _mm_macz_pu16 (__m64 __A, __m64 __B)
|
|
719 {
|
|
720 return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B);
|
|
721 }
|
|
722
|
|
723 /* Element-wise multiplication of signed 16-bit values __B and __C, followed
|
|
724 by accumulate across all elements. */
|
|
725 static __inline __m64
|
|
726 _mm_macz_pi16 (__m64 __A, __m64 __B)
|
|
727 {
|
|
728 return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B);
|
|
729 }
|
|
730
|
|
731 /* Accumulate across all unsigned 8-bit values in __A. */
|
|
732 static __inline __m64
|
|
733 _mm_acc_pu8 (__m64 __A)
|
|
734 {
|
|
735 return __builtin_arm_waccb ((__v8qi)__A);
|
|
736 }
|
|
737
|
|
738 /* Accumulate across all unsigned 16-bit values in __A. */
|
|
739 static __inline __m64
|
|
740 _mm_acc_pu16 (__m64 __A)
|
|
741 {
|
|
742 return __builtin_arm_wacch ((__v4hi)__A);
|
|
743 }
|
|
744
|
|
745 /* Accumulate across all unsigned 32-bit values in __A. */
|
|
746 static __inline __m64
|
|
747 _mm_acc_pu32 (__m64 __A)
|
|
748 {
|
|
749 return __builtin_arm_waccw ((__v2si)__A);
|
|
750 }
|
|
751
|
|
752 static __inline __m64
|
|
753 _mm_mia_si64 (__m64 __A, int __B, int __C)
|
|
754 {
|
|
755 return __builtin_arm_tmia (__A, __B, __C);
|
|
756 }
|
|
757
|
|
758 static __inline __m64
|
|
759 _mm_miaph_si64 (__m64 __A, int __B, int __C)
|
|
760 {
|
|
761 return __builtin_arm_tmiaph (__A, __B, __C);
|
|
762 }
|
|
763
|
|
764 static __inline __m64
|
|
765 _mm_miabb_si64 (__m64 __A, int __B, int __C)
|
|
766 {
|
|
767 return __builtin_arm_tmiabb (__A, __B, __C);
|
|
768 }
|
|
769
|
|
770 static __inline __m64
|
|
771 _mm_miabt_si64 (__m64 __A, int __B, int __C)
|
|
772 {
|
|
773 return __builtin_arm_tmiabt (__A, __B, __C);
|
|
774 }
|
|
775
|
|
776 static __inline __m64
|
|
777 _mm_miatb_si64 (__m64 __A, int __B, int __C)
|
|
778 {
|
|
779 return __builtin_arm_tmiatb (__A, __B, __C);
|
|
780 }
|
|
781
|
|
782 static __inline __m64
|
|
783 _mm_miatt_si64 (__m64 __A, int __B, int __C)
|
|
784 {
|
|
785 return __builtin_arm_tmiatt (__A, __B, __C);
|
|
786 }
|
|
787
|
|
788 /* Extract one of the elements of A and sign extend. The selector N must
|
|
789 be immediate. */
|
|
790 #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N))
|
|
791 #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N))
|
|
792 #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N))
|
|
793
|
|
794 /* Extract one of the elements of A and zero extend. The selector N must
|
|
795 be immediate. */
|
|
796 #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N))
|
|
797 #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N))
|
|
798 #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N))
|
|
799
|
|
800 /* Inserts word D into one of the elements of A. The selector N must be
|
|
801 immediate. */
|
|
802 #define _mm_insert_pi8(A, D, N) \
|
|
803 ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N)))
|
|
804 #define _mm_insert_pi16(A, D, N) \
|
|
805 ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N)))
|
|
806 #define _mm_insert_pi32(A, D, N) \
|
|
807 ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N)))
|
|
808
|
|
809 /* Compute the element-wise maximum of signed 8-bit values. */
|
|
810 static __inline __m64
|
|
811 _mm_max_pi8 (__m64 __A, __m64 __B)
|
|
812 {
|
|
813 return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B);
|
|
814 }
|
|
815
|
|
816 /* Compute the element-wise maximum of signed 16-bit values. */
|
|
817 static __inline __m64
|
|
818 _mm_max_pi16 (__m64 __A, __m64 __B)
|
|
819 {
|
|
820 return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B);
|
|
821 }
|
|
822
|
|
823 /* Compute the element-wise maximum of signed 32-bit values. */
|
|
824 static __inline __m64
|
|
825 _mm_max_pi32 (__m64 __A, __m64 __B)
|
|
826 {
|
|
827 return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B);
|
|
828 }
|
|
829
|
|
830 /* Compute the element-wise maximum of unsigned 8-bit values. */
|
|
831 static __inline __m64
|
|
832 _mm_max_pu8 (__m64 __A, __m64 __B)
|
|
833 {
|
|
834 return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B);
|
|
835 }
|
|
836
|
|
837 /* Compute the element-wise maximum of unsigned 16-bit values. */
|
|
838 static __inline __m64
|
|
839 _mm_max_pu16 (__m64 __A, __m64 __B)
|
|
840 {
|
|
841 return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B);
|
|
842 }
|
|
843
|
|
844 /* Compute the element-wise maximum of unsigned 32-bit values. */
|
|
845 static __inline __m64
|
|
846 _mm_max_pu32 (__m64 __A, __m64 __B)
|
|
847 {
|
|
848 return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B);
|
|
849 }
|
|
850
|
|
851 /* Compute the element-wise minimum of signed 16-bit values. */
|
|
852 static __inline __m64
|
|
853 _mm_min_pi8 (__m64 __A, __m64 __B)
|
|
854 {
|
|
855 return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B);
|
|
856 }
|
|
857
|
|
858 /* Compute the element-wise minimum of signed 16-bit values. */
|
|
859 static __inline __m64
|
|
860 _mm_min_pi16 (__m64 __A, __m64 __B)
|
|
861 {
|
|
862 return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B);
|
|
863 }
|
|
864
|
|
865 /* Compute the element-wise minimum of signed 32-bit values. */
|
|
866 static __inline __m64
|
|
867 _mm_min_pi32 (__m64 __A, __m64 __B)
|
|
868 {
|
|
869 return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B);
|
|
870 }
|
|
871
|
|
872 /* Compute the element-wise minimum of unsigned 16-bit values. */
|
|
873 static __inline __m64
|
|
874 _mm_min_pu8 (__m64 __A, __m64 __B)
|
|
875 {
|
|
876 return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B);
|
|
877 }
|
|
878
|
|
879 /* Compute the element-wise minimum of unsigned 16-bit values. */
|
|
880 static __inline __m64
|
|
881 _mm_min_pu16 (__m64 __A, __m64 __B)
|
|
882 {
|
|
883 return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B);
|
|
884 }
|
|
885
|
|
886 /* Compute the element-wise minimum of unsigned 32-bit values. */
|
|
887 static __inline __m64
|
|
888 _mm_min_pu32 (__m64 __A, __m64 __B)
|
|
889 {
|
|
890 return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B);
|
|
891 }
|
|
892
|
|
893 /* Create an 8-bit mask of the signs of 8-bit values. */
|
|
894 static __inline int
|
|
895 _mm_movemask_pi8 (__m64 __A)
|
|
896 {
|
|
897 return __builtin_arm_tmovmskb ((__v8qi)__A);
|
|
898 }
|
|
899
|
|
900 /* Create an 8-bit mask of the signs of 16-bit values. */
|
|
901 static __inline int
|
|
902 _mm_movemask_pi16 (__m64 __A)
|
|
903 {
|
|
904 return __builtin_arm_tmovmskh ((__v4hi)__A);
|
|
905 }
|
|
906
|
|
907 /* Create an 8-bit mask of the signs of 32-bit values. */
|
|
908 static __inline int
|
|
909 _mm_movemask_pi32 (__m64 __A)
|
|
910 {
|
|
911 return __builtin_arm_tmovmskw ((__v2si)__A);
|
|
912 }
|
|
913
|
|
914 /* Return a combination of the four 16-bit values in A. The selector
|
|
915 must be an immediate. */
|
|
916 #define _mm_shuffle_pi16(A, N) \
|
|
917 ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N)))
|
|
918
|
|
919
|
|
920 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
|
|
921 static __inline __m64
|
|
922 _mm_avg_pu8 (__m64 __A, __m64 __B)
|
|
923 {
|
|
924 return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B);
|
|
925 }
|
|
926
|
|
927 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
|
|
928 static __inline __m64
|
|
929 _mm_avg_pu16 (__m64 __A, __m64 __B)
|
|
930 {
|
|
931 return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B);
|
|
932 }
|
|
933
|
|
934 /* Compute the averages of the unsigned 8-bit values in A and B. */
|
|
935 static __inline __m64
|
|
936 _mm_avg2_pu8 (__m64 __A, __m64 __B)
|
|
937 {
|
|
938 return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B);
|
|
939 }
|
|
940
|
|
941 /* Compute the averages of the unsigned 16-bit values in A and B. */
|
|
942 static __inline __m64
|
|
943 _mm_avg2_pu16 (__m64 __A, __m64 __B)
|
|
944 {
|
|
945 return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B);
|
|
946 }
|
|
947
|
|
948 /* Compute the sum of the absolute differences of the unsigned 8-bit
|
|
949 values in A and B. Return the value in the lower 16-bit word; the
|
|
950 upper words are cleared. */
|
|
951 static __inline __m64
|
|
952 _mm_sad_pu8 (__m64 __A, __m64 __B)
|
|
953 {
|
111
|
954 return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
|
|
955 }
|
|
956
|
|
957 static __inline __m64
|
|
958 _mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
|
|
959 {
|
|
960 return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
|
0
|
961 }
|
|
962
|
|
963 /* Compute the sum of the absolute differences of the unsigned 16-bit
|
|
964 values in A and B. Return the value in the lower 32-bit word; the
|
|
965 upper words are cleared. */
|
|
966 static __inline __m64
|
|
967 _mm_sad_pu16 (__m64 __A, __m64 __B)
|
|
968 {
|
111
|
969 return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
|
0
|
970 }
|
|
971
|
111
|
972 static __inline __m64
|
|
973 _mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
|
|
974 {
|
|
975 return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
|
|
976 }
|
|
977
|
|
978
|
0
|
979 /* Compute the sum of the absolute differences of the unsigned 8-bit
|
|
980 values in A and B. Return the value in the lower 16-bit word; the
|
|
981 upper words are cleared. */
|
|
982 static __inline __m64
|
|
983 _mm_sadz_pu8 (__m64 __A, __m64 __B)
|
|
984 {
|
|
985 return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
|
|
986 }
|
|
987
|
|
988 /* Compute the sum of the absolute differences of the unsigned 16-bit
|
|
989 values in A and B. Return the value in the lower 32-bit word; the
|
|
990 upper words are cleared. */
|
|
991 static __inline __m64
|
|
992 _mm_sadz_pu16 (__m64 __A, __m64 __B)
|
|
993 {
|
|
994 return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
|
|
995 }
|
|
996
|
111
|
997 #define _mm_align_si64(__A,__B, N) \
|
|
998 (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
|
0
|
999
|
|
1000 /* Creates a 64-bit zero. */
|
|
1001 static __inline __m64
|
|
1002 _mm_setzero_si64 (void)
|
|
1003 {
|
|
1004 return __builtin_arm_wzero ();
|
|
1005 }
|
|
1006
|
|
1007 /* Set and Get arbitrary iWMMXt Control registers.
|
|
1008 Note only registers 0-3 and 8-11 are currently defined,
|
|
1009 the rest are reserved. */
|
|
1010
|
|
1011 static __inline void
|
|
1012 _mm_setwcx (const int __value, const int __regno)
|
|
1013 {
|
|
1014 switch (__regno)
|
|
1015 {
|
111
|
1016 case 0:
|
|
1017 __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
|
|
1018 break;
|
|
1019 case 1:
|
|
1020 __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
|
|
1021 break;
|
|
1022 case 2:
|
|
1023 __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
|
|
1024 break;
|
|
1025 case 3:
|
|
1026 __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
|
|
1027 break;
|
|
1028 case 8:
|
|
1029 __builtin_arm_setwcgr0 (__value);
|
|
1030 break;
|
|
1031 case 9:
|
|
1032 __builtin_arm_setwcgr1 (__value);
|
|
1033 break;
|
|
1034 case 10:
|
|
1035 __builtin_arm_setwcgr2 (__value);
|
|
1036 break;
|
|
1037 case 11:
|
|
1038 __builtin_arm_setwcgr3 (__value);
|
|
1039 break;
|
|
1040 default:
|
|
1041 break;
|
0
|
1042 }
|
|
1043 }
|
|
1044
|
|
1045 static __inline int
|
|
1046 _mm_getwcx (const int __regno)
|
|
1047 {
|
111
|
1048 int __value;
|
0
|
1049 switch (__regno)
|
|
1050 {
|
111
|
1051 case 0:
|
|
1052 __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
|
|
1053 break;
|
|
1054 case 1:
|
|
1055 __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
|
|
1056 break;
|
|
1057 case 2:
|
|
1058 __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
|
|
1059 break;
|
|
1060 case 3:
|
|
1061 __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
|
|
1062 break;
|
|
1063 case 8:
|
|
1064 return __builtin_arm_getwcgr0 ();
|
|
1065 case 9:
|
|
1066 return __builtin_arm_getwcgr1 ();
|
|
1067 case 10:
|
|
1068 return __builtin_arm_getwcgr2 ();
|
|
1069 case 11:
|
|
1070 return __builtin_arm_getwcgr3 ();
|
|
1071 default:
|
|
1072 break;
|
0
|
1073 }
|
111
|
1074 return __value;
|
0
|
1075 }
|
|
1076
|
|
1077 /* Creates a vector of two 32-bit values; I0 is least significant. */
|
|
1078 static __inline __m64
|
|
1079 _mm_set_pi32 (int __i1, int __i0)
|
|
1080 {
|
111
|
1081 union
|
|
1082 {
|
0
|
1083 __m64 __q;
|
111
|
1084 struct
|
|
1085 {
|
0
|
1086 unsigned int __i0;
|
|
1087 unsigned int __i1;
|
|
1088 } __s;
|
|
1089 } __u;
|
|
1090
|
|
1091 __u.__s.__i0 = __i0;
|
|
1092 __u.__s.__i1 = __i1;
|
|
1093
|
|
1094 return __u.__q;
|
|
1095 }
|
|
1096
|
|
1097 /* Creates a vector of four 16-bit values; W0 is least significant. */
|
|
1098 static __inline __m64
|
|
1099 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
|
|
1100 {
|
111
|
1101 unsigned int __i1 = (unsigned short) __w3 << 16 | (unsigned short) __w2;
|
|
1102 unsigned int __i0 = (unsigned short) __w1 << 16 | (unsigned short) __w0;
|
|
1103
|
0
|
1104 return _mm_set_pi32 (__i1, __i0);
|
|
1105 }
|
|
1106
|
|
1107 /* Creates a vector of eight 8-bit values; B0 is least significant. */
|
|
1108 static __inline __m64
|
|
1109 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
|
|
1110 char __b3, char __b2, char __b1, char __b0)
|
|
1111 {
|
|
1112 unsigned int __i1, __i0;
|
|
1113
|
|
1114 __i1 = (unsigned char)__b7;
|
|
1115 __i1 = __i1 << 8 | (unsigned char)__b6;
|
|
1116 __i1 = __i1 << 8 | (unsigned char)__b5;
|
|
1117 __i1 = __i1 << 8 | (unsigned char)__b4;
|
|
1118
|
|
1119 __i0 = (unsigned char)__b3;
|
|
1120 __i0 = __i0 << 8 | (unsigned char)__b2;
|
|
1121 __i0 = __i0 << 8 | (unsigned char)__b1;
|
|
1122 __i0 = __i0 << 8 | (unsigned char)__b0;
|
|
1123
|
|
1124 return _mm_set_pi32 (__i1, __i0);
|
|
1125 }
|
|
1126
|
|
1127 /* Similar, but with the arguments in reverse order. */
|
|
1128 static __inline __m64
|
|
1129 _mm_setr_pi32 (int __i0, int __i1)
|
|
1130 {
|
|
1131 return _mm_set_pi32 (__i1, __i0);
|
|
1132 }
|
|
1133
|
|
1134 static __inline __m64
|
|
1135 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
|
|
1136 {
|
|
1137 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
|
|
1138 }
|
|
1139
|
|
1140 static __inline __m64
|
|
1141 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
|
|
1142 char __b4, char __b5, char __b6, char __b7)
|
|
1143 {
|
|
1144 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
|
|
1145 }
|
|
1146
|
|
1147 /* Creates a vector of two 32-bit values, both elements containing I. */
|
|
1148 static __inline __m64
|
|
1149 _mm_set1_pi32 (int __i)
|
|
1150 {
|
|
1151 return _mm_set_pi32 (__i, __i);
|
|
1152 }
|
|
1153
|
|
1154 /* Creates a vector of four 16-bit values, all elements containing W. */
|
|
1155 static __inline __m64
|
|
1156 _mm_set1_pi16 (short __w)
|
|
1157 {
|
|
1158 unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
|
|
1159 return _mm_set1_pi32 (__i);
|
|
1160 }
|
|
1161
|
|
1162 /* Creates a vector of four 16-bit values, all elements containing B. */
|
|
1163 static __inline __m64
|
|
1164 _mm_set1_pi8 (char __b)
|
|
1165 {
|
|
1166 unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
|
|
1167 unsigned int __i = __w << 16 | __w;
|
|
1168 return _mm_set1_pi32 (__i);
|
|
1169 }
|
|
1170
|
111
|
1171 #ifdef __IWMMXT2__
|
|
1172 static __inline __m64
|
|
1173 _mm_abs_pi8 (__m64 m1)
|
|
1174 {
|
|
1175 return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
|
|
1176 }
|
|
1177
|
|
1178 static __inline __m64
|
|
1179 _mm_abs_pi16 (__m64 m1)
|
|
1180 {
|
|
1181 return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
|
|
1182
|
|
1183 }
|
|
1184
|
|
1185 static __inline __m64
|
|
1186 _mm_abs_pi32 (__m64 m1)
|
|
1187 {
|
|
1188 return (__m64) __builtin_arm_wabsw ((__v2si)m1);
|
|
1189
|
|
1190 }
|
|
1191
|
|
1192 static __inline __m64
|
|
1193 _mm_addsubhx_pi16 (__m64 a, __m64 b)
|
|
1194 {
|
|
1195 return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
|
|
1196 }
|
|
1197
|
|
1198 static __inline __m64
|
|
1199 _mm_absdiff_pu8 (__m64 a, __m64 b)
|
|
1200 {
|
|
1201 return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
|
|
1202 }
|
|
1203
|
|
1204 static __inline __m64
|
|
1205 _mm_absdiff_pu16 (__m64 a, __m64 b)
|
|
1206 {
|
|
1207 return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
|
|
1208 }
|
|
1209
|
|
1210 static __inline __m64
|
|
1211 _mm_absdiff_pu32 (__m64 a, __m64 b)
|
|
1212 {
|
|
1213 return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
|
|
1214 }
|
|
1215
|
|
1216 static __inline __m64
|
|
1217 _mm_addc_pu16 (__m64 a, __m64 b)
|
|
1218 {
|
|
1219 __m64 result;
|
|
1220 __asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b));
|
|
1221 return result;
|
|
1222 }
|
|
1223
|
|
1224 static __inline __m64
|
|
1225 _mm_addc_pu32 (__m64 a, __m64 b)
|
|
1226 {
|
|
1227 __m64 result;
|
|
1228 __asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b));
|
|
1229 return result;
|
|
1230 }
|
|
1231
|
|
1232 static __inline __m64
|
|
1233 _mm_avg4_pu8 (__m64 a, __m64 b)
|
|
1234 {
|
|
1235 return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
|
|
1236 }
|
|
1237
|
|
1238 static __inline __m64
|
|
1239 _mm_avg4r_pu8 (__m64 a, __m64 b)
|
|
1240 {
|
|
1241 return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
|
|
1242 }
|
|
1243
|
|
1244 static __inline __m64
|
|
1245 _mm_maddx_pi16 (__m64 a, __m64 b)
|
|
1246 {
|
|
1247 return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
|
|
1248 }
|
|
1249
|
|
1250 static __inline __m64
|
|
1251 _mm_maddx_pu16 (__m64 a, __m64 b)
|
|
1252 {
|
|
1253 return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
|
|
1254 }
|
|
1255
|
0
|
1256 static __inline __m64
|
111
|
1257 _mm_msub_pi16 (__m64 a, __m64 b)
|
|
1258 {
|
|
1259 return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
|
|
1260 }
|
|
1261
|
|
1262 static __inline __m64
|
|
1263 _mm_msub_pu16 (__m64 a, __m64 b)
|
|
1264 {
|
|
1265 return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
|
|
1266 }
|
|
1267
|
|
1268 static __inline __m64
|
|
1269 _mm_mulhi_pi32 (__m64 a, __m64 b)
|
|
1270 {
|
|
1271 return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
|
|
1272 }
|
|
1273
|
|
1274 static __inline __m64
|
|
1275 _mm_mulhi_pu32 (__m64 a, __m64 b)
|
|
1276 {
|
|
1277 return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
|
|
1278 }
|
|
1279
|
|
1280 static __inline __m64
|
|
1281 _mm_mulhir_pi16 (__m64 a, __m64 b)
|
|
1282 {
|
|
1283 return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
|
|
1284 }
|
|
1285
|
|
1286 static __inline __m64
|
|
1287 _mm_mulhir_pi32 (__m64 a, __m64 b)
|
|
1288 {
|
|
1289 return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
|
|
1290 }
|
|
1291
|
|
1292 static __inline __m64
|
|
1293 _mm_mulhir_pu16 (__m64 a, __m64 b)
|
|
1294 {
|
|
1295 return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
|
|
1296 }
|
|
1297
|
|
1298 static __inline __m64
|
|
1299 _mm_mulhir_pu32 (__m64 a, __m64 b)
|
0
|
1300 {
|
111
|
1301 return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
|
|
1302 }
|
|
1303
|
|
1304 static __inline __m64
|
|
1305 _mm_mullo_pi32 (__m64 a, __m64 b)
|
|
1306 {
|
|
1307 return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
|
|
1308 }
|
|
1309
|
|
1310 static __inline __m64
|
|
1311 _mm_qmulm_pi16 (__m64 a, __m64 b)
|
|
1312 {
|
|
1313 return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
|
|
1314 }
|
|
1315
|
|
1316 static __inline __m64
|
|
1317 _mm_qmulm_pi32 (__m64 a, __m64 b)
|
|
1318 {
|
|
1319 return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
|
|
1320 }
|
|
1321
|
|
1322 static __inline __m64
|
|
1323 _mm_qmulmr_pi16 (__m64 a, __m64 b)
|
|
1324 {
|
|
1325 return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
|
|
1326 }
|
|
1327
|
|
1328 static __inline __m64
|
|
1329 _mm_qmulmr_pi32 (__m64 a, __m64 b)
|
|
1330 {
|
|
1331 return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
|
|
1332 }
|
|
1333
|
|
1334 static __inline __m64
|
|
1335 _mm_subaddhx_pi16 (__m64 a, __m64 b)
|
|
1336 {
|
|
1337 return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
|
|
1338 }
|
|
1339
|
|
1340 static __inline __m64
|
|
1341 _mm_addbhusl_pu8 (__m64 a, __m64 b)
|
|
1342 {
|
|
1343 return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
|
|
1344 }
|
|
1345
|
|
1346 static __inline __m64
|
|
1347 _mm_addbhusm_pu8 (__m64 a, __m64 b)
|
|
1348 {
|
|
1349 return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
|
0
|
1350 }
|
|
1351
|
111
|
1352 #define _mm_qmiabb_pi32(acc, m1, m2) \
|
|
1353 ({\
|
|
1354 __m64 _acc = acc;\
|
|
1355 __m64 _m1 = m1;\
|
|
1356 __m64 _m2 = m2;\
|
|
1357 _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1358 _acc;\
|
|
1359 })
|
|
1360
|
|
1361 #define _mm_qmiabbn_pi32(acc, m1, m2) \
|
|
1362 ({\
|
|
1363 __m64 _acc = acc;\
|
|
1364 __m64 _m1 = m1;\
|
|
1365 __m64 _m2 = m2;\
|
|
1366 _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1367 _acc;\
|
|
1368 })
|
|
1369
|
|
1370 #define _mm_qmiabt_pi32(acc, m1, m2) \
|
|
1371 ({\
|
|
1372 __m64 _acc = acc;\
|
|
1373 __m64 _m1 = m1;\
|
|
1374 __m64 _m2 = m2;\
|
|
1375 _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1376 _acc;\
|
|
1377 })
|
|
1378
|
|
1379 #define _mm_qmiabtn_pi32(acc, m1, m2) \
|
|
1380 ({\
|
|
1381 __m64 _acc=acc;\
|
|
1382 __m64 _m1=m1;\
|
|
1383 __m64 _m2=m2;\
|
|
1384 _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1385 _acc;\
|
|
1386 })
|
|
1387
|
|
1388 #define _mm_qmiatb_pi32(acc, m1, m2) \
|
|
1389 ({\
|
|
1390 __m64 _acc = acc;\
|
|
1391 __m64 _m1 = m1;\
|
|
1392 __m64 _m2 = m2;\
|
|
1393 _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1394 _acc;\
|
|
1395 })
|
|
1396
|
|
1397 #define _mm_qmiatbn_pi32(acc, m1, m2) \
|
|
1398 ({\
|
|
1399 __m64 _acc = acc;\
|
|
1400 __m64 _m1 = m1;\
|
|
1401 __m64 _m2 = m2;\
|
|
1402 _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1403 _acc;\
|
|
1404 })
|
|
1405
|
|
1406 #define _mm_qmiatt_pi32(acc, m1, m2) \
|
|
1407 ({\
|
|
1408 __m64 _acc = acc;\
|
|
1409 __m64 _m1 = m1;\
|
|
1410 __m64 _m2 = m2;\
|
|
1411 _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1412 _acc;\
|
|
1413 })
|
|
1414
|
|
1415 #define _mm_qmiattn_pi32(acc, m1, m2) \
|
|
1416 ({\
|
|
1417 __m64 _acc = acc;\
|
|
1418 __m64 _m1 = m1;\
|
|
1419 __m64 _m2 = m2;\
|
|
1420 _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1421 _acc;\
|
|
1422 })
|
|
1423
|
|
1424 #define _mm_wmiabb_si64(acc, m1, m2) \
|
|
1425 ({\
|
|
1426 __m64 _acc = acc;\
|
|
1427 __m64 _m1 = m1;\
|
|
1428 __m64 _m2 = m2;\
|
|
1429 _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1430 _acc;\
|
|
1431 })
|
|
1432
|
|
1433 #define _mm_wmiabbn_si64(acc, m1, m2) \
|
|
1434 ({\
|
|
1435 __m64 _acc = acc;\
|
|
1436 __m64 _m1 = m1;\
|
|
1437 __m64 _m2 = m2;\
|
|
1438 _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1439 _acc;\
|
|
1440 })
|
|
1441
|
|
1442 #define _mm_wmiabt_si64(acc, m1, m2) \
|
|
1443 ({\
|
|
1444 __m64 _acc = acc;\
|
|
1445 __m64 _m1 = m1;\
|
|
1446 __m64 _m2 = m2;\
|
|
1447 _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1448 _acc;\
|
|
1449 })
|
|
1450
|
|
1451 #define _mm_wmiabtn_si64(acc, m1, m2) \
|
|
1452 ({\
|
|
1453 __m64 _acc = acc;\
|
|
1454 __m64 _m1 = m1;\
|
|
1455 __m64 _m2 = m2;\
|
|
1456 _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1457 _acc;\
|
|
1458 })
|
|
1459
|
|
1460 #define _mm_wmiatb_si64(acc, m1, m2) \
|
|
1461 ({\
|
|
1462 __m64 _acc = acc;\
|
|
1463 __m64 _m1 = m1;\
|
|
1464 __m64 _m2 = m2;\
|
|
1465 _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1466 _acc;\
|
|
1467 })
|
|
1468
|
|
1469 #define _mm_wmiatbn_si64(acc, m1, m2) \
|
|
1470 ({\
|
|
1471 __m64 _acc = acc;\
|
|
1472 __m64 _m1 = m1;\
|
|
1473 __m64 _m2 = m2;\
|
|
1474 _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1475 _acc;\
|
|
1476 })
|
|
1477
|
|
1478 #define _mm_wmiatt_si64(acc, m1, m2) \
|
|
1479 ({\
|
|
1480 __m64 _acc = acc;\
|
|
1481 __m64 _m1 = m1;\
|
|
1482 __m64 _m2 = m2;\
|
|
1483 _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1484 _acc;\
|
|
1485 })
|
|
1486
|
|
1487 #define _mm_wmiattn_si64(acc, m1, m2) \
|
|
1488 ({\
|
|
1489 __m64 _acc = acc;\
|
|
1490 __m64 _m1 = m1;\
|
|
1491 __m64 _m2 = m2;\
|
|
1492 _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
|
|
1493 _acc;\
|
|
1494 })
|
|
1495
|
|
1496 #define _mm_wmiawbb_si64(acc, m1, m2) \
|
|
1497 ({\
|
|
1498 __m64 _acc = acc;\
|
|
1499 __m64 _m1 = m1;\
|
|
1500 __m64 _m2 = m2;\
|
|
1501 _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1502 _acc;\
|
|
1503 })
|
|
1504
|
|
1505 #define _mm_wmiawbbn_si64(acc, m1, m2) \
|
|
1506 ({\
|
|
1507 __m64 _acc = acc;\
|
|
1508 __m64 _m1 = m1;\
|
|
1509 __m64 _m2 = m2;\
|
|
1510 _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1511 _acc;\
|
|
1512 })
|
|
1513
|
|
1514 #define _mm_wmiawbt_si64(acc, m1, m2) \
|
|
1515 ({\
|
|
1516 __m64 _acc = acc;\
|
|
1517 __m64 _m1 = m1;\
|
|
1518 __m64 _m2 = m2;\
|
|
1519 _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1520 _acc;\
|
|
1521 })
|
|
1522
|
|
1523 #define _mm_wmiawbtn_si64(acc, m1, m2) \
|
|
1524 ({\
|
|
1525 __m64 _acc = acc;\
|
|
1526 __m64 _m1 = m1;\
|
|
1527 __m64 _m2 = m2;\
|
|
1528 _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1529 _acc;\
|
|
1530 })
|
|
1531
|
|
1532 #define _mm_wmiawtb_si64(acc, m1, m2) \
|
|
1533 ({\
|
|
1534 __m64 _acc = acc;\
|
|
1535 __m64 _m1 = m1;\
|
|
1536 __m64 _m2 = m2;\
|
|
1537 _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1538 _acc;\
|
|
1539 })
|
|
1540
|
|
1541 #define _mm_wmiawtbn_si64(acc, m1, m2) \
|
|
1542 ({\
|
|
1543 __m64 _acc = acc;\
|
|
1544 __m64 _m1 = m1;\
|
|
1545 __m64 _m2 = m2;\
|
|
1546 _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1547 _acc;\
|
|
1548 })
|
|
1549
|
|
1550 #define _mm_wmiawtt_si64(acc, m1, m2) \
|
|
1551 ({\
|
|
1552 __m64 _acc = acc;\
|
|
1553 __m64 _m1 = m1;\
|
|
1554 __m64 _m2 = m2;\
|
|
1555 _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1556 _acc;\
|
|
1557 })
|
|
1558
|
|
1559 #define _mm_wmiawttn_si64(acc, m1, m2) \
|
|
1560 ({\
|
|
1561 __m64 _acc = acc;\
|
|
1562 __m64 _m1 = m1;\
|
|
1563 __m64 _m2 = m2;\
|
|
1564 _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
|
|
1565 _acc;\
|
|
1566 })
|
|
1567
|
|
1568 /* The third arguments should be an immediate. */
|
|
1569 #define _mm_merge_si64(a, b, n) \
|
|
1570 ({\
|
|
1571 __m64 result;\
|
|
1572 result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
|
|
1573 result;\
|
|
1574 })
|
|
1575 #endif /* __IWMMXT2__ */
|
|
1576
|
|
1577 static __inline __m64
|
|
1578 _mm_alignr0_si64 (__m64 a, __m64 b)
|
|
1579 {
|
|
1580 return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
|
|
1581 }
|
|
1582
|
|
1583 static __inline __m64
|
|
1584 _mm_alignr1_si64 (__m64 a, __m64 b)
|
|
1585 {
|
|
1586 return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
|
|
1587 }
|
|
1588
|
|
1589 static __inline __m64
|
|
1590 _mm_alignr2_si64 (__m64 a, __m64 b)
|
|
1591 {
|
|
1592 return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
|
|
1593 }
|
|
1594
|
|
1595 static __inline __m64
|
|
1596 _mm_alignr3_si64 (__m64 a, __m64 b)
|
|
1597 {
|
|
1598 return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
|
|
1599 }
|
|
1600
|
|
1601 static __inline void
|
|
1602 _mm_tandcb ()
|
|
1603 {
|
|
1604 __asm __volatile ("tandcb r15");
|
|
1605 }
|
|
1606
|
|
1607 static __inline void
|
|
1608 _mm_tandch ()
|
|
1609 {
|
|
1610 __asm __volatile ("tandch r15");
|
|
1611 }
|
|
1612
|
|
1613 static __inline void
|
|
1614 _mm_tandcw ()
|
|
1615 {
|
|
1616 __asm __volatile ("tandcw r15");
|
|
1617 }
|
|
1618
|
|
1619 #define _mm_textrcb(n) \
|
|
1620 ({\
|
|
1621 __asm__ __volatile__ (\
|
|
1622 "textrcb r15, %0" : : "i" (n));\
|
|
1623 })
|
|
1624
|
|
1625 #define _mm_textrch(n) \
|
|
1626 ({\
|
|
1627 __asm__ __volatile__ (\
|
|
1628 "textrch r15, %0" : : "i" (n));\
|
|
1629 })
|
|
1630
|
|
1631 #define _mm_textrcw(n) \
|
|
1632 ({\
|
|
1633 __asm__ __volatile__ (\
|
|
1634 "textrcw r15, %0" : : "i" (n));\
|
|
1635 })
|
|
1636
|
|
1637 static __inline void
|
|
1638 _mm_torcb ()
|
|
1639 {
|
|
1640 __asm __volatile ("torcb r15");
|
|
1641 }
|
|
1642
|
|
1643 static __inline void
|
|
1644 _mm_torch ()
|
|
1645 {
|
|
1646 __asm __volatile ("torch r15");
|
|
1647 }
|
|
1648
|
|
1649 static __inline void
|
|
1650 _mm_torcw ()
|
|
1651 {
|
|
1652 __asm __volatile ("torcw r15");
|
|
1653 }
|
|
1654
|
|
1655 #ifdef __IWMMXT2__
|
|
1656 static __inline void
|
|
1657 _mm_torvscb ()
|
|
1658 {
|
|
1659 __asm __volatile ("torvscb r15");
|
|
1660 }
|
|
1661
|
|
1662 static __inline void
|
|
1663 _mm_torvsch ()
|
|
1664 {
|
|
1665 __asm __volatile ("torvsch r15");
|
|
1666 }
|
|
1667
|
|
1668 static __inline void
|
|
1669 _mm_torvscw ()
|
|
1670 {
|
|
1671 __asm __volatile ("torvscw r15");
|
|
1672 }
|
|
1673 #endif /* __IWMMXT2__ */
|
|
1674
|
|
1675 static __inline __m64
|
|
1676 _mm_tbcst_pi8 (int value)
|
|
1677 {
|
|
1678 return (__m64) __builtin_arm_tbcstb ((signed char) value);
|
|
1679 }
|
|
1680
|
|
1681 static __inline __m64
|
|
1682 _mm_tbcst_pi16 (int value)
|
|
1683 {
|
|
1684 return (__m64) __builtin_arm_tbcsth ((short) value);
|
|
1685 }
|
|
1686
|
|
1687 static __inline __m64
|
|
1688 _mm_tbcst_pi32 (int value)
|
|
1689 {
|
|
1690 return (__m64) __builtin_arm_tbcstw (value);
|
|
1691 }
|
|
1692
|
|
1693 #define _m_empty _mm_empty
|
0
|
1694 #define _m_packsswb _mm_packs_pi16
|
|
1695 #define _m_packssdw _mm_packs_pi32
|
|
1696 #define _m_packuswb _mm_packs_pu16
|
|
1697 #define _m_packusdw _mm_packs_pu32
|
|
1698 #define _m_packssqd _mm_packs_pi64
|
|
1699 #define _m_packusqd _mm_packs_pu64
|
|
1700 #define _mm_packs_si64 _mm_packs_pi64
|
|
1701 #define _mm_packs_su64 _mm_packs_pu64
|
|
1702 #define _m_punpckhbw _mm_unpackhi_pi8
|
|
1703 #define _m_punpckhwd _mm_unpackhi_pi16
|
|
1704 #define _m_punpckhdq _mm_unpackhi_pi32
|
|
1705 #define _m_punpcklbw _mm_unpacklo_pi8
|
|
1706 #define _m_punpcklwd _mm_unpacklo_pi16
|
|
1707 #define _m_punpckldq _mm_unpacklo_pi32
|
|
1708 #define _m_punpckehsbw _mm_unpackeh_pi8
|
|
1709 #define _m_punpckehswd _mm_unpackeh_pi16
|
|
1710 #define _m_punpckehsdq _mm_unpackeh_pi32
|
|
1711 #define _m_punpckehubw _mm_unpackeh_pu8
|
|
1712 #define _m_punpckehuwd _mm_unpackeh_pu16
|
|
1713 #define _m_punpckehudq _mm_unpackeh_pu32
|
|
1714 #define _m_punpckelsbw _mm_unpackel_pi8
|
|
1715 #define _m_punpckelswd _mm_unpackel_pi16
|
|
1716 #define _m_punpckelsdq _mm_unpackel_pi32
|
|
1717 #define _m_punpckelubw _mm_unpackel_pu8
|
|
1718 #define _m_punpckeluwd _mm_unpackel_pu16
|
|
1719 #define _m_punpckeludq _mm_unpackel_pu32
|
|
1720 #define _m_paddb _mm_add_pi8
|
|
1721 #define _m_paddw _mm_add_pi16
|
|
1722 #define _m_paddd _mm_add_pi32
|
|
1723 #define _m_paddsb _mm_adds_pi8
|
|
1724 #define _m_paddsw _mm_adds_pi16
|
|
1725 #define _m_paddsd _mm_adds_pi32
|
|
1726 #define _m_paddusb _mm_adds_pu8
|
|
1727 #define _m_paddusw _mm_adds_pu16
|
|
1728 #define _m_paddusd _mm_adds_pu32
|
|
1729 #define _m_psubb _mm_sub_pi8
|
|
1730 #define _m_psubw _mm_sub_pi16
|
|
1731 #define _m_psubd _mm_sub_pi32
|
|
1732 #define _m_psubsb _mm_subs_pi8
|
|
1733 #define _m_psubsw _mm_subs_pi16
|
|
1734 #define _m_psubuw _mm_subs_pi32
|
|
1735 #define _m_psubusb _mm_subs_pu8
|
|
1736 #define _m_psubusw _mm_subs_pu16
|
|
1737 #define _m_psubusd _mm_subs_pu32
|
|
1738 #define _m_pmaddwd _mm_madd_pi16
|
|
1739 #define _m_pmadduwd _mm_madd_pu16
|
|
1740 #define _m_pmulhw _mm_mulhi_pi16
|
|
1741 #define _m_pmulhuw _mm_mulhi_pu16
|
|
1742 #define _m_pmullw _mm_mullo_pi16
|
|
1743 #define _m_pmacsw _mm_mac_pi16
|
|
1744 #define _m_pmacuw _mm_mac_pu16
|
|
1745 #define _m_pmacszw _mm_macz_pi16
|
|
1746 #define _m_pmacuzw _mm_macz_pu16
|
|
1747 #define _m_paccb _mm_acc_pu8
|
|
1748 #define _m_paccw _mm_acc_pu16
|
|
1749 #define _m_paccd _mm_acc_pu32
|
|
1750 #define _m_pmia _mm_mia_si64
|
|
1751 #define _m_pmiaph _mm_miaph_si64
|
|
1752 #define _m_pmiabb _mm_miabb_si64
|
|
1753 #define _m_pmiabt _mm_miabt_si64
|
|
1754 #define _m_pmiatb _mm_miatb_si64
|
|
1755 #define _m_pmiatt _mm_miatt_si64
|
|
1756 #define _m_psllw _mm_sll_pi16
|
|
1757 #define _m_psllwi _mm_slli_pi16
|
|
1758 #define _m_pslld _mm_sll_pi32
|
|
1759 #define _m_pslldi _mm_slli_pi32
|
|
1760 #define _m_psllq _mm_sll_si64
|
|
1761 #define _m_psllqi _mm_slli_si64
|
|
1762 #define _m_psraw _mm_sra_pi16
|
|
1763 #define _m_psrawi _mm_srai_pi16
|
|
1764 #define _m_psrad _mm_sra_pi32
|
|
1765 #define _m_psradi _mm_srai_pi32
|
|
1766 #define _m_psraq _mm_sra_si64
|
|
1767 #define _m_psraqi _mm_srai_si64
|
|
1768 #define _m_psrlw _mm_srl_pi16
|
|
1769 #define _m_psrlwi _mm_srli_pi16
|
|
1770 #define _m_psrld _mm_srl_pi32
|
|
1771 #define _m_psrldi _mm_srli_pi32
|
|
1772 #define _m_psrlq _mm_srl_si64
|
|
1773 #define _m_psrlqi _mm_srli_si64
|
|
1774 #define _m_prorw _mm_ror_pi16
|
|
1775 #define _m_prorwi _mm_rori_pi16
|
|
1776 #define _m_prord _mm_ror_pi32
|
|
1777 #define _m_prordi _mm_rori_pi32
|
|
1778 #define _m_prorq _mm_ror_si64
|
|
1779 #define _m_prorqi _mm_rori_si64
|
|
1780 #define _m_pand _mm_and_si64
|
|
1781 #define _m_pandn _mm_andnot_si64
|
|
1782 #define _m_por _mm_or_si64
|
|
1783 #define _m_pxor _mm_xor_si64
|
|
1784 #define _m_pcmpeqb _mm_cmpeq_pi8
|
|
1785 #define _m_pcmpeqw _mm_cmpeq_pi16
|
|
1786 #define _m_pcmpeqd _mm_cmpeq_pi32
|
|
1787 #define _m_pcmpgtb _mm_cmpgt_pi8
|
|
1788 #define _m_pcmpgtub _mm_cmpgt_pu8
|
|
1789 #define _m_pcmpgtw _mm_cmpgt_pi16
|
|
1790 #define _m_pcmpgtuw _mm_cmpgt_pu16
|
|
1791 #define _m_pcmpgtd _mm_cmpgt_pi32
|
|
1792 #define _m_pcmpgtud _mm_cmpgt_pu32
|
|
1793 #define _m_pextrb _mm_extract_pi8
|
|
1794 #define _m_pextrw _mm_extract_pi16
|
|
1795 #define _m_pextrd _mm_extract_pi32
|
|
1796 #define _m_pextrub _mm_extract_pu8
|
|
1797 #define _m_pextruw _mm_extract_pu16
|
|
1798 #define _m_pextrud _mm_extract_pu32
|
|
1799 #define _m_pinsrb _mm_insert_pi8
|
|
1800 #define _m_pinsrw _mm_insert_pi16
|
|
1801 #define _m_pinsrd _mm_insert_pi32
|
|
1802 #define _m_pmaxsb _mm_max_pi8
|
|
1803 #define _m_pmaxsw _mm_max_pi16
|
|
1804 #define _m_pmaxsd _mm_max_pi32
|
|
1805 #define _m_pmaxub _mm_max_pu8
|
|
1806 #define _m_pmaxuw _mm_max_pu16
|
|
1807 #define _m_pmaxud _mm_max_pu32
|
|
1808 #define _m_pminsb _mm_min_pi8
|
|
1809 #define _m_pminsw _mm_min_pi16
|
|
1810 #define _m_pminsd _mm_min_pi32
|
|
1811 #define _m_pminub _mm_min_pu8
|
|
1812 #define _m_pminuw _mm_min_pu16
|
|
1813 #define _m_pminud _mm_min_pu32
|
|
1814 #define _m_pmovmskb _mm_movemask_pi8
|
|
1815 #define _m_pmovmskw _mm_movemask_pi16
|
|
1816 #define _m_pmovmskd _mm_movemask_pi32
|
|
1817 #define _m_pshufw _mm_shuffle_pi16
|
|
1818 #define _m_pavgb _mm_avg_pu8
|
|
1819 #define _m_pavgw _mm_avg_pu16
|
|
1820 #define _m_pavg2b _mm_avg2_pu8
|
|
1821 #define _m_pavg2w _mm_avg2_pu16
|
|
1822 #define _m_psadbw _mm_sad_pu8
|
|
1823 #define _m_psadwd _mm_sad_pu16
|
|
1824 #define _m_psadzbw _mm_sadz_pu8
|
|
1825 #define _m_psadzwd _mm_sadz_pu16
|
|
1826 #define _m_paligniq _mm_align_si64
|
|
1827 #define _m_cvt_si2pi _mm_cvtsi64_m64
|
|
1828 #define _m_cvt_pi2si _mm_cvtm64_si64
|
111
|
1829 #define _m_from_int _mm_cvtsi32_si64
|
|
1830 #define _m_to_int _mm_cvtsi64_si32
|
|
1831
|
|
1832 #if defined __cplusplus
|
|
1833 }; /* End "C" */
|
|
1834 #endif /* __cplusplus */
|
0
|
1835
|
|
1836 #endif /* _MMINTRIN_H_INCLUDED */
|