152
|
1 /* { dg-do assemble } */
|
|
2 /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
|
|
3 /* { dg-options "-save-temps -O2" } */
|
|
4 /* { dg-add-options arm_v8_2a_bf16_neon } */
|
|
5
|
|
6 #include "arm_neon.h"
|
|
7
|
|
8 /* BF16 DOT without lane. */
|
|
9 float32x2_t
|
|
10 test_vbfdot_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
|
|
11 {
|
|
12 /* vdot.bf16 d, d, d */
|
|
13 return vbfdot_f32 (r, a, b);
|
|
14 }
|
|
15
|
|
16 float32x4_t
|
|
17 test_vbfdotq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
|
18 {
|
|
19 /* vdot.bf16 q, q, q */
|
|
20 return vbfdotq_f32 (r, a, b);
|
|
21 }
|
|
22
|
|
23 /* 64-bit BF16 DOT with lane. */
|
|
24 float32x2_t
|
|
25 test_vbfdot_lane_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
|
|
26 {
|
|
27 /* vdot.bf16 d, d, d[0] */
|
|
28 return vbfdot_lane_f32 (r, a, b, 0);
|
|
29 }
|
|
30
|
|
31 float32x2_t
|
|
32 test_vbfdot_lane_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
|
|
33 {
|
|
34 /* vdot.bf16 d, d, d[1] */
|
|
35 return vbfdot_lane_f32 (r, a, b, 1);
|
|
36 }
|
|
37
|
|
38 float32x2_t
|
|
39 test_vbfdot_laneq_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
|
|
40 {
|
|
41 /* vdot.bf16 d, d, d[0] */
|
|
42 return vbfdot_laneq_f32 (r, a, b, 0);
|
|
43 }
|
|
44
|
|
45 float32x2_t
|
|
46 test_vbfdot_laneq_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
|
|
47 {
|
|
48 /* vdot.bf16 d, d, d[1] */
|
|
49 return vbfdot_laneq_f32 (r, a, b, 1);
|
|
50 }
|
|
51
|
|
52 float32x2_t
|
|
53 test_vbfdot_laneq_f32_2 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
|
|
54 {
|
|
55 /* vdot.bf16 d, d, d[0] */
|
|
56 return vbfdot_laneq_f32 (r, a, b, 2);
|
|
57 }
|
|
58
|
|
59 float32x2_t
|
|
60 test_vbfdot_laneq_f32_3 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
|
|
61 {
|
|
62 /* vdot.bf16 d, d, d[1] */
|
|
63 return vbfdot_laneq_f32 (r, a, b, 3);
|
|
64 }
|
|
65
|
|
66 /* 128-bit BF16 DOT with lane. */
|
|
67 float32x4_t
|
|
68 test_vbfdotq_lane_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
|
69 {
|
|
70 /* vdot.bf16 q, q, d[0] */
|
|
71 return vbfdotq_lane_f32 (r, a, b, 0);
|
|
72 }
|
|
73
|
|
74 float32x4_t
|
|
75 test_vbfdotq_lane_f32_1 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
|
76 {
|
|
77 /* vdot.bf16 q, q, d[1] */
|
|
78 return vbfdotq_lane_f32 (r, a, b, 1);
|
|
79 }
|
|
80
|
|
81 float32x4_t
|
|
82 test_vbfdotq_laneq_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
|
83 {
|
|
84 /* vdot.bf16 q, q, d[0] */
|
|
85 return vbfdotq_laneq_f32 (r, a, b, 0);
|
|
86 }
|
|
87
|
|
88 float32x4_t
|
|
89 test_vbfdotq_laneq_f32_3 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
|
90 {
|
|
91 /* vdot.bf16 q, q, d[1] */
|
|
92 return vbfdotq_laneq_f32 (r, a, b, 3);
|
|
93 }
|
|
94
|
|
95 /* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 1 } } */
|
|
96 /* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 1 } } */
|
|
97 /* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]\n} 3 } } */
|
|
98 /* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[1\]\n} 3 } } */
|
|
99 /* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]\n} 2 } } */
|
|
100 /* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]\n} 2 } } */
|