152
|
1 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
|
|
2 /* { dg-add-options arm_v8_1m_mve } */
|
|
3 /* { dg-additional-options "-O2" } */
|
|
4
|
|
5 #include "arm_mve.h"
|
|
6 void
|
|
7 foo (uint16_t row_len, const int32_t *bias, int8_t *out)
|
|
8 {
|
|
9 int i_out_ch;
|
|
10 for (;;)
|
|
11 {
|
|
12 int8_t *ip_c3;
|
|
13 int32_t acc_3;
|
|
14 int32_t row_loop_cnt = row_len;
|
|
15 int32x4_t res = {acc_3};
|
|
16 uint32x4_t scatter_offset;
|
|
17 int i_row_loop;
|
|
18 for (; i_row_loop < row_loop_cnt; i_row_loop++)
|
|
19 {
|
|
20 mve_pred16_t p;
|
|
21 int16x8_t r0;
|
|
22 int16x8_t c3 = vldrbq_z_s16(ip_c3, p);
|
|
23 acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p);
|
|
24 }
|
|
25 vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
|
|
26 }
|
|
27 }
|
|
28
|
|
29 void
|
|
30 foo1 (uint16_t row_len, const int32_t *bias, int8_t *out)
|
|
31 {
|
|
32 int i_out_ch;
|
|
33 for (;;)
|
|
34 {
|
|
35 int8_t *ip_c3;
|
|
36 int32_t acc_3;
|
|
37 int32_t row_loop_cnt = row_len;
|
|
38 int i_row_loop;
|
|
39 int32x4_t res = {acc_3};
|
|
40 uint32x4_t scatter_offset;
|
|
41 for (; i_row_loop < row_loop_cnt; i_row_loop++)
|
|
42 {
|
|
43 mve_pred16_t p;
|
|
44 int32x4_t r0;
|
|
45 int32x4_t c3 = vldrbq_z_s32(ip_c3, p);
|
|
46 acc_3 = vmladavaq_p_s32(acc_3, r0, c3, p);
|
|
47 }
|
|
48 vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
|
|
49 }
|
|
50 }
|
|
51
|
|
52 void
|
|
53 foo2 (uint16_t row_len, const int32_t *bias, int8_t *out)
|
|
54 {
|
|
55 int i_out_ch;
|
|
56 for (;;)
|
|
57 {
|
|
58 int16_t *ip_c3;
|
|
59 int32_t acc_3;
|
|
60 int32_t row_loop_cnt = row_len;
|
|
61 int i_row_loop;
|
|
62 int32x4_t res = {acc_3};
|
|
63 uint32x4_t scatter_offset;
|
|
64 for (; i_row_loop < row_loop_cnt; i_row_loop++)
|
|
65 {
|
|
66 mve_pred16_t p;
|
|
67 int32x4_t r0;
|
|
68 int32x4_t c3 = vldrhq_z_s32(ip_c3, p);
|
|
69 acc_3 = vmladavaq_p_s32(acc_3, r0, c3, p);
|
|
70 }
|
|
71 vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
|
|
72 }
|
|
73 }
|