module &module:1:0:$full:$large:$default; /* Test for different cases of packed instruction controls. */ /* { dg-do compile } */ /* { dg-options "-fdump-tree-gimple -fdump-tree-original" } */ prog kernel &Kernel(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr) { ld_kernarg_u64 $d0, [%input_ptr]; ld_global_b128 $q0, [$d0]; add_pp_u8x16 $q1, $q0, u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); /* Broadcast the 15 as it's the lowest element (pos 0) in the resulting vector. */ add_ps_u8x16 $q2, $q1, u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); /* Broadcast the lowest element of q1. */ add_sp_u8x16 $q3, $q1, $q2; /* Perform a scalar computation with the lowest element of both inputs and store it to the lowest element of dest. */ add_ss_u8x16 $q4, $q2, $q3; /* Saturating arithmetics variations. */ add_pp_sat_u8x16 $q5, $q4, u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); /* Broadcast the 15 as it's the lowest element (pos 0) in the resulting vector. */ add_ps_sat_u8x16 $q6, $q5, u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); /* Broadcast the lowest element of q1. */ add_sp_sat_u8x16 $q7, $q6, $q5; /* Perform a scalar computation with the lowest element of both inputs and store it to the lowest element of dest. */ add_ss_sat_u8x16 $q8, $q7, $q6; /* Single operand vector computation. */ neg_p_s16x8 $q9, $q8; ld_kernarg_u64 $d0, [%output_ptr]; st_global_b128 $q8, [$d0]; ret; }; /* The b128 load is done using uint128_t*. */ /* { dg-final { scan-tree-dump "q0 = VIEW_CONVERT_EXPR\\\(mem_read.\[0-9\]+\\\);" "original"} } */ /* Before arithmetics, the uint128_t is casted to a vector datatype. */ /* { dg-final { scan-tree-dump "\\\(q0\\\) \\\+ \\\{" "original"} } */ /* The u8x16 constant is generated to an array with elements in reverse order */ /* in comparison to the HSAIL syntax. */ /* { dg-final { scan-tree-dump "\\\+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }" "original"} } */ /* Broadcasted the constant vector's lowest element and summed it up in the next line. */ /* { dg-final { scan-tree-dump "= { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 };\[\n \]+\[a-z0-9_\]+ = \[a-z0-9_\]+ \\\+ \[a-z0-9_\]+;" "gimple"} } */ /* Broadcasted the registers lowest element via a VEC_PERM_EXPR that has an all-zeros mask. */ /* { dg-final { scan-tree-dump "VEC_PERM_EXPR <\[a-z0-9_\]+, \[a-z0-9_\]+, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }>;" "gimple" } } */ /* For the add_ss we assume performing the computation over the whole vector is cheaper than */ /* extracting the scalar and performing a scalar operation. This aims to stay in the vector /* datapath as long as possible. */ /* { dg-final { scan-tree-dump "_\[0-9\]+ = q2 \\\+ q3;" "gimple" } } */ /* Insert the lowest element of the result to the lowest element of the result register. */ /* { dg-final { scan-tree-dump "= VEC_PERM_EXPR <\[a-z0-9_\]+, new_output.\[0-9\]+_\[0-9\]+, { 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }>;" "gimple" } } */ /* FIXME */ /* { dg-final { scan-tree-dump "q4 = \(VIEW_CONVERT_EXPR\\\()?s_output.\[0-9\]+\(_\[0-9\]+\)*\\\)?;" "gimple" } } */ /* The saturating arithmetics are (curently) implemented using scalar builtin calls. */ /* { dg-final { scan-tree-dump-times "= __builtin___hsail_sat_add_u8" 64 "gimple" } } */ /* A single operand vector instr (neg.) */ /* { dg-final { scan-tree-dump "= VIEW_CONVERT_EXPR\\\(\(s_output.\[0-9\]+_\[0-9\]+|q8\)\\\);\[\n \]+q9 = -_\[0-9\]+;\[\n \]+" "gimple" } } */