comparison gcc/tree-vect-slp.c @ 16:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents f6334be47118
children 84e7813d76e9
comparison
equal deleted inserted replaced
15:561a7518be6b 16:04ced10e8804
1 /* SLP - Basic Block Vectorization 1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007, 2008, 2009, 2010 2 Copyright (C) 2007-2017 Free Software Foundation, Inc.
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 and Ira Rosen <irar@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com>
6 5
7 This file is part of GCC. 6 This file is part of GCC.
8 7
21 <http://www.gnu.org/licenses/>. */ 20 <http://www.gnu.org/licenses/>. */
22 21
23 #include "config.h" 22 #include "config.h"
24 #include "system.h" 23 #include "system.h"
25 #include "coretypes.h" 24 #include "coretypes.h"
26 #include "tm.h" 25 #include "backend.h"
27 #include "ggc.h" 26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h" 28 #include "tree.h"
29 #include "target.h" 29 #include "gimple.h"
30 #include "basic-block.h" 30 #include "tree-pass.h"
31 #include "tree-pretty-print.h" 31 #include "ssa.h"
32 #include "gimple-pretty-print.h" 32 #include "optabs-tree.h"
33 #include "tree-flow.h" 33 #include "insn-config.h"
34 #include "tree-dump.h" 34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
35 #include "cfgloop.h" 39 #include "cfgloop.h"
36 #include "cfglayout.h"
37 #include "expr.h"
38 #include "recog.h"
39 #include "optabs.h"
40 #include "tree-vectorizer.h" 40 #include "tree-vectorizer.h"
41 41 #include "langhooks.h"
42 /* Extract the location of the basic block in the source code. 42 #include "gimple-walk.h"
43 Return the basic block location if succeed and NULL if not. */ 43 #include "dbgcnt.h"
44
45 LOC
46 find_bb_location (basic_block bb)
47 {
48 gimple stmt = NULL;
49 gimple_stmt_iterator si;
50
51 if (!bb)
52 return UNKNOWN_LOC;
53
54 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
55 {
56 stmt = gsi_stmt (si);
57 if (gimple_location (stmt) != UNKNOWN_LOC)
58 return gimple_location (stmt);
59 }
60
61 return UNKNOWN_LOC;
62 }
63 44
64 45
65 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ 46 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
66 47
67 static void 48 static void
68 vect_free_slp_tree (slp_tree node) 49 vect_free_slp_tree (slp_tree node)
69 { 50 {
70 if (!node) 51 int i;
71 return; 52 slp_tree child;
72 53
73 if (SLP_TREE_LEFT (node)) 54 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
74 vect_free_slp_tree (SLP_TREE_LEFT (node)); 55 vect_free_slp_tree (child);
75 56
76 if (SLP_TREE_RIGHT (node)) 57 gimple *stmt;
77 vect_free_slp_tree (SLP_TREE_RIGHT (node)); 58 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
78 59 /* After transform some stmts are removed and thus their vinfo is gone. */
79 VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); 60 if (vinfo_for_stmt (stmt))
80 61 {
81 if (SLP_TREE_VEC_STMTS (node)) 62 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
82 VEC_free (gimple, heap, SLP_TREE_VEC_STMTS (node)); 63 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
64 }
65
66 SLP_TREE_CHILDREN (node).release ();
67 SLP_TREE_SCALAR_STMTS (node).release ();
68 SLP_TREE_VEC_STMTS (node).release ();
69 SLP_TREE_LOAD_PERMUTATION (node).release ();
83 70
84 free (node); 71 free (node);
85 } 72 }
86 73
87 74
89 76
90 void 77 void
91 vect_free_slp_instance (slp_instance instance) 78 vect_free_slp_instance (slp_instance instance)
92 { 79 {
93 vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); 80 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
94 VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance)); 81 SLP_INSTANCE_LOADS (instance).release ();
95 VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); 82 free (instance);
96 } 83 }
97 84
98 85
99 /* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that 86 /* Create an SLP node for SCALAR_STMTS. */
100 they are of a legal type and that they match the defs of the first stmt of 87
101 the SLP group (stored in FIRST_STMT_...). */ 88 static slp_tree
102 89 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
103 static bool 90 {
104 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, 91 slp_tree node;
105 slp_tree slp_node, gimple stmt, 92 gimple *stmt = scalar_stmts[0];
106 VEC (gimple, heap) **def_stmts0, 93 unsigned int nops;
107 VEC (gimple, heap) **def_stmts1, 94
108 enum vect_def_type *first_stmt_dt0, 95 if (is_gimple_call (stmt))
109 enum vect_def_type *first_stmt_dt1, 96 nops = gimple_call_num_args (stmt);
110 tree *first_stmt_def0_type, 97 else if (is_gimple_assign (stmt))
111 tree *first_stmt_def1_type, 98 {
112 tree *first_stmt_const_oprnd, 99 nops = gimple_num_ops (stmt) - 1;
113 int ncopies_for_cost, 100 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 bool *pattern0, bool *pattern1) 101 nops++;
102 }
103 else if (gimple_code (stmt) == GIMPLE_PHI)
104 nops = 0;
105 else
106 return NULL;
107
108 node = XNEW (struct _slp_tree);
109 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
110 SLP_TREE_VEC_STMTS (node).create (0);
111 SLP_TREE_CHILDREN (node).create (nops);
112 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
113 SLP_TREE_TWO_OPERATORS (node) = false;
114 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
115
116 unsigned i;
117 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
118 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
119
120 return node;
121 }
122
123
124 /* This structure is used in creation of an SLP tree. Each instance
125 corresponds to the same operand in a group of scalar stmts in an SLP
126 node. */
127 typedef struct _slp_oprnd_info
128 {
129 /* Def-stmts for the operands. */
130 vec<gimple *> def_stmts;
131 /* Information about the first statement, its vector def-type, type, the
132 operand itself in case it's constant, and an indication if it's a pattern
133 stmt. */
134 tree first_op_type;
135 enum vect_def_type first_dt;
136 bool first_pattern;
137 bool second_pattern;
138 } *slp_oprnd_info;
139
140
141 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
142 operand. */
143 static vec<slp_oprnd_info>
144 vect_create_oprnd_info (int nops, int group_size)
145 {
146 int i;
147 slp_oprnd_info oprnd_info;
148 vec<slp_oprnd_info> oprnds_info;
149
150 oprnds_info.create (nops);
151 for (i = 0; i < nops; i++)
152 {
153 oprnd_info = XNEW (struct _slp_oprnd_info);
154 oprnd_info->def_stmts.create (group_size);
155 oprnd_info->first_dt = vect_uninitialized_def;
156 oprnd_info->first_op_type = NULL_TREE;
157 oprnd_info->first_pattern = false;
158 oprnd_info->second_pattern = false;
159 oprnds_info.quick_push (oprnd_info);
160 }
161
162 return oprnds_info;
163 }
164
165
166 /* Free operands info. */
167
168 static void
169 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
170 {
171 int i;
172 slp_oprnd_info oprnd_info;
173
174 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
175 {
176 oprnd_info->def_stmts.release ();
177 XDELETE (oprnd_info);
178 }
179
180 oprnds_info.release ();
181 }
182
183
184 /* Find the place of the data-ref in STMT in the interleaving chain that starts
185 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
186
187 static int
188 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
189 {
190 gimple *next_stmt = first_stmt;
191 int result = 0;
192
193 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
194 return -1;
195
196 do
197 {
198 if (next_stmt == stmt)
199 return result;
200 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
201 if (next_stmt)
202 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
203 }
204 while (next_stmt);
205
206 return -1;
207 }
208
209
210 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
211 they are of a valid type and that they match the defs of the first stmt of
212 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
213 by swapping operands of STMT when possible. Non-zero *SWAP indicates swap
214 is required for cond_expr stmts. Specifically, *SWAP is 1 if STMT is cond
215 and operands of comparison need to be swapped; *SWAP is 2 if STMT is cond
216 and code of comparison needs to be inverted. If there is any operand swap
217 in this function, *SWAP is set to non-zero value.
218 If there was a fatal error return -1; if the error could be corrected by
219 swapping operands of father node of this one, return 1; if everything is
220 ok return 0. */
221
222 static int
223 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
224 gimple *stmt, unsigned stmt_num,
225 vec<slp_oprnd_info> *oprnds_info)
115 { 226 {
116 tree oprnd; 227 tree oprnd;
117 unsigned int i, number_of_oprnds; 228 unsigned int i, number_of_oprnds;
118 tree def; 229 gimple *def_stmt;
119 gimple def_stmt; 230 enum vect_def_type dt = vect_uninitialized_def;
120 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; 231 bool pattern = false;
121 stmt_vec_info stmt_info = 232 slp_oprnd_info oprnd_info;
122 vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0)); 233 int first_op_idx = 1;
123 enum gimple_rhs_class rhs_class; 234 bool commutative = false;
124 struct loop *loop = NULL; 235 bool first_op_cond = false;
125 236 bool first = stmt_num == 0;
126 if (loop_vinfo) 237 bool second = stmt_num == 1;
127 loop = LOOP_VINFO_LOOP (loop_vinfo); 238
128 239 if (is_gimple_call (stmt))
129 rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt)); 240 {
130 number_of_oprnds = gimple_num_ops (stmt) - 1; /* RHS only */ 241 number_of_oprnds = gimple_call_num_args (stmt);
131 242 first_op_idx = 3;
243 }
244 else if (is_gimple_assign (stmt))
245 {
246 enum tree_code code = gimple_assign_rhs_code (stmt);
247 number_of_oprnds = gimple_num_ops (stmt) - 1;
248 /* Swap can only be done for cond_expr if asked to, otherwise we
249 could result in different comparison code to the first stmt. */
250 if (gimple_assign_rhs_code (stmt) == COND_EXPR
251 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
252 {
253 first_op_cond = true;
254 number_of_oprnds++;
255 }
256 else
257 commutative = commutative_tree_code (code);
258 }
259 else
260 return -1;
261
262 bool swapped = (*swap != 0);
263 gcc_assert (!swapped || first_op_cond);
132 for (i = 0; i < number_of_oprnds; i++) 264 for (i = 0; i < number_of_oprnds; i++)
133 { 265 {
134 oprnd = gimple_op (stmt, i + 1); 266 again:
135 267 if (first_op_cond)
136 if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def, 268 {
137 &dt[i]) 269 /* Map indicating how operands of cond_expr should be swapped. */
138 || (!def_stmt && dt[i] != vect_constant_def)) 270 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
139 { 271 int *map = maps[*swap];
140 if (vect_print_dump_info (REPORT_SLP)) 272
273 if (i < 2)
274 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
275 else
276 oprnd = gimple_op (stmt, map[i]);
277 }
278 else
279 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
280
281 oprnd_info = (*oprnds_info)[i];
282
283 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
284 {
285 if (dump_enabled_p ())
141 { 286 {
142 fprintf (vect_dump, "Build SLP failed: can't find def for "); 287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
143 print_generic_expr (vect_dump, oprnd, TDF_SLIM); 288 "Build SLP failed: can't analyze def for ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
144 } 291 }
145 292
146 return false; 293 return -1;
147 } 294 }
148 295
149 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt 296 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
150 from the pattern. Check that all the stmts of the node are in the 297 from the pattern. Check that all the stmts of the node are in the
151 pattern. */ 298 pattern. */
152 if (loop && def_stmt && gimple_bb (def_stmt) 299 if (def_stmt && gimple_bb (def_stmt)
153 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 300 && vect_stmt_in_region_p (vinfo, def_stmt)
154 && vinfo_for_stmt (def_stmt) 301 && vinfo_for_stmt (def_stmt)
155 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))) 302 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
303 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
304 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
156 { 305 {
157 if (!*first_stmt_dt0) 306 pattern = true;
158 *pattern0 = true; 307 if (!first && !oprnd_info->first_pattern
159 else 308 /* Allow different pattern state for the defs of the
309 first stmt in reduction chains. */
310 && (oprnd_info->first_dt != vect_reduction_def
311 || (!second && !oprnd_info->second_pattern)))
312 {
313 if (i == 0
314 && !swapped
315 && commutative)
316 {
317 swapped = true;
318 goto again;
319 }
320
321 if (dump_enabled_p ())
322 {
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "Build SLP failed: some of the stmts"
325 " are in a pattern, and others are not ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
327 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
328 }
329
330 return 1;
331 }
332
333 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
334 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
335
336 if (dt == vect_unknown_def_type)
160 { 337 {
161 if (i == 1 && !*first_stmt_dt1) 338 if (dump_enabled_p ())
162 *pattern1 = true; 339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
163 else if ((i == 0 && !*pattern0) || (i == 1 && !*pattern1)) 340 "Unsupported pattern.\n");
164 { 341 return -1;
165 if (vect_print_dump_info (REPORT_DETAILS))
166 {
167 fprintf (vect_dump, "Build SLP failed: some of the stmts"
168 " are in a pattern, and others are not ");
169 print_generic_expr (vect_dump, oprnd, TDF_SLIM);
170 }
171
172 return false;
173 }
174 }
175
176 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
177 dt[i] = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
178
179 if (*dt == vect_unknown_def_type)
180 {
181 if (vect_print_dump_info (REPORT_DETAILS))
182 fprintf (vect_dump, "Unsupported pattern.");
183 return false;
184 } 342 }
185 343
186 switch (gimple_code (def_stmt)) 344 switch (gimple_code (def_stmt))
187 { 345 {
188 case GIMPLE_PHI: 346 case GIMPLE_PHI:
189 def = gimple_phi_result (def_stmt); 347 case GIMPLE_ASSIGN:
190 break; 348 break;
191 349
192 case GIMPLE_ASSIGN: 350 default:
193 def = gimple_assign_lhs (def_stmt); 351 if (dump_enabled_p ())
194 break; 352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 353 "unsupported defining stmt:\n");
196 default: 354 return -1;
197 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "unsupported defining stmt: ");
199 return false;
200 } 355 }
201 } 356 }
202 357
203 if (!*first_stmt_dt0) 358 if (second)
204 { 359 oprnd_info->second_pattern = pattern;
205 /* op0 of the first stmt of the group - store its info. */ 360
206 *first_stmt_dt0 = dt[i]; 361 if (first)
207 if (def) 362 {
208 *first_stmt_def0_type = TREE_TYPE (def); 363 oprnd_info->first_dt = dt;
209 else 364 oprnd_info->first_pattern = pattern;
210 *first_stmt_const_oprnd = oprnd; 365 oprnd_info->first_op_type = TREE_TYPE (oprnd);
211 366 }
212 /* Analyze costs (for the first stmt of the group only). */
213 if (rhs_class != GIMPLE_SINGLE_RHS)
214 /* Not memory operation (we don't call this functions for loads). */
215 vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node);
216 else
217 /* Store. */
218 vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node);
219 }
220
221 else 367 else
222 { 368 {
223 if (!*first_stmt_dt1 && i == 1) 369 /* Not first stmt of the group, check that the def-stmt/s match
370 the def-stmt/s of the first stmt. Allow different definition
371 types for reduction chains: the first stmt must be a
372 vect_reduction_def (a phi node), and the rest
373 vect_internal_def. */
374 if (((oprnd_info->first_dt != dt
375 && !(oprnd_info->first_dt == vect_reduction_def
376 && dt == vect_internal_def)
377 && !((oprnd_info->first_dt == vect_external_def
378 || oprnd_info->first_dt == vect_constant_def)
379 && (dt == vect_external_def
380 || dt == vect_constant_def)))
381 || !types_compatible_p (oprnd_info->first_op_type,
382 TREE_TYPE (oprnd))))
224 { 383 {
225 /* op1 of the first stmt of the group - store its info. */ 384 /* Try swapping operands if we got a mismatch. */
226 *first_stmt_dt1 = dt[i]; 385 if (i == 0
227 if (def) 386 && !swapped
228 *first_stmt_def1_type = TREE_TYPE (def); 387 && commutative)
229 else
230 { 388 {
231 /* We assume that the stmt contains only one constant 389 swapped = true;
232 operand. We fail otherwise, to be on the safe side. */ 390 goto again;
233 if (*first_stmt_const_oprnd)
234 {
235 if (vect_print_dump_info (REPORT_SLP))
236 fprintf (vect_dump, "Build SLP failed: two constant "
237 "oprnds in stmt");
238 return false;
239 }
240 *first_stmt_const_oprnd = oprnd;
241 } 391 }
392
393 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "Build SLP failed: different types\n");
396
397 return 1;
242 } 398 }
243 else
244 {
245 /* Not first stmt of the group, check that the def-stmt/s match
246 the def-stmt/s of the first stmt. */
247 if ((i == 0
248 && (*first_stmt_dt0 != dt[i]
249 || (*first_stmt_def0_type && def
250 && !types_compatible_p (*first_stmt_def0_type,
251 TREE_TYPE (def)))))
252 || (i == 1
253 && (*first_stmt_dt1 != dt[i]
254 || (*first_stmt_def1_type && def
255 && !types_compatible_p (*first_stmt_def1_type,
256 TREE_TYPE (def)))))
257 || (!def
258 && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd),
259 TREE_TYPE (oprnd))))
260 {
261 if (vect_print_dump_info (REPORT_SLP))
262 fprintf (vect_dump, "Build SLP failed: different types ");
263
264 return false;
265 }
266 }
267 } 399 }
268 400
269 /* Check the types of the definitions. */ 401 /* Check the types of the definitions. */
270 switch (dt[i]) 402 switch (dt)
271 { 403 {
272 case vect_constant_def: 404 case vect_constant_def:
273 case vect_external_def: 405 case vect_external_def:
274 break; 406 break;
275 407
408 case vect_reduction_def:
409 case vect_induction_def:
276 case vect_internal_def: 410 case vect_internal_def:
277 case vect_reduction_def: 411 oprnd_info->def_stmts.quick_push (def_stmt);
278 if (i == 0)
279 VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
280 else
281 VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
282 break; 412 break;
283 413
284 default: 414 default:
285 /* FORNOW: Not supported. */ 415 /* FORNOW: Not supported. */
286 if (vect_print_dump_info (REPORT_SLP)) 416 if (dump_enabled_p ())
287 { 417 {
288 fprintf (vect_dump, "Build SLP failed: illegal type of def "); 418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
289 print_generic_expr (vect_dump, def, TDF_SLIM); 419 "Build SLP failed: illegal type of def ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
421 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
290 } 422 }
291 423
292 return false; 424 return -1;
293 } 425 }
294 } 426 }
427
428 /* Swap operands. */
429 if (swapped)
430 {
431 /* If there are already uses of this stmt in a SLP instance then
432 we've committed to the operand order and can't swap it. */
433 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
434 {
435 if (dump_enabled_p ())
436 {
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 "Build SLP failed: cannot swap operands of "
439 "shared stmt ");
440 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
441 }
442 return -1;
443 }
444
445 if (first_op_cond)
446 {
447 tree cond = gimple_assign_rhs1 (stmt);
448 enum tree_code code = TREE_CODE (cond);
449
450 /* Swap. */
451 if (*swap == 1)
452 {
453 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
454 &TREE_OPERAND (cond, 1));
455 TREE_SET_CODE (cond, swap_tree_comparison (code));
456 }
457 /* Invert. */
458 else
459 {
460 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
461 gimple_assign_rhs3_ptr (stmt));
462 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
463 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
464 gcc_assert (code != ERROR_MARK);
465 TREE_SET_CODE (cond, code);
466 }
467 }
468 else
469 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
470 gimple_assign_rhs2_ptr (stmt));
471 if (dump_enabled_p ())
472 {
473 dump_printf_loc (MSG_NOTE, vect_location,
474 "swapped operands to match def types in ");
475 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
476 }
477 }
478
479 *swap = swapped;
480 return 0;
481 }
482
483 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
484 caller's attempt to find the vector type in STMT with the narrowest
485 element type. Return true if VECTYPE is nonnull and if it is valid
486 for VINFO. When returning true, update MAX_NUNITS to reflect the
487 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
488 as for vect_build_slp_tree. */
489
490 static bool
491 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
492 tree vectype, unsigned int *max_nunits)
493 {
494 if (!vectype)
495 {
496 if (dump_enabled_p ())
497 {
498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
499 "Build SLP failed: unsupported data-type in ");
500 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
501 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
502 }
503 /* Fatal mismatch. */
504 return false;
505 }
506
507 /* If populating the vector type requires unrolling then fail
508 before adjusting *max_nunits for basic-block vectorization. */
509 if (is_a <bb_vec_info> (vinfo)
510 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
511 {
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: unrolling required "
514 "in basic block SLP\n");
515 /* Fatal mismatch. */
516 return false;
517 }
518
519 /* In case of multiple types we need to detect the smallest type. */
520 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
521 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
295 522
296 return true; 523 return true;
297 } 524 }
298 525
299 526 /* Verify if the scalar stmts STMTS are isomorphic, require data
300 /* Recursively build an SLP tree starting from NODE. 527 permutation or are of unsupported types of operation. Return
301 Fail (and return FALSE) if def-stmts are not isomorphic, require data 528 true if they are, otherwise return false and indicate in *MATCHES
302 permutation or are of unsupported types of operation. Otherwise, return 529 which stmts are not isomorphic to the first one. If MATCHES[0]
303 TRUE. */ 530 is false then this indicates the comparison could not be
531 carried out or the stmts will never be vectorized by SLP.
532
533 Note COND_EXPR is possibly ismorphic to another one after swapping its
534 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
535 the first stmt by swapping the two operands of comparison; set SWAP[i]
536 to 2 if stmt I is isormorphic to the first stmt by inverting the code
537 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
538 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
304 539
305 static bool 540 static bool
306 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, 541 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
307 slp_tree *node, unsigned int group_size, 542 vec<gimple *> stmts, unsigned int group_size,
308 int *inside_cost, int *outside_cost, 543 unsigned nops, unsigned int *max_nunits,
309 int ncopies_for_cost, unsigned int *max_nunits, 544 bool *matches, bool *two_operators)
310 VEC (int, heap) **load_permutation, 545 {
311 VEC (slp_tree, heap) **loads,
312 unsigned int vectorization_factor)
313 {
314 VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
315 VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size);
316 unsigned int i; 546 unsigned int i;
317 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node); 547 gimple *first_stmt = stmts[0], *stmt = stmts[0];
318 gimple stmt = VEC_index (gimple, stmts, 0); 548 enum tree_code first_stmt_code = ERROR_MARK;
319 enum vect_def_type first_stmt_dt0 = vect_uninitialized_def; 549 enum tree_code alt_stmt_code = ERROR_MARK;
320 enum vect_def_type first_stmt_dt1 = vect_uninitialized_def; 550 enum tree_code rhs_code = ERROR_MARK;
321 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK; 551 enum tree_code first_cond_code = ERROR_MARK;
322 tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
323 tree lhs; 552 tree lhs;
324 bool stop_recursion = false, need_same_oprnds = false; 553 bool need_same_oprnds = false;
325 tree vectype, scalar_type, first_op1 = NULL_TREE; 554 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
326 unsigned int ncopies;
327 optab optab; 555 optab optab;
328 int icode; 556 int icode;
329 enum machine_mode optab_op2_mode; 557 machine_mode optab_op2_mode;
330 enum machine_mode vec_mode; 558 machine_mode vec_mode;
331 tree first_stmt_const_oprnd = NULL_TREE;
332 struct data_reference *first_dr;
333 bool pattern0 = false, pattern1 = false;
334 HOST_WIDE_INT dummy; 559 HOST_WIDE_INT dummy;
335 bool permutation = false; 560 gimple *first_load = NULL, *prev_first_load = NULL;
336 unsigned int load_place;
337 gimple first_load, prev_first_load = NULL;
338 561
339 /* For every stmt in NODE find its def stmt/s. */ 562 /* For every stmt in NODE find its def stmt/s. */
340 FOR_EACH_VEC_ELT (gimple, stmts, i, stmt) 563 FOR_EACH_VEC_ELT (stmts, i, stmt)
341 { 564 {
342 if (vect_print_dump_info (REPORT_SLP)) 565 swap[i] = 0;
343 { 566 matches[i] = false;
344 fprintf (vect_dump, "Build SLP for "); 567
345 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 568 if (dump_enabled_p ())
569 {
570 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
571 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
346 } 572 }
347 573
348 /* Fail to vectorize statements marked as unvectorizable. */ 574 /* Fail to vectorize statements marked as unvectorizable. */
349 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) 575 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
350 { 576 {
351 if (vect_print_dump_info (REPORT_SLP)) 577 if (dump_enabled_p ())
352 { 578 {
353 fprintf (vect_dump, 579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
354 "Build SLP failed: unvectorizable statement "); 580 "Build SLP failed: unvectorizable statement ");
355 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 581 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
356 } 582 }
357 583 /* Fatal mismatch. */
584 matches[0] = false;
358 return false; 585 return false;
359 } 586 }
360 587
361 lhs = gimple_get_lhs (stmt); 588 lhs = gimple_get_lhs (stmt);
362 if (lhs == NULL_TREE) 589 if (lhs == NULL_TREE)
363 { 590 {
364 if (vect_print_dump_info (REPORT_SLP)) 591 if (dump_enabled_p ())
365 { 592 {
366 fprintf (vect_dump, 593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
367 "Build SLP failed: not GIMPLE_ASSIGN nor GIMPLE_CALL"); 594 "Build SLP failed: not GIMPLE_ASSIGN nor "
368 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 595 "GIMPLE_CALL ");
596 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
369 } 597 }
370 598 /* Fatal mismatch. */
599 matches[0] = false;
371 return false; 600 return false;
372 } 601 }
373 602
374 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); 603 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
375 vectype = get_vectype_for_scalar_type (scalar_type); 604 vectype = get_vectype_for_scalar_type (scalar_type);
376 if (!vectype) 605 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
377 { 606 max_nunits))
378 if (vect_print_dump_info (REPORT_SLP)) 607 {
379 { 608 /* Fatal mismatch. */
380 fprintf (vect_dump, "Build SLP failed: unsupported data-type "); 609 matches[0] = false;
381 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
382 }
383 return false; 610 return false;
384 } 611 }
385 612
386 ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype); 613 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
387 if (ncopies != 1) 614 {
388 { 615 rhs_code = CALL_EXPR;
389 if (vect_print_dump_info (REPORT_SLP)) 616 if (gimple_call_internal_p (call_stmt)
390 fprintf (vect_dump, "SLP with multiple types "); 617 || gimple_call_tail_p (call_stmt)
391 618 || gimple_call_noreturn_p (call_stmt)
392 /* FORNOW: multiple types are unsupported in BB SLP. */ 619 || !gimple_call_nothrow_p (call_stmt)
393 if (bb_vinfo) 620 || gimple_call_chain (call_stmt))
394 return false; 621 {
395 } 622 if (dump_enabled_p ())
396 623 {
397 /* In case of multiple types we need to detect the smallest type. */ 624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
398 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype)) 625 "Build SLP failed: unsupported call type ");
399 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype); 626 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
400 627 call_stmt, 0);
401 if (is_gimple_call (stmt)) 628 }
402 rhs_code = CALL_EXPR; 629 /* Fatal mismatch. */
630 matches[0] = false;
631 return false;
632 }
633 }
403 else 634 else
404 rhs_code = gimple_assign_rhs_code (stmt); 635 rhs_code = gimple_assign_rhs_code (stmt);
405 636
406 /* Check the operation. */ 637 /* Check the operation. */
407 if (i == 0) 638 if (i == 0)
427 optab = optab_for_tree_code (rhs_code, vectype, 658 optab = optab_for_tree_code (rhs_code, vectype,
428 optab_scalar); 659 optab_scalar);
429 660
430 if (!optab) 661 if (!optab)
431 { 662 {
432 if (vect_print_dump_info (REPORT_SLP)) 663 if (dump_enabled_p ())
433 fprintf (vect_dump, "Build SLP failed: no optab."); 664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: no optab.\n");
666 /* Fatal mismatch. */
667 matches[0] = false;
434 return false; 668 return false;
435 } 669 }
436 icode = (int) optab_handler (optab, vec_mode); 670 icode = (int) optab_handler (optab, vec_mode);
437 if (icode == CODE_FOR_nothing) 671 if (icode == CODE_FOR_nothing)
438 { 672 {
439 if (vect_print_dump_info (REPORT_SLP)) 673 if (dump_enabled_p ())
440 fprintf (vect_dump, "Build SLP failed: " 674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
441 "op not supported by target."); 675 "Build SLP failed: "
676 "op not supported by target.\n");
677 /* Fatal mismatch. */
678 matches[0] = false;
442 return false; 679 return false;
443 } 680 }
444 optab_op2_mode = insn_data[icode].operand[2].mode; 681 optab_op2_mode = insn_data[icode].operand[2].mode;
445 if (!VECTOR_MODE_P (optab_op2_mode)) 682 if (!VECTOR_MODE_P (optab_op2_mode))
446 { 683 {
447 need_same_oprnds = true; 684 need_same_oprnds = true;
448 first_op1 = gimple_assign_rhs2 (stmt); 685 first_op1 = gimple_assign_rhs2 (stmt);
449 } 686 }
450 } 687 }
451 } 688 }
689 else if (rhs_code == WIDEN_LSHIFT_EXPR)
690 {
691 need_same_oprnds = true;
692 first_op1 = gimple_assign_rhs2 (stmt);
693 }
452 } 694 }
453 else 695 else
454 { 696 {
697 if (first_stmt_code != rhs_code
698 && alt_stmt_code == ERROR_MARK)
699 alt_stmt_code = rhs_code;
455 if (first_stmt_code != rhs_code 700 if (first_stmt_code != rhs_code
456 && (first_stmt_code != IMAGPART_EXPR 701 && (first_stmt_code != IMAGPART_EXPR
457 || rhs_code != REALPART_EXPR) 702 || rhs_code != REALPART_EXPR)
458 && (first_stmt_code != REALPART_EXPR 703 && (first_stmt_code != REALPART_EXPR
459 || rhs_code != IMAGPART_EXPR) 704 || rhs_code != IMAGPART_EXPR)
460 && !(STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt)) 705 /* Handle mismatches in plus/minus by computing both
706 and merging the results. */
707 && !((first_stmt_code == PLUS_EXPR
708 || first_stmt_code == MINUS_EXPR)
709 && (alt_stmt_code == PLUS_EXPR
710 || alt_stmt_code == MINUS_EXPR)
711 && rhs_code == alt_stmt_code)
712 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
461 && (first_stmt_code == ARRAY_REF 713 && (first_stmt_code == ARRAY_REF
714 || first_stmt_code == BIT_FIELD_REF
462 || first_stmt_code == INDIRECT_REF 715 || first_stmt_code == INDIRECT_REF
463 || first_stmt_code == COMPONENT_REF 716 || first_stmt_code == COMPONENT_REF
464 || first_stmt_code == MEM_REF))) 717 || first_stmt_code == MEM_REF)))
465 { 718 {
466 if (vect_print_dump_info (REPORT_SLP)) 719 if (dump_enabled_p ())
467 { 720 {
468 fprintf (vect_dump, 721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "Build SLP failed: different operation in stmt "); 722 "Build SLP failed: different operation "
470 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 723 "in stmt ");
724 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
726 "original stmt ");
727 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
728 first_stmt, 0);
471 } 729 }
472 730 /* Mismatch. */
473 return false; 731 continue;
474 } 732 }
475 733
476 if (need_same_oprnds 734 if (need_same_oprnds
477 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0)) 735 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
478 { 736 {
479 if (vect_print_dump_info (REPORT_SLP)) 737 if (dump_enabled_p ())
480 { 738 {
481 fprintf (vect_dump, 739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
482 "Build SLP failed: different shift arguments in "); 740 "Build SLP failed: different shift "
483 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 741 "arguments in ");
742 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
484 } 743 }
485 744 /* Mismatch. */
486 return false; 745 continue;
487 } 746 }
488 } 747
489 748 if (rhs_code == CALL_EXPR)
490 /* Strided store or load. */ 749 {
491 if (STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt))) 750 gimple *first_stmt = stmts[0];
751 if (gimple_call_num_args (stmt) != nops
752 || !operand_equal_p (gimple_call_fn (first_stmt),
753 gimple_call_fn (stmt), 0)
754 || gimple_call_fntype (first_stmt)
755 != gimple_call_fntype (stmt))
756 {
757 if (dump_enabled_p ())
758 {
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "Build SLP failed: different calls in ");
761 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
762 stmt, 0);
763 }
764 /* Mismatch. */
765 continue;
766 }
767 }
768 }
769
770 /* Grouped store or load. */
771 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
492 { 772 {
493 if (REFERENCE_CLASS_P (lhs)) 773 if (REFERENCE_CLASS_P (lhs))
494 { 774 {
495 /* Store. */ 775 /* Store. */
496 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, 776 ;
497 stmt, &def_stmts0, &def_stmts1,
498 &first_stmt_dt0,
499 &first_stmt_dt1,
500 &first_stmt_def0_type,
501 &first_stmt_def1_type,
502 &first_stmt_const_oprnd,
503 ncopies_for_cost,
504 &pattern0, &pattern1))
505 return false;
506 } 777 }
507 else 778 else
508 { 779 {
509 /* Load. */ 780 /* Load. */
510 /* FORNOW: Check that there is no gap between the loads. */ 781 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
511 if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
512 && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
513 || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
514 && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
515 {
516 if (vect_print_dump_info (REPORT_SLP))
517 {
518 fprintf (vect_dump, "Build SLP failed: strided "
519 "loads have gaps ");
520 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
521 }
522
523 return false;
524 }
525
526 /* Check that the size of interleaved loads group is not
527 greater than the SLP group size. */
528 if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
529 {
530 if (vect_print_dump_info (REPORT_SLP))
531 {
532 fprintf (vect_dump, "Build SLP failed: the number of "
533 "interleaved loads is greater than"
534 " the SLP group size ");
535 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
536 }
537
538 return false;
539 }
540
541 first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
542 if (prev_first_load) 782 if (prev_first_load)
543 { 783 {
544 /* Check that there are no loads from different interleaving 784 /* Check that there are no loads from different interleaving
545 chains in the same node. The only exception is complex 785 chains in the same node. */
546 numbers. */ 786 if (prev_first_load != first_load)
547 if (prev_first_load != first_load 787 {
548 && rhs_code != REALPART_EXPR 788 if (dump_enabled_p ())
549 && rhs_code != IMAGPART_EXPR)
550 {
551 if (vect_print_dump_info (REPORT_SLP))
552 { 789 {
553 fprintf (vect_dump, "Build SLP failed: different " 790 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
554 "interleaving chains in one node "); 791 vect_location,
555 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 792 "Build SLP failed: different "
793 "interleaving chains in one node ");
794 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
795 stmt, 0);
556 } 796 }
557 797 /* Mismatch. */
558 return false; 798 continue;
559 } 799 }
560 } 800 }
561 else 801 else
562 prev_first_load = first_load; 802 prev_first_load = first_load;
563
564 if (first_load == stmt)
565 {
566 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
567 if (vect_supportable_dr_alignment (first_dr, false)
568 == dr_unaligned_unsupported)
569 {
570 if (vect_print_dump_info (REPORT_SLP))
571 {
572 fprintf (vect_dump, "Build SLP failed: unsupported "
573 "unaligned load ");
574 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
575 }
576
577 return false;
578 }
579
580 /* Analyze costs (for the first stmt in the group). */
581 vect_model_load_cost (vinfo_for_stmt (stmt),
582 ncopies_for_cost, *node);
583 }
584
585 /* Store the place of this load in the interleaving chain. In
586 case that permutation is needed we later decide if a specific
587 permutation is supported. */
588 load_place = vect_get_place_in_interleaving_chain (stmt,
589 first_load);
590 if (load_place != i)
591 permutation = true;
592
593 VEC_safe_push (int, heap, *load_permutation, load_place);
594
595 /* We stop the tree when we reach a group of loads. */
596 stop_recursion = true;
597 continue;
598 } 803 }
599 } /* Strided access. */ 804 } /* Grouped access. */
600 else 805 else
601 { 806 {
602 if (TREE_CODE_CLASS (rhs_code) == tcc_reference) 807 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
603 { 808 {
604 /* Not strided load. */ 809 /* Not grouped load. */
605 if (vect_print_dump_info (REPORT_SLP)) 810 if (dump_enabled_p ())
606 { 811 {
607 fprintf (vect_dump, "Build SLP failed: not strided load "); 812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 813 "Build SLP failed: not grouped load ");
814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
609 } 815 }
610 816
611 /* FORNOW: Not strided loads are not supported. */ 817 /* FORNOW: Not grouped loads are not supported. */
818 /* Fatal mismatch. */
819 matches[0] = false;
612 return false; 820 return false;
613 } 821 }
614 822
615 /* Not memory operation. */ 823 /* Not memory operation. */
616 if (TREE_CODE_CLASS (rhs_code) != tcc_binary 824 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
617 && TREE_CODE_CLASS (rhs_code) != tcc_unary) 825 && TREE_CODE_CLASS (rhs_code) != tcc_unary
826 && TREE_CODE_CLASS (rhs_code) != tcc_expression
827 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
828 && rhs_code != CALL_EXPR)
618 { 829 {
619 if (vect_print_dump_info (REPORT_SLP)) 830 if (dump_enabled_p ())
620 { 831 {
621 fprintf (vect_dump, "Build SLP failed: operation"); 832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
622 fprintf (vect_dump, " unsupported "); 833 "Build SLP failed: operation");
623 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 834 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
835 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
624 } 836 }
625 837 /* Fatal mismatch. */
838 matches[0] = false;
626 return false; 839 return false;
627 } 840 }
628 841
629 /* Find the def-stmts. */ 842 if (rhs_code == COND_EXPR)
630 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt, 843 {
631 &def_stmts0, &def_stmts1, 844 tree cond_expr = gimple_assign_rhs1 (stmt);
632 &first_stmt_dt0, &first_stmt_dt1, 845 enum tree_code cond_code = TREE_CODE (cond_expr);
633 &first_stmt_def0_type, 846 enum tree_code swap_code = ERROR_MARK;
634 &first_stmt_def1_type, 847 enum tree_code invert_code = ERROR_MARK;
635 &first_stmt_const_oprnd, 848
636 ncopies_for_cost, 849 if (i == 0)
637 &pattern0, &pattern1)) 850 first_cond_code = TREE_CODE (cond_expr);
638 return false; 851 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
639 } 852 {
640 } 853 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
641 854 swap_code = swap_tree_comparison (cond_code);
642 /* Add the costs of the node to the overall instance costs. */ 855 invert_code = invert_tree_comparison (cond_code, honor_nans);
643 *inside_cost += SLP_TREE_INSIDE_OF_LOOP_COST (*node); 856 }
644 *outside_cost += SLP_TREE_OUTSIDE_OF_LOOP_COST (*node); 857
645 858 if (first_cond_code == cond_code)
646 /* Strided loads were reached - stop the recursion. */ 859 ;
647 if (stop_recursion) 860 /* Isomorphic can be achieved by swapping. */
648 { 861 else if (first_cond_code == swap_code)
649 if (permutation) 862 swap[i] = 1;
650 { 863 /* Isomorphic can be achieved by inverting. */
651 VEC_safe_push (slp_tree, heap, *loads, *node); 864 else if (first_cond_code == invert_code)
652 *inside_cost 865 swap[i] = 2;
653 += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) 866 else
654 * group_size; 867 {
655 } 868 if (dump_enabled_p ())
869 {
870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
871 "Build SLP failed: different"
872 " operation");
873 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
874 stmt, 0);
875 }
876 /* Mismatch. */
877 continue;
878 }
879 }
880 }
881
882 matches[i] = true;
883 }
884
885 for (i = 0; i < group_size; ++i)
886 if (!matches[i])
887 return false;
888
889 /* If we allowed a two-operation SLP node verify the target can cope
890 with the permute we are going to use. */
891 if (alt_stmt_code != ERROR_MARK
892 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
893 {
894 unsigned int count = TYPE_VECTOR_SUBPARTS (vectype);
895 auto_vec_perm_indices sel (count);
896 for (i = 0; i < count; ++i)
897 {
898 unsigned int elt = i;
899 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
900 elt += count;
901 sel.quick_push (elt);
902 }
903 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
904 {
905 for (i = 0; i < group_size; ++i)
906 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
907 {
908 matches[i] = false;
909 if (dump_enabled_p ())
910 {
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Build SLP failed: different operation "
913 "in stmt ");
914 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
915 stmts[i], 0);
916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
917 "original stmt ");
918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
919 first_stmt, 0);
920 }
921 }
922 return false;
923 }
924 *two_operators = true;
925 }
926
927 return true;
928 }
929
930 /* Traits for the hash_set to record failed SLP builds for a stmt set.
931 Note we never remove apart from at destruction time so we do not
932 need a special value for deleted that differs from empty. */
933 struct bst_traits
934 {
935 typedef vec <gimple *> value_type;
936 typedef vec <gimple *> compare_type;
937 static inline hashval_t hash (value_type);
938 static inline bool equal (value_type existing, value_type candidate);
939 static inline bool is_empty (value_type x) { return !x.exists (); }
940 static inline bool is_deleted (value_type x) { return !x.exists (); }
941 static inline void mark_empty (value_type &x) { x.release (); }
942 static inline void mark_deleted (value_type &x) { x.release (); }
943 static inline void remove (value_type &x) { x.release (); }
944 };
945 inline hashval_t
946 bst_traits::hash (value_type x)
947 {
948 inchash::hash h;
949 for (unsigned i = 0; i < x.length (); ++i)
950 h.add_int (gimple_uid (x[i]));
951 return h.end ();
952 }
953 inline bool
954 bst_traits::equal (value_type existing, value_type candidate)
955 {
956 if (existing.length () != candidate.length ())
957 return false;
958 for (unsigned i = 0; i < existing.length (); ++i)
959 if (existing[i] != candidate[i])
960 return false;
961 return true;
962 }
963
964 static hash_set <vec <gimple *>, bst_traits> *bst_fail;
965
966 static slp_tree
967 vect_build_slp_tree_2 (vec_info *vinfo,
968 vec<gimple *> stmts, unsigned int group_size,
969 unsigned int *max_nunits,
970 vec<slp_tree> *loads,
971 bool *matches, unsigned *npermutes, unsigned *tree_size,
972 unsigned max_tree_size);
973
974 static slp_tree
975 vect_build_slp_tree (vec_info *vinfo,
976 vec<gimple *> stmts, unsigned int group_size,
977 unsigned int *max_nunits,
978 vec<slp_tree> *loads,
979 bool *matches, unsigned *npermutes, unsigned *tree_size,
980 unsigned max_tree_size)
981 {
982 if (bst_fail->contains (stmts))
983 return NULL;
984 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
985 loads, matches, npermutes, tree_size,
986 max_tree_size);
987 /* When SLP build fails for stmts record this, otherwise SLP build
988 can be exponential in time when we allow to construct parts from
989 scalars, see PR81723. */
990 if (! res)
991 {
992 vec <gimple *> x;
993 x.create (stmts.length ());
994 x.splice (stmts);
995 bst_fail->add (x);
996 }
997 return res;
998 }
999
1000 /* Recursively build an SLP tree starting from NODE.
1001 Fail (and return a value not equal to zero) if def-stmts are not
1002 isomorphic, require data permutation or are of unsupported types of
1003 operation. Otherwise, return 0.
1004 The value returned is the depth in the SLP tree where a mismatch
1005 was found. */
1006
1007 static slp_tree
1008 vect_build_slp_tree_2 (vec_info *vinfo,
1009 vec<gimple *> stmts, unsigned int group_size,
1010 unsigned int *max_nunits,
1011 vec<slp_tree> *loads,
1012 bool *matches, unsigned *npermutes, unsigned *tree_size,
1013 unsigned max_tree_size)
1014 {
1015 unsigned nops, i, this_tree_size = 0, this_max_nunits = *max_nunits;
1016 gimple *stmt;
1017 slp_tree node;
1018
1019 matches[0] = false;
1020
1021 stmt = stmts[0];
1022 if (is_gimple_call (stmt))
1023 nops = gimple_call_num_args (stmt);
1024 else if (is_gimple_assign (stmt))
1025 {
1026 nops = gimple_num_ops (stmt) - 1;
1027 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1028 nops++;
1029 }
1030 else if (gimple_code (stmt) == GIMPLE_PHI)
1031 nops = 0;
1032 else
1033 return NULL;
1034
1035 /* If the SLP node is a PHI (induction or reduction), terminate
1036 the recursion. */
1037 if (gimple_code (stmt) == GIMPLE_PHI)
1038 {
1039 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1040 tree vectype = get_vectype_for_scalar_type (scalar_type);
1041 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1042 max_nunits))
1043 return NULL;
1044
1045 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1046 /* Induction from different IVs is not supported. */
1047 if (def_type == vect_induction_def)
1048 {
1049 FOR_EACH_VEC_ELT (stmts, i, stmt)
1050 if (stmt != stmts[0])
1051 return NULL;
1052 }
656 else 1053 else
657 { 1054 {
658 /* We don't check here complex numbers chains, so we keep them in 1055 /* Else def types have to match. */
659 LOADS for further check in vect_supported_load_permutation_p. */ 1056 FOR_EACH_VEC_ELT (stmts, i, stmt)
660 if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR) 1057 {
661 VEC_safe_push (slp_tree, heap, *loads, *node); 1058 /* But for reduction chains only check on the first stmt. */
662 } 1059 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
663 1060 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
664 return true; 1061 continue;
665 } 1062 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1063 return NULL;
1064 }
1065 }
1066 node = vect_create_new_slp_node (stmts);
1067 return node;
1068 }
1069
1070
1071 bool two_operators = false;
1072 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1073 if (!vect_build_slp_tree_1 (vinfo, swap,
1074 stmts, group_size, nops,
1075 &this_max_nunits, matches, &two_operators))
1076 return NULL;
1077
1078 /* If the SLP node is a load, terminate the recursion. */
1079 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1080 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1081 {
1082 *max_nunits = this_max_nunits;
1083 node = vect_create_new_slp_node (stmts);
1084 loads->safe_push (node);
1085 return node;
1086 }
1087
1088 /* Get at the operands, verifying they are compatible. */
1089 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1090 slp_oprnd_info oprnd_info;
1091 FOR_EACH_VEC_ELT (stmts, i, stmt)
1092 {
1093 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1094 stmt, i, &oprnds_info);
1095 if (res != 0)
1096 matches[(res == -1) ? 0 : i] = false;
1097 if (!matches[0])
1098 break;
1099 }
1100 for (i = 0; i < group_size; ++i)
1101 if (!matches[i])
1102 {
1103 vect_free_oprnd_info (oprnds_info);
1104 return NULL;
1105 }
1106
1107 auto_vec<slp_tree, 4> children;
1108 auto_vec<slp_tree> this_loads;
1109
1110 stmt = stmts[0];
1111
1112 if (tree_size)
1113 max_tree_size -= *tree_size;
666 1114
667 /* Create SLP_TREE nodes for the definition node/s. */ 1115 /* Create SLP_TREE nodes for the definition node/s. */
668 if (first_stmt_dt0 == vect_internal_def) 1116 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
669 { 1117 {
670 slp_tree left_node = XNEW (struct _slp_tree); 1118 slp_tree child;
671 SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0; 1119 unsigned old_nloads = this_loads.length ();
672 SLP_TREE_VEC_STMTS (left_node) = NULL; 1120 unsigned old_tree_size = this_tree_size;
673 SLP_TREE_LEFT (left_node) = NULL; 1121 unsigned int j;
674 SLP_TREE_RIGHT (left_node) = NULL; 1122
675 SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0; 1123 if (oprnd_info->first_dt != vect_internal_def
676 SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0; 1124 && oprnd_info->first_dt != vect_reduction_def
677 if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size, 1125 && oprnd_info->first_dt != vect_induction_def)
678 inside_cost, outside_cost, ncopies_for_cost, 1126 continue;
679 max_nunits, load_permutation, loads, 1127
680 vectorization_factor)) 1128 if (++this_tree_size > max_tree_size)
681 return false; 1129 {
682 1130 FOR_EACH_VEC_ELT (children, j, child)
683 SLP_TREE_LEFT (*node) = left_node; 1131 vect_free_slp_tree (child);
684 } 1132 vect_free_oprnd_info (oprnds_info);
685 1133 return NULL;
686 if (first_stmt_dt1 == vect_internal_def) 1134 }
687 { 1135
688 slp_tree right_node = XNEW (struct _slp_tree); 1136 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
689 SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1; 1137 group_size, &this_max_nunits,
690 SLP_TREE_VEC_STMTS (right_node) = NULL; 1138 &this_loads, matches, npermutes,
691 SLP_TREE_LEFT (right_node) = NULL; 1139 &this_tree_size,
692 SLP_TREE_RIGHT (right_node) = NULL; 1140 max_tree_size)) != NULL)
693 SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0; 1141 {
694 SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0; 1142 /* If we have all children of child built up from scalars then just
695 if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size, 1143 throw that away and build it up this node from scalars. */
696 inside_cost, outside_cost, ncopies_for_cost, 1144 if (!SLP_TREE_CHILDREN (child).is_empty ()
697 max_nunits, load_permutation, loads, 1145 /* ??? Rejecting patterns this way doesn't work. We'd have to
698 vectorization_factor)) 1146 do extra work to cancel the pattern so the uses see the
699 return false; 1147 scalar version. */
700 1148 && !is_pattern_stmt_p
701 SLP_TREE_RIGHT (*node) = right_node; 1149 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
702 } 1150 {
703 1151 slp_tree grandchild;
704 return true; 1152
705 } 1153 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
706 1154 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1155 break;
1156 if (!grandchild)
1157 {
1158 /* Roll back. */
1159 this_loads.truncate (old_nloads);
1160 this_tree_size = old_tree_size;
1161 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1162 vect_free_slp_tree (grandchild);
1163 SLP_TREE_CHILDREN (child).truncate (0);
1164
1165 dump_printf_loc (MSG_NOTE, vect_location,
1166 "Building parent vector operands from "
1167 "scalars instead\n");
1168 oprnd_info->def_stmts = vNULL;
1169 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1170 children.safe_push (child);
1171 continue;
1172 }
1173 }
1174
1175 oprnd_info->def_stmts = vNULL;
1176 children.safe_push (child);
1177 continue;
1178 }
1179
1180 /* If the SLP build failed fatally and we analyze a basic-block
1181 simply treat nodes we fail to build as externally defined
1182 (and thus build vectors from the scalar defs).
1183 The cost model will reject outright expensive cases.
1184 ??? This doesn't treat cases where permutation ultimatively
1185 fails (or we don't try permutation below). Ideally we'd
1186 even compute a permutation that will end up with the maximum
1187 SLP tree size... */
1188 if (is_a <bb_vec_info> (vinfo)
1189 && !matches[0]
1190 /* ??? Rejecting patterns this way doesn't work. We'd have to
1191 do extra work to cancel the pattern so the uses see the
1192 scalar version. */
1193 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1194 {
1195 dump_printf_loc (MSG_NOTE, vect_location,
1196 "Building vector operands from scalars\n");
1197 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1198 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1199 children.safe_push (child);
1200 oprnd_info->def_stmts = vNULL;
1201 continue;
1202 }
1203
1204 /* If the SLP build for operand zero failed and operand zero
1205 and one can be commutated try that for the scalar stmts
1206 that failed the match. */
1207 if (i == 0
1208 /* A first scalar stmt mismatch signals a fatal mismatch. */
1209 && matches[0]
1210 /* ??? For COND_EXPRs we can swap the comparison operands
1211 as well as the arms under some constraints. */
1212 && nops == 2
1213 && oprnds_info[1]->first_dt == vect_internal_def
1214 && is_gimple_assign (stmt)
1215 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1216 && ! two_operators
1217 /* Do so only if the number of not successful permutes was nor more
1218 than a cut-ff as re-trying the recursive match on
1219 possibly each level of the tree would expose exponential
1220 behavior. */
1221 && *npermutes < 4)
1222 {
1223 /* Verify if we can safely swap or if we committed to a specific
1224 operand order already. */
1225 for (j = 0; j < group_size; ++j)
1226 if (!matches[j]
1227 && (swap[j] != 0
1228 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmts[j]))))
1229 {
1230 if (dump_enabled_p ())
1231 {
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "Build SLP failed: cannot swap operands "
1234 "of shared stmt ");
1235 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1236 stmts[j], 0);
1237 }
1238 goto fail;
1239 }
1240
1241 /* Swap mismatched definition stmts. */
1242 dump_printf_loc (MSG_NOTE, vect_location,
1243 "Re-trying with swapped operands of stmts ");
1244 for (j = 0; j < group_size; ++j)
1245 if (!matches[j])
1246 {
1247 std::swap (oprnds_info[0]->def_stmts[j],
1248 oprnds_info[1]->def_stmts[j]);
1249 dump_printf (MSG_NOTE, "%d ", j);
1250 }
1251 dump_printf (MSG_NOTE, "\n");
1252 /* And try again with scratch 'matches' ... */
1253 bool *tem = XALLOCAVEC (bool, group_size);
1254 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1255 group_size, &this_max_nunits,
1256 &this_loads, tem, npermutes,
1257 &this_tree_size,
1258 max_tree_size)) != NULL)
1259 {
1260 /* ... so if successful we can apply the operand swapping
1261 to the GIMPLE IL. This is necessary because for example
1262 vect_get_slp_defs uses operand indexes and thus expects
1263 canonical operand order. This is also necessary even
1264 if we end up building the operand from scalars as
1265 we'll continue to process swapped operand two. */
1266 for (j = 0; j < group_size; ++j)
1267 {
1268 gimple *stmt = stmts[j];
1269 gimple_set_plf (stmt, GF_PLF_1, false);
1270 }
1271 for (j = 0; j < group_size; ++j)
1272 {
1273 gimple *stmt = stmts[j];
1274 if (!matches[j])
1275 {
1276 /* Avoid swapping operands twice. */
1277 if (gimple_plf (stmt, GF_PLF_1))
1278 continue;
1279 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1280 gimple_assign_rhs2_ptr (stmt));
1281 gimple_set_plf (stmt, GF_PLF_1, true);
1282 }
1283 }
1284 /* Verify we swap all duplicates or none. */
1285 if (flag_checking)
1286 for (j = 0; j < group_size; ++j)
1287 {
1288 gimple *stmt = stmts[j];
1289 gcc_assert (gimple_plf (stmt, GF_PLF_1) == ! matches[j]);
1290 }
1291
1292 /* If we have all children of child built up from scalars then
1293 just throw that away and build it up this node from scalars. */
1294 if (!SLP_TREE_CHILDREN (child).is_empty ()
1295 /* ??? Rejecting patterns this way doesn't work. We'd have
1296 to do extra work to cancel the pattern so the uses see the
1297 scalar version. */
1298 && !is_pattern_stmt_p
1299 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1300 {
1301 unsigned int j;
1302 slp_tree grandchild;
1303
1304 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1305 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1306 break;
1307 if (!grandchild)
1308 {
1309 /* Roll back. */
1310 this_loads.truncate (old_nloads);
1311 this_tree_size = old_tree_size;
1312 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1313 vect_free_slp_tree (grandchild);
1314 SLP_TREE_CHILDREN (child).truncate (0);
1315
1316 dump_printf_loc (MSG_NOTE, vect_location,
1317 "Building parent vector operands from "
1318 "scalars instead\n");
1319 oprnd_info->def_stmts = vNULL;
1320 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1321 children.safe_push (child);
1322 continue;
1323 }
1324 }
1325
1326 oprnd_info->def_stmts = vNULL;
1327 children.safe_push (child);
1328 continue;
1329 }
1330
1331 ++*npermutes;
1332 }
1333
1334 fail:
1335 gcc_assert (child == NULL);
1336 FOR_EACH_VEC_ELT (children, j, child)
1337 vect_free_slp_tree (child);
1338 vect_free_oprnd_info (oprnds_info);
1339 return NULL;
1340 }
1341
1342 vect_free_oprnd_info (oprnds_info);
1343
1344 if (tree_size)
1345 *tree_size += this_tree_size;
1346 *max_nunits = this_max_nunits;
1347 loads->safe_splice (this_loads);
1348
1349 node = vect_create_new_slp_node (stmts);
1350 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1351 SLP_TREE_CHILDREN (node).splice (children);
1352 return node;
1353 }
1354
1355 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
707 1356
708 static void 1357 static void
709 vect_print_slp_tree (slp_tree node) 1358 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
710 { 1359 {
711 int i; 1360 int i;
712 gimple stmt; 1361 gimple *stmt;
713 1362 slp_tree child;
714 if (!node) 1363
715 return; 1364 dump_printf_loc (dump_kind, loc, "node%s\n",
716 1365 SLP_TREE_DEF_TYPE (node) != vect_internal_def
717 fprintf (vect_dump, "node "); 1366 ? " (external)" : "");
718 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) 1367 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
719 { 1368 {
720 fprintf (vect_dump, "\n\tstmt %d ", i); 1369 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
721 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 1370 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
722 } 1371 }
723 fprintf (vect_dump, "\n"); 1372 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
724 1373 vect_print_slp_tree (dump_kind, loc, child);
725 vect_print_slp_tree (SLP_TREE_LEFT (node));
726 vect_print_slp_tree (SLP_TREE_RIGHT (node));
727 } 1374 }
728 1375
729 1376
730 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). 1377 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
731 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index 1378 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
734 1381
735 static void 1382 static void
736 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) 1383 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
737 { 1384 {
738 int i; 1385 int i;
739 gimple stmt; 1386 gimple *stmt;
740 1387 slp_tree child;
741 if (!node) 1388
1389 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
742 return; 1390 return;
743 1391
744 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) 1392 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
745 if (j < 0 || i == j) 1393 if (j < 0 || i == j)
746 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark; 1394 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
747 1395
748 vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j); 1396 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
749 vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j); 1397 vect_mark_slp_stmts (child, mark, j);
750 } 1398 }
751 1399
752 1400
753 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ 1401 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
754 1402
755 static void 1403 static void
756 vect_mark_slp_stmts_relevant (slp_tree node) 1404 vect_mark_slp_stmts_relevant (slp_tree node)
757 { 1405 {
758 int i; 1406 int i;
759 gimple stmt; 1407 gimple *stmt;
760 stmt_vec_info stmt_info; 1408 stmt_vec_info stmt_info;
761 1409 slp_tree child;
762 if (!node) 1410
1411 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
763 return; 1412 return;
764 1413
765 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) 1414 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
766 { 1415 {
767 stmt_info = vinfo_for_stmt (stmt); 1416 stmt_info = vinfo_for_stmt (stmt);
768 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) 1417 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
769 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); 1418 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
770 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; 1419 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
771 } 1420 }
772 1421
773 vect_mark_slp_stmts_relevant (SLP_TREE_LEFT (node)); 1422 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
774 vect_mark_slp_stmts_relevant (SLP_TREE_RIGHT (node)); 1423 vect_mark_slp_stmts_relevant (child);
775 }
776
777
778 /* Check if the permutation required by the SLP INSTANCE is supported.
779 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
780
781 static bool
782 vect_supported_slp_permutation_p (slp_instance instance)
783 {
784 slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0);
785 gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
786 gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
787 VEC (slp_tree, heap) *sorted_loads = NULL;
788 int index;
789 slp_tree *tmp_loads = NULL;
790 int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
791 slp_tree load;
792
793 /* FORNOW: The only supported loads permutation is loads from the same
794 location in all the loads in the node, when the data-refs in
795 nodes of LOADS constitute an interleaving chain.
796 Sort the nodes according to the order of accesses in the chain. */
797 tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
798 for (i = 0, j = 0;
799 VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index)
800 && VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load);
801 i += group_size, j++)
802 {
803 gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0);
804 /* Check that the loads are all in the same interleaving chain. */
805 if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load)
806 {
807 if (vect_print_dump_info (REPORT_DETAILS))
808 {
809 fprintf (vect_dump, "Build SLP failed: unsupported data "
810 "permutation ");
811 print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM);
812 }
813
814 free (tmp_loads);
815 return false;
816 }
817
818 tmp_loads[index] = load;
819 }
820
821 sorted_loads = VEC_alloc (slp_tree, heap, group_size);
822 for (i = 0; i < group_size; i++)
823 VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]);
824
825 VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
826 SLP_INSTANCE_LOADS (instance) = sorted_loads;
827 free (tmp_loads);
828
829 if (!vect_transform_slp_perm_load (stmt, NULL, NULL,
830 SLP_INSTANCE_UNROLLING_FACTOR (instance),
831 instance, true))
832 return false;
833
834 return true;
835 } 1424 }
836 1425
837 1426
838 /* Rearrange the statements of NODE according to PERMUTATION. */ 1427 /* Rearrange the statements of NODE according to PERMUTATION. */
839 1428
840 static void 1429 static void
841 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, 1430 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
842 VEC (int, heap) *permutation) 1431 vec<unsigned> permutation)
843 { 1432 {
844 gimple stmt; 1433 gimple *stmt;
845 VEC (gimple, heap) *tmp_stmts; 1434 vec<gimple *> tmp_stmts;
846 unsigned int index, i; 1435 unsigned int i;
847 1436 slp_tree child;
848 if (!node) 1437
849 return; 1438 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
850 1439 vect_slp_rearrange_stmts (child, group_size, permutation);
851 vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation); 1440
852 vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation); 1441 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
853 1442 tmp_stmts.create (group_size);
854 gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))); 1443 tmp_stmts.quick_grow_cleared (group_size);
855 tmp_stmts = VEC_alloc (gimple, heap, group_size); 1444
856 1445 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1446 tmp_stmts[permutation[i]] = stmt;
1447
1448 SLP_TREE_SCALAR_STMTS (node).release ();
1449 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1450 }
1451
1452
1453 /* Attempt to reorder stmts in a reduction chain so that we don't
1454 require any load permutation. Return true if that was possible,
1455 otherwise return false. */
1456
1457 static bool
1458 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1459 {
1460 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1461 unsigned int i, j;
1462 unsigned int lidx;
1463 slp_tree node, load;
1464
1465 /* Compare all the permutation sequences to the first one. We know
1466 that at least one load is permuted. */
1467 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1468 if (!node->load_permutation.exists ())
1469 return false;
1470 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1471 {
1472 if (!load->load_permutation.exists ())
1473 return false;
1474 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1475 if (lidx != node->load_permutation[j])
1476 return false;
1477 }
1478
1479 /* Check that the loads in the first sequence are different and there
1480 are no gaps between them. */
1481 auto_sbitmap load_index (group_size);
1482 bitmap_clear (load_index);
1483 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1484 {
1485 if (lidx >= group_size)
1486 return false;
1487 if (bitmap_bit_p (load_index, lidx))
1488 return false;
1489
1490 bitmap_set_bit (load_index, lidx);
1491 }
857 for (i = 0; i < group_size; i++) 1492 for (i = 0; i < group_size; i++)
858 VEC_safe_push (gimple, heap, tmp_stmts, NULL); 1493 if (!bitmap_bit_p (load_index, i))
859 1494 return false;
860 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) 1495
861 { 1496 /* This permutation is valid for reduction. Since the order of the
862 index = VEC_index (int, permutation, i); 1497 statements in the nodes is not important unless they are memory
863 VEC_replace (gimple, tmp_stmts, index, stmt); 1498 accesses, we can rearrange the statements in all the nodes
864 } 1499 according to the order of the loads. */
865 1500 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
866 VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); 1501 node->load_permutation);
867 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; 1502
868 } 1503 /* We are done, no actual permutations need to be generated. */
869 1504 unsigned int unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
870 1505 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
871 /* Check if the required load permutation is supported. 1506 {
872 LOAD_PERMUTATION contains a list of indices of the loads. 1507 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
873 In SLP this permutation is relative to the order of strided stores that are 1508 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
874 the base of the SLP instance. */ 1509 /* But we have to keep those permutations that are required because
1510 of handling of gaps. */
1511 if (unrolling_factor == 1
1512 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1513 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1514 SLP_TREE_LOAD_PERMUTATION (node).release ();
1515 else
1516 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1517 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1518 }
1519
1520 return true;
1521 }
1522
1523 /* Check if the required load permutations in the SLP instance
1524 SLP_INSTN are supported. */
875 1525
876 static bool 1526 static bool
877 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, 1527 vect_supported_load_permutation_p (slp_instance slp_instn)
878 VEC (int, heap) *load_permutation) 1528 {
879 { 1529 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
880 int i = 0, j, prev = -1, next, k, number_of_groups; 1530 unsigned int i, j, k, next;
881 bool supported, bad_permutation = false; 1531 slp_tree node;
882 sbitmap load_index; 1532 gimple *stmt, *load, *next_load;
883 slp_tree node, other_complex_node; 1533
884 gimple stmt, first = NULL, other_node_first; 1534 if (dump_enabled_p ())
885 unsigned complex_numbers = 0; 1535 {
886 1536 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
887 /* FORNOW: permutations are only supported in SLP. */ 1537 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
888 if (!slp_instn) 1538 if (node->load_permutation.exists ())
889 return false; 1539 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
890 1540 dump_printf (MSG_NOTE, "%d ", next);
891 if (vect_print_dump_info (REPORT_SLP)) 1541 else
892 { 1542 for (k = 0; k < group_size; ++k)
893 fprintf (vect_dump, "Load permutation "); 1543 dump_printf (MSG_NOTE, "%d ", k);
894 FOR_EACH_VEC_ELT (int, load_permutation, i, next) 1544 dump_printf (MSG_NOTE, "\n");
895 fprintf (vect_dump, "%d ", next);
896 } 1545 }
897 1546
898 /* In case of reduction every load permutation is allowed, since the order 1547 /* In case of reduction every load permutation is allowed, since the order
899 of the reduction statements is not important (as opposed to the case of 1548 of the reduction statements is not important (as opposed to the case of
900 strided stores). The only condition we need to check is that all the 1549 grouped stores). The only condition we need to check is that all the
901 load nodes are of the same size and have the same permutation (and then 1550 load nodes are of the same size and have the same permutation (and then
902 rearrange all the nodes of the SLP instance according to this 1551 rearrange all the nodes of the SLP instance according to this
903 permutation). */ 1552 permutation). */
904 1553
905 /* Check that all the load nodes are of the same size. */ 1554 /* Check that all the load nodes are of the same size. */
906 FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node) 1555 /* ??? Can't we assert this? */
907 { 1556 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
908 if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)) 1557 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
909 != (unsigned) group_size) 1558 return false;
910 return false; 1559
911 1560 node = SLP_INSTANCE_TREE (slp_instn);
912 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); 1561 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
913 if (is_gimple_assign (stmt) 1562
914 && (gimple_assign_rhs_code (stmt) == REALPART_EXPR 1563 /* Reduction (there are no data-refs in the root).
915 || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR)) 1564 In reduction chain the order of the loads is not important. */
916 complex_numbers++; 1565 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
917 } 1566 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
918 1567 vect_attempt_slp_rearrange_stmts (slp_instn);
919 /* Complex operands can be swapped as following: 1568
920 real_c = real_b + real_a; 1569 /* In basic block vectorization we allow any subchain of an interleaving
921 imag_c = imag_a + imag_b; 1570 chain.
922 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of 1571 FORNOW: not supported in loop SLP because of realignment compications. */
923 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving 1572 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
924 chains are mixed, they match the above pattern. */ 1573 {
925 if (complex_numbers) 1574 /* Check whether the loads in an instance form a subchain and thus
926 { 1575 no permutation is necessary. */
927 FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node) 1576 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
928 { 1577 {
929 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), j, stmt) 1578 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1579 continue;
1580 bool subchain_p = true;
1581 next_load = NULL;
1582 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
930 { 1583 {
931 if (j == 0) 1584 if (j != 0
932 first = stmt; 1585 && (next_load != load
933 else 1586 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
934 { 1587 {
935 if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != first) 1588 subchain_p = false;
936 { 1589 break;
937 if (complex_numbers != 2) 1590 }
938 return false; 1591 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
939
940 if (i == 0)
941 k = 1;
942 else
943 k = 0;
944
945 other_complex_node = VEC_index (slp_tree,
946 SLP_INSTANCE_LOADS (slp_instn), k);
947 other_node_first = VEC_index (gimple,
948 SLP_TREE_SCALAR_STMTS (other_complex_node), 0);
949
950 if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt))
951 != other_node_first)
952 return false;
953 }
954 }
955 } 1592 }
1593 if (subchain_p)
1594 SLP_TREE_LOAD_PERMUTATION (node).release ();
1595 else
1596 {
1597 stmt_vec_info group_info
1598 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1599 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1600 unsigned nunits
1601 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (group_info));
1602 unsigned k, maxk = 0;
1603 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1604 if (k > maxk)
1605 maxk = k;
1606 /* In BB vectorization we may not actually use a loaded vector
1607 accessing elements in excess of GROUP_SIZE. */
1608 if (maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1609 {
1610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611 "BB vectorization with gaps at the end of "
1612 "a load is not supported\n");
1613 return false;
1614 }
1615
1616 /* Verify the permutation can be generated. */
1617 vec<tree> tem;
1618 unsigned n_perms;
1619 if (!vect_transform_slp_perm_load (node, tem, NULL,
1620 1, slp_instn, true, &n_perms))
1621 {
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1623 vect_location,
1624 "unsupported load permutation\n");
1625 return false;
1626 }
1627 }
956 } 1628 }
957 }
958
959 /* We checked that this case ok, so there is no need to proceed with
960 permutation tests. */
961 if (complex_numbers == 2)
962 {
963 VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (slp_instn));
964 VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
965 return true; 1629 return true;
966 } 1630 }
967 1631
968 node = SLP_INSTANCE_TREE (slp_instn); 1632 /* For loop vectorization verify we can generate the permutation. Be
969 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); 1633 conservative about the vectorization factor, there are permutations
970 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP 1634 that will use three vector inputs only starting from a specific factor
971 instance, not all the loads belong to the same node or interleaving 1635 and the vectorization factor is not yet final.
972 group. Hence, we need to divide them into groups according to 1636 ??? The SLP instance unrolling factor might not be the maximum one. */
973 GROUP_SIZE. */ 1637 unsigned n_perms;
974 number_of_groups = VEC_length (int, load_permutation) / group_size; 1638 unsigned test_vf
975 1639 = least_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
976 /* Reduction (there are no data-refs in the root). */ 1640 LOOP_VINFO_VECT_FACTOR
977 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) 1641 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
978 { 1642 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
979 int first_group_load_index; 1643 if (node->load_permutation.exists ()
980 1644 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
981 /* Compare all the permutation sequences to the first one. */ 1645 slp_instn, true, &n_perms))
982 for (i = 1; i < number_of_groups; i++)
983 {
984 k = 0;
985 for (j = i * group_size; j < i * group_size + group_size; j++)
986 {
987 next = VEC_index (int, load_permutation, j);
988 first_group_load_index = VEC_index (int, load_permutation, k);
989
990 if (next != first_group_load_index)
991 {
992 bad_permutation = true;
993 break;
994 }
995
996 k++;
997 }
998
999 if (bad_permutation)
1000 break;
1001 }
1002
1003 if (!bad_permutation)
1004 {
1005 /* Check that the loads in the first sequence are different and there
1006 are no gaps between them. */
1007 load_index = sbitmap_alloc (group_size);
1008 sbitmap_zero (load_index);
1009 for (k = 0; k < group_size; k++)
1010 {
1011 first_group_load_index = VEC_index (int, load_permutation, k);
1012 if (TEST_BIT (load_index, first_group_load_index))
1013 {
1014 bad_permutation = true;
1015 break;
1016 }
1017
1018 SET_BIT (load_index, first_group_load_index);
1019 }
1020
1021 if (!bad_permutation)
1022 for (k = 0; k < group_size; k++)
1023 if (!TEST_BIT (load_index, k))
1024 {
1025 bad_permutation = true;
1026 break;
1027 }
1028
1029 sbitmap_free (load_index);
1030 }
1031
1032 if (!bad_permutation)
1033 {
1034 /* This permutation is valid for reduction. Since the order of the
1035 statements in the nodes is not important unless they are memory
1036 accesses, we can rearrange the statements in all the nodes
1037 according to the order of the loads. */
1038 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1039 load_permutation);
1040 VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
1041 return true;
1042 }
1043 }
1044
1045 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1046 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1047 well (unless it's reduction). */
1048 if (VEC_length (int, load_permutation)
1049 != (unsigned int) (group_size * group_size))
1050 return false;
1051
1052 supported = true;
1053 load_index = sbitmap_alloc (group_size);
1054 sbitmap_zero (load_index);
1055 for (j = 0; j < group_size; j++)
1056 {
1057 for (i = j * group_size, k = 0;
1058 VEC_iterate (int, load_permutation, i, next) && k < group_size;
1059 i++, k++)
1060 {
1061 if (i != j * group_size && next != prev)
1062 {
1063 supported = false;
1064 break;
1065 }
1066
1067 prev = next;
1068 }
1069
1070 if (TEST_BIT (load_index, prev))
1071 {
1072 supported = false;
1073 break;
1074 }
1075
1076 SET_BIT (load_index, prev);
1077 }
1078
1079 for (j = 0; j < group_size; j++)
1080 if (!TEST_BIT (load_index, j))
1081 return false; 1646 return false;
1082 1647
1083 sbitmap_free (load_index); 1648 return true;
1084
1085 if (supported && i == group_size * group_size
1086 && vect_supported_slp_permutation_p (slp_instn))
1087 return true;
1088
1089 return false;
1090 }
1091
1092
1093 /* Find the first load in the loop that belongs to INSTANCE.
1094 When loads are in several SLP nodes, there can be a case in which the first
1095 load does not appear in the first SLP node to be transformed, causing
1096 incorrect order of statements. Since we generate all the loads together,
1097 they must be inserted before the first load of the SLP instance and not
1098 before the first load of the first node of the instance. */
1099
1100 static gimple
1101 vect_find_first_load_in_slp_instance (slp_instance instance)
1102 {
1103 int i, j;
1104 slp_tree load_node;
1105 gimple first_load = NULL, load;
1106
1107 FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (instance), i, load_node)
1108 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (load_node), j, load)
1109 first_load = get_earlier_stmt (load, first_load);
1110
1111 return first_load;
1112 } 1649 }
1113 1650
1114 1651
1115 /* Find the last store in SLP INSTANCE. */ 1652 /* Find the last store in SLP INSTANCE. */
1116 1653
1117 static gimple 1654 gimple *
1118 vect_find_last_store_in_slp_instance (slp_instance instance) 1655 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1119 { 1656 {
1120 int i; 1657 gimple *last = NULL, *stmt;
1121 slp_tree node; 1658
1122 gimple last_store = NULL, store; 1659 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1123 1660 {
1124 node = SLP_INSTANCE_TREE (instance); 1661 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1125 for (i = 0; 1662 if (is_pattern_stmt_p (stmt_vinfo))
1126 VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, store); 1663 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1127 i++) 1664 else
1128 last_store = get_later_stmt (store, last_store); 1665 last = get_later_stmt (stmt, last);
1129 1666 }
1130 return last_store; 1667
1131 } 1668 return last;
1132 1669 }
1133 1670
1134 /* Analyze an SLP instance starting from a group of strided stores. Call 1671 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1672
1673 static void
1674 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1675 stmt_vector_for_cost *prologue_cost_vec,
1676 stmt_vector_for_cost *body_cost_vec,
1677 unsigned ncopies_for_cost)
1678 {
1679 unsigned i, j;
1680 slp_tree child;
1681 gimple *stmt;
1682 stmt_vec_info stmt_info;
1683 tree lhs;
1684
1685 /* Recurse down the SLP tree. */
1686 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1687 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1688 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1689 body_cost_vec, ncopies_for_cost);
1690
1691 /* Look at the first scalar stmt to determine the cost. */
1692 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1693 stmt_info = vinfo_for_stmt (stmt);
1694 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1695 {
1696 vect_memory_access_type memory_access_type
1697 = (STMT_VINFO_STRIDED_P (stmt_info)
1698 ? VMAT_STRIDED_SLP
1699 : VMAT_CONTIGUOUS);
1700 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1701 vect_model_store_cost (stmt_info, ncopies_for_cost,
1702 memory_access_type, vect_uninitialized_def,
1703 node, prologue_cost_vec, body_cost_vec);
1704 else
1705 {
1706 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1707 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1708 {
1709 /* If the load is permuted then the alignment is determined by
1710 the first group element not by the first scalar stmt DR. */
1711 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1712 stmt_info = vinfo_for_stmt (stmt);
1713 /* Record the cost for the permutation. */
1714 unsigned n_perms;
1715 vect_transform_slp_perm_load (node, vNULL, NULL,
1716 ncopies_for_cost, instance, true,
1717 &n_perms);
1718 record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1719 stmt_info, 0, vect_body);
1720 unsigned nunits
1721 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1722 /* And adjust the number of loads performed. This handles
1723 redundancies as well as loads that are later dead. */
1724 auto_sbitmap perm (GROUP_SIZE (stmt_info));
1725 bitmap_clear (perm);
1726 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1727 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1728 ncopies_for_cost = 0;
1729 bool load_seen = false;
1730 for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1731 {
1732 if (i % nunits == 0)
1733 {
1734 if (load_seen)
1735 ncopies_for_cost++;
1736 load_seen = false;
1737 }
1738 if (bitmap_bit_p (perm, i))
1739 load_seen = true;
1740 }
1741 if (load_seen)
1742 ncopies_for_cost++;
1743 gcc_assert (ncopies_for_cost
1744 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1745 + nunits - 1) / nunits);
1746 ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
1747 }
1748 /* Record the cost for the vector loads. */
1749 vect_model_load_cost (stmt_info, ncopies_for_cost,
1750 memory_access_type, node, prologue_cost_vec,
1751 body_cost_vec);
1752 return;
1753 }
1754 }
1755 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1756 {
1757 /* ncopies_for_cost is the number of IVs we generate. */
1758 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1759 stmt_info, 0, vect_body);
1760
1761 /* Prologue cost for the initial values and step vector. */
1762 record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1763 CONSTANT_CLASS_P
1764 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1765 (stmt_info))
1766 ? vector_load : vec_construct,
1767 stmt_info, 0, vect_prologue);
1768 record_stmt_cost (prologue_cost_vec, 1,
1769 CONSTANT_CLASS_P
1770 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1771 ? vector_load : vec_construct,
1772 stmt_info, 0, vect_prologue);
1773
1774 /* ??? No easy way to get at the actual number of vector stmts
1775 to be geneated and thus the derived IVs. */
1776 }
1777 else
1778 {
1779 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1780 stmt_info, 0, vect_body);
1781 if (SLP_TREE_TWO_OPERATORS (node))
1782 {
1783 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1784 stmt_info, 0, vect_body);
1785 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1786 stmt_info, 0, vect_body);
1787 }
1788 }
1789
1790 /* Push SLP node def-type to stmts. */
1791 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1792 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1793 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1794 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1795
1796 /* Scan operands and account for prologue cost of constants/externals.
1797 ??? This over-estimates cost for multiple uses and should be
1798 re-engineered. */
1799 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1800 lhs = gimple_get_lhs (stmt);
1801 for (i = 0; i < gimple_num_ops (stmt); ++i)
1802 {
1803 tree op = gimple_op (stmt, i);
1804 gimple *def_stmt;
1805 enum vect_def_type dt;
1806 if (!op || op == lhs)
1807 continue;
1808 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt))
1809 {
1810 /* Without looking at the actual initializer a vector of
1811 constants can be implemented as load from the constant pool.
1812 ??? We need to pass down stmt_info for a vector type
1813 even if it points to the wrong stmt. */
1814 if (dt == vect_constant_def)
1815 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1816 stmt_info, 0, vect_prologue);
1817 else if (dt == vect_external_def)
1818 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1819 stmt_info, 0, vect_prologue);
1820 }
1821 }
1822
1823 /* Restore stmt def-types. */
1824 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1825 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1826 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1827 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
1828 }
1829
1830 /* Compute the cost for the SLP instance INSTANCE. */
1831
1832 static void
1833 vect_analyze_slp_cost (slp_instance instance, void *data)
1834 {
1835 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1836 unsigned ncopies_for_cost;
1837 stmt_info_for_cost *si;
1838 unsigned i;
1839
1840 if (dump_enabled_p ())
1841 dump_printf_loc (MSG_NOTE, vect_location,
1842 "=== vect_analyze_slp_cost ===\n");
1843
1844 /* Calculate the number of vector stmts to create based on the unrolling
1845 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1846 GROUP_SIZE / NUNITS otherwise. */
1847 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1848 slp_tree node = SLP_INSTANCE_TREE (instance);
1849 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1850 /* Adjust the group_size by the vectorization factor which is always one
1851 for basic-block vectorization. */
1852 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1853 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1854 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1855 /* For reductions look at a reduction operand in case the reduction
1856 operation is widening like DOT_PROD or SAD. */
1857 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1858 {
1859 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1860 switch (gimple_assign_rhs_code (stmt))
1861 {
1862 case DOT_PROD_EXPR:
1863 case SAD_EXPR:
1864 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1865 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1866 break;
1867 default:;
1868 }
1869 }
1870 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1871
1872 prologue_cost_vec.create (10);
1873 body_cost_vec.create (10);
1874 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1875 &prologue_cost_vec, &body_cost_vec,
1876 ncopies_for_cost);
1877
1878 /* Record the prologue costs, which were delayed until we were
1879 sure that SLP was successful. */
1880 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1881 {
1882 struct _stmt_vec_info *stmt_info
1883 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1884 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1885 si->misalign, vect_prologue);
1886 }
1887
1888 /* Record the instance's instructions in the target cost model. */
1889 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1890 {
1891 struct _stmt_vec_info *stmt_info
1892 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1893 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1894 si->misalign, vect_body);
1895 }
1896
1897 prologue_cost_vec.release ();
1898 body_cost_vec.release ();
1899 }
1900
1901 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
1902 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
1903 the first GROUP1_SIZE stmts, since stores are consecutive), the second
1904 containing the remainder.
1905 Return the first stmt in the second group. */
1906
1907 static gimple *
1908 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
1909 {
1910 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
1911 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
1912 gcc_assert (group1_size > 0);
1913 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
1914 gcc_assert (group2_size > 0);
1915 GROUP_SIZE (first_vinfo) = group1_size;
1916
1917 gimple *stmt = first_stmt;
1918 for (unsigned i = group1_size; i > 1; i--)
1919 {
1920 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1921 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1922 }
1923 /* STMT is now the last element of the first group. */
1924 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1925 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
1926
1927 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
1928 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
1929 {
1930 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
1931 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1932 }
1933
1934 /* For the second group, the GROUP_GAP is that before the original group,
1935 plus skipping over the first vector. */
1936 GROUP_GAP (vinfo_for_stmt (group2)) =
1937 GROUP_GAP (first_vinfo) + group1_size;
1938
1939 /* GROUP_GAP of the first group now has to skip over the second group too. */
1940 GROUP_GAP (first_vinfo) += group2_size;
1941
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1944 group1_size, group2_size);
1945
1946 return group2;
1947 }
1948
1949 /* Analyze an SLP instance starting from a group of grouped stores. Call
1135 vect_build_slp_tree to build a tree of packed stmts if possible. 1950 vect_build_slp_tree to build a tree of packed stmts if possible.
1136 Return FALSE if it's impossible to SLP any stmt in the loop. */ 1951 Return FALSE if it's impossible to SLP any stmt in the loop. */
1137 1952
1138 static bool 1953 static bool
1139 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, 1954 vect_analyze_slp_instance (vec_info *vinfo,
1140 gimple stmt) 1955 gimple *stmt, unsigned max_tree_size)
1141 { 1956 {
1142 slp_instance new_instance; 1957 slp_instance new_instance;
1143 slp_tree node = XNEW (struct _slp_tree); 1958 slp_tree node;
1144 unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); 1959 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1145 unsigned int unrolling_factor = 1, nunits; 1960 unsigned int unrolling_factor = 1, nunits;
1146 tree vectype, scalar_type = NULL_TREE; 1961 tree vectype, scalar_type = NULL_TREE;
1147 gimple next; 1962 gimple *next;
1148 unsigned int vectorization_factor = 0; 1963 unsigned int i;
1149 int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i;
1150 unsigned int max_nunits = 0; 1964 unsigned int max_nunits = 0;
1151 VEC (int, heap) *load_permutation; 1965 vec<slp_tree> loads;
1152 VEC (slp_tree, heap) *loads;
1153 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 1966 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1154 1967 vec<gimple *> scalar_stmts;
1155 if (dr) 1968
1156 { 1969 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1157 scalar_type = TREE_TYPE (DR_REF (dr)); 1970 {
1158 vectype = get_vectype_for_scalar_type (scalar_type); 1971 if (dr)
1159 group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); 1972 {
1973 scalar_type = TREE_TYPE (DR_REF (dr));
1974 vectype = get_vectype_for_scalar_type (scalar_type);
1975 }
1976 else
1977 {
1978 gcc_assert (is_a <loop_vec_info> (vinfo));
1979 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1980 }
1981
1982 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1160 } 1983 }
1161 else 1984 else
1162 { 1985 {
1163 gcc_assert (loop_vinfo); 1986 gcc_assert (is_a <loop_vec_info> (vinfo));
1164 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 1987 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1165 group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)); 1988 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1166 } 1989 }
1167 1990
1168 if (!vectype) 1991 if (!vectype)
1169 { 1992 {
1170 if (vect_print_dump_info (REPORT_SLP)) 1993 if (dump_enabled_p ())
1171 { 1994 {
1172 fprintf (vect_dump, "Build SLP failed: unsupported data-type "); 1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173 print_generic_expr (vect_dump, scalar_type, TDF_SLIM); 1996 "Build SLP failed: unsupported data-type ");
1997 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1998 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1174 } 1999 }
1175 2000
1176 return false; 2001 return false;
1177 } 2002 }
1178
1179 nunits = TYPE_VECTOR_SUBPARTS (vectype); 2003 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1180 if (loop_vinfo) 2004
1181 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2005 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1182 else 2006 scalar_stmts.create (group_size);
1183 /* No multitypes in BB SLP. */
1184 vectorization_factor = nunits;
1185
1186 /* Calculate the unrolling factor. */
1187 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1188 if (unrolling_factor != 1 && !loop_vinfo)
1189 {
1190 if (vect_print_dump_info (REPORT_SLP))
1191 fprintf (vect_dump, "Build SLP failed: unrolling required in basic"
1192 " block SLP");
1193
1194 return false;
1195 }
1196
1197 /* Create a node (a root of the SLP tree) for the packed strided stores. */
1198 SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
1199 next = stmt; 2007 next = stmt;
1200 if (dr) 2008 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1201 { 2009 {
1202 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ 2010 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1203 while (next) 2011 while (next)
1204 { 2012 {
1205 VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); 2013 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1206 next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); 2014 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2015 scalar_stmts.safe_push (
2016 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2017 else
2018 scalar_stmts.safe_push (next);
2019 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1207 } 2020 }
2021 /* Mark the first element of the reduction chain as reduction to properly
2022 transform the node. In the reduction analysis phase only the last
2023 element of the chain is marked as reduction. */
2024 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2025 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1208 } 2026 }
1209 else 2027 else
1210 { 2028 {
1211 /* Collect reduction statements. */ 2029 /* Collect reduction statements. */
1212 for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, 2030 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1213 next); 2031 for (i = 0; reductions.iterate (i, &next); i++)
1214 i++) 2032 scalar_stmts.safe_push (next);
2033 }
2034
2035 loads.create (group_size);
2036
2037 /* Build the tree for the SLP instance. */
2038 bool *matches = XALLOCAVEC (bool, group_size);
2039 unsigned npermutes = 0;
2040 bst_fail = new hash_set <vec <gimple *>, bst_traits> ();
2041 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2042 &max_nunits, &loads, matches, &npermutes,
2043 NULL, max_tree_size);
2044 delete bst_fail;
2045 if (node != NULL)
2046 {
2047 /* Calculate the unrolling factor based on the smallest type. */
2048 unrolling_factor
2049 = least_common_multiple (max_nunits, group_size) / group_size;
2050
2051 if (unrolling_factor != 1
2052 && is_a <bb_vec_info> (vinfo))
2053 {
2054
2055 if (max_nunits > group_size)
1215 { 2056 {
1216 VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); 2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217 if (vect_print_dump_info (REPORT_DETAILS)) 2058 "Build SLP failed: store group "
1218 { 2059 "size not a multiple of the vector size "
1219 fprintf (vect_dump, "pushing reduction into node: "); 2060 "in basic block SLP\n");
1220 print_gimple_stmt (vect_dump, next, 0, TDF_SLIM); 2061 vect_free_slp_tree (node);
1221 } 2062 loads.release ();
2063 return false;
1222 } 2064 }
1223 } 2065 /* Fatal mismatch. */
1224 2066 matches[group_size/max_nunits * max_nunits] = false;
1225 SLP_TREE_VEC_STMTS (node) = NULL; 2067 vect_free_slp_tree (node);
1226 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; 2068 loads.release ();
1227 SLP_TREE_LEFT (node) = NULL; 2069 }
1228 SLP_TREE_RIGHT (node) = NULL; 2070 else
1229 SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0; 2071 {
1230 SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0;
1231
1232 /* Calculate the number of vector stmts to create based on the unrolling
1233 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1234 GROUP_SIZE / NUNITS otherwise. */
1235 ncopies_for_cost = unrolling_factor * group_size / nunits;
1236
1237 load_permutation = VEC_alloc (int, heap, group_size * group_size);
1238 loads = VEC_alloc (slp_tree, heap, group_size);
1239
1240 /* Build the tree for the SLP instance. */
1241 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1242 &inside_cost, &outside_cost, ncopies_for_cost,
1243 &max_nunits, &load_permutation, &loads,
1244 vectorization_factor))
1245 {
1246 /* Create a new SLP instance. */ 2072 /* Create a new SLP instance. */
1247 new_instance = XNEW (struct _slp_instance); 2073 new_instance = XNEW (struct _slp_instance);
1248 SLP_INSTANCE_TREE (new_instance) = node; 2074 SLP_INSTANCE_TREE (new_instance) = node;
1249 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; 2075 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1250 /* Calculate the unrolling factor based on the smallest type in the
1251 loop. */
1252 if (max_nunits > nunits)
1253 unrolling_factor = least_common_multiple (max_nunits, group_size)
1254 / group_size;
1255
1256 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; 2076 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1257 SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
1258 SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
1259 SLP_INSTANCE_LOADS (new_instance) = loads; 2077 SLP_INSTANCE_LOADS (new_instance) = loads;
1260 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL; 2078
1261 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation; 2079 /* Compute the load permutation. */
1262 if (VEC_length (slp_tree, loads)) 2080 slp_tree load_node;
2081 bool loads_permuted = false;
2082 FOR_EACH_VEC_ELT (loads, i, load_node)
2083 {
2084 vec<unsigned> load_permutation;
2085 int j;
2086 gimple *load, *first_stmt;
2087 bool this_load_permuted = false;
2088 load_permutation.create (group_size);
2089 first_stmt = GROUP_FIRST_ELEMENT
2090 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2092 {
2093 int load_place = vect_get_place_in_interleaving_chain
2094 (load, first_stmt);
2095 gcc_assert (load_place != -1);
2096 if (load_place != j)
2097 this_load_permuted = true;
2098 load_permutation.safe_push (load_place);
2099 }
2100 if (!this_load_permuted
2101 /* The load requires permutation when unrolling exposes
2102 a gap either because the group is larger than the SLP
2103 group-size or because there is a gap between the groups. */
2104 && (unrolling_factor == 1
2105 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2106 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2107 {
2108 load_permutation.release ();
2109 continue;
2110 }
2111 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2112 loads_permuted = true;
2113 }
2114
2115 if (loads_permuted)
1263 { 2116 {
1264 if (!vect_supported_load_permutation_p (new_instance, group_size, 2117 if (!vect_supported_load_permutation_p (new_instance))
1265 load_permutation))
1266 { 2118 {
1267 if (vect_print_dump_info (REPORT_SLP)) 2119 if (dump_enabled_p ())
1268 { 2120 {
1269 fprintf (vect_dump, "Build SLP failed: unsupported load " 2121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270 "permutation "); 2122 "Build SLP failed: unsupported load "
1271 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 2123 "permutation ");
2124 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2125 TDF_SLIM, stmt, 0);
1272 } 2126 }
1273
1274 vect_free_slp_instance (new_instance); 2127 vect_free_slp_instance (new_instance);
1275 return false; 2128 return false;
1276 } 2129 }
1277
1278 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1279 = vect_find_first_load_in_slp_instance (new_instance);
1280 } 2130 }
1281 else 2131
1282 VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance)); 2132 /* If the loads and stores can be handled with load/store-lan
1283 2133 instructions do not generate this SLP instance. */
1284 if (loop_vinfo) 2134 if (is_a <loop_vec_info> (vinfo)
1285 VEC_safe_push (slp_instance, heap, 2135 && loads_permuted
1286 LOOP_VINFO_SLP_INSTANCES (loop_vinfo), 2136 && dr && vect_store_lanes_supported (vectype, group_size))
1287 new_instance); 2137 {
1288 else 2138 slp_tree load_node;
1289 VEC_safe_push (slp_instance, heap, BB_VINFO_SLP_INSTANCES (bb_vinfo), 2139 FOR_EACH_VEC_ELT (loads, i, load_node)
1290 new_instance); 2140 {
1291 2141 gimple *first_stmt = GROUP_FIRST_ELEMENT
1292 if (vect_print_dump_info (REPORT_SLP)) 2142 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1293 vect_print_slp_tree (node); 2143 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2144 /* Use SLP for strided accesses (or if we
2145 can't load-lanes). */
2146 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2147 || ! vect_load_lanes_supported
2148 (STMT_VINFO_VECTYPE (stmt_vinfo),
2149 GROUP_SIZE (stmt_vinfo)))
2150 break;
2151 }
2152 if (i == loads.length ())
2153 {
2154 if (dump_enabled_p ())
2155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156 "Built SLP cancelled: can use "
2157 "load/store-lanes\n");
2158 vect_free_slp_instance (new_instance);
2159 return false;
2160 }
2161 }
2162
2163 vinfo->slp_instances.safe_push (new_instance);
2164
2165 if (dump_enabled_p ())
2166 {
2167 dump_printf_loc (MSG_NOTE, vect_location,
2168 "Final SLP tree for instance:\n");
2169 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2170 }
1294 2171
1295 return true; 2172 return true;
1296 } 2173 }
1297 2174 }
2175 else
2176 {
1298 /* Failed to SLP. */ 2177 /* Failed to SLP. */
1299 /* Free the allocated memory. */ 2178 /* Free the allocated memory. */
1300 vect_free_slp_tree (node); 2179 scalar_stmts.release ();
1301 VEC_free (int, heap, load_permutation); 2180 loads.release ();
1302 VEC_free (slp_tree, heap, loads); 2181 }
2182
2183 /* For basic block SLP, try to break the group up into multiples of the
2184 vector size. */
2185 if (is_a <bb_vec_info> (vinfo)
2186 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2187 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2188 {
2189 /* We consider breaking the group only on VF boundaries from the existing
2190 start. */
2191 for (i = 0; i < group_size; i++)
2192 if (!matches[i]) break;
2193
2194 if (i >= nunits && i < group_size)
2195 {
2196 /* Split into two groups at the first vector boundary before i. */
2197 gcc_assert ((nunits & (nunits - 1)) == 0);
2198 unsigned group1_size = i & ~(nunits - 1);
2199
2200 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2201 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2202 /* If the first non-match was in the middle of a vector,
2203 skip the rest of that vector. */
2204 if (group1_size < i)
2205 {
2206 i = group1_size + nunits;
2207 if (i < group_size)
2208 rest = vect_split_slp_store_group (rest, nunits);
2209 }
2210 if (i < group_size)
2211 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2212 return res;
2213 }
2214 /* Even though the first vector did not all match, we might be able to SLP
2215 (some) of the remainder. FORNOW ignore this possibility. */
2216 }
1303 2217
1304 return false; 2218 return false;
1305 } 2219 }
1306 2220
1307 2221
1308 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP 2222 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1309 trees of packed scalar stmts if SLP is possible. */ 2223 trees of packed scalar stmts if SLP is possible. */
1310 2224
1311 bool 2225 bool
1312 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) 2226 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
1313 { 2227 {
1314 unsigned int i; 2228 unsigned int i;
1315 VEC (gimple, heap) *strided_stores, *reductions = NULL; 2229 gimple *first_element;
1316 gimple store; 2230
1317 bool ok = false; 2231 if (dump_enabled_p ())
1318 2232 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1319 if (vect_print_dump_info (REPORT_SLP)) 2233
1320 fprintf (vect_dump, "=== vect_analyze_slp ==="); 2234 /* Find SLP sequences starting from groups of grouped stores. */
1321 2235 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
1322 if (loop_vinfo) 2236 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
1323 { 2237
1324 strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); 2238 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1325 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo); 2239 {
1326 } 2240 if (loop_vinfo->reduction_chains.length () > 0)
1327 else 2241 {
1328 strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo); 2242 /* Find SLP sequences starting from reduction chains. */
1329 2243 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
1330 /* Find SLP sequences starting from groups of strided stores. */ 2244 if (! vect_analyze_slp_instance (vinfo, first_element,
1331 FOR_EACH_VEC_ELT (gimple, strided_stores, i, store) 2245 max_tree_size))
1332 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store)) 2246 {
1333 ok = true; 2247 /* Dissolve reduction chain group. */
1334 2248 gimple *next, *stmt = first_element;
1335 if (bb_vinfo && !ok) 2249 while (stmt)
1336 { 2250 {
1337 if (vect_print_dump_info (REPORT_SLP)) 2251 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
1338 fprintf (vect_dump, "Failed to SLP the basic block."); 2252 next = GROUP_NEXT_ELEMENT (vinfo);
1339 2253 GROUP_FIRST_ELEMENT (vinfo) = NULL;
1340 return false; 2254 GROUP_NEXT_ELEMENT (vinfo) = NULL;
1341 } 2255 stmt = next;
1342 2256 }
1343 /* Find SLP sequences starting from groups of reductions. */ 2257 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
1344 if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) > 1 2258 = vect_internal_def;
1345 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, 2259 }
1346 VEC_index (gimple, reductions, 0))) 2260 }
1347 ok = true; 2261
2262 /* Find SLP sequences starting from groups of reductions. */
2263 if (loop_vinfo->reductions.length () > 1)
2264 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2265 max_tree_size);
2266 }
1348 2267
1349 return true; 2268 return true;
1350 } 2269 }
1351 2270
1352 2271
1353 /* For each possible SLP instance decide whether to SLP it and calculate overall 2272 /* For each possible SLP instance decide whether to SLP it and calculate overall
1354 unrolling factor needed to SLP the loop. */ 2273 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1355 2274 least one instance. */
1356 void 2275
2276 bool
1357 vect_make_slp_decision (loop_vec_info loop_vinfo) 2277 vect_make_slp_decision (loop_vec_info loop_vinfo)
1358 { 2278 {
1359 unsigned int i, unrolling_factor = 1; 2279 unsigned int i, unrolling_factor = 1;
1360 VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2280 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1361 slp_instance instance; 2281 slp_instance instance;
1362 int decided_to_slp = 0; 2282 int decided_to_slp = 0;
1363 2283
1364 if (vect_print_dump_info (REPORT_SLP)) 2284 if (dump_enabled_p ())
1365 fprintf (vect_dump, "=== vect_make_slp_decision ==="); 2285 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1366 2286 "\n");
1367 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) 2287
2288 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1368 { 2289 {
1369 /* FORNOW: SLP if you can. */ 2290 /* FORNOW: SLP if you can. */
1370 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance)) 2291 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1371 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance); 2292 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1372 2293
1377 decided_to_slp++; 2298 decided_to_slp++;
1378 } 2299 }
1379 2300
1380 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; 2301 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1381 2302
1382 if (decided_to_slp && vect_print_dump_info (REPORT_SLP)) 2303 if (decided_to_slp && dump_enabled_p ())
1383 fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d", 2304 dump_printf_loc (MSG_NOTE, vect_location,
1384 decided_to_slp, unrolling_factor); 2305 "Decided to SLP %d instances. Unrolling factor %d\n",
2306 decided_to_slp, unrolling_factor);
2307
2308 return (decided_to_slp > 0);
1385 } 2309 }
1386 2310
1387 2311
1388 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that 2312 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1389 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */ 2313 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1390 2314
1391 static void 2315 static void
1392 vect_detect_hybrid_slp_stmts (slp_tree node) 2316 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1393 { 2317 {
1394 int i; 2318 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1395 gimple stmt;
1396 imm_use_iterator imm_iter; 2319 imm_use_iterator imm_iter;
1397 gimple use_stmt; 2320 gimple *use_stmt;
1398 stmt_vec_info stmt_vinfo; 2321 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1399 2322 slp_tree child;
1400 if (!node) 2323 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1401 return; 2324 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402 2325 int j;
1403 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) 2326
1404 if (PURE_SLP_STMT (vinfo_for_stmt (stmt)) 2327 /* Propagate hybrid down the SLP tree. */
1405 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME) 2328 if (stype == hybrid)
1406 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0)) 2329 ;
1407 if ((stmt_vinfo = vinfo_for_stmt (use_stmt)) 2330 else if (HYBRID_SLP_STMT (stmt_vinfo))
1408 && !STMT_SLP_TYPE (stmt_vinfo) 2331 stype = hybrid;
1409 && (STMT_VINFO_RELEVANT (stmt_vinfo) 2332 else
1410 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))) 2333 {
1411 && !(gimple_code (use_stmt) == GIMPLE_PHI 2334 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
1412 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt)) 2335 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1413 == vect_reduction_def)) 2336 /* If we get a pattern stmt here we have to use the LHS of the
1414 vect_mark_slp_stmts (node, hybrid, i); 2337 original stmt for immediate uses. */
1415 2338 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
1416 vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node)); 2339 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
1417 vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node)); 2340 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1418 } 2341 tree def;
1419 2342 if (gimple_code (stmt) == GIMPLE_PHI)
2343 def = gimple_phi_result (stmt);
2344 else
2345 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2346 if (def)
2347 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2348 {
2349 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2350 continue;
2351 use_vinfo = vinfo_for_stmt (use_stmt);
2352 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2353 && STMT_VINFO_RELATED_STMT (use_vinfo))
2354 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2355 if (!STMT_SLP_TYPE (use_vinfo)
2356 && (STMT_VINFO_RELEVANT (use_vinfo)
2357 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2358 && !(gimple_code (use_stmt) == GIMPLE_PHI
2359 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2360 {
2361 if (dump_enabled_p ())
2362 {
2363 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2364 "def in non-SLP stmt: ");
2365 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2366 }
2367 stype = hybrid;
2368 }
2369 }
2370 }
2371
2372 if (stype == hybrid
2373 && !HYBRID_SLP_STMT (stmt_vinfo))
2374 {
2375 if (dump_enabled_p ())
2376 {
2377 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2378 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2379 }
2380 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2381 }
2382
2383 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2384 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2385 vect_detect_hybrid_slp_stmts (child, i, stype);
2386 }
2387
2388 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2389
2390 static tree
2391 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2392 {
2393 walk_stmt_info *wi = (walk_stmt_info *)data;
2394 struct loop *loopp = (struct loop *)wi->info;
2395
2396 if (wi->is_lhs)
2397 return NULL_TREE;
2398
2399 if (TREE_CODE (*tp) == SSA_NAME
2400 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2401 {
2402 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2403 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2404 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2405 {
2406 if (dump_enabled_p ())
2407 {
2408 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2409 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2410 }
2411 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2412 }
2413 }
2414
2415 return NULL_TREE;
2416 }
2417
2418 static tree
2419 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2420 walk_stmt_info *)
2421 {
2422 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2423 /* If the stmt is in a SLP instance then this isn't a reason
2424 to mark use definitions in other SLP instances as hybrid. */
2425 if (! STMT_SLP_TYPE (use_vinfo)
2426 && (STMT_VINFO_RELEVANT (use_vinfo)
2427 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2428 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2429 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2430 ;
2431 else
2432 *handled = true;
2433 return NULL_TREE;
2434 }
1420 2435
1421 /* Find stmts that must be both vectorized and SLPed. */ 2436 /* Find stmts that must be both vectorized and SLPed. */
1422 2437
1423 void 2438 void
1424 vect_detect_hybrid_slp (loop_vec_info loop_vinfo) 2439 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1425 { 2440 {
1426 unsigned int i; 2441 unsigned int i;
1427 VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2442 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1428 slp_instance instance; 2443 slp_instance instance;
1429 2444
1430 if (vect_print_dump_info (REPORT_SLP)) 2445 if (dump_enabled_p ())
1431 fprintf (vect_dump, "=== vect_detect_hybrid_slp ==="); 2446 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1432 2447 "\n");
1433 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) 2448
1434 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance)); 2449 /* First walk all pattern stmt in the loop and mark defs of uses as
1435 } 2450 hybrid because immediate uses in them are not recorded. */
1436 2451 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1437 2452 {
1438 /* Create and initialize a new bb_vec_info struct for BB, as well as 2453 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1439 stmt_vec_info structs for all the stmts in it. */ 2454 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1440 2455 gsi_next (&gsi))
1441 static bb_vec_info 2456 {
1442 new_bb_vec_info (basic_block bb) 2457 gimple *stmt = gsi_stmt (gsi);
1443 { 2458 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444 bb_vec_info res = NULL; 2459 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2460 {
2461 walk_stmt_info wi;
2462 memset (&wi, 0, sizeof (wi));
2463 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2464 gimple_stmt_iterator gsi2
2465 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2466 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2467 vect_detect_hybrid_slp_1, &wi);
2468 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2469 vect_detect_hybrid_slp_2,
2470 vect_detect_hybrid_slp_1, &wi);
2471 }
2472 }
2473 }
2474
2475 /* Then walk the SLP instance trees marking stmts with uses in
2476 non-SLP stmts as hybrid, also propagating hybrid down the
2477 SLP tree, collecting the above info on-the-fly. */
2478 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2479 {
2480 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2481 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2482 i, pure_slp);
2483 }
2484 }
2485
2486
2487 /* Initialize a bb_vec_info struct for the statements between
2488 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2489
2490 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2491 gimple_stmt_iterator region_end_in)
2492 : vec_info (vec_info::bb, init_cost (NULL)),
2493 bb (gsi_bb (region_begin_in)),
2494 region_begin (region_begin_in),
2495 region_end (region_end_in)
2496 {
1445 gimple_stmt_iterator gsi; 2497 gimple_stmt_iterator gsi;
1446 2498
1447 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info)); 2499 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
1448 BB_VINFO_BB (res) = bb; 2500 gsi_next (&gsi))
1449 2501 {
1450 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 2502 gimple *stmt = gsi_stmt (gsi);
1451 {
1452 gimple stmt = gsi_stmt (gsi);
1453 gimple_set_uid (stmt, 0); 2503 gimple_set_uid (stmt, 0);
1454 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res)); 2504 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1455 } 2505 }
1456 2506
1457 BB_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); 2507 bb->aux = this;
1458 BB_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 2);
1459
1460 bb->aux = res;
1461 return res;
1462 } 2508 }
1463 2509
1464 2510
1465 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the 2511 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1466 stmts in the basic block. */ 2512 stmts in the basic block. */
1467 2513
1468 static void 2514 _bb_vec_info::~_bb_vec_info ()
1469 destroy_bb_vec_info (bb_vec_info bb_vinfo) 2515 {
1470 { 2516 for (gimple_stmt_iterator si = region_begin;
1471 basic_block bb; 2517 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
1472 gimple_stmt_iterator si; 2518 {
1473 2519 gimple *stmt = gsi_stmt (si);
1474 if (!bb_vinfo)
1475 return;
1476
1477 bb = BB_VINFO_BB (bb_vinfo);
1478
1479 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1480 {
1481 gimple stmt = gsi_stmt (si);
1482 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2520 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1483 2521
1484 if (stmt_info) 2522 if (stmt_info)
1485 /* Free stmt_vec_info. */ 2523 /* Free stmt_vec_info. */
1486 free_stmt_vec_info (stmt); 2524 free_stmt_vec_info (stmt);
1487 } 2525
1488 2526 /* Reset region marker. */
1489 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo)); 2527 gimple_set_uid (stmt, -1);
1490 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo)); 2528 }
1491 VEC_free (gimple, heap, BB_VINFO_STRIDED_STORES (bb_vinfo)); 2529
1492 VEC_free (slp_instance, heap, BB_VINFO_SLP_INSTANCES (bb_vinfo));
1493 free (bb_vinfo);
1494 bb->aux = NULL; 2530 bb->aux = NULL;
1495 } 2531 }
1496 2532
1497 2533
1498 /* Analyze statements contained in SLP tree node after recursively analyzing 2534 /* Analyze statements contained in SLP tree NODE after recursively analyzing
1499 the subtree. Return TRUE if the operations are supported. */ 2535 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2536
2537 Return true if the operations are supported. */
1500 2538
1501 static bool 2539 static bool
1502 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node) 2540 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2541 slp_instance node_instance)
1503 { 2542 {
1504 bool dummy; 2543 bool dummy;
1505 int i; 2544 int i, j;
1506 gimple stmt; 2545 gimple *stmt;
1507 2546 slp_tree child;
1508 if (!node) 2547
2548 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1509 return true; 2549 return true;
1510 2550
1511 if (!vect_slp_analyze_node_operations (bb_vinfo, SLP_TREE_LEFT (node)) 2551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1512 || !vect_slp_analyze_node_operations (bb_vinfo, SLP_TREE_RIGHT (node))) 2552 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2553 return false;
2554
2555 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2556 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2557 gcc_assert (stmt_info);
2558 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2559
2560 /* For BB vectorization vector types are assigned here.
2561 Memory accesses already got their vector type assigned
2562 in vect_analyze_data_refs. */
2563 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2564 if (bb_vinfo
2565 && ! STMT_VINFO_DATA_REF (stmt_info))
2566 {
2567 gcc_assert (PURE_SLP_STMT (stmt_info));
2568
2569 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2570 if (dump_enabled_p ())
2571 {
2572 dump_printf_loc (MSG_NOTE, vect_location,
2573 "get vectype for scalar type: ");
2574 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2575 dump_printf (MSG_NOTE, "\n");
2576 }
2577
2578 tree vectype = get_vectype_for_scalar_type (scalar_type);
2579 if (!vectype)
2580 {
2581 if (dump_enabled_p ())
2582 {
2583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2584 "not SLPed: unsupported data-type ");
2585 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2586 scalar_type);
2587 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2588 }
2589 return false;
2590 }
2591
2592 if (dump_enabled_p ())
2593 {
2594 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
2595 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2596 dump_printf (MSG_NOTE, "\n");
2597 }
2598
2599 gimple *sstmt;
2600 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2601 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2602 }
2603
2604 /* Calculate the number of vector statements to be created for the
2605 scalar stmts in this node. For SLP reductions it is equal to the
2606 number of vector statements in the children (which has already been
2607 calculated by the recursive call). Otherwise it is the number of
2608 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2609 VF divided by the number of elements in a vector. */
2610 if (GROUP_FIRST_ELEMENT (stmt_info)
2611 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2612 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2613 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2614 else
2615 {
2616 int vf;
2617 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2618 vf = loop_vinfo->vectorization_factor;
2619 else
2620 vf = 1;
2621 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2622 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2623 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2624 = vf * group_size / TYPE_VECTOR_SUBPARTS (vectype);
2625 }
2626
2627 /* Push SLP node def-type to stmt operands. */
2628 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2629 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2630 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2631 = SLP_TREE_DEF_TYPE (child);
2632 bool res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2633 /* Restore def-types. */
2634 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2635 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2636 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2637 = vect_internal_def;
2638 if (! res)
1513 return false; 2639 return false;
1514 2640
1515 FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt)
1516 {
1517 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1518 gcc_assert (stmt_info);
1519 gcc_assert (PURE_SLP_STMT (stmt_info));
1520
1521 if (!vect_analyze_stmt (stmt, &dummy, node))
1522 return false;
1523 }
1524
1525 return true; 2641 return true;
1526 } 2642 }
1527 2643
1528 2644
1529 /* Analyze statements in SLP instances of the basic block. Return TRUE if the 2645 /* Analyze statements in SLP instances of VINFO. Return true if the
1530 operations are supported. */ 2646 operations are supported. */
1531 2647
1532 static bool 2648 bool
1533 vect_slp_analyze_operations (bb_vec_info bb_vinfo) 2649 vect_slp_analyze_operations (vec_info *vinfo)
1534 { 2650 {
1535 VEC (slp_instance, heap) *slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1536 slp_instance instance; 2651 slp_instance instance;
1537 int i; 2652 int i;
1538 2653
1539 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); ) 2654 if (dump_enabled_p ())
1540 { 2655 dump_printf_loc (MSG_NOTE, vect_location,
1541 if (!vect_slp_analyze_node_operations (bb_vinfo, 2656 "=== vect_slp_analyze_operations ===\n");
1542 SLP_INSTANCE_TREE (instance))) 2657
2658 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2659 {
2660 if (!vect_slp_analyze_node_operations (vinfo,
2661 SLP_INSTANCE_TREE (instance),
2662 instance))
1543 { 2663 {
1544 vect_free_slp_instance (instance); 2664 dump_printf_loc (MSG_NOTE, vect_location,
1545 VEC_ordered_remove (slp_instance, slp_instances, i); 2665 "removing SLP instance operations starting from: ");
2666 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2667 SLP_TREE_SCALAR_STMTS
2668 (SLP_INSTANCE_TREE (instance))[0], 0);
2669 vect_free_slp_instance (instance);
2670 vinfo->slp_instances.ordered_remove (i);
1546 } 2671 }
1547 else 2672 else
1548 i++; 2673 {
1549 } 2674 /* Compute the costs of the SLP instance. */
1550 2675 vect_analyze_slp_cost (instance, vinfo->target_cost_data);
1551 if (!VEC_length (slp_instance, slp_instances)) 2676 i++;
1552 return false; 2677 }
1553 2678 }
1554 return true; 2679
1555 } 2680 return !vinfo->slp_instances.is_empty ();
1556 2681 }
1557 /* Check if loads and stores are mixed in the basic block (in that 2682
1558 case if we are not sure that the accesses differ, we can't vectorize the 2683
1559 basic block). Also return FALSE in case that there is statement marked as 2684 /* Compute the scalar cost of the SLP node NODE and its children
1560 not vectorizable. */ 2685 and return it. Do not account defs that are marked in LIFE and
1561 2686 update LIFE according to uses of NODE. */
1562 static bool 2687
1563 vect_bb_vectorizable_with_dependencies (bb_vec_info bb_vinfo) 2688 static unsigned
1564 { 2689 vect_bb_slp_scalar_cost (basic_block bb,
1565 basic_block bb = BB_VINFO_BB (bb_vinfo); 2690 slp_tree node, vec<bool, va_heap> *life)
1566 gimple_stmt_iterator si; 2691 {
1567 bool detected_store = false; 2692 unsigned scalar_cost = 0;
1568 gimple stmt; 2693 unsigned i;
1569 struct data_reference *dr; 2694 gimple *stmt;
1570 2695 slp_tree child;
1571 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 2696
1572 { 2697 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1573 stmt = gsi_stmt (si); 2698 {
1574 2699 unsigned stmt_cost;
1575 /* We can't allow not analyzed statements, since they may contain data 2700 ssa_op_iter op_iter;
1576 accesses. */ 2701 def_operand_p def_p;
1577 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) 2702 stmt_vec_info stmt_info;
1578 return false; 2703
1579 2704 if ((*life)[i])
1580 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) 2705 continue;
1581 continue; 2706
1582 2707 /* If there is a non-vectorized use of the defs then the scalar
1583 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 2708 stmt is kept live in which case we do not account it or any
1584 if (DR_IS_READ (dr) && detected_store) 2709 required defs in the SLP children in the scalar cost. This
1585 return false; 2710 way we make the vectorization more costly when compared to
1586 2711 the scalar cost. */
1587 if (!DR_IS_READ (dr)) 2712 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
1588 detected_store = true; 2713 {
1589 } 2714 imm_use_iterator use_iter;
1590 2715 gimple *use_stmt;
1591 return true; 2716 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
1592 } 2717 if (!is_gimple_debug (use_stmt)
1593 2718 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
1594 /* Check if vectorization of the basic block is profitable. */ 2719 use_stmt)
1595 2720 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
1596 static bool 2721 {
1597 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) 2722 (*life)[i] = true;
1598 { 2723 BREAK_FROM_IMM_USE_STMT (use_iter);
1599 VEC (slp_instance, heap) *slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); 2724 }
1600 slp_instance instance; 2725 }
1601 int i; 2726 if ((*life)[i])
1602 unsigned int vec_outside_cost = 0, vec_inside_cost = 0, scalar_cost = 0; 2727 continue;
1603 unsigned int stmt_cost; 2728
1604 gimple stmt; 2729 /* Count scalar stmts only once. */
1605 gimple_stmt_iterator si; 2730 if (gimple_visited_p (stmt))
1606 basic_block bb = BB_VINFO_BB (bb_vinfo); 2731 continue;
1607 stmt_vec_info stmt_info = NULL; 2732 gimple_set_visited (stmt, true);
1608 tree dummy_type = NULL; 2733
1609 int dummy = 0;
1610
1611 /* Calculate vector costs. */
1612 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
1613 {
1614 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
1615 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
1616 }
1617
1618 /* Calculate scalar cost. */
1619 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1620 {
1621 stmt = gsi_stmt (si);
1622 stmt_info = vinfo_for_stmt (stmt); 2734 stmt_info = vinfo_for_stmt (stmt);
1623
1624 if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
1625 || !PURE_SLP_STMT (stmt_info))
1626 continue;
1627
1628 if (STMT_VINFO_DATA_REF (stmt_info)) 2735 if (STMT_VINFO_DATA_REF (stmt_info))
1629 { 2736 {
1630 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 2737 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1631 stmt_cost = targetm.vectorize.builtin_vectorization_cost 2738 stmt_cost = vect_get_stmt_cost (scalar_load);
1632 (scalar_load, dummy_type, dummy);
1633 else 2739 else
1634 stmt_cost = targetm.vectorize.builtin_vectorization_cost 2740 stmt_cost = vect_get_stmt_cost (scalar_store);
1635 (scalar_store, dummy_type, dummy);
1636 } 2741 }
1637 else 2742 else
1638 stmt_cost = targetm.vectorize.builtin_vectorization_cost 2743 stmt_cost = vect_get_stmt_cost (scalar_stmt);
1639 (scalar_stmt, dummy_type, dummy);
1640 2744
1641 scalar_cost += stmt_cost; 2745 scalar_cost += stmt_cost;
1642 } 2746 }
1643 2747
1644 if (vect_print_dump_info (REPORT_COST)) 2748 auto_vec<bool, 20> subtree_life;
1645 { 2749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1646 fprintf (vect_dump, "Cost model analysis: \n"); 2750 {
1647 fprintf (vect_dump, " Vector inside of basic block cost: %d\n", 2751 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1648 vec_inside_cost); 2752 {
1649 fprintf (vect_dump, " Vector outside of basic block cost: %d\n", 2753 /* Do not directly pass LIFE to the recursive call, copy it to
1650 vec_outside_cost); 2754 confine changes in the callee to the current child/subtree. */
1651 fprintf (vect_dump, " Scalar cost of basic block: %d", scalar_cost); 2755 subtree_life.safe_splice (*life);
1652 } 2756 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
1653 2757 subtree_life.truncate (0);
1654 /* Vectorization is profitable if its cost is less than the cost of scalar 2758 }
1655 version. */ 2759 }
1656 if (vec_outside_cost + vec_inside_cost >= scalar_cost) 2760
2761 return scalar_cost;
2762 }
2763
2764 /* Check if vectorization of the basic block is profitable. */
2765
2766 static bool
2767 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2768 {
2769 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2770 slp_instance instance;
2771 int i;
2772 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2773 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2774
2775 /* Calculate scalar cost. */
2776 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2777 {
2778 auto_vec<bool, 20> life;
2779 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2780 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2781 SLP_INSTANCE_TREE (instance),
2782 &life);
2783 }
2784
2785 /* Unset visited flag. */
2786 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2787 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2788 gimple_set_visited (gsi_stmt (gsi), false);
2789
2790 /* Complete the target-specific cost calculation. */
2791 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2792 &vec_inside_cost, &vec_epilogue_cost);
2793
2794 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2795
2796 if (dump_enabled_p ())
2797 {
2798 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2799 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2800 vec_inside_cost);
2801 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2802 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2803 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2804 }
2805
2806 /* Vectorization is profitable if its cost is more than the cost of scalar
2807 version. Note that we err on the vector side for equal cost because
2808 the cost estimate is otherwise quite pessimistic (constant uses are
2809 free on the scalar side but cost a load on the vector side for
2810 example). */
2811 if (vec_outside_cost + vec_inside_cost > scalar_cost)
1657 return false; 2812 return false;
1658 2813
1659 return true; 2814 return true;
1660 } 2815 }
1661 2816
1662 /* Check if the basic block can be vectorized. */ 2817 /* Check if the basic block can be vectorized. Returns a bb_vec_info
1663 2818 if so and sets fatal to true if failure is independent of
1664 bb_vec_info 2819 current_vector_size. */
1665 vect_slp_analyze_bb (basic_block bb) 2820
2821 static bb_vec_info
2822 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2823 gimple_stmt_iterator region_end,
2824 vec<data_reference_p> datarefs, int n_stmts,
2825 bool &fatal)
1666 { 2826 {
1667 bb_vec_info bb_vinfo; 2827 bb_vec_info bb_vinfo;
1668 VEC (ddr_p, heap) *ddrs;
1669 VEC (slp_instance, heap) *slp_instances;
1670 slp_instance instance; 2828 slp_instance instance;
1671 int i, insns = 0; 2829 int i;
1672 gimple_stmt_iterator gsi;
1673 int min_vf = 2; 2830 int min_vf = 2;
1674 int max_vf = MAX_VECTORIZATION_FACTOR; 2831
1675 bool data_dependence_in_bb = false; 2832 /* The first group of checks is independent of the vector size. */
1676 2833 fatal = true;
1677 current_vector_size = 0; 2834
1678 2835 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
1679 if (vect_print_dump_info (REPORT_DETAILS)) 2836 {
1680 fprintf (vect_dump, "===vect_slp_analyze_bb===\n"); 2837 if (dump_enabled_p ())
1681 2838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1682 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 2839 "not vectorized: too many instructions in "
1683 { 2840 "basic block.\n");
1684 gimple stmt = gsi_stmt (gsi); 2841 free_data_refs (datarefs);
1685 if (!is_gimple_debug (stmt)
1686 && !gimple_nop_p (stmt)
1687 && gimple_code (stmt) != GIMPLE_LABEL)
1688 insns++;
1689 }
1690
1691 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
1692 {
1693 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1694 fprintf (vect_dump, "not vectorized: too many instructions in basic "
1695 "block.\n");
1696
1697 return NULL; 2842 return NULL;
1698 } 2843 }
1699 2844
1700 bb_vinfo = new_bb_vec_info (bb); 2845 bb_vinfo = new _bb_vec_info (region_begin, region_end);
1701 if (!bb_vinfo) 2846 if (!bb_vinfo)
1702 return NULL; 2847 return NULL;
1703 2848
1704 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf)) 2849 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
1705 { 2850
1706 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2851 /* Analyze the data references. */
1707 fprintf (vect_dump, "not vectorized: unhandled data-ref in basic " 2852
1708 "block.\n"); 2853 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
1709 2854 {
1710 destroy_bb_vec_info (bb_vinfo); 2855 if (dump_enabled_p ())
2856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2857 "not vectorized: unhandled data-ref in basic "
2858 "block.\n");
2859
2860 delete bb_vinfo;
1711 return NULL; 2861 return NULL;
1712 } 2862 }
1713 2863
1714 ddrs = BB_VINFO_DDRS (bb_vinfo); 2864 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
1715 if (!VEC_length (ddr_p, ddrs)) 2865 {
1716 { 2866 if (dump_enabled_p ())
1717 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1718 fprintf (vect_dump, "not vectorized: not enough data-refs in basic " 2868 "not vectorized: not enough data-refs in "
1719 "block.\n"); 2869 "basic block.\n");
1720 2870
1721 destroy_bb_vec_info (bb_vinfo); 2871 delete bb_vinfo;
1722 return NULL; 2872 return NULL;
1723 } 2873 }
1724 2874
1725 if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf, 2875 if (!vect_analyze_data_ref_accesses (bb_vinfo))
1726 &data_dependence_in_bb) 2876 {
1727 || min_vf > max_vf 2877 if (dump_enabled_p ())
1728 || (data_dependence_in_bb 2878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729 && !vect_bb_vectorizable_with_dependencies (bb_vinfo))) 2879 "not vectorized: unhandled data access in "
1730 { 2880 "basic block.\n");
1731 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2881
1732 fprintf (vect_dump, "not vectorized: unhandled data dependence " 2882 delete bb_vinfo;
1733 "in basic block.\n");
1734
1735 destroy_bb_vec_info (bb_vinfo);
1736 return NULL;
1737 }
1738
1739 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
1740 {
1741 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1742 fprintf (vect_dump, "not vectorized: bad data alignment in basic "
1743 "block.\n");
1744
1745 destroy_bb_vec_info (bb_vinfo);
1746 return NULL; 2883 return NULL;
1747 } 2884 }
1748 2885
1749 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo)) 2886 /* If there are no grouped stores in the region there is no need
1750 { 2887 to continue with pattern recog as vect_analyze_slp will fail
1751 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2888 anyway. */
1752 fprintf (vect_dump, "not vectorized: unhandled data access in basic " 2889 if (bb_vinfo->grouped_stores.is_empty ())
1753 "block.\n"); 2890 {
1754 2891 if (dump_enabled_p ())
1755 destroy_bb_vec_info (bb_vinfo); 2892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893 "not vectorized: no grouped stores in "
2894 "basic block.\n");
2895
2896 delete bb_vinfo;
1756 return NULL; 2897 return NULL;
1757 } 2898 }
1758 2899
1759 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo)) 2900 /* While the rest of the analysis below depends on it in some way. */
1760 { 2901 fatal = false;
1761 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2902
1762 fprintf (vect_dump, "not vectorized: unsupported alignment in basic " 2903 vect_pattern_recog (bb_vinfo);
1763 "block.\n");
1764
1765 destroy_bb_vec_info (bb_vinfo);
1766 return NULL;
1767 }
1768 2904
1769 /* Check the SLP opportunities in the basic block, analyze and build SLP 2905 /* Check the SLP opportunities in the basic block, analyze and build SLP
1770 trees. */ 2906 trees. */
1771 if (!vect_analyze_slp (NULL, bb_vinfo)) 2907 if (!vect_analyze_slp (bb_vinfo, n_stmts))
1772 { 2908 {
1773 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2909 if (dump_enabled_p ())
1774 fprintf (vect_dump, "not vectorized: failed to find SLP opportunities " 2910 {
1775 "in basic block.\n"); 2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776 2912 "Failed to SLP the basic block.\n");
1777 destroy_bb_vec_info (bb_vinfo); 2913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914 "not vectorized: failed to find SLP opportunities "
2915 "in basic block.\n");
2916 }
2917
2918 delete bb_vinfo;
1778 return NULL; 2919 return NULL;
1779 } 2920 }
1780 2921
1781 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); 2922 vect_record_base_alignments (bb_vinfo);
1782 2923
1783 /* Mark all the statements that we want to vectorize as pure SLP and 2924 /* Analyze and verify the alignment of data references and the
1784 relevant. */ 2925 dependence in the SLP instances. */
1785 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) 2926 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
1786 { 2927 {
2928 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2929 || ! vect_slp_analyze_instance_dependence (instance))
2930 {
2931 dump_printf_loc (MSG_NOTE, vect_location,
2932 "removing SLP instance operations starting from: ");
2933 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2934 SLP_TREE_SCALAR_STMTS
2935 (SLP_INSTANCE_TREE (instance))[0], 0);
2936 vect_free_slp_instance (instance);
2937 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2938 continue;
2939 }
2940
2941 /* Mark all the statements that we want to vectorize as pure SLP and
2942 relevant. */
1787 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); 2943 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1788 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); 2944 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2945
2946 i++;
2947 }
2948 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2949 {
2950 delete bb_vinfo;
2951 return NULL;
1789 } 2952 }
1790 2953
1791 if (!vect_slp_analyze_operations (bb_vinfo)) 2954 if (!vect_slp_analyze_operations (bb_vinfo))
1792 { 2955 {
1793 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2956 if (dump_enabled_p ())
1794 fprintf (vect_dump, "not vectorized: bad operation in basic block.\n"); 2957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1795 2958 "not vectorized: bad operation in basic block.\n");
1796 destroy_bb_vec_info (bb_vinfo); 2959
2960 delete bb_vinfo;
1797 return NULL; 2961 return NULL;
1798 } 2962 }
1799 2963
1800 /* Cost model: check if the vectorization is worthwhile. */ 2964 /* Cost model: check if the vectorization is worthwhile. */
1801 if (flag_vect_cost_model 2965 if (!unlimited_cost_model (NULL)
1802 && !vect_bb_vectorization_profitable_p (bb_vinfo)) 2966 && !vect_bb_vectorization_profitable_p (bb_vinfo))
1803 { 2967 {
1804 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 2968 if (dump_enabled_p ())
1805 fprintf (vect_dump, "not vectorized: vectorization is not " 2969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806 "profitable.\n"); 2970 "not vectorized: vectorization is not "
1807 2971 "profitable.\n");
1808 destroy_bb_vec_info (bb_vinfo); 2972
2973 delete bb_vinfo;
1809 return NULL; 2974 return NULL;
1810 } 2975 }
1811 2976
1812 if (vect_print_dump_info (REPORT_DETAILS)) 2977 if (dump_enabled_p ())
1813 fprintf (vect_dump, "Basic block will be vectorized using SLP\n"); 2978 dump_printf_loc (MSG_NOTE, vect_location,
2979 "Basic block will be vectorized using SLP\n");
1814 2980
1815 return bb_vinfo; 2981 return bb_vinfo;
1816 } 2982 }
1817 2983
1818 2984
1819 /* SLP costs are calculated according to SLP instance unrolling factor (i.e., 2985 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
1820 the number of created vector stmts depends on the unrolling factor). 2986 true if anything in the basic-block was vectorized. */
1821 However, the actual number of vector stmts for every SLP node depends on 2987
1822 VF which is set later in vect_analyze_operations (). Hence, SLP costs 2988 bool
1823 should be updated. In this function we assume that the inside costs 2989 vect_slp_bb (basic_block bb)
1824 calculated in vect_model_xxx_cost are linear in ncopies. */ 2990 {
1825 2991 bb_vec_info bb_vinfo;
1826 void 2992 gimple_stmt_iterator gsi;
1827 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo) 2993 unsigned int vector_sizes;
1828 { 2994 bool any_vectorized = false;
1829 unsigned int i, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2995
1830 VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2996 if (dump_enabled_p ())
1831 slp_instance instance; 2997 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
1832 2998
1833 if (vect_print_dump_info (REPORT_SLP)) 2999 /* Autodetect first vector size we try. */
1834 fprintf (vect_dump, "=== vect_update_slp_costs_according_to_vf ==="); 3000 current_vector_size = 0;
1835 3001 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1836 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) 3002
1837 /* We assume that costs are linear in ncopies. */ 3003 gsi = gsi_start_bb (bb);
1838 SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance) *= vf 3004
1839 / SLP_INSTANCE_UNROLLING_FACTOR (instance); 3005 while (1)
3006 {
3007 if (gsi_end_p (gsi))
3008 break;
3009
3010 gimple_stmt_iterator region_begin = gsi;
3011 vec<data_reference_p> datarefs = vNULL;
3012 int insns = 0;
3013
3014 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3015 {
3016 gimple *stmt = gsi_stmt (gsi);
3017 if (is_gimple_debug (stmt))
3018 continue;
3019 insns++;
3020
3021 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3022 vect_location = gimple_location (stmt);
3023
3024 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3025 break;
3026 }
3027
3028 /* Skip leading unhandled stmts. */
3029 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3030 {
3031 gsi_next (&gsi);
3032 continue;
3033 }
3034
3035 gimple_stmt_iterator region_end = gsi;
3036
3037 bool vectorized = false;
3038 bool fatal = false;
3039 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3040 datarefs, insns, fatal);
3041 if (bb_vinfo
3042 && dbg_cnt (vect_slp))
3043 {
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3046
3047 vect_schedule_slp (bb_vinfo);
3048
3049 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_NOTE, vect_location,
3051 "basic block part vectorized\n");
3052
3053 vectorized = true;
3054 }
3055 delete bb_vinfo;
3056
3057 any_vectorized |= vectorized;
3058
3059 vector_sizes &= ~current_vector_size;
3060 if (vectorized
3061 || vector_sizes == 0
3062 || current_vector_size == 0
3063 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3064 vector sizes will fail do not bother iterating. */
3065 || fatal)
3066 {
3067 if (gsi_end_p (region_end))
3068 break;
3069
3070 /* Skip the unhandled stmt. */
3071 gsi_next (&gsi);
3072
3073 /* And reset vector sizes. */
3074 current_vector_size = 0;
3075 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
3076 }
3077 else
3078 {
3079 /* Try the next biggest vector size. */
3080 current_vector_size = 1 << floor_log2 (vector_sizes);
3081 if (dump_enabled_p ())
3082 dump_printf_loc (MSG_NOTE, vect_location,
3083 "***** Re-trying analysis with "
3084 "vector size %d\n", current_vector_size);
3085
3086 /* Start over. */
3087 gsi = region_begin;
3088 }
3089 }
3090
3091 return any_vectorized;
3092 }
3093
3094
3095 /* Return 1 if vector type of boolean constant which is OPNUM
3096 operand in statement STMT is a boolean vector. */
3097
3098 static bool
3099 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3100 {
3101 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3102 enum tree_code code = gimple_expr_code (stmt);
3103 tree op, vectype;
3104 gimple *def_stmt;
3105 enum vect_def_type dt;
3106
3107 /* For comparison and COND_EXPR type is chosen depending
3108 on the other comparison operand. */
3109 if (TREE_CODE_CLASS (code) == tcc_comparison)
3110 {
3111 if (opnum)
3112 op = gimple_assign_rhs1 (stmt);
3113 else
3114 op = gimple_assign_rhs2 (stmt);
3115
3116 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3117 &dt, &vectype))
3118 gcc_unreachable ();
3119
3120 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3121 }
3122
3123 if (code == COND_EXPR)
3124 {
3125 tree cond = gimple_assign_rhs1 (stmt);
3126
3127 if (TREE_CODE (cond) == SSA_NAME)
3128 op = cond;
3129 else if (opnum)
3130 op = TREE_OPERAND (cond, 1);
3131 else
3132 op = TREE_OPERAND (cond, 0);
3133
3134 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3135 &dt, &vectype))
3136 gcc_unreachable ();
3137
3138 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3139 }
3140
3141 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
1840 } 3142 }
1841 3143
1842 3144
1843 /* For constant and loop invariant defs of SLP_NODE this function returns 3145 /* For constant and loop invariant defs of SLP_NODE this function returns
1844 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. 3146 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1847 REDUC_INDEX is the index of the reduction operand in the statements, unless 3149 REDUC_INDEX is the index of the reduction operand in the statements, unless
1848 it is -1. */ 3150 it is -1. */
1849 3151
1850 static void 3152 static void
1851 vect_get_constant_vectors (tree op, slp_tree slp_node, 3153 vect_get_constant_vectors (tree op, slp_tree slp_node,
1852 VEC (tree, heap) **vec_oprnds, 3154 vec<tree> *vec_oprnds,
1853 unsigned int op_num, unsigned int number_of_vectors, 3155 unsigned int op_num, unsigned int number_of_vectors)
1854 int reduc_index) 3156 {
1855 { 3157 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1856 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); 3158 gimple *stmt = stmts[0];
1857 gimple stmt = VEC_index (gimple, stmts, 0);
1858 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 3159 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1859 int nunits; 3160 unsigned nunits;
1860 tree vec_cst; 3161 tree vec_cst;
1861 tree t = NULL_TREE; 3162 unsigned j, number_of_places_left_in_vector;
1862 int j, number_of_places_left_in_vector;
1863 tree vector_type; 3163 tree vector_type;
1864 tree vop; 3164 tree vop;
1865 int group_size = VEC_length (gimple, stmts); 3165 int group_size = stmts.length ();
1866 unsigned int vec_num, i; 3166 unsigned int vec_num, i;
1867 int number_of_copies = 1; 3167 unsigned number_of_copies = 1;
1868 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); 3168 vec<tree> voprnds;
3169 voprnds.create (number_of_vectors);
1869 bool constant_p, is_store; 3170 bool constant_p, is_store;
1870 tree neutral_op = NULL; 3171 tree neutral_op = NULL;
1871 enum tree_code code = gimple_assign_rhs_code (stmt); 3172 enum tree_code code = gimple_expr_code (stmt);
1872 3173 gimple_seq ctor_seq = NULL;
1873 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) 3174
1874 { 3175 /* Check if vector type is a boolean vector. */
1875 if (reduc_index == -1) 3176 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
1876 { 3177 && vect_mask_constant_operand_p (stmt, op_num))
1877 VEC_free (tree, heap, *vec_oprnds); 3178 vector_type
1878 return; 3179 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
1879 } 3180 else
1880 3181 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1881 op_num = reduc_index - 1; 3182 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1882 op = gimple_op (stmt, reduc_index);
1883 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
1884 we need either neutral operands or the original operands. See
1885 get_initial_def_for_reduction() for details. */
1886 switch (code)
1887 {
1888 case WIDEN_SUM_EXPR:
1889 case DOT_PROD_EXPR:
1890 case PLUS_EXPR:
1891 case MINUS_EXPR:
1892 case BIT_IOR_EXPR:
1893 case BIT_XOR_EXPR:
1894 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
1895 neutral_op = build_real (TREE_TYPE (op), dconst0);
1896 else
1897 neutral_op = build_int_cst (TREE_TYPE (op), 0);
1898
1899 break;
1900
1901 case MULT_EXPR:
1902 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
1903 neutral_op = build_real (TREE_TYPE (op), dconst1);
1904 else
1905 neutral_op = build_int_cst (TREE_TYPE (op), 1);
1906
1907 break;
1908
1909 case BIT_AND_EXPR:
1910 neutral_op = build_int_cst (TREE_TYPE (op), -1);
1911 break;
1912
1913 default:
1914 neutral_op = NULL;
1915 }
1916 }
1917 3183
1918 if (STMT_VINFO_DATA_REF (stmt_vinfo)) 3184 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1919 { 3185 {
1920 is_store = true; 3186 is_store = true;
1921 op = gimple_assign_rhs1 (stmt); 3187 op = gimple_assign_rhs1 (stmt);
1923 else 3189 else
1924 is_store = false; 3190 is_store = false;
1925 3191
1926 gcc_assert (op); 3192 gcc_assert (op);
1927 3193
1928 if (CONSTANT_CLASS_P (op))
1929 constant_p = true;
1930 else
1931 constant_p = false;
1932
1933 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1934 gcc_assert (vector_type);
1935 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1936
1937 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 3194 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1938 created vectors. It is greater than 1 if unrolling is performed. 3195 created vectors. It is greater than 1 if unrolling is performed.
1939 3196
1940 For example, we have two scalar operands, s1 and s2 (e.g., group of 3197 For example, we have two scalar operands, s1 and s2 (e.g., group of
1941 strided accesses of size two), while NUNITS is four (i.e., four scalars 3198 strided accesses of size two), while NUNITS is four (i.e., four scalars
1942 of this type can be packed in a vector). The output vector will contain 3199 of this type can be packed in a vector). The output vector will contain
1943 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 3200 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1944 will be 2). 3201 will be 2).
1945 3202
1946 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors 3203 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1947 containing the operands. 3204 containing the operands.
1948 3205
1949 For example, NUNITS is four as before, and the group size is 8 3206 For example, NUNITS is four as before, and the group size is 8
1950 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 3207 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1951 {s5, s6, s7, s8}. */ 3208 {s5, s6, s7, s8}. */
1952 3209
1953 number_of_copies = least_common_multiple (nunits, group_size) / group_size; 3210 number_of_copies = nunits * number_of_vectors / group_size;
1954 3211
1955 number_of_places_left_in_vector = nunits; 3212 number_of_places_left_in_vector = nunits;
3213 constant_p = true;
3214 auto_vec<tree, 32> elts (nunits);
3215 elts.quick_grow (nunits);
3216 bool place_after_defs = false;
1956 for (j = 0; j < number_of_copies; j++) 3217 for (j = 0; j < number_of_copies; j++)
1957 { 3218 {
1958 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--) 3219 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
1959 { 3220 {
1960 if (is_store) 3221 if (is_store)
1961 op = gimple_assign_rhs1 (stmt); 3222 op = gimple_assign_rhs1 (stmt);
1962 else 3223 else
1963 op = gimple_op (stmt, op_num + 1); 3224 {
1964 3225 switch (code)
1965 if (reduc_index != -1) 3226 {
1966 { 3227 case COND_EXPR:
1967 struct loop *loop = (gimple_bb (stmt))->loop_father; 3228 {
1968 gimple def_stmt = SSA_NAME_DEF_STMT (op); 3229 tree cond = gimple_assign_rhs1 (stmt);
1969 3230 if (TREE_CODE (cond) == SSA_NAME)
1970 gcc_assert (loop); 3231 op = gimple_op (stmt, op_num + 1);
1971 /* Get the def before the loop. */ 3232 else if (op_num == 0 || op_num == 1)
1972 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, 3233 op = TREE_OPERAND (cond, op_num);
1973 loop_preheader_edge (loop)); 3234 else
1974 if (j != (number_of_copies - 1) && neutral_op) 3235 {
1975 op = neutral_op; 3236 if (op_num == 2)
1976 } 3237 op = gimple_assign_rhs2 (stmt);
3238 else
3239 op = gimple_assign_rhs3 (stmt);
3240 }
3241 }
3242 break;
3243
3244 case CALL_EXPR:
3245 op = gimple_call_arg (stmt, op_num);
3246 break;
3247
3248 case LSHIFT_EXPR:
3249 case RSHIFT_EXPR:
3250 case LROTATE_EXPR:
3251 case RROTATE_EXPR:
3252 op = gimple_op (stmt, op_num + 1);
3253 /* Unlike the other binary operators, shifts/rotates have
3254 the shift count being int, instead of the same type as
3255 the lhs, so make sure the scalar is the right type if
3256 we are dealing with vectors of
3257 long long/long/short/char. */
3258 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3259 op = fold_convert (TREE_TYPE (vector_type), op);
3260 break;
3261
3262 default:
3263 op = gimple_op (stmt, op_num + 1);
3264 break;
3265 }
3266 }
1977 3267
1978 /* Create 'vect_ = {op0,op1,...,opn}'. */ 3268 /* Create 'vect_ = {op0,op1,...,opn}'. */
1979 t = tree_cons (NULL_TREE, op, t);
1980
1981 number_of_places_left_in_vector--; 3269 number_of_places_left_in_vector--;
3270 tree orig_op = op;
3271 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3272 {
3273 if (CONSTANT_CLASS_P (op))
3274 {
3275 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3276 {
3277 /* Can't use VIEW_CONVERT_EXPR for booleans because
3278 of possibly different sizes of scalar value and
3279 vector element. */
3280 if (integer_zerop (op))
3281 op = build_int_cst (TREE_TYPE (vector_type), 0);
3282 else if (integer_onep (op))
3283 op = build_all_ones_cst (TREE_TYPE (vector_type));
3284 else
3285 gcc_unreachable ();
3286 }
3287 else
3288 op = fold_unary (VIEW_CONVERT_EXPR,
3289 TREE_TYPE (vector_type), op);
3290 gcc_assert (op && CONSTANT_CLASS_P (op));
3291 }
3292 else
3293 {
3294 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3295 gimple *init_stmt;
3296 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3297 {
3298 tree true_val
3299 = build_all_ones_cst (TREE_TYPE (vector_type));
3300 tree false_val
3301 = build_zero_cst (TREE_TYPE (vector_type));
3302 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3303 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3304 op, true_val,
3305 false_val);
3306 }
3307 else
3308 {
3309 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3310 op);
3311 init_stmt
3312 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3313 op);
3314 }
3315 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3316 op = new_temp;
3317 }
3318 }
3319 elts[number_of_places_left_in_vector] = op;
3320 if (!CONSTANT_CLASS_P (op))
3321 constant_p = false;
3322 if (TREE_CODE (orig_op) == SSA_NAME
3323 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3324 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3325 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3326 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3327 place_after_defs = true;
1982 3328
1983 if (number_of_places_left_in_vector == 0) 3329 if (number_of_places_left_in_vector == 0)
1984 { 3330 {
3331 if (constant_p)
3332 vec_cst = build_vector (vector_type, elts);
3333 else
3334 {
3335 vec<constructor_elt, va_gc> *v;
3336 unsigned k;
3337 vec_alloc (v, nunits);
3338 for (k = 0; k < nunits; ++k)
3339 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
3340 vec_cst = build_constructor (vector_type, v);
3341 }
3342 tree init;
3343 gimple_stmt_iterator gsi;
3344 if (place_after_defs)
3345 {
3346 gsi = gsi_for_stmt
3347 (vect_find_last_scalar_stmt_in_slp (slp_node));
3348 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3349 }
3350 else
3351 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3352 if (ctor_seq != NULL)
3353 {
3354 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3355 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
3356 GSI_SAME_STMT);
3357 ctor_seq = NULL;
3358 }
3359 voprnds.quick_push (init);
3360 place_after_defs = false;
1985 number_of_places_left_in_vector = nunits; 3361 number_of_places_left_in_vector = nunits;
1986 3362 constant_p = true;
1987 if (constant_p)
1988 vec_cst = build_vector (vector_type, t);
1989 else
1990 vec_cst = build_constructor_from_list (vector_type, t);
1991 VEC_quick_push (tree, voprnds,
1992 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1993 t = NULL_TREE;
1994 } 3363 }
1995 } 3364 }
1996 } 3365 }
1997 3366
1998 /* Since the vectors are created in the reverse order, we should invert 3367 /* Since the vectors are created in the reverse order, we should invert
1999 them. */ 3368 them. */
2000 vec_num = VEC_length (tree, voprnds); 3369 vec_num = voprnds.length ();
2001 for (j = vec_num - 1; j >= 0; j--) 3370 for (j = vec_num; j != 0; j--)
2002 { 3371 {
2003 vop = VEC_index (tree, voprnds, j); 3372 vop = voprnds[j - 1];
2004 VEC_quick_push (tree, *vec_oprnds, vop); 3373 vec_oprnds->quick_push (vop);
2005 } 3374 }
2006 3375
2007 VEC_free (tree, heap, voprnds); 3376 voprnds.release ();
2008 3377
2009 /* In case that VF is greater than the unrolling factor needed for the SLP 3378 /* In case that VF is greater than the unrolling factor needed for the SLP
2010 group of stmts, NUMBER_OF_VECTORS to be created is greater than 3379 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2011 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have 3380 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2012 to replicate the vectors. */ 3381 to replicate the vectors. */
2013 while (number_of_vectors > VEC_length (tree, *vec_oprnds)) 3382 while (number_of_vectors > vec_oprnds->length ())
2014 { 3383 {
2015 tree neutral_vec = NULL; 3384 tree neutral_vec = NULL;
2016 3385
2017 if (neutral_op) 3386 if (neutral_op)
2018 { 3387 {
2019 if (!neutral_vec) 3388 if (!neutral_vec)
2020 neutral_vec = build_vector_from_val (vector_type, neutral_op); 3389 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2021 3390
2022 VEC_quick_push (tree, *vec_oprnds, neutral_vec); 3391 vec_oprnds->quick_push (neutral_vec);
2023 } 3392 }
2024 else 3393 else
2025 { 3394 {
2026 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) 3395 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2027 VEC_quick_push (tree, *vec_oprnds, vop); 3396 vec_oprnds->quick_push (vop);
2028 } 3397 }
2029 } 3398 }
2030 } 3399 }
2031 3400
2032 3401
2033 /* Get vectorized definitions from SLP_NODE that contains corresponding 3402 /* Get vectorized definitions from SLP_NODE that contains corresponding
2034 vectorized def-stmts. */ 3403 vectorized def-stmts. */
2035 3404
2036 static void 3405 static void
2037 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) 3406 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2038 { 3407 {
2039 tree vec_oprnd; 3408 tree vec_oprnd;
2040 gimple vec_def_stmt; 3409 gimple *vec_def_stmt;
2041 unsigned int i; 3410 unsigned int i;
2042 3411
2043 gcc_assert (SLP_TREE_VEC_STMTS (slp_node)); 3412 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2044 3413
2045 FOR_EACH_VEC_ELT (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt) 3414 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2046 { 3415 {
2047 gcc_assert (vec_def_stmt); 3416 gcc_assert (vec_def_stmt);
2048 vec_oprnd = gimple_get_lhs (vec_def_stmt); 3417 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
2049 VEC_quick_push (tree, *vec_oprnds, vec_oprnd); 3418 vec_oprnd = gimple_phi_result (vec_def_stmt);
3419 else
3420 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3421 vec_oprnds->quick_push (vec_oprnd);
2050 } 3422 }
2051 } 3423 }
2052 3424
2053 3425
2054 /* Get vectorized definitions for SLP_NODE. 3426 /* Get vectorized definitions for SLP_NODE.
2055 If the scalar definitions are loop invariants or constants, collect them and 3427 If the scalar definitions are loop invariants or constants, collect them and
2056 call vect_get_constant_vectors() to create vector stmts. 3428 call vect_get_constant_vectors() to create vector stmts.
2057 Otherwise, the def-stmts must be already vectorized and the vectorized stmts 3429 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2058 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call 3430 must be stored in the corresponding child of SLP_NODE, and we call
2059 vect_get_slp_vect_defs() to retrieve them. 3431 vect_get_slp_vect_defs () to retrieve them. */
2060 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
2061 the right node. This is used when the second operand must remain scalar. */
2062 3432
2063 void 3433 void
2064 vect_get_slp_defs (tree op0, tree op1, slp_tree slp_node, 3434 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2065 VEC (tree,heap) **vec_oprnds0, 3435 vec<vec<tree> > *vec_oprnds)
2066 VEC (tree,heap) **vec_oprnds1, int reduc_index) 3436 {
2067 { 3437 gimple *first_stmt;
2068 gimple first_stmt; 3438 int number_of_vects = 0, i;
2069 enum tree_code code; 3439 unsigned int child_index = 0;
2070 int number_of_vects;
2071 HOST_WIDE_INT lhs_size_unit, rhs_size_unit; 3440 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2072 3441 slp_tree child = NULL;
2073 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); 3442 vec<tree> vec_defs;
2074 /* The number of vector defs is determined by the number of vector statements 3443 tree oprnd;
2075 in the node from which we get those statements. */ 3444 bool vectorized_defs;
2076 if (SLP_TREE_LEFT (slp_node)) 3445
2077 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node)); 3446 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2078 else 3447 FOR_EACH_VEC_ELT (ops, i, oprnd)
2079 { 3448 {
2080 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 3449 /* For each operand we check if it has vectorized definitions in a child
2081 /* Number of vector stmts was calculated according to LHS in 3450 node or we need to create them (for invariants and constants). We
2082 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if 3451 check if the LHS of the first stmt of the next child matches OPRND.
2083 necessary. See vect_get_smallest_scalar_type () for details. */ 3452 If it does, we found the correct child. Otherwise, we call
2084 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, 3453 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2085 &rhs_size_unit); 3454 to check this child node for the next operand. */
2086 if (rhs_size_unit != lhs_size_unit) 3455 vectorized_defs = false;
3456 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2087 { 3457 {
2088 number_of_vects *= rhs_size_unit; 3458 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2089 number_of_vects /= lhs_size_unit; 3459
3460 /* We have to check both pattern and original def, if available. */
3461 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3462 {
3463 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3464 gimple *related
3465 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3466 tree first_def_op;
3467
3468 if (gimple_code (first_def) == GIMPLE_PHI)
3469 first_def_op = gimple_phi_result (first_def);
3470 else
3471 first_def_op = gimple_get_lhs (first_def);
3472 if (operand_equal_p (oprnd, first_def_op, 0)
3473 || (related
3474 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3475 {
3476 /* The number of vector defs is determined by the number of
3477 vector statements in the node from which we get those
3478 statements. */
3479 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3480 vectorized_defs = true;
3481 child_index++;
3482 }
3483 }
3484 else
3485 child_index++;
2090 } 3486 }
2091 } 3487
2092 3488 if (!vectorized_defs)
2093 /* Allocate memory for vectorized defs. */ 3489 {
2094 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects); 3490 if (i == 0)
2095 3491 {
2096 /* SLP_NODE corresponds either to a group of stores or to a group of 3492 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2097 unary/binary operations. We don't call this function for loads. 3493 /* Number of vector stmts was calculated according to LHS in
2098 For reduction defs we call vect_get_constant_vectors(), since we are 3494 vect_schedule_slp_instance (), fix it by replacing LHS with
2099 looking for initial loop invariant values. */ 3495 RHS, if necessary. See vect_get_smallest_scalar_type () for
2100 if (SLP_TREE_LEFT (slp_node) && reduc_index == -1) 3496 details. */
2101 /* The defs are already vectorized. */ 3497 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2102 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); 3498 &rhs_size_unit);
2103 else 3499 if (rhs_size_unit != lhs_size_unit)
2104 /* Build vectors from scalar defs. */ 3500 {
2105 vect_get_constant_vectors (op0, slp_node, vec_oprnds0, 0, number_of_vects, 3501 number_of_vects *= rhs_size_unit;
2106 reduc_index); 3502 number_of_vects /= lhs_size_unit;
2107 3503 }
2108 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) 3504 }
2109 /* Since we don't call this function with loads, this is a group of 3505 }
2110 stores. */ 3506
2111 return; 3507 /* Allocate memory for vectorized defs. */
2112 3508 vec_defs = vNULL;
2113 /* For reductions, we only need initial values. */ 3509 vec_defs.create (number_of_vects);
2114 if (reduc_index != -1) 3510
2115 return; 3511 /* For reduction defs we call vect_get_constant_vectors (), since we are
2116 3512 looking for initial loop invariant values. */
2117 code = gimple_assign_rhs_code (first_stmt); 3513 if (vectorized_defs)
2118 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1) 3514 /* The defs are already vectorized. */
2119 return; 3515 vect_get_slp_vect_defs (child, &vec_defs);
2120 3516 else
2121 /* The number of vector defs is determined by the number of vector statements 3517 /* Build vectors from scalar defs. */
2122 in the node from which we get those statements. */ 3518 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2123 if (SLP_TREE_RIGHT (slp_node)) 3519 number_of_vects);
2124 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node)); 3520
2125 else 3521 vec_oprnds->quick_push (vec_defs);
2126 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 3522 }
2127 3523 }
2128 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects); 3524
2129 3525 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2130 if (SLP_TREE_RIGHT (slp_node)) 3526 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2131 /* The defs are already vectorized. */ 3527 permute statements for the SLP node NODE of the SLP instance
2132 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); 3528 SLP_NODE_INSTANCE. */
2133 else 3529
2134 /* Build vectors from scalar defs. */ 3530 bool
2135 vect_get_constant_vectors (op1, slp_node, vec_oprnds1, 1, number_of_vects, 3531 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
2136 -1); 3532 gimple_stmt_iterator *gsi, int vf,
2137 } 3533 slp_instance slp_node_instance, bool analyze_only,
2138 3534 unsigned *n_perms)
2139 3535 {
2140 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by 3536 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2141 building a vector of type MASK_TYPE from it) and two input vectors placed in 3537 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2142 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and 3538 tree mask_element_type = NULL_TREE, mask_type;
2143 shifting by STRIDE elements of DR_CHAIN for every copy. 3539 int nunits, vec_index = 0;
2144 (STRIDE is the number of vectorized stmts for NODE divided by the number of 3540 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2145 copies). 3541 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2146 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where 3542 int mask_element;
2147 the created stmts must be inserted. */ 3543 machine_mode mode;
2148 3544
2149 static inline void 3545 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2150 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt, 3546 return false;
2151 tree mask, int first_vec_indx, int second_vec_indx, 3547
2152 gimple_stmt_iterator *gsi, slp_tree node, 3548 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
2153 tree builtin_decl, tree vectype, 3549
2154 VEC(tree,heap) *dr_chain, 3550 mode = TYPE_MODE (vectype);
2155 int ncopies, int vect_stmts_counter) 3551
2156 { 3552 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2157 tree perm_dest; 3553 same size as the vector element being permuted. */
2158 gimple perm_stmt = NULL; 3554 mask_element_type = lang_hooks.types.type_for_mode
2159 stmt_vec_info next_stmt_info; 3555 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
2160 int i, stride; 3556 mask_type = get_vectype_for_scalar_type (mask_element_type);
2161 tree first_vec, second_vec, data_ref; 3557 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2162 3558 auto_vec_perm_indices mask (nunits);
2163 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies; 3559 mask.quick_grow (nunits);
2164 3560
2165 /* Initialize the vect stmts of NODE to properly insert the generated 3561 /* Initialize the vect stmts of NODE to properly insert the generated
2166 stmts later. */ 3562 stmts later. */
2167 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node)); 3563 if (! analyze_only)
2168 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) 3564 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
2169 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL); 3565 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2170 3566 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2171 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2172 for (i = 0; i < ncopies; i++)
2173 {
2174 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
2175 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
2176
2177 /* Generate the permute statement. */
2178 perm_stmt = gimple_build_call (builtin_decl,
2179 3, first_vec, second_vec, mask);
2180 data_ref = make_ssa_name (perm_dest, perm_stmt);
2181 gimple_call_set_lhs (perm_stmt, data_ref);
2182 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2183
2184 /* Store the vector statement in NODE. */
2185 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
2186 stride * i + vect_stmts_counter, perm_stmt);
2187
2188 first_vec_indx += stride;
2189 second_vec_indx += stride;
2190 }
2191
2192 /* Mark the scalar stmt as vectorized. */
2193 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2194 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2195 }
2196
2197
2198 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2199 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2200 representation. Check that the mask is valid and return FALSE if not.
2201 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2202 the next vector, i.e., the current first vector is not needed. */
2203
2204 static bool
2205 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2206 int mask_nunits, bool only_one_vec, int index,
2207 int *mask, int *current_mask_element,
2208 bool *need_next_vector, int *number_of_mask_fixes,
2209 bool *mask_fixed, bool *needs_first_vector)
2210 {
2211 int i;
2212
2213 /* Convert to target specific representation. */
2214 *current_mask_element = first_mask_element + m;
2215 /* Adjust the value in case it's a mask for second and third vectors. */
2216 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2217
2218 if (*current_mask_element < mask_nunits)
2219 *needs_first_vector = true;
2220
2221 /* We have only one input vector to permute but the mask accesses values in
2222 the next vector as well. */
2223 if (only_one_vec && *current_mask_element >= mask_nunits)
2224 {
2225 if (vect_print_dump_info (REPORT_DETAILS))
2226 {
2227 fprintf (vect_dump, "permutation requires at least two vectors ");
2228 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2229 }
2230
2231 return false;
2232 }
2233
2234 /* The mask requires the next vector. */
2235 if (*current_mask_element >= mask_nunits * 2)
2236 {
2237 if (*needs_first_vector || *mask_fixed)
2238 {
2239 /* We either need the first vector too or have already moved to the
2240 next vector. In both cases, this permutation needs three
2241 vectors. */
2242 if (vect_print_dump_info (REPORT_DETAILS))
2243 {
2244 fprintf (vect_dump, "permutation requires at "
2245 "least three vectors ");
2246 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2247 }
2248
2249 return false;
2250 }
2251
2252 /* We move to the next vector, dropping the first one and working with
2253 the second and the third - we need to adjust the values of the mask
2254 accordingly. */
2255 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2256
2257 for (i = 0; i < index; i++)
2258 mask[i] -= mask_nunits * *number_of_mask_fixes;
2259
2260 (*number_of_mask_fixes)++;
2261 *mask_fixed = true;
2262 }
2263
2264 *need_next_vector = *mask_fixed;
2265
2266 /* This was the last element of this mask. Start a new one. */
2267 if (index == mask_nunits - 1)
2268 {
2269 *number_of_mask_fixes = 1;
2270 *mask_fixed = false;
2271 *needs_first_vector = false;
2272 }
2273
2274 return true;
2275 }
2276
2277
2278 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2279 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2280 permute statements for SLP_NODE_INSTANCE. */
2281 bool
2282 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
2283 gimple_stmt_iterator *gsi, int vf,
2284 slp_instance slp_node_instance, bool analyze_only)
2285 {
2286 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2287 tree mask_element_type = NULL_TREE, mask_type;
2288 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
2289 slp_tree node;
2290 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
2291 gimple next_scalar_stmt;
2292 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2293 int first_mask_element;
2294 int index, unroll_factor, *mask, current_mask_element, ncopies;
2295 bool only_one_vec = false, need_next_vector = false;
2296 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2297 int number_of_mask_fixes = 1;
2298 bool mask_fixed = false;
2299 bool needs_first_vector = false;
2300
2301 if (!targetm.vectorize.builtin_vec_perm)
2302 {
2303 if (vect_print_dump_info (REPORT_DETAILS))
2304 {
2305 fprintf (vect_dump, "no builtin for vect permute for ");
2306 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2307 }
2308
2309 return false;
2310 }
2311
2312 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
2313 &mask_element_type);
2314 if (!builtin_decl || !mask_element_type)
2315 {
2316 if (vect_print_dump_info (REPORT_DETAILS))
2317 {
2318 fprintf (vect_dump, "no builtin for vect permute for ");
2319 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2320 }
2321
2322 return false;
2323 }
2324
2325 mask_type = get_vectype_for_scalar_type (mask_element_type);
2326 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
2327 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
2328 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2329 scale = mask_nunits / nunits;
2330 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2331
2332 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2333 unrolling factor. */
2334 orig_vec_stmts_num = group_size *
2335 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2336 if (orig_vec_stmts_num == 1)
2337 only_one_vec = true;
2338
2339 /* Number of copies is determined by the final vectorization factor
2340 relatively to SLP_NODE_INSTANCE unrolling factor. */
2341 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2342 3567
2343 /* Generate permutation masks for every NODE. Number of masks for each NODE 3568 /* Generate permutation masks for every NODE. Number of masks for each NODE
2344 is equal to GROUP_SIZE. 3569 is equal to GROUP_SIZE.
2345 E.g., we have a group of three nodes with three loads from the same 3570 E.g., we have a group of three nodes with three loads from the same
2346 location in each node, and the vector size is 4. I.e., we have a 3571 location in each node, and the vector size is 4. I.e., we have a
2347 a0b0c0a1b1c1... sequence and we need to create the following vectors: 3572 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2348 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 3573 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2349 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 3574 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2350 ... 3575 ...
2351 3576
2352 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target 3577 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2353 scpecific type, e.g., in bytes for Altivec.
2354 The last mask is illegal since we assume two operands for permute 3578 The last mask is illegal since we assume two operands for permute
2355 operation, and the mask element values can't be outside that range. 3579 operation, and the mask element values can't be outside that range.
2356 Hence, the last mask must be converted into {2,5,5,5}. 3580 Hence, the last mask must be converted into {2,5,5,5}.
2357 For the first two permutations we need the first and the second input 3581 For the first two permutations we need the first and the second input
2358 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation 3582 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2359 we need the second and the third vectors: {b1,c1,a2,b2} and 3583 we need the second and the third vectors: {b1,c1,a2,b2} and
2360 {c2,a3,b3,c3}. */ 3584 {c2,a3,b3,c3}. */
2361 3585
2362 FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance), i, node) 3586 int vect_stmts_counter = 0;
2363 { 3587 int index = 0;
2364 scalar_index = 0; 3588 int first_vec_index = -1;
2365 index = 0; 3589 int second_vec_index = -1;
2366 vect_stmts_counter = 0; 3590 bool noop_p = true;
2367 vec_index = 0; 3591 *n_perms = 0;
2368 first_vec_index = vec_index++; 3592
2369 if (only_one_vec) 3593 for (int j = 0; j < vf; j++)
2370 second_vec_index = first_vec_index; 3594 {
2371 else 3595 for (int k = 0; k < group_size; k++)
2372 second_vec_index = vec_index++; 3596 {
2373 3597 int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
2374 for (j = 0; j < unroll_factor; j++) 3598 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
2375 { 3599 vec_index = i / nunits;
2376 for (k = 0; k < group_size; k++) 3600 mask_element = i % nunits;
2377 { 3601 if (vec_index == first_vec_index
2378 first_mask_element = (i + j * group_size) * scale; 3602 || first_vec_index == -1)
2379 for (m = 0; m < scale; m++) 3603 {
2380 { 3604 first_vec_index = vec_index;
2381 if (!vect_get_mask_element (stmt, first_mask_element, m, 3605 }
2382 mask_nunits, only_one_vec, index, mask, 3606 else if (vec_index == second_vec_index
2383 &current_mask_element, &need_next_vector, 3607 || second_vec_index == -1)
2384 &number_of_mask_fixes, &mask_fixed, 3608 {
2385 &needs_first_vector)) 3609 second_vec_index = vec_index;
2386 return false; 3610 mask_element += nunits;
2387 3611 }
2388 mask[index++] = current_mask_element; 3612 else
2389 } 3613 {
2390 3614 if (dump_enabled_p ())
2391 if (index == mask_nunits) 3615 {
2392 { 3616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393 tree mask_vec = NULL; 3617 "permutation requires at "
2394 3618 "least three vectors ");
2395 while (--index >= 0) 3619 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3620 stmt, 0);
3621 }
3622 gcc_assert (analyze_only);
3623 return false;
3624 }
3625
3626 gcc_assert (mask_element >= 0
3627 && mask_element < 2 * nunits);
3628 if (mask_element != index)
3629 noop_p = false;
3630 mask[index++] = mask_element;
3631
3632 if (index == nunits)
3633 {
3634 if (! noop_p
3635 && ! can_vec_perm_p (mode, false, &mask))
3636 {
3637 if (dump_enabled_p ())
2396 { 3638 {
2397 tree t = build_int_cst (mask_element_type, mask[index]); 3639 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2398 mask_vec = tree_cons (NULL, t, mask_vec); 3640 vect_location,
3641 "unsupported vect permute { ");
3642 for (i = 0; i < nunits; ++i)
3643 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ", mask[i]);
3644 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2399 } 3645 }
2400 mask_vec = build_vector (mask_type, mask_vec); 3646 gcc_assert (analyze_only);
2401 index = 0; 3647 return false;
2402 3648 }
2403 if (!targetm.vectorize.builtin_vec_perm_ok (vectype, 3649
2404 mask_vec)) 3650 if (! noop_p)
3651 ++*n_perms;
3652
3653 if (!analyze_only)
3654 {
3655 tree mask_vec = NULL_TREE;
3656
3657 if (! noop_p)
2405 { 3658 {
2406 if (vect_print_dump_info (REPORT_DETAILS)) 3659 auto_vec<tree, 32> mask_elts (nunits);
2407 { 3660 for (int l = 0; l < nunits; ++l)
2408 fprintf (vect_dump, "unsupported vect permute "); 3661 mask_elts.quick_push (build_int_cst (mask_element_type,
2409 print_generic_expr (vect_dump, mask_vec, 0); 3662 mask[l]));
2410 } 3663 mask_vec = build_vector (mask_type, mask_elts);
2411 free (mask);
2412 return false;
2413 } 3664 }
2414 3665
2415 if (!analyze_only) 3666 if (second_vec_index == -1)
2416 { 3667 second_vec_index = first_vec_index;
2417 if (need_next_vector) 3668
2418 { 3669 /* Generate the permute statement if necessary. */
2419 first_vec_index = second_vec_index; 3670 tree first_vec = dr_chain[first_vec_index];
2420 second_vec_index = vec_index; 3671 tree second_vec = dr_chain[second_vec_index];
2421 } 3672 gimple *perm_stmt;
2422 3673 if (! noop_p)
2423 next_scalar_stmt = VEC_index (gimple, 3674 {
2424 SLP_TREE_SCALAR_STMTS (node), scalar_index++); 3675 tree perm_dest
2425 3676 = vect_create_destination_var (gimple_assign_lhs (stmt),
2426 vect_create_mask_and_perm (stmt, next_scalar_stmt, 3677 vectype);
2427 mask_vec, first_vec_index, second_vec_index, 3678 perm_dest = make_ssa_name (perm_dest);
2428 gsi, node, builtin_decl, vectype, dr_chain, 3679 perm_stmt = gimple_build_assign (perm_dest,
2429 ncopies, vect_stmts_counter++); 3680 VEC_PERM_EXPR,
2430 } 3681 first_vec, second_vec,
2431 } 3682 mask_vec);
2432 } 3683 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2433 } 3684 }
2434 } 3685 else
2435 3686 /* If mask was NULL_TREE generate the requested
2436 free (mask); 3687 identity transform. */
3688 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3689
3690 /* Store the vector statement in NODE. */
3691 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3692 }
3693
3694 index = 0;
3695 first_vec_index = -1;
3696 second_vec_index = -1;
3697 noop_p = true;
3698 }
3699 }
3700 }
3701
2437 return true; 3702 return true;
2438 } 3703 }
2439 3704
2440 3705
2441 3706
2442 /* Vectorize SLP instance tree in postorder. */ 3707 /* Vectorize SLP instance tree in postorder. */
2443 3708
2444 static bool 3709 static bool
2445 vect_schedule_slp_instance (slp_tree node, slp_instance instance, 3710 vect_schedule_slp_instance (slp_tree node, slp_instance instance)
2446 unsigned int vectorization_factor) 3711 {
2447 { 3712 gimple *stmt;
2448 gimple stmt; 3713 bool grouped_store, is_store;
2449 bool strided_store, is_store;
2450 gimple_stmt_iterator si; 3714 gimple_stmt_iterator si;
2451 stmt_vec_info stmt_info; 3715 stmt_vec_info stmt_info;
2452 unsigned int vec_stmts_size, nunits, group_size; 3716 unsigned int group_size;
2453 tree vectype; 3717 tree vectype;
2454 int i; 3718 int i, j;
2455 slp_tree loads_node; 3719 slp_tree child;
2456 3720
2457 if (!node) 3721 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2458 return false; 3722 return false;
2459 3723
2460 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance, 3724 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2461 vectorization_factor); 3725 vect_schedule_slp_instance (child, instance);
2462 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance, 3726
2463 vectorization_factor); 3727 /* Push SLP node def-type to stmts. */
2464 3728 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2465 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); 3729 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3730 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
3731 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
3732
3733 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2466 stmt_info = vinfo_for_stmt (stmt); 3734 stmt_info = vinfo_for_stmt (stmt);
2467 3735
2468 /* VECTYPE is the type of the destination. */ 3736 /* VECTYPE is the type of the destination. */
2469 vectype = STMT_VINFO_VECTYPE (stmt_info); 3737 vectype = STMT_VINFO_VECTYPE (stmt_info);
2470 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
2471 group_size = SLP_INSTANCE_GROUP_SIZE (instance); 3738 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2472 3739
2473 /* For each SLP instance calculate number of vector stmts to be created 3740 if (!SLP_TREE_VEC_STMTS (node).exists ())
2474 for the scalar stmts in each node of the SLP tree. Number of vector 3741 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
2475 elements in one vector iteration is the number of scalar elements in 3742
2476 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector 3743 if (dump_enabled_p ())
2477 size. */ 3744 {
2478 vec_stmts_size = (vectorization_factor * group_size) / nunits; 3745 dump_printf_loc (MSG_NOTE,vect_location,
2479 3746 "------>vectorizing SLP node starting from: ");
2480 /* In case of load permutation we have to allocate vectorized statements for 3747 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2481 all the nodes that participate in that permutation. */ 3748 }
2482 if (SLP_INSTANCE_LOAD_PERMUTATION (instance)) 3749
2483 { 3750 /* Vectorized stmts go before the last scalar stmt which is where
2484 FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node) 3751 all uses are ready. */
2485 { 3752 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
2486 if (!SLP_TREE_VEC_STMTS (loads_node)) 3753
2487 { 3754 /* Mark the first element of the reduction chain as reduction to properly
2488 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap, 3755 transform the node. In the analysis phase only the last element of the
2489 vec_stmts_size); 3756 chain is marked as reduction. */
2490 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size; 3757 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2491 } 3758 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
2492 } 3759 {
2493 } 3760 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
2494 3761 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2495 if (!SLP_TREE_VEC_STMTS (node)) 3762 }
2496 { 3763
2497 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size); 3764 /* Handle two-operation SLP nodes by vectorizing the group with
2498 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; 3765 both operations and then performing a merge. */
2499 } 3766 if (SLP_TREE_TWO_OPERATORS (node))
2500 3767 {
2501 if (vect_print_dump_info (REPORT_DETAILS)) 3768 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2502 { 3769 enum tree_code ocode = ERROR_MARK;
2503 fprintf (vect_dump, "------>vectorizing SLP node starting from: "); 3770 gimple *ostmt;
2504 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 3771 auto_vec_perm_indices mask (group_size);
2505 } 3772 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
2506 3773 if (gimple_assign_rhs_code (ostmt) != code0)
2507 /* Loads should be inserted before the first load. */ 3774 {
2508 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) 3775 mask.quick_push (1);
2509 && STMT_VINFO_STRIDED_ACCESS (stmt_info) 3776 ocode = gimple_assign_rhs_code (ostmt);
2510 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))) 3777 }
2511 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); 3778 else
2512 else 3779 mask.quick_push (0);
2513 si = gsi_for_stmt (stmt); 3780 if (ocode != ERROR_MARK)
2514 3781 {
2515 /* Stores should be inserted just before the last store. */ 3782 vec<gimple *> v0;
2516 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) 3783 vec<gimple *> v1;
2517 && REFERENCE_CLASS_P (gimple_get_lhs (stmt))) 3784 unsigned j;
2518 { 3785 tree tmask = NULL_TREE;
2519 gimple last_store = vect_find_last_store_in_slp_instance (instance); 3786 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
2520 si = gsi_for_stmt (last_store); 3787 v0 = SLP_TREE_VEC_STMTS (node).copy ();
2521 } 3788 SLP_TREE_VEC_STMTS (node).truncate (0);
2522 3789 gimple_assign_set_rhs_code (stmt, ocode);
2523 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); 3790 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3791 gimple_assign_set_rhs_code (stmt, code0);
3792 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3793 SLP_TREE_VEC_STMTS (node).truncate (0);
3794 tree meltype = build_nonstandard_integer_type
3795 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3796 tree mvectype = get_same_sized_vectype (meltype, vectype);
3797 unsigned k = 0, l;
3798 for (j = 0; j < v0.length (); ++j)
3799 {
3800 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3801 auto_vec<tree, 32> melts (nunits);
3802 for (l = 0; l < nunits; ++l)
3803 {
3804 if (k >= group_size)
3805 k = 0;
3806 tree t = build_int_cst (meltype, mask[k++] * nunits + l);
3807 melts.quick_push (t);
3808 }
3809 tmask = build_vector (mvectype, melts);
3810
3811 /* ??? Not all targets support a VEC_PERM_EXPR with a
3812 constant mask that would translate to a vec_merge RTX
3813 (with their vec_perm_const_ok). We can either not
3814 vectorize in that case or let veclower do its job.
3815 Unfortunately that isn't too great and at least for
3816 plus/minus we'd eventually like to match targets
3817 vector addsub instructions. */
3818 gimple *vstmt;
3819 vstmt = gimple_build_assign (make_ssa_name (vectype),
3820 VEC_PERM_EXPR,
3821 gimple_assign_lhs (v0[j]),
3822 gimple_assign_lhs (v1[j]), tmask);
3823 vect_finish_stmt_generation (stmt, vstmt, &si);
3824 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3825 }
3826 v0.release ();
3827 v1.release ();
3828 return false;
3829 }
3830 }
3831 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3832
3833 /* Restore stmt def-types. */
3834 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3835 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3836 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
3837 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
3838
2524 return is_store; 3839 return is_store;
2525 } 3840 }
2526 3841
3842 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3843 For loop vectorization this is done in vectorizable_call, but for SLP
3844 it needs to be deferred until end of vect_schedule_slp, because multiple
3845 SLP instances may refer to the same scalar stmt. */
3846
3847 static void
3848 vect_remove_slp_scalar_calls (slp_tree node)
3849 {
3850 gimple *stmt, *new_stmt;
3851 gimple_stmt_iterator gsi;
3852 int i;
3853 slp_tree child;
3854 tree lhs;
3855 stmt_vec_info stmt_info;
3856
3857 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3858 return;
3859
3860 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3861 vect_remove_slp_scalar_calls (child);
3862
3863 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3864 {
3865 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3866 continue;
3867 stmt_info = vinfo_for_stmt (stmt);
3868 if (stmt_info == NULL
3869 || is_pattern_stmt_p (stmt_info)
3870 || !PURE_SLP_STMT (stmt_info))
3871 continue;
3872 lhs = gimple_call_lhs (stmt);
3873 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3874 set_vinfo_for_stmt (new_stmt, stmt_info);
3875 set_vinfo_for_stmt (stmt, NULL);
3876 STMT_VINFO_STMT (stmt_info) = new_stmt;
3877 gsi = gsi_for_stmt (stmt);
3878 gsi_replace (&gsi, new_stmt, false);
3879 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3880 }
3881 }
2527 3882
2528 /* Generate vector code for all SLP instances in the loop/basic block. */ 3883 /* Generate vector code for all SLP instances in the loop/basic block. */
2529 3884
2530 bool 3885 bool
2531 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) 3886 vect_schedule_slp (vec_info *vinfo)
2532 { 3887 {
2533 VEC (slp_instance, heap) *slp_instances; 3888 vec<slp_instance> slp_instances;
2534 slp_instance instance; 3889 slp_instance instance;
2535 unsigned int i, vf; 3890 unsigned int i;
2536 bool is_store = false; 3891 bool is_store = false;
2537 3892
2538 if (loop_vinfo) 3893 slp_instances = vinfo->slp_instances;
2539 { 3894 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2540 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2541 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2542 }
2543 else
2544 {
2545 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2546 vf = 1;
2547 }
2548
2549 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2550 { 3895 {
2551 /* Schedule the tree of INSTANCE. */ 3896 /* Schedule the tree of INSTANCE. */
2552 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), 3897 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
2553 instance, vf); 3898 instance);
2554 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS) 3899 if (dump_enabled_p ())
2555 || vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 3900 dump_printf_loc (MSG_NOTE, vect_location,
2556 fprintf (vect_dump, "vectorizing stmts using SLP."); 3901 "vectorizing stmts using SLP.\n");
2557 } 3902 }
2558 3903
2559 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) 3904 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2560 { 3905 {
2561 slp_tree root = SLP_INSTANCE_TREE (instance); 3906 slp_tree root = SLP_INSTANCE_TREE (instance);
2562 gimple store; 3907 gimple *store;
2563 unsigned int j; 3908 unsigned int j;
2564 gimple_stmt_iterator gsi; 3909 gimple_stmt_iterator gsi;
2565 3910
2566 for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store) 3911 /* Remove scalar call stmts. Do not do this for basic-block
3912 vectorization as not all uses may be vectorized.
3913 ??? Why should this be necessary? DCE should be able to
3914 remove the stmts itself.
3915 ??? For BB vectorization we can as well remove scalar
3916 stmts starting from the SLP tree root if they have no
3917 uses. */
3918 if (is_a <loop_vec_info> (vinfo))
3919 vect_remove_slp_scalar_calls (root);
3920
3921 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
2567 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) 3922 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
2568 { 3923 {
2569 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store))) 3924 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
2570 break; 3925 break;
2571 3926
3927 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3928 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
2572 /* Free the attached stmt_vec_info and remove the stmt. */ 3929 /* Free the attached stmt_vec_info and remove the stmt. */
2573 gsi = gsi_for_stmt (store); 3930 gsi = gsi_for_stmt (store);
3931 unlink_stmt_vdef (store);
2574 gsi_remove (&gsi, true); 3932 gsi_remove (&gsi, true);
3933 release_defs (store);
2575 free_stmt_vec_info (store); 3934 free_stmt_vec_info (store);
2576 } 3935 }
2577 } 3936 }
2578 3937
2579 return is_store; 3938 return is_store;
2580 } 3939 }
2581
2582
2583 /* Vectorize the basic block. */
2584
2585 void
2586 vect_slp_transform_bb (basic_block bb)
2587 {
2588 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
2589 gimple_stmt_iterator si;
2590
2591 gcc_assert (bb_vinfo);
2592
2593 if (vect_print_dump_info (REPORT_DETAILS))
2594 fprintf (vect_dump, "SLPing BB\n");
2595
2596 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2597 {
2598 gimple stmt = gsi_stmt (si);
2599 stmt_vec_info stmt_info;
2600
2601 if (vect_print_dump_info (REPORT_DETAILS))
2602 {
2603 fprintf (vect_dump, "------>SLPing statement: ");
2604 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2605 }
2606
2607 stmt_info = vinfo_for_stmt (stmt);
2608 gcc_assert (stmt_info);
2609
2610 /* Schedule all the SLP instances when the first SLP stmt is reached. */
2611 if (STMT_SLP_TYPE (stmt_info))
2612 {
2613 vect_schedule_slp (NULL, bb_vinfo);
2614 break;
2615 }
2616 }
2617
2618 mark_sym_for_renaming (gimple_vop (cfun));
2619 /* The memory tags and pointers in vectorized statements need to
2620 have their SSA forms updated. FIXME, why can't this be delayed
2621 until all the loops have been transformed? */
2622 update_ssa (TODO_update_ssa);
2623
2624 if (vect_print_dump_info (REPORT_DETAILS))
2625 fprintf (vect_dump, "BASIC BLOCK VECTORIZED\n");
2626
2627 destroy_bb_vec_info (bb_vinfo);
2628 }
2629