111
|
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
|
|
2 and a lowering pass for OpenACC device directives.
|
|
3
|
131
|
4 Copyright (C) 2005-2018 Free Software Foundation, Inc.
|
111
|
5
|
|
6 This file is part of GCC.
|
|
7
|
|
8 GCC is free software; you can redistribute it and/or modify it under
|
|
9 the terms of the GNU General Public License as published by the Free
|
|
10 Software Foundation; either version 3, or (at your option) any later
|
|
11 version.
|
|
12
|
|
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
16 for more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License
|
|
19 along with GCC; see the file COPYING3. If not see
|
|
20 <http://www.gnu.org/licenses/>. */
|
|
21
|
|
22 #include "config.h"
|
|
23 #include "system.h"
|
|
24 #include "coretypes.h"
|
|
25 #include "backend.h"
|
|
26 #include "target.h"
|
|
27 #include "tree.h"
|
|
28 #include "gimple.h"
|
|
29 #include "tree-pass.h"
|
|
30 #include "ssa.h"
|
|
31 #include "cgraph.h"
|
|
32 #include "pretty-print.h"
|
|
33 #include "diagnostic-core.h"
|
|
34 #include "fold-const.h"
|
|
35 #include "internal-fn.h"
|
|
36 #include "langhooks.h"
|
|
37 #include "gimplify.h"
|
|
38 #include "gimple-iterator.h"
|
|
39 #include "gimplify-me.h"
|
|
40 #include "gimple-walk.h"
|
|
41 #include "tree-cfg.h"
|
|
42 #include "tree-into-ssa.h"
|
|
43 #include "tree-nested.h"
|
|
44 #include "stor-layout.h"
|
|
45 #include "common/common-target.h"
|
|
46 #include "omp-general.h"
|
|
47 #include "omp-offload.h"
|
|
48 #include "lto-section-names.h"
|
|
49 #include "gomp-constants.h"
|
|
50 #include "gimple-pretty-print.h"
|
|
51 #include "intl.h"
|
|
52 #include "stringpool.h"
|
|
53 #include "attribs.h"
|
|
54 #include "cfgloop.h"
|
|
55
|
|
56 /* Describe the OpenACC looping structure of a function. The entire
|
|
57 function is held in a 'NULL' loop. */
|
|
58
|
|
59 struct oacc_loop
|
|
60 {
|
|
61 oacc_loop *parent; /* Containing loop. */
|
|
62
|
|
63 oacc_loop *child; /* First inner loop. */
|
|
64
|
|
65 oacc_loop *sibling; /* Next loop within same parent. */
|
|
66
|
|
67 location_t loc; /* Location of the loop start. */
|
|
68
|
|
69 gcall *marker; /* Initial head marker. */
|
|
70
|
|
71 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
|
|
72 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
|
|
73
|
|
74 tree routine; /* Pseudo-loop enclosing a routine. */
|
|
75
|
|
76 unsigned mask; /* Partitioning mask. */
|
|
77 unsigned e_mask; /* Partitioning of element loops (when tiling). */
|
|
78 unsigned inner; /* Partitioning of inner loops. */
|
|
79 unsigned flags; /* Partitioning flags. */
|
|
80 vec<gcall *> ifns; /* Contained loop abstraction functions. */
|
|
81 tree chunk_size; /* Chunk size. */
|
|
82 gcall *head_end; /* Final marker of head sequence. */
|
|
83 };
|
|
84
|
|
85 /* Holds offload tables with decls. */
|
|
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
|
|
87
|
|
88 /* Return level at which oacc routine may spawn a partitioned loop, or
|
|
89 -1 if it is not a routine (i.e. is an offload fn). */
|
|
90
|
|
91 static int
|
|
92 oacc_fn_attrib_level (tree attr)
|
|
93 {
|
|
94 tree pos = TREE_VALUE (attr);
|
|
95
|
|
96 if (!TREE_PURPOSE (pos))
|
|
97 return -1;
|
|
98
|
|
99 int ix = 0;
|
|
100 for (ix = 0; ix != GOMP_DIM_MAX;
|
|
101 ix++, pos = TREE_CHAIN (pos))
|
|
102 if (!integer_zerop (TREE_PURPOSE (pos)))
|
|
103 break;
|
|
104
|
|
105 return ix;
|
|
106 }
|
|
107
|
|
108 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
|
|
109 adds their addresses and sizes to constructor-vector V_CTOR. */
|
|
110
|
|
111 static void
|
|
112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
|
|
113 vec<constructor_elt, va_gc> *v_ctor)
|
|
114 {
|
|
115 unsigned len = vec_safe_length (v_decls);
|
|
116 for (unsigned i = 0; i < len; i++)
|
|
117 {
|
|
118 tree it = (*v_decls)[i];
|
|
119 bool is_var = VAR_P (it);
|
|
120 bool is_link_var
|
|
121 = is_var
|
|
122 #ifdef ACCEL_COMPILER
|
|
123 && DECL_HAS_VALUE_EXPR_P (it)
|
|
124 #endif
|
|
125 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
|
|
126
|
|
127 tree size = NULL_TREE;
|
|
128 if (is_var)
|
|
129 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
|
|
130
|
|
131 tree addr;
|
|
132 if (!is_link_var)
|
|
133 addr = build_fold_addr_expr (it);
|
|
134 else
|
|
135 {
|
|
136 #ifdef ACCEL_COMPILER
|
|
137 /* For "omp declare target link" vars add address of the pointer to
|
|
138 the target table, instead of address of the var. */
|
|
139 tree value_expr = DECL_VALUE_EXPR (it);
|
|
140 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
|
|
141 varpool_node::finalize_decl (link_ptr_decl);
|
|
142 addr = build_fold_addr_expr (link_ptr_decl);
|
|
143 #else
|
|
144 addr = build_fold_addr_expr (it);
|
|
145 #endif
|
|
146
|
|
147 /* Most significant bit of the size marks "omp declare target link"
|
|
148 vars in host and target tables. */
|
|
149 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
|
|
150 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
|
|
151 * BITS_PER_UNIT - 1);
|
|
152 size = wide_int_to_tree (const_ptr_type_node, isize);
|
|
153 }
|
|
154
|
|
155 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
|
|
156 if (is_var)
|
|
157 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
|
|
158 }
|
|
159 }
|
|
160
|
|
161 /* Create new symbols containing (address, size) pairs for global variables,
|
|
162 marked with "omp declare target" attribute, as well as addresses for the
|
|
163 functions, which are outlined offloading regions. */
|
|
164 void
|
|
165 omp_finish_file (void)
|
|
166 {
|
|
167 unsigned num_funcs = vec_safe_length (offload_funcs);
|
|
168 unsigned num_vars = vec_safe_length (offload_vars);
|
|
169
|
|
170 if (num_funcs == 0 && num_vars == 0)
|
|
171 return;
|
|
172
|
|
173 if (targetm_common.have_named_sections)
|
|
174 {
|
|
175 vec<constructor_elt, va_gc> *v_f, *v_v;
|
|
176 vec_alloc (v_f, num_funcs);
|
|
177 vec_alloc (v_v, num_vars * 2);
|
|
178
|
|
179 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
|
|
180 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
|
|
181
|
|
182 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
|
|
183 num_vars * 2);
|
|
184 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
|
|
185 num_funcs);
|
|
186 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
|
|
187 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
|
|
188 tree ctor_v = build_constructor (vars_decl_type, v_v);
|
|
189 tree ctor_f = build_constructor (funcs_decl_type, v_f);
|
|
190 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
|
|
191 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
|
|
192 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
|
|
193 get_identifier (".offload_func_table"),
|
|
194 funcs_decl_type);
|
|
195 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
|
|
196 get_identifier (".offload_var_table"),
|
|
197 vars_decl_type);
|
|
198 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
|
|
199 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
|
|
200 otherwise a joint table in a binary will contain padding between
|
|
201 tables from multiple object files. */
|
|
202 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
|
|
203 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
|
|
204 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
|
|
205 DECL_INITIAL (funcs_decl) = ctor_f;
|
|
206 DECL_INITIAL (vars_decl) = ctor_v;
|
|
207 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
|
|
208 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
|
|
209
|
|
210 varpool_node::finalize_decl (vars_decl);
|
|
211 varpool_node::finalize_decl (funcs_decl);
|
|
212 }
|
|
213 else
|
|
214 {
|
|
215 for (unsigned i = 0; i < num_funcs; i++)
|
|
216 {
|
|
217 tree it = (*offload_funcs)[i];
|
|
218 targetm.record_offload_symbol (it);
|
|
219 }
|
|
220 for (unsigned i = 0; i < num_vars; i++)
|
|
221 {
|
|
222 tree it = (*offload_vars)[i];
|
|
223 targetm.record_offload_symbol (it);
|
|
224 }
|
|
225 }
|
|
226 }
|
|
227
|
|
228 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
|
|
229 axis DIM. Return a tmp var holding the result. */
|
|
230
|
|
231 static tree
|
|
232 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
|
|
233 {
|
|
234 tree arg = build_int_cst (unsigned_type_node, dim);
|
|
235 tree size = create_tmp_var (integer_type_node);
|
|
236 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
|
|
237 gimple *call = gimple_build_call_internal (fn, 1, arg);
|
|
238
|
|
239 gimple_call_set_lhs (call, size);
|
|
240 gimple_seq_add_stmt (seq, call);
|
|
241
|
|
242 return size;
|
|
243 }
|
|
244
|
|
245 /* Find the number of threads (POS = false), or thread number (POS =
|
|
246 true) for an OpenACC region partitioned as MASK. Setup code
|
|
247 required for the calculation is added to SEQ. */
|
|
248
|
|
249 static tree
|
|
250 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
|
|
251 {
|
|
252 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
|
|
253 unsigned ix;
|
|
254
|
|
255 /* Start at gang level, and examine relevant dimension indices. */
|
|
256 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
|
|
257 if (GOMP_DIM_MASK (ix) & mask)
|
|
258 {
|
|
259 if (res)
|
|
260 {
|
|
261 /* We had an outer index, so scale that by the size of
|
|
262 this dimension. */
|
|
263 tree n = oacc_dim_call (false, ix, seq);
|
|
264 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
|
|
265 }
|
|
266 if (pos)
|
|
267 {
|
|
268 /* Determine index in this dimension. */
|
|
269 tree id = oacc_dim_call (true, ix, seq);
|
|
270 if (res)
|
|
271 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
|
|
272 else
|
|
273 res = id;
|
|
274 }
|
|
275 }
|
|
276
|
|
277 if (res == NULL_TREE)
|
|
278 res = integer_zero_node;
|
|
279
|
|
280 return res;
|
|
281 }
|
|
282
|
|
283 /* Transform IFN_GOACC_LOOP calls to actual code. See
|
|
284 expand_oacc_for for where these are generated. At the vector
|
|
285 level, we stride loops, such that each member of a warp will
|
|
286 operate on adjacent iterations. At the worker and gang level,
|
|
287 each gang/warp executes a set of contiguous iterations. Chunking
|
|
288 can override this such that each iteration engine executes a
|
|
289 contiguous chunk, and then moves on to stride to the next chunk. */
|
|
290
|
|
291 static void
|
|
292 oacc_xform_loop (gcall *call)
|
|
293 {
|
|
294 gimple_stmt_iterator gsi = gsi_for_stmt (call);
|
|
295 enum ifn_goacc_loop_kind code
|
|
296 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
|
|
297 tree dir = gimple_call_arg (call, 1);
|
|
298 tree range = gimple_call_arg (call, 2);
|
|
299 tree step = gimple_call_arg (call, 3);
|
|
300 tree chunk_size = NULL_TREE;
|
|
301 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
|
|
302 tree lhs = gimple_call_lhs (call);
|
|
303 tree type = TREE_TYPE (lhs);
|
|
304 tree diff_type = TREE_TYPE (range);
|
|
305 tree r = NULL_TREE;
|
|
306 gimple_seq seq = NULL;
|
|
307 bool chunking = false, striding = true;
|
|
308 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
|
|
309 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
|
|
310
|
|
311 #ifdef ACCEL_COMPILER
|
|
312 chunk_size = gimple_call_arg (call, 4);
|
|
313 if (integer_minus_onep (chunk_size) /* Force static allocation. */
|
|
314 || integer_zerop (chunk_size)) /* Default (also static). */
|
|
315 {
|
|
316 /* If we're at the gang level, we want each to execute a
|
|
317 contiguous run of iterations. Otherwise we want each element
|
|
318 to stride. */
|
|
319 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
|
|
320 chunking = false;
|
|
321 }
|
|
322 else
|
|
323 {
|
|
324 /* Chunk of size 1 is striding. */
|
|
325 striding = integer_onep (chunk_size);
|
|
326 chunking = !striding;
|
|
327 }
|
|
328 #endif
|
|
329
|
|
330 /* striding=true, chunking=true
|
|
331 -> invalid.
|
|
332 striding=true, chunking=false
|
|
333 -> chunks=1
|
|
334 striding=false,chunking=true
|
|
335 -> chunks=ceil (range/(chunksize*threads*step))
|
|
336 striding=false,chunking=false
|
|
337 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
|
|
338 push_gimplify_context (true);
|
|
339
|
|
340 switch (code)
|
|
341 {
|
|
342 default: gcc_unreachable ();
|
|
343
|
|
344 case IFN_GOACC_LOOP_CHUNKS:
|
|
345 if (!chunking)
|
|
346 r = build_int_cst (type, 1);
|
|
347 else
|
|
348 {
|
|
349 /* chunk_max
|
|
350 = (range - dir) / (chunks * step * num_threads) + dir */
|
|
351 tree per = oacc_thread_numbers (false, mask, &seq);
|
|
352 per = fold_convert (type, per);
|
|
353 chunk_size = fold_convert (type, chunk_size);
|
|
354 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
|
|
355 per = fold_build2 (MULT_EXPR, type, per, step);
|
|
356 r = build2 (MINUS_EXPR, type, range, dir);
|
|
357 r = build2 (PLUS_EXPR, type, r, per);
|
|
358 r = build2 (TRUNC_DIV_EXPR, type, r, per);
|
|
359 }
|
|
360 break;
|
|
361
|
|
362 case IFN_GOACC_LOOP_STEP:
|
|
363 {
|
|
364 /* If striding, step by the entire compute volume, otherwise
|
|
365 step by the inner volume. */
|
|
366 unsigned volume = striding ? mask : inner_mask;
|
|
367
|
|
368 r = oacc_thread_numbers (false, volume, &seq);
|
|
369 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
|
|
370 }
|
|
371 break;
|
|
372
|
|
373 case IFN_GOACC_LOOP_OFFSET:
|
|
374 /* Enable vectorization on non-SIMT targets. */
|
|
375 if (!targetm.simt.vf
|
|
376 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
|
|
377 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
|
|
378 the loop. */
|
|
379 && (flag_tree_loop_vectorize
|
|
380 || !global_options_set.x_flag_tree_loop_vectorize))
|
|
381 {
|
|
382 basic_block bb = gsi_bb (gsi);
|
|
383 struct loop *parent = bb->loop_father;
|
|
384 struct loop *body = parent->inner;
|
|
385
|
|
386 parent->force_vectorize = true;
|
|
387 parent->safelen = INT_MAX;
|
|
388
|
|
389 /* "Chunking loops" may have inner loops. */
|
|
390 if (parent->inner)
|
|
391 {
|
|
392 body->force_vectorize = true;
|
|
393 body->safelen = INT_MAX;
|
|
394 }
|
|
395
|
|
396 cfun->has_force_vectorize_loops = true;
|
|
397 }
|
|
398 if (striding)
|
|
399 {
|
|
400 r = oacc_thread_numbers (true, mask, &seq);
|
|
401 r = fold_convert (diff_type, r);
|
|
402 }
|
|
403 else
|
|
404 {
|
|
405 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
|
|
406 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
|
|
407 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
|
|
408 inner_size, outer_size);
|
|
409
|
|
410 volume = fold_convert (diff_type, volume);
|
|
411 if (chunking)
|
|
412 chunk_size = fold_convert (diff_type, chunk_size);
|
|
413 else
|
|
414 {
|
|
415 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
|
|
416
|
|
417 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
|
|
418 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
|
|
419 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
|
|
420 }
|
|
421
|
|
422 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
|
|
423 fold_convert (diff_type, inner_size));
|
|
424 r = oacc_thread_numbers (true, outer_mask, &seq);
|
|
425 r = fold_convert (diff_type, r);
|
|
426 r = build2 (MULT_EXPR, diff_type, r, span);
|
|
427
|
|
428 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
|
|
429 inner = fold_convert (diff_type, inner);
|
|
430 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
|
|
431
|
|
432 if (chunking)
|
|
433 {
|
|
434 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
|
|
435 tree per
|
|
436 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
|
|
437 per = build2 (MULT_EXPR, diff_type, per, chunk);
|
|
438
|
|
439 r = build2 (PLUS_EXPR, diff_type, r, per);
|
|
440 }
|
|
441 }
|
|
442 r = fold_build2 (MULT_EXPR, diff_type, r, step);
|
|
443 if (type != diff_type)
|
|
444 r = fold_convert (type, r);
|
|
445 break;
|
|
446
|
|
447 case IFN_GOACC_LOOP_BOUND:
|
|
448 if (striding)
|
|
449 r = range;
|
|
450 else
|
|
451 {
|
|
452 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
|
|
453 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
|
|
454 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
|
|
455 inner_size, outer_size);
|
|
456
|
|
457 volume = fold_convert (diff_type, volume);
|
|
458 if (chunking)
|
|
459 chunk_size = fold_convert (diff_type, chunk_size);
|
|
460 else
|
|
461 {
|
|
462 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
|
|
463
|
|
464 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
|
|
465 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
|
|
466 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
|
|
467 }
|
|
468
|
|
469 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
|
|
470 fold_convert (diff_type, inner_size));
|
|
471
|
|
472 r = fold_build2 (MULT_EXPR, diff_type, span, step);
|
|
473
|
|
474 tree offset = gimple_call_arg (call, 6);
|
|
475 r = build2 (PLUS_EXPR, diff_type, r,
|
|
476 fold_convert (diff_type, offset));
|
|
477 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
|
|
478 diff_type, r, range);
|
|
479 }
|
|
480 if (diff_type != type)
|
|
481 r = fold_convert (type, r);
|
|
482 break;
|
|
483 }
|
|
484
|
|
485 gimplify_assign (lhs, r, &seq);
|
|
486
|
|
487 pop_gimplify_context (NULL);
|
|
488
|
|
489 gsi_replace_with_seq (&gsi, seq, true);
|
|
490 }
|
|
491
|
|
492 /* Transform a GOACC_TILE call. Determines the element loop span for
|
|
493 the specified loop of the nest. This is 1 if we're not tiling.
|
|
494
|
|
495 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
|
|
496
|
|
497 static void
|
|
498 oacc_xform_tile (gcall *call)
|
|
499 {
|
|
500 gimple_stmt_iterator gsi = gsi_for_stmt (call);
|
|
501 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
|
|
502 /* Inner loops have higher loop_nos. */
|
|
503 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
|
|
504 tree tile_size = gimple_call_arg (call, 2);
|
|
505 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
|
|
506 tree lhs = gimple_call_lhs (call);
|
|
507 tree type = TREE_TYPE (lhs);
|
|
508 gimple_seq seq = NULL;
|
|
509 tree span = build_int_cst (type, 1);
|
|
510
|
|
511 gcc_assert (!(e_mask
|
|
512 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
|
|
513 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
|
|
514 push_gimplify_context (!seen_error ());
|
|
515
|
|
516 #ifndef ACCEL_COMPILER
|
|
517 /* Partitioning disabled on host compilers. */
|
|
518 e_mask = 0;
|
|
519 #endif
|
|
520 if (!e_mask)
|
|
521 /* Not paritioning. */
|
|
522 span = integer_one_node;
|
|
523 else if (!integer_zerop (tile_size))
|
|
524 /* User explicitly specified size. */
|
|
525 span = tile_size;
|
|
526 else
|
|
527 {
|
|
528 /* Pick a size based on the paritioning of the element loop and
|
|
529 the number of loop nests. */
|
|
530 tree first_size = NULL_TREE;
|
|
531 tree second_size = NULL_TREE;
|
|
532
|
|
533 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
|
|
534 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
|
|
535 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
|
|
536 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
|
|
537
|
|
538 if (!first_size)
|
|
539 {
|
|
540 first_size = second_size;
|
|
541 second_size = NULL_TREE;
|
|
542 }
|
|
543
|
|
544 if (loop_no + 1 == collapse)
|
|
545 {
|
|
546 span = first_size;
|
|
547 if (!loop_no && second_size)
|
|
548 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
|
|
549 span, second_size);
|
|
550 }
|
|
551 else if (loop_no + 2 == collapse)
|
|
552 span = second_size;
|
|
553 else
|
|
554 span = NULL_TREE;
|
|
555
|
|
556 if (!span)
|
|
557 /* There's no obvious element size for this loop. Options
|
|
558 are 1, first_size or some non-unity constant (32 is my
|
|
559 favourite). We should gather some statistics. */
|
|
560 span = first_size;
|
|
561 }
|
|
562
|
|
563 span = fold_convert (type, span);
|
|
564 gimplify_assign (lhs, span, &seq);
|
|
565
|
|
566 pop_gimplify_context (NULL);
|
|
567
|
|
568 gsi_replace_with_seq (&gsi, seq, true);
|
|
569 }
|
|
570
|
|
571 /* Default partitioned and minimum partitioned dimensions. */
|
|
572
|
|
573 static int oacc_default_dims[GOMP_DIM_MAX];
|
|
574 static int oacc_min_dims[GOMP_DIM_MAX];
|
|
575
|
|
576 /* Parse the default dimension parameter. This is a set of
|
|
577 :-separated optional compute dimensions. Each specified dimension
|
|
578 is a positive integer. When device type support is added, it is
|
|
579 planned to be a comma separated list of such compute dimensions,
|
|
580 with all but the first prefixed by the colon-terminated device
|
|
581 type. */
|
|
582
|
|
583 static void
|
|
584 oacc_parse_default_dims (const char *dims)
|
|
585 {
|
|
586 int ix;
|
|
587
|
|
588 for (ix = GOMP_DIM_MAX; ix--;)
|
|
589 {
|
|
590 oacc_default_dims[ix] = -1;
|
|
591 oacc_min_dims[ix] = 1;
|
|
592 }
|
|
593
|
|
594 #ifndef ACCEL_COMPILER
|
|
595 /* Cannot be overridden on the host. */
|
|
596 dims = NULL;
|
|
597 #endif
|
|
598 if (dims)
|
|
599 {
|
|
600 const char *pos = dims;
|
|
601
|
|
602 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
|
|
603 {
|
|
604 if (ix)
|
|
605 {
|
|
606 if (*pos != ':')
|
|
607 goto malformed;
|
|
608 pos++;
|
|
609 }
|
|
610
|
|
611 if (*pos != ':')
|
|
612 {
|
|
613 long val;
|
|
614 const char *eptr;
|
|
615
|
|
616 errno = 0;
|
|
617 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
|
|
618 if (errno || val <= 0 || (int) val != val)
|
|
619 goto malformed;
|
|
620 pos = eptr;
|
|
621 oacc_default_dims[ix] = (int) val;
|
|
622 }
|
|
623 }
|
|
624 if (*pos)
|
|
625 {
|
|
626 malformed:
|
|
627 error_at (UNKNOWN_LOCATION,
|
|
628 "-fopenacc-dim operand is malformed at '%s'", pos);
|
|
629 }
|
|
630 }
|
|
631
|
|
632 /* Allow the backend to validate the dimensions. */
|
|
633 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
|
|
634 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
|
|
635 }
|
|
636
|
|
637 /* Validate and update the dimensions for offloaded FN. ATTRS is the
|
|
638 raw attribute. DIMS is an array of dimensions, which is filled in.
|
|
639 LEVEL is the partitioning level of a routine, or -1 for an offload
|
|
640 region itself. USED is the mask of partitioned execution in the
|
|
641 function. */
|
|
642
|
|
643 static void
|
|
644 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
|
|
645 {
|
|
646 tree purpose[GOMP_DIM_MAX];
|
|
647 unsigned ix;
|
|
648 tree pos = TREE_VALUE (attrs);
|
|
649
|
|
650 /* Make sure the attribute creator attached the dimension
|
|
651 information. */
|
|
652 gcc_assert (pos);
|
|
653
|
|
654 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
|
|
655 {
|
|
656 purpose[ix] = TREE_PURPOSE (pos);
|
|
657 tree val = TREE_VALUE (pos);
|
|
658 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
|
|
659 pos = TREE_CHAIN (pos);
|
|
660 }
|
|
661
|
|
662 bool changed = targetm.goacc.validate_dims (fn, dims, level);
|
|
663
|
|
664 /* Default anything left to 1 or a partitioned default. */
|
|
665 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
|
|
666 if (dims[ix] < 0)
|
|
667 {
|
|
668 /* The OpenACC spec says 'If the [num_gangs] clause is not
|
|
669 specified, an implementation-defined default will be used;
|
|
670 the default may depend on the code within the construct.'
|
|
671 (2.5.6). Thus an implementation is free to choose
|
|
672 non-unity default for a parallel region that doesn't have
|
|
673 any gang-partitioned loops. However, it appears that there
|
|
674 is a sufficient body of user code that expects non-gang
|
|
675 partitioned regions to not execute in gang-redundant mode.
|
|
676 So we (a) don't warn about the non-portability and (b) pick
|
|
677 the minimum permissible dimension size when there is no
|
|
678 partitioned execution. Otherwise we pick the global
|
|
679 default for the dimension, which the user can control. The
|
|
680 same wording and logic applies to num_workers and
|
|
681 vector_length, however the worker- or vector- single
|
|
682 execution doesn't have the same impact as gang-redundant
|
|
683 execution. (If the minimum gang-level partioning is not 1,
|
|
684 the target is probably too confusing.) */
|
|
685 dims[ix] = (used & GOMP_DIM_MASK (ix)
|
|
686 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
|
|
687 changed = true;
|
|
688 }
|
|
689
|
|
690 if (changed)
|
|
691 {
|
|
692 /* Replace the attribute with new values. */
|
|
693 pos = NULL_TREE;
|
|
694 for (ix = GOMP_DIM_MAX; ix--;)
|
|
695 pos = tree_cons (purpose[ix],
|
|
696 build_int_cst (integer_type_node, dims[ix]), pos);
|
|
697 oacc_replace_fn_attrib (fn, pos);
|
|
698 }
|
|
699 }
|
|
700
|
|
701 /* Create an empty OpenACC loop structure at LOC. */
|
|
702
|
|
703 static oacc_loop *
|
|
704 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
|
|
705 {
|
|
706 oacc_loop *loop = XCNEW (oacc_loop);
|
|
707
|
|
708 loop->parent = parent;
|
|
709
|
|
710 if (parent)
|
|
711 {
|
|
712 loop->sibling = parent->child;
|
|
713 parent->child = loop;
|
|
714 }
|
|
715
|
|
716 loop->loc = loc;
|
|
717 return loop;
|
|
718 }
|
|
719
|
|
720 /* Create an outermost, dummy OpenACC loop for offloaded function
|
|
721 DECL. */
|
|
722
|
|
723 static oacc_loop *
|
|
724 new_oacc_loop_outer (tree decl)
|
|
725 {
|
|
726 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
|
|
727 }
|
|
728
|
|
729 /* Start a new OpenACC loop structure beginning at head marker HEAD.
|
|
730 Link into PARENT loop. Return the new loop. */
|
|
731
|
|
732 static oacc_loop *
|
|
733 new_oacc_loop (oacc_loop *parent, gcall *marker)
|
|
734 {
|
|
735 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
|
|
736
|
|
737 loop->marker = marker;
|
|
738
|
|
739 /* TODO: This is where device_type flattening would occur for the loop
|
|
740 flags. */
|
|
741
|
|
742 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
|
|
743
|
|
744 tree chunk_size = integer_zero_node;
|
|
745 if (loop->flags & OLF_GANG_STATIC)
|
|
746 chunk_size = gimple_call_arg (marker, 4);
|
|
747 loop->chunk_size = chunk_size;
|
|
748
|
|
749 return loop;
|
|
750 }
|
|
751
|
|
752 /* Create a dummy loop encompassing a call to a openACC routine.
|
|
753 Extract the routine's partitioning requirements. */
|
|
754
|
|
755 static void
|
|
756 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
|
|
757 {
|
|
758 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
|
|
759 int level = oacc_fn_attrib_level (attrs);
|
|
760
|
|
761 gcc_assert (level >= 0);
|
|
762
|
|
763 loop->marker = call;
|
|
764 loop->routine = decl;
|
|
765 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
|
|
766 ^ (GOMP_DIM_MASK (level) - 1));
|
|
767 }
|
|
768
|
|
769 /* Finish off the current OpenACC loop ending at tail marker TAIL.
|
|
770 Return the parent loop. */
|
|
771
|
|
772 static oacc_loop *
|
|
773 finish_oacc_loop (oacc_loop *loop)
|
|
774 {
|
|
775 /* If the loop has been collapsed, don't partition it. */
|
|
776 if (loop->ifns.is_empty ())
|
|
777 loop->mask = loop->flags = 0;
|
|
778 return loop->parent;
|
|
779 }
|
|
780
|
|
781 /* Free all OpenACC loop structures within LOOP (inclusive). */
|
|
782
|
|
783 static void
|
|
784 free_oacc_loop (oacc_loop *loop)
|
|
785 {
|
|
786 if (loop->sibling)
|
|
787 free_oacc_loop (loop->sibling);
|
|
788 if (loop->child)
|
|
789 free_oacc_loop (loop->child);
|
|
790
|
|
791 loop->ifns.release ();
|
|
792 free (loop);
|
|
793 }
|
|
794
|
|
795 /* Dump out the OpenACC loop head or tail beginning at FROM. */
|
|
796
|
|
797 static void
|
|
798 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
|
|
799 const char *title, int level)
|
|
800 {
|
|
801 enum ifn_unique_kind kind
|
|
802 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
|
|
803
|
|
804 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
|
|
805 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
|
|
806 {
|
|
807 gimple *stmt = gsi_stmt (gsi);
|
|
808
|
|
809 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
810 {
|
|
811 enum ifn_unique_kind k
|
|
812 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
|
|
813 (gimple_call_arg (stmt, 0)));
|
|
814
|
|
815 if (k == kind && stmt != from)
|
|
816 break;
|
|
817 }
|
|
818 print_gimple_stmt (file, stmt, depth * 2 + 2);
|
|
819
|
|
820 gsi_next (&gsi);
|
|
821 while (gsi_end_p (gsi))
|
|
822 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
|
|
823 }
|
|
824 }
|
|
825
|
|
826 /* Dump OpenACC loops LOOP, its siblings and its children. */
|
|
827
|
|
828 static void
|
|
829 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
|
|
830 {
|
|
831 int ix;
|
|
832
|
|
833 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
|
|
834 loop->flags, loop->mask,
|
|
835 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
|
|
836
|
|
837 if (loop->marker)
|
|
838 print_gimple_stmt (file, loop->marker, depth * 2);
|
|
839
|
|
840 if (loop->routine)
|
|
841 fprintf (file, "%*sRoutine %s:%u:%s\n",
|
|
842 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
|
|
843 DECL_SOURCE_LINE (loop->routine),
|
|
844 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
|
|
845
|
|
846 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
|
|
847 if (loop->heads[ix])
|
|
848 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
|
|
849 for (ix = GOMP_DIM_MAX; ix--;)
|
|
850 if (loop->tails[ix])
|
|
851 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
|
|
852
|
|
853 if (loop->child)
|
|
854 dump_oacc_loop (file, loop->child, depth + 1);
|
|
855 if (loop->sibling)
|
|
856 dump_oacc_loop (file, loop->sibling, depth);
|
|
857 }
|
|
858
|
|
859 void debug_oacc_loop (oacc_loop *);
|
|
860
|
|
861 /* Dump loops to stderr. */
|
|
862
|
|
863 DEBUG_FUNCTION void
|
|
864 debug_oacc_loop (oacc_loop *loop)
|
|
865 {
|
|
866 dump_oacc_loop (stderr, loop, 0);
|
|
867 }
|
|
868
|
|
869 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
|
|
870 structures as we go. By construction these loops are properly
|
|
871 nested. */
|
|
872
|
|
873 static void
|
|
874 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
|
|
875 {
|
|
876 int marker = 0;
|
|
877 int remaining = 0;
|
|
878
|
|
879 if (bb->flags & BB_VISITED)
|
|
880 return;
|
|
881
|
|
882 follow:
|
|
883 bb->flags |= BB_VISITED;
|
|
884
|
|
885 /* Scan for loop markers. */
|
|
886 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
|
|
887 gsi_next (&gsi))
|
|
888 {
|
|
889 gimple *stmt = gsi_stmt (gsi);
|
|
890
|
|
891 if (!is_gimple_call (stmt))
|
|
892 continue;
|
|
893
|
|
894 gcall *call = as_a <gcall *> (stmt);
|
|
895
|
|
896 /* If this is a routine, make a dummy loop for it. */
|
|
897 if (tree decl = gimple_call_fndecl (call))
|
|
898 if (tree attrs = oacc_get_fn_attrib (decl))
|
|
899 {
|
|
900 gcc_assert (!marker);
|
|
901 new_oacc_loop_routine (loop, call, decl, attrs);
|
|
902 }
|
|
903
|
|
904 if (!gimple_call_internal_p (call))
|
|
905 continue;
|
|
906
|
|
907 switch (gimple_call_internal_fn (call))
|
|
908 {
|
|
909 default:
|
|
910 break;
|
|
911
|
|
912 case IFN_GOACC_LOOP:
|
|
913 case IFN_GOACC_TILE:
|
|
914 /* Record the abstraction function, so we can manipulate it
|
|
915 later. */
|
|
916 loop->ifns.safe_push (call);
|
|
917 break;
|
|
918
|
|
919 case IFN_UNIQUE:
|
|
920 enum ifn_unique_kind kind
|
|
921 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
|
|
922 (gimple_call_arg (call, 0)));
|
|
923 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
|
|
924 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
|
|
925 {
|
|
926 if (gimple_call_num_args (call) == 2)
|
|
927 {
|
|
928 gcc_assert (marker && !remaining);
|
|
929 marker = 0;
|
|
930 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
|
|
931 loop = finish_oacc_loop (loop);
|
|
932 else
|
|
933 loop->head_end = call;
|
|
934 }
|
|
935 else
|
|
936 {
|
|
937 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
|
|
938
|
|
939 if (!marker)
|
|
940 {
|
|
941 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
|
|
942 loop = new_oacc_loop (loop, call);
|
|
943 remaining = count;
|
|
944 }
|
|
945 gcc_assert (count == remaining);
|
|
946 if (remaining)
|
|
947 {
|
|
948 remaining--;
|
|
949 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
|
|
950 loop->heads[marker] = call;
|
|
951 else
|
|
952 loop->tails[remaining] = call;
|
|
953 }
|
|
954 marker++;
|
|
955 }
|
|
956 }
|
|
957 }
|
|
958 }
|
|
959 if (remaining || marker)
|
|
960 {
|
|
961 bb = single_succ (bb);
|
|
962 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
|
|
963 goto follow;
|
|
964 }
|
|
965
|
|
966 /* Walk successor blocks. */
|
|
967 edge e;
|
|
968 edge_iterator ei;
|
|
969
|
|
970 FOR_EACH_EDGE (e, ei, bb->succs)
|
|
971 oacc_loop_discover_walk (loop, e->dest);
|
|
972 }
|
|
973
|
|
974 /* LOOP is the first sibling. Reverse the order in place and return
|
|
975 the new first sibling. Recurse to child loops. */
|
|
976
|
|
977 static oacc_loop *
|
|
978 oacc_loop_sibling_nreverse (oacc_loop *loop)
|
|
979 {
|
|
980 oacc_loop *last = NULL;
|
|
981 do
|
|
982 {
|
|
983 if (loop->child)
|
|
984 loop->child = oacc_loop_sibling_nreverse (loop->child);
|
|
985
|
|
986 oacc_loop *next = loop->sibling;
|
|
987 loop->sibling = last;
|
|
988 last = loop;
|
|
989 loop = next;
|
|
990 }
|
|
991 while (loop);
|
|
992
|
|
993 return last;
|
|
994 }
|
|
995
|
|
996 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
|
|
997 the current function. */
|
|
998
|
|
999 static oacc_loop *
|
|
1000 oacc_loop_discovery ()
|
|
1001 {
|
|
1002 /* Clear basic block flags, in particular BB_VISITED which we're going to use
|
|
1003 in the following. */
|
|
1004 clear_bb_flags ();
|
|
1005
|
|
1006 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
|
|
1007 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
|
|
1008
|
|
1009 /* The siblings were constructed in reverse order, reverse them so
|
|
1010 that diagnostics come out in an unsurprising order. */
|
|
1011 top = oacc_loop_sibling_nreverse (top);
|
|
1012
|
|
1013 return top;
|
|
1014 }
|
|
1015
|
|
1016 /* Transform the abstract internal function markers starting at FROM
|
|
1017 to be for partitioning level LEVEL. Stop when we meet another HEAD
|
|
1018 or TAIL marker. */
|
|
1019
|
|
1020 static void
|
|
1021 oacc_loop_xform_head_tail (gcall *from, int level)
|
|
1022 {
|
|
1023 enum ifn_unique_kind kind
|
|
1024 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
|
|
1025 tree replacement = build_int_cst (unsigned_type_node, level);
|
|
1026
|
|
1027 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
|
|
1028 {
|
|
1029 gimple *stmt = gsi_stmt (gsi);
|
|
1030
|
|
1031 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
1032 {
|
|
1033 enum ifn_unique_kind k
|
|
1034 = ((enum ifn_unique_kind)
|
|
1035 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
|
|
1036
|
|
1037 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
|
|
1038 *gimple_call_arg_ptr (stmt, 2) = replacement;
|
|
1039 else if (k == kind && stmt != from)
|
|
1040 break;
|
|
1041 }
|
|
1042 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
|
|
1043 *gimple_call_arg_ptr (stmt, 3) = replacement;
|
|
1044
|
|
1045 gsi_next (&gsi);
|
|
1046 while (gsi_end_p (gsi))
|
|
1047 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
|
|
1048 }
|
|
1049 }
|
|
1050
|
|
1051 /* Process the discovered OpenACC loops, setting the correct
|
|
1052 partitioning level etc. */
|
|
1053
|
|
1054 static void
|
|
1055 oacc_loop_process (oacc_loop *loop)
|
|
1056 {
|
|
1057 if (loop->child)
|
|
1058 oacc_loop_process (loop->child);
|
|
1059
|
|
1060 if (loop->mask && !loop->routine)
|
|
1061 {
|
|
1062 int ix;
|
|
1063 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
|
|
1064 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
|
|
1065 tree chunk_arg = loop->chunk_size;
|
|
1066 gcall *call;
|
|
1067
|
|
1068 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
|
|
1069 switch (gimple_call_internal_fn (call))
|
|
1070 {
|
|
1071 case IFN_GOACC_LOOP:
|
|
1072 {
|
|
1073 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
|
|
1074 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
|
|
1075 if (!is_e)
|
|
1076 gimple_call_set_arg (call, 4, chunk_arg);
|
|
1077 }
|
|
1078 break;
|
|
1079
|
|
1080 case IFN_GOACC_TILE:
|
|
1081 gimple_call_set_arg (call, 3, mask_arg);
|
|
1082 gimple_call_set_arg (call, 4, e_mask_arg);
|
|
1083 break;
|
|
1084
|
|
1085 default:
|
|
1086 gcc_unreachable ();
|
|
1087 }
|
|
1088
|
|
1089 unsigned dim = GOMP_DIM_GANG;
|
|
1090 unsigned mask = loop->mask | loop->e_mask;
|
|
1091 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
|
|
1092 {
|
|
1093 while (!(GOMP_DIM_MASK (dim) & mask))
|
|
1094 dim++;
|
|
1095
|
|
1096 oacc_loop_xform_head_tail (loop->heads[ix], dim);
|
|
1097 oacc_loop_xform_head_tail (loop->tails[ix], dim);
|
|
1098
|
|
1099 mask ^= GOMP_DIM_MASK (dim);
|
|
1100 }
|
|
1101 }
|
|
1102
|
|
1103 if (loop->sibling)
|
|
1104 oacc_loop_process (loop->sibling);
|
|
1105 }
|
|
1106
|
|
1107 /* Walk the OpenACC loop heirarchy checking and assigning the
|
|
1108 programmer-specified partitionings. OUTER_MASK is the partitioning
|
|
1109 this loop is contained within. Return mask of partitioning
|
|
1110 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
|
|
1111 bit. */
|
|
1112
|
|
1113 static unsigned
|
|
1114 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
|
|
1115 {
|
|
1116 unsigned this_mask = loop->mask;
|
|
1117 unsigned mask_all = 0;
|
|
1118 bool noisy = true;
|
|
1119
|
|
1120 #ifdef ACCEL_COMPILER
|
|
1121 /* When device_type is supported, we want the device compiler to be
|
|
1122 noisy, if the loop parameters are device_type-specific. */
|
|
1123 noisy = false;
|
|
1124 #endif
|
|
1125
|
|
1126 if (!loop->routine)
|
|
1127 {
|
|
1128 bool auto_par = (loop->flags & OLF_AUTO) != 0;
|
|
1129 bool seq_par = (loop->flags & OLF_SEQ) != 0;
|
|
1130 bool tiling = (loop->flags & OLF_TILE) != 0;
|
|
1131
|
|
1132 this_mask = ((loop->flags >> OLF_DIM_BASE)
|
|
1133 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
|
|
1134
|
|
1135 /* Apply auto partitioning if this is a non-partitioned regular
|
|
1136 loop, or (no more than) single axis tiled loop. */
|
|
1137 bool maybe_auto
|
|
1138 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
|
|
1139
|
|
1140 if ((this_mask != 0) + auto_par + seq_par > 1)
|
|
1141 {
|
|
1142 if (noisy)
|
|
1143 error_at (loop->loc,
|
|
1144 seq_par
|
|
1145 ? G_("%<seq%> overrides other OpenACC loop specifiers")
|
|
1146 : G_("%<auto%> conflicts with other OpenACC loop "
|
|
1147 "specifiers"));
|
|
1148 maybe_auto = false;
|
|
1149 loop->flags &= ~OLF_AUTO;
|
|
1150 if (seq_par)
|
|
1151 {
|
|
1152 loop->flags
|
|
1153 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
|
|
1154 this_mask = 0;
|
|
1155 }
|
|
1156 }
|
|
1157
|
|
1158 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
|
|
1159 {
|
|
1160 loop->flags |= OLF_AUTO;
|
|
1161 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
1162 }
|
|
1163 }
|
|
1164
|
|
1165 if (this_mask & outer_mask)
|
|
1166 {
|
|
1167 const oacc_loop *outer;
|
|
1168 for (outer = loop->parent; outer; outer = outer->parent)
|
|
1169 if ((outer->mask | outer->e_mask) & this_mask)
|
|
1170 break;
|
|
1171
|
|
1172 if (noisy)
|
|
1173 {
|
|
1174 if (outer)
|
|
1175 {
|
|
1176 error_at (loop->loc,
|
|
1177 loop->routine
|
|
1178 ? G_("routine call uses same OpenACC parallelism"
|
|
1179 " as containing loop")
|
|
1180 : G_("inner loop uses same OpenACC parallelism"
|
|
1181 " as containing loop"));
|
|
1182 inform (outer->loc, "containing loop here");
|
|
1183 }
|
|
1184 else
|
|
1185 error_at (loop->loc,
|
|
1186 loop->routine
|
|
1187 ? G_("routine call uses OpenACC parallelism disallowed"
|
|
1188 " by containing routine")
|
|
1189 : G_("loop uses OpenACC parallelism disallowed"
|
|
1190 " by containing routine"));
|
|
1191
|
|
1192 if (loop->routine)
|
|
1193 inform (DECL_SOURCE_LOCATION (loop->routine),
|
|
1194 "routine %qD declared here", loop->routine);
|
|
1195 }
|
|
1196 this_mask &= ~outer_mask;
|
|
1197 }
|
|
1198 else
|
|
1199 {
|
|
1200 unsigned outermost = least_bit_hwi (this_mask);
|
|
1201
|
|
1202 if (outermost && outermost <= outer_mask)
|
|
1203 {
|
|
1204 if (noisy)
|
|
1205 {
|
|
1206 error_at (loop->loc,
|
|
1207 "incorrectly nested OpenACC loop parallelism");
|
|
1208
|
|
1209 const oacc_loop *outer;
|
|
1210 for (outer = loop->parent;
|
|
1211 outer->flags && outer->flags < outermost;
|
|
1212 outer = outer->parent)
|
|
1213 continue;
|
|
1214 inform (outer->loc, "containing loop here");
|
|
1215 }
|
|
1216
|
|
1217 this_mask &= ~outermost;
|
|
1218 }
|
|
1219 }
|
|
1220
|
|
1221 mask_all |= this_mask;
|
|
1222
|
|
1223 if (loop->flags & OLF_TILE)
|
|
1224 {
|
|
1225 /* When tiling, vector goes to the element loop, and failing
|
|
1226 that we put worker there. The std doesn't contemplate
|
|
1227 specifying all three. We choose to put worker and vector on
|
|
1228 the element loops in that case. */
|
|
1229 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
|
|
1230 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
|
|
1231 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
|
|
1232
|
|
1233 loop->e_mask = this_e_mask;
|
|
1234 this_mask ^= this_e_mask;
|
|
1235 }
|
|
1236
|
|
1237 loop->mask = this_mask;
|
|
1238
|
|
1239 if (dump_file)
|
|
1240 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
|
|
1241 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
|
|
1242 loop->mask, loop->e_mask);
|
|
1243
|
|
1244 if (loop->child)
|
|
1245 {
|
|
1246 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
|
|
1247 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
|
|
1248 mask_all |= loop->inner;
|
|
1249 }
|
|
1250
|
|
1251 if (loop->sibling)
|
|
1252 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
|
|
1253
|
|
1254 return mask_all;
|
|
1255 }
|
|
1256
|
|
1257 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
|
|
1258 OUTER_MASK is the partitioning this loop is contained within.
|
|
1259 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
|
|
1260 Return the cumulative partitioning used by this loop, siblings and
|
|
1261 children. */
|
|
1262
|
|
1263 static unsigned
|
|
1264 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
|
|
1265 bool outer_assign)
|
|
1266 {
|
|
1267 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
|
|
1268 bool noisy = true;
|
|
1269 bool tiling = loop->flags & OLF_TILE;
|
|
1270
|
|
1271 #ifdef ACCEL_COMPILER
|
|
1272 /* When device_type is supported, we want the device compiler to be
|
|
1273 noisy, if the loop parameters are device_type-specific. */
|
|
1274 noisy = false;
|
|
1275 #endif
|
|
1276
|
|
1277 if (assign && (!outer_assign || loop->inner))
|
|
1278 {
|
|
1279 /* Allocate outermost and non-innermost loops at the outermost
|
|
1280 non-innermost available level. */
|
|
1281 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
|
|
1282
|
|
1283 /* Find the first outermost available partition. */
|
|
1284 while (this_mask <= outer_mask)
|
|
1285 this_mask <<= 1;
|
|
1286
|
|
1287 /* Grab two axes if tiling, and we've not assigned anything */
|
|
1288 if (tiling && !(loop->mask | loop->e_mask))
|
|
1289 this_mask |= this_mask << 1;
|
|
1290
|
|
1291 /* Prohibit the innermost partitioning at the moment. */
|
|
1292 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
|
|
1293
|
|
1294 /* Don't use any dimension explicitly claimed by an inner loop. */
|
|
1295 this_mask &= ~loop->inner;
|
|
1296
|
|
1297 if (tiling && !loop->e_mask)
|
|
1298 {
|
|
1299 /* If we got two axes, allocate the inner one to the element
|
|
1300 loop. */
|
|
1301 loop->e_mask = this_mask & (this_mask << 1);
|
|
1302 this_mask ^= loop->e_mask;
|
|
1303 }
|
|
1304
|
|
1305 loop->mask |= this_mask;
|
|
1306 }
|
|
1307
|
|
1308 if (loop->child)
|
|
1309 {
|
|
1310 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
|
|
1311 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
|
|
1312 outer_assign | assign);
|
|
1313 }
|
|
1314
|
|
1315 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
|
|
1316 {
|
|
1317 /* Allocate the loop at the innermost available level. Note
|
|
1318 that we do this even if we already assigned this loop the
|
|
1319 outermost available level above. That way we'll partition
|
|
1320 this along 2 axes, if they are available. */
|
|
1321 unsigned this_mask = 0;
|
|
1322
|
|
1323 /* Determine the outermost partitioning used within this loop. */
|
|
1324 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
1325 this_mask = least_bit_hwi (this_mask);
|
|
1326
|
|
1327 /* Pick the partitioning just inside that one. */
|
|
1328 this_mask >>= 1;
|
|
1329
|
|
1330 /* And avoid picking one use by an outer loop. */
|
|
1331 this_mask &= ~outer_mask;
|
|
1332
|
|
1333 /* If tiling and we failed completely above, grab the next one
|
|
1334 too. Making sure it doesn't hit an outer loop. */
|
|
1335 if (tiling)
|
|
1336 {
|
|
1337 this_mask &= ~(loop->e_mask | loop->mask);
|
|
1338 unsigned tile_mask = ((this_mask >> 1)
|
|
1339 & ~(outer_mask | loop->e_mask | loop->mask));
|
|
1340
|
|
1341 if (tile_mask || loop->mask)
|
|
1342 {
|
|
1343 loop->e_mask |= this_mask;
|
|
1344 this_mask = tile_mask;
|
|
1345 }
|
|
1346 if (!loop->e_mask && noisy)
|
|
1347 warning_at (loop->loc, 0,
|
|
1348 "insufficient partitioning available"
|
|
1349 " to parallelize element loop");
|
|
1350 }
|
|
1351
|
|
1352 loop->mask |= this_mask;
|
|
1353 if (!loop->mask && noisy)
|
|
1354 warning_at (loop->loc, 0,
|
|
1355 tiling
|
|
1356 ? G_("insufficient partitioning available"
|
|
1357 " to parallelize tile loop")
|
|
1358 : G_("insufficient partitioning available"
|
|
1359 " to parallelize loop"));
|
|
1360 }
|
|
1361
|
|
1362 if (assign && dump_file)
|
|
1363 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
|
|
1364 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
|
|
1365 loop->mask, loop->e_mask);
|
|
1366
|
|
1367 unsigned inner_mask = 0;
|
|
1368
|
|
1369 if (loop->sibling)
|
|
1370 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
|
|
1371 outer_mask, outer_assign);
|
|
1372
|
|
1373 inner_mask |= loop->inner | loop->mask | loop->e_mask;
|
|
1374
|
|
1375 return inner_mask;
|
|
1376 }
|
|
1377
|
|
1378 /* Walk the OpenACC loop heirarchy to check and assign partitioning
|
|
1379 axes. Return mask of partitioning. */
|
|
1380
|
|
1381 static unsigned
|
|
1382 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
|
|
1383 {
|
|
1384 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
|
|
1385
|
|
1386 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
|
|
1387 {
|
|
1388 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
1389 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
|
|
1390 }
|
|
1391 return mask_all;
|
|
1392 }
|
|
1393
|
|
1394 /* Default fork/join early expander. Delete the function calls if
|
|
1395 there is no RTL expander. */
|
|
1396
|
|
1397 bool
|
|
1398 default_goacc_fork_join (gcall *ARG_UNUSED (call),
|
|
1399 const int *ARG_UNUSED (dims), bool is_fork)
|
|
1400 {
|
|
1401 if (is_fork)
|
|
1402 return targetm.have_oacc_fork ();
|
|
1403 else
|
|
1404 return targetm.have_oacc_join ();
|
|
1405 }
|
|
1406
|
|
1407 /* Default goacc.reduction early expander.
|
|
1408
|
|
1409 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
|
|
1410 If RES_PTR is not integer-zerop:
|
|
1411 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
|
|
1412 TEARDOWN - emit '*RES_PTR = VAR'
|
|
1413 If LHS is not NULL
|
|
1414 emit 'LHS = VAR' */
|
|
1415
|
|
1416 void
|
|
1417 default_goacc_reduction (gcall *call)
|
|
1418 {
|
|
1419 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
|
|
1420 gimple_stmt_iterator gsi = gsi_for_stmt (call);
|
|
1421 tree lhs = gimple_call_lhs (call);
|
|
1422 tree var = gimple_call_arg (call, 2);
|
|
1423 gimple_seq seq = NULL;
|
|
1424
|
|
1425 if (code == IFN_GOACC_REDUCTION_SETUP
|
|
1426 || code == IFN_GOACC_REDUCTION_TEARDOWN)
|
|
1427 {
|
|
1428 /* Setup and Teardown need to copy from/to the receiver object,
|
|
1429 if there is one. */
|
|
1430 tree ref_to_res = gimple_call_arg (call, 1);
|
|
1431
|
|
1432 if (!integer_zerop (ref_to_res))
|
|
1433 {
|
|
1434 tree dst = build_simple_mem_ref (ref_to_res);
|
|
1435 tree src = var;
|
|
1436
|
|
1437 if (code == IFN_GOACC_REDUCTION_SETUP)
|
|
1438 {
|
|
1439 src = dst;
|
|
1440 dst = lhs;
|
|
1441 lhs = NULL;
|
|
1442 }
|
|
1443 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
|
|
1444 }
|
|
1445 }
|
|
1446
|
|
1447 /* Copy VAR to LHS, if there is an LHS. */
|
|
1448 if (lhs)
|
|
1449 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
|
|
1450
|
|
1451 gsi_replace_with_seq (&gsi, seq, true);
|
|
1452 }
|
|
1453
|
|
1454 /* Main entry point for oacc transformations which run on the device
|
|
1455 compiler after LTO, so we know what the target device is at this
|
|
1456 point (including the host fallback). */
|
|
1457
|
|
1458 static unsigned int
|
|
1459 execute_oacc_device_lower ()
|
|
1460 {
|
|
1461 tree attrs = oacc_get_fn_attrib (current_function_decl);
|
|
1462
|
|
1463 if (!attrs)
|
|
1464 /* Not an offloaded function. */
|
|
1465 return 0;
|
|
1466
|
|
1467 /* Parse the default dim argument exactly once. */
|
|
1468 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
|
|
1469 {
|
|
1470 oacc_parse_default_dims (flag_openacc_dims);
|
|
1471 flag_openacc_dims = (char *)&flag_openacc_dims;
|
|
1472 }
|
|
1473
|
|
1474 bool is_oacc_kernels
|
|
1475 = (lookup_attribute ("oacc kernels",
|
|
1476 DECL_ATTRIBUTES (current_function_decl)) != NULL);
|
|
1477 bool is_oacc_kernels_parallelized
|
|
1478 = (lookup_attribute ("oacc kernels parallelized",
|
|
1479 DECL_ATTRIBUTES (current_function_decl)) != NULL);
|
|
1480
|
|
1481 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
|
|
1482 kernels, so remove the parallelism dimensions function attributes
|
|
1483 potentially set earlier on. */
|
|
1484 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
|
|
1485 {
|
|
1486 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
|
|
1487 attrs = oacc_get_fn_attrib (current_function_decl);
|
|
1488 }
|
|
1489
|
|
1490 /* Discover, partition and process the loops. */
|
|
1491 oacc_loop *loops = oacc_loop_discovery ();
|
|
1492 int fn_level = oacc_fn_attrib_level (attrs);
|
|
1493
|
|
1494 if (dump_file)
|
|
1495 {
|
|
1496 if (fn_level >= 0)
|
|
1497 fprintf (dump_file, "Function is OpenACC routine level %d\n",
|
|
1498 fn_level);
|
|
1499 else if (is_oacc_kernels)
|
|
1500 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
|
|
1501 (is_oacc_kernels_parallelized
|
|
1502 ? "parallelized" : "unparallelized"));
|
|
1503 else
|
|
1504 fprintf (dump_file, "Function is OpenACC parallel offload\n");
|
|
1505 }
|
|
1506
|
|
1507 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
|
|
1508 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
|
|
1509 /* OpenACC kernels constructs are special: they currently don't use the
|
|
1510 generic oacc_loop infrastructure and attribute/dimension processing. */
|
|
1511 if (is_oacc_kernels && is_oacc_kernels_parallelized)
|
|
1512 {
|
|
1513 /* Parallelized OpenACC kernels constructs use gang parallelism. See
|
|
1514 also tree-parloops.c:create_parallel_loop. */
|
|
1515 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
|
|
1516 }
|
|
1517
|
|
1518 int dims[GOMP_DIM_MAX];
|
|
1519 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
|
|
1520
|
|
1521 if (dump_file)
|
|
1522 {
|
|
1523 const char *comma = "Compute dimensions [";
|
|
1524 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
|
|
1525 fprintf (dump_file, "%s%d", comma, dims[ix]);
|
|
1526 fprintf (dump_file, "]\n");
|
|
1527 }
|
|
1528
|
|
1529 oacc_loop_process (loops);
|
|
1530 if (dump_file)
|
|
1531 {
|
|
1532 fprintf (dump_file, "OpenACC loops\n");
|
|
1533 dump_oacc_loop (dump_file, loops, 0);
|
|
1534 fprintf (dump_file, "\n");
|
|
1535 }
|
|
1536
|
|
1537 /* Offloaded targets may introduce new basic blocks, which require
|
|
1538 dominance information to update SSA. */
|
|
1539 calculate_dominance_info (CDI_DOMINATORS);
|
|
1540
|
|
1541 /* Now lower internal loop functions to target-specific code
|
|
1542 sequences. */
|
|
1543 basic_block bb;
|
|
1544 FOR_ALL_BB_FN (bb, cfun)
|
|
1545 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
|
|
1546 {
|
|
1547 gimple *stmt = gsi_stmt (gsi);
|
|
1548 if (!is_gimple_call (stmt))
|
|
1549 {
|
|
1550 gsi_next (&gsi);
|
|
1551 continue;
|
|
1552 }
|
|
1553
|
|
1554 gcall *call = as_a <gcall *> (stmt);
|
|
1555 if (!gimple_call_internal_p (call))
|
|
1556 {
|
|
1557 gsi_next (&gsi);
|
|
1558 continue;
|
|
1559 }
|
|
1560
|
|
1561 /* Rewind to allow rescan. */
|
|
1562 gsi_prev (&gsi);
|
|
1563 bool rescan = false, remove = false;
|
|
1564 enum internal_fn ifn_code = gimple_call_internal_fn (call);
|
|
1565
|
|
1566 switch (ifn_code)
|
|
1567 {
|
|
1568 default: break;
|
|
1569
|
|
1570 case IFN_GOACC_TILE:
|
|
1571 oacc_xform_tile (call);
|
|
1572 rescan = true;
|
|
1573 break;
|
|
1574
|
|
1575 case IFN_GOACC_LOOP:
|
|
1576 oacc_xform_loop (call);
|
|
1577 rescan = true;
|
|
1578 break;
|
|
1579
|
|
1580 case IFN_GOACC_REDUCTION:
|
|
1581 /* Mark the function for SSA renaming. */
|
|
1582 mark_virtual_operands_for_renaming (cfun);
|
|
1583
|
|
1584 /* If the level is -1, this ended up being an unused
|
|
1585 axis. Handle as a default. */
|
|
1586 if (integer_minus_onep (gimple_call_arg (call, 3)))
|
|
1587 default_goacc_reduction (call);
|
|
1588 else
|
|
1589 targetm.goacc.reduction (call);
|
|
1590 rescan = true;
|
|
1591 break;
|
|
1592
|
|
1593 case IFN_UNIQUE:
|
|
1594 {
|
|
1595 enum ifn_unique_kind kind
|
|
1596 = ((enum ifn_unique_kind)
|
|
1597 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
|
|
1598
|
|
1599 switch (kind)
|
|
1600 {
|
|
1601 default:
|
|
1602 break;
|
|
1603
|
|
1604 case IFN_UNIQUE_OACC_FORK:
|
|
1605 case IFN_UNIQUE_OACC_JOIN:
|
|
1606 if (integer_minus_onep (gimple_call_arg (call, 2)))
|
|
1607 remove = true;
|
|
1608 else if (!targetm.goacc.fork_join
|
|
1609 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
|
|
1610 remove = true;
|
|
1611 break;
|
|
1612
|
|
1613 case IFN_UNIQUE_OACC_HEAD_MARK:
|
|
1614 case IFN_UNIQUE_OACC_TAIL_MARK:
|
|
1615 remove = true;
|
|
1616 break;
|
|
1617 }
|
|
1618 break;
|
|
1619 }
|
|
1620 }
|
|
1621
|
|
1622 if (gsi_end_p (gsi))
|
|
1623 /* We rewound past the beginning of the BB. */
|
|
1624 gsi = gsi_start_bb (bb);
|
|
1625 else
|
|
1626 /* Undo the rewind. */
|
|
1627 gsi_next (&gsi);
|
|
1628
|
|
1629 if (remove)
|
|
1630 {
|
|
1631 if (gimple_vdef (call))
|
|
1632 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
|
|
1633 if (gimple_call_lhs (call))
|
|
1634 {
|
|
1635 /* Propagate the data dependency var. */
|
|
1636 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
|
|
1637 gimple_call_arg (call, 1));
|
|
1638 gsi_replace (&gsi, ass, false);
|
|
1639 }
|
|
1640 else
|
|
1641 gsi_remove (&gsi, true);
|
|
1642 }
|
|
1643 else if (!rescan)
|
|
1644 /* If not rescanning, advance over the call. */
|
|
1645 gsi_next (&gsi);
|
|
1646 }
|
|
1647
|
|
1648 free_oacc_loop (loops);
|
|
1649
|
|
1650 return 0;
|
|
1651 }
|
|
1652
|
|
1653 /* Default launch dimension validator. Force everything to 1. A
|
|
1654 backend that wants to provide larger dimensions must override this
|
|
1655 hook. */
|
|
1656
|
|
1657 bool
|
|
1658 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
|
|
1659 int ARG_UNUSED (fn_level))
|
|
1660 {
|
|
1661 bool changed = false;
|
|
1662
|
|
1663 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
|
|
1664 {
|
|
1665 if (dims[ix] != 1)
|
|
1666 {
|
|
1667 dims[ix] = 1;
|
|
1668 changed = true;
|
|
1669 }
|
|
1670 }
|
|
1671
|
|
1672 return changed;
|
|
1673 }
|
|
1674
|
|
1675 /* Default dimension bound is unknown on accelerator and 1 on host. */
|
|
1676
|
|
1677 int
|
|
1678 default_goacc_dim_limit (int ARG_UNUSED (axis))
|
|
1679 {
|
|
1680 #ifdef ACCEL_COMPILER
|
|
1681 return 0;
|
|
1682 #else
|
|
1683 return 1;
|
|
1684 #endif
|
|
1685 }
|
|
1686
|
|
1687 namespace {
|
|
1688
|
|
1689 const pass_data pass_data_oacc_device_lower =
|
|
1690 {
|
|
1691 GIMPLE_PASS, /* type */
|
|
1692 "oaccdevlow", /* name */
|
|
1693 OPTGROUP_OMP, /* optinfo_flags */
|
|
1694 TV_NONE, /* tv_id */
|
|
1695 PROP_cfg, /* properties_required */
|
|
1696 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
|
|
1697 0, /* properties_destroyed */
|
|
1698 0, /* todo_flags_start */
|
|
1699 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
|
|
1700 };
|
|
1701
|
|
1702 class pass_oacc_device_lower : public gimple_opt_pass
|
|
1703 {
|
|
1704 public:
|
|
1705 pass_oacc_device_lower (gcc::context *ctxt)
|
|
1706 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
|
|
1707 {}
|
|
1708
|
|
1709 /* opt_pass methods: */
|
|
1710 virtual bool gate (function *) { return flag_openacc; };
|
|
1711
|
|
1712 virtual unsigned int execute (function *)
|
|
1713 {
|
|
1714 return execute_oacc_device_lower ();
|
|
1715 }
|
|
1716
|
|
1717 }; // class pass_oacc_device_lower
|
|
1718
|
|
1719 } // anon namespace
|
|
1720
|
|
1721 gimple_opt_pass *
|
|
1722 make_pass_oacc_device_lower (gcc::context *ctxt)
|
|
1723 {
|
|
1724 return new pass_oacc_device_lower (ctxt);
|
|
1725 }
|
|
1726
|
|
1727
|
|
1728 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
|
|
1729 GOMP_SIMT_ENTER call identifying the privatized variables, which are
|
|
1730 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
|
|
1731 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
|
|
1732
|
|
1733 static void
|
|
1734 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
|
|
1735 {
|
|
1736 gimple *alloc_stmt = gsi_stmt (*gsi);
|
|
1737 tree simtrec = gimple_call_lhs (alloc_stmt);
|
|
1738 tree simduid = gimple_call_arg (alloc_stmt, 0);
|
|
1739 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
|
|
1740 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
|
|
1741 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
|
|
1742 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
|
|
1743 TREE_ADDRESSABLE (rectype) = 1;
|
|
1744 TREE_TYPE (simtrec) = build_pointer_type (rectype);
|
|
1745 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
|
|
1746 {
|
|
1747 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
|
|
1748 if (*argp == null_pointer_node)
|
|
1749 continue;
|
|
1750 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
|
|
1751 && VAR_P (TREE_OPERAND (*argp, 0)));
|
|
1752 tree var = TREE_OPERAND (*argp, 0);
|
|
1753
|
|
1754 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
|
|
1755 DECL_NAME (var), TREE_TYPE (var));
|
|
1756 SET_DECL_ALIGN (field, DECL_ALIGN (var));
|
|
1757 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
|
|
1758 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
|
|
1759
|
|
1760 insert_field_into_struct (rectype, field);
|
|
1761
|
|
1762 tree t = build_simple_mem_ref (simtrec);
|
|
1763 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
|
|
1764 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
|
|
1765 SET_DECL_VALUE_EXPR (var, t);
|
|
1766 DECL_HAS_VALUE_EXPR_P (var) = 1;
|
|
1767 *regimplify = true;
|
|
1768 }
|
|
1769 layout_type (rectype);
|
|
1770 tree size = TYPE_SIZE_UNIT (rectype);
|
|
1771 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
|
|
1772
|
|
1773 alloc_stmt
|
|
1774 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
|
|
1775 gimple_call_set_lhs (alloc_stmt, simtrec);
|
|
1776 gsi_replace (gsi, alloc_stmt, false);
|
|
1777 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
|
|
1778 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
|
|
1779 gsi_replace (&enter_gsi, enter_stmt, false);
|
|
1780
|
|
1781 use_operand_p use;
|
|
1782 gimple *exit_stmt;
|
|
1783 if (single_imm_use (simtrec, &use, &exit_stmt))
|
|
1784 {
|
|
1785 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
|
|
1786 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
|
|
1787 tree clobber = build_constructor (rectype, NULL);
|
|
1788 TREE_THIS_VOLATILE (clobber) = 1;
|
|
1789 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
|
|
1790 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
|
|
1791 }
|
|
1792 else
|
|
1793 gcc_checking_assert (has_zero_uses (simtrec));
|
|
1794 }
|
|
1795
|
|
1796 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
|
|
1797
|
|
1798 static tree
|
|
1799 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
|
|
1800 {
|
|
1801 tree t = *tp;
|
|
1802
|
|
1803 if (VAR_P (t)
|
|
1804 && DECL_HAS_VALUE_EXPR_P (t)
|
|
1805 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
|
|
1806 {
|
|
1807 *walk_subtrees = 0;
|
|
1808 return t;
|
|
1809 }
|
|
1810 return NULL_TREE;
|
|
1811 }
|
|
1812
|
|
1813 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
|
|
1814 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
|
|
1815 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
|
|
1816 internal functions on non-SIMT targets, and likewise some SIMD internal
|
|
1817 functions on SIMT targets. */
|
|
1818
|
|
1819 static unsigned int
|
|
1820 execute_omp_device_lower ()
|
|
1821 {
|
|
1822 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
|
|
1823 bool regimplify = false;
|
|
1824 basic_block bb;
|
|
1825 gimple_stmt_iterator gsi;
|
|
1826 FOR_EACH_BB_FN (bb, cfun)
|
|
1827 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
1828 {
|
|
1829 gimple *stmt = gsi_stmt (gsi);
|
|
1830 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
|
|
1831 continue;
|
|
1832 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
|
|
1833 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
|
|
1834 switch (gimple_call_internal_fn (stmt))
|
|
1835 {
|
|
1836 case IFN_GOMP_USE_SIMT:
|
|
1837 rhs = vf == 1 ? integer_zero_node : integer_one_node;
|
|
1838 break;
|
|
1839 case IFN_GOMP_SIMT_ENTER:
|
|
1840 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
|
|
1841 goto simtreg_enter_exit;
|
|
1842 case IFN_GOMP_SIMT_ENTER_ALLOC:
|
|
1843 if (vf != 1)
|
|
1844 ompdevlow_adjust_simt_enter (&gsi, ®implify);
|
|
1845 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
|
|
1846 goto simtreg_enter_exit;
|
|
1847 case IFN_GOMP_SIMT_EXIT:
|
|
1848 simtreg_enter_exit:
|
|
1849 if (vf != 1)
|
|
1850 continue;
|
|
1851 unlink_stmt_vdef (stmt);
|
|
1852 break;
|
|
1853 case IFN_GOMP_SIMT_LANE:
|
|
1854 case IFN_GOMP_SIMT_LAST_LANE:
|
|
1855 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
|
|
1856 break;
|
|
1857 case IFN_GOMP_SIMT_VF:
|
|
1858 rhs = build_int_cst (type, vf);
|
|
1859 break;
|
|
1860 case IFN_GOMP_SIMT_ORDERED_PRED:
|
|
1861 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
|
|
1862 if (rhs || !lhs)
|
|
1863 unlink_stmt_vdef (stmt);
|
|
1864 break;
|
|
1865 case IFN_GOMP_SIMT_VOTE_ANY:
|
|
1866 case IFN_GOMP_SIMT_XCHG_BFLY:
|
|
1867 case IFN_GOMP_SIMT_XCHG_IDX:
|
|
1868 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
|
|
1869 break;
|
|
1870 case IFN_GOMP_SIMD_LANE:
|
|
1871 case IFN_GOMP_SIMD_LAST_LANE:
|
|
1872 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
|
|
1873 break;
|
|
1874 case IFN_GOMP_SIMD_VF:
|
|
1875 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
|
|
1876 break;
|
|
1877 default:
|
|
1878 continue;
|
|
1879 }
|
|
1880 if (lhs && !rhs)
|
|
1881 continue;
|
|
1882 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
|
|
1883 gsi_replace (&gsi, stmt, false);
|
|
1884 }
|
|
1885 if (regimplify)
|
|
1886 FOR_EACH_BB_REVERSE_FN (bb, cfun)
|
|
1887 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
|
|
1888 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
|
|
1889 {
|
|
1890 if (gimple_clobber_p (gsi_stmt (gsi)))
|
|
1891 gsi_remove (&gsi, true);
|
|
1892 else
|
|
1893 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
|
|
1894 }
|
|
1895 if (vf != 1)
|
|
1896 cfun->has_force_vectorize_loops = false;
|
|
1897 return 0;
|
|
1898 }
|
|
1899
|
|
1900 namespace {
|
|
1901
|
|
1902 const pass_data pass_data_omp_device_lower =
|
|
1903 {
|
|
1904 GIMPLE_PASS, /* type */
|
|
1905 "ompdevlow", /* name */
|
|
1906 OPTGROUP_OMP, /* optinfo_flags */
|
|
1907 TV_NONE, /* tv_id */
|
|
1908 PROP_cfg, /* properties_required */
|
|
1909 PROP_gimple_lomp_dev, /* properties_provided */
|
|
1910 0, /* properties_destroyed */
|
|
1911 0, /* todo_flags_start */
|
|
1912 TODO_update_ssa, /* todo_flags_finish */
|
|
1913 };
|
|
1914
|
|
1915 class pass_omp_device_lower : public gimple_opt_pass
|
|
1916 {
|
|
1917 public:
|
|
1918 pass_omp_device_lower (gcc::context *ctxt)
|
|
1919 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
|
|
1920 {}
|
|
1921
|
|
1922 /* opt_pass methods: */
|
|
1923 virtual bool gate (function *fun)
|
|
1924 {
|
|
1925 return !(fun->curr_properties & PROP_gimple_lomp_dev);
|
|
1926 }
|
|
1927 virtual unsigned int execute (function *)
|
|
1928 {
|
|
1929 return execute_omp_device_lower ();
|
|
1930 }
|
|
1931
|
|
1932 }; // class pass_expand_omp_ssa
|
|
1933
|
|
1934 } // anon namespace
|
|
1935
|
|
1936 gimple_opt_pass *
|
|
1937 make_pass_omp_device_lower (gcc::context *ctxt)
|
|
1938 {
|
|
1939 return new pass_omp_device_lower (ctxt);
|
|
1940 }
|
|
1941
|
|
1942 /* "omp declare target link" handling pass. */
|
|
1943
|
|
1944 namespace {
|
|
1945
|
|
1946 const pass_data pass_data_omp_target_link =
|
|
1947 {
|
|
1948 GIMPLE_PASS, /* type */
|
|
1949 "omptargetlink", /* name */
|
|
1950 OPTGROUP_OMP, /* optinfo_flags */
|
|
1951 TV_NONE, /* tv_id */
|
|
1952 PROP_ssa, /* properties_required */
|
|
1953 0, /* properties_provided */
|
|
1954 0, /* properties_destroyed */
|
|
1955 0, /* todo_flags_start */
|
|
1956 TODO_update_ssa, /* todo_flags_finish */
|
|
1957 };
|
|
1958
|
|
1959 class pass_omp_target_link : public gimple_opt_pass
|
|
1960 {
|
|
1961 public:
|
|
1962 pass_omp_target_link (gcc::context *ctxt)
|
|
1963 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
|
|
1964 {}
|
|
1965
|
|
1966 /* opt_pass methods: */
|
|
1967 virtual bool gate (function *fun)
|
|
1968 {
|
|
1969 #ifdef ACCEL_COMPILER
|
131
|
1970 return offloading_function_p (fun->decl);
|
111
|
1971 #else
|
|
1972 (void) fun;
|
|
1973 return false;
|
|
1974 #endif
|
|
1975 }
|
|
1976
|
|
1977 virtual unsigned execute (function *);
|
|
1978 };
|
|
1979
|
|
1980 /* Callback for walk_gimple_stmt used to scan for link var operands. */
|
|
1981
|
|
1982 static tree
|
|
1983 find_link_var_op (tree *tp, int *walk_subtrees, void *)
|
|
1984 {
|
|
1985 tree t = *tp;
|
|
1986
|
|
1987 if (VAR_P (t)
|
|
1988 && DECL_HAS_VALUE_EXPR_P (t)
|
|
1989 && is_global_var (t)
|
|
1990 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
|
|
1991 {
|
|
1992 *walk_subtrees = 0;
|
|
1993 return t;
|
|
1994 }
|
|
1995
|
|
1996 return NULL_TREE;
|
|
1997 }
|
|
1998
|
|
1999 unsigned
|
|
2000 pass_omp_target_link::execute (function *fun)
|
|
2001 {
|
|
2002 basic_block bb;
|
|
2003 FOR_EACH_BB_FN (bb, fun)
|
|
2004 {
|
|
2005 gimple_stmt_iterator gsi;
|
|
2006 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
2007 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
|
|
2008 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
|
|
2009 }
|
|
2010
|
|
2011 return 0;
|
|
2012 }
|
|
2013
|
|
2014 } // anon namespace
|
|
2015
|
|
2016 gimple_opt_pass *
|
|
2017 make_pass_omp_target_link (gcc::context *ctxt)
|
|
2018 {
|
|
2019 return new pass_omp_target_link (ctxt);
|
|
2020 }
|