Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/nvptx/nvptx.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 /* Target code for NVPTX. | 1 /* Target code for NVPTX. |
2 Copyright (C) 2014-2018 Free Software Foundation, Inc. | 2 Copyright (C) 2014-2020 Free Software Foundation, Inc. |
3 Contributed by Bernd Schmidt <bernds@codesourcery.com> | 3 Contributed by Bernd Schmidt <bernds@codesourcery.com> |
4 | 4 |
5 This file is part of GCC. | 5 This file is part of GCC. |
6 | 6 |
7 GCC is free software; you can redistribute it and/or modify it | 7 GCC is free software; you can redistribute it and/or modify it |
57 #include "gimple.h" | 57 #include "gimple.h" |
58 #include "stor-layout.h" | 58 #include "stor-layout.h" |
59 #include "builtins.h" | 59 #include "builtins.h" |
60 #include "omp-general.h" | 60 #include "omp-general.h" |
61 #include "omp-low.h" | 61 #include "omp-low.h" |
62 #include "omp-offload.h" | |
62 #include "gomp-constants.h" | 63 #include "gomp-constants.h" |
63 #include "dumpfile.h" | 64 #include "dumpfile.h" |
64 #include "internal-fn.h" | 65 #include "internal-fn.h" |
65 #include "gimple-iterator.h" | 66 #include "gimple-iterator.h" |
66 #include "stringpool.h" | 67 #include "stringpool.h" |
79 | 80 |
80 #define WORKAROUND_PTXJIT_BUG 1 | 81 #define WORKAROUND_PTXJIT_BUG 1 |
81 #define WORKAROUND_PTXJIT_BUG_2 1 | 82 #define WORKAROUND_PTXJIT_BUG_2 1 |
82 #define WORKAROUND_PTXJIT_BUG_3 1 | 83 #define WORKAROUND_PTXJIT_BUG_3 1 |
83 | 84 |
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread | |
86 block, which has had a maximum number of threads of 1024 since CUDA version | |
87 2.x. */ | |
88 #define PTX_CTA_SIZE 1024 | |
89 | |
90 #define PTX_CTA_NUM_BARRIERS 16 | |
91 #define PTX_WARP_SIZE 32 | |
92 | |
93 #define PTX_PER_CTA_BARRIER 0 | |
94 #define PTX_NUM_PER_CTA_BARRIERS 1 | |
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS) | |
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS) | |
97 | |
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE | |
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE | |
100 #define PTX_WORKER_LENGTH 32 | |
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ | |
102 | |
84 /* The various PTX memory areas an object might reside in. */ | 103 /* The various PTX memory areas an object might reside in. */ |
85 enum nvptx_data_area | 104 enum nvptx_data_area |
86 { | 105 { |
87 DATA_AREA_GENERIC, | 106 DATA_AREA_GENERIC, |
88 DATA_AREA_GLOBAL, | 107 DATA_AREA_GLOBAL, |
120 }; | 139 }; |
121 | 140 |
122 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; | 141 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; |
123 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; | 142 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; |
124 | 143 |
125 /* Buffer needed to broadcast across workers. This is used for both | 144 /* Buffer needed to broadcast across workers and vectors. This is |
126 worker-neutering and worker broadcasting. It is shared by all | 145 used for both worker-neutering and worker broadcasting, and |
127 functions emitted. The buffer is placed in shared memory. It'd be | 146 vector-neutering and boardcasting when vector_length > 32. It is |
128 nice if PTX supported common blocks, because then this could be | 147 shared by all functions emitted. The buffer is placed in shared |
129 shared across TUs (taking the largest size). */ | 148 memory. It'd be nice if PTX supported common blocks, because then |
130 static unsigned worker_bcast_size; | 149 this could be shared across TUs (taking the largest size). */ |
131 static unsigned worker_bcast_align; | 150 static unsigned oacc_bcast_size; |
132 static GTY(()) rtx worker_bcast_sym; | 151 static unsigned oacc_bcast_partition; |
152 static unsigned oacc_bcast_align; | |
153 static GTY(()) rtx oacc_bcast_sym; | |
133 | 154 |
134 /* Buffer needed for worker reductions. This has to be distinct from | 155 /* Buffer needed for worker reductions. This has to be distinct from |
135 the worker broadcast array, as both may be live concurrently. */ | 156 the worker broadcast array, as both may be live concurrently. */ |
136 static unsigned worker_red_size; | 157 static unsigned worker_red_size; |
137 static unsigned worker_red_align; | 158 static unsigned worker_red_align; |
138 static GTY(()) rtx worker_red_sym; | 159 static GTY(()) rtx worker_red_sym; |
139 | 160 |
161 /* Buffer needed for vector reductions, when vector_length > | |
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast | |
163 array, as both may be live concurrently. */ | |
164 static unsigned vector_red_size; | |
165 static unsigned vector_red_align; | |
166 static unsigned vector_red_partition; | |
167 static GTY(()) rtx vector_red_sym; | |
168 | |
140 /* Global lock variable, needed for 128bit worker & gang reductions. */ | 169 /* Global lock variable, needed for 128bit worker & gang reductions. */ |
141 static GTY(()) tree global_lock_var; | 170 static GTY(()) tree global_lock_var; |
142 | 171 |
143 /* True if any function references __nvptx_stacks. */ | 172 /* True if any function references __nvptx_stacks. */ |
144 static bool need_softstack_decl; | 173 static bool need_softstack_decl; |
145 | 174 |
146 /* True if any function references __nvptx_uni. */ | 175 /* True if any function references __nvptx_uni. */ |
147 static bool need_unisimt_decl; | 176 static bool need_unisimt_decl; |
177 | |
178 static int nvptx_mach_max_workers (); | |
148 | 179 |
149 /* Allocate a new, cleared machine_function structure. */ | 180 /* Allocate a new, cleared machine_function structure. */ |
150 | 181 |
151 static struct machine_function * | 182 static struct machine_function * |
152 nvptx_init_machine_status (void) | 183 nvptx_init_machine_status (void) |
161 | 192 |
162 static void | 193 static void |
163 diagnose_openacc_conflict (bool optval, const char *optname) | 194 diagnose_openacc_conflict (bool optval, const char *optname) |
164 { | 195 { |
165 if (flag_openacc && optval) | 196 if (flag_openacc && optval) |
166 error ("option %s is not supported together with -fopenacc", optname); | 197 error ("option %s is not supported together with %<-fopenacc%>", optname); |
167 } | 198 } |
168 | 199 |
169 /* Implement TARGET_OPTION_OVERRIDE. */ | 200 /* Implement TARGET_OPTION_OVERRIDE. */ |
170 | 201 |
171 static void | 202 static void |
200 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); | 231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); |
201 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); | 232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); |
202 declared_libfuncs_htab | 233 declared_libfuncs_htab |
203 = hash_table<declared_libfunc_hasher>::create_ggc (17); | 234 = hash_table<declared_libfunc_hasher>::create_ggc (17); |
204 | 235 |
205 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast"); | 236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast"); |
206 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED); | 237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED); |
207 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; | 238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; |
239 oacc_bcast_partition = 0; | |
208 | 240 |
209 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); | 241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); |
210 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); | 242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); |
211 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; | 243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; |
244 | |
245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red"); | |
246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED); | |
247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; | |
248 vector_red_partition = 0; | |
212 | 249 |
213 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); | 250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); |
214 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); | 251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); |
215 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); | 252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); |
216 | 253 |
481 } | 518 } |
482 | 519 |
483 /* Implement TARGET_FUNCTION_ARG. */ | 520 /* Implement TARGET_FUNCTION_ARG. */ |
484 | 521 |
485 static rtx | 522 static rtx |
486 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode, | 523 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg) |
487 const_tree, bool named) | 524 { |
488 { | 525 if (arg.end_marker_p () || !arg.named) |
489 if (mode == VOIDmode || !named) | |
490 return NULL_RTX; | 526 return NULL_RTX; |
491 | 527 |
492 return gen_reg_rtx (mode); | 528 return gen_reg_rtx (arg.mode); |
493 } | 529 } |
494 | 530 |
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */ | 531 /* Implement TARGET_FUNCTION_INCOMING_ARG. */ |
496 | 532 |
497 static rtx | 533 static rtx |
498 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode, | 534 nvptx_function_incoming_arg (cumulative_args_t cum_v, |
499 const_tree, bool named) | 535 const function_arg_info &arg) |
500 { | 536 { |
501 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); | 537 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
502 | 538 |
503 if (mode == VOIDmode || !named) | 539 if (arg.end_marker_p () || !arg.named) |
504 return NULL_RTX; | 540 return NULL_RTX; |
505 | 541 |
506 /* No need to deal with split modes here, the only case that can | 542 /* No need to deal with split modes here, the only case that can |
507 happen is complex modes and those are dealt with by | 543 happen is complex modes and those are dealt with by |
508 TARGET_SPLIT_COMPLEX_ARG. */ | 544 TARGET_SPLIT_COMPLEX_ARG. */ |
509 return gen_rtx_UNSPEC (mode, | 545 return gen_rtx_UNSPEC (arg.mode, |
510 gen_rtvec (1, GEN_INT (cum->count)), | 546 gen_rtvec (1, GEN_INT (cum->count)), |
511 UNSPEC_ARG_REG); | 547 UNSPEC_ARG_REG); |
512 } | 548 } |
513 | 549 |
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */ | 550 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */ |
515 | 551 |
516 static void | 552 static void |
517 nvptx_function_arg_advance (cumulative_args_t cum_v, | 553 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &) |
518 machine_mode ARG_UNUSED (mode), | |
519 const_tree ARG_UNUSED (type), | |
520 bool ARG_UNUSED (named)) | |
521 { | 554 { |
522 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); | 555 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
523 | 556 |
524 cum->count++; | 557 cum->count++; |
525 } | 558 } |
594 | 627 |
595 /* Types with a mode other than those supported by the machine are passed by | 628 /* Types with a mode other than those supported by the machine are passed by |
596 reference in memory. */ | 629 reference in memory. */ |
597 | 630 |
598 static bool | 631 static bool |
599 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum), | 632 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg) |
600 machine_mode mode, const_tree type, | 633 { |
601 bool ARG_UNUSED (named)) | 634 return pass_in_memory (arg.mode, arg.type, false); |
602 { | |
603 return pass_in_memory (mode, type, false); | |
604 } | 635 } |
605 | 636 |
606 /* Implement TARGET_RETURN_IN_MEMORY. */ | 637 /* Implement TARGET_RETURN_IN_MEMORY. */ |
607 | 638 |
608 static bool | 639 static bool |
1086 static void | 1117 static void |
1087 nvptx_init_axis_predicate (FILE *file, int regno, const char *name) | 1118 nvptx_init_axis_predicate (FILE *file, int regno, const char *name) |
1088 { | 1119 { |
1089 fprintf (file, "\t{\n"); | 1120 fprintf (file, "\t{\n"); |
1090 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); | 1121 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); |
1122 if (strcmp (name, "x") == 0 && cfun->machine->red_partition) | |
1123 { | |
1124 fprintf (file, "\t\t.reg.u64\t%%t_red;\n"); | |
1125 fprintf (file, "\t\t.reg.u64\t%%y64;\n"); | |
1126 } | |
1091 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); | 1127 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); |
1092 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); | 1128 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); |
1129 if (strcmp (name, "x") == 0 && cfun->machine->red_partition) | |
1130 { | |
1131 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n"); | |
1132 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n"); | |
1133 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; " | |
1134 "// vector reduction buffer\n", | |
1135 REGNO (cfun->machine->red_partition), | |
1136 vector_red_partition); | |
1137 } | |
1138 /* Verify vector_red_size. */ | |
1139 gcc_assert (vector_red_partition * nvptx_mach_max_workers () | |
1140 <= vector_red_size); | |
1141 fprintf (file, "\t}\n"); | |
1142 } | |
1143 | |
1144 /* Emit code to initialize OpenACC worker broadcast and synchronization | |
1145 registers. */ | |
1146 | |
1147 static void | |
1148 nvptx_init_oacc_workers (FILE *file) | |
1149 { | |
1150 fprintf (file, "\t{\n"); | |
1151 fprintf (file, "\t\t.reg.u32\t%%tidy;\n"); | |
1152 if (cfun->machine->bcast_partition) | |
1153 { | |
1154 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n"); | |
1155 fprintf (file, "\t\t.reg.u64\t%%y64;\n"); | |
1156 } | |
1157 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n"); | |
1158 if (cfun->machine->bcast_partition) | |
1159 { | |
1160 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n"); | |
1161 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n"); | |
1162 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n"); | |
1163 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; " | |
1164 "// vector broadcast offset\n", | |
1165 REGNO (cfun->machine->bcast_partition), | |
1166 oacc_bcast_partition); | |
1167 } | |
1168 /* Verify oacc_bcast_size. */ | |
1169 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1) | |
1170 <= oacc_bcast_size); | |
1171 if (cfun->machine->sync_bar) | |
1172 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; " | |
1173 "// vector synchronization barrier\n", | |
1174 REGNO (cfun->machine->sync_bar)); | |
1093 fprintf (file, "\t}\n"); | 1175 fprintf (file, "\t}\n"); |
1094 } | 1176 } |
1095 | 1177 |
1096 /* Emit code to initialize predicate and master lane index registers for | 1178 /* Emit code to initialize predicate and master lane index registers for |
1097 -muniform-simt code generation variant. */ | 1179 -muniform-simt code generation variant. */ |
1290 simtsz += align - GET_MODE_SIZE (DImode); | 1372 simtsz += align - GET_MODE_SIZE (DImode); |
1291 if (simtsz) | 1373 if (simtsz) |
1292 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" | 1374 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" |
1293 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); | 1375 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); |
1294 } | 1376 } |
1377 | |
1378 /* Restore the vector reduction partition register, if necessary. | |
1379 FIXME: Find out when and why this is necessary, and fix it. */ | |
1380 if (cfun->machine->red_partition) | |
1381 regno_reg_rtx[REGNO (cfun->machine->red_partition)] | |
1382 = cfun->machine->red_partition; | |
1383 | |
1295 /* Declare the pseudos we have as ptx registers. */ | 1384 /* Declare the pseudos we have as ptx registers. */ |
1296 int maxregs = max_reg_num (); | 1385 int maxregs = max_reg_num (); |
1297 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) | 1386 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) |
1298 { | 1387 { |
1299 if (regno_reg_rtx[i] != const0_rtx) | 1388 if (regno_reg_rtx[i] != const0_rtx) |
1317 nvptx_init_axis_predicate (file, | 1406 nvptx_init_axis_predicate (file, |
1318 REGNO (cfun->machine->axis_predicate[1]), "x"); | 1407 REGNO (cfun->machine->axis_predicate[1]), "x"); |
1319 if (cfun->machine->unisimt_predicate | 1408 if (cfun->machine->unisimt_predicate |
1320 || (cfun->machine->has_simtreg && !crtl->is_leaf)) | 1409 || (cfun->machine->has_simtreg && !crtl->is_leaf)) |
1321 nvptx_init_unisimt_predicate (file); | 1410 nvptx_init_unisimt_predicate (file); |
1411 if (cfun->machine->bcast_partition || cfun->machine->sync_bar) | |
1412 nvptx_init_oacc_workers (file); | |
1322 } | 1413 } |
1323 | 1414 |
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether | 1415 /* Output code for switching uniform-simt state. ENTERING indicates whether |
1325 we are entering or leaving non-uniform execution region. */ | 1416 we are entering or leaving non-uniform execution region. */ |
1326 | 1417 |
1376 else | 1467 else |
1377 output_reg (file, REGNO (size), VOIDmode); | 1468 output_reg (file, REGNO (size), VOIDmode); |
1378 fputs (";\n", file); | 1469 fputs (";\n", file); |
1379 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) | 1470 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) |
1380 fprintf (file, | 1471 fprintf (file, |
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n", | 1472 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n", |
1382 bits, regno, regno, UINTVAL (align)); | 1473 bits, regno, regno, UINTVAL (align)); |
1383 } | 1474 } |
1384 if (cfun->machine->has_softstack) | 1475 if (cfun->machine->has_softstack) |
1385 { | 1476 { |
1386 const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; | 1477 const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; |
1740 | 1831 |
1741 /* Generate an instruction or sequence to broadcast register REG | 1832 /* Generate an instruction or sequence to broadcast register REG |
1742 across the vectors of a single warp. */ | 1833 across the vectors of a single warp. */ |
1743 | 1834 |
1744 static rtx | 1835 static rtx |
1745 nvptx_gen_vcast (rtx reg) | 1836 nvptx_gen_warp_bcast (rtx reg) |
1746 { | 1837 { |
1747 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); | 1838 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); |
1748 } | 1839 } |
1749 | 1840 |
1750 /* Structure used when generating a worker-level spill or fill. */ | 1841 /* Structure used when generating a worker-level spill or fill. */ |
1751 | 1842 |
1752 struct wcast_data_t | 1843 struct broadcast_data_t |
1753 { | 1844 { |
1754 rtx base; /* Register holding base addr of buffer. */ | 1845 rtx base; /* Register holding base addr of buffer. */ |
1755 rtx ptr; /* Iteration var, if needed. */ | 1846 rtx ptr; /* Iteration var, if needed. */ |
1756 unsigned offset; /* Offset into worker buffer. */ | 1847 unsigned offset; /* Offset into worker buffer. */ |
1757 }; | 1848 }; |
1771 /* Generate instruction(s) to spill or fill register REG to/from the | 1862 /* Generate instruction(s) to spill or fill register REG to/from the |
1772 worker broadcast array. PM indicates what is to be done, REP | 1863 worker broadcast array. PM indicates what is to be done, REP |
1773 how many loop iterations will be executed (0 for not a loop). */ | 1864 how many loop iterations will be executed (0 for not a loop). */ |
1774 | 1865 |
1775 static rtx | 1866 static rtx |
1776 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data) | 1867 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, |
1868 broadcast_data_t *data, bool vector) | |
1777 { | 1869 { |
1778 rtx res; | 1870 rtx res; |
1779 machine_mode mode = GET_MODE (reg); | 1871 machine_mode mode = GET_MODE (reg); |
1780 | 1872 |
1781 switch (mode) | 1873 switch (mode) |
1785 rtx tmp = gen_reg_rtx (SImode); | 1877 rtx tmp = gen_reg_rtx (SImode); |
1786 | 1878 |
1787 start_sequence (); | 1879 start_sequence (); |
1788 if (pm & PM_read) | 1880 if (pm & PM_read) |
1789 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); | 1881 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); |
1790 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data)); | 1882 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector)); |
1791 if (pm & PM_write) | 1883 if (pm & PM_write) |
1792 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); | 1884 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); |
1793 res = get_insns (); | 1885 res = get_insns (); |
1794 end_sequence (); | 1886 end_sequence (); |
1795 } | 1887 } |
1801 | 1893 |
1802 if (!addr) | 1894 if (!addr) |
1803 { | 1895 { |
1804 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; | 1896 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; |
1805 | 1897 |
1806 if (align > worker_bcast_align) | 1898 oacc_bcast_align = MAX (oacc_bcast_align, align); |
1807 worker_bcast_align = align; | 1899 data->offset = ROUND_UP (data->offset, align); |
1808 data->offset = (data->offset + align - 1) & ~(align - 1); | |
1809 addr = data->base; | 1900 addr = data->base; |
1901 gcc_assert (data->base != NULL); | |
1810 if (data->offset) | 1902 if (data->offset) |
1811 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); | 1903 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); |
1812 } | 1904 } |
1813 | 1905 |
1814 addr = gen_rtx_MEM (mode, addr); | 1906 addr = gen_rtx_MEM (mode, addr); |
1924 | 2016 |
1925 for (unsigned part = 0; size; size -= part) | 2017 for (unsigned part = 0; size; size -= part) |
1926 { | 2018 { |
1927 val >>= part * BITS_PER_UNIT; | 2019 val >>= part * BITS_PER_UNIT; |
1928 part = init_frag.size - init_frag.offset; | 2020 part = init_frag.size - init_frag.offset; |
1929 if (part > size) | 2021 part = MIN (part, size); |
1930 part = size; | |
1931 | 2022 |
1932 unsigned HOST_WIDE_INT partial | 2023 unsigned HOST_WIDE_INT partial |
1933 = val << (init_frag.offset * BITS_PER_UNIT); | 2024 = val << (init_frag.offset * BITS_PER_UNIT); |
1934 init_frag.val |= partial & init_frag.mask; | 2025 init_frag.val |= partial & init_frag.mask; |
1935 init_frag.offset += part; | 2026 init_frag.offset += part; |
1988 { | 2079 { |
1989 /* Finish the current fragment, if it's started. */ | 2080 /* Finish the current fragment, if it's started. */ |
1990 if (init_frag.offset) | 2081 if (init_frag.offset) |
1991 { | 2082 { |
1992 unsigned part = init_frag.size - init_frag.offset; | 2083 unsigned part = init_frag.size - init_frag.offset; |
1993 if (part > size) | 2084 part = MIN (part, (unsigned)size); |
1994 part = (unsigned) size; | |
1995 size -= part; | 2085 size -= part; |
1996 nvptx_assemble_value (0, part); | 2086 nvptx_assemble_value (0, part); |
1997 } | 2087 } |
1998 | 2088 |
1999 /* If this skip doesn't terminate the initializer, write as many | 2089 /* If this skip doesn't terminate the initializer, write as many |
2861 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); | 2951 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); |
2862 validate_change (insn, &PATTERN (insn), pat, false); | 2952 validate_change (insn, &PATTERN (insn), pat, false); |
2863 } | 2953 } |
2864 } | 2954 } |
2865 | 2955 |
2956 /* Offloading function attributes. */ | |
2957 | |
2958 struct offload_attrs | |
2959 { | |
2960 unsigned mask; | |
2961 int num_gangs; | |
2962 int num_workers; | |
2963 int vector_length; | |
2964 }; | |
2965 | |
2966 /* Define entries for cfun->machine->axis_dim. */ | |
2967 | |
2968 #define MACH_VECTOR_LENGTH 0 | |
2969 #define MACH_MAX_WORKERS 1 | |
2970 | |
2971 static void populate_offload_attrs (offload_attrs *oa); | |
2972 | |
2973 static void | |
2974 init_axis_dim (void) | |
2975 { | |
2976 offload_attrs oa; | |
2977 int max_workers; | |
2978 | |
2979 populate_offload_attrs (&oa); | |
2980 | |
2981 if (oa.num_workers == 0) | |
2982 max_workers = PTX_CTA_SIZE / oa.vector_length; | |
2983 else | |
2984 max_workers = oa.num_workers; | |
2985 | |
2986 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length; | |
2987 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers; | |
2988 cfun->machine->axis_dim_init_p = true; | |
2989 } | |
2990 | |
2991 static int ATTRIBUTE_UNUSED | |
2992 nvptx_mach_max_workers () | |
2993 { | |
2994 if (!cfun->machine->axis_dim_init_p) | |
2995 init_axis_dim (); | |
2996 return cfun->machine->axis_dim[MACH_MAX_WORKERS]; | |
2997 } | |
2998 | |
2999 static int ATTRIBUTE_UNUSED | |
3000 nvptx_mach_vector_length () | |
3001 { | |
3002 if (!cfun->machine->axis_dim_init_p) | |
3003 init_axis_dim (); | |
3004 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH]; | |
3005 } | |
3006 | |
2866 /* Loop structure of the function. The entire function is described as | 3007 /* Loop structure of the function. The entire function is described as |
2867 a NULL loop. */ | 3008 a NULL loop. */ |
2868 | 3009 |
2869 struct parallel | 3010 struct parallel |
2870 { | 3011 { |
3008 block = e->dest; | 3149 block = e->dest; |
3009 map->get_or_insert (block) = elt->first; | 3150 map->get_or_insert (block) = elt->first; |
3010 } | 3151 } |
3011 } | 3152 } |
3012 | 3153 |
3154 /* Return true if MASK contains parallelism that requires shared | |
3155 memory to broadcast. */ | |
3156 | |
3157 static bool | |
3158 nvptx_needs_shared_bcast (unsigned mask) | |
3159 { | |
3160 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER); | |
3161 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) | |
3162 && nvptx_mach_vector_length () != PTX_WARP_SIZE; | |
3163 | |
3164 return worker || large_vector; | |
3165 } | |
3166 | |
3013 /* BLOCK is a basic block containing a head or tail instruction. | 3167 /* BLOCK is a basic block containing a head or tail instruction. |
3014 Locate the associated prehead or pretail instruction, which must be | 3168 Locate the associated prehead or pretail instruction, which must be |
3015 in the single predecessor block. */ | 3169 in the single predecessor block. */ |
3016 | 3170 |
3017 static rtx_insn * | 3171 static rtx_insn * |
3083 | 3237 |
3084 gcc_assert (mask); | 3238 gcc_assert (mask); |
3085 par = new parallel (par, mask); | 3239 par = new parallel (par, mask); |
3086 par->forked_block = block; | 3240 par->forked_block = block; |
3087 par->forked_insn = end; | 3241 par->forked_insn = end; |
3088 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) | 3242 if (nvptx_needs_shared_bcast (mask)) |
3089 par->fork_insn | 3243 par->fork_insn |
3090 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); | 3244 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); |
3091 } | 3245 } |
3092 break; | 3246 break; |
3093 | 3247 |
3096 parent. */ | 3250 parent. */ |
3097 { | 3251 { |
3098 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); | 3252 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); |
3099 | 3253 |
3100 gcc_assert (par->mask == mask); | 3254 gcc_assert (par->mask == mask); |
3255 gcc_assert (par->join_block == NULL); | |
3101 par->join_block = block; | 3256 par->join_block = block; |
3102 par->join_insn = end; | 3257 par->join_insn = end; |
3103 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) | 3258 if (nvptx_needs_shared_bcast (mask)) |
3104 par->joining_insn | 3259 par->joining_insn |
3105 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); | 3260 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); |
3106 par = par->parent; | 3261 par = par->parent; |
3107 } | 3262 } |
3108 break; | 3263 break; |
3389 { | 3544 { |
3390 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds; | 3545 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds; |
3391 size_t offset = (dir > 0 ? offsetof (edge_def, dest) | 3546 size_t offset = (dir > 0 ? offsetof (edge_def, dest) |
3392 : offsetof (edge_def, src)); | 3547 : offsetof (edge_def, src)); |
3393 edge e; | 3548 edge e; |
3394 edge_iterator (ei); | 3549 edge_iterator ei; |
3395 | 3550 |
3396 FOR_EACH_EDGE (e, ei, edges) | 3551 FOR_EACH_EDGE (e, ei, edges) |
3397 { | 3552 { |
3398 basic_block target = *(basic_block *)((char *)e + offset); | 3553 basic_block target = *(basic_block *)((char *)e + offset); |
3399 | 3554 |
3412 static void | 3567 static void |
3413 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, | 3568 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, |
3414 vec<edge, va_gc> *edges, size_t offset) | 3569 vec<edge, va_gc> *edges, size_t offset) |
3415 { | 3570 { |
3416 edge e; | 3571 edge e; |
3417 edge_iterator (ei); | 3572 edge_iterator ei; |
3418 int hi_back = depth; | 3573 int hi_back = depth; |
3419 pseudo_node_t node_back (0, depth); | 3574 pseudo_node_t node_back (0, depth); |
3420 int hi_child = depth; | 3575 int hi_child = depth; |
3421 pseudo_node_t node_child (0, depth); | 3576 pseudo_node_t node_child (0, depth); |
3422 basic_block child = NULL; | 3577 basic_block child = NULL; |
3795 frame for calls and non-calls. We could do better by (a) | 3950 frame for calls and non-calls. We could do better by (a) |
3796 propagating just the live set that is used within the partitioned | 3951 propagating just the live set that is used within the partitioned |
3797 regions and (b) only propagating stack entries that are used. The | 3952 regions and (b) only propagating stack entries that are used. The |
3798 latter might be quite hard to determine. */ | 3953 latter might be quite hard to determine. */ |
3799 | 3954 |
3800 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); | 3955 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool); |
3801 | 3956 |
3802 static bool | 3957 static bool |
3803 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, | 3958 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, |
3804 propagate_mask rw, propagator_fn fn, void *data) | 3959 propagate_mask rw, propagator_fn fn, void *data, bool vector) |
3805 { | 3960 { |
3806 bitmap live = DF_LIVE_IN (block); | 3961 bitmap live = DF_LIVE_IN (block); |
3807 bitmap_iterator iterator; | 3962 bitmap_iterator iterator; |
3808 unsigned ix; | 3963 unsigned ix; |
3809 bool empty = true; | 3964 bool empty = true; |
3834 pred = gen_reg_rtx (BImode); | 3989 pred = gen_reg_rtx (BImode); |
3835 label = gen_label_rtx (); | 3990 label = gen_label_rtx (); |
3836 | 3991 |
3837 emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); | 3992 emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); |
3838 /* Allow worker function to initialize anything needed. */ | 3993 /* Allow worker function to initialize anything needed. */ |
3839 rtx init = fn (tmp, PM_loop_begin, fs, data); | 3994 rtx init = fn (tmp, PM_loop_begin, fs, data, vector); |
3840 if (init) | 3995 if (init) |
3841 emit_insn (init); | 3996 emit_insn (init); |
3842 emit_label (label); | 3997 emit_label (label); |
3843 LABEL_NUSES (label)++; | 3998 LABEL_NUSES (label)++; |
3844 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1))); | 3999 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1))); |
3845 } | 4000 } |
3846 if (rw & PM_read) | 4001 if (rw & PM_read) |
3847 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr))); | 4002 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr))); |
3848 emit_insn (fn (tmp, rw, fs, data)); | 4003 emit_insn (fn (tmp, rw, fs, data, vector)); |
3849 if (rw & PM_write) | 4004 if (rw & PM_write) |
3850 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp)); | 4005 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp)); |
3851 if (fs) | 4006 if (fs) |
3852 { | 4007 { |
3853 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx))); | 4008 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx))); |
3854 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode)))); | 4009 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode)))); |
3855 emit_insn (gen_br_true_uni (pred, label)); | 4010 emit_insn (gen_br_true_uni (pred, label)); |
3856 rtx fini = fn (tmp, PM_loop_end, fs, data); | 4011 rtx fini = fn (tmp, PM_loop_end, fs, data, vector); |
3857 if (fini) | 4012 if (fini) |
3858 emit_insn (fini); | 4013 emit_insn (fini); |
3859 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx)); | 4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx)); |
3860 } | 4015 } |
3861 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp)); | 4016 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp)); |
3871 { | 4026 { |
3872 rtx reg = regno_reg_rtx[ix]; | 4027 rtx reg = regno_reg_rtx[ix]; |
3873 | 4028 |
3874 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) | 4029 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) |
3875 { | 4030 { |
3876 rtx bcast = fn (reg, rw, 0, data); | 4031 rtx bcast = fn (reg, rw, 0, data, vector); |
3877 | 4032 |
3878 insn = emit_insn_after (bcast, insn); | 4033 insn = emit_insn_after (bcast, insn); |
3879 empty = false; | 4034 empty = false; |
3880 } | 4035 } |
3881 } | 4036 } |
3882 return empty; | 4037 return empty; |
3883 } | 4038 } |
3884 | 4039 |
3885 /* Worker for nvptx_vpropagate. */ | 4040 /* Worker for nvptx_warp_propagate. */ |
3886 | 4041 |
3887 static rtx | 4042 static rtx |
3888 vprop_gen (rtx reg, propagate_mask pm, | 4043 warp_prop_gen (rtx reg, propagate_mask pm, |
3889 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data)) | 4044 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data), |
4045 bool ARG_UNUSED (vector)) | |
3890 { | 4046 { |
3891 if (!(pm & PM_read_write)) | 4047 if (!(pm & PM_read_write)) |
3892 return 0; | 4048 return 0; |
3893 | 4049 |
3894 return nvptx_gen_vcast (reg); | 4050 return nvptx_gen_warp_bcast (reg); |
3895 } | 4051 } |
3896 | 4052 |
3897 /* Propagate state that is live at start of BLOCK across the vectors | 4053 /* Propagate state that is live at start of BLOCK across the vectors |
3898 of a single warp. Propagation is inserted just after INSN. | 4054 of a single warp. Propagation is inserted just after INSN. |
3899 IS_CALL and return as for nvptx_propagate. */ | 4055 IS_CALL and return as for nvptx_propagate. */ |
3900 | 4056 |
3901 static bool | 4057 static bool |
3902 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn) | 4058 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn) |
3903 { | 4059 { |
3904 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0); | 4060 return nvptx_propagate (is_call, block, insn, PM_read_write, |
3905 } | 4061 warp_prop_gen, 0, false); |
3906 | 4062 } |
3907 /* Worker for nvptx_wpropagate. */ | 4063 |
4064 /* Worker for nvptx_shared_propagate. */ | |
3908 | 4065 |
3909 static rtx | 4066 static rtx |
3910 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_) | 4067 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_, |
3911 { | 4068 bool vector) |
3912 wcast_data_t *data = (wcast_data_t *)data_; | 4069 { |
4070 broadcast_data_t *data = (broadcast_data_t *)data_; | |
3913 | 4071 |
3914 if (pm & PM_loop_begin) | 4072 if (pm & PM_loop_begin) |
3915 { | 4073 { |
3916 /* Starting a loop, initialize pointer. */ | 4074 /* Starting a loop, initialize pointer. */ |
3917 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT; | 4075 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT; |
3918 | 4076 |
3919 if (align > worker_bcast_align) | 4077 oacc_bcast_align = MAX (oacc_bcast_align, align); |
3920 worker_bcast_align = align; | 4078 data->offset = ROUND_UP (data->offset, align); |
3921 data->offset = (data->offset + align - 1) & ~(align - 1); | |
3922 | 4079 |
3923 data->ptr = gen_reg_rtx (Pmode); | 4080 data->ptr = gen_reg_rtx (Pmode); |
3924 | 4081 |
3925 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset)); | 4082 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset)); |
3926 } | 4083 } |
3929 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr); | 4086 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr); |
3930 data->ptr = NULL_RTX; | 4087 data->ptr = NULL_RTX; |
3931 return clobber; | 4088 return clobber; |
3932 } | 4089 } |
3933 else | 4090 else |
3934 return nvptx_gen_wcast (reg, pm, rep, data); | 4091 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector); |
3935 } | 4092 } |
3936 | 4093 |
3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P | 4094 /* Spill or fill live state that is live at start of BLOCK. PRE_P |
3938 indicates if this is just before partitioned mode (do spill), or | 4095 indicates if this is just before partitioned mode (do spill), or |
3939 just after it starts (do fill). Sequence is inserted just after | 4096 just after it starts (do fill). Sequence is inserted just after |
3940 INSN. IS_CALL and return as for nvptx_propagate. */ | 4097 INSN. IS_CALL and return as for nvptx_propagate. */ |
3941 | 4098 |
3942 static bool | 4099 static bool |
3943 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn) | 4100 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block, |
3944 { | 4101 rtx_insn *insn, bool vector) |
3945 wcast_data_t data; | 4102 { |
4103 broadcast_data_t data; | |
3946 | 4104 |
3947 data.base = gen_reg_rtx (Pmode); | 4105 data.base = gen_reg_rtx (Pmode); |
3948 data.offset = 0; | 4106 data.offset = 0; |
3949 data.ptr = NULL_RTX; | 4107 data.ptr = NULL_RTX; |
3950 | 4108 |
3951 bool empty = nvptx_propagate (is_call, block, insn, | 4109 bool empty = nvptx_propagate (is_call, block, insn, |
3952 pre_p ? PM_read : PM_write, wprop_gen, &data); | 4110 pre_p ? PM_read : PM_write, shared_prop_gen, |
4111 &data, vector); | |
3953 gcc_assert (empty == !data.offset); | 4112 gcc_assert (empty == !data.offset); |
3954 if (data.offset) | 4113 if (data.offset) |
3955 { | 4114 { |
4115 rtx bcast_sym = oacc_bcast_sym; | |
4116 | |
3956 /* Stuff was emitted, initialize the base pointer now. */ | 4117 /* Stuff was emitted, initialize the base pointer now. */ |
3957 rtx init = gen_rtx_SET (data.base, worker_bcast_sym); | 4118 if (vector && nvptx_mach_max_workers () > 1) |
4119 { | |
4120 if (!cfun->machine->bcast_partition) | |
4121 { | |
4122 /* It would be nice to place this register in | |
4123 DATA_AREA_SHARED. */ | |
4124 cfun->machine->bcast_partition = gen_reg_rtx (DImode); | |
4125 } | |
4126 if (!cfun->machine->sync_bar) | |
4127 cfun->machine->sync_bar = gen_reg_rtx (SImode); | |
4128 | |
4129 bcast_sym = cfun->machine->bcast_partition; | |
4130 } | |
4131 | |
4132 rtx init = gen_rtx_SET (data.base, bcast_sym); | |
3958 emit_insn_after (init, insn); | 4133 emit_insn_after (init, insn); |
3959 | 4134 |
3960 if (worker_bcast_size < data.offset) | 4135 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align); |
3961 worker_bcast_size = data.offset; | 4136 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE |
4137 ? nvptx_mach_max_workers () + 1 | |
4138 : 1); | |
4139 | |
4140 oacc_bcast_partition = MAX (oacc_bcast_partition, psize); | |
4141 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum); | |
3962 } | 4142 } |
3963 return empty; | 4143 return empty; |
3964 } | 4144 } |
3965 | 4145 |
3966 /* Emit a worker-level synchronization barrier. We use different | 4146 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number, |
3967 markers for before and after synchronizations. */ | 4147 which is an integer or a register. THREADS is the number of threads |
4148 controlled by the barrier. */ | |
3968 | 4149 |
3969 static rtx | 4150 static rtx |
3970 nvptx_wsync (bool after) | 4151 nvptx_cta_sync (rtx lock, int threads) |
3971 { | 4152 { |
3972 return gen_nvptx_barsync (GEN_INT (after)); | 4153 return gen_nvptx_barsync (lock, GEN_INT (threads)); |
3973 } | 4154 } |
3974 | 4155 |
3975 #if WORKAROUND_PTXJIT_BUG | 4156 #if WORKAROUND_PTXJIT_BUG |
3976 /* Return first real insn in BB, or return NULL_RTX if BB does not contain | 4157 /* Return first real insn in BB, or return NULL_RTX if BB does not contain |
3977 real insns. */ | 4158 real insns. */ |
4214 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) | 4395 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) |
4215 if (GOMP_DIM_MASK (mode) & skip_mask) | 4396 if (GOMP_DIM_MASK (mode) & skip_mask) |
4216 { | 4397 { |
4217 rtx_code_label *label = gen_label_rtx (); | 4398 rtx_code_label *label = gen_label_rtx (); |
4218 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; | 4399 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; |
4219 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump; | 4400 rtx_insn **mode_jump |
4220 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label; | 4401 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump; |
4402 rtx_insn **mode_label | |
4403 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label; | |
4221 | 4404 |
4222 if (!pred) | 4405 if (!pred) |
4223 { | 4406 { |
4224 pred = gen_reg_rtx (BImode); | 4407 pred = gen_reg_rtx (BImode); |
4225 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; | 4408 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; |
4226 } | 4409 } |
4227 | 4410 |
4228 rtx br; | 4411 rtx br; |
4229 if (mode == GOMP_DIM_VECTOR) | 4412 if (mode == GOMP_DIM_VECTOR) |
4230 br = gen_br_true (pred, label); | 4413 br = gen_br_true (pred, label); |
4231 else | 4414 else |
4232 br = gen_br_true_uni (pred, label); | 4415 br = gen_br_true_uni (pred, label); |
4249 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) | 4432 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) |
4250 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) | 4433 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) |
4251 emit_insn_after (gen_exit (), label_insn); | 4434 emit_insn_after (gen_exit (), label_insn); |
4252 } | 4435 } |
4253 | 4436 |
4254 if (mode == GOMP_DIM_VECTOR) | 4437 *mode_label = label_insn; |
4255 vector_label = label_insn; | |
4256 else | |
4257 worker_label = label_insn; | |
4258 } | 4438 } |
4259 | 4439 |
4260 /* Now deal with propagating the branch condition. */ | 4440 /* Now deal with propagating the branch condition. */ |
4261 if (cond_branch) | 4441 if (cond_branch) |
4262 { | 4442 { |
4263 rtx pvar = XEXP (XEXP (cond_branch, 0), 0); | 4443 rtx pvar = XEXP (XEXP (cond_branch, 0), 0); |
4264 | 4444 |
4265 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) | 4445 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask |
4446 && nvptx_mach_vector_length () == PTX_WARP_SIZE) | |
4266 { | 4447 { |
4267 /* Vector mode only, do a shuffle. */ | 4448 /* Vector mode only, do a shuffle. */ |
4268 #if WORKAROUND_PTXJIT_BUG | 4449 #if WORKAROUND_PTXJIT_BUG |
4269 /* The branch condition %rcond is propagated like this: | 4450 /* The branch condition %rcond is propagated like this: |
4270 | 4451 |
4320 emit_insn_before (gen_movbi (tmp, const0_rtx), | 4501 emit_insn_before (gen_movbi (tmp, const0_rtx), |
4321 bb_first_real_insn (from)); | 4502 bb_first_real_insn (from)); |
4322 emit_insn_before (gen_rtx_SET (tmp, pvar), label); | 4503 emit_insn_before (gen_rtx_SET (tmp, pvar), label); |
4323 emit_insn_before (gen_rtx_SET (pvar, tmp), tail); | 4504 emit_insn_before (gen_rtx_SET (pvar, tmp), tail); |
4324 #endif | 4505 #endif |
4325 emit_insn_before (nvptx_gen_vcast (pvar), tail); | 4506 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail); |
4326 } | 4507 } |
4327 else | 4508 else |
4328 { | 4509 { |
4329 /* Includes worker mode, do spill & fill. By construction | 4510 /* Includes worker mode, do spill & fill. By construction |
4330 we should never have worker mode only. */ | 4511 we should never have worker mode only. */ |
4331 wcast_data_t data; | 4512 broadcast_data_t data; |
4332 | 4513 unsigned size = GET_MODE_SIZE (SImode); |
4333 data.base = worker_bcast_sym; | 4514 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0; |
4515 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0; | |
4516 rtx barrier = GEN_INT (0); | |
4517 int threads = 0; | |
4518 | |
4519 data.base = oacc_bcast_sym; | |
4334 data.ptr = 0; | 4520 data.ptr = 0; |
4335 | 4521 |
4336 if (worker_bcast_size < GET_MODE_SIZE (SImode)) | 4522 bool use_partitioning_p = (vector && !worker |
4337 worker_bcast_size = GET_MODE_SIZE (SImode); | 4523 && nvptx_mach_max_workers () > 1 |
4524 && cfun->machine->bcast_partition); | |
4525 if (use_partitioning_p) | |
4526 { | |
4527 data.base = cfun->machine->bcast_partition; | |
4528 barrier = cfun->machine->sync_bar; | |
4529 threads = nvptx_mach_vector_length (); | |
4530 } | |
4531 gcc_assert (data.base != NULL); | |
4532 gcc_assert (barrier); | |
4533 | |
4534 unsigned int psize = ROUND_UP (size, oacc_bcast_align); | |
4535 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE | |
4536 ? nvptx_mach_max_workers () + 1 | |
4537 : 1); | |
4538 | |
4539 oacc_bcast_partition = MAX (oacc_bcast_partition, psize); | |
4540 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum); | |
4338 | 4541 |
4339 data.offset = 0; | 4542 data.offset = 0; |
4340 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data), | 4543 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data, |
4544 vector), | |
4341 before); | 4545 before); |
4546 | |
4342 /* Barrier so other workers can see the write. */ | 4547 /* Barrier so other workers can see the write. */ |
4343 emit_insn_before (nvptx_wsync (false), tail); | 4548 emit_insn_before (nvptx_cta_sync (barrier, threads), tail); |
4344 data.offset = 0; | 4549 data.offset = 0; |
4345 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail); | 4550 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data, |
4551 vector), | |
4552 tail); | |
4346 /* This barrier is needed to avoid worker zero clobbering | 4553 /* This barrier is needed to avoid worker zero clobbering |
4347 the broadcast buffer before all the other workers have | 4554 the broadcast buffer before all the other workers have |
4348 had a chance to read this instance of it. */ | 4555 had a chance to read this instance of it. */ |
4349 emit_insn_before (nvptx_wsync (true), tail); | 4556 emit_insn_before (nvptx_cta_sync (barrier, threads), tail); |
4350 } | 4557 } |
4351 | 4558 |
4352 extract_insn (tail); | 4559 extract_insn (tail); |
4353 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), | 4560 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), |
4354 UNSPEC_BR_UNIFIED); | 4561 UNSPEC_BR_UNIFIED); |
4458 par->inner_mask = nvptx_process_pars (par->inner); | 4665 par->inner_mask = nvptx_process_pars (par->inner); |
4459 inner_mask |= par->inner_mask; | 4666 inner_mask |= par->inner_mask; |
4460 } | 4667 } |
4461 | 4668 |
4462 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; | 4669 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; |
4463 | 4670 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)); |
4464 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) | 4671 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) |
4465 { | 4672 && nvptx_mach_vector_length () > PTX_WARP_SIZE); |
4466 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn); | 4673 |
4467 bool empty = nvptx_wpropagate (true, is_call, | 4674 if (worker || large_vector) |
4468 par->forked_block, par->fork_insn); | 4675 { |
4469 | 4676 nvptx_shared_propagate (false, is_call, par->forked_block, |
4470 if (!empty || !is_call) | 4677 par->forked_insn, !worker); |
4678 bool no_prop_p | |
4679 = nvptx_shared_propagate (true, is_call, par->forked_block, | |
4680 par->fork_insn, !worker); | |
4681 bool empty_loop_p | |
4682 = !is_call && (NEXT_INSN (par->forked_insn) | |
4683 && NEXT_INSN (par->forked_insn) == par->joining_insn); | |
4684 rtx barrier = GEN_INT (0); | |
4685 int threads = 0; | |
4686 | |
4687 if (!worker && cfun->machine->sync_bar) | |
4688 { | |
4689 barrier = cfun->machine->sync_bar; | |
4690 threads = nvptx_mach_vector_length (); | |
4691 } | |
4692 | |
4693 if (no_prop_p && empty_loop_p) | |
4694 ; | |
4695 else if (no_prop_p && is_call) | |
4696 ; | |
4697 else | |
4471 { | 4698 { |
4472 /* Insert begin and end synchronizations. */ | 4699 /* Insert begin and end synchronizations. */ |
4473 emit_insn_before (nvptx_wsync (false), par->forked_insn); | 4700 emit_insn_before (nvptx_cta_sync (barrier, threads), |
4474 emit_insn_before (nvptx_wsync (true), par->join_insn); | 4701 par->forked_insn); |
4702 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn); | |
4475 } | 4703 } |
4476 } | 4704 } |
4477 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) | 4705 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) |
4478 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn); | 4706 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn); |
4479 | 4707 |
4480 /* Now do siblings. */ | 4708 /* Now do siblings. */ |
4481 if (par->next) | 4709 if (par->next) |
4482 inner_mask |= nvptx_process_pars (par->next); | 4710 inner_mask |= nvptx_process_pars (par->next); |
4483 return inner_mask; | 4711 return inner_mask; |
4552 } | 4780 } |
4553 } | 4781 } |
4554 } | 4782 } |
4555 | 4783 |
4556 if (skip_mask) | 4784 if (skip_mask) |
4557 nvptx_skip_par (skip_mask, par); | 4785 nvptx_skip_par (skip_mask, par); |
4558 | 4786 |
4559 if (par->next) | 4787 if (par->next) |
4560 nvptx_neuter_pars (par->next, modes, outer); | 4788 nvptx_neuter_pars (par->next, modes, outer); |
4789 } | |
4790 | |
4791 static void | |
4792 populate_offload_attrs (offload_attrs *oa) | |
4793 { | |
4794 tree attr = oacc_get_fn_attrib (current_function_decl); | |
4795 tree dims = TREE_VALUE (attr); | |
4796 unsigned ix; | |
4797 | |
4798 oa->mask = 0; | |
4799 | |
4800 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) | |
4801 { | |
4802 tree t = TREE_VALUE (dims); | |
4803 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t); | |
4804 tree allowed = TREE_PURPOSE (dims); | |
4805 | |
4806 if (size != 1 && !(allowed && integer_zerop (allowed))) | |
4807 oa->mask |= GOMP_DIM_MASK (ix); | |
4808 | |
4809 switch (ix) | |
4810 { | |
4811 case GOMP_DIM_GANG: | |
4812 oa->num_gangs = size; | |
4813 break; | |
4814 | |
4815 case GOMP_DIM_WORKER: | |
4816 oa->num_workers = size; | |
4817 break; | |
4818 | |
4819 case GOMP_DIM_VECTOR: | |
4820 oa->vector_length = size; | |
4821 break; | |
4822 } | |
4823 } | |
4561 } | 4824 } |
4562 | 4825 |
4563 #if WORKAROUND_PTXJIT_BUG_2 | 4826 #if WORKAROUND_PTXJIT_BUG_2 |
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant | 4827 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant |
4565 is needed in the nvptx target because the branches generated for | 4828 is needed in the nvptx target because the branches generated for |
4739 tree attr = oacc_get_fn_attrib (current_function_decl); | 5002 tree attr = oacc_get_fn_attrib (current_function_decl); |
4740 if (attr) | 5003 if (attr) |
4741 { | 5004 { |
4742 /* If we determined this mask before RTL expansion, we could | 5005 /* If we determined this mask before RTL expansion, we could |
4743 elide emission of some levels of forks and joins. */ | 5006 elide emission of some levels of forks and joins. */ |
4744 unsigned mask = 0; | 5007 offload_attrs oa; |
4745 tree dims = TREE_VALUE (attr); | 5008 |
4746 unsigned ix; | 5009 populate_offload_attrs (&oa); |
4747 | 5010 |
4748 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) | |
4749 { | |
4750 int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); | |
4751 tree allowed = TREE_PURPOSE (dims); | |
4752 | |
4753 if (size != 1 && !(allowed && integer_zerop (allowed))) | |
4754 mask |= GOMP_DIM_MASK (ix); | |
4755 } | |
4756 /* If there is worker neutering, there must be vector | 5011 /* If there is worker neutering, there must be vector |
4757 neutering. Otherwise the hardware will fail. */ | 5012 neutering. Otherwise the hardware will fail. */ |
4758 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) | 5013 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
4759 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); | 5014 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); |
4760 | 5015 |
4761 /* Discover & process partitioned regions. */ | 5016 /* Discover & process partitioned regions. */ |
4762 parallel *pars = nvptx_discover_pars (&bb_insn_map); | 5017 parallel *pars = nvptx_discover_pars (&bb_insn_map); |
4763 nvptx_process_pars (pars); | 5018 nvptx_process_pars (pars); |
4764 nvptx_neuter_pars (pars, mask, 0); | 5019 nvptx_neuter_pars (pars, oa.mask, 0); |
4765 delete pars; | 5020 delete pars; |
4766 } | 5021 } |
4767 | 5022 |
4768 /* Replace subregs. */ | 5023 /* Replace subregs. */ |
4769 nvptx_reorg_subreg (); | 5024 nvptx_reorg_subreg (); |
4937 fputs ("\t.target\tsm_30\n", asm_out_file); | 5192 fputs ("\t.target\tsm_30\n", asm_out_file); |
4938 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); | 5193 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); |
4939 fputs ("// END PREAMBLE\n", asm_out_file); | 5194 fputs ("// END PREAMBLE\n", asm_out_file); |
4940 } | 5195 } |
4941 | 5196 |
4942 /* Emit a declaration for a worker-level buffer in .shared memory. */ | 5197 /* Emit a declaration for a worker and vector-level buffer in .shared |
5198 memory. */ | |
4943 | 5199 |
4944 static void | 5200 static void |
4945 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size) | 5201 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size) |
4946 { | 5202 { |
4947 const char *name = XSTR (sym, 0); | 5203 const char *name = XSTR (sym, 0); |
4948 | 5204 |
4949 write_var_marker (file, true, false, name); | 5205 write_var_marker (file, true, false, name); |
4950 fprintf (file, ".shared .align %d .u8 %s[%d];\n", | 5206 fprintf (file, ".shared .align %d .u8 %s[%d];\n", |
4961 tree decl; | 5217 tree decl; |
4962 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) | 5218 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) |
4963 nvptx_record_fndecl (decl); | 5219 nvptx_record_fndecl (decl); |
4964 fputs (func_decls.str().c_str(), asm_out_file); | 5220 fputs (func_decls.str().c_str(), asm_out_file); |
4965 | 5221 |
4966 if (worker_bcast_size) | 5222 if (oacc_bcast_size) |
4967 write_worker_buffer (asm_out_file, worker_bcast_sym, | 5223 write_shared_buffer (asm_out_file, oacc_bcast_sym, |
4968 worker_bcast_align, worker_bcast_size); | 5224 oacc_bcast_align, oacc_bcast_size); |
4969 | 5225 |
4970 if (worker_red_size) | 5226 if (worker_red_size) |
4971 write_worker_buffer (asm_out_file, worker_red_sym, | 5227 write_shared_buffer (asm_out_file, worker_red_sym, |
4972 worker_red_align, worker_red_size); | 5228 worker_red_align, worker_red_size); |
5229 | |
5230 if (vector_red_size) | |
5231 write_shared_buffer (asm_out_file, vector_red_sym, | |
5232 vector_red_align, vector_red_size); | |
4973 | 5233 |
4974 if (need_softstack_decl) | 5234 if (need_softstack_decl) |
4975 { | 5235 { |
4976 write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); | 5236 write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); |
4977 /* 32 is the maximum number of warps in a block. Even though it's an | 5237 /* 32 is the maximum number of warps in a block. Even though it's an |
5014 emit_insn (pat); | 5274 emit_insn (pat); |
5015 | 5275 |
5016 return target; | 5276 return target; |
5017 } | 5277 } |
5018 | 5278 |
5019 /* Worker reduction address expander. */ | 5279 const char * |
5280 nvptx_output_red_partition (rtx dst, rtx offset) | |
5281 { | |
5282 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n"; | |
5283 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n"; | |
5284 | |
5285 if (offset == const0_rtx) | |
5286 fprintf (asm_out_file, zero_offset, REGNO (dst), | |
5287 REGNO (cfun->machine->red_partition)); | |
5288 else | |
5289 fprintf (asm_out_file, with_offset, REGNO (dst), | |
5290 REGNO (cfun->machine->red_partition), UINTVAL (offset)); | |
5291 | |
5292 return ""; | |
5293 } | |
5294 | |
5295 /* Shared-memory reduction address expander. */ | |
5020 | 5296 |
5021 static rtx | 5297 static rtx |
5022 nvptx_expand_worker_addr (tree exp, rtx target, | 5298 nvptx_expand_shared_addr (tree exp, rtx target, |
5023 machine_mode ARG_UNUSED (mode), int ignore) | 5299 machine_mode ARG_UNUSED (mode), int ignore, |
5300 int vector) | |
5024 { | 5301 { |
5025 if (ignore) | 5302 if (ignore) |
5026 return target; | 5303 return target; |
5027 | 5304 |
5028 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2)); | 5305 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2)); |
5029 if (align > worker_red_align) | |
5030 worker_red_align = align; | |
5031 | |
5032 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0)); | 5306 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0)); |
5033 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1)); | 5307 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1)); |
5034 if (size + offset > worker_red_size) | |
5035 worker_red_size = size + offset; | |
5036 | |
5037 rtx addr = worker_red_sym; | 5308 rtx addr = worker_red_sym; |
5038 if (offset) | 5309 |
5039 { | 5310 if (vector) |
5040 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)); | 5311 { |
5041 addr = gen_rtx_CONST (Pmode, addr); | 5312 offload_attrs oa; |
5042 } | 5313 |
5314 populate_offload_attrs (&oa); | |
5315 | |
5316 unsigned int psize = ROUND_UP (size + offset, align); | |
5317 unsigned int pnum = nvptx_mach_max_workers (); | |
5318 vector_red_partition = MAX (vector_red_partition, psize); | |
5319 vector_red_size = MAX (vector_red_size, psize * pnum); | |
5320 vector_red_align = MAX (vector_red_align, align); | |
5321 | |
5322 if (cfun->machine->red_partition == NULL) | |
5323 cfun->machine->red_partition = gen_reg_rtx (Pmode); | |
5324 | |
5325 addr = gen_reg_rtx (Pmode); | |
5326 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset))); | |
5327 } | |
5328 else | |
5329 { | |
5330 worker_red_align = MAX (worker_red_align, align); | |
5331 worker_red_size = MAX (worker_red_size, size + offset); | |
5332 | |
5333 if (offset) | |
5334 { | |
5335 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)); | |
5336 addr = gen_rtx_CONST (Pmode, addr); | |
5337 } | |
5338 } | |
5043 | 5339 |
5044 emit_move_insn (target, addr); | 5340 emit_move_insn (target, addr); |
5045 | |
5046 return target; | 5341 return target; |
5047 } | 5342 } |
5048 | 5343 |
5049 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do | 5344 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do |
5050 not require taking the address of any object, other than the memory | 5345 not require taking the address of any object, other than the memory |
5088 enum nvptx_builtins | 5383 enum nvptx_builtins |
5089 { | 5384 { |
5090 NVPTX_BUILTIN_SHUFFLE, | 5385 NVPTX_BUILTIN_SHUFFLE, |
5091 NVPTX_BUILTIN_SHUFFLELL, | 5386 NVPTX_BUILTIN_SHUFFLELL, |
5092 NVPTX_BUILTIN_WORKER_ADDR, | 5387 NVPTX_BUILTIN_WORKER_ADDR, |
5388 NVPTX_BUILTIN_VECTOR_ADDR, | |
5093 NVPTX_BUILTIN_CMP_SWAP, | 5389 NVPTX_BUILTIN_CMP_SWAP, |
5094 NVPTX_BUILTIN_CMP_SWAPLL, | 5390 NVPTX_BUILTIN_CMP_SWAPLL, |
5095 NVPTX_BUILTIN_MAX | 5391 NVPTX_BUILTIN_MAX |
5096 }; | 5392 }; |
5097 | 5393 |
5125 | 5421 |
5126 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); | 5422 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); |
5127 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); | 5423 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); |
5128 DEF (WORKER_ADDR, "worker_addr", | 5424 DEF (WORKER_ADDR, "worker_addr", |
5129 (PTRVOID, ST, UINT, UINT, NULL_TREE)); | 5425 (PTRVOID, ST, UINT, UINT, NULL_TREE)); |
5426 DEF (VECTOR_ADDR, "vector_addr", | |
5427 (PTRVOID, ST, UINT, UINT, NULL_TREE)); | |
5130 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); | 5428 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); |
5131 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); | 5429 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); |
5132 | 5430 |
5133 #undef DEF | 5431 #undef DEF |
5134 #undef ST | 5432 #undef ST |
5146 static rtx | 5444 static rtx |
5147 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), | 5445 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), |
5148 machine_mode mode, int ignore) | 5446 machine_mode mode, int ignore) |
5149 { | 5447 { |
5150 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); | 5448 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); |
5151 switch (DECL_FUNCTION_CODE (fndecl)) | 5449 switch (DECL_MD_FUNCTION_CODE (fndecl)) |
5152 { | 5450 { |
5153 case NVPTX_BUILTIN_SHUFFLE: | 5451 case NVPTX_BUILTIN_SHUFFLE: |
5154 case NVPTX_BUILTIN_SHUFFLELL: | 5452 case NVPTX_BUILTIN_SHUFFLELL: |
5155 return nvptx_expand_shuffle (exp, target, mode, ignore); | 5453 return nvptx_expand_shuffle (exp, target, mode, ignore); |
5156 | 5454 |
5157 case NVPTX_BUILTIN_WORKER_ADDR: | 5455 case NVPTX_BUILTIN_WORKER_ADDR: |
5158 return nvptx_expand_worker_addr (exp, target, mode, ignore); | 5456 return nvptx_expand_shared_addr (exp, target, mode, ignore, false); |
5457 | |
5458 case NVPTX_BUILTIN_VECTOR_ADDR: | |
5459 return nvptx_expand_shared_addr (exp, target, mode, ignore, true); | |
5159 | 5460 |
5160 case NVPTX_BUILTIN_CMP_SWAP: | 5461 case NVPTX_BUILTIN_CMP_SWAP: |
5161 case NVPTX_BUILTIN_CMP_SWAPLL: | 5462 case NVPTX_BUILTIN_CMP_SWAPLL: |
5162 return nvptx_expand_cmp_swap (exp, target, mode, ignore); | 5463 return nvptx_expand_cmp_swap (exp, target, mode, ignore); |
5163 | 5464 |
5164 default: gcc_unreachable (); | 5465 default: gcc_unreachable (); |
5165 } | 5466 } |
5166 } | 5467 } |
5167 | |
5168 /* Define dimension sizes for known hardware. */ | |
5169 #define PTX_VECTOR_LENGTH 32 | |
5170 #define PTX_WORKER_LENGTH 32 | |
5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ | |
5172 | 5468 |
5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ | 5469 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ |
5174 | 5470 |
5175 static int | 5471 static int |
5176 nvptx_simt_vf () | 5472 nvptx_simt_vf () |
5177 { | 5473 { |
5178 return PTX_VECTOR_LENGTH; | 5474 return PTX_WARP_SIZE; |
5475 } | |
5476 | |
5477 /* Return 1 if TRAIT NAME is present in the OpenMP context's | |
5478 device trait set, return 0 if not present in any OpenMP context in the | |
5479 whole translation unit, or -1 if not present in the current OpenMP context | |
5480 but might be present in another OpenMP context in the same TU. */ | |
5481 | |
5482 int | |
5483 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, | |
5484 const char *name) | |
5485 { | |
5486 switch (trait) | |
5487 { | |
5488 case omp_device_kind: | |
5489 return strcmp (name, "gpu") == 0; | |
5490 case omp_device_arch: | |
5491 return strcmp (name, "nvptx") == 0; | |
5492 case omp_device_isa: | |
5493 if (strcmp (name, "sm_30") == 0) | |
5494 return !TARGET_SM35; | |
5495 if (strcmp (name, "sm_35") == 0) | |
5496 return TARGET_SM35; | |
5497 return 0; | |
5498 default: | |
5499 gcc_unreachable (); | |
5500 } | |
5501 } | |
5502 | |
5503 static bool | |
5504 nvptx_welformed_vector_length_p (int l) | |
5505 { | |
5506 gcc_assert (l > 0); | |
5507 return l % PTX_WARP_SIZE == 0; | |
5508 } | |
5509 | |
5510 static void | |
5511 nvptx_apply_dim_limits (int dims[]) | |
5512 { | |
5513 /* Check that the vector_length is not too large. */ | |
5514 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH) | |
5515 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH; | |
5516 | |
5517 /* Check that the number of workers is not too large. */ | |
5518 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH) | |
5519 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; | |
5520 | |
5521 /* Ensure that num_worker * vector_length <= cta size. */ | |
5522 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 | |
5523 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE) | |
5524 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; | |
5525 | |
5526 /* If we need a per-worker barrier ... . */ | |
5527 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 | |
5528 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) | |
5529 /* Don't use more barriers than available. */ | |
5530 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER], | |
5531 PTX_NUM_PER_WORKER_BARRIERS); | |
5532 } | |
5533 | |
5534 /* Return true if FNDECL contains calls to vector-partitionable routines. */ | |
5535 | |
5536 static bool | |
5537 has_vector_partitionable_routine_calls_p (tree fndecl) | |
5538 { | |
5539 if (!fndecl) | |
5540 return false; | |
5541 | |
5542 basic_block bb; | |
5543 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl)) | |
5544 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i); | |
5545 gsi_next_nondebug (&i)) | |
5546 { | |
5547 gimple *stmt = gsi_stmt (i); | |
5548 if (gimple_code (stmt) != GIMPLE_CALL) | |
5549 continue; | |
5550 | |
5551 tree callee = gimple_call_fndecl (stmt); | |
5552 if (!callee) | |
5553 continue; | |
5554 | |
5555 tree attrs = oacc_get_fn_attrib (callee); | |
5556 if (attrs == NULL_TREE) | |
5557 return false; | |
5558 | |
5559 int partition_level = oacc_fn_attrib_level (attrs); | |
5560 bool seq_routine_p = partition_level == GOMP_DIM_MAX; | |
5561 if (!seq_routine_p) | |
5562 return true; | |
5563 } | |
5564 | |
5565 return false; | |
5566 } | |
5567 | |
5568 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether | |
5569 DIMS has changed. */ | |
5570 | |
5571 static void | |
5572 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used) | |
5573 { | |
5574 bool oacc_default_dims_p = false; | |
5575 bool oacc_min_dims_p = false; | |
5576 bool offload_region_p = false; | |
5577 bool routine_p = false; | |
5578 bool routine_seq_p = false; | |
5579 int default_vector_length = -1; | |
5580 | |
5581 if (decl == NULL_TREE) | |
5582 { | |
5583 if (fn_level == -1) | |
5584 oacc_default_dims_p = true; | |
5585 else if (fn_level == -2) | |
5586 oacc_min_dims_p = true; | |
5587 else | |
5588 gcc_unreachable (); | |
5589 } | |
5590 else if (fn_level == -1) | |
5591 offload_region_p = true; | |
5592 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX) | |
5593 { | |
5594 routine_p = true; | |
5595 routine_seq_p = fn_level == GOMP_DIM_MAX; | |
5596 } | |
5597 else | |
5598 gcc_unreachable (); | |
5599 | |
5600 if (oacc_min_dims_p) | |
5601 { | |
5602 gcc_assert (dims[GOMP_DIM_VECTOR] == 1); | |
5603 gcc_assert (dims[GOMP_DIM_WORKER] == 1); | |
5604 gcc_assert (dims[GOMP_DIM_GANG] == 1); | |
5605 | |
5606 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; | |
5607 return; | |
5608 } | |
5609 | |
5610 if (routine_p) | |
5611 { | |
5612 if (!routine_seq_p) | |
5613 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; | |
5614 | |
5615 return; | |
5616 } | |
5617 | |
5618 if (oacc_default_dims_p) | |
5619 { | |
5620 /* -1 : not set | |
5621 0 : set at runtime, f.i. -fopenacc-dims=- | |
5622 >= 1: set at compile time, f.i. -fopenacc-dims=1. */ | |
5623 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1); | |
5624 gcc_assert (dims[GOMP_DIM_WORKER] >= -1); | |
5625 gcc_assert (dims[GOMP_DIM_GANG] >= -1); | |
5626 | |
5627 /* But -fopenacc-dims=- is not yet supported on trunk. */ | |
5628 gcc_assert (dims[GOMP_DIM_VECTOR] != 0); | |
5629 gcc_assert (dims[GOMP_DIM_WORKER] != 0); | |
5630 gcc_assert (dims[GOMP_DIM_GANG] != 0); | |
5631 } | |
5632 | |
5633 if (offload_region_p) | |
5634 { | |
5635 /* -1 : not set | |
5636 0 : set using variable, f.i. num_gangs (n) | |
5637 >= 1: set using constant, f.i. num_gangs (1). */ | |
5638 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1); | |
5639 gcc_assert (dims[GOMP_DIM_WORKER] >= -1); | |
5640 gcc_assert (dims[GOMP_DIM_GANG] >= -1); | |
5641 } | |
5642 | |
5643 if (offload_region_p) | |
5644 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR); | |
5645 else | |
5646 /* oacc_default_dims_p. */ | |
5647 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH; | |
5648 | |
5649 int old_dims[GOMP_DIM_MAX]; | |
5650 unsigned int i; | |
5651 for (i = 0; i < GOMP_DIM_MAX; ++i) | |
5652 old_dims[i] = dims[i]; | |
5653 | |
5654 const char *vector_reason = NULL; | |
5655 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl)) | |
5656 { | |
5657 default_vector_length = PTX_WARP_SIZE; | |
5658 | |
5659 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) | |
5660 { | |
5661 vector_reason = G_("using vector_length (%d) due to call to" | |
5662 " vector-partitionable routine, ignoring %d"); | |
5663 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; | |
5664 } | |
5665 } | |
5666 | |
5667 if (dims[GOMP_DIM_VECTOR] == 0) | |
5668 { | |
5669 vector_reason = G_("using vector_length (%d), ignoring runtime setting"); | |
5670 dims[GOMP_DIM_VECTOR] = default_vector_length; | |
5671 } | |
5672 | |
5673 if (dims[GOMP_DIM_VECTOR] > 0 | |
5674 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR])) | |
5675 dims[GOMP_DIM_VECTOR] = default_vector_length; | |
5676 | |
5677 nvptx_apply_dim_limits (dims); | |
5678 | |
5679 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR]) | |
5680 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, | |
5681 vector_reason != NULL | |
5682 ? vector_reason | |
5683 : G_("using vector_length (%d), ignoring %d"), | |
5684 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]); | |
5685 | |
5686 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER]) | |
5687 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, | |
5688 G_("using num_workers (%d), ignoring %d"), | |
5689 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]); | |
5690 | |
5691 if (oacc_default_dims_p) | |
5692 { | |
5693 if (dims[GOMP_DIM_VECTOR] < 0) | |
5694 dims[GOMP_DIM_VECTOR] = default_vector_length; | |
5695 if (dims[GOMP_DIM_WORKER] < 0) | |
5696 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; | |
5697 if (dims[GOMP_DIM_GANG] < 0) | |
5698 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; | |
5699 nvptx_apply_dim_limits (dims); | |
5700 } | |
5701 | |
5702 if (offload_region_p) | |
5703 { | |
5704 for (i = 0; i < GOMP_DIM_MAX; i++) | |
5705 { | |
5706 if (!(dims[i] < 0)) | |
5707 continue; | |
5708 | |
5709 if ((used & GOMP_DIM_MASK (i)) == 0) | |
5710 /* Function oacc_validate_dims will apply the minimal dimension. */ | |
5711 continue; | |
5712 | |
5713 dims[i] = (i == GOMP_DIM_VECTOR | |
5714 ? default_vector_length | |
5715 : oacc_get_default_dim (i)); | |
5716 } | |
5717 | |
5718 nvptx_apply_dim_limits (dims); | |
5719 } | |
5179 } | 5720 } |
5180 | 5721 |
5181 /* Validate compute dimensions of an OpenACC offload or routine, fill | 5722 /* Validate compute dimensions of an OpenACC offload or routine, fill |
5182 in non-unity defaults. FN_LEVEL indicates the level at which a | 5723 in non-unity defaults. FN_LEVEL indicates the level at which a |
5183 routine might spawn a loop. It is negative for non-routines. If | 5724 routine might spawn a loop. It is negative for non-routines. If |
5184 DECL is null, we are validating the default dimensions. */ | 5725 DECL is null, we are validating the default dimensions. */ |
5185 | 5726 |
5186 static bool | 5727 static bool |
5187 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) | 5728 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used) |
5188 { | 5729 { |
5189 bool changed = false; | 5730 int old_dims[GOMP_DIM_MAX]; |
5190 | 5731 unsigned int i; |
5191 /* The vector size must be 32, unless this is a SEQ routine. */ | 5732 |
5192 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1 | 5733 for (i = 0; i < GOMP_DIM_MAX; ++i) |
5193 && dims[GOMP_DIM_VECTOR] >= 0 | 5734 old_dims[i] = dims[i]; |
5194 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH) | 5735 |
5195 { | 5736 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used); |
5196 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0) | 5737 |
5197 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, | 5738 gcc_assert (dims[GOMP_DIM_VECTOR] != 0); |
5198 dims[GOMP_DIM_VECTOR] | 5739 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0) |
5199 ? G_("using vector_length (%d), ignoring %d") | 5740 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE); |
5200 : G_("using vector_length (%d), ignoring runtime setting"), | 5741 |
5201 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]); | 5742 for (i = 0; i < GOMP_DIM_MAX; ++i) |
5202 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; | 5743 if (old_dims[i] != dims[i]) |
5203 changed = true; | 5744 return true; |
5204 } | 5745 |
5205 | 5746 return false; |
5206 /* Check the num workers is not too large. */ | |
5207 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH) | |
5208 { | |
5209 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, | |
5210 "using num_workers (%d), ignoring %d", | |
5211 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]); | |
5212 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; | |
5213 changed = true; | |
5214 } | |
5215 | |
5216 if (!decl) | |
5217 { | |
5218 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; | |
5219 if (dims[GOMP_DIM_WORKER] < 0) | |
5220 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; | |
5221 if (dims[GOMP_DIM_GANG] < 0) | |
5222 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; | |
5223 changed = true; | |
5224 } | |
5225 | |
5226 return changed; | |
5227 } | 5747 } |
5228 | 5748 |
5229 /* Return maximum dimension size, or zero for unbounded. */ | 5749 /* Return maximum dimension size, or zero for unbounded. */ |
5230 | 5750 |
5231 static int | 5751 static int |
5232 nvptx_dim_limit (int axis) | 5752 nvptx_dim_limit (int axis) |
5233 { | 5753 { |
5234 switch (axis) | 5754 switch (axis) |
5235 { | 5755 { |
5236 case GOMP_DIM_VECTOR: | 5756 case GOMP_DIM_VECTOR: |
5237 return PTX_VECTOR_LENGTH; | 5757 return PTX_MAX_VECTOR_LENGTH; |
5238 | 5758 |
5239 default: | 5759 default: |
5240 break; | 5760 break; |
5241 } | 5761 } |
5242 return 0; | 5762 return 0; |
5265 /* Generate a PTX builtin function call that returns the address in | 5785 /* Generate a PTX builtin function call that returns the address in |
5266 the worker reduction buffer at OFFSET. TYPE is the type of the | 5786 the worker reduction buffer at OFFSET. TYPE is the type of the |
5267 data at that location. */ | 5787 data at that location. */ |
5268 | 5788 |
5269 static tree | 5789 static tree |
5270 nvptx_get_worker_red_addr (tree type, tree offset) | 5790 nvptx_get_shared_red_addr (tree type, tree offset, bool vector) |
5271 { | 5791 { |
5792 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR; | |
5793 if (vector) | |
5794 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR; | |
5272 machine_mode mode = TYPE_MODE (type); | 5795 machine_mode mode = TYPE_MODE (type); |
5273 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true); | 5796 tree fndecl = nvptx_builtin_decl (addr_dim, true); |
5274 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode)); | 5797 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode)); |
5275 tree align = build_int_cst (unsigned_type_node, | 5798 tree align = build_int_cst (unsigned_type_node, |
5276 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT); | 5799 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT); |
5277 tree call = build_call_expr (fndecl, 3, offset, size, align); | 5800 tree call = build_call_expr (fndecl, 3, offset, size, align); |
5278 | 5801 |
5584 } | 6107 } |
5585 | 6108 |
5586 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ | 6109 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ |
5587 | 6110 |
5588 static void | 6111 static void |
5589 nvptx_goacc_reduction_setup (gcall *call) | 6112 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) |
5590 { | 6113 { |
5591 gimple_stmt_iterator gsi = gsi_for_stmt (call); | 6114 gimple_stmt_iterator gsi = gsi_for_stmt (call); |
5592 tree lhs = gimple_call_lhs (call); | 6115 tree lhs = gimple_call_lhs (call); |
5593 tree var = gimple_call_arg (call, 2); | 6116 tree var = gimple_call_arg (call, 2); |
5594 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); | 6117 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); |
5603 | 6126 |
5604 if (!integer_zerop (ref_to_res)) | 6127 if (!integer_zerop (ref_to_res)) |
5605 var = build_simple_mem_ref (ref_to_res); | 6128 var = build_simple_mem_ref (ref_to_res); |
5606 } | 6129 } |
5607 | 6130 |
5608 if (level == GOMP_DIM_WORKER) | 6131 if (level == GOMP_DIM_WORKER |
6132 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) | |
5609 { | 6133 { |
5610 /* Store incoming value to worker reduction buffer. */ | 6134 /* Store incoming value to worker reduction buffer. */ |
5611 tree offset = gimple_call_arg (call, 5); | 6135 tree offset = gimple_call_arg (call, 5); |
5612 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset); | 6136 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, |
6137 level == GOMP_DIM_VECTOR); | |
5613 tree ptr = make_ssa_name (TREE_TYPE (call)); | 6138 tree ptr = make_ssa_name (TREE_TYPE (call)); |
5614 | 6139 |
5615 gimplify_assign (ptr, call, &seq); | 6140 gimplify_assign (ptr, call, &seq); |
5616 tree ref = build_simple_mem_ref (ptr); | 6141 tree ref = build_simple_mem_ref (ptr); |
5617 TREE_THIS_VOLATILE (ref) = 1; | 6142 TREE_THIS_VOLATILE (ref) = 1; |
5626 } | 6151 } |
5627 | 6152 |
5628 /* NVPTX implementation of GOACC_REDUCTION_INIT. */ | 6153 /* NVPTX implementation of GOACC_REDUCTION_INIT. */ |
5629 | 6154 |
5630 static void | 6155 static void |
5631 nvptx_goacc_reduction_init (gcall *call) | 6156 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa) |
5632 { | 6157 { |
5633 gimple_stmt_iterator gsi = gsi_for_stmt (call); | 6158 gimple_stmt_iterator gsi = gsi_for_stmt (call); |
5634 tree lhs = gimple_call_lhs (call); | 6159 tree lhs = gimple_call_lhs (call); |
5635 tree var = gimple_call_arg (call, 2); | 6160 tree var = gimple_call_arg (call, 2); |
5636 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); | 6161 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); |
5640 TREE_TYPE (var)); | 6165 TREE_TYPE (var)); |
5641 gimple_seq seq = NULL; | 6166 gimple_seq seq = NULL; |
5642 | 6167 |
5643 push_gimplify_context (true); | 6168 push_gimplify_context (true); |
5644 | 6169 |
5645 if (level == GOMP_DIM_VECTOR) | 6170 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) |
5646 { | 6171 { |
5647 /* Initialize vector-non-zeroes to INIT_VAL (OP). */ | 6172 /* Initialize vector-non-zeroes to INIT_VAL (OP). */ |
5648 tree tid = make_ssa_name (integer_type_node); | 6173 tree tid = make_ssa_name (integer_type_node); |
5649 tree dim_vector = gimple_call_arg (call, 3); | 6174 tree dim_vector = gimple_call_arg (call, 3); |
5650 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, | 6175 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, |
5700 tree ref_to_res = gimple_call_arg (call, 1); | 6225 tree ref_to_res = gimple_call_arg (call, 1); |
5701 if (integer_zerop (ref_to_res)) | 6226 if (integer_zerop (ref_to_res)) |
5702 init = var; | 6227 init = var; |
5703 } | 6228 } |
5704 | 6229 |
5705 gimplify_assign (lhs, init, &seq); | 6230 if (lhs != NULL_TREE) |
6231 gimplify_assign (lhs, init, &seq); | |
5706 } | 6232 } |
5707 | 6233 |
5708 pop_gimplify_context (NULL); | 6234 pop_gimplify_context (NULL); |
5709 gsi_replace_with_seq (&gsi, seq, true); | 6235 gsi_replace_with_seq (&gsi, seq, true); |
5710 } | 6236 } |
5711 | 6237 |
5712 /* NVPTX implementation of GOACC_REDUCTION_FINI. */ | 6238 /* NVPTX implementation of GOACC_REDUCTION_FINI. */ |
5713 | 6239 |
5714 static void | 6240 static void |
5715 nvptx_goacc_reduction_fini (gcall *call) | 6241 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) |
5716 { | 6242 { |
5717 gimple_stmt_iterator gsi = gsi_for_stmt (call); | 6243 gimple_stmt_iterator gsi = gsi_for_stmt (call); |
5718 tree lhs = gimple_call_lhs (call); | 6244 tree lhs = gimple_call_lhs (call); |
5719 tree ref_to_res = gimple_call_arg (call, 1); | 6245 tree ref_to_res = gimple_call_arg (call, 1); |
5720 tree var = gimple_call_arg (call, 2); | 6246 tree var = gimple_call_arg (call, 2); |
5724 gimple_seq seq = NULL; | 6250 gimple_seq seq = NULL; |
5725 tree r = NULL_TREE;; | 6251 tree r = NULL_TREE;; |
5726 | 6252 |
5727 push_gimplify_context (true); | 6253 push_gimplify_context (true); |
5728 | 6254 |
5729 if (level == GOMP_DIM_VECTOR) | 6255 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) |
5730 { | 6256 { |
5731 /* Emit binary shuffle tree. TODO. Emit this as an actual loop, | 6257 /* Emit binary shuffle tree. TODO. Emit this as an actual loop, |
5732 but that requires a method of emitting a unified jump at the | 6258 but that requires a method of emitting a unified jump at the |
5733 gimple level. */ | 6259 gimple level. */ |
5734 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1) | 6260 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1) |
5735 { | 6261 { |
5736 tree other_var = make_ssa_name (TREE_TYPE (var)); | 6262 tree other_var = make_ssa_name (TREE_TYPE (var)); |
5737 nvptx_generate_vector_shuffle (gimple_location (call), | 6263 nvptx_generate_vector_shuffle (gimple_location (call), |
5738 other_var, var, shfl, &seq); | 6264 other_var, var, shfl, &seq); |
5739 | 6265 |
5745 } | 6271 } |
5746 else | 6272 else |
5747 { | 6273 { |
5748 tree accum = NULL_TREE; | 6274 tree accum = NULL_TREE; |
5749 | 6275 |
5750 if (level == GOMP_DIM_WORKER) | 6276 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR) |
5751 { | 6277 { |
5752 /* Get reduction buffer address. */ | 6278 /* Get reduction buffer address. */ |
5753 tree offset = gimple_call_arg (call, 5); | 6279 tree offset = gimple_call_arg (call, 5); |
5754 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset); | 6280 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, |
6281 level == GOMP_DIM_VECTOR); | |
5755 tree ptr = make_ssa_name (TREE_TYPE (call)); | 6282 tree ptr = make_ssa_name (TREE_TYPE (call)); |
5756 | 6283 |
5757 gimplify_assign (ptr, call, &seq); | 6284 gimplify_assign (ptr, call, &seq); |
5758 accum = ptr; | 6285 accum = ptr; |
5759 } | 6286 } |
5780 } | 6307 } |
5781 | 6308 |
5782 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */ | 6309 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */ |
5783 | 6310 |
5784 static void | 6311 static void |
5785 nvptx_goacc_reduction_teardown (gcall *call) | 6312 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa) |
5786 { | 6313 { |
5787 gimple_stmt_iterator gsi = gsi_for_stmt (call); | 6314 gimple_stmt_iterator gsi = gsi_for_stmt (call); |
5788 tree lhs = gimple_call_lhs (call); | 6315 tree lhs = gimple_call_lhs (call); |
5789 tree var = gimple_call_arg (call, 2); | 6316 tree var = gimple_call_arg (call, 2); |
5790 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); | 6317 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); |
5791 gimple_seq seq = NULL; | 6318 gimple_seq seq = NULL; |
5792 | 6319 |
5793 push_gimplify_context (true); | 6320 push_gimplify_context (true); |
5794 if (level == GOMP_DIM_WORKER) | 6321 if (level == GOMP_DIM_WORKER |
6322 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) | |
5795 { | 6323 { |
5796 /* Read the worker reduction buffer. */ | 6324 /* Read the worker reduction buffer. */ |
5797 tree offset = gimple_call_arg (call, 5); | 6325 tree offset = gimple_call_arg (call, 5); |
5798 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset); | 6326 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, |
6327 level == GOMP_DIM_VECTOR); | |
5799 tree ptr = make_ssa_name (TREE_TYPE (call)); | 6328 tree ptr = make_ssa_name (TREE_TYPE (call)); |
5800 | 6329 |
5801 gimplify_assign (ptr, call, &seq); | 6330 gimplify_assign (ptr, call, &seq); |
5802 var = build_simple_mem_ref (ptr); | 6331 var = build_simple_mem_ref (ptr); |
5803 TREE_THIS_VOLATILE (var) = 1; | 6332 TREE_THIS_VOLATILE (var) = 1; |
5824 | 6353 |
5825 static void | 6354 static void |
5826 nvptx_goacc_reduction (gcall *call) | 6355 nvptx_goacc_reduction (gcall *call) |
5827 { | 6356 { |
5828 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); | 6357 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); |
6358 offload_attrs oa; | |
6359 | |
6360 populate_offload_attrs (&oa); | |
5829 | 6361 |
5830 switch (code) | 6362 switch (code) |
5831 { | 6363 { |
5832 case IFN_GOACC_REDUCTION_SETUP: | 6364 case IFN_GOACC_REDUCTION_SETUP: |
5833 nvptx_goacc_reduction_setup (call); | 6365 nvptx_goacc_reduction_setup (call, &oa); |
5834 break; | 6366 break; |
5835 | 6367 |
5836 case IFN_GOACC_REDUCTION_INIT: | 6368 case IFN_GOACC_REDUCTION_INIT: |
5837 nvptx_goacc_reduction_init (call); | 6369 nvptx_goacc_reduction_init (call, &oa); |
5838 break; | 6370 break; |
5839 | 6371 |
5840 case IFN_GOACC_REDUCTION_FINI: | 6372 case IFN_GOACC_REDUCTION_FINI: |
5841 nvptx_goacc_reduction_fini (call); | 6373 nvptx_goacc_reduction_fini (call, &oa); |
5842 break; | 6374 break; |
5843 | 6375 |
5844 case IFN_GOACC_REDUCTION_TEARDOWN: | 6376 case IFN_GOACC_REDUCTION_TEARDOWN: |
5845 nvptx_goacc_reduction_teardown (call); | 6377 nvptx_goacc_reduction_teardown (call, &oa); |
5846 break; | 6378 break; |
5847 | 6379 |
5848 default: | 6380 default: |
5849 gcc_unreachable (); | 6381 gcc_unreachable (); |
5850 } | 6382 } |
5914 | 6446 |
5915 static bool | 6447 static bool |
5916 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t) | 6448 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t) |
5917 { | 6449 { |
5918 return false; | 6450 return false; |
6451 } | |
6452 | |
6453 static GTY(()) tree nvptx_previous_fndecl; | |
6454 | |
6455 static void | |
6456 nvptx_set_current_function (tree fndecl) | |
6457 { | |
6458 if (!fndecl || fndecl == nvptx_previous_fndecl) | |
6459 return; | |
6460 | |
6461 nvptx_previous_fndecl = fndecl; | |
6462 vector_red_partition = 0; | |
6463 oacc_bcast_partition = 0; | |
5919 } | 6464 } |
5920 | 6465 |
5921 #undef TARGET_OPTION_OVERRIDE | 6466 #undef TARGET_OPTION_OVERRIDE |
5922 #define TARGET_OPTION_OVERRIDE nvptx_option_override | 6467 #define TARGET_OPTION_OVERRIDE nvptx_option_override |
5923 | 6468 |
6018 #define TARGET_BUILTIN_DECL nvptx_builtin_decl | 6563 #define TARGET_BUILTIN_DECL nvptx_builtin_decl |
6019 | 6564 |
6020 #undef TARGET_SIMT_VF | 6565 #undef TARGET_SIMT_VF |
6021 #define TARGET_SIMT_VF nvptx_simt_vf | 6566 #define TARGET_SIMT_VF nvptx_simt_vf |
6022 | 6567 |
6568 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA | |
6569 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa | |
6570 | |
6023 #undef TARGET_GOACC_VALIDATE_DIMS | 6571 #undef TARGET_GOACC_VALIDATE_DIMS |
6024 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims | 6572 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims |
6025 | 6573 |
6026 #undef TARGET_GOACC_DIM_LIMIT | 6574 #undef TARGET_GOACC_DIM_LIMIT |
6027 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit | 6575 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit |
6052 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class | 6600 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class |
6053 | 6601 |
6054 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE | 6602 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE |
6055 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed | 6603 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed |
6056 | 6604 |
6605 #undef TARGET_SET_CURRENT_FUNCTION | |
6606 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function | |
6607 | |
6057 struct gcc_target targetm = TARGET_INITIALIZER; | 6608 struct gcc_target targetm = TARGET_INITIALIZER; |
6058 | 6609 |
6059 #include "gt-nvptx.h" | 6610 #include "gt-nvptx.h" |