comparison gcc/config/nvptx/nvptx.c @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents 84e7813d76e9
children
comparison
equal deleted inserted replaced
131:84e7813d76e9 145:1830386684a0
1 /* Target code for NVPTX. 1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc. 2 Copyright (C) 2014-2020 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com> 3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 4
5 This file is part of GCC. 5 This file is part of GCC.
6 6
7 GCC is free software; you can redistribute it and/or modify it 7 GCC is free software; you can redistribute it and/or modify it
57 #include "gimple.h" 57 #include "gimple.h"
58 #include "stor-layout.h" 58 #include "stor-layout.h"
59 #include "builtins.h" 59 #include "builtins.h"
60 #include "omp-general.h" 60 #include "omp-general.h"
61 #include "omp-low.h" 61 #include "omp-low.h"
62 #include "omp-offload.h"
62 #include "gomp-constants.h" 63 #include "gomp-constants.h"
63 #include "dumpfile.h" 64 #include "dumpfile.h"
64 #include "internal-fn.h" 65 #include "internal-fn.h"
65 #include "gimple-iterator.h" 66 #include "gimple-iterator.h"
66 #include "stringpool.h" 67 #include "stringpool.h"
79 80
80 #define WORKAROUND_PTXJIT_BUG 1 81 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1 82 #define WORKAROUND_PTXJIT_BUG_2 1
82 #define WORKAROUND_PTXJIT_BUG_3 1 83 #define WORKAROUND_PTXJIT_BUG_3 1
83 84
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
87 2.x. */
88 #define PTX_CTA_SIZE 1024
89
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
92
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
102
84 /* The various PTX memory areas an object might reside in. */ 103 /* The various PTX memory areas an object might reside in. */
85 enum nvptx_data_area 104 enum nvptx_data_area
86 { 105 {
87 DATA_AREA_GENERIC, 106 DATA_AREA_GENERIC,
88 DATA_AREA_GLOBAL, 107 DATA_AREA_GLOBAL,
120 }; 139 };
121 140
122 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; 141 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
123 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; 142 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
124 143
125 /* Buffer needed to broadcast across workers. This is used for both 144 /* Buffer needed to broadcast across workers and vectors. This is
126 worker-neutering and worker broadcasting. It is shared by all 145 used for both worker-neutering and worker broadcasting, and
127 functions emitted. The buffer is placed in shared memory. It'd be 146 vector-neutering and boardcasting when vector_length > 32. It is
128 nice if PTX supported common blocks, because then this could be 147 shared by all functions emitted. The buffer is placed in shared
129 shared across TUs (taking the largest size). */ 148 memory. It'd be nice if PTX supported common blocks, because then
130 static unsigned worker_bcast_size; 149 this could be shared across TUs (taking the largest size). */
131 static unsigned worker_bcast_align; 150 static unsigned oacc_bcast_size;
132 static GTY(()) rtx worker_bcast_sym; 151 static unsigned oacc_bcast_partition;
152 static unsigned oacc_bcast_align;
153 static GTY(()) rtx oacc_bcast_sym;
133 154
134 /* Buffer needed for worker reductions. This has to be distinct from 155 /* Buffer needed for worker reductions. This has to be distinct from
135 the worker broadcast array, as both may be live concurrently. */ 156 the worker broadcast array, as both may be live concurrently. */
136 static unsigned worker_red_size; 157 static unsigned worker_red_size;
137 static unsigned worker_red_align; 158 static unsigned worker_red_align;
138 static GTY(()) rtx worker_red_sym; 159 static GTY(()) rtx worker_red_sym;
139 160
161 /* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164 static unsigned vector_red_size;
165 static unsigned vector_red_align;
166 static unsigned vector_red_partition;
167 static GTY(()) rtx vector_red_sym;
168
140 /* Global lock variable, needed for 128bit worker & gang reductions. */ 169 /* Global lock variable, needed for 128bit worker & gang reductions. */
141 static GTY(()) tree global_lock_var; 170 static GTY(()) tree global_lock_var;
142 171
143 /* True if any function references __nvptx_stacks. */ 172 /* True if any function references __nvptx_stacks. */
144 static bool need_softstack_decl; 173 static bool need_softstack_decl;
145 174
146 /* True if any function references __nvptx_uni. */ 175 /* True if any function references __nvptx_uni. */
147 static bool need_unisimt_decl; 176 static bool need_unisimt_decl;
177
178 static int nvptx_mach_max_workers ();
148 179
149 /* Allocate a new, cleared machine_function structure. */ 180 /* Allocate a new, cleared machine_function structure. */
150 181
151 static struct machine_function * 182 static struct machine_function *
152 nvptx_init_machine_status (void) 183 nvptx_init_machine_status (void)
161 192
162 static void 193 static void
163 diagnose_openacc_conflict (bool optval, const char *optname) 194 diagnose_openacc_conflict (bool optval, const char *optname)
164 { 195 {
165 if (flag_openacc && optval) 196 if (flag_openacc && optval)
166 error ("option %s is not supported together with -fopenacc", optname); 197 error ("option %s is not supported together with %<-fopenacc%>", optname);
167 } 198 }
168 199
169 /* Implement TARGET_OPTION_OVERRIDE. */ 200 /* Implement TARGET_OPTION_OVERRIDE. */
170 201
171 static void 202 static void
200 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); 231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
201 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); 232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
202 declared_libfuncs_htab 233 declared_libfuncs_htab
203 = hash_table<declared_libfunc_hasher>::create_ggc (17); 234 = hash_table<declared_libfunc_hasher>::create_ggc (17);
204 235
205 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast"); 236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
206 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED); 237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
207 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
239 oacc_bcast_partition = 0;
208 240
209 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); 241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
210 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); 242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
211 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
244
245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248 vector_red_partition = 0;
212 249
213 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); 250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
214 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); 251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
215 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); 252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
216 253
481 } 518 }
482 519
483 /* Implement TARGET_FUNCTION_ARG. */ 520 /* Implement TARGET_FUNCTION_ARG. */
484 521
485 static rtx 522 static rtx
486 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode, 523 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
487 const_tree, bool named) 524 {
488 { 525 if (arg.end_marker_p () || !arg.named)
489 if (mode == VOIDmode || !named)
490 return NULL_RTX; 526 return NULL_RTX;
491 527
492 return gen_reg_rtx (mode); 528 return gen_reg_rtx (arg.mode);
493 } 529 }
494 530
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */ 531 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
496 532
497 static rtx 533 static rtx
498 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode, 534 nvptx_function_incoming_arg (cumulative_args_t cum_v,
499 const_tree, bool named) 535 const function_arg_info &arg)
500 { 536 {
501 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 537 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
502 538
503 if (mode == VOIDmode || !named) 539 if (arg.end_marker_p () || !arg.named)
504 return NULL_RTX; 540 return NULL_RTX;
505 541
506 /* No need to deal with split modes here, the only case that can 542 /* No need to deal with split modes here, the only case that can
507 happen is complex modes and those are dealt with by 543 happen is complex modes and those are dealt with by
508 TARGET_SPLIT_COMPLEX_ARG. */ 544 TARGET_SPLIT_COMPLEX_ARG. */
509 return gen_rtx_UNSPEC (mode, 545 return gen_rtx_UNSPEC (arg.mode,
510 gen_rtvec (1, GEN_INT (cum->count)), 546 gen_rtvec (1, GEN_INT (cum->count)),
511 UNSPEC_ARG_REG); 547 UNSPEC_ARG_REG);
512 } 548 }
513 549
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */ 550 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
515 551
516 static void 552 static void
517 nvptx_function_arg_advance (cumulative_args_t cum_v, 553 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
518 machine_mode ARG_UNUSED (mode),
519 const_tree ARG_UNUSED (type),
520 bool ARG_UNUSED (named))
521 { 554 {
522 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 555 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
523 556
524 cum->count++; 557 cum->count++;
525 } 558 }
594 627
595 /* Types with a mode other than those supported by the machine are passed by 628 /* Types with a mode other than those supported by the machine are passed by
596 reference in memory. */ 629 reference in memory. */
597 630
598 static bool 631 static bool
599 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum), 632 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
600 machine_mode mode, const_tree type, 633 {
601 bool ARG_UNUSED (named)) 634 return pass_in_memory (arg.mode, arg.type, false);
602 {
603 return pass_in_memory (mode, type, false);
604 } 635 }
605 636
606 /* Implement TARGET_RETURN_IN_MEMORY. */ 637 /* Implement TARGET_RETURN_IN_MEMORY. */
607 638
608 static bool 639 static bool
1086 static void 1117 static void
1087 nvptx_init_axis_predicate (FILE *file, int regno, const char *name) 1118 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1088 { 1119 {
1089 fprintf (file, "\t{\n"); 1120 fprintf (file, "\t{\n");
1090 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); 1121 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1122 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1123 {
1124 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1125 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1126 }
1091 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); 1127 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1092 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); 1128 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1129 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1130 {
1131 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1132 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1133 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1134 "// vector reduction buffer\n",
1135 REGNO (cfun->machine->red_partition),
1136 vector_red_partition);
1137 }
1138 /* Verify vector_red_size. */
1139 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1140 <= vector_red_size);
1141 fprintf (file, "\t}\n");
1142 }
1143
1144 /* Emit code to initialize OpenACC worker broadcast and synchronization
1145 registers. */
1146
1147 static void
1148 nvptx_init_oacc_workers (FILE *file)
1149 {
1150 fprintf (file, "\t{\n");
1151 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1152 if (cfun->machine->bcast_partition)
1153 {
1154 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1155 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1156 }
1157 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1158 if (cfun->machine->bcast_partition)
1159 {
1160 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1161 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1162 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1163 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1164 "// vector broadcast offset\n",
1165 REGNO (cfun->machine->bcast_partition),
1166 oacc_bcast_partition);
1167 }
1168 /* Verify oacc_bcast_size. */
1169 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1170 <= oacc_bcast_size);
1171 if (cfun->machine->sync_bar)
1172 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1173 "// vector synchronization barrier\n",
1174 REGNO (cfun->machine->sync_bar));
1093 fprintf (file, "\t}\n"); 1175 fprintf (file, "\t}\n");
1094 } 1176 }
1095 1177
1096 /* Emit code to initialize predicate and master lane index registers for 1178 /* Emit code to initialize predicate and master lane index registers for
1097 -muniform-simt code generation variant. */ 1179 -muniform-simt code generation variant. */
1290 simtsz += align - GET_MODE_SIZE (DImode); 1372 simtsz += align - GET_MODE_SIZE (DImode);
1291 if (simtsz) 1373 if (simtsz)
1292 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" 1374 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1293 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); 1375 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1294 } 1376 }
1377
1378 /* Restore the vector reduction partition register, if necessary.
1379 FIXME: Find out when and why this is necessary, and fix it. */
1380 if (cfun->machine->red_partition)
1381 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1382 = cfun->machine->red_partition;
1383
1295 /* Declare the pseudos we have as ptx registers. */ 1384 /* Declare the pseudos we have as ptx registers. */
1296 int maxregs = max_reg_num (); 1385 int maxregs = max_reg_num ();
1297 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) 1386 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1298 { 1387 {
1299 if (regno_reg_rtx[i] != const0_rtx) 1388 if (regno_reg_rtx[i] != const0_rtx)
1317 nvptx_init_axis_predicate (file, 1406 nvptx_init_axis_predicate (file,
1318 REGNO (cfun->machine->axis_predicate[1]), "x"); 1407 REGNO (cfun->machine->axis_predicate[1]), "x");
1319 if (cfun->machine->unisimt_predicate 1408 if (cfun->machine->unisimt_predicate
1320 || (cfun->machine->has_simtreg && !crtl->is_leaf)) 1409 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1321 nvptx_init_unisimt_predicate (file); 1410 nvptx_init_unisimt_predicate (file);
1411 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1412 nvptx_init_oacc_workers (file);
1322 } 1413 }
1323 1414
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether 1415 /* Output code for switching uniform-simt state. ENTERING indicates whether
1325 we are entering or leaving non-uniform execution region. */ 1416 we are entering or leaving non-uniform execution region. */
1326 1417
1376 else 1467 else
1377 output_reg (file, REGNO (size), VOIDmode); 1468 output_reg (file, REGNO (size), VOIDmode);
1378 fputs (";\n", file); 1469 fputs (";\n", file);
1379 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) 1470 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1380 fprintf (file, 1471 fprintf (file,
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n", 1472 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1382 bits, regno, regno, UINTVAL (align)); 1473 bits, regno, regno, UINTVAL (align));
1383 } 1474 }
1384 if (cfun->machine->has_softstack) 1475 if (cfun->machine->has_softstack)
1385 { 1476 {
1386 const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; 1477 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1740 1831
1741 /* Generate an instruction or sequence to broadcast register REG 1832 /* Generate an instruction or sequence to broadcast register REG
1742 across the vectors of a single warp. */ 1833 across the vectors of a single warp. */
1743 1834
1744 static rtx 1835 static rtx
1745 nvptx_gen_vcast (rtx reg) 1836 nvptx_gen_warp_bcast (rtx reg)
1746 { 1837 {
1747 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); 1838 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1748 } 1839 }
1749 1840
1750 /* Structure used when generating a worker-level spill or fill. */ 1841 /* Structure used when generating a worker-level spill or fill. */
1751 1842
1752 struct wcast_data_t 1843 struct broadcast_data_t
1753 { 1844 {
1754 rtx base; /* Register holding base addr of buffer. */ 1845 rtx base; /* Register holding base addr of buffer. */
1755 rtx ptr; /* Iteration var, if needed. */ 1846 rtx ptr; /* Iteration var, if needed. */
1756 unsigned offset; /* Offset into worker buffer. */ 1847 unsigned offset; /* Offset into worker buffer. */
1757 }; 1848 };
1771 /* Generate instruction(s) to spill or fill register REG to/from the 1862 /* Generate instruction(s) to spill or fill register REG to/from the
1772 worker broadcast array. PM indicates what is to be done, REP 1863 worker broadcast array. PM indicates what is to be done, REP
1773 how many loop iterations will be executed (0 for not a loop). */ 1864 how many loop iterations will be executed (0 for not a loop). */
1774 1865
1775 static rtx 1866 static rtx
1776 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data) 1867 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1868 broadcast_data_t *data, bool vector)
1777 { 1869 {
1778 rtx res; 1870 rtx res;
1779 machine_mode mode = GET_MODE (reg); 1871 machine_mode mode = GET_MODE (reg);
1780 1872
1781 switch (mode) 1873 switch (mode)
1785 rtx tmp = gen_reg_rtx (SImode); 1877 rtx tmp = gen_reg_rtx (SImode);
1786 1878
1787 start_sequence (); 1879 start_sequence ();
1788 if (pm & PM_read) 1880 if (pm & PM_read)
1789 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); 1881 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1790 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data)); 1882 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1791 if (pm & PM_write) 1883 if (pm & PM_write)
1792 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); 1884 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1793 res = get_insns (); 1885 res = get_insns ();
1794 end_sequence (); 1886 end_sequence ();
1795 } 1887 }
1801 1893
1802 if (!addr) 1894 if (!addr)
1803 { 1895 {
1804 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; 1896 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1805 1897
1806 if (align > worker_bcast_align) 1898 oacc_bcast_align = MAX (oacc_bcast_align, align);
1807 worker_bcast_align = align; 1899 data->offset = ROUND_UP (data->offset, align);
1808 data->offset = (data->offset + align - 1) & ~(align - 1);
1809 addr = data->base; 1900 addr = data->base;
1901 gcc_assert (data->base != NULL);
1810 if (data->offset) 1902 if (data->offset)
1811 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); 1903 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1812 } 1904 }
1813 1905
1814 addr = gen_rtx_MEM (mode, addr); 1906 addr = gen_rtx_MEM (mode, addr);
1924 2016
1925 for (unsigned part = 0; size; size -= part) 2017 for (unsigned part = 0; size; size -= part)
1926 { 2018 {
1927 val >>= part * BITS_PER_UNIT; 2019 val >>= part * BITS_PER_UNIT;
1928 part = init_frag.size - init_frag.offset; 2020 part = init_frag.size - init_frag.offset;
1929 if (part > size) 2021 part = MIN (part, size);
1930 part = size;
1931 2022
1932 unsigned HOST_WIDE_INT partial 2023 unsigned HOST_WIDE_INT partial
1933 = val << (init_frag.offset * BITS_PER_UNIT); 2024 = val << (init_frag.offset * BITS_PER_UNIT);
1934 init_frag.val |= partial & init_frag.mask; 2025 init_frag.val |= partial & init_frag.mask;
1935 init_frag.offset += part; 2026 init_frag.offset += part;
1988 { 2079 {
1989 /* Finish the current fragment, if it's started. */ 2080 /* Finish the current fragment, if it's started. */
1990 if (init_frag.offset) 2081 if (init_frag.offset)
1991 { 2082 {
1992 unsigned part = init_frag.size - init_frag.offset; 2083 unsigned part = init_frag.size - init_frag.offset;
1993 if (part > size) 2084 part = MIN (part, (unsigned)size);
1994 part = (unsigned) size;
1995 size -= part; 2085 size -= part;
1996 nvptx_assemble_value (0, part); 2086 nvptx_assemble_value (0, part);
1997 } 2087 }
1998 2088
1999 /* If this skip doesn't terminate the initializer, write as many 2089 /* If this skip doesn't terminate the initializer, write as many
2861 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); 2951 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2862 validate_change (insn, &PATTERN (insn), pat, false); 2952 validate_change (insn, &PATTERN (insn), pat, false);
2863 } 2953 }
2864 } 2954 }
2865 2955
2956 /* Offloading function attributes. */
2957
2958 struct offload_attrs
2959 {
2960 unsigned mask;
2961 int num_gangs;
2962 int num_workers;
2963 int vector_length;
2964 };
2965
2966 /* Define entries for cfun->machine->axis_dim. */
2967
2968 #define MACH_VECTOR_LENGTH 0
2969 #define MACH_MAX_WORKERS 1
2970
2971 static void populate_offload_attrs (offload_attrs *oa);
2972
2973 static void
2974 init_axis_dim (void)
2975 {
2976 offload_attrs oa;
2977 int max_workers;
2978
2979 populate_offload_attrs (&oa);
2980
2981 if (oa.num_workers == 0)
2982 max_workers = PTX_CTA_SIZE / oa.vector_length;
2983 else
2984 max_workers = oa.num_workers;
2985
2986 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2987 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2988 cfun->machine->axis_dim_init_p = true;
2989 }
2990
2991 static int ATTRIBUTE_UNUSED
2992 nvptx_mach_max_workers ()
2993 {
2994 if (!cfun->machine->axis_dim_init_p)
2995 init_axis_dim ();
2996 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
2997 }
2998
2999 static int ATTRIBUTE_UNUSED
3000 nvptx_mach_vector_length ()
3001 {
3002 if (!cfun->machine->axis_dim_init_p)
3003 init_axis_dim ();
3004 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3005 }
3006
2866 /* Loop structure of the function. The entire function is described as 3007 /* Loop structure of the function. The entire function is described as
2867 a NULL loop. */ 3008 a NULL loop. */
2868 3009
2869 struct parallel 3010 struct parallel
2870 { 3011 {
3008 block = e->dest; 3149 block = e->dest;
3009 map->get_or_insert (block) = elt->first; 3150 map->get_or_insert (block) = elt->first;
3010 } 3151 }
3011 } 3152 }
3012 3153
3154 /* Return true if MASK contains parallelism that requires shared
3155 memory to broadcast. */
3156
3157 static bool
3158 nvptx_needs_shared_bcast (unsigned mask)
3159 {
3160 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3161 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3162 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3163
3164 return worker || large_vector;
3165 }
3166
3013 /* BLOCK is a basic block containing a head or tail instruction. 3167 /* BLOCK is a basic block containing a head or tail instruction.
3014 Locate the associated prehead or pretail instruction, which must be 3168 Locate the associated prehead or pretail instruction, which must be
3015 in the single predecessor block. */ 3169 in the single predecessor block. */
3016 3170
3017 static rtx_insn * 3171 static rtx_insn *
3083 3237
3084 gcc_assert (mask); 3238 gcc_assert (mask);
3085 par = new parallel (par, mask); 3239 par = new parallel (par, mask);
3086 par->forked_block = block; 3240 par->forked_block = block;
3087 par->forked_insn = end; 3241 par->forked_insn = end;
3088 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 3242 if (nvptx_needs_shared_bcast (mask))
3089 par->fork_insn 3243 par->fork_insn
3090 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); 3244 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3091 } 3245 }
3092 break; 3246 break;
3093 3247
3096 parent. */ 3250 parent. */
3097 { 3251 {
3098 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); 3252 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3099 3253
3100 gcc_assert (par->mask == mask); 3254 gcc_assert (par->mask == mask);
3255 gcc_assert (par->join_block == NULL);
3101 par->join_block = block; 3256 par->join_block = block;
3102 par->join_insn = end; 3257 par->join_insn = end;
3103 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 3258 if (nvptx_needs_shared_bcast (mask))
3104 par->joining_insn 3259 par->joining_insn
3105 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); 3260 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3106 par = par->parent; 3261 par = par->parent;
3107 } 3262 }
3108 break; 3263 break;
3389 { 3544 {
3390 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds; 3545 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3391 size_t offset = (dir > 0 ? offsetof (edge_def, dest) 3546 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3392 : offsetof (edge_def, src)); 3547 : offsetof (edge_def, src));
3393 edge e; 3548 edge e;
3394 edge_iterator (ei); 3549 edge_iterator ei;
3395 3550
3396 FOR_EACH_EDGE (e, ei, edges) 3551 FOR_EACH_EDGE (e, ei, edges)
3397 { 3552 {
3398 basic_block target = *(basic_block *)((char *)e + offset); 3553 basic_block target = *(basic_block *)((char *)e + offset);
3399 3554
3412 static void 3567 static void
3413 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, 3568 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3414 vec<edge, va_gc> *edges, size_t offset) 3569 vec<edge, va_gc> *edges, size_t offset)
3415 { 3570 {
3416 edge e; 3571 edge e;
3417 edge_iterator (ei); 3572 edge_iterator ei;
3418 int hi_back = depth; 3573 int hi_back = depth;
3419 pseudo_node_t node_back (0, depth); 3574 pseudo_node_t node_back (0, depth);
3420 int hi_child = depth; 3575 int hi_child = depth;
3421 pseudo_node_t node_child (0, depth); 3576 pseudo_node_t node_child (0, depth);
3422 basic_block child = NULL; 3577 basic_block child = NULL;
3795 frame for calls and non-calls. We could do better by (a) 3950 frame for calls and non-calls. We could do better by (a)
3796 propagating just the live set that is used within the partitioned 3951 propagating just the live set that is used within the partitioned
3797 regions and (b) only propagating stack entries that are used. The 3952 regions and (b) only propagating stack entries that are used. The
3798 latter might be quite hard to determine. */ 3953 latter might be quite hard to determine. */
3799 3954
3800 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); 3955 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3801 3956
3802 static bool 3957 static bool
3803 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, 3958 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3804 propagate_mask rw, propagator_fn fn, void *data) 3959 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3805 { 3960 {
3806 bitmap live = DF_LIVE_IN (block); 3961 bitmap live = DF_LIVE_IN (block);
3807 bitmap_iterator iterator; 3962 bitmap_iterator iterator;
3808 unsigned ix; 3963 unsigned ix;
3809 bool empty = true; 3964 bool empty = true;
3834 pred = gen_reg_rtx (BImode); 3989 pred = gen_reg_rtx (BImode);
3835 label = gen_label_rtx (); 3990 label = gen_label_rtx ();
3836 3991
3837 emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); 3992 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3838 /* Allow worker function to initialize anything needed. */ 3993 /* Allow worker function to initialize anything needed. */
3839 rtx init = fn (tmp, PM_loop_begin, fs, data); 3994 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
3840 if (init) 3995 if (init)
3841 emit_insn (init); 3996 emit_insn (init);
3842 emit_label (label); 3997 emit_label (label);
3843 LABEL_NUSES (label)++; 3998 LABEL_NUSES (label)++;
3844 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1))); 3999 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3845 } 4000 }
3846 if (rw & PM_read) 4001 if (rw & PM_read)
3847 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr))); 4002 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3848 emit_insn (fn (tmp, rw, fs, data)); 4003 emit_insn (fn (tmp, rw, fs, data, vector));
3849 if (rw & PM_write) 4004 if (rw & PM_write)
3850 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp)); 4005 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3851 if (fs) 4006 if (fs)
3852 { 4007 {
3853 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx))); 4008 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3854 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode)))); 4009 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3855 emit_insn (gen_br_true_uni (pred, label)); 4010 emit_insn (gen_br_true_uni (pred, label));
3856 rtx fini = fn (tmp, PM_loop_end, fs, data); 4011 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
3857 if (fini) 4012 if (fini)
3858 emit_insn (fini); 4013 emit_insn (fini);
3859 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx)); 4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3860 } 4015 }
3861 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp)); 4016 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3871 { 4026 {
3872 rtx reg = regno_reg_rtx[ix]; 4027 rtx reg = regno_reg_rtx[ix];
3873 4028
3874 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) 4029 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3875 { 4030 {
3876 rtx bcast = fn (reg, rw, 0, data); 4031 rtx bcast = fn (reg, rw, 0, data, vector);
3877 4032
3878 insn = emit_insn_after (bcast, insn); 4033 insn = emit_insn_after (bcast, insn);
3879 empty = false; 4034 empty = false;
3880 } 4035 }
3881 } 4036 }
3882 return empty; 4037 return empty;
3883 } 4038 }
3884 4039
3885 /* Worker for nvptx_vpropagate. */ 4040 /* Worker for nvptx_warp_propagate. */
3886 4041
3887 static rtx 4042 static rtx
3888 vprop_gen (rtx reg, propagate_mask pm, 4043 warp_prop_gen (rtx reg, propagate_mask pm,
3889 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data)) 4044 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4045 bool ARG_UNUSED (vector))
3890 { 4046 {
3891 if (!(pm & PM_read_write)) 4047 if (!(pm & PM_read_write))
3892 return 0; 4048 return 0;
3893 4049
3894 return nvptx_gen_vcast (reg); 4050 return nvptx_gen_warp_bcast (reg);
3895 } 4051 }
3896 4052
3897 /* Propagate state that is live at start of BLOCK across the vectors 4053 /* Propagate state that is live at start of BLOCK across the vectors
3898 of a single warp. Propagation is inserted just after INSN. 4054 of a single warp. Propagation is inserted just after INSN.
3899 IS_CALL and return as for nvptx_propagate. */ 4055 IS_CALL and return as for nvptx_propagate. */
3900 4056
3901 static bool 4057 static bool
3902 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn) 4058 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
3903 { 4059 {
3904 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0); 4060 return nvptx_propagate (is_call, block, insn, PM_read_write,
3905 } 4061 warp_prop_gen, 0, false);
3906 4062 }
3907 /* Worker for nvptx_wpropagate. */ 4063
4064 /* Worker for nvptx_shared_propagate. */
3908 4065
3909 static rtx 4066 static rtx
3910 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_) 4067 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
3911 { 4068 bool vector)
3912 wcast_data_t *data = (wcast_data_t *)data_; 4069 {
4070 broadcast_data_t *data = (broadcast_data_t *)data_;
3913 4071
3914 if (pm & PM_loop_begin) 4072 if (pm & PM_loop_begin)
3915 { 4073 {
3916 /* Starting a loop, initialize pointer. */ 4074 /* Starting a loop, initialize pointer. */
3917 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT; 4075 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3918 4076
3919 if (align > worker_bcast_align) 4077 oacc_bcast_align = MAX (oacc_bcast_align, align);
3920 worker_bcast_align = align; 4078 data->offset = ROUND_UP (data->offset, align);
3921 data->offset = (data->offset + align - 1) & ~(align - 1);
3922 4079
3923 data->ptr = gen_reg_rtx (Pmode); 4080 data->ptr = gen_reg_rtx (Pmode);
3924 4081
3925 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset)); 4082 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3926 } 4083 }
3929 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr); 4086 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3930 data->ptr = NULL_RTX; 4087 data->ptr = NULL_RTX;
3931 return clobber; 4088 return clobber;
3932 } 4089 }
3933 else 4090 else
3934 return nvptx_gen_wcast (reg, pm, rep, data); 4091 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
3935 } 4092 }
3936 4093
3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P 4094 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3938 indicates if this is just before partitioned mode (do spill), or 4095 indicates if this is just before partitioned mode (do spill), or
3939 just after it starts (do fill). Sequence is inserted just after 4096 just after it starts (do fill). Sequence is inserted just after
3940 INSN. IS_CALL and return as for nvptx_propagate. */ 4097 INSN. IS_CALL and return as for nvptx_propagate. */
3941 4098
3942 static bool 4099 static bool
3943 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn) 4100 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
3944 { 4101 rtx_insn *insn, bool vector)
3945 wcast_data_t data; 4102 {
4103 broadcast_data_t data;
3946 4104
3947 data.base = gen_reg_rtx (Pmode); 4105 data.base = gen_reg_rtx (Pmode);
3948 data.offset = 0; 4106 data.offset = 0;
3949 data.ptr = NULL_RTX; 4107 data.ptr = NULL_RTX;
3950 4108
3951 bool empty = nvptx_propagate (is_call, block, insn, 4109 bool empty = nvptx_propagate (is_call, block, insn,
3952 pre_p ? PM_read : PM_write, wprop_gen, &data); 4110 pre_p ? PM_read : PM_write, shared_prop_gen,
4111 &data, vector);
3953 gcc_assert (empty == !data.offset); 4112 gcc_assert (empty == !data.offset);
3954 if (data.offset) 4113 if (data.offset)
3955 { 4114 {
4115 rtx bcast_sym = oacc_bcast_sym;
4116
3956 /* Stuff was emitted, initialize the base pointer now. */ 4117 /* Stuff was emitted, initialize the base pointer now. */
3957 rtx init = gen_rtx_SET (data.base, worker_bcast_sym); 4118 if (vector && nvptx_mach_max_workers () > 1)
4119 {
4120 if (!cfun->machine->bcast_partition)
4121 {
4122 /* It would be nice to place this register in
4123 DATA_AREA_SHARED. */
4124 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4125 }
4126 if (!cfun->machine->sync_bar)
4127 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4128
4129 bcast_sym = cfun->machine->bcast_partition;
4130 }
4131
4132 rtx init = gen_rtx_SET (data.base, bcast_sym);
3958 emit_insn_after (init, insn); 4133 emit_insn_after (init, insn);
3959 4134
3960 if (worker_bcast_size < data.offset) 4135 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
3961 worker_bcast_size = data.offset; 4136 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4137 ? nvptx_mach_max_workers () + 1
4138 : 1);
4139
4140 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4141 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
3962 } 4142 }
3963 return empty; 4143 return empty;
3964 } 4144 }
3965 4145
3966 /* Emit a worker-level synchronization barrier. We use different 4146 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
3967 markers for before and after synchronizations. */ 4147 which is an integer or a register. THREADS is the number of threads
4148 controlled by the barrier. */
3968 4149
3969 static rtx 4150 static rtx
3970 nvptx_wsync (bool after) 4151 nvptx_cta_sync (rtx lock, int threads)
3971 { 4152 {
3972 return gen_nvptx_barsync (GEN_INT (after)); 4153 return gen_nvptx_barsync (lock, GEN_INT (threads));
3973 } 4154 }
3974 4155
3975 #if WORKAROUND_PTXJIT_BUG 4156 #if WORKAROUND_PTXJIT_BUG
3976 /* Return first real insn in BB, or return NULL_RTX if BB does not contain 4157 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3977 real insns. */ 4158 real insns. */
4214 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) 4395 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4215 if (GOMP_DIM_MASK (mode) & skip_mask) 4396 if (GOMP_DIM_MASK (mode) & skip_mask)
4216 { 4397 {
4217 rtx_code_label *label = gen_label_rtx (); 4398 rtx_code_label *label = gen_label_rtx ();
4218 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; 4399 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4219 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump; 4400 rtx_insn **mode_jump
4220 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label; 4401 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4402 rtx_insn **mode_label
4403 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4221 4404
4222 if (!pred) 4405 if (!pred)
4223 { 4406 {
4224 pred = gen_reg_rtx (BImode); 4407 pred = gen_reg_rtx (BImode);
4225 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; 4408 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4226 } 4409 }
4227 4410
4228 rtx br; 4411 rtx br;
4229 if (mode == GOMP_DIM_VECTOR) 4412 if (mode == GOMP_DIM_VECTOR)
4230 br = gen_br_true (pred, label); 4413 br = gen_br_true (pred, label);
4231 else 4414 else
4232 br = gen_br_true_uni (pred, label); 4415 br = gen_br_true_uni (pred, label);
4249 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) 4432 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4250 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) 4433 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4251 emit_insn_after (gen_exit (), label_insn); 4434 emit_insn_after (gen_exit (), label_insn);
4252 } 4435 }
4253 4436
4254 if (mode == GOMP_DIM_VECTOR) 4437 *mode_label = label_insn;
4255 vector_label = label_insn;
4256 else
4257 worker_label = label_insn;
4258 } 4438 }
4259 4439
4260 /* Now deal with propagating the branch condition. */ 4440 /* Now deal with propagating the branch condition. */
4261 if (cond_branch) 4441 if (cond_branch)
4262 { 4442 {
4263 rtx pvar = XEXP (XEXP (cond_branch, 0), 0); 4443 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4264 4444
4265 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) 4445 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4446 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4266 { 4447 {
4267 /* Vector mode only, do a shuffle. */ 4448 /* Vector mode only, do a shuffle. */
4268 #if WORKAROUND_PTXJIT_BUG 4449 #if WORKAROUND_PTXJIT_BUG
4269 /* The branch condition %rcond is propagated like this: 4450 /* The branch condition %rcond is propagated like this:
4270 4451
4320 emit_insn_before (gen_movbi (tmp, const0_rtx), 4501 emit_insn_before (gen_movbi (tmp, const0_rtx),
4321 bb_first_real_insn (from)); 4502 bb_first_real_insn (from));
4322 emit_insn_before (gen_rtx_SET (tmp, pvar), label); 4503 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4323 emit_insn_before (gen_rtx_SET (pvar, tmp), tail); 4504 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4324 #endif 4505 #endif
4325 emit_insn_before (nvptx_gen_vcast (pvar), tail); 4506 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4326 } 4507 }
4327 else 4508 else
4328 { 4509 {
4329 /* Includes worker mode, do spill & fill. By construction 4510 /* Includes worker mode, do spill & fill. By construction
4330 we should never have worker mode only. */ 4511 we should never have worker mode only. */
4331 wcast_data_t data; 4512 broadcast_data_t data;
4332 4513 unsigned size = GET_MODE_SIZE (SImode);
4333 data.base = worker_bcast_sym; 4514 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4515 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4516 rtx barrier = GEN_INT (0);
4517 int threads = 0;
4518
4519 data.base = oacc_bcast_sym;
4334 data.ptr = 0; 4520 data.ptr = 0;
4335 4521
4336 if (worker_bcast_size < GET_MODE_SIZE (SImode)) 4522 bool use_partitioning_p = (vector && !worker
4337 worker_bcast_size = GET_MODE_SIZE (SImode); 4523 && nvptx_mach_max_workers () > 1
4524 && cfun->machine->bcast_partition);
4525 if (use_partitioning_p)
4526 {
4527 data.base = cfun->machine->bcast_partition;
4528 barrier = cfun->machine->sync_bar;
4529 threads = nvptx_mach_vector_length ();
4530 }
4531 gcc_assert (data.base != NULL);
4532 gcc_assert (barrier);
4533
4534 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4535 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4536 ? nvptx_mach_max_workers () + 1
4537 : 1);
4538
4539 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4540 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4338 4541
4339 data.offset = 0; 4542 data.offset = 0;
4340 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data), 4543 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4544 vector),
4341 before); 4545 before);
4546
4342 /* Barrier so other workers can see the write. */ 4547 /* Barrier so other workers can see the write. */
4343 emit_insn_before (nvptx_wsync (false), tail); 4548 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4344 data.offset = 0; 4549 data.offset = 0;
4345 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail); 4550 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4551 vector),
4552 tail);
4346 /* This barrier is needed to avoid worker zero clobbering 4553 /* This barrier is needed to avoid worker zero clobbering
4347 the broadcast buffer before all the other workers have 4554 the broadcast buffer before all the other workers have
4348 had a chance to read this instance of it. */ 4555 had a chance to read this instance of it. */
4349 emit_insn_before (nvptx_wsync (true), tail); 4556 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4350 } 4557 }
4351 4558
4352 extract_insn (tail); 4559 extract_insn (tail);
4353 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), 4560 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4354 UNSPEC_BR_UNIFIED); 4561 UNSPEC_BR_UNIFIED);
4458 par->inner_mask = nvptx_process_pars (par->inner); 4665 par->inner_mask = nvptx_process_pars (par->inner);
4459 inner_mask |= par->inner_mask; 4666 inner_mask |= par->inner_mask;
4460 } 4667 }
4461 4668
4462 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; 4669 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4463 4670 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4464 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 4671 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4465 { 4672 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4466 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn); 4673
4467 bool empty = nvptx_wpropagate (true, is_call, 4674 if (worker || large_vector)
4468 par->forked_block, par->fork_insn); 4675 {
4469 4676 nvptx_shared_propagate (false, is_call, par->forked_block,
4470 if (!empty || !is_call) 4677 par->forked_insn, !worker);
4678 bool no_prop_p
4679 = nvptx_shared_propagate (true, is_call, par->forked_block,
4680 par->fork_insn, !worker);
4681 bool empty_loop_p
4682 = !is_call && (NEXT_INSN (par->forked_insn)
4683 && NEXT_INSN (par->forked_insn) == par->joining_insn);
4684 rtx barrier = GEN_INT (0);
4685 int threads = 0;
4686
4687 if (!worker && cfun->machine->sync_bar)
4688 {
4689 barrier = cfun->machine->sync_bar;
4690 threads = nvptx_mach_vector_length ();
4691 }
4692
4693 if (no_prop_p && empty_loop_p)
4694 ;
4695 else if (no_prop_p && is_call)
4696 ;
4697 else
4471 { 4698 {
4472 /* Insert begin and end synchronizations. */ 4699 /* Insert begin and end synchronizations. */
4473 emit_insn_before (nvptx_wsync (false), par->forked_insn); 4700 emit_insn_before (nvptx_cta_sync (barrier, threads),
4474 emit_insn_before (nvptx_wsync (true), par->join_insn); 4701 par->forked_insn);
4702 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4475 } 4703 }
4476 } 4704 }
4477 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 4705 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4478 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn); 4706 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4479 4707
4480 /* Now do siblings. */ 4708 /* Now do siblings. */
4481 if (par->next) 4709 if (par->next)
4482 inner_mask |= nvptx_process_pars (par->next); 4710 inner_mask |= nvptx_process_pars (par->next);
4483 return inner_mask; 4711 return inner_mask;
4552 } 4780 }
4553 } 4781 }
4554 } 4782 }
4555 4783
4556 if (skip_mask) 4784 if (skip_mask)
4557 nvptx_skip_par (skip_mask, par); 4785 nvptx_skip_par (skip_mask, par);
4558 4786
4559 if (par->next) 4787 if (par->next)
4560 nvptx_neuter_pars (par->next, modes, outer); 4788 nvptx_neuter_pars (par->next, modes, outer);
4789 }
4790
4791 static void
4792 populate_offload_attrs (offload_attrs *oa)
4793 {
4794 tree attr = oacc_get_fn_attrib (current_function_decl);
4795 tree dims = TREE_VALUE (attr);
4796 unsigned ix;
4797
4798 oa->mask = 0;
4799
4800 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4801 {
4802 tree t = TREE_VALUE (dims);
4803 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4804 tree allowed = TREE_PURPOSE (dims);
4805
4806 if (size != 1 && !(allowed && integer_zerop (allowed)))
4807 oa->mask |= GOMP_DIM_MASK (ix);
4808
4809 switch (ix)
4810 {
4811 case GOMP_DIM_GANG:
4812 oa->num_gangs = size;
4813 break;
4814
4815 case GOMP_DIM_WORKER:
4816 oa->num_workers = size;
4817 break;
4818
4819 case GOMP_DIM_VECTOR:
4820 oa->vector_length = size;
4821 break;
4822 }
4823 }
4561 } 4824 }
4562 4825
4563 #if WORKAROUND_PTXJIT_BUG_2 4826 #if WORKAROUND_PTXJIT_BUG_2
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant 4827 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4565 is needed in the nvptx target because the branches generated for 4828 is needed in the nvptx target because the branches generated for
4739 tree attr = oacc_get_fn_attrib (current_function_decl); 5002 tree attr = oacc_get_fn_attrib (current_function_decl);
4740 if (attr) 5003 if (attr)
4741 { 5004 {
4742 /* If we determined this mask before RTL expansion, we could 5005 /* If we determined this mask before RTL expansion, we could
4743 elide emission of some levels of forks and joins. */ 5006 elide emission of some levels of forks and joins. */
4744 unsigned mask = 0; 5007 offload_attrs oa;
4745 tree dims = TREE_VALUE (attr); 5008
4746 unsigned ix; 5009 populate_offload_attrs (&oa);
4747 5010
4748 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4749 {
4750 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4751 tree allowed = TREE_PURPOSE (dims);
4752
4753 if (size != 1 && !(allowed && integer_zerop (allowed)))
4754 mask |= GOMP_DIM_MASK (ix);
4755 }
4756 /* If there is worker neutering, there must be vector 5011 /* If there is worker neutering, there must be vector
4757 neutering. Otherwise the hardware will fail. */ 5012 neutering. Otherwise the hardware will fail. */
4758 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 5013 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4759 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); 5014 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4760 5015
4761 /* Discover & process partitioned regions. */ 5016 /* Discover & process partitioned regions. */
4762 parallel *pars = nvptx_discover_pars (&bb_insn_map); 5017 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4763 nvptx_process_pars (pars); 5018 nvptx_process_pars (pars);
4764 nvptx_neuter_pars (pars, mask, 0); 5019 nvptx_neuter_pars (pars, oa.mask, 0);
4765 delete pars; 5020 delete pars;
4766 } 5021 }
4767 5022
4768 /* Replace subregs. */ 5023 /* Replace subregs. */
4769 nvptx_reorg_subreg (); 5024 nvptx_reorg_subreg ();
4937 fputs ("\t.target\tsm_30\n", asm_out_file); 5192 fputs ("\t.target\tsm_30\n", asm_out_file);
4938 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); 5193 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4939 fputs ("// END PREAMBLE\n", asm_out_file); 5194 fputs ("// END PREAMBLE\n", asm_out_file);
4940 } 5195 }
4941 5196
4942 /* Emit a declaration for a worker-level buffer in .shared memory. */ 5197 /* Emit a declaration for a worker and vector-level buffer in .shared
5198 memory. */
4943 5199
4944 static void 5200 static void
4945 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size) 5201 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4946 { 5202 {
4947 const char *name = XSTR (sym, 0); 5203 const char *name = XSTR (sym, 0);
4948 5204
4949 write_var_marker (file, true, false, name); 5205 write_var_marker (file, true, false, name);
4950 fprintf (file, ".shared .align %d .u8 %s[%d];\n", 5206 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4961 tree decl; 5217 tree decl;
4962 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) 5218 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4963 nvptx_record_fndecl (decl); 5219 nvptx_record_fndecl (decl);
4964 fputs (func_decls.str().c_str(), asm_out_file); 5220 fputs (func_decls.str().c_str(), asm_out_file);
4965 5221
4966 if (worker_bcast_size) 5222 if (oacc_bcast_size)
4967 write_worker_buffer (asm_out_file, worker_bcast_sym, 5223 write_shared_buffer (asm_out_file, oacc_bcast_sym,
4968 worker_bcast_align, worker_bcast_size); 5224 oacc_bcast_align, oacc_bcast_size);
4969 5225
4970 if (worker_red_size) 5226 if (worker_red_size)
4971 write_worker_buffer (asm_out_file, worker_red_sym, 5227 write_shared_buffer (asm_out_file, worker_red_sym,
4972 worker_red_align, worker_red_size); 5228 worker_red_align, worker_red_size);
5229
5230 if (vector_red_size)
5231 write_shared_buffer (asm_out_file, vector_red_sym,
5232 vector_red_align, vector_red_size);
4973 5233
4974 if (need_softstack_decl) 5234 if (need_softstack_decl)
4975 { 5235 {
4976 write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); 5236 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4977 /* 32 is the maximum number of warps in a block. Even though it's an 5237 /* 32 is the maximum number of warps in a block. Even though it's an
5014 emit_insn (pat); 5274 emit_insn (pat);
5015 5275
5016 return target; 5276 return target;
5017 } 5277 }
5018 5278
5019 /* Worker reduction address expander. */ 5279 const char *
5280 nvptx_output_red_partition (rtx dst, rtx offset)
5281 {
5282 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5283 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5284
5285 if (offset == const0_rtx)
5286 fprintf (asm_out_file, zero_offset, REGNO (dst),
5287 REGNO (cfun->machine->red_partition));
5288 else
5289 fprintf (asm_out_file, with_offset, REGNO (dst),
5290 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5291
5292 return "";
5293 }
5294
5295 /* Shared-memory reduction address expander. */
5020 5296
5021 static rtx 5297 static rtx
5022 nvptx_expand_worker_addr (tree exp, rtx target, 5298 nvptx_expand_shared_addr (tree exp, rtx target,
5023 machine_mode ARG_UNUSED (mode), int ignore) 5299 machine_mode ARG_UNUSED (mode), int ignore,
5300 int vector)
5024 { 5301 {
5025 if (ignore) 5302 if (ignore)
5026 return target; 5303 return target;
5027 5304
5028 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2)); 5305 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5029 if (align > worker_red_align)
5030 worker_red_align = align;
5031
5032 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0)); 5306 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5033 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1)); 5307 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5034 if (size + offset > worker_red_size)
5035 worker_red_size = size + offset;
5036
5037 rtx addr = worker_red_sym; 5308 rtx addr = worker_red_sym;
5038 if (offset) 5309
5039 { 5310 if (vector)
5040 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)); 5311 {
5041 addr = gen_rtx_CONST (Pmode, addr); 5312 offload_attrs oa;
5042 } 5313
5314 populate_offload_attrs (&oa);
5315
5316 unsigned int psize = ROUND_UP (size + offset, align);
5317 unsigned int pnum = nvptx_mach_max_workers ();
5318 vector_red_partition = MAX (vector_red_partition, psize);
5319 vector_red_size = MAX (vector_red_size, psize * pnum);
5320 vector_red_align = MAX (vector_red_align, align);
5321
5322 if (cfun->machine->red_partition == NULL)
5323 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5324
5325 addr = gen_reg_rtx (Pmode);
5326 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5327 }
5328 else
5329 {
5330 worker_red_align = MAX (worker_red_align, align);
5331 worker_red_size = MAX (worker_red_size, size + offset);
5332
5333 if (offset)
5334 {
5335 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5336 addr = gen_rtx_CONST (Pmode, addr);
5337 }
5338 }
5043 5339
5044 emit_move_insn (target, addr); 5340 emit_move_insn (target, addr);
5045
5046 return target; 5341 return target;
5047 } 5342 }
5048 5343
5049 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do 5344 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5050 not require taking the address of any object, other than the memory 5345 not require taking the address of any object, other than the memory
5088 enum nvptx_builtins 5383 enum nvptx_builtins
5089 { 5384 {
5090 NVPTX_BUILTIN_SHUFFLE, 5385 NVPTX_BUILTIN_SHUFFLE,
5091 NVPTX_BUILTIN_SHUFFLELL, 5386 NVPTX_BUILTIN_SHUFFLELL,
5092 NVPTX_BUILTIN_WORKER_ADDR, 5387 NVPTX_BUILTIN_WORKER_ADDR,
5388 NVPTX_BUILTIN_VECTOR_ADDR,
5093 NVPTX_BUILTIN_CMP_SWAP, 5389 NVPTX_BUILTIN_CMP_SWAP,
5094 NVPTX_BUILTIN_CMP_SWAPLL, 5390 NVPTX_BUILTIN_CMP_SWAPLL,
5095 NVPTX_BUILTIN_MAX 5391 NVPTX_BUILTIN_MAX
5096 }; 5392 };
5097 5393
5125 5421
5126 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); 5422 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5127 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); 5423 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5128 DEF (WORKER_ADDR, "worker_addr", 5424 DEF (WORKER_ADDR, "worker_addr",
5129 (PTRVOID, ST, UINT, UINT, NULL_TREE)); 5425 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5426 DEF (VECTOR_ADDR, "vector_addr",
5427 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5130 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); 5428 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5131 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); 5429 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5132 5430
5133 #undef DEF 5431 #undef DEF
5134 #undef ST 5432 #undef ST
5146 static rtx 5444 static rtx
5147 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), 5445 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5148 machine_mode mode, int ignore) 5446 machine_mode mode, int ignore)
5149 { 5447 {
5150 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 5448 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5151 switch (DECL_FUNCTION_CODE (fndecl)) 5449 switch (DECL_MD_FUNCTION_CODE (fndecl))
5152 { 5450 {
5153 case NVPTX_BUILTIN_SHUFFLE: 5451 case NVPTX_BUILTIN_SHUFFLE:
5154 case NVPTX_BUILTIN_SHUFFLELL: 5452 case NVPTX_BUILTIN_SHUFFLELL:
5155 return nvptx_expand_shuffle (exp, target, mode, ignore); 5453 return nvptx_expand_shuffle (exp, target, mode, ignore);
5156 5454
5157 case NVPTX_BUILTIN_WORKER_ADDR: 5455 case NVPTX_BUILTIN_WORKER_ADDR:
5158 return nvptx_expand_worker_addr (exp, target, mode, ignore); 5456 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5457
5458 case NVPTX_BUILTIN_VECTOR_ADDR:
5459 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5159 5460
5160 case NVPTX_BUILTIN_CMP_SWAP: 5461 case NVPTX_BUILTIN_CMP_SWAP:
5161 case NVPTX_BUILTIN_CMP_SWAPLL: 5462 case NVPTX_BUILTIN_CMP_SWAPLL:
5162 return nvptx_expand_cmp_swap (exp, target, mode, ignore); 5463 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5163 5464
5164 default: gcc_unreachable (); 5465 default: gcc_unreachable ();
5165 } 5466 }
5166 } 5467 }
5167
5168 /* Define dimension sizes for known hardware. */
5169 #define PTX_VECTOR_LENGTH 32
5170 #define PTX_WORKER_LENGTH 32
5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5172 5468
5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ 5469 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5174 5470
5175 static int 5471 static int
5176 nvptx_simt_vf () 5472 nvptx_simt_vf ()
5177 { 5473 {
5178 return PTX_VECTOR_LENGTH; 5474 return PTX_WARP_SIZE;
5475 }
5476
5477 /* Return 1 if TRAIT NAME is present in the OpenMP context's
5478 device trait set, return 0 if not present in any OpenMP context in the
5479 whole translation unit, or -1 if not present in the current OpenMP context
5480 but might be present in another OpenMP context in the same TU. */
5481
5482 int
5483 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5484 const char *name)
5485 {
5486 switch (trait)
5487 {
5488 case omp_device_kind:
5489 return strcmp (name, "gpu") == 0;
5490 case omp_device_arch:
5491 return strcmp (name, "nvptx") == 0;
5492 case omp_device_isa:
5493 if (strcmp (name, "sm_30") == 0)
5494 return !TARGET_SM35;
5495 if (strcmp (name, "sm_35") == 0)
5496 return TARGET_SM35;
5497 return 0;
5498 default:
5499 gcc_unreachable ();
5500 }
5501 }
5502
5503 static bool
5504 nvptx_welformed_vector_length_p (int l)
5505 {
5506 gcc_assert (l > 0);
5507 return l % PTX_WARP_SIZE == 0;
5508 }
5509
5510 static void
5511 nvptx_apply_dim_limits (int dims[])
5512 {
5513 /* Check that the vector_length is not too large. */
5514 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5515 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5516
5517 /* Check that the number of workers is not too large. */
5518 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5519 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5520
5521 /* Ensure that num_worker * vector_length <= cta size. */
5522 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5523 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5524 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5525
5526 /* If we need a per-worker barrier ... . */
5527 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5528 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5529 /* Don't use more barriers than available. */
5530 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5531 PTX_NUM_PER_WORKER_BARRIERS);
5532 }
5533
5534 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5535
5536 static bool
5537 has_vector_partitionable_routine_calls_p (tree fndecl)
5538 {
5539 if (!fndecl)
5540 return false;
5541
5542 basic_block bb;
5543 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5544 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5545 gsi_next_nondebug (&i))
5546 {
5547 gimple *stmt = gsi_stmt (i);
5548 if (gimple_code (stmt) != GIMPLE_CALL)
5549 continue;
5550
5551 tree callee = gimple_call_fndecl (stmt);
5552 if (!callee)
5553 continue;
5554
5555 tree attrs = oacc_get_fn_attrib (callee);
5556 if (attrs == NULL_TREE)
5557 return false;
5558
5559 int partition_level = oacc_fn_attrib_level (attrs);
5560 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5561 if (!seq_routine_p)
5562 return true;
5563 }
5564
5565 return false;
5566 }
5567
5568 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5569 DIMS has changed. */
5570
5571 static void
5572 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5573 {
5574 bool oacc_default_dims_p = false;
5575 bool oacc_min_dims_p = false;
5576 bool offload_region_p = false;
5577 bool routine_p = false;
5578 bool routine_seq_p = false;
5579 int default_vector_length = -1;
5580
5581 if (decl == NULL_TREE)
5582 {
5583 if (fn_level == -1)
5584 oacc_default_dims_p = true;
5585 else if (fn_level == -2)
5586 oacc_min_dims_p = true;
5587 else
5588 gcc_unreachable ();
5589 }
5590 else if (fn_level == -1)
5591 offload_region_p = true;
5592 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5593 {
5594 routine_p = true;
5595 routine_seq_p = fn_level == GOMP_DIM_MAX;
5596 }
5597 else
5598 gcc_unreachable ();
5599
5600 if (oacc_min_dims_p)
5601 {
5602 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5603 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5604 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5605
5606 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5607 return;
5608 }
5609
5610 if (routine_p)
5611 {
5612 if (!routine_seq_p)
5613 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5614
5615 return;
5616 }
5617
5618 if (oacc_default_dims_p)
5619 {
5620 /* -1 : not set
5621 0 : set at runtime, f.i. -fopenacc-dims=-
5622 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5623 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5624 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5625 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5626
5627 /* But -fopenacc-dims=- is not yet supported on trunk. */
5628 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5629 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5630 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5631 }
5632
5633 if (offload_region_p)
5634 {
5635 /* -1 : not set
5636 0 : set using variable, f.i. num_gangs (n)
5637 >= 1: set using constant, f.i. num_gangs (1). */
5638 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5639 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5640 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5641 }
5642
5643 if (offload_region_p)
5644 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5645 else
5646 /* oacc_default_dims_p. */
5647 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5648
5649 int old_dims[GOMP_DIM_MAX];
5650 unsigned int i;
5651 for (i = 0; i < GOMP_DIM_MAX; ++i)
5652 old_dims[i] = dims[i];
5653
5654 const char *vector_reason = NULL;
5655 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5656 {
5657 default_vector_length = PTX_WARP_SIZE;
5658
5659 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5660 {
5661 vector_reason = G_("using vector_length (%d) due to call to"
5662 " vector-partitionable routine, ignoring %d");
5663 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5664 }
5665 }
5666
5667 if (dims[GOMP_DIM_VECTOR] == 0)
5668 {
5669 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5670 dims[GOMP_DIM_VECTOR] = default_vector_length;
5671 }
5672
5673 if (dims[GOMP_DIM_VECTOR] > 0
5674 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5675 dims[GOMP_DIM_VECTOR] = default_vector_length;
5676
5677 nvptx_apply_dim_limits (dims);
5678
5679 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5680 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5681 vector_reason != NULL
5682 ? vector_reason
5683 : G_("using vector_length (%d), ignoring %d"),
5684 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5685
5686 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5687 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5688 G_("using num_workers (%d), ignoring %d"),
5689 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5690
5691 if (oacc_default_dims_p)
5692 {
5693 if (dims[GOMP_DIM_VECTOR] < 0)
5694 dims[GOMP_DIM_VECTOR] = default_vector_length;
5695 if (dims[GOMP_DIM_WORKER] < 0)
5696 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5697 if (dims[GOMP_DIM_GANG] < 0)
5698 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5699 nvptx_apply_dim_limits (dims);
5700 }
5701
5702 if (offload_region_p)
5703 {
5704 for (i = 0; i < GOMP_DIM_MAX; i++)
5705 {
5706 if (!(dims[i] < 0))
5707 continue;
5708
5709 if ((used & GOMP_DIM_MASK (i)) == 0)
5710 /* Function oacc_validate_dims will apply the minimal dimension. */
5711 continue;
5712
5713 dims[i] = (i == GOMP_DIM_VECTOR
5714 ? default_vector_length
5715 : oacc_get_default_dim (i));
5716 }
5717
5718 nvptx_apply_dim_limits (dims);
5719 }
5179 } 5720 }
5180 5721
5181 /* Validate compute dimensions of an OpenACC offload or routine, fill 5722 /* Validate compute dimensions of an OpenACC offload or routine, fill
5182 in non-unity defaults. FN_LEVEL indicates the level at which a 5723 in non-unity defaults. FN_LEVEL indicates the level at which a
5183 routine might spawn a loop. It is negative for non-routines. If 5724 routine might spawn a loop. It is negative for non-routines. If
5184 DECL is null, we are validating the default dimensions. */ 5725 DECL is null, we are validating the default dimensions. */
5185 5726
5186 static bool 5727 static bool
5187 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) 5728 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5188 { 5729 {
5189 bool changed = false; 5730 int old_dims[GOMP_DIM_MAX];
5190 5731 unsigned int i;
5191 /* The vector size must be 32, unless this is a SEQ routine. */ 5732
5192 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1 5733 for (i = 0; i < GOMP_DIM_MAX; ++i)
5193 && dims[GOMP_DIM_VECTOR] >= 0 5734 old_dims[i] = dims[i];
5194 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH) 5735
5195 { 5736 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5196 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0) 5737
5197 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, 5738 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5198 dims[GOMP_DIM_VECTOR] 5739 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5199 ? G_("using vector_length (%d), ignoring %d") 5740 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5200 : G_("using vector_length (%d), ignoring runtime setting"), 5741
5201 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]); 5742 for (i = 0; i < GOMP_DIM_MAX; ++i)
5202 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; 5743 if (old_dims[i] != dims[i])
5203 changed = true; 5744 return true;
5204 } 5745
5205 5746 return false;
5206 /* Check the num workers is not too large. */
5207 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5208 {
5209 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5210 "using num_workers (%d), ignoring %d",
5211 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5212 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5213 changed = true;
5214 }
5215
5216 if (!decl)
5217 {
5218 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5219 if (dims[GOMP_DIM_WORKER] < 0)
5220 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5221 if (dims[GOMP_DIM_GANG] < 0)
5222 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5223 changed = true;
5224 }
5225
5226 return changed;
5227 } 5747 }
5228 5748
5229 /* Return maximum dimension size, or zero for unbounded. */ 5749 /* Return maximum dimension size, or zero for unbounded. */
5230 5750
5231 static int 5751 static int
5232 nvptx_dim_limit (int axis) 5752 nvptx_dim_limit (int axis)
5233 { 5753 {
5234 switch (axis) 5754 switch (axis)
5235 { 5755 {
5236 case GOMP_DIM_VECTOR: 5756 case GOMP_DIM_VECTOR:
5237 return PTX_VECTOR_LENGTH; 5757 return PTX_MAX_VECTOR_LENGTH;
5238 5758
5239 default: 5759 default:
5240 break; 5760 break;
5241 } 5761 }
5242 return 0; 5762 return 0;
5265 /* Generate a PTX builtin function call that returns the address in 5785 /* Generate a PTX builtin function call that returns the address in
5266 the worker reduction buffer at OFFSET. TYPE is the type of the 5786 the worker reduction buffer at OFFSET. TYPE is the type of the
5267 data at that location. */ 5787 data at that location. */
5268 5788
5269 static tree 5789 static tree
5270 nvptx_get_worker_red_addr (tree type, tree offset) 5790 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5271 { 5791 {
5792 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5793 if (vector)
5794 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5272 machine_mode mode = TYPE_MODE (type); 5795 machine_mode mode = TYPE_MODE (type);
5273 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true); 5796 tree fndecl = nvptx_builtin_decl (addr_dim, true);
5274 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode)); 5797 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5275 tree align = build_int_cst (unsigned_type_node, 5798 tree align = build_int_cst (unsigned_type_node,
5276 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT); 5799 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5277 tree call = build_call_expr (fndecl, 3, offset, size, align); 5800 tree call = build_call_expr (fndecl, 3, offset, size, align);
5278 5801
5584 } 6107 }
5585 6108
5586 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ 6109 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5587 6110
5588 static void 6111 static void
5589 nvptx_goacc_reduction_setup (gcall *call) 6112 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
5590 { 6113 {
5591 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6114 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5592 tree lhs = gimple_call_lhs (call); 6115 tree lhs = gimple_call_lhs (call);
5593 tree var = gimple_call_arg (call, 2); 6116 tree var = gimple_call_arg (call, 2);
5594 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6117 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5603 6126
5604 if (!integer_zerop (ref_to_res)) 6127 if (!integer_zerop (ref_to_res))
5605 var = build_simple_mem_ref (ref_to_res); 6128 var = build_simple_mem_ref (ref_to_res);
5606 } 6129 }
5607 6130
5608 if (level == GOMP_DIM_WORKER) 6131 if (level == GOMP_DIM_WORKER
6132 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
5609 { 6133 {
5610 /* Store incoming value to worker reduction buffer. */ 6134 /* Store incoming value to worker reduction buffer. */
5611 tree offset = gimple_call_arg (call, 5); 6135 tree offset = gimple_call_arg (call, 5);
5612 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset); 6136 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6137 level == GOMP_DIM_VECTOR);
5613 tree ptr = make_ssa_name (TREE_TYPE (call)); 6138 tree ptr = make_ssa_name (TREE_TYPE (call));
5614 6139
5615 gimplify_assign (ptr, call, &seq); 6140 gimplify_assign (ptr, call, &seq);
5616 tree ref = build_simple_mem_ref (ptr); 6141 tree ref = build_simple_mem_ref (ptr);
5617 TREE_THIS_VOLATILE (ref) = 1; 6142 TREE_THIS_VOLATILE (ref) = 1;
5626 } 6151 }
5627 6152
5628 /* NVPTX implementation of GOACC_REDUCTION_INIT. */ 6153 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5629 6154
5630 static void 6155 static void
5631 nvptx_goacc_reduction_init (gcall *call) 6156 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
5632 { 6157 {
5633 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6158 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5634 tree lhs = gimple_call_lhs (call); 6159 tree lhs = gimple_call_lhs (call);
5635 tree var = gimple_call_arg (call, 2); 6160 tree var = gimple_call_arg (call, 2);
5636 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6161 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5640 TREE_TYPE (var)); 6165 TREE_TYPE (var));
5641 gimple_seq seq = NULL; 6166 gimple_seq seq = NULL;
5642 6167
5643 push_gimplify_context (true); 6168 push_gimplify_context (true);
5644 6169
5645 if (level == GOMP_DIM_VECTOR) 6170 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
5646 { 6171 {
5647 /* Initialize vector-non-zeroes to INIT_VAL (OP). */ 6172 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5648 tree tid = make_ssa_name (integer_type_node); 6173 tree tid = make_ssa_name (integer_type_node);
5649 tree dim_vector = gimple_call_arg (call, 3); 6174 tree dim_vector = gimple_call_arg (call, 3);
5650 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, 6175 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5700 tree ref_to_res = gimple_call_arg (call, 1); 6225 tree ref_to_res = gimple_call_arg (call, 1);
5701 if (integer_zerop (ref_to_res)) 6226 if (integer_zerop (ref_to_res))
5702 init = var; 6227 init = var;
5703 } 6228 }
5704 6229
5705 gimplify_assign (lhs, init, &seq); 6230 if (lhs != NULL_TREE)
6231 gimplify_assign (lhs, init, &seq);
5706 } 6232 }
5707 6233
5708 pop_gimplify_context (NULL); 6234 pop_gimplify_context (NULL);
5709 gsi_replace_with_seq (&gsi, seq, true); 6235 gsi_replace_with_seq (&gsi, seq, true);
5710 } 6236 }
5711 6237
5712 /* NVPTX implementation of GOACC_REDUCTION_FINI. */ 6238 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5713 6239
5714 static void 6240 static void
5715 nvptx_goacc_reduction_fini (gcall *call) 6241 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
5716 { 6242 {
5717 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6243 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5718 tree lhs = gimple_call_lhs (call); 6244 tree lhs = gimple_call_lhs (call);
5719 tree ref_to_res = gimple_call_arg (call, 1); 6245 tree ref_to_res = gimple_call_arg (call, 1);
5720 tree var = gimple_call_arg (call, 2); 6246 tree var = gimple_call_arg (call, 2);
5724 gimple_seq seq = NULL; 6250 gimple_seq seq = NULL;
5725 tree r = NULL_TREE;; 6251 tree r = NULL_TREE;;
5726 6252
5727 push_gimplify_context (true); 6253 push_gimplify_context (true);
5728 6254
5729 if (level == GOMP_DIM_VECTOR) 6255 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
5730 { 6256 {
5731 /* Emit binary shuffle tree. TODO. Emit this as an actual loop, 6257 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5732 but that requires a method of emitting a unified jump at the 6258 but that requires a method of emitting a unified jump at the
5733 gimple level. */ 6259 gimple level. */
5734 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1) 6260 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
5735 { 6261 {
5736 tree other_var = make_ssa_name (TREE_TYPE (var)); 6262 tree other_var = make_ssa_name (TREE_TYPE (var));
5737 nvptx_generate_vector_shuffle (gimple_location (call), 6263 nvptx_generate_vector_shuffle (gimple_location (call),
5738 other_var, var, shfl, &seq); 6264 other_var, var, shfl, &seq);
5739 6265
5745 } 6271 }
5746 else 6272 else
5747 { 6273 {
5748 tree accum = NULL_TREE; 6274 tree accum = NULL_TREE;
5749 6275
5750 if (level == GOMP_DIM_WORKER) 6276 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
5751 { 6277 {
5752 /* Get reduction buffer address. */ 6278 /* Get reduction buffer address. */
5753 tree offset = gimple_call_arg (call, 5); 6279 tree offset = gimple_call_arg (call, 5);
5754 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset); 6280 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6281 level == GOMP_DIM_VECTOR);
5755 tree ptr = make_ssa_name (TREE_TYPE (call)); 6282 tree ptr = make_ssa_name (TREE_TYPE (call));
5756 6283
5757 gimplify_assign (ptr, call, &seq); 6284 gimplify_assign (ptr, call, &seq);
5758 accum = ptr; 6285 accum = ptr;
5759 } 6286 }
5780 } 6307 }
5781 6308
5782 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */ 6309 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5783 6310
5784 static void 6311 static void
5785 nvptx_goacc_reduction_teardown (gcall *call) 6312 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
5786 { 6313 {
5787 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6314 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5788 tree lhs = gimple_call_lhs (call); 6315 tree lhs = gimple_call_lhs (call);
5789 tree var = gimple_call_arg (call, 2); 6316 tree var = gimple_call_arg (call, 2);
5790 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6317 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5791 gimple_seq seq = NULL; 6318 gimple_seq seq = NULL;
5792 6319
5793 push_gimplify_context (true); 6320 push_gimplify_context (true);
5794 if (level == GOMP_DIM_WORKER) 6321 if (level == GOMP_DIM_WORKER
6322 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
5795 { 6323 {
5796 /* Read the worker reduction buffer. */ 6324 /* Read the worker reduction buffer. */
5797 tree offset = gimple_call_arg (call, 5); 6325 tree offset = gimple_call_arg (call, 5);
5798 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset); 6326 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6327 level == GOMP_DIM_VECTOR);
5799 tree ptr = make_ssa_name (TREE_TYPE (call)); 6328 tree ptr = make_ssa_name (TREE_TYPE (call));
5800 6329
5801 gimplify_assign (ptr, call, &seq); 6330 gimplify_assign (ptr, call, &seq);
5802 var = build_simple_mem_ref (ptr); 6331 var = build_simple_mem_ref (ptr);
5803 TREE_THIS_VOLATILE (var) = 1; 6332 TREE_THIS_VOLATILE (var) = 1;
5824 6353
5825 static void 6354 static void
5826 nvptx_goacc_reduction (gcall *call) 6355 nvptx_goacc_reduction (gcall *call)
5827 { 6356 {
5828 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); 6357 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6358 offload_attrs oa;
6359
6360 populate_offload_attrs (&oa);
5829 6361
5830 switch (code) 6362 switch (code)
5831 { 6363 {
5832 case IFN_GOACC_REDUCTION_SETUP: 6364 case IFN_GOACC_REDUCTION_SETUP:
5833 nvptx_goacc_reduction_setup (call); 6365 nvptx_goacc_reduction_setup (call, &oa);
5834 break; 6366 break;
5835 6367
5836 case IFN_GOACC_REDUCTION_INIT: 6368 case IFN_GOACC_REDUCTION_INIT:
5837 nvptx_goacc_reduction_init (call); 6369 nvptx_goacc_reduction_init (call, &oa);
5838 break; 6370 break;
5839 6371
5840 case IFN_GOACC_REDUCTION_FINI: 6372 case IFN_GOACC_REDUCTION_FINI:
5841 nvptx_goacc_reduction_fini (call); 6373 nvptx_goacc_reduction_fini (call, &oa);
5842 break; 6374 break;
5843 6375
5844 case IFN_GOACC_REDUCTION_TEARDOWN: 6376 case IFN_GOACC_REDUCTION_TEARDOWN:
5845 nvptx_goacc_reduction_teardown (call); 6377 nvptx_goacc_reduction_teardown (call, &oa);
5846 break; 6378 break;
5847 6379
5848 default: 6380 default:
5849 gcc_unreachable (); 6381 gcc_unreachable ();
5850 } 6382 }
5914 6446
5915 static bool 6447 static bool
5916 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t) 6448 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5917 { 6449 {
5918 return false; 6450 return false;
6451 }
6452
6453 static GTY(()) tree nvptx_previous_fndecl;
6454
6455 static void
6456 nvptx_set_current_function (tree fndecl)
6457 {
6458 if (!fndecl || fndecl == nvptx_previous_fndecl)
6459 return;
6460
6461 nvptx_previous_fndecl = fndecl;
6462 vector_red_partition = 0;
6463 oacc_bcast_partition = 0;
5919 } 6464 }
5920 6465
5921 #undef TARGET_OPTION_OVERRIDE 6466 #undef TARGET_OPTION_OVERRIDE
5922 #define TARGET_OPTION_OVERRIDE nvptx_option_override 6467 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5923 6468
6018 #define TARGET_BUILTIN_DECL nvptx_builtin_decl 6563 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6019 6564
6020 #undef TARGET_SIMT_VF 6565 #undef TARGET_SIMT_VF
6021 #define TARGET_SIMT_VF nvptx_simt_vf 6566 #define TARGET_SIMT_VF nvptx_simt_vf
6022 6567
6568 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6569 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6570
6023 #undef TARGET_GOACC_VALIDATE_DIMS 6571 #undef TARGET_GOACC_VALIDATE_DIMS
6024 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims 6572 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6025 6573
6026 #undef TARGET_GOACC_DIM_LIMIT 6574 #undef TARGET_GOACC_DIM_LIMIT
6027 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit 6575 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6052 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class 6600 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6053 6601
6054 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE 6602 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6055 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed 6603 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6056 6604
6605 #undef TARGET_SET_CURRENT_FUNCTION
6606 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6607
6057 struct gcc_target targetm = TARGET_INITIALIZER; 6608 struct gcc_target targetm = TARGET_INITIALIZER;
6058 6609
6059 #include "gt-nvptx.h" 6610 #include "gt-nvptx.h"