comparison gcc/config/nvptx/nvptx.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
comparison
equal deleted inserted replaced
111:04ced10e8804 131:84e7813d76e9
1 /* Target code for NVPTX. 1 /* Target code for NVPTX.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc. 2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com> 3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 4
5 This file is part of GCC. 5 This file is part of GCC.
6 6
7 GCC is free software; you can redistribute it and/or modify it 7 GCC is free software; you can redistribute it and/or modify it
15 License for more details. 15 License for more details.
16 16
17 You should have received a copy of the GNU General Public License 17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see 18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */ 19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
20 22
21 #include "config.h" 23 #include "config.h"
22 #include <sstream> 24 #include <sstream>
23 #include "system.h" 25 #include "system.h"
24 #include "coretypes.h" 26 #include "coretypes.h"
74 76
75 /* This file should be included last. */ 77 /* This file should be included last. */
76 #include "target-def.h" 78 #include "target-def.h"
77 79
78 #define WORKAROUND_PTXJIT_BUG 1 80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82 #define WORKAROUND_PTXJIT_BUG_3 1
79 83
80 /* The various PTX memory areas an object might reside in. */ 84 /* The various PTX memory areas an object might reside in. */
81 enum nvptx_data_area 85 enum nvptx_data_area
82 { 86 {
83 DATA_AREA_GENERIC, 87 DATA_AREA_GENERIC,
172 /* Set toplevel_reorder, unless explicitly disabled. We need 176 /* Set toplevel_reorder, unless explicitly disabled. We need
173 reordering so that we emit necessary assembler decls of 177 reordering so that we emit necessary assembler decls of
174 undeclared variables. */ 178 undeclared variables. */
175 if (!global_options_set.x_flag_toplevel_reorder) 179 if (!global_options_set.x_flag_toplevel_reorder)
176 flag_toplevel_reorder = 1; 180 flag_toplevel_reorder = 1;
181
182 debug_nonbind_markers_p = 0;
177 183
178 /* Set flag_no_common, unless explicitly disabled. We fake common 184 /* Set flag_no_common, unless explicitly disabled. We fake common
179 using .weak, and that's not entirely accurate, so avoid it 185 using .weak, and that's not entirely accurate, so avoid it
180 unless forced. */ 186 unless forced. */
181 if (!global_options_set.x_flag_no_common) 187 if (!global_options_set.x_flag_no_common)
392 398
393 /* Emit fork at all levels. This helps form SESE regions, as 399 /* Emit fork at all levels. This helps form SESE regions, as
394 it creates a block with a single successor before entering a 400 it creates a block with a single successor before entering a
395 partitooned region. That is a good candidate for the end of 401 partitooned region. That is a good candidate for the end of
396 an SESE region. */ 402 an SESE region. */
397 if (!is_call) 403 emit_insn (gen_nvptx_fork (op));
398 emit_insn (gen_nvptx_fork (op));
399 emit_insn (gen_nvptx_forked (op)); 404 emit_insn (gen_nvptx_forked (op));
400 } 405 }
401 } 406 }
402 407
403 /* Emit joining instructions for MASK. */ 408 /* Emit joining instructions for MASK. */
412 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); 417 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
413 418
414 /* Emit joining for all non-call pars to ensure there's a single 419 /* Emit joining for all non-call pars to ensure there's a single
415 predecessor for the block the join insn ends up in. This is 420 predecessor for the block the join insn ends up in. This is
416 needed for skipping entire loops. */ 421 needed for skipping entire loops. */
417 if (!is_call) 422 emit_insn (gen_nvptx_joining (op));
418 emit_insn (gen_nvptx_joining (op));
419 emit_insn (gen_nvptx_join (op)); 423 emit_insn (gen_nvptx_join (op));
420 } 424 }
421 } 425 }
422 426
423 427
1893 init_frag.offset = 0; 1897 init_frag.offset = 0;
1894 init_frag.remaining--; 1898 init_frag.remaining--;
1895 1899
1896 if (sym) 1900 if (sym)
1897 { 1901 {
1898 fprintf (asm_out_file, "generic("); 1902 bool function = (SYMBOL_REF_DECL (sym)
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1904 if (!function)
1905 fprintf (asm_out_file, "generic(");
1899 output_address (VOIDmode, sym); 1906 output_address (VOIDmode, sym);
1900 fprintf (asm_out_file, val ? ") + " : ")"); 1907 if (!function)
1908 fprintf (asm_out_file, ")");
1909 if (val)
1910 fprintf (asm_out_file, " + ");
1901 } 1911 }
1902 1912
1903 if (!sym || val) 1913 if (!sym || val)
1904 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val); 1914 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1905 } 1915 }
2008 { 2018 {
2009 for (unsigned HOST_WIDE_INT i = 0; i < size; i++) 2019 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2010 nvptx_assemble_value (str[i], 1); 2020 nvptx_assemble_value (str[i], 1);
2011 } 2021 }
2012 2022
2023 /* Return true if TYPE is a record type where the last field is an array without
2024 given dimension. */
2025
2026 static bool
2027 flexible_array_member_type_p (const_tree type)
2028 {
2029 if (TREE_CODE (type) != RECORD_TYPE)
2030 return false;
2031
2032 const_tree last_field = NULL_TREE;
2033 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2034 last_field = f;
2035
2036 if (!last_field)
2037 return false;
2038
2039 const_tree last_field_type = TREE_TYPE (last_field);
2040 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2041 return false;
2042
2043 return (! TYPE_DOMAIN (last_field_type)
2044 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2045 }
2046
2013 /* Emit a PTX variable decl and prepare for emission of its 2047 /* Emit a PTX variable decl and prepare for emission of its
2014 initializer. NAME is the symbol name and SETION the PTX data 2048 initializer. NAME is the symbol name and SETION the PTX data
2015 area. The type is TYPE, object size SIZE and alignment is ALIGN. 2049 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2016 The caller has already emitted any indentation and linkage 2050 The caller has already emitted any indentation and linkage
2017 specifier. It is responsible for any initializer, terminating ; 2051 specifier. It is responsible for any initializer, terminating ;
2018 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly 2052 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2019 this is the opposite way round that PTX wants them! */ 2053 this is the opposite way round that PTX wants them! */
2020 2054
2021 static void 2055 static void
2022 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section, 2056 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2023 const_tree type, HOST_WIDE_INT size, unsigned align) 2057 const_tree type, HOST_WIDE_INT size, unsigned align,
2024 { 2058 bool undefined = false)
2059 {
2060 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2061 && (TYPE_DOMAIN (type) == NULL_TREE);
2062
2063 if (undefined && flexible_array_member_type_p (type))
2064 {
2065 size = 0;
2066 atype = true;
2067 }
2068
2025 while (TREE_CODE (type) == ARRAY_TYPE) 2069 while (TREE_CODE (type) == ARRAY_TYPE)
2026 type = TREE_TYPE (type); 2070 type = TREE_TYPE (type);
2027 2071
2028 if (TREE_CODE (type) == VECTOR_TYPE 2072 if (TREE_CODE (type) == VECTOR_TYPE
2029 || TREE_CODE (type) == COMPLEX_TYPE) 2073 || TREE_CODE (type) == COMPLEX_TYPE)
2059 2103
2060 if (size) 2104 if (size)
2061 /* We make everything an array, to simplify any initialization 2105 /* We make everything an array, to simplify any initialization
2062 emission. */ 2106 emission. */
2063 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining); 2107 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2108 else if (atype)
2109 fprintf (file, "[]");
2064 } 2110 }
2065 2111
2066 /* Called when the initializer for a decl has been completely output through 2112 /* Called when the initializer for a decl has been completely output through
2067 combinations of the three functions above. */ 2113 combinations of the three functions above. */
2068 2114
2154 2200
2155 fprintf (file, "\t.extern "); 2201 fprintf (file, "\t.extern ");
2156 tree size = DECL_SIZE_UNIT (decl); 2202 tree size = DECL_SIZE_UNIT (decl);
2157 nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2203 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2158 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0, 2204 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2159 DECL_ALIGN (decl)); 2205 DECL_ALIGN (decl), true);
2160 nvptx_assemble_decl_end (); 2206 nvptx_assemble_decl_end ();
2161 } 2207 }
2162 2208
2163 /* Output a pattern for a move instruction. */ 2209 /* Output a pattern for a move instruction. */
2164 2210
3037 3083
3038 gcc_assert (mask); 3084 gcc_assert (mask);
3039 par = new parallel (par, mask); 3085 par = new parallel (par, mask);
3040 par->forked_block = block; 3086 par->forked_block = block;
3041 par->forked_insn = end; 3087 par->forked_insn = end;
3042 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) 3088 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3043 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3044 par->fork_insn 3089 par->fork_insn
3045 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); 3090 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3046 } 3091 }
3047 break; 3092 break;
3048 3093
3053 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); 3098 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3054 3099
3055 gcc_assert (par->mask == mask); 3100 gcc_assert (par->mask == mask);
3056 par->join_block = block; 3101 par->join_block = block;
3057 par->join_insn = end; 3102 par->join_insn = end;
3058 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) 3103 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3059 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3060 par->joining_insn 3104 par->joining_insn
3061 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); 3105 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3062 par = par->parent; 3106 par = par->parent;
3063 } 3107 }
3064 break; 3108 break;
3733 } 3777 }
3734 3778
3735 #undef BB_SET_SESE 3779 #undef BB_SET_SESE
3736 #undef BB_GET_SESE 3780 #undef BB_GET_SESE
3737 3781
3738 /* Propagate live state at the start of a partitioned region. BLOCK 3782 /* Propagate live state at the start of a partitioned region. IS_CALL
3739 provides the live register information, and might not contain 3783 indicates whether the propagation is for a (partitioned) call
3740 INSN. Propagation is inserted just after INSN. RW indicates whether 3784 instruction. BLOCK provides the live register information, and
3741 we are reading and/or writing state. This 3785 might not contain INSN. Propagation is inserted just after INSN. RW
3786 indicates whether we are reading and/or writing state. This
3742 separation is needed for worker-level proppagation where we 3787 separation is needed for worker-level proppagation where we
3743 essentially do a spill & fill. FN is the underlying worker 3788 essentially do a spill & fill. FN is the underlying worker
3744 function to generate the propagation instructions for single 3789 function to generate the propagation instructions for single
3745 register. DATA is user data. 3790 register. DATA is user data.
3746 3791
3747 We propagate the live register set and the entire frame. We could 3792 Returns true if we didn't emit any instructions.
3748 do better by (a) propagating just the live set that is used within 3793
3749 the partitioned regions and (b) only propagating stack entries that 3794 We propagate the live register set for non-calls and the entire
3750 are used. The latter might be quite hard to determine. */ 3795 frame for calls and non-calls. We could do better by (a)
3796 propagating just the live set that is used within the partitioned
3797 regions and (b) only propagating stack entries that are used. The
3798 latter might be quite hard to determine. */
3751 3799
3752 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); 3800 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3753 3801
3754 static void 3802 static bool
3755 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, 3803 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3756 propagator_fn fn, void *data) 3804 propagate_mask rw, propagator_fn fn, void *data)
3757 { 3805 {
3758 bitmap live = DF_LIVE_IN (block); 3806 bitmap live = DF_LIVE_IN (block);
3759 bitmap_iterator iterator; 3807 bitmap_iterator iterator;
3760 unsigned ix; 3808 unsigned ix;
3809 bool empty = true;
3761 3810
3762 /* Copy the frame array. */ 3811 /* Copy the frame array. */
3763 HOST_WIDE_INT fs = get_frame_size (); 3812 HOST_WIDE_INT fs = get_frame_size ();
3764 if (fs) 3813 if (fs)
3765 { 3814 {
3767 rtx idx = NULL_RTX; 3816 rtx idx = NULL_RTX;
3768 rtx ptr = gen_reg_rtx (Pmode); 3817 rtx ptr = gen_reg_rtx (Pmode);
3769 rtx pred = NULL_RTX; 3818 rtx pred = NULL_RTX;
3770 rtx_code_label *label = NULL; 3819 rtx_code_label *label = NULL;
3771 3820
3821 empty = false;
3772 /* The frame size might not be DImode compatible, but the frame 3822 /* The frame size might not be DImode compatible, but the frame
3773 array's declaration will be. So it's ok to round up here. */ 3823 array's declaration will be. So it's ok to round up here. */
3774 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); 3824 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3775 /* Detect single iteration loop. */ 3825 /* Detect single iteration loop. */
3776 if (fs == 1) 3826 if (fs == 1)
3813 rtx cpy = get_insns (); 3863 rtx cpy = get_insns ();
3814 end_sequence (); 3864 end_sequence ();
3815 insn = emit_insn_after (cpy, insn); 3865 insn = emit_insn_after (cpy, insn);
3816 } 3866 }
3817 3867
3818 /* Copy live registers. */ 3868 if (!is_call)
3819 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) 3869 /* Copy live registers. */
3820 { 3870 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3821 rtx reg = regno_reg_rtx[ix]; 3871 {
3822 3872 rtx reg = regno_reg_rtx[ix];
3823 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) 3873
3824 { 3874 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3825 rtx bcast = fn (reg, rw, 0, data); 3875 {
3826 3876 rtx bcast = fn (reg, rw, 0, data);
3827 insn = emit_insn_after (bcast, insn); 3877
3828 } 3878 insn = emit_insn_after (bcast, insn);
3829 } 3879 empty = false;
3880 }
3881 }
3882 return empty;
3830 } 3883 }
3831 3884
3832 /* Worker for nvptx_vpropagate. */ 3885 /* Worker for nvptx_vpropagate. */
3833 3886
3834 static rtx 3887 static rtx
3840 3893
3841 return nvptx_gen_vcast (reg); 3894 return nvptx_gen_vcast (reg);
3842 } 3895 }
3843 3896
3844 /* Propagate state that is live at start of BLOCK across the vectors 3897 /* Propagate state that is live at start of BLOCK across the vectors
3845 of a single warp. Propagation is inserted just after INSN. */ 3898 of a single warp. Propagation is inserted just after INSN.
3846 3899 IS_CALL and return as for nvptx_propagate. */
3847 static void 3900
3848 nvptx_vpropagate (basic_block block, rtx_insn *insn) 3901 static bool
3849 { 3902 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
3850 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0); 3903 {
3904 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
3851 } 3905 }
3852 3906
3853 /* Worker for nvptx_wpropagate. */ 3907 /* Worker for nvptx_wpropagate. */
3854 3908
3855 static rtx 3909 static rtx
3881 } 3935 }
3882 3936
3883 /* Spill or fill live state that is live at start of BLOCK. PRE_P 3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3884 indicates if this is just before partitioned mode (do spill), or 3938 indicates if this is just before partitioned mode (do spill), or
3885 just after it starts (do fill). Sequence is inserted just after 3939 just after it starts (do fill). Sequence is inserted just after
3886 INSN. */ 3940 INSN. IS_CALL and return as for nvptx_propagate. */
3887 3941
3888 static void 3942 static bool
3889 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) 3943 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
3890 { 3944 {
3891 wcast_data_t data; 3945 wcast_data_t data;
3892 3946
3893 data.base = gen_reg_rtx (Pmode); 3947 data.base = gen_reg_rtx (Pmode);
3894 data.offset = 0; 3948 data.offset = 0;
3895 data.ptr = NULL_RTX; 3949 data.ptr = NULL_RTX;
3896 3950
3897 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data); 3951 bool empty = nvptx_propagate (is_call, block, insn,
3952 pre_p ? PM_read : PM_write, wprop_gen, &data);
3953 gcc_assert (empty == !data.offset);
3898 if (data.offset) 3954 if (data.offset)
3899 { 3955 {
3900 /* Stuff was emitted, initialize the base pointer now. */ 3956 /* Stuff was emitted, initialize the base pointer now. */
3901 rtx init = gen_rtx_SET (data.base, worker_bcast_sym); 3957 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3902 emit_insn_after (init, insn); 3958 emit_insn_after (init, insn);
3903 3959
3904 if (worker_bcast_size < data.offset) 3960 if (worker_bcast_size < data.offset)
3905 worker_bcast_size = data.offset; 3961 worker_bcast_size = data.offset;
3906 } 3962 }
3963 return empty;
3907 } 3964 }
3908 3965
3909 /* Emit a worker-level synchronization barrier. We use different 3966 /* Emit a worker-level synchronization barrier. We use different
3910 markers for before and after synchronizations. */ 3967 markers for before and after synchronizations. */
3911 3968
3930 return insn; 3987 return insn;
3931 3988
3932 return 0; 3989 return 0;
3933 } 3990 }
3934 #endif 3991 #endif
3992
3993 /* Return true if INSN needs neutering. */
3994
3995 static bool
3996 needs_neutering_p (rtx_insn *insn)
3997 {
3998 if (!INSN_P (insn))
3999 return false;
4000
4001 switch (recog_memoized (insn))
4002 {
4003 case CODE_FOR_nvptx_fork:
4004 case CODE_FOR_nvptx_forked:
4005 case CODE_FOR_nvptx_joining:
4006 case CODE_FOR_nvptx_join:
4007 case CODE_FOR_nvptx_barsync:
4008 return false;
4009 default:
4010 return true;
4011 }
4012 }
4013
4014 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4015
4016 static bool
4017 verify_neutering_jumps (basic_block from,
4018 rtx_insn *vector_jump, rtx_insn *worker_jump,
4019 rtx_insn *vector_label, rtx_insn *worker_label)
4020 {
4021 basic_block bb = from;
4022 rtx_insn *insn = BB_HEAD (bb);
4023 bool seen_worker_jump = false;
4024 bool seen_vector_jump = false;
4025 bool seen_worker_label = false;
4026 bool seen_vector_label = false;
4027 bool worker_neutered = false;
4028 bool vector_neutered = false;
4029 while (true)
4030 {
4031 if (insn == worker_jump)
4032 {
4033 seen_worker_jump = true;
4034 worker_neutered = true;
4035 gcc_assert (!vector_neutered);
4036 }
4037 else if (insn == vector_jump)
4038 {
4039 seen_vector_jump = true;
4040 vector_neutered = true;
4041 }
4042 else if (insn == worker_label)
4043 {
4044 seen_worker_label = true;
4045 gcc_assert (worker_neutered);
4046 worker_neutered = false;
4047 }
4048 else if (insn == vector_label)
4049 {
4050 seen_vector_label = true;
4051 gcc_assert (vector_neutered);
4052 vector_neutered = false;
4053 }
4054 else if (INSN_P (insn))
4055 switch (recog_memoized (insn))
4056 {
4057 case CODE_FOR_nvptx_barsync:
4058 gcc_assert (!vector_neutered && !worker_neutered);
4059 break;
4060 default:
4061 break;
4062 }
4063
4064 if (insn != BB_END (bb))
4065 insn = NEXT_INSN (insn);
4066 else if (JUMP_P (insn) && single_succ_p (bb)
4067 && !seen_vector_jump && !seen_worker_jump)
4068 {
4069 bb = single_succ (bb);
4070 insn = BB_HEAD (bb);
4071 }
4072 else
4073 break;
4074 }
4075
4076 gcc_assert (!(vector_jump && !seen_vector_jump));
4077 gcc_assert (!(worker_jump && !seen_worker_jump));
4078
4079 if (seen_vector_label || seen_worker_label)
4080 {
4081 gcc_assert (!(vector_label && !seen_vector_label));
4082 gcc_assert (!(worker_label && !seen_worker_label));
4083
4084 return true;
4085 }
4086
4087 return false;
4088 }
4089
4090 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4091
4092 static void
4093 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4094 rtx_insn *worker_label)
4095 {
4096 basic_block bb = to;
4097 rtx_insn *insn = BB_END (bb);
4098 bool seen_worker_label = false;
4099 bool seen_vector_label = false;
4100 while (true)
4101 {
4102 if (insn == worker_label)
4103 {
4104 seen_worker_label = true;
4105 gcc_assert (!seen_vector_label);
4106 }
4107 else if (insn == vector_label)
4108 seen_vector_label = true;
4109 else if (INSN_P (insn))
4110 switch (recog_memoized (insn))
4111 {
4112 case CODE_FOR_nvptx_barsync:
4113 gcc_assert (!seen_vector_label && !seen_worker_label);
4114 break;
4115 }
4116
4117 if (insn != BB_HEAD (bb))
4118 insn = PREV_INSN (insn);
4119 else
4120 break;
4121 }
4122
4123 gcc_assert (!(vector_label && !seen_vector_label));
4124 gcc_assert (!(worker_label && !seen_worker_label));
4125 }
3935 4126
3936 /* Single neutering according to MASK. FROM is the incoming block and 4127 /* Single neutering according to MASK. FROM is the incoming block and
3937 TO is the outgoing block. These may be the same block. Insert at 4128 TO is the outgoing block. These may be the same block. Insert at
3938 start of FROM: 4129 start of FROM:
3939 4130
3956 unsigned skip_mask = mask; 4147 unsigned skip_mask = mask;
3957 4148
3958 while (true) 4149 while (true)
3959 { 4150 {
3960 /* Find first insn of from block. */ 4151 /* Find first insn of from block. */
3961 while (head != BB_END (from) && !INSN_P (head)) 4152 while (head != BB_END (from) && !needs_neutering_p (head))
3962 head = NEXT_INSN (head); 4153 head = NEXT_INSN (head);
3963 4154
3964 if (from == to) 4155 if (from == to)
3965 break; 4156 break;
3966 4157
3997 } 4188 }
3998 4189
3999 if (tail == head) 4190 if (tail == head)
4000 { 4191 {
4001 /* If this is empty, do nothing. */ 4192 /* If this is empty, do nothing. */
4002 if (!head || !INSN_P (head)) 4193 if (!head || !needs_neutering_p (head))
4003 return; 4194 return;
4004
4005 /* If this is a dummy insn, do nothing. */
4006 switch (recog_memoized (head))
4007 {
4008 default:
4009 break;
4010 case CODE_FOR_nvptx_fork:
4011 case CODE_FOR_nvptx_forked:
4012 case CODE_FOR_nvptx_joining:
4013 case CODE_FOR_nvptx_join:
4014 return;
4015 }
4016 4195
4017 if (cond_branch) 4196 if (cond_branch)
4018 { 4197 {
4019 /* If we're only doing vector single, there's no need to 4198 /* If we're only doing vector single, there's no need to
4020 emit skip code because we'll not insert anything. */ 4199 emit skip code because we'll not insert anything. */
4027 } 4206 }
4028 4207
4029 /* Insert the vector test inside the worker test. */ 4208 /* Insert the vector test inside the worker test. */
4030 unsigned mode; 4209 unsigned mode;
4031 rtx_insn *before = tail; 4210 rtx_insn *before = tail;
4211 rtx_insn *neuter_start = NULL;
4212 rtx_insn *worker_label = NULL, *vector_label = NULL;
4213 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4032 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) 4214 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4033 if (GOMP_DIM_MASK (mode) & skip_mask) 4215 if (GOMP_DIM_MASK (mode) & skip_mask)
4034 { 4216 {
4035 rtx_code_label *label = gen_label_rtx (); 4217 rtx_code_label *label = gen_label_rtx ();
4036 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; 4218 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4219 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4220 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4037 4221
4038 if (!pred) 4222 if (!pred)
4039 { 4223 {
4040 pred = gen_reg_rtx (BImode); 4224 pred = gen_reg_rtx (BImode);
4041 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; 4225 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4044 rtx br; 4228 rtx br;
4045 if (mode == GOMP_DIM_VECTOR) 4229 if (mode == GOMP_DIM_VECTOR)
4046 br = gen_br_true (pred, label); 4230 br = gen_br_true (pred, label);
4047 else 4231 else
4048 br = gen_br_true_uni (pred, label); 4232 br = gen_br_true_uni (pred, label);
4049 emit_insn_before (br, head); 4233 if (neuter_start)
4234 neuter_start = emit_insn_after (br, neuter_start);
4235 else
4236 neuter_start = emit_insn_before (br, head);
4237 *mode_jump = neuter_start;
4050 4238
4051 LABEL_NUSES (label)++; 4239 LABEL_NUSES (label)++;
4240 rtx_insn *label_insn;
4052 if (tail_branch) 4241 if (tail_branch)
4053 before = emit_label_before (label, before); 4242 {
4243 label_insn = emit_label_before (label, before);
4244 before = label_insn;
4245 }
4054 else 4246 else
4055 emit_label_after (label, tail); 4247 {
4248 label_insn = emit_label_after (label, tail);
4249 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4250 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4251 emit_insn_after (gen_exit (), label_insn);
4252 }
4253
4254 if (mode == GOMP_DIM_VECTOR)
4255 vector_label = label_insn;
4256 else
4257 worker_label = label_insn;
4056 } 4258 }
4057 4259
4058 /* Now deal with propagating the branch condition. */ 4260 /* Now deal with propagating the branch condition. */
4059 if (cond_branch) 4261 if (cond_branch)
4060 { 4262 {
4090 to threads 1-31, so after the shfl %rcondu32 is defined in threads 4292 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4091 0-31, and after the setp.ne %rcond is defined in threads 0-31. 4293 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4092 4294
4093 There is nothing in the PTX spec to suggest that this is wrong, or 4295 There is nothing in the PTX spec to suggest that this is wrong, or
4094 to explain why the extra initialization is needed. So, we classify 4296 to explain why the extra initialization is needed. So, we classify
4095 it as a JIT bug, and the extra initialization as workaround. */ 4297 it as a JIT bug, and the extra initialization as workaround:
4096 emit_insn_before (gen_movbi (pvar, const0_rtx), 4298
4299 {
4300 .reg .u32 %x;
4301 mov.u32 %x,%tid.x;
4302 setp.ne.u32 %rnotvzero,%x,0;
4303 }
4304
4305 +.reg .pred %rcond2;
4306 +setp.eq.u32 %rcond2, 1, 0;
4307
4308 @%rnotvzero bra Lskip;
4309 setp.<op>.<type> %rcond,op1,op2;
4310 +mov.pred %rcond2, %rcond;
4311 Lskip:
4312 +mov.pred %rcond, %rcond2;
4313 selp.u32 %rcondu32,1,0,%rcond;
4314 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4315 setp.ne.u32 %rcond,%rcondu32,0;
4316 */
4317 rtx_insn *label = PREV_INSN (tail);
4318 gcc_assert (label && LABEL_P (label));
4319 rtx tmp = gen_reg_rtx (BImode);
4320 emit_insn_before (gen_movbi (tmp, const0_rtx),
4097 bb_first_real_insn (from)); 4321 bb_first_real_insn (from));
4322 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4323 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4098 #endif 4324 #endif
4099 emit_insn_before (nvptx_gen_vcast (pvar), tail); 4325 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4100 } 4326 }
4101 else 4327 else
4102 { 4328 {
4126 extract_insn (tail); 4352 extract_insn (tail);
4127 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), 4353 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4128 UNSPEC_BR_UNIFIED); 4354 UNSPEC_BR_UNIFIED);
4129 validate_change (tail, recog_data.operand_loc[0], unsp, false); 4355 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4130 } 4356 }
4357
4358 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4359 vector_label, worker_label);
4360 if (!seen_label)
4361 verify_neutering_labels (to, vector_label, worker_label);
4131 } 4362 }
4132 4363
4133 /* PAR is a parallel that is being skipped in its entirety according to 4364 /* PAR is a parallel that is being skipped in its entirety according to
4134 MASK. Treat this as skipping a superblock starting at forked 4365 MASK. Treat this as skipping a superblock starting at forked
4135 and ending at joining. */ 4366 and ending at joining. */
4226 { 4457 {
4227 par->inner_mask = nvptx_process_pars (par->inner); 4458 par->inner_mask = nvptx_process_pars (par->inner);
4228 inner_mask |= par->inner_mask; 4459 inner_mask |= par->inner_mask;
4229 } 4460 }
4230 4461
4231 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) 4462 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4232 /* No propagation needed for a call. */; 4463
4233 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 4464 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4234 { 4465 {
4235 nvptx_wpropagate (false, par->forked_block, par->forked_insn); 4466 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
4236 nvptx_wpropagate (true, par->forked_block, par->fork_insn); 4467 bool empty = nvptx_wpropagate (true, is_call,
4237 /* Insert begin and end synchronizations. */ 4468 par->forked_block, par->fork_insn);
4238 emit_insn_after (nvptx_wsync (false), par->forked_insn); 4469
4239 emit_insn_before (nvptx_wsync (true), par->joining_insn); 4470 if (!empty || !is_call)
4471 {
4472 /* Insert begin and end synchronizations. */
4473 emit_insn_before (nvptx_wsync (false), par->forked_insn);
4474 emit_insn_before (nvptx_wsync (true), par->join_insn);
4475 }
4240 } 4476 }
4241 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 4477 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4242 nvptx_vpropagate (par->forked_block, par->forked_insn); 4478 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
4243 4479
4244 /* Now do siblings. */ 4480 /* Now do siblings. */
4245 if (par->next) 4481 if (par->next)
4246 inner_mask |= nvptx_process_pars (par->next); 4482 inner_mask |= nvptx_process_pars (par->next);
4247 return inner_mask; 4483 return inner_mask;
4322 4558
4323 if (par->next) 4559 if (par->next)
4324 nvptx_neuter_pars (par->next, modes, outer); 4560 nvptx_neuter_pars (par->next, modes, outer);
4325 } 4561 }
4326 4562
4563 #if WORKAROUND_PTXJIT_BUG_2
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4565 is needed in the nvptx target because the branches generated for
4566 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4567
4568 static rtx
4569 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4570 {
4571 rtx pat;
4572 if ((strict && !JUMP_P (insn))
4573 || (!strict && !INSN_P (insn)))
4574 return NULL_RTX;
4575 pat = PATTERN (insn);
4576
4577 /* The set is allowed to appear either as the insn pattern or
4578 the first set in a PARALLEL. */
4579 if (GET_CODE (pat) == PARALLEL)
4580 pat = XVECEXP (pat, 0, 0);
4581 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4582 return pat;
4583
4584 return NULL_RTX;
4585 }
4586
4587 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4588
4589 static rtx
4590 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4591 {
4592 rtx x = nvptx_pc_set (insn, strict);
4593
4594 if (!x)
4595 return NULL_RTX;
4596 x = SET_SRC (x);
4597 if (GET_CODE (x) == LABEL_REF)
4598 return x;
4599 if (GET_CODE (x) != IF_THEN_ELSE)
4600 return NULL_RTX;
4601 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4602 return XEXP (x, 1);
4603 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4604 return XEXP (x, 2);
4605 return NULL_RTX;
4606 }
4607
4608 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4609 insn inbetween the branch and the label. This works around a JIT bug
4610 observed at driver version 384.111, at -O0 for sm_50. */
4611
4612 static void
4613 prevent_branch_around_nothing (void)
4614 {
4615 rtx_insn *seen_label = NULL;
4616 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4617 {
4618 if (INSN_P (insn) && condjump_p (insn))
4619 {
4620 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4621 continue;
4622 }
4623
4624 if (seen_label == NULL)
4625 continue;
4626
4627 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4628 continue;
4629
4630 if (INSN_P (insn))
4631 switch (recog_memoized (insn))
4632 {
4633 case CODE_FOR_nvptx_fork:
4634 case CODE_FOR_nvptx_forked:
4635 case CODE_FOR_nvptx_joining:
4636 case CODE_FOR_nvptx_join:
4637 continue;
4638 default:
4639 seen_label = NULL;
4640 continue;
4641 }
4642
4643 if (LABEL_P (insn) && insn == seen_label)
4644 emit_insn_before (gen_fake_nop (), insn);
4645
4646 seen_label = NULL;
4647 }
4648 }
4649 #endif
4650
4651 #ifdef WORKAROUND_PTXJIT_BUG_3
4652 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4653 works around a hang observed at driver version 390.48 for sm_50. */
4654
4655 static void
4656 workaround_barsyncs (void)
4657 {
4658 bool seen_barsync = false;
4659 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4660 {
4661 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4662 {
4663 if (seen_barsync)
4664 {
4665 emit_insn_before (gen_nvptx_membar_cta (), insn);
4666 emit_insn_before (gen_nvptx_membar_cta (), insn);
4667 }
4668
4669 seen_barsync = true;
4670 continue;
4671 }
4672
4673 if (!seen_barsync)
4674 continue;
4675
4676 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4677 continue;
4678 else if (INSN_P (insn))
4679 switch (recog_memoized (insn))
4680 {
4681 case CODE_FOR_nvptx_fork:
4682 case CODE_FOR_nvptx_forked:
4683 case CODE_FOR_nvptx_joining:
4684 case CODE_FOR_nvptx_join:
4685 continue;
4686 default:
4687 break;
4688 }
4689
4690 seen_barsync = false;
4691 }
4692 }
4693 #endif
4694
4327 /* PTX-specific reorganization 4695 /* PTX-specific reorganization
4328 - Split blocks at fork and join instructions 4696 - Split blocks at fork and join instructions
4329 - Compute live registers 4697 - Compute live registers
4330 - Mark now-unused registers, so function begin doesn't declare 4698 - Mark now-unused registers, so function begin doesn't declare
4331 unused registers. 4699 unused registers.
4401 nvptx_reorg_subreg (); 4769 nvptx_reorg_subreg ();
4402 4770
4403 if (TARGET_UNIFORM_SIMT) 4771 if (TARGET_UNIFORM_SIMT)
4404 nvptx_reorg_uniform_simt (); 4772 nvptx_reorg_uniform_simt ();
4405 4773
4774 #if WORKAROUND_PTXJIT_BUG_2
4775 prevent_branch_around_nothing ();
4776 #endif
4777
4778 #ifdef WORKAROUND_PTXJIT_BUG_3
4779 workaround_barsyncs ();
4780 #endif
4781
4406 regstat_free_n_sets_and_refs (); 4782 regstat_free_n_sets_and_refs ();
4407 4783
4408 df_finish_pass (true); 4784 df_finish_pass (true);
4409 } 4785 }
4410 4786
4455 } 4831 }
4456 4832
4457 /* Table of valid machine attributes. */ 4833 /* Table of valid machine attributes. */
4458 static const struct attribute_spec nvptx_attribute_table[] = 4834 static const struct attribute_spec nvptx_attribute_table[] =
4459 { 4835 {
4460 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, 4836 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4461 affects_type_identity } */ 4837 affects_type_identity, handler, exclude } */
4462 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false }, 4838 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4463 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false }, 4839 NULL },
4464 { NULL, 0, 0, false, false, false, NULL, false } 4840 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4841 NULL },
4842 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4465 }; 4843 };
4466 4844
4467 /* Limit vector alignments to BIGGEST_ALIGNMENT. */ 4845 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4468 4846
4469 static HOST_WIDE_INT 4847 static HOST_WIDE_INT
4551 static void 4929 static void
4552 nvptx_file_start (void) 4930 nvptx_file_start (void)
4553 { 4931 {
4554 fputs ("// BEGIN PREAMBLE\n", asm_out_file); 4932 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4555 fputs ("\t.version\t3.1\n", asm_out_file); 4933 fputs ("\t.version\t3.1\n", asm_out_file);
4556 fputs ("\t.target\tsm_30\n", asm_out_file); 4934 if (TARGET_SM35)
4935 fputs ("\t.target\tsm_35\n", asm_out_file);
4936 else
4937 fputs ("\t.target\tsm_30\n", asm_out_file);
4557 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); 4938 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4558 fputs ("// END PREAMBLE\n", asm_out_file); 4939 fputs ("// END PREAMBLE\n", asm_out_file);
4559 } 4940 }
4560 4941
4561 /* Emit a declaration for a worker-level buffer in .shared memory. */ 4942 /* Emit a declaration for a worker-level buffer in .shared memory. */
4785 } 5166 }
4786 5167
4787 /* Define dimension sizes for known hardware. */ 5168 /* Define dimension sizes for known hardware. */
4788 #define PTX_VECTOR_LENGTH 32 5169 #define PTX_VECTOR_LENGTH 32
4789 #define PTX_WORKER_LENGTH 32 5170 #define PTX_WORKER_LENGTH 32
4790 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */ 5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
4791 5172
4792 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ 5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4793 5174
4794 static int 5175 static int
4795 nvptx_simt_vf () 5176 nvptx_simt_vf ()
4834 5215
4835 if (!decl) 5216 if (!decl)
4836 { 5217 {
4837 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; 5218 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4838 if (dims[GOMP_DIM_WORKER] < 0) 5219 if (dims[GOMP_DIM_WORKER] < 0)
4839 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; 5220 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
4840 if (dims[GOMP_DIM_GANG] < 0) 5221 if (dims[GOMP_DIM_GANG] < 0)
4841 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT; 5222 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
4842 changed = true; 5223 changed = true;
4843 } 5224 }
4844 5225
4845 return changed; 5226 return changed;
4846 } 5227 }
4850 static int 5231 static int
4851 nvptx_dim_limit (int axis) 5232 nvptx_dim_limit (int axis)
4852 { 5233 {
4853 switch (axis) 5234 switch (axis)
4854 { 5235 {
4855 case GOMP_DIM_WORKER:
4856 return PTX_WORKER_LENGTH;
4857
4858 case GOMP_DIM_VECTOR: 5236 case GOMP_DIM_VECTOR:
4859 return PTX_VECTOR_LENGTH; 5237 return PTX_VECTOR_LENGTH;
4860 5238
4861 default: 5239 default:
4862 break; 5240 break;
5671 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs 6049 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5672 6050
5673 #undef TARGET_CAN_CHANGE_MODE_CLASS 6051 #undef TARGET_CAN_CHANGE_MODE_CLASS
5674 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class 6052 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5675 6053
6054 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6055 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6056
5676 struct gcc_target targetm = TARGET_INITIALIZER; 6057 struct gcc_target targetm = TARGET_INITIALIZER;
5677 6058
5678 #include "gt-nvptx.h" 6059 #include "gt-nvptx.h"