Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/nvptx/nvptx.c @ 131:84e7813d76e9
gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 07:37:49 +0900 |
parents | 04ced10e8804 |
children | 1830386684a0 |
comparison
equal
deleted
inserted
replaced
111:04ced10e8804 | 131:84e7813d76e9 |
---|---|
1 /* Target code for NVPTX. | 1 /* Target code for NVPTX. |
2 Copyright (C) 2014-2017 Free Software Foundation, Inc. | 2 Copyright (C) 2014-2018 Free Software Foundation, Inc. |
3 Contributed by Bernd Schmidt <bernds@codesourcery.com> | 3 Contributed by Bernd Schmidt <bernds@codesourcery.com> |
4 | 4 |
5 This file is part of GCC. | 5 This file is part of GCC. |
6 | 6 |
7 GCC is free software; you can redistribute it and/or modify it | 7 GCC is free software; you can redistribute it and/or modify it |
15 License for more details. | 15 License for more details. |
16 | 16 |
17 You should have received a copy of the GNU General Public License | 17 You should have received a copy of the GNU General Public License |
18 along with GCC; see the file COPYING3. If not see | 18 along with GCC; see the file COPYING3. If not see |
19 <http://www.gnu.org/licenses/>. */ | 19 <http://www.gnu.org/licenses/>. */ |
20 | |
21 #define IN_TARGET_CODE 1 | |
20 | 22 |
21 #include "config.h" | 23 #include "config.h" |
22 #include <sstream> | 24 #include <sstream> |
23 #include "system.h" | 25 #include "system.h" |
24 #include "coretypes.h" | 26 #include "coretypes.h" |
74 | 76 |
75 /* This file should be included last. */ | 77 /* This file should be included last. */ |
76 #include "target-def.h" | 78 #include "target-def.h" |
77 | 79 |
78 #define WORKAROUND_PTXJIT_BUG 1 | 80 #define WORKAROUND_PTXJIT_BUG 1 |
81 #define WORKAROUND_PTXJIT_BUG_2 1 | |
82 #define WORKAROUND_PTXJIT_BUG_3 1 | |
79 | 83 |
80 /* The various PTX memory areas an object might reside in. */ | 84 /* The various PTX memory areas an object might reside in. */ |
81 enum nvptx_data_area | 85 enum nvptx_data_area |
82 { | 86 { |
83 DATA_AREA_GENERIC, | 87 DATA_AREA_GENERIC, |
172 /* Set toplevel_reorder, unless explicitly disabled. We need | 176 /* Set toplevel_reorder, unless explicitly disabled. We need |
173 reordering so that we emit necessary assembler decls of | 177 reordering so that we emit necessary assembler decls of |
174 undeclared variables. */ | 178 undeclared variables. */ |
175 if (!global_options_set.x_flag_toplevel_reorder) | 179 if (!global_options_set.x_flag_toplevel_reorder) |
176 flag_toplevel_reorder = 1; | 180 flag_toplevel_reorder = 1; |
181 | |
182 debug_nonbind_markers_p = 0; | |
177 | 183 |
178 /* Set flag_no_common, unless explicitly disabled. We fake common | 184 /* Set flag_no_common, unless explicitly disabled. We fake common |
179 using .weak, and that's not entirely accurate, so avoid it | 185 using .weak, and that's not entirely accurate, so avoid it |
180 unless forced. */ | 186 unless forced. */ |
181 if (!global_options_set.x_flag_no_common) | 187 if (!global_options_set.x_flag_no_common) |
392 | 398 |
393 /* Emit fork at all levels. This helps form SESE regions, as | 399 /* Emit fork at all levels. This helps form SESE regions, as |
394 it creates a block with a single successor before entering a | 400 it creates a block with a single successor before entering a |
395 partitooned region. That is a good candidate for the end of | 401 partitooned region. That is a good candidate for the end of |
396 an SESE region. */ | 402 an SESE region. */ |
397 if (!is_call) | 403 emit_insn (gen_nvptx_fork (op)); |
398 emit_insn (gen_nvptx_fork (op)); | |
399 emit_insn (gen_nvptx_forked (op)); | 404 emit_insn (gen_nvptx_forked (op)); |
400 } | 405 } |
401 } | 406 } |
402 | 407 |
403 /* Emit joining instructions for MASK. */ | 408 /* Emit joining instructions for MASK. */ |
412 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); | 417 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); |
413 | 418 |
414 /* Emit joining for all non-call pars to ensure there's a single | 419 /* Emit joining for all non-call pars to ensure there's a single |
415 predecessor for the block the join insn ends up in. This is | 420 predecessor for the block the join insn ends up in. This is |
416 needed for skipping entire loops. */ | 421 needed for skipping entire loops. */ |
417 if (!is_call) | 422 emit_insn (gen_nvptx_joining (op)); |
418 emit_insn (gen_nvptx_joining (op)); | |
419 emit_insn (gen_nvptx_join (op)); | 423 emit_insn (gen_nvptx_join (op)); |
420 } | 424 } |
421 } | 425 } |
422 | 426 |
423 | 427 |
1893 init_frag.offset = 0; | 1897 init_frag.offset = 0; |
1894 init_frag.remaining--; | 1898 init_frag.remaining--; |
1895 | 1899 |
1896 if (sym) | 1900 if (sym) |
1897 { | 1901 { |
1898 fprintf (asm_out_file, "generic("); | 1902 bool function = (SYMBOL_REF_DECL (sym) |
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL)); | |
1904 if (!function) | |
1905 fprintf (asm_out_file, "generic("); | |
1899 output_address (VOIDmode, sym); | 1906 output_address (VOIDmode, sym); |
1900 fprintf (asm_out_file, val ? ") + " : ")"); | 1907 if (!function) |
1908 fprintf (asm_out_file, ")"); | |
1909 if (val) | |
1910 fprintf (asm_out_file, " + "); | |
1901 } | 1911 } |
1902 | 1912 |
1903 if (!sym || val) | 1913 if (!sym || val) |
1904 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val); | 1914 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val); |
1905 } | 1915 } |
2008 { | 2018 { |
2009 for (unsigned HOST_WIDE_INT i = 0; i < size; i++) | 2019 for (unsigned HOST_WIDE_INT i = 0; i < size; i++) |
2010 nvptx_assemble_value (str[i], 1); | 2020 nvptx_assemble_value (str[i], 1); |
2011 } | 2021 } |
2012 | 2022 |
2023 /* Return true if TYPE is a record type where the last field is an array without | |
2024 given dimension. */ | |
2025 | |
2026 static bool | |
2027 flexible_array_member_type_p (const_tree type) | |
2028 { | |
2029 if (TREE_CODE (type) != RECORD_TYPE) | |
2030 return false; | |
2031 | |
2032 const_tree last_field = NULL_TREE; | |
2033 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f)) | |
2034 last_field = f; | |
2035 | |
2036 if (!last_field) | |
2037 return false; | |
2038 | |
2039 const_tree last_field_type = TREE_TYPE (last_field); | |
2040 if (TREE_CODE (last_field_type) != ARRAY_TYPE) | |
2041 return false; | |
2042 | |
2043 return (! TYPE_DOMAIN (last_field_type) | |
2044 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type))); | |
2045 } | |
2046 | |
2013 /* Emit a PTX variable decl and prepare for emission of its | 2047 /* Emit a PTX variable decl and prepare for emission of its |
2014 initializer. NAME is the symbol name and SETION the PTX data | 2048 initializer. NAME is the symbol name and SETION the PTX data |
2015 area. The type is TYPE, object size SIZE and alignment is ALIGN. | 2049 area. The type is TYPE, object size SIZE and alignment is ALIGN. |
2016 The caller has already emitted any indentation and linkage | 2050 The caller has already emitted any indentation and linkage |
2017 specifier. It is responsible for any initializer, terminating ; | 2051 specifier. It is responsible for any initializer, terminating ; |
2018 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly | 2052 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly |
2019 this is the opposite way round that PTX wants them! */ | 2053 this is the opposite way round that PTX wants them! */ |
2020 | 2054 |
2021 static void | 2055 static void |
2022 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section, | 2056 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section, |
2023 const_tree type, HOST_WIDE_INT size, unsigned align) | 2057 const_tree type, HOST_WIDE_INT size, unsigned align, |
2024 { | 2058 bool undefined = false) |
2059 { | |
2060 bool atype = (TREE_CODE (type) == ARRAY_TYPE) | |
2061 && (TYPE_DOMAIN (type) == NULL_TREE); | |
2062 | |
2063 if (undefined && flexible_array_member_type_p (type)) | |
2064 { | |
2065 size = 0; | |
2066 atype = true; | |
2067 } | |
2068 | |
2025 while (TREE_CODE (type) == ARRAY_TYPE) | 2069 while (TREE_CODE (type) == ARRAY_TYPE) |
2026 type = TREE_TYPE (type); | 2070 type = TREE_TYPE (type); |
2027 | 2071 |
2028 if (TREE_CODE (type) == VECTOR_TYPE | 2072 if (TREE_CODE (type) == VECTOR_TYPE |
2029 || TREE_CODE (type) == COMPLEX_TYPE) | 2073 || TREE_CODE (type) == COMPLEX_TYPE) |
2059 | 2103 |
2060 if (size) | 2104 if (size) |
2061 /* We make everything an array, to simplify any initialization | 2105 /* We make everything an array, to simplify any initialization |
2062 emission. */ | 2106 emission. */ |
2063 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining); | 2107 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining); |
2108 else if (atype) | |
2109 fprintf (file, "[]"); | |
2064 } | 2110 } |
2065 | 2111 |
2066 /* Called when the initializer for a decl has been completely output through | 2112 /* Called when the initializer for a decl has been completely output through |
2067 combinations of the three functions above. */ | 2113 combinations of the three functions above. */ |
2068 | 2114 |
2154 | 2200 |
2155 fprintf (file, "\t.extern "); | 2201 fprintf (file, "\t.extern "); |
2156 tree size = DECL_SIZE_UNIT (decl); | 2202 tree size = DECL_SIZE_UNIT (decl); |
2157 nvptx_assemble_decl_begin (file, name, section_for_decl (decl), | 2203 nvptx_assemble_decl_begin (file, name, section_for_decl (decl), |
2158 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0, | 2204 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0, |
2159 DECL_ALIGN (decl)); | 2205 DECL_ALIGN (decl), true); |
2160 nvptx_assemble_decl_end (); | 2206 nvptx_assemble_decl_end (); |
2161 } | 2207 } |
2162 | 2208 |
2163 /* Output a pattern for a move instruction. */ | 2209 /* Output a pattern for a move instruction. */ |
2164 | 2210 |
3037 | 3083 |
3038 gcc_assert (mask); | 3084 gcc_assert (mask); |
3039 par = new parallel (par, mask); | 3085 par = new parallel (par, mask); |
3040 par->forked_block = block; | 3086 par->forked_block = block; |
3041 par->forked_insn = end; | 3087 par->forked_insn = end; |
3042 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) | 3088 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
3043 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) | |
3044 par->fork_insn | 3089 par->fork_insn |
3045 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); | 3090 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); |
3046 } | 3091 } |
3047 break; | 3092 break; |
3048 | 3093 |
3053 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); | 3098 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); |
3054 | 3099 |
3055 gcc_assert (par->mask == mask); | 3100 gcc_assert (par->mask == mask); |
3056 par->join_block = block; | 3101 par->join_block = block; |
3057 par->join_insn = end; | 3102 par->join_insn = end; |
3058 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) | 3103 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
3059 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) | |
3060 par->joining_insn | 3104 par->joining_insn |
3061 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); | 3105 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); |
3062 par = par->parent; | 3106 par = par->parent; |
3063 } | 3107 } |
3064 break; | 3108 break; |
3733 } | 3777 } |
3734 | 3778 |
3735 #undef BB_SET_SESE | 3779 #undef BB_SET_SESE |
3736 #undef BB_GET_SESE | 3780 #undef BB_GET_SESE |
3737 | 3781 |
3738 /* Propagate live state at the start of a partitioned region. BLOCK | 3782 /* Propagate live state at the start of a partitioned region. IS_CALL |
3739 provides the live register information, and might not contain | 3783 indicates whether the propagation is for a (partitioned) call |
3740 INSN. Propagation is inserted just after INSN. RW indicates whether | 3784 instruction. BLOCK provides the live register information, and |
3741 we are reading and/or writing state. This | 3785 might not contain INSN. Propagation is inserted just after INSN. RW |
3786 indicates whether we are reading and/or writing state. This | |
3742 separation is needed for worker-level proppagation where we | 3787 separation is needed for worker-level proppagation where we |
3743 essentially do a spill & fill. FN is the underlying worker | 3788 essentially do a spill & fill. FN is the underlying worker |
3744 function to generate the propagation instructions for single | 3789 function to generate the propagation instructions for single |
3745 register. DATA is user data. | 3790 register. DATA is user data. |
3746 | 3791 |
3747 We propagate the live register set and the entire frame. We could | 3792 Returns true if we didn't emit any instructions. |
3748 do better by (a) propagating just the live set that is used within | 3793 |
3749 the partitioned regions and (b) only propagating stack entries that | 3794 We propagate the live register set for non-calls and the entire |
3750 are used. The latter might be quite hard to determine. */ | 3795 frame for calls and non-calls. We could do better by (a) |
3796 propagating just the live set that is used within the partitioned | |
3797 regions and (b) only propagating stack entries that are used. The | |
3798 latter might be quite hard to determine. */ | |
3751 | 3799 |
3752 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); | 3800 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); |
3753 | 3801 |
3754 static void | 3802 static bool |
3755 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, | 3803 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, |
3756 propagator_fn fn, void *data) | 3804 propagate_mask rw, propagator_fn fn, void *data) |
3757 { | 3805 { |
3758 bitmap live = DF_LIVE_IN (block); | 3806 bitmap live = DF_LIVE_IN (block); |
3759 bitmap_iterator iterator; | 3807 bitmap_iterator iterator; |
3760 unsigned ix; | 3808 unsigned ix; |
3809 bool empty = true; | |
3761 | 3810 |
3762 /* Copy the frame array. */ | 3811 /* Copy the frame array. */ |
3763 HOST_WIDE_INT fs = get_frame_size (); | 3812 HOST_WIDE_INT fs = get_frame_size (); |
3764 if (fs) | 3813 if (fs) |
3765 { | 3814 { |
3767 rtx idx = NULL_RTX; | 3816 rtx idx = NULL_RTX; |
3768 rtx ptr = gen_reg_rtx (Pmode); | 3817 rtx ptr = gen_reg_rtx (Pmode); |
3769 rtx pred = NULL_RTX; | 3818 rtx pred = NULL_RTX; |
3770 rtx_code_label *label = NULL; | 3819 rtx_code_label *label = NULL; |
3771 | 3820 |
3821 empty = false; | |
3772 /* The frame size might not be DImode compatible, but the frame | 3822 /* The frame size might not be DImode compatible, but the frame |
3773 array's declaration will be. So it's ok to round up here. */ | 3823 array's declaration will be. So it's ok to round up here. */ |
3774 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); | 3824 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); |
3775 /* Detect single iteration loop. */ | 3825 /* Detect single iteration loop. */ |
3776 if (fs == 1) | 3826 if (fs == 1) |
3813 rtx cpy = get_insns (); | 3863 rtx cpy = get_insns (); |
3814 end_sequence (); | 3864 end_sequence (); |
3815 insn = emit_insn_after (cpy, insn); | 3865 insn = emit_insn_after (cpy, insn); |
3816 } | 3866 } |
3817 | 3867 |
3818 /* Copy live registers. */ | 3868 if (!is_call) |
3819 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) | 3869 /* Copy live registers. */ |
3820 { | 3870 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) |
3821 rtx reg = regno_reg_rtx[ix]; | 3871 { |
3822 | 3872 rtx reg = regno_reg_rtx[ix]; |
3823 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) | 3873 |
3824 { | 3874 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) |
3825 rtx bcast = fn (reg, rw, 0, data); | 3875 { |
3826 | 3876 rtx bcast = fn (reg, rw, 0, data); |
3827 insn = emit_insn_after (bcast, insn); | 3877 |
3828 } | 3878 insn = emit_insn_after (bcast, insn); |
3829 } | 3879 empty = false; |
3880 } | |
3881 } | |
3882 return empty; | |
3830 } | 3883 } |
3831 | 3884 |
3832 /* Worker for nvptx_vpropagate. */ | 3885 /* Worker for nvptx_vpropagate. */ |
3833 | 3886 |
3834 static rtx | 3887 static rtx |
3840 | 3893 |
3841 return nvptx_gen_vcast (reg); | 3894 return nvptx_gen_vcast (reg); |
3842 } | 3895 } |
3843 | 3896 |
3844 /* Propagate state that is live at start of BLOCK across the vectors | 3897 /* Propagate state that is live at start of BLOCK across the vectors |
3845 of a single warp. Propagation is inserted just after INSN. */ | 3898 of a single warp. Propagation is inserted just after INSN. |
3846 | 3899 IS_CALL and return as for nvptx_propagate. */ |
3847 static void | 3900 |
3848 nvptx_vpropagate (basic_block block, rtx_insn *insn) | 3901 static bool |
3849 { | 3902 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn) |
3850 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0); | 3903 { |
3904 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0); | |
3851 } | 3905 } |
3852 | 3906 |
3853 /* Worker for nvptx_wpropagate. */ | 3907 /* Worker for nvptx_wpropagate. */ |
3854 | 3908 |
3855 static rtx | 3909 static rtx |
3881 } | 3935 } |
3882 | 3936 |
3883 /* Spill or fill live state that is live at start of BLOCK. PRE_P | 3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P |
3884 indicates if this is just before partitioned mode (do spill), or | 3938 indicates if this is just before partitioned mode (do spill), or |
3885 just after it starts (do fill). Sequence is inserted just after | 3939 just after it starts (do fill). Sequence is inserted just after |
3886 INSN. */ | 3940 INSN. IS_CALL and return as for nvptx_propagate. */ |
3887 | 3941 |
3888 static void | 3942 static bool |
3889 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) | 3943 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn) |
3890 { | 3944 { |
3891 wcast_data_t data; | 3945 wcast_data_t data; |
3892 | 3946 |
3893 data.base = gen_reg_rtx (Pmode); | 3947 data.base = gen_reg_rtx (Pmode); |
3894 data.offset = 0; | 3948 data.offset = 0; |
3895 data.ptr = NULL_RTX; | 3949 data.ptr = NULL_RTX; |
3896 | 3950 |
3897 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data); | 3951 bool empty = nvptx_propagate (is_call, block, insn, |
3952 pre_p ? PM_read : PM_write, wprop_gen, &data); | |
3953 gcc_assert (empty == !data.offset); | |
3898 if (data.offset) | 3954 if (data.offset) |
3899 { | 3955 { |
3900 /* Stuff was emitted, initialize the base pointer now. */ | 3956 /* Stuff was emitted, initialize the base pointer now. */ |
3901 rtx init = gen_rtx_SET (data.base, worker_bcast_sym); | 3957 rtx init = gen_rtx_SET (data.base, worker_bcast_sym); |
3902 emit_insn_after (init, insn); | 3958 emit_insn_after (init, insn); |
3903 | 3959 |
3904 if (worker_bcast_size < data.offset) | 3960 if (worker_bcast_size < data.offset) |
3905 worker_bcast_size = data.offset; | 3961 worker_bcast_size = data.offset; |
3906 } | 3962 } |
3963 return empty; | |
3907 } | 3964 } |
3908 | 3965 |
3909 /* Emit a worker-level synchronization barrier. We use different | 3966 /* Emit a worker-level synchronization barrier. We use different |
3910 markers for before and after synchronizations. */ | 3967 markers for before and after synchronizations. */ |
3911 | 3968 |
3930 return insn; | 3987 return insn; |
3931 | 3988 |
3932 return 0; | 3989 return 0; |
3933 } | 3990 } |
3934 #endif | 3991 #endif |
3992 | |
3993 /* Return true if INSN needs neutering. */ | |
3994 | |
3995 static bool | |
3996 needs_neutering_p (rtx_insn *insn) | |
3997 { | |
3998 if (!INSN_P (insn)) | |
3999 return false; | |
4000 | |
4001 switch (recog_memoized (insn)) | |
4002 { | |
4003 case CODE_FOR_nvptx_fork: | |
4004 case CODE_FOR_nvptx_forked: | |
4005 case CODE_FOR_nvptx_joining: | |
4006 case CODE_FOR_nvptx_join: | |
4007 case CODE_FOR_nvptx_barsync: | |
4008 return false; | |
4009 default: | |
4010 return true; | |
4011 } | |
4012 } | |
4013 | |
4014 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */ | |
4015 | |
4016 static bool | |
4017 verify_neutering_jumps (basic_block from, | |
4018 rtx_insn *vector_jump, rtx_insn *worker_jump, | |
4019 rtx_insn *vector_label, rtx_insn *worker_label) | |
4020 { | |
4021 basic_block bb = from; | |
4022 rtx_insn *insn = BB_HEAD (bb); | |
4023 bool seen_worker_jump = false; | |
4024 bool seen_vector_jump = false; | |
4025 bool seen_worker_label = false; | |
4026 bool seen_vector_label = false; | |
4027 bool worker_neutered = false; | |
4028 bool vector_neutered = false; | |
4029 while (true) | |
4030 { | |
4031 if (insn == worker_jump) | |
4032 { | |
4033 seen_worker_jump = true; | |
4034 worker_neutered = true; | |
4035 gcc_assert (!vector_neutered); | |
4036 } | |
4037 else if (insn == vector_jump) | |
4038 { | |
4039 seen_vector_jump = true; | |
4040 vector_neutered = true; | |
4041 } | |
4042 else if (insn == worker_label) | |
4043 { | |
4044 seen_worker_label = true; | |
4045 gcc_assert (worker_neutered); | |
4046 worker_neutered = false; | |
4047 } | |
4048 else if (insn == vector_label) | |
4049 { | |
4050 seen_vector_label = true; | |
4051 gcc_assert (vector_neutered); | |
4052 vector_neutered = false; | |
4053 } | |
4054 else if (INSN_P (insn)) | |
4055 switch (recog_memoized (insn)) | |
4056 { | |
4057 case CODE_FOR_nvptx_barsync: | |
4058 gcc_assert (!vector_neutered && !worker_neutered); | |
4059 break; | |
4060 default: | |
4061 break; | |
4062 } | |
4063 | |
4064 if (insn != BB_END (bb)) | |
4065 insn = NEXT_INSN (insn); | |
4066 else if (JUMP_P (insn) && single_succ_p (bb) | |
4067 && !seen_vector_jump && !seen_worker_jump) | |
4068 { | |
4069 bb = single_succ (bb); | |
4070 insn = BB_HEAD (bb); | |
4071 } | |
4072 else | |
4073 break; | |
4074 } | |
4075 | |
4076 gcc_assert (!(vector_jump && !seen_vector_jump)); | |
4077 gcc_assert (!(worker_jump && !seen_worker_jump)); | |
4078 | |
4079 if (seen_vector_label || seen_worker_label) | |
4080 { | |
4081 gcc_assert (!(vector_label && !seen_vector_label)); | |
4082 gcc_assert (!(worker_label && !seen_worker_label)); | |
4083 | |
4084 return true; | |
4085 } | |
4086 | |
4087 return false; | |
4088 } | |
4089 | |
4090 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */ | |
4091 | |
4092 static void | |
4093 verify_neutering_labels (basic_block to, rtx_insn *vector_label, | |
4094 rtx_insn *worker_label) | |
4095 { | |
4096 basic_block bb = to; | |
4097 rtx_insn *insn = BB_END (bb); | |
4098 bool seen_worker_label = false; | |
4099 bool seen_vector_label = false; | |
4100 while (true) | |
4101 { | |
4102 if (insn == worker_label) | |
4103 { | |
4104 seen_worker_label = true; | |
4105 gcc_assert (!seen_vector_label); | |
4106 } | |
4107 else if (insn == vector_label) | |
4108 seen_vector_label = true; | |
4109 else if (INSN_P (insn)) | |
4110 switch (recog_memoized (insn)) | |
4111 { | |
4112 case CODE_FOR_nvptx_barsync: | |
4113 gcc_assert (!seen_vector_label && !seen_worker_label); | |
4114 break; | |
4115 } | |
4116 | |
4117 if (insn != BB_HEAD (bb)) | |
4118 insn = PREV_INSN (insn); | |
4119 else | |
4120 break; | |
4121 } | |
4122 | |
4123 gcc_assert (!(vector_label && !seen_vector_label)); | |
4124 gcc_assert (!(worker_label && !seen_worker_label)); | |
4125 } | |
3935 | 4126 |
3936 /* Single neutering according to MASK. FROM is the incoming block and | 4127 /* Single neutering according to MASK. FROM is the incoming block and |
3937 TO is the outgoing block. These may be the same block. Insert at | 4128 TO is the outgoing block. These may be the same block. Insert at |
3938 start of FROM: | 4129 start of FROM: |
3939 | 4130 |
3956 unsigned skip_mask = mask; | 4147 unsigned skip_mask = mask; |
3957 | 4148 |
3958 while (true) | 4149 while (true) |
3959 { | 4150 { |
3960 /* Find first insn of from block. */ | 4151 /* Find first insn of from block. */ |
3961 while (head != BB_END (from) && !INSN_P (head)) | 4152 while (head != BB_END (from) && !needs_neutering_p (head)) |
3962 head = NEXT_INSN (head); | 4153 head = NEXT_INSN (head); |
3963 | 4154 |
3964 if (from == to) | 4155 if (from == to) |
3965 break; | 4156 break; |
3966 | 4157 |
3997 } | 4188 } |
3998 | 4189 |
3999 if (tail == head) | 4190 if (tail == head) |
4000 { | 4191 { |
4001 /* If this is empty, do nothing. */ | 4192 /* If this is empty, do nothing. */ |
4002 if (!head || !INSN_P (head)) | 4193 if (!head || !needs_neutering_p (head)) |
4003 return; | 4194 return; |
4004 | |
4005 /* If this is a dummy insn, do nothing. */ | |
4006 switch (recog_memoized (head)) | |
4007 { | |
4008 default: | |
4009 break; | |
4010 case CODE_FOR_nvptx_fork: | |
4011 case CODE_FOR_nvptx_forked: | |
4012 case CODE_FOR_nvptx_joining: | |
4013 case CODE_FOR_nvptx_join: | |
4014 return; | |
4015 } | |
4016 | 4195 |
4017 if (cond_branch) | 4196 if (cond_branch) |
4018 { | 4197 { |
4019 /* If we're only doing vector single, there's no need to | 4198 /* If we're only doing vector single, there's no need to |
4020 emit skip code because we'll not insert anything. */ | 4199 emit skip code because we'll not insert anything. */ |
4027 } | 4206 } |
4028 | 4207 |
4029 /* Insert the vector test inside the worker test. */ | 4208 /* Insert the vector test inside the worker test. */ |
4030 unsigned mode; | 4209 unsigned mode; |
4031 rtx_insn *before = tail; | 4210 rtx_insn *before = tail; |
4211 rtx_insn *neuter_start = NULL; | |
4212 rtx_insn *worker_label = NULL, *vector_label = NULL; | |
4213 rtx_insn *worker_jump = NULL, *vector_jump = NULL; | |
4032 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) | 4214 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) |
4033 if (GOMP_DIM_MASK (mode) & skip_mask) | 4215 if (GOMP_DIM_MASK (mode) & skip_mask) |
4034 { | 4216 { |
4035 rtx_code_label *label = gen_label_rtx (); | 4217 rtx_code_label *label = gen_label_rtx (); |
4036 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; | 4218 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; |
4219 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump; | |
4220 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label; | |
4037 | 4221 |
4038 if (!pred) | 4222 if (!pred) |
4039 { | 4223 { |
4040 pred = gen_reg_rtx (BImode); | 4224 pred = gen_reg_rtx (BImode); |
4041 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; | 4225 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; |
4044 rtx br; | 4228 rtx br; |
4045 if (mode == GOMP_DIM_VECTOR) | 4229 if (mode == GOMP_DIM_VECTOR) |
4046 br = gen_br_true (pred, label); | 4230 br = gen_br_true (pred, label); |
4047 else | 4231 else |
4048 br = gen_br_true_uni (pred, label); | 4232 br = gen_br_true_uni (pred, label); |
4049 emit_insn_before (br, head); | 4233 if (neuter_start) |
4234 neuter_start = emit_insn_after (br, neuter_start); | |
4235 else | |
4236 neuter_start = emit_insn_before (br, head); | |
4237 *mode_jump = neuter_start; | |
4050 | 4238 |
4051 LABEL_NUSES (label)++; | 4239 LABEL_NUSES (label)++; |
4240 rtx_insn *label_insn; | |
4052 if (tail_branch) | 4241 if (tail_branch) |
4053 before = emit_label_before (label, before); | 4242 { |
4243 label_insn = emit_label_before (label, before); | |
4244 before = label_insn; | |
4245 } | |
4054 else | 4246 else |
4055 emit_label_after (label, tail); | 4247 { |
4248 label_insn = emit_label_after (label, tail); | |
4249 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) | |
4250 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) | |
4251 emit_insn_after (gen_exit (), label_insn); | |
4252 } | |
4253 | |
4254 if (mode == GOMP_DIM_VECTOR) | |
4255 vector_label = label_insn; | |
4256 else | |
4257 worker_label = label_insn; | |
4056 } | 4258 } |
4057 | 4259 |
4058 /* Now deal with propagating the branch condition. */ | 4260 /* Now deal with propagating the branch condition. */ |
4059 if (cond_branch) | 4261 if (cond_branch) |
4060 { | 4262 { |
4090 to threads 1-31, so after the shfl %rcondu32 is defined in threads | 4292 to threads 1-31, so after the shfl %rcondu32 is defined in threads |
4091 0-31, and after the setp.ne %rcond is defined in threads 0-31. | 4293 0-31, and after the setp.ne %rcond is defined in threads 0-31. |
4092 | 4294 |
4093 There is nothing in the PTX spec to suggest that this is wrong, or | 4295 There is nothing in the PTX spec to suggest that this is wrong, or |
4094 to explain why the extra initialization is needed. So, we classify | 4296 to explain why the extra initialization is needed. So, we classify |
4095 it as a JIT bug, and the extra initialization as workaround. */ | 4297 it as a JIT bug, and the extra initialization as workaround: |
4096 emit_insn_before (gen_movbi (pvar, const0_rtx), | 4298 |
4299 { | |
4300 .reg .u32 %x; | |
4301 mov.u32 %x,%tid.x; | |
4302 setp.ne.u32 %rnotvzero,%x,0; | |
4303 } | |
4304 | |
4305 +.reg .pred %rcond2; | |
4306 +setp.eq.u32 %rcond2, 1, 0; | |
4307 | |
4308 @%rnotvzero bra Lskip; | |
4309 setp.<op>.<type> %rcond,op1,op2; | |
4310 +mov.pred %rcond2, %rcond; | |
4311 Lskip: | |
4312 +mov.pred %rcond, %rcond2; | |
4313 selp.u32 %rcondu32,1,0,%rcond; | |
4314 shfl.idx.b32 %rcondu32,%rcondu32,0,31; | |
4315 setp.ne.u32 %rcond,%rcondu32,0; | |
4316 */ | |
4317 rtx_insn *label = PREV_INSN (tail); | |
4318 gcc_assert (label && LABEL_P (label)); | |
4319 rtx tmp = gen_reg_rtx (BImode); | |
4320 emit_insn_before (gen_movbi (tmp, const0_rtx), | |
4097 bb_first_real_insn (from)); | 4321 bb_first_real_insn (from)); |
4322 emit_insn_before (gen_rtx_SET (tmp, pvar), label); | |
4323 emit_insn_before (gen_rtx_SET (pvar, tmp), tail); | |
4098 #endif | 4324 #endif |
4099 emit_insn_before (nvptx_gen_vcast (pvar), tail); | 4325 emit_insn_before (nvptx_gen_vcast (pvar), tail); |
4100 } | 4326 } |
4101 else | 4327 else |
4102 { | 4328 { |
4126 extract_insn (tail); | 4352 extract_insn (tail); |
4127 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), | 4353 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), |
4128 UNSPEC_BR_UNIFIED); | 4354 UNSPEC_BR_UNIFIED); |
4129 validate_change (tail, recog_data.operand_loc[0], unsp, false); | 4355 validate_change (tail, recog_data.operand_loc[0], unsp, false); |
4130 } | 4356 } |
4357 | |
4358 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump, | |
4359 vector_label, worker_label); | |
4360 if (!seen_label) | |
4361 verify_neutering_labels (to, vector_label, worker_label); | |
4131 } | 4362 } |
4132 | 4363 |
4133 /* PAR is a parallel that is being skipped in its entirety according to | 4364 /* PAR is a parallel that is being skipped in its entirety according to |
4134 MASK. Treat this as skipping a superblock starting at forked | 4365 MASK. Treat this as skipping a superblock starting at forked |
4135 and ending at joining. */ | 4366 and ending at joining. */ |
4226 { | 4457 { |
4227 par->inner_mask = nvptx_process_pars (par->inner); | 4458 par->inner_mask = nvptx_process_pars (par->inner); |
4228 inner_mask |= par->inner_mask; | 4459 inner_mask |= par->inner_mask; |
4229 } | 4460 } |
4230 | 4461 |
4231 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) | 4462 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; |
4232 /* No propagation needed for a call. */; | 4463 |
4233 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) | 4464 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
4234 { | 4465 { |
4235 nvptx_wpropagate (false, par->forked_block, par->forked_insn); | 4466 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn); |
4236 nvptx_wpropagate (true, par->forked_block, par->fork_insn); | 4467 bool empty = nvptx_wpropagate (true, is_call, |
4237 /* Insert begin and end synchronizations. */ | 4468 par->forked_block, par->fork_insn); |
4238 emit_insn_after (nvptx_wsync (false), par->forked_insn); | 4469 |
4239 emit_insn_before (nvptx_wsync (true), par->joining_insn); | 4470 if (!empty || !is_call) |
4471 { | |
4472 /* Insert begin and end synchronizations. */ | |
4473 emit_insn_before (nvptx_wsync (false), par->forked_insn); | |
4474 emit_insn_before (nvptx_wsync (true), par->join_insn); | |
4475 } | |
4240 } | 4476 } |
4241 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) | 4477 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) |
4242 nvptx_vpropagate (par->forked_block, par->forked_insn); | 4478 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn); |
4243 | 4479 |
4244 /* Now do siblings. */ | 4480 /* Now do siblings. */ |
4245 if (par->next) | 4481 if (par->next) |
4246 inner_mask |= nvptx_process_pars (par->next); | 4482 inner_mask |= nvptx_process_pars (par->next); |
4247 return inner_mask; | 4483 return inner_mask; |
4322 | 4558 |
4323 if (par->next) | 4559 if (par->next) |
4324 nvptx_neuter_pars (par->next, modes, outer); | 4560 nvptx_neuter_pars (par->next, modes, outer); |
4325 } | 4561 } |
4326 | 4562 |
4563 #if WORKAROUND_PTXJIT_BUG_2 | |
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant | |
4565 is needed in the nvptx target because the branches generated for | |
4566 parititioning are NONJUMP_INSN_P, not JUMP_P. */ | |
4567 | |
4568 static rtx | |
4569 nvptx_pc_set (const rtx_insn *insn, bool strict = true) | |
4570 { | |
4571 rtx pat; | |
4572 if ((strict && !JUMP_P (insn)) | |
4573 || (!strict && !INSN_P (insn))) | |
4574 return NULL_RTX; | |
4575 pat = PATTERN (insn); | |
4576 | |
4577 /* The set is allowed to appear either as the insn pattern or | |
4578 the first set in a PARALLEL. */ | |
4579 if (GET_CODE (pat) == PARALLEL) | |
4580 pat = XVECEXP (pat, 0, 0); | |
4581 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC) | |
4582 return pat; | |
4583 | |
4584 return NULL_RTX; | |
4585 } | |
4586 | |
4587 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */ | |
4588 | |
4589 static rtx | |
4590 nvptx_condjump_label (const rtx_insn *insn, bool strict = true) | |
4591 { | |
4592 rtx x = nvptx_pc_set (insn, strict); | |
4593 | |
4594 if (!x) | |
4595 return NULL_RTX; | |
4596 x = SET_SRC (x); | |
4597 if (GET_CODE (x) == LABEL_REF) | |
4598 return x; | |
4599 if (GET_CODE (x) != IF_THEN_ELSE) | |
4600 return NULL_RTX; | |
4601 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF) | |
4602 return XEXP (x, 1); | |
4603 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF) | |
4604 return XEXP (x, 2); | |
4605 return NULL_RTX; | |
4606 } | |
4607 | |
4608 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx | |
4609 insn inbetween the branch and the label. This works around a JIT bug | |
4610 observed at driver version 384.111, at -O0 for sm_50. */ | |
4611 | |
4612 static void | |
4613 prevent_branch_around_nothing (void) | |
4614 { | |
4615 rtx_insn *seen_label = NULL; | |
4616 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) | |
4617 { | |
4618 if (INSN_P (insn) && condjump_p (insn)) | |
4619 { | |
4620 seen_label = label_ref_label (nvptx_condjump_label (insn, false)); | |
4621 continue; | |
4622 } | |
4623 | |
4624 if (seen_label == NULL) | |
4625 continue; | |
4626 | |
4627 if (NOTE_P (insn) || DEBUG_INSN_P (insn)) | |
4628 continue; | |
4629 | |
4630 if (INSN_P (insn)) | |
4631 switch (recog_memoized (insn)) | |
4632 { | |
4633 case CODE_FOR_nvptx_fork: | |
4634 case CODE_FOR_nvptx_forked: | |
4635 case CODE_FOR_nvptx_joining: | |
4636 case CODE_FOR_nvptx_join: | |
4637 continue; | |
4638 default: | |
4639 seen_label = NULL; | |
4640 continue; | |
4641 } | |
4642 | |
4643 if (LABEL_P (insn) && insn == seen_label) | |
4644 emit_insn_before (gen_fake_nop (), insn); | |
4645 | |
4646 seen_label = NULL; | |
4647 } | |
4648 } | |
4649 #endif | |
4650 | |
4651 #ifdef WORKAROUND_PTXJIT_BUG_3 | |
4652 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This | |
4653 works around a hang observed at driver version 390.48 for sm_50. */ | |
4654 | |
4655 static void | |
4656 workaround_barsyncs (void) | |
4657 { | |
4658 bool seen_barsync = false; | |
4659 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) | |
4660 { | |
4661 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync) | |
4662 { | |
4663 if (seen_barsync) | |
4664 { | |
4665 emit_insn_before (gen_nvptx_membar_cta (), insn); | |
4666 emit_insn_before (gen_nvptx_membar_cta (), insn); | |
4667 } | |
4668 | |
4669 seen_barsync = true; | |
4670 continue; | |
4671 } | |
4672 | |
4673 if (!seen_barsync) | |
4674 continue; | |
4675 | |
4676 if (NOTE_P (insn) || DEBUG_INSN_P (insn)) | |
4677 continue; | |
4678 else if (INSN_P (insn)) | |
4679 switch (recog_memoized (insn)) | |
4680 { | |
4681 case CODE_FOR_nvptx_fork: | |
4682 case CODE_FOR_nvptx_forked: | |
4683 case CODE_FOR_nvptx_joining: | |
4684 case CODE_FOR_nvptx_join: | |
4685 continue; | |
4686 default: | |
4687 break; | |
4688 } | |
4689 | |
4690 seen_barsync = false; | |
4691 } | |
4692 } | |
4693 #endif | |
4694 | |
4327 /* PTX-specific reorganization | 4695 /* PTX-specific reorganization |
4328 - Split blocks at fork and join instructions | 4696 - Split blocks at fork and join instructions |
4329 - Compute live registers | 4697 - Compute live registers |
4330 - Mark now-unused registers, so function begin doesn't declare | 4698 - Mark now-unused registers, so function begin doesn't declare |
4331 unused registers. | 4699 unused registers. |
4401 nvptx_reorg_subreg (); | 4769 nvptx_reorg_subreg (); |
4402 | 4770 |
4403 if (TARGET_UNIFORM_SIMT) | 4771 if (TARGET_UNIFORM_SIMT) |
4404 nvptx_reorg_uniform_simt (); | 4772 nvptx_reorg_uniform_simt (); |
4405 | 4773 |
4774 #if WORKAROUND_PTXJIT_BUG_2 | |
4775 prevent_branch_around_nothing (); | |
4776 #endif | |
4777 | |
4778 #ifdef WORKAROUND_PTXJIT_BUG_3 | |
4779 workaround_barsyncs (); | |
4780 #endif | |
4781 | |
4406 regstat_free_n_sets_and_refs (); | 4782 regstat_free_n_sets_and_refs (); |
4407 | 4783 |
4408 df_finish_pass (true); | 4784 df_finish_pass (true); |
4409 } | 4785 } |
4410 | 4786 |
4455 } | 4831 } |
4456 | 4832 |
4457 /* Table of valid machine attributes. */ | 4833 /* Table of valid machine attributes. */ |
4458 static const struct attribute_spec nvptx_attribute_table[] = | 4834 static const struct attribute_spec nvptx_attribute_table[] = |
4459 { | 4835 { |
4460 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, | 4836 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, |
4461 affects_type_identity } */ | 4837 affects_type_identity, handler, exclude } */ |
4462 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false }, | 4838 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute, |
4463 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false }, | 4839 NULL }, |
4464 { NULL, 0, 0, false, false, false, NULL, false } | 4840 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute, |
4841 NULL }, | |
4842 { NULL, 0, 0, false, false, false, false, NULL, NULL } | |
4465 }; | 4843 }; |
4466 | 4844 |
4467 /* Limit vector alignments to BIGGEST_ALIGNMENT. */ | 4845 /* Limit vector alignments to BIGGEST_ALIGNMENT. */ |
4468 | 4846 |
4469 static HOST_WIDE_INT | 4847 static HOST_WIDE_INT |
4551 static void | 4929 static void |
4552 nvptx_file_start (void) | 4930 nvptx_file_start (void) |
4553 { | 4931 { |
4554 fputs ("// BEGIN PREAMBLE\n", asm_out_file); | 4932 fputs ("// BEGIN PREAMBLE\n", asm_out_file); |
4555 fputs ("\t.version\t3.1\n", asm_out_file); | 4933 fputs ("\t.version\t3.1\n", asm_out_file); |
4556 fputs ("\t.target\tsm_30\n", asm_out_file); | 4934 if (TARGET_SM35) |
4935 fputs ("\t.target\tsm_35\n", asm_out_file); | |
4936 else | |
4937 fputs ("\t.target\tsm_30\n", asm_out_file); | |
4557 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); | 4938 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); |
4558 fputs ("// END PREAMBLE\n", asm_out_file); | 4939 fputs ("// END PREAMBLE\n", asm_out_file); |
4559 } | 4940 } |
4560 | 4941 |
4561 /* Emit a declaration for a worker-level buffer in .shared memory. */ | 4942 /* Emit a declaration for a worker-level buffer in .shared memory. */ |
4785 } | 5166 } |
4786 | 5167 |
4787 /* Define dimension sizes for known hardware. */ | 5168 /* Define dimension sizes for known hardware. */ |
4788 #define PTX_VECTOR_LENGTH 32 | 5169 #define PTX_VECTOR_LENGTH 32 |
4789 #define PTX_WORKER_LENGTH 32 | 5170 #define PTX_WORKER_LENGTH 32 |
4790 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */ | 5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ |
4791 | 5172 |
4792 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ | 5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ |
4793 | 5174 |
4794 static int | 5175 static int |
4795 nvptx_simt_vf () | 5176 nvptx_simt_vf () |
4834 | 5215 |
4835 if (!decl) | 5216 if (!decl) |
4836 { | 5217 { |
4837 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; | 5218 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; |
4838 if (dims[GOMP_DIM_WORKER] < 0) | 5219 if (dims[GOMP_DIM_WORKER] < 0) |
4839 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; | 5220 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; |
4840 if (dims[GOMP_DIM_GANG] < 0) | 5221 if (dims[GOMP_DIM_GANG] < 0) |
4841 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT; | 5222 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; |
4842 changed = true; | 5223 changed = true; |
4843 } | 5224 } |
4844 | 5225 |
4845 return changed; | 5226 return changed; |
4846 } | 5227 } |
4850 static int | 5231 static int |
4851 nvptx_dim_limit (int axis) | 5232 nvptx_dim_limit (int axis) |
4852 { | 5233 { |
4853 switch (axis) | 5234 switch (axis) |
4854 { | 5235 { |
4855 case GOMP_DIM_WORKER: | |
4856 return PTX_WORKER_LENGTH; | |
4857 | |
4858 case GOMP_DIM_VECTOR: | 5236 case GOMP_DIM_VECTOR: |
4859 return PTX_VECTOR_LENGTH; | 5237 return PTX_VECTOR_LENGTH; |
4860 | 5238 |
4861 default: | 5239 default: |
4862 break; | 5240 break; |
5671 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs | 6049 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs |
5672 | 6050 |
5673 #undef TARGET_CAN_CHANGE_MODE_CLASS | 6051 #undef TARGET_CAN_CHANGE_MODE_CLASS |
5674 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class | 6052 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class |
5675 | 6053 |
6054 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE | |
6055 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed | |
6056 | |
5676 struct gcc_target targetm = TARGET_INITIALIZER; | 6057 struct gcc_target targetm = TARGET_INITIALIZER; |
5677 | 6058 |
5678 #include "gt-nvptx.h" | 6059 #include "gt-nvptx.h" |