diff gcc/config/nvptx/nvptx.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/config/nvptx/nvptx.c	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/config/nvptx/nvptx.c	Thu Oct 25 07:37:49 2018 +0900
@@ -1,5 +1,5 @@
 /* Target code for NVPTX.
-   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    Contributed by Bernd Schmidt <bernds@codesourcery.com>
 
    This file is part of GCC.
@@ -18,6 +18,8 @@
    along with GCC; see the file COPYING3.  If not see
    <http://www.gnu.org/licenses/>.  */
 
+#define IN_TARGET_CODE 1
+
 #include "config.h"
 #include <sstream>
 #include "system.h"
@@ -76,6 +78,8 @@
 #include "target-def.h"
 
 #define WORKAROUND_PTXJIT_BUG 1
+#define WORKAROUND_PTXJIT_BUG_2 1
+#define WORKAROUND_PTXJIT_BUG_3 1
 
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
@@ -175,6 +179,8 @@
   if (!global_options_set.x_flag_toplevel_reorder)
     flag_toplevel_reorder = 1;
 
+  debug_nonbind_markers_p = 0;
+
   /* Set flag_no_common, unless explicitly disabled.  We fake common
      using .weak, and that's not entirely accurate, so avoid it
      unless forced.  */
@@ -394,8 +400,7 @@
 	 it creates a block with a single successor before entering a
 	 partitooned region.  That is a good candidate for the end of
 	 an SESE region.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_fork (op));
       emit_insn (gen_nvptx_forked (op));
     }
 }
@@ -414,8 +419,7 @@
       /* Emit joining for all non-call pars to ensure there's a single
 	 predecessor for the block the join insn ends up in.  This is
 	 needed for skipping entire loops.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_joining (op));
       emit_insn (gen_nvptx_join (op));
     }
 }
@@ -1895,9 +1899,15 @@
   
   if (sym)
     {
-      fprintf (asm_out_file, "generic(");
+      bool function = (SYMBOL_REF_DECL (sym)
+		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
+      if (!function)
+	fprintf (asm_out_file, "generic(");
       output_address (VOIDmode, sym);
-      fprintf (asm_out_file, val ? ") + " : ")");
+      if (!function)
+	fprintf (asm_out_file, ")");
+      if (val)
+	fprintf (asm_out_file, " + ");
     }
 
   if (!sym || val)
@@ -2010,6 +2020,30 @@
     nvptx_assemble_value (str[i], 1);
 }
 
+/* Return true if TYPE is a record type where the last field is an array without
+   given dimension.  */
+
+static bool
+flexible_array_member_type_p (const_tree type)
+{
+  if (TREE_CODE (type) != RECORD_TYPE)
+    return false;
+
+  const_tree last_field = NULL_TREE;
+  for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
+    last_field = f;
+
+  if (!last_field)
+    return false;
+
+  const_tree last_field_type = TREE_TYPE (last_field);
+  if (TREE_CODE (last_field_type) != ARRAY_TYPE)
+    return false;
+
+  return (! TYPE_DOMAIN (last_field_type)
+	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
+}
+
 /* Emit a PTX variable decl and prepare for emission of its
    initializer.  NAME is the symbol name and SETION the PTX data
    area. The type is TYPE, object size SIZE and alignment is ALIGN.
@@ -2020,8 +2054,18 @@
 
 static void
 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
-			   const_tree type, HOST_WIDE_INT size, unsigned align)
-{
+			   const_tree type, HOST_WIDE_INT size, unsigned align,
+			   bool undefined = false)
+{
+  bool atype = (TREE_CODE (type) == ARRAY_TYPE)
+    && (TYPE_DOMAIN (type) == NULL_TREE);
+
+  if (undefined && flexible_array_member_type_p (type))
+    {
+      size = 0;
+      atype = true;
+    }
+
   while (TREE_CODE (type) == ARRAY_TYPE)
     type = TREE_TYPE (type);
 
@@ -2061,6 +2105,8 @@
     /* We make everything an array, to simplify any initialization
        emission.  */
     fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
+  else if (atype)
+    fprintf (file, "[]");
 }
 
 /* Called when the initializer for a decl has been completely output through
@@ -2156,7 +2202,7 @@
   tree size = DECL_SIZE_UNIT (decl);
   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
-			     DECL_ALIGN (decl));
+			     DECL_ALIGN (decl), true);
   nvptx_assemble_decl_end ();
 }
 
@@ -3039,8 +3085,7 @@
 	    par = new parallel (par, mask);
 	    par->forked_block = block;
 	    par->forked_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->fork_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	  }
@@ -3055,8 +3100,7 @@
 	    gcc_assert (par->mask == mask);
 	    par->join_block = block;
 	    par->join_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->joining_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 	    par = par->parent;
@@ -3735,29 +3779,34 @@
 #undef BB_SET_SESE
 #undef BB_GET_SESE
 
-/* Propagate live state at the start of a partitioned region.  BLOCK
-   provides the live register information, and might not contain
-   INSN. Propagation is inserted just after INSN. RW indicates whether
-   we are reading and/or writing state.  This
+/* Propagate live state at the start of a partitioned region.  IS_CALL
+   indicates whether the propagation is for a (partitioned) call
+   instruction.  BLOCK provides the live register information, and
+   might not contain INSN. Propagation is inserted just after INSN. RW
+   indicates whether we are reading and/or writing state.  This
    separation is needed for worker-level proppagation where we
    essentially do a spill & fill.  FN is the underlying worker
    function to generate the propagation instructions for single
    register.  DATA is user data.
 
-   We propagate the live register set and the entire frame.  We could
-   do better by (a) propagating just the live set that is used within
-   the partitioned regions and (b) only propagating stack entries that
-   are used.  The latter might be quite hard to determine.  */
+   Returns true if we didn't emit any instructions.
+
+   We propagate the live register set for non-calls and the entire
+   frame for calls and non-calls.  We could do better by (a)
+   propagating just the live set that is used within the partitioned
+   regions and (b) only propagating stack entries that are used.  The
+   latter might be quite hard to determine.  */
 
 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
 
-static void
-nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
-		 propagator_fn fn, void *data)
+static bool
+nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
+		 propagate_mask rw, propagator_fn fn, void *data)
 {
   bitmap live = DF_LIVE_IN (block);
   bitmap_iterator iterator;
   unsigned ix;
+  bool empty = true;
 
   /* Copy the frame array.  */
   HOST_WIDE_INT fs = get_frame_size ();
@@ -3769,6 +3818,7 @@
       rtx pred = NULL_RTX;
       rtx_code_label *label = NULL;
 
+      empty = false;
       /* The frame size might not be DImode compatible, but the frame
 	 array's declaration will be.  So it's ok to round up here.  */
       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
@@ -3815,18 +3865,21 @@
       insn = emit_insn_after (cpy, insn);
     }
 
-  /* Copy live registers.  */
-  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
-    {
-      rtx reg = regno_reg_rtx[ix];
-
-      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
-	{
-	  rtx bcast = fn (reg, rw, 0, data);
-
-	  insn = emit_insn_after (bcast, insn);
-	}
-    }
+  if (!is_call)
+    /* Copy live registers.  */
+    EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+      {
+	rtx reg = regno_reg_rtx[ix];
+
+	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+	  {
+	    rtx bcast = fn (reg, rw, 0, data);
+
+	    insn = emit_insn_after (bcast, insn);
+	    empty = false;
+	  }
+      }
+  return empty;
 }
 
 /* Worker for nvptx_vpropagate.  */
@@ -3842,12 +3895,13 @@
 }
 
 /* Propagate state that is live at start of BLOCK across the vectors
-   of a single warp.  Propagation is inserted just after INSN.   */
-
-static void
-nvptx_vpropagate (basic_block block, rtx_insn *insn)
-{
-  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+   of a single warp.  Propagation is inserted just after INSN.
+   IS_CALL and return as for nvptx_propagate.  */
+
+static bool
+nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
+{
+  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
 }
 
 /* Worker for nvptx_wpropagate.  */
@@ -3883,10 +3937,10 @@
 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
    indicates if this is just before partitioned mode (do spill), or
    just after it starts (do fill). Sequence is inserted just after
-   INSN.  */
-
-static void
-nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+   INSN.  IS_CALL and return as for nvptx_propagate.  */
+
+static bool
+nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
 {
   wcast_data_t data;
 
@@ -3894,7 +3948,9 @@
   data.offset = 0;
   data.ptr = NULL_RTX;
 
-  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  bool empty = nvptx_propagate (is_call, block, insn,
+				pre_p ? PM_read : PM_write, wprop_gen, &data);
+  gcc_assert (empty == !data.offset);
   if (data.offset)
     {
       /* Stuff was emitted, initialize the base pointer now.  */
@@ -3904,6 +3960,7 @@
       if (worker_bcast_size < data.offset)
 	worker_bcast_size = data.offset;
     }
+  return empty;
 }
 
 /* Emit a worker-level synchronization barrier.  We use different
@@ -3933,6 +3990,140 @@
 }
 #endif
 
+/* Return true if INSN needs neutering.  */
+
+static bool
+needs_neutering_p (rtx_insn *insn)
+{
+  if (!INSN_P (insn))
+    return false;
+
+  switch (recog_memoized (insn))
+    {
+    case CODE_FOR_nvptx_fork:
+    case CODE_FOR_nvptx_forked:
+    case CODE_FOR_nvptx_joining:
+    case CODE_FOR_nvptx_join:
+    case CODE_FOR_nvptx_barsync:
+      return false;
+    default:
+      return true;
+    }
+}
+
+/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM.  */
+
+static bool
+verify_neutering_jumps (basic_block from,
+			rtx_insn *vector_jump, rtx_insn *worker_jump,
+			rtx_insn *vector_label, rtx_insn *worker_label)
+{
+  basic_block bb = from;
+  rtx_insn *insn = BB_HEAD (bb);
+  bool seen_worker_jump = false;
+  bool seen_vector_jump = false;
+  bool seen_worker_label = false;
+  bool seen_vector_label = false;
+  bool worker_neutered = false;
+  bool vector_neutered = false;
+  while (true)
+    {
+      if (insn == worker_jump)
+	{
+	  seen_worker_jump = true;
+	  worker_neutered = true;
+	  gcc_assert (!vector_neutered);
+	}
+      else if (insn == vector_jump)
+	{
+	  seen_vector_jump = true;
+	  vector_neutered = true;
+	}
+      else if (insn == worker_label)
+	{
+	  seen_worker_label = true;
+	  gcc_assert (worker_neutered);
+	  worker_neutered = false;
+	}
+      else if (insn == vector_label)
+	{
+	  seen_vector_label = true;
+	  gcc_assert (vector_neutered);
+	  vector_neutered = false;
+	}
+      else if (INSN_P (insn))
+	switch (recog_memoized (insn))
+	  {
+	  case CODE_FOR_nvptx_barsync:
+	    gcc_assert (!vector_neutered && !worker_neutered);
+	    break;
+	  default:
+	    break;
+	  }
+
+      if (insn != BB_END (bb))
+	insn = NEXT_INSN (insn);
+      else if (JUMP_P (insn) && single_succ_p (bb)
+	       && !seen_vector_jump && !seen_worker_jump)
+	{
+	  bb = single_succ (bb);
+	  insn = BB_HEAD (bb);
+	}
+      else
+	break;
+    }
+
+  gcc_assert (!(vector_jump && !seen_vector_jump));
+  gcc_assert (!(worker_jump && !seen_worker_jump));
+
+  if (seen_vector_label || seen_worker_label)
+    {
+      gcc_assert (!(vector_label && !seen_vector_label));
+      gcc_assert (!(worker_label && !seen_worker_label));
+
+      return true;
+    }
+
+  return false;
+}
+
+/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO.  */
+
+static void
+verify_neutering_labels (basic_block to, rtx_insn *vector_label,
+			 rtx_insn *worker_label)
+{
+  basic_block bb = to;
+  rtx_insn *insn = BB_END (bb);
+  bool seen_worker_label = false;
+  bool seen_vector_label = false;
+  while (true)
+    {
+      if (insn == worker_label)
+	{
+	  seen_worker_label = true;
+	  gcc_assert (!seen_vector_label);
+	}
+      else if (insn == vector_label)
+	seen_vector_label = true;
+      else if (INSN_P (insn))
+	switch (recog_memoized (insn))
+	  {
+	  case CODE_FOR_nvptx_barsync:
+	    gcc_assert (!seen_vector_label && !seen_worker_label);
+	    break;
+	  }
+
+      if (insn != BB_HEAD (bb))
+	insn = PREV_INSN (insn);
+      else
+	break;
+    }
+
+  gcc_assert (!(vector_label && !seen_vector_label));
+  gcc_assert (!(worker_label && !seen_worker_label));
+}
+
 /* Single neutering according to MASK.  FROM is the incoming block and
    TO is the outgoing block.  These may be the same block. Insert at
    start of FROM:
@@ -3958,7 +4149,7 @@
   while (true)
     {
       /* Find first insn of from block.  */
-      while (head != BB_END (from) && !INSN_P (head))
+      while (head != BB_END (from) && !needs_neutering_p (head))
 	head = NEXT_INSN (head);
 
       if (from == to)
@@ -3999,21 +4190,9 @@
   if (tail == head)
     {
       /* If this is empty, do nothing.  */
-      if (!head || !INSN_P (head))
+      if (!head || !needs_neutering_p (head))
 	return;
 
-      /* If this is a dummy insn, do nothing.  */
-      switch (recog_memoized (head))
-	{
-	default:
-	  break;
-	case CODE_FOR_nvptx_fork:
-	case CODE_FOR_nvptx_forked:
-	case CODE_FOR_nvptx_joining:
-	case CODE_FOR_nvptx_join:
-	  return;
-	}
-
       if (cond_branch)
 	{
 	  /* If we're only doing vector single, there's no need to
@@ -4029,11 +4208,16 @@
   /* Insert the vector test inside the worker test.  */
   unsigned mode;
   rtx_insn *before = tail;
+  rtx_insn *neuter_start = NULL;
+  rtx_insn *worker_label = NULL, *vector_label = NULL;
+  rtx_insn *worker_jump = NULL, *vector_jump = NULL;
   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
     if (GOMP_DIM_MASK (mode) & skip_mask)
       {
 	rtx_code_label *label = gen_label_rtx ();
 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
+	rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
+	rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
 
 	if (!pred)
 	  {
@@ -4046,13 +4230,31 @@
 	  br = gen_br_true (pred, label);
 	else
 	  br = gen_br_true_uni (pred, label);
-	emit_insn_before (br, head);
+	if (neuter_start)
+	  neuter_start = emit_insn_after (br, neuter_start);
+	else
+	  neuter_start = emit_insn_before (br, head);
+	*mode_jump = neuter_start;
 
 	LABEL_NUSES (label)++;
+	rtx_insn *label_insn;
 	if (tail_branch)
-	  before = emit_label_before (label, before);
+	  {
+	    label_insn = emit_label_before (label, before);
+	    before = label_insn;
+	  }
 	else
-	  emit_label_after (label, tail);
+	  {
+	    label_insn = emit_label_after (label, tail);
+	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
+		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
+	      emit_insn_after (gen_exit (), label_insn);
+	  }
+
+	if (mode == GOMP_DIM_VECTOR)
+	  vector_label = label_insn;
+	else
+	  worker_label = label_insn;
       }
 
   /* Now deal with propagating the branch condition.  */
@@ -4092,9 +4294,33 @@
 
 	     There is nothing in the PTX spec to suggest that this is wrong, or
 	     to explain why the extra initialization is needed.  So, we classify
-	     it as a JIT bug, and the extra initialization as workaround.  */
-	  emit_insn_before (gen_movbi (pvar, const0_rtx),
+	     it as a JIT bug, and the extra initialization as workaround:
+
+		{
+		    .reg .u32 %x;
+		    mov.u32 %x,%tid.x;
+		    setp.ne.u32 %rnotvzero,%x,0;
+		}
+
+		+.reg .pred %rcond2;
+		+setp.eq.u32 %rcond2, 1, 0;
+
+		 @%rnotvzero bra Lskip;
+		 setp.<op>.<type> %rcond,op1,op2;
+		+mov.pred %rcond2, %rcond;
+		 Lskip:
+		+mov.pred %rcond, %rcond2;
+		 selp.u32 %rcondu32,1,0,%rcond;
+		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+		 setp.ne.u32 %rcond,%rcondu32,0;
+	  */
+	  rtx_insn *label = PREV_INSN (tail);
+	  gcc_assert (label && LABEL_P (label));
+	  rtx tmp = gen_reg_rtx (BImode);
+	  emit_insn_before (gen_movbi (tmp, const0_rtx),
 			    bb_first_real_insn (from));
+	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
+	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
 #endif
 	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
 	}
@@ -4128,6 +4354,11 @@
 				 UNSPEC_BR_UNIFIED);
       validate_change (tail, recog_data.operand_loc[0], unsp, false);
     }
+
+  bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
+					    vector_label, worker_label);
+  if (!seen_label)
+    verify_neutering_labels (to, vector_label, worker_label);
 }
 
 /* PAR is a parallel that is being skipped in its entirety according to
@@ -4228,18 +4459,23 @@
       inner_mask |= par->inner_mask;
     }
 
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-    /* No propagation needed for a call.  */;
-  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
     {
-      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
-      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
-      /* Insert begin and end synchronizations.  */
-      emit_insn_after (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
+      bool empty = nvptx_wpropagate (true, is_call,
+				     par->forked_block, par->fork_insn);
+
+      if (!empty || !is_call)
+	{
+	  /* Insert begin and end synchronizations.  */
+	  emit_insn_before (nvptx_wsync (false), par->forked_insn);
+	  emit_insn_before (nvptx_wsync (true), par->join_insn);
+	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (par->forked_block, par->forked_insn);
+    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
 
   /* Now do siblings.  */
   if (par->next)
@@ -4324,6 +4560,138 @@
     nvptx_neuter_pars (par->next, modes, outer);
 }
 
+#if WORKAROUND_PTXJIT_BUG_2
+/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
+   is needed in the nvptx target because the branches generated for
+   parititioning are NONJUMP_INSN_P, not JUMP_P.  */
+
+static rtx
+nvptx_pc_set (const rtx_insn *insn, bool strict = true)
+{
+  rtx pat;
+  if ((strict && !JUMP_P (insn))
+      || (!strict && !INSN_P (insn)))
+    return NULL_RTX;
+  pat = PATTERN (insn);
+
+  /* The set is allowed to appear either as the insn pattern or
+     the first set in a PARALLEL.  */
+  if (GET_CODE (pat) == PARALLEL)
+    pat = XVECEXP (pat, 0, 0);
+  if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
+    return pat;
+
+  return NULL_RTX;
+}
+
+/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
+
+static rtx
+nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
+{
+  rtx x = nvptx_pc_set (insn, strict);
+
+  if (!x)
+    return NULL_RTX;
+  x = SET_SRC (x);
+  if (GET_CODE (x) == LABEL_REF)
+    return x;
+  if (GET_CODE (x) != IF_THEN_ELSE)
+    return NULL_RTX;
+  if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
+    return XEXP (x, 1);
+  if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
+    return XEXP (x, 2);
+  return NULL_RTX;
+}
+
+/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
+   insn inbetween the branch and the label.  This works around a JIT bug
+   observed at driver version 384.111, at -O0 for sm_50.  */
+
+static void
+prevent_branch_around_nothing (void)
+{
+  rtx_insn *seen_label = NULL;
+    for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+      {
+	if (INSN_P (insn) && condjump_p (insn))
+	  {
+	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
+	    continue;
+	  }
+
+	if (seen_label == NULL)
+	  continue;
+
+	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
+	  continue;
+
+	if (INSN_P (insn))
+	  switch (recog_memoized (insn))
+	    {
+	    case CODE_FOR_nvptx_fork:
+	    case CODE_FOR_nvptx_forked:
+	    case CODE_FOR_nvptx_joining:
+	    case CODE_FOR_nvptx_join:
+	      continue;
+	    default:
+	      seen_label = NULL;
+	      continue;
+	    }
+
+	if (LABEL_P (insn) && insn == seen_label)
+	  emit_insn_before (gen_fake_nop (), insn);
+
+	seen_label = NULL;
+      }
+  }
+#endif
+
+#ifdef WORKAROUND_PTXJIT_BUG_3
+/* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
+   works around a hang observed at driver version 390.48 for sm_50.  */
+
+static void
+workaround_barsyncs (void)
+{
+  bool seen_barsync = false;
+  for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
+	{
+	  if (seen_barsync)
+	    {
+	      emit_insn_before (gen_nvptx_membar_cta (), insn);
+	      emit_insn_before (gen_nvptx_membar_cta (), insn);
+	    }
+
+	  seen_barsync = true;
+	  continue;
+	}
+
+      if (!seen_barsync)
+	continue;
+
+      if (NOTE_P (insn) || DEBUG_INSN_P (insn))
+	continue;
+      else if (INSN_P (insn))
+	switch (recog_memoized (insn))
+	  {
+	  case CODE_FOR_nvptx_fork:
+	  case CODE_FOR_nvptx_forked:
+	  case CODE_FOR_nvptx_joining:
+	  case CODE_FOR_nvptx_join:
+	    continue;
+	  default:
+	    break;
+	  }
+
+      seen_barsync = false;
+    }
+}
+#endif
+
 /* PTX-specific reorganization
    - Split blocks at fork and join instructions
    - Compute live registers
@@ -4403,6 +4771,14 @@
   if (TARGET_UNIFORM_SIMT)
     nvptx_reorg_uniform_simt ();
 
+#if WORKAROUND_PTXJIT_BUG_2
+  prevent_branch_around_nothing ();
+#endif
+
+#ifdef WORKAROUND_PTXJIT_BUG_3
+  workaround_barsyncs ();
+#endif
+
   regstat_free_n_sets_and_refs ();
 
   df_finish_pass (true);
@@ -4457,11 +4833,13 @@
 /* Table of valid machine attributes.  */
 static const struct attribute_spec nvptx_attribute_table[] =
 {
-  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
-       affects_type_identity } */
-  { "kernel", 0, 0, true, false,  false, nvptx_handle_kernel_attribute, false },
-  { "shared", 0, 0, true, false,  false, nvptx_handle_shared_attribute, false },
-  { NULL, 0, 0, false, false, false, NULL, false }
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
+    NULL },
+  { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
+    NULL },
+  { NULL, 0, 0, false, false, false, false, NULL, NULL }
 };
 
 /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
@@ -4553,7 +4931,10 @@
 {
   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
   fputs ("\t.version\t3.1\n", asm_out_file);
-  fputs ("\t.target\tsm_30\n", asm_out_file);
+  if (TARGET_SM35)
+    fputs ("\t.target\tsm_35\n", asm_out_file);
+  else
+    fputs ("\t.target\tsm_30\n", asm_out_file);
   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
@@ -4787,7 +5168,7 @@
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
 
 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
 
@@ -4836,9 +5217,9 @@
     {
       dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
       if (dims[GOMP_DIM_WORKER] < 0)
-	dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
       if (dims[GOMP_DIM_GANG] < 0)
-	dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
+	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
       changed = true;
     }
 
@@ -4852,9 +5233,6 @@
 {
   switch (axis)
     {
-    case GOMP_DIM_WORKER:
-      return PTX_WORKER_LENGTH;
-
     case GOMP_DIM_VECTOR:
       return PTX_VECTOR_LENGTH;
 
@@ -5673,6 +6051,9 @@
 #undef TARGET_CAN_CHANGE_MODE_CLASS
 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
 
+#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
+#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"