diff gcc/config/aarch64/falkor-tag-collision-avoidance.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents
children 1830386684a0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gcc/config/aarch64/falkor-tag-collision-avoidance.c	Thu Oct 25 07:37:49 2018 +0900
@@ -0,0 +1,881 @@
+/* Tag Collision Avoidance pass for Falkor.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#define INCLUDE_LIST
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "aarch64-protos.h"
+#include "hash-map.h"
+#include "cfgloop.h"
+#include "cfgrtl.h"
+#include "rtl-iter.h"
+#include "df.h"
+#include "memmodel.h"
+#include "optabs.h"
+#include "regs.h"
+#include "recog.h"
+#include "regrename.h"
+#include "print-rtl.h"
+
+/* The Falkor hardware prefetching system uses the encoding of the registers
+   and offsets of loads to decide which of the multiple hardware prefetchers to
+   assign the load to.  This has the positive effect of accelerating prefetches
+   when all related loads with uniform strides are assigned to the same
+   prefetcher unit.  The down side is that because of the way the assignment
+   works, multiple unrelated loads may end up on the same prefetch unit, thus
+   causing the unit to bounce between different sets of addresses and never
+   train correctly.  The point of this pass is to avoid such collisions so that
+   unrelated loads are spread out to different prefetchers.  It also makes a
+   rudimentary attempt to ensure that related loads with the same tags don't
+   get moved out unnecessarily.
+
+   Perhaps a future enhancement would be to make a more concerted attempt to
+   get related loads under the same tag.  See the memcpy/memset implementation
+   for falkor in glibc to understand the kind of impact this can have on
+   falkor.
+
+   The assignment of loads is based on a tag that is computed from the encoding
+   of the first destination register (the only destination in case of LDR), the
+   base register and the offset (either the register or the immediate value, as
+   encoded in the instruction).  This is what the 14 bit tag looks like:
+
+   |<- 6 bits ->|<- 4b ->|<- 4b ->|
+   --------------------------------
+   |  OFFSET    |  SRC   |  DST   |
+   --------------------------------
+
+   For all cases, the SRC and DST are the 4 LSB of the encoding of the register
+   in the instruction.  Offset computation is more involved and is as follows:
+
+   - For register offset addressing: 4 LSB of the offset register with the MSB
+     of the 6 bits set to 1.
+
+   - For immediate offset: 4 LSB of the encoded immediate offset.  The encoding
+     depends on the width of the load and is expressed as multiples of the
+     width.
+
+   - For loads with update: 4 LSB of the offset.  The encoding here is the
+     exact number by which the base is offset and incremented.
+
+   Based on the above it is clear that registers 0 and 16 will result in
+   collisions, 1 and 17 and so on.  This pass detects such collisions within a
+   def/use chain of the source register in a loop and tries to resolve the
+   collision by renaming one of the destination registers.  */
+
+/* Get the destination part of the tag.  */
+#define TAG_GET_DEST(__tag) ((__tag) & 0xf)
+
+/* Get the tag with the destination part updated.  */
+#define TAG_UPDATE_DEST(__tag, __dest) (((__tag) & ~0xf) | (__dest & 0xf))
+
+#define MAX_PREFETCH_STRIDE 2048
+
+/* The instruction information structure.  This is used to cache information
+   about the INSN that we derive when traversing through all of the insns in
+   loops.  */
+class tag_insn_info
+{
+public:
+  rtx_insn *insn;
+  rtx dest;
+  rtx base;
+  rtx offset;
+  bool writeback;
+  bool ldp;
+
+  tag_insn_info (rtx_insn *i, rtx d, rtx b, rtx o, bool w, bool p)
+    : insn (i), dest (d), base (b), offset (o), writeback (w), ldp (p)
+  {}
+
+  /* Compute the tag based on BASE, DEST and OFFSET of the load.  */
+  unsigned tag ()
+    {
+      unsigned int_offset = 0;
+      rtx offset = this->offset;
+      unsigned dest = REGNO (this->dest);
+      unsigned base = REGNO (this->base);
+      machine_mode dest_mode = GET_MODE (this->dest);
+
+      /* Falkor does not support SVE; GET_LOAD_INFO ensures that the
+	 destination mode is constant here.  */
+      unsigned dest_mode_size = GET_MODE_SIZE (dest_mode).to_constant ();
+
+      /* For loads of larger than 16 bytes, the DEST part of the tag is 0.  */
+      if ((dest_mode_size << this->ldp) > 16)
+	dest = 0;
+
+      if (offset && REG_P (offset))
+	int_offset = (1 << 5) | REGNO (offset);
+      else if (offset && CONST_INT_P (offset))
+	{
+	  int_offset = INTVAL (offset);
+	  int_offset /= dest_mode_size;
+	  if (!this->writeback)
+	    int_offset >>= 2;
+	}
+      return ((dest & 0xf)
+	      | ((base & 0xf) << 4)
+	      | ((int_offset & 0x3f) << 8));
+    }
+};
+
+/* Hash map to traverse and process instructions with colliding tags.  */
+typedef hash_map <rtx, auto_vec <tag_insn_info *> > tag_map_t;
+
+/* Vector of instructions with colliding tags.  */
+typedef auto_vec <tag_insn_info *> insn_info_list_t;
+
+/* Pair of instruction information and unavailable register set to pass to
+   CHECK_COLLIDING_TAGS.  */
+typedef std::pair <tag_insn_info *, HARD_REG_SET *> arg_pair_t;
+
+
+/* Callback to free all tag_insn_info objects.  */
+bool
+free_insn_info (const rtx &t ATTRIBUTE_UNUSED, insn_info_list_t *v,
+		void *arg ATTRIBUTE_UNUSED)
+{
+  while (v->length () > 0)
+    delete v->pop ();
+
+  return true;
+}
+
+
+/* Add all aliases of the register to the unavailable register set.  REG is the
+   smallest register number that can then be used to reference its aliases.
+   UNAVAILABLE is the hard register set to add the ignored register numbers to
+   and MODE is the mode in which the registers would have been used.  */
+static void
+ignore_all_aliases (HARD_REG_SET *unavailable, machine_mode mode, unsigned reg)
+{
+  add_to_hard_reg_set (unavailable, mode, reg);
+  add_to_hard_reg_set (unavailable, mode, reg + 16);
+  add_to_hard_reg_set (unavailable, mode, reg + 32);
+  add_to_hard_reg_set (unavailable, mode, reg + 48);
+}
+
+
+/* Callback to check which destination registers are unavailable to us for
+   renaming because of the base and offset colliding.  This is a callback that
+   gets called for every name value pair (T, V) in the TAG_MAP.  The ARG is an
+   std::pair of the tag_insn_info of the original insn and the hard register
+   set UNAVAILABLE that is used to record hard register numbers that cannot be
+   used for the renaming.  This always returns true since we want to traverse
+   through the entire TAG_MAP.  */
+bool
+check_colliding_tags (const rtx &t, const insn_info_list_t &v, arg_pair_t *arg)
+{
+  HARD_REG_SET *unavailable = arg->second;
+  unsigned orig_tag = arg->first->tag ();
+  unsigned tag = INTVAL (t);
+  machine_mode mode = GET_MODE (arg->first->dest);
+
+  /* Can't collide with emptiness.  */
+  if (v.length () == 0)
+    return true;
+
+  /* Drop all aliased destination registers that result in the same
+     tag.  It is not necessary to drop all of them but we do anyway
+     because it is quicker than checking ranges.  */
+  if (TAG_UPDATE_DEST (tag, 0) == TAG_UPDATE_DEST (orig_tag, 0))
+    ignore_all_aliases (unavailable, mode, TAG_GET_DEST (tag));
+
+  return true;
+}
+
+
+/* Initialize and build a set of hard register numbers UNAVAILABLE to avoid for
+   renaming.  INSN_INFO is the original insn, TAG_MAP is the map of the list of
+   insns indexed by their tags, HEAD is the def/use chain head of the
+   destination register of the original insn.  The routine returns the super
+   class of register classes that may be used during the renaming.  */
+static enum reg_class
+init_unavailable (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head,
+		  HARD_REG_SET *unavailable)
+{
+  unsigned dest = head->regno;
+  enum reg_class super_class = NO_REGS;
+  machine_mode mode = GET_MODE (insn_info->dest);
+
+  CLEAR_HARD_REG_SET (*unavailable);
+
+  for (struct du_chain *tmp = head->first; tmp; tmp = tmp->next_use)
+    {
+      if (DEBUG_INSN_P (tmp->insn))
+	continue;
+
+      IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
+      super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
+    }
+
+  for (unsigned i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    if (fixed_regs[i] || global_regs[i])
+      add_to_hard_reg_set (unavailable, mode, i);
+
+  arg_pair_t arg = arg_pair_t (insn_info, unavailable);
+
+  /* Exclude all registers that would lead to collisions with other loads.  */
+  tag_map.traverse <arg_pair_t *, check_colliding_tags> (&arg);
+
+  /* Finally, also ignore all aliases of the current reg.  */
+  ignore_all_aliases (unavailable, mode, dest & 0xf);
+
+  return super_class;
+}
+
+
+/* Find a suitable and available register and rename the chain of occurrences
+   of the register  defined in the def/use chain headed by HEAD in which INSN
+   exists.  CUR_TAG, TAGS and TAG_MAP are used to determine which registers are
+   unavailable due to a potential collision due to the rename.  The routine
+   returns the register number in case of a successful rename or -1 to indicate
+   failure.  */
+static int
+rename_chain (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head)
+{
+  unsigned dest_regno = head->regno;
+
+  if (head->cannot_rename || head->renamed)
+    return -1;
+
+  HARD_REG_SET unavailable;
+
+  enum reg_class super_class = init_unavailable (insn_info, tag_map, head,
+						 &unavailable);
+
+  unsigned new_regno = find_rename_reg (head, super_class, &unavailable,
+					dest_regno, false);
+
+  /* Attempt to rename as long as regrename doesn't just throw the same
+     register at us.  */
+  if (new_regno != dest_regno && regrename_do_replace (head, new_regno))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	  fprintf (dump_file, "\tInsn %d: Renamed %d to %d\n",
+		   INSN_UID (insn_info->insn), dest_regno, new_regno);
+
+      return new_regno;
+    }
+
+  return -1;
+}
+
+
+/* Return true if REGNO is not safe to rename.  */
+static bool
+unsafe_rename_p (unsigned regno)
+{
+  /* Avoid renaming registers used for argument passing and return value.  In
+     future we could be a little less conservative and walk through the basic
+     blocks to see if there are any call or syscall sites.  */
+  if (regno <= R8_REGNUM
+      || (regno >= V0_REGNUM && regno < V8_REGNUM))
+    return true;
+
+  /* Don't attempt to rename registers that may have specific meanings.  */
+  switch (regno)
+    {
+    case LR_REGNUM:
+    case HARD_FRAME_POINTER_REGNUM:
+    case FRAME_POINTER_REGNUM:
+    case STACK_POINTER_REGNUM:
+      return true;
+    }
+
+  return false;
+}
+
+
+/* Go through the def/use chains for the register and find the chain for this
+   insn to rename.  The function returns the hard register number in case of a
+   successful rename and -1 otherwise.  */
+static int
+rename_dest (tag_insn_info *insn_info, tag_map_t &tag_map)
+{
+  struct du_chain *chain = NULL;
+  du_head_p head = NULL;
+  int i;
+
+  unsigned dest_regno = REGNO (insn_info->dest);
+
+  if (unsafe_rename_p (dest_regno))
+    return -1;
+
+  /* Search the chain where this instruction is (one of) the root.  */
+  rtx_insn *insn = insn_info->insn;
+  operand_rr_info *dest_op_info = insn_rr[INSN_UID (insn)].op_info;
+
+  for (i = 0; i < dest_op_info->n_chains; i++)
+    {
+      /* The register tracked by this chain does not match the
+	 destination register of insn.  */
+      if (dest_op_info->heads[i]->regno != dest_regno)
+	continue;
+
+      head = dest_op_info->heads[i];
+      /* The chain was merged in another, find the new head.  */
+      if (!head->first)
+	head = regrename_chain_from_id (head->id);
+
+      for (chain = head->first; chain; chain = chain->next_use)
+	/* Found the insn in the chain, so try renaming the register in this
+	   chain.  */
+	if (chain->insn == insn)
+	  return rename_chain (insn_info, tag_map, head);
+    }
+
+  return -1;
+}
+
+
+/* Flag to track if the map has changed.  */
+static bool map_changed = false;
+
+/* The actual reallocation logic.  For each vector of collisions V, try to
+   resolve the collision by attempting to rename the destination register of
+   all but one of the loads.  This is a callback that is invoked for each
+   name-value pair (T, V) in TAG_MAP.  The function returns true whenever it
+   returns unchanged and false otherwise to halt traversal.  */
+bool
+avoid_collisions_1 (const rtx &t, insn_info_list_t *v, tag_map_t *tag_map)
+{
+  /* We need at least two loads to cause a tag collision, return unchanged.  */
+  if (v->length () < 2)
+    return true;
+
+  tag_insn_info *vec_start = v->pop ();
+  tag_insn_info *insn_info = vec_start;
+
+  /* Try to rename at least one register to reduce the collision.  If we
+     iterate all the way through, we end up dropping one of the loads from the
+     list.  This is fine because we want at most one element to ensure that a
+     subsequent rename attempt does not end up worsening the collision.  */
+  do
+    {
+      int new_regno;
+
+      if ((new_regno = rename_dest (insn_info, *tag_map)) != -1)
+	{
+	  rtx new_tag = GEN_INT (TAG_UPDATE_DEST (INTVAL (t), new_regno));
+
+	  tag_map->get_or_insert (new_tag).safe_push (insn_info);
+	  df_set_regs_ever_live (new_regno, true);
+	  map_changed = true;
+	  return false;
+	}
+
+      v->safe_insert (0, insn_info);
+      insn_info = v->pop ();
+    }
+  while (insn_info != vec_start);
+
+  if (dump_file)
+    fprintf (dump_file, "\t>> Failed to rename destination in insn %d\n\t>>",
+	     INSN_UID (insn_info->insn));
+
+  /* Drop the last element and move on to the next tag.  */
+  delete insn_info;
+  return true;
+}
+
+
+/* For each set of collisions, attempt to rename the registers or insert a move
+   to avoid the collision.  We repeatedly traverse through TAG_MAP using
+   AVOID_COLLISIONS_1 trying to rename registers to avoid collisions until a
+   full traversal results in no change in the map.  */
+static void
+avoid_collisions (tag_map_t &tag_map)
+{
+  do
+    {
+      map_changed = false;
+      tag_map.traverse <tag_map_t *, avoid_collisions_1> (&tag_map);
+    }
+  while (map_changed);
+}
+
+
+
+/* Find the use def chain in which INSN exists and then see if there is a
+   definition inside the loop and outside it.  We use this as a simple
+   approximation to determine whether the base register is an IV.  The basic
+   idea is to find INSN in the use-def chains for its base register and find
+   all definitions that reach it.  Of all these definitions, there should be at
+   least one definition that is a simple addition of a constant value, either
+   as a binary operation or a pre or post update.
+
+   The function returns true if the base register is estimated to be an IV.  */
+static bool
+iv_p (rtx_insn *insn, rtx reg, struct loop *loop)
+{
+  df_ref ause;
+  unsigned regno = REGNO (reg);
+
+  /* Ignore loads from the stack.  */
+  if (regno == SP_REGNUM)
+    return false;
+
+  for (ause = DF_REG_USE_CHAIN (regno); ause; ause = DF_REF_NEXT_REG (ause))
+    {
+      if (!DF_REF_INSN_INFO (ause)
+	  || !NONDEBUG_INSN_P (DF_REF_INSN (ause)))
+	continue;
+
+      if (insn != DF_REF_INSN (ause))
+	continue;
+
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+      df_ref def_rec;
+
+      FOR_EACH_INSN_INFO_DEF (def_rec, insn_info)
+	{
+	  rtx_insn *insn = DF_REF_INSN (def_rec);
+	  basic_block bb = BLOCK_FOR_INSN (insn);
+
+	  if (dominated_by_p (CDI_DOMINATORS, bb, loop->header)
+	      && bb->loop_father == loop)
+	    {
+	      if (recog_memoized (insn) < 0)
+		continue;
+
+	      rtx pat = PATTERN (insn);
+
+	      /* Prefetch or clobber; unlikely to be a constant stride.  The
+		 falkor software prefetcher tuning is pretty conservative, so
+		 its presence indicates that the access pattern is probably
+		 strided but most likely with an unknown stride size or a
+		 stride size that is quite large.  */
+	      if (GET_CODE (pat) != SET)
+		continue;
+
+	      rtx x = SET_SRC (pat);
+	      if (GET_CODE (x) == ZERO_EXTRACT
+		  || GET_CODE (x) == ZERO_EXTEND
+		  || GET_CODE (x) == SIGN_EXTEND)
+		x = XEXP (x, 0);
+
+	      /* Loading the value from memory; unlikely to be a constant
+		 stride.  */
+	      if (MEM_P (x))
+		continue;
+
+	      /* An increment or decrement by a constant MODE_SIZE amount or
+		 the result of a binary expression is likely to be an IV.  */
+	      if (GET_CODE (x) == POST_INC
+		  || GET_CODE (x) == POST_DEC
+		  || GET_CODE (x) == PRE_INC
+		  || GET_CODE (x) == PRE_DEC)
+		return true;
+	      else if (BINARY_P (x)
+		       && (CONST_INT_P (XEXP (x, 0))
+			   || CONST_INT_P (XEXP (x, 1))))
+		{
+		  rtx stride = (CONST_INT_P (XEXP (x, 0))
+				? XEXP (x, 0) : XEXP (x, 1));
+
+		  /* Don't bother with very long strides because the prefetcher
+		     is unable to train on them anyway.  */
+		  if (INTVAL (stride) < MAX_PREFETCH_STRIDE)
+		    return true;
+		}
+	    }
+	}
+      return false;
+    }
+  return false;
+}
+
+
+/* Return true if SRC is a strided load in the LOOP, false otherwise.
+   If it is a strided load, set the BASE and OFFSET.  Also, if this is
+   a pre/post increment load, set PRE_POST to true.  */
+static bool
+valid_src_p (rtx src, rtx_insn *insn, struct loop *loop, bool *pre_post,
+	     rtx *base, rtx *offset, bool load_pair)
+{
+  subrtx_var_iterator::array_type array;
+  rtx x = NULL_RTX;
+
+  FOR_EACH_SUBRTX_VAR (iter, array, src, NONCONST)
+    if (MEM_P (*iter))
+      {
+	x = *iter;
+	break;
+      }
+
+  if (!x)
+    return false;
+
+  struct aarch64_address_info addr;
+  machine_mode mode = GET_MODE (x);
+
+  if (!aarch64_classify_address (&addr, XEXP (x, 0), mode, true))
+    return false;
+
+  unsigned regno = REGNO (addr.base);
+  if (global_regs[regno] || fixed_regs[regno])
+    return false;
+
+  if (addr.type == ADDRESS_REG_WB)
+    {
+      unsigned code = GET_CODE (XEXP (x, 0));
+
+      *pre_post = true;
+      *base = addr.base;
+
+      if (code == PRE_MODIFY || code == POST_MODIFY)
+	*offset = addr.offset;
+      else
+	{
+	  /*Writeback is only supported for fixed-width modes.  */
+	  unsigned int_offset = GET_MODE_SIZE (mode).to_constant ();
+
+	  /* For post-incremented load pairs we would increment the base twice
+	     over, so make that adjustment.  */
+	  if (load_pair && (code == POST_INC || code == POST_DEC))
+	    int_offset *= 2;
+
+	  *offset = GEN_INT (int_offset);
+	}
+      return true;
+    }
+  else if (addr.type == ADDRESS_REG_IMM || addr.type == ADDRESS_REG_REG)
+    {
+      /* Check if the load is strided.  */
+      if (!iv_p (insn, addr.base, loop))
+	return false;
+
+      *base = addr.base;
+      *offset = addr.offset;
+      return true;
+    }
+
+  return false;
+}
+
+
+/* Return true if INSN is a strided load in LOOP.  If it is a strided load, set
+   the DEST, BASE and OFFSET.  Also, if this is a pre/post increment load, set
+   PRE_POST to true.
+
+   The routine does checks on the destination of the insn and depends on
+   STRIDED_LOAD_P to check the source and fill in the BASE and OFFSET.  */
+static bool
+get_load_info (rtx_insn *insn, struct loop *loop, rtx *dest, rtx *base,
+	       rtx *offset, bool *pre_post, bool *ldp)
+{
+  if (!INSN_P (insn) || recog_memoized (insn) < 0)
+    return false;
+
+  rtx pat = PATTERN (insn);
+  unsigned code = GET_CODE (pat);
+  bool load_pair = (code == PARALLEL);
+
+  /* For a load pair we need only the first base and destination
+     registers.  We however need to ensure that our pre/post increment
+     offset is doubled; we do that in STRIDED_LOAD_P.  */
+  if (load_pair)
+    {
+      pat = XVECEXP (pat, 0, 0);
+      code = GET_CODE (pat);
+    }
+
+  if (code != SET)
+    return false;
+
+  rtx dest_rtx = SET_DEST (pat);
+
+  if (!REG_P (dest_rtx))
+    return false;
+
+  unsigned regno = REGNO (dest_rtx);
+  machine_mode mode = GET_MODE (dest_rtx);
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+
+  /* Falkor does not support SVE vectors.  */
+  if (!GET_MODE_SIZE (mode).is_constant ())
+    return false;
+
+  /* Ignore vector struct or lane loads.  */
+  if (GET_MODE_SIZE (mode).to_constant ()
+      != GET_MODE_SIZE (inner_mode).to_constant ())
+    return false;
+
+  /* The largest width we want to bother with is a load of a pair of
+     quad-words.  */
+  if ((GET_MODE_SIZE (mode).to_constant () << load_pair)
+      > GET_MODE_SIZE (OImode))
+    return false;
+
+  /* Ignore loads into the stack pointer because it is unlikely to be a
+     stream.  */
+  if (regno == SP_REGNUM)
+    return false;
+
+  if (valid_src_p (SET_SRC (pat), insn, loop, pre_post, base, offset,
+		   load_pair))
+    {
+      *dest = dest_rtx;
+      *ldp = load_pair;
+
+      return true;
+    }
+
+  return false;
+}
+
+
+/* Return whether INSN and CAND are in the same def/use chain.  */
+static bool
+in_same_chain (rtx_insn *insn, rtx_insn *cand, unsigned regno)
+{
+  struct du_chain *chain = NULL;
+  du_head_p head = NULL;
+  int i;
+
+  /* Search the chain where this instruction is (one of) the root.  */
+  operand_rr_info *op_info = insn_rr[INSN_UID (insn)].op_info;
+
+  for (i = 0; i < op_info->n_chains; i++)
+    {
+      /* The register tracked by this chain does not match the
+	 dest register of insn.  */
+      if (op_info->heads[i]->regno != regno)
+	continue;
+
+      head = op_info->heads[i];
+      /* The chain was merged in another, find the new head.  */
+      if (!head->first)
+	head = regrename_chain_from_id (head->id);
+
+      bool found_insn = false, found_cand = false;
+
+      for (chain = head->first; chain; chain = chain->next_use)
+	{
+	  rtx *loc = &SET_DEST (PATTERN (chain->insn));
+
+	  if (chain->loc != loc)
+	    continue;
+
+	  if (chain->insn == insn)
+	    found_insn = true;
+
+	  if (chain->insn == cand)
+	    found_cand = true;
+
+	  if (found_insn && found_cand)
+	    return true;
+	}
+    }
+
+  return false;
+}
+
+
+/* Callback function to traverse the tag map and drop loads that have the same
+   destination and and in the same chain of occurrence.  Routine always returns
+   true to allow traversal through all of TAG_MAP.  */
+bool
+single_dest_per_chain (const rtx &t ATTRIBUTE_UNUSED, insn_info_list_t *v,
+		       void *arg ATTRIBUTE_UNUSED)
+{
+  for (int i = v->length () - 1; i>= 1; i--)
+    {
+      tag_insn_info *insn_info = (*v)[i];
+
+      for (int j = v->length () - 2; j >= 0; j--)
+	{
+	  /* Filter out destinations in the same chain.  */
+	  if (in_same_chain (insn_info->insn, (*v)[j]->insn,
+			     REGNO (insn_info->dest)))
+	    {
+	      v->ordered_remove (j);
+	      i = v->length ();
+	      break;
+	    }
+	}
+    }
+
+  return true;
+}
+
+
+/* Callback invoked for each name-value pair (T, INSN_INFO) to dump the insn
+   list INSN_INFO for tag T.  */
+bool
+dump_insn_list (const rtx &t, const insn_info_list_t &insn_info,
+		void *unused ATTRIBUTE_UNUSED)
+{
+  gcc_assert (dump_file);
+  fprintf (dump_file, "Tag 0x%lx ::\n", INTVAL (t));
+
+  for (unsigned i = 0; i < insn_info.length (); i++)
+    dump_insn_slim (dump_file, insn_info[i]->insn);
+
+  fprintf (dump_file, "\n");
+
+  return true;
+}
+
+
+/* Record all loads in LOOP into TAG_MAP indexed by the falkor hardware
+   prefetcher memory tags.  */
+static void
+record_loads (tag_map_t &tag_map, struct loop *loop)
+{
+  rtx_insn *insn;
+  basic_block *body, bb;
+
+  body = get_loop_body (loop);
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+      FOR_BB_INSNS (bb, insn)
+	{
+	  rtx base = NULL_RTX;
+	  rtx dest = NULL_RTX;
+	  rtx offset = NULL_RTX;
+	  bool writeback = false;
+	  bool ldp = false;
+
+	  if (!INSN_P (insn) || DEBUG_INSN_P (insn))
+	    continue;
+
+	  if (get_load_info (insn, loop, &dest, &base, &offset, &writeback,
+			     &ldp))
+	    {
+	      tag_insn_info *i = new tag_insn_info (insn, dest, base, offset,
+						    writeback, ldp);
+	      rtx tag = GEN_INT (i->tag ());
+	      tag_map.get_or_insert (tag).safe_push (i);
+	    }
+	}
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Loop %d: Tag map generated.\n", loop->num);
+      tag_map.traverse <void *, dump_insn_list> (NULL);
+    }
+
+  /* Try to reduce the dataset before launching into the rename attempt.  Drop
+     destinations in the same collision chain that appear in the same def/use
+     chain, all as defs.  These chains will move together in a rename so
+     there's no point in keeping both in there.  */
+  tag_map.traverse <void *, single_dest_per_chain> (NULL);
+}
+
+
+/* Tag collision avoidance pass for Falkor.  The pass runs in two phases for
+   each loop; the first phase collects all loads that we consider as
+   interesting for renaming into a tag-indexed map of lists.  The second phase
+   renames the destination register of the loads in an attempt to spread out
+   the loads into different tags.  */
+void
+execute_tag_collision_avoidance ()
+{
+  struct loop *loop;
+
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_UD_CHAIN);
+  df_compute_regs_ever_live (true);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  regrename_init (true);
+  regrename_analyze (NULL);
+
+  compute_bb_for_insn ();
+  calculate_dominance_info (CDI_DOMINATORS);
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      tag_map_t tag_map (512);
+
+      record_loads (tag_map, loop);
+      avoid_collisions (tag_map);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Loop %d: Completed rename.\n", loop->num);
+	  tag_map.traverse <void *, dump_insn_list> (NULL);
+	}
+      tag_map.traverse <void *, free_insn_info> (NULL);
+    }
+
+  loop_optimizer_finalize ();
+  free_dominance_info (CDI_DOMINATORS);
+  regrename_finish ();
+}
+
+
+const pass_data pass_data_tag_collision_avoidance =
+{
+  RTL_PASS, /* type */
+  "tag_collision_avoidance", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+
+class pass_tag_collision_avoidance : public rtl_opt_pass
+{
+public:
+  pass_tag_collision_avoidance (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_tag_collision_avoidance, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return ((aarch64_tune_params.extra_tuning_flags
+	       & AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS)
+	      && optimize >= 2);
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      execute_tag_collision_avoidance ();
+      return 0;
+    }
+
+}; // class pass_tag_collision_avoidance
+
+
+/* Create a new pass instance.  */
+rtl_opt_pass *
+make_pass_tag_collision_avoidance (gcc::context *ctxt)
+{
+  return new pass_tag_collision_avoidance (ctxt);
+}