view gcc/config/spu/cachemgr.c @ 55:77e2b8dfacca gcc-4.4.5

update it from 4.4.3 to 4.5.0
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Fri, 12 Feb 2010 23:39:51 +0900
parents
children
line wrap: on
line source

/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

#include <spu_mfcio.h>
#include <spu_internals.h>
#include <spu_intrinsics.h>
#include <spu_cache.h>

extern unsigned long long __ea_local_store;
extern char __cache_tag_array_size;

#define LINE_SIZE 128
#define TAG_MASK (LINE_SIZE - 1)

#define WAYS 4
#define SET_MASK ((int) &__cache_tag_array_size - LINE_SIZE)

#define CACHE_LINES ((int) &__cache_tag_array_size /		\
		     sizeof (struct __cache_tag_array) * WAYS)

struct __cache_tag_array
{
  unsigned int tag_lo[WAYS];
  unsigned int tag_hi[WAYS];
  void *base[WAYS];
  int reserved[WAYS];
  vector unsigned short dirty_bits[WAYS];
};

extern struct __cache_tag_array __cache_tag_array[];
extern char __cache[];

/* In order to make the code seem a little cleaner, and to avoid having
   64/32 bit ifdefs all over the place, we use macros.  */

#ifdef __EA64__
typedef unsigned long long addr;

#define CHECK_TAG(_entry, _way, _tag)			\
  ((_entry)->tag_lo[(_way)] == ((_tag) & 0xFFFFFFFF)	\
   && (_entry)->tag_hi[(_way)] == ((_tag) >> 32))

#define GET_TAG(_entry, _way) \
  ((unsigned long long)(_entry)->tag_hi[(_way)] << 32	\
   | (unsigned long long)(_entry)->tag_lo[(_way)])

#define SET_TAG(_entry, _way, _tag)			\
  (_entry)->tag_lo[(_way)] = (_tag) & 0xFFFFFFFF;	\
  (_entry)->tag_hi[(_way)] = (_tag) >> 32

#else /*__EA32__*/
typedef unsigned long addr;

#define CHECK_TAG(_entry, _way, _tag)			\
  ((_entry)->tag_lo[(_way)] == (_tag))

#define GET_TAG(_entry, _way)				\
  ((_entry)->tag_lo[(_way)])

#define SET_TAG(_entry, _way, _tag)			\
  (_entry)->tag_lo[(_way)] = (_tag)

#endif

/* In GET_ENTRY, we cast away the high 32 bits,
   as the tag is only in the low 32.  */

#define GET_ENTRY(_addr)						   \
  ((struct __cache_tag_array *)						   \
   si_to_uint (si_a (si_and (si_from_uint ((unsigned int) (addr) (_addr)), \
			     si_from_uint (SET_MASK)),			   \
	       si_from_uint ((unsigned int) __cache_tag_array))))

#define GET_CACHE_LINE(_addr, _way) \
  ((void *) (__cache + ((_addr) & SET_MASK) * WAYS) + ((_way) * LINE_SIZE));

#define CHECK_DIRTY(_vec) (si_to_uint (si_orx ((qword) (_vec))))
#define SET_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] = 1)
#define CHECK_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] == 1)

#define LS_FLAG 0x80000000
#define SET_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] |= LS_FLAG)
#define CHECK_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] & LS_FLAG)
#define GET_LRU(_entry, _way) ((_entry)->reserved[(_way)] & ~LS_FLAG)

static int dma_tag = 32;

static void
__cache_evict_entry (struct __cache_tag_array *entry, int way)
{
  addr tag = GET_TAG (entry, way);

  if (CHECK_DIRTY (entry->dirty_bits[way]) && !CHECK_IS_LS (entry, way))
    {
#ifdef NONATOMIC
      /* Non-atomic writes.  */
      unsigned int oldmask, mach_stat;
      char *line = ((void *) 0);

      /* Enter critical section.  */
      mach_stat = spu_readch (SPU_RdMachStat);
      spu_idisable ();

      /* Issue DMA request.  */
      line = GET_CACHE_LINE (entry->tag_lo[way], way);
      mfc_put (line, tag, LINE_SIZE, dma_tag, 0, 0);

      /* Wait for DMA completion.  */
      oldmask = mfc_read_tag_mask ();
      mfc_write_tag_mask (1 << dma_tag);
      mfc_read_tag_status_all ();
      mfc_write_tag_mask (oldmask);

      /* Leave critical section.  */
      if (__builtin_expect (mach_stat & 1, 0))
	spu_ienable ();
#else
      /* Allocate a buffer large enough that we know it has 128 bytes
         that are 128 byte aligned (for DMA). */

      char buffer[LINE_SIZE + 127];
      qword *buf_ptr = (qword *) (((unsigned int) (buffer) + 127) & ~127);
      qword *line = GET_CACHE_LINE (entry->tag_lo[way], way);
      qword bits;
      unsigned int mach_stat;

      /* Enter critical section.  */
      mach_stat = spu_readch (SPU_RdMachStat);
      spu_idisable ();

      do
	{
	  /* We atomically read the current memory into a buffer
	     modify the dirty bytes in the buffer, and write it
	     back. If writeback fails, loop and try again.  */

	  mfc_getllar (buf_ptr, tag, 0, 0);
	  mfc_read_atomic_status ();

	  /* The method we're using to write 16 dirty bytes into
	     the buffer at a time uses fsmb which in turn uses
	     the least significant 16 bits of word 0, so we
	     load the bits and rotate so that the first bit of
	     the bitmap is in the first bit that fsmb will use.  */

	  bits = (qword) entry->dirty_bits[way];
	  bits = si_rotqbyi (bits, -2);

	  /* Si_fsmb creates the mask of dirty bytes.
	     Use selb to nab the appropriate bits.  */
	  buf_ptr[0] = si_selb (buf_ptr[0], line[0], si_fsmb (bits));

	  /* Rotate to next 16 byte section of cache.  */
	  bits = si_rotqbyi (bits, 2);

	  buf_ptr[1] = si_selb (buf_ptr[1], line[1], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);
	  buf_ptr[2] = si_selb (buf_ptr[2], line[2], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);
	  buf_ptr[3] = si_selb (buf_ptr[3], line[3], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);
	  buf_ptr[4] = si_selb (buf_ptr[4], line[4], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);
	  buf_ptr[5] = si_selb (buf_ptr[5], line[5], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);
	  buf_ptr[6] = si_selb (buf_ptr[6], line[6], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);
	  buf_ptr[7] = si_selb (buf_ptr[7], line[7], si_fsmb (bits));
	  bits = si_rotqbyi (bits, 2);

	  mfc_putllc (buf_ptr, tag, 0, 0);
	}
      while (mfc_read_atomic_status ());

      /* Leave critical section.  */
      if (__builtin_expect (mach_stat & 1, 0))
	spu_ienable ();
#endif
    }

  /* In any case, marking the lo tag with 1 which denotes empty.  */
  SET_EMPTY (entry, way);
  entry->dirty_bits[way] = (vector unsigned short) si_from_uint (0);
}

void
__cache_evict (__ea void *ea)
{
  addr tag = (addr) ea & ~TAG_MASK;
  struct __cache_tag_array *entry = GET_ENTRY (ea);
  int i = 0;

  /* Cycles through all the possible ways an address could be at
     and evicts the way if found.  */

  for (i = 0; i < WAYS; i++)
    if (CHECK_TAG (entry, i, tag))
      __cache_evict_entry (entry, i);
}

static void *
__cache_fill (int way, addr tag)
{
  unsigned int oldmask, mach_stat;
  char *line = ((void *) 0);

  /* Reserve our DMA tag.  */
  if (dma_tag == 32)
    dma_tag = mfc_tag_reserve ();

  /* Enter critical section.  */
  mach_stat = spu_readch (SPU_RdMachStat);
  spu_idisable ();

  /* Issue DMA request.  */
  line = GET_CACHE_LINE (tag, way);
  mfc_get (line, tag, LINE_SIZE, dma_tag, 0, 0);

  /* Wait for DMA completion.  */
  oldmask = mfc_read_tag_mask ();
  mfc_write_tag_mask (1 << dma_tag);
  mfc_read_tag_status_all ();
  mfc_write_tag_mask (oldmask);

  /* Leave critical section.  */
  if (__builtin_expect (mach_stat & 1, 0))
    spu_ienable ();

  return (void *) line;
}

static void
__cache_miss (__ea void *ea, struct __cache_tag_array *entry, int way)
{

  addr tag = (addr) ea & ~TAG_MASK;
  unsigned int lru = 0;
  int i = 0;
  int idx = 0;

  /* If way > 4, then there are no empty slots, so we must evict
     the least recently used entry. */
  if (way >= 4)
    {
      for (i = 0; i < WAYS; i++)
	{
	  if (GET_LRU (entry, i) > lru)
	    {
	      lru = GET_LRU (entry, i);
	      idx = i;
	    }
	}
      __cache_evict_entry (entry, idx);
      way = idx;
    }

  /* Set the empty entry's tag and fill it's cache line. */

  SET_TAG (entry, way, tag);
  entry->reserved[way] = 0;

  /* Check if the address is just an effective address within the
     SPU's local store. */

  /* Because the LS is not 256k aligned, we can't do a nice and mask
     here to compare, so we must check the whole range.  */

  if ((addr) ea >= (addr) __ea_local_store
      && (addr) ea < (addr) (__ea_local_store + 0x40000))
    {
      SET_IS_LS (entry, way);
      entry->base[way] =
	(void *) ((unsigned int) ((addr) ea -
				  (addr) __ea_local_store) & ~0x7f);
    }
  else
    {
      entry->base[way] = __cache_fill (way, tag);
    }
}

void *
__cache_fetch_dirty (__ea void *ea, int n_bytes_dirty)
{
#ifdef __EA64__
  unsigned int tag_hi;
  qword etag_hi;
#endif
  unsigned int tag_lo;
  struct __cache_tag_array *entry;

  qword etag_lo;
  qword equal;
  qword bit_mask;
  qword way;

  /* This first chunk, we merely fill the pointer and tag.  */

  entry = GET_ENTRY (ea);

#ifndef __EA64__
  tag_lo =
    si_to_uint (si_andc
		(si_shufb
		 (si_from_uint ((addr) ea), si_from_uint (0),
		  si_from_uint (0x00010203)), si_from_uint (TAG_MASK)));
#else
  tag_lo =
    si_to_uint (si_andc
		(si_shufb
		 (si_from_ullong ((addr) ea), si_from_uint (0),
		  si_from_uint (0x04050607)), si_from_uint (TAG_MASK)));

  tag_hi =
    si_to_uint (si_shufb
		(si_from_ullong ((addr) ea), si_from_uint (0),
		 si_from_uint (0x00010203)));
#endif

  /* Increment LRU in reserved bytes.  */
  si_stqd (si_ai (si_lqd (si_from_ptr (entry), 48), 1),
	   si_from_ptr (entry), 48);

missreturn:
  /* Check if the entry's lo_tag is equal to the address' lo_tag.  */
  etag_lo = si_lqd (si_from_ptr (entry), 0);
  equal = si_ceq (etag_lo, si_from_uint (tag_lo));
#ifdef __EA64__
  /* And the high tag too.  */
  etag_hi = si_lqd (si_from_ptr (entry), 16);
  equal = si_and (equal, (si_ceq (etag_hi, si_from_uint (tag_hi))));
#endif

  if ((si_to_uint (si_orx (equal)) == 0))
    goto misshandler;

  if (n_bytes_dirty)
    {
      /* way = 0x40,0x50,0x60,0x70 for each way, which is also the
         offset of the appropriate dirty bits.  */
      way = si_shli (si_clz (si_gbb (equal)), 2);

      /* To create the bit_mask, we set it to all 1s (uint -1), then we
         shift it over (128 - n_bytes_dirty) times.  */

      bit_mask = si_from_uint (-1);

      bit_mask =
	si_shlqby (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) / 8));

      bit_mask =
	si_shlqbi (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) % 8));

      /* Rotate it around to the correct offset.  */
      bit_mask =
	si_rotqby (bit_mask,
		   si_from_uint (-1 * ((addr) ea & TAG_MASK) / 8));

      bit_mask =
	si_rotqbi (bit_mask,
		   si_from_uint (-1 * ((addr) ea & TAG_MASK) % 8));

      /* Update the dirty bits.  */
      si_stqx (si_or (si_lqx (si_from_ptr (entry), way), bit_mask),
	       si_from_ptr (entry), way);
    };

  /* We've definitely found the right entry, set LRU (reserved) to 0
     maintaining the LS flag (MSB).  */

  si_stqd (si_andc
	   (si_lqd (si_from_ptr (entry), 48),
	    si_and (equal, si_from_uint (~(LS_FLAG)))),
	   si_from_ptr (entry), 48);

  return (void *)
    si_to_uint (si_a
		(si_orx
		 (si_and (si_lqd (si_from_ptr (entry), 32), equal)),
		 si_from_uint (((unsigned int) (addr) ea) & TAG_MASK)));

misshandler:
  equal = si_ceqi (etag_lo, 1);
  __cache_miss (ea, entry, (si_to_uint (si_clz (si_gbb (equal))) - 16) >> 2);
  goto missreturn;
}

void *
__cache_fetch (__ea void *ea)
{
  return __cache_fetch_dirty (ea, 0);
}

void
__cache_touch (__ea void *ea __attribute__ ((unused)))
{
  /* NO-OP for now.  */
}

void __cache_flush (void) __attribute__ ((destructor));
void
__cache_flush (void)
{
  struct __cache_tag_array *entry = __cache_tag_array;
  unsigned int i;
  int j;

  /* Cycle through each cache entry and evict all used ways.  */

  for (i = 0; i < CACHE_LINES / WAYS; i++)
    {
      for (j = 0; j < WAYS; j++)
	if (!CHECK_EMPTY (entry, j))
	  __cache_evict_entry (entry, j);

      entry++;
    }
}