diff gcc/config/sparc/niagara7.md @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gcc/config/sparc/niagara7.md	Fri Oct 27 22:46:09 2017 +0900
@@ -0,0 +1,205 @@
+;; Scheduling description for Niagara-7
+;;   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "niagara7_0")
+
+;; The S4 core has a dual-issue queue.  This queue is divided into two
+;; slots.  One instruction can be issued each cycle to each slot, and
+;; up to 2 instructions are committed each cycle.  Each slot serves
+;; several execution units, as depicted below:
+;;
+;;
+;;                 m7_slot0 - Integer unit.
+;;                          - Load/Store unit.
+;; === QUEUE ==>
+;;
+;;                 m7_slot1 - Integer unit.
+;;                          - Branch unit.
+;;                          - Floating-point and graphics unit.
+;;                          - 3-cycles crypto unit.
+
+(define_cpu_unit "n7_slot0,n7_slot1" "niagara7_0")
+
+;; Some instructions stall the pipeline and avoid any other
+;; instruction to be issued in the same cycle.  We assume the same for
+;; multi-instruction insns.
+
+(define_reservation "n7_single_issue" "n7_slot0 + n7_slot1")
+
+(define_insn_reservation "n7_single" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "multi,savew,flushw,trap"))
+  "n7_single_issue")
+
+;; Most of the instructions executing in the integer unit have a
+;; latency of 1.
+
+(define_insn_reservation "n7_integer" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "ialu,ialuX,shift,cmove,compare"))
+  "(n7_slot0 | n7_slot1)")
+
+;; Flushing the instruction memory takes 27 cycles.
+
+(define_insn_reservation "n7_iflush" 27
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "iflush"))
+  "(n7_slot0 | n7_slot1), nothing*26")
+
+;; The integer multiplication instructions have a latency of 12 cycles
+;; and execute in the integer unit.
+;;
+;; Likewise for array*, edge* and pdistn instructions.
+
+(define_insn_reservation "n7_imul" 12
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "imul,array,edge,edgen,pdistn"))
+  "(n7_slot0 | n7_slot1), nothing*11")
+
+;; The integer division instructions have a latency of 35 cycles and
+;; execute in the integer unit.
+
+(define_insn_reservation "n7_idiv" 35
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "idiv"))
+  "(n7_slot0 | n7_slot1), nothing*34")
+
+;; Both integer and floating-point load instructions have a latency of
+;; 5 cycles, and execute in the slot0.
+;;
+;; The prefetch instruction also executes in the load/store unit, but
+;; its latency is only 1 cycle.
+
+(define_insn_reservation "n7_load" 5
+  (and (eq_attr "cpu" "niagara7")
+       (ior (eq_attr "type" "fpload,sload")
+            (and (eq_attr "type" "load")
+                 (eq_attr "subtype" "regular"))))
+  "n7_slot0, nothing*4")
+
+(define_insn_reservation "n7_prefetch" 1
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "load")
+       (eq_attr "subtype" "prefetch"))
+  "n7_slot0")
+
+;; Both integer and floating-point store instructions have a latency
+;; of 1 cycle, and execute in the load/store unit in slot0.
+
+(define_insn_reservation "n7_store" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "store,fpstore"))
+  "n7_slot0")
+
+;; Control-transfer instructions execute in the Branch Unit in the
+;; slot1.
+
+(define_insn_reservation "n7_cti" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
+  "n7_slot1")
+
+;; Many instructions executing in the Floating-point and Graphics unit
+;; in the slot1 feature a latency of 11 cycles.
+
+(define_insn_reservation "n7_fp" 11
+  (and (eq_attr "cpu" "niagara7")
+       (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
+            (and (eq_attr "type" "fga")
+                 (eq_attr "subtype" "fpu,maxmin"))))
+  "n7_slot1, nothing*10")
+
+;; Floating-point division and floating-point square-root instructions
+;; have high latencies.  They execute in the floating-point and
+;; graphics unit in the slot1.
+
+
+(define_insn_reservation "n7_fpdivs" 24
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "fpdivs,fpsqrts"))
+  "n7_slot1, nothing*23")
+
+(define_insn_reservation "n7_fpdivd" 37
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpdivd,fpsqrtd"))
+  "n7_slot1, nothing*36")
+
+;; SIMD VIS instructions executing in the Floating-point and graphics
+;; unit (FPG) in slot1 usually have a latency of either 11 or 12
+;; cycles.
+;;
+;; However, the latency for many instructions is only 3 cycles if the
+;; consumer can also be executed in 3 cycles.  We model this with a
+;; bypass.  In these cases the instructions are executed in the
+;; 3-cycle crypto unit which also serves slot1.
+
+(define_insn_reservation "n7_vis_11cycles" 11
+  (and (eq_attr "cpu" "niagara7")
+       (ior (and (eq_attr "type" "fga")
+                 (eq_attr "subtype" "addsub64,other"))
+            (and (eq_attr "type" "vismv")
+                 (eq_attr "subtype" "double,single"))
+            (and (eq_attr "type" "visl")
+                 (eq_attr "subtype" "double,single"))))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_12cycles" 12
+  (and (eq_attr "cpu" "niagara7")
+       (ior (eq_attr "type" "bmask,viscmp")
+            (and (eq_attr "type" "fga")
+                 (eq_attr "subtype" "cmask"))
+            (and (eq_attr "type" "vismv")
+                 (eq_attr "subtype" "movstouw"))))
+  "n7_slot1, nothing*11")
+
+(define_bypass 3 "n7_vis_*" "n7_vis_*")
+
+;; Some other VIS instructions have a latency of 12 cycles, and won't
+;; be executed in the 3-cycle crypto pipe.
+
+(define_insn_reservation "n7_lzd" 12
+  (and (eq_attr "cpu" "niagara7")
+       (ior (eq_attr "type" "lzd,")
+            (and (eq_attr "type" "gsr")
+                 (eq_attr "subtype" "alignaddr"))))
+  "n7_slot1, nothing*11")
+
+;; A couple of VIS instructions feature very low latencies in the M7.
+
+(define_insn_reservation "n7_single_vis" 1
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "vismv")
+       (eq_attr "subtype" "movxtod"))
+  "n7_slot1")
+
+(define_insn_reservation "n7_double_vis" 2
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "vismv")
+       (eq_attr "subtype" "movdtox"))
+  "n7_slot1, nothing")
+
+;; Reading and writing to the gsr register takes a high number of
+;; cycles that is not documented in the PRM.  Let's use the same value
+;; than the M8.
+
+(define_insn_reservation "n7_gsr_reg" 70
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "gsr")
+       (eq_attr "subtype" "reg"))
+  "n7_slot1, nothing*70")