0
|
1 ;; Scheduling description for cell processor.
|
|
2 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
|
|
3 ;; Free Software Foundation, Inc.
|
|
4 ;; Contributed by Sony Computer Entertainment, Inc.,
|
|
5
|
|
6
|
|
7 ;; This file is free software; you can redistribute it and/or modify it under
|
|
8 ;; the terms of the GNU General Public License as published by the Free
|
|
9 ;; Software Foundation; either version 3 of the License, or (at your option)
|
|
10 ;; any later version.
|
|
11
|
|
12 ;; This file is distributed in the hope that it will be useful, but WITHOUT
|
|
13 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
14 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
15 ;; for more details.
|
|
16
|
|
17 ;; You should have received a copy of the GNU General Public License
|
|
18 ;; along with GCC; see the file COPYING3. If not see
|
|
19 ;; <http://www.gnu.org/licenses/>.
|
|
20
|
|
21 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
|
|
22
|
|
23 ;; BE Architecture *DD3.0 and DD3.1*
|
|
24 ;; This file simulate PPU processor unit backend of pipeline, maualP24.
|
|
25 ;; manual P27, stall and flush points
|
|
26 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
|
|
27 ;; order, the grouped address are aligned by 8
|
|
28 ;; This file only simulate one thread situation
|
|
29 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
|
|
30 ;; and load/store unit)
|
|
31 ;; VSU executes all scalar floating points insn(a float unit),
|
|
32 ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
|
|
33
|
|
34 ;; Dual issue combination
|
|
35
|
|
36 ;; FXU LSU BR VMX VMX
|
|
37 ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls)
|
|
38 ;;FXU X
|
|
39 ;;LSU X X X
|
|
40 ;;BR X
|
|
41 ;;VMX(sx,cx,vsu_fp,fp_arth) X
|
|
42 ;;VMX(perm,vsu_ls, fp_ls) X
|
|
43 ;; X are illegal combination.
|
|
44
|
|
45 ;; Dual issue exceptions:
|
|
46 ;;(1) nop-pipelined FXU instr in slot 0
|
|
47 ;;(2) non-pipelined FPU inst in slot 0
|
|
48 ;; CSI instr(contex-synchronizing insn)
|
|
49 ;; Microcode insn
|
|
50
|
|
51 ;; BRU unit: bru(none register stall), bru_cr(cr register stall)
|
|
52 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
|
|
53 ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
|
|
54 ;; nonpipelined simulation
|
|
55 ;; micr insns will stall at least 7 cycles to get the first instr from ROM,
|
|
56 ;; micro instructions are not dual issued.
|
|
57
|
|
58 ;; slot0 is older than slot1
|
|
59 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
|
|
60
|
|
61 ;; There different stall point
|
|
62 ;; IB2, only stall one thread if stall here, so try to stall here as much as
|
|
63 ;; we can
|
|
64 ;; condition(1) insert nop, OR and ORI instruction form
|
|
65 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
|
|
66 ;; CR0-access while stdcx, or stwcx
|
|
67 ;; IS2 stall ;; Page91 for details
|
|
68 ;; VQ8 stall
|
|
69 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
|
|
70 ;; the vsu issue queue
|
|
71
|
|
72 ;;(define_automaton "cellxu")
|
|
73
|
|
74 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
|
|
75
|
|
76 ;; ndfa
|
|
77 (define_automaton "cellxu,cellvsu,cellbru,cell_mis")
|
|
78
|
|
79 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
|
|
80 (define_cpu_unit "bru_cell" "cellbru")
|
|
81 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
|
|
82
|
|
83 (define_cpu_unit "slot0,slot1" "cell_mis")
|
|
84
|
|
85 (absence_set "slot0" "slot1")
|
|
86
|
|
87 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
|
|
88 (define_reservation "slot01" "slot0|slot1")
|
|
89
|
|
90
|
|
91 ;; Load/store
|
|
92 ;; lmw, lswi, lswx are only generated for optimize for space, MC,
|
|
93 ;; these instr are not simulated
|
|
94 (define_insn_reservation "cell-load" 2
|
|
95 (and (eq_attr "type" "load")
|
|
96 (eq_attr "cpu" "cell"))
|
|
97 "slot01,lsu_cell")
|
|
98
|
|
99 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
|
|
100 ;; if with 32bytes alignment, CMC
|
|
101 (define_insn_reservation "cell-load-ux" 2
|
|
102 (and (eq_attr "type" "load_ux,load_u")
|
|
103 (eq_attr "cpu" "cell"))
|
|
104 "slot01,fxu_cell+lsu_cell")
|
|
105
|
|
106 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
|
|
107 ;; 11/7, 11/8, 11/12
|
|
108 (define_insn_reservation "cell-load-ext" 2
|
|
109 (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
|
|
110 (eq_attr "cpu" "cell"))
|
|
111 "slot01,fxu_cell+lsu_cell")
|
|
112
|
|
113 ;;lfs,lfsx,lfd,lfdx, 1 cycle
|
|
114 (define_insn_reservation "cell-fpload" 1
|
|
115 (and (eq_attr "type" "fpload")
|
|
116 (eq_attr "cpu" "cell"))
|
|
117 "vsu2_cell+lsu_cell+slot01")
|
|
118
|
|
119 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
|
|
120 (define_insn_reservation "cell-fpload-update" 1
|
|
121 (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
|
|
122 (eq_attr "cpu" "cell"))
|
|
123 "fxu_cell+vsu2_cell+lsu_cell+slot01")
|
|
124
|
|
125 (define_insn_reservation "cell-vecload" 2
|
|
126 (and (eq_attr "type" "vecload")
|
|
127 (eq_attr "cpu" "cell"))
|
|
128 "slot01,vsu2_cell+lsu_cell")
|
|
129
|
|
130 ;;st? stw(MC)
|
|
131 (define_insn_reservation "cell-store" 1
|
|
132 (and (eq_attr "type" "store")
|
|
133 (eq_attr "cpu" "cell"))
|
|
134 "lsu_cell+slot01")
|
|
135
|
|
136 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
|
|
137 (define_insn_reservation "cell-store-update" 1
|
|
138 (and (eq_attr "type" "store_ux,store_u")
|
|
139 (eq_attr "cpu" "cell"))
|
|
140 "fxu_cell+lsu_cell+slot01")
|
|
141
|
|
142 (define_insn_reservation "cell-fpstore" 1
|
|
143 (and (eq_attr "type" "fpstore")
|
|
144 (eq_attr "cpu" "cell"))
|
|
145 "vsu2_cell+lsu_cell+slot01")
|
|
146
|
|
147 (define_insn_reservation "cell-fpstore-update" 1
|
|
148 (and (eq_attr "type" "fpstore_ux,fpstore_u")
|
|
149 (eq_attr "cpu" "cell"))
|
|
150 "vsu2_cell+fxu_cell+lsu_cell+slot01")
|
|
151
|
|
152 (define_insn_reservation "cell-vecstore" 1
|
|
153 (and (eq_attr "type" "vecstore")
|
|
154 (eq_attr "cpu" "cell"))
|
|
155 "vsu2_cell+lsu_cell+slot01")
|
|
156
|
|
157 ;; Integer latency is 2 cycles
|
|
158 (define_insn_reservation "cell-integer" 2
|
|
159 (and (eq_attr "type" "integer,insert_dword,shift,trap,\
|
|
160 var_shift_rotate,cntlz,exts")
|
|
161 (eq_attr "cpu" "cell"))
|
|
162 "slot01,fxu_cell")
|
|
163
|
|
164 ;; Two integer latency is 4 cycles
|
|
165 (define_insn_reservation "cell-two" 4
|
|
166 (and (eq_attr "type" "two")
|
|
167 (eq_attr "cpu" "cell"))
|
|
168 "slot01,fxu_cell,fxu_cell*2")
|
|
169
|
|
170 ;; Three integer latency is 6 cycles
|
|
171 (define_insn_reservation "cell-three" 6
|
|
172 (and (eq_attr "type" "three")
|
|
173 (eq_attr "cpu" "cell"))
|
|
174 "slot01,fxu_cell,fxu_cell*4")
|
|
175
|
|
176 ;; rlwimi, alter cr0
|
|
177 (define_insn_reservation "cell-insert" 2
|
|
178 (and (eq_attr "type" "insert_word")
|
|
179 (eq_attr "cpu" "cell"))
|
|
180 "slot01,fxu_cell")
|
|
181
|
|
182 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
|
|
183 (define_insn_reservation "cell-cmp" 1
|
|
184 (and (eq_attr "type" "cmp")
|
|
185 (eq_attr "cpu" "cell"))
|
|
186 "fxu_cell+slot01")
|
|
187
|
|
188 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
|
|
189 (define_insn_reservation "cell-fast-cmp" 2
|
|
190 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
|
|
191 var_delayed_compare")
|
|
192 (eq_attr "cpu" "cell"))
|
|
193 (eq_attr "cell_micro" "not"))
|
|
194 "slot01,fxu_cell")
|
|
195
|
|
196 (define_insn_reservation "cell-cmp-microcoded" 9
|
|
197 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
|
|
198 var_delayed_compare")
|
|
199 (eq_attr "cpu" "cell"))
|
|
200 (eq_attr "cell_micro" "always"))
|
|
201 "slot0+slot1,fxu_cell,fxu_cell*7")
|
|
202
|
|
203 ;; mulld
|
|
204 (define_insn_reservation "cell-lmul" 15
|
|
205 (and (eq_attr "type" "lmul")
|
|
206 (eq_attr "cpu" "cell"))
|
|
207 "slot1,nonpipeline,nonpipeline*13")
|
|
208
|
|
209 ;; mulld. is microcoded
|
|
210 (define_insn_reservation "cell-lmul-cmp" 22
|
|
211 (and (eq_attr "type" "lmul_compare")
|
|
212 (eq_attr "cpu" "cell"))
|
|
213 "slot0+slot1,nonpipeline,nonpipeline*20")
|
|
214
|
|
215 ;; mulli, 6 cycles
|
|
216 (define_insn_reservation "cell-imul23" 6
|
|
217 (and (eq_attr "type" "imul2,imul3")
|
|
218 (eq_attr "cpu" "cell"))
|
|
219 "slot1,nonpipeline,nonpipeline*4")
|
|
220
|
|
221 ;; mullw, 9
|
|
222 (define_insn_reservation "cell-imul" 9
|
|
223 (and (eq_attr "type" "imul")
|
|
224 (eq_attr "cpu" "cell"))
|
|
225 "slot1,nonpipeline,nonpipeline*7")
|
|
226
|
|
227 ;; divide
|
|
228 (define_insn_reservation "cell-idiv" 32
|
|
229 (and (eq_attr "type" "idiv")
|
|
230 (eq_attr "cpu" "cell"))
|
|
231 "slot1,nonpipeline,nonpipeline*30")
|
|
232
|
|
233 (define_insn_reservation "cell-ldiv" 64
|
|
234 (and (eq_attr "type" "ldiv")
|
|
235 (eq_attr "cpu" "cell"))
|
|
236 "slot1,nonpipeline,nonpipeline*62")
|
|
237
|
|
238 ;;mflr and mfctr are pipelined
|
|
239 (define_insn_reservation "cell-mfjmpr" 1
|
|
240 (and (eq_attr "type" "mfjmpr")
|
|
241 (eq_attr "cpu" "cell"))
|
|
242 "slot01+bru_cell")
|
|
243
|
|
244 ;;mtlr and mtctr,
|
|
245 ;;mtspr fully pipelined
|
|
246 (define_insn_reservation "cell-mtjmpr" 1
|
|
247 (and (eq_attr "type" "mtjmpr")
|
|
248 (eq_attr "cpu" "cell"))
|
|
249 "bru_cell+slot01")
|
|
250
|
|
251 ;; Branches
|
|
252 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
|
|
253 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4
|
|
254 (define_insn_reservation "cell-branch" 1
|
|
255 (and (eq_attr "type" "branch")
|
|
256 (eq_attr "cpu" "cell"))
|
|
257 "bru_cell+slot1")
|
|
258
|
|
259 (define_insn_reservation "cell-branchreg" 1
|
|
260 (and (eq_attr "type" "jmpreg")
|
|
261 (eq_attr "cpu" "cell"))
|
|
262 "bru_cell+slot1")
|
|
263
|
|
264 ;; cr hazard
|
|
265 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle
|
|
266 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
|
|
267 (define_insn_reservation "cell-crlogical" 1
|
|
268 (and (eq_attr "type" "cr_logical,delayed_cr")
|
|
269 (eq_attr "cpu" "cell"))
|
|
270 "bru_cell+slot01")
|
|
271
|
|
272 ;; mfcrf and mfcr is about 34 cycles and nonpipelined
|
|
273 (define_insn_reservation "cell-mfcr" 34
|
|
274 (and (eq_attr "type" "mfcrf,mfcr")
|
|
275 (eq_attr "cpu" "cell"))
|
|
276 "slot1,nonpipeline,nonpipeline*32")
|
|
277
|
|
278 ;; mtcrf (1 field)
|
|
279 (define_insn_reservation "cell-mtcrf" 1
|
|
280 (and (eq_attr "type" "mtcr")
|
|
281 (eq_attr "cpu" "cell"))
|
|
282 "fxu_cell+slot01")
|
|
283
|
|
284 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
|
|
285 (define_insn_reservation "cell-fp" 10
|
|
286 (and (eq_attr "type" "fp,dmul")
|
|
287 (eq_attr "cpu" "cell"))
|
|
288 "slot01,vsu1_cell,vsu1_cell*8")
|
|
289
|
|
290 (define_insn_reservation "cell-fpcompare" 1
|
|
291 (and (eq_attr "type" "fpcompare")
|
|
292 (eq_attr "cpu" "cell"))
|
|
293 "vsu1_cell+slot01")
|
|
294
|
|
295 ;; sdiv thoughput 1/74, not pipelined but only in the FPU
|
|
296 (define_insn_reservation "cell-sdiv" 74
|
|
297 (and (eq_attr "type" "sdiv,ddiv")
|
|
298 (eq_attr "cpu" "cell"))
|
|
299 "slot1,nonpipeline,nonpipeline*72")
|
|
300
|
|
301 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU
|
|
302 (define_insn_reservation "cell-sqrt" 84
|
|
303 (and (eq_attr "type" "ssqrt,dsqrt")
|
|
304 (eq_attr "cpu" "cell"))
|
|
305 "slot1,nonpipeline,nonpipeline*82")
|
|
306
|
|
307 ; VMX
|
|
308 (define_insn_reservation "cell-vecsimple" 4
|
|
309 (and (eq_attr "type" "vecsimple")
|
|
310 (eq_attr "cpu" "cell"))
|
|
311 "slot01,vsu1_cell,vsu1_cell*2")
|
|
312
|
|
313 ;; mult, div, madd
|
|
314 (define_insn_reservation "cell-veccomplex" 10
|
|
315 (and (eq_attr "type" "veccomplex")
|
|
316 (eq_attr "cpu" "cell"))
|
|
317 "slot01,vsu1_cell,vsu1_cell*8")
|
|
318
|
|
319 ;; TODO: add support for recording instructions
|
|
320 (define_insn_reservation "cell-veccmp" 4
|
|
321 (and (eq_attr "type" "veccmp")
|
|
322 (eq_attr "cpu" "cell"))
|
|
323 "slot01,vsu1_cell,vsu1_cell*2")
|
|
324
|
|
325 (define_insn_reservation "cell-vecfloat" 12
|
|
326 (and (eq_attr "type" "vecfloat")
|
|
327 (eq_attr "cpu" "cell"))
|
|
328 "slot01,vsu1_cell,vsu1_cell*10")
|
|
329
|
|
330 (define_insn_reservation "cell-vecperm" 4
|
|
331 (and (eq_attr "type" "vecperm")
|
|
332 (eq_attr "cpu" "cell"))
|
|
333 "slot01,vsu2_cell,vsu2_cell*2")
|
|
334
|
|
335 ;; New for 4.2, syncs
|
|
336
|
|
337 (define_insn_reservation "cell-sync" 11
|
|
338 (and (eq_attr "type" "sync")
|
|
339 (eq_attr "cpu" "cell"))
|
|
340 "slot01,lsu_cell,lsu_cell*9")
|
|
341
|
|
342 (define_insn_reservation "cell-isync" 11
|
|
343 (and (eq_attr "type" "isync")
|
|
344 (eq_attr "cpu" "cell"))
|
|
345 "slot01,lsu_cell,lsu_cell*9")
|
|
346
|
|
347 (define_insn_reservation "cell-load_l" 11
|
|
348 (and (eq_attr "type" "load_l")
|
|
349 (eq_attr "cpu" "cell"))
|
|
350 "slot01,lsu_cell,lsu_cell*9")
|
|
351
|
|
352 (define_insn_reservation "cell-store_c" 11
|
|
353 (and (eq_attr "type" "store_c")
|
|
354 (eq_attr "cpu" "cell"))
|
|
355 "slot01,lsu_cell,lsu_cell*9")
|
|
356
|
|
357 ;; RAW register dependency
|
|
358
|
|
359 ;; addi r3, r3, 1
|
|
360 ;; lw r4,offset(r3)
|
|
361 ;; there are 5 cycle deplay for r3 bypassing
|
|
362 ;; there are 5 cycle delay for a dependent load after a load
|
|
363 (define_bypass 5 "cell-integer" "cell-load")
|
|
364 (define_bypass 5 "cell-integer" "cell-load-ext")
|
|
365 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
|
|
366
|
|
367 ;; there is a 6 cycle delay after a fp compare until you can use the cr.
|
|
368 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
|
|
369
|
|
370 ;; VXU float RAW
|
|
371 (define_bypass 11 "cell-vecfloat" "cell-vecfloat")
|
|
372
|
|
373 ;; VXU and FPU
|
|
374 (define_bypass 6 "cell-veccomplex" "cell-vecsimple")
|
|
375 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
|
|
376 (define_bypass 3 "cell-vecfloat" "cell-veccomplex")
|
|
377 ; this is not correct,
|
|
378 ;; this is a stall in general and not dependent on result
|
|
379 (define_bypass 13 "cell-vecstore" "cell-fpstore")
|
|
380 ; this is not correct, this can never be true, not dependent on result
|
|
381 (define_bypass 7 "cell-fp" "cell-fpload")
|
|
382 ;; vsu1 should avoid writing to the same target register as vsu2 insn
|
|
383 ;; within 12 cycles.
|
|
384
|
|
385 ;; WAW hazard
|
|
386
|
|
387 ;; the target of VSU estimate should not be reused within 10 dispatch groups
|
|
388 ;; the target of VSU float should not be reused within 8 dispatch groups
|
|
389 ;; the target of VSU complex should not be reused within 5 dispatch groups
|
|
390 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
|
|
391
|
|
392 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
|
|
393 ;; ex4 stage(10 cycles)
|
|
394 (define_bypass 10 "cell-mtjmpr" "cell-branchreg")
|
|
395
|
|
396 ;;Things are not simulated:
|
|
397 ;; update instruction, update address gpr are not simulated
|
|
398 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
|
|
399 ;; insns
|
|
400
|