Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/powerpcspe/cell.md @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 ;; Scheduling description for cell processor. | |
2 ;; Copyright (C) 2001-2017 Free Software Foundation, Inc. | |
3 ;; Contributed by Sony Computer Entertainment, Inc., | |
4 | |
5 | |
6 ;; This file is free software; you can redistribute it and/or modify it under | |
7 ;; the terms of the GNU General Public License as published by the Free | |
8 ;; Software Foundation; either version 3 of the License, or (at your option) | |
9 ;; any later version. | |
10 | |
11 ;; This file is distributed in the hope that it will be useful, but WITHOUT | |
12 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 ;; for more details. | |
15 | |
16 ;; You should have received a copy of the GNU General Public License | |
17 ;; along with GCC; see the file COPYING3. If not see | |
18 ;; <http://www.gnu.org/licenses/>. | |
19 | |
20 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) | |
21 | |
22 ;; BE Architecture *DD3.0 and DD3.1* | |
23 ;; This file simulate PPU processor unit backend of pipeline, maualP24. | |
24 ;; manual P27, stall and flush points | |
25 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program | |
26 ;; order, the grouped address are aligned by 8 | |
27 ;; This file only simulate one thread situation | |
28 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, | |
29 ;; and load/store unit) | |
30 ;; VSU executes all scalar floating points insn(a float unit), | |
31 ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) | |
32 | |
33 ;; Dual issue combination | |
34 | |
35 ;; FXU LSU BR VMX VMX | |
36 ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) | |
37 ;;FXU X | |
38 ;;LSU X X X | |
39 ;;BR X | |
40 ;;VMX(sx,cx,vsu_fp,fp_arth) X | |
41 ;;VMX(perm,vsu_ls, fp_ls) X | |
42 ;; X are illegal combination. | |
43 | |
44 ;; Dual issue exceptions: | |
45 ;;(1) nop-pipelined FXU instr in slot 0 | |
46 ;;(2) non-pipelined FPU inst in slot 0 | |
47 ;; CSI instr(contex-synchronizing insn) | |
48 ;; Microcode insn | |
49 | |
50 ;; BRU unit: bru(none register stall), bru_cr(cr register stall) | |
51 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), | |
52 ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for | |
53 ;; nonpipelined simulation | |
54 ;; micr insns will stall at least 7 cycles to get the first instr from ROM, | |
55 ;; micro instructions are not dual issued. | |
56 | |
57 ;; slot0 is older than slot1 | |
58 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall | |
59 | |
60 ;; There different stall point | |
61 ;; IB2, only stall one thread if stall here, so try to stall here as much as | |
62 ;; we can | |
63 ;; condition(1) insert nop, OR and ORI instruction form | |
64 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or | |
65 ;; CR0-access while stdcx, or stwcx | |
66 ;; IS2 stall ;; Page91 for details | |
67 ;; VQ8 stall | |
68 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to | |
69 ;; the vsu issue queue | |
70 | |
71 ;;(define_automaton "cellxu") | |
72 | |
73 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") | |
74 | |
75 ;; ndfa | |
76 (define_automaton "cellxu,cellvsu,cellbru,cell_mis") | |
77 | |
78 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu") | |
79 (define_cpu_unit "bru_cell" "cellbru") | |
80 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") | |
81 | |
82 (define_cpu_unit "slot0,slot1" "cell_mis") | |
83 | |
84 (absence_set "slot0" "slot1") | |
85 | |
86 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") | |
87 (define_reservation "slot01" "slot0|slot1") | |
88 | |
89 | |
90 ;; Load/store | |
91 ;; lmw, lswi, lswx are only generated for optimize for space, MC, | |
92 ;; these instr are not simulated | |
93 (define_insn_reservation "cell-load" 2 | |
94 (and (eq_attr "type" "load") | |
95 (eq_attr "sign_extend" "no") | |
96 (eq_attr "update" "no") | |
97 (eq_attr "cpu" "cell")) | |
98 "slot01,lsu_cell") | |
99 | |
100 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, | |
101 ;; if with 32bytes alignment, CMC | |
102 (define_insn_reservation "cell-load-ux" 2 | |
103 (and (eq_attr "type" "load") | |
104 (eq_attr "sign_extend" "no") | |
105 (eq_attr "update" "yes") | |
106 (eq_attr "cpu" "cell")) | |
107 "slot01,fxu_cell+lsu_cell") | |
108 | |
109 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown | |
110 ;; 11/7, 11/8, 11/12 | |
111 (define_insn_reservation "cell-load-ext" 2 | |
112 (and (eq_attr "type" "load") | |
113 (eq_attr "sign_extend" "yes") | |
114 (eq_attr "cpu" "cell")) | |
115 "slot01,fxu_cell+lsu_cell") | |
116 | |
117 ;;lfs,lfsx,lfd,lfdx, 1 cycle | |
118 (define_insn_reservation "cell-fpload" 1 | |
119 (and (eq_attr "type" "fpload") | |
120 (eq_attr "update" "no") | |
121 (eq_attr "cpu" "cell")) | |
122 "vsu2_cell+lsu_cell+slot01") | |
123 | |
124 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) | |
125 (define_insn_reservation "cell-fpload-update" 1 | |
126 (and (eq_attr "type" "fpload") | |
127 (eq_attr "update" "yes") | |
128 (eq_attr "cpu" "cell")) | |
129 "fxu_cell+vsu2_cell+lsu_cell+slot01") | |
130 | |
131 (define_insn_reservation "cell-vecload" 2 | |
132 (and (eq_attr "type" "vecload") | |
133 (eq_attr "cpu" "cell")) | |
134 "slot01,vsu2_cell+lsu_cell") | |
135 | |
136 ;;st? stw(MC) | |
137 (define_insn_reservation "cell-store" 1 | |
138 (and (eq_attr "type" "store") | |
139 (eq_attr "update" "no") | |
140 (eq_attr "cpu" "cell")) | |
141 "lsu_cell+slot01") | |
142 | |
143 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg | |
144 (define_insn_reservation "cell-store-update" 1 | |
145 (and (eq_attr "type" "store") | |
146 (eq_attr "update" "yes") | |
147 (eq_attr "cpu" "cell")) | |
148 "fxu_cell+lsu_cell+slot01") | |
149 | |
150 (define_insn_reservation "cell-fpstore" 1 | |
151 (and (eq_attr "type" "fpstore") | |
152 (eq_attr "update" "no") | |
153 (eq_attr "cpu" "cell")) | |
154 "vsu2_cell+lsu_cell+slot01") | |
155 | |
156 (define_insn_reservation "cell-fpstore-update" 1 | |
157 (and (eq_attr "type" "fpstore") | |
158 (eq_attr "update" "yes") | |
159 (eq_attr "cpu" "cell")) | |
160 "vsu2_cell+fxu_cell+lsu_cell+slot01") | |
161 | |
162 (define_insn_reservation "cell-vecstore" 1 | |
163 (and (eq_attr "type" "vecstore") | |
164 (eq_attr "cpu" "cell")) | |
165 "vsu2_cell+lsu_cell+slot01") | |
166 | |
167 ;; Integer latency is 2 cycles | |
168 (define_insn_reservation "cell-integer" 2 | |
169 (and (ior (eq_attr "type" "integer,trap,cntlz,isel") | |
170 (and (eq_attr "type" "add,logical,shift,exts") | |
171 (eq_attr "dot" "no")) | |
172 (and (eq_attr "type" "insert") | |
173 (eq_attr "size" "64"))) | |
174 (eq_attr "cpu" "cell")) | |
175 "slot01,fxu_cell") | |
176 | |
177 ;; Two integer latency is 4 cycles | |
178 (define_insn_reservation "cell-two" 4 | |
179 (and (eq_attr "type" "two") | |
180 (eq_attr "cpu" "cell")) | |
181 "slot01,fxu_cell,fxu_cell*2") | |
182 | |
183 ;; Three integer latency is 6 cycles | |
184 (define_insn_reservation "cell-three" 6 | |
185 (and (eq_attr "type" "three") | |
186 (eq_attr "cpu" "cell")) | |
187 "slot01,fxu_cell,fxu_cell*4") | |
188 | |
189 ;; rlwimi, alter cr0 | |
190 (define_insn_reservation "cell-insert" 2 | |
191 (and (eq_attr "type" "insert") | |
192 (eq_attr "size" "32") | |
193 (eq_attr "cpu" "cell")) | |
194 "slot01,fxu_cell") | |
195 | |
196 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 | |
197 (define_insn_reservation "cell-cmp" 1 | |
198 (and (eq_attr "type" "cmp") | |
199 (eq_attr "cpu" "cell")) | |
200 "fxu_cell+slot01") | |
201 | |
202 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm | |
203 (define_insn_reservation "cell-fast-cmp" 2 | |
204 (and (eq_attr "type" "add,logical,shift,exts") | |
205 (eq_attr "dot" "yes") | |
206 (eq_attr "cpu" "cell") | |
207 (eq_attr "cell_micro" "not")) | |
208 "slot01,fxu_cell") | |
209 | |
210 (define_insn_reservation "cell-cmp-microcoded" 9 | |
211 (and (eq_attr "type" "add,logical,shift,exts") | |
212 (eq_attr "dot" "yes") | |
213 (eq_attr "cpu" "cell") | |
214 (eq_attr "cell_micro" "always")) | |
215 "slot0+slot1,fxu_cell,fxu_cell*7") | |
216 | |
217 ;; mulld | |
218 (define_insn_reservation "cell-lmul" 15 | |
219 (and (eq_attr "type" "mul") | |
220 (eq_attr "dot" "no") | |
221 (eq_attr "size" "64") | |
222 (eq_attr "cpu" "cell")) | |
223 "slot1,nonpipeline,nonpipeline*13") | |
224 | |
225 ;; mulld. is microcoded | |
226 (define_insn_reservation "cell-lmul-cmp" 22 | |
227 (and (eq_attr "type" "mul") | |
228 (eq_attr "dot" "yes") | |
229 (eq_attr "size" "64") | |
230 (eq_attr "cpu" "cell")) | |
231 "slot0+slot1,nonpipeline,nonpipeline*20") | |
232 | |
233 ;; mulli, 6 cycles | |
234 (define_insn_reservation "cell-imul23" 6 | |
235 (and (eq_attr "type" "mul") | |
236 (eq_attr "size" "8,16") | |
237 (eq_attr "cpu" "cell")) | |
238 "slot1,nonpipeline,nonpipeline*4") | |
239 | |
240 ;; mullw, 9 | |
241 (define_insn_reservation "cell-imul" 9 | |
242 (and (eq_attr "type" "mul") | |
243 (eq_attr "dot" "no") | |
244 (eq_attr "size" "32") | |
245 (eq_attr "cpu" "cell")) | |
246 "slot1,nonpipeline,nonpipeline*7") | |
247 | |
248 ;; divide | |
249 (define_insn_reservation "cell-idiv" 32 | |
250 (and (eq_attr "type" "div") | |
251 (eq_attr "size" "32") | |
252 (eq_attr "cpu" "cell")) | |
253 "slot1,nonpipeline,nonpipeline*30") | |
254 | |
255 (define_insn_reservation "cell-ldiv" 64 | |
256 (and (eq_attr "type" "div") | |
257 (eq_attr "size" "64") | |
258 (eq_attr "cpu" "cell")) | |
259 "slot1,nonpipeline,nonpipeline*62") | |
260 | |
261 ;;mflr and mfctr are pipelined | |
262 (define_insn_reservation "cell-mfjmpr" 1 | |
263 (and (eq_attr "type" "mfjmpr") | |
264 (eq_attr "cpu" "cell")) | |
265 "slot01+bru_cell") | |
266 | |
267 ;;mtlr and mtctr, | |
268 ;;mtspr fully pipelined | |
269 (define_insn_reservation "cell-mtjmpr" 1 | |
270 (and (eq_attr "type" "mtjmpr") | |
271 (eq_attr "cpu" "cell")) | |
272 "bru_cell+slot01") | |
273 | |
274 ;; Branches | |
275 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency | |
276 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4 | |
277 (define_insn_reservation "cell-branch" 1 | |
278 (and (eq_attr "type" "branch") | |
279 (eq_attr "cpu" "cell")) | |
280 "bru_cell+slot1") | |
281 | |
282 (define_insn_reservation "cell-branchreg" 1 | |
283 (and (eq_attr "type" "jmpreg") | |
284 (eq_attr "cpu" "cell")) | |
285 "bru_cell+slot1") | |
286 | |
287 ;; cr hazard | |
288 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle | |
289 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish | |
290 (define_insn_reservation "cell-crlogical" 1 | |
291 (and (eq_attr "type" "cr_logical,delayed_cr") | |
292 (eq_attr "cpu" "cell")) | |
293 "bru_cell+slot01") | |
294 | |
295 ;; mfcrf and mfcr is about 34 cycles and nonpipelined | |
296 (define_insn_reservation "cell-mfcr" 34 | |
297 (and (eq_attr "type" "mfcrf,mfcr") | |
298 (eq_attr "cpu" "cell")) | |
299 "slot1,nonpipeline,nonpipeline*32") | |
300 | |
301 ;; mtcrf (1 field) | |
302 (define_insn_reservation "cell-mtcrf" 1 | |
303 (and (eq_attr "type" "mtcr") | |
304 (eq_attr "cpu" "cell")) | |
305 "fxu_cell+slot01") | |
306 | |
307 ; Basic FP latency is 10 cycles, thoughput is 1/cycle | |
308 (define_insn_reservation "cell-fp" 10 | |
309 (and (eq_attr "type" "fp,fpsimple,dmul") | |
310 (eq_attr "cpu" "cell")) | |
311 "slot01,vsu1_cell,vsu1_cell*8") | |
312 | |
313 (define_insn_reservation "cell-fpcompare" 1 | |
314 (and (eq_attr "type" "fpcompare") | |
315 (eq_attr "cpu" "cell")) | |
316 "vsu1_cell+slot01") | |
317 | |
318 ;; sdiv thoughput 1/74, not pipelined but only in the FPU | |
319 (define_insn_reservation "cell-sdiv" 74 | |
320 (and (eq_attr "type" "sdiv,ddiv") | |
321 (eq_attr "cpu" "cell")) | |
322 "slot1,nonpipeline,nonpipeline*72") | |
323 | |
324 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU | |
325 (define_insn_reservation "cell-sqrt" 84 | |
326 (and (eq_attr "type" "ssqrt,dsqrt") | |
327 (eq_attr "cpu" "cell")) | |
328 "slot1,nonpipeline,nonpipeline*82") | |
329 | |
330 ; VMX | |
331 (define_insn_reservation "cell-vecsimple" 4 | |
332 (and (eq_attr "type" "vecsimple,veclogical,vecmove") | |
333 (eq_attr "cpu" "cell")) | |
334 "slot01,vsu1_cell,vsu1_cell*2") | |
335 | |
336 ;; mult, div, madd | |
337 (define_insn_reservation "cell-veccomplex" 10 | |
338 (and (eq_attr "type" "veccomplex") | |
339 (eq_attr "cpu" "cell")) | |
340 "slot01,vsu1_cell,vsu1_cell*8") | |
341 | |
342 ;; TODO: add support for recording instructions | |
343 (define_insn_reservation "cell-veccmp" 4 | |
344 (and (eq_attr "type" "veccmp,veccmpfx") | |
345 (eq_attr "cpu" "cell")) | |
346 "slot01,vsu1_cell,vsu1_cell*2") | |
347 | |
348 (define_insn_reservation "cell-vecfloat" 12 | |
349 (and (eq_attr "type" "vecfloat") | |
350 (eq_attr "cpu" "cell")) | |
351 "slot01,vsu1_cell,vsu1_cell*10") | |
352 | |
353 (define_insn_reservation "cell-vecperm" 4 | |
354 (and (eq_attr "type" "vecperm") | |
355 (eq_attr "cpu" "cell")) | |
356 "slot01,vsu2_cell,vsu2_cell*2") | |
357 | |
358 ;; New for 4.2, syncs | |
359 | |
360 (define_insn_reservation "cell-sync" 11 | |
361 (and (eq_attr "type" "sync") | |
362 (eq_attr "cpu" "cell")) | |
363 "slot01,lsu_cell,lsu_cell*9") | |
364 | |
365 (define_insn_reservation "cell-isync" 11 | |
366 (and (eq_attr "type" "isync") | |
367 (eq_attr "cpu" "cell")) | |
368 "slot01,lsu_cell,lsu_cell*9") | |
369 | |
370 (define_insn_reservation "cell-load_l" 11 | |
371 (and (eq_attr "type" "load_l") | |
372 (eq_attr "cpu" "cell")) | |
373 "slot01,lsu_cell,lsu_cell*9") | |
374 | |
375 (define_insn_reservation "cell-store_c" 11 | |
376 (and (eq_attr "type" "store_c") | |
377 (eq_attr "cpu" "cell")) | |
378 "slot01,lsu_cell,lsu_cell*9") | |
379 | |
380 ;; RAW register dependency | |
381 | |
382 ;; addi r3, r3, 1 | |
383 ;; lw r4,offset(r3) | |
384 ;; there are 5 cycle deplay for r3 bypassing | |
385 ;; there are 5 cycle delay for a dependent load after a load | |
386 (define_bypass 5 "cell-integer" "cell-load") | |
387 (define_bypass 5 "cell-integer" "cell-load-ext") | |
388 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") | |
389 | |
390 ;; there is a 6 cycle delay after a fp compare until you can use the cr. | |
391 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") | |
392 | |
393 ;; VXU float RAW | |
394 (define_bypass 11 "cell-vecfloat" "cell-vecfloat") | |
395 | |
396 ;; VXU and FPU | |
397 (define_bypass 6 "cell-veccomplex" "cell-vecsimple") | |
398 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") | |
399 (define_bypass 3 "cell-vecfloat" "cell-veccomplex") | |
400 ; this is not correct, | |
401 ;; this is a stall in general and not dependent on result | |
402 (define_bypass 13 "cell-vecstore" "cell-fpstore") | |
403 ; this is not correct, this can never be true, not dependent on result | |
404 (define_bypass 7 "cell-fp" "cell-fpload") | |
405 ;; vsu1 should avoid writing to the same target register as vsu2 insn | |
406 ;; within 12 cycles. | |
407 | |
408 ;; WAW hazard | |
409 | |
410 ;; the target of VSU estimate should not be reused within 10 dispatch groups | |
411 ;; the target of VSU float should not be reused within 8 dispatch groups | |
412 ;; the target of VSU complex should not be reused within 5 dispatch groups | |
413 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus | |
414 | |
415 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at | |
416 ;; ex4 stage(10 cycles) | |
417 (define_bypass 10 "cell-mtjmpr" "cell-branchreg") | |
418 | |
419 ;;Things are not simulated: | |
420 ;; update instruction, update address gpr are not simulated | |
421 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float | |
422 ;; insns | |
423 |