Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/rs6000/cell.md @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | 77e2b8dfacca |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 ;; Scheduling description for cell processor. | |
2 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 | |
3 ;; Free Software Foundation, Inc. | |
4 ;; Contributed by Sony Computer Entertainment, Inc., | |
5 | |
6 | |
7 ;; This file is free software; you can redistribute it and/or modify it under | |
8 ;; the terms of the GNU General Public License as published by the Free | |
9 ;; Software Foundation; either version 3 of the License, or (at your option) | |
10 ;; any later version. | |
11 | |
12 ;; This file is distributed in the hope that it will be useful, but WITHOUT | |
13 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 ;; for more details. | |
16 | |
17 ;; You should have received a copy of the GNU General Public License | |
18 ;; along with GCC; see the file COPYING3. If not see | |
19 ;; <http://www.gnu.org/licenses/>. | |
20 | |
21 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) | |
22 | |
23 ;; BE Architecture *DD3.0 and DD3.1* | |
24 ;; This file simulate PPU processor unit backend of pipeline, maualP24. | |
25 ;; manual P27, stall and flush points | |
26 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program | |
27 ;; order, the grouped address are aligned by 8 | |
28 ;; This file only simulate one thread situation | |
29 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, | |
30 ;; and load/store unit) | |
31 ;; VSU executes all scalar floating points insn(a float unit), | |
32 ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) | |
33 | |
34 ;; Dual issue combination | |
35 | |
36 ;; FXU LSU BR VMX VMX | |
37 ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) | |
38 ;;FXU X | |
39 ;;LSU X X X | |
40 ;;BR X | |
41 ;;VMX(sx,cx,vsu_fp,fp_arth) X | |
42 ;;VMX(perm,vsu_ls, fp_ls) X | |
43 ;; X are illegal combination. | |
44 | |
45 ;; Dual issue exceptions: | |
46 ;;(1) nop-pipelined FXU instr in slot 0 | |
47 ;;(2) non-pipelined FPU inst in slot 0 | |
48 ;; CSI instr(contex-synchronizing insn) | |
49 ;; Microcode insn | |
50 | |
51 ;; BRU unit: bru(none register stall), bru_cr(cr register stall) | |
52 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), | |
53 ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for | |
54 ;; nonpipelined simulation | |
55 ;; micr insns will stall at least 7 cycles to get the first instr from ROM, | |
56 ;; micro instructions are not dual issued. | |
57 | |
58 ;; slot0 is older than slot1 | |
59 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall | |
60 | |
61 ;; There different stall point | |
62 ;; IB2, only stall one thread if stall here, so try to stall here as much as | |
63 ;; we can | |
64 ;; condition(1) insert nop, OR and ORI instruction form | |
65 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or | |
66 ;; CR0-access while stdcx, or stwcx | |
67 ;; IS2 stall ;; Page91 for details | |
68 ;; VQ8 stall | |
69 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to | |
70 ;; the vsu issue queue | |
71 | |
72 ;;(define_automaton "cellxu") | |
73 | |
74 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") | |
75 | |
76 ;; ndfa | |
77 (define_automaton "cellxu,cellvsu,cellbru,cell_mis") | |
78 | |
79 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu") | |
80 (define_cpu_unit "bru_cell" "cellbru") | |
81 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") | |
82 | |
83 (define_cpu_unit "slot0,slot1" "cell_mis") | |
84 | |
85 (absence_set "slot0" "slot1") | |
86 | |
87 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") | |
88 (define_reservation "slot01" "slot0|slot1") | |
89 | |
90 | |
91 ;; Load/store | |
92 ;; lmw, lswi, lswx are only generated for optimize for space, MC, | |
93 ;; these instr are not simulated | |
94 (define_insn_reservation "cell-load" 2 | |
95 (and (eq_attr "type" "load") | |
96 (eq_attr "cpu" "cell")) | |
97 "slot01,lsu_cell") | |
98 | |
99 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, | |
100 ;; if with 32bytes alignment, CMC | |
101 (define_insn_reservation "cell-load-ux" 2 | |
102 (and (eq_attr "type" "load_ux,load_u") | |
103 (eq_attr "cpu" "cell")) | |
104 "slot01,fxu_cell+lsu_cell") | |
105 | |
106 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown | |
107 ;; 11/7, 11/8, 11/12 | |
108 (define_insn_reservation "cell-load-ext" 2 | |
109 (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux") | |
110 (eq_attr "cpu" "cell")) | |
111 "slot01,fxu_cell+lsu_cell") | |
112 | |
113 ;;lfs,lfsx,lfd,lfdx, 1 cycle | |
114 (define_insn_reservation "cell-fpload" 1 | |
115 (and (eq_attr "type" "fpload") | |
116 (eq_attr "cpu" "cell")) | |
117 "vsu2_cell+lsu_cell+slot01") | |
118 | |
119 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) | |
120 (define_insn_reservation "cell-fpload-update" 1 | |
121 (and (eq_attr "type" "fpload,fpload_u,fpload_ux") | |
122 (eq_attr "cpu" "cell")) | |
123 "fxu_cell+vsu2_cell+lsu_cell+slot01") | |
124 | |
125 (define_insn_reservation "cell-vecload" 2 | |
126 (and (eq_attr "type" "vecload") | |
127 (eq_attr "cpu" "cell")) | |
128 "slot01,vsu2_cell+lsu_cell") | |
129 | |
130 ;;st? stw(MC) | |
131 (define_insn_reservation "cell-store" 1 | |
132 (and (eq_attr "type" "store") | |
133 (eq_attr "cpu" "cell")) | |
134 "lsu_cell+slot01") | |
135 | |
136 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg | |
137 (define_insn_reservation "cell-store-update" 1 | |
138 (and (eq_attr "type" "store_ux,store_u") | |
139 (eq_attr "cpu" "cell")) | |
140 "fxu_cell+lsu_cell+slot01") | |
141 | |
142 (define_insn_reservation "cell-fpstore" 1 | |
143 (and (eq_attr "type" "fpstore") | |
144 (eq_attr "cpu" "cell")) | |
145 "vsu2_cell+lsu_cell+slot01") | |
146 | |
147 (define_insn_reservation "cell-fpstore-update" 1 | |
148 (and (eq_attr "type" "fpstore_ux,fpstore_u") | |
149 (eq_attr "cpu" "cell")) | |
150 "vsu2_cell+fxu_cell+lsu_cell+slot01") | |
151 | |
152 (define_insn_reservation "cell-vecstore" 1 | |
153 (and (eq_attr "type" "vecstore") | |
154 (eq_attr "cpu" "cell")) | |
155 "vsu2_cell+lsu_cell+slot01") | |
156 | |
157 ;; Integer latency is 2 cycles | |
158 (define_insn_reservation "cell-integer" 2 | |
159 (and (eq_attr "type" "integer,insert_dword,shift,trap,\ | |
160 var_shift_rotate,cntlz,exts") | |
161 (eq_attr "cpu" "cell")) | |
162 "slot01,fxu_cell") | |
163 | |
164 ;; Two integer latency is 4 cycles | |
165 (define_insn_reservation "cell-two" 4 | |
166 (and (eq_attr "type" "two") | |
167 (eq_attr "cpu" "cell")) | |
168 "slot01,fxu_cell,fxu_cell*2") | |
169 | |
170 ;; Three integer latency is 6 cycles | |
171 (define_insn_reservation "cell-three" 6 | |
172 (and (eq_attr "type" "three") | |
173 (eq_attr "cpu" "cell")) | |
174 "slot01,fxu_cell,fxu_cell*4") | |
175 | |
176 ;; rlwimi, alter cr0 | |
177 (define_insn_reservation "cell-insert" 2 | |
178 (and (eq_attr "type" "insert_word") | |
179 (eq_attr "cpu" "cell")) | |
180 "slot01,fxu_cell") | |
181 | |
182 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 | |
183 (define_insn_reservation "cell-cmp" 1 | |
184 (and (eq_attr "type" "cmp") | |
185 (eq_attr "cpu" "cell")) | |
186 "fxu_cell+slot01") | |
187 | |
188 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm | |
189 (define_insn_reservation "cell-fast-cmp" 2 | |
190 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ | |
191 var_delayed_compare") | |
192 (eq_attr "cpu" "cell")) | |
193 (eq_attr "cell_micro" "not")) | |
194 "slot01,fxu_cell") | |
195 | |
196 (define_insn_reservation "cell-cmp-microcoded" 9 | |
197 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ | |
198 var_delayed_compare") | |
199 (eq_attr "cpu" "cell")) | |
200 (eq_attr "cell_micro" "always")) | |
201 "slot0+slot1,fxu_cell,fxu_cell*7") | |
202 | |
203 ;; mulld | |
204 (define_insn_reservation "cell-lmul" 15 | |
205 (and (eq_attr "type" "lmul") | |
206 (eq_attr "cpu" "cell")) | |
207 "slot1,nonpipeline,nonpipeline*13") | |
208 | |
209 ;; mulld. is microcoded | |
210 (define_insn_reservation "cell-lmul-cmp" 22 | |
211 (and (eq_attr "type" "lmul_compare") | |
212 (eq_attr "cpu" "cell")) | |
213 "slot0+slot1,nonpipeline,nonpipeline*20") | |
214 | |
215 ;; mulli, 6 cycles | |
216 (define_insn_reservation "cell-imul23" 6 | |
217 (and (eq_attr "type" "imul2,imul3") | |
218 (eq_attr "cpu" "cell")) | |
219 "slot1,nonpipeline,nonpipeline*4") | |
220 | |
221 ;; mullw, 9 | |
222 (define_insn_reservation "cell-imul" 9 | |
223 (and (eq_attr "type" "imul") | |
224 (eq_attr "cpu" "cell")) | |
225 "slot1,nonpipeline,nonpipeline*7") | |
226 | |
227 ;; divide | |
228 (define_insn_reservation "cell-idiv" 32 | |
229 (and (eq_attr "type" "idiv") | |
230 (eq_attr "cpu" "cell")) | |
231 "slot1,nonpipeline,nonpipeline*30") | |
232 | |
233 (define_insn_reservation "cell-ldiv" 64 | |
234 (and (eq_attr "type" "ldiv") | |
235 (eq_attr "cpu" "cell")) | |
236 "slot1,nonpipeline,nonpipeline*62") | |
237 | |
238 ;;mflr and mfctr are pipelined | |
239 (define_insn_reservation "cell-mfjmpr" 1 | |
240 (and (eq_attr "type" "mfjmpr") | |
241 (eq_attr "cpu" "cell")) | |
242 "slot01+bru_cell") | |
243 | |
244 ;;mtlr and mtctr, | |
245 ;;mtspr fully pipelined | |
246 (define_insn_reservation "cell-mtjmpr" 1 | |
247 (and (eq_attr "type" "mtjmpr") | |
248 (eq_attr "cpu" "cell")) | |
249 "bru_cell+slot01") | |
250 | |
251 ;; Branches | |
252 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency | |
253 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4 | |
254 (define_insn_reservation "cell-branch" 1 | |
255 (and (eq_attr "type" "branch") | |
256 (eq_attr "cpu" "cell")) | |
257 "bru_cell+slot1") | |
258 | |
259 (define_insn_reservation "cell-branchreg" 1 | |
260 (and (eq_attr "type" "jmpreg") | |
261 (eq_attr "cpu" "cell")) | |
262 "bru_cell+slot1") | |
263 | |
264 ;; cr hazard | |
265 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle | |
266 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish | |
267 (define_insn_reservation "cell-crlogical" 1 | |
268 (and (eq_attr "type" "cr_logical,delayed_cr") | |
269 (eq_attr "cpu" "cell")) | |
270 "bru_cell+slot01") | |
271 | |
272 ;; mfcrf and mfcr is about 34 cycles and nonpipelined | |
273 (define_insn_reservation "cell-mfcr" 34 | |
274 (and (eq_attr "type" "mfcrf,mfcr") | |
275 (eq_attr "cpu" "cell")) | |
276 "slot1,nonpipeline,nonpipeline*32") | |
277 | |
278 ;; mtcrf (1 field) | |
279 (define_insn_reservation "cell-mtcrf" 1 | |
280 (and (eq_attr "type" "mtcr") | |
281 (eq_attr "cpu" "cell")) | |
282 "fxu_cell+slot01") | |
283 | |
284 ; Basic FP latency is 10 cycles, thoughput is 1/cycle | |
285 (define_insn_reservation "cell-fp" 10 | |
286 (and (eq_attr "type" "fp,dmul") | |
287 (eq_attr "cpu" "cell")) | |
288 "slot01,vsu1_cell,vsu1_cell*8") | |
289 | |
290 (define_insn_reservation "cell-fpcompare" 1 | |
291 (and (eq_attr "type" "fpcompare") | |
292 (eq_attr "cpu" "cell")) | |
293 "vsu1_cell+slot01") | |
294 | |
295 ;; sdiv thoughput 1/74, not pipelined but only in the FPU | |
296 (define_insn_reservation "cell-sdiv" 74 | |
297 (and (eq_attr "type" "sdiv,ddiv") | |
298 (eq_attr "cpu" "cell")) | |
299 "slot1,nonpipeline,nonpipeline*72") | |
300 | |
301 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU | |
302 (define_insn_reservation "cell-sqrt" 84 | |
303 (and (eq_attr "type" "ssqrt,dsqrt") | |
304 (eq_attr "cpu" "cell")) | |
305 "slot1,nonpipeline,nonpipeline*82") | |
306 | |
307 ; VMX | |
308 (define_insn_reservation "cell-vecsimple" 4 | |
309 (and (eq_attr "type" "vecsimple") | |
310 (eq_attr "cpu" "cell")) | |
311 "slot01,vsu1_cell,vsu1_cell*2") | |
312 | |
313 ;; mult, div, madd | |
314 (define_insn_reservation "cell-veccomplex" 10 | |
315 (and (eq_attr "type" "veccomplex") | |
316 (eq_attr "cpu" "cell")) | |
317 "slot01,vsu1_cell,vsu1_cell*8") | |
318 | |
319 ;; TODO: add support for recording instructions | |
320 (define_insn_reservation "cell-veccmp" 4 | |
321 (and (eq_attr "type" "veccmp") | |
322 (eq_attr "cpu" "cell")) | |
323 "slot01,vsu1_cell,vsu1_cell*2") | |
324 | |
325 (define_insn_reservation "cell-vecfloat" 12 | |
326 (and (eq_attr "type" "vecfloat") | |
327 (eq_attr "cpu" "cell")) | |
328 "slot01,vsu1_cell,vsu1_cell*10") | |
329 | |
330 (define_insn_reservation "cell-vecperm" 4 | |
331 (and (eq_attr "type" "vecperm") | |
332 (eq_attr "cpu" "cell")) | |
333 "slot01,vsu2_cell,vsu2_cell*2") | |
334 | |
335 ;; New for 4.2, syncs | |
336 | |
337 (define_insn_reservation "cell-sync" 11 | |
338 (and (eq_attr "type" "sync") | |
339 (eq_attr "cpu" "cell")) | |
340 "slot01,lsu_cell,lsu_cell*9") | |
341 | |
342 (define_insn_reservation "cell-isync" 11 | |
343 (and (eq_attr "type" "isync") | |
344 (eq_attr "cpu" "cell")) | |
345 "slot01,lsu_cell,lsu_cell*9") | |
346 | |
347 (define_insn_reservation "cell-load_l" 11 | |
348 (and (eq_attr "type" "load_l") | |
349 (eq_attr "cpu" "cell")) | |
350 "slot01,lsu_cell,lsu_cell*9") | |
351 | |
352 (define_insn_reservation "cell-store_c" 11 | |
353 (and (eq_attr "type" "store_c") | |
354 (eq_attr "cpu" "cell")) | |
355 "slot01,lsu_cell,lsu_cell*9") | |
356 | |
357 ;; RAW register dependency | |
358 | |
359 ;; addi r3, r3, 1 | |
360 ;; lw r4,offset(r3) | |
361 ;; there are 5 cycle deplay for r3 bypassing | |
362 ;; there are 5 cycle delay for a dependent load after a load | |
363 (define_bypass 5 "cell-integer" "cell-load") | |
364 (define_bypass 5 "cell-integer" "cell-load-ext") | |
365 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") | |
366 | |
367 ;; there is a 6 cycle delay after a fp compare until you can use the cr. | |
368 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") | |
369 | |
370 ;; VXU float RAW | |
371 (define_bypass 11 "cell-vecfloat" "cell-vecfloat") | |
372 | |
373 ;; VXU and FPU | |
374 (define_bypass 6 "cell-veccomplex" "cell-vecsimple") | |
375 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") | |
376 (define_bypass 3 "cell-vecfloat" "cell-veccomplex") | |
377 ; this is not correct, | |
378 ;; this is a stall in general and not dependent on result | |
379 (define_bypass 13 "cell-vecstore" "cell-fpstore") | |
380 ; this is not correct, this can never be true, not dependent on result | |
381 (define_bypass 7 "cell-fp" "cell-fpload") | |
382 ;; vsu1 should avoid writing to the same target register as vsu2 insn | |
383 ;; within 12 cycles. | |
384 | |
385 ;; WAW hazard | |
386 | |
387 ;; the target of VSU estimate should not be reused within 10 dispatch groups | |
388 ;; the target of VSU float should not be reused within 8 dispatch groups | |
389 ;; the target of VSU complex should not be reused within 5 dispatch groups | |
390 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus | |
391 | |
392 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at | |
393 ;; ex4 stage(10 cycles) | |
394 (define_bypass 10 "cell-mtjmpr" "cell-branchreg") | |
395 | |
396 ;;Things are not simulated: | |
397 ;; update instruction, update address gpr are not simulated | |
398 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float | |
399 ;; insns | |
400 |