1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
|
;; ARM 1136J[F]-S Pipeline Description
;; Copyright (C) 2003 Free Software Foundation, Inc.
;; Written by CodeSourcery, LLC.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING. If not, write to the Free
;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
;; 02110-1301, USA. */
;; These descriptions are based on the information contained in the
;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM
;; Limited.
;;
;; This automaton provides a pipeline description for the ARM
;; 1136J-S and 1136JF-S cores.
;;
;; The model given here assumes that the condition for all conditional
;; instructions is "true", i.e., that all of the instructions are
;; actually executed.
(define_automaton "arm1136jfs")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Pipelines
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; There are three distinct pipelines (page 1-26 and following):
;;
;; - A 4-stage decode pipeline, shared by all three. It has fetch (1),
;; fetch (2), decode, and issue stages. Since this is always involved,
;; we do not model it in the scheduler.
;;
;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations),
;; and saturation stages. The fourth stage is writeback; see below.
;;
;; - A 4-stage multiply-accumulate pipeline. It has three stages, called
;; MAC1 through MAC3, and a fourth writeback stage.
;;
;; The 4th-stage writeback is shared between the ALU and MAC pipelines,
;; which operate in lockstep. Results from either pipeline will be
;; moved into the writeback stage. Because the two pipelines operate
;; in lockstep, we schedule them as a single "execute" pipeline.
;;
;; - A 4-stage LSU pipeline. It has address generation, data cache (1),
;; data cache (2), and writeback stages. (Note that this pipeline,
;; including the writeback stage, is independent from the ALU & LSU pipes.)
(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC
; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3
(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ALU Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ALU instructions require eight cycles to execute, and use the ALU
;; pipeline in each of the eight stages. The results are available
;; after the alu stage has finished.
;;
;; If the destination register is the PC, the pipelines are stalled
;; for several cycles. That case is not modelled here.
;; ALU operations with no shifted operand
(define_insn_reservation "11_alu_op" 2
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "alu"))
"e_1,e_2,e_3,e_wb")
;; ALU operations with a shift-by-constant operand
(define_insn_reservation "11_alu_shift_op" 2
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "alu_shift"))
"e_1,e_2,e_3,e_wb")
;; ALU operations with a shift-by-register operand
;; These really stall in the decoder, in order to read
;; the shift value in a second cycle. Pretend we take two cycles in
;; the shift stage.
(define_insn_reservation "11_alu_shift_reg_op" 3
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "alu_shift_reg"))
"e_1*2,e_2,e_3,e_wb")
;; alu_ops can start sooner, if there is no shifter dependency
(define_bypass 1 "11_alu_op,11_alu_shift_op"
"11_alu_op")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 2 "11_alu_shift_reg_op"
"11_alu_op")
(define_bypass 2 "11_alu_shift_reg_op"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_alu_shift_reg_op"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 2 "11_alu_shift_reg_op"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Multiplication Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Multiplication instructions loop in the first two execute stages until
;; the instruction has been passed through the multiplier array enough
;; times.
;; Multiply and multiply-accumulate results are available after four stages.
(define_insn_reservation "11_mult1" 4
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "mul,mla"))
"e_1*2,e_2,e_3,e_wb")
;; The *S variants set the condition flags, which requires three more cycles.
(define_insn_reservation "11_mult2" 4
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "muls,mlas"))
"e_1*2,e_2,e_3,e_wb")
(define_bypass 3 "11_mult1,11_mult2"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 3 "11_mult1,11_mult2"
"11_alu_op")
(define_bypass 3 "11_mult1,11_mult2"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_mult1,11_mult2"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 3 "11_mult1,11_mult2"
"11_store1"
"arm_no_early_store_addr_dep")
;; Signed and unsigned multiply long results are available across two cycles;
;; the less significant word is available one cycle before the more significant
;; word. Here we conservatively wait until both are available, which is
;; after three iterations and the memory cycle. The same is also true of
;; the two multiply-accumulate instructions.
(define_insn_reservation "11_mult3" 5
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "smull,umull,smlal,umlal"))
"e_1*3,e_2,e_3,e_wb*2")
;; The *S variants set the condition flags, which requires three more cycles.
(define_insn_reservation "11_mult4" 5
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "smulls,umulls,smlals,umlals"))
"e_1*3,e_2,e_3,e_wb*2")
(define_bypass 4 "11_mult3,11_mult4"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 4 "11_mult3,11_mult4"
"11_alu_op")
(define_bypass 4 "11_mult3,11_mult4"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 4 "11_mult3,11_mult4"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 4 "11_mult3,11_mult4"
"11_store1"
"arm_no_early_store_addr_dep")
;; Various 16x16->32 multiplies and multiply-accumulates, using combinations
;; of high and low halves of the argument registers. They take a single
;; pass through the pipeline and make the result available after three
;; cycles.
(define_insn_reservation "11_mult5" 3
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))
"e_1,e_2,e_3,e_wb")
(define_bypass 2 "11_mult5"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 2 "11_mult5"
"11_alu_op")
(define_bypass 2 "11_mult5"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_mult5"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 2 "11_mult5"
"11_store1"
"arm_no_early_store_addr_dep")
;; The same idea, then the 32-bit result is added to a 64-bit quantity.
(define_insn_reservation "11_mult6" 4
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "smlalxy"))
"e_1*2,e_2,e_3,e_wb*2")
;; Signed 32x32 multiply, then the most significant 32 bits are extracted
;; and are available after the memory stage.
(define_insn_reservation "11_mult7" 4
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "insn" "smmul,smmulr"))
"e_1*2,e_2,e_3,e_wb")
(define_bypass 3 "11_mult6,11_mult7"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 3 "11_mult6,11_mult7"
"11_alu_op")
(define_bypass 3 "11_mult6,11_mult7"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_mult6,11_mult7"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 3 "11_mult6,11_mult7"
"11_store1"
"arm_no_early_store_addr_dep")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Branch Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; These vary greatly depending on their arguments and the results of
;; stat prediction. Cycle count ranges from zero (unconditional branch,
;; folded dynamic prediction) to seven (incorrect predictions, etc). We
;; assume an optimal case for now, because the cost of a cache miss
;; overwhelms the cost of everything else anyhow.
(define_insn_reservation "11_branches" 0
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "branch"))
"nothing")
;; Call latencies are not predictable. A semi-arbitrary very large
;; number is used as "positive infinity" so that everything should be
;; finished by the time of return.
(define_insn_reservation "11_call" 32
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "call"))
"nothing")
;; Branches are predicted. A correctly predicted branch will be no
;; cost, but we're conservative here, and use the timings a
;; late-register would give us.
(define_bypass 1 "11_alu_op,11_alu_shift_op"
"11_branches")
(define_bypass 2 "11_alu_shift_reg_op"
"11_branches")
(define_bypass 2 "11_load1,11_load2"
"11_branches")
(define_bypass 3 "11_load34"
"11_branches")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load/Store Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The models for load/store instructions do not accurately describe
;; the difference between operations with a base register writeback.
;; These models assume that all memory references hit in dcache. Also,
;; if the PC is one of the registers involved, there are additional stalls
;; not modelled here. Addressing modes are also not modelled.
(define_insn_reservation "11_load1" 3
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "load1"))
"l_a+e_1,l_dc1,l_dc2,l_wb")
;; Load byte results are not available until the writeback stage, where
;; the correct byte is extracted.
(define_insn_reservation "11_loadb" 4
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "load_byte"))
"l_a+e_1,l_dc1,l_dc2,l_wb")
(define_insn_reservation "11_store1" 0
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "store1"))
"l_a+e_1,l_dc1,l_dc2,l_wb")
;; Load/store double words into adjacent registers. The timing and
;; latencies are different depending on whether the address is 64-bit
;; aligned. This model assumes that it is.
(define_insn_reservation "11_load2" 3
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "load2"))
"l_a+e_1,l_dc1,l_dc2,l_wb")
(define_insn_reservation "11_store2" 0
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "store2"))
"l_a+e_1,l_dc1,l_dc2,l_wb")
;; Load/store multiple registers. Two registers are stored per cycle.
;; Actual timing depends on how many registers are affected, so we
;; optimistically schedule a low latency.
(define_insn_reservation "11_load34" 4
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "load3,load4"))
"l_a+e_1,l_dc1*2,l_dc2,l_wb")
(define_insn_reservation "11_store34" 0
(and (eq_attr "tune" "arm1136js,arm1136jfs")
(eq_attr "type" "store3,store4"))
"l_a+e_1,l_dc1*2,l_dc2,l_wb")
;; A store can start immediately after an alu op, if that alu op does
;; not provide part of the address to access.
(define_bypass 1 "11_alu_op,11_alu_shift_op"
"11_store1"
"arm_no_early_store_addr_dep")
(define_bypass 2 "11_alu_shift_reg_op"
"11_store1"
"arm_no_early_store_addr_dep")
;; An alu op can start sooner after a load, if that alu op does not
;; have an early register dependency on the load
(define_bypass 2 "11_load1"
"11_alu_op")
(define_bypass 2 "11_load1"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_load1"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
(define_bypass 3 "11_loadb"
"11_alu_op")
(define_bypass 3 "11_loadb"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_loadb"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
;; A mul op can start sooner after a load, if that mul op does not
;; have an early multiply dependency
(define_bypass 2 "11_load1"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 3 "11_load34"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
(define_bypass 3 "11_loadb"
"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
"arm_no_early_mul_dep")
;; A store can start sooner after a load, if that load does not
;; produce part of the address to access
(define_bypass 2 "11_load1"
"11_store1"
"arm_no_early_store_addr_dep")
(define_bypass 3 "11_loadb"
"11_store1"
"arm_no_early_store_addr_dep")
|