File: x86Shuffle.sml

package info (click to toggle)
mlton 20210117%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 58,464 kB
  • sloc: ansic: 27,682; sh: 4,455; asm: 3,569; lisp: 2,879; makefile: 2,347; perl: 1,169; python: 191; pascal: 68; javascript: 7
file content (139 lines) | stat: -rw-r--r-- 4,890 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
(* NOTE on xchg on the x86
 *
 * From Allen Leung:
 * Here's why I didn't use xchg: 
 * 
 * o  According to the optimization guide xchg mem, reg is complex,
 *    cannot be pipelined or paired at all. xchg reg, reg requires 3 uops.
 *    In contrast, mov mem, reg requires 1 or 2 uops.  
 *    So xchgs loses out, at least on paper.  
 *    [I haven't done any measurements though] 
 * 
 * o  Secondly, unlike other architectures, parallel copies are split 
 *    into individual copies during instruction selection.  Here's why
 *    I did this:  I found that more copies are retained and more spills 
 *    are generated when keeping the parallel copies.   My guess on this is 
 *    that the copy temporary for parallel copies create addition 
 *    interferences [even when they are not needed.]  
 *    This is not a problem on RISC machines, because of plentiful registers.
 *   
 * o  Spilling of parallel copies is also a very complex business when
 *    memory coalescing is turned on.  I think I have implemented a solution
 *    to this, but not using parallel copies keep life simple.   This problem
 *    could be simpler with xchg...but I haven't thought about it much.
 * 
 * From Fermin Reig:
 * In the java@gcc.gnu.org, GC  mailing lists there's been a discussion about
 * the costs of xcgh. Here's some extracts of it:
 * 
 * ----------------
 * > From: Emery Berger [mailto:emery@cs.utexas.edu]
 * > 
 * > http://developer.intel.com/design/pentium4/manuals/24547203.pdf
 * > 
 * > See Chapter 7.1. "For the P6 family processors, locked 
 * > operations serialize
 * > all outstanding load and store operations (that is, wait for them to
 * > complete). This rule is also true for the Pentium 4 
 * > processor, with one
 * > exception: load operations that reference weakly ordered 
 * > memory types (such
 * > as the WC memory type) may not be serialized. "
 * > 
 * -----------------
 * I just tried this on a 500 MHz Pentium III.  I get about 23 cycles for
 *  
 * lock; cmpxchg
 *  
 * :
 * and about 19 or 20 cycles for xchg (which has an implicit lock prefix).
 *  
 * I got consistent results by timing a loop and by looking at an instruction
 * level profile.  Putting other stuff in the loop didn't seem to affect the
 * time taken by xchg much.  Here's the code in case someone else wants to try.
 * (This requires Linux/gcc)
 * -------------------
 * Chris Dodd pointed out on the GC mailing list that on recent Intel X86
 * processors:
 *  
 * - cmpxchg without a lock prefix is much faster (roughly 3x or close to 15
 * cycles by my measurements) than either xchg (implied lock prefix) or lock;
 * cmpxchg .
 *  
 * - cmpxchg without the lock prefix is atomic on uniprocessors, i.e. it's not
 * interruptable.
 *  
 * As far as I can tell, none of the GNU libraries currently take advantage of
 * this fact.  Should they?
 *  
 * This argues, for example, that I could get noticable additional speedup from
 * Java hash synchronization on X86 by overwriting a few strategic "lock"
 * prefixes with "nop"s when I notice that there's only one processor
 *
 *
 * From John Reppy:
 *
 * Disregard what I said.  The xchg instruction has an implicit lock prefix,
 * so it is not useful for normal programming tasks.
 *)

functor X86Shuffle(I : X86INSTR) : X86SHUFFLE =
struct
  structure I = I
  structure C = I.C
  structure CB = CellsBasis
  structure Shuffle = Shuffle(I)

  type t = {tmp:I.operand option, dst:CellsBasis.cell list, src:CellsBasis.cell list}

  exception foo
  val shuffle =
    Shuffle.shuffle
        {mvInstr=fn{dst, src} => [I.move{mvOp=I.MOVL, src=src, dst=dst}],
	 ea=I.Direct}

  (*
   * These assume that the ''registers'' are mapped onto the memory
   *)

  (* Note, this only works with double precision floating point *) 
  val shufflefpNormalAndSlow = 
    Shuffle.shuffle
        {mvInstr=fn{dst, src} => [I.fldl src, I.fstpl dst],
	 ea = I.FDirect}

  (* 
   * This version makes use of the x86 floating point stack for hardware
   * renaming! 
   *)
  fun shufflefpNormal{tmp, src, dst} = let
    val n =  length src
  in 
    if n <= 7 then let 
	fun gen(s::ss, d::ds, pushes, pops) = 
  	    if CB.sameColor(s,d) then gen(ss, ds, pushes, pops)
	    else 
	      gen(ss, ds, 
		    I.fldl(I.FDirect s)::pushes,
		    I.fstpl(I.FDirect d)::pops)
	  | gen(_, _, pushes, pops) = List.revAppend(pushes, pops)
      in  gen(src, dst, [], []) 
      end
    else shufflefpNormalAndSlow{tmp=tmp, src=src, dst=dst}
  end

  (*
   * These assume that the ''registers'' are mapped onto the pseudo 
   * %fpr register.  Only works with double precision floating point for 
   * now...
   *)
  val shufflefpFast = 
       Shuffle.shuffle
         {mvInstr=fn{dst, src} => [I.fmove{fsize=I.FP64,src=src, dst=dst}],
	  ea = I.FPR}

  fun shufflefp(x as {tmp=SOME(I.FPR _), ...}) = shufflefpFast x
    | shufflefp x = shufflefpNormal x

end