1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
{-# LANGUAGE MagicHash #-}
-- | Evaluate an array by dividing it into rectangular blocks and filling
-- each block in parallel.
module Data.Array.Repa.Eval.Cursored
( fillBlock2P
, fillCursoredBlock2P
, fillCursoredBlock2S )
where
import Data.Array.Repa.Index
import Data.Array.Repa.Shape
import Data.Array.Repa.Eval.Elt
import Data.Array.Repa.Eval.Gang
import GHC.Base
-- Non-cursored interface -----------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
--
-- * Blockwise filling can be more cache-efficient than linear filling for
-- rank-2 arrays.
--
-- * Coordinates given are of the filled edges of the block.
--
-- * We divide the block into columns, and give one column to each thread.
--
-- * Each column is filled in row major order from top to bottom.
--
fillBlock2P
:: Elt a
=> (Int -> a -> IO ()) -- ^ Update function to write into result buffer.
-> (DIM2 -> a) -- ^ Function to evaluate the element at an index.
-> Int# -- ^ Width of the whole array.
-> Int# -- ^ x0 lower left corner of block to fill
-> Int# -- ^ y0
-> Int# -- ^ w0 width of block to fill.
-> Int# -- ^ h0 height of block to fill.
-> IO ()
{-# INLINE [0] fillBlock2P #-}
fillBlock2P write getElem !imageWidth !x0 !y0 !w0 h0
= fillCursoredBlock2P
write id addDim getElem
imageWidth x0 y0 w0 h0
{-
-- | Fill a block in a rank-2 array sequentially.
--
-- * Blockwise filling can be more cache-efficient than linear filling for
-- rank-2 arrays.
--
-- * Coordinates given are of the filled edges of the block.
--
-- * The block is filled in row major order from top to bottom.
--
fillBlock2S
:: Elt a
=> (Int -> a -> IO ()) -- ^ Update function to write into result buffer.
-> (DIM2 -> a) -- ^ Function to evaluate the element at an index.
-> Int# -- ^ Width of the whole array.
-> Int# -- ^ x0 lower left corner of block to fill
-> Int# -- ^ y0
-> Int# -- ^ w0 width of block to fill
-> Int# -- ^ h0 height of block to filll
-> IO ()
{-# INLINE [0] fillBlock2S #-}
fillBlock2S write getElem !imageWidth !x0 !y0 !w0 !h0
= fillCursoredBlock2S
write id addDim getElem
imageWidth x0 y0 w0 h0
-}
-- Block filling ----------------------------------------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
--
-- * Blockwise filling can be more cache-efficient than linear filling for rank-2 arrays.
--
-- * Using cursor functions can help to expose inter-element indexing computations to
-- the GHC and LLVM optimisers.
--
-- * Coordinates given are of the filled edges of the block.
--
-- * We divide the block into columns, and give one column to each thread.
--
-- * Each column is filled in row major order from top to bottom.
--
fillCursoredBlock2P
:: Elt a
=> (Int -> a -> IO ()) -- ^ Update function to write into result buffer.
-> (DIM2 -> cursor) -- ^ Make a cursor to a particular element.
-> (DIM2 -> cursor -> cursor) -- ^ Shift the cursor by an offset.
-> (cursor -> a) -- ^ Function to evaluate the element at an index.
-> Int# -- ^ Width of the whole array.
-> Int# -- ^ x0 lower left corner of block to fill
-> Int# -- ^ y0
-> Int# -- ^ w0 width of block to fill
-> Int# -- ^ h0 height of block to fill
-> IO ()
{-# INLINE [0] fillCursoredBlock2P #-}
fillCursoredBlock2P
write
makeCursorFCB shiftCursorFCB getElemFCB
!imageWidth !x0 !y0 !w0 !h0
= gangIO theGang fillBlock
where
!(I# threads) = gangSize theGang
-- All columns have at least this many pixels.
!colChunkLen = w0 `quotInt#` threads
-- Extra pixels that we have to divide between some of the threads.
!colChunkSlack = w0 `remInt#` threads
-- Get the starting pixel of a column in the image.
{-# INLINE colIx #-}
colIx !ix
| 1# <- ix <# colChunkSlack = x0 +# (ix *# (colChunkLen +# 1#))
| otherwise = x0 +# (ix *# colChunkLen) +# colChunkSlack
-- Give one column to each thread
{-# INLINE fillBlock #-}
fillBlock :: Int -> IO ()
fillBlock !(I# ix)
= let !x0' = colIx ix
!w0' = colIx (ix +# 1#) -# x0'
!y0' = y0
!h0' = h0
in fillCursoredBlock2S
write
makeCursorFCB shiftCursorFCB getElemFCB
imageWidth x0' y0' w0' h0'
-- | Fill a block in a rank-2 array, sequentially.
--
-- * Blockwise filling can be more cache-efficient than linear filling for rank-2 arrays.
--
-- * Using cursor functions can help to expose inter-element indexing computations to
-- the GHC and LLVM optimisers.
--
-- * Coordinates given are of the filled edges of the block.
--
-- * The block is filled in row major order from top to bottom.
--
fillCursoredBlock2S
:: Elt a
=> (Int -> a -> IO ()) -- ^ Update function to write into result buffer.
-> (DIM2 -> cursor) -- ^ Make a cursor to a particular element.
-> (DIM2 -> cursor -> cursor) -- ^ Shift the cursor by an offset.
-> (cursor -> a) -- ^ Function to evaluate an element at the given index.
-> Int# -- ^ Width of the whole array.
-> Int# -- ^ x0 lower left corner of block to fill.
-> Int# -- ^ y0
-> Int# -- ^ w0 width of block to fill
-> Int# -- ^ h0 height of block to fill
-> IO ()
{-# INLINE [0] fillCursoredBlock2S #-}
fillCursoredBlock2S
write
makeCursor shiftCursor getElem
!imageWidth !x0 !y0 !w0 h0
= do fillBlock y0
where !x1 = x0 +# w0
!y1 = y0 +# h0
{-# INLINE fillBlock #-}
fillBlock !y
| 1# <- y >=# y1 = return ()
| otherwise
= do fillLine4 x0
fillBlock (y +# 1#)
where {-# INLINE fillLine4 #-}
fillLine4 !x
| 1# <- x +# 4# >=# x1 = fillLine1 x
| otherwise
= do -- Compute each source cursor based on the previous one so that
-- the variable live ranges in the generated code are shorter.
let srcCur0 = makeCursor (Z :. (I# y) :. (I# x))
let srcCur1 = shiftCursor (Z :. 0 :. 1) srcCur0
let srcCur2 = shiftCursor (Z :. 0 :. 1) srcCur1
let srcCur3 = shiftCursor (Z :. 0 :. 1) srcCur2
-- Get the result value for each cursor.
let val0 = getElem srcCur0
let val1 = getElem srcCur1
let val2 = getElem srcCur2
let val3 = getElem srcCur3
-- Ensure that we've computed each of the result values before we
-- write into the array. If the backend code generator can't tell
-- our destination array doesn't alias with the source then writing
-- to it can prevent the sharing of intermediate computations.
touch val0
touch val1
touch val2
touch val3
-- Compute cursor into destination array.
let !dstCur0 = x +# (y *# imageWidth)
write (I# dstCur0) val0
write (I# (dstCur0 +# 1#)) val1
write (I# (dstCur0 +# 2#)) val2
write (I# (dstCur0 +# 3#)) val3
fillLine4 (x +# 4#)
{-# INLINE fillLine1 #-}
fillLine1 !x
| 1# <- x >=# x1 = return ()
| otherwise
= do let val0 = (getElem $ makeCursor (Z :. (I# y) :. (I# x)))
write (I# (x +# (y *# imageWidth))) val0
fillLine1 (x +# 1#)
|