1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
C Initial Fortran 77 version of the Lattice QCD benchmark.
subroutine qcdf(M, res, src, V, iters)
integer V, iters, i, site, spin, col
complex*16 M(3,3,V), res(3,2,V), src(3,2,V)
DO i=1,iters
DO site=1,V
DO spin=1,2
DO col=1,3
res(col,spin,site) = M(col,1,site) * src(1,spin,site)
. + M(col,2,site) * src(2,spin,site)
. + M(col,3,site) * src(3,spin,site)
ENDDO
ENDDO
ENDDO
ENDDO
return
end
C Hand-tuned version
C Changes:
C o Ordering of array altered to improve layout of data in memory
C o col and spin loops unwound; it was found that unwinding the
C col loop inside the spin loop was marginally faster (by 1.1%)
C o Unwinding both loops was faster than unwinding just one.
subroutine qcdf2(M, res, src, V, iters)
integer V, iters, i, site
complex*16 M(3,3,V), res(3,2,V), src(3,2,V)
DO i=1,iters
DO site=1,V
C col=1, spin=1
res(1,1,site) = M(1,1,site) * src(1,1,site)
. + M(1,2,site) * src(2,1,site)
. + M(1,3,site) * src(3,1,site)
C col=2, spin=1
res(2,1,site) = M(2,1,site) * src(1,1,site)
. + M(2,2,site) * src(2,1,site)
. + M(2,3,site) * src(3,1,site)
C col=3, spin=1
res(3,1,site) = M(3,1,site) * src(1,1,site)
. + M(3,2,site) * src(2,1,site)
. + M(3,3,site) * src(3,1,site)
C col=1, spin=2
res(1,2,site) = M(1,1,site) * src(1,2,site)
. + M(1,2,site) * src(2,2,site)
. + M(1,3,site) * src(3,2,site)
C col=2, spin=2
res(2,2,site) = M(2,1,site) * src(1,2,site)
. + M(2,2,site) * src(2,2,site)
. + M(2,3,site) * src(3,2,site)
C col=3, spin=2
res(3,2,site) = M(3,1,site) * src(1,2,site)
. + M(3,2,site) * src(2,2,site)
. + M(3,3,site) * src(3,2,site)
ENDDO
ENDDO
return
end
|