File: transpose4x4-opt.py

package info (click to toggle)
python-peachpy 0.0~git20211013.257881e-1.1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 2,452 kB
  • sloc: python: 29,286; ansic: 54; makefile: 44; cpp: 31
file content (58 lines) | stat: -rw-r--r-- 1,847 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# This file is part of PeachPy package and is licensed under the Simplified BSD license.
#    See license.rst for the full text of the license.

from peachpy.x86_64 import *
from peachpy import *

matrix = Argument(ptr(float_))

with Function("transpose4x4_opt", (matrix,)):
    reg_matrix = GeneralPurposeRegister64()
    LOAD.ARGUMENT(reg_matrix, matrix)

    xmm_rows = [XMMRegister() for _ in range(4)]
    for i, xmm_row in enumerate(xmm_rows):
        MOVUPS(xmm_row, [reg_matrix + i * XMMRegister.size])

    xmm_temps = [XMMRegister() for _ in range(2)]
    # xmm_temps[0] = ( m00, m01, m02, m03 )
    MOVAPS(xmm_temps[0], xmm_rows[0])
    # xmm_temps[1] = ( m20, m21, m22, m23 )
    MOVAPS(xmm_temps[1], xmm_rows[2])

    # xmm_rows[0] = ( m00, m10, m01, m11 )
    UNPCKLPS(xmm_rows[0], xmm_rows[1])
    # xmm_rows[2] = ( m20, m30, m21, m31 )
    UNPCKLPS(xmm_rows[2], xmm_rows[3])

    # xmm_rows[1] = ( m02, m12, m03, m13 )
    UNPCKHPS(xmm_temps[0], xmm_rows[1])
    xmm_rows[1] = xmm_temps[0]

    # xmm_rows[3] = ( m22, m32, m23, m33 )
    UNPCKHPS(xmm_temps[1], xmm_rows[3])
    xmm_rows[3] = xmm_temps[1]

    xmm_temps = [XMMRegister() for _ in range(2)]
    # xmm_temps[0] = ( m00, m10, m01, m11 )
    MOVAPS(xmm_temps[0], xmm_rows[0])
    # xmm_temps[1] = ( m02, m12, m03, m13 )
    MOVAPS(xmm_temps[1], xmm_rows[1])

    # xmm_rows[0] = ( m00, m10, m20, m30 )
    MOVLHPS(xmm_rows[0], xmm_rows[2])
    MOVUPS([reg_matrix], xmm_rows[0])

    # xmm_rows[2] = ( m01, m11, m21, m31 )
    MOVHLPS(xmm_rows[2], xmm_temps[0])
    MOVUPS([reg_matrix + 16], xmm_rows[2])

    # xmm_rows[1] = ( m02, m12, m22, m32 )
    MOVLHPS(xmm_rows[1], xmm_rows[3])
    MOVUPS([reg_matrix + 32], xmm_rows[1])

    # xmm_rows[3] = ( m03, m13, m23, m33 )
    MOVHLPS(xmm_rows[3], xmm_temps[1])
    MOVUPS([reg_matrix + 48], xmm_rows[3])

    RETURN()