1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
import sys, pathlib, time
sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
from m1n1.setup import *
from m1n1 import asm
LOOPS = 10000000
freq = u.mrs(CNTFRQ_EL0)
CREG = [
0x210e00000,
0x211e00000,
]
CLUSTER_PSTATE = 0x20020
# e-core pstates
# 600 972 1332 1704 2064
# p-core pstates
# 600 828 1056 1284 1500 1728 1956 2184 2388 2592 2772 2988 3096 3144 3204
code = u.malloc(0x1000)
util = asm.ARMAsm("""
bench:
mrs x1, CNTPCT_EL0
1:
sub x0, x0, #1
cbnz x0, 1b
mrs x2, CNTPCT_EL0
sub x0, x2, x1
ret
""", code)
iface.writemem(code, util.data)
p.dc_cvau(code, len(util.data))
p.ic_ivau(code, len(util.data))
def bench_cpu(idx):
if idx == 0:
elapsed = p.call(util.bench, LOOPS) / freq
else:
elapsed = p.smp_call_sync(idx, util.bench, LOOPS) / freq
if elapsed == 0:
return 0
mhz = (LOOPS / elapsed) / 1000000
return mhz
print()
e_pstate = p.read64(CREG[0] + CLUSTER_PSTATE)
p_pstate = p.read64(CREG[1] + CLUSTER_PSTATE)
print(f"E-Core pstate: {e_pstate:x}")
print(f"P-Core pstate: {p_pstate:x}")
#for cluster in range(2):
#print(f"Initializing cluster {cluster} (early)")
#p.write64(CREG[cluster] + 0x20660, 0x1000000015)
#p.write64(CREG[cluster] + 0x48000, 0)
#p.write64(CREG[cluster] + 0x48080, 0xa000000000000000)
#p.clear64(CREG[cluster] + CLUSTER_PSTATE, 1<<22)
#p.set32(PMGR + 0x48000, 1)
#p.set32(PMGR + 0x48c00, 1)
#p.set32(PMGR + 0x48800, 1)
#p.set32(PMGR + 0x48400, 1)
CLUSTER_DVMR = 0x206b8
CLUSTER_LIMIT2 = 0x40240
CLUSTER_LIMIT3 = 0x40250
CLUSTER_LIMIT1 = 0x48400
PMGR_CPUGATING = 0x1c080
CLUSTER_CTRL = 0x440f8
CLUSTER_PSCTRL = 0x200f8
for cluster in range(2):
print(f"Initializing cluster {cluster}")
ena = (1<<63)
val = p.read64(CREG[cluster] + CLUSTER_DVMR)
if cluster == 1:
ena |= (1<<32) | (1<<31)
if (val & ena) != ena:
print(f"DVMR: {val:#x} -> {val|ena:#x}")
p.set64(CREG[cluster] + CLUSTER_DVMR, ena) # CLUSTER_DVMR
#p.set64(CREG[cluster] + CLUSTER_LIMIT1, 1<<63)
#p.clear64(CREG[cluster] + CLUSTER_LIMIT2, 1<<63)
#p.set64(CREG[cluster] + CLUSTER_LIMIT3, 1<<63)
#p.set64(CREG[cluster] + CLUSTER_PSTATE, 0)
#p.set32(PMGR + PMGR_CPUGATING + 8 * cluster, 1<<31)
#p.write64(CREG[cluster] + CLUSTER_CTRL, 1)
#p.set64(CREG[cluster] + CLUSTER_PSCTRL, 1<<40)
#pstate = p.read64(CREG[cluster] + CLUSTER_PSTATE) & 0xf
p.smp_start_secondaries()
print("== Initial CPU frequencies ==")
for cpu in range(8):
print(f"CPU {cpu}: {bench_cpu(cpu):.2f} MHz")
def set_pstate(cluster, pstate):
# This really seems to be all that's needed
p.mask64(CREG[cluster] + CLUSTER_PSTATE, 0xf00f, (1<<25) | pstate | (pstate << 12))
# Optionally, adjust MCC performance in higher p-core pstates
if cluster == 1:
if pstate > 8:
p0, p1 = 0x133, 0x55555340
else:
p0, p1 = 0x813057f, 0x1800180
for lane in range(8):
p.write32(0x200200dc4 + lane * 0x40000, p0)
p.write32(0x200200dbc + lane * 0x40000, p1)
# This seems to be about notifying PMP
#p.write32(0x23b738004 + cluster*4, pstate)
#p.write32(0x23bc34000, 1 << cluster)
set_pstate(1, 15)
e_pstate = p.read64(CREG[0] + CLUSTER_PSTATE)
p_pstate = p.read64(CREG[1] + CLUSTER_PSTATE)
print(f"E-Core pstate: {e_pstate:x}")
print(f"P-Core pstate: {p_pstate:x}")
time.sleep(0.5)
print("== Final CPU frequencies ==")
#elapsed = p.smp_call(7, util.bench, 80000000)
for cpu in range(8):
print(f"CPU {cpu}: {bench_cpu(cpu):.2f} MHz")
#elapsed = p.smp_wait(7)
|