1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
|
// Copyright 2017 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "SHA1.h"
#include <array>
#include <memory>
#include <mbedtls/sha1.h>
#include "Common/Assert.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/Swap.h"
#ifdef _MSC_VER
#include <intrin.h>
#else
#ifdef _M_X86_64
#include <immintrin.h>
#elif defined(_M_ARM_64)
#include <arm_acle.h>
#include <arm_neon.h>
#endif
#endif
#ifdef _MSC_VER
#define ATTRIBUTE_TARGET(x)
#else
#define ATTRIBUTE_TARGET(x) [[gnu::target(x)]]
#endif
namespace Common::SHA1
{
class ContextMbed final : public Context
{
public:
ContextMbed()
{
mbedtls_sha1_init(&ctx);
ASSERT(!mbedtls_sha1_starts_ret(&ctx));
}
~ContextMbed() { mbedtls_sha1_free(&ctx); }
virtual void Update(const u8* msg, size_t len) override
{
ASSERT(!mbedtls_sha1_update_ret(&ctx, msg, len));
}
virtual Digest Finish() override
{
Digest digest;
ASSERT(!mbedtls_sha1_finish_ret(&ctx, digest.data()));
return digest;
}
virtual bool HwAccelerated() const override { return false; }
private:
mbedtls_sha1_context ctx{};
};
class BlockContext : public Context
{
protected:
static constexpr size_t BLOCK_LEN = 64;
static constexpr u32 K[4]{0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
static constexpr u32 H[5]{0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0};
virtual void ProcessBlock(const u8* msg) = 0;
virtual Digest GetDigest() = 0;
virtual void Update(const u8* msg, size_t len) override
{
if (len == 0)
return;
msg_len += len;
if (block_used)
{
if (block_used + len >= block.size())
{
size_t rem = block.size() - block_used;
std::memcpy(&block[block_used], msg, rem);
ProcessBlock(&block[0]);
block_used = 0;
msg += rem;
len -= rem;
}
else
{
std::memcpy(&block[block_used], msg, len);
block_used += len;
return;
}
}
while (len >= BLOCK_LEN)
{
ProcessBlock(msg);
msg += BLOCK_LEN;
len -= BLOCK_LEN;
}
if (len)
{
std::memcpy(&block[0], msg, len);
block_used = len;
}
}
virtual Digest Finish() override
{
// block_used is guaranteed < BLOCK_LEN
block[block_used++] = 0x80;
constexpr size_t MSG_LEN_POS = BLOCK_LEN - sizeof(u64);
if (block_used > MSG_LEN_POS)
{
// Pad current block and process it
std::memset(&block[block_used], 0, BLOCK_LEN - block_used);
ProcessBlock(&block[0]);
// Pad a new block
std::memset(&block[0], 0, MSG_LEN_POS);
}
else
{
// Pad current block
std::memset(&block[block_used], 0, MSG_LEN_POS - block_used);
}
Common::BigEndianValue<u64> msg_bitlen(msg_len * 8);
std::memcpy(&block[MSG_LEN_POS], &msg_bitlen, sizeof(msg_bitlen));
ProcessBlock(&block[0]);
return GetDigest();
}
alignas(64) std::array<u8, BLOCK_LEN> block{};
size_t block_used{};
size_t msg_len{};
};
template <typename ValueType, size_t Size>
class CyclicArray
{
public:
inline ValueType operator[](size_t i) const { return data[i % Size]; }
inline ValueType& operator[](size_t i) { return data[i % Size]; }
constexpr size_t size() { return Size; }
private:
std::array<ValueType, Size> data;
};
#ifdef _M_X86_64
// Uses the dedicated SHA1 instructions. Normal SSE(AVX*) would be needed for parallel
// multi-message processing. While Dolphin could gain from such implementation, it requires the
// calling code to be modified and/or making the SHA1 implementation asynchronous so it can
// optimistically batch.
class ContextX64SHA1 final : public BlockContext
{
public:
ContextX64SHA1()
{
state[0] = _mm_set_epi32(H[0], H[1], H[2], H[3]);
state[1] = _mm_set_epi32(H[4], 0, 0, 0);
}
private:
struct XmmReg
{
// Allows aliasing attributes to be respected in the
// face of templates.
__m128i data;
XmmReg& operator=(const __m128i& d)
{
data = d;
return *this;
}
operator __m128i() const { return data; }
};
using WorkBlock = CyclicArray<XmmReg, 4>;
ATTRIBUTE_TARGET("ssse3")
static inline __m128i byterev_16B(__m128i x)
{
return _mm_shuffle_epi8(x, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
}
template <size_t I>
ATTRIBUTE_TARGET("sha")
static inline __m128i MsgSchedule(WorkBlock* wblock)
{
auto& w = *wblock;
// Update and return this location
auto& wx = w[I];
// Do all the xors and rol(x,1) required for 4 rounds of msg schedule
wx = _mm_sha1msg1_epu32(wx, w[I + 1]);
wx = _mm_xor_si128(wx, w[I + 2]);
wx = _mm_sha1msg2_epu32(wx, w[I + 3]);
return wx;
}
ATTRIBUTE_TARGET("sha")
virtual void ProcessBlock(const u8* msg) override
{
// There are 80 rounds with 4 bytes per round, giving 0x140 byte work space, but we can keep
// active state in just 0x40 bytes.
// see FIPS 180-4 6.1.3 Alternate Method for Computing a SHA-1 Message Digest
WorkBlock w;
auto msg_block = (const __m128i*)msg;
for (size_t i = 0; i < w.size(); i++)
w[i] = byterev_16B(_mm_loadu_si128(&msg_block[i]));
// 0: abcd, 1: e
auto abcde = state;
// Not sure of a (non-ugly) way to have constant-evaluated for-loop, so just rely on inlining.
// Problem is that sha1rnds4 requires imm8 arg, and first/last rounds have different behavior.
// clang-format off
// E0 += MSG0, special case of "nexte", can do normal add
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_add_epi32(abcde[1], w[0]), 0);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], w[1]), 0);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], w[2]), 0);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], w[3]), 0);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<4>(&w)), 0);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<5>(&w)), 1);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<6>(&w)), 1);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<7>(&w)), 1);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<8>(&w)), 1);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<9>(&w)), 1);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<10>(&w)), 2);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<11>(&w)), 2);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<12>(&w)), 2);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<13>(&w)), 2);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<14>(&w)), 2);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<15>(&w)), 3);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<16>(&w)), 3);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<17>(&w)), 3);
abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<18>(&w)), 3);
abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<19>(&w)), 3);
// state += abcde
state[1] = _mm_sha1nexte_epu32(abcde[1], state[1]);
state[0] = _mm_add_epi32(abcde[0], state[0]);
// clang-format on
}
virtual Digest GetDigest() override
{
Digest digest;
_mm_storeu_si128((__m128i*)&digest[0], byterev_16B(state[0]));
u32 hi = _mm_cvtsi128_si32(byterev_16B(state[1]));
std::memcpy(&digest[sizeof(__m128i)], &hi, sizeof(hi));
return digest;
}
virtual bool HwAccelerated() const override { return true; }
std::array<XmmReg, 2> state{};
};
#endif
#ifdef _M_ARM_64
class ContextNeon final : public BlockContext
{
public:
ContextNeon()
{
state.abcd = vld1q_u32(&H[0]);
state.e = H[4];
}
private:
using WorkBlock = CyclicArray<uint32x4_t, 4>;
struct State
{
// ARM thought they were being clever by exposing e as u32, but it actually makes non-asm
// implementations pretty annoying/makes compiler's life needlessly difficult.
uint32x4_t abcd{};
u32 e{};
};
static inline uint32x4_t MsgSchedule(WorkBlock* wblock, size_t i)
{
auto& w = *wblock;
// Update and return this location
auto& wx = w[0 + i];
wx = vsha1su0q_u32(wx, w[1 + i], w[2 + i]);
wx = vsha1su1q_u32(wx, w[3 + i]);
return wx;
}
template <size_t Func>
static inline constexpr uint32x4_t f(State state, uint32x4_t w)
{
const auto wk = vaddq_u32(w, vdupq_n_u32(K[Func]));
if constexpr (Func == 0)
return vsha1cq_u32(state.abcd, state.e, wk);
if constexpr (Func == 1 || Func == 3)
return vsha1pq_u32(state.abcd, state.e, wk);
if constexpr (Func == 2)
return vsha1mq_u32(state.abcd, state.e, wk);
}
template <size_t Func>
static inline constexpr State FourRounds(State state, uint32x4_t w)
{
return {f<Func>(state, w), vsha1h_u32(vgetq_lane_u32(state.abcd, 0))};
}
virtual void ProcessBlock(const u8* msg) override
{
WorkBlock w;
for (size_t i = 0; i < w.size(); i++)
w[i] = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&msg[sizeof(uint32x4_t) * i])));
std::array<State, 2> states{state};
// Fashioned to look like x64 impl.
// In each case the goal is to have compiler inline + unroll everything.
states[1] = FourRounds<0>(states[0], w[0]);
states[0] = FourRounds<0>(states[1], w[1]);
states[1] = FourRounds<0>(states[0], w[2]);
states[0] = FourRounds<0>(states[1], w[3]);
states[1] = FourRounds<0>(states[0], MsgSchedule(&w, 4));
states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 5));
states[1] = FourRounds<1>(states[0], MsgSchedule(&w, 6));
states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 7));
states[1] = FourRounds<1>(states[0], MsgSchedule(&w, 8));
states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 9));
states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 10));
states[0] = FourRounds<2>(states[1], MsgSchedule(&w, 11));
states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 12));
states[0] = FourRounds<2>(states[1], MsgSchedule(&w, 13));
states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 14));
states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 15));
states[1] = FourRounds<3>(states[0], MsgSchedule(&w, 16));
states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 17));
states[1] = FourRounds<3>(states[0], MsgSchedule(&w, 18));
states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 19));
state = {vaddq_u32(state.abcd, states[0].abcd), state.e + states[0].e};
}
virtual Digest GetDigest() override
{
Digest digest;
vst1q_u8(&digest[0], vrev32q_u8(vreinterpretq_u8_u32(state.abcd)));
u32 e = Common::FromBigEndian(state.e);
std::memcpy(&digest[sizeof(state.abcd)], &e, sizeof(e));
return digest;
}
virtual bool HwAccelerated() const override { return true; }
State state;
};
#endif
std::unique_ptr<Context> CreateContext()
{
if (cpu_info.bSHA1)
{
#ifdef _M_X86_64
// Note: As of mid 2022, > 99% of CPUs reporting to Steam survey have SSSE3, ~40% have SHA.
// Seems unlikely we'll see any cpus supporting SHA but not SSSE3 (in the foreseeable future at
// least).
if (cpu_info.bSSSE3)
return std::make_unique<ContextX64SHA1>();
#elif defined(_M_ARM_64)
return std::make_unique<ContextNeon>();
#endif
}
return std::make_unique<ContextMbed>();
}
Digest CalculateDigest(const u8* msg, size_t len)
{
auto ctx = CreateContext();
ctx->Update(msg, len);
return ctx->Finish();
}
std::string DigestToString(const Digest& digest)
{
static constexpr std::array<char, 16> lookup = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
std::string hash;
hash.reserve(digest.size() * 2);
for (size_t i = 0; i < digest.size(); ++i)
{
const u8 upper = static_cast<u8>((digest[i] >> 4) & 0xf);
const u8 lower = static_cast<u8>(digest[i] & 0xf);
hash.push_back(lookup[upper]);
hash.push_back(lookup[lower]);
}
return hash;
}
} // namespace Common::SHA1
|