File: SHA1.cpp

package info (click to toggle)
dolphin-emu 2503%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 111,624 kB
  • sloc: cpp: 787,747; ansic: 217,914; xml: 31,400; python: 4,226; yacc: 3,985; javascript: 2,430; makefile: 777; asm: 726; sh: 281; pascal: 257; perl: 97; objc: 75
file content (404 lines) | stat: -rw-r--r-- 12,535 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
// Copyright 2017 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#include "SHA1.h"

#include <array>
#include <memory>

#include <mbedtls/sha1.h>

#include "Common/Assert.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/Swap.h"

#ifdef _MSC_VER
#include <intrin.h>
#else
#ifdef _M_X86_64
#include <immintrin.h>
#elif defined(_M_ARM_64)
#include <arm_acle.h>
#include <arm_neon.h>
#endif
#endif

#ifdef _MSC_VER
#define ATTRIBUTE_TARGET(x)
#else
#define ATTRIBUTE_TARGET(x) [[gnu::target(x)]]
#endif

namespace Common::SHA1
{
class ContextMbed final : public Context
{
public:
  ContextMbed()
  {
    mbedtls_sha1_init(&ctx);
    ASSERT(!mbedtls_sha1_starts_ret(&ctx));
  }
  ~ContextMbed() { mbedtls_sha1_free(&ctx); }
  virtual void Update(const u8* msg, size_t len) override
  {
    ASSERT(!mbedtls_sha1_update_ret(&ctx, msg, len));
  }
  virtual Digest Finish() override
  {
    Digest digest;
    ASSERT(!mbedtls_sha1_finish_ret(&ctx, digest.data()));
    return digest;
  }
  virtual bool HwAccelerated() const override { return false; }

private:
  mbedtls_sha1_context ctx{};
};

class BlockContext : public Context
{
protected:
  static constexpr size_t BLOCK_LEN = 64;
  static constexpr u32 K[4]{0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
  static constexpr u32 H[5]{0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0};

  virtual void ProcessBlock(const u8* msg) = 0;
  virtual Digest GetDigest() = 0;

  virtual void Update(const u8* msg, size_t len) override
  {
    if (len == 0)
      return;
    msg_len += len;

    if (block_used)
    {
      if (block_used + len >= block.size())
      {
        size_t rem = block.size() - block_used;
        std::memcpy(&block[block_used], msg, rem);
        ProcessBlock(&block[0]);
        block_used = 0;
        msg += rem;
        len -= rem;
      }
      else
      {
        std::memcpy(&block[block_used], msg, len);
        block_used += len;
        return;
      }
    }
    while (len >= BLOCK_LEN)
    {
      ProcessBlock(msg);
      msg += BLOCK_LEN;
      len -= BLOCK_LEN;
    }
    if (len)
    {
      std::memcpy(&block[0], msg, len);
      block_used = len;
    }
  }

  virtual Digest Finish() override
  {
    // block_used is guaranteed < BLOCK_LEN
    block[block_used++] = 0x80;

    constexpr size_t MSG_LEN_POS = BLOCK_LEN - sizeof(u64);
    if (block_used > MSG_LEN_POS)
    {
      // Pad current block and process it
      std::memset(&block[block_used], 0, BLOCK_LEN - block_used);
      ProcessBlock(&block[0]);

      // Pad a new block
      std::memset(&block[0], 0, MSG_LEN_POS);
    }
    else
    {
      // Pad current block
      std::memset(&block[block_used], 0, MSG_LEN_POS - block_used);
    }

    Common::BigEndianValue<u64> msg_bitlen(msg_len * 8);
    std::memcpy(&block[MSG_LEN_POS], &msg_bitlen, sizeof(msg_bitlen));

    ProcessBlock(&block[0]);

    return GetDigest();
  }

  alignas(64) std::array<u8, BLOCK_LEN> block{};
  size_t block_used{};
  size_t msg_len{};
};

template <typename ValueType, size_t Size>
class CyclicArray
{
public:
  inline ValueType operator[](size_t i) const { return data[i % Size]; }
  inline ValueType& operator[](size_t i) { return data[i % Size]; }
  constexpr size_t size() { return Size; }

private:
  std::array<ValueType, Size> data;
};

#ifdef _M_X86_64

// Uses the dedicated SHA1 instructions. Normal SSE(AVX*) would be needed for parallel
// multi-message processing. While Dolphin could gain from such implementation, it requires the
// calling code to be modified and/or making the SHA1 implementation asynchronous so it can
// optimistically batch.
class ContextX64SHA1 final : public BlockContext
{
public:
  ContextX64SHA1()
  {
    state[0] = _mm_set_epi32(H[0], H[1], H[2], H[3]);
    state[1] = _mm_set_epi32(H[4], 0, 0, 0);
  }

private:
  struct XmmReg
  {
    // Allows aliasing attributes to be respected in the
    // face of templates.
    __m128i data;

    XmmReg& operator=(const __m128i& d)
    {
      data = d;
      return *this;
    }
    operator __m128i() const { return data; }
  };
  using WorkBlock = CyclicArray<XmmReg, 4>;

  ATTRIBUTE_TARGET("ssse3")
  static inline __m128i byterev_16B(__m128i x)
  {
    return _mm_shuffle_epi8(x, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
  }

  template <size_t I>
  ATTRIBUTE_TARGET("sha")
  static inline __m128i MsgSchedule(WorkBlock* wblock)
  {
    auto& w = *wblock;
    // Update and return this location
    auto& wx = w[I];
    // Do all the xors and rol(x,1) required for 4 rounds of msg schedule
    wx = _mm_sha1msg1_epu32(wx, w[I + 1]);
    wx = _mm_xor_si128(wx, w[I + 2]);
    wx = _mm_sha1msg2_epu32(wx, w[I + 3]);
    return wx;
  }

  ATTRIBUTE_TARGET("sha")
  virtual void ProcessBlock(const u8* msg) override
  {
    // There are 80 rounds with 4 bytes per round, giving 0x140 byte work space, but we can keep
    // active state in just 0x40 bytes.
    // see FIPS 180-4 6.1.3 Alternate Method for Computing a SHA-1 Message Digest
    WorkBlock w;
    auto msg_block = (const __m128i*)msg;
    for (size_t i = 0; i < w.size(); i++)
      w[i] = byterev_16B(_mm_loadu_si128(&msg_block[i]));

    // 0: abcd, 1: e
    auto abcde = state;

    // Not sure of a (non-ugly) way to have constant-evaluated for-loop, so just rely on inlining.
    // Problem is that sha1rnds4 requires imm8 arg, and first/last rounds have different behavior.

    // clang-format off
    // E0 += MSG0, special case of "nexte", can do normal add
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_add_epi32(abcde[1], w[0]), 0);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], w[1]), 0);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], w[2]), 0);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], w[3]), 0);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<4>(&w)), 0);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<5>(&w)), 1);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<6>(&w)), 1);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<7>(&w)), 1);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<8>(&w)), 1);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<9>(&w)), 1);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<10>(&w)), 2);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<11>(&w)), 2);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<12>(&w)), 2);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<13>(&w)), 2);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<14>(&w)), 2);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<15>(&w)), 3);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<16>(&w)), 3);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<17>(&w)), 3);
    abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<18>(&w)), 3);
    abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<19>(&w)), 3);
    // state += abcde
    state[1] = _mm_sha1nexte_epu32(abcde[1], state[1]);
    state[0] = _mm_add_epi32(abcde[0], state[0]);
    // clang-format on
  }

  virtual Digest GetDigest() override
  {
    Digest digest;
    _mm_storeu_si128((__m128i*)&digest[0], byterev_16B(state[0]));
    u32 hi = _mm_cvtsi128_si32(byterev_16B(state[1]));
    std::memcpy(&digest[sizeof(__m128i)], &hi, sizeof(hi));
    return digest;
  }

  virtual bool HwAccelerated() const override { return true; }

  std::array<XmmReg, 2> state{};
};

#endif

#ifdef _M_ARM_64

class ContextNeon final : public BlockContext
{
public:
  ContextNeon()
  {
    state.abcd = vld1q_u32(&H[0]);
    state.e = H[4];
  }

private:
  using WorkBlock = CyclicArray<uint32x4_t, 4>;

  struct State
  {
    // ARM thought they were being clever by exposing e as u32, but it actually makes non-asm
    // implementations pretty annoying/makes compiler's life needlessly difficult.
    uint32x4_t abcd{};
    u32 e{};
  };

  static inline uint32x4_t MsgSchedule(WorkBlock* wblock, size_t i)
  {
    auto& w = *wblock;
    // Update and return this location
    auto& wx = w[0 + i];
    wx = vsha1su0q_u32(wx, w[1 + i], w[2 + i]);
    wx = vsha1su1q_u32(wx, w[3 + i]);
    return wx;
  }

  template <size_t Func>
  static inline constexpr uint32x4_t f(State state, uint32x4_t w)
  {
    const auto wk = vaddq_u32(w, vdupq_n_u32(K[Func]));
    if constexpr (Func == 0)
      return vsha1cq_u32(state.abcd, state.e, wk);
    if constexpr (Func == 1 || Func == 3)
      return vsha1pq_u32(state.abcd, state.e, wk);
    if constexpr (Func == 2)
      return vsha1mq_u32(state.abcd, state.e, wk);
  }

  template <size_t Func>
  static inline constexpr State FourRounds(State state, uint32x4_t w)
  {
    return {f<Func>(state, w), vsha1h_u32(vgetq_lane_u32(state.abcd, 0))};
  }

  virtual void ProcessBlock(const u8* msg) override
  {
    WorkBlock w;
    for (size_t i = 0; i < w.size(); i++)
      w[i] = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&msg[sizeof(uint32x4_t) * i])));

    std::array<State, 2> states{state};

    // Fashioned to look like x64 impl.
    // In each case the goal is to have compiler inline + unroll everything.
    states[1] = FourRounds<0>(states[0], w[0]);
    states[0] = FourRounds<0>(states[1], w[1]);
    states[1] = FourRounds<0>(states[0], w[2]);
    states[0] = FourRounds<0>(states[1], w[3]);
    states[1] = FourRounds<0>(states[0], MsgSchedule(&w, 4));
    states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 5));
    states[1] = FourRounds<1>(states[0], MsgSchedule(&w, 6));
    states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 7));
    states[1] = FourRounds<1>(states[0], MsgSchedule(&w, 8));
    states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 9));
    states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 10));
    states[0] = FourRounds<2>(states[1], MsgSchedule(&w, 11));
    states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 12));
    states[0] = FourRounds<2>(states[1], MsgSchedule(&w, 13));
    states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 14));
    states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 15));
    states[1] = FourRounds<3>(states[0], MsgSchedule(&w, 16));
    states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 17));
    states[1] = FourRounds<3>(states[0], MsgSchedule(&w, 18));
    states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 19));

    state = {vaddq_u32(state.abcd, states[0].abcd), state.e + states[0].e};
  }

  virtual Digest GetDigest() override
  {
    Digest digest;
    vst1q_u8(&digest[0], vrev32q_u8(vreinterpretq_u8_u32(state.abcd)));
    u32 e = Common::FromBigEndian(state.e);
    std::memcpy(&digest[sizeof(state.abcd)], &e, sizeof(e));
    return digest;
  }

  virtual bool HwAccelerated() const override { return true; }

  State state;
};

#endif

std::unique_ptr<Context> CreateContext()
{
  if (cpu_info.bSHA1)
  {
#ifdef _M_X86_64
    // Note: As of mid 2022, > 99% of CPUs reporting to Steam survey have SSSE3, ~40% have SHA.
    // Seems unlikely we'll see any cpus supporting SHA but not SSSE3 (in the foreseeable future at
    // least).
    if (cpu_info.bSSSE3)
      return std::make_unique<ContextX64SHA1>();
#elif defined(_M_ARM_64)
    return std::make_unique<ContextNeon>();
#endif
  }
  return std::make_unique<ContextMbed>();
}

Digest CalculateDigest(const u8* msg, size_t len)
{
  auto ctx = CreateContext();
  ctx->Update(msg, len);
  return ctx->Finish();
}

std::string DigestToString(const Digest& digest)
{
  static constexpr std::array<char, 16> lookup = {'0', '1', '2', '3', '4', '5', '6', '7',
                                                  '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
  std::string hash;
  hash.reserve(digest.size() * 2);
  for (size_t i = 0; i < digest.size(); ++i)
  {
    const u8 upper = static_cast<u8>((digest[i] >> 4) & 0xf);
    const u8 lower = static_cast<u8>(digest[i] & 0xf);
    hash.push_back(lookup[upper]);
    hash.push_back(lookup[lower]);
  }
  return hash;
}
}  // namespace Common::SHA1