From: YOKOTA Hiroshi <yokota.hgml@gmail.com>
Date: Sun, 21 Aug 2022 16:50:54 +0900
Subject: Manually de-reference pointers

Implicit de-reference breaks link time optimization (LTO).

Pointer type mismatch breaks LTO because it violates strict-aliasing rules.

C/Aes.h:
  typedef void (MY_FAST_CALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks);
C/AesOpt.c:
  void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks)
  void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks)
---
 C/AesOpt.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/C/AesOpt.c b/C/AesOpt.c
index 60058bc..1a81546 100755
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -61,7 +61,7 @@
 #endif
 
 #define AES_FUNC_START(name) \
-    void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks)
+    void MY_FAST_CALL name(UInt32 *d_p, Byte *d_data, size_t numBlocks)
 
 #define AES_FUNC_START2(name) \
 AES_FUNC_START (name); \
@@ -77,6 +77,9 @@ AES_FUNC_START (name)
 
 AES_FUNC_START2 (AesCbc_Encode_HW)
 {
+  __m128i *p    = (__m128i *)(void *)d_p;
+  __m128i *data = (__m128i *)(void *)d_data;
+
   __m128i m = *p;
   const __m128i k0 = p[2];
   const __m128i k1 = p[3];
@@ -218,6 +221,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 
 AES_FUNC_START2 (AesCbc_Decode_HW)
 {
+  __m128i *p    = (__m128i *)(void *)d_p;
+  __m128i *data = (__m128i *)(void *)d_data;
+
   __m128i iv = *p;
   const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
   const __m128i *dataEnd;
@@ -271,6 +277,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
 
 AES_FUNC_START2 (AesCtr_Code_HW)
 {
+  __m128i *p    = (__m128i *)(void *)d_p;
+  __m128i *data = (__m128i *)(void *)d_data;
+
   __m128i ctr = *p;
   UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
   const __m128i *dataEnd;
@@ -344,6 +353,9 @@ AES_FUNC_START (name)
 
 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
 {
+  __m128i *p    = (__m128i *)(void *)d_p;
+  __m128i *data = (__m128i *)(void *)d_data;
+
   __m128i iv = *p;
   const __m128i *dataEnd;
   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
@@ -415,6 +427,9 @@ AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
  
 VAES_FUNC_START2 (AesCtr_Code_HW_256)
 {
+  __m128i *p    = (__m128i *)(void *)d_p;
+  __m128i *data = (__m128i *)(void *)d_data;
+
   __m128i ctr = *p;
   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
   const __m128i *dataEnd;
@@ -553,7 +568,7 @@ VAES_COMPAT_STUB (AesCtr_Code_HW)
 typedef uint8x16_t v128;
 
 #define AES_FUNC_START(name) \
-    void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks)
+    void MY_FAST_CALL name(UInt32 *d_p, Byte *d_data, size_t numBlocks)
 
 #define AES_FUNC_START2(name) \
 AES_FUNC_START (name); \
@@ -573,6 +588,9 @@ AES_FUNC_START (name)
 
 AES_FUNC_START2 (AesCbc_Encode_HW)
 {
+  v128 *p    = (v128 *)(void *)d_p;
+  v128 *data = (v128 *)(void *)d_data;
+
   v128 m = *p;
   const v128 k0 = p[2];
   const v128 k1 = p[3];
@@ -674,6 +692,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 
 AES_FUNC_START2 (AesCbc_Decode_HW)
 {
+  v128 *p    = (v128 *)(void *)d_p;
+  v128 *data = (v128 *)(void *)d_data;
+
   v128 iv = *p;
   const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
   const v128 *dataEnd;
@@ -726,6 +747,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
 
 AES_FUNC_START2 (AesCtr_Code_HW)
 {
+  v128 *p    = (v128 *)(void *)d_p;
+  v128 *data = (v128 *)(void *)d_data;
+
   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
   const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
   const v128 *dataEnd;
