1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
|
//+build !noasm,!appengine,gc
// Copyright (c) 2020 MinIO Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
package md5simd
import (
"fmt"
"math"
"unsafe"
"github.com/klauspost/cpuid/v2"
)
var hasAVX512 bool
func init() {
// VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
}
//go:noescape
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
//go:noescape
func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int)
// 8-way 4x uint32 digests in 4 ymm registers
// (ymm0, ymm1, ymm2, ymm3)
type digest8 struct {
v0, v1, v2, v3 [8]uint32
}
// Stack cache for 8x64 byte md5.BlockSize bytes.
// Must be 32-byte aligned, so allocate 512+32 and
// align upwards at runtime.
type cache8 [512 + 32]byte
// MD5 magic numbers for one lane of hashing; inflated
// 8x below at init time.
var md5consts = [64]uint32{
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
}
// inflate the consts 8-way for 8x md5 (256 bit ymm registers)
var avx256md5consts = func(c []uint32) []uint32 {
inf := make([]uint32, 8*len(c))
for i := range c {
for j := 0; j < 8; j++ {
inf[(i*8)+j] = c[i]
}
}
return inf
}(md5consts[:])
// 16-way 4x uint32 digests in 4 zmm registers
type digest16 struct {
v0, v1, v2, v3 [16]uint32
}
// inflate the consts 16-way for 16x md5 (512 bit zmm registers)
var avx512md5consts = func(c []uint32) []uint32 {
inf := make([]uint32, 16*len(c))
for i := range c {
for j := 0; j < 16; j++ {
inf[(i*16)+j] = c[i]
}
}
return inf
}(md5consts[:])
// Interface function to assembly code
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
return
}
// Preparing data using copy is slower since copies aren't inlined.
// Calculate on this goroutine
if half {
for i := range s.i8[0][:] {
s.i8[0][i] = input[i]
}
for i := range s.d8a.v0[:] {
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
}
blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
return
}
for i := range s.i8[0][:] {
s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
}
for i := range s.d8a.v0[:] {
j := (i + 8) & 15
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
// Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
// of using the current for one of the blocks.
s.wg.Add(2)
go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
s.wg.Wait()
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
for i := range s.d8b.v0[:] {
j := (i + 8) & 15
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
}
}
// Interface function to AVX512 assembly code
func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) {
baseMin := uint64(uintptr(unsafe.Pointer(&(base[0]))))
ptrs := [16]int32{}
for i := range ptrs {
if len(input[i]) > 0 {
if len(input[i]) > internalBlockSize {
panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
}
off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
if off > math.MaxUint32 {
panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
}
ptrs[i] = int32(off)
}
}
sdup := *s // create copy of initial states to receive intermediate updates
rounds := generateMaskAndRounds16(input, maskRounds)
for r := 0; r < rounds; r++ {
m := maskRounds[r]
block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds))
for j := 0; j < len(ptrs); j++ {
ptrs[j] += int32(64 * m.rounds) // update pointers for next round
if m.mask&(1<<j) != 0 { // update digest if still masked as active
(*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
}
}
}
}
// Interface function to AVX2 assembly code
func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {
baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4
ptrs := [8]int32{}
for i := range ptrs {
if len(input[i]) > 0 {
if len(input[i]) > internalBlockSize {
panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
}
off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
if off > math.MaxUint32 {
panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
}
ptrs[i] = int32(off)
}
}
sdup := *s // create copy of initial states to receive intermediate updates
rounds := generateMaskAndRounds8(input, maskRounds)
for r := 0; r < rounds; r++ {
m := maskRounds[r]
var cache cache8 // stack storage for block8 tmp state
block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds))
for j := 0; j < len(ptrs); j++ {
ptrs[j] += int32(64 * m.rounds) // update pointers for next round
if m.mask&(1<<j) != 0 { // update digest if still masked as active
(*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
}
}
}
}
|