2C2EF2GKTOAT7QI56KKDFGRGSFZRDPKUASNQFB6XQDROAEPPZW4AC
VKLGQREYOZDV46F672RFE5XJO3OEOP4EHTCWZYOJY24HVPQX3L6QC
PEUS54XQ5KJQYAVUYBG5MWLEHIOVPMZ3ANVC7HPQP6JUWWPRDW5AC
X36ICMJNYKJF35ZUEVCCR33JOZPCPZQ6KSGEQI3RZND2P5EDASLQC
RIWSVVASWLJQQTSVRHIIUPENOZWOMHQLZMTQVGJUS2ZUGDPSWWIQC
Q7TKZCJP2Z75EICZYKCEZDHKGERSOKZGMTSU3UXETBHTF663T66AC
3NA345CN3HKNUQOWTUMUTINQMFLYATWPO4H74J4AFGEUQKGQYBWQC
B3XLVPNC4COLLC3FUE34Y7HIKTMF6CJZUASZOU3YM2YGPZKJZP7QC
3OHR6ZPHN53SVWJL4GUKKUC223IPXA73UPBFJCM4ENGS27AKHN6AC
#include "blake3_impl.h"
#include <string.h>
INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
return (w >> c) | (w << (32 - c));
}
INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
uint32_t x, uint32_t y) {
state[a] = state[a] + state[b] + x;
state[d] = rotr32(state[d] ^ state[a], 16);
state[c] = state[c] + state[d];
state[b] = rotr32(state[b] ^ state[c], 12);
state[a] = state[a] + state[b] + y;
state[d] = rotr32(state[d] ^ state[a], 8);
state[c] = state[c] + state[d];
state[b] = rotr32(state[b] ^ state[c], 7);
}
INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
// Select the message schedule based on the round.
const uint8_t *schedule = MSG_SCHEDULE[round];
// Mix the columns.
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
// Mix the rows.
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
}
INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags) {
uint32_t block_words[16];
block_words[0] = load32(block + 4 * 0);
block_words[1] = load32(block + 4 * 1);
block_words[2] = load32(block + 4 * 2);
block_words[3] = load32(block + 4 * 3);
block_words[4] = load32(block + 4 * 4);
block_words[5] = load32(block + 4 * 5);
block_words[6] = load32(block + 4 * 6);
block_words[7] = load32(block + 4 * 7);
block_words[8] = load32(block + 4 * 8);
block_words[9] = load32(block + 4 * 9);
block_words[10] = load32(block + 4 * 10);
block_words[11] = load32(block + 4 * 11);
block_words[12] = load32(block + 4 * 12);
block_words[13] = load32(block + 4 * 13);
block_words[14] = load32(block + 4 * 14);
block_words[15] = load32(block + 4 * 15);
state[0] = cv[0];
state[1] = cv[1];
state[2] = cv[2];
state[3] = cv[3];
state[4] = cv[4];
state[5] = cv[5];
state[6] = cv[6];
state[7] = cv[7];
state[8] = IV[0];
state[9] = IV[1];
state[10] = IV[2];
state[11] = IV[3];
state[12] = counter_low(counter);
state[13] = counter_high(counter);
state[14] = (uint32_t)block_len;
state[15] = (uint32_t)flags;
round_fn(state, &block_words[0], 0);
round_fn(state, &block_words[0], 1);
round_fn(state, &block_words[0], 2);
round_fn(state, &block_words[0], 3);
round_fn(state, &block_words[0], 4);
round_fn(state, &block_words[0], 5);
round_fn(state, &block_words[0], 6);
}
void blake3_compress_in_place_portable(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
uint32_t state[16];
compress_pre(state, cv, block, block_len, counter, flags);
cv[0] = state[0] ^ state[8];
cv[1] = state[1] ^ state[9];
cv[2] = state[2] ^ state[10];
cv[3] = state[3] ^ state[11];
cv[4] = state[4] ^ state[12];
cv[5] = state[5] ^ state[13];
cv[6] = state[6] ^ state[14];
cv[7] = state[7] ^ state[15];
}
void blake3_compress_xof_portable(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]) {
uint32_t state[16];
compress_pre(state, cv, block, block_len, counter, flags);
store32(&out[0 * 4], state[0] ^ state[8]);
store32(&out[1 * 4], state[1] ^ state[9]);
store32(&out[2 * 4], state[2] ^ state[10]);
store32(&out[3 * 4], state[3] ^ state[11]);
store32(&out[4 * 4], state[4] ^ state[12]);
store32(&out[5 * 4], state[5] ^ state[13]);
store32(&out[6 * 4], state[6] ^ state[14]);
store32(&out[7 * 4], state[7] ^ state[15]);
store32(&out[8 * 4], state[8] ^ cv[0]);
store32(&out[9 * 4], state[9] ^ cv[1]);
store32(&out[10 * 4], state[10] ^ cv[2]);
store32(&out[11 * 4], state[11] ^ cv[3]);
store32(&out[12 * 4], state[12] ^ cv[4]);
store32(&out[13 * 4], state[13] ^ cv[5]);
store32(&out[14 * 4], state[14] ^ cv[6]);
store32(&out[15 * 4], state[15] ^ cv[7]);
}
INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
store_cv_words(out, cv);
}
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs > 0) {
hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
if (increment_counter) {
counter += 1;
}
inputs += 1;
num_inputs -= 1;
out = &out[BLAKE3_OUT_LEN];
}
}
#ifndef BLAKE3_IMPL_H
#define BLAKE3_IMPL_H
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "blake3.h"
// internal flags
enum blake3_flags {
CHUNK_START = 1 << 0,
CHUNK_END = 1 << 1,
PARENT = 1 << 2,
ROOT = 1 << 3,
KEYED_HASH = 1 << 4,
DERIVE_KEY_CONTEXT = 1 << 5,
DERIVE_KEY_MATERIAL = 1 << 6,
};
// This C implementation tries to support recent versions of GCC, Clang, and
// MSVC.
#if defined(_MSC_VER)
#define INLINE static __forceinline
#else
#define INLINE static inline __attribute__((always_inline))
#endif
#if defined(__x86_64__) || defined(_M_X64)
#define IS_X86
#define IS_X86_64
#endif
#if defined(__i386__) || defined(_M_IX86)
#define IS_X86
#define IS_X86_32
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
#define IS_AARCH64
#endif
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#include <immintrin.h>
#endif
#if !defined(BLAKE3_USE_NEON)
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
#if defined(IS_AARCH64)
#define BLAKE3_USE_NEON 1
#else
#define BLAKE3_USE_NEON 0
#endif
#endif
#if defined(IS_X86)
#define MAX_SIMD_DEGREE 16
#elif BLAKE3_USE_NEON == 1
#define MAX_SIMD_DEGREE 4
#else
#define MAX_SIMD_DEGREE 1
#endif
// There are some places where we want a static size that's equal to the
// MAX_SIMD_DEGREE, but also at least 2.
#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
0x1F83D9ABUL, 0x5BE0CD19UL};
static const uint8_t MSG_SCHEDULE[7][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
{3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
{10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
{12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
{9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
{11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
};
/* Find index of the highest set bit */
/* x is assumed to be nonzero. */
static unsigned int highest_one(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
return 63 ^ __builtin_clzll(x);
#elif defined(_MSC_VER) && defined(IS_X86_64)
unsigned long index;
_BitScanReverse64(&index, x);
return index;
#elif defined(_MSC_VER) && defined(IS_X86_32)
if(x >> 32) {
unsigned long index;
_BitScanReverse(&index, (unsigned long)(x >> 32));
return 32 + index;
} else {
unsigned long index;
_BitScanReverse(&index, (unsigned long)x);
return index;
}
#else
unsigned int c = 0;
if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
if(x & 0x0000000000000002ULL) { c += 1; }
return c;
#endif
}
// Count the number of 1 bits.
INLINE unsigned int popcnt(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
return __builtin_popcountll(x);
#else
unsigned int count = 0;
while (x != 0) {
count += 1;
x &= x - 1;
}
return count;
#endif
}
// Largest power of two less than or equal to x. As a special case, returns 1
// when x is 0.
INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
return 1ULL << highest_one(x | 1);
}
INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
INLINE uint32_t counter_high(uint64_t counter) {
return (uint32_t)(counter >> 32);
}
INLINE uint32_t load32(const void *src) {
const uint8_t *p = (const uint8_t *)src;
return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
}
INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
uint32_t key_words[8]) {
key_words[0] = load32(&key[0 * 4]);
key_words[1] = load32(&key[1 * 4]);
key_words[2] = load32(&key[2 * 4]);
key_words[3] = load32(&key[3 * 4]);
key_words[4] = load32(&key[4 * 4]);
key_words[5] = load32(&key[5 * 4]);
key_words[6] = load32(&key[6 * 4]);
key_words[7] = load32(&key[7 * 4]);
}
INLINE void store32(void *dst, uint32_t w) {
uint8_t *p = (uint8_t *)dst;
p[0] = (uint8_t)(w >> 0);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
}
INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
store32(&bytes_out[0 * 4], cv_words[0]);
store32(&bytes_out[1 * 4], cv_words[1]);
store32(&bytes_out[2 * 4], cv_words[2]);
store32(&bytes_out[3 * 4], cv_words[3]);
store32(&bytes_out[4 * 4], cv_words[4]);
store32(&bytes_out[5 * 4], cv_words[5]);
store32(&bytes_out[6 * 4], cv_words[6]);
store32(&bytes_out[7 * 4], cv_words[7]);
}
void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]);
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out);
size_t blake3_simd_degree(void);
// Declarations for implementation-specific functions.
void blake3_compress_in_place_portable(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_portable(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#if defined(IS_X86)
#if !defined(BLAKE3_NO_SSE2)
void blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_SSE41)
void blake3_compress_in_place_sse41(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_sse41(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_AVX2)
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_AVX512)
void blake3_compress_in_place_avx512(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#endif
#if BLAKE3_USE_NEON == 1
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#endif /* BLAKE3_IMPL_H */
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "blake3_impl.h"
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <immintrin.h>
#else
#error "Unimplemented!"
#endif
#endif
#define MAYBE_UNUSED(x) (void)((x))
#if defined(IS_X86)
static uint64_t xgetbv(void) {
#if defined(_MSC_VER)
return _xgetbv(0);
#else
uint32_t eax = 0, edx = 0;
__asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64_t)edx << 32) | eax;
#endif
}
static void cpuid(uint32_t out[4], uint32_t id) {
#if defined(_MSC_VER)
__cpuid((int *)out, id);
#elif defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#endif
}
static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
#if defined(_MSC_VER)
__cpuidex((int *)out, id, sid);
#elif defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#endif
}
#endif
enum cpu_feature {
SSE2 = 1 << 0,
SSSE3 = 1 << 1,
SSE41 = 1 << 2,
AVX = 1 << 3,
AVX2 = 1 << 4,
AVX512F = 1 << 5,
AVX512VL = 1 << 6,
/* ... */
UNDEFINED = 1 << 30
};
#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
enum cpu_feature g_cpu_features = UNDEFINED;
#if !defined(BLAKE3_TESTING)
static
#endif
enum cpu_feature
get_cpu_features(void) {
if (g_cpu_features != UNDEFINED) {
return g_cpu_features;
} else {
#if defined(IS_X86)
uint32_t regs[4] = {0};
uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
(void)edx;
enum cpu_feature features = 0;
cpuid(regs, 0);
const int max_id = *eax;
cpuid(regs, 1);
#if defined(__amd64__) || defined(_M_X64)
features |= SSE2;
#else
if (*edx & (1UL << 26))
features |= SSE2;
#endif
if (*ecx & (1UL << 0))
features |= SSSE3;
if (*ecx & (1UL << 19))
features |= SSE41;
if (*ecx & (1UL << 27)) { // OSXSAVE
const uint64_t mask = xgetbv();
if ((mask & 6) == 6) { // SSE and AVX states
if (*ecx & (1UL << 28))
features |= AVX;
if (max_id >= 7) {
cpuidex(regs, 7, 0);
if (*ebx & (1UL << 5))
features |= AVX2;
if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
if (*ebx & (1UL << 31))
features |= AVX512VL;
if (*ebx & (1UL << 16))
features |= AVX512F;
}
}
}
}
g_cpu_features = features;
return features;
#else
/* How to detect NEON? */
return 0;
#endif
}
}
void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
return;
}
#endif
#endif
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}
void blake3_compress_xof(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
return;
}
#endif
#endif
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#endif
#if BLAKE3_USE_NEON == 1
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
return;
#endif
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
}
// The dynamically detected SIMD degree of the current platform.
size_t blake3_simd_degree(void) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
return 16;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
return 8;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
return 4;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
return 4;
}
#endif
#endif
#if BLAKE3_USE_NEON == 1
return 4;
#endif
return 1;
}
#ifndef BLAKE3_H
#define BLAKE3_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define BLAKE3_VERSION_STRING "1.3.1"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
#define BLAKE3_CHUNK_LEN 1024
#define BLAKE3_MAX_DEPTH 54
// This struct is a private implementation detail. It has to be here because
// it's part of blake3_hasher below.
typedef struct {
uint32_t cv[8];
uint64_t chunk_counter;
uint8_t buf[BLAKE3_BLOCK_LEN];
uint8_t buf_len;
uint8_t blocks_compressed;
uint8_t flags;
} blake3_chunk_state;
typedef struct {
uint32_t key[8];
blake3_chunk_state chunk;
uint8_t cv_stack_len;
// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
// requires a 4th entry, rather than merging everything down to 1, because we
// don't know whether more input is coming. This is different from how the
// reference implementation does things.
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
} blake3_hasher;
const char *blake3_version(void);
void blake3_hasher_init(blake3_hasher *self);
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len);
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len);
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len);
void blake3_hasher_reset(blake3_hasher *self);
#ifdef __cplusplus
}
#endif
#endif /* BLAKE3_H */
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include "blake3.h"
#include "blake3_impl.h"
const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->cv, key, BLAKE3_KEY_LEN);
self->chunk_counter = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
self->buf_len = 0;
self->blocks_compressed = 0;
self->flags = flags;
}
INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
uint64_t chunk_counter) {
memcpy(self->cv, key, BLAKE3_KEY_LEN);
self->chunk_counter = chunk_counter;
self->blocks_compressed = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
self->buf_len = 0;
}
INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
((size_t)self->buf_len);
}
INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
const uint8_t *input, size_t input_len) {
size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
if (take > input_len) {
take = input_len;
}
uint8_t *dest = self->buf + ((size_t)self->buf_len);
memcpy(dest, input, take);
self->buf_len += (uint8_t)take;
return take;
}
INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
if (self->blocks_compressed == 0) {
return CHUNK_START;
} else {
return 0;
}
}
typedef struct {
uint32_t input_cv[8];
uint64_t counter;
uint8_t block[BLAKE3_BLOCK_LEN];
uint8_t block_len;
uint8_t flags;
} output_t;
INLINE output_t make_output(const uint32_t input_cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
output_t ret;
memcpy(ret.input_cv, input_cv, 32);
memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
ret.block_len = block_len;
ret.counter = counter;
ret.flags = flags;
return ret;
}
// Chaining values within a given chunk (specifically the compress_in_place
// interface) are represented as words. This avoids unnecessary bytes<->words
// conversion overhead in the portable implementation. However, the hash_many
// interface handles both user input and parent node blocks, so it accepts
// bytes. For that reason, chaining values in the CV stack are represented as
// bytes.
INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
uint32_t cv_words[8];
memcpy(cv_words, self->input_cv, 32);
blake3_compress_in_place(cv_words, self->block, self->block_len,
self->counter, self->flags);
store_cv_words(cv, cv_words);
}
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
size_t out_len) {
uint64_t output_block_counter = seek / 64;
size_t offset_within_block = seek % 64;
uint8_t wide_buf[64];
while (out_len > 0) {
blake3_compress_xof(self->input_cv, self->block, self->block_len,
output_block_counter, self->flags | ROOT, wide_buf);
size_t available_bytes = 64 - offset_within_block;
size_t memcpy_len;
if (out_len > available_bytes) {
memcpy_len = available_bytes;
} else {
memcpy_len = out_len;
}
memcpy(out, wide_buf + offset_within_block, memcpy_len);
out += memcpy_len;
out_len -= memcpy_len;
output_block_counter += 1;
offset_within_block = 0;
}
}
INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
size_t input_len) {
if (self->buf_len > 0) {
size_t take = chunk_state_fill_buf(self, input, input_len);
input += take;
input_len -= take;
if (input_len > 0) {
blake3_compress_in_place(
self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
self->flags | chunk_state_maybe_start_flag(self));
self->blocks_compressed += 1;
self->buf_len = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
}
}
while (input_len > BLAKE3_BLOCK_LEN) {
blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
self->chunk_counter,
self->flags | chunk_state_maybe_start_flag(self));
self->blocks_compressed += 1;
input += BLAKE3_BLOCK_LEN;
input_len -= BLAKE3_BLOCK_LEN;
}
size_t take = chunk_state_fill_buf(self, input, input_len);
input += take;
input_len -= take;
}
INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
uint8_t block_flags =
self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
block_flags);
}
INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
const uint32_t key[8], uint8_t flags) {
return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
}
// Given some input larger than one chunk, return the number of bytes that
// should go in the left subtree. This is the largest power-of-2 number of
// chunks that leaves at least 1 byte for the right subtree.
INLINE size_t left_len(size_t content_len) {
// Subtract 1 to reserve at least one byte for the right side. content_len
// should always be greater than BLAKE3_CHUNK_LEN.
size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
}
// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
// on a single thread. Write out the chunk chaining values and return the
// number of chunks hashed. These chunks are never the root and never empty;
// those cases use a different codepath.
INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags,
uint8_t *out) {
#if defined(BLAKE3_TESTING)
assert(0 < input_len);
assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
#endif
const uint8_t *chunks_array[MAX_SIMD_DEGREE];
size_t input_position = 0;
size_t chunks_array_len = 0;
while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
chunks_array[chunks_array_len] = &input[input_position];
input_position += BLAKE3_CHUNK_LEN;
chunks_array_len += 1;
}
blake3_hash_many(chunks_array, chunks_array_len,
BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
true, flags, CHUNK_START, CHUNK_END, out);
// Hash the remaining partial chunk, if there is one. Note that the empty
// chunk (meaning the empty message) is a different codepath.
if (input_len > input_position) {
uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
blake3_chunk_state chunk_state;
chunk_state_init(&chunk_state, key, flags);
chunk_state.chunk_counter = counter;
chunk_state_update(&chunk_state, &input[input_position],
input_len - input_position);
output_t output = chunk_state_output(&chunk_state);
output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
return chunks_array_len + 1;
} else {
return chunks_array_len;
}
}
// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
// on a single thread. Write out the parent chaining values and return the
// number of parents hashed. (If there's an odd input chaining value left over,
// return it as an additional output.) These parents are never the root and
// never empty; those cases use a different codepath.
INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
size_t num_chaining_values,
const uint32_t key[8], uint8_t flags,
uint8_t *out) {
#if defined(BLAKE3_TESTING)
assert(2 <= num_chaining_values);
assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
#endif
const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
size_t parents_array_len = 0;
while (num_chaining_values - (2 * parents_array_len) >= 2) {
parents_array[parents_array_len] =
&child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
parents_array_len += 1;
}
blake3_hash_many(parents_array, parents_array_len, 1, key,
0, // Parents always use counter 0.
false, flags | PARENT,
0, // Parents have no start flags.
0, // Parents have no end flags.
out);
// If there's an odd child left over, it becomes an output.
if (num_chaining_values > 2 * parents_array_len) {
memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
&child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
BLAKE3_OUT_LEN);
return parents_array_len + 1;
} else {
return parents_array_len;
}
}
// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
//
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
// wouldn't be able to implement exendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
// Why not just have the caller split the input on the first update(), instead
// of implementing this special rule? Because we don't want to limit SIMD or
// multi-threading parallelism for that update().
static size_t blake3_compress_subtree_wide(const uint8_t *input,
size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter,
uint8_t flags, uint8_t *out) {
// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. If this implementation adds multi-threading in the future,
// this gives us the option of multi-threading even the 2-chunk case, which
// can help performance on smaller platforms.
if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
out);
}
// With more than simd_degree chunks, we need to recurse. Start by dividing
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
size_t left_input_len = left_len(input_len);
size_t right_input_len = input_len - left_input_len;
const uint8_t *right_input = &input[left_input_len];
uint64_t right_chunk_counter =
chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
// Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
// account for the special case of returning 2 outputs when the SIMD degree
// is 1.
uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t degree = blake3_simd_degree();
if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
// The special case: We always use a degree of at least two, to make
// sure there are two outputs. Except, as noted above, at the chunk
// level, where we allow degree=1. (Note that the 1-chunk-input case is
// a different codepath.)
degree = 2;
}
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
// Recurse! If this implementation adds multi-threading support in the
// future, this is where it will go.
size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
chunk_counter, flags, cv_array);
size_t right_n = blake3_compress_subtree_wide(
right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
// them directly, to make sure we always have at least two outputs.
if (left_n == 1) {
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
return 2;
}
// Otherwise, do one layer of parent node compression.
size_t num_chaining_values = left_n + right_n;
return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
out);
}
// Hash a subtree with compress_subtree_wide(), and then condense the resulting
// list of chaining values down to a single parent node. Don't compress that
// last parent node, however. Instead, return its message bytes (the
// concatenated chaining values of its children). This is necessary when the
// first call to update() supplies a complete subtree, because the topmost
// parent node of that subtree could end up being the root. It's also necessary
// for extended output in the general case.
//
// As with compress_subtree_wide(), this function is not used on inputs of 1
// chunk or less. That's a different codepath.
INLINE void compress_subtree_to_parent_node(
const uint8_t *input, size_t input_len, const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
#if defined(BLAKE3_TESTING)
assert(input_len > BLAKE3_CHUNK_LEN);
#endif
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
// The second half of this loop condition is always true, and we just
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
// this code, test it against that version.
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}
INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->key, key, BLAKE3_KEY_LEN);
chunk_state_init(&self->chunk, key, flags);
self->cv_stack_len = 0;
}
void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]) {
uint32_t key_words[8];
load_key_words(key, key_words);
hasher_init_base(self, key_words, KEYED_HASH);
}
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len) {
blake3_hasher context_hasher;
hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
blake3_hasher_update(&context_hasher, context, context_len);
uint8_t context_key[BLAKE3_KEY_LEN];
blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
uint32_t context_key_words[8];
load_key_words(context_key, context_key_words);
hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
}
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
blake3_hasher_init_derive_key_raw(self, context, strlen(context));
}
// As described in hasher_push_cv() below, we do "lazy merging", delaying
// merges until right before the next CV is about to be added. This is
// different from the reference implementation. Another difference is that we
// aren't always merging 1 chunk at a time. Instead, each CV might represent
// any power-of-two number of chunks, as long as the smaller-above-larger stack
// order is maintained. Instead of the "count the trailing 0-bits" algorithm
// described in the spec, we use a "count the total number of 1-bits" variant
// that doesn't require us to retain the subtree size of the CV on top of the
// stack. The principle is the same: each CV that should remain in the stack is
// represented by a 1-bit in the total number of chunks (or bytes) so far.
INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
size_t post_merge_stack_len = (size_t)popcnt(total_len);
while (self->cv_stack_len > post_merge_stack_len) {
uint8_t *parent_node =
&self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
output_t output = parent_output(parent_node, self->key, self->chunk.flags);
output_chaining_value(&output, parent_node);
self->cv_stack_len -= 1;
}
}
// In reference_impl.rs, we merge the new CV with existing CVs from the stack
// before pushing it. We can do that because we know more input is coming, so
// we know none of the merges are root.
//
// This setting is different. We want to feed as much input as possible to
// compress_subtree_wide(), without setting aside anything for the chunk_state.
// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
// as a single subtree, if at all possible.
//
// This leads to two problems:
// 1) This 64 KiB input might be the only call that ever gets made to update.
// In this case, the root node of the 64 KiB subtree would be the root node
// of the whole tree, and it would need to be ROOT finalized. We can't
// compress it until we know.
// 2) This 64 KiB input might complete a larger tree, whose root node is
// similarly going to be the the root of the whole tree. For example, maybe
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
// node at the root of the 256 KiB subtree until we know how to finalize it.
//
// The second problem is solved with "lazy merging". That is, when we're about
// to add a CV to the stack, we don't merge it with anything first, as the
// reference impl does. Instead we do merges using the *previous* CV that was
// added, which is sitting on top of the stack, and we put the new CV
// (unmerged) on top of the stack afterwards. This guarantees that we never
// merge the root node until finalize().
//
// Solving the first problem requires an additional tool,
// compress_subtree_to_parent_node(). That function always returns the top
// *two* chaining values of the subtree it's compressing. We then do lazy
// merging with each of them separately, so that the second CV will always
// remain unmerged. (That also helps us support extendable output when we're
// hashing an input all-at-once.)
INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
uint64_t chunk_counter) {
hasher_merge_cv_stack(self, chunk_counter);
memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
BLAKE3_OUT_LEN);
self->cv_stack_len += 1;
}
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
// blake3_hasher_update(&hasher, v.data(), v.size());
if (input_len == 0) {
return;
}
const uint8_t *input_bytes = (const uint8_t *)input;
// If we have some partial chunk bytes in the internal chunk_state, we need
// to finish that chunk first.
if (chunk_state_len(&self->chunk) > 0) {
size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
if (take > input_len) {
take = input_len;
}
chunk_state_update(&self->chunk, input_bytes, take);
input_bytes += take;
input_len -= take;
// If we've filled the current chunk and there's more coming, finalize this
// chunk and proceed. In this case we know it's not the root.
if (input_len > 0) {
output_t output = chunk_state_output(&self->chunk);
uint8_t chunk_cv[32];
output_chaining_value(&output, chunk_cv);
hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
} else {
return;
}
}
// Now the chunk_state is clear, and we have more input. If there's more than
// a single chunk (so, definitely not the root chunk), hash the largest whole
// subtree we can, with the full benefits of SIMD (and maybe in the future,
// multi-threading) parallelism. Two restrictions:
// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
// the right edge can be incomplete, and we don't know where the right edge
// is going to be until we get to finalize().
// - The subtree must evenly divide the total number of chunks up until this
// point (if total is not 0). If the current incomplete subtree is only
// waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
// to complete the current subtree first.
// Because we might need to break up the input to form powers of 2, or to
// evenly divide what we already have, this part runs in a loop.
while (input_len > BLAKE3_CHUNK_LEN) {
size_t subtree_len = round_down_to_power_of_2(input_len);
uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
// Shrink the subtree_len until it evenly divides the count so far. We know
// that subtree_len itself is a power of 2, so we can use a bitmasking
// trick instead of an actual remainder operation. (Note that if the caller
// consistently passes power-of-2 inputs of the same size, as is hopefully
// typical, this loop condition will always fail, and subtree_len will
// always be the full length of the input.)
//
// An aside: We don't have to shrink subtree_len quite this much. For
// example, if count_so_far is 1, we could pass 2 chunks to
// compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
// get the right answer in the end, and we might get to use 2-way SIMD
// parallelism. The problem with this optimization, is that it gets us
// stuck always hashing 2 chunks. The total number of chunks will remain
// odd, and we'll never graduate to higher degrees of parallelism. See
// https://github.com/BLAKE3-team/BLAKE3/issues/69.
while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
subtree_len /= 2;
}
// The shrunken subtree_len might now be 1 chunk long. If so, hash that one
// chunk by itself. Otherwise, compress the subtree into a pair of CVs.
uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
if (subtree_len <= BLAKE3_CHUNK_LEN) {
blake3_chunk_state chunk_state;
chunk_state_init(&chunk_state, self->key, self->chunk.flags);
chunk_state.chunk_counter = self->chunk.chunk_counter;
chunk_state_update(&chunk_state, input_bytes, subtree_len);
output_t output = chunk_state_output(&chunk_state);
uint8_t cv[BLAKE3_OUT_LEN];
output_chaining_value(&output, cv);
hasher_push_cv(self, cv, chunk_state.chunk_counter);
} else {
// This is the high-performance happy path, though getting here depends
// on the caller giving us a long enough input.
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
self->chunk.chunk_counter,
self->chunk.flags, cv_pair);
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
self->chunk.chunk_counter + (subtree_chunks / 2));
}
self->chunk.chunk_counter += subtree_chunks;
input_bytes += subtree_len;
input_len -= subtree_len;
}
// If there's any remaining input less than a full chunk, add it to the chunk
// state. In that case, also do a final merge loop to make sure the subtree
// stack doesn't contain any unmerged pairs. The remaining input means we
// know these merges are non-root. This merge loop isn't strictly necessary
// here, because hasher_push_chunk_cv already does its own merge loop, but it
// simplifies blake3_hasher_finalize below.
if (input_len > 0) {
chunk_state_update(&self->chunk, input_bytes, input_len);
hasher_merge_cv_stack(self, self->chunk.chunk_counter);
}
}
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len) {
blake3_hasher_finalize_seek(self, 0, out, out_len);
}
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
// blake3_hasher_finalize(&hasher, v.data(), v.size());
if (out_len == 0) {
return;
}
// If the subtree stack is empty, then the current chunk is the root.
if (self->cv_stack_len == 0) {
output_t output = chunk_state_output(&self->chunk);
output_root_bytes(&output, seek, out, out_len);
return;
}
// If there are any bytes in the chunk state, finalize that chunk and do a
// roll-up merge between that chunk hash and every subtree in the stack. In
// this case, the extra merge loop at the end of blake3_hasher_update
// guarantees that none of the subtrees in the stack need to be merged with
// each other first. Otherwise, if there are no bytes in the chunk state,
// then the top of the stack is a chunk hash, and we start the merge from
// that.
output_t output;
size_t cvs_remaining;
if (chunk_state_len(&self->chunk) > 0) {
cvs_remaining = self->cv_stack_len;
output = chunk_state_output(&self->chunk);
} else {
// There are always at least 2 CVs in the stack in this case.
cvs_remaining = self->cv_stack_len - 2;
output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
self->chunk.flags);
}
while (cvs_remaining > 0) {
cvs_remaining -= 1;
uint8_t parent_block[BLAKE3_BLOCK_LEN];
memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
output_chaining_value(&output, &parent_block[32]);
output = parent_output(parent_block, self->key, self->chunk.flags);
}
output_root_bytes(&output, seek, out, out_len);
}
void blake3_hasher_reset(blake3_hasher *self) {
chunk_state_reset(&self->chunk, self->key, 0);
self->cv_stack_len = 0;
}
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
#endif
#endif
#if !defined(_CET_ENDBR)
#define _CET_ENDBR
#endif
.intel_syntax noprefix
.global blake3_hash_many_sse41
.global _blake3_hash_many_sse41
.global blake3_compress_in_place_sse41
.global _blake3_compress_in_place_sse41
.global blake3_compress_xof_sse41
.global _blake3_compress_xof_sse41
#ifdef __APPLE__
.text
#else
.section .text
#endif
.p2align 6
_blake3_hash_many_sse41:
blake3_hash_many_sse41:
_CET_ENDBR
push r15
push r14
push r13
push r12
push rbx
push rbp
mov rbp, rsp
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
movdqa xmm1, xmm0
pand xmm1, xmmword ptr [ADD0+rip]
pand xmm0, xmmword ptr [ADD1+rip]
movdqa xmmword ptr [rsp+0x150], xmm0
movd xmm0, r8d
pshufd xmm0, xmm0, 0x00
paddd xmm0, xmm1
movdqa xmmword ptr [rsp+0x110], xmm0
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm1, xmm0
shr r8, 32
movd xmm2, r8d
pshufd xmm2, xmm2, 0x00
psubd xmm2, xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
mov rbx, qword ptr [rbp+0x50]
mov r15, rdx
shl r15, 6
movzx r13d, byte ptr [rbp+0x38]
movzx r12d, byte ptr [rbp+0x48]
cmp rsi, 4
jc 3f
2:
movdqu xmm3, xmmword ptr [rcx]
pshufd xmm0, xmm3, 0x00
pshufd xmm1, xmm3, 0x55
pshufd xmm2, xmm3, 0xAA
pshufd xmm3, xmm3, 0xFF
movdqu xmm7, xmmword ptr [rcx+0x10]
pshufd xmm4, xmm7, 0x00
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
9:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp], xmm8
movdqa xmmword ptr [rsp+0x10], xmm9
movdqa xmmword ptr [rsp+0x20], xmm12
movdqa xmmword ptr [rsp+0x30], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x40], xmm8
movdqa xmmword ptr [rsp+0x50], xmm9
movdqa xmmword ptr [rsp+0x60], xmm12
movdqa xmmword ptr [rsp+0x70], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x80], xmm8
movdqa xmmword ptr [rsp+0x90], xmm9
movdqa xmmword ptr [rsp+0xA0], xmm12
movdqa xmmword ptr [rsp+0xB0], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0xC0], xmm8
movdqa xmmword ptr [rsp+0xD0], xmm9
movdqa xmmword ptr [rsp+0xE0], xmm12
movdqa xmmword ptr [rsp+0xF0], xmm13
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
movdqa xmm12, xmmword ptr [rsp+0x110]
movdqa xmm13, xmmword ptr [rsp+0x120]
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
movd xmm15, eax
pshufd xmm15, xmm15, 0x00
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x80]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x70]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xB0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x50]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xC0]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xA0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0x60]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xF0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
pxor xmm0, xmm8
pxor xmm1, xmm9
pxor xmm2, xmm10
pxor xmm3, xmm11
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
pxor xmm4, xmm12
pxor xmm5, xmm13
pxor xmm6, xmm14
pxor xmm7, xmm15
mov eax, r13d
jne 9b
movdqa xmm9, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm9, xmm1
movdqa xmm11, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm11, xmm3
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2
punpckhqdq xmm1, xmm2
movdqa xmm3, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm3, xmm11
movdqu xmmword ptr [rbx], xmm0
movdqu xmmword ptr [rbx+0x20], xmm1
movdqu xmmword ptr [rbx+0x40], xmm9
movdqu xmmword ptr [rbx+0x60], xmm3
movdqa xmm9, xmm4
punpckldq xmm4, xmm5
punpckhdq xmm9, xmm5
movdqa xmm11, xmm6
punpckldq xmm6, xmm7
punpckhdq xmm11, xmm7
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm7, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm7, xmm11
movdqu xmmword ptr [rbx+0x10], xmm4
movdqu xmmword ptr [rbx+0x30], xmm5
movdqu xmmword ptr [rbx+0x50], xmm9
movdqu xmmword ptr [rbx+0x70], xmm7
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm0, xmm1
paddd xmm1, xmmword ptr [rsp+0x150]
movdqa xmmword ptr [rsp+0x110], xmm1
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm0, xmm1
movdqa xmm1, xmmword ptr [rsp+0x120]
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
add rdi, 32
sub rsi, 4
cmp rsi, 4
jnc 2b
test rsi, rsi
jnz 3f
4:
mov rsp, rbp
pop rbp
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
.p2align 5
3:
test esi, 0x2
je 3f
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
pinsrd xmm13, dword ptr [rsp+0x120], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
pinsrd xmm14, dword ptr [rsp+0x124], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm10, xmm2
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm3, xmm4
shufps xmm4, xmm5, 136
shufps xmm3, xmm5, 221
movaps xmm5, xmm3
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm3, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm3, xmm7, 221
pshufd xmm7, xmm3, 0x93
movups xmm12, xmmword ptr [r9+rdx-0x40]
movups xmm13, xmmword ptr [r9+rdx-0x30]
movaps xmm11, xmm12
shufps xmm12, xmm13, 136
shufps xmm11, xmm13, 221
movaps xmm13, xmm11
movups xmm14, xmmword ptr [r9+rdx-0x20]
movups xmm15, xmmword ptr [r9+rdx-0x10]
movaps xmm11, xmm14
shufps xmm14, xmm15, 136
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
pinsrd xmm3, eax, 3
pinsrd xmm11, eax, 3
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm8, xmm12
movaps xmmword ptr [rsp+0x20], xmm4
movaps xmmword ptr [rsp+0x30], xmm12
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm12, xmmword ptr [ROT16+rip]
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm5
paddd xmm8, xmm13
movaps xmmword ptr [rsp+0x40], xmm5
movaps xmmword ptr [rsp+0x50], xmm13
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm13, xmmword ptr [ROT8+rip]
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x93
pshufd xmm8, xmm8, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x39
pshufd xmm10, xmm10, 0x39
paddd xmm0, xmm6
paddd xmm8, xmm14
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm7
paddd xmm8, xmm15
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x39
pshufd xmm8, xmm8, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x93
pshufd xmm10, xmm10, 0x93
dec al
je 9f
movdqa xmm12, xmmword ptr [rsp+0x20]
movdqa xmm5, xmmword ptr [rsp+0x40]
pshufd xmm13, xmm12, 0x0F
shufps xmm12, xmm5, 214
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
pblendw xmm13, xmm12, 0xCC
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
pblendw xmm12, xmm6, 0xC0
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmmword ptr [rsp+0x20], xmm13
movdqa xmmword ptr [rsp+0x40], xmm12
movdqa xmm5, xmmword ptr [rsp+0x30]
movdqa xmm13, xmmword ptr [rsp+0x50]
pshufd xmm6, xmm5, 0x0F
shufps xmm5, xmm13, 214
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
pblendw xmm6, xmm5, 0xCC
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
pblendw xmm5, xmm14, 0xC0
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
pshufd xmm15, xmm14, 0x1E
movdqa xmm13, xmm6
movdqa xmm14, xmm5
movdqa xmm5, xmmword ptr [rsp+0x20]
movdqa xmm6, xmmword ptr [rsp+0x40]
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm8, xmm10
pxor xmm9, xmm11
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
movups xmmword ptr [rbx+0x20], xmm8
movups xmmword ptr [rbx+0x30], xmm9
movdqa xmm0, xmmword ptr [rsp+0x130]
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm2, xmmword ptr [rsp+0x120]
movdqu xmm3, xmmword ptr [rsp+0x118]
movdqu xmm4, xmmword ptr [rsp+0x128]
blendvps xmm1, xmm3, xmm0
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+0x110], xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
add rdi, 16
add rbx, 64
sub rsi, 2
3:
test esi, 0x1
je 4b
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
pinsrd xmm13, dword ptr [rsp+0x120], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
pinsrd xmm3, eax, 3
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
jmp 4b
.p2align 6
blake3_compress_in_place_sse41:
_blake3_compress_in_place_sse41:
_CET_ENDBR
movups xmm0, xmmword ptr [rdi]
movups xmm1, xmmword ptr [rdi+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
shl r8, 32
add rdx, r8
movq xmm3, rcx
movq xmm4, rdx
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rsi]
movups xmm5, xmmword ptr [rsi+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rsi+0x20]
movups xmm7, xmmword ptr [rsi+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
movups xmmword ptr [rdi], xmm0
movups xmmword ptr [rdi+0x10], xmm1
ret
.p2align 6
blake3_compress_xof_sse41:
_blake3_compress_xof_sse41:
_CET_ENDBR
movups xmm0, xmmword ptr [rdi]
movups xmm1, xmmword ptr [rdi+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movzx eax, r8b
movzx edx, dl
shl rax, 32
add rdx, rax
movq xmm3, rcx
movq xmm4, rdx
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rsi]
movups xmm5, xmmword ptr [rsi+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rsi+0x20]
movups xmm7, xmmword ptr [rsi+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
movdqu xmm4, xmmword ptr [rdi]
movdqu xmm5, xmmword ptr [rdi+0x10]
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm2, xmm4
pxor xmm3, xmm5
movups xmmword ptr [r9], xmm0
movups xmmword ptr [r9+0x10], xmm1
movups xmmword ptr [r9+0x20], xmm2
movups xmmword ptr [r9+0x30], xmm3
ret
#ifdef __APPLE__
.static_data
#else
.section .rodata
#endif
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
ROT16:
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
.long 0, 1, 2, 3
ADD1:
.long 4, 4, 4, 4
BLAKE3_IV_0:
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
BLAKE3_IV_1:
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
BLAKE3_IV_2:
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
BLAKE3_IV_3:
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
#endif
#endif
#if !defined(_CET_ENDBR)
#define _CET_ENDBR
#endif
.intel_syntax noprefix
.global blake3_hash_many_sse2
.global _blake3_hash_many_sse2
.global blake3_compress_in_place_sse2
.global _blake3_compress_in_place_sse2
.global blake3_compress_xof_sse2
.global _blake3_compress_xof_sse2
#ifdef __APPLE__
.text
#else
.section .text
#endif
.p2align 6
_blake3_hash_many_sse2:
blake3_hash_many_sse2:
_CET_ENDBR
push r15
push r14
push r13
push r12
push rbx
push rbp
mov rbp, rsp
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
movdqa xmm1, xmm0
pand xmm1, xmmword ptr [ADD0+rip]
pand xmm0, xmmword ptr [ADD1+rip]
movdqa xmmword ptr [rsp+0x150], xmm0
movd xmm0, r8d
pshufd xmm0, xmm0, 0x00
paddd xmm0, xmm1
movdqa xmmword ptr [rsp+0x110], xmm0
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm1, xmm0
shr r8, 32
movd xmm2, r8d
pshufd xmm2, xmm2, 0x00
psubd xmm2, xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
mov rbx, qword ptr [rbp+0x50]
mov r15, rdx
shl r15, 6
movzx r13d, byte ptr [rbp+0x38]
movzx r12d, byte ptr [rbp+0x48]
cmp rsi, 4
jc 3f
2:
movdqu xmm3, xmmword ptr [rcx]
pshufd xmm0, xmm3, 0x00
pshufd xmm1, xmm3, 0x55
pshufd xmm2, xmm3, 0xAA
pshufd xmm3, xmm3, 0xFF
movdqu xmm7, xmmword ptr [rcx+0x10]
pshufd xmm4, xmm7, 0x00
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
9:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp], xmm8
movdqa xmmword ptr [rsp+0x10], xmm9
movdqa xmmword ptr [rsp+0x20], xmm12
movdqa xmmword ptr [rsp+0x30], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x40], xmm8
movdqa xmmword ptr [rsp+0x50], xmm9
movdqa xmmword ptr [rsp+0x60], xmm12
movdqa xmmword ptr [rsp+0x70], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x80], xmm8
movdqa xmmword ptr [rsp+0x90], xmm9
movdqa xmmword ptr [rsp+0xA0], xmm12
movdqa xmmword ptr [rsp+0xB0], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0xC0], xmm8
movdqa xmmword ptr [rsp+0xD0], xmm9
movdqa xmmword ptr [rsp+0xE0], xmm12
movdqa xmmword ptr [rsp+0xF0], xmm13
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
movdqa xmm12, xmmword ptr [rsp+0x110]
movdqa xmm13, xmmword ptr [rsp+0x120]
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
movd xmm15, eax
pshufd xmm15, xmm15, 0x00
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x80]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x70]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xB0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x50]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xC0]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xA0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0x60]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xF0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0xB1
pshufhw xmm15, xmm15, 0xB1
pshuflw xmm12, xmm12, 0xB1
pshufhw xmm12, xmm12, 0xB1
pshuflw xmm13, xmm13, 0xB1
pshufhw xmm13, xmm13, 0xB1
pshuflw xmm14, xmm14, 0xB1
pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
pxor xmm0, xmm8
pxor xmm1, xmm9
pxor xmm2, xmm10
pxor xmm3, xmm11
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
pxor xmm4, xmm12
pxor xmm5, xmm13
pxor xmm6, xmm14
pxor xmm7, xmm15
mov eax, r13d
jne 9b
movdqa xmm9, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm9, xmm1
movdqa xmm11, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm11, xmm3
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2
punpckhqdq xmm1, xmm2
movdqa xmm3, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm3, xmm11
movdqu xmmword ptr [rbx], xmm0
movdqu xmmword ptr [rbx+0x20], xmm1
movdqu xmmword ptr [rbx+0x40], xmm9
movdqu xmmword ptr [rbx+0x60], xmm3
movdqa xmm9, xmm4
punpckldq xmm4, xmm5
punpckhdq xmm9, xmm5
movdqa xmm11, xmm6
punpckldq xmm6, xmm7
punpckhdq xmm11, xmm7
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm7, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm7, xmm11
movdqu xmmword ptr [rbx+0x10], xmm4
movdqu xmmword ptr [rbx+0x30], xmm5
movdqu xmmword ptr [rbx+0x50], xmm9
movdqu xmmword ptr [rbx+0x70], xmm7
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm0, xmm1
paddd xmm1, xmmword ptr [rsp+0x150]
movdqa xmmword ptr [rsp+0x110], xmm1
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm0, xmm1
movdqa xmm1, xmmword ptr [rsp+0x120]
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
add rdi, 32
sub rsi, 4
cmp rsi, 4
jnc 2b
test rsi, rsi
jnz 3f
4:
mov rsp, rbp
pop rbp
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
.p2align 5
3:
test esi, 0x2
je 3f
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm10, xmm2
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm3, xmm4
shufps xmm4, xmm5, 136
shufps xmm3, xmm5, 221
movaps xmm5, xmm3
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm3, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm3, xmm7, 221
pshufd xmm7, xmm3, 0x93
movups xmm12, xmmword ptr [r9+rdx-0x40]
movups xmm13, xmmword ptr [r9+rdx-0x30]
movaps xmm11, xmm12
shufps xmm12, xmm13, 136
shufps xmm11, xmm13, 221
movaps xmm13, xmm11
movups xmm14, xmmword ptr [r9+rdx-0x20]
movups xmm15, xmmword ptr [r9+rdx-0x10]
movaps xmm11, xmm14
shufps xmm14, xmm15, 136
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
shl rax, 0x20
or rax, 0x40
movq xmm3, rax
movdqa xmmword ptr [rsp+0x20], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm8, xmm12
movaps xmmword ptr [rsp+0x20], xmm4
movaps xmmword ptr [rsp+0x30], xmm12
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
pshuflw xmm11, xmm11, 0xB1
pshufhw xmm11, xmm11, 0xB1
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm5
paddd xmm8, xmm13
movaps xmmword ptr [rsp+0x40], xmm5
movaps xmmword ptr [rsp+0x50], xmm13
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movdqa xmm13, xmm3
psrld xmm3, 8
pslld xmm13, 24
pxor xmm3, xmm13
movdqa xmm13, xmm11
psrld xmm11, 8
pslld xmm13, 24
pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x93
pshufd xmm8, xmm8, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x39
pshufd xmm10, xmm10, 0x39
paddd xmm0, xmm6
paddd xmm8, xmm14
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
pshuflw xmm11, xmm11, 0xB1
pshufhw xmm11, xmm11, 0xB1
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm7
paddd xmm8, xmm15
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movdqa xmm13, xmm3
psrld xmm3, 8
pslld xmm13, 24
pxor xmm3, xmm13
movdqa xmm13, xmm11
psrld xmm11, 8
pslld xmm13, 24
pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x39
pshufd xmm8, xmm8, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x93
pshufd xmm10, xmm10, 0x93
dec al
je 9f
movdqa xmm12, xmmword ptr [rsp+0x20]
movdqa xmm5, xmmword ptr [rsp+0x40]
pshufd xmm13, xmm12, 0x0F
shufps xmm12, xmm5, 214
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm13, xmm12
movdqa xmmword ptr [rsp+0x20], xmm13
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
movdqa xmm13, xmm6
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm12, xmm13
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmmword ptr [rsp+0x40], xmm12
movdqa xmm5, xmmword ptr [rsp+0x30]
movdqa xmm13, xmmword ptr [rsp+0x50]
pshufd xmm6, xmm5, 0x0F
shufps xmm5, xmm13, 214
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
movdqa xmmword ptr [rsp+0x30], xmm2
movdqa xmm2, xmm14
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+0x30]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
pshufd xmm15, xmm14, 0x1E
movdqa xmm13, xmm6
movdqa xmm14, xmm5
movdqa xmm5, xmmword ptr [rsp+0x20]
movdqa xmm6, xmmword ptr [rsp+0x40]
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm8, xmm10
pxor xmm9, xmm11
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
movups xmmword ptr [rbx+0x20], xmm8
movups xmmword ptr [rbx+0x30], xmm9
mov eax, dword ptr [rsp+0x130]
neg eax
mov r10d, dword ptr [rsp+0x110+8*rax]
mov r11d, dword ptr [rsp+0x120+8*rax]
mov dword ptr [rsp+0x110], r10d
mov dword ptr [rsp+0x120], r11d
add rdi, 16
add rbx, 64
sub rsi, 2
3:
test esi, 0x1
je 4b
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
shl rax, 32
or rax, 64
movq xmm12, rax
movdqa xmm3, xmm13
punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
jmp 4b
.p2align 6
blake3_compress_in_place_sse2:
_blake3_compress_in_place_sse2:
_CET_ENDBR
movups xmm0, xmmword ptr [rdi]
movups xmm1, xmmword ptr [rdi+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
shl r8, 32
add rdx, r8
movq xmm3, rcx
movq xmm4, rdx
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rsi]
movups xmm5, xmmword ptr [rsi+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rsi+0x20]
movups xmm7, xmmword ptr [rsi+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
movups xmmword ptr [rdi], xmm0
movups xmmword ptr [rdi+0x10], xmm1
ret
.p2align 6
blake3_compress_xof_sse2:
_blake3_compress_xof_sse2:
_CET_ENDBR
movups xmm0, xmmword ptr [rdi]
movups xmm1, xmmword ptr [rdi+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movzx eax, r8b
movzx edx, dl
shl rax, 32
add rdx, rax
movq xmm3, rcx
movq xmm4, rdx
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rsi]
movups xmm5, xmmword ptr [rsi+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rsi+0x20]
movups xmm7, xmmword ptr [rsi+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0xB1
pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
movdqu xmm4, xmmword ptr [rdi]
movdqu xmm5, xmmword ptr [rdi+0x10]
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm2, xmm4
pxor xmm3, xmm5
movups xmmword ptr [r9], xmm0
movups xmmword ptr [r9+0x10], xmm1
movups xmmword ptr [r9+0x20], xmm2
movups xmmword ptr [r9+0x30], xmm3
ret
#ifdef __APPLE__
.static_data
#else
.section .rodata
#endif
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
ADD0:
.long 0, 1, 2, 3
ADD1:
.long 4, 4, 4, 4
BLAKE3_IV_0:
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
BLAKE3_IV_1:
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
BLAKE3_IV_2:
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
BLAKE3_IV_3:
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
PBLENDW_0x33_MASK:
.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
PBLENDW_0xCC_MASK:
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
PBLENDW_0x3F_MASK:
.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
PBLENDW_0xC0_MASK:
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
#endif
#endif
#if !defined(_CET_ENDBR)
#define _CET_ENDBR
#endif
.intel_syntax noprefix
.global _blake3_hash_many_avx512
.global blake3_hash_many_avx512
.global blake3_compress_in_place_avx512
.global _blake3_compress_in_place_avx512
.global blake3_compress_xof_avx512
.global _blake3_compress_xof_avx512
#ifdef __APPLE__
.text
#else
.section .text
#endif
.p2align 6
_blake3_hash_many_avx512:
blake3_hash_many_avx512:
_CET_ENDBR
push r15
push r14
push r13
push r12
push rbx
push rbp
mov rbp, rsp
sub rsp, 144
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9
kmovw k1, r9d
vmovd xmm0, r8d
vpbroadcastd ymm0, xmm0
shr r8, 32
vmovd xmm1, r8d
vpbroadcastd ymm1, xmm1
vmovdqa ymm4, ymm1
vmovdqa ymm5, ymm1
vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
vpcmpltud k2, ymm2, ymm0
vpcmpltud k3, ymm3, ymm0
vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
knotw k2, k1
vmovdqa32 ymm2 {k2}, ymm0
vmovdqa32 ymm3 {k2}, ymm0
vmovdqa32 ymm4 {k2}, ymm1
vmovdqa32 ymm5 {k2}, ymm1
vmovdqa ymmword ptr [rsp], ymm2
vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
shl rdx, 6
mov qword ptr [rsp+0x80], rdx
cmp rsi, 16
jc 3f
2:
vpbroadcastd zmm0, dword ptr [rcx]
vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
xor edx, edx
.p2align 5
9:
movzx ebx, byte ptr [rbp+0x48]
or ebx, eax
add rdx, 64
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
mov r12, qword ptr [rdi+0x40]
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
mov r11, qword ptr [rdi+0x38]
mov r12, qword ptr [rdi+0x60]
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
vshufps zmm16, zmm8, zmm10, 136
vshufps zmm17, zmm12, zmm14, 136
vmovdqa32 zmm20, zmm16
vpermt2d zmm16, zmm27, zmm17
vpermt2d zmm20, zmm31, zmm17
vshufps zmm17, zmm8, zmm10, 221
vshufps zmm30, zmm12, zmm14, 221
vmovdqa32 zmm21, zmm17
vpermt2d zmm17, zmm27, zmm30
vpermt2d zmm21, zmm31, zmm30
vshufps zmm18, zmm9, zmm11, 136
vshufps zmm8, zmm13, zmm15, 136
vmovdqa32 zmm22, zmm18
vpermt2d zmm18, zmm27, zmm8
vpermt2d zmm22, zmm31, zmm8
vshufps zmm19, zmm9, zmm11, 221
vshufps zmm8, zmm13, zmm15, 221
vmovdqa32 zmm23, zmm19
vpermt2d zmm19, zmm27, zmm8
vpermt2d zmm23, zmm31, zmm8
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
mov r12, qword ptr [rdi+0x40]
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r12+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r13+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
mov r11, qword ptr [rdi+0x38]
mov r12, qword ptr [rdi+0x60]
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r12+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r13+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
vshufps zmm24, zmm8, zmm10, 136
vshufps zmm30, zmm12, zmm14, 136
vmovdqa32 zmm28, zmm24
vpermt2d zmm24, zmm27, zmm30
vpermt2d zmm28, zmm31, zmm30
vshufps zmm25, zmm8, zmm10, 221
vshufps zmm30, zmm12, zmm14, 221
vmovdqa32 zmm29, zmm25
vpermt2d zmm25, zmm27, zmm30
vpermt2d zmm29, zmm31, zmm30
vshufps zmm26, zmm9, zmm11, 136
vshufps zmm8, zmm13, zmm15, 136
vmovdqa32 zmm30, zmm26
vpermt2d zmm26, zmm27, zmm8
vpermt2d zmm30, zmm31, zmm8
vshufps zmm8, zmm9, zmm11, 221
vshufps zmm10, zmm13, zmm15, 221
vpermi2d zmm27, zmm8, zmm10
vpermi2d zmm31, zmm8, zmm10
vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
vmovdqa32 zmm12, zmmword ptr [rsp]
vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
vpaddd zmm0, zmm0, zmm16
vpaddd zmm1, zmm1, zmm18
vpaddd zmm2, zmm2, zmm20
vpaddd zmm3, zmm3, zmm22
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm17
vpaddd zmm1, zmm1, zmm19
vpaddd zmm2, zmm2, zmm21
vpaddd zmm3, zmm3, zmm23
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm24
vpaddd zmm1, zmm1, zmm26
vpaddd zmm2, zmm2, zmm28
vpaddd zmm3, zmm3, zmm30
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm25
vpaddd zmm1, zmm1, zmm27
vpaddd zmm2, zmm2, zmm29
vpaddd zmm3, zmm3, zmm31
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpaddd zmm0, zmm0, zmm18
vpaddd zmm1, zmm1, zmm19
vpaddd zmm2, zmm2, zmm23
vpaddd zmm3, zmm3, zmm20
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm22
vpaddd zmm1, zmm1, zmm26
vpaddd zmm2, zmm2, zmm16
vpaddd zmm3, zmm3, zmm29
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm17
vpaddd zmm1, zmm1, zmm28
vpaddd zmm2, zmm2, zmm25
vpaddd zmm3, zmm3, zmm31
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm27
vpaddd zmm1, zmm1, zmm21
vpaddd zmm2, zmm2, zmm30
vpaddd zmm3, zmm3, zmm24
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpaddd zmm0, zmm0, zmm19
vpaddd zmm1, zmm1, zmm26
vpaddd zmm2, zmm2, zmm29
vpaddd zmm3, zmm3, zmm23
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm20
vpaddd zmm1, zmm1, zmm28
vpaddd zmm2, zmm2, zmm18
vpaddd zmm3, zmm3, zmm30
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm22
vpaddd zmm1, zmm1, zmm25
vpaddd zmm2, zmm2, zmm27
vpaddd zmm3, zmm3, zmm24
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm21
vpaddd zmm1, zmm1, zmm16
vpaddd zmm2, zmm2, zmm31
vpaddd zmm3, zmm3, zmm17
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpaddd zmm0, zmm0, zmm26
vpaddd zmm1, zmm1, zmm28
vpaddd zmm2, zmm2, zmm30
vpaddd zmm3, zmm3, zmm29
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm23
vpaddd zmm1, zmm1, zmm25
vpaddd zmm2, zmm2, zmm19
vpaddd zmm3, zmm3, zmm31
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm20
vpaddd zmm1, zmm1, zmm27
vpaddd zmm2, zmm2, zmm21
vpaddd zmm3, zmm3, zmm17
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm16
vpaddd zmm1, zmm1, zmm18
vpaddd zmm2, zmm2, zmm24
vpaddd zmm3, zmm3, zmm22
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpaddd zmm0, zmm0, zmm28
vpaddd zmm1, zmm1, zmm25
vpaddd zmm2, zmm2, zmm31
vpaddd zmm3, zmm3, zmm30
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm29
vpaddd zmm1, zmm1, zmm27
vpaddd zmm2, zmm2, zmm26
vpaddd zmm3, zmm3, zmm24
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm23
vpaddd zmm1, zmm1, zmm21
vpaddd zmm2, zmm2, zmm16
vpaddd zmm3, zmm3, zmm22
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm18
vpaddd zmm1, zmm1, zmm19
vpaddd zmm2, zmm2, zmm17
vpaddd zmm3, zmm3, zmm20
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpaddd zmm0, zmm0, zmm25
vpaddd zmm1, zmm1, zmm27
vpaddd zmm2, zmm2, zmm24
vpaddd zmm3, zmm3, zmm31
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm30
vpaddd zmm1, zmm1, zmm21
vpaddd zmm2, zmm2, zmm28
vpaddd zmm3, zmm3, zmm17
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm29
vpaddd zmm1, zmm1, zmm16
vpaddd zmm2, zmm2, zmm18
vpaddd zmm3, zmm3, zmm20
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm19
vpaddd zmm1, zmm1, zmm26
vpaddd zmm2, zmm2, zmm22
vpaddd zmm3, zmm3, zmm23
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpaddd zmm0, zmm0, zmm27
vpaddd zmm1, zmm1, zmm21
vpaddd zmm2, zmm2, zmm17
vpaddd zmm3, zmm3, zmm24
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vprord zmm15, zmm15, 16
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 12
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vpaddd zmm0, zmm0, zmm31
vpaddd zmm1, zmm1, zmm16
vpaddd zmm2, zmm2, zmm25
vpaddd zmm3, zmm3, zmm22
vpaddd zmm0, zmm0, zmm4
vpaddd zmm1, zmm1, zmm5
vpaddd zmm2, zmm2, zmm6
vpaddd zmm3, zmm3, zmm7
vpxord zmm12, zmm12, zmm0
vpxord zmm13, zmm13, zmm1
vpxord zmm14, zmm14, zmm2
vpxord zmm15, zmm15, zmm3
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vprord zmm15, zmm15, 8
vpaddd zmm8, zmm8, zmm12
vpaddd zmm9, zmm9, zmm13
vpaddd zmm10, zmm10, zmm14
vpaddd zmm11, zmm11, zmm15
vpxord zmm4, zmm4, zmm8
vpxord zmm5, zmm5, zmm9
vpxord zmm6, zmm6, zmm10
vpxord zmm7, zmm7, zmm11
vprord zmm4, zmm4, 7
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vpaddd zmm0, zmm0, zmm30
vpaddd zmm1, zmm1, zmm18
vpaddd zmm2, zmm2, zmm19
vpaddd zmm3, zmm3, zmm23
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 16
vprord zmm12, zmm12, 16
vprord zmm13, zmm13, 16
vprord zmm14, zmm14, 16
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 12
vprord zmm6, zmm6, 12
vprord zmm7, zmm7, 12
vprord zmm4, zmm4, 12
vpaddd zmm0, zmm0, zmm26
vpaddd zmm1, zmm1, zmm28
vpaddd zmm2, zmm2, zmm20
vpaddd zmm3, zmm3, zmm29
vpaddd zmm0, zmm0, zmm5
vpaddd zmm1, zmm1, zmm6
vpaddd zmm2, zmm2, zmm7
vpaddd zmm3, zmm3, zmm4
vpxord zmm15, zmm15, zmm0
vpxord zmm12, zmm12, zmm1
vpxord zmm13, zmm13, zmm2
vpxord zmm14, zmm14, zmm3
vprord zmm15, zmm15, 8
vprord zmm12, zmm12, 8
vprord zmm13, zmm13, 8
vprord zmm14, zmm14, 8
vpaddd zmm10, zmm10, zmm15
vpaddd zmm11, zmm11, zmm12
vpaddd zmm8, zmm8, zmm13
vpaddd zmm9, zmm9, zmm14
vpxord zmm5, zmm5, zmm10
vpxord zmm6, zmm6, zmm11
vpxord zmm7, zmm7, zmm8
vpxord zmm4, zmm4, zmm9
vprord zmm5, zmm5, 7
vprord zmm6, zmm6, 7
vprord zmm7, zmm7, 7
vprord zmm4, zmm4, 7
vpxord zmm0, zmm0, zmm8
vpxord zmm1, zmm1, zmm9
vpxord zmm2, zmm2, zmm10
vpxord zmm3, zmm3, zmm11
vpxord zmm4, zmm4, zmm12
vpxord zmm5, zmm5, zmm13
vpxord zmm6, zmm6, zmm14
vpxord zmm7, zmm7, zmm15
movzx eax, byte ptr [rbp+0x38]
jne 9b
mov rbx, qword ptr [rbp+0x50]
vpunpckldq zmm16, zmm0, zmm1
vpunpckhdq zmm17, zmm0, zmm1
vpunpckldq zmm18, zmm2, zmm3
vpunpckhdq zmm19, zmm2, zmm3
vpunpckldq zmm20, zmm4, zmm5
vpunpckhdq zmm21, zmm4, zmm5
vpunpckldq zmm22, zmm6, zmm7
vpunpckhdq zmm23, zmm6, zmm7
vpunpcklqdq zmm0, zmm16, zmm18
vpunpckhqdq zmm1, zmm16, zmm18
vpunpcklqdq zmm2, zmm17, zmm19
vpunpckhqdq zmm3, zmm17, zmm19
vpunpcklqdq zmm4, zmm20, zmm22
vpunpckhqdq zmm5, zmm20, zmm22
vpunpcklqdq zmm6, zmm21, zmm23
vpunpckhqdq zmm7, zmm21, zmm23
vshufi32x4 zmm16, zmm0, zmm4, 0x88
vshufi32x4 zmm17, zmm1, zmm5, 0x88
vshufi32x4 zmm18, zmm2, zmm6, 0x88
vshufi32x4 zmm19, zmm3, zmm7, 0x88
vshufi32x4 zmm20, zmm0, zmm4, 0xDD
vshufi32x4 zmm21, zmm1, zmm5, 0xDD
vshufi32x4 zmm22, zmm2, zmm6, 0xDD
vshufi32x4 zmm23, zmm3, zmm7, 0xDD
vshufi32x4 zmm0, zmm16, zmm17, 0x88
vshufi32x4 zmm1, zmm18, zmm19, 0x88
vshufi32x4 zmm2, zmm20, zmm21, 0x88
vshufi32x4 zmm3, zmm22, zmm23, 0x88
vshufi32x4 zmm4, zmm16, zmm17, 0xDD
vshufi32x4 zmm5, zmm18, zmm19, 0xDD
vshufi32x4 zmm6, zmm20, zmm21, 0xDD
vshufi32x4 zmm7, zmm22, zmm23, 0xDD
vmovdqu32 zmmword ptr [rbx], zmm0
vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
vmovdqa32 zmm0, zmmword ptr [rsp]
vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
vmovdqa32 zmm2, zmm0
vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
vpcmpltud k2, zmm2, zmm0
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
add rdi, 128
add rbx, 512
mov qword ptr [rbp+0x50], rbx
sub rsi, 16
cmp rsi, 16
jnc 2b
test rsi, rsi
jnz 3f
4:
vzeroupper
mov rsp, rbp
pop rbp
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
.p2align 6
3:
test esi, 0x8
je 3f
vpbroadcastd ymm0, dword ptr [rcx]
vpbroadcastd ymm1, dword ptr [rcx+0x4]
vpbroadcastd ymm2, dword ptr [rcx+0x8]
vpbroadcastd ymm3, dword ptr [rcx+0xC]
vpbroadcastd ymm4, dword ptr [rcx+0x10]
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
mov r12, qword ptr [rdi+0x20]
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
xor edx, edx
2:
movzx ebx, byte ptr [rbp+0x48]
or ebx, eax
add rdx, 64
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
vmovups xmm8, xmmword ptr [r8+rdx-0x40]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x40]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x40]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x40]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm16, ymm12, ymm14, 136
vshufps ymm17, ymm12, ymm14, 221
vshufps ymm18, ymm13, ymm15, 136
vshufps ymm19, ymm13, ymm15, 221
vmovups xmm8, xmmword ptr [r8+rdx-0x30]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x30]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x30]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x30]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm20, ymm12, ymm14, 136
vshufps ymm21, ymm12, ymm14, 221
vshufps ymm22, ymm13, ymm15, 136
vshufps ymm23, ymm13, ymm15, 221
vmovups xmm8, xmmword ptr [r8+rdx-0x20]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x20]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x20]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x20]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm24, ymm12, ymm14, 136
vshufps ymm25, ymm12, ymm14, 221
vshufps ymm26, ymm13, ymm15, 136
vshufps ymm27, ymm13, ymm15, 221
vmovups xmm8, xmmword ptr [r8+rdx-0x10]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x10]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x10]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x10]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm28, ymm12, ymm14, 136
vshufps ymm29, ymm12, ymm14, 221
vshufps ymm30, ymm13, ymm15, 136
vshufps ymm31, ymm13, ymm15, 221
vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
vmovdqa ymm12, ymmword ptr [rsp]
vmovdqa ymm13, ymmword ptr [rsp+0x40]
vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
vpbroadcastd ymm15, dword ptr [rsp+0x88]
vpaddd ymm0, ymm0, ymm16
vpaddd ymm1, ymm1, ymm18
vpaddd ymm2, ymm2, ymm20
vpaddd ymm3, ymm3, ymm22
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm17
vpaddd ymm1, ymm1, ymm19
vpaddd ymm2, ymm2, ymm21
vpaddd ymm3, ymm3, ymm23
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm24
vpaddd ymm1, ymm1, ymm26
vpaddd ymm2, ymm2, ymm28
vpaddd ymm3, ymm3, ymm30
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm25
vpaddd ymm1, ymm1, ymm27
vpaddd ymm2, ymm2, ymm29
vpaddd ymm3, ymm3, ymm31
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpaddd ymm0, ymm0, ymm18
vpaddd ymm1, ymm1, ymm19
vpaddd ymm2, ymm2, ymm23
vpaddd ymm3, ymm3, ymm20
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm22
vpaddd ymm1, ymm1, ymm26
vpaddd ymm2, ymm2, ymm16
vpaddd ymm3, ymm3, ymm29
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm17
vpaddd ymm1, ymm1, ymm28
vpaddd ymm2, ymm2, ymm25
vpaddd ymm3, ymm3, ymm31
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm27
vpaddd ymm1, ymm1, ymm21
vpaddd ymm2, ymm2, ymm30
vpaddd ymm3, ymm3, ymm24
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpaddd ymm0, ymm0, ymm19
vpaddd ymm1, ymm1, ymm26
vpaddd ymm2, ymm2, ymm29
vpaddd ymm3, ymm3, ymm23
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm20
vpaddd ymm1, ymm1, ymm28
vpaddd ymm2, ymm2, ymm18
vpaddd ymm3, ymm3, ymm30
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm22
vpaddd ymm1, ymm1, ymm25
vpaddd ymm2, ymm2, ymm27
vpaddd ymm3, ymm3, ymm24
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm21
vpaddd ymm1, ymm1, ymm16
vpaddd ymm2, ymm2, ymm31
vpaddd ymm3, ymm3, ymm17
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpaddd ymm0, ymm0, ymm26
vpaddd ymm1, ymm1, ymm28
vpaddd ymm2, ymm2, ymm30
vpaddd ymm3, ymm3, ymm29
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm23
vpaddd ymm1, ymm1, ymm25
vpaddd ymm2, ymm2, ymm19
vpaddd ymm3, ymm3, ymm31
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm20
vpaddd ymm1, ymm1, ymm27
vpaddd ymm2, ymm2, ymm21
vpaddd ymm3, ymm3, ymm17
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm16
vpaddd ymm1, ymm1, ymm18
vpaddd ymm2, ymm2, ymm24
vpaddd ymm3, ymm3, ymm22
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpaddd ymm0, ymm0, ymm28
vpaddd ymm1, ymm1, ymm25
vpaddd ymm2, ymm2, ymm31
vpaddd ymm3, ymm3, ymm30
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm29
vpaddd ymm1, ymm1, ymm27
vpaddd ymm2, ymm2, ymm26
vpaddd ymm3, ymm3, ymm24
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm23
vpaddd ymm1, ymm1, ymm21
vpaddd ymm2, ymm2, ymm16
vpaddd ymm3, ymm3, ymm22
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm18
vpaddd ymm1, ymm1, ymm19
vpaddd ymm2, ymm2, ymm17
vpaddd ymm3, ymm3, ymm20
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpaddd ymm0, ymm0, ymm25
vpaddd ymm1, ymm1, ymm27
vpaddd ymm2, ymm2, ymm24
vpaddd ymm3, ymm3, ymm31
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm30
vpaddd ymm1, ymm1, ymm21
vpaddd ymm2, ymm2, ymm28
vpaddd ymm3, ymm3, ymm17
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm29
vpaddd ymm1, ymm1, ymm16
vpaddd ymm2, ymm2, ymm18
vpaddd ymm3, ymm3, ymm20
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm19
vpaddd ymm1, ymm1, ymm26
vpaddd ymm2, ymm2, ymm22
vpaddd ymm3, ymm3, ymm23
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpaddd ymm0, ymm0, ymm27
vpaddd ymm1, ymm1, ymm21
vpaddd ymm2, ymm2, ymm17
vpaddd ymm3, ymm3, ymm24
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vprord ymm15, ymm15, 16
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 12
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vpaddd ymm0, ymm0, ymm31
vpaddd ymm1, ymm1, ymm16
vpaddd ymm2, ymm2, ymm25
vpaddd ymm3, ymm3, ymm22
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxord ymm12, ymm12, ymm0
vpxord ymm13, ymm13, ymm1
vpxord ymm14, ymm14, ymm2
vpxord ymm15, ymm15, ymm3
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vprord ymm15, ymm15, 8
vpaddd ymm8, ymm8, ymm12
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxord ymm4, ymm4, ymm8
vpxord ymm5, ymm5, ymm9
vpxord ymm6, ymm6, ymm10
vpxord ymm7, ymm7, ymm11
vprord ymm4, ymm4, 7
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vpaddd ymm0, ymm0, ymm30
vpaddd ymm1, ymm1, ymm18
vpaddd ymm2, ymm2, ymm19
vpaddd ymm3, ymm3, ymm23
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 16
vprord ymm12, ymm12, 16
vprord ymm13, ymm13, 16
vprord ymm14, ymm14, 16
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 12
vprord ymm6, ymm6, 12
vprord ymm7, ymm7, 12
vprord ymm4, ymm4, 12
vpaddd ymm0, ymm0, ymm26
vpaddd ymm1, ymm1, ymm28
vpaddd ymm2, ymm2, ymm20
vpaddd ymm3, ymm3, ymm29
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxord ymm15, ymm15, ymm0
vpxord ymm12, ymm12, ymm1
vpxord ymm13, ymm13, ymm2
vpxord ymm14, ymm14, ymm3
vprord ymm15, ymm15, 8
vprord ymm12, ymm12, 8
vprord ymm13, ymm13, 8
vprord ymm14, ymm14, 8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm8, ymm13
vpaddd ymm9, ymm9, ymm14
vpxord ymm5, ymm5, ymm10
vpxord ymm6, ymm6, ymm11
vpxord ymm7, ymm7, ymm8
vpxord ymm4, ymm4, ymm9
vprord ymm5, ymm5, 7
vprord ymm6, ymm6, 7
vprord ymm7, ymm7, 7
vprord ymm4, ymm4, 7
vpxor ymm0, ymm0, ymm8
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm10
vpxor ymm3, ymm3, ymm11
vpxor ymm4, ymm4, ymm12
vpxor ymm5, ymm5, ymm13
vpxor ymm6, ymm6, ymm14
vpxor ymm7, ymm7, ymm15
movzx eax, byte ptr [rbp+0x38]
jne 2b
mov rbx, qword ptr [rbp+0x50]
vunpcklps ymm8, ymm0, ymm1
vunpcklps ymm9, ymm2, ymm3
vunpckhps ymm10, ymm0, ymm1
vunpcklps ymm11, ymm4, ymm5
vunpcklps ymm0, ymm6, ymm7
vshufps ymm12, ymm8, ymm9, 78
vblendps ymm1, ymm8, ymm12, 0xCC
vshufps ymm8, ymm11, ymm0, 78
vunpckhps ymm13, ymm2, ymm3
vblendps ymm2, ymm11, ymm8, 0xCC
vblendps ymm3, ymm12, ymm9, 0xCC
vperm2f128 ymm12, ymm1, ymm2, 0x20
vmovups ymmword ptr [rbx], ymm12
vunpckhps ymm14, ymm4, ymm5
vblendps ymm4, ymm8, ymm0, 0xCC
vunpckhps ymm15, ymm6, ymm7
vperm2f128 ymm7, ymm3, ymm4, 0x20
vmovups ymmword ptr [rbx+0x20], ymm7
vshufps ymm5, ymm10, ymm13, 78
vblendps ymm6, ymm5, ymm13, 0xCC
vshufps ymm13, ymm14, ymm15, 78
vblendps ymm10, ymm10, ymm5, 0xCC
vblendps ymm14, ymm14, ymm13, 0xCC
vperm2f128 ymm8, ymm10, ymm14, 0x20
vmovups ymmword ptr [rbx+0x40], ymm8
vblendps ymm15, ymm13, ymm15, 0xCC
vperm2f128 ymm13, ymm6, ymm15, 0x20
vmovups ymmword ptr [rbx+0x60], ymm13
vperm2f128 ymm9, ymm1, ymm2, 0x31
vperm2f128 ymm11, ymm3, ymm4, 0x31
vmovups ymmword ptr [rbx+0x80], ymm9
vperm2f128 ymm14, ymm10, ymm14, 0x31
vperm2f128 ymm15, ymm6, ymm15, 0x31
vmovups ymmword ptr [rbx+0xA0], ymm11
vmovups ymmword ptr [rbx+0xC0], ymm14
vmovups ymmword ptr [rbx+0xE0], ymm15
vmovdqa ymm0, ymmword ptr [rsp]
vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
vmovdqa ymmword ptr [rsp], ymm0
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
add rbx, 256
mov qword ptr [rbp+0x50], rbx
add rdi, 64
sub rsi, 8
3:
mov rbx, qword ptr [rbp+0x50]
mov r15, qword ptr [rsp+0x80]
movzx r13, byte ptr [rbp+0x38]
movzx r12, byte ptr [rbp+0x48]
test esi, 0x4
je 3f
vbroadcasti32x4 zmm0, xmmword ptr [rcx]
vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
vmovdqa xmm12, xmmword ptr [rsp]
vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
vpunpckldq xmm14, xmm12, xmm13
vpunpckhdq xmm15, xmm12, xmm13
vpermq ymm14, ymm14, 0xDC
vpermq ymm15, ymm15, 0xDC
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
vinserti64x4 zmm13, zmm14, ymm15, 0x01
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
mov eax, 43690
kmovw k3, eax
mov eax, 34952
kmovw k4, eax
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
.p2align 5
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
mov dword ptr [rsp+0x88], eax
vmovdqa32 zmm2, zmm15
vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
vpblendmd zmm3 {k4}, zmm13, zmm8
vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
vmovups zmm9, zmmword ptr [r8+rdx-0x30]
vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
vshufps zmm4, zmm8, zmm9, 136
vshufps zmm5, zmm8, zmm9, 221
vmovups zmm8, zmmword ptr [r8+rdx-0x20]
vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
vmovups zmm9, zmmword ptr [r8+rdx-0x10]
vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
vshufps zmm6, zmm8, zmm9, 136
vshufps zmm7, zmm8, zmm9, 221
vpshufd zmm6, zmm6, 0x93
vpshufd zmm7, zmm7, 0x93
mov al, 7
9:
vpaddd zmm0, zmm0, zmm4
vpaddd zmm0, zmm0, zmm1
vpxord zmm3, zmm3, zmm0
vprord zmm3, zmm3, 16
vpaddd zmm2, zmm2, zmm3
vpxord zmm1, zmm1, zmm2
vprord zmm1, zmm1, 12
vpaddd zmm0, zmm0, zmm5
vpaddd zmm0, zmm0, zmm1
vpxord zmm3, zmm3, zmm0
vprord zmm3, zmm3, 8
vpaddd zmm2, zmm2, zmm3
vpxord zmm1, zmm1, zmm2
vprord zmm1, zmm1, 7
vpshufd zmm0, zmm0, 0x93
vpshufd zmm3, zmm3, 0x4E
vpshufd zmm2, zmm2, 0x39
vpaddd zmm0, zmm0, zmm6
vpaddd zmm0, zmm0, zmm1
vpxord zmm3, zmm3, zmm0
vprord zmm3, zmm3, 16
vpaddd zmm2, zmm2, zmm3
vpxord zmm1, zmm1, zmm2
vprord zmm1, zmm1, 12
vpaddd zmm0, zmm0, zmm7
vpaddd zmm0, zmm0, zmm1
vpxord zmm3, zmm3, zmm0
vprord zmm3, zmm3, 8
vpaddd zmm2, zmm2, zmm3
vpxord zmm1, zmm1, zmm2
vprord zmm1, zmm1, 7
vpshufd zmm0, zmm0, 0x39
vpshufd zmm3, zmm3, 0x4E
vpshufd zmm2, zmm2, 0x93
dec al
jz 9f
vshufps zmm8, zmm4, zmm5, 214
vpshufd zmm9, zmm4, 0x0F
vpshufd zmm4, zmm8, 0x39
vshufps zmm8, zmm6, zmm7, 250
vpblendmd zmm9 {k3}, zmm9, zmm8
vpunpcklqdq zmm8, zmm7, zmm5
vpblendmd zmm8 {k4}, zmm8, zmm6
vpshufd zmm8, zmm8, 0x78
vpunpckhdq zmm5, zmm5, zmm7
vpunpckldq zmm6, zmm6, zmm5
vpshufd zmm7, zmm6, 0x1E
vmovdqa32 zmm5, zmm9
vmovdqa32 zmm6, zmm8
jmp 9b
9:
vpxord zmm0, zmm0, zmm2
vpxord zmm1, zmm1, zmm3
mov eax, r13d
cmp rdx, r15
jne 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
vmovdqa xmm0, xmmword ptr [rsp]
vmovdqa xmm2, xmmword ptr [rsp+0x40]
vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x40], xmm2
add rbx, 128
add rdi, 32
sub rsi, 4
3:
test esi, 0x2
je 3f
vbroadcasti128 ymm0, xmmword ptr [rcx]
vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
vmovd xmm13, dword ptr [rsp]
vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovd xmm14, dword ptr [rsp+0x4]
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
.p2align 5
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
mov dword ptr [rsp+0x88], eax
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
vpbroadcastd ymm8, dword ptr [rsp+0x88]
vpblendd ymm3, ymm13, ymm8, 0x88
vmovups ymm8, ymmword ptr [r8+rdx-0x40]
vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
vmovups ymm9, ymmword ptr [r8+rdx-0x30]
vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
vshufps ymm4, ymm8, ymm9, 136
vshufps ymm5, ymm8, ymm9, 221
vmovups ymm8, ymmword ptr [r8+rdx-0x20]
vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
vmovups ymm9, ymmword ptr [r8+rdx-0x10]
vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
vshufps ymm6, ymm8, ymm9, 136
vshufps ymm7, ymm8, ymm9, 221
vpshufd ymm6, ymm6, 0x93
vpshufd ymm7, ymm7, 0x93
mov al, 7
9:
vpaddd ymm0, ymm0, ymm4
vpaddd ymm0, ymm0, ymm1
vpxord ymm3, ymm3, ymm0
vprord ymm3, ymm3, 16
vpaddd ymm2, ymm2, ymm3
vpxord ymm1, ymm1, ymm2
vprord ymm1, ymm1, 12
vpaddd ymm0, ymm0, ymm5
vpaddd ymm0, ymm0, ymm1
vpxord ymm3, ymm3, ymm0
vprord ymm3, ymm3, 8
vpaddd ymm2, ymm2, ymm3
vpxord ymm1, ymm1, ymm2
vprord ymm1, ymm1, 7
vpshufd ymm0, ymm0, 0x93
vpshufd ymm3, ymm3, 0x4E
vpshufd ymm2, ymm2, 0x39
vpaddd ymm0, ymm0, ymm6
vpaddd ymm0, ymm0, ymm1
vpxord ymm3, ymm3, ymm0
vprord ymm3, ymm3, 16
vpaddd ymm2, ymm2, ymm3
vpxord ymm1, ymm1, ymm2
vprord ymm1, ymm1, 12
vpaddd ymm0, ymm0, ymm7
vpaddd ymm0, ymm0, ymm1
vpxord ymm3, ymm3, ymm0
vprord ymm3, ymm3, 8
vpaddd ymm2, ymm2, ymm3
vpxord ymm1, ymm1, ymm2
vprord ymm1, ymm1, 7
vpshufd ymm0, ymm0, 0x39
vpshufd ymm3, ymm3, 0x4E
vpshufd ymm2, ymm2, 0x93
dec al
jz 9f
vshufps ymm8, ymm4, ymm5, 214
vpshufd ymm9, ymm4, 0x0F
vpshufd ymm4, ymm8, 0x39
vshufps ymm8, ymm6, ymm7, 250
vpblendd ymm9, ymm9, ymm8, 0xAA
vpunpcklqdq ymm8, ymm7, ymm5
vpblendd ymm8, ymm8, ymm6, 0x88
vpshufd ymm8, ymm8, 0x78
vpunpckhdq ymm5, ymm5, ymm7
vpunpckldq ymm6, ymm6, ymm5
vpshufd ymm7, ymm6, 0x1E
vmovdqa ymm5, ymm9
vmovdqa ymm6, ymm8
jmp 9b
9:
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
mov eax, r13d
cmp rdx, r15
jne 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
vmovdqa xmm0, xmmword ptr [rsp]
vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
add rbx, 64
add rdi, 16
sub rsi, 2
3:
test esi, 0x1
je 4b
vmovdqu xmm0, xmmword ptr [rcx]
vmovdqu xmm1, xmmword ptr [rcx+0x10]
vmovd xmm14, dword ptr [rsp]
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
.p2align 5
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
vpinsrd xmm3, xmm14, eax, 3
vmovdqa xmm2, xmm15
vmovups xmm8, xmmword ptr [r8+rdx-0x40]
vmovups xmm9, xmmword ptr [r8+rdx-0x30]
vshufps xmm4, xmm8, xmm9, 136
vshufps xmm5, xmm8, xmm9, 221
vmovups xmm8, xmmword ptr [r8+rdx-0x20]
vmovups xmm9, xmmword ptr [r8+rdx-0x10]
vshufps xmm6, xmm8, xmm9, 136
vshufps xmm7, xmm8, xmm9, 221
vpshufd xmm6, xmm6, 0x93
vpshufd xmm7, xmm7, 0x93
mov al, 7
9:
vpaddd xmm0, xmm0, xmm4
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm5
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x93
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x39
vpaddd xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm7
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x39
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x93
dec al
jz 9f
vshufps xmm8, xmm4, xmm5, 214
vpshufd xmm9, xmm4, 0x0F
vpshufd xmm4, xmm8, 0x39
vshufps xmm8, xmm6, xmm7, 250
vpblendd xmm9, xmm9, xmm8, 0xAA
vpunpcklqdq xmm8, xmm7, xmm5
vpblendd xmm8, xmm8, xmm6, 0x88
vpshufd xmm8, xmm8, 0x78
vpunpckhdq xmm5, xmm5, xmm7
vpunpckldq xmm6, xmm6, xmm5
vpshufd xmm7, xmm6, 0x1E
vmovdqa xmm5, xmm9
vmovdqa xmm6, xmm8
jmp 9b
9:
vpxor xmm0, xmm0, xmm2
vpxor xmm1, xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
jmp 4b
.p2align 6
_blake3_compress_in_place_avx512:
blake3_compress_in_place_avx512:
_CET_ENDBR
vmovdqu xmm0, xmmword ptr [rdi]
vmovdqu xmm1, xmmword ptr [rdi+0x10]
movzx eax, r8b
movzx edx, dl
shl rax, 32
add rdx, rax
vmovq xmm3, rcx
vmovq xmm4, rdx
vpunpcklqdq xmm3, xmm3, xmm4
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
vmovups xmm8, xmmword ptr [rsi]
vmovups xmm9, xmmword ptr [rsi+0x10]
vshufps xmm4, xmm8, xmm9, 136
vshufps xmm5, xmm8, xmm9, 221
vmovups xmm8, xmmword ptr [rsi+0x20]
vmovups xmm9, xmmword ptr [rsi+0x30]
vshufps xmm6, xmm8, xmm9, 136
vshufps xmm7, xmm8, xmm9, 221
vpshufd xmm6, xmm6, 0x93
vpshufd xmm7, xmm7, 0x93
mov al, 7
9:
vpaddd xmm0, xmm0, xmm4
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm5
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x93
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x39
vpaddd xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm7
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x39
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x93
dec al
jz 9f
vshufps xmm8, xmm4, xmm5, 214
vpshufd xmm9, xmm4, 0x0F
vpshufd xmm4, xmm8, 0x39
vshufps xmm8, xmm6, xmm7, 250
vpblendd xmm9, xmm9, xmm8, 0xAA
vpunpcklqdq xmm8, xmm7, xmm5
vpblendd xmm8, xmm8, xmm6, 0x88
vpshufd xmm8, xmm8, 0x78
vpunpckhdq xmm5, xmm5, xmm7
vpunpckldq xmm6, xmm6, xmm5
vpshufd xmm7, xmm6, 0x1E
vmovdqa xmm5, xmm9
vmovdqa xmm6, xmm8
jmp 9b
9:
vpxor xmm0, xmm0, xmm2
vpxor xmm1, xmm1, xmm3
vmovdqu xmmword ptr [rdi], xmm0
vmovdqu xmmword ptr [rdi+0x10], xmm1
ret
.p2align 6
_blake3_compress_xof_avx512:
blake3_compress_xof_avx512:
_CET_ENDBR
vmovdqu xmm0, xmmword ptr [rdi]
vmovdqu xmm1, xmmword ptr [rdi+0x10]
movzx eax, r8b
movzx edx, dl
shl rax, 32
add rdx, rax
vmovq xmm3, rcx
vmovq xmm4, rdx
vpunpcklqdq xmm3, xmm3, xmm4
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
vmovups xmm8, xmmword ptr [rsi]
vmovups xmm9, xmmword ptr [rsi+0x10]
vshufps xmm4, xmm8, xmm9, 136
vshufps xmm5, xmm8, xmm9, 221
vmovups xmm8, xmmword ptr [rsi+0x20]
vmovups xmm9, xmmword ptr [rsi+0x30]
vshufps xmm6, xmm8, xmm9, 136
vshufps xmm7, xmm8, xmm9, 221
vpshufd xmm6, xmm6, 0x93
vpshufd xmm7, xmm7, 0x93
mov al, 7
9:
vpaddd xmm0, xmm0, xmm4
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm5
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x93
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x39
vpaddd xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm7
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x39
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x93
dec al
jz 9f
vshufps xmm8, xmm4, xmm5, 214
vpshufd xmm9, xmm4, 0x0F
vpshufd xmm4, xmm8, 0x39
vshufps xmm8, xmm6, xmm7, 250
vpblendd xmm9, xmm9, xmm8, 0xAA
vpunpcklqdq xmm8, xmm7, xmm5
vpblendd xmm8, xmm8, xmm6, 0x88
vpshufd xmm8, xmm8, 0x78
vpunpckhdq xmm5, xmm5, xmm7
vpunpckldq xmm6, xmm6, xmm5
vpshufd xmm7, xmm6, 0x1E
vmovdqa xmm5, xmm9
vmovdqa xmm6, xmm8
jmp 9b
9:
vpxor xmm0, xmm0, xmm2
vpxor xmm1, xmm1, xmm3
vpxor xmm2, xmm2, [rdi]
vpxor xmm3, xmm3, [rdi+0x10]
vmovdqu xmmword ptr [r9], xmm0
vmovdqu xmmword ptr [r9+0x10], xmm1
vmovdqu xmmword ptr [r9+0x20], xmm2
vmovdqu xmmword ptr [r9+0x30], xmm3
ret
#ifdef __APPLE__
.static_data
#else
.section .rodata
#endif
.p2align 6
INDEX0:
.long 0, 1, 2, 3, 16, 17, 18, 19
.long 8, 9, 10, 11, 24, 25, 26, 27
INDEX1:
.long 4, 5, 6, 7, 20, 21, 22, 23
.long 12, 13, 14, 15, 28, 29, 30, 31
ADD0:
.long 0, 1, 2, 3, 4, 5, 6, 7
.long 8, 9, 10, 11, 12, 13, 14, 15
ADD1: .long 1
ADD16: .long 16
BLAKE3_BLOCK_LEN:
.long 64
.p2align 6
BLAKE3_IV:
BLAKE3_IV_0:
.long 0x6A09E667
BLAKE3_IV_1:
.long 0xBB67AE85
BLAKE3_IV_2:
.long 0x3C6EF372
BLAKE3_IV_3:
.long 0xA54FF53A
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
#endif
#endif
#if !defined(_CET_ENDBR)
#define _CET_ENDBR
#endif
.intel_syntax noprefix
.global _blake3_hash_many_avx2
.global blake3_hash_many_avx2
#ifdef __APPLE__
.text
#else
.section .text
#endif
.p2align 6
_blake3_hash_many_avx2:
blake3_hash_many_avx2:
_CET_ENDBR
push r15
push r14
push r13
push r12
push rbx
push rbp
mov rbp, rsp
sub rsp, 680
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
vmovd xmm0, r9d
vpbroadcastd ymm0, xmm0
vmovdqa ymmword ptr [rsp+0x280], ymm0
vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
vmovdqa ymmword ptr [rsp+0x220], ymm2
vmovd xmm2, r8d
vpbroadcastd ymm2, xmm2
vpaddd ymm2, ymm2, ymm1
vmovdqa ymmword ptr [rsp+0x240], ymm2
vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
vpcmpgtd ymm2, ymm1, ymm2
shr r8, 32
vmovd xmm3, r8d
vpbroadcastd ymm3, xmm3
vpsubd ymm3, ymm3, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm3
shl rdx, 6
mov qword ptr [rsp+0x2A0], rdx
cmp rsi, 8
jc 3f
2:
vpbroadcastd ymm0, dword ptr [rcx]
vpbroadcastd ymm1, dword ptr [rcx+0x4]
vpbroadcastd ymm2, dword ptr [rcx+0x8]
vpbroadcastd ymm3, dword ptr [rcx+0xC]
vpbroadcastd ymm4, dword ptr [rcx+0x10]
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
mov r12, qword ptr [rdi+0x20]
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
xor edx, edx
.p2align 5
9:
movzx ebx, byte ptr [rbp+0x48]
or ebx, eax
add rdx, 64
cmp rdx, qword ptr [rsp+0x2A0]
cmove eax, ebx
mov dword ptr [rsp+0x200], eax
vmovups xmm8, xmmword ptr [r8+rdx-0x40]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x40]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x40]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x40]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm8, ymm12, ymm14, 136
vmovaps ymmword ptr [rsp], ymm8
vshufps ymm9, ymm12, ymm14, 221
vmovaps ymmword ptr [rsp+0x20], ymm9
vshufps ymm10, ymm13, ymm15, 136
vmovaps ymmword ptr [rsp+0x40], ymm10
vshufps ymm11, ymm13, ymm15, 221
vmovaps ymmword ptr [rsp+0x60], ymm11
vmovups xmm8, xmmword ptr [r8+rdx-0x30]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x30]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x30]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x30]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm8, ymm12, ymm14, 136
vmovaps ymmword ptr [rsp+0x80], ymm8
vshufps ymm9, ymm12, ymm14, 221
vmovaps ymmword ptr [rsp+0xA0], ymm9
vshufps ymm10, ymm13, ymm15, 136
vmovaps ymmword ptr [rsp+0xC0], ymm10
vshufps ymm11, ymm13, ymm15, 221
vmovaps ymmword ptr [rsp+0xE0], ymm11
vmovups xmm8, xmmword ptr [r8+rdx-0x20]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x20]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x20]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x20]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm8, ymm12, ymm14, 136
vmovaps ymmword ptr [rsp+0x100], ymm8
vshufps ymm9, ymm12, ymm14, 221
vmovaps ymmword ptr [rsp+0x120], ymm9
vshufps ymm10, ymm13, ymm15, 136
vmovaps ymmword ptr [rsp+0x140], ymm10
vshufps ymm11, ymm13, ymm15, 221
vmovaps ymmword ptr [rsp+0x160], ymm11
vmovups xmm8, xmmword ptr [r8+rdx-0x10]
vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
vmovups xmm9, xmmword ptr [r9+rdx-0x10]
vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
vunpcklpd ymm12, ymm8, ymm9
vunpckhpd ymm13, ymm8, ymm9
vmovups xmm10, xmmword ptr [r10+rdx-0x10]
vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
vmovups xmm11, xmmword ptr [r11+rdx-0x10]
vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
vunpcklpd ymm14, ymm10, ymm11
vunpckhpd ymm15, ymm10, ymm11
vshufps ymm8, ymm12, ymm14, 136
vmovaps ymmword ptr [rsp+0x180], ymm8
vshufps ymm9, ymm12, ymm14, 221
vmovaps ymmword ptr [rsp+0x1A0], ymm9
vshufps ymm10, ymm13, ymm15, 136
vmovaps ymmword ptr [rsp+0x1C0], ymm10
vshufps ymm11, ymm13, ymm15, 221
vmovaps ymmword ptr [rsp+0x1E0], ymm11
vpbroadcastd ymm15, dword ptr [rsp+0x200]
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r12+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r13+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
vpaddd ymm0, ymm0, ymmword ptr [rsp]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
vpxor ymm15, ymm3, ymm15
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
vpaddd ymm2, ymm2, ymmword ptr [rsp]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
vpaddd ymm1, ymm1, ymmword ptr [rsp]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
vpaddd ymm2, ymm2, ymmword ptr [rsp]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
vpaddd ymm1, ymm1, ymmword ptr [rsp]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
vpaddd ymm1, ymm1, ymmword ptr [rsp]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
vpaddd ymm0, ymm0, ymm4
vpaddd ymm1, ymm1, ymm5
vpaddd ymm2, ymm2, ymm6
vpaddd ymm3, ymm3, ymm7
vpxor ymm12, ymm12, ymm0
vpxor ymm13, ymm13, ymm1
vpxor ymm14, ymm14, ymm2
vpxor ymm15, ymm15, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpshufb ymm15, ymm15, ymm8
vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm13
vpaddd ymm10, ymm10, ymm14
vpaddd ymm11, ymm11, ymm15
vpxor ymm4, ymm4, ymm8
vpxor ymm5, ymm5, ymm9
vpxor ymm6, ymm6, ymm10
vpxor ymm7, ymm7, ymm11
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vmovdqa ymmword ptr [rsp+0x200], ymm8
vpsrld ymm8, ymm5, 12
vpslld ymm5, ymm5, 20
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 12
vpslld ymm6, ymm6, 20
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 12
vpslld ymm7, ymm7, 20
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 12
vpslld ymm4, ymm4, 20
vpor ymm4, ymm4, ymm8
vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
vpaddd ymm0, ymm0, ymm5
vpaddd ymm1, ymm1, ymm6
vpaddd ymm2, ymm2, ymm7
vpaddd ymm3, ymm3, ymm4
vpxor ymm15, ymm15, ymm0
vpxor ymm12, ymm12, ymm1
vpxor ymm13, ymm13, ymm2
vpxor ymm14, ymm14, ymm3
vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
vpshufb ymm15, ymm15, ymm8
vpshufb ymm12, ymm12, ymm8
vpshufb ymm13, ymm13, ymm8
vpshufb ymm14, ymm14, ymm8
vpaddd ymm10, ymm10, ymm15
vpaddd ymm11, ymm11, ymm12
vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
vpaddd ymm9, ymm9, ymm14
vpxor ymm5, ymm5, ymm10
vpxor ymm6, ymm6, ymm11
vpxor ymm7, ymm7, ymm8
vpxor ymm4, ymm4, ymm9
vpxor ymm0, ymm0, ymm8
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm10
vpxor ymm3, ymm3, ymm11
vpsrld ymm8, ymm5, 7
vpslld ymm5, ymm5, 25
vpor ymm5, ymm5, ymm8
vpsrld ymm8, ymm6, 7
vpslld ymm6, ymm6, 25
vpor ymm6, ymm6, ymm8
vpsrld ymm8, ymm7, 7
vpslld ymm7, ymm7, 25
vpor ymm7, ymm7, ymm8
vpsrld ymm8, ymm4, 7
vpslld ymm4, ymm4, 25
vpor ymm4, ymm4, ymm8
vpxor ymm4, ymm4, ymm12
vpxor ymm5, ymm5, ymm13
vpxor ymm6, ymm6, ymm14
vpxor ymm7, ymm7, ymm15
movzx eax, byte ptr [rbp+0x38]
jne 9b
mov rbx, qword ptr [rbp+0x50]
vunpcklps ymm8, ymm0, ymm1
vunpcklps ymm9, ymm2, ymm3
vunpckhps ymm10, ymm0, ymm1
vunpcklps ymm11, ymm4, ymm5
vunpcklps ymm0, ymm6, ymm7
vshufps ymm12, ymm8, ymm9, 78
vblendps ymm1, ymm8, ymm12, 0xCC
vshufps ymm8, ymm11, ymm0, 78
vunpckhps ymm13, ymm2, ymm3
vblendps ymm2, ymm11, ymm8, 0xCC
vblendps ymm3, ymm12, ymm9, 0xCC
vperm2f128 ymm12, ymm1, ymm2, 0x20
vmovups ymmword ptr [rbx], ymm12
vunpckhps ymm14, ymm4, ymm5
vblendps ymm4, ymm8, ymm0, 0xCC
vunpckhps ymm15, ymm6, ymm7
vperm2f128 ymm7, ymm3, ymm4, 0x20
vmovups ymmword ptr [rbx+0x20], ymm7
vshufps ymm5, ymm10, ymm13, 78
vblendps ymm6, ymm5, ymm13, 0xCC
vshufps ymm13, ymm14, ymm15, 78
vblendps ymm10, ymm10, ymm5, 0xCC
vblendps ymm14, ymm14, ymm13, 0xCC
vperm2f128 ymm8, ymm10, ymm14, 0x20
vmovups ymmword ptr [rbx+0x40], ymm8
vblendps ymm15, ymm13, ymm15, 0xCC
vperm2f128 ymm13, ymm6, ymm15, 0x20
vmovups ymmword ptr [rbx+0x60], ymm13
vperm2f128 ymm9, ymm1, ymm2, 0x31
vperm2f128 ymm11, ymm3, ymm4, 0x31
vmovups ymmword ptr [rbx+0x80], ymm9
vperm2f128 ymm14, ymm10, ymm14, 0x31
vperm2f128 ymm15, ymm6, ymm15, 0x31
vmovups ymmword ptr [rbx+0xA0], ymm11
vmovups ymmword ptr [rbx+0xC0], ymm14
vmovups ymmword ptr [rbx+0xE0], ymm15
vmovdqa ymm0, ymmword ptr [rsp+0x220]
vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
vmovdqa ymmword ptr [rsp+0x240], ymm1
vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
vpcmpgtd ymm2, ymm0, ymm2
vmovdqa ymm0, ymmword ptr [rsp+0x260]
vpsubd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm2
add rdi, 64
add rbx, 256
mov qword ptr [rbp+0x50], rbx
sub rsi, 8
cmp rsi, 8
jnc 2b
test rsi, rsi
jnz 3f
4:
vzeroupper
mov rsp, rbp
pop rbp
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
.p2align 5
3:
mov rbx, qword ptr [rbp+0x50]
mov r15, qword ptr [rsp+0x2A0]
movzx r13d, byte ptr [rbp+0x38]
movzx r12d, byte ptr [rbp+0x48]
test rsi, 0x4
je 3f
vbroadcasti128 ymm0, xmmword ptr [rcx]
vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
vmovdqa ymm8, ymm0
vmovdqa ymm9, ymm1
vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
vpunpckldq ymm14, ymm12, ymm13
vpunpckhdq ymm15, ymm12, ymm13
vpermq ymm14, ymm14, 0x50
vpermq ymm15, ymm15, 0x50
vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
vpblendd ymm14, ymm14, ymm12, 0x44
vpblendd ymm15, ymm15, ymm12, 0x44
vmovdqa ymmword ptr [rsp], ymm14
vmovdqa ymmword ptr [rsp+0x20], ymm15
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
.p2align 5
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
mov dword ptr [rsp+0x200], eax
vmovups ymm2, ymmword ptr [r8+rdx-0x40]
vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
vmovups ymm3, ymmword ptr [r8+rdx-0x30]
vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
vshufps ymm4, ymm2, ymm3, 136
vshufps ymm5, ymm2, ymm3, 221
vmovups ymm2, ymmword ptr [r8+rdx-0x20]
vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
vmovups ymm3, ymmword ptr [r8+rdx-0x10]
vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
vshufps ymm6, ymm2, ymm3, 136
vshufps ymm7, ymm2, ymm3, 221
vpshufd ymm6, ymm6, 0x93
vpshufd ymm7, ymm7, 0x93
vmovups ymm10, ymmword ptr [r10+rdx-0x40]
vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
vmovups ymm11, ymmword ptr [r10+rdx-0x30]
vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
vshufps ymm12, ymm10, ymm11, 136
vshufps ymm13, ymm10, ymm11, 221
vmovups ymm10, ymmword ptr [r10+rdx-0x20]
vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
vmovups ymm11, ymmword ptr [r10+rdx-0x10]
vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
vshufps ymm14, ymm10, ymm11, 136
vshufps ymm15, ymm10, ymm11, 221
vpshufd ymm14, ymm14, 0x93
vpshufd ymm15, ymm15, 0x93
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
vpbroadcastd ymm2, dword ptr [rsp+0x200]
vmovdqa ymm3, ymmword ptr [rsp]
vmovdqa ymm11, ymmword ptr [rsp+0x20]
vpblendd ymm3, ymm3, ymm2, 0x88
vpblendd ymm11, ymm11, ymm2, 0x88
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
vmovdqa ymm10, ymm2
mov al, 7
9:
vpaddd ymm0, ymm0, ymm4
vpaddd ymm8, ymm8, ymm12
vmovdqa ymmword ptr [rsp+0x40], ymm4
nop
vmovdqa ymmword ptr [rsp+0x60], ymm12
nop
vpaddd ymm0, ymm0, ymm1
vpaddd ymm8, ymm8, ymm9
vpxor ymm3, ymm3, ymm0
vpxor ymm11, ymm11, ymm8
vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
vpshufb ymm3, ymm3, ymm4
vpshufb ymm11, ymm11, ymm4
vpaddd ymm2, ymm2, ymm3
vpaddd ymm10, ymm10, ymm11
vpxor ymm1, ymm1, ymm2
vpxor ymm9, ymm9, ymm10
vpsrld ymm4, ymm1, 12
vpslld ymm1, ymm1, 20
vpor ymm1, ymm1, ymm4
vpsrld ymm4, ymm9, 12
vpslld ymm9, ymm9, 20
vpor ymm9, ymm9, ymm4
vpaddd ymm0, ymm0, ymm5
vpaddd ymm8, ymm8, ymm13
vpaddd ymm0, ymm0, ymm1
vpaddd ymm8, ymm8, ymm9
vmovdqa ymmword ptr [rsp+0x80], ymm5
vmovdqa ymmword ptr [rsp+0xA0], ymm13
vpxor ymm3, ymm3, ymm0
vpxor ymm11, ymm11, ymm8
vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
vpshufb ymm3, ymm3, ymm4
vpshufb ymm11, ymm11, ymm4
vpaddd ymm2, ymm2, ymm3
vpaddd ymm10, ymm10, ymm11
vpxor ymm1, ymm1, ymm2
vpxor ymm9, ymm9, ymm10
vpsrld ymm4, ymm1, 7
vpslld ymm1, ymm1, 25
vpor ymm1, ymm1, ymm4
vpsrld ymm4, ymm9, 7
vpslld ymm9, ymm9, 25
vpor ymm9, ymm9, ymm4
vpshufd ymm0, ymm0, 0x93
vpshufd ymm8, ymm8, 0x93
vpshufd ymm3, ymm3, 0x4E
vpshufd ymm11, ymm11, 0x4E
vpshufd ymm2, ymm2, 0x39
vpshufd ymm10, ymm10, 0x39
vpaddd ymm0, ymm0, ymm6
vpaddd ymm8, ymm8, ymm14
vpaddd ymm0, ymm0, ymm1
vpaddd ymm8, ymm8, ymm9
vpxor ymm3, ymm3, ymm0
vpxor ymm11, ymm11, ymm8
vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
vpshufb ymm3, ymm3, ymm4
vpshufb ymm11, ymm11, ymm4
vpaddd ymm2, ymm2, ymm3
vpaddd ymm10, ymm10, ymm11
vpxor ymm1, ymm1, ymm2
vpxor ymm9, ymm9, ymm10
vpsrld ymm4, ymm1, 12
vpslld ymm1, ymm1, 20
vpor ymm1, ymm1, ymm4
vpsrld ymm4, ymm9, 12
vpslld ymm9, ymm9, 20
vpor ymm9, ymm9, ymm4
vpaddd ymm0, ymm0, ymm7
vpaddd ymm8, ymm8, ymm15
vpaddd ymm0, ymm0, ymm1
vpaddd ymm8, ymm8, ymm9
vpxor ymm3, ymm3, ymm0
vpxor ymm11, ymm11, ymm8
vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
vpshufb ymm3, ymm3, ymm4
vpshufb ymm11, ymm11, ymm4
vpaddd ymm2, ymm2, ymm3
vpaddd ymm10, ymm10, ymm11
vpxor ymm1, ymm1, ymm2
vpxor ymm9, ymm9, ymm10
vpsrld ymm4, ymm1, 7
vpslld ymm1, ymm1, 25
vpor ymm1, ymm1, ymm4
vpsrld ymm4, ymm9, 7
vpslld ymm9, ymm9, 25
vpor ymm9, ymm9, ymm4
vpshufd ymm0, ymm0, 0x39
vpshufd ymm8, ymm8, 0x39
vpshufd ymm3, ymm3, 0x4E
vpshufd ymm11, ymm11, 0x4E
vpshufd ymm2, ymm2, 0x93
vpshufd ymm10, ymm10, 0x93
dec al
je 9f
vmovdqa ymm4, ymmword ptr [rsp+0x40]
vmovdqa ymm5, ymmword ptr [rsp+0x80]
vshufps ymm12, ymm4, ymm5, 214
vpshufd ymm13, ymm4, 0x0F
vpshufd ymm4, ymm12, 0x39
vshufps ymm12, ymm6, ymm7, 250
vpblendd ymm13, ymm13, ymm12, 0xAA
vpunpcklqdq ymm12, ymm7, ymm5
vpblendd ymm12, ymm12, ymm6, 0x88
vpshufd ymm12, ymm12, 0x78
vpunpckhdq ymm5, ymm5, ymm7
vpunpckldq ymm6, ymm6, ymm5
vpshufd ymm7, ymm6, 0x1E
vmovdqa ymmword ptr [rsp+0x40], ymm13
vmovdqa ymmword ptr [rsp+0x80], ymm12
vmovdqa ymm12, ymmword ptr [rsp+0x60]
vmovdqa ymm13, ymmword ptr [rsp+0xA0]
vshufps ymm5, ymm12, ymm13, 214
vpshufd ymm6, ymm12, 0x0F
vpshufd ymm12, ymm5, 0x39
vshufps ymm5, ymm14, ymm15, 250
vpblendd ymm6, ymm6, ymm5, 0xAA
vpunpcklqdq ymm5, ymm15, ymm13
vpblendd ymm5, ymm5, ymm14, 0x88
vpshufd ymm5, ymm5, 0x78
vpunpckhdq ymm13, ymm13, ymm15
vpunpckldq ymm14, ymm14, ymm13
vpshufd ymm15, ymm14, 0x1E
vmovdqa ymm13, ymm6
vmovdqa ymm14, ymm5
vmovdqa ymm5, ymmword ptr [rsp+0x40]
vmovdqa ymm6, ymmword ptr [rsp+0x80]
jmp 9b
9:
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
vpxor ymm8, ymm8, ymm10
vpxor ymm9, ymm9, ymm11
mov eax, r13d
cmp rdx, r15
jne 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
vmovdqu xmmword ptr [rbx+0x40], xmm8
vmovdqu xmmword ptr [rbx+0x50], xmm9
vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
vmovaps xmm8, xmmword ptr [rsp+0x280]
vmovaps xmm0, xmmword ptr [rsp+0x240]
vmovaps xmm1, xmmword ptr [rsp+0x250]
vmovaps xmm2, xmmword ptr [rsp+0x260]
vmovaps xmm3, xmmword ptr [rsp+0x270]
vblendvps xmm0, xmm0, xmm1, xmm8
vblendvps xmm2, xmm2, xmm3, xmm8
vmovaps xmmword ptr [rsp+0x240], xmm0
vmovaps xmmword ptr [rsp+0x260], xmm2
add rbx, 128
add rdi, 32
sub rsi, 4
3:
test rsi, 0x2
je 3f
vbroadcasti128 ymm0, xmmword ptr [rcx]
vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
vmovd xmm13, dword ptr [rsp+0x240]
vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovd xmm14, dword ptr [rsp+0x244]
vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
.p2align 5
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
mov dword ptr [rsp+0x200], eax
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
vpbroadcastd ymm8, dword ptr [rsp+0x200]
vpblendd ymm3, ymm13, ymm8, 0x88
vmovups ymm8, ymmword ptr [r8+rdx-0x40]
vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
vmovups ymm9, ymmword ptr [r8+rdx-0x30]
vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
vshufps ymm4, ymm8, ymm9, 136
vshufps ymm5, ymm8, ymm9, 221
vmovups ymm8, ymmword ptr [r8+rdx-0x20]
vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
vmovups ymm9, ymmword ptr [r8+rdx-0x10]
vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
vshufps ymm6, ymm8, ymm9, 136
vshufps ymm7, ymm8, ymm9, 221
vpshufd ymm6, ymm6, 0x93
vpshufd ymm7, ymm7, 0x93
mov al, 7
9:
vpaddd ymm0, ymm0, ymm4
vpaddd ymm0, ymm0, ymm1
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm14
vpaddd ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpsrld ymm8, ymm1, 12
vpslld ymm1, ymm1, 20
vpor ymm1, ymm1, ymm8
vpaddd ymm0, ymm0, ymm5
vpaddd ymm0, ymm0, ymm1
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm15
vpaddd ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpsrld ymm8, ymm1, 7
vpslld ymm1, ymm1, 25
vpor ymm1, ymm1, ymm8
vpshufd ymm0, ymm0, 0x93
vpshufd ymm3, ymm3, 0x4E
vpshufd ymm2, ymm2, 0x39
vpaddd ymm0, ymm0, ymm6
vpaddd ymm0, ymm0, ymm1
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm14
vpaddd ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpsrld ymm8, ymm1, 12
vpslld ymm1, ymm1, 20
vpor ymm1, ymm1, ymm8
vpaddd ymm0, ymm0, ymm7
vpaddd ymm0, ymm0, ymm1
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm15
vpaddd ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpsrld ymm8, ymm1, 7
vpslld ymm1, ymm1, 25
vpor ymm1, ymm1, ymm8
vpshufd ymm0, ymm0, 0x39
vpshufd ymm3, ymm3, 0x4E
vpshufd ymm2, ymm2, 0x93
dec al
jz 9f
vshufps ymm8, ymm4, ymm5, 214
vpshufd ymm9, ymm4, 0x0F
vpshufd ymm4, ymm8, 0x39
vshufps ymm8, ymm6, ymm7, 250
vpblendd ymm9, ymm9, ymm8, 0xAA
vpunpcklqdq ymm8, ymm7, ymm5
vpblendd ymm8, ymm8, ymm6, 0x88
vpshufd ymm8, ymm8, 0x78
vpunpckhdq ymm5, ymm5, ymm7
vpunpckldq ymm6, ymm6, ymm5
vpshufd ymm7, ymm6, 0x1E
vmovdqa ymm5, ymm9
vmovdqa ymm6, ymm8
jmp 9b
9:
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
mov eax, r13d
cmp rdx, r15
jne 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
vmovaps ymm8, ymmword ptr [rsp+0x280]
vmovaps ymm0, ymmword ptr [rsp+0x240]
vmovups ymm1, ymmword ptr [rsp+0x248]
vmovaps ymm2, ymmword ptr [rsp+0x260]
vmovups ymm3, ymmword ptr [rsp+0x268]
vblendvps ymm0, ymm0, ymm1, ymm8
vblendvps ymm2, ymm2, ymm3, ymm8
vmovaps ymmword ptr [rsp+0x240], ymm0
vmovaps ymmword ptr [rsp+0x260], ymm2
add rbx, 64
add rdi, 16
sub rsi, 2
3:
test rsi, 0x1
je 4b
vmovdqu xmm0, xmmword ptr [rcx]
vmovdqu xmm1, xmmword ptr [rcx+0x10]
vmovd xmm3, dword ptr [rsp+0x240]
vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm14, xmmword ptr [ROT16+rip]
vmovdqa xmm15, xmmword ptr [ROT8+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
.p2align 5
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
vmovdqa xmm3, xmm13
vpinsrd xmm3, xmm3, eax, 3
vmovups xmm8, xmmword ptr [r8+rdx-0x40]
vmovups xmm9, xmmword ptr [r8+rdx-0x30]
vshufps xmm4, xmm8, xmm9, 136
vshufps xmm5, xmm8, xmm9, 221
vmovups xmm8, xmmword ptr [r8+rdx-0x20]
vmovups xmm9, xmmword ptr [r8+rdx-0x10]
vshufps xmm6, xmm8, xmm9, 136
vshufps xmm7, xmm8, xmm9, 221
vpshufd xmm6, xmm6, 0x93
vpshufd xmm7, xmm7, 0x93
mov al, 7
9:
vpaddd xmm0, xmm0, xmm4
vpaddd xmm0, xmm0, xmm1
vpxor xmm3, xmm3, xmm0
vpshufb xmm3, xmm3, xmm14
vpaddd xmm2, xmm2, xmm3
vpxor xmm1, xmm1, xmm2
vpsrld xmm8, xmm1, 12
vpslld xmm1, xmm1, 20
vpor xmm1, xmm1, xmm8
vpaddd xmm0, xmm0, xmm5
vpaddd xmm0, xmm0, xmm1
vpxor xmm3, xmm3, xmm0
vpshufb xmm3, xmm3, xmm15
vpaddd xmm2, xmm2, xmm3
vpxor xmm1, xmm1, xmm2
vpsrld xmm8, xmm1, 7
vpslld xmm1, xmm1, 25
vpor xmm1, xmm1, xmm8
vpshufd xmm0, xmm0, 0x93
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x39
vpaddd xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm1
vpxor xmm3, xmm3, xmm0
vpshufb xmm3, xmm3, xmm14
vpaddd xmm2, xmm2, xmm3
vpxor xmm1, xmm1, xmm2
vpsrld xmm8, xmm1, 12
vpslld xmm1, xmm1, 20
vpor xmm1, xmm1, xmm8
vpaddd xmm0, xmm0, xmm7
vpaddd xmm0, xmm0, xmm1
vpxor xmm3, xmm3, xmm0
vpshufb xmm3, xmm3, xmm15
vpaddd xmm2, xmm2, xmm3
vpxor xmm1, xmm1, xmm2
vpsrld xmm8, xmm1, 7
vpslld xmm1, xmm1, 25
vpor xmm1, xmm1, xmm8
vpshufd xmm0, xmm0, 0x39
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x93
dec al
jz 9f
vshufps xmm8, xmm4, xmm5, 214
vpshufd xmm9, xmm4, 0x0F
vpshufd xmm4, xmm8, 0x39
vshufps xmm8, xmm6, xmm7, 250
vpblendd xmm9, xmm9, xmm8, 0xAA
vpunpcklqdq xmm8, xmm7, xmm5
vpblendd xmm8, xmm8, xmm6, 0x88
vpshufd xmm8, xmm8, 0x78
vpunpckhdq xmm5, xmm5, xmm7
vpunpckldq xmm6, xmm6, xmm5
vpshufd xmm7, xmm6, 0x1E
vmovdqa xmm5, xmm9
vmovdqa xmm6, xmm8
jmp 9b
9:
vpxor xmm0, xmm0, xmm2
vpxor xmm1, xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
jmp 4b
#ifdef __APPLE__
.static_data
#else
.section .rodata
#endif
.p2align 6
ADD0:
.long 0, 1, 2, 3, 4, 5, 6, 7
ADD1:
.long 8, 8, 8, 8, 8, 8, 8, 8
BLAKE3_IV_0:
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
BLAKE3_IV_1:
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
BLAKE3_IV_2:
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
BLAKE3_IV_3:
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
BLAKE3_BLOCK_LEN:
.long 0x00000040, 0x00000040, 0x00000040, 0x00000040
.long 0x00000040, 0x00000040, 0x00000040, 0x00000040
ROT16:
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
ASM = blake3_avx2_x86-64_unix.S
ASM += blake3_avx512_x86-64_unix.S
ASM += blake3_sse2_x86-64_unix.S
ASM += blake3_sse41_x86-64_unix.S
ASM = vendor/blake3/blake3_avx2_x86-64_unix.S
ASM += vendor/blake3/blake3_avx512_x86-64_unix.S
ASM += vendor/blake3/blake3_sse2_x86-64_unix.S
ASM += vendor/blake3/blake3_sse41_x86-64_unix.S