move blake3 files to vendor/blake3

laumann
Sep 2, 2022, 3:06 PM
2C2EF2GKTOAT7QI56KKDFGRGSFZRDPKUASNQFB6XQDROAEPPZW4AC

Dependencies

  • [2] VKLGQREY change: add base32 decode, initial deconstruction of hashed
  • [3] FB67XX5E add argument parsing setup
  • [4] RRUEU4S3 init
  • [5] 3OHR6ZPH make: prettify output
  • [6] RIWSVVAS change: decompress the 'contents' with zstd_seekable
  • [7] X36ICMJN Initial import for blake3
  • [8] B3XLVPNC Add ani.c and initial Makefile
  • [9] Q7TKZCJP Add initial support for reading the offsets from a (fixed) change
  • [10] Y26WT3ZF change: decode message, description and timestamp
  • [11] 3NA345CN Add zstd_seekable + many headers
  • [12] PEUS54XQ

Change contents

  • file deletion: blake3_portable.c (----------)
    [3.1][3.0:41](),[3.41][3.42:42]()
    #include "blake3_impl.h"
    #include <string.h>
    INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
    return (w >> c) | (w << (32 - c));
    }
    INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
    uint32_t x, uint32_t y) {
    state[a] = state[a] + state[b] + x;
    state[d] = rotr32(state[d] ^ state[a], 16);
    state[c] = state[c] + state[d];
    state[b] = rotr32(state[b] ^ state[c], 12);
    state[a] = state[a] + state[b] + y;
    state[d] = rotr32(state[d] ^ state[a], 8);
    state[c] = state[c] + state[d];
    state[b] = rotr32(state[b] ^ state[c], 7);
    }
    INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
    // Select the message schedule based on the round.
    const uint8_t *schedule = MSG_SCHEDULE[round];
    // Mix the columns.
    g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
    g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
    g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
    g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
    // Mix the rows.
    g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
    g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
    g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
    g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
    }
    INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter, uint8_t flags) {
    uint32_t block_words[16];
    block_words[0] = load32(block + 4 * 0);
    block_words[1] = load32(block + 4 * 1);
    block_words[2] = load32(block + 4 * 2);
    block_words[3] = load32(block + 4 * 3);
    block_words[4] = load32(block + 4 * 4);
    block_words[5] = load32(block + 4 * 5);
    block_words[6] = load32(block + 4 * 6);
    block_words[7] = load32(block + 4 * 7);
    block_words[8] = load32(block + 4 * 8);
    block_words[9] = load32(block + 4 * 9);
    block_words[10] = load32(block + 4 * 10);
    block_words[11] = load32(block + 4 * 11);
    block_words[12] = load32(block + 4 * 12);
    block_words[13] = load32(block + 4 * 13);
    block_words[14] = load32(block + 4 * 14);
    block_words[15] = load32(block + 4 * 15);
    state[0] = cv[0];
    state[1] = cv[1];
    state[2] = cv[2];
    state[3] = cv[3];
    state[4] = cv[4];
    state[5] = cv[5];
    state[6] = cv[6];
    state[7] = cv[7];
    state[8] = IV[0];
    state[9] = IV[1];
    state[10] = IV[2];
    state[11] = IV[3];
    state[12] = counter_low(counter);
    state[13] = counter_high(counter);
    state[14] = (uint32_t)block_len;
    state[15] = (uint32_t)flags;
    round_fn(state, &block_words[0], 0);
    round_fn(state, &block_words[0], 1);
    round_fn(state, &block_words[0], 2);
    round_fn(state, &block_words[0], 3);
    round_fn(state, &block_words[0], 4);
    round_fn(state, &block_words[0], 5);
    round_fn(state, &block_words[0], 6);
    }
    void blake3_compress_in_place_portable(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags) {
    uint32_t state[16];
    compress_pre(state, cv, block, block_len, counter, flags);
    cv[0] = state[0] ^ state[8];
    cv[1] = state[1] ^ state[9];
    cv[2] = state[2] ^ state[10];
    cv[3] = state[3] ^ state[11];
    cv[4] = state[4] ^ state[12];
    cv[5] = state[5] ^ state[13];
    cv[6] = state[6] ^ state[14];
    cv[7] = state[7] ^ state[15];
    }
    void blake3_compress_xof_portable(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags, uint8_t out[64]) {
    uint32_t state[16];
    compress_pre(state, cv, block, block_len, counter, flags);
    store32(&out[0 * 4], state[0] ^ state[8]);
    store32(&out[1 * 4], state[1] ^ state[9]);
    store32(&out[2 * 4], state[2] ^ state[10]);
    store32(&out[3 * 4], state[3] ^ state[11]);
    store32(&out[4 * 4], state[4] ^ state[12]);
    store32(&out[5 * 4], state[5] ^ state[13]);
    store32(&out[6 * 4], state[6] ^ state[14]);
    store32(&out[7 * 4], state[7] ^ state[15]);
    store32(&out[8 * 4], state[8] ^ cv[0]);
    store32(&out[9 * 4], state[9] ^ cv[1]);
    store32(&out[10 * 4], state[10] ^ cv[2]);
    store32(&out[11 * 4], state[11] ^ cv[3]);
    store32(&out[12 * 4], state[12] ^ cv[4]);
    store32(&out[13 * 4], state[13] ^ cv[5]);
    store32(&out[14 * 4], state[14] ^ cv[6]);
    store32(&out[15 * 4], state[15] ^ cv[7]);
    }
    INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
    const uint32_t key[8], uint64_t counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
    uint32_t cv[8];
    memcpy(cv, key, BLAKE3_KEY_LEN);
    uint8_t block_flags = flags | flags_start;
    while (blocks > 0) {
    if (blocks == 1) {
    block_flags |= flags_end;
    }
    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
    block_flags);
    input = &input[BLAKE3_BLOCK_LEN];
    blocks -= 1;
    block_flags = flags;
    }
    store_cv_words(out, cv);
    }
    void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out) {
    while (num_inputs > 0) {
    hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
    flags_end, out);
    if (increment_counter) {
    counter += 1;
    }
    inputs += 1;
    num_inputs -= 1;
    out = &out[BLAKE3_OUT_LEN];
    }
    }
  • file deletion: blake3_impl.h (----------)
    [3.1][3.5933:5970](),[3.5970][3.5971:5971]()
    #ifndef BLAKE3_IMPL_H
    #define BLAKE3_IMPL_H
    #include <assert.h>
    #include <stdbool.h>
    #include <stddef.h>
    #include <stdint.h>
    #include <string.h>
    #include "blake3.h"
    // internal flags
    enum blake3_flags {
    CHUNK_START = 1 << 0,
    CHUNK_END = 1 << 1,
    PARENT = 1 << 2,
    ROOT = 1 << 3,
    KEYED_HASH = 1 << 4,
    DERIVE_KEY_CONTEXT = 1 << 5,
    DERIVE_KEY_MATERIAL = 1 << 6,
    };
    // This C implementation tries to support recent versions of GCC, Clang, and
    // MSVC.
    #if defined(_MSC_VER)
    #define INLINE static __forceinline
    #else
    #define INLINE static inline __attribute__((always_inline))
    #endif
    #if defined(__x86_64__) || defined(_M_X64)
    #define IS_X86
    #define IS_X86_64
    #endif
    #if defined(__i386__) || defined(_M_IX86)
    #define IS_X86
    #define IS_X86_32
    #endif
    #if defined(__aarch64__) || defined(_M_ARM64)
    #define IS_AARCH64
    #endif
    #if defined(IS_X86)
    #if defined(_MSC_VER)
    #include <intrin.h>
    #endif
    #include <immintrin.h>
    #endif
    #if !defined(BLAKE3_USE_NEON)
    // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
    #if defined(IS_AARCH64)
    #define BLAKE3_USE_NEON 1
    #else
    #define BLAKE3_USE_NEON 0
    #endif
    #endif
    #if defined(IS_X86)
    #define MAX_SIMD_DEGREE 16
    #elif BLAKE3_USE_NEON == 1
    #define MAX_SIMD_DEGREE 4
    #else
    #define MAX_SIMD_DEGREE 1
    #endif
    // There are some places where we want a static size that's equal to the
    // MAX_SIMD_DEGREE, but also at least 2.
    #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
    static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
    0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
    0x1F83D9ABUL, 0x5BE0CD19UL};
    static const uint8_t MSG_SCHEDULE[7][16] = {
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
    {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
    {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
    {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
    {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
    {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
    {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
    };
    /* Find index of the highest set bit */
    /* x is assumed to be nonzero. */
    static unsigned int highest_one(uint64_t x) {
    #if defined(__GNUC__) || defined(__clang__)
    return 63 ^ __builtin_clzll(x);
    #elif defined(_MSC_VER) && defined(IS_X86_64)
    unsigned long index;
    _BitScanReverse64(&index, x);
    return index;
    #elif defined(_MSC_VER) && defined(IS_X86_32)
    if(x >> 32) {
    unsigned long index;
    _BitScanReverse(&index, (unsigned long)(x >> 32));
    return 32 + index;
    } else {
    unsigned long index;
    _BitScanReverse(&index, (unsigned long)x);
    return index;
    }
    #else
    unsigned int c = 0;
    if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
    if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
    if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
    if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
    if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
    if(x & 0x0000000000000002ULL) { c += 1; }
    return c;
    #endif
    }
    // Count the number of 1 bits.
    INLINE unsigned int popcnt(uint64_t x) {
    #if defined(__GNUC__) || defined(__clang__)
    return __builtin_popcountll(x);
    #else
    unsigned int count = 0;
    while (x != 0) {
    count += 1;
    x &= x - 1;
    }
    return count;
    #endif
    }
    // Largest power of two less than or equal to x. As a special case, returns 1
    // when x is 0.
    INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
    return 1ULL << highest_one(x | 1);
    }
    INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
    INLINE uint32_t counter_high(uint64_t counter) {
    return (uint32_t)(counter >> 32);
    }
    INLINE uint32_t load32(const void *src) {
    const uint8_t *p = (const uint8_t *)src;
    return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
    ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
    }
    INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
    uint32_t key_words[8]) {
    key_words[0] = load32(&key[0 * 4]);
    key_words[1] = load32(&key[1 * 4]);
    key_words[2] = load32(&key[2 * 4]);
    key_words[3] = load32(&key[3 * 4]);
    key_words[4] = load32(&key[4 * 4]);
    key_words[5] = load32(&key[5 * 4]);
    key_words[6] = load32(&key[6 * 4]);
    key_words[7] = load32(&key[7 * 4]);
    }
    INLINE void store32(void *dst, uint32_t w) {
    uint8_t *p = (uint8_t *)dst;
    p[0] = (uint8_t)(w >> 0);
    p[1] = (uint8_t)(w >> 8);
    p[2] = (uint8_t)(w >> 16);
    p[3] = (uint8_t)(w >> 24);
    }
    INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
    store32(&bytes_out[0 * 4], cv_words[0]);
    store32(&bytes_out[1 * 4], cv_words[1]);
    store32(&bytes_out[2 * 4], cv_words[2]);
    store32(&bytes_out[3 * 4], cv_words[3]);
    store32(&bytes_out[4 * 4], cv_words[4]);
    store32(&bytes_out[5 * 4], cv_words[5]);
    store32(&bytes_out[6 * 4], cv_words[6]);
    store32(&bytes_out[7 * 4], cv_words[7]);
    }
    void blake3_compress_in_place(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags);
    void blake3_compress_xof(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter, uint8_t flags,
    uint8_t out[64]);
    void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8], uint64_t counter,
    bool increment_counter, uint8_t flags,
    uint8_t flags_start, uint8_t flags_end, uint8_t *out);
    size_t blake3_simd_degree(void);
    // Declarations for implementation-specific functions.
    void blake3_compress_in_place_portable(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags);
    void blake3_compress_xof_portable(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags, uint8_t out[64]);
    void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out);
    #if defined(IS_X86)
    #if !defined(BLAKE3_NO_SSE2)
    void blake3_compress_in_place_sse2(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags);
    void blake3_compress_xof_sse2(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags, uint8_t out[64]);
    void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out);
    #endif
    #if !defined(BLAKE3_NO_SSE41)
    void blake3_compress_in_place_sse41(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags);
    void blake3_compress_xof_sse41(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags, uint8_t out[64]);
    void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out);
    #endif
    #if !defined(BLAKE3_NO_AVX2)
    void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out);
    #endif
    #if !defined(BLAKE3_NO_AVX512)
    void blake3_compress_in_place_avx512(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags);
    void blake3_compress_xof_avx512(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags, uint8_t out[64]);
    void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out);
    #endif
    #endif
    #if BLAKE3_USE_NEON == 1
    void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8],
    uint64_t counter, bool increment_counter,
    uint8_t flags, uint8_t flags_start,
    uint8_t flags_end, uint8_t *out);
    #endif
    #endif /* BLAKE3_IMPL_H */
  • file deletion: blake3_dispatch.c (----------)
    [3.1][3.16192:16233](),[3.16233][3.16234:16234]()
    #include <stdbool.h>
    #include <stddef.h>
    #include <stdint.h>
    #include "blake3_impl.h"
    #if defined(IS_X86)
    #if defined(_MSC_VER)
    #include <intrin.h>
    #elif defined(__GNUC__)
    #include <immintrin.h>
    #else
    #error "Unimplemented!"
    #endif
    #endif
    #define MAYBE_UNUSED(x) (void)((x))
    #if defined(IS_X86)
    static uint64_t xgetbv(void) {
    #if defined(_MSC_VER)
    return _xgetbv(0);
    #else
    uint32_t eax = 0, edx = 0;
    __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
    return ((uint64_t)edx << 32) | eax;
    #endif
    }
    static void cpuid(uint32_t out[4], uint32_t id) {
    #if defined(_MSC_VER)
    __cpuid((int *)out, id);
    #elif defined(__i386__) || defined(_M_IX86)
    __asm__ __volatile__("movl %%ebx, %1\n"
    "cpuid\n"
    "xchgl %1, %%ebx\n"
    : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
    : "a"(id));
    #else
    __asm__ __volatile__("cpuid\n"
    : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
    : "a"(id));
    #endif
    }
    static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
    #if defined(_MSC_VER)
    __cpuidex((int *)out, id, sid);
    #elif defined(__i386__) || defined(_M_IX86)
    __asm__ __volatile__("movl %%ebx, %1\n"
    "cpuid\n"
    "xchgl %1, %%ebx\n"
    : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
    : "a"(id), "c"(sid));
    #else
    __asm__ __volatile__("cpuid\n"
    : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
    : "a"(id), "c"(sid));
    #endif
    }
    #endif
    enum cpu_feature {
    SSE2 = 1 << 0,
    SSSE3 = 1 << 1,
    SSE41 = 1 << 2,
    AVX = 1 << 3,
    AVX2 = 1 << 4,
    AVX512F = 1 << 5,
    AVX512VL = 1 << 6,
    /* ... */
    UNDEFINED = 1 << 30
    };
    #if !defined(BLAKE3_TESTING)
    static /* Allow the variable to be controlled manually for testing */
    #endif
    enum cpu_feature g_cpu_features = UNDEFINED;
    #if !defined(BLAKE3_TESTING)
    static
    #endif
    enum cpu_feature
    get_cpu_features(void) {
    if (g_cpu_features != UNDEFINED) {
    return g_cpu_features;
    } else {
    #if defined(IS_X86)
    uint32_t regs[4] = {0};
    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
    (void)edx;
    enum cpu_feature features = 0;
    cpuid(regs, 0);
    const int max_id = *eax;
    cpuid(regs, 1);
    #if defined(__amd64__) || defined(_M_X64)
    features |= SSE2;
    #else
    if (*edx & (1UL << 26))
    features |= SSE2;
    #endif
    if (*ecx & (1UL << 0))
    features |= SSSE3;
    if (*ecx & (1UL << 19))
    features |= SSE41;
    if (*ecx & (1UL << 27)) { // OSXSAVE
    const uint64_t mask = xgetbv();
    if ((mask & 6) == 6) { // SSE and AVX states
    if (*ecx & (1UL << 28))
    features |= AVX;
    if (max_id >= 7) {
    cpuidex(regs, 7, 0);
    if (*ebx & (1UL << 5))
    features |= AVX2;
    if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
    if (*ebx & (1UL << 31))
    features |= AVX512VL;
    if (*ebx & (1UL << 16))
    features |= AVX512F;
    }
    }
    }
    }
    g_cpu_features = features;
    return features;
    #else
    /* How to detect NEON? */
    return 0;
    #endif
    }
    }
    void blake3_compress_in_place(uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags) {
    #if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
    #if !defined(BLAKE3_NO_AVX512)
    if (features & AVX512VL) {
    blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
    blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
    blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
    return;
    }
    #endif
    #endif
    blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
    }
    void blake3_compress_xof(const uint32_t cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter, uint8_t flags,
    uint8_t out[64]) {
    #if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
    #if !defined(BLAKE3_NO_AVX512)
    if (features & AVX512VL) {
    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
    blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
    blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
    return;
    }
    #endif
    #endif
    blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
    }
    void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
    size_t blocks, const uint32_t key[8], uint64_t counter,
    bool increment_counter, uint8_t flags,
    uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
    #if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
    #if !defined(BLAKE3_NO_AVX512)
    if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
    blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
    increment_counter, flags, flags_start, flags_end,
    out);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_AVX2)
    if (features & AVX2) {
    blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
    increment_counter, flags, flags_start, flags_end,
    out);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
    blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
    increment_counter, flags, flags_start, flags_end,
    out);
    return;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
    blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
    increment_counter, flags, flags_start, flags_end,
    out);
    return;
    }
    #endif
    #endif
    #if BLAKE3_USE_NEON == 1
    blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
    increment_counter, flags, flags_start, flags_end, out);
    return;
    #endif
    blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
    increment_counter, flags, flags_start, flags_end,
    out);
    }
    // The dynamically detected SIMD degree of the current platform.
    size_t blake3_simd_degree(void) {
    #if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
    #if !defined(BLAKE3_NO_AVX512)
    if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
    return 16;
    }
    #endif
    #if !defined(BLAKE3_NO_AVX2)
    if (features & AVX2) {
    return 8;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
    return 4;
    }
    #endif
    #if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
    return 4;
    }
    #endif
    #endif
    #if BLAKE3_USE_NEON == 1
    return 4;
    #endif
    return 1;
    }
  • file deletion: blake3.h (----------)
    [3.1][3.23773:23805](),[3.23805][3.23806:23806]()
    #ifndef BLAKE3_H
    #define BLAKE3_H
    #include <stddef.h>
    #include <stdint.h>
    #ifdef __cplusplus
    extern "C" {
    #endif
    #define BLAKE3_VERSION_STRING "1.3.1"
    #define BLAKE3_KEY_LEN 32
    #define BLAKE3_OUT_LEN 32
    #define BLAKE3_BLOCK_LEN 64
    #define BLAKE3_CHUNK_LEN 1024
    #define BLAKE3_MAX_DEPTH 54
    // This struct is a private implementation detail. It has to be here because
    // it's part of blake3_hasher below.
    typedef struct {
    uint32_t cv[8];
    uint64_t chunk_counter;
    uint8_t buf[BLAKE3_BLOCK_LEN];
    uint8_t buf_len;
    uint8_t blocks_compressed;
    uint8_t flags;
    } blake3_chunk_state;
    typedef struct {
    uint32_t key[8];
    blake3_chunk_state chunk;
    uint8_t cv_stack_len;
    // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
    // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
    // requires a 4th entry, rather than merging everything down to 1, because we
    // don't know whether more input is coming. This is different from how the
    // reference implementation does things.
    uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
    } blake3_hasher;
    const char *blake3_version(void);
    void blake3_hasher_init(blake3_hasher *self);
    void blake3_hasher_init_keyed(blake3_hasher *self,
    const uint8_t key[BLAKE3_KEY_LEN]);
    void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
    void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
    size_t context_len);
    void blake3_hasher_update(blake3_hasher *self, const void *input,
    size_t input_len);
    void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
    size_t out_len);
    void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
    uint8_t *out, size_t out_len);
    void blake3_hasher_reset(blake3_hasher *self);
    #ifdef __cplusplus
    }
    #endif
    #endif /* BLAKE3_H */
  • file deletion: blake3.c (----------)
    [3.1][3.25793:25825](),[3.25825][3.25826:25826]()
    #include <assert.h>
    #include <stdbool.h>
    #include <string.h>
    #include "blake3.h"
    #include "blake3_impl.h"
    const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
    INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
    uint8_t flags) {
    memcpy(self->cv, key, BLAKE3_KEY_LEN);
    self->chunk_counter = 0;
    memset(self->buf, 0, BLAKE3_BLOCK_LEN);
    self->buf_len = 0;
    self->blocks_compressed = 0;
    self->flags = flags;
    }
    INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
    uint64_t chunk_counter) {
    memcpy(self->cv, key, BLAKE3_KEY_LEN);
    self->chunk_counter = chunk_counter;
    self->blocks_compressed = 0;
    memset(self->buf, 0, BLAKE3_BLOCK_LEN);
    self->buf_len = 0;
    }
    INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
    return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
    ((size_t)self->buf_len);
    }
    INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
    const uint8_t *input, size_t input_len) {
    size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
    if (take > input_len) {
    take = input_len;
    }
    uint8_t *dest = self->buf + ((size_t)self->buf_len);
    memcpy(dest, input, take);
    self->buf_len += (uint8_t)take;
    return take;
    }
    INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
    if (self->blocks_compressed == 0) {
    return CHUNK_START;
    } else {
    return 0;
    }
    }
    typedef struct {
    uint32_t input_cv[8];
    uint64_t counter;
    uint8_t block[BLAKE3_BLOCK_LEN];
    uint8_t block_len;
    uint8_t flags;
    } output_t;
    INLINE output_t make_output(const uint32_t input_cv[8],
    const uint8_t block[BLAKE3_BLOCK_LEN],
    uint8_t block_len, uint64_t counter,
    uint8_t flags) {
    output_t ret;
    memcpy(ret.input_cv, input_cv, 32);
    memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
    ret.block_len = block_len;
    ret.counter = counter;
    ret.flags = flags;
    return ret;
    }
    // Chaining values within a given chunk (specifically the compress_in_place
    // interface) are represented as words. This avoids unnecessary bytes<->words
    // conversion overhead in the portable implementation. However, the hash_many
    // interface handles both user input and parent node blocks, so it accepts
    // bytes. For that reason, chaining values in the CV stack are represented as
    // bytes.
    INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
    uint32_t cv_words[8];
    memcpy(cv_words, self->input_cv, 32);
    blake3_compress_in_place(cv_words, self->block, self->block_len,
    self->counter, self->flags);
    store_cv_words(cv, cv_words);
    }
    INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
    size_t out_len) {
    uint64_t output_block_counter = seek / 64;
    size_t offset_within_block = seek % 64;
    uint8_t wide_buf[64];
    while (out_len > 0) {
    blake3_compress_xof(self->input_cv, self->block, self->block_len,
    output_block_counter, self->flags | ROOT, wide_buf);
    size_t available_bytes = 64 - offset_within_block;
    size_t memcpy_len;
    if (out_len > available_bytes) {
    memcpy_len = available_bytes;
    } else {
    memcpy_len = out_len;
    }
    memcpy(out, wide_buf + offset_within_block, memcpy_len);
    out += memcpy_len;
    out_len -= memcpy_len;
    output_block_counter += 1;
    offset_within_block = 0;
    }
    }
    INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
    size_t input_len) {
    if (self->buf_len > 0) {
    size_t take = chunk_state_fill_buf(self, input, input_len);
    input += take;
    input_len -= take;
    if (input_len > 0) {
    blake3_compress_in_place(
    self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
    self->flags | chunk_state_maybe_start_flag(self));
    self->blocks_compressed += 1;
    self->buf_len = 0;
    memset(self->buf, 0, BLAKE3_BLOCK_LEN);
    }
    }
    while (input_len > BLAKE3_BLOCK_LEN) {
    blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
    self->chunk_counter,
    self->flags | chunk_state_maybe_start_flag(self));
    self->blocks_compressed += 1;
    input += BLAKE3_BLOCK_LEN;
    input_len -= BLAKE3_BLOCK_LEN;
    }
    size_t take = chunk_state_fill_buf(self, input, input_len);
    input += take;
    input_len -= take;
    }
    INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
    uint8_t block_flags =
    self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
    return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
    block_flags);
    }
    INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
    const uint32_t key[8], uint8_t flags) {
    return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
    }
    // Given some input larger than one chunk, return the number of bytes that
    // should go in the left subtree. This is the largest power-of-2 number of
    // chunks that leaves at least 1 byte for the right subtree.
    INLINE size_t left_len(size_t content_len) {
    // Subtract 1 to reserve at least one byte for the right side. content_len
    // should always be greater than BLAKE3_CHUNK_LEN.
    size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
    return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
    }
    // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
    // on a single thread. Write out the chunk chaining values and return the
    // number of chunks hashed. These chunks are never the root and never empty;
    // those cases use a different codepath.
    INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
    const uint32_t key[8],
    uint64_t chunk_counter, uint8_t flags,
    uint8_t *out) {
    #if defined(BLAKE3_TESTING)
    assert(0 < input_len);
    assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
    #endif
    const uint8_t *chunks_array[MAX_SIMD_DEGREE];
    size_t input_position = 0;
    size_t chunks_array_len = 0;
    while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
    chunks_array[chunks_array_len] = &input[input_position];
    input_position += BLAKE3_CHUNK_LEN;
    chunks_array_len += 1;
    }
    blake3_hash_many(chunks_array, chunks_array_len,
    BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
    true, flags, CHUNK_START, CHUNK_END, out);
    // Hash the remaining partial chunk, if there is one. Note that the empty
    // chunk (meaning the empty message) is a different codepath.
    if (input_len > input_position) {
    uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
    blake3_chunk_state chunk_state;
    chunk_state_init(&chunk_state, key, flags);
    chunk_state.chunk_counter = counter;
    chunk_state_update(&chunk_state, &input[input_position],
    input_len - input_position);
    output_t output = chunk_state_output(&chunk_state);
    output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
    return chunks_array_len + 1;
    } else {
    return chunks_array_len;
    }
    }
    // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
    // on a single thread. Write out the parent chaining values and return the
    // number of parents hashed. (If there's an odd input chaining value left over,
    // return it as an additional output.) These parents are never the root and
    // never empty; those cases use a different codepath.
    INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
    size_t num_chaining_values,
    const uint32_t key[8], uint8_t flags,
    uint8_t *out) {
    #if defined(BLAKE3_TESTING)
    assert(2 <= num_chaining_values);
    assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
    #endif
    const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
    size_t parents_array_len = 0;
    while (num_chaining_values - (2 * parents_array_len) >= 2) {
    parents_array[parents_array_len] =
    &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
    parents_array_len += 1;
    }
    blake3_hash_many(parents_array, parents_array_len, 1, key,
    0, // Parents always use counter 0.
    false, flags | PARENT,
    0, // Parents have no start flags.
    0, // Parents have no end flags.
    out);
    // If there's an odd child left over, it becomes an output.
    if (num_chaining_values > 2 * parents_array_len) {
    memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
    &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
    BLAKE3_OUT_LEN);
    return parents_array_len + 1;
    } else {
    return parents_array_len;
    }
    }
    // The wide helper function returns (writes out) an array of chaining values
    // and returns the length of that array. The number of chaining values returned
    // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
    // if the input is shorter than that many chunks. The reason for maintaining a
    // wide array of chaining values going back up the tree, is to allow the
    // implementation to hash as many parents in parallel as possible.
    //
    // As a special case when the SIMD degree is 1, this function will still return
    // at least 2 outputs. This guarantees that this function doesn't perform the
    // root compression. (If it did, it would use the wrong flags, and also we
    // wouldn't be able to implement exendable output.) Note that this function is
    // not used when the whole input is only 1 chunk long; that's a different
    // codepath.
    //
    // Why not just have the caller split the input on the first update(), instead
    // of implementing this special rule? Because we don't want to limit SIMD or
    // multi-threading parallelism for that update().
    static size_t blake3_compress_subtree_wide(const uint8_t *input,
    size_t input_len,
    const uint32_t key[8],
    uint64_t chunk_counter,
    uint8_t flags, uint8_t *out) {
    // Note that the single chunk case does *not* bump the SIMD degree up to 2
    // when it is 1. If this implementation adds multi-threading in the future,
    // this gives us the option of multi-threading even the 2-chunk case, which
    // can help performance on smaller platforms.
    if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
    return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
    out);
    }
    // With more than simd_degree chunks, we need to recurse. Start by dividing
    // the input into left and right subtrees. (Note that this is only optimal
    // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
    // of 3 or something, we'll need a more complicated strategy.)
    size_t left_input_len = left_len(input_len);
    size_t right_input_len = input_len - left_input_len;
    const uint8_t *right_input = &input[left_input_len];
    uint64_t right_chunk_counter =
    chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
    // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
    // account for the special case of returning 2 outputs when the SIMD degree
    // is 1.
    uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
    size_t degree = blake3_simd_degree();
    if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
    // The special case: We always use a degree of at least two, to make
    // sure there are two outputs. Except, as noted above, at the chunk
    // level, where we allow degree=1. (Note that the 1-chunk-input case is
    // a different codepath.)
    degree = 2;
    }
    uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
    // Recurse! If this implementation adds multi-threading support in the
    // future, this is where it will go.
    size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
    chunk_counter, flags, cv_array);
    size_t right_n = blake3_compress_subtree_wide(
    right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
    // The special case again. If simd_degree=1, then we'll have left_n=1 and
    // right_n=1. Rather than compressing them into a single output, return
    // them directly, to make sure we always have at least two outputs.
    if (left_n == 1) {
    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
    return 2;
    }
    // Otherwise, do one layer of parent node compression.
    size_t num_chaining_values = left_n + right_n;
    return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
    out);
    }
    // Hash a subtree with compress_subtree_wide(), and then condense the resulting
    // list of chaining values down to a single parent node. Don't compress that
    // last parent node, however. Instead, return its message bytes (the
    // concatenated chaining values of its children). This is necessary when the
    // first call to update() supplies a complete subtree, because the topmost
    // parent node of that subtree could end up being the root. It's also necessary
    // for extended output in the general case.
    //
    // As with compress_subtree_wide(), this function is not used on inputs of 1
    // chunk or less. That's a different codepath.
    INLINE void compress_subtree_to_parent_node(
    const uint8_t *input, size_t input_len, const uint32_t key[8],
    uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
    #if defined(BLAKE3_TESTING)
    assert(input_len > BLAKE3_CHUNK_LEN);
    #endif
    uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
    size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
    chunk_counter, flags, cv_array);
    assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
    // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
    // compress_subtree_wide() returns more than 2 chaining values. Condense
    // them into 2 by forming parent nodes repeatedly.
    uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
    // The second half of this loop condition is always true, and we just
    // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
    // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
    // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
    // this code, test it against that version.
    while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
    num_cvs =
    compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
    }
    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
    }
    INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
    uint8_t flags) {
    memcpy(self->key, key, BLAKE3_KEY_LEN);
    chunk_state_init(&self->chunk, key, flags);
    self->cv_stack_len = 0;
    }
    void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
    void blake3_hasher_init_keyed(blake3_hasher *self,
    const uint8_t key[BLAKE3_KEY_LEN]) {
    uint32_t key_words[8];
    load_key_words(key, key_words);
    hasher_init_base(self, key_words, KEYED_HASH);
    }
    void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
    size_t context_len) {
    blake3_hasher context_hasher;
    hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
    blake3_hasher_update(&context_hasher, context, context_len);
    uint8_t context_key[BLAKE3_KEY_LEN];
    blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
    uint32_t context_key_words[8];
    load_key_words(context_key, context_key_words);
    hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
    }
    void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
    blake3_hasher_init_derive_key_raw(self, context, strlen(context));
    }
    // As described in hasher_push_cv() below, we do "lazy merging", delaying
    // merges until right before the next CV is about to be added. This is
    // different from the reference implementation. Another difference is that we
    // aren't always merging 1 chunk at a time. Instead, each CV might represent
    // any power-of-two number of chunks, as long as the smaller-above-larger stack
    // order is maintained. Instead of the "count the trailing 0-bits" algorithm
    // described in the spec, we use a "count the total number of 1-bits" variant
    // that doesn't require us to retain the subtree size of the CV on top of the
    // stack. The principle is the same: each CV that should remain in the stack is
    // represented by a 1-bit in the total number of chunks (or bytes) so far.
    INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
    size_t post_merge_stack_len = (size_t)popcnt(total_len);
    while (self->cv_stack_len > post_merge_stack_len) {
    uint8_t *parent_node =
    &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
    output_t output = parent_output(parent_node, self->key, self->chunk.flags);
    output_chaining_value(&output, parent_node);
    self->cv_stack_len -= 1;
    }
    }
    // In reference_impl.rs, we merge the new CV with existing CVs from the stack
    // before pushing it. We can do that because we know more input is coming, so
    // we know none of the merges are root.
    //
    // This setting is different. We want to feed as much input as possible to
    // compress_subtree_wide(), without setting aside anything for the chunk_state.
    // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
    // as a single subtree, if at all possible.
    //
    // This leads to two problems:
    // 1) This 64 KiB input might be the only call that ever gets made to update.
    // In this case, the root node of the 64 KiB subtree would be the root node
    // of the whole tree, and it would need to be ROOT finalized. We can't
    // compress it until we know.
    // 2) This 64 KiB input might complete a larger tree, whose root node is
    // similarly going to be the the root of the whole tree. For example, maybe
    // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
    // node at the root of the 256 KiB subtree until we know how to finalize it.
    //
    // The second problem is solved with "lazy merging". That is, when we're about
    // to add a CV to the stack, we don't merge it with anything first, as the
    // reference impl does. Instead we do merges using the *previous* CV that was
    // added, which is sitting on top of the stack, and we put the new CV
    // (unmerged) on top of the stack afterwards. This guarantees that we never
    // merge the root node until finalize().
    //
    // Solving the first problem requires an additional tool,
    // compress_subtree_to_parent_node(). That function always returns the top
    // *two* chaining values of the subtree it's compressing. We then do lazy
    // merging with each of them separately, so that the second CV will always
    // remain unmerged. (That also helps us support extendable output when we're
    // hashing an input all-at-once.)
    INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
    uint64_t chunk_counter) {
    hasher_merge_cv_stack(self, chunk_counter);
    memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
    BLAKE3_OUT_LEN);
    self->cv_stack_len += 1;
    }
    void blake3_hasher_update(blake3_hasher *self, const void *input,
    size_t input_len) {
    // Explicitly checking for zero avoids causing UB by passing a null pointer
    // to memcpy. This comes up in practice with things like:
    // std::vector<uint8_t> v;
    // blake3_hasher_update(&hasher, v.data(), v.size());
    if (input_len == 0) {
    return;
    }
    const uint8_t *input_bytes = (const uint8_t *)input;
    // If we have some partial chunk bytes in the internal chunk_state, we need
    // to finish that chunk first.
    if (chunk_state_len(&self->chunk) > 0) {
    size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
    if (take > input_len) {
    take = input_len;
    }
    chunk_state_update(&self->chunk, input_bytes, take);
    input_bytes += take;
    input_len -= take;
    // If we've filled the current chunk and there's more coming, finalize this
    // chunk and proceed. In this case we know it's not the root.
    if (input_len > 0) {
    output_t output = chunk_state_output(&self->chunk);
    uint8_t chunk_cv[32];
    output_chaining_value(&output, chunk_cv);
    hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
    chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
    } else {
    return;
    }
    }
    // Now the chunk_state is clear, and we have more input. If there's more than
    // a single chunk (so, definitely not the root chunk), hash the largest whole
    // subtree we can, with the full benefits of SIMD (and maybe in the future,
    // multi-threading) parallelism. Two restrictions:
    // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
    // the right edge can be incomplete, and we don't know where the right edge
    // is going to be until we get to finalize().
    // - The subtree must evenly divide the total number of chunks up until this
    // point (if total is not 0). If the current incomplete subtree is only
    // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
    // to complete the current subtree first.
    // Because we might need to break up the input to form powers of 2, or to
    // evenly divide what we already have, this part runs in a loop.
    while (input_len > BLAKE3_CHUNK_LEN) {
    size_t subtree_len = round_down_to_power_of_2(input_len);
    uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
    // Shrink the subtree_len until it evenly divides the count so far. We know
    // that subtree_len itself is a power of 2, so we can use a bitmasking
    // trick instead of an actual remainder operation. (Note that if the caller
    // consistently passes power-of-2 inputs of the same size, as is hopefully
    // typical, this loop condition will always fail, and subtree_len will
    // always be the full length of the input.)
    //
    // An aside: We don't have to shrink subtree_len quite this much. For
    // example, if count_so_far is 1, we could pass 2 chunks to
    // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
    // get the right answer in the end, and we might get to use 2-way SIMD
    // parallelism. The problem with this optimization, is that it gets us
    // stuck always hashing 2 chunks. The total number of chunks will remain
    // odd, and we'll never graduate to higher degrees of parallelism. See
    // https://github.com/BLAKE3-team/BLAKE3/issues/69.
    while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
    subtree_len /= 2;
    }
    // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
    // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
    uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
    if (subtree_len <= BLAKE3_CHUNK_LEN) {
    blake3_chunk_state chunk_state;
    chunk_state_init(&chunk_state, self->key, self->chunk.flags);
    chunk_state.chunk_counter = self->chunk.chunk_counter;
    chunk_state_update(&chunk_state, input_bytes, subtree_len);
    output_t output = chunk_state_output(&chunk_state);
    uint8_t cv[BLAKE3_OUT_LEN];
    output_chaining_value(&output, cv);
    hasher_push_cv(self, cv, chunk_state.chunk_counter);
    } else {
    // This is the high-performance happy path, though getting here depends
    // on the caller giving us a long enough input.
    uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
    compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
    self->chunk.chunk_counter,
    self->chunk.flags, cv_pair);
    hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
    hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
    self->chunk.chunk_counter + (subtree_chunks / 2));
    }
    self->chunk.chunk_counter += subtree_chunks;
    input_bytes += subtree_len;
    input_len -= subtree_len;
    }
    // If there's any remaining input less than a full chunk, add it to the chunk
    // state. In that case, also do a final merge loop to make sure the subtree
    // stack doesn't contain any unmerged pairs. The remaining input means we
    // know these merges are non-root. This merge loop isn't strictly necessary
    // here, because hasher_push_chunk_cv already does its own merge loop, but it
    // simplifies blake3_hasher_finalize below.
    if (input_len > 0) {
    chunk_state_update(&self->chunk, input_bytes, input_len);
    hasher_merge_cv_stack(self, self->chunk.chunk_counter);
    }
    }
    void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
    size_t out_len) {
    blake3_hasher_finalize_seek(self, 0, out, out_len);
    }
    void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
    uint8_t *out, size_t out_len) {
    // Explicitly checking for zero avoids causing UB by passing a null pointer
    // to memcpy. This comes up in practice with things like:
    // std::vector<uint8_t> v;
    // blake3_hasher_finalize(&hasher, v.data(), v.size());
    if (out_len == 0) {
    return;
    }
    // If the subtree stack is empty, then the current chunk is the root.
    if (self->cv_stack_len == 0) {
    output_t output = chunk_state_output(&self->chunk);
    output_root_bytes(&output, seek, out, out_len);
    return;
    }
    // If there are any bytes in the chunk state, finalize that chunk and do a
    // roll-up merge between that chunk hash and every subtree in the stack. In
    // this case, the extra merge loop at the end of blake3_hasher_update
    // guarantees that none of the subtrees in the stack need to be merged with
    // each other first. Otherwise, if there are no bytes in the chunk state,
    // then the top of the stack is a chunk hash, and we start the merge from
    // that.
    output_t output;
    size_t cvs_remaining;
    if (chunk_state_len(&self->chunk) > 0) {
    cvs_remaining = self->cv_stack_len;
    output = chunk_state_output(&self->chunk);
    } else {
    // There are always at least 2 CVs in the stack in this case.
    cvs_remaining = self->cv_stack_len - 2;
    output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
    self->chunk.flags);
    }
    while (cvs_remaining > 0) {
    cvs_remaining -= 1;
    uint8_t parent_block[BLAKE3_BLOCK_LEN];
    memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
    output_chaining_value(&output, &parent_block[32]);
    output = parent_output(parent_block, self->key, self->chunk.flags);
    }
    output_root_bytes(&output, seek, out, out_len);
    }
    void blake3_hasher_reset(blake3_hasher *self) {
    chunk_state_reset(&self->chunk, self->key, 0);
    self->cv_stack_len = 0;
    }
  • file addition: vendor (d--r------)
    [3.1]
  • file addition: blake3 (d--r------)
    [0.18]
  • file addition: blake3_sse41_x86-64_unix.S (----------)
    [0.38]
    #if defined(__ELF__) && defined(__linux__)
    .section .note.GNU-stack,"",%progbits
    #endif
    #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
    #if __has_include(<cet.h>)
    #include <cet.h>
    #endif
    #endif
    #if !defined(_CET_ENDBR)
    #define _CET_ENDBR
    #endif
    .intel_syntax noprefix
    .global blake3_hash_many_sse41
    .global _blake3_hash_many_sse41
    .global blake3_compress_in_place_sse41
    .global _blake3_compress_in_place_sse41
    .global blake3_compress_xof_sse41
    .global _blake3_compress_xof_sse41
    #ifdef __APPLE__
    .text
    #else
    .section .text
    #endif
    .p2align 6
    _blake3_hash_many_sse41:
    blake3_hash_many_sse41:
    _CET_ENDBR
    push r15
    push r14
    push r13
    push r12
    push rbx
    push rbp
    mov rbp, rsp
    sub rsp, 360
    and rsp, 0xFFFFFFFFFFFFFFC0
    neg r9d
    movd xmm0, r9d
    pshufd xmm0, xmm0, 0x00
    movdqa xmmword ptr [rsp+0x130], xmm0
    movdqa xmm1, xmm0
    pand xmm1, xmmword ptr [ADD0+rip]
    pand xmm0, xmmword ptr [ADD1+rip]
    movdqa xmmword ptr [rsp+0x150], xmm0
    movd xmm0, r8d
    pshufd xmm0, xmm0, 0x00
    paddd xmm0, xmm1
    movdqa xmmword ptr [rsp+0x110], xmm0
    pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
    pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
    pcmpgtd xmm1, xmm0
    shr r8, 32
    movd xmm2, r8d
    pshufd xmm2, xmm2, 0x00
    psubd xmm2, xmm1
    movdqa xmmword ptr [rsp+0x120], xmm2
    mov rbx, qword ptr [rbp+0x50]
    mov r15, rdx
    shl r15, 6
    movzx r13d, byte ptr [rbp+0x38]
    movzx r12d, byte ptr [rbp+0x48]
    cmp rsi, 4
    jc 3f
    2:
    movdqu xmm3, xmmword ptr [rcx]
    pshufd xmm0, xmm3, 0x00
    pshufd xmm1, xmm3, 0x55
    pshufd xmm2, xmm3, 0xAA
    pshufd xmm3, xmm3, 0xFF
    movdqu xmm7, xmmword ptr [rcx+0x10]
    pshufd xmm4, xmm7, 0x00
    pshufd xmm5, xmm7, 0x55
    pshufd xmm6, xmm7, 0xAA
    pshufd xmm7, xmm7, 0xFF
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    9:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    movdqu xmm8, xmmword ptr [r8+rdx-0x40]
    movdqu xmm9, xmmword ptr [r9+rdx-0x40]
    movdqu xmm10, xmmword ptr [r10+rdx-0x40]
    movdqu xmm11, xmmword ptr [r11+rdx-0x40]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp], xmm8
    movdqa xmmword ptr [rsp+0x10], xmm9
    movdqa xmmword ptr [rsp+0x20], xmm12
    movdqa xmmword ptr [rsp+0x30], xmm13
    movdqu xmm8, xmmword ptr [r8+rdx-0x30]
    movdqu xmm9, xmmword ptr [r9+rdx-0x30]
    movdqu xmm10, xmmword ptr [r10+rdx-0x30]
    movdqu xmm11, xmmword ptr [r11+rdx-0x30]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp+0x40], xmm8
    movdqa xmmword ptr [rsp+0x50], xmm9
    movdqa xmmword ptr [rsp+0x60], xmm12
    movdqa xmmword ptr [rsp+0x70], xmm13
    movdqu xmm8, xmmword ptr [r8+rdx-0x20]
    movdqu xmm9, xmmword ptr [r9+rdx-0x20]
    movdqu xmm10, xmmword ptr [r10+rdx-0x20]
    movdqu xmm11, xmmword ptr [r11+rdx-0x20]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp+0x80], xmm8
    movdqa xmmword ptr [rsp+0x90], xmm9
    movdqa xmmword ptr [rsp+0xA0], xmm12
    movdqa xmmword ptr [rsp+0xB0], xmm13
    movdqu xmm8, xmmword ptr [r8+rdx-0x10]
    movdqu xmm9, xmmword ptr [r9+rdx-0x10]
    movdqu xmm10, xmmword ptr [r10+rdx-0x10]
    movdqu xmm11, xmmword ptr [r11+rdx-0x10]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp+0xC0], xmm8
    movdqa xmmword ptr [rsp+0xD0], xmm9
    movdqa xmmword ptr [rsp+0xE0], xmm12
    movdqa xmmword ptr [rsp+0xF0], xmm13
    movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
    movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
    movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
    movdqa xmm12, xmmword ptr [rsp+0x110]
    movdqa xmm13, xmmword ptr [rsp+0x120]
    movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
    movd xmm15, eax
    pshufd xmm15, xmm15, 0x00
    prefetcht0 [r8+rdx+0x80]
    prefetcht0 [r9+rdx+0x80]
    prefetcht0 [r10+rdx+0x80]
    prefetcht0 [r11+rdx+0x80]
    paddd xmm0, xmmword ptr [rsp]
    paddd xmm1, xmmword ptr [rsp+0x20]
    paddd xmm2, xmmword ptr [rsp+0x40]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x10]
    paddd xmm1, xmmword ptr [rsp+0x30]
    paddd xmm2, xmmword ptr [rsp+0x50]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x80]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp+0xC0]
    paddd xmm3, xmmword ptr [rsp+0xE0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x90]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0xD0]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x20]
    paddd xmm1, xmmword ptr [rsp+0x30]
    paddd xmm2, xmmword ptr [rsp+0x70]
    paddd xmm3, xmmword ptr [rsp+0x40]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x60]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp]
    paddd xmm3, xmmword ptr [rsp+0xD0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x10]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0x90]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xB0]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp+0xE0]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x30]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp+0xD0]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x40]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0x20]
    paddd xmm3, xmmword ptr [rsp+0xE0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x60]
    paddd xmm1, xmmword ptr [rsp+0x90]
    paddd xmm2, xmmword ptr [rsp+0xB0]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x50]
    paddd xmm1, xmmword ptr [rsp]
    paddd xmm2, xmmword ptr [rsp+0xF0]
    paddd xmm3, xmmword ptr [rsp+0x10]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xA0]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0xE0]
    paddd xmm3, xmmword ptr [rsp+0xD0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x70]
    paddd xmm1, xmmword ptr [rsp+0x90]
    paddd xmm2, xmmword ptr [rsp+0x30]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x40]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0x50]
    paddd xmm3, xmmword ptr [rsp+0x10]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp]
    paddd xmm1, xmmword ptr [rsp+0x20]
    paddd xmm2, xmmword ptr [rsp+0x80]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xC0]
    paddd xmm1, xmmword ptr [rsp+0x90]
    paddd xmm2, xmmword ptr [rsp+0xF0]
    paddd xmm3, xmmword ptr [rsp+0xE0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xD0]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0xA0]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x70]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x20]
    paddd xmm1, xmmword ptr [rsp+0x30]
    paddd xmm2, xmmword ptr [rsp+0x10]
    paddd xmm3, xmmword ptr [rsp+0x40]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x90]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0x80]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xE0]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp+0xC0]
    paddd xmm3, xmmword ptr [rsp+0x10]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xD0]
    paddd xmm1, xmmword ptr [rsp]
    paddd xmm2, xmmword ptr [rsp+0x20]
    paddd xmm3, xmmword ptr [rsp+0x40]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x30]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp+0x60]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xB0]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp+0x10]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xF0]
    paddd xmm1, xmmword ptr [rsp]
    paddd xmm2, xmmword ptr [rsp+0x90]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    pshufb xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xE0]
    paddd xmm1, xmmword ptr [rsp+0x20]
    paddd xmm2, xmmword ptr [rsp+0x30]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT16+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xA0]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0x40]
    paddd xmm3, xmmword ptr [rsp+0xD0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmmword ptr [ROT8+rip]
    pshufb xmm15, xmm8
    pshufb xmm12, xmm8
    pshufb xmm13, xmm8
    pshufb xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    pxor xmm0, xmm8
    pxor xmm1, xmm9
    pxor xmm2, xmm10
    pxor xmm3, xmm11
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    pxor xmm4, xmm12
    pxor xmm5, xmm13
    pxor xmm6, xmm14
    pxor xmm7, xmm15
    mov eax, r13d
    jne 9b
    movdqa xmm9, xmm0
    punpckldq xmm0, xmm1
    punpckhdq xmm9, xmm1
    movdqa xmm11, xmm2
    punpckldq xmm2, xmm3
    punpckhdq xmm11, xmm3
    movdqa xmm1, xmm0
    punpcklqdq xmm0, xmm2
    punpckhqdq xmm1, xmm2
    movdqa xmm3, xmm9
    punpcklqdq xmm9, xmm11
    punpckhqdq xmm3, xmm11
    movdqu xmmword ptr [rbx], xmm0
    movdqu xmmword ptr [rbx+0x20], xmm1
    movdqu xmmword ptr [rbx+0x40], xmm9
    movdqu xmmword ptr [rbx+0x60], xmm3
    movdqa xmm9, xmm4
    punpckldq xmm4, xmm5
    punpckhdq xmm9, xmm5
    movdqa xmm11, xmm6
    punpckldq xmm6, xmm7
    punpckhdq xmm11, xmm7
    movdqa xmm5, xmm4
    punpcklqdq xmm4, xmm6
    punpckhqdq xmm5, xmm6
    movdqa xmm7, xmm9
    punpcklqdq xmm9, xmm11
    punpckhqdq xmm7, xmm11
    movdqu xmmword ptr [rbx+0x10], xmm4
    movdqu xmmword ptr [rbx+0x30], xmm5
    movdqu xmmword ptr [rbx+0x50], xmm9
    movdqu xmmword ptr [rbx+0x70], xmm7
    movdqa xmm1, xmmword ptr [rsp+0x110]
    movdqa xmm0, xmm1
    paddd xmm1, xmmword ptr [rsp+0x150]
    movdqa xmmword ptr [rsp+0x110], xmm1
    pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
    pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
    pcmpgtd xmm0, xmm1
    movdqa xmm1, xmmword ptr [rsp+0x120]
    psubd xmm1, xmm0
    movdqa xmmword ptr [rsp+0x120], xmm1
    add rbx, 128
    add rdi, 32
    sub rsi, 4
    cmp rsi, 4
    jnc 2b
    test rsi, rsi
    jnz 3f
    4:
    mov rsp, rbp
    pop rbp
    pop rbx
    pop r12
    pop r13
    pop r14
    pop r15
    ret
    .p2align 5
    3:
    test esi, 0x2
    je 3f
    movups xmm0, xmmword ptr [rcx]
    movups xmm1, xmmword ptr [rcx+0x10]
    movaps xmm8, xmm0
    movaps xmm9, xmm1
    movd xmm13, dword ptr [rsp+0x110]
    pinsrd xmm13, dword ptr [rsp+0x120], 1
    pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    movaps xmmword ptr [rsp], xmm13
    movd xmm14, dword ptr [rsp+0x114]
    pinsrd xmm14, dword ptr [rsp+0x124], 1
    pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    movaps xmmword ptr [rsp+0x10], xmm14
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    movaps xmm10, xmm2
    movups xmm4, xmmword ptr [r8+rdx-0x40]
    movups xmm5, xmmword ptr [r8+rdx-0x30]
    movaps xmm3, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm3, xmm5, 221
    movaps xmm5, xmm3
    movups xmm6, xmmword ptr [r8+rdx-0x20]
    movups xmm7, xmmword ptr [r8+rdx-0x10]
    movaps xmm3, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm3, xmm7, 221
    pshufd xmm7, xmm3, 0x93
    movups xmm12, xmmword ptr [r9+rdx-0x40]
    movups xmm13, xmmword ptr [r9+rdx-0x30]
    movaps xmm11, xmm12
    shufps xmm12, xmm13, 136
    shufps xmm11, xmm13, 221
    movaps xmm13, xmm11
    movups xmm14, xmmword ptr [r9+rdx-0x20]
    movups xmm15, xmmword ptr [r9+rdx-0x10]
    movaps xmm11, xmm14
    shufps xmm14, xmm15, 136
    pshufd xmm14, xmm14, 0x93
    shufps xmm11, xmm15, 221
    pshufd xmm15, xmm11, 0x93
    movaps xmm3, xmmword ptr [rsp]
    movaps xmm11, xmmword ptr [rsp+0x10]
    pinsrd xmm3, eax, 3
    pinsrd xmm11, eax, 3
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm8, xmm12
    movaps xmmword ptr [rsp+0x20], xmm4
    movaps xmmword ptr [rsp+0x30], xmm12
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    movaps xmm12, xmmword ptr [ROT16+rip]
    pshufb xmm3, xmm12
    pshufb xmm11, xmm12
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 20
    psrld xmm4, 12
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 20
    psrld xmm4, 12
    por xmm9, xmm4
    paddd xmm0, xmm5
    paddd xmm8, xmm13
    movaps xmmword ptr [rsp+0x40], xmm5
    movaps xmmword ptr [rsp+0x50], xmm13
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    movaps xmm13, xmmword ptr [ROT8+rip]
    pshufb xmm3, xmm13
    pshufb xmm11, xmm13
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 25
    psrld xmm4, 7
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 25
    psrld xmm4, 7
    por xmm9, xmm4
    pshufd xmm0, xmm0, 0x93
    pshufd xmm8, xmm8, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm11, xmm11, 0x4E
    pshufd xmm2, xmm2, 0x39
    pshufd xmm10, xmm10, 0x39
    paddd xmm0, xmm6
    paddd xmm8, xmm14
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    pshufb xmm3, xmm12
    pshufb xmm11, xmm12
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 20
    psrld xmm4, 12
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 20
    psrld xmm4, 12
    por xmm9, xmm4
    paddd xmm0, xmm7
    paddd xmm8, xmm15
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    pshufb xmm3, xmm13
    pshufb xmm11, xmm13
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 25
    psrld xmm4, 7
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 25
    psrld xmm4, 7
    por xmm9, xmm4
    pshufd xmm0, xmm0, 0x39
    pshufd xmm8, xmm8, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm11, xmm11, 0x4E
    pshufd xmm2, xmm2, 0x93
    pshufd xmm10, xmm10, 0x93
    dec al
    je 9f
    movdqa xmm12, xmmword ptr [rsp+0x20]
    movdqa xmm5, xmmword ptr [rsp+0x40]
    pshufd xmm13, xmm12, 0x0F
    shufps xmm12, xmm5, 214
    pshufd xmm4, xmm12, 0x39
    movdqa xmm12, xmm6
    shufps xmm12, xmm7, 250
    pblendw xmm13, xmm12, 0xCC
    movdqa xmm12, xmm7
    punpcklqdq xmm12, xmm5
    pblendw xmm12, xmm6, 0xC0
    pshufd xmm12, xmm12, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmmword ptr [rsp+0x20], xmm13
    movdqa xmmword ptr [rsp+0x40], xmm12
    movdqa xmm5, xmmword ptr [rsp+0x30]
    movdqa xmm13, xmmword ptr [rsp+0x50]
    pshufd xmm6, xmm5, 0x0F
    shufps xmm5, xmm13, 214
    pshufd xmm12, xmm5, 0x39
    movdqa xmm5, xmm14
    shufps xmm5, xmm15, 250
    pblendw xmm6, xmm5, 0xCC
    movdqa xmm5, xmm15
    punpcklqdq xmm5, xmm13
    pblendw xmm5, xmm14, 0xC0
    pshufd xmm5, xmm5, 0x78
    punpckhdq xmm13, xmm15
    punpckldq xmm14, xmm13
    pshufd xmm15, xmm14, 0x1E
    movdqa xmm13, xmm6
    movdqa xmm14, xmm5
    movdqa xmm5, xmmword ptr [rsp+0x20]
    movdqa xmm6, xmmword ptr [rsp+0x40]
    jmp 9b
    9:
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    pxor xmm8, xmm10
    pxor xmm9, xmm11
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    movups xmmword ptr [rbx], xmm0
    movups xmmword ptr [rbx+0x10], xmm1
    movups xmmword ptr [rbx+0x20], xmm8
    movups xmmword ptr [rbx+0x30], xmm9
    movdqa xmm0, xmmword ptr [rsp+0x130]
    movdqa xmm1, xmmword ptr [rsp+0x110]
    movdqa xmm2, xmmword ptr [rsp+0x120]
    movdqu xmm3, xmmword ptr [rsp+0x118]
    movdqu xmm4, xmmword ptr [rsp+0x128]
    blendvps xmm1, xmm3, xmm0
    blendvps xmm2, xmm4, xmm0
    movdqa xmmword ptr [rsp+0x110], xmm1
    movdqa xmmword ptr [rsp+0x120], xmm2
    add rdi, 16
    add rbx, 64
    sub rsi, 2
    3:
    test esi, 0x1
    je 4b
    movups xmm0, xmmword ptr [rcx]
    movups xmm1, xmmword ptr [rcx+0x10]
    movd xmm13, dword ptr [rsp+0x110]
    pinsrd xmm13, dword ptr [rsp+0x120], 1
    pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    movaps xmm14, xmmword ptr [ROT8+rip]
    movaps xmm15, xmmword ptr [ROT16+rip]
    mov r8, qword ptr [rdi]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    movaps xmm3, xmm13
    pinsrd xmm3, eax, 3
    movups xmm4, xmmword ptr [r8+rdx-0x40]
    movups xmm5, xmmword ptr [r8+rdx-0x30]
    movaps xmm8, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm8, xmm5, 221
    movaps xmm5, xmm8
    movups xmm6, xmmword ptr [r8+rdx-0x20]
    movups xmm7, xmmword ptr [r8+rdx-0x10]
    movaps xmm8, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm8, xmm7, 221
    pshufd xmm7, xmm8, 0x93
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm15
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm5
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x39
    paddd xmm0, xmm6
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm15
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm7
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    movdqa xmm8, xmm4
    shufps xmm8, xmm5, 214
    pshufd xmm9, xmm4, 0x0F
    pshufd xmm4, xmm8, 0x39
    movdqa xmm8, xmm6
    shufps xmm8, xmm7, 250
    pblendw xmm9, xmm8, 0xCC
    movdqa xmm8, xmm7
    punpcklqdq xmm8, xmm5
    pblendw xmm8, xmm6, 0xC0
    pshufd xmm8, xmm8, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmm5, xmm9
    movdqa xmm6, xmm8
    jmp 9b
    9:
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    movups xmmword ptr [rbx], xmm0
    movups xmmword ptr [rbx+0x10], xmm1
    jmp 4b
    .p2align 6
    blake3_compress_in_place_sse41:
    _blake3_compress_in_place_sse41:
    _CET_ENDBR
    movups xmm0, xmmword ptr [rdi]
    movups xmm1, xmmword ptr [rdi+0x10]
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    shl r8, 32
    add rdx, r8
    movq xmm3, rcx
    movq xmm4, rdx
    punpcklqdq xmm3, xmm4
    movups xmm4, xmmword ptr [rsi]
    movups xmm5, xmmword ptr [rsi+0x10]
    movaps xmm8, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm8, xmm5, 221
    movaps xmm5, xmm8
    movups xmm6, xmmword ptr [rsi+0x20]
    movups xmm7, xmmword ptr [rsi+0x30]
    movaps xmm8, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm8, xmm7, 221
    pshufd xmm7, xmm8, 0x93
    movaps xmm14, xmmword ptr [ROT8+rip]
    movaps xmm15, xmmword ptr [ROT16+rip]
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm15
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm5
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x39
    paddd xmm0, xmm6
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm15
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm7
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    movdqa xmm8, xmm4
    shufps xmm8, xmm5, 214
    pshufd xmm9, xmm4, 0x0F
    pshufd xmm4, xmm8, 0x39
    movdqa xmm8, xmm6
    shufps xmm8, xmm7, 250
    pblendw xmm9, xmm8, 0xCC
    movdqa xmm8, xmm7
    punpcklqdq xmm8, xmm5
    pblendw xmm8, xmm6, 0xC0
    pshufd xmm8, xmm8, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmm5, xmm9
    movdqa xmm6, xmm8
    jmp 9b
    9:
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    movups xmmword ptr [rdi], xmm0
    movups xmmword ptr [rdi+0x10], xmm1
    ret
    .p2align 6
    blake3_compress_xof_sse41:
    _blake3_compress_xof_sse41:
    _CET_ENDBR
    movups xmm0, xmmword ptr [rdi]
    movups xmm1, xmmword ptr [rdi+0x10]
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    movzx eax, r8b
    movzx edx, dl
    shl rax, 32
    add rdx, rax
    movq xmm3, rcx
    movq xmm4, rdx
    punpcklqdq xmm3, xmm4
    movups xmm4, xmmword ptr [rsi]
    movups xmm5, xmmword ptr [rsi+0x10]
    movaps xmm8, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm8, xmm5, 221
    movaps xmm5, xmm8
    movups xmm6, xmmword ptr [rsi+0x20]
    movups xmm7, xmmword ptr [rsi+0x30]
    movaps xmm8, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm8, xmm7, 221
    pshufd xmm7, xmm8, 0x93
    movaps xmm14, xmmword ptr [ROT8+rip]
    movaps xmm15, xmmword ptr [ROT16+rip]
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm15
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm5
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x39
    paddd xmm0, xmm6
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm15
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm7
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshufb xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    movdqa xmm8, xmm4
    shufps xmm8, xmm5, 214
    pshufd xmm9, xmm4, 0x0F
    pshufd xmm4, xmm8, 0x39
    movdqa xmm8, xmm6
    shufps xmm8, xmm7, 250
    pblendw xmm9, xmm8, 0xCC
    movdqa xmm8, xmm7
    punpcklqdq xmm8, xmm5
    pblendw xmm8, xmm6, 0xC0
    pshufd xmm8, xmm8, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmm5, xmm9
    movdqa xmm6, xmm8
    jmp 9b
    9:
    movdqu xmm4, xmmword ptr [rdi]
    movdqu xmm5, xmmword ptr [rdi+0x10]
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    pxor xmm2, xmm4
    pxor xmm3, xmm5
    movups xmmword ptr [r9], xmm0
    movups xmmword ptr [r9+0x10], xmm1
    movups xmmword ptr [r9+0x20], xmm2
    movups xmmword ptr [r9+0x30], xmm3
    ret
    #ifdef __APPLE__
    .static_data
    #else
    .section .rodata
    #endif
    .p2align 6
    BLAKE3_IV:
    .long 0x6A09E667, 0xBB67AE85
    .long 0x3C6EF372, 0xA54FF53A
    ROT16:
    .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
    ROT8:
    .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
    ADD0:
    .long 0, 1, 2, 3
    ADD1:
    .long 4, 4, 4, 4
    BLAKE3_IV_0:
    .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
    BLAKE3_IV_1:
    .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
    BLAKE3_IV_2:
    .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
    BLAKE3_IV_3:
    .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
    BLAKE3_BLOCK_LEN:
    .long 64, 64, 64, 64
    CMP_MSB_MASK:
    .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
  • file addition: blake3_sse2_x86-64_unix.S (----------)
    [0.38]
    #if defined(__ELF__) && defined(__linux__)
    .section .note.GNU-stack,"",%progbits
    #endif
    #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
    #if __has_include(<cet.h>)
    #include <cet.h>
    #endif
    #endif
    #if !defined(_CET_ENDBR)
    #define _CET_ENDBR
    #endif
    .intel_syntax noprefix
    .global blake3_hash_many_sse2
    .global _blake3_hash_many_sse2
    .global blake3_compress_in_place_sse2
    .global _blake3_compress_in_place_sse2
    .global blake3_compress_xof_sse2
    .global _blake3_compress_xof_sse2
    #ifdef __APPLE__
    .text
    #else
    .section .text
    #endif
    .p2align 6
    _blake3_hash_many_sse2:
    blake3_hash_many_sse2:
    _CET_ENDBR
    push r15
    push r14
    push r13
    push r12
    push rbx
    push rbp
    mov rbp, rsp
    sub rsp, 360
    and rsp, 0xFFFFFFFFFFFFFFC0
    neg r9d
    movd xmm0, r9d
    pshufd xmm0, xmm0, 0x00
    movdqa xmmword ptr [rsp+0x130], xmm0
    movdqa xmm1, xmm0
    pand xmm1, xmmword ptr [ADD0+rip]
    pand xmm0, xmmword ptr [ADD1+rip]
    movdqa xmmword ptr [rsp+0x150], xmm0
    movd xmm0, r8d
    pshufd xmm0, xmm0, 0x00
    paddd xmm0, xmm1
    movdqa xmmword ptr [rsp+0x110], xmm0
    pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
    pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
    pcmpgtd xmm1, xmm0
    shr r8, 32
    movd xmm2, r8d
    pshufd xmm2, xmm2, 0x00
    psubd xmm2, xmm1
    movdqa xmmword ptr [rsp+0x120], xmm2
    mov rbx, qword ptr [rbp+0x50]
    mov r15, rdx
    shl r15, 6
    movzx r13d, byte ptr [rbp+0x38]
    movzx r12d, byte ptr [rbp+0x48]
    cmp rsi, 4
    jc 3f
    2:
    movdqu xmm3, xmmword ptr [rcx]
    pshufd xmm0, xmm3, 0x00
    pshufd xmm1, xmm3, 0x55
    pshufd xmm2, xmm3, 0xAA
    pshufd xmm3, xmm3, 0xFF
    movdqu xmm7, xmmword ptr [rcx+0x10]
    pshufd xmm4, xmm7, 0x00
    pshufd xmm5, xmm7, 0x55
    pshufd xmm6, xmm7, 0xAA
    pshufd xmm7, xmm7, 0xFF
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    9:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    movdqu xmm8, xmmword ptr [r8+rdx-0x40]
    movdqu xmm9, xmmword ptr [r9+rdx-0x40]
    movdqu xmm10, xmmword ptr [r10+rdx-0x40]
    movdqu xmm11, xmmword ptr [r11+rdx-0x40]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp], xmm8
    movdqa xmmword ptr [rsp+0x10], xmm9
    movdqa xmmword ptr [rsp+0x20], xmm12
    movdqa xmmword ptr [rsp+0x30], xmm13
    movdqu xmm8, xmmword ptr [r8+rdx-0x30]
    movdqu xmm9, xmmword ptr [r9+rdx-0x30]
    movdqu xmm10, xmmword ptr [r10+rdx-0x30]
    movdqu xmm11, xmmword ptr [r11+rdx-0x30]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp+0x40], xmm8
    movdqa xmmword ptr [rsp+0x50], xmm9
    movdqa xmmword ptr [rsp+0x60], xmm12
    movdqa xmmword ptr [rsp+0x70], xmm13
    movdqu xmm8, xmmword ptr [r8+rdx-0x20]
    movdqu xmm9, xmmword ptr [r9+rdx-0x20]
    movdqu xmm10, xmmword ptr [r10+rdx-0x20]
    movdqu xmm11, xmmword ptr [r11+rdx-0x20]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp+0x80], xmm8
    movdqa xmmword ptr [rsp+0x90], xmm9
    movdqa xmmword ptr [rsp+0xA0], xmm12
    movdqa xmmword ptr [rsp+0xB0], xmm13
    movdqu xmm8, xmmword ptr [r8+rdx-0x10]
    movdqu xmm9, xmmword ptr [r9+rdx-0x10]
    movdqu xmm10, xmmword ptr [r10+rdx-0x10]
    movdqu xmm11, xmmword ptr [r11+rdx-0x10]
    movdqa xmm12, xmm8
    punpckldq xmm8, xmm9
    punpckhdq xmm12, xmm9
    movdqa xmm14, xmm10
    punpckldq xmm10, xmm11
    punpckhdq xmm14, xmm11
    movdqa xmm9, xmm8
    punpcklqdq xmm8, xmm10
    punpckhqdq xmm9, xmm10
    movdqa xmm13, xmm12
    punpcklqdq xmm12, xmm14
    punpckhqdq xmm13, xmm14
    movdqa xmmword ptr [rsp+0xC0], xmm8
    movdqa xmmword ptr [rsp+0xD0], xmm9
    movdqa xmmword ptr [rsp+0xE0], xmm12
    movdqa xmmword ptr [rsp+0xF0], xmm13
    movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
    movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
    movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
    movdqa xmm12, xmmword ptr [rsp+0x110]
    movdqa xmm13, xmmword ptr [rsp+0x120]
    movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
    movd xmm15, eax
    pshufd xmm15, xmm15, 0x00
    prefetcht0 [r8+rdx+0x80]
    prefetcht0 [r9+rdx+0x80]
    prefetcht0 [r10+rdx+0x80]
    prefetcht0 [r11+rdx+0x80]
    paddd xmm0, xmmword ptr [rsp]
    paddd xmm1, xmmword ptr [rsp+0x20]
    paddd xmm2, xmmword ptr [rsp+0x40]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x10]
    paddd xmm1, xmmword ptr [rsp+0x30]
    paddd xmm2, xmmword ptr [rsp+0x50]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x80]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp+0xC0]
    paddd xmm3, xmmword ptr [rsp+0xE0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x90]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0xD0]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x20]
    paddd xmm1, xmmword ptr [rsp+0x30]
    paddd xmm2, xmmword ptr [rsp+0x70]
    paddd xmm3, xmmword ptr [rsp+0x40]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x60]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp]
    paddd xmm3, xmmword ptr [rsp+0xD0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x10]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0x90]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xB0]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp+0xE0]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x30]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp+0xD0]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x40]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0x20]
    paddd xmm3, xmmword ptr [rsp+0xE0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x60]
    paddd xmm1, xmmword ptr [rsp+0x90]
    paddd xmm2, xmmword ptr [rsp+0xB0]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x50]
    paddd xmm1, xmmword ptr [rsp]
    paddd xmm2, xmmword ptr [rsp+0xF0]
    paddd xmm3, xmmword ptr [rsp+0x10]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xA0]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0xE0]
    paddd xmm3, xmmword ptr [rsp+0xD0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x70]
    paddd xmm1, xmmword ptr [rsp+0x90]
    paddd xmm2, xmmword ptr [rsp+0x30]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x40]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0x50]
    paddd xmm3, xmmword ptr [rsp+0x10]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp]
    paddd xmm1, xmmword ptr [rsp+0x20]
    paddd xmm2, xmmword ptr [rsp+0x80]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xC0]
    paddd xmm1, xmmword ptr [rsp+0x90]
    paddd xmm2, xmmword ptr [rsp+0xF0]
    paddd xmm3, xmmword ptr [rsp+0xE0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xD0]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0xA0]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0x70]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x20]
    paddd xmm1, xmmword ptr [rsp+0x30]
    paddd xmm2, xmmword ptr [rsp+0x10]
    paddd xmm3, xmmword ptr [rsp+0x40]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x90]
    paddd xmm1, xmmword ptr [rsp+0xB0]
    paddd xmm2, xmmword ptr [rsp+0x80]
    paddd xmm3, xmmword ptr [rsp+0xF0]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xE0]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp+0xC0]
    paddd xmm3, xmmword ptr [rsp+0x10]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xD0]
    paddd xmm1, xmmword ptr [rsp]
    paddd xmm2, xmmword ptr [rsp+0x20]
    paddd xmm3, xmmword ptr [rsp+0x40]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0x30]
    paddd xmm1, xmmword ptr [rsp+0xA0]
    paddd xmm2, xmmword ptr [rsp+0x60]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xB0]
    paddd xmm1, xmmword ptr [rsp+0x50]
    paddd xmm2, xmmword ptr [rsp+0x10]
    paddd xmm3, xmmword ptr [rsp+0x80]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xF0]
    paddd xmm1, xmmword ptr [rsp]
    paddd xmm2, xmmword ptr [rsp+0x90]
    paddd xmm3, xmmword ptr [rsp+0x60]
    paddd xmm0, xmm4
    paddd xmm1, xmm5
    paddd xmm2, xmm6
    paddd xmm3, xmm7
    pxor xmm12, xmm0
    pxor xmm13, xmm1
    pxor xmm14, xmm2
    pxor xmm15, xmm3
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm12
    paddd xmm9, xmm13
    paddd xmm10, xmm14
    paddd xmm11, xmm15
    pxor xmm4, xmm8
    pxor xmm5, xmm9
    pxor xmm6, xmm10
    pxor xmm7, xmm11
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    paddd xmm0, xmmword ptr [rsp+0xE0]
    paddd xmm1, xmmword ptr [rsp+0x20]
    paddd xmm2, xmmword ptr [rsp+0x30]
    paddd xmm3, xmmword ptr [rsp+0x70]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    pshuflw xmm15, xmm15, 0xB1
    pshufhw xmm15, xmm15, 0xB1
    pshuflw xmm12, xmm12, 0xB1
    pshufhw xmm12, xmm12, 0xB1
    pshuflw xmm13, xmm13, 0xB1
    pshufhw xmm13, xmm13, 0xB1
    pshuflw xmm14, xmm14, 0xB1
    pshufhw xmm14, xmm14, 0xB1
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    movdqa xmmword ptr [rsp+0x100], xmm8
    movdqa xmm8, xmm5
    psrld xmm8, 12
    pslld xmm5, 20
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 12
    pslld xmm6, 20
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 12
    pslld xmm7, 20
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 12
    pslld xmm4, 20
    por xmm4, xmm8
    paddd xmm0, xmmword ptr [rsp+0xA0]
    paddd xmm1, xmmword ptr [rsp+0xC0]
    paddd xmm2, xmmword ptr [rsp+0x40]
    paddd xmm3, xmmword ptr [rsp+0xD0]
    paddd xmm0, xmm5
    paddd xmm1, xmm6
    paddd xmm2, xmm7
    paddd xmm3, xmm4
    pxor xmm15, xmm0
    pxor xmm12, xmm1
    pxor xmm13, xmm2
    pxor xmm14, xmm3
    movdqa xmm8, xmm15
    psrld xmm15, 8
    pslld xmm8, 24
    pxor xmm15, xmm8
    movdqa xmm8, xmm12
    psrld xmm12, 8
    pslld xmm8, 24
    pxor xmm12, xmm8
    movdqa xmm8, xmm13
    psrld xmm13, 8
    pslld xmm8, 24
    pxor xmm13, xmm8
    movdqa xmm8, xmm14
    psrld xmm14, 8
    pslld xmm8, 24
    pxor xmm14, xmm8
    paddd xmm10, xmm15
    paddd xmm11, xmm12
    movdqa xmm8, xmmword ptr [rsp+0x100]
    paddd xmm8, xmm13
    paddd xmm9, xmm14
    pxor xmm5, xmm10
    pxor xmm6, xmm11
    pxor xmm7, xmm8
    pxor xmm4, xmm9
    pxor xmm0, xmm8
    pxor xmm1, xmm9
    pxor xmm2, xmm10
    pxor xmm3, xmm11
    movdqa xmm8, xmm5
    psrld xmm8, 7
    pslld xmm5, 25
    por xmm5, xmm8
    movdqa xmm8, xmm6
    psrld xmm8, 7
    pslld xmm6, 25
    por xmm6, xmm8
    movdqa xmm8, xmm7
    psrld xmm8, 7
    pslld xmm7, 25
    por xmm7, xmm8
    movdqa xmm8, xmm4
    psrld xmm8, 7
    pslld xmm4, 25
    por xmm4, xmm8
    pxor xmm4, xmm12
    pxor xmm5, xmm13
    pxor xmm6, xmm14
    pxor xmm7, xmm15
    mov eax, r13d
    jne 9b
    movdqa xmm9, xmm0
    punpckldq xmm0, xmm1
    punpckhdq xmm9, xmm1
    movdqa xmm11, xmm2
    punpckldq xmm2, xmm3
    punpckhdq xmm11, xmm3
    movdqa xmm1, xmm0
    punpcklqdq xmm0, xmm2
    punpckhqdq xmm1, xmm2
    movdqa xmm3, xmm9
    punpcklqdq xmm9, xmm11
    punpckhqdq xmm3, xmm11
    movdqu xmmword ptr [rbx], xmm0
    movdqu xmmword ptr [rbx+0x20], xmm1
    movdqu xmmword ptr [rbx+0x40], xmm9
    movdqu xmmword ptr [rbx+0x60], xmm3
    movdqa xmm9, xmm4
    punpckldq xmm4, xmm5
    punpckhdq xmm9, xmm5
    movdqa xmm11, xmm6
    punpckldq xmm6, xmm7
    punpckhdq xmm11, xmm7
    movdqa xmm5, xmm4
    punpcklqdq xmm4, xmm6
    punpckhqdq xmm5, xmm6
    movdqa xmm7, xmm9
    punpcklqdq xmm9, xmm11
    punpckhqdq xmm7, xmm11
    movdqu xmmword ptr [rbx+0x10], xmm4
    movdqu xmmword ptr [rbx+0x30], xmm5
    movdqu xmmword ptr [rbx+0x50], xmm9
    movdqu xmmword ptr [rbx+0x70], xmm7
    movdqa xmm1, xmmword ptr [rsp+0x110]
    movdqa xmm0, xmm1
    paddd xmm1, xmmword ptr [rsp+0x150]
    movdqa xmmword ptr [rsp+0x110], xmm1
    pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
    pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
    pcmpgtd xmm0, xmm1
    movdqa xmm1, xmmword ptr [rsp+0x120]
    psubd xmm1, xmm0
    movdqa xmmword ptr [rsp+0x120], xmm1
    add rbx, 128
    add rdi, 32
    sub rsi, 4
    cmp rsi, 4
    jnc 2b
    test rsi, rsi
    jnz 3f
    4:
    mov rsp, rbp
    pop rbp
    pop rbx
    pop r12
    pop r13
    pop r14
    pop r15
    ret
    .p2align 5
    3:
    test esi, 0x2
    je 3f
    movups xmm0, xmmword ptr [rcx]
    movups xmm1, xmmword ptr [rcx+0x10]
    movaps xmm8, xmm0
    movaps xmm9, xmm1
    movd xmm13, dword ptr [rsp+0x110]
    movd xmm14, dword ptr [rsp+0x120]
    punpckldq xmm13, xmm14
    movaps xmmword ptr [rsp], xmm13
    movd xmm14, dword ptr [rsp+0x114]
    movd xmm13, dword ptr [rsp+0x124]
    punpckldq xmm14, xmm13
    movaps xmmword ptr [rsp+0x10], xmm14
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    movaps xmm10, xmm2
    movups xmm4, xmmword ptr [r8+rdx-0x40]
    movups xmm5, xmmword ptr [r8+rdx-0x30]
    movaps xmm3, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm3, xmm5, 221
    movaps xmm5, xmm3
    movups xmm6, xmmword ptr [r8+rdx-0x20]
    movups xmm7, xmmword ptr [r8+rdx-0x10]
    movaps xmm3, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm3, xmm7, 221
    pshufd xmm7, xmm3, 0x93
    movups xmm12, xmmword ptr [r9+rdx-0x40]
    movups xmm13, xmmword ptr [r9+rdx-0x30]
    movaps xmm11, xmm12
    shufps xmm12, xmm13, 136
    shufps xmm11, xmm13, 221
    movaps xmm13, xmm11
    movups xmm14, xmmword ptr [r9+rdx-0x20]
    movups xmm15, xmmword ptr [r9+rdx-0x10]
    movaps xmm11, xmm14
    shufps xmm14, xmm15, 136
    pshufd xmm14, xmm14, 0x93
    shufps xmm11, xmm15, 221
    pshufd xmm15, xmm11, 0x93
    shl rax, 0x20
    or rax, 0x40
    movq xmm3, rax
    movdqa xmmword ptr [rsp+0x20], xmm3
    movaps xmm3, xmmword ptr [rsp]
    movaps xmm11, xmmword ptr [rsp+0x10]
    punpcklqdq xmm3, xmmword ptr [rsp+0x20]
    punpcklqdq xmm11, xmmword ptr [rsp+0x20]
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm8, xmm12
    movaps xmmword ptr [rsp+0x20], xmm4
    movaps xmmword ptr [rsp+0x30], xmm12
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    pshuflw xmm11, xmm11, 0xB1
    pshufhw xmm11, xmm11, 0xB1
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 20
    psrld xmm4, 12
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 20
    psrld xmm4, 12
    por xmm9, xmm4
    paddd xmm0, xmm5
    paddd xmm8, xmm13
    movaps xmmword ptr [rsp+0x40], xmm5
    movaps xmmword ptr [rsp+0x50], xmm13
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    movdqa xmm13, xmm3
    psrld xmm3, 8
    pslld xmm13, 24
    pxor xmm3, xmm13
    movdqa xmm13, xmm11
    psrld xmm11, 8
    pslld xmm13, 24
    pxor xmm11, xmm13
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 25
    psrld xmm4, 7
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 25
    psrld xmm4, 7
    por xmm9, xmm4
    pshufd xmm0, xmm0, 0x93
    pshufd xmm8, xmm8, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm11, xmm11, 0x4E
    pshufd xmm2, xmm2, 0x39
    pshufd xmm10, xmm10, 0x39
    paddd xmm0, xmm6
    paddd xmm8, xmm14
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    pshuflw xmm11, xmm11, 0xB1
    pshufhw xmm11, xmm11, 0xB1
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 20
    psrld xmm4, 12
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 20
    psrld xmm4, 12
    por xmm9, xmm4
    paddd xmm0, xmm7
    paddd xmm8, xmm15
    paddd xmm0, xmm1
    paddd xmm8, xmm9
    pxor xmm3, xmm0
    pxor xmm11, xmm8
    movdqa xmm13, xmm3
    psrld xmm3, 8
    pslld xmm13, 24
    pxor xmm3, xmm13
    movdqa xmm13, xmm11
    psrld xmm11, 8
    pslld xmm13, 24
    pxor xmm11, xmm13
    paddd xmm2, xmm3
    paddd xmm10, xmm11
    pxor xmm1, xmm2
    pxor xmm9, xmm10
    movdqa xmm4, xmm1
    pslld xmm1, 25
    psrld xmm4, 7
    por xmm1, xmm4
    movdqa xmm4, xmm9
    pslld xmm9, 25
    psrld xmm4, 7
    por xmm9, xmm4
    pshufd xmm0, xmm0, 0x39
    pshufd xmm8, xmm8, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm11, xmm11, 0x4E
    pshufd xmm2, xmm2, 0x93
    pshufd xmm10, xmm10, 0x93
    dec al
    je 9f
    movdqa xmm12, xmmword ptr [rsp+0x20]
    movdqa xmm5, xmmword ptr [rsp+0x40]
    pshufd xmm13, xmm12, 0x0F
    shufps xmm12, xmm5, 214
    pshufd xmm4, xmm12, 0x39
    movdqa xmm12, xmm6
    shufps xmm12, xmm7, 250
    pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
    pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
    por xmm13, xmm12
    movdqa xmmword ptr [rsp+0x20], xmm13
    movdqa xmm12, xmm7
    punpcklqdq xmm12, xmm5
    movdqa xmm13, xmm6
    pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
    pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
    por xmm12, xmm13
    pshufd xmm12, xmm12, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmmword ptr [rsp+0x40], xmm12
    movdqa xmm5, xmmword ptr [rsp+0x30]
    movdqa xmm13, xmmword ptr [rsp+0x50]
    pshufd xmm6, xmm5, 0x0F
    shufps xmm5, xmm13, 214
    pshufd xmm12, xmm5, 0x39
    movdqa xmm5, xmm14
    shufps xmm5, xmm15, 250
    pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
    pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
    por xmm6, xmm5
    movdqa xmm5, xmm15
    punpcklqdq xmm5, xmm13
    movdqa xmmword ptr [rsp+0x30], xmm2
    movdqa xmm2, xmm14
    pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
    pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
    por xmm5, xmm2
    movdqa xmm2, xmmword ptr [rsp+0x30]
    pshufd xmm5, xmm5, 0x78
    punpckhdq xmm13, xmm15
    punpckldq xmm14, xmm13
    pshufd xmm15, xmm14, 0x1E
    movdqa xmm13, xmm6
    movdqa xmm14, xmm5
    movdqa xmm5, xmmword ptr [rsp+0x20]
    movdqa xmm6, xmmword ptr [rsp+0x40]
    jmp 9b
    9:
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    pxor xmm8, xmm10
    pxor xmm9, xmm11
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    movups xmmword ptr [rbx], xmm0
    movups xmmword ptr [rbx+0x10], xmm1
    movups xmmword ptr [rbx+0x20], xmm8
    movups xmmword ptr [rbx+0x30], xmm9
    mov eax, dword ptr [rsp+0x130]
    neg eax
    mov r10d, dword ptr [rsp+0x110+8*rax]
    mov r11d, dword ptr [rsp+0x120+8*rax]
    mov dword ptr [rsp+0x110], r10d
    mov dword ptr [rsp+0x120], r11d
    add rdi, 16
    add rbx, 64
    sub rsi, 2
    3:
    test esi, 0x1
    je 4b
    movups xmm0, xmmword ptr [rcx]
    movups xmm1, xmmword ptr [rcx+0x10]
    movd xmm13, dword ptr [rsp+0x110]
    movd xmm14, dword ptr [rsp+0x120]
    punpckldq xmm13, xmm14
    mov r8, qword ptr [rdi]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    shl rax, 32
    or rax, 64
    movq xmm12, rax
    movdqa xmm3, xmm13
    punpcklqdq xmm3, xmm12
    movups xmm4, xmmword ptr [r8+rdx-0x40]
    movups xmm5, xmmword ptr [r8+rdx-0x30]
    movaps xmm8, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm8, xmm5, 221
    movaps xmm5, xmm8
    movups xmm6, xmmword ptr [r8+rdx-0x20]
    movups xmm7, xmmword ptr [r8+rdx-0x10]
    movaps xmm8, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm8, xmm7, 221
    pshufd xmm7, xmm8, 0x93
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm5
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    movdqa xmm14, xmm3
    psrld xmm3, 8
    pslld xmm14, 24
    pxor xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x39
    paddd xmm0, xmm6
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm7
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    movdqa xmm14, xmm3
    psrld xmm3, 8
    pslld xmm14, 24
    pxor xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    movdqa xmm8, xmm4
    shufps xmm8, xmm5, 214
    pshufd xmm9, xmm4, 0x0F
    pshufd xmm4, xmm8, 0x39
    movdqa xmm8, xmm6
    shufps xmm8, xmm7, 250
    pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
    pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
    por xmm9, xmm8
    movdqa xmm8, xmm7
    punpcklqdq xmm8, xmm5
    movdqa xmm10, xmm6
    pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
    pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
    por xmm8, xmm10
    pshufd xmm8, xmm8, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmm5, xmm9
    movdqa xmm6, xmm8
    jmp 9b
    9:
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    movups xmmword ptr [rbx], xmm0
    movups xmmword ptr [rbx+0x10], xmm1
    jmp 4b
    .p2align 6
    blake3_compress_in_place_sse2:
    _blake3_compress_in_place_sse2:
    _CET_ENDBR
    movups xmm0, xmmword ptr [rdi]
    movups xmm1, xmmword ptr [rdi+0x10]
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    shl r8, 32
    add rdx, r8
    movq xmm3, rcx
    movq xmm4, rdx
    punpcklqdq xmm3, xmm4
    movups xmm4, xmmword ptr [rsi]
    movups xmm5, xmmword ptr [rsi+0x10]
    movaps xmm8, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm8, xmm5, 221
    movaps xmm5, xmm8
    movups xmm6, xmmword ptr [rsi+0x20]
    movups xmm7, xmmword ptr [rsi+0x30]
    movaps xmm8, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm8, xmm7, 221
    pshufd xmm7, xmm8, 0x93
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm5
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    movdqa xmm14, xmm3
    psrld xmm3, 8
    pslld xmm14, 24
    pxor xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x39
    paddd xmm0, xmm6
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm7
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    movdqa xmm14, xmm3
    psrld xmm3, 8
    pslld xmm14, 24
    pxor xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    movdqa xmm8, xmm4
    shufps xmm8, xmm5, 214
    pshufd xmm9, xmm4, 0x0F
    pshufd xmm4, xmm8, 0x39
    movdqa xmm8, xmm6
    shufps xmm8, xmm7, 250
    pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
    pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
    por xmm9, xmm8
    movdqa xmm8, xmm7
    punpcklqdq xmm8, xmm5
    movdqa xmm10, xmm6
    pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
    pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
    por xmm8, xmm10
    pshufd xmm8, xmm8, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmm5, xmm9
    movdqa xmm6, xmm8
    jmp 9b
    9:
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    movups xmmword ptr [rdi], xmm0
    movups xmmword ptr [rdi+0x10], xmm1
    ret
    .p2align 6
    blake3_compress_xof_sse2:
    _blake3_compress_xof_sse2:
    _CET_ENDBR
    movups xmm0, xmmword ptr [rdi]
    movups xmm1, xmmword ptr [rdi+0x10]
    movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    movzx eax, r8b
    movzx edx, dl
    shl rax, 32
    add rdx, rax
    movq xmm3, rcx
    movq xmm4, rdx
    punpcklqdq xmm3, xmm4
    movups xmm4, xmmword ptr [rsi]
    movups xmm5, xmmword ptr [rsi+0x10]
    movaps xmm8, xmm4
    shufps xmm4, xmm5, 136
    shufps xmm8, xmm5, 221
    movaps xmm5, xmm8
    movups xmm6, xmmword ptr [rsi+0x20]
    movups xmm7, xmmword ptr [rsi+0x30]
    movaps xmm8, xmm6
    shufps xmm6, xmm7, 136
    pshufd xmm6, xmm6, 0x93
    shufps xmm8, xmm7, 221
    pshufd xmm7, xmm8, 0x93
    mov al, 7
    9:
    paddd xmm0, xmm4
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm5
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    movdqa xmm14, xmm3
    psrld xmm3, 8
    pslld xmm14, 24
    pxor xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x93
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x39
    paddd xmm0, xmm6
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    pshuflw xmm3, xmm3, 0xB1
    pshufhw xmm3, xmm3, 0xB1
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 20
    psrld xmm11, 12
    por xmm1, xmm11
    paddd xmm0, xmm7
    paddd xmm0, xmm1
    pxor xmm3, xmm0
    movdqa xmm14, xmm3
    psrld xmm3, 8
    pslld xmm14, 24
    pxor xmm3, xmm14
    paddd xmm2, xmm3
    pxor xmm1, xmm2
    movdqa xmm11, xmm1
    pslld xmm1, 25
    psrld xmm11, 7
    por xmm1, xmm11
    pshufd xmm0, xmm0, 0x39
    pshufd xmm3, xmm3, 0x4E
    pshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    movdqa xmm8, xmm4
    shufps xmm8, xmm5, 214
    pshufd xmm9, xmm4, 0x0F
    pshufd xmm4, xmm8, 0x39
    movdqa xmm8, xmm6
    shufps xmm8, xmm7, 250
    pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
    pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
    por xmm9, xmm8
    movdqa xmm8, xmm7
    punpcklqdq xmm8, xmm5
    movdqa xmm10, xmm6
    pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
    pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
    por xmm8, xmm10
    pshufd xmm8, xmm8, 0x78
    punpckhdq xmm5, xmm7
    punpckldq xmm6, xmm5
    pshufd xmm7, xmm6, 0x1E
    movdqa xmm5, xmm9
    movdqa xmm6, xmm8
    jmp 9b
    9:
    movdqu xmm4, xmmword ptr [rdi]
    movdqu xmm5, xmmword ptr [rdi+0x10]
    pxor xmm0, xmm2
    pxor xmm1, xmm3
    pxor xmm2, xmm4
    pxor xmm3, xmm5
    movups xmmword ptr [r9], xmm0
    movups xmmword ptr [r9+0x10], xmm1
    movups xmmword ptr [r9+0x20], xmm2
    movups xmmword ptr [r9+0x30], xmm3
    ret
    #ifdef __APPLE__
    .static_data
    #else
    .section .rodata
    #endif
    .p2align 6
    BLAKE3_IV:
    .long 0x6A09E667, 0xBB67AE85
    .long 0x3C6EF372, 0xA54FF53A
    ADD0:
    .long 0, 1, 2, 3
    ADD1:
    .long 4, 4, 4, 4
    BLAKE3_IV_0:
    .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
    BLAKE3_IV_1:
    .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
    BLAKE3_IV_2:
    .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
    BLAKE3_IV_3:
    .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
    BLAKE3_BLOCK_LEN:
    .long 64, 64, 64, 64
    CMP_MSB_MASK:
    .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
    PBLENDW_0x33_MASK:
    .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
    PBLENDW_0xCC_MASK:
    .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
    PBLENDW_0x3F_MASK:
    .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
    PBLENDW_0xC0_MASK:
    .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
  • file addition: blake3_avx512_x86-64_unix.S (----------)
    [0.38]
    #if defined(__ELF__) && defined(__linux__)
    .section .note.GNU-stack,"",%progbits
    #endif
    #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
    #if __has_include(<cet.h>)
    #include <cet.h>
    #endif
    #endif
    #if !defined(_CET_ENDBR)
    #define _CET_ENDBR
    #endif
    .intel_syntax noprefix
    .global _blake3_hash_many_avx512
    .global blake3_hash_many_avx512
    .global blake3_compress_in_place_avx512
    .global _blake3_compress_in_place_avx512
    .global blake3_compress_xof_avx512
    .global _blake3_compress_xof_avx512
    #ifdef __APPLE__
    .text
    #else
    .section .text
    #endif
    .p2align 6
    _blake3_hash_many_avx512:
    blake3_hash_many_avx512:
    _CET_ENDBR
    push r15
    push r14
    push r13
    push r12
    push rbx
    push rbp
    mov rbp, rsp
    sub rsp, 144
    and rsp, 0xFFFFFFFFFFFFFFC0
    neg r9
    kmovw k1, r9d
    vmovd xmm0, r8d
    vpbroadcastd ymm0, xmm0
    shr r8, 32
    vmovd xmm1, r8d
    vpbroadcastd ymm1, xmm1
    vmovdqa ymm4, ymm1
    vmovdqa ymm5, ymm1
    vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
    vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
    vpcmpltud k2, ymm2, ymm0
    vpcmpltud k3, ymm3, ymm0
    vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
    vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
    knotw k2, k1
    vmovdqa32 ymm2 {k2}, ymm0
    vmovdqa32 ymm3 {k2}, ymm0
    vmovdqa32 ymm4 {k2}, ymm1
    vmovdqa32 ymm5 {k2}, ymm1
    vmovdqa ymmword ptr [rsp], ymm2
    vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
    vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
    vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
    shl rdx, 6
    mov qword ptr [rsp+0x80], rdx
    cmp rsi, 16
    jc 3f
    2:
    vpbroadcastd zmm0, dword ptr [rcx]
    vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
    vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
    vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
    vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
    vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
    vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
    vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
    movzx eax, byte ptr [rbp+0x38]
    movzx ebx, byte ptr [rbp+0x40]
    or eax, ebx
    xor edx, edx
    .p2align 5
    9:
    movzx ebx, byte ptr [rbp+0x48]
    or ebx, eax
    add rdx, 64
    cmp rdx, qword ptr [rsp+0x80]
    cmove eax, ebx
    mov dword ptr [rsp+0x88], eax
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    mov r12, qword ptr [rdi+0x40]
    mov r13, qword ptr [rdi+0x48]
    mov r14, qword ptr [rdi+0x50]
    mov r15, qword ptr [rdi+0x58]
    vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
    vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
    vpunpcklqdq zmm8, zmm16, zmm17
    vpunpckhqdq zmm9, zmm16, zmm17
    vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
    vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
    vpunpcklqdq zmm10, zmm18, zmm19
    vpunpckhqdq zmm11, zmm18, zmm19
    mov r8, qword ptr [rdi+0x20]
    mov r9, qword ptr [rdi+0x28]
    mov r10, qword ptr [rdi+0x30]
    mov r11, qword ptr [rdi+0x38]
    mov r12, qword ptr [rdi+0x60]
    mov r13, qword ptr [rdi+0x68]
    mov r14, qword ptr [rdi+0x70]
    mov r15, qword ptr [rdi+0x78]
    vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
    vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
    vpunpcklqdq zmm12, zmm16, zmm17
    vpunpckhqdq zmm13, zmm16, zmm17
    vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
    vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
    vpunpcklqdq zmm14, zmm18, zmm19
    vpunpckhqdq zmm15, zmm18, zmm19
    vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
    vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
    vshufps zmm16, zmm8, zmm10, 136
    vshufps zmm17, zmm12, zmm14, 136
    vmovdqa32 zmm20, zmm16
    vpermt2d zmm16, zmm27, zmm17
    vpermt2d zmm20, zmm31, zmm17
    vshufps zmm17, zmm8, zmm10, 221
    vshufps zmm30, zmm12, zmm14, 221
    vmovdqa32 zmm21, zmm17
    vpermt2d zmm17, zmm27, zmm30
    vpermt2d zmm21, zmm31, zmm30
    vshufps zmm18, zmm9, zmm11, 136
    vshufps zmm8, zmm13, zmm15, 136
    vmovdqa32 zmm22, zmm18
    vpermt2d zmm18, zmm27, zmm8
    vpermt2d zmm22, zmm31, zmm8
    vshufps zmm19, zmm9, zmm11, 221
    vshufps zmm8, zmm13, zmm15, 221
    vmovdqa32 zmm23, zmm19
    vpermt2d zmm19, zmm27, zmm8
    vpermt2d zmm23, zmm31, zmm8
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    mov r12, qword ptr [rdi+0x40]
    mov r13, qword ptr [rdi+0x48]
    mov r14, qword ptr [rdi+0x50]
    mov r15, qword ptr [rdi+0x58]
    vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
    vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
    vpunpcklqdq zmm8, zmm24, zmm25
    vpunpckhqdq zmm9, zmm24, zmm25
    vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
    vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
    vpunpcklqdq zmm10, zmm24, zmm25
    vpunpckhqdq zmm11, zmm24, zmm25
    prefetcht0 [r8+rdx+0x80]
    prefetcht0 [r12+rdx+0x80]
    prefetcht0 [r9+rdx+0x80]
    prefetcht0 [r13+rdx+0x80]
    prefetcht0 [r10+rdx+0x80]
    prefetcht0 [r14+rdx+0x80]
    prefetcht0 [r11+rdx+0x80]
    prefetcht0 [r15+rdx+0x80]
    mov r8, qword ptr [rdi+0x20]
    mov r9, qword ptr [rdi+0x28]
    mov r10, qword ptr [rdi+0x30]
    mov r11, qword ptr [rdi+0x38]
    mov r12, qword ptr [rdi+0x60]
    mov r13, qword ptr [rdi+0x68]
    mov r14, qword ptr [rdi+0x70]
    mov r15, qword ptr [rdi+0x78]
    vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
    vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
    vpunpcklqdq zmm12, zmm24, zmm25
    vpunpckhqdq zmm13, zmm24, zmm25
    vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
    vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
    vpunpcklqdq zmm14, zmm24, zmm25
    vpunpckhqdq zmm15, zmm24, zmm25
    prefetcht0 [r8+rdx+0x80]
    prefetcht0 [r12+rdx+0x80]
    prefetcht0 [r9+rdx+0x80]
    prefetcht0 [r13+rdx+0x80]
    prefetcht0 [r10+rdx+0x80]
    prefetcht0 [r14+rdx+0x80]
    prefetcht0 [r11+rdx+0x80]
    prefetcht0 [r15+rdx+0x80]
    vshufps zmm24, zmm8, zmm10, 136
    vshufps zmm30, zmm12, zmm14, 136
    vmovdqa32 zmm28, zmm24
    vpermt2d zmm24, zmm27, zmm30
    vpermt2d zmm28, zmm31, zmm30
    vshufps zmm25, zmm8, zmm10, 221
    vshufps zmm30, zmm12, zmm14, 221
    vmovdqa32 zmm29, zmm25
    vpermt2d zmm25, zmm27, zmm30
    vpermt2d zmm29, zmm31, zmm30
    vshufps zmm26, zmm9, zmm11, 136
    vshufps zmm8, zmm13, zmm15, 136
    vmovdqa32 zmm30, zmm26
    vpermt2d zmm26, zmm27, zmm8
    vpermt2d zmm30, zmm31, zmm8
    vshufps zmm8, zmm9, zmm11, 221
    vshufps zmm10, zmm13, zmm15, 221
    vpermi2d zmm27, zmm8, zmm10
    vpermi2d zmm31, zmm8, zmm10
    vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
    vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
    vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
    vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
    vmovdqa32 zmm12, zmmword ptr [rsp]
    vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
    vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
    vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
    vpaddd zmm0, zmm0, zmm16
    vpaddd zmm1, zmm1, zmm18
    vpaddd zmm2, zmm2, zmm20
    vpaddd zmm3, zmm3, zmm22
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm17
    vpaddd zmm1, zmm1, zmm19
    vpaddd zmm2, zmm2, zmm21
    vpaddd zmm3, zmm3, zmm23
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm24
    vpaddd zmm1, zmm1, zmm26
    vpaddd zmm2, zmm2, zmm28
    vpaddd zmm3, zmm3, zmm30
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm25
    vpaddd zmm1, zmm1, zmm27
    vpaddd zmm2, zmm2, zmm29
    vpaddd zmm3, zmm3, zmm31
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpaddd zmm0, zmm0, zmm18
    vpaddd zmm1, zmm1, zmm19
    vpaddd zmm2, zmm2, zmm23
    vpaddd zmm3, zmm3, zmm20
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm22
    vpaddd zmm1, zmm1, zmm26
    vpaddd zmm2, zmm2, zmm16
    vpaddd zmm3, zmm3, zmm29
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm17
    vpaddd zmm1, zmm1, zmm28
    vpaddd zmm2, zmm2, zmm25
    vpaddd zmm3, zmm3, zmm31
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm27
    vpaddd zmm1, zmm1, zmm21
    vpaddd zmm2, zmm2, zmm30
    vpaddd zmm3, zmm3, zmm24
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpaddd zmm0, zmm0, zmm19
    vpaddd zmm1, zmm1, zmm26
    vpaddd zmm2, zmm2, zmm29
    vpaddd zmm3, zmm3, zmm23
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm20
    vpaddd zmm1, zmm1, zmm28
    vpaddd zmm2, zmm2, zmm18
    vpaddd zmm3, zmm3, zmm30
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm22
    vpaddd zmm1, zmm1, zmm25
    vpaddd zmm2, zmm2, zmm27
    vpaddd zmm3, zmm3, zmm24
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm21
    vpaddd zmm1, zmm1, zmm16
    vpaddd zmm2, zmm2, zmm31
    vpaddd zmm3, zmm3, zmm17
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpaddd zmm0, zmm0, zmm26
    vpaddd zmm1, zmm1, zmm28
    vpaddd zmm2, zmm2, zmm30
    vpaddd zmm3, zmm3, zmm29
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm23
    vpaddd zmm1, zmm1, zmm25
    vpaddd zmm2, zmm2, zmm19
    vpaddd zmm3, zmm3, zmm31
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm20
    vpaddd zmm1, zmm1, zmm27
    vpaddd zmm2, zmm2, zmm21
    vpaddd zmm3, zmm3, zmm17
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm16
    vpaddd zmm1, zmm1, zmm18
    vpaddd zmm2, zmm2, zmm24
    vpaddd zmm3, zmm3, zmm22
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpaddd zmm0, zmm0, zmm28
    vpaddd zmm1, zmm1, zmm25
    vpaddd zmm2, zmm2, zmm31
    vpaddd zmm3, zmm3, zmm30
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm29
    vpaddd zmm1, zmm1, zmm27
    vpaddd zmm2, zmm2, zmm26
    vpaddd zmm3, zmm3, zmm24
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm23
    vpaddd zmm1, zmm1, zmm21
    vpaddd zmm2, zmm2, zmm16
    vpaddd zmm3, zmm3, zmm22
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm18
    vpaddd zmm1, zmm1, zmm19
    vpaddd zmm2, zmm2, zmm17
    vpaddd zmm3, zmm3, zmm20
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpaddd zmm0, zmm0, zmm25
    vpaddd zmm1, zmm1, zmm27
    vpaddd zmm2, zmm2, zmm24
    vpaddd zmm3, zmm3, zmm31
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm30
    vpaddd zmm1, zmm1, zmm21
    vpaddd zmm2, zmm2, zmm28
    vpaddd zmm3, zmm3, zmm17
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm29
    vpaddd zmm1, zmm1, zmm16
    vpaddd zmm2, zmm2, zmm18
    vpaddd zmm3, zmm3, zmm20
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm19
    vpaddd zmm1, zmm1, zmm26
    vpaddd zmm2, zmm2, zmm22
    vpaddd zmm3, zmm3, zmm23
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpaddd zmm0, zmm0, zmm27
    vpaddd zmm1, zmm1, zmm21
    vpaddd zmm2, zmm2, zmm17
    vpaddd zmm3, zmm3, zmm24
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vprord zmm15, zmm15, 16
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 12
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vpaddd zmm0, zmm0, zmm31
    vpaddd zmm1, zmm1, zmm16
    vpaddd zmm2, zmm2, zmm25
    vpaddd zmm3, zmm3, zmm22
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm1, zmm1, zmm5
    vpaddd zmm2, zmm2, zmm6
    vpaddd zmm3, zmm3, zmm7
    vpxord zmm12, zmm12, zmm0
    vpxord zmm13, zmm13, zmm1
    vpxord zmm14, zmm14, zmm2
    vpxord zmm15, zmm15, zmm3
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vprord zmm15, zmm15, 8
    vpaddd zmm8, zmm8, zmm12
    vpaddd zmm9, zmm9, zmm13
    vpaddd zmm10, zmm10, zmm14
    vpaddd zmm11, zmm11, zmm15
    vpxord zmm4, zmm4, zmm8
    vpxord zmm5, zmm5, zmm9
    vpxord zmm6, zmm6, zmm10
    vpxord zmm7, zmm7, zmm11
    vprord zmm4, zmm4, 7
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vpaddd zmm0, zmm0, zmm30
    vpaddd zmm1, zmm1, zmm18
    vpaddd zmm2, zmm2, zmm19
    vpaddd zmm3, zmm3, zmm23
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 16
    vprord zmm12, zmm12, 16
    vprord zmm13, zmm13, 16
    vprord zmm14, zmm14, 16
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 12
    vprord zmm6, zmm6, 12
    vprord zmm7, zmm7, 12
    vprord zmm4, zmm4, 12
    vpaddd zmm0, zmm0, zmm26
    vpaddd zmm1, zmm1, zmm28
    vpaddd zmm2, zmm2, zmm20
    vpaddd zmm3, zmm3, zmm29
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm1, zmm1, zmm6
    vpaddd zmm2, zmm2, zmm7
    vpaddd zmm3, zmm3, zmm4
    vpxord zmm15, zmm15, zmm0
    vpxord zmm12, zmm12, zmm1
    vpxord zmm13, zmm13, zmm2
    vpxord zmm14, zmm14, zmm3
    vprord zmm15, zmm15, 8
    vprord zmm12, zmm12, 8
    vprord zmm13, zmm13, 8
    vprord zmm14, zmm14, 8
    vpaddd zmm10, zmm10, zmm15
    vpaddd zmm11, zmm11, zmm12
    vpaddd zmm8, zmm8, zmm13
    vpaddd zmm9, zmm9, zmm14
    vpxord zmm5, zmm5, zmm10
    vpxord zmm6, zmm6, zmm11
    vpxord zmm7, zmm7, zmm8
    vpxord zmm4, zmm4, zmm9
    vprord zmm5, zmm5, 7
    vprord zmm6, zmm6, 7
    vprord zmm7, zmm7, 7
    vprord zmm4, zmm4, 7
    vpxord zmm0, zmm0, zmm8
    vpxord zmm1, zmm1, zmm9
    vpxord zmm2, zmm2, zmm10
    vpxord zmm3, zmm3, zmm11
    vpxord zmm4, zmm4, zmm12
    vpxord zmm5, zmm5, zmm13
    vpxord zmm6, zmm6, zmm14
    vpxord zmm7, zmm7, zmm15
    movzx eax, byte ptr [rbp+0x38]
    jne 9b
    mov rbx, qword ptr [rbp+0x50]
    vpunpckldq zmm16, zmm0, zmm1
    vpunpckhdq zmm17, zmm0, zmm1
    vpunpckldq zmm18, zmm2, zmm3
    vpunpckhdq zmm19, zmm2, zmm3
    vpunpckldq zmm20, zmm4, zmm5
    vpunpckhdq zmm21, zmm4, zmm5
    vpunpckldq zmm22, zmm6, zmm7
    vpunpckhdq zmm23, zmm6, zmm7
    vpunpcklqdq zmm0, zmm16, zmm18
    vpunpckhqdq zmm1, zmm16, zmm18
    vpunpcklqdq zmm2, zmm17, zmm19
    vpunpckhqdq zmm3, zmm17, zmm19
    vpunpcklqdq zmm4, zmm20, zmm22
    vpunpckhqdq zmm5, zmm20, zmm22
    vpunpcklqdq zmm6, zmm21, zmm23
    vpunpckhqdq zmm7, zmm21, zmm23
    vshufi32x4 zmm16, zmm0, zmm4, 0x88
    vshufi32x4 zmm17, zmm1, zmm5, 0x88
    vshufi32x4 zmm18, zmm2, zmm6, 0x88
    vshufi32x4 zmm19, zmm3, zmm7, 0x88
    vshufi32x4 zmm20, zmm0, zmm4, 0xDD
    vshufi32x4 zmm21, zmm1, zmm5, 0xDD
    vshufi32x4 zmm22, zmm2, zmm6, 0xDD
    vshufi32x4 zmm23, zmm3, zmm7, 0xDD
    vshufi32x4 zmm0, zmm16, zmm17, 0x88
    vshufi32x4 zmm1, zmm18, zmm19, 0x88
    vshufi32x4 zmm2, zmm20, zmm21, 0x88
    vshufi32x4 zmm3, zmm22, zmm23, 0x88
    vshufi32x4 zmm4, zmm16, zmm17, 0xDD
    vshufi32x4 zmm5, zmm18, zmm19, 0xDD
    vshufi32x4 zmm6, zmm20, zmm21, 0xDD
    vshufi32x4 zmm7, zmm22, zmm23, 0xDD
    vmovdqu32 zmmword ptr [rbx], zmm0
    vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
    vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
    vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
    vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
    vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
    vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
    vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
    vmovdqa32 zmm0, zmmword ptr [rsp]
    vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
    vmovdqa32 zmm2, zmm0
    vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
    vpcmpltud k2, zmm2, zmm0
    vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
    vmovdqa32 zmmword ptr [rsp], zmm2
    vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
    add rdi, 128
    add rbx, 512
    mov qword ptr [rbp+0x50], rbx
    sub rsi, 16
    cmp rsi, 16
    jnc 2b
    test rsi, rsi
    jnz 3f
    4:
    vzeroupper
    mov rsp, rbp
    pop rbp
    pop rbx
    pop r12
    pop r13
    pop r14
    pop r15
    ret
    .p2align 6
    3:
    test esi, 0x8
    je 3f
    vpbroadcastd ymm0, dword ptr [rcx]
    vpbroadcastd ymm1, dword ptr [rcx+0x4]
    vpbroadcastd ymm2, dword ptr [rcx+0x8]
    vpbroadcastd ymm3, dword ptr [rcx+0xC]
    vpbroadcastd ymm4, dword ptr [rcx+0x10]
    vpbroadcastd ymm5, dword ptr [rcx+0x14]
    vpbroadcastd ymm6, dword ptr [rcx+0x18]
    vpbroadcastd ymm7, dword ptr [rcx+0x1C]
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    mov r12, qword ptr [rdi+0x20]
    mov r13, qword ptr [rdi+0x28]
    mov r14, qword ptr [rdi+0x30]
    mov r15, qword ptr [rdi+0x38]
    movzx eax, byte ptr [rbp+0x38]
    movzx ebx, byte ptr [rbp+0x40]
    or eax, ebx
    xor edx, edx
    2:
    movzx ebx, byte ptr [rbp+0x48]
    or ebx, eax
    add rdx, 64
    cmp rdx, qword ptr [rsp+0x80]
    cmove eax, ebx
    mov dword ptr [rsp+0x88], eax
    vmovups xmm8, xmmword ptr [r8+rdx-0x40]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x40]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x40]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x40]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm16, ymm12, ymm14, 136
    vshufps ymm17, ymm12, ymm14, 221
    vshufps ymm18, ymm13, ymm15, 136
    vshufps ymm19, ymm13, ymm15, 221
    vmovups xmm8, xmmword ptr [r8+rdx-0x30]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x30]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x30]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x30]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm20, ymm12, ymm14, 136
    vshufps ymm21, ymm12, ymm14, 221
    vshufps ymm22, ymm13, ymm15, 136
    vshufps ymm23, ymm13, ymm15, 221
    vmovups xmm8, xmmword ptr [r8+rdx-0x20]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x20]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x20]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x20]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm24, ymm12, ymm14, 136
    vshufps ymm25, ymm12, ymm14, 221
    vshufps ymm26, ymm13, ymm15, 136
    vshufps ymm27, ymm13, ymm15, 221
    vmovups xmm8, xmmword ptr [r8+rdx-0x10]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x10]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x10]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x10]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm28, ymm12, ymm14, 136
    vshufps ymm29, ymm12, ymm14, 221
    vshufps ymm30, ymm13, ymm15, 136
    vshufps ymm31, ymm13, ymm15, 221
    vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
    vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
    vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
    vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
    vmovdqa ymm12, ymmword ptr [rsp]
    vmovdqa ymm13, ymmword ptr [rsp+0x40]
    vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
    vpbroadcastd ymm15, dword ptr [rsp+0x88]
    vpaddd ymm0, ymm0, ymm16
    vpaddd ymm1, ymm1, ymm18
    vpaddd ymm2, ymm2, ymm20
    vpaddd ymm3, ymm3, ymm22
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm17
    vpaddd ymm1, ymm1, ymm19
    vpaddd ymm2, ymm2, ymm21
    vpaddd ymm3, ymm3, ymm23
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm24
    vpaddd ymm1, ymm1, ymm26
    vpaddd ymm2, ymm2, ymm28
    vpaddd ymm3, ymm3, ymm30
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm25
    vpaddd ymm1, ymm1, ymm27
    vpaddd ymm2, ymm2, ymm29
    vpaddd ymm3, ymm3, ymm31
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpaddd ymm0, ymm0, ymm18
    vpaddd ymm1, ymm1, ymm19
    vpaddd ymm2, ymm2, ymm23
    vpaddd ymm3, ymm3, ymm20
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm22
    vpaddd ymm1, ymm1, ymm26
    vpaddd ymm2, ymm2, ymm16
    vpaddd ymm3, ymm3, ymm29
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm17
    vpaddd ymm1, ymm1, ymm28
    vpaddd ymm2, ymm2, ymm25
    vpaddd ymm3, ymm3, ymm31
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm27
    vpaddd ymm1, ymm1, ymm21
    vpaddd ymm2, ymm2, ymm30
    vpaddd ymm3, ymm3, ymm24
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpaddd ymm0, ymm0, ymm19
    vpaddd ymm1, ymm1, ymm26
    vpaddd ymm2, ymm2, ymm29
    vpaddd ymm3, ymm3, ymm23
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm20
    vpaddd ymm1, ymm1, ymm28
    vpaddd ymm2, ymm2, ymm18
    vpaddd ymm3, ymm3, ymm30
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm22
    vpaddd ymm1, ymm1, ymm25
    vpaddd ymm2, ymm2, ymm27
    vpaddd ymm3, ymm3, ymm24
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm21
    vpaddd ymm1, ymm1, ymm16
    vpaddd ymm2, ymm2, ymm31
    vpaddd ymm3, ymm3, ymm17
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpaddd ymm0, ymm0, ymm26
    vpaddd ymm1, ymm1, ymm28
    vpaddd ymm2, ymm2, ymm30
    vpaddd ymm3, ymm3, ymm29
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm23
    vpaddd ymm1, ymm1, ymm25
    vpaddd ymm2, ymm2, ymm19
    vpaddd ymm3, ymm3, ymm31
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm20
    vpaddd ymm1, ymm1, ymm27
    vpaddd ymm2, ymm2, ymm21
    vpaddd ymm3, ymm3, ymm17
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm16
    vpaddd ymm1, ymm1, ymm18
    vpaddd ymm2, ymm2, ymm24
    vpaddd ymm3, ymm3, ymm22
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpaddd ymm0, ymm0, ymm28
    vpaddd ymm1, ymm1, ymm25
    vpaddd ymm2, ymm2, ymm31
    vpaddd ymm3, ymm3, ymm30
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm29
    vpaddd ymm1, ymm1, ymm27
    vpaddd ymm2, ymm2, ymm26
    vpaddd ymm3, ymm3, ymm24
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm23
    vpaddd ymm1, ymm1, ymm21
    vpaddd ymm2, ymm2, ymm16
    vpaddd ymm3, ymm3, ymm22
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm18
    vpaddd ymm1, ymm1, ymm19
    vpaddd ymm2, ymm2, ymm17
    vpaddd ymm3, ymm3, ymm20
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpaddd ymm0, ymm0, ymm25
    vpaddd ymm1, ymm1, ymm27
    vpaddd ymm2, ymm2, ymm24
    vpaddd ymm3, ymm3, ymm31
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm30
    vpaddd ymm1, ymm1, ymm21
    vpaddd ymm2, ymm2, ymm28
    vpaddd ymm3, ymm3, ymm17
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm29
    vpaddd ymm1, ymm1, ymm16
    vpaddd ymm2, ymm2, ymm18
    vpaddd ymm3, ymm3, ymm20
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm19
    vpaddd ymm1, ymm1, ymm26
    vpaddd ymm2, ymm2, ymm22
    vpaddd ymm3, ymm3, ymm23
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpaddd ymm0, ymm0, ymm27
    vpaddd ymm1, ymm1, ymm21
    vpaddd ymm2, ymm2, ymm17
    vpaddd ymm3, ymm3, ymm24
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vprord ymm15, ymm15, 16
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 12
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vpaddd ymm0, ymm0, ymm31
    vpaddd ymm1, ymm1, ymm16
    vpaddd ymm2, ymm2, ymm25
    vpaddd ymm3, ymm3, ymm22
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxord ymm12, ymm12, ymm0
    vpxord ymm13, ymm13, ymm1
    vpxord ymm14, ymm14, ymm2
    vpxord ymm15, ymm15, ymm3
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vprord ymm15, ymm15, 8
    vpaddd ymm8, ymm8, ymm12
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxord ymm4, ymm4, ymm8
    vpxord ymm5, ymm5, ymm9
    vpxord ymm6, ymm6, ymm10
    vpxord ymm7, ymm7, ymm11
    vprord ymm4, ymm4, 7
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vpaddd ymm0, ymm0, ymm30
    vpaddd ymm1, ymm1, ymm18
    vpaddd ymm2, ymm2, ymm19
    vpaddd ymm3, ymm3, ymm23
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 16
    vprord ymm12, ymm12, 16
    vprord ymm13, ymm13, 16
    vprord ymm14, ymm14, 16
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 12
    vprord ymm6, ymm6, 12
    vprord ymm7, ymm7, 12
    vprord ymm4, ymm4, 12
    vpaddd ymm0, ymm0, ymm26
    vpaddd ymm1, ymm1, ymm28
    vpaddd ymm2, ymm2, ymm20
    vpaddd ymm3, ymm3, ymm29
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxord ymm15, ymm15, ymm0
    vpxord ymm12, ymm12, ymm1
    vpxord ymm13, ymm13, ymm2
    vpxord ymm14, ymm14, ymm3
    vprord ymm15, ymm15, 8
    vprord ymm12, ymm12, 8
    vprord ymm13, ymm13, 8
    vprord ymm14, ymm14, 8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm9, ymm9, ymm14
    vpxord ymm5, ymm5, ymm10
    vpxord ymm6, ymm6, ymm11
    vpxord ymm7, ymm7, ymm8
    vpxord ymm4, ymm4, ymm9
    vprord ymm5, ymm5, 7
    vprord ymm6, ymm6, 7
    vprord ymm7, ymm7, 7
    vprord ymm4, ymm4, 7
    vpxor ymm0, ymm0, ymm8
    vpxor ymm1, ymm1, ymm9
    vpxor ymm2, ymm2, ymm10
    vpxor ymm3, ymm3, ymm11
    vpxor ymm4, ymm4, ymm12
    vpxor ymm5, ymm5, ymm13
    vpxor ymm6, ymm6, ymm14
    vpxor ymm7, ymm7, ymm15
    movzx eax, byte ptr [rbp+0x38]
    jne 2b
    mov rbx, qword ptr [rbp+0x50]
    vunpcklps ymm8, ymm0, ymm1
    vunpcklps ymm9, ymm2, ymm3
    vunpckhps ymm10, ymm0, ymm1
    vunpcklps ymm11, ymm4, ymm5
    vunpcklps ymm0, ymm6, ymm7
    vshufps ymm12, ymm8, ymm9, 78
    vblendps ymm1, ymm8, ymm12, 0xCC
    vshufps ymm8, ymm11, ymm0, 78
    vunpckhps ymm13, ymm2, ymm3
    vblendps ymm2, ymm11, ymm8, 0xCC
    vblendps ymm3, ymm12, ymm9, 0xCC
    vperm2f128 ymm12, ymm1, ymm2, 0x20
    vmovups ymmword ptr [rbx], ymm12
    vunpckhps ymm14, ymm4, ymm5
    vblendps ymm4, ymm8, ymm0, 0xCC
    vunpckhps ymm15, ymm6, ymm7
    vperm2f128 ymm7, ymm3, ymm4, 0x20
    vmovups ymmword ptr [rbx+0x20], ymm7
    vshufps ymm5, ymm10, ymm13, 78
    vblendps ymm6, ymm5, ymm13, 0xCC
    vshufps ymm13, ymm14, ymm15, 78
    vblendps ymm10, ymm10, ymm5, 0xCC
    vblendps ymm14, ymm14, ymm13, 0xCC
    vperm2f128 ymm8, ymm10, ymm14, 0x20
    vmovups ymmword ptr [rbx+0x40], ymm8
    vblendps ymm15, ymm13, ymm15, 0xCC
    vperm2f128 ymm13, ymm6, ymm15, 0x20
    vmovups ymmword ptr [rbx+0x60], ymm13
    vperm2f128 ymm9, ymm1, ymm2, 0x31
    vperm2f128 ymm11, ymm3, ymm4, 0x31
    vmovups ymmword ptr [rbx+0x80], ymm9
    vperm2f128 ymm14, ymm10, ymm14, 0x31
    vperm2f128 ymm15, ymm6, ymm15, 0x31
    vmovups ymmword ptr [rbx+0xA0], ymm11
    vmovups ymmword ptr [rbx+0xC0], ymm14
    vmovups ymmword ptr [rbx+0xE0], ymm15
    vmovdqa ymm0, ymmword ptr [rsp]
    vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
    vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
    vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
    vmovdqa ymmword ptr [rsp], ymm0
    vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
    add rbx, 256
    mov qword ptr [rbp+0x50], rbx
    add rdi, 64
    sub rsi, 8
    3:
    mov rbx, qword ptr [rbp+0x50]
    mov r15, qword ptr [rsp+0x80]
    movzx r13, byte ptr [rbp+0x38]
    movzx r12, byte ptr [rbp+0x48]
    test esi, 0x4
    je 3f
    vbroadcasti32x4 zmm0, xmmword ptr [rcx]
    vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
    vmovdqa xmm12, xmmword ptr [rsp]
    vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
    vpunpckldq xmm14, xmm12, xmm13
    vpunpckhdq xmm15, xmm12, xmm13
    vpermq ymm14, ymm14, 0xDC
    vpermq ymm15, ymm15, 0xDC
    vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
    vinserti64x4 zmm13, zmm14, ymm15, 0x01
    mov eax, 17476
    kmovw k2, eax
    vpblendmd zmm13 {k2}, zmm13, zmm12
    vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    mov eax, 43690
    kmovw k3, eax
    mov eax, 34952
    kmovw k4, eax
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    .p2align 5
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    mov dword ptr [rsp+0x88], eax
    vmovdqa32 zmm2, zmm15
    vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
    vpblendmd zmm3 {k4}, zmm13, zmm8
    vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
    vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
    vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
    vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
    vmovups zmm9, zmmword ptr [r8+rdx-0x30]
    vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
    vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
    vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
    vshufps zmm4, zmm8, zmm9, 136
    vshufps zmm5, zmm8, zmm9, 221
    vmovups zmm8, zmmword ptr [r8+rdx-0x20]
    vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
    vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
    vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
    vmovups zmm9, zmmword ptr [r8+rdx-0x10]
    vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
    vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
    vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
    vshufps zmm6, zmm8, zmm9, 136
    vshufps zmm7, zmm8, zmm9, 221
    vpshufd zmm6, zmm6, 0x93
    vpshufd zmm7, zmm7, 0x93
    mov al, 7
    9:
    vpaddd zmm0, zmm0, zmm4
    vpaddd zmm0, zmm0, zmm1
    vpxord zmm3, zmm3, zmm0
    vprord zmm3, zmm3, 16
    vpaddd zmm2, zmm2, zmm3
    vpxord zmm1, zmm1, zmm2
    vprord zmm1, zmm1, 12
    vpaddd zmm0, zmm0, zmm5
    vpaddd zmm0, zmm0, zmm1
    vpxord zmm3, zmm3, zmm0
    vprord zmm3, zmm3, 8
    vpaddd zmm2, zmm2, zmm3
    vpxord zmm1, zmm1, zmm2
    vprord zmm1, zmm1, 7
    vpshufd zmm0, zmm0, 0x93
    vpshufd zmm3, zmm3, 0x4E
    vpshufd zmm2, zmm2, 0x39
    vpaddd zmm0, zmm0, zmm6
    vpaddd zmm0, zmm0, zmm1
    vpxord zmm3, zmm3, zmm0
    vprord zmm3, zmm3, 16
    vpaddd zmm2, zmm2, zmm3
    vpxord zmm1, zmm1, zmm2
    vprord zmm1, zmm1, 12
    vpaddd zmm0, zmm0, zmm7
    vpaddd zmm0, zmm0, zmm1
    vpxord zmm3, zmm3, zmm0
    vprord zmm3, zmm3, 8
    vpaddd zmm2, zmm2, zmm3
    vpxord zmm1, zmm1, zmm2
    vprord zmm1, zmm1, 7
    vpshufd zmm0, zmm0, 0x39
    vpshufd zmm3, zmm3, 0x4E
    vpshufd zmm2, zmm2, 0x93
    dec al
    jz 9f
    vshufps zmm8, zmm4, zmm5, 214
    vpshufd zmm9, zmm4, 0x0F
    vpshufd zmm4, zmm8, 0x39
    vshufps zmm8, zmm6, zmm7, 250
    vpblendmd zmm9 {k3}, zmm9, zmm8
    vpunpcklqdq zmm8, zmm7, zmm5
    vpblendmd zmm8 {k4}, zmm8, zmm6
    vpshufd zmm8, zmm8, 0x78
    vpunpckhdq zmm5, zmm5, zmm7
    vpunpckldq zmm6, zmm6, zmm5
    vpshufd zmm7, zmm6, 0x1E
    vmovdqa32 zmm5, zmm9
    vmovdqa32 zmm6, zmm8
    jmp 9b
    9:
    vpxord zmm0, zmm0, zmm2
    vpxord zmm1, zmm1, zmm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    vmovdqu xmmword ptr [rbx], xmm0
    vmovdqu xmmword ptr [rbx+0x10], xmm1
    vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
    vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
    vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
    vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
    vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
    vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
    vmovdqa xmm0, xmmword ptr [rsp]
    vmovdqa xmm2, xmmword ptr [rsp+0x40]
    vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
    vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
    vmovdqa xmmword ptr [rsp], xmm0
    vmovdqa xmmword ptr [rsp+0x40], xmm2
    add rbx, 128
    add rdi, 32
    sub rsi, 4
    3:
    test esi, 0x2
    je 3f
    vbroadcasti128 ymm0, xmmword ptr [rcx]
    vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
    vmovd xmm13, dword ptr [rsp]
    vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
    vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    vmovd xmm14, dword ptr [rsp+0x4]
    vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
    vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    vinserti128 ymm13, ymm13, xmm14, 0x01
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    .p2align 5
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    mov dword ptr [rsp+0x88], eax
    vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
    vpbroadcastd ymm8, dword ptr [rsp+0x88]
    vpblendd ymm3, ymm13, ymm8, 0x88
    vmovups ymm8, ymmword ptr [r8+rdx-0x40]
    vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
    vmovups ymm9, ymmword ptr [r8+rdx-0x30]
    vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
    vshufps ymm4, ymm8, ymm9, 136
    vshufps ymm5, ymm8, ymm9, 221
    vmovups ymm8, ymmword ptr [r8+rdx-0x20]
    vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
    vmovups ymm9, ymmword ptr [r8+rdx-0x10]
    vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
    vshufps ymm6, ymm8, ymm9, 136
    vshufps ymm7, ymm8, ymm9, 221
    vpshufd ymm6, ymm6, 0x93
    vpshufd ymm7, ymm7, 0x93
    mov al, 7
    9:
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm0, ymm0, ymm1
    vpxord ymm3, ymm3, ymm0
    vprord ymm3, ymm3, 16
    vpaddd ymm2, ymm2, ymm3
    vpxord ymm1, ymm1, ymm2
    vprord ymm1, ymm1, 12
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm0, ymm0, ymm1
    vpxord ymm3, ymm3, ymm0
    vprord ymm3, ymm3, 8
    vpaddd ymm2, ymm2, ymm3
    vpxord ymm1, ymm1, ymm2
    vprord ymm1, ymm1, 7
    vpshufd ymm0, ymm0, 0x93
    vpshufd ymm3, ymm3, 0x4E
    vpshufd ymm2, ymm2, 0x39
    vpaddd ymm0, ymm0, ymm6
    vpaddd ymm0, ymm0, ymm1
    vpxord ymm3, ymm3, ymm0
    vprord ymm3, ymm3, 16
    vpaddd ymm2, ymm2, ymm3
    vpxord ymm1, ymm1, ymm2
    vprord ymm1, ymm1, 12
    vpaddd ymm0, ymm0, ymm7
    vpaddd ymm0, ymm0, ymm1
    vpxord ymm3, ymm3, ymm0
    vprord ymm3, ymm3, 8
    vpaddd ymm2, ymm2, ymm3
    vpxord ymm1, ymm1, ymm2
    vprord ymm1, ymm1, 7
    vpshufd ymm0, ymm0, 0x39
    vpshufd ymm3, ymm3, 0x4E
    vpshufd ymm2, ymm2, 0x93
    dec al
    jz 9f
    vshufps ymm8, ymm4, ymm5, 214
    vpshufd ymm9, ymm4, 0x0F
    vpshufd ymm4, ymm8, 0x39
    vshufps ymm8, ymm6, ymm7, 250
    vpblendd ymm9, ymm9, ymm8, 0xAA
    vpunpcklqdq ymm8, ymm7, ymm5
    vpblendd ymm8, ymm8, ymm6, 0x88
    vpshufd ymm8, ymm8, 0x78
    vpunpckhdq ymm5, ymm5, ymm7
    vpunpckldq ymm6, ymm6, ymm5
    vpshufd ymm7, ymm6, 0x1E
    vmovdqa ymm5, ymm9
    vmovdqa ymm6, ymm8
    jmp 9b
    9:
    vpxor ymm0, ymm0, ymm2
    vpxor ymm1, ymm1, ymm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    vmovdqu xmmword ptr [rbx], xmm0
    vmovdqu xmmword ptr [rbx+0x10], xmm1
    vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
    vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
    vmovdqa xmm0, xmmword ptr [rsp]
    vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
    vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
    vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
    vmovdqa xmmword ptr [rsp], xmm0
    vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
    add rbx, 64
    add rdi, 16
    sub rsi, 2
    3:
    test esi, 0x1
    je 4b
    vmovdqu xmm0, xmmword ptr [rcx]
    vmovdqu xmm1, xmmword ptr [rcx+0x10]
    vmovd xmm14, dword ptr [rsp]
    vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
    vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
    mov r8, qword ptr [rdi]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    .p2align 5
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    vpinsrd xmm3, xmm14, eax, 3
    vmovdqa xmm2, xmm15
    vmovups xmm8, xmmword ptr [r8+rdx-0x40]
    vmovups xmm9, xmmword ptr [r8+rdx-0x30]
    vshufps xmm4, xmm8, xmm9, 136
    vshufps xmm5, xmm8, xmm9, 221
    vmovups xmm8, xmmword ptr [r8+rdx-0x20]
    vmovups xmm9, xmmword ptr [r8+rdx-0x10]
    vshufps xmm6, xmm8, xmm9, 136
    vshufps xmm7, xmm8, xmm9, 221
    vpshufd xmm6, xmm6, 0x93
    vpshufd xmm7, xmm7, 0x93
    mov al, 7
    9:
    vpaddd xmm0, xmm0, xmm4
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 16
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 12
    vpaddd xmm0, xmm0, xmm5
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 8
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 7
    vpshufd xmm0, xmm0, 0x93
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x39
    vpaddd xmm0, xmm0, xmm6
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 16
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 12
    vpaddd xmm0, xmm0, xmm7
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 8
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 7
    vpshufd xmm0, xmm0, 0x39
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    vshufps xmm8, xmm4, xmm5, 214
    vpshufd xmm9, xmm4, 0x0F
    vpshufd xmm4, xmm8, 0x39
    vshufps xmm8, xmm6, xmm7, 250
    vpblendd xmm9, xmm9, xmm8, 0xAA
    vpunpcklqdq xmm8, xmm7, xmm5
    vpblendd xmm8, xmm8, xmm6, 0x88
    vpshufd xmm8, xmm8, 0x78
    vpunpckhdq xmm5, xmm5, xmm7
    vpunpckldq xmm6, xmm6, xmm5
    vpshufd xmm7, xmm6, 0x1E
    vmovdqa xmm5, xmm9
    vmovdqa xmm6, xmm8
    jmp 9b
    9:
    vpxor xmm0, xmm0, xmm2
    vpxor xmm1, xmm1, xmm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    vmovdqu xmmword ptr [rbx], xmm0
    vmovdqu xmmword ptr [rbx+0x10], xmm1
    jmp 4b
    .p2align 6
    _blake3_compress_in_place_avx512:
    blake3_compress_in_place_avx512:
    _CET_ENDBR
    vmovdqu xmm0, xmmword ptr [rdi]
    vmovdqu xmm1, xmmword ptr [rdi+0x10]
    movzx eax, r8b
    movzx edx, dl
    shl rax, 32
    add rdx, rax
    vmovq xmm3, rcx
    vmovq xmm4, rdx
    vpunpcklqdq xmm3, xmm3, xmm4
    vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    vmovups xmm8, xmmword ptr [rsi]
    vmovups xmm9, xmmword ptr [rsi+0x10]
    vshufps xmm4, xmm8, xmm9, 136
    vshufps xmm5, xmm8, xmm9, 221
    vmovups xmm8, xmmword ptr [rsi+0x20]
    vmovups xmm9, xmmword ptr [rsi+0x30]
    vshufps xmm6, xmm8, xmm9, 136
    vshufps xmm7, xmm8, xmm9, 221
    vpshufd xmm6, xmm6, 0x93
    vpshufd xmm7, xmm7, 0x93
    mov al, 7
    9:
    vpaddd xmm0, xmm0, xmm4
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 16
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 12
    vpaddd xmm0, xmm0, xmm5
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 8
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 7
    vpshufd xmm0, xmm0, 0x93
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x39
    vpaddd xmm0, xmm0, xmm6
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 16
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 12
    vpaddd xmm0, xmm0, xmm7
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 8
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 7
    vpshufd xmm0, xmm0, 0x39
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    vshufps xmm8, xmm4, xmm5, 214
    vpshufd xmm9, xmm4, 0x0F
    vpshufd xmm4, xmm8, 0x39
    vshufps xmm8, xmm6, xmm7, 250
    vpblendd xmm9, xmm9, xmm8, 0xAA
    vpunpcklqdq xmm8, xmm7, xmm5
    vpblendd xmm8, xmm8, xmm6, 0x88
    vpshufd xmm8, xmm8, 0x78
    vpunpckhdq xmm5, xmm5, xmm7
    vpunpckldq xmm6, xmm6, xmm5
    vpshufd xmm7, xmm6, 0x1E
    vmovdqa xmm5, xmm9
    vmovdqa xmm6, xmm8
    jmp 9b
    9:
    vpxor xmm0, xmm0, xmm2
    vpxor xmm1, xmm1, xmm3
    vmovdqu xmmword ptr [rdi], xmm0
    vmovdqu xmmword ptr [rdi+0x10], xmm1
    ret
    .p2align 6
    _blake3_compress_xof_avx512:
    blake3_compress_xof_avx512:
    _CET_ENDBR
    vmovdqu xmm0, xmmword ptr [rdi]
    vmovdqu xmm1, xmmword ptr [rdi+0x10]
    movzx eax, r8b
    movzx edx, dl
    shl rax, 32
    add rdx, rax
    vmovq xmm3, rcx
    vmovq xmm4, rdx
    vpunpcklqdq xmm3, xmm3, xmm4
    vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
    vmovups xmm8, xmmword ptr [rsi]
    vmovups xmm9, xmmword ptr [rsi+0x10]
    vshufps xmm4, xmm8, xmm9, 136
    vshufps xmm5, xmm8, xmm9, 221
    vmovups xmm8, xmmword ptr [rsi+0x20]
    vmovups xmm9, xmmword ptr [rsi+0x30]
    vshufps xmm6, xmm8, xmm9, 136
    vshufps xmm7, xmm8, xmm9, 221
    vpshufd xmm6, xmm6, 0x93
    vpshufd xmm7, xmm7, 0x93
    mov al, 7
    9:
    vpaddd xmm0, xmm0, xmm4
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 16
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 12
    vpaddd xmm0, xmm0, xmm5
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 8
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 7
    vpshufd xmm0, xmm0, 0x93
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x39
    vpaddd xmm0, xmm0, xmm6
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 16
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 12
    vpaddd xmm0, xmm0, xmm7
    vpaddd xmm0, xmm0, xmm1
    vpxord xmm3, xmm3, xmm0
    vprord xmm3, xmm3, 8
    vpaddd xmm2, xmm2, xmm3
    vpxord xmm1, xmm1, xmm2
    vprord xmm1, xmm1, 7
    vpshufd xmm0, xmm0, 0x39
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    vshufps xmm8, xmm4, xmm5, 214
    vpshufd xmm9, xmm4, 0x0F
    vpshufd xmm4, xmm8, 0x39
    vshufps xmm8, xmm6, xmm7, 250
    vpblendd xmm9, xmm9, xmm8, 0xAA
    vpunpcklqdq xmm8, xmm7, xmm5
    vpblendd xmm8, xmm8, xmm6, 0x88
    vpshufd xmm8, xmm8, 0x78
    vpunpckhdq xmm5, xmm5, xmm7
    vpunpckldq xmm6, xmm6, xmm5
    vpshufd xmm7, xmm6, 0x1E
    vmovdqa xmm5, xmm9
    vmovdqa xmm6, xmm8
    jmp 9b
    9:
    vpxor xmm0, xmm0, xmm2
    vpxor xmm1, xmm1, xmm3
    vpxor xmm2, xmm2, [rdi]
    vpxor xmm3, xmm3, [rdi+0x10]
    vmovdqu xmmword ptr [r9], xmm0
    vmovdqu xmmword ptr [r9+0x10], xmm1
    vmovdqu xmmword ptr [r9+0x20], xmm2
    vmovdqu xmmword ptr [r9+0x30], xmm3
    ret
    #ifdef __APPLE__
    .static_data
    #else
    .section .rodata
    #endif
    .p2align 6
    INDEX0:
    .long 0, 1, 2, 3, 16, 17, 18, 19
    .long 8, 9, 10, 11, 24, 25, 26, 27
    INDEX1:
    .long 4, 5, 6, 7, 20, 21, 22, 23
    .long 12, 13, 14, 15, 28, 29, 30, 31
    ADD0:
    .long 0, 1, 2, 3, 4, 5, 6, 7
    .long 8, 9, 10, 11, 12, 13, 14, 15
    ADD1: .long 1
    ADD16: .long 16
    BLAKE3_BLOCK_LEN:
    .long 64
    .p2align 6
    BLAKE3_IV:
    BLAKE3_IV_0:
    .long 0x6A09E667
    BLAKE3_IV_1:
    .long 0xBB67AE85
    BLAKE3_IV_2:
    .long 0x3C6EF372
    BLAKE3_IV_3:
    .long 0xA54FF53A
  • file addition: blake3_avx2_x86-64_unix.S (----------)
    [0.38]
    #if defined(__ELF__) && defined(__linux__)
    .section .note.GNU-stack,"",%progbits
    #endif
    #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
    #if __has_include(<cet.h>)
    #include <cet.h>
    #endif
    #endif
    #if !defined(_CET_ENDBR)
    #define _CET_ENDBR
    #endif
    .intel_syntax noprefix
    .global _blake3_hash_many_avx2
    .global blake3_hash_many_avx2
    #ifdef __APPLE__
    .text
    #else
    .section .text
    #endif
    .p2align 6
    _blake3_hash_many_avx2:
    blake3_hash_many_avx2:
    _CET_ENDBR
    push r15
    push r14
    push r13
    push r12
    push rbx
    push rbp
    mov rbp, rsp
    sub rsp, 680
    and rsp, 0xFFFFFFFFFFFFFFC0
    neg r9d
    vmovd xmm0, r9d
    vpbroadcastd ymm0, xmm0
    vmovdqa ymmword ptr [rsp+0x280], ymm0
    vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
    vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
    vmovdqa ymmword ptr [rsp+0x220], ymm2
    vmovd xmm2, r8d
    vpbroadcastd ymm2, xmm2
    vpaddd ymm2, ymm2, ymm1
    vmovdqa ymmword ptr [rsp+0x240], ymm2
    vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
    vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
    vpcmpgtd ymm2, ymm1, ymm2
    shr r8, 32
    vmovd xmm3, r8d
    vpbroadcastd ymm3, xmm3
    vpsubd ymm3, ymm3, ymm2
    vmovdqa ymmword ptr [rsp+0x260], ymm3
    shl rdx, 6
    mov qword ptr [rsp+0x2A0], rdx
    cmp rsi, 8
    jc 3f
    2:
    vpbroadcastd ymm0, dword ptr [rcx]
    vpbroadcastd ymm1, dword ptr [rcx+0x4]
    vpbroadcastd ymm2, dword ptr [rcx+0x8]
    vpbroadcastd ymm3, dword ptr [rcx+0xC]
    vpbroadcastd ymm4, dword ptr [rcx+0x10]
    vpbroadcastd ymm5, dword ptr [rcx+0x14]
    vpbroadcastd ymm6, dword ptr [rcx+0x18]
    vpbroadcastd ymm7, dword ptr [rcx+0x1C]
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    mov r12, qword ptr [rdi+0x20]
    mov r13, qword ptr [rdi+0x28]
    mov r14, qword ptr [rdi+0x30]
    mov r15, qword ptr [rdi+0x38]
    movzx eax, byte ptr [rbp+0x38]
    movzx ebx, byte ptr [rbp+0x40]
    or eax, ebx
    xor edx, edx
    .p2align 5
    9:
    movzx ebx, byte ptr [rbp+0x48]
    or ebx, eax
    add rdx, 64
    cmp rdx, qword ptr [rsp+0x2A0]
    cmove eax, ebx
    mov dword ptr [rsp+0x200], eax
    vmovups xmm8, xmmword ptr [r8+rdx-0x40]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x40]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x40]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x40]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm8, ymm12, ymm14, 136
    vmovaps ymmword ptr [rsp], ymm8
    vshufps ymm9, ymm12, ymm14, 221
    vmovaps ymmword ptr [rsp+0x20], ymm9
    vshufps ymm10, ymm13, ymm15, 136
    vmovaps ymmword ptr [rsp+0x40], ymm10
    vshufps ymm11, ymm13, ymm15, 221
    vmovaps ymmword ptr [rsp+0x60], ymm11
    vmovups xmm8, xmmword ptr [r8+rdx-0x30]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x30]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x30]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x30]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm8, ymm12, ymm14, 136
    vmovaps ymmword ptr [rsp+0x80], ymm8
    vshufps ymm9, ymm12, ymm14, 221
    vmovaps ymmword ptr [rsp+0xA0], ymm9
    vshufps ymm10, ymm13, ymm15, 136
    vmovaps ymmword ptr [rsp+0xC0], ymm10
    vshufps ymm11, ymm13, ymm15, 221
    vmovaps ymmword ptr [rsp+0xE0], ymm11
    vmovups xmm8, xmmword ptr [r8+rdx-0x20]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x20]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x20]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x20]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm8, ymm12, ymm14, 136
    vmovaps ymmword ptr [rsp+0x100], ymm8
    vshufps ymm9, ymm12, ymm14, 221
    vmovaps ymmword ptr [rsp+0x120], ymm9
    vshufps ymm10, ymm13, ymm15, 136
    vmovaps ymmword ptr [rsp+0x140], ymm10
    vshufps ymm11, ymm13, ymm15, 221
    vmovaps ymmword ptr [rsp+0x160], ymm11
    vmovups xmm8, xmmword ptr [r8+rdx-0x10]
    vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
    vmovups xmm9, xmmword ptr [r9+rdx-0x10]
    vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
    vunpcklpd ymm12, ymm8, ymm9
    vunpckhpd ymm13, ymm8, ymm9
    vmovups xmm10, xmmword ptr [r10+rdx-0x10]
    vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
    vmovups xmm11, xmmword ptr [r11+rdx-0x10]
    vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
    vunpcklpd ymm14, ymm10, ymm11
    vunpckhpd ymm15, ymm10, ymm11
    vshufps ymm8, ymm12, ymm14, 136
    vmovaps ymmword ptr [rsp+0x180], ymm8
    vshufps ymm9, ymm12, ymm14, 221
    vmovaps ymmword ptr [rsp+0x1A0], ymm9
    vshufps ymm10, ymm13, ymm15, 136
    vmovaps ymmword ptr [rsp+0x1C0], ymm10
    vshufps ymm11, ymm13, ymm15, 221
    vmovaps ymmword ptr [rsp+0x1E0], ymm11
    vpbroadcastd ymm15, dword ptr [rsp+0x200]
    prefetcht0 [r8+rdx+0x80]
    prefetcht0 [r12+rdx+0x80]
    prefetcht0 [r9+rdx+0x80]
    prefetcht0 [r13+rdx+0x80]
    prefetcht0 [r10+rdx+0x80]
    prefetcht0 [r14+rdx+0x80]
    prefetcht0 [r11+rdx+0x80]
    prefetcht0 [r15+rdx+0x80]
    vpaddd ymm0, ymm0, ymmword ptr [rsp]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
    vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
    vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
    vpxor ymm15, ymm3, ymm15
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
    vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
    vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
    vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
    vpaddd ymm2, ymm2, ymmword ptr [rsp]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
    vpaddd ymm2, ymm2, ymmword ptr [rsp]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm1, ymm1, ymm5
    vpaddd ymm2, ymm2, ymm6
    vpaddd ymm3, ymm3, ymm7
    vpxor ymm12, ymm12, ymm0
    vpxor ymm13, ymm13, ymm1
    vpxor ymm14, ymm14, ymm2
    vpxor ymm15, ymm15, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpshufb ymm15, ymm15, ymm8
    vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm13
    vpaddd ymm10, ymm10, ymm14
    vpaddd ymm11, ymm11, ymm15
    vpxor ymm4, ymm4, ymm8
    vpxor ymm5, ymm5, ymm9
    vpxor ymm6, ymm6, ymm10
    vpxor ymm7, ymm7, ymm11
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vmovdqa ymmword ptr [rsp+0x200], ymm8
    vpsrld ymm8, ymm5, 12
    vpslld ymm5, ymm5, 20
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 12
    vpslld ymm6, ymm6, 20
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 12
    vpslld ymm7, ymm7, 20
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 12
    vpslld ymm4, ymm4, 20
    vpor ymm4, ymm4, ymm8
    vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
    vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
    vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
    vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm1, ymm1, ymm6
    vpaddd ymm2, ymm2, ymm7
    vpaddd ymm3, ymm3, ymm4
    vpxor ymm15, ymm15, ymm0
    vpxor ymm12, ymm12, ymm1
    vpxor ymm13, ymm13, ymm2
    vpxor ymm14, ymm14, ymm3
    vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
    vpshufb ymm15, ymm15, ymm8
    vpshufb ymm12, ymm12, ymm8
    vpshufb ymm13, ymm13, ymm8
    vpshufb ymm14, ymm14, ymm8
    vpaddd ymm10, ymm10, ymm15
    vpaddd ymm11, ymm11, ymm12
    vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
    vpaddd ymm9, ymm9, ymm14
    vpxor ymm5, ymm5, ymm10
    vpxor ymm6, ymm6, ymm11
    vpxor ymm7, ymm7, ymm8
    vpxor ymm4, ymm4, ymm9
    vpxor ymm0, ymm0, ymm8
    vpxor ymm1, ymm1, ymm9
    vpxor ymm2, ymm2, ymm10
    vpxor ymm3, ymm3, ymm11
    vpsrld ymm8, ymm5, 7
    vpslld ymm5, ymm5, 25
    vpor ymm5, ymm5, ymm8
    vpsrld ymm8, ymm6, 7
    vpslld ymm6, ymm6, 25
    vpor ymm6, ymm6, ymm8
    vpsrld ymm8, ymm7, 7
    vpslld ymm7, ymm7, 25
    vpor ymm7, ymm7, ymm8
    vpsrld ymm8, ymm4, 7
    vpslld ymm4, ymm4, 25
    vpor ymm4, ymm4, ymm8
    vpxor ymm4, ymm4, ymm12
    vpxor ymm5, ymm5, ymm13
    vpxor ymm6, ymm6, ymm14
    vpxor ymm7, ymm7, ymm15
    movzx eax, byte ptr [rbp+0x38]
    jne 9b
    mov rbx, qword ptr [rbp+0x50]
    vunpcklps ymm8, ymm0, ymm1
    vunpcklps ymm9, ymm2, ymm3
    vunpckhps ymm10, ymm0, ymm1
    vunpcklps ymm11, ymm4, ymm5
    vunpcklps ymm0, ymm6, ymm7
    vshufps ymm12, ymm8, ymm9, 78
    vblendps ymm1, ymm8, ymm12, 0xCC
    vshufps ymm8, ymm11, ymm0, 78
    vunpckhps ymm13, ymm2, ymm3
    vblendps ymm2, ymm11, ymm8, 0xCC
    vblendps ymm3, ymm12, ymm9, 0xCC
    vperm2f128 ymm12, ymm1, ymm2, 0x20
    vmovups ymmword ptr [rbx], ymm12
    vunpckhps ymm14, ymm4, ymm5
    vblendps ymm4, ymm8, ymm0, 0xCC
    vunpckhps ymm15, ymm6, ymm7
    vperm2f128 ymm7, ymm3, ymm4, 0x20
    vmovups ymmword ptr [rbx+0x20], ymm7
    vshufps ymm5, ymm10, ymm13, 78
    vblendps ymm6, ymm5, ymm13, 0xCC
    vshufps ymm13, ymm14, ymm15, 78
    vblendps ymm10, ymm10, ymm5, 0xCC
    vblendps ymm14, ymm14, ymm13, 0xCC
    vperm2f128 ymm8, ymm10, ymm14, 0x20
    vmovups ymmword ptr [rbx+0x40], ymm8
    vblendps ymm15, ymm13, ymm15, 0xCC
    vperm2f128 ymm13, ymm6, ymm15, 0x20
    vmovups ymmword ptr [rbx+0x60], ymm13
    vperm2f128 ymm9, ymm1, ymm2, 0x31
    vperm2f128 ymm11, ymm3, ymm4, 0x31
    vmovups ymmword ptr [rbx+0x80], ymm9
    vperm2f128 ymm14, ymm10, ymm14, 0x31
    vperm2f128 ymm15, ymm6, ymm15, 0x31
    vmovups ymmword ptr [rbx+0xA0], ymm11
    vmovups ymmword ptr [rbx+0xC0], ymm14
    vmovups ymmword ptr [rbx+0xE0], ymm15
    vmovdqa ymm0, ymmword ptr [rsp+0x220]
    vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
    vmovdqa ymmword ptr [rsp+0x240], ymm1
    vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
    vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
    vpcmpgtd ymm2, ymm0, ymm2
    vmovdqa ymm0, ymmword ptr [rsp+0x260]
    vpsubd ymm2, ymm0, ymm2
    vmovdqa ymmword ptr [rsp+0x260], ymm2
    add rdi, 64
    add rbx, 256
    mov qword ptr [rbp+0x50], rbx
    sub rsi, 8
    cmp rsi, 8
    jnc 2b
    test rsi, rsi
    jnz 3f
    4:
    vzeroupper
    mov rsp, rbp
    pop rbp
    pop rbx
    pop r12
    pop r13
    pop r14
    pop r15
    ret
    .p2align 5
    3:
    mov rbx, qword ptr [rbp+0x50]
    mov r15, qword ptr [rsp+0x2A0]
    movzx r13d, byte ptr [rbp+0x38]
    movzx r12d, byte ptr [rbp+0x48]
    test rsi, 0x4
    je 3f
    vbroadcasti128 ymm0, xmmword ptr [rcx]
    vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
    vmovdqa ymm8, ymm0
    vmovdqa ymm9, ymm1
    vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
    vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
    vpunpckldq ymm14, ymm12, ymm13
    vpunpckhdq ymm15, ymm12, ymm13
    vpermq ymm14, ymm14, 0x50
    vpermq ymm15, ymm15, 0x50
    vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
    vpblendd ymm14, ymm14, ymm12, 0x44
    vpblendd ymm15, ymm15, ymm12, 0x44
    vmovdqa ymmword ptr [rsp], ymm14
    vmovdqa ymmword ptr [rsp+0x20], ymm15
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    mov r10, qword ptr [rdi+0x10]
    mov r11, qword ptr [rdi+0x18]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    .p2align 5
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    mov dword ptr [rsp+0x200], eax
    vmovups ymm2, ymmword ptr [r8+rdx-0x40]
    vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
    vmovups ymm3, ymmword ptr [r8+rdx-0x30]
    vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
    vshufps ymm4, ymm2, ymm3, 136
    vshufps ymm5, ymm2, ymm3, 221
    vmovups ymm2, ymmword ptr [r8+rdx-0x20]
    vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
    vmovups ymm3, ymmword ptr [r8+rdx-0x10]
    vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
    vshufps ymm6, ymm2, ymm3, 136
    vshufps ymm7, ymm2, ymm3, 221
    vpshufd ymm6, ymm6, 0x93
    vpshufd ymm7, ymm7, 0x93
    vmovups ymm10, ymmword ptr [r10+rdx-0x40]
    vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
    vmovups ymm11, ymmword ptr [r10+rdx-0x30]
    vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
    vshufps ymm12, ymm10, ymm11, 136
    vshufps ymm13, ymm10, ymm11, 221
    vmovups ymm10, ymmword ptr [r10+rdx-0x20]
    vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
    vmovups ymm11, ymmword ptr [r10+rdx-0x10]
    vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
    vshufps ymm14, ymm10, ymm11, 136
    vshufps ymm15, ymm10, ymm11, 221
    vpshufd ymm14, ymm14, 0x93
    vpshufd ymm15, ymm15, 0x93
    prefetcht0 [r8+rdx+0x80]
    prefetcht0 [r9+rdx+0x80]
    prefetcht0 [r10+rdx+0x80]
    prefetcht0 [r11+rdx+0x80]
    vpbroadcastd ymm2, dword ptr [rsp+0x200]
    vmovdqa ymm3, ymmword ptr [rsp]
    vmovdqa ymm11, ymmword ptr [rsp+0x20]
    vpblendd ymm3, ymm3, ymm2, 0x88
    vpblendd ymm11, ymm11, ymm2, 0x88
    vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
    vmovdqa ymm10, ymm2
    mov al, 7
    9:
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm8, ymm8, ymm12
    vmovdqa ymmword ptr [rsp+0x40], ymm4
    nop
    vmovdqa ymmword ptr [rsp+0x60], ymm12
    nop
    vpaddd ymm0, ymm0, ymm1
    vpaddd ymm8, ymm8, ymm9
    vpxor ymm3, ymm3, ymm0
    vpxor ymm11, ymm11, ymm8
    vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
    vpshufb ymm3, ymm3, ymm4
    vpshufb ymm11, ymm11, ymm4
    vpaddd ymm2, ymm2, ymm3
    vpaddd ymm10, ymm10, ymm11
    vpxor ymm1, ymm1, ymm2
    vpxor ymm9, ymm9, ymm10
    vpsrld ymm4, ymm1, 12
    vpslld ymm1, ymm1, 20
    vpor ymm1, ymm1, ymm4
    vpsrld ymm4, ymm9, 12
    vpslld ymm9, ymm9, 20
    vpor ymm9, ymm9, ymm4
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm8, ymm8, ymm13
    vpaddd ymm0, ymm0, ymm1
    vpaddd ymm8, ymm8, ymm9
    vmovdqa ymmword ptr [rsp+0x80], ymm5
    vmovdqa ymmword ptr [rsp+0xA0], ymm13
    vpxor ymm3, ymm3, ymm0
    vpxor ymm11, ymm11, ymm8
    vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
    vpshufb ymm3, ymm3, ymm4
    vpshufb ymm11, ymm11, ymm4
    vpaddd ymm2, ymm2, ymm3
    vpaddd ymm10, ymm10, ymm11
    vpxor ymm1, ymm1, ymm2
    vpxor ymm9, ymm9, ymm10
    vpsrld ymm4, ymm1, 7
    vpslld ymm1, ymm1, 25
    vpor ymm1, ymm1, ymm4
    vpsrld ymm4, ymm9, 7
    vpslld ymm9, ymm9, 25
    vpor ymm9, ymm9, ymm4
    vpshufd ymm0, ymm0, 0x93
    vpshufd ymm8, ymm8, 0x93
    vpshufd ymm3, ymm3, 0x4E
    vpshufd ymm11, ymm11, 0x4E
    vpshufd ymm2, ymm2, 0x39
    vpshufd ymm10, ymm10, 0x39
    vpaddd ymm0, ymm0, ymm6
    vpaddd ymm8, ymm8, ymm14
    vpaddd ymm0, ymm0, ymm1
    vpaddd ymm8, ymm8, ymm9
    vpxor ymm3, ymm3, ymm0
    vpxor ymm11, ymm11, ymm8
    vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
    vpshufb ymm3, ymm3, ymm4
    vpshufb ymm11, ymm11, ymm4
    vpaddd ymm2, ymm2, ymm3
    vpaddd ymm10, ymm10, ymm11
    vpxor ymm1, ymm1, ymm2
    vpxor ymm9, ymm9, ymm10
    vpsrld ymm4, ymm1, 12
    vpslld ymm1, ymm1, 20
    vpor ymm1, ymm1, ymm4
    vpsrld ymm4, ymm9, 12
    vpslld ymm9, ymm9, 20
    vpor ymm9, ymm9, ymm4
    vpaddd ymm0, ymm0, ymm7
    vpaddd ymm8, ymm8, ymm15
    vpaddd ymm0, ymm0, ymm1
    vpaddd ymm8, ymm8, ymm9
    vpxor ymm3, ymm3, ymm0
    vpxor ymm11, ymm11, ymm8
    vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
    vpshufb ymm3, ymm3, ymm4
    vpshufb ymm11, ymm11, ymm4
    vpaddd ymm2, ymm2, ymm3
    vpaddd ymm10, ymm10, ymm11
    vpxor ymm1, ymm1, ymm2
    vpxor ymm9, ymm9, ymm10
    vpsrld ymm4, ymm1, 7
    vpslld ymm1, ymm1, 25
    vpor ymm1, ymm1, ymm4
    vpsrld ymm4, ymm9, 7
    vpslld ymm9, ymm9, 25
    vpor ymm9, ymm9, ymm4
    vpshufd ymm0, ymm0, 0x39
    vpshufd ymm8, ymm8, 0x39
    vpshufd ymm3, ymm3, 0x4E
    vpshufd ymm11, ymm11, 0x4E
    vpshufd ymm2, ymm2, 0x93
    vpshufd ymm10, ymm10, 0x93
    dec al
    je 9f
    vmovdqa ymm4, ymmword ptr [rsp+0x40]
    vmovdqa ymm5, ymmword ptr [rsp+0x80]
    vshufps ymm12, ymm4, ymm5, 214
    vpshufd ymm13, ymm4, 0x0F
    vpshufd ymm4, ymm12, 0x39
    vshufps ymm12, ymm6, ymm7, 250
    vpblendd ymm13, ymm13, ymm12, 0xAA
    vpunpcklqdq ymm12, ymm7, ymm5
    vpblendd ymm12, ymm12, ymm6, 0x88
    vpshufd ymm12, ymm12, 0x78
    vpunpckhdq ymm5, ymm5, ymm7
    vpunpckldq ymm6, ymm6, ymm5
    vpshufd ymm7, ymm6, 0x1E
    vmovdqa ymmword ptr [rsp+0x40], ymm13
    vmovdqa ymmword ptr [rsp+0x80], ymm12
    vmovdqa ymm12, ymmword ptr [rsp+0x60]
    vmovdqa ymm13, ymmword ptr [rsp+0xA0]
    vshufps ymm5, ymm12, ymm13, 214
    vpshufd ymm6, ymm12, 0x0F
    vpshufd ymm12, ymm5, 0x39
    vshufps ymm5, ymm14, ymm15, 250
    vpblendd ymm6, ymm6, ymm5, 0xAA
    vpunpcklqdq ymm5, ymm15, ymm13
    vpblendd ymm5, ymm5, ymm14, 0x88
    vpshufd ymm5, ymm5, 0x78
    vpunpckhdq ymm13, ymm13, ymm15
    vpunpckldq ymm14, ymm14, ymm13
    vpshufd ymm15, ymm14, 0x1E
    vmovdqa ymm13, ymm6
    vmovdqa ymm14, ymm5
    vmovdqa ymm5, ymmword ptr [rsp+0x40]
    vmovdqa ymm6, ymmword ptr [rsp+0x80]
    jmp 9b
    9:
    vpxor ymm0, ymm0, ymm2
    vpxor ymm1, ymm1, ymm3
    vpxor ymm8, ymm8, ymm10
    vpxor ymm9, ymm9, ymm11
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    vmovdqu xmmword ptr [rbx], xmm0
    vmovdqu xmmword ptr [rbx+0x10], xmm1
    vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
    vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
    vmovdqu xmmword ptr [rbx+0x40], xmm8
    vmovdqu xmmword ptr [rbx+0x50], xmm9
    vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
    vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
    vmovaps xmm8, xmmword ptr [rsp+0x280]
    vmovaps xmm0, xmmword ptr [rsp+0x240]
    vmovaps xmm1, xmmword ptr [rsp+0x250]
    vmovaps xmm2, xmmword ptr [rsp+0x260]
    vmovaps xmm3, xmmword ptr [rsp+0x270]
    vblendvps xmm0, xmm0, xmm1, xmm8
    vblendvps xmm2, xmm2, xmm3, xmm8
    vmovaps xmmword ptr [rsp+0x240], xmm0
    vmovaps xmmword ptr [rsp+0x260], xmm2
    add rbx, 128
    add rdi, 32
    sub rsi, 4
    3:
    test rsi, 0x2
    je 3f
    vbroadcasti128 ymm0, xmmword ptr [rcx]
    vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
    vmovd xmm13, dword ptr [rsp+0x240]
    vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
    vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    vmovd xmm14, dword ptr [rsp+0x244]
    vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
    vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    vinserti128 ymm13, ymm13, xmm14, 0x01
    vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
    vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
    mov r8, qword ptr [rdi]
    mov r9, qword ptr [rdi+0x8]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    .p2align 5
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    mov dword ptr [rsp+0x200], eax
    vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
    vpbroadcastd ymm8, dword ptr [rsp+0x200]
    vpblendd ymm3, ymm13, ymm8, 0x88
    vmovups ymm8, ymmword ptr [r8+rdx-0x40]
    vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
    vmovups ymm9, ymmword ptr [r8+rdx-0x30]
    vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
    vshufps ymm4, ymm8, ymm9, 136
    vshufps ymm5, ymm8, ymm9, 221
    vmovups ymm8, ymmword ptr [r8+rdx-0x20]
    vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
    vmovups ymm9, ymmword ptr [r8+rdx-0x10]
    vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
    vshufps ymm6, ymm8, ymm9, 136
    vshufps ymm7, ymm8, ymm9, 221
    vpshufd ymm6, ymm6, 0x93
    vpshufd ymm7, ymm7, 0x93
    mov al, 7
    9:
    vpaddd ymm0, ymm0, ymm4
    vpaddd ymm0, ymm0, ymm1
    vpxor ymm3, ymm3, ymm0
    vpshufb ymm3, ymm3, ymm14
    vpaddd ymm2, ymm2, ymm3
    vpxor ymm1, ymm1, ymm2
    vpsrld ymm8, ymm1, 12
    vpslld ymm1, ymm1, 20
    vpor ymm1, ymm1, ymm8
    vpaddd ymm0, ymm0, ymm5
    vpaddd ymm0, ymm0, ymm1
    vpxor ymm3, ymm3, ymm0
    vpshufb ymm3, ymm3, ymm15
    vpaddd ymm2, ymm2, ymm3
    vpxor ymm1, ymm1, ymm2
    vpsrld ymm8, ymm1, 7
    vpslld ymm1, ymm1, 25
    vpor ymm1, ymm1, ymm8
    vpshufd ymm0, ymm0, 0x93
    vpshufd ymm3, ymm3, 0x4E
    vpshufd ymm2, ymm2, 0x39
    vpaddd ymm0, ymm0, ymm6
    vpaddd ymm0, ymm0, ymm1
    vpxor ymm3, ymm3, ymm0
    vpshufb ymm3, ymm3, ymm14
    vpaddd ymm2, ymm2, ymm3
    vpxor ymm1, ymm1, ymm2
    vpsrld ymm8, ymm1, 12
    vpslld ymm1, ymm1, 20
    vpor ymm1, ymm1, ymm8
    vpaddd ymm0, ymm0, ymm7
    vpaddd ymm0, ymm0, ymm1
    vpxor ymm3, ymm3, ymm0
    vpshufb ymm3, ymm3, ymm15
    vpaddd ymm2, ymm2, ymm3
    vpxor ymm1, ymm1, ymm2
    vpsrld ymm8, ymm1, 7
    vpslld ymm1, ymm1, 25
    vpor ymm1, ymm1, ymm8
    vpshufd ymm0, ymm0, 0x39
    vpshufd ymm3, ymm3, 0x4E
    vpshufd ymm2, ymm2, 0x93
    dec al
    jz 9f
    vshufps ymm8, ymm4, ymm5, 214
    vpshufd ymm9, ymm4, 0x0F
    vpshufd ymm4, ymm8, 0x39
    vshufps ymm8, ymm6, ymm7, 250
    vpblendd ymm9, ymm9, ymm8, 0xAA
    vpunpcklqdq ymm8, ymm7, ymm5
    vpblendd ymm8, ymm8, ymm6, 0x88
    vpshufd ymm8, ymm8, 0x78
    vpunpckhdq ymm5, ymm5, ymm7
    vpunpckldq ymm6, ymm6, ymm5
    vpshufd ymm7, ymm6, 0x1E
    vmovdqa ymm5, ymm9
    vmovdqa ymm6, ymm8
    jmp 9b
    9:
    vpxor ymm0, ymm0, ymm2
    vpxor ymm1, ymm1, ymm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    vmovdqu xmmword ptr [rbx], xmm0
    vmovdqu xmmword ptr [rbx+0x10], xmm1
    vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
    vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
    vmovaps ymm8, ymmword ptr [rsp+0x280]
    vmovaps ymm0, ymmword ptr [rsp+0x240]
    vmovups ymm1, ymmword ptr [rsp+0x248]
    vmovaps ymm2, ymmword ptr [rsp+0x260]
    vmovups ymm3, ymmword ptr [rsp+0x268]
    vblendvps ymm0, ymm0, ymm1, ymm8
    vblendvps ymm2, ymm2, ymm3, ymm8
    vmovaps ymmword ptr [rsp+0x240], ymm0
    vmovaps ymmword ptr [rsp+0x260], ymm2
    add rbx, 64
    add rdi, 16
    sub rsi, 2
    3:
    test rsi, 0x1
    je 4b
    vmovdqu xmm0, xmmword ptr [rcx]
    vmovdqu xmm1, xmmword ptr [rcx+0x10]
    vmovd xmm3, dword ptr [rsp+0x240]
    vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
    vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
    vmovdqa xmm14, xmmword ptr [ROT16+rip]
    vmovdqa xmm15, xmmword ptr [ROT8+rip]
    mov r8, qword ptr [rdi]
    movzx eax, byte ptr [rbp+0x40]
    or eax, r13d
    xor edx, edx
    .p2align 5
    2:
    mov r14d, eax
    or eax, r12d
    add rdx, 64
    cmp rdx, r15
    cmovne eax, r14d
    vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
    vmovdqa xmm3, xmm13
    vpinsrd xmm3, xmm3, eax, 3
    vmovups xmm8, xmmword ptr [r8+rdx-0x40]
    vmovups xmm9, xmmword ptr [r8+rdx-0x30]
    vshufps xmm4, xmm8, xmm9, 136
    vshufps xmm5, xmm8, xmm9, 221
    vmovups xmm8, xmmword ptr [r8+rdx-0x20]
    vmovups xmm9, xmmword ptr [r8+rdx-0x10]
    vshufps xmm6, xmm8, xmm9, 136
    vshufps xmm7, xmm8, xmm9, 221
    vpshufd xmm6, xmm6, 0x93
    vpshufd xmm7, xmm7, 0x93
    mov al, 7
    9:
    vpaddd xmm0, xmm0, xmm4
    vpaddd xmm0, xmm0, xmm1
    vpxor xmm3, xmm3, xmm0
    vpshufb xmm3, xmm3, xmm14
    vpaddd xmm2, xmm2, xmm3
    vpxor xmm1, xmm1, xmm2
    vpsrld xmm8, xmm1, 12
    vpslld xmm1, xmm1, 20
    vpor xmm1, xmm1, xmm8
    vpaddd xmm0, xmm0, xmm5
    vpaddd xmm0, xmm0, xmm1
    vpxor xmm3, xmm3, xmm0
    vpshufb xmm3, xmm3, xmm15
    vpaddd xmm2, xmm2, xmm3
    vpxor xmm1, xmm1, xmm2
    vpsrld xmm8, xmm1, 7
    vpslld xmm1, xmm1, 25
    vpor xmm1, xmm1, xmm8
    vpshufd xmm0, xmm0, 0x93
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x39
    vpaddd xmm0, xmm0, xmm6
    vpaddd xmm0, xmm0, xmm1
    vpxor xmm3, xmm3, xmm0
    vpshufb xmm3, xmm3, xmm14
    vpaddd xmm2, xmm2, xmm3
    vpxor xmm1, xmm1, xmm2
    vpsrld xmm8, xmm1, 12
    vpslld xmm1, xmm1, 20
    vpor xmm1, xmm1, xmm8
    vpaddd xmm0, xmm0, xmm7
    vpaddd xmm0, xmm0, xmm1
    vpxor xmm3, xmm3, xmm0
    vpshufb xmm3, xmm3, xmm15
    vpaddd xmm2, xmm2, xmm3
    vpxor xmm1, xmm1, xmm2
    vpsrld xmm8, xmm1, 7
    vpslld xmm1, xmm1, 25
    vpor xmm1, xmm1, xmm8
    vpshufd xmm0, xmm0, 0x39
    vpshufd xmm3, xmm3, 0x4E
    vpshufd xmm2, xmm2, 0x93
    dec al
    jz 9f
    vshufps xmm8, xmm4, xmm5, 214
    vpshufd xmm9, xmm4, 0x0F
    vpshufd xmm4, xmm8, 0x39
    vshufps xmm8, xmm6, xmm7, 250
    vpblendd xmm9, xmm9, xmm8, 0xAA
    vpunpcklqdq xmm8, xmm7, xmm5
    vpblendd xmm8, xmm8, xmm6, 0x88
    vpshufd xmm8, xmm8, 0x78
    vpunpckhdq xmm5, xmm5, xmm7
    vpunpckldq xmm6, xmm6, xmm5
    vpshufd xmm7, xmm6, 0x1E
    vmovdqa xmm5, xmm9
    vmovdqa xmm6, xmm8
    jmp 9b
    9:
    vpxor xmm0, xmm0, xmm2
    vpxor xmm1, xmm1, xmm3
    mov eax, r13d
    cmp rdx, r15
    jne 2b
    vmovdqu xmmword ptr [rbx], xmm0
    vmovdqu xmmword ptr [rbx+0x10], xmm1
    jmp 4b
    #ifdef __APPLE__
    .static_data
    #else
    .section .rodata
    #endif
    .p2align 6
    ADD0:
    .long 0, 1, 2, 3, 4, 5, 6, 7
    ADD1:
    .long 8, 8, 8, 8, 8, 8, 8, 8
    BLAKE3_IV_0:
    .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
    .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
    BLAKE3_IV_1:
    .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
    .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
    BLAKE3_IV_2:
    .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
    .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
    BLAKE3_IV_3:
    .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
    .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
    BLAKE3_BLOCK_LEN:
    .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
    .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
    ROT16:
    .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
    ROT8:
    .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
    CMP_MSB_MASK:
    .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
    .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
    BLAKE3_IV:
    .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
  • replacement in change.c at line 12
    [3.66][2.670:690]()
    #include "blake3.h"
    [3.66]
    [3.1192]
    #include "vendor/blake3/blake3.h"
  • replacement in Makefile at line 20
    [3.78838][3.53400:53469]()
    OBJS += blake3.o
    OBJS += blake3_dispatch.o
    OBJS += blake3_portable.o
    [3.78838]
    [2.3725]
    OBJS += vendor/blake3/blake3.o
    OBJS += vendor/blake3/blake3_dispatch.o
    OBJS += vendor/blake3/blake3_portable.o
  • replacement in Makefile at line 27
    [3.53470][3.53470:53605]()
    ASM = blake3_avx2_x86-64_unix.S
    ASM += blake3_avx512_x86-64_unix.S
    ASM += blake3_sse2_x86-64_unix.S
    ASM += blake3_sse41_x86-64_unix.S
    [3.53470]
    [3.620]
    ASM = vendor/blake3/blake3_avx2_x86-64_unix.S
    ASM += vendor/blake3/blake3_avx512_x86-64_unix.S
    ASM += vendor/blake3/blake3_sse2_x86-64_unix.S
    ASM += vendor/blake3/blake3_sse41_x86-64_unix.S
  • replacement in Makefile at line 38
    [3.700][3.188:235]()
    @printf "CC\t%s\n" $@
    @$(CC) $(CFLAGS) -c $<
    [3.700]
    [3.723]
    @printf " CC\t%s\n" $@
    @$(CC) $(CFLAGS) -o $*.o -c $<