/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0" * * Written by Nir Drucker, Shay Gueron and Dusan Kostic, * AWS Cryptographic Algorithms Group. */ #if defined(S2N_BIKE_R3_AVX512) #include #include "sampling_internal.h" #define AVX512_INTERNAL #include "x86_64_intrinsic.h" // For improved performance, we process NUM_ZMMS amount of data in parallel. #define NUM_ZMMS (8) #define ZMMS_QWORDS (QWORDS_IN_ZMM * NUM_ZMMS) void secure_set_bits_avx512(OUT pad_r_t * r, IN const size_t first_pos, IN const idx_t *wlist, IN const size_t w_size) { // The function assumes that the size of r is a multiple // of the cumulative size of used ZMM registers. assert((sizeof(*r) / sizeof(uint64_t)) % ZMMS_QWORDS == 0); // va vectors hold the bits of the output array "r" // va_pos_qw vectors hold the qw position indices of "r" // The algorithm works as follows: // 1. Initialize va_pos_qw with starting positions of qw's of "r" // va_pos_qw = (7, 6, 5, 4, 3, 2, 1, 0); // 2. While the size of "r" is not exceeded: // 3. For each w in wlist: // 4. Compare the pos_qw of w with positions in va_pos_qw // and for the position which is equal set the appropriate // bit in va vector. // 5. Set va_pos_qw to the next qw positions of "r" __m512i va[NUM_ZMMS], va_pos_qw[NUM_ZMMS]; __m512i w_pos_qw, w_pos_bit, one, inc; __mmask8 va_mask; uint64_t *r64 = (uint64_t *)r; one = SET1_I64(1); inc = SET1_I64(QWORDS_IN_ZMM); // 1. Initialize va_pos_qw[0] = SET_I64(7, 6, 5, 4, 3, 2, 1, 0); for(size_t i = 1; i < NUM_ZMMS; i++) { va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc); } // va_pos_qw vectors hold qw positions 0 .. (NUM_ZMMS * QWORDS_IN_ZMM - 1) // Therefore, we set the increment vector inc such that by adding it to // va_pos_qw vectors they hold the next ZMMS_QWORDS qw positions. inc = SET1_I64(ZMMS_QWORDS); for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += ZMMS_QWORDS) { for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) { va[va_iter] = SET_ZERO; } for(size_t w_iter = 0; w_iter < w_size; w_iter++) { int32_t w = wlist[w_iter] - first_pos; w_pos_qw = SET1_I64(w >> 6); #if (defined(__GNUC__) && ((__GNUC__ == 6) || (__GNUC__ == 5)) && !defined(__clang__)) || (defined(__clang__) && __clang_major__ == 3 && __clang_minor__ == 9) // Workaround for gcc-6, gcc-5, and clang < 3.9, which do not allowing the second // argument of SLLI to be non-immediate value. __m512i temp = SET1_I64(w & MASK(6)); w_pos_bit = SLLV_I64(one, temp); #else w_pos_bit = SLLI_I64(one, w & MASK(6)); #endif // 4. Compare the positions in va_pos_qw with w_pos_qw // and set the appropriate bit in va for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) { va_mask = CMPMEQ_I64(va_pos_qw[va_iter], w_pos_qw); va[va_iter] = MOR_I64(va[va_iter], va_mask, va[va_iter], w_pos_bit); } } // 5. Set the va_pos_qw to the next qw positions of r // and store the previously computed data in r for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) { STORE(&r64[i + (va_iter * QWORDS_IN_ZMM)], va[va_iter]); va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc); } } } int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr) { bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t); REG_T idx_ctr = SET1_I32(wlist[ctr]); for(size_t i = 0; i < ctr; i += REG_DWORDS) { // Comparisons are done with SIMD instructions with each SIMD register // containing REG_DWORDS elements. We compare registers element-wise: // idx_ctr = {8 repetitions of wlist[ctr]}, with // idx_cur = {8 consecutive elements from wlist}. // In the last iteration we consider wlist elements only up to ctr. REG_T idx_cur = LOAD(&wlist[i]); uint16_t mask = (ctr < (i + REG_DWORDS)) ? MASK(ctr - i) : 0xffff; uint16_t check = MCMPMEQ_I32(mask, idx_ctr, idx_cur); if(check != 0) { return 0; } } return 1; } #endif typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;