/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0" * * Written by Nir Drucker, Shay Gueron and Dusan Kostic, * AWS Cryptographic Algorithms Group. */ #if defined(S2N_BIKE_R3_AVX2) #include #include "sampling_internal.h" #define AVX2_INTERNAL #include "x86_64_intrinsic.h" // For improved performance, we process NUM_YMMS amount of data in parallel. #define NUM_YMMS (4) #define YMMS_QWORDS (QWORDS_IN_YMM * NUM_YMMS) void secure_set_bits_avx2(OUT pad_r_t * r, IN const size_t first_pos, IN const idx_t *wlist, IN const size_t w_size) { // The function assumes that the size of r is a multiple // of the cumulative size of used YMM registers. assert((sizeof(*r) / sizeof(uint64_t)) % YMMS_QWORDS == 0); // va vectors hold the bits of the output array "r" // va_pos_qw vectors hold the qw position indices of "r" // The algorithm works as follows: // 1. Initialize va_pos_qw with starting positions of qw's of "r" // va_pos_qw = (3, 2, 1, 0); // 2. While the size of "r" is not exceeded: // 3. For each w in wlist: // 4. Compare the pos_qw of w with positions in va_pos_qw // and for the position which is equal set the appropriate // bit in va vector. // 5. Set va_pos_qw to the next qw positions of "r" __m256i va[NUM_YMMS], va_pos_qw[NUM_YMMS], va_mask; __m256i w_pos_qw, w_pos_bit; __m256i one, inc; uint64_t *r64 = (uint64_t *)r; one = SET1_I64(1); inc = SET1_I64(QWORDS_IN_YMM); // 1. Initialize va_pos_qw[0] = SET_I64(3, 2, 1, 0); for(size_t i = 1; i < NUM_YMMS; i++) { va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc); } // va_pos_qw vectors hold qw positions 0 .. (NUM_YMMS * QWORDS_IN_YMM - 1) // Therefore, we set the increment vector inc such that by adding it to // va_pos_qw vectors, they hold the next YMM_QWORDS qw positions. inc = SET1_I64(YMMS_QWORDS); for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += YMMS_QWORDS) { for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) { va[va_iter] = SET_ZERO; } for(size_t w_iter = 0; w_iter < w_size; w_iter++) { int32_t w = wlist[w_iter] - first_pos; w_pos_qw = SET1_I64(w >> 6); w_pos_bit = SLLI_I64(one, w & MASK(6)); // 4. Compare the positions in va_pos_qw with w_pos_qw // and set the appropriate bit in va for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) { va_mask = CMPEQ_I64(va_pos_qw[va_iter], w_pos_qw); va[va_iter] |= (va_mask & w_pos_bit); } } // 5. Set the va_pos_qw to the next qw positions of r // and store the previously computed data in r for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) { STORE(&r64[i + (va_iter * QWORDS_IN_YMM)], va[va_iter]); va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc); } } } int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr) { bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t); REG_T idx_ctr = SET1_I32(wlist[ctr]); for(size_t i = 0; i < ctr; i += REG_DWORDS) { // Comparisons are done with SIMD instructions with each SIMD register // containing REG_DWORDS elements. We compare registers element-wise: // idx_ctr = {8 repetitions of wlist[ctr]}, with // idx_cur = {8 consecutive elements from wlist}. // In the last iteration we consider wlist elements only up to ctr. REG_T idx_cur = LOAD(&wlist[i]); REG_T cmp_res = CMPEQ_I32(idx_ctr, idx_cur); uint32_t check = MOVEMASK(cmp_res); // Handle the last iteration by appropriate masking. if(ctr < (i + REG_DWORDS)) { // MOVEMASK instruction in AVX2 compares corresponding bytes from // two given vector registers and produces a 32-bit mask. On the other hand, // we compare idx_t elements, not bytes, so we multiply by sizeof(idx_t). check &= MASK((ctr - i) * sizeof(idx_t)); } if(check != 0) { return 0; } } return 1; } #endif typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;