/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0" * * Written by Nir Drucker, Shay Gueron and Dusan Kostic, * AWS Cryptographic Algorithms Group. * * The rotate functions are based on the Barrel shifter described in [1] and * some code snippets from [2]: * * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography. * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg, * Berlin, Heidelberg (2016) * * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019. * “Optimized Implementation of QC-MDPC Code-Based Cryptography.” * Concurrency and Computation: Practice and Experience 31 (18): * e5089. https://doi.org/10.1002/cpe.5089. */ #if defined(S2N_BIKE_R3_AVX2) #include "decode.h" #include "decode_internal.h" #include "utilities.h" #define AVX2_INTERNAL #include "x86_64_intrinsic.h" #define R_YMM_HALF_LOG2 UPTOPOW2(R_YMM / 2) _INLINE_ void rotate256_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t ymm_num) { // For preventing overflows (comparison in bytes) bike_static_assert(sizeof(*out) > (BYTES_IN_YMM * (R_YMM + (2 * R_YMM_HALF_LOG2))), rotr_big_err); *out = *in; for(uint32_t idx = R_YMM_HALF_LOG2; idx >= 1; idx >>= 1) { const uint8_t mask = secure_l32_mask(ymm_num, idx); const __m256i blend_mask = SET1_I8(mask); ymm_num = ymm_num - (idx & mask); for(size_t i = 0; i < (R_YMM + idx); i++) { __m256i a = LOAD(&out->qw[4 * (i + idx)]); __m256i b = LOAD(&out->qw[4 * i]); b = BLENDV_I8(b, a, blend_mask); STORE(&out->qw[4 * i], b); } } } _INLINE_ void rotate256_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t count) { __m256i carry_in = SET_ZERO; const int count64 = (int)count & 0x3f; const uint64_t count_mask = (count >> 5) & 0xe; __m256i idx = SET_I32(7, 6, 5, 4, 3, 2, 1, 0); const __m256i zero_mask = SET_I64(-1, -1, -1, 0); const __m256i count_vet = SET1_I8(count_mask); ALIGN(ALIGN_BYTES) const uint8_t zero_mask2_buf[] = { 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; __m256i zero_mask2 = LOAD(zero_mask2_buf); zero_mask2 = SUB_I8(zero_mask2, count_vet); idx = ADD_I8(idx, count_vet); for(int i = R_YMM; i >= 0; i--) { // Load the next 256 bits __m256i in256 = LOAD(&in->qw[4 * i]); // Rotate the current and previous 256 registers so that their quadwords // would be in the right positions. __m256i carry_out = PERMVAR_I32(in256, idx); in256 = BLENDV_I8(carry_in, carry_out, zero_mask2); // Shift less than 64 (quadwords internal) __m256i inner_carry = BLENDV_I8(carry_in, in256, zero_mask); inner_carry = PERM_I64(inner_carry, 0x39); const __m256i out256 = SRLI_I64(in256, count64) | SLLI_I64(inner_carry, (int)64 - count64); // Store the rotated value STORE(&out->qw[4 * i], out256); carry_in = carry_out; } } void rotate_right_avx2(OUT syndrome_t *out, IN const syndrome_t *in, IN const uint32_t bitscount) { // 1) Rotate in granularity of 256 bits blocks, using YMMs rotate256_big(out, in, (bitscount / BITS_IN_YMM)); // 2) Rotate in smaller granularity (less than 256 bits), using YMMs rotate256_small(out, out, (bitscount % BITS_IN_YMM)); } // Duplicates the first R_BITS of the syndrome three times // |------------------------------------------| // | Third copy | Second copy | first R_BITS | // |------------------------------------------| // This is required by the rotate functions. void dup_avx2(IN OUT syndrome_t *s) { s->qw[R_QWORDS - 1] = (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK); for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) { s->qw[R_QWORDS + i] = (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD); } } // Use half-adder as described in [1]. void bit_sliced_adder_avx2(OUT upc_t *upc, IN OUT syndrome_t *rotated_syndrome, IN const size_t num_of_slices) { // From cache-memory perspective this loop should be the outside loop for(size_t j = 0; j < num_of_slices; j++) { for(size_t i = 0; i < R_QWORDS; i++) { const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]); upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i]; rotated_syndrome->qw[i] = carry; } } } void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val) { // Borrow uint64_t br[R_QWORDS] = {0}; for(size_t j = 0; j < SLICES; j++) { const uint64_t lsb_mask = 0 - (val & 0x1); val >>= 1; // Perform a - b with c as the input/output carry // br = 0 0 0 0 1 1 1 1 // a = 0 0 1 1 0 0 1 1 // b = 0 1 0 1 0 1 0 1 // ------------------- // o = 0 1 1 0 0 1 1 1 // c = 0 1 0 0 1 1 0 1 // // o = a^b^c // _ __ _ _ _ _ _ // br = abc + abc + abc + abc = abc + ((a+b))c for(size_t i = 0; i < R_QWORDS; i++) { const uint64_t a = upc->slice[j].u.qw[i]; const uint64_t b = lsb_mask; const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i])); upc->slice[j].u.qw[i] = a ^ b ^ br[i]; br[i] = tmp; } } } #endif typedef int dummy_typedef_to_avoid_empty_translation_unit_warning;