739 poly8x8_t a = vreinterpret_p8_u64(_a);
740 poly8x8_t b = vreinterpret_p8_u64(_b);
743 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
744 vcreate_u8(0x00000000ffffffff));
745 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
746 vcreate_u8(0x0000000000000000));
749 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
751 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
753 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
755 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
757 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
759 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
761 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
763 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
766 uint8x16_t l = veorq_u8(e, f);
767 uint8x16_t m = veorq_u8(g, h);
768 uint8x16_t n = veorq_u8(i, j);
772#if defined(__aarch64__)
773 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
774 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
775 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
776 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
777 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
778 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
779 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
780 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
782 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
783 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
784 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
785 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
789 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
790 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
791 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
795 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
796 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
797 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
800#if defined(__aarch64__)
801 uint8x16_t t0 = vreinterpretq_u8_u64(
802 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
803 uint8x16_t t1 = vreinterpretq_u8_u64(
804 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
805 uint8x16_t t2 = vreinterpretq_u8_u64(
806 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
807 uint8x16_t t3 = vreinterpretq_u8_u64(
808 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
810 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
811 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
812 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
813 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
816 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
817 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
818 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
819 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
822 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
823 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
824 uint8x16_t mix = veorq_u8(d, cross1);
825 uint8x16_t r = veorq_u8(mix, cross2);
826 return vreinterpretq_u64_u8(r);
4038#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4050 float *f = (
float *) &a;
4053 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4056 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4058 int32x4_t r_trunc = vcvtq_s32_f32(
4060 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4061 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
4062 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4064 float32x4_t delta = vsubq_f32(
4066 vcvtq_f32_s32(r_trunc));
4067 uint32x4_t is_delta_half =
4068 vceqq_f32(delta, half);
4070 vbslq_s32(is_delta_half, r_even, r_normal));
4073 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4079 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
6966#if defined(__aarch64__)
6969 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6970 vmovl_s8(vget_low_s8(b)));
6971 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6972 vmovl_s8(vget_high_s8(b)));
6974 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6982 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6983 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6986 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6987 int16x8_t b_odd = vshrq_n_s16(b, 8);
6990 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6991 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
8019 switch (imm & 0x4) {
8029#if defined(__GNUC__) || defined(__clang__)
8030 __builtin_unreachable();
8035 switch (imm & 0x3) {
8037 _b = vreinterpretq_u8_u32(
8041 _b = vreinterpretq_u8_u32(
8045 _b = vreinterpretq_u8_u32(
8049 _b = vreinterpretq_u8_u32(
8053#if defined(__GNUC__) || defined(__clang__)
8054 __builtin_unreachable();
8059 int16x8_t c04, c15, c26, c37;
8060 uint8x8_t low_b = vget_low_u8(_b);
8061 c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8062 _a = vextq_u8(_a, _a, 1);
8063 c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8064 _a = vextq_u8(_a, _a, 1);
8065 c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8066 _a = vextq_u8(_a, _a, 1);
8067 c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8068#if defined(__aarch64__)
8070 c04 = vpaddq_s16(c04, c26);
8072 c15 = vpaddq_s16(c15, c37);
8075 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8077 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8079 vreinterpretq_s16_s32(trn2_c)));
8081 int16x4_t c01, c23, c45, c67;
8082 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8083 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8084 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8085 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8088 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8524#if defined(__aarch64__)
8525 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8526 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8527 0xc, 0x1, 0x6, 0xb};
8528 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8529 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8535 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8544 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8545 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8546 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8552#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8553 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8554 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8555#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8556#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8557#define SSE2NEON_AES_U0(p) \
8558 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8559#define SSE2NEON_AES_U1(p) \
8560 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8561#define SSE2NEON_AES_U2(p) \
8562 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8563#define SSE2NEON_AES_U3(p) \
8564 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8565 static const uint32_t
ALIGN_STRUCT(16) aes_table[4][256] = {
8571#undef SSE2NEON_AES_B2W
8572#undef SSE2NEON_AES_F2
8573#undef SSE2NEON_AES_F3
8574#undef SSE2NEON_AES_U0
8575#undef SSE2NEON_AES_U1
8576#undef SSE2NEON_AES_U2
8577#undef SSE2NEON_AES_U3
8585 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8586 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8587 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8588 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8589 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8590 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8591 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8592 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));