Vector Optimized Library of Kernels 3.1.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
42#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
43#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
44
45#include <inttypes.h>
46#include <stdio.h>
47
48#ifdef LV_HAVE_SSE2
49
50#include <emmintrin.h>
51
52static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
53 short* src0,
54 short* src1,
55 short* src2,
56 short* src3,
57 unsigned int num_points)
58{
59 const unsigned int num_bytes = num_points * 2;
60
61 int i = 0;
62
63 int bound = (num_bytes >> 4);
64 int bound_copy = bound;
65 int leftovers = (num_bytes >> 1) & 7;
66
67 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
68 p_target = (__m128i*)target;
69 p_src0 = (__m128i*)src0;
70 p_src1 = (__m128i*)src1;
71 p_src2 = (__m128i*)src2;
72 p_src3 = (__m128i*)src3;
73
74 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
75
76 while (bound_copy > 0) {
77 xmm1 = _mm_load_si128(p_src0);
78 xmm2 = _mm_load_si128(p_src1);
79 xmm3 = _mm_load_si128(p_src2);
80 xmm4 = _mm_load_si128(p_src3);
81
82 xmm5 = _mm_setzero_si128();
83 xmm6 = _mm_setzero_si128();
84 xmm7 = xmm1;
85 xmm8 = xmm3;
86
87 xmm1 = _mm_sub_epi16(xmm2, xmm1);
88
89 xmm3 = _mm_sub_epi16(xmm4, xmm3);
90
91 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
92 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
93
94 xmm2 = _mm_and_si128(xmm5, xmm2);
95 xmm4 = _mm_and_si128(xmm6, xmm4);
96 xmm5 = _mm_andnot_si128(xmm5, xmm7);
97 xmm6 = _mm_andnot_si128(xmm6, xmm8);
98
99 xmm5 = _mm_add_epi16(xmm2, xmm5);
100 xmm6 = _mm_add_epi16(xmm4, xmm6);
101
102 xmm1 = _mm_xor_si128(xmm1, xmm1);
103 xmm2 = xmm5;
104 xmm5 = _mm_sub_epi16(xmm6, xmm5);
105 p_src0 += 1;
106 bound_copy -= 1;
107
108 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
109 p_src1 += 1;
110
111 xmm6 = _mm_and_si128(xmm1, xmm6);
112
113 xmm1 = _mm_andnot_si128(xmm1, xmm2);
114 p_src2 += 1;
115
116 xmm1 = _mm_add_epi16(xmm6, xmm1);
117 p_src3 += 1;
118
119 _mm_store_si128(p_target, xmm1);
120 p_target += 1;
121 }
122
123 short temp0 = 0;
124 short temp1 = 0;
125 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
126 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
127 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
128 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
129 }
130 return;
131}
132
133#endif /*LV_HAVE_SSE2*/
134
135#ifdef LV_HAVE_NEON
136
137#include <arm_neon.h>
138
139static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
140 short* src0,
141 short* src1,
142 short* src2,
143 short* src3,
144 unsigned int num_points)
145{
146 const unsigned int eighth_points = num_points / 8;
147 unsigned i;
148
149 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
150 int16x8_t diff12, diff34;
151 int16x8_t comp0, comp1, comp2, comp3;
152 int16x8_t result1_vec, result2_vec;
153 int16x8_t zeros;
154 zeros = vdupq_n_s16(0);
155 for (i = 0; i < eighth_points; ++i) {
156 src0_vec = vld1q_s16(src0);
157 src1_vec = vld1q_s16(src1);
158 src2_vec = vld1q_s16(src2);
159 src3_vec = vld1q_s16(src3);
160 diff12 = vsubq_s16(src0_vec, src1_vec);
161 diff34 = vsubq_s16(src2_vec, src3_vec);
162 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
163 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
164 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
165 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
166 comp0 = vandq_s16(src0_vec, comp0);
167 comp1 = vandq_s16(src1_vec, comp1);
168 comp2 = vandq_s16(src2_vec, comp2);
169 comp3 = vandq_s16(src3_vec, comp3);
170
171 result1_vec = vaddq_s16(comp0, comp1);
172 result2_vec = vaddq_s16(comp2, comp3);
173
174 diff12 = vsubq_s16(result1_vec, result2_vec);
175 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
176 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
177 comp0 = vandq_s16(result1_vec, comp0);
178 comp1 = vandq_s16(result2_vec, comp1);
179 result1_vec = vaddq_s16(comp0, comp1);
180 vst1q_s16(target, result1_vec);
181 src0 += 8;
182 src1 += 8;
183 src2 += 8;
184 src3 += 8;
185 target += 8;
186 }
187
188 short temp0 = 0;
189 short temp1 = 0;
190 for (i = eighth_points * 8; i < num_points; ++i) {
191 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
192 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
193 *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
194 src0++;
195 src1++;
196 src2++;
197 src3++;
198 }
199}
200#endif /* LV_HAVE_NEON */
201
202
203#ifdef LV_HAVE_GENERIC
204static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
205 short* src0,
206 short* src1,
207 short* src2,
208 short* src3,
209 unsigned int num_points)
210{
211 const unsigned int num_bytes = num_points * 2;
212
213 int i = 0;
214
215 int bound = num_bytes >> 1;
216
217 short temp0 = 0;
218 short temp1 = 0;
219 for (i = 0; i < bound; ++i) {
220 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
221 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
222 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
223 }
224}
225
226#endif /*LV_HAVE_GENERIC*/
227
228#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/