99 #ifndef INCLUDED_volk_32f_exp_32f_a_H
100 #define INCLUDED_volk_32f_exp_32f_a_H
103 #include <emmintrin.h>
108 float* bPtr = bVector;
109 const float* aPtr = aVector;
111 unsigned int number = 0;
112 unsigned int quarterPoints = num_points / 4;
115 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
116 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
117 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
118 __m128i emm0, pi32_0x7f;
120 one = _mm_set1_ps(1.0);
121 exp_hi = _mm_set1_ps(88.3762626647949);
122 exp_lo = _mm_set1_ps(-88.3762626647949);
123 log2EF = _mm_set1_ps(1.44269504088896341);
124 half = _mm_set1_ps(0.5);
125 exp_C1 = _mm_set1_ps(0.693359375);
126 exp_C2 = _mm_set1_ps(-2.12194440e-4);
127 pi32_0x7f = _mm_set1_epi32(0x7f);
129 exp_p0 = _mm_set1_ps(1.9875691500e-4);
130 exp_p1 = _mm_set1_ps(1.3981999507e-3);
131 exp_p2 = _mm_set1_ps(8.3334519073e-3);
132 exp_p3 = _mm_set1_ps(4.1665795894e-2);
133 exp_p4 = _mm_set1_ps(1.6666665459e-1);
134 exp_p5 = _mm_set1_ps(5.0000001201e-1);
136 for (; number < quarterPoints; number++) {
137 aVal = _mm_load_ps(aPtr);
138 tmp = _mm_setzero_ps();
140 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
143 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
145 emm0 = _mm_cvttps_epi32(fx);
146 tmp = _mm_cvtepi32_ps(emm0);
148 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
149 fx = _mm_sub_ps(tmp, mask);
151 tmp = _mm_mul_ps(fx, exp_C1);
152 z = _mm_mul_ps(fx, exp_C2);
153 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
154 z = _mm_mul_ps(aVal, aVal);
156 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
157 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
158 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
159 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
160 y = _mm_add_ps(y, one);
162 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
164 pow2n = _mm_castsi128_ps(emm0);
165 bVal = _mm_mul_ps(y, pow2n);
167 _mm_store_ps(bPtr, bVal);
172 number = quarterPoints * 4;
173 for (; number < num_points; number++) {
174 *bPtr++ = expf(*aPtr++);
181 #ifdef LV_HAVE_GENERIC
186 float* bPtr = bVector;
187 const float* aPtr = aVector;
188 unsigned int number = 0;
190 for (number = 0; number < num_points; number++) {
191 *bPtr++ = expf(*aPtr++);
199 #ifndef INCLUDED_volk_32f_exp_32f_u_H
200 #define INCLUDED_volk_32f_exp_32f_u_H
203 #include <emmintrin.h>
208 float* bPtr = bVector;
209 const float* aPtr = aVector;
211 unsigned int number = 0;
212 unsigned int quarterPoints = num_points / 4;
215 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
216 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
217 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
218 __m128i emm0, pi32_0x7f;
220 one = _mm_set1_ps(1.0);
221 exp_hi = _mm_set1_ps(88.3762626647949);
222 exp_lo = _mm_set1_ps(-88.3762626647949);
223 log2EF = _mm_set1_ps(1.44269504088896341);
224 half = _mm_set1_ps(0.5);
225 exp_C1 = _mm_set1_ps(0.693359375);
226 exp_C2 = _mm_set1_ps(-2.12194440e-4);
227 pi32_0x7f = _mm_set1_epi32(0x7f);
229 exp_p0 = _mm_set1_ps(1.9875691500e-4);
230 exp_p1 = _mm_set1_ps(1.3981999507e-3);
231 exp_p2 = _mm_set1_ps(8.3334519073e-3);
232 exp_p3 = _mm_set1_ps(4.1665795894e-2);
233 exp_p4 = _mm_set1_ps(1.6666665459e-1);
234 exp_p5 = _mm_set1_ps(5.0000001201e-1);
237 for (; number < quarterPoints; number++) {
238 aVal = _mm_loadu_ps(aPtr);
239 tmp = _mm_setzero_ps();
241 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
244 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
246 emm0 = _mm_cvttps_epi32(fx);
247 tmp = _mm_cvtepi32_ps(emm0);
249 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
250 fx = _mm_sub_ps(tmp, mask);
252 tmp = _mm_mul_ps(fx, exp_C1);
253 z = _mm_mul_ps(fx, exp_C2);
254 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
255 z = _mm_mul_ps(aVal, aVal);
257 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
258 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
259 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
260 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
261 y = _mm_add_ps(y, one);
263 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
265 pow2n = _mm_castsi128_ps(emm0);
266 bVal = _mm_mul_ps(y, pow2n);
268 _mm_storeu_ps(bPtr, bVal);
273 number = quarterPoints * 4;
274 for (; number < num_points; number++) {
275 *bPtr++ = expf(*aPtr++);
282 #ifdef LV_HAVE_GENERIC
287 float* bPtr = bVector;
288 const float* aPtr = aVector;
289 unsigned int number = 0;
291 for (number = 0; number < num_points; number++) {
292 *bPtr++ = expf(*aPtr++);