Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_exp_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10/* SIMD (SSE4) implementation of exp
11 Inspired by Intel Approximate Math library, and based on the
12 corresponding algorithms of the cephes math library
13*/
14
15/* Copyright (C) 2007 Julien Pommier
16
17 This software is provided 'as-is', without any express or implied
18 warranty. In no event will the authors be held liable for any damages
19 arising from the use of this software.
20
21 Permission is granted to anyone to use this software for any purpose,
22 including commercial applications, and to alter it and redistribute it
23 freely, subject to the following restrictions:
24
25 1. The origin of this software must not be misrepresented; you must not
26 claim that you wrote the original software. If you use this software
27 in a product, an acknowledgment in the product documentation would be
28 appreciated but is not required.
29 2. Altered source versions must be plainly marked as such, and must not be
30 misrepresented as being the original software.
31 3. This notice may not be removed or altered from any source distribution.
32
33 (this is the zlib license)
34*/
35
82#include <inttypes.h>
83#include <math.h>
84#include <stdio.h>
85
86#ifndef INCLUDED_volk_32f_exp_32f_a_H
87#define INCLUDED_volk_32f_exp_32f_a_H
88
89#ifdef LV_HAVE_SSE2
90#include <emmintrin.h>
91
92static inline void
93volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
94{
95 float* bPtr = bVector;
96 const float* aPtr = aVector;
97
98 unsigned int number = 0;
99 unsigned int quarterPoints = num_points / 4;
100
101 // Declare variables and constants
102 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105 __m128i emm0, pi32_0x7f;
106
107 one = _mm_set1_ps(1.0);
108 exp_hi = _mm_set1_ps(88.3762626647949);
109 exp_lo = _mm_set1_ps(-88.3762626647949);
110 log2EF = _mm_set1_ps(1.44269504088896341);
111 half = _mm_set1_ps(0.5);
112 exp_C1 = _mm_set1_ps(0.693359375);
113 exp_C2 = _mm_set1_ps(-2.12194440e-4);
114 pi32_0x7f = _mm_set1_epi32(0x7f);
115
116 exp_p0 = _mm_set1_ps(1.9875691500e-4);
117 exp_p1 = _mm_set1_ps(1.3981999507e-3);
118 exp_p2 = _mm_set1_ps(8.3334519073e-3);
119 exp_p3 = _mm_set1_ps(4.1665795894e-2);
120 exp_p4 = _mm_set1_ps(1.6666665459e-1);
121 exp_p5 = _mm_set1_ps(5.0000001201e-1);
122
123 for (; number < quarterPoints; number++) {
124 aVal = _mm_load_ps(aPtr);
125 tmp = _mm_setzero_ps();
126
127 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
128
129 /* express exp(x) as exp(g + n*log(2)) */
130 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
131
132 emm0 = _mm_cvttps_epi32(fx);
133 tmp = _mm_cvtepi32_ps(emm0);
134
135 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136 fx = _mm_sub_ps(tmp, mask);
137
138 tmp = _mm_mul_ps(fx, exp_C1);
139 z = _mm_mul_ps(fx, exp_C2);
140 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141 z = _mm_mul_ps(aVal, aVal);
142
143 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147 y = _mm_add_ps(y, one);
148
149 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
150
151 pow2n = _mm_castsi128_ps(emm0);
152 bVal = _mm_mul_ps(y, pow2n);
153
154 _mm_store_ps(bPtr, bVal);
155 aPtr += 4;
156 bPtr += 4;
157 }
158
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *bPtr++ = expf(*aPtr++);
162 }
163}
164
165#endif /* LV_HAVE_SSE2 for aligned */
166
167
168#ifdef LV_HAVE_GENERIC
169
170static inline void
171volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points)
172{
173 float* bPtr = bVector;
174 const float* aPtr = aVector;
175 unsigned int number = 0;
176
177 for (number = 0; number < num_points; number++) {
178 *bPtr++ = expf(*aPtr++);
179 }
180}
181
182#endif /* LV_HAVE_GENERIC */
183
184#endif /* INCLUDED_volk_32f_exp_32f_a_H */
185
186#ifndef INCLUDED_volk_32f_exp_32f_u_H
187#define INCLUDED_volk_32f_exp_32f_u_H
188
189#ifdef LV_HAVE_SSE2
190#include <emmintrin.h>
191
192static inline void
193volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
194{
195 float* bPtr = bVector;
196 const float* aPtr = aVector;
197
198 unsigned int number = 0;
199 unsigned int quarterPoints = num_points / 4;
200
201 // Declare variables and constants
202 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
203 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
204 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
205 __m128i emm0, pi32_0x7f;
206
207 one = _mm_set1_ps(1.0);
208 exp_hi = _mm_set1_ps(88.3762626647949);
209 exp_lo = _mm_set1_ps(-88.3762626647949);
210 log2EF = _mm_set1_ps(1.44269504088896341);
211 half = _mm_set1_ps(0.5);
212 exp_C1 = _mm_set1_ps(0.693359375);
213 exp_C2 = _mm_set1_ps(-2.12194440e-4);
214 pi32_0x7f = _mm_set1_epi32(0x7f);
215
216 exp_p0 = _mm_set1_ps(1.9875691500e-4);
217 exp_p1 = _mm_set1_ps(1.3981999507e-3);
218 exp_p2 = _mm_set1_ps(8.3334519073e-3);
219 exp_p3 = _mm_set1_ps(4.1665795894e-2);
220 exp_p4 = _mm_set1_ps(1.6666665459e-1);
221 exp_p5 = _mm_set1_ps(5.0000001201e-1);
222
223
224 for (; number < quarterPoints; number++) {
225 aVal = _mm_loadu_ps(aPtr);
226 tmp = _mm_setzero_ps();
227
228 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
229
230 /* express exp(x) as exp(g + n*log(2)) */
231 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
232
233 emm0 = _mm_cvttps_epi32(fx);
234 tmp = _mm_cvtepi32_ps(emm0);
235
236 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
237 fx = _mm_sub_ps(tmp, mask);
238
239 tmp = _mm_mul_ps(fx, exp_C1);
240 z = _mm_mul_ps(fx, exp_C2);
241 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
242 z = _mm_mul_ps(aVal, aVal);
243
244 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
245 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
246 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
247 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
248 y = _mm_add_ps(y, one);
249
250 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
251
252 pow2n = _mm_castsi128_ps(emm0);
253 bVal = _mm_mul_ps(y, pow2n);
254
255 _mm_storeu_ps(bPtr, bVal);
256 aPtr += 4;
257 bPtr += 4;
258 }
259
260 number = quarterPoints * 4;
261 for (; number < num_points; number++) {
262 *bPtr++ = expf(*aPtr++);
263 }
264}
265
266#endif /* LV_HAVE_SSE2 for unaligned */
267
268
269#ifdef LV_HAVE_GENERIC
270
271static inline void
272volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
273{
274 float* bPtr = bVector;
275 const float* aPtr = aVector;
276 unsigned int number = 0;
277
278 for (number = 0; number < num_points; number++) {
279 *bPtr++ = expf(*aPtr++);
280 }
281}
282
283#endif /* LV_HAVE_GENERIC */
284
285#endif /* INCLUDED_volk_32f_exp_32f_u_H */
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition sse2neon.h:5565
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition sse2neon.h:2834
float32x4_t __m128
Definition sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition sse2neon.h:2984
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition sse2neon.h:4324
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1858
int64x2_t __m128i
Definition sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition sse2neon.h:2080
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition sse2neon.h:3937
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition sse2neon.h:2025
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:93
static void volk_32f_exp_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:272
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:193
static void volk_32f_exp_32f_a_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:171