Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
68#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
69#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
70
71
72#include <math.h>
73#include <stdio.h>
74#include <stdlib.h>
75#include <volk/volk_complex.h>
76#define ROTATOR_RELOAD 512
77#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79
80
81#ifdef LV_HAVE_GENERIC
82
84 const lv_32fc_t* inVector,
85 const lv_32fc_t phase_inc,
86 lv_32fc_t* phase,
87 unsigned int num_points)
88{
89 unsigned int i = 0;
90 int j = 0;
91 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
92 for (j = 0; j < ROTATOR_RELOAD; ++j) {
93 *outVector++ = *inVector++ * (*phase);
94 (*phase) *= phase_inc;
95 }
96
97 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
98 }
99 for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
100 *outVector++ = *inVector++ * (*phase);
101 (*phase) *= phase_inc;
102 }
103 if (i) {
104 // Make sure, we normalize phase on every call!
105 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106 }
107}
108
109#endif /* LV_HAVE_GENERIC */
110
111
112#ifdef LV_HAVE_NEON
113#include <arm_neon.h>
115
117 const lv_32fc_t* inVector,
118 const lv_32fc_t phase_inc,
119 lv_32fc_t* phase,
120 unsigned int num_points)
121
122{
123 lv_32fc_t* outputVectorPtr = outVector;
124 const lv_32fc_t* inputVectorPtr = inVector;
125 lv_32fc_t incr = 1;
126 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127 float32x4x2_t input_vec;
128 float32x4x2_t output_vec;
129
130 unsigned int i = 0, j = 0;
131 // const unsigned int quarter_points = num_points / 4;
132
133 for (i = 0; i < 4; ++i) {
134 phasePtr[i] *= incr;
135 incr *= (phase_inc);
136 }
137
138 // Notice that incr has be incremented in the previous loop
139 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140 const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
141 float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
142
143 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
144 for (j = 0; j < ROTATOR_RELOAD_4; j++) {
145 input_vec = vld2q_f32((float*)inputVectorPtr);
146 // Prefetch next one, speeds things up
147 __VOLK_PREFETCH(inputVectorPtr + 4);
148 // Rotate
149 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
150 // Increase phase
151 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
152 // Store output
153 vst2q_f32((float*)outputVectorPtr, output_vec);
154
155 outputVectorPtr += 4;
156 inputVectorPtr += 4;
157 }
158 // normalize phase so magnitude doesn't grow because of
159 // floating point rounding error
160 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
161 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
162 // Multiply complex with real
163 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
165 }
166
167 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
168 input_vec = vld2q_f32((float*)inputVectorPtr);
169 // Prefetch next one, speeds things up
170 __VOLK_PREFETCH(inputVectorPtr + 4);
171 // Rotate
172 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
173 // Increase phase
174 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
175 // Store output
176 vst2q_f32((float*)outputVectorPtr, output_vec);
177
178 outputVectorPtr += 4;
179 inputVectorPtr += 4;
180 }
181 // if(i) == true means we looped above
182 if (i) {
183 // normalize phase so magnitude doesn't grow because of
184 // floating point rounding error
185 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
186 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
187 // Multiply complex with real
188 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
190 }
191 // Store current phase
192 vst2q_f32((float*)phasePtr, phase_vec);
193
194 // Deal with the rest
195 for (i = 0; i < num_points % 4; i++) {
196 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197 phasePtr[0] *= (phase_inc);
198 }
199
200 // For continuous phase next time we need to call this function
201 (*phase) = phasePtr[0];
202}
203
204#endif /* LV_HAVE_NEON */
205
206
207#ifdef LV_HAVE_SSE4_1
208#include <smmintrin.h>
209
210static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
211 const lv_32fc_t* inVector,
212 const lv_32fc_t phase_inc,
213 lv_32fc_t* phase,
214 unsigned int num_points)
215{
216 lv_32fc_t* cPtr = outVector;
217 const lv_32fc_t* aPtr = inVector;
218 lv_32fc_t incr = 1;
219 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
220
221 unsigned int i, j = 0;
222
223 for (i = 0; i < 2; ++i) {
224 phase_Ptr[i] *= incr;
225 incr *= (phase_inc);
226 }
227
228 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229
230 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
231 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
232
233 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
234 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
235
236 aVal = _mm_load_ps((float*)aPtr);
237
238 yl = _mm_moveldup_ps(phase_Val);
239 yh = _mm_movehdup_ps(phase_Val);
240 ylp = _mm_moveldup_ps(inc_Val);
241 yhp = _mm_movehdup_ps(inc_Val);
242
243 tmp1 = _mm_mul_ps(aVal, yl);
244 tmp1p = _mm_mul_ps(phase_Val, ylp);
245
246 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
247 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
248 tmp2 = _mm_mul_ps(aVal, yh);
249 tmp2p = _mm_mul_ps(phase_Val, yhp);
250
251 z = _mm_addsub_ps(tmp1, tmp2);
252 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
253
254 _mm_store_ps((float*)cPtr, z);
255
256 aPtr += 2;
257 cPtr += 2;
258 }
259 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
260 tmp2 = _mm_hadd_ps(tmp1, tmp1);
261 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
262 tmp2 = _mm_sqrt_ps(tmp1);
263 phase_Val = _mm_div_ps(phase_Val, tmp2);
264 }
265 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
266 aVal = _mm_load_ps((float*)aPtr);
267
268 yl = _mm_moveldup_ps(phase_Val);
269 yh = _mm_movehdup_ps(phase_Val);
270 ylp = _mm_moveldup_ps(inc_Val);
271 yhp = _mm_movehdup_ps(inc_Val);
272
273 tmp1 = _mm_mul_ps(aVal, yl);
274
275 tmp1p = _mm_mul_ps(phase_Val, ylp);
276
277 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
278 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
279 tmp2 = _mm_mul_ps(aVal, yh);
280 tmp2p = _mm_mul_ps(phase_Val, yhp);
281
282 z = _mm_addsub_ps(tmp1, tmp2);
283 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
284
285 _mm_store_ps((float*)cPtr, z);
286
287 aPtr += 2;
288 cPtr += 2;
289 }
290 if (i) {
291 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
292 tmp2 = _mm_hadd_ps(tmp1, tmp1);
293 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
294 tmp2 = _mm_sqrt_ps(tmp1);
295 phase_Val = _mm_div_ps(phase_Val, tmp2);
296 }
297
298 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
299 if (num_points & 1) {
300 *cPtr++ = *aPtr++ * phase_Ptr[0];
301 phase_Ptr[0] *= (phase_inc);
302 }
303
304 (*phase) = phase_Ptr[0];
305}
306
307#endif /* LV_HAVE_SSE4_1 for aligned */
308
309
310#ifdef LV_HAVE_SSE4_1
311#include <smmintrin.h>
312
313static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
314 const lv_32fc_t* inVector,
315 const lv_32fc_t phase_inc,
316 lv_32fc_t* phase,
317 unsigned int num_points)
318{
319 lv_32fc_t* cPtr = outVector;
320 const lv_32fc_t* aPtr = inVector;
321 lv_32fc_t incr = 1;
322 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
323
324 unsigned int i, j = 0;
325
326 for (i = 0; i < 2; ++i) {
327 phase_Ptr[i] *= incr;
328 incr *= (phase_inc);
329 }
330
331 /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
332 printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
333 printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
334 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
335
336 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
337 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
338
339 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
340 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
341
342 aVal = _mm_loadu_ps((float*)aPtr);
343
344 yl = _mm_moveldup_ps(phase_Val);
345 yh = _mm_movehdup_ps(phase_Val);
346 ylp = _mm_moveldup_ps(inc_Val);
347 yhp = _mm_movehdup_ps(inc_Val);
348
349 tmp1 = _mm_mul_ps(aVal, yl);
350 tmp1p = _mm_mul_ps(phase_Val, ylp);
351
352 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
353 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
354 tmp2 = _mm_mul_ps(aVal, yh);
355 tmp2p = _mm_mul_ps(phase_Val, yhp);
356
357 z = _mm_addsub_ps(tmp1, tmp2);
358 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
359
360 _mm_storeu_ps((float*)cPtr, z);
361
362 aPtr += 2;
363 cPtr += 2;
364 }
365 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
366 tmp2 = _mm_hadd_ps(tmp1, tmp1);
367 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
368 tmp2 = _mm_sqrt_ps(tmp1);
369 phase_Val = _mm_div_ps(phase_Val, tmp2);
370 }
371 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
372 aVal = _mm_loadu_ps((float*)aPtr);
373
374 yl = _mm_moveldup_ps(phase_Val);
375 yh = _mm_movehdup_ps(phase_Val);
376 ylp = _mm_moveldup_ps(inc_Val);
377 yhp = _mm_movehdup_ps(inc_Val);
378
379 tmp1 = _mm_mul_ps(aVal, yl);
380
381 tmp1p = _mm_mul_ps(phase_Val, ylp);
382
383 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
384 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
385 tmp2 = _mm_mul_ps(aVal, yh);
386 tmp2p = _mm_mul_ps(phase_Val, yhp);
387
388 z = _mm_addsub_ps(tmp1, tmp2);
389 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
390
391 _mm_storeu_ps((float*)cPtr, z);
392
393 aPtr += 2;
394 cPtr += 2;
395 }
396 if (i) {
397 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
398 tmp2 = _mm_hadd_ps(tmp1, tmp1);
399 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
400 tmp2 = _mm_sqrt_ps(tmp1);
401 phase_Val = _mm_div_ps(phase_Val, tmp2);
402 }
403
404 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
405 if (num_points & 1) {
406 *cPtr++ = *aPtr++ * phase_Ptr[0];
407 phase_Ptr[0] *= (phase_inc);
408 }
409
410 (*phase) = phase_Ptr[0];
411}
412
413#endif /* LV_HAVE_SSE4_1 */
414
415
416#ifdef LV_HAVE_AVX
417#include <immintrin.h>
419
421 const lv_32fc_t* inVector,
422 const lv_32fc_t phase_inc,
423 lv_32fc_t* phase,
424 unsigned int num_points)
425{
426 lv_32fc_t* cPtr = outVector;
427 const lv_32fc_t* aPtr = inVector;
428 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
429 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
430
431 unsigned int i, j = 0;
432
433 for (i = 0; i < 4; ++i) {
434 phase_Ptr[i] *= incr;
435 incr *= (phase_inc);
436 }
437
438 __m256 aVal, phase_Val, z;
439
440 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
441
442 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
443 lv_creal(incr),
444 lv_cimag(incr),
445 lv_creal(incr),
446 lv_cimag(incr),
447 lv_creal(incr),
448 lv_cimag(incr),
449 lv_creal(incr));
450
451 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
452 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
453
454 aVal = _mm256_load_ps((float*)aPtr);
455
456 z = _mm256_complexmul_ps(aVal, phase_Val);
457 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
458
459 _mm256_store_ps((float*)cPtr, z);
460
461 aPtr += 4;
462 cPtr += 4;
463 }
464 phase_Val = _mm256_normalize_ps(phase_Val);
465 }
466
467 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
468 aVal = _mm256_load_ps((float*)aPtr);
469
470 z = _mm256_complexmul_ps(aVal, phase_Val);
471 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
472
473 _mm256_store_ps((float*)cPtr, z);
474
475 aPtr += 4;
476 cPtr += 4;
477 }
478 if (i) {
479 phase_Val = _mm256_normalize_ps(phase_Val);
480 }
481
482 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
483 (*phase) = phase_Ptr[0];
484 volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
485}
486
487#endif /* LV_HAVE_AVX for aligned */
488
489
490#ifdef LV_HAVE_AVX
491#include <immintrin.h>
493
495 const lv_32fc_t* inVector,
496 const lv_32fc_t phase_inc,
497 lv_32fc_t* phase,
498 unsigned int num_points)
499{
500 lv_32fc_t* cPtr = outVector;
501 const lv_32fc_t* aPtr = inVector;
502 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
503 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
504
505 unsigned int i, j = 0;
506
507 for (i = 0; i < 4; ++i) {
508 phase_Ptr[i] *= incr;
509 incr *= (phase_inc);
510 }
511
512 __m256 aVal, phase_Val, z;
513
514 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
515
516 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
517 lv_creal(incr),
518 lv_cimag(incr),
519 lv_creal(incr),
520 lv_cimag(incr),
521 lv_creal(incr),
522 lv_cimag(incr),
523 lv_creal(incr));
524
525 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
526 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
527
528 aVal = _mm256_loadu_ps((float*)aPtr);
529
530 z = _mm256_complexmul_ps(aVal, phase_Val);
531 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
532
533 _mm256_storeu_ps((float*)cPtr, z);
534
535 aPtr += 4;
536 cPtr += 4;
537 }
538 phase_Val = _mm256_normalize_ps(phase_Val);
539 }
540
541 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
542 aVal = _mm256_loadu_ps((float*)aPtr);
543
544 z = _mm256_complexmul_ps(aVal, phase_Val);
545 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
546
547 _mm256_storeu_ps((float*)cPtr, z);
548
549 aPtr += 4;
550 cPtr += 4;
551 }
552 if (i) {
553 phase_Val = _mm256_normalize_ps(phase_Val);
554 }
555
556 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
557 (*phase) = phase_Ptr[0];
558 volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
559}
560
561#endif /* LV_HAVE_AVX */
562
563#if LV_HAVE_AVX && LV_HAVE_FMA
564#include <immintrin.h>
565
566static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
567 const lv_32fc_t* inVector,
568 const lv_32fc_t phase_inc,
569 lv_32fc_t* phase,
570 unsigned int num_points)
571{
572 lv_32fc_t* cPtr = outVector;
573 const lv_32fc_t* aPtr = inVector;
574 lv_32fc_t incr = 1;
576 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
577
578 unsigned int i, j = 0;
579
580 for (i = 0; i < 4; ++i) {
581 phase_Ptr[i] *= incr;
582 incr *= (phase_inc);
583 }
584
585 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
586
587 phase_Val = _mm256_load_ps((float*)phase_Ptr);
588 inc_Val = _mm256_set_ps(lv_cimag(incr),
589 lv_creal(incr),
590 lv_cimag(incr),
591 lv_creal(incr),
592 lv_cimag(incr),
593 lv_creal(incr),
594 lv_cimag(incr),
595 lv_creal(incr));
596
597 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
598 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
599
600 aVal = _mm256_load_ps((float*)aPtr);
601
602 yl = _mm256_moveldup_ps(phase_Val);
603 yh = _mm256_movehdup_ps(phase_Val);
604 ylp = _mm256_moveldup_ps(inc_Val);
605 yhp = _mm256_movehdup_ps(inc_Val);
606
607 tmp1 = aVal;
608 tmp1p = phase_Val;
609
610 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
611 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
612 tmp2 = _mm256_mul_ps(aVal, yh);
613 tmp2p = _mm256_mul_ps(phase_Val, yhp);
614
615 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
616 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
617
618 _mm256_store_ps((float*)cPtr, z);
619
620 aPtr += 4;
621 cPtr += 4;
622 }
623 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
624 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
625 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
626 tmp2 = _mm256_sqrt_ps(tmp1);
627 phase_Val = _mm256_div_ps(phase_Val, tmp2);
628 }
629 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
630 aVal = _mm256_load_ps((float*)aPtr);
631
632 yl = _mm256_moveldup_ps(phase_Val);
633 yh = _mm256_movehdup_ps(phase_Val);
634 ylp = _mm256_moveldup_ps(inc_Val);
635 yhp = _mm256_movehdup_ps(inc_Val);
636
637 tmp1 = aVal;
638 tmp1p = phase_Val;
639
640 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
641 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
642 tmp2 = _mm256_mul_ps(aVal, yh);
643 tmp2p = _mm256_mul_ps(phase_Val, yhp);
644
645 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
646 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
647
648 _mm256_store_ps((float*)cPtr, z);
649
650 aPtr += 4;
651 cPtr += 4;
652 }
653 if (i) {
654 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
655 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
656 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
657 tmp2 = _mm256_sqrt_ps(tmp1);
658 phase_Val = _mm256_div_ps(phase_Val, tmp2);
659 }
660
661 _mm256_store_ps((float*)phase_Ptr, phase_Val);
662 for (i = 0; i < num_points % 4; ++i) {
663 *cPtr++ = *aPtr++ * phase_Ptr[0];
664 phase_Ptr[0] *= (phase_inc);
665 }
666
667 (*phase) = phase_Ptr[0];
668}
669
670#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
671
672#if LV_HAVE_AVX && LV_HAVE_FMA
673#include <immintrin.h>
674
675static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
676 const lv_32fc_t* inVector,
677 const lv_32fc_t phase_inc,
678 lv_32fc_t* phase,
679 unsigned int num_points)
680{
681 lv_32fc_t* cPtr = outVector;
682 const lv_32fc_t* aPtr = inVector;
683 lv_32fc_t incr = 1;
684 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
685
686 unsigned int i, j = 0;
687
688 for (i = 0; i < 4; ++i) {
689 phase_Ptr[i] *= incr;
690 incr *= (phase_inc);
691 }
692
693 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
694
695 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
696 inc_Val = _mm256_set_ps(lv_cimag(incr),
697 lv_creal(incr),
698 lv_cimag(incr),
699 lv_creal(incr),
700 lv_cimag(incr),
701 lv_creal(incr),
702 lv_cimag(incr),
703 lv_creal(incr));
704
705 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
706 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
707
708 aVal = _mm256_loadu_ps((float*)aPtr);
709
710 yl = _mm256_moveldup_ps(phase_Val);
711 yh = _mm256_movehdup_ps(phase_Val);
712 ylp = _mm256_moveldup_ps(inc_Val);
713 yhp = _mm256_movehdup_ps(inc_Val);
714
715 tmp1 = aVal;
716 tmp1p = phase_Val;
717
718 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
719 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
720 tmp2 = _mm256_mul_ps(aVal, yh);
721 tmp2p = _mm256_mul_ps(phase_Val, yhp);
722
723 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
724 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
725
726 _mm256_storeu_ps((float*)cPtr, z);
727
728 aPtr += 4;
729 cPtr += 4;
730 }
731 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
732 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
733 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
734 tmp2 = _mm256_sqrt_ps(tmp1);
735 phase_Val = _mm256_div_ps(phase_Val, tmp2);
736 }
737 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
738 aVal = _mm256_loadu_ps((float*)aPtr);
739
740 yl = _mm256_moveldup_ps(phase_Val);
741 yh = _mm256_movehdup_ps(phase_Val);
742 ylp = _mm256_moveldup_ps(inc_Val);
743 yhp = _mm256_movehdup_ps(inc_Val);
744
745 tmp1 = aVal;
746 tmp1p = phase_Val;
747
748 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
749 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
750 tmp2 = _mm256_mul_ps(aVal, yh);
751 tmp2p = _mm256_mul_ps(phase_Val, yhp);
752
753 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
754 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
755
756 _mm256_storeu_ps((float*)cPtr, z);
757
758 aPtr += 4;
759 cPtr += 4;
760 }
761 if (i) {
762 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
763 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
764 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
765 tmp2 = _mm256_sqrt_ps(tmp1);
766 phase_Val = _mm256_div_ps(phase_Val, tmp2);
767 }
768
769 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
770 for (i = 0; i < num_points % 4; ++i) {
771 *cPtr++ = *aPtr++ * phase_Ptr[0];
772 phase_Ptr[0] *= (phase_inc);
773 }
774
775 (*phase) = phase_Ptr[0];
776}
777
778#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
779
780#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition sse2neon.h:6611
float32x4_t __m128
Definition sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition sse2neon.h:6527
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition sse2neon.h:1756
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition sse2neon.h:2429
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition sse2neon.h:2787
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition sse2neon.h:2659
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator_32fc.h:116
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator_32fc.h:494
#define ROTATOR_RELOAD_4
Definition volk_32fc_s32fc_x2_rotator_32fc.h:78
#define ROTATOR_RELOAD_2
Definition volk_32fc_s32fc_x2_rotator_32fc.h:77
#define ROTATOR_RELOAD
Definition volk_32fc_s32fc_x2_rotator_32fc.h:76
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator_32fc.h:83
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator_32fc.h:420
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition volk_avx_intrinsics.h:19
static __m256 _mm256_normalize_ps(__m256 val)
Definition volk_avx_intrinsics.h:51
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:65
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_cmake(r, i)
Definition volk_complex.h:77
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:83
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition volk_neon_intrinsics.h:105
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition volk_neon_intrinsics.h:73