56#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
61#if defined(__SSE4_1__)
69#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
71#define SIMD_ARCH_ARM64
73#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
75#define SIMD_ARCH_ARM32
80#define SIMD_ARCH_SCALAR
95#if defined(SIMD_ARCH_X86)
97typedef __m128 simd_vec_t;
98typedef __m128i simd_ivec_t;
99#elif defined(SIMD_ARCH_ARM)
101typedef float32x4_t simd_vec_t;
102typedef int32x4_t simd_ivec_t;
120static inline simd_vec_t simd_set_zero(
void) {
121#if defined(SIMD_ARCH_X86)
122 return _mm_setzero_ps();
123#elif defined(SIMD_ARCH_ARM)
124 return vdupq_n_f32(0.0f);
126 return (simd_vec_t){{0, 0, 0, 0}};
139static inline simd_vec_t simd_set(
float x,
float y,
float z,
float w) {
140#if defined(SIMD_ARCH_X86)
145 return _mm_set_ps(w, z, y, x);
146#elif defined(SIMD_ARCH_ARM)
147 float d[4] = {x, y, z, w};
150 return (simd_vec_t){{x, y, z, w}};
158static inline simd_vec_t simd_set1(
float s) {
159#if defined(SIMD_ARCH_X86)
160 return _mm_set1_ps(s);
161#elif defined(SIMD_ARCH_ARM)
162 return vdupq_n_f32(s);
164 return (simd_vec_t){{s, s, s, s}};
175static inline simd_vec_t simd_load(
const float* ptr) {
176#if defined(SIMD_ARCH_X86)
177 return _mm_loadu_ps(ptr);
178#elif defined(SIMD_ARCH_ARM)
179 return vld1q_f32(ptr);
182 memcpy(r.f, ptr, 4 *
sizeof(
float));
191static inline void simd_store(
float* ptr, simd_vec_t v) {
192#if defined(SIMD_ARCH_X86)
193 _mm_storeu_ps(ptr, v);
194#elif defined(SIMD_ARCH_ARM)
197 memcpy(ptr, v.f, 4 *
sizeof(
float));
206static inline simd_vec_t simd_add(simd_vec_t a, simd_vec_t b) {
207#if defined(SIMD_ARCH_X86)
208 return _mm_add_ps(a, b);
209#elif defined(SIMD_ARCH_ARM)
210 return vaddq_f32(a, b);
212 return (simd_vec_t){{a.f[0] + b.f[0], a.f[1] + b.f[1], a.f[2] + b.f[2], a.f[3] + b.f[3]}};
217static inline simd_vec_t simd_sub(simd_vec_t a, simd_vec_t b) {
218#if defined(SIMD_ARCH_X86)
219 return _mm_sub_ps(a, b);
220#elif defined(SIMD_ARCH_ARM)
221 return vsubq_f32(a, b);
223 return (simd_vec_t){{a.f[0] - b.f[0], a.f[1] - b.f[1], a.f[2] - b.f[2], a.f[3] - b.f[3]}};
228static inline simd_vec_t simd_mul(simd_vec_t a, simd_vec_t b) {
229#if defined(SIMD_ARCH_X86)
230 return _mm_mul_ps(a, b);
231#elif defined(SIMD_ARCH_ARM)
232 return vmulq_f32(a, b);
234 return (simd_vec_t){{a.f[0] * b.f[0], a.f[1] * b.f[1], a.f[2] * b.f[2], a.f[3] * b.f[3]}};
244static inline simd_vec_t simd_div(simd_vec_t a, simd_vec_t b) {
245#if defined(SIMD_ARCH_X86)
246 return _mm_div_ps(a, b);
247#elif defined(SIMD_ARCH_ARM64)
248 return vdivq_f32(a, b);
249#elif defined(SIMD_ARCH_ARM32)
256 simd_vec_t recip = vrecpeq_f32(b);
257 recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
258 return vmulq_f32(a, recip);
260 return (simd_vec_t){{a.f[0] / b.f[0], a.f[1] / b.f[1], a.f[2] / b.f[2], a.f[3] / b.f[3]}};
270static inline simd_vec_t simd_madd(simd_vec_t a, simd_vec_t b, simd_vec_t c) {
271#if defined(SIMD_ARCH_X86)
273 return _mm_fmadd_ps(a, b, c);
275 return _mm_add_ps(_mm_mul_ps(a, b), c);
277#elif defined(SIMD_ARCH_ARM)
278 return vmlaq_f32(c, a, b);
280 return simd_add(simd_mul(a, b), c);
285static inline simd_vec_t simd_neg(simd_vec_t v) {
286#if defined(SIMD_ARCH_X86)
287 return _mm_sub_ps(_mm_setzero_ps(), v);
288#elif defined(SIMD_ARCH_ARM)
291 return (simd_vec_t){{-v.f[0], -v.f[1], -v.f[2], -v.f[3]}};
296static inline simd_vec_t simd_abs(simd_vec_t v) {
297#if defined(SIMD_ARCH_X86)
299 static const __m128 sign_mask = {-0.0f, -0.0f, -0.0f, -0.0f};
300 return _mm_andnot_ps(sign_mask, v);
301#elif defined(SIMD_ARCH_ARM)
304 return (simd_vec_t){{fabsf(v.f[0]), fabsf(v.f[1]), fabsf(v.f[2]), fabsf(v.f[3])}};
313static inline simd_vec_t simd_min(simd_vec_t a, simd_vec_t b) {
314#if defined(SIMD_ARCH_X86)
315 return _mm_min_ps(a, b);
316#elif defined(SIMD_ARCH_ARM)
317 return vminq_f32(a, b);
319 return (simd_vec_t){{a.f[0] < b.f[0] ? a.f[0] : b.f[0], a.f[1] < b.f[1] ? a.f[1] : b.f[1],
320 a.f[2] < b.f[2] ? a.f[2] : b.f[2], a.f[3] < b.f[3] ? a.f[3] : b.f[3]}};
325static inline simd_vec_t simd_max(simd_vec_t a, simd_vec_t b) {
326#if defined(SIMD_ARCH_X86)
327 return _mm_max_ps(a, b);
328#elif defined(SIMD_ARCH_ARM)
329 return vmaxq_f32(a, b);
331 return (simd_vec_t){{a.f[0] > b.f[0] ? a.f[0] : b.f[0], a.f[1] > b.f[1] ? a.f[1] : b.f[1],
332 a.f[2] > b.f[2] ? a.f[2] : b.f[2], a.f[3] > b.f[3] ? a.f[3] : b.f[3]}};
341static inline simd_vec_t simd_sqrt(simd_vec_t v) {
342#if defined(SIMD_ARCH_X86)
343 return _mm_sqrt_ps(v);
344#elif defined(SIMD_ARCH_ARM64)
345 return vsqrtq_f32(v);
346#elif defined(SIMD_ARCH_ARM32)
351 for (
int i = 0; i < 4; ++i) temp[i] = sqrtf(temp[i]);
352 return vld1q_f32(temp);
354 return (simd_vec_t){{sqrtf(v.f[0]), sqrtf(v.f[1]), sqrtf(v.f[2]), sqrtf(v.f[3])}};
365static inline simd_vec_t simd_rsqrt(simd_vec_t v) {
366#if defined(SIMD_ARCH_X86)
367 return _mm_rsqrt_ps(v);
368#elif defined(SIMD_ARCH_ARM)
369 return vrsqrteq_f32(v);
371 return (simd_vec_t){{1.0f / sqrtf(v.f[0]), 1.0f / sqrtf(v.f[1]), 1.0f / sqrtf(v.f[2]), 1.0f / sqrtf(v.f[3])}};
379static inline simd_vec_t simd_rcp(simd_vec_t v) {
380#if defined(SIMD_ARCH_X86)
381 return _mm_rcp_ps(v);
382#elif defined(SIMD_ARCH_ARM)
383 return vrecpeq_f32(v);
385 return (simd_vec_t){{1.0f / v.f[0], 1.0f / v.f[1], 1.0f / v.f[2], 1.0f / v.f[3]}};
390static inline simd_vec_t simd_floor(simd_vec_t v) {
391#if defined(SIMD_ARCH_X86)
393 return _mm_floor_ps(v);
397 _mm_storeu_ps(temp, v);
398 for (
int i = 0; i < 4; ++i) temp[i] = floorf(temp[i]);
399 return _mm_loadu_ps(temp);
401#elif defined(SIMD_ARCH_ARM64)
402 return vrndmq_f32(v);
403#elif defined(SIMD_ARCH_ARM32)
406 for (
int i = 0; i < 4; ++i) temp[i] = floorf(temp[i]);
407 return vld1q_f32(temp);
409 return (simd_vec_t){{floorf(v.f[0]), floorf(v.f[1]), floorf(v.f[2]), floorf(v.f[3])}};
414static inline simd_vec_t simd_ceil(simd_vec_t v) {
415#if defined(SIMD_ARCH_X86)
417 return _mm_ceil_ps(v);
420 _mm_storeu_ps(temp, v);
421 for (
int i = 0; i < 4; ++i) temp[i] = ceilf(temp[i]);
422 return _mm_loadu_ps(temp);
424#elif defined(SIMD_ARCH_ARM64)
425 return vrndpq_f32(v);
426#elif defined(SIMD_ARCH_ARM32)
429 for (
int i = 0; i < 4; ++i) temp[i] = ceilf(temp[i]);
430 return vld1q_f32(temp);
432 return (simd_vec_t){{ceilf(v.f[0]), ceilf(v.f[1]), ceilf(v.f[2]), ceilf(v.f[3])}};
437static inline simd_vec_t simd_round(simd_vec_t v) {
438#if defined(SIMD_ARCH_X86)
440 return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
443 _mm_storeu_ps(temp, v);
444 for (
int i = 0; i < 4; ++i) temp[i] = roundf(temp[i]);
445 return _mm_loadu_ps(temp);
447#elif defined(SIMD_ARCH_ARM64)
448 return vrndnq_f32(v);
449#elif defined(SIMD_ARCH_ARM32)
452 for (
int i = 0; i < 4; ++i) temp[i] = roundf(temp[i]);
453 return vld1q_f32(temp);
455 return (simd_vec_t){{roundf(v.f[0]), roundf(v.f[1]), roundf(v.f[2]), roundf(v.f[3])}};
473static inline simd_vec_t simd_cmpeq(simd_vec_t a, simd_vec_t b) {
474#if defined(SIMD_ARCH_X86)
475 return _mm_cmpeq_ps(a, b);
476#elif defined(SIMD_ARCH_ARM)
477 return vreinterpretq_f32_u32(vceqq_f32(a, b));
480 for (
int i = 0; i < 4; i++) {
481 uint32_t mask = (a.f[i] == b.f[i]) ? 0xFFFFFFFF : 0;
482 memcpy(&result.f[i], &mask,
sizeof(uint32_t));
488static inline simd_vec_t simd_cmpneq(simd_vec_t a, simd_vec_t b) {
489#if defined(SIMD_ARCH_X86)
490 return _mm_cmpneq_ps(a, b);
491#elif defined(SIMD_ARCH_ARM)
492 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a, b)));
495 for (
int i = 0; i < 4; i++) {
496 uint32_t mask = (a.f[i] != b.f[i]) ? 0xFFFFFFFF : 0;
497 memcpy(&result.f[i], &mask,
sizeof(uint32_t));
503static inline simd_vec_t simd_cmplt(simd_vec_t a, simd_vec_t b) {
504#if defined(SIMD_ARCH_X86)
505 return _mm_cmplt_ps(a, b);
506#elif defined(SIMD_ARCH_ARM)
507 return vreinterpretq_f32_u32(vcltq_f32(a, b));
510 for (
int i = 0; i < 4; i++) {
511 uint32_t mask = (a.f[i] < b.f[i]) ? 0xFFFFFFFF : 0;
512 memcpy(&result.f[i], &mask,
sizeof(uint32_t));
518static inline simd_vec_t simd_cmple(simd_vec_t a, simd_vec_t b) {
519#if defined(SIMD_ARCH_X86)
520 return _mm_cmple_ps(a, b);
521#elif defined(SIMD_ARCH_ARM)
522 return vreinterpretq_f32_u32(vcleq_f32(a, b));
525 for (
int i = 0; i < 4; i++) {
526 uint32_t mask = (a.f[i] <= b.f[i]) ? 0xFFFFFFFF : 0;
527 memcpy(&result.f[i], &mask,
sizeof(uint32_t));
533static inline simd_vec_t simd_cmpgt(simd_vec_t a, simd_vec_t b) {
534#if defined(SIMD_ARCH_X86)
535 return _mm_cmpgt_ps(a, b);
536#elif defined(SIMD_ARCH_ARM)
537 return vreinterpretq_f32_u32(vcgtq_f32(a, b));
540 for (
int i = 0; i < 4; i++) {
541 uint32_t mask = (a.f[i] > b.f[i]) ? 0xFFFFFFFF : 0;
542 memcpy(&result.f[i], &mask,
sizeof(uint32_t));
548static inline simd_vec_t simd_cmpge(simd_vec_t a, simd_vec_t b) {
549#if defined(SIMD_ARCH_X86)
550 return _mm_cmpge_ps(a, b);
551#elif defined(SIMD_ARCH_ARM)
552 return vreinterpretq_f32_u32(vcgeq_f32(a, b));
555 for (
int i = 0; i < 4; i++) {
556 uint32_t mask = (a.f[i] >= b.f[i]) ? 0xFFFFFFFF : 0;
557 memcpy(&result.f[i], &mask,
sizeof(uint32_t));
567static inline simd_vec_t simd_and(simd_vec_t a, simd_vec_t b) {
568#if defined(SIMD_ARCH_X86)
569 return _mm_and_ps(a, b);
570#elif defined(SIMD_ARCH_ARM)
572 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
575 for (
int i = 0; i < 4; i++) {
577 memcpy(&ai, &a.f[i],
sizeof(uint32_t));
578 memcpy(&bi, &b.f[i],
sizeof(uint32_t));
580 memcpy(&result.f[i], &ri,
sizeof(uint32_t));
586static inline simd_vec_t simd_or(simd_vec_t a, simd_vec_t b) {
587#if defined(SIMD_ARCH_X86)
588 return _mm_or_ps(a, b);
589#elif defined(SIMD_ARCH_ARM)
590 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
593 for (
int i = 0; i < 4; i++) {
595 memcpy(&ai, &a.f[i],
sizeof(uint32_t));
596 memcpy(&bi, &b.f[i],
sizeof(uint32_t));
598 memcpy(&result.f[i], &ri,
sizeof(uint32_t));
604static inline simd_vec_t simd_xor(simd_vec_t a, simd_vec_t b) {
605#if defined(SIMD_ARCH_X86)
606 return _mm_xor_ps(a, b);
607#elif defined(SIMD_ARCH_ARM)
608 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
611 for (
int i = 0; i < 4; i++) {
613 memcpy(&ai, &a.f[i],
sizeof(uint32_t));
614 memcpy(&bi, &b.f[i],
sizeof(uint32_t));
616 memcpy(&result.f[i], &ri,
sizeof(uint32_t));
623static inline simd_vec_t simd_andnot(simd_vec_t a, simd_vec_t b) {
624#if defined(SIMD_ARCH_X86)
625 return _mm_andnot_ps(a, b);
626#elif defined(SIMD_ARCH_ARM)
627 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a)));
630 for (
int i = 0; i < 4; i++) {
632 memcpy(&ai, &a.f[i],
sizeof(uint32_t));
633 memcpy(&bi, &b.f[i],
sizeof(uint32_t));
635 memcpy(&result.f[i], &ri,
sizeof(uint32_t));
646#if defined(SIMD_ARCH_X86)
648#define simd_dup_x(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))
649#define simd_dup_y(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))
650#define simd_dup_z(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))
651#define simd_dup_w(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3))
652#elif defined(SIMD_ARCH_ARM)
654#define simd_dup_x(v) vdupq_n_f32(vgetq_lane_f32(v, 0))
655#define simd_dup_y(v) vdupq_n_f32(vgetq_lane_f32(v, 1))
656#define simd_dup_z(v) vdupq_n_f32(vgetq_lane_f32(v, 2))
657#define simd_dup_w(v) vdupq_n_f32(vgetq_lane_f32(v, 3))
659#define simd_dup_x(v) simd_set1(v.f[0])
660#define simd_dup_y(v) simd_set1(v.f[1])
661#define simd_dup_z(v) simd_set1(v.f[2])
662#define simd_dup_w(v) simd_set1(v.f[3])
676static inline simd_vec_t simd_blend(simd_vec_t false_vec, simd_vec_t true_vec, simd_vec_t mask) {
677#if defined(SIMD_ARCH_X86)
680 return _mm_blendv_ps(false_vec, true_vec, mask);
683 return _mm_or_ps(_mm_and_ps(mask, true_vec), _mm_andnot_ps(mask, false_vec));
685#elif defined(SIMD_ARCH_ARM)
687 uint32x4_t uint_mask = vreinterpretq_u32_f32(mask);
688 return vbslq_f32(uint_mask, true_vec, false_vec);
691 for (
int i = 0; i < 4; i++) {
693 memcpy(&mask_int, &mask.f[i],
sizeof(uint32_t));
695 result.f[i] = (mask_int & 0x80000000) ? true_vec.f[i] : false_vec.f[i];
711static inline float simd_hadd(simd_vec_t v) {
712#if defined(SIMD_ARCH_X86)
720 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
721 __m128 sums = _mm_add_ps(v, shuf);
722 shuf = _mm_movehl_ps(shuf, sums);
723 sums = _mm_add_ps(sums, shuf);
724 return _mm_cvtss_f32(sums);
725#elif defined(SIMD_ARCH_ARM64)
726 return vaddvq_f32(v);
727#elif defined(SIMD_ARCH_ARM32)
729 float32x2_t r = vadd_f32(vget_high_f32(v), vget_low_f32(v));
731 return vget_lane_f32(r, 0);
733 return v.f[0] + v.f[1] + v.f[2] + v.f[3];
738static inline float simd_hmin(simd_vec_t v) {
739#if defined(SIMD_ARCH_X86)
740 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
741 __m128 mins = _mm_min_ps(v, shuf);
742 shuf = _mm_movehl_ps(shuf, mins);
743 mins = _mm_min_ps(mins, shuf);
744 return _mm_cvtss_f32(mins);
745#elif defined(SIMD_ARCH_ARM64)
746 return vminvq_f32(v);
747#elif defined(SIMD_ARCH_ARM32)
748 float32x2_t r = vmin_f32(vget_high_f32(v), vget_low_f32(v));
750 return vget_lane_f32(r, 0);
753 if (v.f[1] < min) min = v.f[1];
754 if (v.f[2] < min) min = v.f[2];
755 if (v.f[3] < min) min = v.f[3];
761static inline float simd_hmax(simd_vec_t v) {
762#if defined(SIMD_ARCH_X86)
763 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
764 __m128 maxs = _mm_max_ps(v, shuf);
765 shuf = _mm_movehl_ps(shuf, maxs);
766 maxs = _mm_max_ps(maxs, shuf);
767 return _mm_cvtss_f32(maxs);
768#elif defined(SIMD_ARCH_ARM64)
769 return vmaxvq_f32(v);
770#elif defined(SIMD_ARCH_ARM32)
771 float32x2_t r = vmax_f32(vget_high_f32(v), vget_low_f32(v));
773 return vget_lane_f32(r, 0);
776 if (v.f[1] > max) max = v.f[1];
777 if (v.f[2] > max) max = v.f[2];
778 if (v.f[3] > max) max = v.f[3];
793static inline float simd_dot3(simd_vec_t a, simd_vec_t b) {
794#if defined(SIMD_ARCH_X86)
796 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x71));
798 __m128 mul = _mm_mul_ps(a, b);
807 const __m128i imask = _mm_set_epi32(0, ~0, ~0, ~0);
808 mul = _mm_and_ps(mul, _mm_castsi128_ps(imask));
810 return simd_hadd(mul);
812#elif defined(SIMD_ARCH_ARM)
813 simd_vec_t mul = vmulq_f32(a, b);
814 mul = vsetq_lane_f32(0.0f, mul, 3);
815#if defined(SIMD_ARCH_ARM64)
816 return vaddvq_f32(mul);
818 float32x2_t r = vadd_f32(vget_high_f32(mul), vget_low_f32(mul));
820 return vget_lane_f32(r, 0);
823 return a.f[0] * b.f[0] + a.f[1] * b.f[1] + a.f[2] * b.f[2];
831static inline float simd_dot4(simd_vec_t a, simd_vec_t b) {
832#if defined(SIMD_ARCH_X86)
835 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xF1));
837 return simd_hadd(_mm_mul_ps(a, b));
839#elif defined(SIMD_ARCH_ARM)
840 simd_vec_t mul = vmulq_f32(a, b);
841#if defined(SIMD_ARCH_ARM64)
842 return vaddvq_f32(mul);
844 float32x2_t r = vadd_f32(vget_high_f32(mul), vget_low_f32(mul));
846 return vget_lane_f32(r, 0);
849 return a.f[0] * b.f[0] + a.f[1] * b.f[1] + a.f[2] * b.f[2] + a.f[3] * b.f[3];
866static inline simd_vec_t simd_cross(simd_vec_t a, simd_vec_t b) {
867#if defined(SIMD_ARCH_X86)
875 __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
876 __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
877 __m128 a_zxy = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
878 __m128 b_zxy = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2));
880 __m128 mul1 = _mm_mul_ps(a_yzx, b_zxy);
881 __m128 mul2 = _mm_mul_ps(a_zxy, b_yzx);
883 return _mm_sub_ps(mul1, mul2);
884#elif defined(SIMD_ARCH_ARM)
889 float temp_a[4], temp_b[4];
890 vst1q_f32(temp_a, a);
891 vst1q_f32(temp_b, b);
894 result[0] = temp_a[1] * temp_b[2] - temp_a[2] * temp_b[1];
895 result[1] = temp_a[2] * temp_b[0] - temp_a[0] * temp_b[2];
896 result[2] = temp_a[0] * temp_b[1] - temp_a[1] * temp_b[0];
899 return vld1q_f32(result);
901 return (simd_vec_t){{a.f[1] * b.f[2] - a.f[2] * b.f[1], a.f[2] * b.f[0] - a.f[0] * b.f[2],
902 a.f[0] * b.f[1] - a.f[1] * b.f[0], 0.0f}};
907static inline float simd_length_sq3(simd_vec_t v) {
return simd_dot3(v, v); }
910static inline float simd_length3(simd_vec_t v) {
return sqrtf(simd_dot3(v, v)); }
913static inline float simd_length_sq4(simd_vec_t v) {
return simd_dot4(v, v); }
916static inline float simd_length4(simd_vec_t v) {
return sqrtf(simd_dot4(v, v)); }
923static inline simd_vec_t simd_normalize3(simd_vec_t v) {
924 float len_sq = simd_dot3(v, v);
926 float inv_len = 1.0f / sqrtf(len_sq);
927 simd_vec_t scale = simd_set1(inv_len);
928 simd_vec_t result = simd_mul(v, scale);
931#if defined(SIMD_ARCH_X86)
933 return _mm_blend_ps(result, v, 0x8);
934#elif defined(SIMD_ARCH_ARM)
935 return vsetq_lane_f32(vgetq_lane_f32(v, 3), result, 3);
937 result.f[3] = v.f[3];
945static inline simd_vec_t simd_normalize4(simd_vec_t v) {
946 float len_sq = simd_dot4(v, v);
948 float inv_len = 1.0f / sqrtf(len_sq);
949 return simd_mul(v, simd_set1(inv_len));
958static inline simd_vec_t simd_normalize3_fast(simd_vec_t v) {
959 simd_vec_t len_sq = simd_set1(simd_dot3(v, v));
960 simd_vec_t inv_len = simd_rsqrt(len_sq);
961 simd_vec_t result = simd_mul(v, inv_len);
964#if defined(SIMD_ARCH_X86)
965 return _mm_blend_ps(result, v, 0x8);
966#elif defined(SIMD_ARCH_ARM)
967 return vsetq_lane_f32(vgetq_lane_f32(v, 3), result, 3);
969 result.f[3] = v.f[3];
984static inline bool simd_equals_eps(simd_vec_t a, simd_vec_t b,
float epsilon) {
985 simd_vec_t sub = simd_sub(a, b);
987#if defined(SIMD_ARCH_X86)
989 static const __m128 sign_mask = {-0.0f, -0.0f, -0.0f, -0.0f};
990 __m128 abs_diff = _mm_andnot_ps(sign_mask, sub);
991 __m128 eps_vec = _mm_set1_ps(epsilon);
992 __m128 cmp = _mm_cmplt_ps(abs_diff, eps_vec);
995 return _mm_movemask_ps(cmp) == 0xF;
996#elif defined(SIMD_ARCH_ARM)
997 simd_vec_t abs_diff = vabsq_f32(sub);
998 simd_vec_t eps_vec = vdupq_n_f32(epsilon);
999 uint32x4_t cmp = vcltq_f32(abs_diff, eps_vec);
1002 uint32x2_t min = vmin_u32(vget_low_u32(cmp), vget_high_u32(cmp));
1003 return vget_lane_u32(min, 0) == 0xFFFFFFFF && vget_lane_u32(min, 1) == 0xFFFFFFFF;
1005 return fabsf(a.f[0] - b.f[0]) < epsilon && fabsf(a.f[1] - b.f[1]) < epsilon && fabsf(a.f[2] - b.f[2]) < epsilon &&
1006 fabsf(a.f[3] - b.f[3]) < epsilon;
1028#if defined(SIMD_ARCH_X86)
1034#define simd_swizzle(v, x, y, z, w) _mm_shuffle_ps((v), (v), _MM_SHUFFLE((w), (z), (y), (x)))
1036#elif defined(SIMD_ARCH_ARM)
1043#if defined(__clang__) || defined(__GNUC__)
1044#define simd_swizzle(v, x, y, z, w) __builtin_shufflevector((v), (v), (x), (y), (z), (w))
1047static inline simd_vec_t simd_swizzle_fallback(simd_vec_t v,
int i0,
int i1,
int i2,
int i3) {
1050 float r[4] = {d[i0], d[i1], d[i2], d[i3]};
1051 return vld1q_f32(r);
1053#define simd_swizzle(v, x, y, z, w) simd_swizzle_fallback((v), (x), (y), (z), (w))
1059#define simd_swizzle(v, x, y, z, w) ((simd_vec_t){{(v).f[(x)], (v).f[(y)], (v).f[(z)], (v).f[(w)]}})
1064#define simd_splat_x(v) simd_swizzle(v, 0, 0, 0, 0)
1065#define simd_splat_y(v) simd_swizzle(v, 1, 1, 1, 1)
1066#define simd_splat_z(v) simd_swizzle(v, 2, 2, 2, 2)
1067#define simd_splat_w(v) simd_swizzle(v, 3, 3, 3, 3)
1070#define simd_yzxw(v) simd_swizzle(v, 1, 2, 0, 3)
1071#define simd_zxyw(v) simd_swizzle(v, 2, 0, 1, 3)
1078static inline float simd_get_x(simd_vec_t v) {
1079#if defined(SIMD_ARCH_X86)
1080 return _mm_cvtss_f32(v);
1081#elif defined(SIMD_ARCH_ARM)
1082 return vgetq_lane_f32(v, 0);
1089static inline bool simd_check_all(simd_vec_t mask) {
1090#if defined(SIMD_ARCH_X86)
1091 return _mm_movemask_ps(mask) == 0xF;
1092#elif defined(SIMD_ARCH_ARM)
1094 uint32x4_t u = vreinterpretq_u32_f32(mask);
1095 uint32x2_t min = vmin_u32(vget_low_u32(u), vget_high_u32(u));
1096 return (vget_lane_u32(min, 0) & vget_lane_u32(min, 1)) == 0xFFFFFFFF;
1098 uint32_t* i = (uint32_t*)&mask;
1099 return i[0] && i[1] && i[2] && i[3];
1104#if defined(SIMD_ARCH_X86)
1105#define simd_transpose4(r0, r1, r2, r3) _MM_TRANSPOSE4_PS(r0, r1, r2, r3)
1106#elif defined(SIMD_ARCH_ARM)
1107#define simd_transpose4(r0, r1, r2, r3) \
1109 float32x4x2_t t0 = vtrnq_f32(r0, r1); \
1110 float32x4x2_t t1 = vtrnq_f32(r2, r3); \
1111 r0 = vcombine_f32(vget_low_f32(t0.val[0]), vget_low_f32(t1.val[0])); \
1112 r1 = vcombine_f32(vget_low_f32(t0.val[1]), vget_low_f32(t1.val[1])); \
1113 r2 = vcombine_f32(vget_high_f32(t0.val[0]), vget_high_f32(t1.val[0])); \
1114 r3 = vcombine_f32(vget_high_f32(t0.val[1]), vget_high_f32(t1.val[1])); \
1117#define simd_transpose4(r0, r1, r2, r3) \
1119 simd_vec_t t0 = r0, t1 = r1, t2 = r2, t3 = r3; \
1120 r0 = (simd_vec_t){{t0.f[0], t1.f[0], t2.f[0], t3.f[0]}}; \
1121 r1 = (simd_vec_t){{t0.f[1], t1.f[1], t2.f[1], t3.f[1]}}; \
1122 r2 = (simd_vec_t){{t0.f[2], t1.f[2], t2.f[2], t3.f[2]}}; \
1123 r3 = (simd_vec_t){{t0.f[3], t1.f[3], t2.f[3], t3.f[3]}}; \
Memory alignment utilities and macros for cross-platform alignment.