solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
simd.h
Go to the documentation of this file.
1
38#ifndef SIMD_H
39#define SIMD_H
40
41#include "align.h"
42
43#include <math.h>
44#include <stdbool.h>
45#include <stdint.h>
46#include <string.h>
47
48/* =========================================================
49 Architecture Detection
50 ========================================================= */
51
52/*
53 * Detect x86_64 or x86.
54 * Note: We assume SSE2 is available on all modern x86 targets.
55 */
56#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
57#define SIMD_ARCH_X86
58#include <immintrin.h>
59
60/* Check for SSE4.1 support (Standard on Core 2 Duo Penryn and later) */
61#if defined(__SSE4_1__)
62#define SIMD_HAS_SSE41
63#endif
64
65/*
66 * Detect ARM NEON.
67 * Distinguishes between AArch64 (ARMv8) and ARMv7 (32-bit).
68 */
69#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
70#define SIMD_ARCH_ARM
71#define SIMD_ARCH_ARM64
72#include <arm_neon.h>
73#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
74#define SIMD_ARCH_ARM
75#define SIMD_ARCH_ARM32
76#include <arm_neon.h>
77
78/* Fallback for generic C implementation */
79#else
80#define SIMD_ARCH_SCALAR
81#endif
82
83/* =========================================================
84 Type Definitions
85 ========================================================= */
86
95#if defined(SIMD_ARCH_X86)
96/* Maps to __m128 (XMM registers) */
97typedef __m128 simd_vec_t;
98typedef __m128i simd_ivec_t;
99#elif defined(SIMD_ARCH_ARM)
100/* Maps to float32x4_t (Q registers) */
101typedef float32x4_t simd_vec_t;
102typedef int32x4_t simd_ivec_t;
103#else
104/* Maps to a struct of 4 floats */
105typedef struct {
106 float f[4];
107} simd_vec_t;
108typedef struct {
109 int i[4];
110} simd_ivec_t;
111#endif
112
113/* =========================================================
114 Load / Store / Set
115 ========================================================= */
116
120static inline simd_vec_t simd_set_zero(void) {
121#if defined(SIMD_ARCH_X86)
122 return _mm_setzero_ps();
123#elif defined(SIMD_ARCH_ARM)
124 return vdupq_n_f32(0.0f);
125#else
126 return (simd_vec_t){{0, 0, 0, 0}};
127#endif
128}
129
139static inline simd_vec_t simd_set(float x, float y, float z, float w) {
140#if defined(SIMD_ARCH_X86)
141 /*
142 * _mm_set_ps inputs are reversed (w, z, y, x) to map to
143 * registers [3, 2, 1, 0]. We flip them here for API consistency.
144 */
145 return _mm_set_ps(w, z, y, x);
146#elif defined(SIMD_ARCH_ARM)
147 float d[4] = {x, y, z, w};
148 return vld1q_f32(d);
149#else
150 return (simd_vec_t){{x, y, z, w}};
151#endif
152}
153
158static inline simd_vec_t simd_set1(float s) {
159#if defined(SIMD_ARCH_X86)
160 return _mm_set1_ps(s);
161#elif defined(SIMD_ARCH_ARM)
162 return vdupq_n_f32(s);
163#else
164 return (simd_vec_t){{s, s, s, s}};
165#endif
166}
167
175static inline simd_vec_t simd_load(const float* ptr) {
176#if defined(SIMD_ARCH_X86)
177 return _mm_loadu_ps(ptr);
178#elif defined(SIMD_ARCH_ARM)
179 return vld1q_f32(ptr);
180#else
181 simd_vec_t r;
182 memcpy(r.f, ptr, 4 * sizeof(float));
183 return r;
184#endif
185}
186
191static inline void simd_store(float* ptr, simd_vec_t v) {
192#if defined(SIMD_ARCH_X86)
193 _mm_storeu_ps(ptr, v);
194#elif defined(SIMD_ARCH_ARM)
195 vst1q_f32(ptr, v);
196#else
197 memcpy(ptr, v.f, 4 * sizeof(float));
198#endif
199}
200
201/* =========================================================
202 Arithmetic
203 ========================================================= */
204
206static inline simd_vec_t simd_add(simd_vec_t a, simd_vec_t b) {
207#if defined(SIMD_ARCH_X86)
208 return _mm_add_ps(a, b);
209#elif defined(SIMD_ARCH_ARM)
210 return vaddq_f32(a, b);
211#else
212 return (simd_vec_t){{a.f[0] + b.f[0], a.f[1] + b.f[1], a.f[2] + b.f[2], a.f[3] + b.f[3]}};
213#endif
214}
215
217static inline simd_vec_t simd_sub(simd_vec_t a, simd_vec_t b) {
218#if defined(SIMD_ARCH_X86)
219 return _mm_sub_ps(a, b);
220#elif defined(SIMD_ARCH_ARM)
221 return vsubq_f32(a, b);
222#else
223 return (simd_vec_t){{a.f[0] - b.f[0], a.f[1] - b.f[1], a.f[2] - b.f[2], a.f[3] - b.f[3]}};
224#endif
225}
226
228static inline simd_vec_t simd_mul(simd_vec_t a, simd_vec_t b) {
229#if defined(SIMD_ARCH_X86)
230 return _mm_mul_ps(a, b);
231#elif defined(SIMD_ARCH_ARM)
232 return vmulq_f32(a, b);
233#else
234 return (simd_vec_t){{a.f[0] * b.f[0], a.f[1] * b.f[1], a.f[2] * b.f[2], a.f[3] * b.f[3]}};
235#endif
236}
237
244static inline simd_vec_t simd_div(simd_vec_t a, simd_vec_t b) {
245#if defined(SIMD_ARCH_X86)
246 return _mm_div_ps(a, b);
247#elif defined(SIMD_ARCH_ARM64)
248 return vdivq_f32(a, b); // Native division on AArch64
249#elif defined(SIMD_ARCH_ARM32)
250 /*
251 * ARMv7 optimization: x / y = x * (1/y)
252 * 1. Estimate reciprocal (vrecpeq)
253 * 2. Refine estimate using Newton-Raphson step (vrecpsq)
254 * 3. Multiply
255 */
256 simd_vec_t recip = vrecpeq_f32(b);
257 recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
258 return vmulq_f32(a, recip);
259#else
260 return (simd_vec_t){{a.f[0] / b.f[0], a.f[1] / b.f[1], a.f[2] / b.f[2], a.f[3] / b.f[3]}};
261#endif
262}
263
270static inline simd_vec_t simd_madd(simd_vec_t a, simd_vec_t b, simd_vec_t c) {
271#if defined(SIMD_ARCH_X86)
272#if defined(__FMA__)
273 return _mm_fmadd_ps(a, b, c);
274#else
275 return _mm_add_ps(_mm_mul_ps(a, b), c);
276#endif
277#elif defined(SIMD_ARCH_ARM)
278 return vmlaq_f32(c, a, b); // NEON instruction acts as accumulator: c += a*b
279#else
280 return simd_add(simd_mul(a, b), c);
281#endif
282}
283
285static inline simd_vec_t simd_neg(simd_vec_t v) {
286#if defined(SIMD_ARCH_X86)
287 return _mm_sub_ps(_mm_setzero_ps(), v);
288#elif defined(SIMD_ARCH_ARM)
289 return vnegq_f32(v);
290#else
291 return (simd_vec_t){{-v.f[0], -v.f[1], -v.f[2], -v.f[3]}};
292#endif
293}
294
296static inline simd_vec_t simd_abs(simd_vec_t v) {
297#if defined(SIMD_ARCH_X86)
298 /* Clear the sign bit (MSB) using a mask */
299 static const __m128 sign_mask = {-0.0f, -0.0f, -0.0f, -0.0f}; // 0x80000000
300 return _mm_andnot_ps(sign_mask, v);
301#elif defined(SIMD_ARCH_ARM)
302 return vabsq_f32(v);
303#else
304 return (simd_vec_t){{fabsf(v.f[0]), fabsf(v.f[1]), fabsf(v.f[2]), fabsf(v.f[3])}};
305#endif
306}
307
308/* =========================================================
309 Min / Max
310 ========================================================= */
311
313static inline simd_vec_t simd_min(simd_vec_t a, simd_vec_t b) {
314#if defined(SIMD_ARCH_X86)
315 return _mm_min_ps(a, b);
316#elif defined(SIMD_ARCH_ARM)
317 return vminq_f32(a, b);
318#else
319 return (simd_vec_t){{a.f[0] < b.f[0] ? a.f[0] : b.f[0], a.f[1] < b.f[1] ? a.f[1] : b.f[1],
320 a.f[2] < b.f[2] ? a.f[2] : b.f[2], a.f[3] < b.f[3] ? a.f[3] : b.f[3]}};
321#endif
322}
323
325static inline simd_vec_t simd_max(simd_vec_t a, simd_vec_t b) {
326#if defined(SIMD_ARCH_X86)
327 return _mm_max_ps(a, b);
328#elif defined(SIMD_ARCH_ARM)
329 return vmaxq_f32(a, b);
330#else
331 return (simd_vec_t){{a.f[0] > b.f[0] ? a.f[0] : b.f[0], a.f[1] > b.f[1] ? a.f[1] : b.f[1],
332 a.f[2] > b.f[2] ? a.f[2] : b.f[2], a.f[3] > b.f[3] ? a.f[3] : b.f[3]}};
333#endif
334}
335
336/* =========================================================
337 Math Functions
338 ========================================================= */
339
341static inline simd_vec_t simd_sqrt(simd_vec_t v) {
342#if defined(SIMD_ARCH_X86)
343 return _mm_sqrt_ps(v);
344#elif defined(SIMD_ARCH_ARM64)
345 return vsqrtq_f32(v);
346#elif defined(SIMD_ARCH_ARM32)
347 /* ARMv7 NEON lacks native vector sqrt, fallback to scalar */
348 simd_vec_t result;
349 float temp[4];
350 vst1q_f32(temp, v);
351 for (int i = 0; i < 4; ++i) temp[i] = sqrtf(temp[i]);
352 return vld1q_f32(temp);
353#else
354 return (simd_vec_t){{sqrtf(v.f[0]), sqrtf(v.f[1]), sqrtf(v.f[2]), sqrtf(v.f[3])}};
355#endif
356}
357
365static inline simd_vec_t simd_rsqrt(simd_vec_t v) {
366#if defined(SIMD_ARCH_X86)
367 return _mm_rsqrt_ps(v);
368#elif defined(SIMD_ARCH_ARM)
369 return vrsqrteq_f32(v);
370#else
371 return (simd_vec_t){{1.0f / sqrtf(v.f[0]), 1.0f / sqrtf(v.f[1]), 1.0f / sqrtf(v.f[2]), 1.0f / sqrtf(v.f[3])}};
372#endif
373}
374
379static inline simd_vec_t simd_rcp(simd_vec_t v) {
380#if defined(SIMD_ARCH_X86)
381 return _mm_rcp_ps(v);
382#elif defined(SIMD_ARCH_ARM)
383 return vrecpeq_f32(v);
384#else
385 return (simd_vec_t){{1.0f / v.f[0], 1.0f / v.f[1], 1.0f / v.f[2], 1.0f / v.f[3]}};
386#endif
387}
388
390static inline simd_vec_t simd_floor(simd_vec_t v) {
391#if defined(SIMD_ARCH_X86)
392#ifdef SIMD_HAS_SSE41
393 return _mm_floor_ps(v);
394#else
395 /* SSE2 Fallback: conversion via scalar math */
396 float temp[4];
397 _mm_storeu_ps(temp, v);
398 for (int i = 0; i < 4; ++i) temp[i] = floorf(temp[i]);
399 return _mm_loadu_ps(temp);
400#endif
401#elif defined(SIMD_ARCH_ARM64)
402 return vrndmq_f32(v); // Round towards minus infinity
403#elif defined(SIMD_ARCH_ARM32)
404 float temp[4];
405 vst1q_f32(temp, v);
406 for (int i = 0; i < 4; ++i) temp[i] = floorf(temp[i]);
407 return vld1q_f32(temp);
408#else
409 return (simd_vec_t){{floorf(v.f[0]), floorf(v.f[1]), floorf(v.f[2]), floorf(v.f[3])}};
410#endif
411}
412
414static inline simd_vec_t simd_ceil(simd_vec_t v) {
415#if defined(SIMD_ARCH_X86)
416#ifdef SIMD_HAS_SSE41
417 return _mm_ceil_ps(v);
418#else
419 float temp[4];
420 _mm_storeu_ps(temp, v);
421 for (int i = 0; i < 4; ++i) temp[i] = ceilf(temp[i]);
422 return _mm_loadu_ps(temp);
423#endif
424#elif defined(SIMD_ARCH_ARM64)
425 return vrndpq_f32(v); // Round towards plus infinity
426#elif defined(SIMD_ARCH_ARM32)
427 float temp[4];
428 vst1q_f32(temp, v);
429 for (int i = 0; i < 4; ++i) temp[i] = ceilf(temp[i]);
430 return vld1q_f32(temp);
431#else
432 return (simd_vec_t){{ceilf(v.f[0]), ceilf(v.f[1]), ceilf(v.f[2]), ceilf(v.f[3])}};
433#endif
434}
435
437static inline simd_vec_t simd_round(simd_vec_t v) {
438#if defined(SIMD_ARCH_X86)
439#ifdef SIMD_HAS_SSE41
440 return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
441#else
442 float temp[4];
443 _mm_storeu_ps(temp, v);
444 for (int i = 0; i < 4; ++i) temp[i] = roundf(temp[i]);
445 return _mm_loadu_ps(temp);
446#endif
447#elif defined(SIMD_ARCH_ARM64)
448 return vrndnq_f32(v); // Round nearest, ties to even
449#elif defined(SIMD_ARCH_ARM32)
450 float temp[4];
451 vst1q_f32(temp, v);
452 for (int i = 0; i < 4; ++i) temp[i] = roundf(temp[i]);
453 return vld1q_f32(temp);
454#else
455 return (simd_vec_t){{roundf(v.f[0]), roundf(v.f[1]), roundf(v.f[2]), roundf(v.f[3])}};
456#endif
457}
458
459/* =========================================================
460 Comparison Operations
461 ========================================================= */
462
463/*
464 * NOTE ON COMPARISONS:
465 * SIMD comparisons do NOT return true/false booleans.
466 * They return a bitmask:
467 * - If True: All bits set to 1 (0xFFFFFFFF or NaN)
468 * - If False: All bits set to 0 (0x00000000 or 0.0f)
469 *
470 * This allows "Branchless Programming" via bitwise masking (simd_and, simd_blend).
471 */
472
473static inline simd_vec_t simd_cmpeq(simd_vec_t a, simd_vec_t b) {
474#if defined(SIMD_ARCH_X86)
475 return _mm_cmpeq_ps(a, b);
476#elif defined(SIMD_ARCH_ARM)
477 return vreinterpretq_f32_u32(vceqq_f32(a, b));
478#else
479 simd_vec_t result;
480 for (int i = 0; i < 4; i++) {
481 uint32_t mask = (a.f[i] == b.f[i]) ? 0xFFFFFFFF : 0;
482 memcpy(&result.f[i], &mask, sizeof(uint32_t));
483 }
484 return result;
485#endif
486}
487
488static inline simd_vec_t simd_cmpneq(simd_vec_t a, simd_vec_t b) {
489#if defined(SIMD_ARCH_X86)
490 return _mm_cmpneq_ps(a, b);
491#elif defined(SIMD_ARCH_ARM)
492 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a, b))); // Bitwise NOT of Equals
493#else
494 simd_vec_t result;
495 for (int i = 0; i < 4; i++) {
496 uint32_t mask = (a.f[i] != b.f[i]) ? 0xFFFFFFFF : 0;
497 memcpy(&result.f[i], &mask, sizeof(uint32_t));
498 }
499 return result;
500#endif
501}
502
503static inline simd_vec_t simd_cmplt(simd_vec_t a, simd_vec_t b) {
504#if defined(SIMD_ARCH_X86)
505 return _mm_cmplt_ps(a, b);
506#elif defined(SIMD_ARCH_ARM)
507 return vreinterpretq_f32_u32(vcltq_f32(a, b));
508#else
509 simd_vec_t result;
510 for (int i = 0; i < 4; i++) {
511 uint32_t mask = (a.f[i] < b.f[i]) ? 0xFFFFFFFF : 0;
512 memcpy(&result.f[i], &mask, sizeof(uint32_t));
513 }
514 return result;
515#endif
516}
517
518static inline simd_vec_t simd_cmple(simd_vec_t a, simd_vec_t b) {
519#if defined(SIMD_ARCH_X86)
520 return _mm_cmple_ps(a, b);
521#elif defined(SIMD_ARCH_ARM)
522 return vreinterpretq_f32_u32(vcleq_f32(a, b));
523#else
524 simd_vec_t result;
525 for (int i = 0; i < 4; i++) {
526 uint32_t mask = (a.f[i] <= b.f[i]) ? 0xFFFFFFFF : 0;
527 memcpy(&result.f[i], &mask, sizeof(uint32_t));
528 }
529 return result;
530#endif
531}
532
533static inline simd_vec_t simd_cmpgt(simd_vec_t a, simd_vec_t b) {
534#if defined(SIMD_ARCH_X86)
535 return _mm_cmpgt_ps(a, b);
536#elif defined(SIMD_ARCH_ARM)
537 return vreinterpretq_f32_u32(vcgtq_f32(a, b));
538#else
539 simd_vec_t result;
540 for (int i = 0; i < 4; i++) {
541 uint32_t mask = (a.f[i] > b.f[i]) ? 0xFFFFFFFF : 0;
542 memcpy(&result.f[i], &mask, sizeof(uint32_t));
543 }
544 return result;
545#endif
546}
547
548static inline simd_vec_t simd_cmpge(simd_vec_t a, simd_vec_t b) {
549#if defined(SIMD_ARCH_X86)
550 return _mm_cmpge_ps(a, b);
551#elif defined(SIMD_ARCH_ARM)
552 return vreinterpretq_f32_u32(vcgeq_f32(a, b));
553#else
554 simd_vec_t result;
555 for (int i = 0; i < 4; i++) {
556 uint32_t mask = (a.f[i] >= b.f[i]) ? 0xFFFFFFFF : 0;
557 memcpy(&result.f[i], &mask, sizeof(uint32_t));
558 }
559 return result;
560#endif
561}
562
563/* =========================================================
564 Bitwise Operations
565 ========================================================= */
566
567static inline simd_vec_t simd_and(simd_vec_t a, simd_vec_t b) {
568#if defined(SIMD_ARCH_X86)
569 return _mm_and_ps(a, b);
570#elif defined(SIMD_ARCH_ARM)
571 /* NEON float types require reinterpretation to integer types for bitwise ops */
572 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
573#else
574 simd_vec_t result;
575 for (int i = 0; i < 4; i++) {
576 uint32_t ai, bi, ri;
577 memcpy(&ai, &a.f[i], sizeof(uint32_t));
578 memcpy(&bi, &b.f[i], sizeof(uint32_t));
579 ri = ai & bi;
580 memcpy(&result.f[i], &ri, sizeof(uint32_t));
581 }
582 return result;
583#endif
584}
585
586static inline simd_vec_t simd_or(simd_vec_t a, simd_vec_t b) {
587#if defined(SIMD_ARCH_X86)
588 return _mm_or_ps(a, b);
589#elif defined(SIMD_ARCH_ARM)
590 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
591#else
592 simd_vec_t result;
593 for (int i = 0; i < 4; i++) {
594 uint32_t ai, bi, ri;
595 memcpy(&ai, &a.f[i], sizeof(uint32_t));
596 memcpy(&bi, &b.f[i], sizeof(uint32_t));
597 ri = ai | bi;
598 memcpy(&result.f[i], &ri, sizeof(uint32_t));
599 }
600 return result;
601#endif
602}
603
604static inline simd_vec_t simd_xor(simd_vec_t a, simd_vec_t b) {
605#if defined(SIMD_ARCH_X86)
606 return _mm_xor_ps(a, b);
607#elif defined(SIMD_ARCH_ARM)
608 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
609#else
610 simd_vec_t result;
611 for (int i = 0; i < 4; i++) {
612 uint32_t ai, bi, ri;
613 memcpy(&ai, &a.f[i], sizeof(uint32_t));
614 memcpy(&bi, &b.f[i], sizeof(uint32_t));
615 ri = ai ^ bi;
616 memcpy(&result.f[i], &ri, sizeof(uint32_t));
617 }
618 return result;
619#endif
620}
621
623static inline simd_vec_t simd_andnot(simd_vec_t a, simd_vec_t b) {
624#if defined(SIMD_ARCH_X86)
625 return _mm_andnot_ps(a, b);
626#elif defined(SIMD_ARCH_ARM)
627 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a)));
628#else
629 simd_vec_t result;
630 for (int i = 0; i < 4; i++) {
631 uint32_t ai, bi, ri;
632 memcpy(&ai, &a.f[i], sizeof(uint32_t));
633 memcpy(&bi, &b.f[i], sizeof(uint32_t));
634 ri = (~ai) & bi;
635 memcpy(&result.f[i], &ri, sizeof(uint32_t));
636 }
637 return result;
638#endif
639}
640
641/* =========================================================
642 Shuffles / Permutations
643 ========================================================= */
644
645/* Macros to duplicate a specific component across the vector */
646#if defined(SIMD_ARCH_X86)
647/* _MM_SHUFFLE parameters are (w, z, y, x) indices */
648#define simd_dup_x(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))
649#define simd_dup_y(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))
650#define simd_dup_z(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))
651#define simd_dup_w(v) _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3))
652#elif defined(SIMD_ARCH_ARM)
653/* NEON lane extraction requires compile-time constant indices */
654#define simd_dup_x(v) vdupq_n_f32(vgetq_lane_f32(v, 0))
655#define simd_dup_y(v) vdupq_n_f32(vgetq_lane_f32(v, 1))
656#define simd_dup_z(v) vdupq_n_f32(vgetq_lane_f32(v, 2))
657#define simd_dup_w(v) vdupq_n_f32(vgetq_lane_f32(v, 3))
658#else
659#define simd_dup_x(v) simd_set1(v.f[0])
660#define simd_dup_y(v) simd_set1(v.f[1])
661#define simd_dup_z(v) simd_set1(v.f[2])
662#define simd_dup_w(v) simd_set1(v.f[3])
663#endif
664
676static inline simd_vec_t simd_blend(simd_vec_t false_vec, simd_vec_t true_vec, simd_vec_t mask) {
677#if defined(SIMD_ARCH_X86)
678#ifdef SIMD_HAS_SSE41
679 /* SSE4.1 dedicated blend instruction */
680 return _mm_blendv_ps(false_vec, true_vec, mask);
681#else
682 /* SSE2 Fallback: (mask & true) | (~mask & false) */
683 return _mm_or_ps(_mm_and_ps(mask, true_vec), _mm_andnot_ps(mask, false_vec));
684#endif
685#elif defined(SIMD_ARCH_ARM)
686 /* vbslq: Bitwise Select. Selects from true_vec if mask bit is 1, else false_vec */
687 uint32x4_t uint_mask = vreinterpretq_u32_f32(mask);
688 return vbslq_f32(uint_mask, true_vec, false_vec);
689#else
690 simd_vec_t result;
691 for (int i = 0; i < 4; i++) {
692 uint32_t mask_int;
693 memcpy(&mask_int, &mask.f[i], sizeof(uint32_t));
694 /* Check sign bit for quick selection */
695 result.f[i] = (mask_int & 0x80000000) ? true_vec.f[i] : false_vec.f[i];
696 }
697 return result;
698#endif
699}
700
701/* =========================================================
702 Reductions (Horizontal operations)
703 ========================================================= */
704
711static inline float simd_hadd(simd_vec_t v) {
712#if defined(SIMD_ARCH_X86)
713 /*
714 * Butterfly reduction:
715 * 1. Shuffle high half to low half: [z, w, x, y]
716 * 2. Add: [x+z, y+w, ...]
717 * 3. Shuffle result's high to low: [y+w, ...]
718 * 4. Add: [x+z+y+w, ...]
719 */
720 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
721 __m128 sums = _mm_add_ps(v, shuf);
722 shuf = _mm_movehl_ps(shuf, sums);
723 sums = _mm_add_ps(sums, shuf);
724 return _mm_cvtss_f32(sums);
725#elif defined(SIMD_ARCH_ARM64)
726 return vaddvq_f32(v); /* AArch64 hardware instruction */
727#elif defined(SIMD_ARCH_ARM32)
728 /* Pairwise add: [x+y, z+w] -> [x+y+z+w] */
729 float32x2_t r = vadd_f32(vget_high_f32(v), vget_low_f32(v));
730 r = vpadd_f32(r, r);
731 return vget_lane_f32(r, 0);
732#else
733 return v.f[0] + v.f[1] + v.f[2] + v.f[3];
734#endif
735}
736
738static inline float simd_hmin(simd_vec_t v) {
739#if defined(SIMD_ARCH_X86)
740 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
741 __m128 mins = _mm_min_ps(v, shuf);
742 shuf = _mm_movehl_ps(shuf, mins);
743 mins = _mm_min_ps(mins, shuf);
744 return _mm_cvtss_f32(mins);
745#elif defined(SIMD_ARCH_ARM64)
746 return vminvq_f32(v);
747#elif defined(SIMD_ARCH_ARM32)
748 float32x2_t r = vmin_f32(vget_high_f32(v), vget_low_f32(v));
749 r = vpmin_f32(r, r);
750 return vget_lane_f32(r, 0);
751#else
752 float min = v.f[0];
753 if (v.f[1] < min) min = v.f[1];
754 if (v.f[2] < min) min = v.f[2];
755 if (v.f[3] < min) min = v.f[3];
756 return min;
757#endif
758}
759
761static inline float simd_hmax(simd_vec_t v) {
762#if defined(SIMD_ARCH_X86)
763 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
764 __m128 maxs = _mm_max_ps(v, shuf);
765 shuf = _mm_movehl_ps(shuf, maxs);
766 maxs = _mm_max_ps(maxs, shuf);
767 return _mm_cvtss_f32(maxs);
768#elif defined(SIMD_ARCH_ARM64)
769 return vmaxvq_f32(v);
770#elif defined(SIMD_ARCH_ARM32)
771 float32x2_t r = vmax_f32(vget_high_f32(v), vget_low_f32(v));
772 r = vpmax_f32(r, r);
773 return vget_lane_f32(r, 0);
774#else
775 float max = v.f[0];
776 if (v.f[1] > max) max = v.f[1];
777 if (v.f[2] > max) max = v.f[2];
778 if (v.f[3] > max) max = v.f[3];
779 return max;
780#endif
781}
782
783/* =========================================================
784 Dot Product Operations
785 ========================================================= */
786
792/* 3D dot product (ignores W component) */
793static inline float simd_dot3(simd_vec_t a, simd_vec_t b) {
794#if defined(SIMD_ARCH_X86)
795#ifdef SIMD_HAS_SSE41
796 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x71)); /* mask: xxxx0111 */
797#else
798 __m128 mul = _mm_mul_ps(a, b);
799
800 /*
801 * FIX: Zero the W lane with a bitwise AND instead of multiplication
802 * by 0.0f. Multiplication propagates NaN/Inf; AND does not.
803 *
804 * Mask layout (little-endian lanes): [~0, ~0, ~0, 0]
805 * X Y Z W
806 */
807 const __m128i imask = _mm_set_epi32(0, ~0, ~0, ~0);
808 mul = _mm_and_ps(mul, _mm_castsi128_ps(imask));
809
810 return simd_hadd(mul);
811#endif
812#elif defined(SIMD_ARCH_ARM)
813 simd_vec_t mul = vmulq_f32(a, b);
814 mul = vsetq_lane_f32(0.0f, mul, 3); /* zero W lane (no NaN risk: 0 literal) */
815#if defined(SIMD_ARCH_ARM64)
816 return vaddvq_f32(mul);
817#else
818 float32x2_t r = vadd_f32(vget_high_f32(mul), vget_low_f32(mul));
819 r = vpadd_f32(r, r);
820 return vget_lane_f32(r, 0);
821#endif
822#else
823 return a.f[0] * b.f[0] + a.f[1] * b.f[1] + a.f[2] * b.f[2];
824#endif
825}
826
831static inline float simd_dot4(simd_vec_t a, simd_vec_t b) {
832#if defined(SIMD_ARCH_X86)
833#ifdef SIMD_HAS_SSE41
834 /* _mm_dp_ps mask 0xF1: SrcMask=1111 (xyzw), DstMask=0001 (store in x) */
835 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xF1));
836#else
837 return simd_hadd(_mm_mul_ps(a, b));
838#endif
839#elif defined(SIMD_ARCH_ARM)
840 simd_vec_t mul = vmulq_f32(a, b);
841#if defined(SIMD_ARCH_ARM64)
842 return vaddvq_f32(mul);
843#else
844 float32x2_t r = vadd_f32(vget_high_f32(mul), vget_low_f32(mul));
845 r = vpadd_f32(r, r);
846 return vget_lane_f32(r, 0);
847#endif
848#else
849 return a.f[0] * b.f[0] + a.f[1] * b.f[1] + a.f[2] * b.f[2] + a.f[3] * b.f[3];
850#endif
851}
852
853/* =========================================================
854 3D Vector Operations
855 ========================================================= */
856
866static inline simd_vec_t simd_cross(simd_vec_t a, simd_vec_t b) {
867#if defined(SIMD_ARCH_X86)
868 /*
869 * Efficient shuffle method for SSE:
870 * 1. Shuffle A to [y, z, x]
871 * 2. Shuffle B to [z, x, y]
872 * 3. Mul
873 * 4. Subtract the reverse shuffle multiplication
874 */
875 __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
876 __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
877 __m128 a_zxy = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
878 __m128 b_zxy = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2));
879
880 __m128 mul1 = _mm_mul_ps(a_yzx, b_zxy);
881 __m128 mul2 = _mm_mul_ps(a_zxy, b_yzx);
882
883 return _mm_sub_ps(mul1, mul2);
884#elif defined(SIMD_ARCH_ARM)
885 /*
886 * ARM implementation uses a manual float shuffle approach for clarity
887 * and because NEON shuffles (tbl) can be high latency on older chips.
888 */
889 float temp_a[4], temp_b[4];
890 vst1q_f32(temp_a, a);
891 vst1q_f32(temp_b, b);
892
893 float result[4];
894 result[0] = temp_a[1] * temp_b[2] - temp_a[2] * temp_b[1];
895 result[1] = temp_a[2] * temp_b[0] - temp_a[0] * temp_b[2];
896 result[2] = temp_a[0] * temp_b[1] - temp_a[1] * temp_b[0];
897 result[3] = 0.0f;
898
899 return vld1q_f32(result);
900#else
901 return (simd_vec_t){{a.f[1] * b.f[2] - a.f[2] * b.f[1], a.f[2] * b.f[0] - a.f[0] * b.f[2],
902 a.f[0] * b.f[1] - a.f[1] * b.f[0], 0.0f}};
903#endif
904}
905
907static inline float simd_length_sq3(simd_vec_t v) { return simd_dot3(v, v); }
908
910static inline float simd_length3(simd_vec_t v) { return sqrtf(simd_dot3(v, v)); }
911
913static inline float simd_length_sq4(simd_vec_t v) { return simd_dot4(v, v); }
914
916static inline float simd_length4(simd_vec_t v) { return sqrtf(simd_dot4(v, v)); }
917
923static inline simd_vec_t simd_normalize3(simd_vec_t v) {
924 float len_sq = simd_dot3(v, v);
925 if (len_sq > 0.0f) {
926 float inv_len = 1.0f / sqrtf(len_sq);
927 simd_vec_t scale = simd_set1(inv_len);
928 simd_vec_t result = simd_mul(v, scale);
929
930 /* Restore original W component */
931#if defined(SIMD_ARCH_X86)
932 /* SSE4.1 blend is ideal, fallback is shuffle or logic */
933 return _mm_blend_ps(result, v, 0x8); // Mask 1000: copy W from v
934#elif defined(SIMD_ARCH_ARM)
935 return vsetq_lane_f32(vgetq_lane_f32(v, 3), result, 3);
936#else
937 result.f[3] = v.f[3];
938 return result;
939#endif
940 }
941 return v;
942}
943
945static inline simd_vec_t simd_normalize4(simd_vec_t v) {
946 float len_sq = simd_dot4(v, v);
947 if (len_sq > 0.0f) {
948 float inv_len = 1.0f / sqrtf(len_sq);
949 return simd_mul(v, simd_set1(inv_len));
950 }
951 return v;
952}
953
958static inline simd_vec_t simd_normalize3_fast(simd_vec_t v) {
959 simd_vec_t len_sq = simd_set1(simd_dot3(v, v));
960 simd_vec_t inv_len = simd_rsqrt(len_sq);
961 simd_vec_t result = simd_mul(v, inv_len);
962
963 /* Restore W */
964#if defined(SIMD_ARCH_X86)
965 return _mm_blend_ps(result, v, 0x8);
966#elif defined(SIMD_ARCH_ARM)
967 return vsetq_lane_f32(vgetq_lane_f32(v, 3), result, 3);
968#else
969 result.f[3] = v.f[3];
970 return result;
971#endif
972}
973
974/* =========================================================
975 Comparison Helper
976 ========================================================= */
977
984static inline bool simd_equals_eps(simd_vec_t a, simd_vec_t b, float epsilon) {
985 simd_vec_t sub = simd_sub(a, b);
986
987#if defined(SIMD_ARCH_X86)
988 /* Abs(sub) < epsilon */
989 static const __m128 sign_mask = {-0.0f, -0.0f, -0.0f, -0.0f};
990 __m128 abs_diff = _mm_andnot_ps(sign_mask, sub);
991 __m128 eps_vec = _mm_set1_ps(epsilon);
992 __m128 cmp = _mm_cmplt_ps(abs_diff, eps_vec);
993
994 /* _mm_movemask_ps returns an int where bits 0-3 correspond to vector lanes */
995 return _mm_movemask_ps(cmp) == 0xF;
996#elif defined(SIMD_ARCH_ARM)
997 simd_vec_t abs_diff = vabsq_f32(sub);
998 simd_vec_t eps_vec = vdupq_n_f32(epsilon);
999 uint32x4_t cmp = vcltq_f32(abs_diff, eps_vec);
1000
1001 /* Check if all bits are set in the comparison result */
1002 uint32x2_t min = vmin_u32(vget_low_u32(cmp), vget_high_u32(cmp));
1003 return vget_lane_u32(min, 0) == 0xFFFFFFFF && vget_lane_u32(min, 1) == 0xFFFFFFFF;
1004#else
1005 return fabsf(a.f[0] - b.f[0]) < epsilon && fabsf(a.f[1] - b.f[1]) < epsilon && fabsf(a.f[2] - b.f[2]) < epsilon &&
1006 fabsf(a.f[3] - b.f[3]) < epsilon;
1007#endif
1008}
1009
1010/* =========================================================
1011 Swizzling (Rearranging Components)
1012 ========================================================= */
1013
1014/* Indices for readability */
1015#define SIMD_X 0
1016#define SIMD_Y 1
1017#define SIMD_Z 2
1018#define SIMD_W 3
1019
1028#if defined(SIMD_ARCH_X86)
1029
1030/*
1031 * _MM_SHUFFLE(w, z, y, x) takes indices in reverse order.
1032 * We pass v for both inputs to shuffle from the same vector.
1033 */
1034#define simd_swizzle(v, x, y, z, w) _mm_shuffle_ps((v), (v), _MM_SHUFFLE((w), (z), (y), (x)))
1035
1036#elif defined(SIMD_ARCH_ARM)
1037
1038/*
1039 * GCC and Clang (used for 99% of Android/iOS/Linux ARM) support
1040 * __builtin_shufflevector. This maps to the optimal NEON instruction
1041 * (usually vtbl, vext, or simple moves).
1042 */
1043#if defined(__clang__) || defined(__GNUC__)
1044#define simd_swizzle(v, x, y, z, w) __builtin_shufflevector((v), (v), (x), (y), (z), (w))
1045#else
1046/* MSVC ARM or other compilers: Fallback to slow extract/set */
1047static inline simd_vec_t simd_swizzle_fallback(simd_vec_t v, int i0, int i1, int i2, int i3) {
1048 float d[4];
1049 vst1q_f32(d, v);
1050 float r[4] = {d[i0], d[i1], d[i2], d[i3]};
1051 return vld1q_f32(r);
1052}
1053#define simd_swizzle(v, x, y, z, w) simd_swizzle_fallback((v), (x), (y), (z), (w))
1054#endif
1055
1056#else
1057
1058/* Scalar Fallback */
1059#define simd_swizzle(v, x, y, z, w) ((simd_vec_t){{(v).f[(x)], (v).f[(y)], (v).f[(z)], (v).f[(w)]}})
1060
1061#endif
1062
1063/* Broadcast macros */
1064#define simd_splat_x(v) simd_swizzle(v, 0, 0, 0, 0)
1065#define simd_splat_y(v) simd_swizzle(v, 1, 1, 1, 1)
1066#define simd_splat_z(v) simd_swizzle(v, 2, 2, 2, 2)
1067#define simd_splat_w(v) simd_swizzle(v, 3, 3, 3, 3)
1068
1069/* Common rotations */
1070#define simd_yzxw(v) simd_swizzle(v, 1, 2, 0, 3) // Useful for cross product
1071#define simd_zxyw(v) simd_swizzle(v, 2, 0, 1, 3) // Useful for cross product
1072
1073/* =========================================================
1074 Matrix Helpers
1075 ========================================================= */
1076
1078static inline float simd_get_x(simd_vec_t v) {
1079#if defined(SIMD_ARCH_X86)
1080 return _mm_cvtss_f32(v);
1081#elif defined(SIMD_ARCH_ARM)
1082 return vgetq_lane_f32(v, 0);
1083#else
1084 return v.f[0];
1085#endif
1086}
1087
1089static inline bool simd_check_all(simd_vec_t mask) {
1090#if defined(SIMD_ARCH_X86)
1091 return _mm_movemask_ps(mask) == 0xF;
1092#elif defined(SIMD_ARCH_ARM)
1093 // Check if minimum of all lanes is NOT 0
1094 uint32x4_t u = vreinterpretq_u32_f32(mask);
1095 uint32x2_t min = vmin_u32(vget_low_u32(u), vget_high_u32(u));
1096 return (vget_lane_u32(min, 0) & vget_lane_u32(min, 1)) == 0xFFFFFFFF;
1097#else
1098 uint32_t* i = (uint32_t*)&mask;
1099 return i[0] && i[1] && i[2] && i[3];
1100#endif
1101}
1102
1104#if defined(SIMD_ARCH_X86)
1105#define simd_transpose4(r0, r1, r2, r3) _MM_TRANSPOSE4_PS(r0, r1, r2, r3)
1106#elif defined(SIMD_ARCH_ARM)
1107#define simd_transpose4(r0, r1, r2, r3) \
1108 do { \
1109 float32x4x2_t t0 = vtrnq_f32(r0, r1); \
1110 float32x4x2_t t1 = vtrnq_f32(r2, r3); \
1111 r0 = vcombine_f32(vget_low_f32(t0.val[0]), vget_low_f32(t1.val[0])); \
1112 r1 = vcombine_f32(vget_low_f32(t0.val[1]), vget_low_f32(t1.val[1])); \
1113 r2 = vcombine_f32(vget_high_f32(t0.val[0]), vget_high_f32(t1.val[0])); \
1114 r3 = vcombine_f32(vget_high_f32(t0.val[1]), vget_high_f32(t1.val[1])); \
1115 } while (0)
1116#else
1117#define simd_transpose4(r0, r1, r2, r3) \
1118 do { \
1119 simd_vec_t t0 = r0, t1 = r1, t2 = r2, t3 = r3; \
1120 r0 = (simd_vec_t){{t0.f[0], t1.f[0], t2.f[0], t3.f[0]}}; \
1121 r1 = (simd_vec_t){{t0.f[1], t1.f[1], t2.f[1], t3.f[1]}}; \
1122 r2 = (simd_vec_t){{t0.f[2], t1.f[2], t2.f[2], t3.f[2]}}; \
1123 r3 = (simd_vec_t){{t0.f[3], t1.f[3], t2.f[3], t3.f[3]}}; \
1124 } while (0)
1125#endif
1126
1127#endif /* SIMD_H */
Memory alignment utilities and macros for cross-platform alignment.