23 #ifdef BT_USE_DOUBLE_PRECISION
24 #define btVector3Data btVector3DoubleData
25 #define btVector3DataName "btVector3DoubleData"
27 #define btVector3Data btVector3FloatData
28 #define btVector3DataName "btVector3FloatData"
29 #endif //BT_USE_DOUBLE_PRECISION
31 #if defined BT_USE_SSE
36 #pragma warning(disable : 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
39 #define BT_SHUFFLE(x, y, z, w) (((w) << 6 | (z) << 4 | (y) << 2 | (x)) & 0xff)
41 #define bt_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
42 #define bt_splat3_ps(_a, _i) bt_pshufd_ps((_a), BT_SHUFFLE(_i, _i, _i, 3))
43 #define bt_splat_ps(_a, _i) bt_pshufd_ps((_a), BT_SHUFFLE(_i, _i, _i, _i))
45 #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
46 #define btvAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
47 #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
48 #define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
49 #define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
50 #define btvxyzMaskf btvFFF0fMask
51 #define btvAbsfMask btCastiTo128f(btvAbsMask)
54 #define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f))
55 #define v1110 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f))
56 #define vHalf (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f))
57 #define v1_5 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f))
68 const float32x4_t
ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
69 const int32x4_t
ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){static_cast<int32_t>(0xFFFFFFFF),
70 static_cast<int32_t>(0xFFFFFFFF), static_cast<int32_t>(0xFFFFFFFF), 0x0};
71 const int32x4_t
ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
72 const int32x4_t
ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
86 #if defined(__SPU__) && defined(__CELLOS_LV2__)
92 return *((
const vec_float4*)&m_floats[0]);
96 #else //__CELLOS_LV2__ __SPU__
97 #if defined(BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
113 #endif //__CELLOS_LV2__ __SPU__
134 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
144 mVec128 = rhs.mVec128;
155 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
161 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
162 mVec128 = _mm_add_ps(mVec128, v.mVec128);
163 #elif defined(BT_USE_NEON)
164 mVec128 = vaddq_f32(mVec128, v.mVec128);
177 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
178 mVec128 = _mm_sub_ps(mVec128, v.mVec128);
179 #elif defined(BT_USE_NEON)
180 mVec128 = vsubq_f32(mVec128, v.mVec128);
193 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
194 __m128 vs = _mm_load_ss(&s);
195 vs = bt_pshufd_ps(vs, 0x80);
196 mVec128 = _mm_mul_ps(mVec128, vs);
197 #elif defined(BT_USE_NEON)
198 mVec128 = vmulq_n_f32(mVec128, s);
213 #if 0 //defined(BT_USE_SSE_IN_API)
215 __m128 vs = _mm_load_ss(&s);
216 vs = _mm_div_ss(v1110, vs);
217 vs = bt_pshufd_ps(vs, 0x00);
219 mVec128 = _mm_mul_ps(mVec128, vs);
231 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
232 __m128 vd = _mm_mul_ps(mVec128, v.mVec128);
233 __m128 z = _mm_movehl_ps(vd, vd);
234 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
235 vd = _mm_add_ss(vd, y);
236 vd = _mm_add_ss(vd, z);
237 return _mm_cvtss_f32(vd);
238 #elif defined(BT_USE_NEON)
239 float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
240 float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
241 x = vadd_f32(x, vget_high_f32(vd));
242 return vget_lane_f32(x, 0);
244 return m_floats[0] * v.
m_floats[0] +
307 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
309 __m128 vd = _mm_mul_ps(mVec128, mVec128);
310 __m128 z = _mm_movehl_ps(vd, vd);
311 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
312 vd = _mm_add_ss(vd, y);
313 vd = _mm_add_ss(vd, z);
316 vd = _mm_sqrt_ss(vd);
317 vd = _mm_div_ss(v1110, vd);
318 vd = bt_splat_ps(vd, 0x80);
319 mVec128 = _mm_mul_ps(mVec128, vd);
323 y = _mm_rsqrt_ss(vd);
327 vd = _mm_mul_ss(vd, vHalf);
329 vd = _mm_mul_ss(vd, y);
330 vd = _mm_mul_ss(vd, y);
331 z = _mm_sub_ss(z, vd);
333 y = _mm_mul_ss(y, z);
335 y = bt_splat_ps(y, 0x80);
336 mVec128 = _mm_mul_ps(mVec128, y);
366 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
367 return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
368 #elif defined(BT_USE_NEON)
382 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
385 T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));
386 V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));
388 V = _mm_mul_ps(V, mVec128);
389 T = _mm_mul_ps(T, v.mVec128);
390 V = _mm_sub_ps(V, T);
392 V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
394 #elif defined(BT_USE_NEON)
397 float32x2_t Tlow = vget_low_f32(mVec128);
398 float32x2_t Vlow = vget_low_f32(v.mVec128);
399 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
400 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
402 V = vmulq_f32(V, mVec128);
403 T = vmulq_f32(T, v.mVec128);
405 Vlow = vget_low_f32(V);
407 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
408 V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask);
421 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
423 __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));
424 __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));
426 V = _mm_mul_ps(V, v1.mVec128);
427 T = _mm_mul_ps(T, v2.mVec128);
428 V = _mm_sub_ps(V, T);
430 V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
433 V = _mm_mul_ps(V, mVec128);
434 __m128 z = _mm_movehl_ps(V, V);
435 __m128 y = _mm_shuffle_ps(V, V, 0x55);
436 V = _mm_add_ss(V, y);
437 V = _mm_add_ss(V, z);
438 return _mm_cvtss_f32(V);
440 #elif defined(BT_USE_NEON)
444 float32x2_t Tlow = vget_low_f32(v1.mVec128);
445 float32x2_t Vlow = vget_low_f32(v2.mVec128);
446 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
447 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
449 V = vmulq_f32(V, v1.mVec128);
450 T = vmulq_f32(T, v2.mVec128);
452 Vlow = vget_low_f32(V);
454 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
457 V = vmulq_f32(mVec128, V);
458 float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
459 x = vadd_f32(x, vget_high_f32(V));
460 return vget_lane_f32(x, 0);
472 return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2);
479 return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0);
484 return absolute().minAxis();
489 return absolute().maxAxis();
494 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
495 __m128 vrt = _mm_load_ss(&rt);
497 __m128 vs = _mm_load_ss(&s);
498 vs = bt_pshufd_ps(vs, 0x80);
499 __m128 r0 = _mm_mul_ps(v0.mVec128, vs);
500 vrt = bt_pshufd_ps(vrt, 0x80);
501 __m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
502 __m128 tmp3 = _mm_add_ps(r0, r1);
504 #elif defined(BT_USE_NEON)
505 float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
506 vl = vmulq_n_f32(vl, rt);
507 mVec128 = vaddq_f32(vl, v0.mVec128);
523 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
524 __m128 vt = _mm_load_ss(&t);
525 vt = bt_pshufd_ps(vt, 0x80);
526 __m128 vl = _mm_sub_ps(v.mVec128, mVec128);
527 vl = _mm_mul_ps(vl, vt);
528 vl = _mm_add_ps(vl, mVec128);
531 #elif defined(BT_USE_NEON)
532 float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
533 vl = vmulq_n_f32(vl, t);
534 vl = vaddq_f32(vl, mVec128);
539 m_floats[1] + (v.
m_floats[1] - m_floats[1]) * t,
540 m_floats[2] + (v.
m_floats[2] - m_floats[2]) * t);
548 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
549 mVec128 = _mm_mul_ps(mVec128, v.mVec128);
550 #elif defined(BT_USE_NEON)
551 mVec128 = vmulq_f32(mVec128, v.mVec128);
591 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
592 return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
594 return ((m_floats[3] == other.
m_floats[3]) &&
595 (m_floats[2] == other.
m_floats[2]) &&
596 (m_floats[1] == other.
m_floats[1]) &&
597 (m_floats[0] == other.
m_floats[0]));
603 return !(*
this == other);
611 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
612 mVec128 = _mm_max_ps(mVec128, other.mVec128);
613 #elif defined(BT_USE_NEON)
614 mVec128 = vmaxq_f32(mVec128, other.mVec128);
628 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
629 mVec128 = _mm_min_ps(mVec128, other.mVec128);
630 #elif defined(BT_USE_NEON)
631 mVec128 = vminq_f32(mVec128, other.mVec128);
650 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
652 __m128 V = _mm_and_ps(mVec128, btvFFF0fMask);
653 __m128 V0 = _mm_xor_ps(btvMzeroMask, V);
654 __m128 V2 = _mm_movelh_ps(V0, V);
656 __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
658 V0 = _mm_shuffle_ps(V0, V, 0xDB);
659 V2 = _mm_shuffle_ps(V2, V, 0xF9);
673 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
674 mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
675 #elif defined(BT_USE_NEON)
676 int32x4_t vi = vdupq_n_s32(0);
677 mVec128 = vreinterpretq_f32_s32(vi);
722 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
724 __m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128);
725 __m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128);
726 __m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128);
727 __m128 b0 = _mm_unpacklo_ps(a0, a1);
728 __m128 b1 = _mm_unpackhi_ps(a0, a1);
729 __m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps());
730 __m128 r = _mm_movelh_ps(b0, b2);
731 r = _mm_add_ps(r, _mm_movehl_ps(b2, b0));
732 a2 = _mm_and_ps(a2, btvxyzMaskf);
733 r = _mm_add_ps(r, btCastdTo128f(_mm_move_sd(btCastfTo128d(a2), btCastfTo128d(b1))));
736 #elif defined(BT_USE_NEON)
737 static const uint32x4_t xyzMask = (
const uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0};
738 float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128);
739 float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128);
740 float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128);
741 float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1));
742 a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask);
743 float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]);
744 float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
756 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
757 return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
758 #elif defined(BT_USE_NEON)
759 return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
772 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
773 return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
774 #elif defined(BT_USE_NEON)
775 return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
788 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
791 __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
792 return btVector3(_mm_and_ps(r, btvFFF0fMask));
793 #elif defined(BT_USE_NEON)
794 float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
795 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
808 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
809 __m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
810 return btVector3(_mm_and_ps(r, btvFFF0fMask));
811 #elif defined(BT_USE_NEON)
812 return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
822 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
823 __m128 vs = _mm_load_ss(&s);
824 vs = bt_pshufd_ps(vs, 0x80);
825 return btVector3(_mm_mul_ps(v.mVec128, vs));
826 #elif defined(BT_USE_NEON)
827 float32x4_t r = vmulq_n_f32(v.mVec128, s);
828 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
846 #if 0 //defined(BT_USE_SSE_IN_API)
848 __m128 vs = _mm_load_ss(&s);
849 vs = _mm_div_ss(v1110, vs);
850 vs = bt_pshufd_ps(vs, 0x00);
852 return btVector3(_mm_mul_ps(v.mVec128, vs));
862 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
863 __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
864 vec = _mm_and_ps(vec, btvFFF0fMask);
866 #elif defined(BT_USE_NEON)
867 float32x4_t x, y, v, m;
873 m = vrecpsq_f32(y, v);
875 m = vrecpsq_f32(y, v);
936 return v1.
lerp(v2, t);
941 return (v - *
this).length2();
946 return (v - *
this).length();
960 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
962 __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
964 __m128 C = wAxis.
cross(mVec128).mVec128;
965 O = _mm_and_ps(O, btvFFF0fMask);
968 __m128 vsin = _mm_load_ss(&ssin);
969 __m128 vcos = _mm_load_ss(&scos);
971 __m128 Y = bt_pshufd_ps(O, 0xC9);
972 __m128 Z = bt_pshufd_ps(O, 0xD2);
973 O = _mm_add_ps(O, Y);
974 vsin = bt_pshufd_ps(vsin, 0x80);
975 O = _mm_add_ps(O, Z);
976 vcos = bt_pshufd_ps(vcos, 0x80);
979 O = O * wAxis.mVec128;
980 __m128 X = mVec128 - O;
992 _y = wAxis.
cross(*
this);
994 return (o + _x *
btCos(_angle) + _y *
btSin(_angle));
1000 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1001 #if defined _WIN32 || defined(BT_USE_SSE)
1002 const long scalar_cutoff = 10;
1003 long _maxdot_large(
const float* array,
const float* vec,
unsigned long array_count,
float* dotOut);
1004 #elif defined BT_USE_NEON
1005 const long scalar_cutoff = 4;
1006 extern long (*_maxdot_large)(
const float* array,
const float* vec,
unsigned long array_count,
float* dotOut);
1008 if (array_count < scalar_cutoff)
1014 for (i = 0; i < array_count; i++)
1028 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1029 return _maxdot_large((
float*)array, (
float*)&
m_floats[0], array_count, &dotOut);
1035 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1036 #if defined BT_USE_SSE
1037 const long scalar_cutoff = 10;
1038 long _mindot_large(
const float* array,
const float* vec,
unsigned long array_count,
float* dotOut);
1039 #elif defined BT_USE_NEON
1040 const long scalar_cutoff = 4;
1041 extern long (*_mindot_large)(
const float* array,
const float* vec,
unsigned long array_count,
float* dotOut);
1043 #error unhandled arch!
1046 if (array_count < scalar_cutoff)
1053 for (i = 0; i < array_count; i++)
1068 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined(BT_USE_NEON)
1069 return _mindot_large((
float*)array, (
float*)&
m_floats[0], array_count, &dotOut);
1070 #endif //BT_USE_SIMD_VECTOR3
1084 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
1092 mVec128 = rhs.mVec128;
1098 mVec128 = v.mVec128;
1101 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1105 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
1106 return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
1107 #elif defined(BT_USE_NEON)
1210 #ifdef BT_USE_DOUBLE_PRECISION
1211 unsigned char* dest = (
unsigned char*)&destVal;
1212 const unsigned char* src = (
const unsigned char*)&sourceVal;
1222 unsigned char* dest = (
unsigned char*)&destVal;
1223 const unsigned char* src = (
const unsigned char*)&sourceVal;
1228 #endif //BT_USE_DOUBLE_PRECISION
1233 for (
int i = 0; i < 4; i++)
1243 for (
int i = 0; i < 4; i++)
1247 vector = swappedVec;
1256 btScalar a = n[1] * n[1] + n[2] * n[2];
1263 q[1] = -n[0] * p[2];
1269 btScalar a = n[0] * n[0] + n[1] * n[1];
1275 q[0] = -n[2] * p[1];
1294 for (
int i = 0; i < 4; i++)
1300 for (
int i = 0; i < 4; i++)
1307 for (
int i = 0; i < 4; i++)
1313 for (
int i = 0; i < 4; i++)
1320 for (
int i = 0; i < 4; i++)
1326 for (
int i = 0; i < 4; i++)
1332 for (
int i = 0; i < 4; i++)
1336 #endif //BT_VECTOR3_H