15 #ifndef BT_SIMD__QUATERNION_H_
16 #define BT_SIMD__QUATERNION_H_
21 #ifdef BT_USE_DOUBLE_PRECISION
22 #define btQuaternionData btQuaternionDoubleData
23 #define btQuaternionDataName "btQuaternionDoubleData"
25 #define btQuaternionData btQuaternionFloatData
26 #define btQuaternionDataName "btQuaternionFloatData"
27 #endif //BT_USE_DOUBLE_PRECISION
32 #define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f))
36 #if defined(BT_USE_SSE)
38 #define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f))
39 #define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f))
41 #elif defined(BT_USE_NEON)
55 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
65 mVec128 = rhs.mVec128;
99 #ifndef BT_EULER_DEFAULT_ZYX
131 setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
132 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
133 sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
134 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
151 setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
152 cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
153 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
154 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
202 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
203 mVec128 = _mm_add_ps(mVec128, q.mVec128);
204 #elif defined(BT_USE_NEON)
205 mVec128 = vaddq_f32(mVec128, q.mVec128);
219 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
220 mVec128 = _mm_sub_ps(mVec128, q.mVec128);
221 #elif defined(BT_USE_NEON)
222 mVec128 = vsubq_f32(mVec128, q.mVec128);
236 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
237 __m128 vs = _mm_load_ss(&s);
238 vs = bt_pshufd_ps(vs, 0);
239 mVec128 = _mm_mul_ps(mVec128, vs);
240 #elif defined(BT_USE_NEON)
241 mVec128 = vmulq_n_f32(mVec128, s);
256 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
257 __m128 vQ2 = q.get128();
259 __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0, 1, 2, 0));
260 __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0));
264 __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 1));
265 __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
269 B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2, 0, 1, 2));
270 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
274 mVec128 = bt_splat_ps(mVec128, 3);
275 mVec128 = mVec128 * vQ2;
278 mVec128 = mVec128 - B1;
279 A1 = _mm_xor_ps(A1, vPPPM);
280 mVec128 = mVec128 + A1;
282 #elif defined(BT_USE_NEON)
284 float32x4_t vQ1 = mVec128;
285 float32x4_t vQ2 = q.get128();
286 float32x4_t A0, A1, B1, A2, B2, A3, B3;
287 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
291 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));
294 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));
297 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
299 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
301 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
302 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
304 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
305 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
307 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
308 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
310 A3 = vcombine_f32(vQ1zx, vQ1yz);
311 B3 = vcombine_f32(vQ2yz, vQ2xz);
313 A1 = vmulq_f32(A1, B1);
314 A2 = vmulq_f32(A2, B2);
315 A3 = vmulq_f32(A3, B3);
316 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
318 A1 = vaddq_f32(A1, A2);
319 A0 = vsubq_f32(A0, A3);
322 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
323 A0 = vaddq_f32(A0, A1);
339 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
342 vd = _mm_mul_ps(mVec128, q.mVec128);
344 __m128 t = _mm_movehl_ps(vd, vd);
345 vd = _mm_add_ps(vd, t);
346 t = _mm_shuffle_ps(vd, vd, 0x55);
347 vd = _mm_add_ss(vd, t);
349 return _mm_cvtss_f32(vd);
350 #elif defined(BT_USE_NEON)
351 float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
352 float32x2_t
x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
354 return vget_lane_f32(
x, 0);
387 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
390 vd = _mm_mul_ps(mVec128, mVec128);
392 __m128 t = _mm_movehl_ps(vd, vd);
393 vd = _mm_add_ps(vd, t);
394 t = _mm_shuffle_ps(vd, vd, 0x55);
395 vd = _mm_add_ss(vd, t);
397 vd = _mm_sqrt_ss(vd);
398 vd = _mm_div_ss(vOnes, vd);
399 vd = bt_pshufd_ps(vd, 0);
400 mVec128 = _mm_mul_ps(mVec128, vd);
413 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
414 __m128 vs = _mm_load_ss(&s);
415 vs = bt_pshufd_ps(vs, 0x00);
418 #elif defined(BT_USE_NEON)
499 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
501 #elif defined(BT_USE_NEON)
502 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
513 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
515 #elif defined(BT_USE_NEON)
528 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
530 #elif defined(BT_USE_NEON)
542 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
544 #elif defined(BT_USE_NEON)
545 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask));
635 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
636 __m128 vQ1 = q1.get128();
637 __m128 vQ2 = q2.get128();
638 __m128 A0, A1, B1, A2, B2;
640 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0, 1, 2, 0));
641 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0));
645 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1));
646 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
650 B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2));
651 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
655 A0 = bt_splat_ps(vQ1, 3);
661 A1 = _mm_xor_ps(A1, vPPPM);
666 #elif defined(BT_USE_NEON)
668 float32x4_t vQ1 = q1.get128();
669 float32x4_t vQ2 = q2.get128();
670 float32x4_t A0, A1, B1, A2, B2, A3, B3;
671 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
675 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));
678 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));
681 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
683 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
685 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
686 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
688 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
689 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
691 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
692 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
694 A3 = vcombine_f32(vQ1zx, vQ1yz);
695 B3 = vcombine_f32(vQ2yz, vQ2xz);
697 A1 = vmulq_f32(A1, B1);
698 A2 = vmulq_f32(A2, B2);
699 A3 = vmulq_f32(A3, B3);
700 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
702 A1 = vaddq_f32(A1, A2);
703 A0 = vsubq_f32(A0, A3);
706 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
707 A0 = vaddq_f32(A0, A1);
713 q1.
w() * q2.
x() + q1.
x() * q2.
w() + q1.
y() * q2.
z() - q1.
z() * q2.
y(),
714 q1.
w() * q2.
y() + q1.
y() * q2.
w() + q1.
z() * q2.
x() - q1.
x() * q2.
z(),
715 q1.
w() * q2.
z() + q1.
z() * q2.
w() + q1.
x() * q2.
y() - q1.
y() * q2.
x(),
716 q1.
w() * q2.
w() - q1.
x() * q2.
x() - q1.
y() * q2.
y() - q1.
z() * q2.
z());
723 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
724 __m128 vQ1 = q.get128();
725 __m128 vQ2 = w.get128();
726 __m128 A1, B1, A2, B2, A3, B3;
728 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3, 3, 3, 0));
729 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0, 1, 2, 0));
733 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1));
734 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
738 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2));
739 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
744 A1 = _mm_xor_ps(A1, vPPPM);
749 #elif defined(BT_USE_NEON)
751 float32x4_t vQ1 = q.get128();
752 float32x4_t vQ2 = w.get128();
753 float32x4_t A1, B1, A2, B2, A3, B3;
754 float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
756 vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
760 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));
763 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));
767 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
769 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
770 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
772 A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);
773 B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);
775 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
776 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
778 A3 = vcombine_f32(vQ1zx, vQ1yz);
779 B3 = vcombine_f32(vQ2yz, vQ2xz);
781 A1 = vmulq_f32(A1, B1);
782 A2 = vmulq_f32(A2, B2);
783 A3 = vmulq_f32(A3, B3);
785 A1 = vaddq_f32(A1, A2);
788 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
790 A1 = vsubq_f32(A1, A3);
796 q.
w() * w.
x() + q.
y() * w.
z() - q.
z() * w.
y(),
797 q.
w() * w.
y() + q.
z() * w.
x() - q.
x() * w.
z(),
798 q.
w() * w.
z() + q.
x() * w.
y() - q.
y() * w.
x(),
799 -q.
x() * w.
x() - q.
y() * w.
y() - q.
z() * w.
z());
806 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
807 __m128 vQ1 = w.get128();
808 __m128 vQ2 = q.get128();
809 __m128 A1, B1, A2, B2, A3, B3;
811 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0, 1, 2, 0));
812 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0));
816 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1));
817 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
821 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2));
822 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
827 A1 = _mm_xor_ps(A1, vPPPM);
832 #elif defined(BT_USE_NEON)
834 float32x4_t vQ1 = w.get128();
835 float32x4_t vQ2 = q.get128();
836 float32x4_t A1, B1, A2, B2, A3, B3;
837 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
842 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));
845 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));
848 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
850 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
852 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
853 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
855 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
856 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
858 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
859 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
861 A3 = vcombine_f32(vQ1zx, vQ1yz);
862 B3 = vcombine_f32(vQ2yz, vQ2xz);
864 A1 = vmulq_f32(A1, B1);
865 A2 = vmulq_f32(A2, B2);
866 A3 = vmulq_f32(A3, B3);
868 A1 = vaddq_f32(A1, A2);
871 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
873 A1 = vsubq_f32(A1, A3);
879 +w.
x() * q.
w() + w.
y() * q.
z() - w.
z() * q.
y(),
880 +w.
y() * q.
w() + w.
z() * q.
x() - w.
x() * q.
z(),
881 +w.
z() * q.
w() + w.
x() * q.
y() - w.
y() * q.
x(),
882 -w.
x() * q.
x() - w.
y() * q.
y() - w.
z() * q.
z());
922 return q1.
slerp(q2, t);
930 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
931 return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
932 #elif defined(BT_USE_NEON)
933 return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
979 for (
int i = 0; i < 4; i++)
985 for (
int i = 0; i < 4; i++)
992 for (
int i = 0; i < 4; i++)
998 for (
int i = 0; i < 4; i++)
1005 for (
int i = 0; i < 4; i++)
1011 for (
int i = 0; i < 4; i++)
1017 for (
int i = 0; i < 4; i++)
1021 #endif //BT_SIMD__QUATERNION_H_