15 #ifndef BT_MATRIX3x3_H
16 #define BT_MATRIX3x3_H
25 #define vMPPP (_mm_set_ps(+0.0f, +0.0f, +0.0f, -0.0f))
28 #if defined(BT_USE_SSE)
29 #define v1000 (_mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f))
30 #define v0100 (_mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f))
31 #define v0010 (_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f))
32 #elif defined(BT_USE_NEON)
38 #ifdef BT_USE_DOUBLE_PRECISION
39 #define btMatrix3x3Data btMatrix3x3DoubleData
41 #define btMatrix3x3Data btMatrix3x3FloatData
42 #endif //BT_USE_DOUBLE_PRECISION
77 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
95 m_el[0].mVec128 = rhs.
m_el[0].mVec128;
96 m_el[1].mVec128 = rhs.
m_el[1].mVec128;
97 m_el[2].mVec128 = rhs.
m_el[2].mVec128;
103 m_el[0].mVec128 = m.
m_el[0].mVec128;
104 m_el[1].mVec128 = m.
m_el[1].mVec128;
105 m_el[2].mVec128 = m.
m_el[2].mVec128;
115 m_el[0] = other.
m_el[0];
116 m_el[1] = other.
m_el[1];
117 m_el[2] = other.
m_el[2];
123 m_el[0] = other.
m_el[0];
124 m_el[1] = other.
m_el[1];
125 m_el[2] = other.
m_el[2];
142 return btVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
190 m_el[2].
setValue(m[2], m[6], m[10]);
219 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
220 __m128 vs, Q = q.get128();
221 __m128i Qi = btCastfTo128i(Q);
224 __m128 V11, V21, V31;
225 __m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
226 __m128i NQi = btCastfTo128i(NQ);
228 V1 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 2, 3)));
229 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0, 0, 1, 3));
230 V3 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(2, 1, 0, 3)));
231 V1 = _mm_xor_ps(V1, vMPPP);
233 V11 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 1, 0, 3)));
234 V21 = _mm_unpackhi_ps(Q, Q);
235 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0, 2, 0, 3));
241 V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2, 3, 1, 3));
243 V21 = _mm_xor_ps(V21, vMPPP);
244 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3, 3, 1, 3));
245 V31 = _mm_xor_ps(V31, vMPPP);
246 Y = btCastiTo128f(_mm_shuffle_epi32(NQi, BT_SHUFFLE(3, 2, 0, 3)));
247 Z = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 1, 3)));
249 vs = _mm_load_ss(&s);
257 vs = bt_splat3_ps(vs, 0);
271 btScalar xs = q.
x() * s, ys = q.
y() * s, zs = q.
z() * s;
272 btScalar wx = q.
w() * xs, wy = q.
w() * ys, wz = q.
w() * zs;
273 btScalar xx = q.
x() * xs, xy = q.
x() * ys, xz = q.
x() * zs;
274 btScalar yy = q.
y() * ys, yz = q.
y() * zs, zz = q.
z() * zs;
276 btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
277 xy + wz,
btScalar(1.0) - (xx + zz), yz - wx,
278 xz - wy, yz + wx,
btScalar(1.0) - (xx + yy));
289 setEulerZYX(roll, pitch, yaw);
315 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
316 cj * sh, sj * ss + cc, sj * cs - sc,
317 -sj, cj * si, cj * ci);
323 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
336 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
338 identityMatrix(v1000, v0100, v0010);
346 return identityMatrix;
353 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
354 __m128 v0 = m_el[0].mVec128;
355 __m128 v1 = m_el[1].mVec128;
356 __m128 v2 = m_el[2].mVec128;
357 __m128* vm = (__m128*)m;
360 v2 = _mm_and_ps(v2, btvFFF0fMask);
362 vT = _mm_unpackhi_ps(v0, v1);
363 v0 = _mm_unpacklo_ps(v0, v1);
365 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3));
366 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3));
367 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));
372 #elif defined(BT_USE_NEON)
374 static const uint32x2_t zMask = (
const uint32x2_t){static_cast<uint32_t>(-1), 0};
375 float32x4_t* vm = (float32x4_t*)m;
376 float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);
377 float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));
378 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
379 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
380 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
381 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);
406 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
407 btScalar trace = m_el[0].
x() + m_el[1].
y() + m_el[2].
z();
419 temp.f[0] = m_el[2].
y() - m_el[1].
z();
420 temp.f[1] = m_el[0].
z() - m_el[2].
x();
421 temp.f[2] = m_el[1].
x() - m_el[0].
y();
428 if (m_el[0].x() < m_el[1].y())
430 if (m_el[1].y() < m_el[2].z())
445 if (m_el[0].x() < m_el[2].z())
459 x = m_el[i][i] - m_el[j][j] - m_el[k][k] +
btScalar(1.0);
461 temp.f[3] = (m_el[k][j] - m_el[j][k]);
462 temp.f[j] = (m_el[j][i] + m_el[i][j]);
463 temp.f[k] = (m_el[k][i] + m_el[i][k]);
474 btScalar trace = m_el[0].
x() + m_el[1].
y() + m_el[2].
z();
484 temp[0] = ((m_el[2].
y() - m_el[1].
z()) * s);
485 temp[1] = ((m_el[0].
z() - m_el[2].
x()) * s);
486 temp[2] = ((m_el[1].
x() - m_el[0].
y()) * s);
490 int i = m_el[0].
x() < m_el[1].
y() ? (m_el[1].
y() < m_el[2].
z() ? 2 : 1) : (m_el[0].x() < m_el[2].
z() ? 2 : 0);
498 temp[3] = (m_el[k][j] - m_el[j][k]) * s;
499 temp[j] = (m_el[j][i] + m_el[i][j]) * s;
500 temp[k] = (m_el[k][i] + m_el[i][k]) * s;
502 q.
setValue(temp[0], temp[1], temp[2], temp[3]);
551 if (
btFabs(m_el[2].x()) >= 1)
562 euler_out.roll = euler_out.pitch + delta;
563 euler_out2.roll = euler_out.pitch + delta;
569 euler_out.roll = -euler_out.pitch + delta;
570 euler_out2.roll = -euler_out.pitch + delta;
575 euler_out.pitch = -
btAsin(m_el[2].x());
576 euler_out2.pitch =
SIMD_PI - euler_out.pitch;
578 euler_out.roll =
btAtan2(m_el[2].y() /
btCos(euler_out.pitch),
579 m_el[2].
z() /
btCos(euler_out.pitch));
580 euler_out2.roll =
btAtan2(m_el[2].y() /
btCos(euler_out2.pitch),
581 m_el[2].
z() /
btCos(euler_out2.pitch));
583 euler_out.yaw =
btAtan2(m_el[1].x() /
btCos(euler_out.pitch),
584 m_el[0].
x() /
btCos(euler_out.pitch));
585 euler_out2.yaw =
btAtan2(m_el[1].x() /
btCos(euler_out2.pitch),
586 m_el[0].
x() /
btCos(euler_out2.pitch));
589 if (solution_number == 1)
592 pitch = euler_out.pitch;
593 roll = euler_out.roll;
597 yaw = euler_out2.yaw;
598 pitch = euler_out2.pitch;
599 roll = euler_out2.roll;
608 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
609 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
612 m_el[0].x() * s.
x(), m_el[0].
y() * s.y(), m_el[0].
z() * s.z(),
613 m_el[1].
x() * s.x(), m_el[1].
y() * s.y(), m_el[1].
z() * s.z(),
614 m_el[2].
x() * s.x(), m_el[2].
y() * s.y(), m_el[2].
z() * s.z());
655 return m_el[0].
x() * v.
x() + m_el[1].
x() * v.
y() + m_el[2].
x() * v.
z();
659 return m_el[0].
y() * v.
x() + m_el[1].
y() * v.
y() + m_el[2].
y() * v.
z();
663 return m_el[0].
z() * v.
x() + m_el[1].
z() * v.
y() + m_el[2].
z() * v.
z();
677 for (iter = 0; iter < maxIter; iter++)
703 for (
int step = maxSteps; step > 0; step--)
738 btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
744 t = (theta >= 0) ? 1 / (theta +
btSqrt(1 + theta2))
745 : 1 / (theta -
btSqrt(1 + theta2));
746 cos = 1 /
btSqrt(1 + t * t);
752 t = 1 / (theta * (2 +
btScalar(0.5) / theta2));
758 m_el[p][q] = m_el[q][p] = 0;
759 m_el[p][p] -= t * mpq;
760 m_el[q][q] += t * mpq;
763 m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
764 m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
767 for (
int i = 0; i < 3; i++)
772 row[p] = cos * mrp - sin * mrq;
773 row[q] = cos * mrq + sin * mrp;
787 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
804 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
805 __m128 rv00, rv01, rv02;
806 __m128 rv10, rv11, rv12;
807 __m128 rv20, rv21, rv22;
808 __m128 mv0, mv1, mv2;
810 rv02 =
m_el[0].mVec128;
811 rv12 =
m_el[1].mVec128;
812 rv22 =
m_el[2].mVec128;
814 mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
815 mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
816 mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
819 rv00 = bt_splat_ps(rv02, 0);
820 rv01 = bt_splat_ps(rv02, 1);
821 rv02 = bt_splat_ps(rv02, 2);
823 rv00 = _mm_mul_ps(rv00, mv0);
824 rv01 = _mm_mul_ps(rv01, mv1);
825 rv02 = _mm_mul_ps(rv02, mv2);
828 rv10 = bt_splat_ps(rv12, 0);
829 rv11 = bt_splat_ps(rv12, 1);
830 rv12 = bt_splat_ps(rv12, 2);
832 rv10 = _mm_mul_ps(rv10, mv0);
833 rv11 = _mm_mul_ps(rv11, mv1);
834 rv12 = _mm_mul_ps(rv12, mv2);
837 rv20 = bt_splat_ps(rv22, 0);
838 rv21 = bt_splat_ps(rv22, 1);
839 rv22 = bt_splat_ps(rv22, 2);
841 rv20 = _mm_mul_ps(rv20, mv0);
842 rv21 = _mm_mul_ps(rv21, mv1);
843 rv22 = _mm_mul_ps(rv22, mv2);
845 rv00 = _mm_add_ps(rv00, rv01);
846 rv10 = _mm_add_ps(rv10, rv11);
847 rv20 = _mm_add_ps(rv20, rv21);
849 m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
850 m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
851 m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
853 #elif defined(BT_USE_NEON)
855 float32x4_t rv0, rv1, rv2;
856 float32x4_t v0, v1, v2;
857 float32x4_t mv0, mv1, mv2;
859 v0 =
m_el[0].mVec128;
860 v1 =
m_el[1].mVec128;
861 v2 =
m_el[2].mVec128;
863 mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
864 mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
865 mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
867 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
868 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
869 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
871 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
872 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
873 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
875 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
876 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
877 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
879 m_el[0].mVec128 = rv0;
880 m_el[1].mVec128 = rv1;
881 m_el[2].mVec128 = rv2;
894 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
916 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
917 __m128 vk = bt_splat_ps(_mm_load_ss((
float*)&k), 0x80);
919 _mm_mul_ps(m[0].mVec128, vk),
920 _mm_mul_ps(m[1].mVec128, vk),
921 _mm_mul_ps(m[2].mVec128, vk));
922 #elif defined(BT_USE_NEON)
924 vmulq_n_f32(m[0].mVec128, k),
925 vmulq_n_f32(m[1].mVec128, k),
926 vmulq_n_f32(m[2].mVec128, k));
929 m[0].x() * k, m[0].y() * k, m[0].z() * k,
930 m[1].x() * k, m[1].y() * k, m[1].z() * k,
931 m[2].x() * k, m[2].y() * k, m[2].z() * k);
938 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
940 m1[0].mVec128 + m2[0].mVec128,
941 m1[1].mVec128 + m2[1].mVec128,
942 m1[2].mVec128 + m2[2].mVec128);
955 m1[2][2] + m2[2][2]);
962 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
964 m1[0].mVec128 - m2[0].mVec128,
965 m1[1].mVec128 - m2[1].mVec128,
966 m1[2].mVec128 - m2[2].mVec128);
979 m1[2][2] - m2[2][2]);
986 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
1008 return btTriple((*
this)[0], (*
this)[1], (*
this)[2]);
1014 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1016 _mm_and_ps(
m_el[0].mVec128, btvAbsfMask),
1017 _mm_and_ps(
m_el[1].mVec128, btvAbsfMask),
1018 _mm_and_ps(
m_el[2].mVec128, btvAbsfMask));
1019 #elif defined(BT_USE_NEON)
1021 (float32x4_t)vandq_s32((int32x4_t)
m_el[0].mVec128, btv3AbsMask),
1022 (float32x4_t)vandq_s32((int32x4_t)
m_el[1].mVec128, btv3AbsMask),
1023 (float32x4_t)vandq_s32((int32x4_t)
m_el[2].mVec128, btv3AbsMask));
1035 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1036 __m128 v0 =
m_el[0].mVec128;
1037 __m128 v1 =
m_el[1].mVec128;
1038 __m128 v2 =
m_el[2].mVec128;
1041 v2 = _mm_and_ps(v2, btvFFF0fMask);
1043 vT = _mm_unpackhi_ps(v0, v1);
1044 v0 = _mm_unpacklo_ps(v0, v1);
1046 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3));
1047 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3));
1048 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));
1051 #elif defined(BT_USE_NEON)
1053 static const uint32x2_t zMask = (
const uint32x2_t){static_cast<uint32_t>(-1), 0};
1054 float32x4x2_t top = vtrnq_f32(
m_el[0].mVec128,
m_el[1].mVec128);
1055 float32x2x2_t bl = vtrn_f32(vget_low_f32(
m_el[2].mVec128), vdup_n_f32(0.0f));
1056 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
1057 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
1058 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(
m_el[2].mVec128), zMask);
1059 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);
1071 return btMatrix3x3(
cofac(1, 1, 2, 2),
cofac(0, 2, 2, 1),
cofac(0, 1, 1, 2),
1072 cofac(1, 2, 2, 0),
cofac(0, 0, 2, 2),
cofac(0, 2, 1, 0),
1073 cofac(1, 0, 2, 1),
cofac(0, 1, 2, 0),
cofac(0, 0, 1, 1));
1079 btVector3 co(
cofac(1, 1, 2, 2),
cofac(1, 2, 2, 0),
cofac(1, 0, 2, 1));
1085 co.
y() * s,
cofac(0, 0, 2, 2) * s,
cofac(0, 2, 1, 0) * s,
1086 co.
z() * s,
cofac(0, 1, 2, 0) * s,
cofac(0, 0, 1, 1) * s);
1092 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1095 __m128 row =
m_el[0].mVec128;
1096 __m128 m0 = _mm_and_ps(m.
getRow(0).mVec128, btvFFF0fMask);
1097 __m128 m1 = _mm_and_ps(m.
getRow(1).mVec128, btvFFF0fMask);
1098 __m128 m2 = _mm_and_ps(m.
getRow(2).mVec128, btvFFF0fMask);
1099 __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1100 __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1101 __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1102 row =
m_el[1].mVec128;
1103 r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1104 r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1105 r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1106 row =
m_el[2].mVec128;
1107 r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1108 r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1109 r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1112 #elif defined BT_USE_NEON
1114 static const uint32x4_t xyzMask = (
const uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0};
1115 float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.
getRow(0).mVec128, xyzMask);
1116 float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.
getRow(1).mVec128, xyzMask);
1117 float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.
getRow(2).mVec128, xyzMask);
1118 float32x4_t row =
m_el[0].mVec128;
1119 float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
1120 float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
1121 float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
1122 row =
m_el[1].mVec128;
1123 r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
1124 r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
1125 r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
1126 row =
m_el[2].mVec128;
1127 r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
1128 r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
1129 r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
1133 m_el[0].x() * m[0].x() +
m_el[1].x() * m[1].x() +
m_el[2].x() * m[2].x(),
1134 m_el[0].x() * m[0].y() +
m_el[1].x() * m[1].y() +
m_el[2].x() * m[2].y(),
1135 m_el[0].x() * m[0].z() +
m_el[1].x() * m[1].z() +
m_el[2].x() * m[2].z(),
1136 m_el[0].y() * m[0].x() +
m_el[1].y() * m[1].x() +
m_el[2].y() * m[2].x(),
1137 m_el[0].y() * m[0].y() +
m_el[1].y() * m[1].y() +
m_el[2].y() * m[2].y(),
1138 m_el[0].y() * m[0].z() +
m_el[1].y() * m[1].z() +
m_el[2].y() * m[2].z(),
1139 m_el[0].z() * m[0].x() +
m_el[1].z() * m[1].x() +
m_el[2].z() * m[2].x(),
1140 m_el[0].z() * m[0].y() +
m_el[1].z() * m[1].y() +
m_el[2].z() * m[2].y(),
1141 m_el[0].z() * m[0].z() +
m_el[1].z() * m[1].z() +
m_el[2].z() * m[2].z());
1148 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1149 __m128 a0 =
m_el[0].mVec128;
1150 __m128 a1 =
m_el[1].mVec128;
1151 __m128 a2 =
m_el[2].mVec128;
1154 __m128 mx = mT[0].mVec128;
1155 __m128 my = mT[1].mVec128;
1156 __m128 mz = mT[2].mVec128;
1158 __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1159 __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1160 __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1161 r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1162 r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1163 r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1164 r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1165 r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1166 r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1169 #elif defined BT_USE_NEON
1170 float32x4_t a0 =
m_el[0].mVec128;
1171 float32x4_t a1 =
m_el[1].mVec128;
1172 float32x4_t a2 =
m_el[2].mVec128;
1175 float32x4_t mx = mT[0].mVec128;
1176 float32x4_t my = mT[1].mVec128;
1177 float32x4_t mz = mT[2].mVec128;
1179 float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
1180 float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
1181 float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
1182 r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
1183 r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
1184 r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
1185 r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
1186 r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
1187 r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
1201 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
1202 return v.
dot3(m[0], m[1], m[2]);
1211 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1213 const __m128 vv = v.mVec128;
1215 __m128 c0 = bt_splat_ps(vv, 0);
1216 __m128 c1 = bt_splat_ps(vv, 1);
1217 __m128 c2 = bt_splat_ps(vv, 2);
1219 c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask));
1220 c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask));
1221 c0 = _mm_add_ps(c0, c1);
1222 c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask));
1225 #elif defined(BT_USE_NEON)
1226 const float32x4_t vv = v.mVec128;
1227 const float32x2_t vlo = vget_low_f32(vv);
1228 const float32x2_t vhi = vget_high_f32(vv);
1230 float32x4_t c0, c1, c2;
1232 c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1233 c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1234 c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1236 c0 = vmulq_lane_f32(c0, vlo, 0);
1237 c1 = vmulq_lane_f32(c1, vlo, 1);
1238 c2 = vmulq_lane_f32(c2, vhi, 0);
1239 c0 = vaddq_f32(c0, c1);
1240 c0 = vaddq_f32(c0, c2);
1251 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1253 __m128 m10 = m1[0].mVec128;
1254 __m128 m11 = m1[1].mVec128;
1255 __m128 m12 = m1[2].mVec128;
1257 __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1259 __m128 c0 = bt_splat_ps(m10, 0);
1260 __m128 c1 = bt_splat_ps(m11, 0);
1261 __m128 c2 = bt_splat_ps(m12, 0);
1263 c0 = _mm_mul_ps(c0, m2v);
1264 c1 = _mm_mul_ps(c1, m2v);
1265 c2 = _mm_mul_ps(c2, m2v);
1267 m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1269 __m128 c0_1 = bt_splat_ps(m10, 1);
1270 __m128 c1_1 = bt_splat_ps(m11, 1);
1271 __m128 c2_1 = bt_splat_ps(m12, 1);
1273 c0_1 = _mm_mul_ps(c0_1, m2v);
1274 c1_1 = _mm_mul_ps(c1_1, m2v);
1275 c2_1 = _mm_mul_ps(c2_1, m2v);
1277 m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1279 c0 = _mm_add_ps(c0, c0_1);
1280 c1 = _mm_add_ps(c1, c1_1);
1281 c2 = _mm_add_ps(c2, c2_1);
1283 m10 = bt_splat_ps(m10, 2);
1284 m11 = bt_splat_ps(m11, 2);
1285 m12 = bt_splat_ps(m12, 2);
1287 m10 = _mm_mul_ps(m10, m2v);
1288 m11 = _mm_mul_ps(m11, m2v);
1289 m12 = _mm_mul_ps(m12, m2v);
1291 c0 = _mm_add_ps(c0, m10);
1292 c1 = _mm_add_ps(c1, m11);
1293 c2 = _mm_add_ps(c2, m12);
1297 #elif defined(BT_USE_NEON)
1299 float32x4_t rv0, rv1, rv2;
1300 float32x4_t v0, v1, v2;
1301 float32x4_t mv0, mv1, mv2;
1307 mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1308 mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1309 mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1311 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1312 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1313 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1315 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1316 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1317 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1319 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1320 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1321 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1352 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1356 c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1357 c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1358 c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1360 c0 = _mm_and_ps(c0, c1);
1361 c0 = _mm_and_ps(c0, c2);
1363 int m = _mm_movemask_ps((__m128)c0);
1364 return (0x7 == (m & 0x7));
1367 return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1368 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1369 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
1387 for (
int i = 0; i < 3; i++)
1393 for (
int i = 0; i < 3; i++)
1399 for (
int i = 0; i < 3; i++)
1405 for (
int i = 0; i < 3; i++)
1411 for (
int i = 0; i < 3; i++)
1415 #endif //BT_MATRIX3x3_H