#include <mtx.h>
asm void PSMTXMultVec(const register Mtx m, const register Vec *src, register Vec *dst)
{
nofralloc
psq_l f0,0x0(src),0,0
psq_l f2,0x0(m),0,0
psq_l f1,0x8(src),0x1,0
ps_mul f4, f2, f0
psq_l f3,0x8(m),0,0
ps_madd f5, f3, f1, f4
psq_l f8,0x10(m),0,0
ps_sum0 f6, f5, f6, f5
psq_l f9,0x18(m),0,0
ps_mul f10, f8, f0
psq_st f6,0x0(dst),0x1,0
ps_madd f11, f9, f1, f10
psq_l f2,0x20(m),0,0
ps_sum0 f12, f11, f12, f11
psq_l f3,0x28(m),0,0
ps_mul f4, f2, f0
psq_st f12,0x4(dst),0x1,0
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_st f6,0x8(dst),0x1,0
blr
}
asm void PSMTXMultVecSR(const register Mtx m, const register Vec *src, register Vec *dst)
{
nofralloc
psq_l f0,0x0(m),0,0
psq_l f6,0x0(src),0,0
psq_l f2,0x10(m),0,0
ps_mul f8, f0, f6
psq_l f4,0x20(m),0,0
ps_mul f10, f2, f6
psq_l f7,0x8(src),0x1,0
ps_mul f12, f4, f6
psq_l f3,0x18(m),0,0
ps_sum0 f8, f8, f8, f8
psq_l f5,0x28(m),0,0
ps_sum0 f10, f10, f10, f10
psq_l f1,0x8(m),0,0
ps_sum0 f12, f12, f12, f12
ps_madd f9, f1, f7, f8
psq_st f9,0x0(dst),0x1,0
ps_madd f11, f3, f7, f10
psq_st f11,0x4(dst),0x1,0
ps_madd f13, f5, f7, f12
psq_st f13,0x8(dst),0x1,0
blr
}
asm void PSMTXMultVecArraySR(const register Mtx m, const register Vec *src, register Vec *dst, register u32 n)
{
nofralloc
psq_l f13,0x0(m),0,0
psq_l f12,0x10(m),0,0
subi n, n, 0x1
psq_l f11,0x8(m),0x1,0
ps_merge00 f0, f13, f12
subi r5, dst, 0x4
psq_l f10,0x18(m),0x1,0
ps_merge11 f1, f13, f12
mtctr n
psq_l f3,0x20(m),0,0
ps_merge00 f2, f11, f10
psq_l f4,0x28(m),0x1,0
psq_l f6,0x0(src),0,0
psq_lu f7,0x8(src),0x1,0
ps_muls0 f8, f0, f6
ps_mul f9, f3, f6
ps_madds1 f8, f1, f6, f8
ps_madd f10, f4, f7, f9
loop:
psq_lu f6,0x4(src),0,0
ps_madds0 f12, f2, f7, f8
psq_lu f7,0x8(src),0x1,0
ps_sum0 f13, f10, f9, f9
ps_muls0 f8, f0, f6
ps_mul f9, f3, f6
psq_stu f12,0x4(dst),0,0
ps_madds1 f8, f1, f6, f8
psq_stu f13,0x8(dst),0x1,0
ps_madd f10, f4, f7, f9
bdnz+ loop
ps_madds0 f12, f2, f7, f8
ps_sum0 f13, f10, f9, f9
psq_stu f12,0x4(dst),0,0
psq_stu f13,0x8(dst),0x1,0
blr
}