#include asm void PSMTXMultVec(const register Mtx m, const register Vec *src, register Vec *dst) { nofralloc psq_l f0,0x0(src),0,0 psq_l f2,0x0(m),0,0 psq_l f1,0x8(src),0x1,0 ps_mul f4, f2, f0 psq_l f3,0x8(m),0,0 ps_madd f5, f3, f1, f4 psq_l f8,0x10(m),0,0 ps_sum0 f6, f5, f6, f5 psq_l f9,0x18(m),0,0 ps_mul f10, f8, f0 psq_st f6,0x0(dst),0x1,0 ps_madd f11, f9, f1, f10 psq_l f2,0x20(m),0,0 ps_sum0 f12, f11, f12, f11 psq_l f3,0x28(m),0,0 ps_mul f4, f2, f0 psq_st f12,0x4(dst),0x1,0 ps_madd f5, f3, f1, f4 ps_sum0 f6, f5, f6, f5 psq_st f6,0x8(dst),0x1,0 blr } asm void PSMTXMultVecSR(const register Mtx m, const register Vec *src, register Vec *dst) { nofralloc psq_l f0,0x0(m),0,0 psq_l f6,0x0(src),0,0 psq_l f2,0x10(m),0,0 ps_mul f8, f0, f6 psq_l f4,0x20(m),0,0 ps_mul f10, f2, f6 psq_l f7,0x8(src),0x1,0 ps_mul f12, f4, f6 psq_l f3,0x18(m),0,0 ps_sum0 f8, f8, f8, f8 psq_l f5,0x28(m),0,0 ps_sum0 f10, f10, f10, f10 psq_l f1,0x8(m),0,0 ps_sum0 f12, f12, f12, f12 ps_madd f9, f1, f7, f8 psq_st f9,0x0(dst),0x1,0 ps_madd f11, f3, f7, f10 psq_st f11,0x4(dst),0x1,0 ps_madd f13, f5, f7, f12 psq_st f13,0x8(dst),0x1,0 blr } asm void PSMTXMultVecArraySR(const register Mtx m, const register Vec *src, register Vec *dst, register u32 n) { nofralloc psq_l f13,0x0(m),0,0 psq_l f12,0x10(m),0,0 subi n, n, 0x1 psq_l f11,0x8(m),0x1,0 ps_merge00 f0, f13, f12 subi r5, dst, 0x4 psq_l f10,0x18(m),0x1,0 ps_merge11 f1, f13, f12 mtctr n psq_l f3,0x20(m),0,0 ps_merge00 f2, f11, f10 psq_l f4,0x28(m),0x1,0 psq_l f6,0x0(src),0,0 psq_lu f7,0x8(src),0x1,0 ps_muls0 f8, f0, f6 ps_mul f9, f3, f6 ps_madds1 f8, f1, f6, f8 ps_madd f10, f4, f7, f9 loop: psq_lu f6,0x4(src),0,0 ps_madds0 f12, f2, f7, f8 psq_lu f7,0x8(src),0x1,0 ps_sum0 f13, f10, f9, f9 ps_muls0 f8, f0, f6 ps_mul f9, f3, f6 psq_stu f12,0x4(dst),0,0 ps_madds1 f8, f1, f6, f8 psq_stu f13,0x8(dst),0x1,0 ps_madd f10, f4, f7, f9 bdnz+ loop ps_madds0 f12, f2, f7, f8 ps_sum0 f13, f10, f9, f9 psq_stu f12,0x4(dst),0,0 psq_stu f13,0x8(dst),0x1,0 blr }