src/Dolphin/mtx/mtxvec.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

#include <mtx.h>

asm void PSMTXMultVec(const register Mtx m, const register Vec *src, register Vec *dst)
{
	nofralloc
	psq_l     f0,0x0(src),0,0
	psq_l     f2,0x0(m),0,0
	psq_l     f1,0x8(src),0x1,0

	ps_mul    f4, f2, f0
	psq_l     f3,0x8(m),0,0
	ps_madd   f5, f3, f1, f4
	psq_l     f8,0x10(m),0,0
	ps_sum0   f6, f5, f6, f5
	psq_l     f9,0x18(m),0,0
	ps_mul    f10, f8, f0
	psq_st    f6,0x0(dst),0x1,0
	ps_madd   f11, f9, f1, f10
	psq_l     f2,0x20(m),0,0
	ps_sum0   f12, f11, f12, f11
	psq_l     f3,0x28(m),0,0
	ps_mul    f4, f2, f0
	psq_st    f12,0x4(dst),0x1,0
	ps_madd   f5, f3, f1, f4
	ps_sum0   f6, f5, f6, f5
	psq_st    f6,0x8(dst),0x1,0
	blr
}

asm void PSMTXMultVecSR(const register Mtx m, const register Vec *src, register Vec *dst)
{
	nofralloc
	psq_l     f0,0x0(m),0,0
	psq_l     f6,0x0(src),0,0
	psq_l     f2,0x10(m),0,0
	ps_mul    f8, f0, f6
	psq_l     f4,0x20(m),0,0
	ps_mul    f10, f2, f6
	psq_l     f7,0x8(src),0x1,0
	ps_mul    f12, f4, f6
	psq_l     f3,0x18(m),0,0
	ps_sum0   f8, f8, f8, f8
	psq_l     f5,0x28(m),0,0
	ps_sum0   f10, f10, f10, f10
	psq_l     f1,0x8(m),0,0
	ps_sum0   f12, f12, f12, f12
	ps_madd   f9, f1, f7, f8
	psq_st    f9,0x0(dst),0x1,0
	ps_madd   f11, f3, f7, f10
	psq_st    f11,0x4(dst),0x1,0
	ps_madd   f13, f5, f7, f12
	psq_st    f13,0x8(dst),0x1,0
	blr
}

asm void PSMTXMultVecArraySR(const register Mtx m, const register Vec *src, register Vec *dst, register u32 n)
{
	nofralloc
	psq_l     f13,0x0(m),0,0
	psq_l     f12,0x10(m),0,0
	subi      n, n, 0x1
	psq_l     f11,0x8(m),0x1,0
	ps_merge00 f0, f13, f12
	subi      r5, dst, 0x4
	psq_l     f10,0x18(m),0x1,0
	ps_merge11 f1, f13, f12
	mtctr     n
	psq_l     f3,0x20(m),0,0
	ps_merge00 f2, f11, f10
	psq_l     f4,0x28(m),0x1,0
	psq_l     f6,0x0(src),0,0
	psq_lu    f7,0x8(src),0x1,0
	ps_muls0  f8, f0, f6
	ps_mul    f9, f3, f6
	ps_madds1 f8, f1, f6, f8
	ps_madd   f10, f4, f7, f9

	loop:
	psq_lu    f6,0x4(src),0,0
	ps_madds0 f12, f2, f7, f8
	psq_lu    f7,0x8(src),0x1,0
	ps_sum0   f13, f10, f9, f9
	ps_muls0  f8, f0, f6
	ps_mul    f9, f3, f6
	psq_stu   f12,0x4(dst),0,0
	ps_madds1 f8, f1, f6, f8
	psq_stu   f13,0x8(dst),0x1,0
	ps_madd   f10, f4, f7, f9
	bdnz+     loop
	ps_madds0 f12, f2, f7, f8
	ps_sum0   f13, f10, f9, f9
	psq_stu   f12,0x4(dst),0,0
	psq_stu   f13,0x8(dst),0x1,0
	blr
}