From 55487832c614edb61ebf22d0c153944ba864feaa Mon Sep 17 00:00:00 2001
From: mrb0nk500 <b0nk@b0nk.xyz>
Date: Thu, 2 Feb 2023 11:30:25 -0400
Subject: sdk: Add `mtx`

We're slowly getting there.
---
 include/dolphin/mtx.h    |   3 +-
 include/dolphin/vec.h    |  36 +++
 include/fdlibm.h         | 227 +++++++++++++++++
 obj_files.mk             |   6 +-
 src/Dolphin/mtx/mtx.c    | 648 +++++++++++++++++++++++++++++++++++++++++++++++
 src/Dolphin/mtx/mtx44.c  |  98 +++++++
 src/Dolphin/mtx/mtxvec.c |  95 +++++++
 src/Dolphin/mtx/vec.c    | 146 +++++++++++
 8 files changed, 1256 insertions(+), 3 deletions(-)
 create mode 100644 include/dolphin/vec.h
 create mode 100644 include/fdlibm.h
 create mode 100644 src/Dolphin/mtx/mtx.c
 create mode 100644 src/Dolphin/mtx/mtx44.c
 create mode 100644 src/Dolphin/mtx/mtxvec.c
 create mode 100644 src/Dolphin/mtx/vec.c

diff --git a/include/dolphin/mtx.h b/include/dolphin/mtx.h
index f57beea..b1c5525 100644
--- a/include/dolphin/mtx.h
+++ b/include/dolphin/mtx.h
@@ -68,9 +68,8 @@ void PSMTXTranspose(const Mtx src, Mtx xPose);
 u32 PSMTXInverse(const Mtx src, Mtx inv);
 u32 PSMTXInvXpose(const Mtx src, Mtx invX);
 void PSMTXMultVec(const Mtx m, const Vec* src, Vec* dst);
-void PSMTXMultVecArray(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count);
 void PSMTXMultVecSR(const Mtx m, const Vec* src, Vec* dst);
-void PSMTXMultVecArraySR(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count);
+void PSMTXMultVecArraySR(const Mtx m, const Vec* src, Vec* dst, u32 n);
 void PSMTXQuat(Mtx m, const Quaternion* q);
 void PSMTXReflect(Mtx m, const Vec* p, const Vec* n);
 void PSMTXTrans(Mtx m, f32 xT, f32 yT, f32 zT);
diff --git a/include/dolphin/vec.h b/include/dolphin/vec.h
new file mode 100644
index 0000000..27a0037
--- /dev/null
+++ b/include/dolphin/vec.h
@@ -0,0 +1,36 @@
+#ifndef _DOLPHIN_VEC_H
+#define _DOLPHIN_VEC_H
+
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // ifdef __cplusplus
+
+typedef struct Vec {
+	f32 x;
+	f32 y;
+	f32 z;
+} Vec;
+
+void PSVECAdd(const Vec*, const Vec*, Vec*);
+void PSVECSubtract(const Vec*, const Vec*, Vec*);
+void PSVECNormalize(const Vec*, Vec*);
+f32 PSVECMag(const Vec*);
+void PSVECCrossProduct(const Vec*, const Vec*, Vec*);
+
+#ifdef __cplusplus
+}
+#endif
+
+//  lfs     f1,0(r3)
+//  lfs     f0,4(r3)
+//  fmuls   f1,f1,f1
+//  lfs     f2,8(r3)
+//  fmuls   f0,f0,f0
+//  fmuls   f2,f2,f2
+//  fadds   f0,f1,f0
+//  fadds   f1,f2,f0
+//  blr
+
+#endif
diff --git a/include/fdlibm.h b/include/fdlibm.h
new file mode 100644
index 0000000..2b3388c
--- /dev/null
+++ b/include/fdlibm.h
@@ -0,0 +1,227 @@
+#ifndef _FDLIBM_H
+#define _FDLIBM_H
+
+/* @(#)fdlibm.h 1.5 04/04/22 */
+/*
+ * ====================================================
+ * Copyright (C) 2004 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif // ifdef __cplusplus
+
+/* Sometimes it's necessary to define __LITTLE_ENDIAN explicitly
+   but these catch some common cases. */
+
+#if defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
+#define __LITTLE_ENDIAN
+#endif
+
+#ifdef __LITTLE_ENDIAN
+#define __HI(x)  *(1 + (int*)&x)
+#define __LO(x)  *(int*)&x
+#define __HIp(x) *(1 + (int*)x)
+#define __LOp(x) *(int*)x
+#else
+#define __HI(x)  *(int*)&x
+#define __LO(x)  *(1 + (int*)&x)
+#define __HIp(x) *(int*)x
+#define __LOp(x) *(1 + (int*)x)
+#endif
+
+// TODO: should __STDC__ actually be defined?
+// #ifdef __STDC__
+#define __P(p) p
+// #else
+// #define __P(p) ()
+// #endif
+
+/*
+ * ANSI/POSIX
+ */
+
+extern int signgam;
+
+#define MAXFLOAT ((float)3.40282346638528860e+38)
+
+enum fdversion { fdlibmIeee = -1, fdlibmSvid, fdlibmXopen, fdlibmPosix };
+
+#define _LIB_VERSION_TYPE enum fdversion
+#define _LIB_VERSION      _fdlib_version
+
+/* if global variable _LIB_VERSION is not desirable, one may
+ * change the following to be a constant by:
+ *	#define _LIB_VERSION_TYPE const enum version
+ * In that case, after one initializes the value _LIB_VERSION (see
+ * s_lib_version.c) during compile time, it cannot be modified
+ * in the middle of a program
+ */
+extern _LIB_VERSION_TYPE _LIB_VERSION;
+
+#define _IEEE_  fdlibmIeee
+#define _SVID_  fdlibmSvid
+#define _XOPEN_ fdlibmXopen
+#define _POSIX_ fdlibmPosix
+
+struct exception {
+	int type;
+	char* name;
+	double arg1;
+	double arg2;
+	double retval;
+};
+
+#define HUGE MAXFLOAT
+
+/*
+ * set X_TLOSS = pi*2**52, which is possibly defined in <values.h>
+ * (one may replace the following line by "#include <values.h>")
+ */
+
+#define X_TLOSS 1.41484755040568800000e+16
+
+#define DOMAIN    1
+#define SING      2
+#define OVERFLOW  3
+#define UNDERFLOW 4
+#define TLOSS     5
+#define PLOSS     6
+
+/*
+ * ANSI/POSIX
+ */
+extern double acos __P((double));
+extern double asin __P((double));
+extern double atan __P((double));
+extern double atan2 __P((double, double));
+extern double cos __P((double));
+extern double sin __P((double));
+extern double tan __P((double));
+
+extern double cosh __P((double));
+extern double sinh __P((double));
+extern double tanh __P((double));
+
+extern double exp __P((double));
+extern double frexp __P((double, int*));
+extern double ldexp __P((double, int));
+extern double log __P((double));
+extern double log10 __P((double));
+extern double modf __P((double, double*));
+
+extern double pow __P((double, double));
+extern double sqrt __P((double));
+
+extern double ceil __P((double));
+extern double fabs __P((double));
+extern double floor __P((double));
+extern double fmod __P((double, double));
+
+extern double erf __P((double));
+extern double erfc __P((double));
+extern double gamma __P((double));
+extern double hypot __P((double, double));
+extern int isnan __P((double));
+extern int finite __P((double));
+extern double j0 __P((double));
+extern double j1 __P((double));
+extern double jn __P((int, double));
+extern double lgamma __P((double));
+extern double y0 __P((double));
+extern double y1 __P((double));
+extern double yn __P((int, double));
+
+extern double acosh __P((double));
+extern double asinh __P((double));
+extern double atanh __P((double));
+extern double cbrt __P((double));
+extern double logb __P((double));
+extern double nextafter __P((double, double));
+extern double remainder __P((double, double));
+#ifdef _SCALB_INT
+extern double scalb __P((double, int));
+#else
+extern double scalb __P((double, double));
+#endif
+
+extern int matherr __P((struct exception*));
+
+/*
+ * IEEE Test Vector
+ */
+extern double significand __P((double));
+
+/*
+ * Functions callable from C, intended to support IEEE arithmetic.
+ */
+extern double copysign __P((double, double));
+extern int ilogb __P((double));
+extern double rint __P((double));
+extern double scalbn __P((double, int));
+
+/*
+ * BSD math library entry points
+ */
+extern double expm1 __P((double));
+extern double log1p __P((double));
+
+/*
+ * Reentrant version of gamma & lgamma; passes signgam back by reference
+ * as the second argument; user must allocate space for signgam.
+ */
+#ifdef _REENTRANT
+extern double gamma_r __P((double, int*));
+extern double lgamma_r __P((double, int*));
+#endif /* _REENTRANT */
+
+/* ieee style elementary functions */
+extern double __ieee754_sqrt __P((double));
+extern double __ieee754_acos __P((double));
+extern double __ieee754_acosh __P((double));
+extern double __ieee754_log __P((double));
+extern double __ieee754_atanh __P((double));
+extern double __ieee754_asin __P((double));
+extern double __ieee754_atan2 __P((double, double));
+extern double __ieee754_exp __P((double));
+extern double __ieee754_cosh __P((double));
+extern double __ieee754_fmod __P((double, double));
+extern double __ieee754_pow __P((double, double));
+extern double __ieee754_lgamma_r __P((double, int*));
+extern double __ieee754_gamma_r __P((double, int*));
+extern double __ieee754_lgamma __P((double));
+extern double __ieee754_gamma __P((double));
+extern double __ieee754_log10 __P((double));
+extern double __ieee754_sinh __P((double));
+extern double __ieee754_hypot __P((double, double));
+extern double __ieee754_j0 __P((double));
+extern double __ieee754_j1 __P((double));
+extern double __ieee754_y0 __P((double));
+extern double __ieee754_y1 __P((double));
+extern double __ieee754_jn __P((int, double));
+extern double __ieee754_yn __P((int, double));
+extern double __ieee754_remainder __P((double, double));
+extern int __ieee754_remPio2 __P((double, double*));
+#ifdef _SCALB_INT
+extern double __ieee754_scalb __P((double, int));
+#else
+extern double __ieee754_scalb __P((double, double));
+#endif
+
+/* fdlibm kernel function */
+extern double __kernel_standard __P((double, double, int));
+extern double __kernel_sin __P((double, double, int));
+extern double __kernel_cos __P((double, double));
+extern double __kernel_tan __P((double, double, int));
+extern int __kernel_remPio2 __P((double*, double*, int, int, int, const int*));
+
+#ifdef __cplusplus
+};
+#endif // ifdef __cplusplus
+
+#endif
diff --git a/obj_files.mk b/obj_files.mk
index 99022d0..24a0fb6 100644
--- a/obj_files.mk
+++ b/obj_files.mk
@@ -30,4 +30,8 @@ O_FILES := $(BUILD_DIR)/src/main.o \
 	$(BUILD_DIR)/src/Dolphin/os/OSSync.o \
 	$(BUILD_DIR)/src/Dolphin/os/OSThread.o \
 	$(BUILD_DIR)/src/Dolphin/os/OSTime.o \
-	$(BUILD_DIR)/src/Dolphin/os/__ppc_eabi_init.o
+	$(BUILD_DIR)/src/Dolphin/os/__ppc_eabi_init.o \
+	$(BUILD_DIR)/src/Dolphin/mtx/mtx.o \
+	$(BUILD_DIR)/src/Dolphin/mtx/mtxvec.o \
+	$(BUILD_DIR)/src/Dolphin/mtx/mtx44.o \
+	$(BUILD_DIR)/src/Dolphin/mtx/vec.o
diff --git a/src/Dolphin/mtx/mtx.c b/src/Dolphin/mtx/mtx.c
new file mode 100644
index 0000000..4a229bd
--- /dev/null
+++ b/src/Dolphin/mtx/mtx.c
@@ -0,0 +1,648 @@
+// This file was taken from the Pikmin 2 decompilation project.
+// https://github.com/projectPiki/pikmin2/blob/main/src/Dolphin/mtx/mtx.c
+#include "types.h"
+#include "fdlibm.h"
+#include "Dolphin/mtx.h"
+static f32 Unit01[] = { 0.0f, 1.0f };
+
+void PSMTXIdentity
+(
+    register Mtx m
+)
+{
+    register f32 zero_c = 0.0f;
+    register f32 one_c  = 1.0f;
+    register f32 c_01;
+    register f32 c_10;
+
+    asm
+    {
+        psq_st      zero_c, 8(m),   0, 0
+        ps_merge01  c_01, zero_c, one_c
+        psq_st      zero_c, 24(m),  0, 0
+        ps_merge10  c_10, one_c, zero_c
+        psq_st      zero_c, 32(m),  0, 0
+        psq_st      c_01,   16(m),  0, 0
+        psq_st      c_10,   0(m),   0, 0
+        psq_st      c_10,   40(m),  0, 0
+    }
+}
+
+asm void PSMTXCopy
+(
+    const register Mtx src,
+    register Mtx dst
+)
+{
+    nofralloc
+
+    psq_l       fp0, 0(src),   0, 0
+    psq_st      fp0, 0(dst),   0, 0
+    psq_l       fp1, 8(src),   0, 0
+    psq_st      fp1, 8(dst),   0, 0
+    psq_l       fp2, 16(src),  0, 0
+    psq_st      fp2, 16(dst),  0, 0
+    psq_l       fp3, 24(src),  0, 0
+    psq_st      fp3, 24(dst),  0, 0
+    psq_l       fp4, 32(src),  0, 0
+    psq_st      fp4, 32(dst),  0, 0
+    psq_l       fp5, 40(src),  0, 0
+    psq_st      fp5, 40(dst),  0, 0
+
+    blr
+}
+
+asm void PSMTXConcat
+(
+    const register Mtx mA,    // r3
+    const register Mtx mB,    // r4
+          register Mtx mAB    // r5
+)
+{
+    nofralloc
+
+#define FP0 fp0
+#define FP1 fp1
+#define FP2 fp2
+#define FP3 fp3
+#define FP4 fp4
+#define FP5 fp5
+#define FP6 fp6
+#define FP7 fp7
+#define FP8 fp8
+#define FP9 fp9
+#define FP10 fp10
+#define FP11 fp11
+#define FP12 fp12
+#define FP13 fp13
+#define FP14 fp14
+#define FP15 fp15
+#define FP31 fp31
+    stwu    r1, -64(r1)
+    psq_l   FP0, 0(mA), 0, 0
+    stfd    fp14, 8(r1)
+    psq_l   FP6, 0(mB), 0, 0
+    addis   r6, 0, Unit01@ha
+    psq_l   FP7, 8(mB), 0, 0
+    stfd    fp15, 16(r1)
+    addi    r6, r6, Unit01@l
+    stfd    fp31, 40(r1)
+    psq_l   FP8, 16(mB), 0, 0
+    ps_muls0 FP12, FP6, FP0
+    psq_l   FP2, 16(mA), 0, 0
+    ps_muls0 FP13, FP7, FP0
+    psq_l   FP31, 0(r6), 0, 0
+    ps_muls0 FP14, FP6, FP2
+    psq_l   FP9, 24(mB), 0, 0
+    ps_muls0 FP15, FP7, FP2
+    psq_l   FP1, 8(mA), 0, 0
+        ps_madds1 FP12, FP8, FP0, FP12
+    psq_l   FP3, 24(mA), 0, 0
+        ps_madds1 FP14, FP8, FP2, FP14
+    psq_l   FP10, 32(mB), 0, 0
+        ps_madds1 FP13, FP9, FP0, FP13
+    psq_l   FP11, 40(mB), 0, 0
+        ps_madds1 FP15, FP9, FP2, FP15
+    psq_l   FP4, 32(mA), 0, 0
+    psq_l   FP5, 40(mA), 0, 0
+            ps_madds0 FP12, FP10, FP1, FP12
+            ps_madds0 FP13, FP11, FP1, FP13
+            ps_madds0 FP14, FP10, FP3, FP14
+            ps_madds0 FP15, FP11, FP3, FP15
+    psq_st  FP12, 0(mAB), 0, 0
+
+    ps_muls0 FP2, FP6, FP4
+                ps_madds1 FP13, FP31, FP1, FP13
+    ps_muls0 FP0, FP7, FP4
+    psq_st  FP14, 16(mAB), 0, 0
+                ps_madds1 FP15, FP31, FP3, FP15
+
+    psq_st  FP13, 8(mAB), 0, 0
+
+        ps_madds1 FP2, FP8, FP4, FP2
+        ps_madds1 FP0, FP9, FP4, FP0
+            ps_madds0 FP2, FP10, FP5, FP2
+    lfd    fp14, 8(r1)
+    psq_st  FP15, 24(mAB), 0, 0
+            ps_madds0 FP0, FP11, FP5, FP0
+    psq_st  FP2, 32(mAB), 0, 0
+                ps_madds1 FP0, FP31, FP5, FP0
+    lfd    fp15, 16(r1)
+    psq_st  FP0, 40(mAB), 0, 0
+
+    lfd    fp31, 40(r1)
+    addi   r1, r1, 64
+
+    blr
+
+#undef FP0
+#undef FP1
+#undef FP2
+#undef FP3
+#undef FP4
+#undef FP5
+#undef FP6
+#undef FP7
+#undef FP8
+#undef FP9
+#undef FP10
+#undef FP11
+#undef FP12
+#undef FP13
+#undef FP14
+#undef FP15
+#undef FP31
+}
+
+#pragma scheduling off
+void PSMTXTranspose ( const register Mtx src, register Mtx xPose )
+{
+    register f32 c_zero = 0.0f;
+    register f32 row0a, row1a, row0b, row1b;
+    register f32 trns0, trns1, trns2;
+
+    asm
+    {
+        psq_l       row0a, 0(src),  0, 0
+        stfs        c_zero, 44(xPose)
+        psq_l       row1a, 16(src), 0, 0
+        ps_merge00  trns0, row0a, row1a
+        psq_l       row0b, 8(src),  1, 0
+        ps_merge11  trns1, row0a, row1a
+        psq_l       row1b, 24(src), 1, 0
+        psq_st      trns0, 0(xPose),  0, 0
+        psq_l       row0a, 32(src), 0, 0
+        ps_merge00  trns2, row0b, row1b
+        psq_st      trns1, 16(xPose), 0, 0
+        ps_merge00  trns0, row0a, c_zero
+        psq_st      trns2, 32(xPose), 0, 0
+        ps_merge10  trns1, row0a, c_zero
+        psq_st      trns0, 8(xPose),  0, 0
+        lfs         row0b, 40(src)
+        psq_st      trns1, 24(xPose), 0, 0
+        stfs        row0b, 40(xPose)
+    }
+}
+
+asm u32 PSMTXInverse ( const register Mtx src, register Mtx inv )
+{
+    nofralloc
+    psq_l       fp0, 0(src), 1, 0
+    psq_l       fp1, 4(src), 0, 0
+    psq_l       fp2, 16(src), 1, 0
+        ps_merge10  fp6, fp1, fp0
+    psq_l       fp3, 20(src), 0, 0
+    psq_l       fp4, 32(src), 1, 0
+        ps_merge10  fp7, fp3, fp2
+    psq_l       fp5, 36(src), 0, 0
+    ps_mul      fp11, fp3, fp6
+    ps_mul      fp13, fp5, fp7
+        ps_merge10  fp8, fp5, fp4
+    ps_msub     fp11, fp1, fp7, fp11
+    ps_mul      fp12, fp1, fp8
+    ps_msub     fp13, fp3, fp8, fp13
+        ps_mul      fp10, fp3, fp4
+    ps_msub     fp12, fp5, fp6, fp12
+        ps_mul      fp9,  fp0, fp5
+        ps_mul      fp8,  fp1, fp2
+    ps_sub      fp6, fp6, fp6
+        ps_msub     fp10, fp2, fp5, fp10
+    ps_mul      fp7, fp0, fp13
+        ps_msub     fp9,  fp1, fp4, fp9
+    ps_madd     fp7, fp2, fp12, fp7
+        ps_msub     fp8,  fp0, fp3, fp8
+    ps_madd     fp7, fp4, fp11, fp7
+    ps_cmpo0    cr0, fp7, fp6
+    bne         _regular
+    addi        r3, 0, 0
+    blr
+  _regular:
+    fres        fp0, fp7
+    ps_add      fp6, fp0, fp0
+    ps_mul      fp5, fp0, fp0
+    ps_nmsub    fp0, fp7, fp5, fp6
+    lfs         fp1, 12(src)
+        ps_muls0    fp13, fp13, fp0
+    lfs         fp2, 28(src)
+        ps_muls0    fp12, fp12, fp0
+    lfs         fp3, 44(src)
+        ps_muls0    fp11, fp11, fp0
+    ps_merge00  fp5, fp13, fp12
+        ps_muls0    fp10, fp10, fp0
+    ps_merge11  fp4, fp13, fp12
+        ps_muls0    fp9,  fp9,  fp0
+    psq_st      fp5,  0(inv), 0, 0
+        ps_mul      fp6, fp13, fp1
+    psq_st      fp4,  16(inv), 0, 0
+        ps_muls0    fp8,  fp8,  fp0
+        ps_madd     fp6, fp12, fp2, fp6
+    psq_st      fp10, 32(inv), 1, 0
+        ps_nmadd    fp6, fp11, fp3, fp6
+    psq_st      fp9,  36(inv), 1, 0
+        ps_mul      fp7, fp10, fp1
+        ps_merge00  fp5, fp11, fp6
+    psq_st      fp8,  40(inv), 1, 0
+        ps_merge11  fp4, fp11, fp6
+    psq_st      fp5,  8(inv), 0, 0
+        ps_madd     fp7, fp9,  fp2, fp7
+    psq_st      fp4,  24(inv), 0, 0
+        ps_nmadd    fp7, fp8,  fp3, fp7
+            addi        r3, 0, 1
+    psq_st      fp7,  44(inv), 1, 0
+    blr
+}
+
+void PSMTXRotRad ( Mtx m, char axis, f32 rad )
+{
+    f32 sinA, cosA;
+
+    sinA = sinf(rad);
+    cosA = cosf(rad);
+
+    PSMTXRotTrig( m, axis, sinA, cosA );
+}
+
+void PSMTXRotTrig (
+    register Mtx    m,
+    register char   axis,
+    register f32    sinA,
+    register f32    cosA )
+{
+    register f32    fc0, fc1, nsinA;
+    register f32    fw0, fw1, fw2, fw3;
+
+    asm
+    {
+        frsp        sinA, sinA
+        frsp        cosA, cosA
+    }
+
+    fc0 = 0.0f;
+    fc1 = 1.0f;
+
+    asm
+    {
+        ori         axis, axis, 0x20
+        ps_neg      nsinA, sinA
+        cmplwi      axis, 'x'
+        beq         _case_x
+        cmplwi      axis, 'y'
+        beq         _case_y
+        cmplwi      axis, 'z'
+        beq         _case_z
+        b           _end
+
+    _case_x:
+        psq_st      fc1,  0(m), 1, 0
+        psq_st      fc0,  4(m), 0, 0
+        ps_merge00  fw0, sinA, cosA
+        psq_st      fc0, 12(m), 0, 0
+        ps_merge00  fw1, cosA, nsinA
+        psq_st      fc0, 28(m), 0, 0
+        psq_st      fc0, 44(m), 1, 0
+        psq_st      fw0, 36(m), 0, 0
+        psq_st      fw1, 20(m), 0, 0
+        b           _end;
+
+    _case_y:
+        ps_merge00  fw0, cosA, fc0
+        ps_merge00  fw1, fc0, fc1
+        psq_st      fc0, 24(m), 0, 0
+        psq_st      fw0,  0(m), 0, 0
+        ps_merge00  fw2, nsinA, fc0
+        ps_merge00  fw3, sinA, fc0
+        psq_st      fw0, 40(m), 0, 0;
+        psq_st      fw1, 16(m), 0, 0;
+        psq_st      fw3,  8(m), 0, 0;
+        psq_st      fw2, 32(m), 0, 0;
+        b           _end;
+
+    _case_z:
+        psq_st      fc0,  8(m), 0, 0
+        ps_merge00  fw0, sinA, cosA
+        ps_merge00  fw2, cosA, nsinA
+        psq_st      fc0, 24(m), 0, 0
+        psq_st      fc0, 32(m), 0, 0
+        ps_merge00  fw1, fc1, fc0
+        psq_st      fw0, 16(m), 0, 0
+        psq_st      fw2,  0(m), 0, 0
+        psq_st      fw1, 40(m), 0, 0
+
+    _end:
+
+    }
+}
+
+static void __PSMTXRotAxisRadInternal(
+    register Mtx    m,
+    const register Vec *axis,
+    register f32    sT,
+    register f32    cT )
+{
+    register f32    tT, fc0;
+    register f32    tmp0, tmp1, tmp2, tmp3, tmp4;
+    register f32    tmp5, tmp6, tmp7, tmp8, tmp9;
+
+    tmp9 = 0.5f;
+    tmp8 = 3.0f;
+
+    asm
+    {
+        frsp        cT, cT
+        psq_l       tmp0, 0(axis), 0, 0
+        frsp        sT, sT
+        lfs         tmp1, 8(axis)
+        ps_mul      tmp2, tmp0, tmp0
+        fadds       tmp7, tmp9, tmp9
+        ps_madd     tmp3, tmp1, tmp1, tmp2
+        fsubs       fc0, tmp9, tmp9
+        ps_sum0     tmp4, tmp3, tmp1, tmp2
+        fsubs       tT, tmp7, cT
+        frsqrte     tmp5, tmp4
+        fmuls       tmp2, tmp5, tmp5
+        fmuls       tmp3, tmp5, tmp9
+        fnmsubs     tmp2, tmp2, tmp4, tmp8
+        fmuls       tmp5, tmp2, tmp3
+        ps_merge00  cT, cT, cT
+        ps_muls0    tmp0, tmp0, tmp5
+        ps_muls0    tmp1, tmp1, tmp5
+        ps_muls0    tmp4, tmp0, tT
+        ps_muls0    tmp9, tmp0, sT
+        ps_muls0    tmp5, tmp1, tT
+        ps_muls1    tmp3, tmp4, tmp0
+        ps_muls0    tmp2, tmp4, tmp0
+        ps_muls0    tmp4, tmp4, tmp1
+        fnmsubs     tmp6, tmp1, sT, tmp3
+        fmadds      tmp7, tmp1, sT, tmp3
+        ps_neg      tmp0, tmp9
+        ps_sum0     tmp8, tmp4, fc0, tmp9
+        ps_sum0     tmp2, tmp2, tmp6, cT
+        ps_sum1     tmp3, cT, tmp7, tmp3
+        ps_sum0     tmp6, tmp0, fc0 ,tmp4
+            psq_st      tmp8, 8(m), 0, 0
+        ps_sum0     tmp0, tmp4, tmp4, tmp0
+            psq_st      tmp2, 0(m), 0, 0
+        ps_muls0    tmp5, tmp5, tmp1
+            psq_st      tmp3, 16(m), 0, 0
+        ps_sum1     tmp4, tmp9, tmp0, tmp4
+            psq_st      tmp6, 24(m), 0, 0
+        ps_sum0     tmp5, tmp5, fc0, cT
+            psq_st      tmp4, 32(m), 0, 0
+            psq_st      tmp5, 40(m), 0, 0
+    }
+}
+
+// clang-format on
+void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad)
+{
+	f32 sinT, cosT;
+
+	sinT = sinf(rad);
+	cosT = cosf(rad);
+
+	__PSMTXRotAxisRadInternal(m, axis, sinT, cosT);
+}
+
+// clang-format off
+void PSMTXTrans(
+    register Mtx m,
+    register f32 xT,
+    register f32 yT,
+    register f32 zT
+)
+{
+    register f32 c0 = 0.0f;
+    register f32 c1 = 1.0f;
+
+    asm
+    {
+        stfs        xT,     12(m)
+        stfs        yT,     28(m)
+        psq_st      c0,      4(m), 0, 0
+        psq_st      c0,     32(m), 0, 0
+        stfs        c0,     16(m)
+        stfs        c1,     20(m)
+        stfs        c0,     24(m)
+        stfs        c1,     40(m)
+        stfs        zT,     44(m)
+        stfs        c1,      0(m)
+    }
+}
+
+asm void PSMTXTransApply(
+    const register Mtx src,
+    register Mtx dst,
+    register f32 xT,
+    register f32 yT,
+    register f32 zT )
+{
+    nofralloc;
+    psq_l       fp4, 0(src),        0, 0;
+    frsp        xT, xT;
+    psq_l       fp5, 8(src),        0, 0;
+    frsp        yT, yT;
+    psq_l       fp7, 24(src),       0, 0;
+    frsp        zT, zT;
+    psq_l       fp8, 40(src),       0, 0;
+    psq_st      fp4, 0(dst),        0, 0;
+    ps_sum1     fp5, xT, fp5, fp5;
+    psq_l       fp6, 16(src),       0, 0;
+    psq_st      fp5, 8(dst),        0, 0;
+    ps_sum1     fp7, yT, fp7, fp7;
+    psq_l       fp9, 32(src),       0, 0;
+    psq_st      fp6, 16(dst),       0, 0;
+    ps_sum1     fp8, zT, fp8, fp8;
+    psq_st      fp7, 24(dst),       0, 0;
+    psq_st      fp9, 32(dst),       0, 0;
+    psq_st      fp8, 40(dst),       0, 0;
+    blr;
+}
+
+void PSMTXScale(
+    register Mtx m,
+    register f32 xS,
+    register f32 yS,
+    register f32 zS
+)
+{
+    register f32 c0 = 0.0f;
+
+    asm
+    {
+        stfs        xS,      0(m)
+        psq_st      c0,      4(m), 0, 0
+        psq_st      c0,     12(m), 0, 0
+        stfs        yS,     20(m)
+        psq_st      c0,     24(m), 0, 0
+        psq_st      c0,     32(m), 0, 0
+        stfs        zS,     40(m)
+        stfs        c0,     44(m)
+    }
+}
+
+asm void PSMTXScaleApply (
+    const register Mtx src,
+    register Mtx dst,
+    register f32 xS,
+    register f32 yS,
+    register f32 zS )
+{
+    nofralloc;
+    frsp        xS, xS;
+    psq_l       fp4, 0(src),        0, 0;
+    frsp        yS, yS;
+    psq_l       fp5, 8(src),        0, 0;
+    frsp        zS, zS;
+    ps_muls0    fp4, fp4, xS;
+    psq_l       fp6, 16(src),       0, 0;
+    ps_muls0    fp5, fp5, xS;
+    psq_l       fp7, 24(src),       0, 0;
+    ps_muls0    fp6, fp6, yS;
+    psq_l       fp8, 32(src),       0, 0;
+    psq_st      fp4, 0(dst),        0, 0;
+    ps_muls0    fp7, fp7, yS;
+    psq_l       fp2, 40(src),       0, 0;
+    psq_st      fp5, 8(dst),        0, 0;
+    ps_muls0    fp8, fp8, zS;
+    psq_st      fp6, 16(dst),       0, 0;
+    ps_muls0    fp2, fp2, zS;
+    psq_st      fp7, 24(dst),       0, 0;
+    psq_st      fp8, 32(dst),       0, 0;
+    psq_st      fp2, 40(dst),       0, 0;
+    blr;
+}
+
+void PSMTXQuat ( register Mtx m, const register Quaternion *q )
+{
+    register f32    c_zero, c_one, c_two, scale;
+    register f32    tmp0, tmp1, tmp2, tmp3, tmp4;
+    register f32    tmp5, tmp6, tmp7, tmp8, tmp9;
+    c_one = 1.0f;
+    asm
+    {
+        psq_l       tmp0, 0(q), 0, 0
+        psq_l       tmp1, 8(q), 0, 0
+        fsubs       c_zero, c_one, c_one
+        fadds       c_two, c_one, c_one
+        ps_mul      tmp2, tmp0, tmp0
+        ps_merge10  tmp5, tmp0, tmp0
+        ps_madd     tmp4, tmp1, tmp1, tmp2
+        ps_mul      tmp3, tmp1, tmp1
+            ps_sum0     scale, tmp4, tmp4, tmp4
+        ps_muls1    tmp7, tmp5, tmp1
+            fres        tmp9, scale
+        ps_sum1     tmp4, tmp3, tmp4, tmp2
+            ps_nmsub    scale, scale, tmp9, c_two
+        ps_muls1    tmp6, tmp1, tmp1
+            ps_mul      scale, tmp9, scale
+        ps_sum0     tmp2, tmp2, tmp2, tmp2
+            fmuls       scale, scale, c_two
+        ps_madd     tmp8, tmp0, tmp5, tmp6
+        ps_msub     tmp6, tmp0, tmp5, tmp6
+            psq_st      c_zero, 12(m), 1, 0
+        ps_nmsub    tmp2, tmp2, scale, c_one
+        ps_nmsub    tmp4, tmp4, scale, c_one
+            psq_st      c_zero, 44(m), 1, 0
+        ps_mul      tmp8, tmp8, scale
+        ps_mul      tmp6, tmp6, scale
+            psq_st      tmp2, 40(m), 1, 0
+        ps_madds0   tmp5, tmp0, tmp1, tmp7
+        ps_merge00  tmp1, tmp8, tmp4
+        ps_nmsub    tmp7, tmp7, c_two, tmp5
+        ps_merge10  tmp0, tmp4, tmp6
+            psq_st      tmp1, 16(m), 0, 0
+        ps_mul      tmp5, tmp5, scale
+        ps_mul      tmp7, tmp7, scale
+            psq_st      tmp0,  0(m), 0, 0
+            psq_st      tmp5,  8(m), 1, 0
+        ps_merge10  tmp3, tmp7, c_zero
+        ps_merge01  tmp9, tmp7, tmp5
+            psq_st      tmp3, 24(m), 0, 0
+            psq_st      tmp9, 32(m), 0, 0
+    }
+}
+
+// clang-format on
+void C_MTXLookAt(Mtx m, const Vec* camPos, const Vec* camUp, const Vec* target)
+{
+
+	Vec vLook, vRight, vUp;
+
+	vLook.x = camPos->x - target->x;
+	vLook.y = camPos->y - target->y;
+	vLook.z = camPos->z - target->z;
+	PSVECNormalize(&vLook, &vLook);
+
+	PSVECCrossProduct(camUp, &vLook, &vRight);
+	PSVECNormalize(&vRight, &vRight);
+
+	PSVECCrossProduct(&vLook, &vRight, &vUp);
+
+	m[0][0] = vRight.x;
+	m[0][1] = vRight.y;
+	m[0][2] = vRight.z;
+	m[0][3] = -(camPos->x * vRight.x + camPos->y * vRight.y + camPos->z * vRight.z);
+
+	m[1][0] = vUp.x;
+	m[1][1] = vUp.y;
+	m[1][2] = vUp.z;
+	m[1][3] = -(camPos->x * vUp.x + camPos->y * vUp.y + camPos->z * vUp.z);
+
+	m[2][0] = vLook.x;
+	m[2][1] = vLook.y;
+	m[2][2] = vLook.z;
+	m[2][3] = -(camPos->x * vLook.x + camPos->y * vLook.y + camPos->z * vLook.z);
+}
+
+void C_MTXLightFrustum(Mtx m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 scaleS, f32 scaleT, f32 transS, f32 transT)
+{
+
+}
+
+void C_MTXLightPerspective(Mtx m, f32 fovY, f32 aspect, float scaleS, float scaleT, float transS, float transT)
+{
+	f32 angle;
+	f32 cot;
+
+	angle = fovY * 0.5f;
+	angle = MTXDegToRad(angle);
+
+	cot = 1.0f / tanf(angle);
+
+	m[0][0] = (cot / aspect) * scaleS;
+	m[0][1] = 0.0f;
+	m[0][2] = -transS;
+	m[0][3] = 0.0f;
+
+	m[1][0] = 0.0f;
+	m[1][1] = cot * scaleT;
+	m[1][2] = -transT;
+	m[1][3] = 0.0f;
+
+	m[2][0] = 0.0f;
+	m[2][1] = 0.0f;
+	m[2][2] = -1.0f;
+	m[2][3] = 0.0f;
+}
+
+void C_MTXLightOrtho(Mtx m, f32 t, f32 b, f32 l, f32 r, float scaleS, float scaleT, float transS, float transT)
+{
+	f32 tmp;
+
+	tmp     = 1.0f / (r - l);
+	m[0][0] = (2.0f * tmp * scaleS);
+	m[0][1] = 0.0f;
+	m[0][2] = 0.0f;
+	m[0][3] = ((-(r + l) * tmp) * scaleS) + transS;
+
+	tmp     = 1.0f / (t - b);
+	m[1][0] = 0.0f;
+	m[1][1] = (2.0f * tmp) * scaleT;
+	m[1][2] = 0.0f;
+	m[1][3] = ((-(t + b) * tmp) * scaleT) + transT;
+
+	m[2][0] = 0.0f;
+	m[2][1] = 0.0f;
+	m[2][2] = 0.0f;
+	m[2][3] = 1.0f;
+}
diff --git a/src/Dolphin/mtx/mtx44.c b/src/Dolphin/mtx/mtx44.c
new file mode 100644
index 0000000..b974888
--- /dev/null
+++ b/src/Dolphin/mtx/mtx44.c
@@ -0,0 +1,98 @@
+// This file was taken from the Melee decompilation project.
+// https://github.com/doldecomp/melee/blob/master/src/dolphin/mtx/mtx44.c
+#include <mtx.h>
+#include <math.h>
+
+void C_MTXFrustum(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f)
+{
+    f32 temp_f8;
+    f32 temp_f6;
+    f32 temp_f9;
+    f32 temp_f11;
+
+    temp_f11 = 1.0F / (r - l);
+    temp_f8 = 2.0F * n;
+    temp_f9 = 1.0F / (t - b);
+    temp_f6 = 1.0F / (f - n);
+
+    m[0][0] = temp_f8 * temp_f11;
+    m[0][1] = 0.0F;
+    m[0][2] = temp_f11 * (r + l);
+    m[0][3] = 0.0F;
+
+    m[1][0] = 0.0F;
+    m[1][1] = temp_f8 * temp_f9;
+    m[1][2] = temp_f9 * (t + b);
+    m[1][3] = 0.0F;
+
+    m[2][0] = 0.0F;
+    m[2][1] = 0.0F;
+    m[2][2] = -n * temp_f6;
+    m[2][3] = temp_f6 * -(f * n);
+
+    m[3][0] = 0.0F;
+    m[3][1] = 0.0F;
+    m[3][2] = -1.0F;
+    m[3][3] = 0.0F;
+}
+
+void C_MTXPerspective(Mtx44 m, f32 fovY, f32 aspect, f32 n, f32 f)
+{
+    f32 temp_f3;
+    f32 temp_f4;
+
+    fovY = 0.5F * fovY;
+
+    temp_f4 = 1.0F / tanf(0.017453292F * (fovY));
+    temp_f3 = 1.0F / (f - n);
+
+    m[0][0] = temp_f4 / aspect;
+    m[0][1] = 0.0F;
+    m[0][2] = 0.0F;
+    m[0][3] = 0.0F;
+
+    m[1][0] = 0.0F;
+    m[1][1] = temp_f4;
+    m[1][2] = 0.0F;
+    m[1][3] = 0.0F;
+
+    m[2][0] = 0.0F;
+    m[2][1] = 0.0F;
+    m[2][2] = -n * temp_f3;
+    m[2][3] = temp_f3 * -(f * n);
+
+    m[3][0] = 0.0F;
+    m[3][1] = 0.0F;
+    m[3][2] = -1.0F;
+    m[3][3] = 0.0F;
+}
+
+void C_MTXOrtho(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f)
+{
+    f32 temp_f8;
+    f32 temp_f10;
+    f32 temp_f4;
+
+    temp_f10 = 1.0F / (r - l);
+    m[0][0] = 2.0F * temp_f10;
+    m[0][1] = 0.0F;
+    m[0][2] = 0.0F;
+    m[0][3] = temp_f10 * -(r + l);
+
+    temp_f8 = 1.0F / (t - b);
+    m[1][0] = 0.0F;
+    m[1][1] = 2.0F * temp_f8;
+    m[1][2] = 0.0F;
+    m[1][3] = temp_f8 * -(t + b);
+
+    temp_f4 = 1.0F / (f - n);
+    m[2][0] = 0.0F;
+    m[2][1] = 0.0F;
+    m[2][2] = -1.0F * temp_f4;
+    m[2][3] = -f * temp_f4;
+
+    m[3][0] = 0.0F;
+    m[3][1] = 0.0F;
+    m[3][2] = 0.0F;
+    m[3][3] = 1.0F;
+}
diff --git a/src/Dolphin/mtx/mtxvec.c b/src/Dolphin/mtx/mtxvec.c
new file mode 100644
index 0000000..bb2e1d4
--- /dev/null
+++ b/src/Dolphin/mtx/mtxvec.c
@@ -0,0 +1,95 @@
+#include <mtx.h>
+
+asm void PSMTXMultVec(const register Mtx m, const register Vec *src, register Vec *dst)
+{
+	nofralloc
+	psq_l     f0,0x0(src),0,0
+	psq_l     f2,0x0(m),0,0
+	psq_l     f1,0x8(src),0x1,0
+
+	ps_mul    f4, f2, f0
+	psq_l     f3,0x8(m),0,0
+	ps_madd   f5, f3, f1, f4
+	psq_l     f8,0x10(m),0,0
+	ps_sum0   f6, f5, f6, f5
+	psq_l     f9,0x18(m),0,0
+	ps_mul    f10, f8, f0
+	psq_st    f6,0x0(dst),0x1,0
+	ps_madd   f11, f9, f1, f10
+	psq_l     f2,0x20(m),0,0
+	ps_sum0   f12, f11, f12, f11
+	psq_l     f3,0x28(m),0,0
+	ps_mul    f4, f2, f0
+	psq_st    f12,0x4(dst),0x1,0
+	ps_madd   f5, f3, f1, f4
+	ps_sum0   f6, f5, f6, f5
+	psq_st    f6,0x8(dst),0x1,0
+	blr
+}
+
+asm void PSMTXMultVecSR(const register Mtx m, const register Vec *src, register Vec *dst)
+{
+	nofralloc
+	psq_l     f0,0x0(m),0,0
+	psq_l     f6,0x0(src),0,0
+	psq_l     f2,0x10(m),0,0
+	ps_mul    f8, f0, f6
+	psq_l     f4,0x20(m),0,0
+	ps_mul    f10, f2, f6
+	psq_l     f7,0x8(src),0x1,0
+	ps_mul    f12, f4, f6
+	psq_l     f3,0x18(m),0,0
+	ps_sum0   f8, f8, f8, f8
+	psq_l     f5,0x28(m),0,0
+	ps_sum0   f10, f10, f10, f10
+	psq_l     f1,0x8(m),0,0
+	ps_sum0   f12, f12, f12, f12
+	ps_madd   f9, f1, f7, f8
+	psq_st    f9,0x0(dst),0x1,0
+	ps_madd   f11, f3, f7, f10
+	psq_st    f11,0x4(dst),0x1,0
+	ps_madd   f13, f5, f7, f12
+	psq_st    f13,0x8(dst),0x1,0
+	blr
+}
+
+asm void PSMTXMultVecArraySR(const register Mtx m, const register Vec *src, register Vec *dst, register u32 n)
+{
+	nofralloc
+	psq_l     f13,0x0(m),0,0
+	psq_l     f12,0x10(m),0,0
+	subi      n, n, 0x1
+	psq_l     f11,0x8(m),0x1,0
+	ps_merge00 f0, f13, f12
+	subi      r5, dst, 0x4
+	psq_l     f10,0x18(m),0x1,0
+	ps_merge11 f1, f13, f12
+	mtctr     n
+	psq_l     f3,0x20(m),0,0
+	ps_merge00 f2, f11, f10
+	psq_l     f4,0x28(m),0x1,0
+	psq_l     f6,0x0(src),0,0
+	psq_lu    f7,0x8(src),0x1,0
+	ps_muls0  f8, f0, f6
+	ps_mul    f9, f3, f6
+	ps_madds1 f8, f1, f6, f8
+	ps_madd   f10, f4, f7, f9
+
+	loop:
+	psq_lu    f6,0x4(src),0,0
+	ps_madds0 f12, f2, f7, f8
+	psq_lu    f7,0x8(src),0x1,0
+	ps_sum0   f13, f10, f9, f9
+	ps_muls0  f8, f0, f6
+	ps_mul    f9, f3, f6
+	psq_stu   f12,0x4(dst),0,0
+	ps_madds1 f8, f1, f6, f8
+	psq_stu   f13,0x8(dst),0x1,0
+	ps_madd   f10, f4, f7, f9
+	bdnz+     loop
+	ps_madds0 f12, f2, f7, f8
+	ps_sum0   f13, f10, f9, f9
+	psq_stu   f12,0x4(dst),0,0
+	psq_stu   f13,0x8(dst),0x1,0
+	blr
+}
diff --git a/src/Dolphin/mtx/vec.c b/src/Dolphin/mtx/vec.c
new file mode 100644
index 0000000..af42c0b
--- /dev/null
+++ b/src/Dolphin/mtx/vec.c
@@ -0,0 +1,146 @@
+#include "types.h"
+#include "Dolphin/vec.h"
+
+#define R_RET fp1
+#define FP2   fp2
+#define FP3   fp3
+#define FP4   fp4
+#define FP5   fp5
+#define FP6   fp6
+#define FP7   fp7
+#define FP8   fp8
+#define FP9   fp9
+#define FP10  fp10
+#define FP11  fp11
+#define FP12  fp12
+#define FP13  fp13
+// clang-format off
+asm void PSVECAdd
+(
+    const register Vec *vec1,
+    const register Vec *vec2,
+    register Vec *ret
+)
+{
+    nofralloc;
+    psq_l     FP2,  0(vec1), 0, 0;
+    psq_l     FP4,  0(vec2), 0, 0;
+    ps_add    FP6, FP2, FP4;
+    psq_st    FP6,  0(ret), 0, 0;
+    psq_l     FP3,   8(vec1), 1, 0;
+    psq_l     FP5,   8(vec2), 1, 0;
+    ps_add    FP7, FP3, FP5;
+    psq_st    FP7,   8(ret), 1, 0;
+    blr
+}
+
+asm void PSVECSubtract
+(
+    const register Vec *vec1,
+    const register Vec *vec2,
+          register Vec *ret
+)
+{
+    nofralloc;
+    psq_l     FP2,  0(vec1), 0, 0;
+    psq_l     FP4,  0(vec2), 0, 0;
+    ps_sub    FP6, FP2, FP4;
+    psq_st    FP6, 0(ret), 0, 0;
+    psq_l     FP3,   8(vec1), 1, 0;
+    psq_l     FP5,   8(vec2), 1, 0;
+    ps_sub    FP7, FP3, FP5;
+    psq_st    FP7,  8(ret), 1, 0;
+    blr
+}
+
+void PSVECNormalize
+(
+    const register Vec *vec1,
+          register Vec *ret
+)
+{
+    register f32 half  = 0.5f;
+    register f32 three = 3.0f;
+    register f32 xx_zz, xx_yy;
+    register f32 square_sum;
+    register f32 ret_sqrt;
+    register f32 n_0, n_1;
+    asm
+    {
+        psq_l       FP2, 0(vec1), 0, 0;
+        ps_mul      xx_yy, FP2, FP2;
+        psq_l       FP3, 8(vec1), 1, 0;
+        ps_madd     xx_zz, FP3, FP3, xx_yy;
+        ps_sum0     square_sum, xx_zz, FP3, xx_yy;
+        frsqrte     ret_sqrt, square_sum;
+        fmuls       n_0, ret_sqrt, ret_sqrt;
+        fmuls       n_1, ret_sqrt, half;
+        fnmsubs     n_0, n_0, square_sum, three;
+        fmuls       ret_sqrt, n_0, n_1;
+        ps_muls0    FP2, FP2, ret_sqrt;
+        psq_st      FP2, 0(ret), 0, 0;
+        ps_muls0    FP3, FP3, ret_sqrt;
+        psq_st      FP3, 8(ret), 1, 0;
+    }
+}
+
+f32 PSVECMag ( const register Vec *v )
+{
+    register f32    v_xy, v_zz, square_mag;
+    register f32    ret_mag, n_0, n_1;
+    register f32    three, half, zero;
+    half = 0.5f;
+    asm
+    {
+        psq_l       v_xy, 0(v), 0, 0
+        ps_mul      v_xy, v_xy, v_xy
+        lfs         v_zz, 8(v)
+        fsubs       zero, half, half
+        ps_madd     square_mag, v_zz, v_zz, v_xy
+        ps_sum0     square_mag, square_mag, v_xy, v_xy
+        fcmpu       cr0, square_mag, zero
+        beq-        __exit
+        frsqrte     ret_mag, square_mag
+    }
+    three = 3.0f;
+    asm
+    {
+        fmuls       n_0, ret_mag, ret_mag
+        fmuls       n_1, ret_mag, half
+        fnmsubs     n_0, n_0, square_mag, three
+        fmuls       ret_mag, n_0, n_1
+        fmuls       square_mag, square_mag, ret_mag
+    __exit:
+    }
+    return square_mag;
+}
+
+asm void PSVECCrossProduct
+(
+    const register Vec *vec1,
+    const register Vec *vec2,
+          register Vec *ret
+)
+{
+    nofralloc;
+    psq_l       fp1, 0(vec2), 0, 0
+    lfs         fp2, 8(vec1)
+    psq_l       fp0, 0(vec1), 0, 0
+    ps_merge10  fp6, fp1, fp1
+    lfs         fp3, 8(vec2)
+    ps_mul      fp4, fp1, fp2
+    ps_muls0    fp7, fp1, fp0
+    ps_msub     fp5, fp0, fp3, fp4
+    ps_msub     fp8, fp0, fp6, fp7
+    ps_merge11  fp9, fp5, fp5
+    ps_merge01  fp10, fp5, fp8
+    psq_st      fp9, 0(ret), 1, 0
+    ps_neg      fp10, fp10
+    psq_st      fp10, 4(ret), 0, 0
+    blr;
+}
+
+f32 PSVECDistance(const register Vec *a, const register Vec *b)
+{
+	return 0.0f;
+}
-- 
cgit v1.2.3-13-gbd6f