Latest compatible version of Classicube from the original GitHub repository (https://github.com/ClassiCube/ClassiCube) that can be compiled on Classicube for PowerMac PPC running Mac OS X 10.4.

2025-12-17 13:17:57 +02:00
commit c71492f846
1248 changed files with 422858 additions and 0 deletions
--- a/misc/dreamcast/Makefile
+++ b/misc/dreamcast/Makefile
@@ -0,0 +1,96 @@
+ifeq ($(strip $(KOS_BASE)),)
+$(warning Please set KOS variables in your environment. For example:)
+$(warning source /opt/toolchains/dc/kos/environ.sh)
+$(error   Failed to find KallistiOS installation)
+endif
+
+
+#---------------------------------------------------------------------------------
+# Configurable options
+#---------------------------------------------------------------------------------
+# Directory where object files are placed
+BUILD_DIR	:= build/dc
+# List of directories containing source code
+SOURCE_DIRS	:= src third_party/bearssl/src misc/dreamcast
+# Name of the final output
+TARGET 		:= ClassiCube-dc
+# Additional libraries to link against
+LIBS		= -lm -lppp -lkosfat
+# List of directories containing more header files
+INCLUDES	= -Ithird_party/bearssl/inc
+
+
+#---------------------------------------------------------------------------------
+# Code generation
+#---------------------------------------------------------------------------------
+S_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.S))
+C_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.c))
+OBJS 	:= $(addprefix $(BUILD_DIR)/, $(notdir $(C_FILES:%.c=%.o) $(S_FILES:%.S=%.o)))
+CFLAGS	:= -g -DNDEBUG -O3 -fipa-pta -fno-pie -flto=auto -fomit-frame-pointer -fbuiltin -ffast-math -ffp-contract=fast -mfsrra -mfsca -pipe -fno-math-errno
+LDFLAGS	= -g
+
+# Dependency tracking
+DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d
+DEPFILES := $(OBJS:%.o=%.d)
+
+
+#---------------------------------------------------------------------------------
+# Main targets
+#---------------------------------------------------------------------------------
+default: $(BUILD_DIR) $(TARGET).cdi
+
+clean:
+	rm $(TARGET).cdi $(TARGET).iso $(TARGET).elf $(TARGET)-scr.bin $(TARGET).bin $(OBJS)
+
+$(BUILD_DIR):
+	mkdir -p $(BUILD_DIR)
+
+
+#---------------------------------------------------------------------------------
+# Executable generation
+#---------------------------------------------------------------------------------
+$(TARGET).elf: $(OBJS)
+	kos-cc $(LDFLAGS) $^ -o $@ $(LIBS)
+	
+$(TARGET).bin: $(TARGET).elf
+	sh-elf-objcopy -R .stack -O binary $(TARGET).elf $(TARGET).bin
+	
+# https://dcemulation.org/phpBB/viewtopic.php?t=105269
+$(TARGET)-scr.bin: $(TARGET).bin
+	$(KOS_BASE)/utils/scramble/scramble $(TARGET).bin $(TARGET)-scr.bin
+	
+$(TARGET).iso: $(TARGET)-scr.bin
+	mkdir -p ISO_FILES
+	cp $(TARGET)-scr.bin ISO_FILES/1ST_READ.BIN
+	mkdir -p ISO_FILES/audio
+	mkdir -p ISO_FILES/maps
+	mkdir -p ISO_FILES/texpacks
+	mkdir -p ISO_FILES/texturecache
+	cp misc/dreamcast/classicube.zip ISO_FILES/texpacks/default.zip
+	cp misc/dreamcast/IP.BIN IP.BIN
+	mkisofs -G IP.BIN -C 0,11702 -J -l -r -quiet -o $(TARGET).iso ISO_FILES
+# genisoimage -V ClassiCube -G IP.BIN -joliet -rock -l -o $(TARGET).iso ISO_FILES
+	
+$(TARGET).cdi: $(TARGET).iso
+	cdi4dc $(TARGET).iso $(TARGET).cdi
+
+
+#---------------------------------------------------------------------------------
+# Object generation
+#---------------------------------------------------------------------------------
+$(BUILD_DIR)/%.o: src/%.c
+	kos-cc $(CFLAGS) $(INCLUDES) $(DEPFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/%.o: third_party/bearssl/src/%.c
+	kos-cc $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(BUILD_DIR)/%.o: misc/dreamcast/%.S
+	kos-cc $(DEPFLAGS) -c $< -o $@
+
+
+#---------------------------------------------------------------------------------
+# Dependency tracking
+#---------------------------------------------------------------------------------
+$(DEPFILES):
+
+include $(wildcard $(DEPFILES))
--- a/misc/dreamcast/VertexClip.S
+++ b/misc/dreamcast/VertexClip.S
@@ -0,0 +1,196 @@
+! r1 = v1,    CLOBBERS
+! r2 = v2,    CLOBBERS
+! r3 = OUT,   CLOBBERS
+! r4 = TMP 1, preserved
+! r5 = TMP 2, preserved
+! r0 = CL0,   CLOBBERS
+
+! FR0  = 0
+! FR1  = 0
+! FR2  = A.1
+! FR3  = B.1
+! FR4  = 0
+! FR5  = 0
+! FR6  = A.2
+! FR7  = B.2
+! FR8  = 0
+! FR9  = 0
+! FR10 = invT
+! FR11 = t
+
+#define TM1 r0 // CLOBBERED, temp register 1
+#define TM2 r4 // PRESERVED, temp register 2
+#define CLO r5 // PRESERVED, output colour
+
+#define IN1 r1 // CLOBBERED, input vertex 1
+#define IN2 r2 // CLOBBERED, input vertex 2
+#define OUT r3 // CLOBBERED, output vertex
+
+#define CL1 r1 // CLOBBERED, input colour 1
+#define CL2 r2 // CLOBBERED, input colour 2
+
+! Calculates the near plane intersection point between two points:
+!    float t  = fabsf(v1->z) / fabsf(v2->z - v1->z)
+!    float invt = 1.0f - t;
+!    
+!    out->x = invt * v1->x + t * v2->x;
+!    out->y = invt * v1->y + t * v2->y;
+!    out->z = 0.0f; // clipped against near plane anyways (I.e Z/W = 0 --> Z = 0)
+!    
+!    out->u = invt * v1->u + t * v2->u;
+!    out->v = invt * v1->v + t * v2->v;
+!    out->w = invt * v1->w + t * v2->w;
+!    
+!    out->b = invt * v1->b + t * v2->b;
+!    out->g = invt * v1->g + t * v2->g;
+!    out->r = invt * v1->r + t * v2->r;
+!    out->a = invt * v1->a + t * v2->a;
+! To optimise these calculations, FIPR is used:
+!   FIPR = FVm.x*FVn.x + FVm.y*FVn.x + FVm.z*FVn.z + FVm.w*FVn.w --> FVn.w
+! FIPR can be used to accomplish "vout->Q = invt * v1->Q + t * v2->Q" by:
+!   - assigning x/y components to 0 for both vectors
+!   - assigning t and invT to z/w of FVm vector
+!   - assigning v1 and v2  to z/w  of FVn vector
+!   FIPR = 0*0 + 0*0 + t*v1->Q + invT*v2->Q --> FVn.w
+!   FIPR = t*v1->Q + invT*v2->Q --> FVn.w
+
+.global _ClipLine
+.align 4
+_ClipLine:
+	mov.l    r4,@-r15 ! LS, push(r4)
+	mov.l    r5,@-r15 ! LS, push(r5)
+	mov      IN1, TM1 ! MT, tmp  = &v1
+	fldi0    fr4      ! LS, fr4  = 0
+	add      #12, TM1 ! EX, tmp  = &v1->z
+	fmov.s  @TM1, fr2 ! LS, fr2  = v1->z	
+	mov      IN2, TM1 ! MT, tmp  = &v2
+	fldi0    fr5      ! LS, fr5  = 0
+	add      #12, TM1 ! EX, tmp   = &v2->z
+	fmov.s  @TM1,fr11 ! LS, fr11 = v2->z
+	fsub     fr2,fr11 ! FE, fr11 = v2->z - v1->z
+	fldi0    fr8      ! LS, fr8  = 0
+	fmul    fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z)
+	fldi0    fr9      ! LS, fr9  = 0
+	fldi0    fr0      ! LS, fr0  = 0
+	fldi0    fr1      ! LS, fr1  = 0
+	fsrra   fr11      ! FE, fr11 = 1 / abs(v2->z - v1->z)
+	fabs     fr2      ! LS, fr2  = abs(v1->z)
+	fmul     fr2,fr11 ! FE, fr11 = abs(v1->Z) / abs(v2->z - v1->z)  --> t
+	add       #4, IN1 ! EX, v1   += 4
+	fldi1   fr10      ! LS, fr10 = 1
+	add       #4, IN2 ! EX, v2   += 4
+	add       #4, OUT ! EX, OUT  += 4
+	fsub    fr11,fr10 ! FE, fr10 = 1.0 - t  --> invT
+	
+	fmov.s @IN1+, fr2 ! LS, A1 = v1->x, v1 += 4
+	fmov.s @IN2+, fr3 ! LS, B1 = v2->x, v2 += 4
+	fipr     fv8, fv0 ! FE, LERP(A1, B1)
+	fmov.s @IN1+, fr6 ! LS, A2 = v1->y, v1 += 4
+	fmov.s @IN2+, fr7 ! LS, B2 = v2->y, v2 += 4
+
+	fmov.s   fr3,@OUT ! LS, OUT->x = LERP
+	add       #4, OUT ! EX, OUT += 4
+	fipr     fv8, fv4 ! FE, LERP(A2, B2)
+	add       #4, IN1 ! EX, v1 += 4
+	add       #4, IN2 ! EX, v2 += 4
+
+	fmov.s   fr7,@OUT ! LS, OUT->y = LERP
+	add       #4, OUT ! EX, OUT += 4
+	fmov.s   fr1,@OUT ! LS, OUT->z = 0
+	add       #4, OUT ! EX, OUT += 4
+	
+	fmov.s @IN1+, fr2 ! LS, A1 = v1->u, v1 += 4
+	fmov.s @IN2+, fr3 ! LS, B1 = v2->u, v2 += 4
+	fipr     fv8, fv0 ! FE, LERP(A1, B1)
+	fmov.s @IN1+, fr6 ! LS, A2 = v1->v, v1 += 4
+	fmov.s @IN2+, fr7 ! LS, B2 = v2->v, v2 += 4
+
+	fmov.s   fr3,@OUT ! LS, OUT->u = LERP
+	add       #4, OUT ! EX, OUT += 4
+	fipr     fv8, fv4 ! FE, LERP(A2, B2)
+	add       #4, IN1 ! EX, v1 += 4
+	add       #4, IN2 ! EX, v2 += 4
+	fmov.s  @IN1,fr2  ! LS, A1 = v1->w
+	fmov.s  @IN2,fr3  ! LS, B1 = v2->w
+	fmov.s   fr7,@OUT ! LS, OUT->v = LERP
+	add       #8, OUT ! EX, OUT += 8
+	
+	fipr     fv8, fv0 ! FE, LERP(A1, B1)
+	add      #-4, IN1 ! EX, v1 -= 4
+	add      #-4, IN2 ! EX, v2 -= 4
+	fmov.s   fr3,@OUT ! LS, OUT->w = lerp
+	add      #-4, OUT ! EX, OUT -= 4
+	
+	mov.l  @IN1,CL1   ! LS, ACOLOR = v1->bgra
+	mov.l  @IN2,CL2   ! LS, BCOLOR = v2->bgra
+! Bypass interpolation if unnecessary
+	cmp/eq  CL1,CL2   ! MT, T = ACOLOR == BCOLOR
+	bt.s    1f        ! BR, if (T) goto 1;
+	mov     CL1,CLO   ! MT, OUTCOLOR = ACOLOR (branch delay instruction)
+! Interpolate B
+	extu.b  CL1,TM1   ! EX, val = ACOLOR.b
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr2  ! EX, fr2 = float(FPUL)
+	extu.b  CL2,TM1   ! EX, val = BCOLOR.b
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr3  ! EX, fr3 = float(FPUL)
+	fipr    fv8, fv0  ! FE, LERP(A1, B1)
+	shlr8   CL1       ! EX, ACOLOR >>= 8
+	ftrc    fr3,fpul  ! FE, FPUL = int(lerp)
+	shlr8   CL2       ! EX, BCOLOR >>= 8
+	sts     fpul,TM2  ! CO, tmp = FPUL
+! Interpolate G
+	extu.b  CL1,TM1   ! EX, val = ACOLOR.g
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr2  ! EX, fr2 = float(FPUL)
+	extu.b  CL2,TM1   ! EX, val = BCOLOR.g
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr3  ! EX, fr3 = float(FPUL)
+	fipr    fv8, fv0  ! FE, LERP(A1, B1)
+	shlr8   CL1       ! EX, ACOLOR >>= 8
+	ftrc    fr3,fpul  ! FE, FPUL = int(lerp)
+	extu.b  TM2,TM2   ! EX, tmp = (uint8)tmp
+	mov     TM2,CLO   ! MT, OUTCOLOR.b = tmp
+	shlr8   CL2       ! EX, BCOLOR >>= 8
+	sts     fpul,TM2  ! CO, tmp = FPUL
+! Interpolate R
+	extu.b  CL1,TM1   ! EX, val = ACOLOR.r
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr2  ! EX, fr2 = float(FPUL)
+	extu.b  CL2,TM1   ! EX, val = BCOLOR.r
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr3  ! EX, fr3 = float(FPUL)
+	fipr    fv8, fv0  ! FE, LERP(A1, B1)
+	shlr8   CL1       ! EX, ACOLOR >>= 8
+	ftrc    fr3,fpul  ! FE, FPUL = int(lerp)
+	extu.b  TM2,TM2   ! EX, tmp = (uint8)tmp
+	shll8   TM2       ! EX, tmp <<= 8
+	or      TM2,CLO   ! EX, OUTCOLOR.g |= tmp
+	shlr8   CL2       ! EX, BCOLOR >>= 8
+	sts     fpul,TM2  ! CO, tmp = FPUL
+! Interpolate A
+	extu.b  CL1,TM1   ! EX, val = ACOLOR.a
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr2  ! EX, fr2 = float(FPUL)
+	extu.b  CL2,TM1   ! EX, val = BCOLOR.a
+	lds     TM1,fpul  ! CO, FPUL = val
+	float   fpul,fr3  ! EX, fr3 = float(FPUL)
+	fipr    fv8, fv0  ! FE, LERP(A1, B1)
+	ftrc    fr3,fpul  ! FE, FPUL = int(lerp)
+	extu.b  TM2,TM2   ! EX, tmp = (uint8)tmp
+	shll16  TM2       ! EX, tmp <<= 16
+	or      TM2,CLO   ! EX, OUTCOLOR.r |= tmp
+	sts     fpul,TM2  ! CO, tmp = FPUL
+	extu.b  TM2,TM2   ! EX, tmp = (uint8)tmp
+	shll16  TM2       ! EX, tmp <<= 16
+	shll8   TM2       ! EX, tmp <<= 8
+	or      TM2,CLO   ! EX, OUTCOLOR.a |= tmp
+
+1:
+	mov.l  CLO,@OUT   ! LS, OUT->color = OUTCOLOR
+	mov.l   @r15+,r5  ! LS, pop(r5)
+	rts               ! CO, return after executing instruction in delay slot
+	mov.l   @r15+,r4  ! LS, pop(r4)
+
+.size _ClipLine, .-_ClipLine
+.type _ClipLine,%function
--- a/misc/dreamcast/VertexClip2.S
+++ b/misc/dreamcast/VertexClip2.S
@@ -0,0 +1,210 @@
+! Calculates vertex as the near plane intersection point between two points:
+!    float t = fabsf(v1->z) / fabsf(v2->z - v1->z)
+!    float w = (1 - t) * v1->w + t * v2->w;
+!    
+!    out->c = type << 24
+!    out->x = ((1 - t) * v1->x + t * v2->x) * 1/w
+!    out->y = ((1 - t) * v1->y + t * v2->y) * 1/w
+!    out->w = 1/w  
+!    
+!    out->u = (1 - t) * v1->u + t * v2->u;
+!    out->v = (1 - t) * v1->v + t * v2->v;
+!    
+!    out->b = (1 - t) * v1->b + t * v2->b;
+!    out->g = (1 - t) * v1->g + t * v2->g;
+!    out->r = (1 - t) * v1->r + t * v2->r;
+!    out->a = (1 - t) * v1->a + t * v2->a;
+
+! Optimisations:
+! - w always ends up being zNear
+! - Calculations of (1 - t) * v1 + t * v2 can be rearranged to t * (v2 - v1) + v1
+! - These rearranges calculations can then take advantage of FMAC 
+
+! Final calculation:
+!    out->c = type << 24
+!    out->x = ((v2->x - v1->x) + v1->x) * 1/zNear
+!    out->y = ((v2->y - v1->y) + v1->y) * 1/zNear
+!    out->w = 1/zNear  
+!    
+!    out->u = (v2->u - v1->u) + v1->u;
+!    out->v = (v2->v - v1->v) + v1->v;
+!    
+!    out->b = (v2->b - v1->b) + v1->b;
+!    out->g = (v2->g - v1->g) + v1->g;
+!    out->r = (v2->r - v1->r) + v1->r;
+!    out->a = (v2->a - v1->a) + v1->a;
+
+! INPUT ARGUMENTS
+#define IN1 r4 // input vertex 1
+#define IN2 r5 // input vertex 2
+#define OUT r6 // output vertex
+#define TYP r7 // type/flags for output vertex
+
+#define TM1 r1 // temp register 1
+#define TM2 r3 // temp register 2
+#define CL1 r4 // input colour 1
+#define CL2 r5 // input colour 2
+#define CLO r7 // output colour
+
+#define F_T  fr0
+#define F_W  fr1
+#define F_X1 fr2
+#define F_X2 fr3
+#define F_Y1 fr4
+#define F_Y2 fr5
+#define F_U1 fr6
+#define F_U2 fr7
+#define F_V1 fr8
+#define F_V2 fr9
+#define F_Z1 fr10
+#define F_Z2 fr11
+#define Ftmp fr11
+
+.global _ClipEdge
+.align 4
+_ClipEdge:
+	fschg              ! FE (swap to 32 bit FPU loads/stores)
+! Start calculating interpolation factor
+	add      #28, IN1  ! EX, IN1 = &v1->z
+	mov.l _NEAR_CLIP_W,TM1 ! LS, tmp = invW (1/zNear)
+	fmov.s  @IN1, F_Z1 ! LS, Z1  = v1->z
+	add      #28, IN2  ! EX, IN  = &v2->z
+	fmov.s  @IN2, F_Z2 ! LS, Z2  = v2->z
+	shll16   TYP       ! EX, TYP <<= 16
+	fsub    F_Z1, Ftmp ! FE, tmp = v2->z - v1->z
+! Load W
+	lds 	 TM1,fpul  ! LS, FPUL = invW (1/zNear)
+	add     #-24, IN1  ! EX, IN1  = &v1->x
+	fsts     fpul,F_W  ! LS, invW = FPUL
+	add     #-24, IN2  ! EX, IN2  = &v2->x
+! Finish calculating interpolation factor
+	shll8    TYP       ! EX, TYP <<= 8
+	fmul    Ftmp,Ftmp  ! FE, tmp = (v2->z - v1->z) * (v2->z * v1->z)
+	mov.l  TYP,@OUT    ! LS, dst->cmd = TYPE
+
+! Load components
+	fmov.s @IN1+, F_X1 ! LS, X1 = v1->x
+	fmov.s @IN2+, F_X2 ! LS, X2 = v2->x
+	fmov.s @IN1+, F_Y1 ! LS, Y1 = y1->x
+	fmov.s @IN2+, F_Y2 ! LS, Y2 = y2->x
+	fsrra   Ftmp       ! FE, tmp = 1 / abs(v2->z - v1->z)
+	add       #4, IN1  ! EX, skip over W
+	fabs    F_Z1       ! LS, z1 = abs(v1->z)
+	add       #4, IN2  ! EX, skip over W
+	fmov.s @IN1+, F_U1 ! LS, U1 = v1->u
+	fmov.s @IN2+, F_U2 ! LS, U2 = v2->u
+	fmov.s @IN1+, F_V1 ! LS, V1 = v1->v
+	fmul    F_Z1,Ftmp  ! FE, tmp = abs(v1->Z) / abs(v2->z - v1->z)
+	fmov.s @IN2+, F_V2 ! LS, V2 = v2->v
+
+! Interpolate vertices
+	fsub    F_X1, F_X2 ! FE, X2 = X2 - X1
+	fsub    F_Y1, F_Y2 ! FE, Y2 = Y2 - Y1
+	fsub    F_U1, F_U2 ! FE, U2 = U2 - U1
+	fmov   Ftmp, F_T   ! LS,   T  = tmp
+	fsub    F_V1, F_V2 ! FE, V2 = V2 - V1
+
+	fmac F_T,F_X2,F_X1 ! FE, X = T * (X2 - X1) + X1
+	fmac F_T,F_Y2,F_Y1 ! FE, Y = T * (Y2 - Y1) + Y1
+	fmac F_T,F_U2,F_U1 ! FE, U = T * (U2 - U1) + U1
+	fmac F_T,F_V2,F_V1 ! FE, V = T * (V2 - V1) + V1
+	
+! Adjust by w
+	fmul	 F_W, F_X1 ! FE, x = invW * x
+	fmul	 F_W, F_Y1 ! FE, x = invY * x
+
+! Load colours and check if equal
+	mov.l  @IN1,CL1    ! LS, ACOLOR = v1->bgra
+	mov.l  @IN2,CL2    ! LS, BCOLOR = v2->bgra
+	cmp/eq  CL1,CL2    ! MT, T = ACOLOR == BCOLOR
+	add		 #28,  OUT ! EX, dst = &dst->padding
+	
+! Bypass RGBA interpolation if unnecessary
+	bt.s    1f         ! BR, if (T) goto 1;
+	mov     CL1,CLO    ! MT, OUTCOLOR = ACOLOR (branch delay instruction)
+
+! Interpolate B
+	extu.b  CL1,TM1    ! EX, val  = ACOLOR.b
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z1  ! FE,  C1  = float(val)
+	extu.b  CL2,TM1    ! EX, val  = BCOLOR.b
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z2  ! FE,  C2  = float(val)
+	fsub    F_Z1, F_Z2 ! FE,  C2  = C2 - C1
+	fmac F_T,F_Z2,F_Z1 ! FE,  C   = T * (C2 - C1) + C1
+	shlr8   CL1        ! EX, ACOLOR >>= 8
+	shlr8   CL2        ! EX, BCOLOR >>= 8
+	ftrc    F_Z1,fpul  ! FE, FPUL = int(C)
+	sts     fpul,TM2   ! LS, tmp  = FPUL
+
+! Interpolate G
+	extu.b  CL1,TM1    ! EX, val  = ACOLOR.g
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z1  ! FE,  C1  = float(val)
+	extu.b  CL2,TM1    ! EX, val  = BCOLOR.g
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z2  ! FE,  C2  = float(val)
+	fsub    F_Z1, F_Z2 ! FE,  C2  = C2 - C1
+	fmac F_T,F_Z2,F_Z1 ! FE,  C   = T * (C2 - C1) + C1
+	shlr8   CL1        ! EX, ACOLOR >>= 8
+	extu.b  TM2,TM2    ! EX, tmp  = (uint8)tmp
+	mov     TM2,CLO    ! MT, OUTCOLOR.b = tmp
+	shlr8   CL2        ! EX, BCOLOR >>= 8
+	ftrc    F_Z1,fpul  ! FE, FPUL = int(C)
+	sts     fpul,TM2   ! LS, tmp  = FPUL
+
+! Interpolate R
+	extu.b  CL1,TM1    ! EX, val  = ACOLOR.r
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z1  ! FE,  C1  = float(val)
+	extu.b  CL2,TM1    ! EX, val  = BCOLOR.r
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z2  ! FE,  C2  = float(val)
+	fsub    F_Z1, F_Z2 ! FE,  C2  = C2 - C1
+	fmac F_T,F_Z2,F_Z1 ! FE,  C   = T * (C2 - C1) + C1
+	shlr8   CL1        ! EX, ACOLOR >>= 8
+	extu.b  TM2,TM2    ! EX, tmp  = (uint8)tmp
+	shll8   TM2        ! EX, tmp <<= 8
+	or      TM2,CLO    ! EX, OUTCOLOR.g |= tmp
+	shlr8   CL2        ! EX, BCOLOR >>= 8
+	ftrc    F_Z1,fpul  ! FE, FPUL = int(C)
+	sts     fpul,TM2   ! LS, tmp  = FPUL
+	
+! Interpolate A
+	extu.b  CL1,TM1    ! EX, val  = ACOLOR.a
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z1  ! FE,  C1  = float(val)
+	extu.b  CL2,TM1    ! EX, val  = BCOLOR.a
+	lds     TM1,fpul   ! LS, FPUL = val
+	float   fpul,F_Z2  ! FE,  C2  = float(val)
+	fsub    F_Z1, F_Z2 ! FE,  C2  = C2 - C1
+	fmac F_T,F_Z2,F_Z1 ! FE,  C   = T * (C2 - C1) + C1
+	extu.b  TM2,TM2    ! EX, tmp  = (uint8)tmp
+	shll16  TM2        ! EX, tmp <<= 16
+	or      TM2,CLO    ! EX, OUTCOLOR.r |= tmp
+	ftrc    F_Z1,fpul  ! FE, FPUL = int(C)
+	sts     fpul,TM2   ! LS, tmp  = FPUL
+	extu.b  TM2,TM2    ! EX, tmp  = (uint8)tmp
+	shll16  TM2        ! EX, tmp <<= 16
+	shll8   TM2        ! EX, tmp <<= 8
+	or      TM2,CLO    ! EX, OUTCOLOR.a |= tmp
+
+1:
+! Store output	
+	mov.l    CLO,@-OUT ! LS, dst->color = OUTCOLOR
+	fmov.s  F_V1,@-OUT ! LS, dst->v = v	
+	fmov.s  F_U1,@-OUT ! LS, dst->u = u
+	fmov.s  F_W ,@-OUT ! LS, dst->w = invW
+	fmov.s  F_Y1,@-OUT ! LS, dst->y = y	
+	fmov.s  F_X1,@-OUT ! LS, dst->x = x
+
+	fschg              ! FE (swap to 64 bit FPU loads/stores)
+	rts                ! CO, return after executing instruction in delay slot
+	pref   @OUT        ! LS, trigger store queue flush
+.size _ClipEdge, .-_ClipEdge
+.type _ClipEdge, %function
+
+.align 4
+_NEAR_CLIP_W:
+        .float 0
+.global _NEAR_CLIP_W
--- a/misc/dreamcast/VertexDraw.S
+++ b/misc/dreamcast/VertexDraw.S
@@ -0,0 +1,611 @@
+! r8  = STORE_QUEUE
+! r9  = num vertices left
+! r10 = PVR_CMD_VERTEX
+! r11 = PVR_CMD_VERTEX_EOL
+! r12 = ClipLine function
+! r13 = cur vertex
+! r14 = next vertex (prefetch)
+
+#define R_VTX        r10
+#define R_EOL        r11
+#define REG_CMD_VTX  r10
+#define REG_CMD_EOL  r11
+#define REG_CLIPFUNC r12
+
+
+.align 4
+
+! Pushes a vertex to the store queue
+!   CLOBBERS: r2
+!   INPUTS:   R (vertex), r8 (SQ global)
+!   OUTPUTS:  r8 altered
+.macro PushVertex R
+	! memcpy(r8, \R, 32)
+	mov.l   @(0,\R), r2
+	mov.l   r2, @(0,r8)
+	mov.l   @(4,\R), r2
+	mov.l   r2, @(4,r8)
+	mov.l   @(8,\R), r2
+	mov.l   r2, @(8,r8)
+	mov.l   @(12,\R),r2
+	mov.l   r2,@(12,r8)
+	mov.l   @(16,\R),r2
+	mov.l   r2,@(16,r8)
+	mov.l   @(20,\R),r2
+	mov.l   r2,@(20,r8)
+	mov.l   @(24,\R),r2
+	mov.l   r2,@(24,r8)
+	mov.l   @(28,\R),r2
+	mov.l   r2,@(28,r8) 
+	pref    @r8         ! LS, Trigger SQ
+	add     #32,r8      ! EX, SQ += 32
+.endm
+
+! Transforms then pushes a vertex to the store queue
+! note: Vertices are assumed as pre viewport transformed already
+!   CLOBBERS: r2, fr0, fr4, fr5
+!   INPUTS:   R (vertex), r8 (SQ global)
+!   OUTPUTS:  R, r8 altered
+.macro TransformVertex R
+! INVERSE W CALCULATION
+    add #28, \R       ! EX, SRC += 28
+    fmov.s  @\R,fr0   ! LS, fr0 = v->w
+    fmul    fr0,fr0   ! FE, fr0 = fr0 * fr0
+    add #-28, \R      ! EX, SRC -= 28
+	mov.l   @\R+, r2  ! LS, tmp = SRC->flags, SRC += 4
+	mov.l   r2,@r8    ! LS, DST->flags = tmp
+    fsrra   fr0       ! FE, invW = 1 / sqrt(SRC->W * SRC->W)
+    add    #4, r8     ! EX, DST += 4
+
+! COPY U,V
+	mov.l @(12,\R),r2 ! LS, tmp = SRC->u
+	mov.l r2,@(12,r8) ! LS, DST->u = tmp
+	mov.l @(16,\R),r2 ! LS, tmp = SRC->v
+	mov.l r2,@(16,r8) ! LS, DST->v = tmp
+
+! TRANSFORM X
+    fmov.s @\R,fr4    ! LS, fr4 = SRC->x
+    fmul   fr0,fr4    ! FE, fr4 = invW * SRC->x
+	mov.l @(20,\R),r2 ! LS, tmp = SRC->bgra
+	mov.l r2,@(20,r8) ! LS, SRC->bgra = tmp
+    add    #4, \R     ! EX, SRC += 4
+    fmov.s fr4,@r8    ! LS, DST->x = fr4
+
+! TRANSFORM Y
+    fmov.s @\R,fr4    ! LS, fr4  = SRC->y
+    add    #8, r8     ! EX, DST += 8
+    fmul   fr0,fr4    ! FE, fr4 = invW * SRC->y
+    fmov.s fr0,@r8    ! LS, DST->z = invW
+    add   #-4, r8     ! EX, DST -= 4
+    add   #-8, \R     ! EX, src -= 8 (back to start of vertex)
+    fmov.s fr4,@r8    ! LS, DST->y = fr4
+
+    add   #-8,r8      ! EX, DST -= 8 (back to start of vertex)	
+	pref    @r8       ! LS, Trigger SQ
+	add     #32,r8    ! EX, SQ += 32
+.endm
+
+
+#define REG_CLIP1 r1
+#define REG_CLIP2 r2
+
+#define REG_V0 r4
+#define REG_V1 r5
+#define REG_V2 r6
+#define REG_V3 r7
+
+! r3 also matches out parameter for ClipLine
+#define REG_TMP r3
+#define TMP_SET_A \
+	mov r15, REG_TMP
+
+#define TMP_SET_B \
+	mov r15, REG_TMP; add #32, REG_TMP
+
+
+_Case_0_0_0_1:
+	!          v0
+	!         / |
+	!       /   |
+	! .....A....B...
+	!    /      |
+	!  v3--v2---v1
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V3, REG_CLIP1
+	mov    REG_V0, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V0, REG_CLIP1
+	mov    REG_V1, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TransformVertex REG_V0
+	TMP_SET_B
+	TransformVertex REG_TMP
+	TMP_SET_A
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_0_0_1_0:
+	!          v1
+	!         / |
+	!       /   |
+	! ....A.....B...
+	!    /      |
+	!  v0--v3---v2
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V0, REG_CLIP1
+	mov    REG_V1, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V1, REG_CLIP1
+	mov    REG_V2, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TransformVertex REG_V1
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_0_1_0_0:
+ 	!          v2
+	!         / |
+	!       /   |
+	! ....A.....B...
+	!    /      |
+	!  v1--v0---v3
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V1, REG_CLIP1
+	mov    REG_V2, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V2, REG_CLIP1
+	mov    REG_V3, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TransformVertex REG_V2
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_0_0_0:
+	!          v3
+	!         / |
+	!       /   |
+	! ....A.....B...
+	!    /      |
+	!  v2--v1---v0
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V2, REG_CLIP1
+	mov    REG_V3, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V3, REG_CLIP1
+	mov    REG_V0, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	TransformVertex REG_TMP
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TransformVertex REG_V3
+
+	lds   r13,pr
+	rts
+	nop
+
+
+_Case_0_0_1_1:
+	!    v0-----------v1
+	!      \           |
+	!   ....B..........A...
+	!         \        |
+	!          v3-----v2
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V1, REG_CLIP1
+	mov    REG_V2, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V3, REG_CLIP1
+	mov    REG_V0, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+
+	TransformVertex REG_V1
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TransformVertex REG_V0
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_0_0_1:
+	!    v3-----------v0
+	!      \           |
+	!   ....B..........A...
+	!         \        |
+	!          v2-----v1
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V0, REG_CLIP1
+	mov    REG_V1, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V2, REG_CLIP1
+	mov    REG_V3, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TMP_SET_B
+	TransformVertex REG_TMP
+	TransformVertex REG_V0
+	TransformVertex REG_V3
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_0_1_1_0:
+	!    v1-----------v2
+	!      \           |
+	!   ....B..........A...
+	!         \        |
+ 	!          v0-----v3
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V2, REG_CLIP1
+	mov    REG_V3, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V0, REG_CLIP1
+	mov    REG_V1, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TransformVertex REG_V1
+	TransformVertex REG_V2
+	TMP_SET_B
+	TransformVertex REG_TMP
+	TMP_SET_A
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_1_0_0:
+	!    v2-----------v3
+	!      \           |
+	!   ....B..........A...
+	!         \        |
+	!          v1-----v0
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V3, REG_CLIP1
+	mov    REG_V0, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V1, REG_CLIP1
+	mov    REG_V2, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	TransformVertex REG_TMP
+	TransformVertex REG_V2
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TransformVertex REG_V3
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_0_1_1_1:
+	!        --v1--
+	!    v0--      --v2
+	!      \        |
+	!   .....B.....A...
+	!          \   |
+	!            v3
+	! v1,v2,v0  v2,v0,A  v0,A,B
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V2, REG_CLIP1
+	mov    REG_V3, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V3, REG_CLIP1
+	mov    REG_V0, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+
+	TransformVertex REG_V1
+	TransformVertex REG_V2
+	TransformVertex REG_V0
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_0_1_1:
+	!        --v0--
+	!    v3--      --v1
+	!      \        |
+	!   .....B.....A...
+	!          \   |
+	!            v2
+	! v0,v1,v3  v1,v3,A  v3,A,B
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V1, REG_CLIP1
+	mov    REG_V2, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V2, REG_CLIP1
+	mov    REG_V3, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+	mov.l  REG_CMD_VTX, @REG_V3
+
+	TransformVertex REG_V0
+	TransformVertex REG_V1
+	TransformVertex REG_V3
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_1_0_1:
+	!        --v3--
+	!    v2--      --v0
+	!      \        |
+	!   .....B.....A...
+	!          \   |
+	!            v1
+	! v3,v0,v2  v0,v2,A  v2,A,B
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V0, REG_CLIP1
+	mov    REG_V1, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V1, REG_CLIP1
+	mov    REG_V2, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+	mov.l  REG_CMD_VTX, @REG_V3
+
+	TransformVertex REG_V3
+	TransformVertex REG_V0
+	TransformVertex REG_V2
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_1_1_0:
+	!        --v2--
+	!    v1--      --v3
+	!      \        |
+	!   .....B.....A...
+	!          \   |
+	!            v0
+	! v2,v3,v1  v3,v1,A  v1,A,B
+	sts    pr,r13
+
+	TMP_SET_A
+	mov    REG_V3, REG_CLIP1
+	mov    REG_V0, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_VTX, @REG_TMP
+
+	TMP_SET_B
+	mov    REG_V0, REG_CLIP1
+	mov    REG_V1, REG_CLIP2
+	jsr @REG_CLIPFUNC
+	mov.l  REG_CMD_EOL, @REG_TMP
+	mov.l  REG_CMD_VTX, @REG_V3
+
+	TransformVertex REG_V2
+	TransformVertex REG_V3
+	TransformVertex REG_V1
+	TMP_SET_A
+	TransformVertex REG_TMP
+	TMP_SET_B
+	TransformVertex REG_TMP
+
+	lds   r13,pr
+	rts
+	nop
+
+_Case_1_1_1_1:
+! Triangle strip: {1,2,0} {2,0,3}
+	TransformVertex REG_V1
+	TransformVertex REG_V2
+	TransformVertex REG_V0
+	TransformVertex REG_V3
+	rts
+	nop
+
+.global _ProcessVertexList
+.align 4
+_ProcessVertexList:
+! STORE CPU REGISTERS
+	mov.l    r8,@-r15
+	mov.l    r9,@-r15
+	mov.l   r10,@-r15
+	mov.l   r11,@-r15
+	mov.l   r12,@-r15
+	mov.l   r13,@-r15
+	mov.l   r14,@-r15
+	sts.l    pr,@-r15
+! REGISTER SETUP
+	mov      r4,r14
+	mov      r4,r13
+	mov.l  .CLIPFUNC,r12
+	mov.l  .PVR_EOL, r11
+	mov.l  .PVR_VTX, r10
+	mov      r5,r9
+	mov      r6,r8	
+	bra     SUBMIT_LOOP
+	add    #-64,r15
+
+! Submits a PVR2 TA GPU command
+DO_CMD:
+	PushVertex REG_V0
+	bra     NEXT_ITER
+	nop
+
+SUBMIT_LOOP:
+	mov.l   @r13,r0   ! FLAGS = CUR->flags
+	add     #32,r14   ! NEXT += sizeof(Vertex)
+	mov     r0,r2     ! TYPE = FLAGS
+	and     r11,r2    ! TYPE = FLAGS & 0xF000000 (reuse PVR_CMD_VERTEX_EOL as type mask)
+! Check for PVR_CMD_VERTEX
+	cmp/eq  r10,r2    ! T = r2 == PVR_CMD_VERTEX
+	bt.s    NEXT_ITER ! if (T) goto NEXT_ITER
+	pref    @r14      ! prefetch(NEXT) -- always executed
+! Check for non PVR_CMD_VERTEX_EOL
+	cmp/eq  r11,r2    ! T = r2 == PVR_CMD_VERTEX_EOL
+	bf.s    DO_CMD    ! if (!T) goto DO_CMD
+! PVR_CMD_VERTEX_EOL case
+	extu.b  r0,r1     ! EX, MASK = FLAGS & 0xFF (branch delay slot)
+
+! Prepare and then jump to quad drawing function, based on quad clipflags
+	mova    .CASES,r0   ! LS, r0 = CASES
+	mov     r13,r7      ! MT, r7 = v3
+	shll2   r1          ! EX, MASK <<= 2
+	mov     r13,r6      ! MT, r6 = v3
+	mov.l   @(r0,r1),r2 ! LS, r1 = CASES[MASK]
+	mov     r13,r5      ! MT, r5 = v3
+	add     #-32,r6     ! EX, r6 = v3 - 1 (v2)
+	mov     r13,r4      ! MT, r4 = v3
+	add     #-64,r5     ! EX, r5 = v3 - 2 (v1)
+	jsr     @r2         ! C0, jump CASES[MASK]
+	add     #-96,r4     ! EX, r4 = v3 - 3 (v0) (branch delay slot)
+NEXT_ITER:
+	dt r9               ! NUM--; T = NUM == 0
+	bf.s    SUBMIT_LOOP
+	mov     r14,r13     ! CUR = NEXT 
+
+	add      #64,r15
+! RESTORE CPU REGISTERS
+	lds.l   @r15+,pr
+	mov.l   @r15+,r14
+	mov.l   @r15+,r13
+	mov.l   @r15+,r12
+	mov.l   @r15+,r11
+	mov.l   @r15+,r10
+	mov.l   @r15+,r9
+	rts     
+	mov.l   @r15+,r8
+.size _ProcessVertexList, .-_ProcessVertexList
+.type _ProcessVertexList, %function
+
+.align 4
+.VP_1:
+        .long   _vp
+.PVR_VTX:
+        .long   0xE0000000
+.PVR_EOL:
+        .long   0xF0000000
+.CLIPFUNC:
+        .long   _ClipLine
+
+BUGGY_CASE:
+rts
+nop
+
+! CASES table holds the functions to transfer a quad,
+!  based on the visibility clipflags of the 4 vertices
+!  e.g. CASES[15] = V0_VIS | V1_VIS | V2_VIS | V3_VIS (all 4 visible)
+.CASES:
+	.long   BUGGY_CASE ! Should never happen
+	.long   _Case_0_0_0_1
+	.long   _Case_0_0_1_0
+	.long   _Case_0_0_1_1
+	.long   _Case_0_1_0_0
+	.long   BUGGY_CASE ! V0_VIS | V2_VIS, Should never happen
+	.long   _Case_0_1_1_0
+	.long   _Case_0_1_1_1
+	.long   _Case_1_0_0_0
+	.long   _Case_1_0_0_1
+	.long   BUGGY_CASE ! V1_VIS | V3_VIS, Should never happen
+	.long   _Case_1_0_1_1
+	.long   _Case_1_1_0_0
+	.long   _Case_1_1_0_1
+	.long   _Case_1_1_1_0
+	.long   _Case_1_1_1_1
--- a/misc/dreamcast/VertexTransform.S
+++ b/misc/dreamcast/VertexTransform.S
@@ -0,0 +1,230 @@
+#define FLG r0 // clip flags
+#define TMP r1 // temp
+#define VTX r2 // PVR_CMD_VERTEX
+#define EOS r3 // PVR_CMD_VERTEX_EOL
+#define SRC r4 // src pointer ARG
+#define DST r5 // dst pointer ARG
+#define CNT r6 // quads count ARG
+#define PFT r7 // prefetch address
+
+#define ZERO fr0 // 0.0
+#define F_U  fr1 // vertex.u
+#define F_V  fr2 // vertex.v
+#define F_C  fr3 // vertex.colour
+#define F_X  fr4 // vertex.x
+#define F_Y  fr5 // vertex.y
+#define F_Z  fr6 // vertex.z
+#define F_W  fr7 // vertex.w
+
+#define XYZW fv4 // vertex.xyzw
+
+
+! =========================================================
+! ========================= TRANSFORM SETUP ===============
+! =========================================================
+.macro TransformSetup
+    mov   SRC, PFT    ! MT, pft = src
+    add  #-32, DST    ! EX, dst -= sizeof(VERTEX)
+    mov #0xE0, VTX    ! EX, VTX = 0x00 00 00 E0
+    pref  @PFT        ! LS, PREFETCH pft (first vertex)
+    shll16 VTX        ! EX, VTX = 0x00 E0 00 00
+    shll8  VTX        ! EX, VTX = 0xE0 00 00 00 (PVR_CMD_VERTEX)
+    mov #0xF0, EOS    ! EX, EOS = 0x00 00 00 F0
+    shll16 EOS        ! EX, EOS = 0x00 F0 00 00
+    shll8  EOS        ! EX, EOS = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL)
+    fldi0  ZERO       ! LS, fr0 = 0.0
+.endm
+
+.macro TransformEnd
+    add #32, DST      ! EX, DST += sizeof(VERTEX)
+    rts               ! CO, return after executing instruction in delay slot
+    mov DST, r0       ! MT, r0 = DST
+.endm
+
+
+! =========================================================
+! ========================= VERTEX LOADING ================
+! =========================================================
+.macro LoadColouredVertex
+! LOAD XYZ
+    fmov @SRC+, F_X   ! LS, X = src->x
+    fmov @SRC+, F_Y   ! LS, Y = src->y
+    fmov @SRC+, F_Z   ! LS, Z = src->z
+    fldi1 F_W         ! LS, W = 1.0
+! PREPARE NEXT VERTEX
+    add    #16, PFT   ! EX, pft += VERTEX_STRIDE
+    pref   @PFT       ! LS, PREFETCH pft (next vertex)
+    add    #64, DST   ! EX, dst += 2 * sizeof(VERTEX)
+! TRANSFORM VERTEX
+    ftrv xmtrx, XYZW  ! FE, TRANSFORM(XYZW)
+! LOAD ATTRIBUTES
+    fmov @SRC+, F_C   ! LS, C = src->color
+.endm
+
+.macro LoadTexturedVertex
+! LOAD XYZ
+    fmov @SRC+, F_X   ! LS, X = src->x
+    fmov @SRC+, F_Y   ! LS, Y = src->y
+    fmov @SRC+, F_Z   ! LS, Z = src->z
+    fldi1 F_W         ! LS, W = 1.0
+! PREPARE NEXT VERTEX
+    add    #24, PFT   ! EX, pft += VERTEX_STRIDE
+    pref   @PFT       ! LS, PREFETCH pft (next vertex)
+    add    #64, DST   ! EX, dst += 2 * sizeof(VERTEX)
+! TRANSFORM VERTEX
+    ftrv xmtrx, XYZW  ! FE, TRANSFORM(XYZW)
+! LOAD ATTRIBUTES
+    fmov @SRC+, F_C   ! LS, C = src->color
+    fmov @SRC+, F_U   ! LS, U = src->u
+    fmov @SRC+, F_V   ! LS, V = src->v
+.endm
+
+! =========================================================
+! ========================= VERTEX OUTPUT =================
+! =========================================================
+! To take advantage of SH4 dual instruction processing, 
+!  clipflag calculation and vertex output are interleaved
+.macro ProcessVertex1
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_C,@-DST ! LS, dst->c = C
+    fmov.s  F_V,@-DST ! LS, dst->v = V
+    fcmp/gt ZERO, F_Z ! FE, T = Z > 0
+    fmov.s  F_U,@-DST ! LS, dst->u = U
+    movt    FLG       ! EX, CLIPFLAGS = T
+    fmov.s  F_W,@-DST ! LS, dst->w = W
+    fmov.s  F_Y,@-DST ! LS, dst->y = Y
+    fmov.s  F_X,@-DST ! LS, dst->x = X
+    mov.l   VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX
+.endm
+
+.macro ProcessVertex2
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_C,@-DST ! LS, dst->c = C
+    fmov.s  F_V,@-DST ! LS, dst->v = V
+    fcmp/gt ZERO,F_Z  ! FE, T = Z > 0
+    fmov.s  F_U,@-DST ! LS, dst->u = U
+    movt    TMP       ! EX, tmp = T
+    fmov.s  F_W,@-DST ! LS, dst->w = W
+    add     TMP,TMP   ! EX, tmp = tmp + tmp
+    fmov.s  F_Y,@-DST ! LS, dst->y = Y
+    or      TMP,FLG   ! EX, CLIPFLAGS |= tmp (T << 1)
+    fmov.s  F_X,@-DST ! LS, dst->x = X
+    mov.l   VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX
+.endm
+
+.macro ProcessVertex3
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_C,@-DST ! LS, dst->c = C
+    fmov.s  F_V,@-DST ! LS, dst->v = V
+    fcmp/gt ZERO, F_Z ! FE, T = Z > 0
+    fmov.s  F_U,@-DST ! LS, dst->u = U
+    movt    TMP       ! EX, tmp = T
+    fmov.s  F_W,@-DST ! LS, dst->w = W
+    fmov.s  F_Y,@-DST ! LS, dst->y = Y
+    shll2   TMP       ! EX, tmp = tmp << 2
+    fmov.s  F_X,@-DST ! LS, dst->x = X
+    or      TMP,FLG   ! EX, CLIPFLAGS |= tmp (T << 2)
+    mov.l   VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX
+.endm
+
+.macro ProcessVertex4
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    or      EOS,FLG   ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
+    fmov.s  F_C,@-DST ! LS, dst->c = C
+    fmov.s  F_V,@-DST ! LS, dst->v = V
+    fcmp/gt ZERO, F_Z ! FE, T = Z > 0
+    fmov.s  F_U,@-DST ! LS, dst->u = U
+    movt    TMP       ! EX, tmp = T
+    fmov.s  F_W,@-DST ! LS, dst->w = W
+    shll2   TMP       ! EX, tmp = tmp << 2
+    fmov.s  F_Y,@-DST ! LS, dst->y = Y
+    add     TMP,TMP   ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
+    fmov.s  F_X,@-DST ! LS, dst->x = X
+    or      TMP,FLG   ! EX, CLIPFLAGS |= tmp (T << 3)
+    mov.l   FLG,@-DST ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS
+.endm
+
+
+! =========================================================
+! ==================== TEXTURED VERTEX TRANSFORM ==========
+! =========================================================
+.global _DrawTexturedQuads
+.align 4
+
+_DrawTexturedQuads:
+! Setup
+    TransformSetup
+
+.T_TRANSFORM_QUAD:
+    LoadTexturedVertex
+    ProcessVertex1
+
+    LoadTexturedVertex
+    ProcessVertex2
+
+    LoadTexturedVertex
+    ProcessVertex3
+
+    LoadTexturedVertex
+    ProcessVertex4
+
+! CLIPFLAGS TESTING
+    and     #15,FLG
+    cmp/eq   #0,FLG      ! T = CLIPFLAGS == 0 (all points invisible)
+    bf/s    .T_LOOP_END  ! if !T goto LOOP_END
+    nop
+
+! No points visible case
+    add #-128, DST       ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration
+
+.T_LOOP_END:
+    dt CNT               ! count--; T = count == 0
+    bf .T_TRANSFORM_QUAD ! if !T then goto T_TRANSFORM_QUAD
+    nop
+    
+    TransformEnd
+.size _DrawTexturedQuads, .-_DrawTexturedQuads
+.type _DrawTexturedQuads, %function
+
+! =========================================================
+! ==================== COLOURED VERTEX TRANSFORM ==========
+! =========================================================
+.global _DrawColouredQuads
+.align 4
+
+_DrawColouredQuads:
+! Setup
+    fldi0 F_U     ! U = 0
+    fldi0 F_V     ! V = 0
+    TransformSetup
+
+.C_TRANSFORM_QUAD:
+    LoadColouredVertex
+    ProcessVertex1
+
+    LoadColouredVertex
+    ProcessVertex2
+
+    LoadColouredVertex
+    ProcessVertex3
+
+    LoadColouredVertex
+    ProcessVertex4
+
+! CLIPFLAGS TESTING
+    and     #15,FLG
+    cmp/eq   #0,FLG      ! T = CLIPFLAGS == 0 (all points invisible)
+    bf/s    .C_LOOP_END  ! if !T goto LOOP_END
+    nop
+
+! No points visible case
+    add #-128, DST       ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad
+
+.C_LOOP_END:
+    dt CNT               ! count--; T = count == 0
+    bf .C_TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
+    nop
+    
+    TransformEnd
+.size _DrawColouredQuads, .-_DrawColouredQuads
+.type _DrawColouredQuads, %function
--- a/misc/dreamcast/boot_logo.png
+++ b/misc/dreamcast/boot_logo.png
--- a/misc/dreamcast/ip.txt
+++ b/misc/dreamcast/ip.txt
@@ -0,0 +1,3 @@
+Version       : V1.360
+SW Maker Name : ClassiCube team
+Game Title    : ClassiCube
--- a/misc/dreamcast/readme.txt
+++ b/misc/dreamcast/readme.txt
@@ -0,0 +1,9 @@
+The dreamcast build requires an initial bootstrap program (Initial Program) named IP.bin
+
+To generate a custom IP.bin, compile https://github.com/Dreamcast-Projects/makeip
+
+Then run: makeip ip.txt IP.BIN -l boot_logo.png
+
+---
+
+For more details about IP.bin, see https://mc.pp.se/dc/ip.bin.html
--- a/misc/dreamcast/sh4_notes.txt
+++ b/misc/dreamcast/sh4_notes.txt
@@ -0,0 +1,31 @@
+=========================================================
+======================== PROCESSOR INFO =================
+=========================================================
+The SH4 can dual issue (i.e. parallel execution) two instructions
+as long as the groups of the two instructions are different:
+* LS - most ALU and FPU register load/stores
+* EX - most ALU arithmetic instructions
+* MT - TST, CMP, NOP, MOV Rm,Rn (NOTE: Can execute in parallel with other MT)
+* FE - most FPU arithmetic instructions
+* CO - other instructions (NOTE: Cannot never execute in parallel)
+
+The following general aspects of instructions are important to note per the SH4 manual:
+* Issue rate: Interval between the issue of an instruction and that of the next instruction
+* Latency: Interval between the issue of an instruction and the generation of its result (completion)
+* Latency is also the interval between the execution of two instructions with an interdependent relationship.
+  (although different cases may either increase or decrease Latency)
+
+
+=========================================================
+======================== REGISTER USAGES ================
+=========================================================
+SH4 C ABI:
+-  R0  to  R3 are return values (can be overwritten)
+-  R4  to  R7 are input arguments (can be overwritten)
+-  R8  to R13 are non-volatile (must be restored at end)
+- R14  is the frame pointer (must be restored at end)
+- R15  is the stack pointer (must be restored at end)
+- FR0  to FR3 are return values (can be overwritten)
+- FR4  to FR11 are input arguments (can be overwritten)
+- FR12 to FR13 are non-volatile (must be restored at end)
+