diff --git a/avail_mem.py b/avail_mem.py index c0ceef5..bc66dc5 100644 --- a/avail_mem.py +++ b/avail_mem.py @@ -21,9 +21,9 @@ continue addr = int(toks[0], 16) sym = toks[1] - if sym == "endVariableDmemUse": + if sym == "startFreeDmem": dmemAvail = addr - elif sym == "rdpCmdBuffer1": + elif sym == "endFreeDmem": dmemAvail = addr - dmemAvail elif sym == "startFreeImem": imemAvail = addr diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md index 22d7ff5..4932e87 100644 --- a/docs/Documentation/Performance.md +++ b/docs/Documentation/Performance.md @@ -7,24 +7,21 @@ visual effects are desired and increasing the RSP time a bit does not affect the overall performance. If your game is RSP bound, using the base version of F3DEX3 will make it slower. -Conversely, F3DEX3_LVP_NOC was created with the goal of matching the RSP -performance of F3DEX2 on all critical paths in the microcode: command dispatch, -vertex processing, and triangle processing. Then, the RDP and memory traffic -performance improvements of F3DEX3--56 vertex buffer, auto-batched rendering, -etc.--should improve performance from there. This means that F3DEX3_LVP_NOC can -improve performance regardless of whether your game is RSP bound or RDP bound. - -Note that F3DEX3_LVP_NOC is still slightly slower than F3DEX2 for various other -tasks--for example, the one-time setup when loading vertices, outside the loop -over vertices, is a little slower. +Conversely, F3DEX3_LVP_NOC matches or beats the RSP performance of F3DEX2 on all +critical paths in the microcode, including command dispatch, vertex processing, +and triangle processing. Then, the RDP and memory traffic performance +improvements of F3DEX3--56 vertex buffer, auto-batched rendering, etc.--should +further improve performance from there. This means that switching from F3DEX2 to +F3DEX3_LVP_NOC should always improve performance regardless of whether your game +is RSP bound or RDP bound. # Performance Results -These are cycle counts for all the critical paths in the microcode. Lower is +These are cycle counts for many key paths in the microcode. Lower numbers are better. The timings are hand-counted taking into account all pipeline stalls and -all dual-issue conditions. Instruction alignment is sometimes taken into -account, otherwise assumed to be optimal. +all dual-issue conditions. Instruction alignment after branches is sometimes +taken into account, otherwise assumed to be optimal. Vertex / lighting numbers assume no special features (texgen, packed normals, etc.) Tri numbers assume texture, shade, and Z, and not flushing the buffer. @@ -33,6 +30,9 @@ measured yet". | | F3DEX2 | F3DEX3_LVP_NOC | F3DEX3_LVP | F3DEX3_NOC | F3DEX3 | |----------------------------|--------|----------------|------------|------------|--------| +| Command dispatch | 12 | 12 | 12 | 12 | 12 | +| Small RDP command | 14 | 5 | 5 | 5 | 5 | +| Vtx before DMA start | 16 | 17 | 17 | 17 | 17 | | Vtx pair, no lighting | 54 | 54 | 81 | 79 | 98 | | Vtx pair, 0 dir lts | Can't | 64 | | | | | Vtx pair, 1 dir lt | 73 | 70 | 96 | 182 | 201 | @@ -44,20 +44,28 @@ measured yet". | Vtx pair, 7 dir lts | 118 | 112 | 138 | 356 | 375 | | Vtx pair, 8 dir lts | Can't | 119 | 145 | 385 | 404 | | Vtx pair, 9 dir lts | Can't | 126 | 152 | 414 | 433 | -| Command dispatch | 12 | 12 | 12 | 12 | 12 | -| Small RDP command | 14 | 5 | 5 | 5 | 5 | -| Only/2nd tri to offscreen | 27 | 29 | 29 | 29 | 29 | -| 1st tri to offscreen | 28 | 29 | 29 | 29 | 29 | +| Light dir xfrm, 0 dir lts | Can't | 95 | 95 | None | None | +| Light dir xfrm, 1 dir lt | 141 | 95 | 95 | None | None | +| Light dir xfrm, 2 dir lts | 180 | 96 | 96 | None | None | +| Light dir xfrm, 3 dir lts | 219 | 121 | 121 | None | None | +| Light dir xfrm, 4 dir lts | 258 | 122 | 122 | None | None | +| Light dir xfrm, 5 dir lts | 297 | 147 | 147 | None | None | +| Light dir xfrm, 6 dir lts | 336 | 148 | 148 | None | None | +| Light dir xfrm, 7 dir lts | 375 | 173 | 173 | None | None | +| Light dir xfrm, 8 dir lts | Can't | 174 | 174 | None | None | +| Light dir xfrm, 9 dir lts | Can't | 199 | 199 | None | None | +| Only/2nd tri to offscreen | 27 | 26 | 26 | 26 | 26 | +| 1st tri to offscreen | 28 | 27 | 27 | 27 | 27 | | Only/2nd tri to clip | 32 | 31 | 31 | 31 | 31 | -| 1st tri to clip | 33 | 31 | 31 | 31 | 31 | -| Only/2nd tri to backface | 38 | 40 | 40 | 40 | 40 | -| 1st tri to backface | 39 | 40 | 40 | 40 | 40 | -| Only/2nd tri to degenerate | 42 | 42 | 42 | 42 | 42 | -| 1st tri to degenerate | 43 | 42 | 42 | 42 | 42 | +| 1st tri to clip | 33 | 32 | 32 | 32 | 32 | +| Only/2nd tri to backface | 38 | 38 | 38 | 38 | 38 | +| 1st tri to backface | 39 | 39 | 39 | 39 | 39 | +| Only/2nd tri to degenerate | 42 | 40 | 40 | 40 | 40 | +| 1st tri to degenerate | 43 | 41 | 41 | 41 | 41 | | Only/2nd tri to occluded | Can't | Can't | 49 | Can't | 49 | -| 1st tri to occluded | Can't | Can't | 49 | Can't | 49 | -| Only/2nd tri to draw | 172 | 166 | 167 | 166 | 167 | -| 1st tri to draw | 173 | 166 | 167 | 166 | 167 | +| 1st tri to occluded | Can't | Can't | 50 | Can't | 50 | +| Only/2nd tri to draw | 172 | 165 | 168 | 165 | 168 | +| 1st tri to draw | 173 | 165 | 168 | 165 | 168 | Tri numbers are measured from the first cycle of the command handler inclusive, @@ -74,12 +82,12 @@ configuration. | Microcode | Scene 1 | Scene 2 | Scene 3 | |----------------|---------|---------|---------| -| F3DEX3 | 7.64ms | 3.13ms | 2.37ms | -| F3DEX3_NOC | 7.07ms | 2.89ms | 2.14ms | -| F3DEX3_LVP | 4.57ms | 1.77ms | 1.67ms | -| F3DEX3_LVP_NOC | Outdated | | | -| F3DEX2 | No* | No* | No* | -| Vertex count | 3664 | 1608 | 1608 | +| F3DEX3 | 7.41ms | 2.99ms | 2.22ms | +| F3DEX3_NOC | 6.85ms | 2.75ms | 1.98ms | +| F3DEX3_LVP | 4.12ms | 1.59ms | 1.48ms | +| F3DEX3_LVP_NOC | 3.34ms | 1.27ms | 1.16ms | +| F3DEX2 | Can't* | Can't* | Can't* | +| Vertex count | 3557 | 1548 | 1548 | *F3DEX2 does not contain performance counters, so the portion of the RSP time taken for vertex processing cannot be measured. diff --git a/f3dex3.s b/f3dex3.s index a9a80fc..c171df7 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -453,6 +453,9 @@ normalsMode: lastMatDLPhyAddr: .dw 0 +activeClipPlanes: + .dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping + // Constants for clipping algorithm clipCondShifts: .db CLIP_SCAL_NY_SHIFT @@ -460,17 +463,14 @@ clipCondShifts: .db CLIP_SCAL_NX_SHIFT .db CLIP_SCAL_PX_SHIFT -// "Forward declaration" of temporary matrix in clipTempVerts scratch space, aligned to 16 bytes -tempMemRounded equ ((clipTempVerts + 15) & ~15) - // Movemem table movememTable: - .dh tempMemRounded // G_MTX multiply temp matrix (model) - .dh mMatrix // G_MV_MMTX - .dh tempMemRounded // G_MTX multiply temp matrix (projection) - .dh vpMatrix // G_MV_PMTX - .dh viewport // G_MV_VIEWPORT - .dh cameraWorldPos // G_MV_LIGHT + .dh tempMatrix // G_MTX multiply temp matrix (model) + .dh mMatrix // G_MV_MMTX + .dh tempMatrix // G_MTX multiply temp matrix (projection) + .dh vpMatrix // G_MV_PMTX + .dh viewport // G_MV_VIEWPORT + .dh cameraWorldPos // G_MV_LIGHT // moveword table movewordTable: @@ -558,7 +558,6 @@ miniTableEntry G_TRIFAN_handler miniTableEntry G_LIGHTTORDP_handler miniTableEntry G_RELSEGMENT_handler - .align 2 // for everything following // The maximum number of generated vertices in a clip polygon. In reality, this // is equal to MAX_CLIP_POLY_VERTS, but for testing we can change them separately. @@ -593,54 +592,58 @@ MAX_CLIP_GEN_VERTS equ 7 // tris) if this occurs. Because this is caused by extreme/degenerate cases like // the camera exactly on a tri, not drawing anything is an okay result. MAX_CLIP_POLY_VERTS equ 7 -clipPoly: - .skip (MAX_CLIP_POLY_VERTS+1) * 2 // 3 5 7 + term 0 -clipPoly2: // \ / \ / \ - .skip (MAX_CLIP_POLY_VERTS+1) * 2 // 4 6 7 + term 0 - -// Vertex buffer in RSP internal format -vertexBuffer: - .skip (G_MAX_VERTS * vtxSize) - -.if . > yieldDataFooter - // OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved. The last data in that is - // the footer, which contains four perf counters, taskDataPtr, and ucode. - // So, any data starting from the address of this footer will be clobbered, - // so the vertex buffer and other data which needs to be save across yield - // can't extend here. (The input buffer will be reloaded from the next - // command in the source DL.) - .error "Important things in DMEM will not be saved at yield!" -.endif +CLIP_POLY_SIZE_BYTES equ (MAX_CLIP_POLY_VERTS+1) * 2 +CLIP_TEMP_VERTS_SIZE_BYTES equ (MAX_CLIP_GEN_VERTS * vtxSize) -// Space for temporary verts for clipping code -// tempMemRounded defined above = this rounded up to 16 bytes, for temp mtx etc. -clipTempVerts: - .skip MAX_CLIP_GEN_VERTS * vtxSize -clipTempVertsEnd: - -.if (. - tempMemRounded) < 0x40 - .error "Not enough space for temp matrix!" -.endif - -memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0) -memsetBufferEnd equ (clipTempVertsEnd & 0xFF0) -memsetBufferSize equ (memsetBufferEnd - memsetBufferStart) +VERTEX_BUFFER_SIZE_BYTES equ (G_MAX_VERTS * vtxSize) RDP_CMD_BUFSIZE equ 0xB0 RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command RDP_CMD_BUFSIZE_TOTAL equ (RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS) + INPUT_BUFFER_CMDS equ 21 -INPUT_BUFFER_LEN equ (INPUT_BUFFER_CMDS * 8) -END_VARIABLE_LEN_DMEM equ (0xFC0 - INPUT_BUFFER_LEN - (2 * RDP_CMD_BUFSIZE_TOTAL)) +INPUT_BUFFER_SIZE_BYTES equ (INPUT_BUFFER_CMDS * 8) + +END_VARIABLE_LEN_DMEM equ (0xFC0 - INPUT_BUFFER_SIZE_BYTES - (2 * RDP_CMD_BUFSIZE_TOTAL) - (2 * CLIP_POLY_SIZE_BYTES) - CLIP_TEMP_VERTS_SIZE_BYTES - VERTEX_BUFFER_SIZE_BYTES) -endVariableDmemUse: +startFreeDmem: +.org END_VARIABLE_LEN_DMEM +endFreeDmem: + +// Main vertex buffer in RSP internal format +vertexBuffer: + .skip VERTEX_BUFFER_SIZE_BYTES + +// Space for temporary verts for clipping code, and reused for other things +clipTempVerts: -.if . > END_VARIABLE_LEN_DMEM - .error "Out of DMEM space" +// Round up to 0x10 +.org ((clipTempVerts + 0xF) & 0xFF0) +// Vertex addresses, to avoid a multiply-add for each vertex index lookup +vertexTable: + .skip ((G_MAX_VERTS + 8) * 2) // halfword for each vertex; need 1 extra end addr, easier to write 8 extra + +.if . > yieldDataFooter + // Need to fit everything through vertex buffer in yield buffer, would like + // to also fit vertexTable to avoid recompute after yield + .error "Too much being stored in yieldable DMEM" .endif -.org END_VARIABLE_LEN_DMEM +tempMatrix: + .skip 0x40 + +.if . > (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES) + .error "Too much in clipTempVerts" +.endif +.org (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES) +clipTempVertsEnd: +clipPoly: + .skip CLIP_POLY_SIZE_BYTES // 3 5 7 + term 0 +clipPoly2: // \ / \ / \ + .skip CLIP_POLY_SIZE_BYTES // 4 6 7 + term 0 + + // First RDP Command Buffer rdpCmdBuffer1: .skip RDP_CMD_BUFSIZE @@ -665,7 +668,7 @@ rdpCmdBuffer2EndPlus1Word: // Input buffer. After RDP cmd buffers so it can be vector addressed from end. inputBuffer: - .skip INPUT_BUFFER_LEN + .skip INPUT_BUFFER_SIZE_BYTES inputBufferEnd: inputBufferEndSgn equ -(0x1000 - inputBufferEnd) // Underflow DMEM address @@ -687,6 +690,13 @@ startCounterTime equ (OSTask + OSTask_ucode_size) // These two words are used by boot, but not by F3DEX3 or S2DEX. xfrmLookatDirs equ -(0x1000 - (OSTask + OSTask_ucode_data)) // and OSTask_ucode_data_size + +memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0) +memsetBufferMaxEnd equ (rdpCmdBuffer1 & 0xFF0) +memsetBufferMaxSize equ (memsetBufferMaxEnd - memsetBufferStart) +memsetBufferSize equ (memsetBufferMaxSize > 0x800 ? 0x800 : memsetBufferMaxSize) + + .close // DATA_FILE //////////////////////////////////////////////////////////////////////////////// @@ -929,9 +939,10 @@ vLookat1 equ vAAA tempViewportScale equ 0x00 tempViewportOffset equ 0x10 tempOccPlusMinus equ 0x20 -tempXfrmSingle equ 0x30 -tempVpRGBA equ 0x40 -tempVpPkNorm equ 0x50 +tempVpRGBA equ 0x30 +tempVpPkNorm equ 0x40 +tempXfrmSingle equ 0x50 +tempPrevVtxGarbage equ 0x50 // Up to 2 * 0x26 = 0x4C used -> to 0x9C //////////////////////////////////////////////////////////////////////////////// @@ -943,7 +954,7 @@ tempVpPkNorm equ 0x50 .macro instantiate_mtx_end_begin // Multiplies the temp loaded matrix into the M or VP matrix lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP. - li $3, tempMemRounded // Input 1 = temp mem (loaded mtx) + li $3, tempMatrix // Input 1 = temp mem (loaded mtx) jal while_wait_dma_busy move $2, $6 // Input 0 = output // Followed immediately by instantiate_mtx_multiply. These need to be broken @@ -985,8 +996,7 @@ tempVpPkNorm equ 0x50 .endmacro .macro instantiate_branch_wz - jal vtx_addrs_from_cmd // byte 3 = vtx being tested; addr -> $10 - nop + lhu $10, (vertexTable)(cmd_w0) // Vertex addr from byte 3 .if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2 lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex) .else @@ -1018,7 +1028,7 @@ tempVpPkNorm equ 0x50 li $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart jal @@clamp_to_memset_buffer vmudh $v2, vOne, $v2[1] // Move element 1 (lower bytes) to all - addi $2, $2, memsetBufferStart // First qword set is one below memsetBufferEnd + addi $2, $2, memsetBufferStart // First qword set is one below end @@pre_loop: sqv $v2, (-0x10)($2) bne $2, $3, @@pre_loop @@ -1034,11 +1044,11 @@ tempVpPkNorm equ 0x50 j wait_for_dma_and_run_next_command // Delay slot harmless @@clamp_to_memset_buffer: - addi $11, cmd_w0, -memsetBufferSize // Is more than a whole buffer left? - bltz $11, return_routine - move $2, cmd_w0 // No, use partial buffer + addi $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize) + sra $10, $11, 31 + and $11, $11, $10 jr $ra - li $2, memsetBufferSize + addi $2, $11, memsetBufferSize .endmacro @@ -1115,7 +1125,8 @@ continue_from_os_task: lw perfCounterB, mITMatrix + YDF_OFFSET_PERFCOUNTERB lw perfCounterC, mITMatrix + YDF_OFFSET_PERFCOUNTERC lw perfCounterD, mITMatrix + YDF_OFFSET_PERFCOUNTERD - lw taskDataPtr, OSTask + OSTask_data_ptr + jal fill_vertex_table + lw taskDataPtr, OSTask + OSTask_data_ptr finish_setup: .if CFG_PROFILING_C mfc0 $11, DPC_CLOCK @@ -1137,8 +1148,8 @@ ovl01_end: displaylist_dma_with_count: andi inputBufferPos, cmd_w0, 0x00F8 // Byte 3, how many cmds to drop from load (max 0xA0) displaylist_dma: - // Load INPUT_BUFFER_LEN - inputBufferPos cmds (inputBufferPos >= 0, mult of 8) - addi inputBufferPos, inputBufferPos, -INPUT_BUFFER_LEN // inputBufferPos = - num cmds + // Load INPUT_BUFFER_SIZE_BYTES - inputBufferPos cmds (inputBufferPos >= 0, mult of 8) + addi inputBufferPos, inputBufferPos, -INPUT_BUFFER_SIZE_BYTES // inputBufferPos = - num cmds .if CFG_PROFILING_A sll $11, inputBufferPos, 16 - 3 // Divide by 8 for num cmds to load, then move to upper 16 sub perfCounterB, perfCounterB, $11 // Negative so subtract @@ -1179,7 +1190,7 @@ check_rdp_buffer_full_and_run_next_cmd: vertex_end: .endif .if !CFG_PROFILING_A -tri_end: +tris_end: .endif .if ENABLE_PROFILING G_LIGHTTORDP_handler: @@ -1232,56 +1243,6 @@ call_ret_common: j displaylist_dma_with_count sb $1, displayListStackLength -G_LOAD_UCODE_handler: - j load_overlay_0_and_enter // Delay slot is harmless -G_MODIFYVTX_handler: - // Command byte 3 = vtx being modified; its addr -> $10 - li $ra, do_moveword // Moveword adds cmd_w0 to $10 for final addr - lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx -vtx_addrs_from_cmd: - // Treat eight bytes of last command each as vertex indices << 1 - // inputBufferEnd is close enough to the end of DMEM to fit in signed offset - lpv $v27[0], (inputBufferEndSgn - 8)(inputBufferPos) - // Also out elem 3 -> $10, elem 7 -> $3 because these are used more than once - vmudn $v29, vOne, $v30[0] // Address of vertex buffer - vmadl $v27, $v27, $v30[1] // Plus vtx indices times length - sb $zero, materialCullMode // This covers modify vtx, branchZ, cull - jr $ra - mfc2 $10, $v27[6] - -G_TRIFAN_handler: - li $1, 0x8000 // $ra negative = flag for G_TRIFAN -G_TRISTRIP_handler: - addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0 - addi cmd_w0, inputBufferPos, inputBufferEnd - 12 // Start pointing so elems 5-7 are tris 1-3 -tri_strip_fan_loop: - lb $3, (7)(cmd_w0) // Load signed index of last of 3 tris - bgez $ra, @@skip_copy_1 // Skip if G_TRISTRIP - lbu $1, (inputBufferEnd - 7)(inputBufferPos) // Load tri 1 index - sb $1, (5)(cmd_w0) // Store as first tri of the three current tris -@@skip_copy_1: - bltz $3, tri_end // If third tri index is negative, exit - addi $11, inputBufferPos, inputBufferEnd - 7 // Off end of command - beq $11, cmd_w0, tri_end // If off end of command, exit - lpv $v27[0], (0)(cmd_w0) // Load the three tris to elems 5-7 - bltz $ra, tri_main // Draw if G_TRIFAN - addi cmd_w0, cmd_w0, 1 // Increment - andi $11, cmd_w0, 1 // If odd after increment, this is the 1st/3rd/5th tri - bnez $11, tri_main // in that case draw directly - sll $3, $3, 8 // Move tri 3 index into bits 15:8 - vmov $v27[7], $v27[6] // Move tri 2 to tri 3 - j tri_main - mtc2 $3, $v27[12] // Move tri 3 to tri 2 - -G_TRI2_handler: -G_QUAD_handler: - jal tri_main // Send second tri; return here for first tri - lpv $v27[0], (inputBufferEndSgn - 8)(inputBufferPos) // Second tri idxs elems 5, 6, 7 -G_TRI1_handler: - lpv $v27[4], (inputBufferEndSgn - 8)(inputBufferPos) // First tri idxs elems 5, 6, 7 - j tri_main - li $ra, tri_end // After done with this tri, exit tri processing - .if !ENABLE_PROFILING G_LIGHTTORDP_handler: lbu $11, numLightsxSize // Ambient light @@ -1321,740 +1282,489 @@ G_MEMSET_handler: .endif +G_LOAD_UCODE_handler: + j load_overlay_0_and_enter // Delay slot is harmless +G_MODIFYVTX_handler: + lhu $10, (vertexTable)(cmd_w0) // Byte 3 = vtx being modified + j do_moveword // Moveword adds cmd_w0 to $10 for final addr + lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear + G_VTX_handler: - srl $2, cmd_w0, 11 // n << 1 - sub $2, cmd_w0, $2 // = v0 << 1 - vmudn $v29, vOne, $v30[0] // Address of vertex buffer - sb $2, (inputBufferEnd - 8)(inputBufferPos) // Store v0 << 1 as byte 0 - lpv $v27[0], (inputBufferEndSgn - 8)(inputBufferPos) // (v0 + n) << 1 is byte 3 - sb $zero, materialCullMode // This covers vtx - lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10 + lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive jal segmented_to_physical // Convert address in cmd_w1_dram to physical - vmadl $v27, $v27, $v30[1] // Plus vtx indices times length - mfc2 $10, $v27[6] + lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10 + sub dmemAddr, dmemAddr, $1 // Start addr = end addr - size. Rounded down to DMA word by H/W addi dmaLen, $1, -1 // DMA length is always offset by -1 - lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode - andi $10, $10, 0xFFF8 // Round down end addr to DMA word; one input vtx still fits in one internal vtx - jal dma_read_write - sub dmemAddr, $10, $1 // Start addr = end addr - size - mfc2 outputVtxPos, $v27[0] // Address of start -.if COUNTER_A_UPPER_VERTEX_COUNT - sll $11, $1, 12 // Vtx count * 0x10000 - add perfCounterA, perfCounterA, $11 // Add to vertex count -.endif - li $ra, 0 // Flag to not return to clipping -vtx_setup_constants: - // Computes modified viewport scale and offset including fog info, and stores - // these to temp memory in the RDP buffer. This is only used during vertex write - // and the first half of clipping, so that memory is not used then. -.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE - veq $v29, $v31, $v31[3h] // VCC = 00010001 -.elseif !CFG_NO_OCCLUSION_PLANE - vge $v29, $v31, $v31[2h] // VCC = 00110011 + j dma_read_write + li $ra, 0x8000 | vtx_after_dma // Negative = flag to not to return to clipping in vtx_setup_constants + +G_TRIFAN_handler: + li $1, 0x8000 // $ra negative = flag for G_TRIFAN +G_TRISTRIP_handler: + addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0 + addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte +tri_strip_fan_loop: + lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word + addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command + beq $11, cmd_w0, tris_end // If off end of command, exit + sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit + bltz $10, tris_end // If negative, exit + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices + bltz $ra, tri_fan_store // Finish handling G_TRIFAN + addi cmd_w0, cmd_w0, 1 // Increment + andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri + bnez $11, tri_main // Draw as is + srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs + sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2 + j tri_main + sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3 + +tV1AtF equ $v5 +tV2AtF equ $v7 +tV3AtF equ $v9 +tV1AtI equ $v18 +tV2AtI equ $v19 +tV3AtI equ $v21 + +G_TRI2_handler: +G_QUAD_handler: + jal tri_main // Send second tri; return here for first tri + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store second tri indices +G_TRI1_handler: + li $ra, tris_end // After done with this tri, exit tri processing + sw cmd_w0, 4(rdpCmdBufPtr) // Store first tri indices +tri_main: + lpv $v27[0], 0(rdpCmdBufPtr) // To vector unit + lbu $1, 5(rdpCmdBufPtr) + lbu $2, 6(rdpCmdBufPtr) + lbu $3, 7(rdpCmdBufPtr) + vclr vZero + lhu $1, (vertexTable)($1) + vmudn $v29, vOne, $v30[0] // Address of vertex buffer + lhu $2, (vertexTable)($2) + vmadl $v27, $v27, $v30[1] // Plus vtx indices times length + lhu $3, (vertexTable)($3) + vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6 +.if !ENABLE_PROFILING + addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested + move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading .endif - ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7 -.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE -// sFGM is $v12 // FoG Mask - vmrg sFGM, vOne, $v31[2] // sFGM is 0,0,0,1,0,0,0,1 -.elseif !CFG_NO_OCCLUSION_PLANE - vmrg sOPMs, vOne, $v31[1] // Signs of sOPMs are --++--++ +tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping + vnxor tV1AtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding + llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y) + vnxor tV2AtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding + llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4 + vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr + llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8 + vnxor tV3AtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding + lhu $5, VTX_CLIP($1) + vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr + lhu $7, VTX_CLIP($2) + // vnop + lhu $8, VTX_CLIP($3) + vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1 + andi $11, $5, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane + vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr) + and $11, $11, $7 + vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr) + and $11, $11, $8 + vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr) + vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y + bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull + // 22 cycles + vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2) + vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... + lhu $24, activeClipPlanes + vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing + lw $6, geometryModeLabel // Load full geometry mode word + vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y + or $10, $5, $7 + vmrg $v10, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2) + or $10, $10, $8 // $10 = all clip bits which are true for any verts + vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y + and $10, $10, $24 // If clipping is enabled, check clip flags + vmrg $v4, $v14, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3) + mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended + vmrg $v14, $v8, $v14 // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2) + bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip + // 30 cycles + sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull + vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) + srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing) + vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor + sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing + vmrg $v2, $v4, $v10 // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2) + bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing) + // 34 cycles + vmrg $v10, $v10, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3) + vmudn $v4, $v14, $v31[5] // 0x4000 + beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull. + // 36 cycles + mfc2 $1, $v14[12] // $v14 = lowest Y value = highest on screen (x, y, addr) + vsub $v6, $v2, $v14 + mfc2 $2, $v2[12] // $v2 = mid vertex (x, y, addr) + vsub $v8, $v10, $v14 +.if !ENABLE_PROFILING + sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit .endif - ldv sVPO[8], (viewport + 8)($zero) - lw $10, (geometryModeLabel)($zero) - ldv sVPS[0], (viewport)($zero) // Load vscale duplicated in 0-3 and 4-7 - ldv sVPS[8], (viewport)($zero) + vsub $v11, $v14, $v2 + andi $6, $6, (G_SHADE | G_ZBUFFER) + vsub $v12, $v14, $v10 // VH - VL (negative) + mfc2 $3, $v10[12] // $v10 = highest Y value = lowest on screen (x, y, addr) + vsub $v15, $v10, $v2 .if !CFG_NO_OCCLUSION_PLANE - vmudh sOPMs, sOPMs, $v31[5] // sOPMs is 0xC000, 0xC000, 0x4000, 0x4000, repeat -.endif - llv $v23[0], (fogFactor)($zero) // Load fog multiplier 0 and offset 1 - vne $v29, $v31, $v31[3h] // VCC = 11101110 - lqv $v30, (fxParams - altBase)(altBaseReg) // Parameters for vtx and lighting - vmudh $v20, sVPS, $v31[1] // -1; -vscale -.if CFG_LEGACY_VTX_PIPE - lbu $7, mITValid -.else - andi $11, $10, G_AMBOCCLUSION -.endif - vmrg sVPS, sVPS, $v23[0] // Put fog multiplier in elements 3,7 of vscale -.if !CFG_NO_OCCLUSION_PLANE && !CFG_LEGACY_VTX_PIPE - sqv sOPMs, (tempOccPlusMinus)(rdpCmdBufEndP1) // Store occlusion plane -/+4000 constants + and $5, $5, $7 + and $5, $5, $8 + andi $5, $5, CLIP_OCCLUDED .endif -.if CFG_LEGACY_VTX_PIPE - llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1 + vmudh $v29, $v6, $v8[0] +.if !CFG_NO_OCCLUSION_PLANE + bnez $5, tri_culled_by_occlusion_plane // Cull if all verts occluded .endif - vmrg sVPO, sVPO, $v23[1] // Put fog offset in elements 3,7 of vtrans -.if CFG_LEGACY_VTX_PIPE - llv sSTS[8], (textureSettings2)($zero) // Texture ST scale in 4, 5 -.else - vge $v29, $v31, $v31[3] // VCC = 00011111 + llv $v13[0], VTX_INV_W_VEC($1) + vmadh $v29, $v8, $v11[0] + lpv tV1AtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1 + vreadacc $v17, ACC_UPPER + lpv tV2AtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2 + vreadacc $v16, ACC_MIDDLE + lpv tV3AtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3 + vrcp $v20[0], $v15[1] +.if !ENABLE_PROFILING + lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx) .endif - vmov sVPS[1], $v20[1] // Negate vscale[1] because RDP top = y=0 -.if CFG_LEGACY_VTX_PIPE - bnez $ra, clip_after_constants // Return to clipping if from there + vmov $v15[2], $v6[0] + llv $v13[8], VTX_INV_W_VEC($2) + vrcph $v22[0], $v17[1] + llv $v13[12], VTX_INV_W_VEC($3) + vrcpl $v23[1], $v16[1] +.if !ENABLE_PROFILING + bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set .endif - vmov sVPS[5], $v20[1] // Same for second half -vtx_matrix_load: -.if CFG_LEGACY_VTX_PIPE - bnez $7, skip_vtx_mvp - li $2, vpMatrix - li $3, mMatrix - j mtx_multiply - li $6, mITMatrix -vtx_after_mtx_multiply: - sqv $v5[0], (fourthQWMVP + 0)($zero) - sb $10, mITValid // $10 is nonzero from mtx_multiply, in fact 0x18 -skip_vtx_mvp: - andi $11, $5, G_LIGHTING >> 8 - bnez $11, ovl234_lighting_entrypoint // Lighting setup, incl. transform - move inputVtxPos, dmemAddr // Must be before overlay load -vtx_after_lt_setup: - lqv vM0I, (mITMatrix + 0x00)($zero) // Load MVP matrix - lqv vM2I, (mITMatrix + 0x10)($zero) - lqv vM0F, (mITMatrix + 0x20)($zero) - lqv vM2F, (fourthQWMVP + 0)($zero) -.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC - addi outputVtxPos, outputVtxPos, -vtxSize // Will inc by 2, but need point to 2nd -.else - addi outputVtxPos, outputVtxPos, -2*vtxSize // Going to increment this by 2 verts in loop + vrcph $v24[1], $v31[2] // 0 +.if !ENABLE_PROFILING + vlt $v29, $v31, $v31[3] // Set vcc to 11100000 + vmrg tV1AtI, $v25, tV1AtI // RGB from $4, alpha from $1 + vmrg tV2AtI, $v25, tV2AtI // RGB from $4, alpha from $2 + vmrg tV3AtI, $v25, tV3AtI // RGB from $4, alpha from $3 +tri_skip_flat_shading: .endif - vcopy vM1I, vM0I - vcopy vM3I, vM2I - ldv vM1I[0], (mITMatrix + 0x08)($zero) - vcopy vM1F, vM0F - ldv vM3I[0], (mITMatrix + 0x18)($zero) - vcopy vM3F, vM2F - ldv vM1F[0], (mITMatrix + 0x28)($zero) - ldv vM3F[0], (fourthQWMVP + 8)($zero) - ldv vM0I[8], (mITMatrix + 0x00)($zero) - ldv vM2I[8], (mITMatrix + 0x10)($zero) - ldv vM0F[8], (mITMatrix + 0x20)($zero) - ldv vM2F[8], (fourthQWMVP + 0)($zero) -.else - bnez $11, @@skipzeroao // Continue if AO disabled - sqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Store viewport offset - vmrg $v30, $v30, $v31[2] // 0; zero AO values -@@skipzeroao: - bnez $ra, clip_after_constants // Return to clipping if from there - sqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Store viewport scale - lqv vM0I, (mMatrix + 0x00)($zero) // Load M matrix - lqv vM2I, (mMatrix + 0x10)($zero) - lqv vM0F, (mMatrix + 0x20)($zero) - lqv vM2F, (mMatrix + 0x30)($zero) - lbu $11, mITValid // 0 if matrix invalid, 1 if valid - vcopy vM1I, vM0I - lbu $10, normalsMode // bit 0 clear if don't compute mIT, set if do - vcopy vM3I, vM2I - ldv vM1I[0], (mMatrix + 0x08)($zero) - vcopy vM1F, vM0F - ldv vM3I[0], (mMatrix + 0x18)($zero) - vcopy vM3F, vM2F - ldv vM1F[0], (mMatrix + 0x28)($zero) - sltiu $11, $11, 1 // 0 if matrix valid, 1 if invalid - srl $7, $5, 9 // G_LIGHTING in bit 1 - and $7, $7, $11 // If lighting enabled and need to update matrix, - and $7, $7, $10 // and computing mIT, - move inputVtxPos, dmemAddr // this must be before overlay load, can be clobbered - ldv vM3F[0], (mMatrix + 0x38)($zero) - ldv vM0I[8], (mMatrix + 0x00)($zero) - ldv vM2I[8], (mMatrix + 0x10)($zero) - ldv vM0F[8], (mMatrix + 0x20)($zero) - bnez $7, ovl234_ovl4_entrypoint // run overlay 4 to compute M inverse transpose - ldv vM2F[8], (mMatrix + 0x30)($zero) -vtx_after_calc_mit: - lqv vVP0I, (vpMatrix + 0x00)($zero) - lqv vVP2I, (vpMatrix + 0x10)($zero) - lqv vVP0F, (vpMatrix + 0x20)($zero) - lqv vVP2F, (vpMatrix + 0x30)($zero) - addi outputVtxPos, outputVtxPos, -2*vtxSize // Going to increment this by 2 verts in loop - vcopy vVP1I, vVP0I - vcopy vVP3I, vVP2I - ldv vVP1I[0], (vpMatrix + 0x08)($zero) - vcopy vVP1F, vVP0F - ldv vVP3I[0], (vpMatrix + 0x18)($zero) - vcopy vVP3F, vVP2F - ldv vVP1F[0], (vpMatrix + 0x28)($zero) - ldv vVP3F[0], (vpMatrix + 0x38)($zero) - ldv vVP0I[8], (vpMatrix + 0x00)($zero) - ldv vVP2I[8], (vpMatrix + 0x10)($zero) - ldv vVP0F[8], (vpMatrix + 0x20)($zero) - ldv vVP2F[8], (vpMatrix + 0x30)($zero) -.endif -vtx_after_matrix_load: -.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC - andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled - srl $7, $7, 5 // 8 if G_FOG is set, 0 otherwise - li $19, clipTempVerts + vtxSize // Temp mem; fog writes up to vtxSize before - jal while_wait_dma_busy // Wait for vertex load to finish - move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos - andi $11, $5, G_LIGHTING >> 8 - beqz $11, @@skip_lighting - li $ra, vtx_loop_no_lighting - li $ra, lt_vtx_pair -@@skip_lighting: - ldv vPairPosI[0], (VTX_IN_OB + 0 * inputVtxSize)(inputVtxPos) // 1st vec pos - ldv vPairPosI[8], (VTX_IN_OB + 1 * inputVtxSize)(inputVtxPos) // 2nd vec pos - llv sTCL[8], (VTX_IN_CN + 0 * inputVtxSize)(inputVtxPos) // RGBA in 4:5 - llv sTCL[12], (VTX_IN_CN + 1 * inputVtxSize)(inputVtxPos) // RGBA in 6:7 - llv vPairST[0], (VTX_IN_TC + 0 * inputVtxSize)(inputVtxPos) // ST in 0:1 - j vtx_store_loop_entry - llv vPairST[8], (VTX_IN_TC + 1 * inputVtxSize)(inputVtxPos) // ST in 4:5 -.else - andi $11, $5, G_LIGHTING >> 8 - beqz $11, @@skip_lighting - li $16, vtx_return_from_lighting // This is clipFlags, but not modified - li $16, lt_vtx_pair // during vtx_store -@@skip_lighting: - andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled - jal while_wait_dma_busy // Wait for vertex load to finish - li $19, clipTempVerts // Temp mem we can freely overwrite replaces outputVtxPos - j vtx_store_loop_entry - move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos -.endif - -.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC - -// $v0:$v7 = MVP, $v8:$v10 = sVPS/sVPO/sSTS, $v11 = available, $v12 = sFGM, -// $v13 = first light dir, $v14:$v16 = Y/Z/vPairNrml/temp, $v17 = vPairLt/temp, -// $v18:$v19 = available, $v20:$v21 = vPairPosI/F/temp, -// $v22 = vPairST, $v23:$v24 = vPairTPosF/I/temp, $v25:$v26 = temps, $v27 = vPairRGBA, -// $v28 = vOne, $v29 = garbage, $v30 = params, $v31 = constants -// $1: 0x10 vtx count, $2: need for clipping, $3: init lt ptr, $4: vtx1/perf, -// $5: geom mode mid, $6: need for clipping, $7: fog flag, $8: secondVtxPos, -// $9: need for clipping, $10:$11: temp, $12: perf, $13: altBaseReg, $14: inputVtxPos, -// $15: outputVtxPos, $16: lt jump addr, $17:$18: need for clipping, $19: shadow out vtx, -// $20: temp, $21: need for clipping, $22:$23: cmd buf, $24: temp, $25: cmd_w0 global, -// $26: taskDataPtr, $27: inputBufferPos, $28:$30: perf, $ra return addr - -.align 8 -vtx_loop_no_lighting: - vmadh $v29, vM1I, vPairPosI[1h] - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about - vmadn vPairTPosF, vM2F, vPairPosI[2h] - or $10, $10, $11 // Combine results for first vertex - vmadh vPairTPosI, vM2I, vPairPosI[2h] - sh $10, (VTX_CLIP )($19) // Store first vertex flags -// sKPI is $v11 // vtx_store Keep Int (keep across pipelining) -// sKPG is vBBB = $v21 // vtx_store Keep Fog - vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) - luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA -// sCLZ is $v19 - vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 - addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2 -vtx_return_from_lighting: -vtx_store_for_clip: - vmudl $v29, vPairTPosF, $v30[3] // Persp norm - sub $20, secondVtxPos, $7 // Points 8 before secondVtxPos if fog, else 0 -// s1WI is $v16 // vtx_store 1/W Int - vmadm s1WI, vPairTPosI, $v30[3] // Persp norm - addi outputVtxPos, outputVtxPos, 2*vtxSize // Points to SECOND output vtx -// s1WF is $v17 // vtx_store 1/W Frac - vmadn s1WF, $v31, $v31[2] // 0 - sbv sKPG[15], (VTX_COLOR_A + 8)($20) // In VTX_SCR_Y if fog disabled... -// sKPF is $v18 // vtx_store Keep Frac - vmov sKPF[1], sCLZ[2] - sbv sKPG[7], (VTX_COLOR_A + 8 - vtxSize)($20) // ...which gets overwritten below -// sSCF is $v20 // vtx_store Scaled Clipping Frac - vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping - ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) -// sSCI is $v21 // vtx_store Scaled Clipping Int - vmadh sSCI, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping - slv sKPI[8], (VTX_SCR_VEC )(secondVtxPos) - vrcph $v29[0], s1WI[3] - slv sKPI[0], (VTX_SCR_VEC )($19) -// sRTF is $v25 // vtx_store Reciprocal Temp Frac - vrcpl sRTF[2], s1WF[3] - ssv sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos) -// sRTI is $v26 // vtx_store Reciprocal Temp Int - vrcph sRTI[3], s1WI[7] - slv sKPF[2], (VTX_SCR_Z )($19) - vrcpl sRTF[6], s1WF[7] - sra $24, $1, 31 // All 1s if on last iter - vrcph sRTI[7], $v31[2] // 0 - andi $24, $24, vtxSize // vtxSize if on last iter, else normally 0 - vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high - sub secondVtxPos, outputVtxPos, $24 // First output vtx on last iter, else second - vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low - addi $19, outputVtxPos, -vtxSize // First output vtx always - vmudl $v29, s1WF, sRTF[2h] - cfc2 $10, $vcc // Screen clip results - vmadm $v29, s1WI, sRTF[2h] - sdv vPairTPosF[8], (VTX_FRAC_VEC )(secondVtxPos) - vmadn s1WF, s1WF, sRTI[3h] -// sTCL is $v19 // vtx_store Temp CoLor - ldv sTCL[0], (VTX_IN_TC + 2 * inputVtxSize)(inputVtxPos) // ST in 0:1, RGBA in 2:3 - vmadh s1WI, s1WI, sRTI[3h] - sdv vPairTPosF[0], (VTX_FRAC_VEC )($19) -// sST2 equ $v11 // vtx_store ST coordinates copy 2 - vmudm sST2, vPairST, sSTS // Scale ST - lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below - vmudh $v29, vOne, $v31[4] // 4 - sdv vPairTPosI[8], (VTX_INT_VEC )(secondVtxPos) - vmadn s1WF, s1WF, $v31[0] // -4 - lsv vPairTPosF[6], (VTX_Z_FRAC )($19) // load Z into W slot, will be for fog below - vmadh s1WI, s1WI, $v31[0] // -4 - sdv vPairTPosI[0], (VTX_INT_VEC )($19) - // vnop - ldv sTCL[8], (VTX_IN_TC + 3 * inputVtxSize)(inputVtxPos) // ST in 4:5, RGBA in 6:7 - vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high - suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos) // Store RGBA for second vtx - vmudl $v29, s1WF, sRTF[2h] - lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below - vmadm $v29, s1WI, sRTF[2h] - suv vPairRGBA[0], (VTX_COLOR_VEC )($19) // Store RGBA for first vtx - vmadn s1WF, s1WF, sRTI[3h] - lsv vPairTPosI[6], (VTX_Z_INT )($19) // load Z into W slot, will be for fog below - vmadh s1WI, s1WI, sRTI[3h] - srl $24, $10, 4 // Shift second vertex screen clipping to first slots - vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low - andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about - vcopy vPairST, sTCL - cfc2 $20, $vcc // Scaled clip results - vmudl $v29, vPairTPosF, s1WF[3h] // Pos times inv W - ssv s1WF[14], (VTX_INV_W_FRAC)(secondVtxPos) - vmadm $v29, vPairTPosI, s1WF[3h] // Pos times inv W -// vPairPosI is $v20 - ldv vPairPosI[0], (VTX_IN_OB + 2 * inputVtxSize)(inputVtxPos) // Pos of 1st vector for next iteration - vmadn vPairTPosF, vPairTPosF, s1WI[3h] - ldv vPairPosI[8], (VTX_IN_OB + 3 * inputVtxSize)(inputVtxPos) // Pos of 2nd vector on next iteration - vmadh vPairTPosI, vPairTPosI, s1WI[3h] // vPairTPosI:vPairTPosF = pos times inv W - addi inputVtxPos, inputVtxPos, (2 * inputVtxSize) // Advance two positions forward in the input vertices - vmov sTCL[4], vPairST[2] // First vtx RG to elem 4 - andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about - vmov sTCL[5], vPairST[3] // First vtx BA to elem 5 - sll $11, $20, 4 // Shift first vertex scaled clipping to second slots - vmudl $v29, vPairTPosF, $v30[3] // Persp norm - ssv s1WF[6], (VTX_INV_W_FRAC)($19) - vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm - ssv s1WI[14], (VTX_INV_W_INT )(secondVtxPos) - vmadn vPairTPosF, $v31, $v31[2] // 0; Now vPairTPosI:vPairTPosF = projected position - ssv s1WI[6], (VTX_INV_W_INT )($19) - // vnop - slv sST2[8], (VTX_TC_VEC )(secondVtxPos) // Store scaled S, T vertex 2 - vmudh $v29, sVPO, vOne // offset * 1 - slv sST2[0], (VTX_TC_VEC )($19) // Store scaled S, T vertex 1 - vmadh $v29, sFGM, $v31[6] // + (0,0,0,1,0,0,0,1) * 0x7F00 - andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about - vmadn sKPF, vPairTPosF, sVPS // + pos frac * scale - or $24, $24, $20 // Combine results for second vertex - vmadh sKPI, vPairTPosI, sVPS // int part, sKPI:sKPF is now screen space pos - sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags -vtx_store_loop_entry: - vmudn $v29, vM3F, vOne - blez $1, vtx_epilogue - vmadh $v29, vM3I, vOne - vmadn $v29, vM0F, vPairPosI[0h] - sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order - vmadh $v29, vM0I, vPairPosI[0h] - jr $ra - vmadn $v29, vM1F, vPairPosI[1h] - -vtx_epilogue: - vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about - vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 - or $10, $10, $11 // Combine results for first vertex - beqz $7, @@skip_fog - slv sKPI[8], (VTX_SCR_VEC )(secondVtxPos) - sbv sKPG[15], (VTX_COLOR_A )(secondVtxPos) - sbv sKPG[7], (VTX_COLOR_A )($19) -@@skip_fog: - vmov sKPF[1], sCLZ[2] - ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) - slv sKPI[0], (VTX_SCR_VEC )($19) - ssv sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos) - bltz $ra, clip_after_vtx_store - slv sKPF[2], (VTX_SCR_Z )($19) - sh $10, (VTX_CLIP )($19) // Store first vertex flags - j vertex_end - lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store - -.else // end of new LVP_NOC - -.if CFG_LEGACY_VTX_PIPE -vtx_early_return_from_lighting: - vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha -.endif -vtx_return_from_lighting: - li $ra, vertex_end -.if CFG_LEGACY_VTX_PIPE - vmudm vPairST, vPairST, sSTS // Scale ST; must be after texgen -@@skipsecond: -.else - vclr sSTO - andi $11, $5, G_ATTROFFSET_ST_ENABLE >> 8 - vmudn $v29, vVP3F, vOne - beqz $11, @@skipoffset - vmadh $v29, vVP3I, vOne - llv sSTO[0], (attrOffsetST - altBase)(altBaseReg) // elems 0, 1 = S, T offset - llv sSTO[8], (attrOffsetST - altBase)(altBaseReg) // elems 4, 5 = S, T offset -@@skipoffset: - vmadl $v29, vVP0F, vPairPosF[0h] - llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1 - vmadm $v29, vVP0I, vPairPosF[0h] - llv sSTS[8], (textureSettings2)($zero) // Texture ST scale in 4, 5 - vmadn $v29, vVP0F, vPairPosI[0h] - vmadh $v29, vVP0I, vPairPosI[0h] - vmadl $v29, vVP1F, vPairPosF[1h] - vmadm $v29, vVP1I, vPairPosF[1h] - vmadn $v29, vVP1F, vPairPosI[1h] - vmadh $v29, vVP1I, vPairPosI[1h] - vmadl $v29, vVP2F, vPairPosF[2h] - vmadm $v29, vVP2I, vPairPosF[2h] - vmadn vPairTPosF, vVP2F, vPairPosI[2h] - vmadh vPairTPosI, vVP2I, vPairPosI[2h] - vmudm $v29, vPairST, sSTS // Scale ST; must be after texgen - vmadh vPairST, sSTO, vOne // + 1 * (ST offset or zero) -.endif - addi outputVtxPos, outputVtxPos, 2*vtxSize -vtx_store_for_clip: - // Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA - // Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp). Also vPairST and - // vPairRGBA can be used as temps once stored ($v22, $v27). - // Scalar regs: secondVtxPos, outputVtxPos; set to the same thing if only write 1 vtx - // temps $10, $11, $20, $24 - vmudl $v29, vPairTPosF, $v30[3] // Persp norm - move secondVtxPos, outputVtxPos // Second and output vertices write to same mem... - vmadm s1WI, vPairTPosI, $v30[3] // Persp norm - bltz $1, @@skipsecond // ...if < 0 verts remain, ... - vmadn s1WF, $v31, $v31[2] // 0 - addi secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx -@@skipsecond: - vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high - suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos) - vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low - suv vPairRGBA[0], (VTX_COLOR_VEC )(outputVtxPos) - vrcph $v29[0], s1WI[3] - cfc2 $10, $vcc // Load screen clipping results - vrcpl sRTF[2], s1WF[3] - sdv vPairTPosF[8], (VTX_FRAC_VEC )(secondVtxPos) - vrcph sRTI[3], s1WI[7] - move $19, outputVtxPos // Else $19 is initialized to temp memory on first pre-loop - vrcpl sRTF[6], s1WF[7] - sdv vPairTPosF[0], (VTX_FRAC_VEC )(outputVtxPos) - vrcph sRTI[7], $v31[2] // 0 - sdv vPairTPosI[8], (VTX_INT_VEC )(secondVtxPos) - vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping - sdv vPairTPosI[0], (VTX_INT_VEC )(outputVtxPos) - vmadh sSCI, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping - slv vPairST[8], (VTX_TC_VEC )(secondVtxPos) - vmudl $v29, s1WF, sRTF[2h] - slv vPairST[0], (VTX_TC_VEC )(outputVtxPos) - vmadm $v29, s1WI, sRTF[2h] - -.if CFG_NO_OCCLUSION_PLANE - vmadn s1WF, s1WF, sRTI[3h] - addi inputVtxPos, inputVtxPos, 2*inputVtxSize - vmadh s1WI, s1WI, sRTI[3h] -vtx_store_loop_entry: -// vPairST is $v22 - ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3 - vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high - ldv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5, RGBA in 6:7 - vmudh $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7 - lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below - vmadn s1WF, s1WF, $v31[0] // -4 - lsv vPairTPosI[6], (VTX_Z_INT )($19) // load Z into W slot, will be for fog below - vmadh s1WI, s1WI, $v31[0] // -4 - srl $24, $10, 4 // Shift second vertex screen clipping to first slots - vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low - andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about -// sTCL is $v21 - vcopy sTCL, vPairST - cfc2 $20, $vcc // Load scaled clipping results - vmudl $v29, s1WF, sRTF[2h] - lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below - vmadm $v29, s1WI, sRTF[2h] - lsv vPairTPosF[6], (VTX_Z_FRAC )($19) // load Z into W slot, will be for fog below - vmadn s1WF, s1WF, sRTI[3h] -// vPairPosI is $v20 - ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) - vmadh s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W - ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) - vmov sTCL[4], vPairST[2] - andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about - vmov sTCL[5], vPairST[3] - ori $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts - vmudl $v29, vPairTPosF, s1WF[3h] - ssv s1WF[14], (VTX_INV_W_FRAC)(secondVtxPos) - vmadm $v29, vPairTPosI, s1WF[3h] - ssv s1WF[6], (VTX_INV_W_FRAC)($19) - vmadn vPairTPosF, vPairTPosF, s1WI[3h] - ssv s1WI[14], (VTX_INV_W_INT )(secondVtxPos) - vmadh vPairTPosI, vPairTPosI, s1WI[3h] // pos * 1/W - ssv s1WI[6], (VTX_INV_W_INT )($19) - // vnop - sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA - // vnop -.if CFG_LEGACY_VTX_PIPE - lpv $v14[7], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4 -.else -// sVPO is $v17 // vtx_store ViewPort Offset - lqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset -.endif - vmudl $v29, vPairTPosF, $v30[3] // Persp norm -.if CFG_LEGACY_VTX_PIPE - lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4 -.else -// sVPS is $v26 // vtx_store ViewPort Scale - lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale -.endif - vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm -// vPairRGBA is $v27 - luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA - vmadn vPairTPosF, $v31, $v31[2] // 0 - sll $11, $20, 4 // Shift first vertex scaled clipping to second slots -.if !CFG_LEGACY_VTX_PIPE -// sTPN is $v16 - vmov sTPN[2], vPairPosI[7] // Move vtx 1 packed normals to elem 2 -.endif - andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about -.if !CFG_LEGACY_VTX_PIPE - vmov sTPN[0], vPairPosI[3] // Move vtx 0 packed normals to elem 0 -.endif - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about - vmudh $v29, sVPO, vOne // offset * 1 - or $24, $24, $20 // Combine results for second vertex - vmadn vPairTPosF, vPairTPosF, sVPS // + XYZ * scale - or $10, $10, $11 // Combine results for first vertex - vmadh vPairTPosI, vPairTPosI, sVPS - sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags -// sFOG is $v25 - vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog -.if !CFG_LEGACY_VTX_PIPE - sdv sTPN[0], (tempVpPkNorm)(rdpCmdBufEndP1) // Vtx 0 and 1 packed normals -.endif - // vnop - sh $10, (VTX_CLIP )($19) // Store first vertex results -// vPairNrml is $v16 - vmudn vPairNrml, vPairRGBA, $v31[3] // 2; left shift RGBA without clamp; vtx pair normals - ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) -// sCLZ is $v21 // vtx_store CLamped Z - vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0 - ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19) - vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) - slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos) - vmudn $v29, vM3F, vOne - slv vPairTPosI[0], (VTX_SCR_VEC )($19) - vmadh $v29, vM3I, vOne - blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping - vmadn $v29, vM0F, vPairPosI[0h] - move $ra, $16 // Normally $ra = loop or lighting -skip_return_to_lt_or_loop: - vmadh $v29, vM0I, vPairPosI[0h] - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize - vmadn $v29, vM1F, vPairPosI[1h] - ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) - vmadh $v29, vM1I, vPairPosI[1h] - ssv sCLZ[4], (VTX_SCR_Z )($19) -// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 - vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords - beqz $7, return_routine // fog disabled -// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24 - vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords - sbv sFOG[15], (VTX_COLOR_A )(secondVtxPos) - jr $ra - sbv sFOG[7], (VTX_COLOR_A )($19) - -.else // CFG_NO_OCCLUSION_PLANE - -// sOCM is $v22 // vtx_store OCclusion Mid, $v22 = vPairST - ldv sOCM[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) - vmadn s1WF, s1WF, sRTI[3h] - ldv sOCM[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) - vmadh s1WI, s1WI, sRTI[3h] - srl $24, $10, 4 // Shift second vertex screen clipping to first slots - vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high - andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about - vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low - andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about - vmudh $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7 - cfc2 $20, $vcc // Load scaled clipping results - vmadn s1WF, s1WF, $v31[0] // -4 - ori $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts - vmadh s1WI, s1WI, $v31[0] // -4 - addi inputVtxPos, inputVtxPos, 2*inputVtxSize - vmudn $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz - vmadh $v29, vPairTPosI, sOCM // Int * int - lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below -// sOC1 is $v21 // vtx_store OCclusion temp 1 - vreadacc sOC1, ACC_UPPER // Load int * int portion - lsv vPairTPosF[6], (VTX_Z_FRAC )(outputVtxPos) // load Z into W slot, will be for fog below - vmudl $v29, s1WF, sRTF[2h] - lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below - vmadm $v29, s1WI, sRTF[2h] - lsv vPairTPosI[6], (VTX_Z_INT )(outputVtxPos) // load Z into W slot, will be for fog below - vmadn s1WF, s1WF, sRTI[3h] - sll $11, $20, 4 // Shift first vertex scaled clipping to second slots - vmadh s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about - veq $v29, $v31, $v31[3h] // Set VCC to 00010001 - blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping - vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7 -vtx_store_loop_entry: - move $ra, $16 // Normally $ra = loop or lighting -skip_return_to_lt_or_loop: - vmudl $v29, vPairTPosF, s1WF[3h] // W must be overwritten with Z before here - ssv s1WF[14], (VTX_INV_W_FRAC)(secondVtxPos) - vmadm $v29, vPairTPosI, s1WF[3h] - ssv s1WF[6], (VTX_INV_W_FRAC)($19) - vmadn vPairTPosF, vPairTPosF, s1WI[3h] - ssv s1WI[14], (VTX_INV_W_INT )(secondVtxPos) - vmadh vPairTPosI, vPairTPosI, s1WI[3h] // pos * 1/W - ssv s1WI[6], (VTX_INV_W_INT )($19) - vadd sOC1, sOC1, sOC1[0q] // Add pairs upwards -.if !CFG_LEGACY_VTX_PIPE -// sVPO is $v17 // vtx_store ViewPort Offset - lqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset -.endif - // vnop -.if CFG_LEGACY_VTX_PIPE - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize -.else -// sVPS is $v16 // vtx_store ViewPort Scale - lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale -.endif - vmudl $v29, vPairTPosF, $v30[3] // Persp norm -// vPairST is $v22 - ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3 - vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm - ldv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5, RGBA in 6:7 - vmadn vPairTPosF, $v31, $v31[2] // 0 -// vPairPosI is $v20 - ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) - vadd sOC1, sOC1, sOC1[1h] // Add elems 1, 5 to 3, 7 - ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) - // vnop -// sO03 is $v26 // vtx_store Occlusion coeffs 0-3 - ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3 - vmudh $v29, sVPO, vOne // offset * 1 - ldv sO03[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2 - vmadn vPairTPosF, vPairTPosF, sVPS // + XYZ * scale -.if !CFG_LEGACY_VTX_PIPE -// sOPM is $v17 // vtx_store Occlusion Plus Minus constants - lqv sOPM, (tempOccPlusMinus)(rdpCmdBufEndP1) // Load occlusion plane -/+4000 constants -.endif - vmadh vPairTPosI, vPairTPosI, sVPS - andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about -// sFOG is $v16 - vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog - or $10, $10, $11 // Combine results for first vertex - vlt $v29, sOC1, $v31[2] // Occlusion plane equation < 0 in elems 3, 7 - slv vPairST[4], (tempVpRGBA + 0)(rdpCmdBufEndP1) // Store vtx 0 RGBA to temp mem -.if !CFG_LEGACY_VTX_PIPE -// sTPN is $v18 - vmov sTPN[2], vPairPosI[7] // Move vtx 1 packed normals to elem 2 -.endif - slv vPairST[12], (tempVpRGBA + 4)(rdpCmdBufEndP1) // Store vtx 1 RGBA to temp mem -.if !CFG_LEGACY_VTX_PIPE - vmov sTPN[0], vPairPosI[3] // Move vtx 0 packed normals to elem 0 -.endif - cfc2 $11, $vcc // Load occlusion plane mid results to bits 3 and 7 -// sOSC is $v21 // vtx_store Occlusion SCaled up - vmudh sOSC, vPairTPosI, $v31[4] // 4; scale up x and y - ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) - vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) - or $24, $24, $20 // Combine results for second vertex -// sCLZ is $v25 // vtx_store CLamped Z - vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0 - ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19) - vmulf $v29, sOPM, vPairTPosI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2 -// sO47 is $v23 // vtx_store Occlusion coeffs 0-3; $v23 = vPairTPosF - ldv sO47[0], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // Load coeffs 4-7 -// sOC2 is $v27 // vtx_store OCclusion temp 2; $v27 = vPairRGBA - vmacf sOC2, sO03, sOSC[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2 - ldv sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2 - vmulf $v29, sOPM, vPairTPosI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2 - beqz $7, @@skipfog // fog disabled -// sOC3 is $v21 // vtx_store OCclusion temp 3 - vmacf sOC3, sO03, sOSC[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2 - sbv sFOG[15], (VTX_COLOR_A )(secondVtxPos) - sbv sFOG[7], (VTX_COLOR_A )($19) -@@skipfog: - slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos) - veq $v29, $v31, $v31[0q] // Set VCC to 10101010 - slv vPairTPosI[0], (VTX_SCR_VEC )($19) - vmrg sOC2, sOC2, sOC3 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1 -.if CFG_LEGACY_VTX_PIPE - lpv $v14[7], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4 -.else - sdv sTPN[0], (tempVpPkNorm)(rdpCmdBufEndP1) // Vtx 0 and 1 packed normals + // 52 cycles + vrcp $v20[2], $v6[1] + lb $20, (alphaCompareCullMode)($zero) + vrcph $v22[2], $v6[1] + lw $5, VTX_INV_W_VEC($1) + vrcp $v20[3], $v8[1] + lw $7, VTX_INV_W_VEC($2) + vrcph $v22[3], $v8[1] + lw $8, VTX_INV_W_VEC($3) + vmudl tV1AtI, tV1AtI, $v30[3] // 0x0100; vertex color 1 >>= 8 + lbu $9, textureSettings1 + 3 + vmudl tV2AtI, tV2AtI, $v30[3] // 0x0100; vertex color 2 >>= 8 + sub $11, $5, $7 + vmudl tV3AtI, tV3AtI, $v30[3] // 0x0100; vertex color 3 >>= 8 + sra $10, $11, 31 + vmov $v15[3], $v8[0] + and $11, $11, $10 + vmudl $v29, $v20, $v30[7] // 0x0020 + beqz $20, tri_skip_alpha_compare_cull + sub $5, $5, $11 + // Alpha compare culling + vge $v26, tV1AtI, tV2AtI + lbu $19, alphaCompareCullThresh + vlt $v27, tV1AtI, tV2AtI + bgtz $20, @@skip1 + vge $v26, $v26, tV3AtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts + vlt $v26, $v27, tV3AtI // else if < 0, $v26 = min of 3 verts +@@skip1: // $v26 elem 3 has max or min alpha value + mfc2 $24, $v26[6] + sub $24, $24, $19 // sign bit set if (max/min) < thresh + xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull + bltz $24, return_and_end_mat // if max < thresh or if min >= thresh. +tri_skip_alpha_compare_cull: + // 63 cycles + vmadm $v22, $v22, $v30[7] // 0x0020 + sub $11, $5, $8 // Four instr: $5 = max($5, $8) + vmadn $v20, $v31, $v31[2] // 0 + sra $10, $11, 31 + vmudm $v25, $v15, $v30[2] // 0x1000 + and $11, $11, $10 + vmadn $v15, $v31, $v31[2] // 0 + sub $5, $5, $11 + vsubc $v4, vZero, $v4 + sw $5, 0x0010(rdpCmdBufPtr) + vsub $v26, vZero, vZero + llv $v27[0], 0x0010(rdpCmdBufPtr) + vmudm $v29, $v25, $v20 + mfc2 $5, $v17[1] + vmadl $v29, $v15, $v20 + lbu $7, textureSettings1 + 2 + vmadn $v20, $v15, $v22 + lsv tV2AtI[14], VTX_SCR_Z($2) + vmadh $v15, $v25, $v22 + lsv tV3AtI[14], VTX_SCR_Z($3) + vmudl $v29, $v23, $v16 + lsv tV2AtF[14], VTX_SCR_Z_FRAC($2) + vmadm $v29, $v24, $v16 + lsv tV3AtF[14], VTX_SCR_Z_FRAC($3) + vmadn $v16, $v23, $v17 + ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id + vmadh $v17, $v24, $v17 + or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id + vand $v22, $v20, $v30[5] // 0xFFF8 + // nop + vcr $v15, $v15, $v30[3] // 0x0100 + sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id + vmudh $v29, vOne, $v30[6] // 0x0010 + ssv $v10[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient + vmadn $v16, $v16, $v30[4] // -16 + ssv $v2[2], 0x0004(rdpCmdBufPtr) // Store YM edge coefficient + vmadh $v17, $v17, $v30[4] // -16 + ssv $v14[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient + vmudn $v29, $v3, $v14[0] + lw $20, otherMode1 + vmadl $v29, $v22, $v4[1] + andi $10, $5, 0x0080 // Extract the left major flag from $5 + vmadm $v29, $v15, $v4[1] + or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings + vmadn $v2, $v22, $v26[1] + sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings + vmadh $v3, $v15, $v26[1] + sb $zero, materialCullMode // This covers tri write out + vrcph $v29[0], $v27[0] + andi $20, ZMODE_DEC + vrcpl $v10[0], $v27[1] + addi $20, $20, -ZMODE_DEC + vmudh $v14, vOne, $v13[1q] + beqz $9, tri_skip_tex // If textures are not enabled, skip texture coefficient calculation + vrcph $v27[0], $v31[2] // 0 + vmudh $v22, vOne, $v31[7] // 0x7FFF + vmudm $v29, $v13, $v10[0] + vmadl $v29, $v14, $v10[0] + llv $v22[0], VTX_TC_VEC($1) + vmadn $v14, $v14, $v27[0] + llv $v22[8], VTX_TC_VEC($2) + vmadh $v13, $v13, $v27[0] + vmudh $v10, vOne, $v31[7] // 0x7FFF + vge $v29, $v30, $v30[7] // Set VCC to 11110001; select RGBA___Z or ____STW_ + llv $v10[8], VTX_TC_VEC($3) + vmudm $v29, $v22, $v14[0h] + vmadh $v22, $v22, $v13[0h] + vmadn $v25, $v31, $v31[2] // 0 + vmudm $v29, $v10, $v14[6] // acc = (v10 * v14[6]); v29 = mid(clamp(acc)) + vmadh $v10, $v10, $v13[6] // acc += (v10 * v13[6]) << 16; v10 = mid(clamp(acc)) + vmadn $v13, $v31, $v31[2] // 0; v13 = lo(clamp(acc)) + sdv $v22[0], 0x0020(rdpCmdBufPtr) + vmrg tV2AtI, tV2AtI, $v22 // Merge S, T, W into elems 4-6 + sdv $v25[0], 0x0028(rdpCmdBufPtr) // 8 + vmrg tV2AtF, tV2AtF, $v25 // Merge S, T, W into elems 4-6 + ldv tV1AtI[8], 0x0020(rdpCmdBufPtr) // 8 + vmrg tV3AtI, tV3AtI, $v10 // Merge S, T, W into elems 4-6 + ldv tV1AtF[8], 0x0028(rdpCmdBufPtr) // 8 + vmrg tV3AtF, tV3AtF, $v13 // Merge S, T, W into elems 4-6 +tri_skip_tex: +.if !ENABLE_PROFILING + addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP .endif - // vnop - ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) - // vnop -.if CFG_LEGACY_VTX_PIPE - lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4 -.else - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize + // 108 cycles + vmudl $v29, $v16, $v23 + lsv tV1AtF[14], VTX_SCR_Z_FRAC($1) + vmadm $v29, $v17, $v23 + lsv tV1AtI[14], VTX_SCR_Z($1) + vmadn $v23, $v16, $v24 + lh $1, VTX_SCR_VEC($2) + vmadh $v24, $v17, $v24 + addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients) +// tV*At* contains R, G, B, A, S, T, W, Z. tD31* = vtx 3 - vtx 1, tD21* = vtx 2 - vtx 1 +tD31F equ $v10 +tD31I equ $v9 +tD21F equ $v13 +tD21I equ $v7 + vsubc tD31F, tV3AtF, tV1AtF + andi $3, $6, G_SHADE + vsub tD31I, tV3AtI, tV1AtI + sll $1, $1, 14 + vsubc tD21F, tV2AtF, tV1AtF + sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient + vsub tD21I, tV2AtI, tV1AtI + ssv $v3[6], 0x0010(rdpCmdBufPtr) // Store XH edge coefficient (integer part) +// DaDx = (v3 - v1) * factor + (v2 - v1) * factor +tDaDxF equ $v2 +tDaDxI equ $v3 + vmudn $v29, tD31F, $v6[1] + ssv $v2[6], 0x0012(rdpCmdBufPtr) // Store XH edge coefficient (fractional part) + vmadh $v29, tD31I, $v6[1] + ssv $v3[4], 0x0018(rdpCmdBufPtr) // Store XM edge coefficient (integer part) + vmadn $v29, tD21F, $v12[1] + ssv $v2[4], 0x001A(rdpCmdBufPtr) // Store XM edge coefficient (fractional part) + vmadh $v29, tD21I, $v12[1] + ssv $v15[0], 0x000C(rdpCmdBufPtr) // Store DxLDy edge coefficient (integer part) + vreadacc tDaDxF, ACC_MIDDLE + ssv $v20[0], 0x000E(rdpCmdBufPtr) // Store DxLDy edge coefficient (fractional part) + vreadacc tDaDxI, ACC_UPPER + ssv $v15[6], 0x0014(rdpCmdBufPtr) // Store DxHDy edge coefficient (integer part) +// DaDy = (v2 - v1) * factor + (v3 - v1) * factor +tDaDyF equ $v6 +tDaDyI equ $v7 + vmudn $v29, tD21F, $v8[0] + ssv $v20[6], 0x0016(rdpCmdBufPtr) // Store DxHDy edge coefficient (fractional part) + vmadh $v29, tD21I, $v8[0] + ssv $v15[4], 0x001C(rdpCmdBufPtr) // Store DxMDy edge coefficient (integer part) + vmadn $v29, tD31F, $v11[0] + ssv $v20[4], 0x001E(rdpCmdBufPtr) // Store DxMDy edge coefficient (fractional part) + vmadh $v29, tD31I, $v11[0] + sll $11, $3, 4 // Shift (geometry mode & G_SHADE) by 4 to get 0x40 if G_SHADE is set + vreadacc tDaDyF, ACC_MIDDLE + add $1, $2, $11 // Increment the triangle pointer by 0x40 bytes (shade coefficients) if G_SHADE is set + vreadacc tDaDyI, ACC_UPPER + sll $11, $9, 5 // Shift texture enabled (which is 2 when on) by 5 to get 0x40 if textures are on +// DaDx, DaDy *= more factors + vmudl $v29, tDaDxF, $v23[1] + add rdpCmdBufPtr, $1, $11 // Increment the triangle pointer by 0x40 bytes (texture coefficients) if textures are on + vmadm $v29, tDaDxI, $v23[1] + andi $6, $6, G_ZBUFFER // Get the value of G_ZBUFFER from the current geometry mode + vmadn tDaDxF, tDaDxF, $v24[1] + sll $11, $6, 4 // Shift (geometry mode & G_ZBUFFER) by 4 to get 0x10 if G_ZBUFFER is set + vmadh tDaDxI, tDaDxI, $v24[1] + move $10, rdpCmdBufPtr // Write Z here + vmudl $v29, tDaDyF, $v23[1] + add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set + vmadm $v29, tDaDyI, $v23[1] + sub $8, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP + vmadn tDaDyF, tDaDyF, $v24[1] + sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional) + vmadh tDaDyI, tDaDyI, $v24[1] + sdv tDaDxI[0], 0x0008($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (integer) +// DaDe = DaDx * factor +tDaDeF equ $v8 +tDaDeI equ $v9 + // 136 cycles + vmadl $v29, tDaDxF, $v20[3] + sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional) + vmadm $v29, tDaDxI, $v20[3] + sdv tDaDxI[8], 0x0008($1) // Store DsDx, DtDx, DwDx texture coefficients (integer) + vmadn tDaDeF, tDaDxF, $v15[3] + sdv tDaDyF[0], 0x0038($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (fractional) + vmadh tDaDeI, tDaDxI, $v15[3] + sdv tDaDyI[0], 0x0028($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer) +// Base value += DaDe * factor + vmudn $v29, tV1AtF, vOne[0] + sdv tDaDyF[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficients (fractional) + vmadh $v29, tV1AtI, vOne[0] + sdv tDaDyI[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficients (integer) + vmadl $v29, tDaDeF, $v4[1] + sdv tDaDeF[0], 0x0030($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional) + vmadm $v29, tDaDeI, $v4[1] + sdv tDaDeI[0], 0x0020($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (integer) + vmadn tV1AtF, tDaDeF, $v26[1] + sdv tDaDeF[8], 0x0030($1) // Store DsDe, DtDe, DwDe texture coefficients (fractional) + vmadh tV1AtI, tDaDeI, $v26[1] + sdv tDaDeI[8], 0x0020($1) // Store DsDe, DtDe, DwDe texture coefficients (integer) + // All values start in element 7. "a", attribute, is Z. Need + // tV1AtI, tV1AtF, tDaDxI, tDaDxF, tDaDeI, tDaDeF, tDaDyI, tDaDyF + vmudn tDaDyF, tDaDyF, $v30[7] // 0x0020 + beqz $20, tri_decal_fix_z + vmadh tDaDyI, tDaDyI, $v30[7] // 0x0020 +tri_return_from_decal_fix_z: +tV1AtFF equ $v10 + vmudn tV1AtFF, tDaDeF, $v4[1] // Super-frac (frac * frac) part; assumes v4 factor >= 0 + sdv tV1AtF[0], 0x0010($2) // Store RGBA shade color (fractional) + vmudn tDaDeF, tDaDeF, $v30[7] // 0x0020 + sdv tV1AtI[0], 0x0000($2) // Store RGBA shade color (integer) + vmadh tDaDeI, tDaDeI, $v30[7] // 0x0020 + sdv tV1AtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional) + vmudn tDaDxF, tDaDxF, $v30[7] // 0x0020 + sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer) + vmadh tDaDxI, tDaDxI, $v30[7] // 0x0020 + ssv tDaDyF[14], 0x0E($10) + vmudl $v29, tV1AtFF, $v30[7] // 0x0020 + ssv tDaDyI[14], 0x0C($10) + vmadn tV1AtF, tV1AtF, $v30[7] // 0x0020 + ssv tDaDeF[14], 0x0A($10) + vmadh tV1AtI, tV1AtI, $v30[7] // 0x0020 + ssv tDaDeI[14], 0x08($10) + ssv tDaDxF[14], 0x06($10) + ssv tDaDxI[14], 0x04($10) + ssv tV1AtF[14], 0x02($10) +tri_end_check_rdp_buffer_full: + bltz $8, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end + ssv tV1AtI[14], 0x00($10) // If returning from no-Z, this is okay b/c $10 is at end + // 161 cycles +flush_rdp_buffer: // $8 = rdpCmdBufPtr - rdpCmdBufEndP1 + mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight + lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write + addi dmaLen, $8, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy +.if CFG_PROFILING_C + // This is a wait for DMA busy loop, but written inline to avoid overwriting ra. + addi perfCounterD, perfCounterD, 10 // 6 instr + 2 between end load and mfc + 0 taken branch overlaps with last + 2 between mfc and load .endif - // vnop - ssv sCLZ[4], (VTX_SCR_Z )($19) - vge $v29, sOC2, sO47 // Each compare to coeffs 4-7 -// vPairNrml is $v16 - lpv vPairNrml[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair normals - vmudn $v29, vM3F, vOne - cfc2 $20, $vcc - vmadh $v29, vM3I, vOne -// vPairRGBA is $v27 - luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair colors - vmadn $v29, vM0F, vPairPosI[0h] - andi $11, $11, CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4) // Only bits 3, 7 from occlusion - vmadh $v29, vM0I, vPairPosI[0h] - or $20, $20, $11 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded - vmadn $v29, vM1F, vPairPosI[1h] - andi $11, $20, 0x00F0 // Bits 4-7 for vtx 2 - vmadh $v29, vM1I, vPairPosI[1h] - bnez $11, @@skipv2 // If nonzero, at least one equation false, don't set occluded flag - andi $20, $20, 0x000F // Bits 0-3 for vtx 1 - ori $24, $24, CLIP_OCCLUDED // All equations true, set vtx 2 occluded flag -@@skipv2: -// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 - vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords - bnez $20, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag - sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags - ori $10, $10, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag -@@skipv1: -// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24 - vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords - jr $ra - sh $10, (VTX_CLIP )($19) // Store first vertex results - -.endif // CFG_NO_OCCLUSION_PLANE - -.endif // New LVP_NOC - -.if !CFG_PROFILING_A && (!CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE) -vertex_end: - j run_next_DL_command - lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store + bnez $10, flush_rdp_buffer // Wait until no DMAs are active + lw $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr) + mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time) + add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer + sub $10, $10, $11 // $10 = FIFO end addr - future pointer + bgez $10, @@has_room // Branch if we can fit this +@@await_rdp_dblbuf_avail: + mfc0 $11, DPC_STATUS // Read RDP status + andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf + bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available +.if COUNTER_C_FIFO_FULL + addi perfCounterC, perfCounterC, 7 // 4 instr + 2 after mfc + 1 taken branch .endif - -.if CFG_PROFILING_A -vertex_end: - li $ra, 0 // Flag for coming from vtx -.if !CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE - lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store + lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO +@@await_past_first_instr: + mfc0 $11, DPC_CURRENT // Load RDP current pointer + beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start +.if COUNTER_C_FIFO_FULL + addi perfCounterC, perfCounterC, 6 // 3 instr + 2 after mfc + 1 taken branch +.else + nop .endif -tri_end: - mfc0 $11, DPC_CLOCK - lw $10, startCounterTime - sub $11, $11, $10 - beqz $ra, run_next_DL_command // $ra != 0 if from tri cmds - add perfCounterA, perfCounterA, $11 // Add to vert cycles perf counter - sub perfCounterA, perfCounterA, $11 // From tris, undo add to vert perf counter - sub $10, perfCounterC, $4 // How long we stalled for RDP FIFO during this cmd - sub $11, $11, $10 // Subtract that from the tri cycles - j run_next_DL_command - add perfCounterD, perfCounterD, $11 // Add to tri cycles perf counter + // Start was previously the start of the FIFO, unless this is the first buffer, + // in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we + // have a new end value waiting (END_VALID), it'll load end but leave current. By + // setting start here, it will also load current with start. + mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO +@@keep_waiting: +.if COUNTER_C_FIFO_FULL + // This is here so we only count it when stalling below or on FIFO end codepath + addi perfCounterC, perfCounterC, 10 // 7 instr + 2 after mfc + 1 taken branch .endif +@@has_room: + mfc0 $11, DPC_CURRENT // Load RDP current pointer + sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start) + blez $11, @@copy_buffer // Current is behind or at current end, can do copy + sub $11, $11, dmaLen // If amount current is ahead of current end + blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting +@@copy_buffer: + add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size + sw $11, rdpFifoPos + // Set up the DMA from DMEM to the RDP fifo in RDRAM + addi dmaLen, dmaLen, -1 // subtract 1 from the length + addi dmemAddr, rdpCmdBufEndP1, -(0x2000 | (RDP_CMD_BUFSIZE + 8)) // The 0x2000 is meaningless, negative means write + xori rdpCmdBufEndP1, rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word ^ rdpCmdBuffer2EndPlus1Word // Swap between the two RDP command buffers + j dma_read_write + addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8) -.if CFG_LEGACY_VTX_PIPE || CFG_NO_OCCLUSION_PLANE -G_MTX_end: - instantiate_mtx_end_begin -mtx_multiply: - instantiate_mtx_multiply +tri_decal_fix_z: + // Valid range of tV1AtI = 0 to 3FF, but most of the scene is large values + vmudm $v25, tV1AtI, $v31[5] // 0x4000; right shift 2; now 0 to FF + vsub $v25, $v25, $v30[3] // 0x0100; (0 to FF) - 100 = -100 to -1 + j tri_return_from_decal_fix_z + vcr tDaDyI, tDaDyI, $v25[7] + +tri_culled_by_occlusion_plane: +.if CFG_PROFILING_B + addi perfCounterB, perfCounterB, 0x4000 .endif - +return_and_end_mat: + jr $ra + sb $zero, materialCullMode // This covers all tri early exits except clipping + +tri_fan_store: + lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1 + j tri_main + sb $11, 5(rdpCmdBufPtr) // Store vtx 1 + .if (. & 4) .warning "One instruction of padding before ovl234" .endif @@ -2088,9 +1798,11 @@ ovl234_ovl4_entrypoint_ovl3ver: // same IMEM address as ovl234_ovl4_e // Jump here to do clipping. If overlay 3 is loaded (this code), directly starts // the clipping code. ovl234_clipping_entrypoint: + sh $ra, tempTriRA // Tri return after clipping .if CFG_PROFILING_B addi perfCounterB, perfCounterB, 1 // Increment clipped (input) tris count .endif + sb $zero, materialCullMode // In case only/all tri(s) clip then offscreen jal vtx_setup_constants li clipMaskIdx, 4 clip_after_constants: @@ -2125,7 +1837,7 @@ clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the ed beq $11, clipFlags, clip_nextedge // Both set or both clear = both off screen or both on screen, no subdivision move clipFlags, $11 // clipFlags = masked V2's flags // Going to subdivide this edge. Find available temp vertex slot. - li outputVtxPos, clipTempVerts + MAX_CLIP_GEN_VERTS * vtxSize + li outputVtxPos, clipTempVertsEnd clip_find_unused_loop: lhu $11, (VTX_CLIP - vtxSize)(outputVtxPos) addi $10, outputVtxPos, -clipTempVerts // This is within the loop rather than before b/c delay after lhu @@ -2263,530 +1975,819 @@ clip_skipxy: .else vmadm vPairST, vPairST, vClFade2[3] // + Fade factor for on screen vert * on screen vert TC .endif - vmudl $v29, $v6, vClFade1[3] // Fade factor for off screen vert * off screen vert pos frac - vmadm $v29, $v7, vClFade1[3] // + Fade factor for off screen vert * off screen vert pos int - vmadl $v29, $v4, vClFade2[3] // + Fade factor for on screen vert * on screen vert pos frac - vmadm vPairTPosI, $v5, vClFade2[3] // + Fade factor for on screen vert * on screen vert pos int + vmudl $v29, $v6, vClFade1[3] // Fade factor for off screen vert * off screen vert pos frac + vmadm $v29, $v7, vClFade1[3] // + Fade factor for off screen vert * off screen vert pos int + vmadl $v29, $v4, vClFade2[3] // + Fade factor for on screen vert * on screen vert pos frac + vmadm vPairTPosI, $v5, vClFade2[3] // + Fade factor for on screen vert * on screen vert pos int +.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC + j vtx_store_for_clip +.else + jal vtx_store_for_clip +.endif + vmadn vPairTPosF, $v31, $v31[2] // 0; load resulting frac pos +.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC +clip_after_vtx_store: + ori $10, $10, CLIP_VTX_USED // Mark generated vtx as used + slv sSTS[0], (VTX_TC_VEC )($19) // Store not-twice-scaled ST + sh $10, (VTX_CLIP )($19) // Store generated vertex flags +.endif +clip_nextedge: + bnez clipFlags, clip_edgelooptop // Discard V2 if it was off screen (whether inserted vtx or not) + move $3, $2 // Move what was the end of the edge to be the new start of the edge + sub $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing + addi $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot + bgez $11, clip_done // If so, give up + sh $3, (clipPoly)(clipPolyWrite) // Former V2 was on screen, so add it to the output polygon + j clip_edgelooptop + addi clipPolyWrite, clipPolyWrite, 2 + +clip_w: + vcopy vClBaseF, $v4 // Result is just W + j clip_skipxy + vcopy vClBaseI, $v5 + +clip_nextcond: + sub $11, clipPolyWrite, clipPolySelect // Are there less than 3 verts in the output polygon? + bltz $11, clip_done // If so, degenerate result, quit + sh $zero, (clipPoly)(clipPolyWrite) // Terminate the output polygon with a 0 + lhu $3, (clipPoly - 2)(clipPolyWrite) // Initialize the edge start (V3) to the last vert + beqz clipMaskIdx, clip_draw_tris + lbu $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount + li $9, 1 + sllv $9, $9, $11 // $9 is clip mask + j clip_condlooptop + addi clipMaskIdx, clipMaskIdx, -1 + +clip_draw_tris: + vclr vZero // TODO may not need this + sh $zero, activeClipPlanes + lqv $v30, (v30Value)($zero) +// Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite +// Draws verts in pattern like 0-1-4, 1-2-4, 2-3-4 +clip_draw_tris_loop: + lhu $1, (clipPoly - 6)(clipPolySelect) + lhu $2, (clipPoly - 4)(clipPolySelect) + lhu $3, (clipPoly - 2)(clipPolyWrite) + mtc2 $1, $v27[10] // Addresses go in vector regs too + mtc2 $2, $v4[12] + jal tri_noinit + mtc2 $3, $v27[14] + bne clipPolyWrite, clipPolySelect, clip_draw_tris_loop + addi clipPolySelect, clipPolySelect, 2 +clip_done: + li $11, CLIP_SCAL_NPXY | CLIP_CAMPLANE + sh $11, activeClipPlanes + lqv $v30, (v30Value)($zero) // Need this repeated here in case we exited early + lh $ra, tempTriRA + +fill_vertex_table: + // Create bytes 00-07 + li $1, 7 +@@loop1: + sb $1, (vertexTable)($1) + bgtz $1, @@loop1 + addi $1, $1, -1 + // Load to vu and multiply by 2 to get vertex indexes. It would be more cycles + // to change the loop above to count by 2s than the stalls here. + li $2, vertexTable + lpv $v3[0], (0)($2) + li $3, vertexTable + ((G_MAX_VERTS + 8) * 2) // Need 0-56 inclusive, so do 0-63 + vmudh $v3, $v3, $v31[3] // 2; now 0x0000, 0x0200, ..., 0x0E00 +@@loop2: + vmudn $v29, vOne, $v30[0] // Address of vertex buffer + vmadl $v4, $v3, $v30[1] // Plus vtx indices times length + vadd $v3, $v3, $v30[2] // 0x1000; increment by 8 verts = 16 + addi $2, $2, 0x10 + bne $2, $3, @@loop2 + sqv $v4[0], (-0x10)($2) + jr $ra + nop + +ovl3_end: +.align 8 +ovl3_padded_end: + +.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga()) +ovl234_end: + +vtx_after_dma: + andi inputVtxPos, dmemAddr, 0xFFF8 // Round down input start addr to DMA word + lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode + srl $2, cmd_w0, 11 // n << 1 + sub $2, cmd_w0, $2 // = v0 << 1 + lhu outputVtxPos, (vertexTable)($2) // Address of output start +.if COUNTER_A_UPPER_VERTEX_COUNT + sll $11, $1, 12 // Vtx count * 0x10000 + add perfCounterA, perfCounterA, $11 // Add to vertex count +.endif +vtx_setup_constants: + // Computes modified viewport scale and offset including fog info, and stores + // these to temp memory in the RDP buffer. This is only used during vertex write + // and the first half of clipping, so that memory is not used then. +.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE + veq $v29, $v31, $v31[3h] // VCC = 00010001 +.elseif !CFG_NO_OCCLUSION_PLANE + vge $v29, $v31, $v31[2h] // VCC = 00110011 +.endif + ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7 +.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE +// sFGM is $v12 // FoG Mask + vmrg sFGM, vOne, $v31[2] // sFGM is 0,0,0,1,0,0,0,1 +.elseif !CFG_NO_OCCLUSION_PLANE + vmrg sOPMs, vOne, $v31[1] // Signs of sOPMs are --++--++ +.endif + ldv sVPO[8], (viewport + 8)($zero) + lw $10, (geometryModeLabel)($zero) + ldv sVPS[0], (viewport)($zero) // Load vscale duplicated in 0-3 and 4-7 + ldv sVPS[8], (viewport)($zero) +.if !CFG_NO_OCCLUSION_PLANE + vmudh sOPMs, sOPMs, $v31[5] // sOPMs is 0xC000, 0xC000, 0x4000, 0x4000, repeat +.endif + llv $v23[0], (fogFactor)($zero) // Load fog multiplier 0 and offset 1 + vne $v29, $v31, $v31[3h] // VCC = 11101110 + lqv $v30, (fxParams - altBase)(altBaseReg) // Parameters for vtx and lighting + vmudh $v20, sVPS, $v31[1] // -1; -vscale +.if CFG_LEGACY_VTX_PIPE + lbu $7, mITValid +.else + andi $11, $10, G_AMBOCCLUSION +.endif + vmrg sVPS, sVPS, $v23[0] // Put fog multiplier in elements 3,7 of vscale +.if !CFG_NO_OCCLUSION_PLANE && !CFG_LEGACY_VTX_PIPE + sqv sOPMs, (tempOccPlusMinus)(rdpCmdBufEndP1) // Store occlusion plane -/+4000 constants +.endif +.if CFG_LEGACY_VTX_PIPE + llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1 +.endif + vmrg sVPO, sVPO, $v23[1] // Put fog offset in elements 3,7 of vtrans +.if CFG_LEGACY_VTX_PIPE + llv sSTS[8], (textureSettings2)($zero) // Texture ST scale in 4, 5 +.else + vge $v29, $v31, $v31[3] // VCC = 00011111 +.endif + vmov sVPS[1], $v20[1] // Negate vscale[1] because RDP top = y=0 +.if CFG_LEGACY_VTX_PIPE + bgtz $ra, clip_after_constants // Return to clipping if from there + vmov sVPS[5], $v20[1] // Same for second half +.else + vmov sVPS[5], $v20[1] // Same for second half + bnez $11, @@skipzeroao // Continue if AO disabled + sqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Store viewport offset + vmrg $v30, $v30, $v31[2] // 0; zero AO values +@@skipzeroao: + bgtz $ra, clip_after_constants // Return to clipping if from there + sqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Store viewport scale +.endif + +vtx_after_setup_constants: + andi $8, $5, G_LIGHTING >> 8 // Temp to be reused below, is secondVtxPos + beqz $8, @@skip_lighting + li $16, vtx_loop_no_lighting // This is clipFlags, but not modified + li $16, lt_vtx_pair // during vtx_store +@@skip_lighting: +.if CFG_LEGACY_VTX_PIPE + bnez $7, skip_vtx_mvp + li $2, vpMatrix + li $3, mMatrix + j mtx_multiply + li $6, mITMatrix +vtx_after_mtx_multiply: + sqv $v5[0], (fourthQWMVP + 0)($zero) + sb $10, mITValid // $10 is nonzero from mtx_multiply, in fact 0x18 +skip_vtx_mvp: + bnez $8, ovl234_lighting_entrypoint // Lighting setup, incl. transform + sb $zero, materialCullMode // Vtx ends material +vtx_after_lt_setup: + lqv vM0I, (mITMatrix + 0x00)($zero) // Load MVP matrix + lqv vM2I, (mITMatrix + 0x10)($zero) + lqv vM0F, (mITMatrix + 0x20)($zero) + lqv vM2F, (fourthQWMVP + 0)($zero) +.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC + addi outputVtxPos, outputVtxPos, -vtxSize // Will inc by 2, but need point to 2nd +.else + addi outputVtxPos, outputVtxPos, -2*vtxSize // Going to increment this by 2 verts in loop +.endif + vcopy vM1I, vM0I + vcopy vM3I, vM2I + ldv vM1I[0], (mITMatrix + 0x08)($zero) + vcopy vM1F, vM0F + ldv vM3I[0], (mITMatrix + 0x18)($zero) + vcopy vM3F, vM2F + ldv vM1F[0], (mITMatrix + 0x28)($zero) + ldv vM3F[0], (fourthQWMVP + 8)($zero) + ldv vM0I[8], (mITMatrix + 0x00)($zero) + ldv vM2I[8], (mITMatrix + 0x10)($zero) + ldv vM0F[8], (mITMatrix + 0x20)($zero) + ldv vM2F[8], (fourthQWMVP + 0)($zero) +.else + sb $zero, materialCullMode // Vtx ends material + lqv vM0I, (mMatrix + 0x00)($zero) // Load M matrix + lqv vM2I, (mMatrix + 0x10)($zero) + lqv vM0F, (mMatrix + 0x20)($zero) + lqv vM2F, (mMatrix + 0x30)($zero) + lbu $11, mITValid // 0 if matrix invalid, 1 if valid + vcopy vM1I, vM0I + lbu $10, normalsMode // bit 0 clear if don't compute mIT, set if do + vcopy vM3I, vM2I + ldv vM1I[0], (mMatrix + 0x08)($zero) + vcopy vM1F, vM0F + ldv vM3I[0], (mMatrix + 0x18)($zero) + vcopy vM3F, vM2F + ldv vM1F[0], (mMatrix + 0x28)($zero) + sltiu $11, $11, 1 // 0 if matrix valid, 1 if invalid + srl $7, $5, 9 // G_LIGHTING in bit 1 + and $7, $7, $11 // If lighting enabled and need to update matrix, + and $7, $7, $10 // and computing mIT, + ldv vM3F[0], (mMatrix + 0x38)($zero) + ldv vM0I[8], (mMatrix + 0x00)($zero) + ldv vM2I[8], (mMatrix + 0x10)($zero) + ldv vM0F[8], (mMatrix + 0x20)($zero) + bnez $7, ovl234_ovl4_entrypoint // run overlay 4 to compute M inverse transpose + ldv vM2F[8], (mMatrix + 0x30)($zero) +vtx_after_calc_mit: + lqv vVP0I, (vpMatrix + 0x00)($zero) + lqv vVP2I, (vpMatrix + 0x10)($zero) + lqv vVP0F, (vpMatrix + 0x20)($zero) + lqv vVP2F, (vpMatrix + 0x30)($zero) + addi outputVtxPos, outputVtxPos, -2*vtxSize // Going to increment this by 2 verts in loop + vcopy vVP1I, vVP0I + vcopy vVP3I, vVP2I + ldv vVP1I[0], (vpMatrix + 0x08)($zero) + vcopy vVP1F, vVP0F + ldv vVP3I[0], (vpMatrix + 0x18)($zero) + vcopy vVP3F, vVP2F + ldv vVP1F[0], (vpMatrix + 0x28)($zero) + ldv vVP3F[0], (vpMatrix + 0x38)($zero) + ldv vVP0I[8], (vpMatrix + 0x00)($zero) + ldv vVP2I[8], (vpMatrix + 0x10)($zero) + ldv vVP0F[8], (vpMatrix + 0x20)($zero) + ldv vVP2F[8], (vpMatrix + 0x30)($zero) +.endif + andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC - j vtx_store_for_clip + srl $7, $7, 5 // 8 if G_FOG is set, 0 otherwise + addi $19, rdpCmdBufEndP1, vtxSize // Temp mem; fog writes up to vtxSize before + jal while_wait_dma_busy // Wait for vertex load to finish + move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos + ldv vPairPosI[0], (VTX_IN_OB + 0 * inputVtxSize)(inputVtxPos) // 1st vec pos + ldv vPairPosI[8], (VTX_IN_OB + 1 * inputVtxSize)(inputVtxPos) // 2nd vec pos + llv sTCL[8], (VTX_IN_CN + 0 * inputVtxSize)(inputVtxPos) // RGBA in 4:5 + llv sTCL[12], (VTX_IN_CN + 1 * inputVtxSize)(inputVtxPos) // RGBA in 6:7 + llv vPairST[0], (VTX_IN_TC + 0 * inputVtxSize)(inputVtxPos) // ST in 0:1 + j vtx_store_loop_entry + llv vPairST[8], (VTX_IN_TC + 1 * inputVtxSize)(inputVtxPos) // ST in 4:5 .else - jal vtx_store_for_clip -.endif - vmadn vPairTPosF, $v31, $v31[2] // 0; load resulting frac pos -.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC -clip_after_vtx_store: - ori $10, $10, CLIP_VTX_USED // Mark generated vtx as used - slv sSTS[0], (VTX_TC_VEC )($19) // Store not-twice-scaled ST - sh $10, (VTX_CLIP )($19) // Store generated vertex flags + jal while_wait_dma_busy // Wait for vertex load to finish + addi $19, rdpCmdBufEndP1, tempPrevVtxGarbage // Temp mem we can freely overwrite replaces outputVtxPos + j vtx_store_loop_entry + move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos .endif -clip_nextedge: - bnez clipFlags, clip_edgelooptop // Discard V2 if it was off screen (whether inserted vtx or not) - move $3, $2 // Move what was the end of the edge to be the new start of the edge - sub $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing - addi $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot - bgez $11, clip_done // If so, give up - sh $3, (clipPoly)(clipPolyWrite) // Former V2 was on screen, so add it to the output polygon - j clip_edgelooptop - addi clipPolyWrite, clipPolyWrite, 2 -clip_w: - vcopy vClBaseF, $v4 // Result is just W - j clip_skipxy - vcopy vClBaseI, $v5 +.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC -clip_nextcond: - sub $11, clipPolyWrite, clipPolySelect // Are there less than 3 verts in the output polygon? - bltz $11, clip_done // If so, degenerate result, quit - sh $zero, (clipPoly)(clipPolyWrite) // Terminate the output polygon with a 0 - lhu $3, (clipPoly - 2)(clipPolyWrite) // Initialize the edge start (V3) to the last vert - beqz clipMaskIdx, clip_draw_tris - lbu $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount - li $9, 1 - sllv $9, $9, $11 // $9 is clip mask - j clip_condlooptop - addi clipMaskIdx, clipMaskIdx, -1 - -clip_draw_tris: - vclr vZero // TODO may not need this - lqv $v30, (v30Value)($zero) -// Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite -// Draws verts in pattern like 0-1-4, 1-2-4, 2-3-4 -clip_draw_tris_loop: - lhu $1, (clipPoly - 6)(clipPolySelect) - lhu $2, (clipPoly - 4)(clipPolySelect) - lhu $3, (clipPoly - 2)(clipPolyWrite) - mtc2 $1, $v6[12] // Addresses go in vector regs too - mtc2 $2, $v4[12] - lw $6, geometryModeLabel // Load full geometry mode word - sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull - li $24, 0 // Init clipping flags for tri draw--no repeat clipping - jal tri_noinit - mtc2 $3, $v8[12] - bne clipPolyWrite, clipPolySelect, clip_draw_tris_loop - addi clipPolySelect, clipPolySelect, 2 -clip_done: - lh $ra, tempTriRA - jr $ra - lqv $v30, (v30Value)($zero) // Need this repeated here in case we exited early +// $v0:$v7 = MVP, $v8:$v10 = sVPS/sVPO/sSTS, $v11 = available, $v12 = sFGM, +// $v13 = first light dir, $v14:$v16 = Y/Z/vPairNrml/temp, $v17 = vPairLt/temp, +// $v18:$v19 = available, $v20:$v21 = vPairPosI/F/temp, +// $v22 = vPairST, $v23:$v24 = vPairTPosF/I/temp, $v25:$v26 = temps, $v27 = vPairRGBA, +// $v28 = vOne, $v29 = garbage, $v30 = params, $v31 = constants +// $1: 0x10 vtx count, $2: need for clipping, $3: init lt ptr, $4: vtx1/perf, +// $5: geom mode mid, $6: need for clipping, $7: fog flag, $8: secondVtxPos, +// $9: need for clipping, $10:$11: temp, $12: perf, $13: altBaseReg, $14: inputVtxPos, +// $15: outputVtxPos, $16: lt jump addr, $17:$18: need for clipping, $19: shadow out vtx, +// $20: temp, $21: need for clipping, $22:$23: cmd buf, $24: temp, $25: cmd_w0 global, +// $26: taskDataPtr, $27: inputBufferPos, $28:$30: perf, $ra return addr -ovl3_end: .align 8 -ovl3_padded_end: - -.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga()) -ovl234_end: +vtx_loop_no_lighting: + vmadh $v29, vM1I, vPairPosI[1h] + andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + vmadn vPairTPosF, vM2F, vPairPosI[2h] + or $10, $10, $11 // Combine results for first vertex + vmadh vPairTPosI, vM2I, vPairPosI[2h] + sh $10, (VTX_CLIP )($19) // Store first vertex flags +// sKPI is $v11 // vtx_store Keep Int (keep across pipelining) +// sKPG is vBBB = $v21 // vtx_store Keep Fog + vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) + luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA +// sCLZ is $v19 + vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 + addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2 +vtx_return_from_lighting: +vtx_store_for_clip: + vmudl $v29, vPairTPosF, $v30[3] // Persp norm + sub $20, secondVtxPos, $7 // Points 8 before secondVtxPos if fog, else 0 +// s1WI is $v16 // vtx_store 1/W Int + vmadm s1WI, vPairTPosI, $v30[3] // Persp norm + addi outputVtxPos, outputVtxPos, 2*vtxSize // Points to SECOND output vtx +// s1WF is $v17 // vtx_store 1/W Frac + vmadn s1WF, $v31, $v31[2] // 0 + sbv sKPG[15], (VTX_COLOR_A + 8)($20) // In VTX_SCR_Y if fog disabled... +// sKPF is $v18 // vtx_store Keep Frac + vmov sKPF[1], sCLZ[2] + sbv sKPG[7], (VTX_COLOR_A + 8 - vtxSize)($20) // ...which gets overwritten below +// sSCF is $v20 // vtx_store Scaled Clipping Frac + vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping + ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) +// sSCI is $v21 // vtx_store Scaled Clipping Int + vmadh sSCI, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping + slv sKPI[8], (VTX_SCR_VEC )(secondVtxPos) + vrcph $v29[0], s1WI[3] + slv sKPI[0], (VTX_SCR_VEC )($19) +// sRTF is $v25 // vtx_store Reciprocal Temp Frac + vrcpl sRTF[2], s1WF[3] + ssv sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos) +// sRTI is $v26 // vtx_store Reciprocal Temp Int + vrcph sRTI[3], s1WI[7] + slv sKPF[2], (VTX_SCR_Z )($19) + vrcpl sRTF[6], s1WF[7] + sra $24, $1, 31 // All 1s if on last iter + vrcph sRTI[7], $v31[2] // 0 + andi $24, $24, vtxSize // vtxSize if on last iter, else normally 0 + vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high + sub secondVtxPos, outputVtxPos, $24 // First output vtx on last iter, else second + vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low + addi $19, outputVtxPos, -vtxSize // First output vtx always + vmudl $v29, s1WF, sRTF[2h] + cfc2 $10, $vcc // Screen clip results + vmadm $v29, s1WI, sRTF[2h] + sdv vPairTPosF[8], (VTX_FRAC_VEC )(secondVtxPos) + vmadn s1WF, s1WF, sRTI[3h] +// sTCL is $v19 // vtx_store Temp CoLor + ldv sTCL[0], (VTX_IN_TC + 2 * inputVtxSize)(inputVtxPos) // ST in 0:1, RGBA in 2:3 + vmadh s1WI, s1WI, sRTI[3h] + sdv vPairTPosF[0], (VTX_FRAC_VEC )($19) +// sST2 equ $v11 // vtx_store ST coordinates copy 2 + vmudm sST2, vPairST, sSTS // Scale ST + lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below + vmudh $v29, vOne, $v31[4] // 4 + sdv vPairTPosI[8], (VTX_INT_VEC )(secondVtxPos) + vmadn s1WF, s1WF, $v31[0] // -4 + lsv vPairTPosF[6], (VTX_Z_FRAC )($19) // load Z into W slot, will be for fog below + vmadh s1WI, s1WI, $v31[0] // -4 + sdv vPairTPosI[0], (VTX_INT_VEC )($19) + // vnop + ldv sTCL[8], (VTX_IN_TC + 3 * inputVtxSize)(inputVtxPos) // ST in 4:5, RGBA in 6:7 + vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high + suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos) // Store RGBA for second vtx + vmudl $v29, s1WF, sRTF[2h] + lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below + vmadm $v29, s1WI, sRTF[2h] + suv vPairRGBA[0], (VTX_COLOR_VEC )($19) // Store RGBA for first vtx + vmadn s1WF, s1WF, sRTI[3h] + lsv vPairTPosI[6], (VTX_Z_INT )($19) // load Z into W slot, will be for fog below + vmadh s1WI, s1WI, sRTI[3h] + srl $24, $10, 4 // Shift second vertex screen clipping to first slots + vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low + andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + vcopy vPairST, sTCL + cfc2 $20, $vcc // Scaled clip results + vmudl $v29, vPairTPosF, s1WF[3h] // Pos times inv W + ssv s1WF[14], (VTX_INV_W_FRAC)(secondVtxPos) + vmadm $v29, vPairTPosI, s1WF[3h] // Pos times inv W +// vPairPosI is $v20 + ldv vPairPosI[0], (VTX_IN_OB + 2 * inputVtxSize)(inputVtxPos) // Pos of 1st vector for next iteration + vmadn vPairTPosF, vPairTPosF, s1WI[3h] + ldv vPairPosI[8], (VTX_IN_OB + 3 * inputVtxSize)(inputVtxPos) // Pos of 2nd vector on next iteration + vmadh vPairTPosI, vPairTPosI, s1WI[3h] // vPairTPosI:vPairTPosF = pos times inv W + addi inputVtxPos, inputVtxPos, (2 * inputVtxSize) // Advance two positions forward in the input vertices + vmov sTCL[4], vPairST[2] // First vtx RG to elem 4 + andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + vmov sTCL[5], vPairST[3] // First vtx BA to elem 5 + sll $11, $20, 4 // Shift first vertex scaled clipping to second slots + vmudl $v29, vPairTPosF, $v30[3] // Persp norm + ssv s1WF[6], (VTX_INV_W_FRAC)($19) + vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm + ssv s1WI[14], (VTX_INV_W_INT )(secondVtxPos) + vmadn vPairTPosF, $v31, $v31[2] // 0; Now vPairTPosI:vPairTPosF = projected position + ssv s1WI[6], (VTX_INV_W_INT )($19) + // vnop + slv sST2[8], (VTX_TC_VEC )(secondVtxPos) // Store scaled S, T vertex 2 + vmudh $v29, sVPO, vOne // offset * 1 + slv sST2[0], (VTX_TC_VEC )($19) // Store scaled S, T vertex 1 + vmadh $v29, sFGM, $v31[6] // + (0,0,0,1,0,0,0,1) * 0x7F00 + andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about + vmadn sKPF, vPairTPosF, sVPS // + pos frac * scale + or $24, $24, $20 // Combine results for second vertex + vmadh sKPI, vPairTPosI, sVPS // int part, sKPI:sKPF is now screen space pos + sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags +vtx_store_loop_entry: + vmudn $v29, vM3F, vOne + blez $1, vtx_epilogue + vmadh $v29, vM3I, vOne + vmadn $v29, vM0F, vPairPosI[0h] + sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order + vmadh $v29, vM0I, vPairPosI[0h] + jr $16 // lt_vtx_pair or vtx_loop_no_lighting + vmadn $v29, vM1F, vPairPosI[1h] + +vtx_epilogue: + vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) + andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 + or $10, $10, $11 // Combine results for first vertex + beqz $7, @@skip_fog + slv sKPI[8], (VTX_SCR_VEC )(secondVtxPos) + sbv sKPG[15], (VTX_COLOR_A )(secondVtxPos) + sbv sKPG[7], (VTX_COLOR_A )($19) +@@skip_fog: + vmov sKPF[1], sCLZ[2] + ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) + slv sKPI[0], (VTX_SCR_VEC )($19) + ssv sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos) + bltz $ra, clip_after_vtx_store // $ra - from clipping or + from while_wait_dma_busy + slv sKPF[2], (VTX_SCR_Z )($19) + sh $10, (VTX_CLIP )($19) // Store first vertex flags + j vertex_end + lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store -tV1AtF equ $v5 -tV2AtF equ $v7 -tV3AtF equ $v9 -tV1AtI equ $v18 -tV2AtI equ $v19 -tV3AtI equ $v21 +.else // end of new LVP_NOC -tri_main: - vmudn $v29, vOne, $v30[0] // Address of vertex buffer - lw $6, geometryModeLabel // Load full geometry mode word - vmadl $v27, $v27, $v30[1] // Plus vtx indices times length - sb $zero, materialCullMode // This covers all tri cmds - vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6 - li $24, CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri draw, check clipping - vclr vZero - sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull - // vnop - sh $ra, tempTriRA // For tri cmds; where to go after clipping - mfc2 $1, $v27[10] - mfc2 $2, $v27[12] -.if !ENABLE_PROFILING - addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested - move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading -.endif - vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr - mfc2 $3, $v27[14] - vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr -tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping - llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y) - vnxor tV1AtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding - llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4 - vnxor tV2AtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding - llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8 - vnxor tV3AtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding - lhu $5, VTX_CLIP($1) - vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1 - lhu $7, VTX_CLIP($2) - vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr) - lhu $8, VTX_CLIP($3) - vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr) - andi $11, $5, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane - vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr) - and $11, $11, $7 - vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y - and $11, $11, $8 - vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2) - bnez $11, return_routine // Then the whole tri is offscreen, cull - // 24 cycles - vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... - vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing - or $10, $5, $7 - vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y - or $10, $10, $8 // $10 = all clip bits which are true for any verts - vmrg $v10, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2) - and $10, $10, $24 // If clipping is enabled, check clip flags - vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y - bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip - // 29 cycles - mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended - vmrg $v4, $v14, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3) - and $5, $5, $7 - vmrg $v14, $v8, $v14 // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2) - and $5, $5, $8 - vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) - srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing) - vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor - sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing - vmrg $v2, $v4, $v10 // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2) - bltz $11, return_routine // Cull if bit is set (culled based on facing) - // 35 cycles - vmrg $v10, $v10, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3) - vmudn $v4, $v14, $v31[5] // 0x4000 - beqz $9, return_routine // If cross product is 0, tri is degenerate (zero area), cull. - // 37 cycles - mfc2 $1, $v14[12] // $v14 = lowest Y value = highest on screen (x, y, addr) - vsub $v6, $v2, $v14 - mfc2 $2, $v2[12] // $v2 = mid vertex (x, y, addr) - vsub $v8, $v10, $v14 -.if !ENABLE_PROFILING - sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit -.endif - vsub $v11, $v14, $v2 - andi $6, $6, (G_SHADE | G_ZBUFFER) - vsub $v12, $v14, $v10 // VH - VL (negative) - mfc2 $3, $v10[12] // $v10 = highest Y value = lowest on screen (x, y, addr) - vsub $v15, $v10, $v2 -.if !CFG_NO_OCCLUSION_PLANE - andi $5, $5, CLIP_OCCLUDED -.endif - vmudh $v29, $v6, $v8[0] -.if !CFG_NO_OCCLUSION_PLANE - bnez $5, tri_culled_by_occlusion_plane // Cull if all verts occluded -.endif - llv $v13[0], VTX_INV_W_VEC($1) - vmadh $v29, $v8, $v11[0] - lpv tV1AtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1 - vreadacc $v17, ACC_UPPER - lpv tV2AtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2 - vreadacc $v16, ACC_MIDDLE - lpv tV3AtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3 - vrcp $v20[0], $v15[1] -.if !ENABLE_PROFILING - lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx) -.endif - vmov $v15[2], $v6[0] - llv $v13[8], VTX_INV_W_VEC($2) - vrcph $v22[0], $v17[1] - llv $v13[12], VTX_INV_W_VEC($3) - vrcpl $v23[1], $v16[1] -.if !ENABLE_PROFILING - bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set +.if CFG_LEGACY_VTX_PIPE +vtx_early_return_from_lighting: + vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha .endif - vrcph $v24[1], $v31[2] // 0 -.if !ENABLE_PROFILING - vlt $v29, $v31, $v31[3] // Set vcc to 11100000 - vmrg tV1AtI, $v25, tV1AtI // RGB from $4, alpha from $1 - vmrg tV2AtI, $v25, tV2AtI // RGB from $4, alpha from $2 - vmrg tV3AtI, $v25, tV3AtI // RGB from $4, alpha from $3 -tri_skip_flat_shading: +vtx_loop_no_lighting: +vtx_return_from_lighting: + li $ra, vertex_end +.if CFG_LEGACY_VTX_PIPE + vmudm vPairST, vPairST, sSTS // Scale ST; must be after texgen +@@skipsecond: +.else + vclr sSTO + andi $11, $5, G_ATTROFFSET_ST_ENABLE >> 8 + vmudn $v29, vVP3F, vOne + beqz $11, @@skipoffset + vmadh $v29, vVP3I, vOne + llv sSTO[0], (attrOffsetST - altBase)(altBaseReg) // elems 0, 1 = S, T offset + llv sSTO[8], (attrOffsetST - altBase)(altBaseReg) // elems 4, 5 = S, T offset +@@skipoffset: + vmadl $v29, vVP0F, vPairPosF[0h] + llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1 + vmadm $v29, vVP0I, vPairPosF[0h] + llv sSTS[8], (textureSettings2)($zero) // Texture ST scale in 4, 5 + vmadn $v29, vVP0F, vPairPosI[0h] + vmadh $v29, vVP0I, vPairPosI[0h] + vmadl $v29, vVP1F, vPairPosF[1h] + vmadm $v29, vVP1I, vPairPosF[1h] + vmadn $v29, vVP1F, vPairPosI[1h] + vmadh $v29, vVP1I, vPairPosI[1h] + vmadl $v29, vVP2F, vPairPosF[2h] + vmadm $v29, vVP2I, vPairPosF[2h] + vmadn vPairTPosF, vVP2F, vPairPosI[2h] + vmadh vPairTPosI, vVP2I, vPairPosI[2h] + vmudm $v29, vPairST, sSTS // Scale ST; must be after texgen + vmadh vPairST, sSTO, vOne // + 1 * (ST offset or zero) .endif - // 53 cycles - vrcp $v20[2], $v6[1] - lb $20, (alphaCompareCullMode)($zero) - vrcph $v22[2], $v6[1] - lw $5, VTX_INV_W_VEC($1) - vrcp $v20[3], $v8[1] - lw $7, VTX_INV_W_VEC($2) - vrcph $v22[3], $v8[1] - lw $8, VTX_INV_W_VEC($3) - vmudl tV1AtI, tV1AtI, $v30[3] // 0x0100; vertex color 1 >>= 8 - lbu $9, textureSettings1 + 3 - vmudl tV2AtI, tV2AtI, $v30[3] // 0x0100; vertex color 2 >>= 8 - sub $11, $5, $7 - vmudl tV3AtI, tV3AtI, $v30[3] // 0x0100; vertex color 3 >>= 8 - sra $10, $11, 31 - vmov $v15[3], $v8[0] - and $11, $11, $10 - vmudl $v29, $v20, $v30[7] // 0x0020 - beqz $20, tri_skip_alpha_compare_cull - sub $5, $5, $11 - // Alpha compare culling - vge $v26, tV1AtI, tV2AtI - lbu $19, alphaCompareCullThresh - vlt $v27, tV1AtI, tV2AtI - bgtz $20, @@skip1 - vge $v26, $v26, tV3AtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts - vlt $v26, $v27, tV3AtI // else if < 0, $v26 = min of 3 verts -@@skip1: // $v26 elem 3 has max or min alpha value - mfc2 $24, $v26[6] - sub $24, $24, $19 // sign bit set if (max/min) < thresh - xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull - bltz $24, return_routine // if max < thresh or if min >= thresh. -tri_skip_alpha_compare_cull: - // 64 cycles - vmadm $v22, $v22, $v30[7] // 0x0020 - sub $11, $5, $8 - vmadn $v20, $v31, $v31[2] // 0 - sra $10, $11, 31 - vmudm $v25, $v15, $v30[2] // 0x1000 - and $11, $11, $10 - vmadn $v15, $v31, $v31[2] // 0 - sub $5, $5, $11 - vsubc $v4, vZero, $v4 - sw $5, 0x0010(rdpCmdBufPtr) - vsub $v26, vZero, vZero - llv $v27[0], 0x0010(rdpCmdBufPtr) - vmudm $v29, $v25, $v20 - mfc2 $5, $v17[1] - vmadl $v29, $v15, $v20 - lbu $7, textureSettings1 + 2 - vmadn $v20, $v15, $v22 - lsv tV2AtI[14], VTX_SCR_Z($2) - vmadh $v15, $v25, $v22 - lsv tV3AtI[14], VTX_SCR_Z($3) - vmudl $v29, $v23, $v16 - lsv tV2AtF[14], VTX_SCR_Z_FRAC($2) - vmadm $v29, $v24, $v16 - lsv tV3AtF[14], VTX_SCR_Z_FRAC($3) - vmadn $v16, $v23, $v17 - ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id - vmadh $v17, $v24, $v17 - or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id - vand $v22, $v20, $v30[5] // 0xFFF8 - // nop - vcr $v15, $v15, $v30[3] // 0x0100 - sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id - vmudh $v29, vOne, $v30[6] // 0x0010 - ssv $v10[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient - vmadn $v16, $v16, $v30[4] // -16 - ssv $v2[2], 0x0004(rdpCmdBufPtr) // Store YM edge coefficient - vmadh $v17, $v17, $v30[4] // -16 - ssv $v14[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient - vmudn $v29, $v3, $v14[0] - lw $20, otherMode1 - vmadl $v29, $v22, $v4[1] - andi $10, $5, 0x0080 // Extract the left major flag from $5 - vmadm $v29, $v15, $v4[1] - or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings - vmadn $v2, $v22, $v26[1] - sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings - vmadh $v3, $v15, $v26[1] - andi $20, ZMODE_DEC - vrcph $v29[0], $v27[0] - addi $20, $20, -ZMODE_DEC - vrcpl $v10[0], $v27[1] - beqz $9, tri_skip_tex // If textures are not enabled, skip texture coefficient calculation - vmudh $v14, vOne, $v13[1q] - // 91 cycles - vrcph $v27[0], $v31[2] // 0 - vmudh $v22, vOne, $v31[7] // 0x7FFF - vmudm $v29, $v13, $v10[0] - vmadl $v29, $v14, $v10[0] - llv $v22[0], VTX_TC_VEC($1) - vmadn $v14, $v14, $v27[0] - llv $v22[8], VTX_TC_VEC($2) - vmadh $v13, $v13, $v27[0] - vmudh $v10, vOne, $v31[7] // 0x7FFF - vge $v29, $v30, $v30[7] // Set VCC to 11110001; select RGBA___Z or ____STW_ - llv $v10[8], VTX_TC_VEC($3) - vmudm $v29, $v22, $v14[0h] - vmadh $v22, $v22, $v13[0h] - vmadn $v25, $v31, $v31[2] // 0 - vmudm $v29, $v10, $v14[6] // acc = (v10 * v14[6]); v29 = mid(clamp(acc)) - vmadh $v10, $v10, $v13[6] // acc += (v10 * v13[6]) << 16; v10 = mid(clamp(acc)) - vmadn $v13, $v31, $v31[2] // 0; v13 = lo(clamp(acc)) - sdv $v22[0], 0x0020(rdpCmdBufPtr) - vmrg tV2AtI, tV2AtI, $v22 // Merge S, T, W into elems 4-6 - sdv $v25[0], 0x0028(rdpCmdBufPtr) // 8 - vmrg tV2AtF, tV2AtF, $v25 // Merge S, T, W into elems 4-6 - ldv tV1AtI[8], 0x0020(rdpCmdBufPtr) // 8 - vmrg tV3AtI, tV3AtI, $v10 // Merge S, T, W into elems 4-6 - ldv tV1AtF[8], 0x0028(rdpCmdBufPtr) // 8 - vmrg tV3AtF, tV3AtF, $v13 // Merge S, T, W into elems 4-6 -tri_skip_tex: -.if !ENABLE_PROFILING - addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP + addi outputVtxPos, outputVtxPos, 2*vtxSize +vtx_store_for_clip: + // Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA + // Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp). Also vPairST and + // vPairRGBA can be used as temps once stored ($v22, $v27). + // Scalar regs: secondVtxPos, outputVtxPos; set to the same thing if only write 1 vtx + // temps $10, $11, $20, $24 + vmudl $v29, vPairTPosF, $v30[3] // Persp norm + move secondVtxPos, outputVtxPos // Second and output vertices write to same mem... + vmadm s1WI, vPairTPosI, $v30[3] // Persp norm + bltz $1, @@skipsecond // ...if < 0 verts remain, ... + vmadn s1WF, $v31, $v31[2] // 0 + addi secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx +@@skipsecond: + vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high + suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos) + vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low + suv vPairRGBA[0], (VTX_COLOR_VEC )(outputVtxPos) + vrcph $v29[0], s1WI[3] + cfc2 $10, $vcc // Load screen clipping results + vrcpl sRTF[2], s1WF[3] + sdv vPairTPosF[8], (VTX_FRAC_VEC )(secondVtxPos) + vrcph sRTI[3], s1WI[7] + move $19, outputVtxPos // Else $19 is initialized to temp memory on first pre-loop + vrcpl sRTF[6], s1WF[7] + sdv vPairTPosF[0], (VTX_FRAC_VEC )(outputVtxPos) + vrcph sRTI[7], $v31[2] // 0 + sdv vPairTPosI[8], (VTX_INT_VEC )(secondVtxPos) + vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping + sdv vPairTPosI[0], (VTX_INT_VEC )(outputVtxPos) + vmadh sSCI, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping + slv vPairST[8], (VTX_TC_VEC )(secondVtxPos) + vmudl $v29, s1WF, sRTF[2h] + slv vPairST[0], (VTX_TC_VEC )(outputVtxPos) + vmadm $v29, s1WI, sRTF[2h] + +.if CFG_NO_OCCLUSION_PLANE + vmadn s1WF, s1WF, sRTI[3h] + addi inputVtxPos, inputVtxPos, 2*inputVtxSize + vmadh s1WI, s1WI, sRTI[3h] +vtx_store_loop_entry: +// vPairST is $v22 + ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3 + vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high + ldv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5, RGBA in 6:7 + vmudh $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7 + lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below + vmadn s1WF, s1WF, $v31[0] // -4 + lsv vPairTPosI[6], (VTX_Z_INT )($19) // load Z into W slot, will be for fog below + vmadh s1WI, s1WI, $v31[0] // -4 + srl $24, $10, 4 // Shift second vertex screen clipping to first slots + vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low + andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about +// sTCL is $v21 + vcopy sTCL, vPairST + cfc2 $20, $vcc // Load scaled clipping results + vmudl $v29, s1WF, sRTF[2h] + lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below + vmadm $v29, s1WI, sRTF[2h] + lsv vPairTPosF[6], (VTX_Z_FRAC )($19) // load Z into W slot, will be for fog below + vmadn s1WF, s1WF, sRTI[3h] +// vPairPosI is $v20 + ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) + vmadh s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W + ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) + vmov sTCL[4], vPairST[2] + andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + vmov sTCL[5], vPairST[3] + ori $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts + vmudl $v29, vPairTPosF, s1WF[3h] + ssv s1WF[14], (VTX_INV_W_FRAC)(secondVtxPos) + vmadm $v29, vPairTPosI, s1WF[3h] + ssv s1WF[6], (VTX_INV_W_FRAC)($19) + vmadn vPairTPosF, vPairTPosF, s1WI[3h] + ssv s1WI[14], (VTX_INV_W_INT )(secondVtxPos) + vmadh vPairTPosI, vPairTPosI, s1WI[3h] // pos * 1/W + ssv s1WI[6], (VTX_INV_W_INT )($19) + // vnop + sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA + // vnop +.if CFG_LEGACY_VTX_PIPE + lpv $v14[7], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4 +.else +// sVPO is $v17 // vtx_store ViewPort Offset + lqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset .endif - // 109 cycles - vmudl $v29, $v16, $v23 - lsv tV1AtF[14], VTX_SCR_Z_FRAC($1) - vmadm $v29, $v17, $v23 - lsv tV1AtI[14], VTX_SCR_Z($1) - vmadn $v23, $v16, $v24 - lh $1, VTX_SCR_VEC($2) - vmadh $v24, $v17, $v24 - addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients) -// tV*At* contains R, G, B, A, S, T, W, Z. tD31* = vtx 3 - vtx 1, tD21* = vtx 2 - vtx 1 -tD31F equ $v10 -tD31I equ $v9 -tD21F equ $v13 -tD21I equ $v7 - vsubc tD31F, tV3AtF, tV1AtF - andi $3, $6, G_SHADE - vsub tD31I, tV3AtI, tV1AtI - sll $1, $1, 14 - vsubc tD21F, tV2AtF, tV1AtF - sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient - vsub tD21I, tV2AtI, tV1AtI - ssv $v3[6], 0x0010(rdpCmdBufPtr) // Store XH edge coefficient (integer part) -// DaDx = (v3 - v1) * factor + (v2 - v1) * factor -tDaDxF equ $v2 -tDaDxI equ $v3 - vmudn $v29, tD31F, $v6[1] - ssv $v2[6], 0x0012(rdpCmdBufPtr) // Store XH edge coefficient (fractional part) - vmadh $v29, tD31I, $v6[1] - ssv $v3[4], 0x0018(rdpCmdBufPtr) // Store XM edge coefficient (integer part) - vmadn $v29, tD21F, $v12[1] - ssv $v2[4], 0x001A(rdpCmdBufPtr) // Store XM edge coefficient (fractional part) - vmadh $v29, tD21I, $v12[1] - ssv $v15[0], 0x000C(rdpCmdBufPtr) // Store DxLDy edge coefficient (integer part) - vreadacc tDaDxF, ACC_MIDDLE - ssv $v20[0], 0x000E(rdpCmdBufPtr) // Store DxLDy edge coefficient (fractional part) - vreadacc tDaDxI, ACC_UPPER - ssv $v15[6], 0x0014(rdpCmdBufPtr) // Store DxHDy edge coefficient (integer part) -// DaDy = (v2 - v1) * factor + (v3 - v1) * factor -tDaDyF equ $v6 -tDaDyI equ $v7 - vmudn $v29, tD21F, $v8[0] - ssv $v20[6], 0x0016(rdpCmdBufPtr) // Store DxHDy edge coefficient (fractional part) - vmadh $v29, tD21I, $v8[0] - ssv $v15[4], 0x001C(rdpCmdBufPtr) // Store DxMDy edge coefficient (integer part) - vmadn $v29, tD31F, $v11[0] - ssv $v20[4], 0x001E(rdpCmdBufPtr) // Store DxMDy edge coefficient (fractional part) - vmadh $v29, tD31I, $v11[0] - sll $11, $3, 4 // Shift (geometry mode & G_SHADE) by 4 to get 0x40 if G_SHADE is set - vreadacc tDaDyF, ACC_MIDDLE - add $1, $2, $11 // Increment the triangle pointer by 0x40 bytes (shade coefficients) if G_SHADE is set - vreadacc tDaDyI, ACC_UPPER - sll $11, $9, 5 // Shift texture enabled (which is 2 when on) by 5 to get 0x40 if textures are on -// DaDx, DaDy *= more factors - vmudl $v29, tDaDxF, $v23[1] - add rdpCmdBufPtr, $1, $11 // Increment the triangle pointer by 0x40 bytes (texture coefficients) if textures are on - vmadm $v29, tDaDxI, $v23[1] - andi $6, $6, G_ZBUFFER // Get the value of G_ZBUFFER from the current geometry mode - vmadn tDaDxF, tDaDxF, $v24[1] - sll $11, $6, 4 // Shift (geometry mode & G_ZBUFFER) by 4 to get 0x10 if G_ZBUFFER is set - vmadh tDaDxI, tDaDxI, $v24[1] - move $10, rdpCmdBufPtr // Write Z here - vmudl $v29, tDaDyF, $v23[1] - add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set - vmadm $v29, tDaDyI, $v23[1] - sub $8, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP - vmadn tDaDyF, tDaDyF, $v24[1] - sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional) - vmadh tDaDyI, tDaDyI, $v24[1] - sdv tDaDxI[0], 0x0008($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (integer) -// DaDe = DaDx * factor -tDaDeF equ $v8 -tDaDeI equ $v9 - // 137 cycles - vmadl $v29, tDaDxF, $v20[3] - sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional) - vmadm $v29, tDaDxI, $v20[3] - sdv tDaDxI[8], 0x0008($1) // Store DsDx, DtDx, DwDx texture coefficients (integer) - vmadn tDaDeF, tDaDxF, $v15[3] - sdv tDaDyF[0], 0x0038($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (fractional) - vmadh tDaDeI, tDaDxI, $v15[3] - sdv tDaDyI[0], 0x0028($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer) -// Base value += DaDe * factor - vmudn $v29, tV1AtF, vOne[0] - sdv tDaDyF[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficients (fractional) - vmadh $v29, tV1AtI, vOne[0] - sdv tDaDyI[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficients (integer) - vmadl $v29, tDaDeF, $v4[1] - sdv tDaDeF[0], 0x0030($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional) - vmadm $v29, tDaDeI, $v4[1] - sdv tDaDeI[0], 0x0020($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (integer) - vmadn tV1AtF, tDaDeF, $v26[1] - sdv tDaDeF[8], 0x0030($1) // Store DsDe, DtDe, DwDe texture coefficients (fractional) - vmadh tV1AtI, tDaDeI, $v26[1] - sdv tDaDeI[8], 0x0020($1) // Store DsDe, DtDe, DwDe texture coefficients (integer) - // All values start in element 7. "a", attribute, is Z. Need - // tV1AtI, tV1AtF, tDaDxI, tDaDxF, tDaDeI, tDaDeF, tDaDyI, tDaDyF - vmudn tDaDyF, tDaDyF, $v30[7] // 0x0020 - beqz $20, tri_decal_fix_z - vmadh tDaDyI, tDaDyI, $v30[7] // 0x0020 -tri_return_from_decal_fix_z: -tV1AtFF equ $v10 - vmudn tV1AtFF, tDaDeF, $v4[1] // Super-frac (frac * frac) part; assumes v4 factor >= 0 - sdv tV1AtF[0], 0x0010($2) // Store RGBA shade color (fractional) - vmudn tDaDeF, tDaDeF, $v30[7] // 0x0020 - sdv tV1AtI[0], 0x0000($2) // Store RGBA shade color (integer) - vmadh tDaDeI, tDaDeI, $v30[7] // 0x0020 - sdv tV1AtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional) - vmudn tDaDxF, tDaDxF, $v30[7] // 0x0020 - sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer) - vmadh tDaDxI, tDaDxI, $v30[7] // 0x0020 - ssv tDaDyF[14], 0x0E($10) - vmudl $v29, tV1AtFF, $v30[7] // 0x0020 - ssv tDaDyI[14], 0x0C($10) - vmadn tV1AtF, tV1AtF, $v30[7] // 0x0020 - ssv tDaDeF[14], 0x0A($10) - vmadh tV1AtI, tV1AtI, $v30[7] // 0x0020 - ssv tDaDeI[14], 0x08($10) - ssv tDaDxF[14], 0x06($10) - ssv tDaDxI[14], 0x04($10) - ssv tV1AtF[14], 0x02($10) -tri_end_check_rdp_buffer_full: - bltz $8, return_routine // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end - ssv tV1AtI[14], 0x00($10) // If returning from no-Z, this is okay b/c $10 is at end - // 162 cycles -flush_rdp_buffer: // $8 = rdpCmdBufPtr - rdpCmdBufEndP1 - mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight - lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write - addi dmaLen, $8, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy -.if CFG_PROFILING_C - // This is a wait for DMA busy loop, but written inline to avoid overwriting ra. - addi perfCounterD, perfCounterD, 10 // 6 instr + 2 between end load and mfc + 0 taken branch overlaps with last + 2 between mfc and load + vmudl $v29, vPairTPosF, $v30[3] // Persp norm +.if CFG_LEGACY_VTX_PIPE + lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4 +.else +// sVPS is $v26 // vtx_store ViewPort Scale + lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale +.endif + vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm +// vPairRGBA is $v27 + luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA + vmadn vPairTPosF, $v31, $v31[2] // 0 + sll $11, $20, 4 // Shift first vertex scaled clipping to second slots +.if !CFG_LEGACY_VTX_PIPE +// sTPN is $v16 + vmov sTPN[2], vPairPosI[7] // Move vtx 1 packed normals to elem 2 +.endif + andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about +.if !CFG_LEGACY_VTX_PIPE + vmov sTPN[0], vPairPosI[3] // Move vtx 0 packed normals to elem 0 +.endif + andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + vmudh $v29, sVPO, vOne // offset * 1 + or $24, $24, $20 // Combine results for second vertex + vmadn vPairTPosF, vPairTPosF, sVPS // + XYZ * scale + or $10, $10, $11 // Combine results for first vertex + vmadh vPairTPosI, vPairTPosI, sVPS + sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags +// sFOG is $v25 + vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog +.if !CFG_LEGACY_VTX_PIPE + sdv sTPN[0], (tempVpPkNorm)(rdpCmdBufEndP1) // Vtx 0 and 1 packed normals +.endif + // vnop + sh $10, (VTX_CLIP )($19) // Store first vertex results +// vPairNrml is $v16 + vmudn vPairNrml, vPairRGBA, $v31[3] // 2; left shift RGBA without clamp; vtx pair normals + ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) +// sCLZ is $v21 // vtx_store CLamped Z + vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0 + ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19) + vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) + slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos) + vmudn $v29, vM3F, vOne + slv vPairTPosI[0], (VTX_SCR_VEC )($19) + vmadh $v29, vM3I, vOne + blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping + vmadn $v29, vM0F, vPairPosI[0h] + move $ra, $16 // Normally $ra = loop or lighting +skip_return_to_lt_or_loop: + vmadh $v29, vM0I, vPairPosI[0h] + addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize + vmadn $v29, vM1F, vPairPosI[1h] + ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) + vmadh $v29, vM1I, vPairPosI[1h] + ssv sCLZ[4], (VTX_SCR_Z )($19) +// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 + vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords + beqz $7, return_and_end_mat // fog disabled +// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24 + vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords + sbv sFOG[15], (VTX_COLOR_A )(secondVtxPos) + jr $ra + sbv sFOG[7], (VTX_COLOR_A )($19) + +.else // CFG_NO_OCCLUSION_PLANE + +// sOCM is $v22 // vtx_store OCclusion Mid, $v22 = vPairST + ldv sOCM[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) + vmadn s1WF, s1WF, sRTI[3h] + ldv sOCM[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) + vmadh s1WI, s1WI, sRTI[3h] + srl $24, $10, 4 // Shift second vertex screen clipping to first slots + vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high + andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low + andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + vmudh $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7 + cfc2 $20, $vcc // Load scaled clipping results + vmadn s1WF, s1WF, $v31[0] // -4 + ori $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts + vmadh s1WI, s1WI, $v31[0] // -4 + addi inputVtxPos, inputVtxPos, 2*inputVtxSize + vmudn $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz + vmadh $v29, vPairTPosI, sOCM // Int * int + lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below +// sOC1 is $v21 // vtx_store OCclusion temp 1 + vreadacc sOC1, ACC_UPPER // Load int * int portion + lsv vPairTPosF[6], (VTX_Z_FRAC )(outputVtxPos) // load Z into W slot, will be for fog below + vmudl $v29, s1WF, sRTF[2h] + lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below + vmadm $v29, s1WI, sRTF[2h] + lsv vPairTPosI[6], (VTX_Z_INT )(outputVtxPos) // load Z into W slot, will be for fog below + vmadn s1WF, s1WF, sRTI[3h] + sll $11, $20, 4 // Shift first vertex scaled clipping to second slots + vmadh s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W + andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + veq $v29, $v31, $v31[3h] // Set VCC to 00010001 + blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping + vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7 +vtx_store_loop_entry: + move $ra, $16 // Normally $ra = loop or lighting +skip_return_to_lt_or_loop: + vmudl $v29, vPairTPosF, s1WF[3h] // W must be overwritten with Z before here + ssv s1WF[14], (VTX_INV_W_FRAC)(secondVtxPos) + vmadm $v29, vPairTPosI, s1WF[3h] + ssv s1WF[6], (VTX_INV_W_FRAC)($19) + vmadn vPairTPosF, vPairTPosF, s1WI[3h] + ssv s1WI[14], (VTX_INV_W_INT )(secondVtxPos) + vmadh vPairTPosI, vPairTPosI, s1WI[3h] // pos * 1/W + ssv s1WI[6], (VTX_INV_W_INT )($19) + vadd sOC1, sOC1, sOC1[0q] // Add pairs upwards +.if !CFG_LEGACY_VTX_PIPE +// sVPO is $v17 // vtx_store ViewPort Offset + lqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset .endif - bnez $10, flush_rdp_buffer // Wait until no DMAs are active - lw $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr) - mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time) - add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer - sub $10, $10, $11 // $10 = FIFO end addr - future pointer - bgez $10, @@has_room // Branch if we can fit this -@@await_rdp_dblbuf_avail: - mfc0 $11, DPC_STATUS // Read RDP status - andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf - bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available -.if COUNTER_C_FIFO_FULL - addi perfCounterC, perfCounterC, 7 // 4 instr + 2 after mfc + 1 taken branch + // vnop +.if CFG_LEGACY_VTX_PIPE + addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize +.else +// sVPS is $v16 // vtx_store ViewPort Scale + lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale .endif - lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO -@@await_past_first_instr: - mfc0 $11, DPC_CURRENT // Load RDP current pointer - beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start -.if COUNTER_C_FIFO_FULL - addi perfCounterC, perfCounterC, 6 // 3 instr + 2 after mfc + 1 taken branch + vmudl $v29, vPairTPosF, $v30[3] // Persp norm +// vPairST is $v22 + ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3 + vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm + ldv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5, RGBA in 6:7 + vmadn vPairTPosF, $v31, $v31[2] // 0 +// vPairPosI is $v20 + ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) + vadd sOC1, sOC1, sOC1[1h] // Add elems 1, 5 to 3, 7 + ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) + // vnop +// sO03 is $v26 // vtx_store Occlusion coeffs 0-3 + ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3 + vmudh $v29, sVPO, vOne // offset * 1 + ldv sO03[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2 + vmadn vPairTPosF, vPairTPosF, sVPS // + XYZ * scale +.if !CFG_LEGACY_VTX_PIPE +// sOPM is $v17 // vtx_store Occlusion Plus Minus constants + lqv sOPM, (tempOccPlusMinus)(rdpCmdBufEndP1) // Load occlusion plane -/+4000 constants +.endif + vmadh vPairTPosI, vPairTPosI, sVPS + andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about +// sFOG is $v16 + vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog + or $10, $10, $11 // Combine results for first vertex + vlt $v29, sOC1, $v31[2] // Occlusion plane equation < 0 in elems 3, 7 + slv vPairST[4], (tempVpRGBA + 0)(rdpCmdBufEndP1) // Store vtx 0 RGBA to temp mem +.if !CFG_LEGACY_VTX_PIPE +// sTPN is $v18 + vmov sTPN[2], vPairPosI[7] // Move vtx 1 packed normals to elem 2 +.endif + slv vPairST[12], (tempVpRGBA + 4)(rdpCmdBufEndP1) // Store vtx 1 RGBA to temp mem +.if !CFG_LEGACY_VTX_PIPE + vmov sTPN[0], vPairPosI[3] // Move vtx 0 packed normals to elem 0 +.endif + cfc2 $11, $vcc // Load occlusion plane mid results to bits 3 and 7 +// sOSC is $v21 // vtx_store Occlusion SCaled up + vmudh sOSC, vPairTPosI, $v31[4] // 4; scale up x and y + ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) + vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) + or $24, $24, $20 // Combine results for second vertex +// sCLZ is $v25 // vtx_store CLamped Z + vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0 + ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19) + vmulf $v29, sOPM, vPairTPosI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2 +// sO47 is $v23 // vtx_store Occlusion coeffs 0-3; $v23 = vPairTPosF + ldv sO47[0], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // Load coeffs 4-7 +// sOC2 is $v27 // vtx_store OCclusion temp 2; $v27 = vPairRGBA + vmacf sOC2, sO03, sOSC[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2 + ldv sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2 + vmulf $v29, sOPM, vPairTPosI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2 + beqz $7, @@skipfog // fog disabled +// sOC3 is $v21 // vtx_store OCclusion temp 3 + vmacf sOC3, sO03, sOSC[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2 + sbv sFOG[15], (VTX_COLOR_A )(secondVtxPos) + sbv sFOG[7], (VTX_COLOR_A )($19) +@@skipfog: + slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos) + veq $v29, $v31, $v31[0q] // Set VCC to 10101010 + slv vPairTPosI[0], (VTX_SCR_VEC )($19) + vmrg sOC2, sOC2, sOC3 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1 +.if CFG_LEGACY_VTX_PIPE + lpv $v14[7], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4 .else - nop + sdv sTPN[0], (tempVpPkNorm)(rdpCmdBufEndP1) // Vtx 0 and 1 packed normals .endif - // Start was previously the start of the FIFO, unless this is the first buffer, - // in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we - // have a new end value waiting (END_VALID), it'll load end but leave current. By - // setting start here, it will also load current with start. - mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO -@@keep_waiting: -.if COUNTER_C_FIFO_FULL - // This is here so we only count it when stalling below or on FIFO end codepath - addi perfCounterC, perfCounterC, 10 // 7 instr + 2 after mfc + 1 taken branch + // vnop + ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) + // vnop +.if CFG_LEGACY_VTX_PIPE + lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4 +.else + addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize .endif -@@has_room: - mfc0 $11, DPC_CURRENT // Load RDP current pointer - sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start) - blez $11, @@copy_buffer // Current is behind or at current end, can do copy - sub $11, $11, dmaLen // If amount current is ahead of current end - blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting -@@copy_buffer: - add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size - sw $11, rdpFifoPos - // Set up the DMA from DMEM to the RDP fifo in RDRAM - addi dmaLen, dmaLen, -1 // subtract 1 from the length - addi dmemAddr, rdpCmdBufEndP1, -(0x2000 | (RDP_CMD_BUFSIZE + 8)) // The 0x2000 is meaningless, negative means write - xori rdpCmdBufEndP1, rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word ^ rdpCmdBuffer2EndPlus1Word // Swap between the two RDP command buffers - j dma_read_write - addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8) + // vnop + ssv sCLZ[4], (VTX_SCR_Z )($19) + vge $v29, sOC2, sO47 // Each compare to coeffs 4-7 +// vPairNrml is $v16 + lpv vPairNrml[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair normals + vmudn $v29, vM3F, vOne + cfc2 $20, $vcc + vmadh $v29, vM3I, vOne +// vPairRGBA is $v27 + luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair colors + vmadn $v29, vM0F, vPairPosI[0h] + andi $11, $11, CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4) // Only bits 3, 7 from occlusion + vmadh $v29, vM0I, vPairPosI[0h] + or $20, $20, $11 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded + vmadn $v29, vM1F, vPairPosI[1h] + andi $11, $20, 0x00F0 // Bits 4-7 for vtx 2 + vmadh $v29, vM1I, vPairPosI[1h] + bnez $11, @@skipv2 // If nonzero, at least one equation false, don't set occluded flag + andi $20, $20, 0x000F // Bits 0-3 for vtx 1 + ori $24, $24, CLIP_OCCLUDED // All equations true, set vtx 2 occluded flag +@@skipv2: +// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 + vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords + bnez $20, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag + sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags + ori $10, $10, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag +@@skipv1: +// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24 + vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords + jr $ra + sh $10, (VTX_CLIP )($19) // Store first vertex results -tri_decal_fix_z: - /* - vrsqh $v29[0], tV1AtI[7] - vrsql $v26[0], tV1AtF[7] - vrsqh $v25[0], $v31[2] // 0 - vmudn $v29, $v26, $v31[0] // -4 - vmadh $v25, $v25, $v31[0] // -4 - */ - /* - vrcph $v29[0], tV1AtI[7] - vrcpl $v25[0], tV1AtF[7] - vmudh $v25, $v25, $v31[1] // -1 - */ - mfc2 $20, tV1AtI[7] // Z int part; maybe 0000 to 03FF - li $11, 0xFE00 - srl $20, $20, 7 // Now 00 to 07 - srav $11, $11, $20 // 00 -> FF00 = -512; 07 -> FFFE = -4 - mtc2 $11, $v25[0] - j tri_return_from_decal_fix_z - vcr tDaDyI, tDaDyI, $v25[0] +.endif // CFG_NO_OCCLUSION_PLANE -.if CFG_PROFILING_B -tri_culled_by_occlusion_plane: - jr $ra - addi perfCounterB, perfCounterB, 0x4000 +.endif // New LVP_NOC + +.if !CFG_PROFILING_A && (!CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE) +vertex_end: + j run_next_DL_command + lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store .endif -// This routine is used to return via conditional branch -.if !CFG_PROFILING_B -tri_culled_by_occlusion_plane: +.if CFG_PROFILING_A +vertex_end: + li $ra, 0 // Flag for coming from vtx +.if !CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE + lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store .endif -return_routine: - jr $ra - nop +tris_end: + mfc0 $11, DPC_CLOCK + lw $10, startCounterTime + sub $11, $11, $10 + beqz $ra, run_next_DL_command // $ra != 0 if from tri cmds + add perfCounterA, perfCounterA, $11 // Add to vert cycles perf counter + sub perfCounterA, perfCounterA, $11 // From tris, undo add to vert perf counter + sub $10, perfCounterC, $4 // How long we stalled for RDP FIFO during this cmd + sub $11, $11, $10 // Subtract that from the tri cycles + j run_next_DL_command + add perfCounterD, perfCounterD, $11 // Add to tri cycles perf counter +.endif + +.if CFG_LEGACY_VTX_PIPE || CFG_NO_OCCLUSION_PLANE +G_MTX_end: + instantiate_mtx_end_begin +mtx_multiply: + instantiate_mtx_multiply +.endif + .if CFG_PROFILING_B loadOverlayInstrs equ 13 @@ -3110,7 +3111,8 @@ segmented_to_physical: add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address G_CULLDL_handler: - jal vtx_addrs_from_cmd // Load start vtx addr in $10 + lhu $10, (vertexTable)(cmd_w0) // Start vtx addr + lhu $3, (vertexTable)(cmd_w1_dram) // End vertex /* CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1 verts which are behind the occlusion plane, and 1 vert which is behind the camera @@ -3120,8 +3122,7 @@ G_CULLDL_handler: the occlusion plane if the vert is behind the camera, because this only matters for G_CULLDL and not for tris. */ - li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) - mfc2 $3, $v27[14] // End vertex + li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) lhu $11, VTX_CLIP($10) culldl_loop: and $1, $1, $11 @@ -3185,6 +3186,7 @@ ovl234_ovl4_entrypoint_ovl2ver: // same IMEM address as ovl234_ovl4_e // Jump here to do clipping. If overlay 2 is loaded (this code), loads overlay 3 // and jumps to right here, which is now in the new code. ovl234_clipping_entrypoint_ovl2ver: // same IMEM address as ovl234_clipping_entrypoint + sh $ra, tempTriRA // Tri return after clipping .if CFG_PROFILING_B addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load .endif @@ -3201,96 +3203,117 @@ lt_continue_setup: addi $3, $3, altBase // Point to ambient light; stored through vtx proc andi $17, $5, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store and $11, $11, $7 // Zero if either matrix or lights invalid - bnez $11, lt_setup_skip_xfrm + bnez $11, lt_setup_after_xfrm sb $10, dirLightsXfrmValid xfrm_dir_lights: // Transform directional lights' direction by M transpose. // First, load M transpose. Can use any regs except $v8-$v12, $v28-$v31. - // This algorithm clobbers all of $v0-$v7 and $v16-$v23 with the transposes. - // The F3DEX2 implementation takes 18 instructions and about 11 cycles. - // This implementation is 16 instructions and about 10 cycles. However, since - // this code is in an overlay and is not run per vertex, that doesn't really - // matter and it's really just an excuse to use the rare ltv instructions. + // This algorithm clobbers all of $v0-$v7 and $v16-$v23 with the transposes; + // it's mainly just an excuse to use the rare ltv and swv instructions. + // The F3DEX2 implementation takes 18 instructions and 11 cycles. + // This implementation is 23 instructions and 17 cycles, but this version + // loads M transpose to both halves of each vector so we can process two + // lights at a time, which matters because there's always at least 3 lights + // (technically 2 for EX3)--the lookat directions. Plus, those 17 cycles + // also include a few instructions starting the loop. // Memory at mMatrix contains, in shorts within qwords, for the elements we care about: // A B C - D E F - (X int, Y int) // G H I - - - - - (Z int, W int) // M N O - P Q R - (X frac, Y frac) // S T U - - - - - (Z frac, W frac) // First, make $v0-$v7 contain this, and same for $v16-$v23 frac parts. - // $v0 A - G - - - - - $v16 M - S - - - - - - // $v1 - B - H - - - - $v17 - N - T - - - - - // $v2 - - C - I - - - $v18 - - O - U - - - + // $v0 A - G - A - G - $v16 M - S - M - S - + // $v1 - B - H - B - H $v17 - N - T - N - T + // $v2 I - C - I - C - $v18 U - O - U - O - // $v3 - - - - - - - - $v19 - - - - - - - - - // $v4 - - - - D - - - $v20 - - - - P - - - - // $v5 - - - - - E - - $v21 - - - - - Q - - - // $v6 - - - - - - F - $v22 - - - - - - R - + // $v4 D - - - D - - - $v20 P - - - P - - - + // $v5 - E - - - E - - $v21 - Q - - - Q - - + // $v6 - - F - - - F - $v22 - - R - - - R - // $v7 - - - - - - - - $v23 - - - - - - - - - ltv $v0[0], (mMatrix + 0x00)($zero) - ltv $v0[12], (mMatrix + 0x10)($zero) + ltv $v0[0], (mMatrix + 0x00)($zero) // A to $v0[0] etc. + ltv $v0[12], (mMatrix + 0x10)($zero) // G to $v0[2] etc. + ltv $v0[8], (mMatrix + 0x00)($zero) // A to $v0[4] etc. + ltv $v0[4], (mMatrix + 0x10)($zero) // G to $v0[6] etc. ltv $v16[0], (mMatrix + 0x20)($zero) ltv $v16[12], (mMatrix + 0x30)($zero) - move curLight, $3 - lsv $v0[2], (mMatrix + 0x08)($zero) // Place D into $v0 element 1 - vmudh $v1, vOne, $v1[1q] // Shift $v1 left one element (B, H) - lsv $v2[0], (mMatrix + 0x04)($zero) // Place C into $v2 element 0 - vmov $v1[1], $v5[5] // Move E into $v1 element 1 - lsv $v2[4], (mMatrix + 0x14)($zero) // Place I into $v2 element 2 - vmov $v2[1], $v6[6] // Move F into $v2 element 2 - lsv $v16[2], (mMatrix + 0x28)($zero) // Place P into $v16 element 1 - vmudh $v17, vOne, $v17[1q] // Shift $v17 left one element (N, T) - lsv $v18[0], (mMatrix + 0x24)($zero) // Place O into $v18 element 0 - vmov $v17[1], $v21[5] // Move Q into $v17 element 1 - lsv $v18[4], (mMatrix + 0x34)($zero) // Place U into $v18 element 2 - vmov $v18[1], $v22[6] // Move R into $v18 element 1 - // Resulting matrix (M transpose) in $v0:$v2 int, $v16:$v18 frac. -xfrm_light_loop: - beq curLight, altBaseReg, xfrm_light_post - lpv $v3, (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2 - addi $20, curLight, (ltBufOfs + 12 - lightSize) // Target = last word of light - addi curLight, curLight, -lightSize - j xfrm_single_dir - li $ra, xfrm_light_loop - -xfrm_light_post: - // Lookat 0: input already in $v3, target is xfrmLookatDirs. - jal xfrm_single_dir - li $20, OSTask + OSTask_ucode_data //xfrmLookatDirs - // Lookat 1: curLight still pointing to light 0, target is 4 bytes later. - lpv $v3[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2 - jal xfrm_single_dir - li $20, OSTask + OSTask_ucode_data_size -lt_setup_skip_xfrm: - // Load first light direction to $v13, which is not used throughout vtx processing. - j vtx_after_lt_setup - lpv $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6 - -xfrm_single_dir: - vmudn $v29, $v16, $v3[0] - vmadh $v29, $v0, $v3[0] - vmadn $v29, $v17, $v3[1] - vmadh $v29, $v1, $v3[1] - vmadn $v29, $v18, $v3[2] - vmadh $v4, $v2, $v3[2] // $v4[0:2] = light dir in model space + ltv $v16[8], (mMatrix + 0x20)($zero) + ltv $v16[4], (mMatrix + 0x30)($zero) + veq $v29, $v31, $v31[0q] // Set VCC to 10101010 + vmudh $v1, vOne, $v1[1q] // B - H - B - H - + lsv $v18[6], (mMatrix + 0x2C)($zero) // U - O(R)U - O - + vmrg $v0, $v0, $v4[0q] // A D G - A D G - + lsv $v18[14], (mMatrix + 0x2C)($zero) // U - O R U - O(R) + vmrg $v2, $v2, $v6[0q] // I - C F I - C F + lpv $v3[0], (lightBufferLookat - altBase)(altBaseReg) // Lookat 0 and 1 + vmudh $v17, vOne, $v17[1q] // N - T - N - T - + li curLight, altBase - 4 * lightSize // + ltBufOfs = light -4; write pointer + vmrg $v1, $v1, $v5 // B E H - B E H - + // nop + // Interleave the start of transforming pairs of dir lights, including lookat. + vmrg $v16, $v16, $v20[0q] // M P S - M P S - + swv $v18[4], (tempXfrmSingle)(rdpCmdBufEndP1) // Stores O R U - O R U - + vmudh $v29, $v0, $v3[0h] + lqv $v18, (tempXfrmSingle)(rdpCmdBufEndP1) + vmrg $v17, $v17, $v21 // N Q T - N Q T - + swv $v2[4], (tempXfrmSingle)(rdpCmdBufEndP1) // Stores C F I - C F I - + vmadh $v29, $v1, $v3[1h] + lqv $v2, (tempXfrmSingle)(rdpCmdBufEndP1) + vmadn $v29, $v16, $v3[0h] + // 18 cycles +xfrm_light_loop_1: + vmadn $v29, $v18, $v3[2h] +xfrm_light_loop_2: + vmadn $v29, $v17, $v3[1h] + vmadh $v4, $v2, $v3[2h] // $v4[0:2] and [4:6] = two lights dir in model space + vrsqh $v29[0], $v20[0] + vrsql $v23[0], $v21[0] + vrsqh $v22[0], $v20[4] + addi curLight, curLight, 2 * lightSize // Iters: -2, 0, 2, ... + vrsql $v23[4], $v21[4] + lw $20, (ltBufOfs + 8 + 2 * lightSize)(curLight) // First iter = light 0 + vrsqh $v22[4], $v31[2] // 0 + lw $24, (ltBufOfs + 8 + 3 * lightSize)(curLight) // First iter = light 1 vmudh $v29, $v4, $v4 // Squared + sub $10, curLight, altBaseReg // Is curLight (write ptr) <= 0? vreadacc $v7, ACC_MIDDLE // Read not-clamped value + sub $11, curLight, $3 // Is curLight (write ptr) <, =, or > ambient light? vreadacc $v6, ACC_UPPER - vmudm $v29, vOne, $v7[2] // Sum of squared components - vmadh $v29, vOne, $v6[2] - vmadm $v29, vOne, $v7[1] - vmadh $v29, vOne, $v6[1] - vmadn $v7, $v7, vOne // elem 0; swapped so we can do vmadn and get result - vmadh $v6, $v6, vOne - vrsqh $v29[0], $v6[0] - vrsql $v7[0], $v7[0] - vrsqh $v6[0], $v31[2] // 0 - vmudm $v29, $v4, $v7[0] // Vec int * frac scaling - vmadh $v4, $v4, $v6[0] // Vec int * int scaling - spv $v4[0], (tempXfrmSingle)(rdpCmdBufEndP1) // Store elem 0-2 as bytes to temp memory - lw $11, (tempXfrmSingle)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit - jr $ra - sw $11, (0)($20) // Store 3 (4) bytes to target address - // This clobbers the specular size + sw $20, (tempXfrmSingle)(rdpCmdBufEndP1) // Store light 0 + vmudm $v29, $v19, $v23[0h] // Vec int * frac scaling + sw $24, (tempXfrmSingle + 4)(rdpCmdBufEndP1) // Store light 1 + vmadh $v5, $v19, $v22[0h] // Vec int * int scaling + lpv $v3[0], (tempXfrmSingle)(rdpCmdBufEndP1) // Load dirs 0-2, 4-6 + vmudm $v29, vOne, $v7[2h] // Sum of squared components + vmadh $v29, vOne, $v6[2h] + vmadm $v29, vOne, $v7[1h] + vmadh $v29, vOne, $v6[1h] + spv $v5[0], (tempXfrmSingle)(rdpCmdBufEndP1) // Store elem 0-2, 4-6 as bytes to temp memory + vmadn $v21, $v7, vOne // elem 0, 4; swapped so we can do vmadn and get result + lw $20, (tempXfrmSingle)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit + vmadh $v20, $v6, vOne + lw $24, (tempXfrmSingle + 4)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit + vcopy $v19, $v4 + blez $10, xfrm_light_store_lookat // curLight = -2 or 0 + vmudh $v29, $v0, $v3[0h] + // 20 cycles from xfrm_light_loop_2 not counting land + vmadh $v29, $v1, $v3[1h] + bgtz $11, lt_setup_after_xfrm // curLight > ambient; only one light valid + sw $20, (ltBufOfs + 0xC - 2 * lightSize)(curLight) // Write light relative -2 + vmadn $v29, $v16, $v3[0h] + bltz $11, xfrm_light_loop_1 // curLight < ambient; more lights to compute + sw $24, (ltBufOfs + 0xC - 1 * lightSize)(curLight) // Write light relative -1 +lt_setup_after_xfrm: + // Load first light direction to $v13, which is not used throughout vtx processing. + j vtx_after_lt_setup + lpv $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6 +xfrm_light_store_lookat: + vmadh $v29, $v1, $v3[1h] + spv $v5[0], (xfrmLookatDirs)($zero) // First time is garbage; second actual + vmadn $v29, $v16, $v3[0h] + j xfrm_light_loop_2 + vmadn $v29, $v18, $v3[2h] + .if CFG_NO_OCCLUSION_PLANE // New LVP_NOC .align 8 @@ -3727,6 +3750,7 @@ G_MTX_end: // Jump here to do clipping. If overlay 4 is loaded (this code), loads overlay 3 // and jumps to right here, which is now in the new code. ovl234_clipping_entrypoint_ovl4ver: // same IMEM address as ovl234_clipping_entrypoint + sh $ra, tempTriRA // Tri return after clipping .if CFG_PROFILING_B addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load .endif diff --git a/gbi.h b/gbi.h index c592fd2..dc8069a 100644 --- a/gbi.h +++ b/gbi.h @@ -1,9 +1,16 @@ /** * @file gbi.h * @brief Modded GBI for use with F3DEX3 custom microcode - * */ +/* List of options; the documentation for each is where it is used below. */ +/* #define REQUIRE_SEMICOLONS_AFTER_GBI_COMMANDS */ /* recommended */ +/* #define NO_SYNCS_IN_TEXTURE_LOADS */ /* see documentation */ +/* #define F3DEX2_SEGMENTS */ /* see documentation */ +/* #define DISABLE_AA */ /* developer taste */ +/* #define RISKY_RDP_SYNCS */ /* see documentation */ +/* #define KAZE_GBI_HACKS */ /* not recommended unless you are Kaze */ + #include "ultra64/mbi.h" #ifndef F3DEX3_H @@ -625,6 +632,20 @@ longer a multiple of 8 (DMA word). This was not used in any command anyway. */ #define G_ZS_PIXEL (0 << G_MDSFT_ZSRCSEL) #define G_ZS_PRIM (1 << G_MDSFT_ZSRCSEL) +#ifdef DISABLE_AA +/* Disables antialiasing in all preset rendermodes, saving RDP time. Note that +this does NOT disable antialiasing in manually written rendermodes, e.g. +exported from fast64 with advanced options enabled. We can't redefine the real +IM_RD because IM_RD is needed for transparency also, and we can't distinguish +between a manually written rendermode using IM_RD for transparency and one using +it for antialiasing. */ +#define AA_DEF 0 +#define RD_DEF 0 +#else +#define AA_DEF AA_EN +#define RD_DEF IM_RD +#endif + /* G_SETOTHERMODE_L gSetRenderMode */ #define AA_EN 0x0008 #define Z_CMP 0x0010 @@ -642,7 +663,7 @@ longer a multiple of 8 (DMA word). This was not used in any command anyway. */ #define CVG_X_ALPHA 0x1000 #define ALPHA_CVG_SEL 0x2000 #define FORCE_BL 0x4000 -#define TEX_EDGE 0x0000 /* used to be 0x8000 */ +#define TEX_EDGE 0x0000 /* not in HW V2; is 0x8000 in older HW */ #define G_BL_CLR_IN 0 #define G_BL_CLR_MEM 1 @@ -662,148 +683,150 @@ longer a multiple of 8 (DMA word). This was not used in any command anyway. */ (m1a) << 28 | (m1b) << 24 | (m2a) << 20 | (m2b) << 16 #define RM_AA_ZB_OPA_SURF(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | RD_DEF | CVG_DST_CLAMP | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_RA_ZB_OPA_SURF(clk) \ - AA_EN | Z_CMP | Z_UPD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | CVG_DST_CLAMP | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_ZB_XLU_SURF(clk) \ - AA_EN | Z_CMP | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | \ + AA_DEF | Z_CMP | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | \ FORCE_BL | ZMODE_XLU | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_OPA_DECAL(clk) \ - AA_EN | Z_CMP | IM_RD | CVG_DST_WRAP | ALPHA_CVG_SEL | \ + AA_DEF | Z_CMP | RD_DEF | CVG_DST_WRAP | ALPHA_CVG_SEL | \ ZMODE_DEC | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_RA_ZB_OPA_DECAL(clk) \ - AA_EN | Z_CMP | CVG_DST_WRAP | ALPHA_CVG_SEL | \ + AA_DEF | Z_CMP | CVG_DST_WRAP | ALPHA_CVG_SEL | \ ZMODE_DEC | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_ZB_XLU_DECAL(clk) \ - AA_EN | Z_CMP | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | \ + AA_DEF | Z_CMP | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | \ FORCE_BL | ZMODE_DEC | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_OPA_INTER(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | RD_DEF | CVG_DST_CLAMP | \ ALPHA_CVG_SEL | ZMODE_INTER | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_RA_ZB_OPA_INTER(clk) \ - AA_EN | Z_CMP | Z_UPD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | CVG_DST_CLAMP | \ ALPHA_CVG_SEL | ZMODE_INTER | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_ZB_XLU_INTER(clk) \ - AA_EN | Z_CMP | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | \ + AA_DEF | Z_CMP | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | \ FORCE_BL | ZMODE_INTER | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_XLU_LINE(clk) \ - AA_EN | Z_CMP | IM_RD | CVG_DST_CLAMP | CVG_X_ALPHA | \ + AA_DEF | Z_CMP | IM_RD | CVG_DST_CLAMP | CVG_X_ALPHA | \ ALPHA_CVG_SEL | FORCE_BL | ZMODE_XLU | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_DEC_LINE(clk) \ - AA_EN | Z_CMP | IM_RD | CVG_DST_SAVE | CVG_X_ALPHA | \ + AA_DEF | Z_CMP | IM_RD | CVG_DST_SAVE | CVG_X_ALPHA | \ ALPHA_CVG_SEL | FORCE_BL | ZMODE_DEC | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) +/* Note that this uses AA_EN not AA_DEF */ #define RM_AA_ZB_TEX_EDGE(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_EN | Z_CMP | Z_UPD | RD_DEF | CVG_DST_CLAMP | \ CVG_X_ALPHA | ALPHA_CVG_SEL | ZMODE_OPA | TEX_EDGE | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_ZB_TEX_INTER(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | RD_DEF | CVG_DST_CLAMP | \ CVG_X_ALPHA | ALPHA_CVG_SEL | ZMODE_INTER | TEX_EDGE | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_ZB_SUB_SURF(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_FULL | \ + AA_DEF | Z_CMP | Z_UPD | IM_RD | CVG_DST_FULL | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_ZB_PCL_SURF(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ ZMODE_OPA | G_AC_DITHER | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_OPA_TERR(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | RD_DEF | CVG_DST_CLAMP | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_TEX_TERR(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | Z_CMP | Z_UPD | RD_DEF | CVG_DST_CLAMP | \ CVG_X_ALPHA | ALPHA_CVG_SEL | ZMODE_OPA | TEX_EDGE | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_ZB_SUB_TERR(clk) \ - AA_EN | Z_CMP | Z_UPD | IM_RD | CVG_DST_FULL | \ + AA_DEF | Z_CMP | Z_UPD | IM_RD | CVG_DST_FULL | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_OPA_SURF(clk) \ - AA_EN | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | RD_DEF | CVG_DST_CLAMP | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_RA_OPA_SURF(clk) \ - AA_EN | CVG_DST_CLAMP | \ + AA_DEF | CVG_DST_CLAMP | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_XLU_SURF(clk) \ - AA_EN | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | FORCE_BL | \ + AA_DEF | IM_RD | CVG_DST_WRAP | CLR_ON_CVG | FORCE_BL | \ ZMODE_OPA | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_XLU_LINE(clk) \ - AA_EN | IM_RD | CVG_DST_CLAMP | CVG_X_ALPHA | \ + AA_DEF | IM_RD | CVG_DST_CLAMP | CVG_X_ALPHA | \ ALPHA_CVG_SEL | FORCE_BL | ZMODE_OPA | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_DEC_LINE(clk) \ - AA_EN | IM_RD | CVG_DST_FULL | CVG_X_ALPHA | \ + AA_DEF | IM_RD | CVG_DST_FULL | CVG_X_ALPHA | \ ALPHA_CVG_SEL | FORCE_BL | ZMODE_OPA | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) +/* Note that this uses AA_EN not AA_DEF */ #define RM_AA_TEX_EDGE(clk) \ - AA_EN | IM_RD | CVG_DST_CLAMP | \ + AA_EN | RD_DEF | CVG_DST_CLAMP | \ CVG_X_ALPHA | ALPHA_CVG_SEL | ZMODE_OPA | TEX_EDGE | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_SUB_SURF(clk) \ - AA_EN | IM_RD | CVG_DST_FULL | \ + AA_DEF | IM_RD | CVG_DST_FULL | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_A_MEM) #define RM_AA_PCL_SURF(clk) \ - AA_EN | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | IM_RD | CVG_DST_CLAMP | \ ZMODE_OPA | G_AC_DITHER | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_OPA_TERR(clk) \ - AA_EN | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | RD_DEF | CVG_DST_CLAMP | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_TEX_TERR(clk) \ - AA_EN | IM_RD | CVG_DST_CLAMP | \ + AA_DEF | RD_DEF | CVG_DST_CLAMP | \ CVG_X_ALPHA | ALPHA_CVG_SEL | ZMODE_OPA | TEX_EDGE | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) #define RM_AA_SUB_TERR(clk) \ - AA_EN | IM_RD | CVG_DST_FULL | \ + AA_DEF | IM_RD | CVG_DST_FULL | \ ZMODE_OPA | ALPHA_CVG_SEL | \ GBL_c##clk(G_BL_CLR_IN, G_BL_A_IN, G_BL_CLR_MEM, G_BL_1MA) @@ -2624,11 +2647,22 @@ _DW({ \ /* * Moveword commands */ -/* not strictly a moveword command anymore */ +#ifdef F3DEX2_SEGMENTS +/* Use F3DEX2 style segment setup binary encoding. F3DEX3 supports both the +F3DEX2 encoding and the F3DEX3 encoding, but the former does not have the +relative segment resolution behavior. */ +#define gSPSegment(pkt, segment, base) \ + gMoveWd(pkt, G_MW_SEGMENT, (segment) * 4, (base)) +#define gsSPSegment(segment, base) \ + gsMoveWd( G_MW_SEGMENT, (segment) * 4, (base)) +#else +/* F3DEX3 style segment setup, which resolves segment addresses relative to +other segments. */ #define gSPSegment(pkt, segment, base) \ gDma1p((pkt), G_RELSEGMENT, (base), ((segment) * 4) & 0xFFF, G_MW_SEGMENT) #define gsSPSegment(segment, base) \ gsDma1p( G_RELSEGMENT, (base), ((segment) * 4) & 0xFFF, G_MW_SEGMENT) +#endif #define gSPPerspNormalize(pkt, s) gMoveHalfwd(pkt, G_MW_FX, G_MWO_PERSPNORM, (s)) #define gsSPPerspNormalize(s) gsMoveHalfwd( G_MW_FX, G_MWO_PERSPNORM, (s)) @@ -2924,7 +2958,8 @@ _DW({ \ * * Internally, a material is defined to start with any set image command, and * end on any of the following: call, branch, return, vertex, all tri commands, - * modify vertex, branch Z/W, or cull. The physical address of the display list + * tex/fill rectangles, and successes on cull or branch w/z (which are usually + * preceded by vertex loads anyway). The physical address of the display list * --not the address of the image--is stored when a material is started. If a * material starts and its physical address is the same as the stored last start * address, i.e. we're executing the same material display list as the last @@ -3326,7 +3361,11 @@ _DW({ \ #define gSPSetLights0(pkt, name) gSPSetLights(pkt, 0, name) #define gsSPSetLights0(name) gsSPSetLights( 0, name) #define gSPSetLights1(pkt, name) gSPSetLights(pkt, 1, name) +#ifdef KAZE_GBI_HACKS +#define gsSPSetLights1(name) gsSPNoOp() +#else #define gsSPSetLights1(name) gsSPSetLights( 1, name) +#endif #define gSPSetLights2(pkt, name) gSPSetLights(pkt, 2, name) #define gsSPSetLights2(name) gsSPSetLights( 2, name) #define gSPSetLights3(pkt, name) gSPSetLights(pkt, 3, name) @@ -3639,11 +3678,11 @@ _DW({ \ * Fri May 26 13:45:55 PDT 1995 * @deprecated */ -#define gDPSetBlendMask(pkt, mask) gDPNoOp(pkt) +#define gDPSetBlendMask(pkt, mask) gSPNoOp(pkt) /** * @copydetails gDPSetBlendMask */ -#define gsDPSetBlendMask(mask) gsDPNoOp() +#define gsDPSetBlendMask(mask) gsSPNoOp() #define gDPSetAlphaCompare(pkt, type) \ gSPSetOtherMode(pkt, G_SETOTHERMODE_L, G_MDSFT_ALPHACOMPARE, 2, type) @@ -3815,9 +3854,14 @@ _DW({ \ #define gDPSetEnvColor(pkt, r, g, b, a) \ DPRGBColor(pkt, G_SETENVCOLOR, r, g, b, a) - + +#ifdef KAZE_GBI_HACKS +#define gsDPSetEnvColor(r, g, b, a) \ + gsSPNoOp() +#else #define gsDPSetEnvColor(r, g, b, a) \ sDPRGBColor( G_SETENVCOLOR, r, g, b, a) +#endif #define gDPSetBlendColor(pkt, r, g, b, a) \ DPRGBColor(pkt, G_SETBLENDCOLOR, r, g, b, a) @@ -5358,17 +5402,28 @@ _DW({ #define gDPWord(pkt, wordhi, wordlo) \ _DW({ \ Gfx *_g = (Gfx *)(pkt); \ - \ gImmp1(pkt, G_RDPHALF_1, (unsigned int)(wordhi)); \ gImmp1(pkt, G_RDPHALF_2, (unsigned int)(wordlo)); \ }) +#ifdef RISKY_RDP_SYNCS +/* + * The community has found that in nearly all instances, a tile sync is + * sufficient where a pipe sync is normally used--between rendering something + * and changing critical RDP settings. However, we are not 100% sure this is + * true for all obscure settings, so it is risky. +*/ +#define G_USEASPIPESYNC G_RDPTILESYNC +#else +#define G_USEASPIPESYNC G_RDPPIPESYNC +#endif + #define gDPFullSync(pkt) gDPNoParam(pkt, G_RDPFULLSYNC) #define gsDPFullSync() gsDPNoParam( G_RDPFULLSYNC) #define gDPTileSync(pkt) gDPNoParam(pkt, G_RDPTILESYNC) #define gsDPTileSync() gsDPNoParam( G_RDPTILESYNC) -#define gDPPipeSync(pkt) gDPNoParam(pkt, G_RDPPIPESYNC) -#define gsDPPipeSync() gsDPNoParam( G_RDPPIPESYNC) +#define gDPPipeSync(pkt) gDPNoParam(pkt, G_USEASPIPESYNC) +#define gsDPPipeSync() gsDPNoParam( G_USEASPIPESYNC) #define gDPLoadSync(pkt) gDPNoParam(pkt, G_RDPLOADSYNC) #define gsDPLoadSync() gsDPNoParam( G_RDPLOADSYNC) #define gDPNoOp(pkt) gDPNoParam(pkt, G_NOOP)