diff --git a/src/ARM.cpp b/src/ARM.cpp index f75c1e27..9424366c 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -87,7 +87,7 @@ void ARM::GdbCheckC() {} -const u32 ARM::ConditionTable[16] = +alignas(64) const u32 ARM::ConditionTable[16] = { 0xF0F0, // EQ 0x0F0F, // NE @@ -158,8 +158,11 @@ void ARM::Reset() { Cycles = 0; Halted = 0; + DataCycles = 0; + CheckInterlock = false; IRQ = 0; + IRQTimestamp = -1; for (int i = 0; i < 16; i++) R[i] = 0; @@ -197,6 +200,13 @@ void ARM::Reset() BreakReq = false; #endif + memset(&MRTrack, 0, sizeof(MRTrack)); + + FuncQueueFill = 0; + FuncQueueEnd = 0; + FuncQueueProg = 0; + FuncQueueActive = false; + // zorp JumpTo(ExceptionBase); } @@ -204,6 +214,31 @@ void ARM::Reset() void ARMv5::Reset() { PU_Map = PU_PrivMap; + Store = false; + + ITCMTimestamp = 0; + TimestampMemory = 0; + ILCurrReg = 16; + ILPrevReg = 16; + + ICacheStreamPtr = 7; + DCacheStreamPtr = 7; + + WBWritePointer = 16; + WBFillPointer = 0; + WBDelay = 0; + WBTimestamp = 0; + WBReleaseTS = 0; + WBLastRegion = Mem9_Null; + WBWriting = false; + WBInitialTS = 0; + + ARM::Reset(); +} + +void ARMv4::Reset() +{ + Nonseq = true; ARM::Reset(); } @@ -228,7 +263,7 @@ void ARM::DoSavestate(Savestate* file) file->VarArray(R_ABT, 3*sizeof(u32)); file->VarArray(R_IRQ, 3*sizeof(u32)); file->VarArray(R_UND, 3*sizeof(u32)); - file->Var32(&CurInstr); + file->Var64(&CurInstr); #ifdef JIT_ENABLED if (file->Saving && NDS.IsJITEnabled()) { @@ -238,8 +273,24 @@ void ARM::DoSavestate(Savestate* file) FillPipeline(); } #endif - file->VarArray(NextInstr, 2*sizeof(u32)); + file->VarArray(NextInstr, 2*sizeof(u64)); + file->VarArray(&MRTrack, sizeof(MRTrack)); + file->Var32(&BranchAddr); + file->VarArray(QueueMode, sizeof(QueueMode)); + file->Var8(&ExtReg); + file->Var8(&ExtROROffs); + file->Var64(&RetVal); + file->Var16(&LDRRegs); + file->Var16(&LDRFailedRegs); + file->VarArray(FetchAddr, sizeof(FetchAddr)); + file->VarArray(STRVal, sizeof(STRVal)); + file->Var64(&IRQTimestamp); + file->Var8(&FuncQueueFill); + file->Var8(&FuncQueueEnd); + file->Var8(&ExecuteCycles); + file->Bool32(&FuncQueueActive); + file->Bool32(&CheckInterlock); file->Var32(&ExceptionBase); if (!file->Saving) @@ -254,7 +305,7 @@ void ARM::DoSavestate(Savestate* file) if (!Num) { SetupCodeMem(R[15]); // should fix it - ((ARMv5*)this)->RegionCodeCycles = ((ARMv5*)this)->MemTimings[R[15] >> 12][0]; + ((ARMv5*)this)->RegionCodeCycles = ((ARMv5*)this)->MemTimings[R[15] >> 12][2]; if ((CPSR & 0x1F) == 0x10) ((ARMv5*)this)->PU_Map = ((ARMv5*)this)->PU_UserMap; @@ -271,8 +322,105 @@ void ARM::DoSavestate(Savestate* file) void ARMv5::DoSavestate(Savestate* file) { + file->Var64(&ITCMTimestamp); + file->Var64(&TimestampMemory); + file->Bool32(&Store); + file->Var8((u8*)&ITCMDelay); + file->Var32(&QueuedDCacheLine); + file->Var32(&CP15Queue); + + file->Var8(&ILCurrReg); + file->Var8(&ILPrevReg); + file->Var64(&ILCurrTime); + file->Var64(&ILPrevTime); + file->Var8(&ILQueueReg); + file->Var8((u8*)&ILQueueDelay); + file->Var8(&ILQueueMemReg); + file->VarArray(ILQueueTimes, sizeof(ILQueueTimes)); + file->Var16(&ILQueueMask); + + file->Var8(&ICacheStreamPtr); + file->Var8(&DCacheStreamPtr); + file->VarArray(ICacheStreamTimes, sizeof(ICacheStreamTimes)); + file->VarArray(DCacheStreamTimes, sizeof(DCacheStreamTimes)); + + file->Var8((u8*)&ILForceDelay); + file->Var8(&WBWritePointer); + file->Var8(&WBFillPointer); + file->Var8(&WBWriting); + file->Var32(&WBCurAddr); + file->Var64(&WBCurVal); + file->VarArray(WBAddrQueued, sizeof(WBAddrQueued)); + file->VarArray(storeaddr, sizeof(storeaddr)); + file->VarArray(WBValQueued, sizeof(WBValQueued)); + file->VarArray(WriteBufferFifo, sizeof(WriteBufferFifo)); + file->Var64(&WBTimestamp); + file->Var64(&WBDelay); + file->Var32(&WBLastRegion); + file->Var64(&WBReleaseTS); + file->Var64(&WBInitialTS); + ARM::DoSavestate(file); CP15DoSavestate(file); + + if (!file->Saving) + { + int id; + file->Var32((u32*)&id); + DelayedQueue = GetQueueFuncFromID(id); + + file->Var32((u32*)&id); + StartExec = GetQueueFuncFromID(id); + + for (int i = 0; i <= FuncQueueEnd; i++) + { + file->Var32((u32*)&id); + FuncQueue[i] = GetQueueFuncFromID(id); + } + } + else + { + int id = GetIDFromQueueFunc(DelayedQueue); + file->Var32((u32*)&id); + + id = GetIDFromQueueFunc(StartExec); + file->Var32((u32*)&id); + + for (int i = 0; i <= FuncQueueEnd; i++) + { + id = GetIDFromQueueFunc(FuncQueue[i]); + file->Var32((u32*)&id); + } + } +} + +void ARMv4::DoSavestate(Savestate* file) +{ + file->Bool32(&Nonseq); + + ARM::DoSavestate(file); + + if (!file->Saving) + { + int id; + file->Var32((u32*)&id); + StartExec = GetQueueFuncFromID(id); + for (int i = 0; i <= FuncQueueEnd; i++) + { + file->Var32((u32*)&id); + FuncQueue[i] = GetQueueFuncFromID(id); + } + } + else + { + int id = GetIDFromQueueFunc(StartExec); + file->Var32((u32*)&id); + for (int i = 0; i <= FuncQueueEnd; i++) + { + id = GetIDFromQueueFunc(FuncQueue[i]); + file->Var32((u32*)&id); + } + } } @@ -291,118 +439,186 @@ void ARM::SetupCodeMem(u32 addr) } } -void ARMv5::JumpTo(u32 addr, bool restorecpsr) +void ARMv5::JumpTo(u32 addr, bool restorecpsr, u8 R15) { - if (restorecpsr) + //printf("JUMP! %08X %i %i\n", addr, restorecpsr, R15); + NDS.MonitorARM9Jump(addr); + + BranchRestore = restorecpsr; + BranchUpdate = R15; + BranchAddr = addr; + QueueFunction(&ARMv5::JumpTo_2); +} + +void ARMv5::JumpTo_2() +{ + if (BranchUpdate) + { + if (CP15Control & (1<<15)) + { + if (BranchUpdate == 1) BranchAddr = R[15] & ~1; + else BranchAddr = R[15] | 1; + } + else BranchAddr = R[15]; + } + + if (BranchRestore) { RestoreCPSR(); - if (CPSR & 0x20) addr |= 0x1; - else addr &= ~0x1; + if (CPSR & 0x20) BranchAddr |= 0x1; + else BranchAddr &= ~0x1; } // aging cart debug crap //if (addr == 0x0201764C) printf("capture test %d: R1=%08X\n", R[6], R[1]); //if (addr == 0x020175D8) printf("capture test %d: res=%08X\n", R[6], R[0]); - u32 oldregion = R[15] >> 24; - u32 newregion = addr >> 24; - - RegionCodeCycles = MemTimings[addr >> 12][0]; - - if (addr & 0x1) + // jumps count as nonsequential accesses on the instruction bus on the arm9 + // thus it requires waiting for the current ICache line fill to complete before continuing + if (ICacheStreamPtr < 7) { - addr &= ~0x1; - R[15] = addr+2; + u64 fillend = ICacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; + ICacheStreamPtr = 7; + } - if (newregion != oldregion) SetupCodeMem(addr); + if (BranchAddr & 0x1) + { + StartExec = &ARMv5::StartExecTHUMB; + FuncQueue[0] = StartExec; + + BranchAddr &= ~0x1; + R[15] = BranchAddr+2; + + CPSR |= 0x20; // two-opcodes-at-once fetch // doesn't matter if we put garbage in the MSbs there - if (addr & 0x2) + if (BranchAddr & 0x2) { - NextInstr[0] = CodeRead32(addr-2, true) >> 16; - Cycles += CodeCycles; - NextInstr[1] = CodeRead32(addr+2, false); - Cycles += CodeCycles; + DelayedQueue = &ARMv5::JumpTo_3A; + CodeRead32(BranchAddr-2); } else { - NextInstr[0] = CodeRead32(addr, true); - NextInstr[1] = NextInstr[0] >> 16; - Cycles += CodeCycles; + DelayedQueue = &ARMv5::JumpTo_3B; + CodeRead32(BranchAddr); } - - CPSR |= 0x20; } else { - addr &= ~0x3; - R[15] = addr+4; + StartExec = &ARMv5::StartExecARM; + FuncQueue[0] = StartExec; - if (newregion != oldregion) SetupCodeMem(addr); - - NextInstr[0] = CodeRead32(addr, true); - Cycles += CodeCycles; - NextInstr[1] = CodeRead32(addr+4, false); - Cycles += CodeCycles; + BranchAddr &= ~0x3; + R[15] = BranchAddr+4; CPSR &= ~0x20; + + DelayedQueue = &ARMv5::JumpTo_3C; + CodeRead32(BranchAddr); } - - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return; - } - - NDS.MonitorARM9Jump(addr); } -void ARMv4::JumpTo(u32 addr, bool restorecpsr) +void ARMv5::JumpTo_3A() { - if (restorecpsr) + NextInstr[0] = RetVal >> 16; + DelayedQueue = &ARMv5::JumpTo_4; + CodeRead32(BranchAddr+2); +} + +void ARMv5::JumpTo_3B() +{ + NextInstr[0] = RetVal; + NextInstr[1] = NextInstr[0] >> 16; +} + +void ARMv5::JumpTo_3C() +{ + NextInstr[0] = RetVal; + DelayedQueue = &ARMv5::JumpTo_4; + CodeRead32(BranchAddr+4); +} + +void ARMv5::JumpTo_4() +{ + NextInstr[1] = RetVal; +} + +void ARMv4::JumpTo(u32 addr, bool restorecpsr, u8 R15) +{ + //printf("JUMP! %08X %08X %i %i\n", addr, R[15], restorecpsr, R15); + BranchRestore = restorecpsr; + BranchUpdate = R15; + BranchAddr = addr; + QueueFunction(&ARMv4::JumpTo_2); +} + +void ARMv4::JumpTo_2() +{ + if (BranchUpdate) + { + if (BranchUpdate == 1) BranchAddr = R[15] & ~1; + else BranchAddr = R[15] | 1; + } + + if (BranchRestore) { RestoreCPSR(); - if (CPSR & 0x20) addr |= 0x1; - else addr &= ~0x1; + if (CPSR & 0x20) BranchAddr |= 0x1; + else BranchAddr &= ~0x1; } + + //printf("JUMP2! %08X\n", BranchAddr); - u32 oldregion = R[15] >> 23; - u32 newregion = addr >> 23; - - CodeRegion = addr >> 24; - CodeCycles = addr >> 15; // cheato - - if (addr & 0x1) + if (BranchAddr & 0x1) { - addr &= ~0x1; - R[15] = addr+2; + StartExec = &ARMv4::StartExecTHUMB; + FuncQueue[0] = StartExec; - //if (newregion != oldregion) SetupCodeMem(addr); - - NextInstr[0] = CodeRead16(addr); - NextInstr[1] = CodeRead16(addr+2); - Cycles += NDS.ARM7MemTimings[CodeCycles][0] + NDS.ARM7MemTimings[CodeCycles][1]; + BranchAddr &= ~0x1; + R[15] = BranchAddr+2; CPSR |= 0x20; + + Nonseq = true; + CodeRead16(BranchAddr); + QueueFunction(&ARMv4::JumpTo_3A); } else { - addr &= ~0x3; - R[15] = addr+4; + StartExec = &ARMv4::StartExecARM; + FuncQueue[0] = StartExec; - //if (newregion != oldregion) SetupCodeMem(addr); - - NextInstr[0] = CodeRead32(addr); - NextInstr[1] = CodeRead32(addr+4); - Cycles += NDS.ARM7MemTimings[CodeCycles][2] + NDS.ARM7MemTimings[CodeCycles][3]; + BranchAddr &= ~0x3; + R[15] = BranchAddr+4; CPSR &= ~0x20; + + Nonseq = true; + CodeRead32(BranchAddr); + QueueFunction(&ARMv4::JumpTo_3B); } } +void ARMv4::JumpTo_3A() +{ + NextInstr[0] = RetVal; + Nonseq = false; + CodeRead16(BranchAddr+2); + QueueFunction(&ARMv4::UpdateNextInstr1); +} + +void ARMv4::JumpTo_3B() +{ + NextInstr[0] = RetVal; + Nonseq = false; + CodeRead32(BranchAddr+4); + QueueFunction(&ARMv4::UpdateNextInstr1); +} + void ARM::RestoreCPSR() { u32 oldcpsr = CPSR; @@ -524,8 +740,10 @@ void ARM::UpdateMode(u32 oldmode, u32 newmode, bool phony) } } +template void ARM::TriggerIRQ() { + AddCycles_C(); if (CPSR & 0x80) return; @@ -535,7 +753,12 @@ void ARM::TriggerIRQ() UpdateMode(oldcpsr, CPSR); R_IRQ[2] = oldcpsr; - R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); +#ifdef JIT_ENABLED + if constexpr (mode == CPUExecuteMode::JIT) + R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); + else +#endif + R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); JumpTo(ExceptionBase + 0x18); // ARDS cheat support @@ -546,9 +769,15 @@ void ARM::TriggerIRQ() NDS.AREngine.RunCheats(); } } +template void ARM::TriggerIRQ(); +template void ARM::TriggerIRQ(); +#ifdef JIT_ENABLED +template void ARM::TriggerIRQ(); +#endif void ARMv5::PrefetchAbort() { + AddCycles_C(); Log(LogLevel::Warn, "ARM9: prefetch abort (%08X)\n", R[15]); u32 oldcpsr = CPSR; @@ -556,23 +785,14 @@ void ARMv5::PrefetchAbort() CPSR |= 0x97; UpdateMode(oldcpsr, CPSR); - // this shouldn't happen, but if it does, we're stuck in some nasty endless loop - // so better take care of it - if (!(PU_Map[ExceptionBase>>12] & 0x04)) - { - Log(LogLevel::Error, "!!!!! EXCEPTION REGION NOT EXECUTABLE. THIS IS VERY BAD!!\n"); - NDS.Stop(Platform::StopReason::BadExceptionRegion); - return; - } - R_ABT[2] = oldcpsr; - R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); + R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); JumpTo(ExceptionBase + 0x0C); } void ARMv5::DataAbort() { - Log(LogLevel::Warn, "ARM9: data abort (%08X)\n", R[15]); + Log(LogLevel::Warn, "ARM9: data abort (%08X) %08llX\n", R[15], CurInstr); u32 oldcpsr = CPSR; CPSR &= ~0xBF; @@ -589,13 +809,134 @@ void ARM::CheckGdbIncoming() GdbCheckA(); } +void ARMv5::StartExecTHUMB() +{ + // prefetch + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + + CheckInterlock = true; + // check for interlocks + if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions + { + // abt + } + else [[likely]] // actually execute + { + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); + } + + if (R[15] & 0x2) + { + // the value we need is cached by the bus + // in practice we can treat this as a 1 cycle fetch, with no penalties + RetVal = NextInstr[1] >> 16; + NDS.ARM9Timestamp++; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; + Store = false; + DataRegion = Mem9_Null; + + QueueFunction(&ARMv5::ContExecTHUMB); + } + else + { + DelayedQueue = &ARMv5::ContExecTHUMB; + CodeRead32(R[15]); + } +} + +void ARMv5::ContExecTHUMB() +{ + NextInstr[1] = RetVal; + + CheckInterlock = false; + if ((NDS.ARM9Timestamp >= IRQTimestamp) && !(CPSR & 0x80)) TriggerIRQ(); + else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions + { + PrefetchAbort(); + } + else [[likely]] // actually execute + { + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); + } + QueueFunction(&ARMv5::WBCheck_2); +} + +void ARMv5::StartExecARM() +{ + // prefetch + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + + CheckInterlock = true; + // check for interlocks + if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions + { + // abt + } + else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else if ((CurInstr & 0xFE000000) == 0xFA000000) + { + ARMInterpreter::A_BLX_IMM(this); + } + else if ((CurInstr & 0x0FF000F0) == 0x01200070) + { + ARMInterpreter::A_BKPT(this); // always passes regardless of condition code + } + + DelayedQueue = &ARMv5::ContExecARM; + CodeRead32(R[15]); +} + +void ARMv5::ContExecARM() +{ + NextInstr[1] = RetVal; + + CheckInterlock = false; + if ((NDS.ARM9Timestamp >= IRQTimestamp) && !(CPSR & 0x80)) TriggerIRQ(); + else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions + { + PrefetchAbort(); + } + else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else if ((CurInstr & 0xFE000000) == 0xFA000000) + { + ARMInterpreter::A_BLX_IMM(this); + } + else if ((CurInstr & 0x0FF000F0) == 0x01200070) + { + ARMInterpreter::A_BKPT(this); // always passes regardless of condition code + } + else + AddCycles_C(); + + QueueFunction(&ARMv5::WBCheck_2); +} + +void ARMv5::WBCheck_2() +{ + WriteBufferCheck(); +} + template void ARMv5::Execute() { if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckB(); - if (Halted) + if (Halted && !FuncQueueActive) { if (Halted == 2) { @@ -603,9 +944,16 @@ void ARMv5::Execute() } else if (NDS.HaltInterrupted(0)) { + NDS.ARM9Timestamp = IRQTimestamp; Halted = 0; if (NDS.IME[0] & 0x1) - TriggerIRQ(); + { +#ifdef JIT_ENABLED + if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); + //else +#endif + //IRQ = 1; + } } else { @@ -640,7 +988,7 @@ void ARMv5::Execute() { // this order is crucial otherwise idle loops waiting for an IRQ won't function if (IRQ) - TriggerIRQ(); + TriggerIRQ(); if (Halted || IdleLoop) { @@ -657,70 +1005,90 @@ void ARMv5::Execute() else #endif { - if (CPSR & 0x20) // THUMB + if (FuncQueueActive) { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); + while (FuncQueueActive) + { + //printf("A9: A:%i, F:%i, P:%i, E:%i, I:%08llX, N:%08llX, 7:%08llX 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, NextInstr[0], NDS.ARM7.CurInstr, R[15]); + (this->*FuncQueue[FuncQueueProg])(); - // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } - else NextInstr[1] = CodeRead32(R[15], false); + if (FuncQueueFill == FuncQueueProg) + { + // we did not get a new addition to the queue; increment and reset ptrs + FuncQueueFill = ++FuncQueueProg; - // actually execute - u32 icode = (CurInstr >> 6) & 0x3FF; - ARMInterpreter::THUMBInstrTable[icode](this); + // check if we're done with the queue, if so, reset everything + if (FuncQueueProg >= FuncQueueEnd) + { + FuncQueueFill = 0; + FuncQueueProg = 0; + FuncQueueEnd = 0; + FuncQueueActive = false; + FuncQueue[0] = StartExec; + if (Halted) + { + if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target) + { + NDS.ARM9Timestamp = NDS.ARM9Target; + } + goto exit; + } + } + } + else + { + // we got a new addition to the list; redo the current entry and exit to resolve main ram + if (FuncQueueEnd < FuncQueueFill) FuncQueueEnd = FuncQueueFill; + FuncQueueFill = FuncQueueProg; + return; + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram + } } - else + else { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15], false); - - // actually execute - if (CheckCondition(CurInstr >> 28)) + while (NDS.ARM9Timestamp < NDS.ARM9Target) { - u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); - ARMInterpreter::ARMInstrTable[icode](this); - } - else if ((CurInstr & 0xFE000000) == 0xFA000000) - { - ARMInterpreter::A_BLX_IMM(this); - } - else - AddCycles_C(); - } + if constexpr (mode == CPUExecuteMode::InterpreterGDB) + GdbCheckC(); // gdb might throw a hissy fit about this change but idc - // TODO optimize this shit!!! - if (Halted) - { - if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target) - { - NDS.ARM9Timestamp = NDS.ARM9Target; + //printf("A9: A:%i, F:%i, P:%i, E:%i, I:%08llX, N:%08llX, 7:%08llX 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, NextInstr[0], NDS.ARM7.CurInstr, R[15]); + (this->*FuncQueue[FuncQueueProg])(); + + if (FuncQueueFill > 0) // check if we started the queue up + { + FuncQueueEnd = FuncQueueFill; + FuncQueueFill = 0; + FuncQueueActive = true; + return; // exit to resolve main ram + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram + + // TODO optimize this shit!!! + if (Halted && !FuncQueueActive) + { + if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target) + { + NDS.ARM9Timestamp = NDS.ARM9Target; + } + goto exit; + } } - break; } /*if (NDS::IF[0] & NDS::IE[0]) { if (NDS::IME[0] & 0x1) - TriggerIRQ(); + TriggerIRQ(); }*/ - if (IRQ) TriggerIRQ(); - } - NDS.ARM9Timestamp += Cycles; - Cycles = 0; + //NDS.ARM9Timestamp += Cycles; + //Cycles = 0; } - if (Halted == 2) + exit: + + if (Halted == 2 && !FuncQueueActive) Halted = 0; } template void ARMv5::Execute(); @@ -729,13 +1097,50 @@ template void ARMv5::Execute(); template void ARMv5::Execute(); #endif +void ARMv4::StartExecTHUMB() +{ + // prefetch + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + CodeRead16(R[15]); + QueueFunction(&ARMv4::UpdateNextInstr1); + + if ((NDS.ARM7Timestamp >= IRQTimestamp) && !(CPSR & 0x80)) TriggerIRQ(); + else + { + // actually execute + u32 icode = (CurInstr >> 6); + ARMInterpreter::THUMBInstrTable[icode](this); + } +} + +void ARMv4::StartExecARM() +{ + // prefetch + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + CodeRead32(R[15]); + QueueFunction(&ARMv4::UpdateNextInstr1); + + if ((NDS.ARM7Timestamp >= IRQTimestamp) && !(CPSR & 0x80)) TriggerIRQ(); + else if (CheckCondition(CurInstr >> 28)) // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else + AddCycles_C(); +} + template void ARMv4::Execute() { if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckB(); - - if (Halted) + + if (Halted && !FuncQueueActive) { if (Halted == 2) { @@ -743,9 +1148,16 @@ void ARMv4::Execute() } else if (NDS.HaltInterrupted(1)) { + NDS.ARM7Timestamp = IRQTimestamp; Halted = 0; if (NDS.IME[1] & 0x1) - TriggerIRQ(); + { +#ifdef JIT_ENABLED + if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); + //else +#endif + //IRQ = 1; + } } else { @@ -779,7 +1191,7 @@ void ARMv4::Execute() if (StopExecution) { if (IRQ) - TriggerIRQ(); + TriggerIRQ(); if (Halted || IdleLoop) { @@ -796,67 +1208,84 @@ void ARMv4::Execute() else #endif { - if (CPSR & 0x20) // THUMB + if (FuncQueueActive) { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15]); - - // actually execute - u32 icode = (CurInstr >> 6); - ARMInterpreter::THUMBInstrTable[icode](this); - } - else - { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15]); - - // actually execute - if (CheckCondition(CurInstr >> 28)) + while (FuncQueueActive) { - u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); - ARMInterpreter::ARMInstrTable[icode](this); - } - else - AddCycles_C(); - } + //printf("A7: A:%i, F:%i, P:%i, E:%i, I:%08llX, N:%08llX 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, NextInstr[0], R[15]); + (this->*FuncQueue[FuncQueueProg])(); - // TODO optimize this shit!!! - if (Halted) - { - if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target) - { - NDS.ARM7Timestamp = NDS.ARM7Target; + if (FuncQueueFill == FuncQueueProg) + { + // we did not get a new addition to the queue; increment and reset ptrs + FuncQueueFill = ++FuncQueueProg; + + // check if we're done with the queue, if so, reset everything + if (FuncQueueProg >= FuncQueueEnd) + { + FuncQueueFill = 0; + FuncQueueProg = 0; + FuncQueueEnd = 0; + FuncQueueActive = false; + FuncQueue[0] = StartExec; + if (Halted) + { + if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target) + { + NDS.ARM7Timestamp = NDS.ARM7Target; + } + goto exit; + } + } + } + else + { + // we got a new addition to the list; redo the current entry and exit to resolve main ram + FuncQueueFill = FuncQueueProg; + return; + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram } - break; } - /*if (NDS::IF[1] & NDS::IE[1]) + else { - if (NDS::IME[1] & 0x1) - TriggerIRQ(); - }*/ - if (IRQ) TriggerIRQ(); + while (NDS.ARM7Timestamp < NDS.ARM7Target) + { + if constexpr (mode == CPUExecuteMode::InterpreterGDB) + GdbCheckC(); + + //printf("A7: A:%i, F:%i, P:%i, E:%i, I:%08llX, N:%08llX 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, NextInstr[0], R[15]); + (this->*FuncQueue[FuncQueueProg])(); + + if (FuncQueueFill > 0) // check if we started the queue up + { + FuncQueueEnd = FuncQueueFill; + FuncQueueFill = 0; + FuncQueueActive = true; + return; // exit to resolve main ram + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram + + // TODO optimize this shit!!! + if (Halted && !FuncQueueActive) + { + if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target) + { + NDS.ARM7Timestamp = NDS.ARM7Target; + } + goto exit; + } + } + } } - - NDS.ARM7Timestamp += Cycles; - Cycles = 0; } - if (Halted == 2) + exit: + + if (Halted == 2 && !FuncQueueActive) Halted = 0; - if (Halted == 4) + if (Halted == 4 && !FuncQueueActive) { assert(NDS.ConsoleType == 1); auto& dsi = dynamic_cast(NDS); @@ -873,31 +1302,31 @@ template void ARMv4::Execute(); void ARMv5::FillPipeline() { - SetupCodeMem(R[15]); + /*SetupCodeMem(R[15]); if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) { - NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16; - NextInstr[1] = CodeRead32(R[15], false); + NextInstr[0] = CodeRead32(R[15] - 4) >> 16; + NextInstr[1] = CodeRead32(R[15]); } else { - NextInstr[0] = CodeRead32(R[15] - 2, false); + NextInstr[0] = CodeRead32(R[15] - 2); NextInstr[1] = NextInstr[0] >> 16; } } else { - NextInstr[0] = CodeRead32(R[15] - 4, false); - NextInstr[1] = CodeRead32(R[15], false); - } + NextInstr[0] = CodeRead32(R[15] - 4); + NextInstr[1] = CodeRead32(R[15]); + }*/ } void ARMv4::FillPipeline() { - SetupCodeMem(R[15]); + /*SetupCodeMem(R[15]); if (CPSR & 0x20) { @@ -908,7 +1337,7 @@ void ARMv4::FillPipeline() { NextInstr[0] = CodeRead32(R[15] - 4); NextInstr[1] = CodeRead32(R[15]); - } + }*/ } #ifdef GDBSTUB_ENABLED @@ -1119,133 +1548,409 @@ u32 ARMv5::ReadMem(u32 addr, int size) } #endif -void ARMv4::DataRead8(u32 addr, u32* val) + +void ARMv5::CodeFetch() { - *val = BusRead8(addr); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataRead16(u32 addr, u32* val) +void ARMv5::AddExecute() { - addr &= ~1; - - *val = BusRead16(addr); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; + NDS.ARM9Timestamp += ExecuteCycles; } -void ARMv4::DataRead32(u32 addr, u32* val) +void ARMv5::AddCycles_MW_2() { - addr &= ~3; + TimestampMemory = NDS.ARM9Timestamp; - *val = BusRead32(addr); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; + NDS.ARM9Timestamp -= DataCycles; } -void ARMv4::DataRead32S(u32 addr, u32* val) +void ARMv5::DelayIfITCM_2() { - addr &= ~3; - - *val = BusRead32(addr); - DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; + if (DataRegion == Mem9_ITCM) NDS.ARM9Timestamp += ITCMDelay; } -void ARMv4::DataWrite8(u32 addr, u8 val) +void ARMv5::SetupInterlock_2() { - BusWrite8(addr, val); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; + ILCurrReg = ILQueueReg; + ILCurrTime = TimestampMemory + ILQueueDelay; } -void ARMv4::DataWrite16(u32 addr, u16 val) +void ARMv5::HandleInterlocksExecute_2() { - addr &= ~1; + if (ILQueueMask & (1<> 15][0]; + ILCurrReg = 16; + ILPrevReg = 16; + return; + } + } + + if (ILQueueMask & (1<> 15][2]; + if ((ILQueueMemReg != ILPrevReg) || (NDS.ARM9Timestamp >= ILPrevTime)) return; + + u64 diff = ILPrevTime - NDS.ARM9Timestamp; // should always be 1? + NDS.ARM9Timestamp = ILPrevTime; + ITCMTimestamp += diff; // checkme + ILPrevTime = 16; } -void ARMv4::DataWrite32S(u32 addr, u32 val) +void ARMv5::ForceInterlock_2() { - addr &= ~3; + NDS.ARM9Timestamp = TimestampMemory + ILForceDelay; +} - BusWrite32(addr, val); - DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; +void ARMv5::QueueFunction(void (ARMv5::*QueueEntry)(void)) +{ + if ((NDS.ARM9Timestamp >= NDS.ARM9Target) || (MRTrack.Type != MainRAMType::Null)) + FuncQueue[FuncQueueFill++] = QueueEntry; + else + (this->*QueueEntry)(); +} + +void ARMv4::QueueFunction(void (ARMv4::*QueueEntry)(void)) +{ + if ((NDS.ARM7Timestamp >= NDS.ARM7Target) || (MRTrack.Type != MainRAMType::Null)) + FuncQueue[FuncQueueFill++] = QueueEntry; + else + (this->*QueueEntry)(); +} + +void ARMv4::CodeRead16(u32 addr) +{ + if ((addr >> 24) == 0x02) + { + FetchAddr[16] = addr; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRCodeFetch | MR16; + if (!Nonseq) MRTrack.Var |= MRSequential; + } + else + { + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?0:1]; + RetVal = BusRead16(addr); + } +} + +void ARMv4::CodeRead32(u32 addr) +{ + if ((addr >> 24) == 0x02) + { + FetchAddr[16] = addr; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRCodeFetch | MR32; + if (!Nonseq) MRTrack.Var |= MRSequential; + } + else + { + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?2:3]; + RetVal = BusRead32(addr); + } +} + +bool ARMv4::DataRead8(u32 addr, u8 reg) +{ + FetchAddr[reg] = addr; + LDRRegs = 1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR8; + MRTrack.Progress = reg; + } + else + { + u32 dummy; u32* val = (LDRFailedRegs & (1<> 15][0]; + *val = BusRead8(addr); + } + +} + +bool ARMv4::DataRead16(u32 addr, u8 reg) +{ + FetchAddr[reg] = addr; + LDRRegs = 1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR16; + MRTrack.Progress = reg; + } + else + { + u32 dummy; + u32* val = (LDRFailedRegs & (1<> 15][0]; + *val = BusRead16(addr); + } +} + +bool ARMv4::DataRead32(u32 addr, u8 reg) +{ + FetchAddr[reg] = addr; + LDRRegs = 1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32; + MRTrack.Progress = reg; + } + else + { + u32 dummy; + u32* val = (LDRFailedRegs & (1<> 15][2]; + *val = BusRead32(addr); + } + LDRRegs &= ~1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32 | MRSequential; + MRTrack.Progress = reg; + } + else + { + u32 dummy; + u32* val = (LDRFailedRegs & (1<> 15][3]; + *val = BusRead32(addr); + } + LDRRegs &= ~1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR8; + MRTrack.Progress = reg; + } + else + { + u8 val = STRVal[reg]; + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + BusWrite8(addr, val); + } +} + +bool ARMv4::DataWrite16(u32 addr, u16 val, u8 reg) +{ + FetchAddr[reg] = addr; + STRRegs = 1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR16; + MRTrack.Progress = reg; + } + else + { + u16 val = STRVal[reg]; + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + BusWrite16(addr, val); + } +} + +bool ARMv4::DataWrite32(u32 addr, u32 val, u8 reg) +{ + FetchAddr[reg] = addr; + STRRegs = 1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32; + MRTrack.Progress = reg; + } + else + { + u32 val = STRVal[reg]; + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][2]; + BusWrite32(addr, val); + } + STRRegs &= ~1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32 | MRSequential; + MRTrack.Progress = reg; + } + else + { + u32 val = STRVal[reg]; + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][3]; + BusWrite32(addr, val); + } + STRRegs &= ~1<> 24) == 0x02) // mainRAM - { - if (CodeRegion == 0x02) - Cycles += numC + numD; - else - { - numC++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - } - else if (CodeRegion == 0x02) - { - numD++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - else - { - Cycles += numC + numD + 1; - } + Nonseq = true; + QueueFunction(&ARMv4::AddExtraCycle); +} + +void ARMv4::AddExtraCycle() +{ + NDS.ARM7Timestamp += 1; } void ARMv4::AddCycles_CD() { // TODO: max gain should be 5c when writing to mainRAM - s32 numC = NDS.ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; - s32 numD = DataCycles; - - if ((DataRegion >> 24) == 0x02) - { - if (CodeRegion == 0x02) - Cycles += numC + numD; - else - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - else if (CodeRegion == 0x02) - { - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - else - { - Cycles += numC + numD; - } + + Nonseq = true; } u8 ARMv5::BusRead8(u32 addr) diff --git a/src/ARM.h b/src/ARM.h index f18d7650..8c119afc 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -21,10 +21,13 @@ #include #include +#include #include "types.h" #include "MemRegion.h" #include "MemConstants.h" +#include "CP15_Constants.h" +#include "Platform.h" #ifdef GDBSTUB_ENABLED #include "debug/GdbStub.h" @@ -52,6 +55,50 @@ enum class CPUExecuteMode : u32 #endif }; +enum class WBMode +{ + Check, + Force, + SingleBurst, + WaitEntry, +}; + +enum class MainRAMType : u8 +{ + Null = 0, + Fetch, + ICacheStream, + DCacheStream, + DMA16, + DMA32, + + WriteBufferCmds, // all write buffer commands must be below this one; wb cmds are not strictly used for main ram + + WBDrain, + WBWrite, + WBCheck, + WBWaitRead, + WBWaitWrite, +}; + +// each one represents a bit in the field +enum FetchFlags +{ + MR8 = 0x00, // tbh it only exists because it felt wrong to write nothing to the field for 8 bit reads + MR16 = 0x01, + MR32 = 0x02, + MRWrite = 0x20, + MRSequential = 0x40, + MRCodeFetch = 0x80, +}; + +struct MainRAMTrackers +{ + MainRAMType Type; + u8 Var; + u8 Progress; +}; + struct GDBArgs; class ARMJIT; class GPU; @@ -76,7 +123,7 @@ public: virtual void FillPipeline() = 0; - virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; + virtual void JumpTo(u32 addr, bool restorecpsr = false, u8 R15 = 0) = 0; void RestoreCPSR(); void Halt(u32 halt) @@ -130,19 +177,20 @@ public: void UpdateMode(u32 oldmode, u32 newmode, bool phony = false); + template void TriggerIRQ(); void SetupCodeMem(u32 addr); - virtual void DataRead8(u32 addr, u32* val) = 0; - virtual void DataRead16(u32 addr, u32* val) = 0; - virtual void DataRead32(u32 addr, u32* val) = 0; - virtual void DataRead32S(u32 addr, u32* val) = 0; - virtual void DataWrite8(u32 addr, u8 val) = 0; - virtual void DataWrite16(u32 addr, u16 val) = 0; - virtual void DataWrite32(u32 addr, u32 val) = 0; - virtual void DataWrite32S(u32 addr, u32 val) = 0; + virtual bool DataRead8(u32 addr, u8 reg) = 0; + virtual bool DataRead16(u32 addr, u8 reg) = 0; + virtual bool DataRead32(u32 addr, u8 reg) = 0; + virtual bool DataRead32S(u32 addr, u8 reg) = 0; + virtual bool DataWrite8(u32 addr, u8 val, u8 reg) = 0; + virtual bool DataWrite16(u32 addr, u16 val, u8 reg) = 0; + virtual bool DataWrite32(u32 addr, u32 val, u8 reg) = 0; + virtual bool DataWrite32S(u32 addr, u32 val, u8 reg) = 0; virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; @@ -171,20 +219,47 @@ public: u32 DataRegion; s32 DataCycles; - u32 R[16]; // heh + alignas(64) u32 R[16]; // heh u32 CPSR; u32 R_FIQ[8]; // holding SPSR too u32 R_SVC[3]; u32 R_ABT[3]; u32 R_IRQ[3]; u32 R_UND[3]; - u32 CurInstr; - u32 NextInstr[2]; + u64 CurInstr; + u64 NextInstr[2]; u32 ExceptionBase; MemRegion CodeMem; + MainRAMTrackers MRTrack; + + u32 BranchAddr; + u8 BranchUpdate; + bool BranchRestore; + + u32 QueueMode[2]; + u8 ExtReg; + u8 ExtROROffs; + + u64 RetVal; + + u16 LDRRegs; + u16 LDRFailedRegs; + u16 STRRegs; + u32 FetchAddr[17]; + u32 STRVal[16]; + + u64 IRQTimestamp; + + u8 FuncQueueFill; + u8 FuncQueueEnd; + u8 FuncQueueProg; + u8 ExecuteCycles; + bool FuncQueueActive; + bool CheckInterlock; + #ifdef JIT_ENABLED u32 FastBlockLookupStart, FastBlockLookupSize; u64* FastBlockLookup; @@ -243,7 +318,7 @@ public: void FillPipeline() override; - void JumpTo(u32 addr, bool restorecpsr = false) override; + void JumpTo(u32 addr, bool restorecpsr = false, u8 R15 = 0) override; void PrefetchAbort(); void DataAbort(); @@ -252,120 +327,732 @@ public: void Execute(); // all code accesses are forced nonseq 32bit - u32 CodeRead32(u32 addr, bool branch); + void CodeRead32(const u32 addr); - void DataRead8(u32 addr, u32* val) override; - void DataRead16(u32 addr, u32* val) override; - void DataRead32(u32 addr, u32* val) override; - void DataRead32S(u32 addr, u32* val) override; - void DataWrite8(u32 addr, u8 val) override; - void DataWrite16(u32 addr, u16 val) override; - void DataWrite32(u32 addr, u32 val) override; - void DataWrite32S(u32 addr, u32 val) override; + bool DataRead8(u32 addr, u8 reg) override; + bool DataRead16(u32 addr, u8 reg) override; + bool DataRead32(u32 addr, u8 reg) override; + bool DataRead32S(u32 addr, u8 reg) override; + bool DataWrite8(u32 addr, u8 val, u8 reg) override; + bool DataWrite16(u32 addr, u16 val, u8 reg) override; + bool DataWrite32(u32 addr, u32 val, u8 reg) override; + bool DataWrite32S(u32 addr, u32 val, u8 reg) override; + + void CodeFetch(); void AddCycles_C() override { - // code only. always nonseq 32-bit for ARM9. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC; + //ExecuteCycles = 0; + //CodeFetch(); } - void AddCycles_CI(s32 numI) override + void AddCycles_CI(s32 numX) override { - // code+internal - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; + ExecuteCycles = numX; + QueueFunction(&ARMv5::AddExecute); + } + + void AddCycles_MW(s32 numM) + { + DataCycles = numM; + QueueFunction(&ARMv5::AddCycles_MW_2); } void AddCycles_CDI() override { - // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; + AddCycles_MW(DataCycles); } void AddCycles_CD() override { - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; + Store = true; // todo: queue this + AddCycles_MW(DataCycles); } - void GetCodeMemRegion(u32 addr, MemRegion* region); + void DelayIfITCM(s8 delay) + { + ITCMDelay = delay; + QueueFunction(&ARMv5::DelayIfITCM_2); + } + inline void SetupInterlock(u8 reg, s8 delay = 0) + { + ILQueueReg = reg; + ILQueueDelay = delay; + + QueueFunction(&ARMv5::SetupInterlock_2); + } + + template + inline void HandleInterlocksExecute(u16 ilmask, u8* times = NULL) + { + if constexpr (bitfield) ILQueueMask = ilmask; + else ILQueueMask = 1< bool WriteBufferHandle(); + template void WriteBufferCheck(); + void WriteBufferWrite(u32 val, u8 flag, u32 addr = 0); + void WriteBufferDrain(); - void CP15Write(u32 id, u32 val); - u32 CP15Read(u32 id) const; + /** + * @brief Invalidates the instruction cacheline containing + * the data of an address. + * @details + * Searches the cacheline containing the data of an address, and + * if found clears the @ref CACHE_FLAG_VALID of this cache line. + * Nothing is done if the address is not present in the cache. + * @param [in] addr Memory address of the data in the cache line + * @par Returns + * Nothing + */ + void ICacheInvalidateByAddr(const u32 addr); - u32 CP15Control; + /** + * @brief Invalidates an instruction cache line + * @details + * Clears the @ref CACHE_FLAG_VALID of the cacheline given by + * set and index within the set. Nothing is done if the cache + * line does not exist. + * @param [in] cacheSet index of the internal cache set from + * 0 to @ref ICACHE_SETS - 1 + * @param [in] cacheLine index of the line within the cache set + * from 0 to @ref ICACHE_LINESPERSET - 1 + * @par Returns + * Nothing + */ + void ICacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); - u32 RNGSeed; - u32 TraceProcessID; + /** + * @brief Perform an data cache lookup handle + * @details + * A cache lookup is performed, if not disabled in + * @ref CP15BISTTestStateRegister, a hit will returned the + * cached data, otherwise it returns the result of an memory + * access instead. + * If the cache lookup results in a cachemiss and linefill is + * not disabled in @ref CP15BISTTestStateRegister, will fill + * fetch all data to fill the entire cacheline directly + * from the ITCM, DTCM or bus + * @param [in] addr Address of the memory to be retreived from + * cache. The address is internally aligned to an word boundary + * @return Value of the word at addr + */ + bool DCacheLookup(const u32 addr); - u32 DTCMSetting, ITCMSetting; + /** + * @brief Updates a word in the data cache if present + * @param [in] addr Memory address which is written + * @param [in] val Word value to be written + * @retval true, if the data was written into the cache and + * does not need to be updated until cache is + * cleaned + * false, to write through + */ + bool DCacheWrite32(const u32 addr, const u32 val); + + /** + * @brief Updates a word in the data cache if present + * @param [in] addr Memory address which is written + * @param [in] val Half-Word value to be written + * @retval true, if the data was written into the cache and + * does not need to be updated until cache is + * cleaned + * false, to write through + */ + bool DCacheWrite16(const u32 addr, const u16 val); + + /** + * @brief Updates a word in the data cache if present + * @param [in] addr Memory address which is written + * @param [in] val Byte value to be written + * @retval true, if the data was written into the cache and + * does not need to be updated until cache is + * cleaned + * false, to write through + */ + bool DCacheWrite8(const u32 addr, const u8 val); + + /** + * @brief Check if an address is within a data cachable region + * @details + * Checks the address by looking up the PU_map flags for + * the address and returns the status of the data cache enable + * flag + * + * @param [in] addr Address. May be unaligned. + * @retval true If the address points to a region, that is + * enabled for instruction fetches to be cached. + */ + inline bool IsAddressDCachable(const u32 addr) const; + + /** + * @brief Invalidates the data cacheline containing the data of + * an address. + * @details + * Searches the cacheline containing the data of an address, and + * if found clears the @ref CACHE_FLAG_VALID of this cache line. + * Nothing is done if the address is not present in the cache. + * @par Returns + * Nothing + */ + void DCacheInvalidateAll(); + + /** + * @brief Invalidates the data cacheline containing the data of + * an address. + * @details + * Searches the cacheline containing the data of an address, and + * if found clears the @ref CACHE_FLAG_VALID of this cache line. + * Nothing is done if the address is not present in the cache. + * @par Returns + * Nothing + */ + void DCacheInvalidateByAddr(const u32 addr); + + /** + * @brief Invalidates an data cache line + * @details + * Clears the @ref CACHE_FLAG_VALID of the cacheline given by + * set and index within the set. Nothing is done if the cache + * line does not exist. + * @param [in] cacheSet index of the internal cache set from + * 0 to @ref DCACHE_SETS - 1 + * @param [in] cacheLine index of the line within the cache set + * from 0 to @ref DCACHE_LINESPERSET - 1 + * @par Returns + * Nothing + */ + void DCacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); + + /** + * @brief Cleans the entire data cache + * @details + * If write-back is enabled in conjunction with the data cache + * the dirty flags in tags are set if the corresponding cache + * line is written to. + * A clean will write the parts of the cache line back + * that is marked dirty and adds the required cycles to the + * @ref DataCyces member. + * @par Returns + * Nothing + */ + void DCacheClearAll(); + + /** + * @brief Cleans a data cache line + * @details + * If write-back is enabled in conjunction with the data cache + * the dirty flags in tags are set if the corresponding cache + * line is written to. + * A clean will write the parts of the cache line back + * that is marked dirty and adds the required cycles to the + * @ref DataCyces member. + * @param [in] addr Memory address of the data in the cache line + * @par Returns + * Nothing + */ + void DCacheClearByAddr(const u32 addr); + + /** + * @brief Cleans a data cache line + * @details + * If write-back is enabled in conjunction with the data cache + * the dirty flags in tags are set if the corresponding cache + * line is written to. + * A clean will write the parts of the cache line back + * that is marked dirty and adds the required cycles to the + * @ref DataCyces member. + * @param [in] cacheSet index of the internal cache set from + * 0 to @ref DCACHE_SETS - 1 + * @param [in] cacheLine index of the line within the cache set + * from 0 to @ref DCACHE_LINESPERSET - 1 + * @par Returns + * Nothing + */ + void DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine); + + /** + * @brief Handles MCR operations writing to cp15 registers + * @details + * This function updates the internal state of the emulator when + * a cp15 register is written, or triggers the corresponding action + * like flushing caches. + * + * @param [in] id the operation id to be performed, consisting of + * (from lower to higher nibble) opcode2, intermediate register, + * register and opcode1. Most write operations just take the first 3 + * into account. + * param [in] val value to be written to the cp15 register + * @par Returns + * Nothing + */ + void CP15Write(const u32 id, const u32 val); + + /** + * @brief handles MRC operations reading from cp15 registers + * @details + * This function accumulates the regsiter states from the internal + * emulator state. It does not modify the internal state of the + * emulator or cp15. + * @param [in] id the operation id to be performed, consisting of + * (from lower to higher nibble) opcode2, intermediate register, + * register and opcode1. Most read operations just take the first 3 + * into account. + * @return Value of the cp15 register + */ + u32 CP15Read(const u32 id) const; + + void QueueFunction(void (ARMv5::*QueueEntry)(void)); + + int GetIDFromQueueFunc(void (ARMv5::*funcptr)(void)) + { + if (funcptr == &ARMv5::StartExecARM) return 0; + else if (funcptr == &ARMv5::ContExecARM) return 1; + else if (funcptr == &ARMv5::StartExecTHUMB) return 2; + else if (funcptr == &ARMv5::ContExecTHUMB) return 3; + else if (funcptr == &ARMv5::AddExecute) return 4; + else if (funcptr == &ARMv5::AddCycles_MW_2) return 5; + else if (funcptr == &ARMv5::DelayIfITCM_2) return 6; + else if (funcptr == &ARMv5::JumpTo_2) return 7; + else if (funcptr == &ARMv5::JumpTo_3A) return 8; + else if (funcptr == &ARMv5::JumpTo_3B) return 9; + else if (funcptr == &ARMv5::JumpTo_3C) return 10; + else if (funcptr == &ARMv5::JumpTo_4) return 11; + else if (funcptr == &ARMv5::CodeRead32_2) return 12; + else if (funcptr == &ARMv5::CodeRead32_3) return 13; + else if (funcptr == &ARMv5::CodeRead32_4) return 14; + else if (funcptr == &ARMv5::ICacheLookup_2) return 15; + else if (funcptr == &ARMv5::DAbortHandle) return 16; + else if (funcptr == &ARMv5::DCacheFin8) return 17; + else if (funcptr == &ARMv5::DRead8_2) return 18; + else if (funcptr == &ARMv5::DRead8_3) return 19; + else if (funcptr == &ARMv5::DRead8_4) return 20; + else if (funcptr == &ARMv5::DRead8_5) return 21; + else if (funcptr == &ARMv5::DCacheFin16) return 22; + else if (funcptr == &ARMv5::DRead16_2) return 23; + else if (funcptr == &ARMv5::DRead16_3) return 24; + else if (funcptr == &ARMv5::DRead16_4) return 25; + else if (funcptr == &ARMv5::DRead16_5) return 26; + else if (funcptr == &ARMv5::DCacheFin32) return 27; + else if (funcptr == &ARMv5::DRead32_2) return 28; + else if (funcptr == &ARMv5::DRead32_3) return 29; + else if (funcptr == &ARMv5::DRead32_4) return 30; + else if (funcptr == &ARMv5::DRead32_5) return 31; + else if (funcptr == &ARMv5::DRead32S_2) return 32; + else if (funcptr == &ARMv5::DRead32S_3) return 33; + else if (funcptr == &ARMv5::DRead32S_4) return 34; + else if (funcptr == &ARMv5::DRead32S_5A) return 35; + else if (funcptr == &ARMv5::DRead32S_5B) return 36; + else if (funcptr == &ARMv5::DWrite8_2) return 37; + else if (funcptr == &ARMv5::DWrite8_3) return 38; + else if (funcptr == &ARMv5::DWrite8_4) return 39; + else if (funcptr == &ARMv5::DWrite8_5) return 40; + else if (funcptr == &ARMv5::DWrite16_2) return 41; + else if (funcptr == &ARMv5::DWrite16_3) return 42; + else if (funcptr == &ARMv5::DWrite16_4) return 43; + else if (funcptr == &ARMv5::DWrite16_5) return 44; + else if (funcptr == &ARMv5::DWrite32_2) return 45; + else if (funcptr == &ARMv5::DWrite32_3) return 46; + else if (funcptr == &ARMv5::DWrite32_4) return 47; + else if (funcptr == &ARMv5::DWrite32_5) return 48; + else if (funcptr == &ARMv5::DWrite32S_2) return 49; + else if (funcptr == &ARMv5::DWrite32S_3) return 50; + else if (funcptr == &ARMv5::DWrite32S_4) return 51; + else if (funcptr == &ARMv5::DWrite32S_5A) return 52; + else if (funcptr == &ARMv5::DWrite32S_5B) return 53; + else if (funcptr == &ARMv5::WBCheck_2) return 54; + else if (funcptr == &ARMv5::ICachePrefetch_2) return 55; + else if (funcptr == &ARMv5::DCacheLookup_2) return 56; + else if (funcptr == &ARMv5::DCacheLookup_3) return 57; + else if (funcptr == &ARMv5::DCClearAddr_2) return 58; + else if (funcptr == &ARMv5::DCClearSetWay_2) return 59; + else if (funcptr == &ARMv5::DCClearInvalidateAddr_2) return 60; + else if (funcptr == &ARMv5::DCClearInvalidateSetWay_2) return 61; + else if (funcptr == &ARMv5::SetupInterlock_2) return 62; + else if (funcptr == &ARMv5::HandleInterlocksExecute_2) return 63; + else if (funcptr == &ARMv5::HandleInterlocksMemory_2) return 64; + else if (funcptr == &ARMv5::ForceInterlock_2) return 65; + else if (funcptr == &ARMv5::QueueUpdateMode) return 66; + else if (funcptr == &ARMv5::SignExtend8) return 67; + else if (funcptr == &ARMv5::SignExtend16) return 68; + else if (funcptr == &ARMv5::ROR32) return 69; + else { Platform::Log(Platform::LogLevel::Error, "ARM9: INVALID FUNCTION POINTER FOR SAVESTATES; DID SOMEONE FORGET TO UPDATE SERIALIZATION?\n"); return -1; } + } + + typedef void (ARMv5::*funcptrA9)(void); + funcptrA9 GetQueueFuncFromID(int funcid) + { + switch(funcid) + { + case 0: return &ARMv5::StartExecARM; + case 1: return &ARMv5::ContExecARM; + case 2: return &ARMv5::StartExecTHUMB; + case 3: return &ARMv5::ContExecTHUMB; + case 4: return &ARMv5::AddExecute; + case 5: return &ARMv5::AddCycles_MW_2; + case 6: return &ARMv5::DelayIfITCM_2; + case 7: return &ARMv5::JumpTo_2; + case 8: return &ARMv5::JumpTo_3A; + case 9: return &ARMv5::JumpTo_3B; + case 10: return &ARMv5::JumpTo_3C; + case 11: return &ARMv5::JumpTo_4; + case 12: return &ARMv5::CodeRead32_2; + case 13: return &ARMv5::CodeRead32_3; + case 14: return &ARMv5::CodeRead32_4; + case 15: return &ARMv5::ICacheLookup_2; + case 16: return &ARMv5::DAbortHandle; + case 17: return &ARMv5::DCacheFin8; + case 18: return &ARMv5::DRead8_2; + case 19: return &ARMv5::DRead8_3; + case 20: return &ARMv5::DRead8_4; + case 21: return &ARMv5::DRead8_5; + case 22: return &ARMv5::DCacheFin16; + case 23: return &ARMv5::DRead16_2; + case 24: return &ARMv5::DRead16_3; + case 25: return &ARMv5::DRead16_4; + case 26: return &ARMv5::DRead16_5; + case 27: return &ARMv5::DCacheFin32; + case 28: return &ARMv5::DRead32_2; + case 29: return &ARMv5::DRead32_3; + case 30: return &ARMv5::DRead32_4; + case 31: return &ARMv5::DRead32_5; + case 32: return &ARMv5::DRead32S_2; + case 33: return &ARMv5::DRead32S_3; + case 34: return &ARMv5::DRead32S_4; + case 35: return &ARMv5::DRead32S_5A; + case 36: return &ARMv5::DRead32S_5B; + case 37: return &ARMv5::DWrite8_2; + case 38: return &ARMv5::DWrite8_3; + case 39: return &ARMv5::DWrite8_4; + case 40: return &ARMv5::DWrite8_5; + case 41: return &ARMv5::DWrite16_2; + case 42: return &ARMv5::DWrite16_3; + case 43: return &ARMv5::DWrite16_4; + case 44: return &ARMv5::DWrite16_5; + case 45: return &ARMv5::DWrite32_2; + case 46: return &ARMv5::DWrite32_3; + case 47: return &ARMv5::DWrite32_4; + case 48: return &ARMv5::DWrite32_5; + case 49: return &ARMv5::DWrite32S_2; + case 50: return &ARMv5::DWrite32S_3; + case 51: return &ARMv5::DWrite32S_4; + case 52: return &ARMv5::DWrite32S_5A; + case 53: return &ARMv5::DWrite32S_5B; + case 54: return &ARMv5::WBCheck_2; + case 55: return &ARMv5::ICachePrefetch_2; + case 56: return &ARMv5::DCacheLookup_2; + case 57: return &ARMv5::DCacheLookup_3; + case 58: return &ARMv5::DCClearAddr_2; + case 59: return &ARMv5::DCClearSetWay_2; + case 60: return &ARMv5::DCClearInvalidateAddr_2; + case 61: return &ARMv5::DCClearInvalidateSetWay_2; + case 62: return &ARMv5::SetupInterlock_2; + case 63: return &ARMv5::HandleInterlocksExecute_2; + case 64: return &ARMv5::HandleInterlocksMemory_2; + case 65: return &ARMv5::ForceInterlock_2; + case 66: return &ARMv5::QueueUpdateMode; + case 67: return &ARMv5::SignExtend8; + case 68: return &ARMv5::SignExtend16; + case 69: return &ARMv5::ROR32; + default: Platform::Log(Platform::LogLevel::Error, "ARM9: INVALID FUNCTION ID FOR LOADING SAVESTATES; EITHER THE SAVESTATE IS BORKED OR SOMEONE FORGOT TO UPDATE SERIALIZATION\n"); return nullptr; + } + } + + // Queue Functions + void StartExecARM(); + void ContExecARM(); + void StartExecTHUMB(); + void ContExecTHUMB(); + void AddExecute(); + void AddCycles_MW_2(); + void DelayIfITCM_2(); + void JumpTo_2(); + void JumpTo_3A(); + void JumpTo_3B(); + void JumpTo_3C(); + void JumpTo_4(); + void CodeRead32_2(); + void CodeRead32_3(); + void CodeRead32_4(); + void ICacheLookup_2(); + void DAbortHandle(); + void DCacheFin8(); + void DRead8_2(); + void DRead8_3(); + void DRead8_4(); + void DRead8_5(); + void DCacheFin16(); + void DRead16_2(); + void DRead16_3(); + void DRead16_4(); + void DRead16_5(); + void DCacheFin32(); + void DRead32_2(); + void DRead32_3(); + void DRead32_4(); + void DRead32_5(); + void DRead32S_2(); + void DRead32S_3(); + void DRead32S_4(); + void DRead32S_5A(); + void DRead32S_5B(); + void DWrite8_2(); + void DWrite8_3(); + void DWrite8_4(); + void DWrite8_5(); + void DWrite16_2(); + void DWrite16_3(); + void DWrite16_4(); + void DWrite16_5(); + void DWrite32_2(); + void DWrite32_3(); + void DWrite32_4(); + void DWrite32_5(); + void DWrite32S_2(); + void DWrite32S_3(); + void DWrite32S_4(); + void DWrite32S_5A(); + void DWrite32S_5B(); + void WBCheck_2(); + void ICachePrefetch_2(); + void DCacheLookup_2(); + void DCacheLookup_3(); + void DCClearAddr_2(); + void DCClearSetWay_2(); + void DCClearInvalidateAddr_2(); + void DCClearInvalidateSetWay_2(); + void SetupInterlock_2(); + void HandleInterlocksExecute_2(); + void HandleInterlocksMemory_2(); + void ForceInterlock_2(); + void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } + void SignExtend8() { R[ExtReg] = (s8)R[ExtReg]; } + void SignExtend16() { R[ExtReg] = (s16)R[ExtReg]; } + void ROR32() { R[ExtReg] = ROR(R[ExtReg], ExtROROffs); } + + + + u32 CP15Control; //! CP15 Register 1: Control Register + + u32 RNGSeed; //! Global cache line fill seed. Used for pseudo random replacement strategy with the instruction and data cache + + u32 DTCMSetting; //! CP15 Register 9 Intermediate 1 Opcode2 0: Data Tightly-Coupled Memory register + u32 ITCMSetting; //! CP15 Register 9 Intermediate 1 Opcode2 1: Instruction Tightly-Coupled Memory register + u32 DCacheLockDown; //! CP15 Register 9 Intermediate 0 Opcode2 0: Data Cache Lockdown Register + u32 ICacheLockDown; //! CP15 Register 9 Intermediate 0 Opcode2 1: Instruction Cache Lockdown Register + u32 CacheDebugRegisterIndex; //! CP15: Cache Debug Index Register + u32 CP15TraceProcessId; //! CP15: Trace Process Id Register + u32 CP15BISTTestStateRegister; //! CP15: BIST Test State Register // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate - u32 ITCMSize; - u32 DTCMBase, DTCMMask; - s32 RegionCodeCycles; + u32 ITCMSize; //! Internal: Size of the memory ITCM is mapped to. @ref ITCM data repeats every @ref ITCMPhysicalSize withhin + u32 DTCMBase; //! Internal: DTCMBase Address. The DTCM can be accessed if the address & ~ @ref DTCMMask is equal to thhis base address + u32 DTCMMask; //! Internal: DTCM Address Mask used in conjunction with @ref DTCMBase to check for DTCM access + s32 RegionCodeCycles; //! Internal: Cached amount of cycles to fetch instruction from the current code region. - u8 ITCM[ITCMPhysicalSize]; - u8* DTCM; + alignas(u32) u8 ITCM[ITCMPhysicalSize]; //! Content of the ITCM + u8* DTCM; //! Content of the DTCM - u8 ICache[0x2000]; - u32 ICacheTags[64*4]; - u8 ICacheCount[64]; + alignas(u32) u8 ICache[ICACHE_SIZE]; //! Instruction Cache Content organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS times @ref ICACHE_LINELENGTH bytes + u32 ICacheTags[ICACHE_LINESPERSET*ICACHE_SETS]; //! Instruction Cache Tags organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS Tags + u8 ICacheCount; //! Global instruction line fill counter. Used for round-robin replacement strategy with the instruction cache - u32 PU_CodeCacheable; - u32 PU_DataCacheable; - u32 PU_DataCacheWrite; + alignas(u32) u8 DCache[DCACHE_SIZE]; //! Data Cache Content organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS times @ref DCACHE_LINELENGTH bytes + u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; //! Data Cache Tags organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS Tags + u8 DCacheCount; //! Global data line fill counter. Used for round-robin replacement strategy with the instruction cache - u32 PU_CodeRW; - u32 PU_DataRW; + u32 PU_CodeCacheable; //! CP15 Register 2 Opcode2 1: Code Cachable Bits + u32 PU_DataCacheable; //! CP15 Register 2 Opcode2 0: Data Cachable Bits + u32 PU_WriteBufferability; //! CP15 Register 3 Opcode2 0: Write Buffer Control Register - u32 PU_Region[8]; + u32 PU_CodeRW; //! CP15 Register 5 Opcode2 3: Code Access Permission register + u32 PU_DataRW; //! CP15 Register 5 Opcode2 2: Data Access Permission register + + u32 PU_Region[CP15_REGION_COUNT]; //! CP15 Register 6 Opcode2 0..7: Protection Region Base and Size Register // 0=dataR 1=dataW 2=codeR 4=datacache 5=datawrite 6=codecache - u8 PU_PrivMap[0x100000]; - u8 PU_UserMap[0x100000]; - - // games operate under system mode, generally - //#define PU_Map PU_PrivMap - u8* PU_Map; + u8 PU_PrivMap[CP15_MAP_ENTRYCOUNT]; /** + * Memory mapping flags for Privileged Modes + * Bits: + * 0 - CP15_MAP_READABLE + * 1 - CP15_MAP_WRITEABLE + * 2 - CP15_MAP_EXECUTABLE + * 4 - CP15_MAP_DCACHEABLE + * 5 - CP15_MAP_BUFFERABLE + * 6 - CP15_MAP_ICACHEABLE + */ + u8 PU_UserMap[CP15_MAP_ENTRYCOUNT]; //! Memory mapping flags for User Mode + u8* PU_Map; //! Current valid Region Mapping (is either @ref PU_PrivMap or PU_UserMap) // code/16N/32N/32S - u8 MemTimings[0x100000][4]; - - u8* CurICacheLine; + u8 MemTimings[0x40000][3]; bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); + alignas(64) void (ARMv5::*DelayedQueue)(void); // adding more than one new entry to the queue while it's already active does not work. so uh. we use this to work around that. it's less than ideal... + void (ARMv5::*StartExec)(void); + void (ARMv5::*FuncQueue[32])(void); + u64 ITCMTimestamp; + u64 TimestampMemory; + bool Store; + s8 ITCMDelay; + u32 QueuedDCacheLine; + u32 CP15Queue; + + u8 ILCurrReg; + u8 ILPrevReg; + u64 ILCurrTime; + u64 ILPrevTime; + u8 ILQueueReg; + s8 ILQueueDelay; + u8 ILQueueMemReg; + u8 ILQueueTimes[16]; + u16 ILQueueMask; + + u8 ICacheStreamPtr; + u8 DCacheStreamPtr; + u64 ICacheStreamTimes[7]; + u64 DCacheStreamTimes[7]; + + s8 ILForceDelay; + u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing + u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing + u8 WBWriting; // whether the buffer is actively trying to perform a write + u32 WBCurAddr; // address the write buffer is currently writing to + u64 WBCurVal; // current value being written; 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address (invalid in this variable) + u32 WBAddrQueued[40]; + u32 storeaddr[16]; // temp until i figure out why using the fifo address entries directly didn't work + u64 WBValQueued[40]; + u64 WriteBufferFifo[16]; // 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address + u64 WBTimestamp; // current timestamp + //u64 WBMainRAMDelay; // timestamp used to emulate the delay before the next main ram write can begin + u64 WBDelay; // timestamp in bus cycles use for the delay before next write to the write buffer can occur (seems to be a 1 cycle delay after a write to it) + u32 WBLastRegion; // the last region written to by the write buffer + u64 WBReleaseTS; // the timestamp on which the write buffer relinquished control of the bus back + u64 WBInitialTS; // what cycle the entry was first sent in + #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; void WriteMem(u32 addr, int size, u32 v) override; @@ -384,36 +1071,116 @@ class ARMv4 : public ARM { public: ARMv4(melonDS::NDS& nds, std::optional gdb, bool jit); + + void Reset() override; + + void DoSavestate(Savestate* file) override; void FillPipeline() override; - void JumpTo(u32 addr, bool restorecpsr = false) override; + void JumpTo(u32 addr, bool restorecpsr = false, u8 R15 = 0) override; template void Execute(); - u16 CodeRead16(u32 addr) - { - return BusRead16(addr); - } + alignas(64) void (ARMv4::*StartExec)(void); + void (ARMv4::*FuncQueue[32])(void); + bool Nonseq; - u32 CodeRead32(u32 addr) - { - return BusRead32(addr); - } + void CodeRead16(u32 addr); + void CodeRead32(u32 addr); - void DataRead8(u32 addr, u32* val) override; - void DataRead16(u32 addr, u32* val) override; - void DataRead32(u32 addr, u32* val) override; - void DataRead32S(u32 addr, u32* val) override; - void DataWrite8(u32 addr, u8 val) override; - void DataWrite16(u32 addr, u16 val) override; - void DataWrite32(u32 addr, u32 val) override; - void DataWrite32S(u32 addr, u32 val) override; + bool DataRead8(u32 addr, u8 reg) override; + bool DataRead16(u32 addr, u8 reg) override; + bool DataRead32(u32 addr, u8 reg) override; + bool DataRead32S(u32 addr, u8 reg) override; + bool DataWrite8(u32 addr, u8 val, u8 reg) override; + bool DataWrite16(u32 addr, u16 val, u8 reg) override; + bool DataWrite32(u32 addr, u32 val, u8 reg) override; + bool DataWrite32S(u32 addr, u32 val, u8 reg) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; void AddCycles_CD() override; + + void QueueFunction(void (ARMv4::*QueueEntry)(void)); + + int GetIDFromQueueFunc(void (ARMv4::*funcptr)(void)) + { + if (funcptr == &ARMv4::StartExecARM) return 0; + else if (funcptr == &ARMv4::StartExecTHUMB) return 1; + else if (funcptr == &ARMv4::UpdateNextInstr1) return 2; + else if (funcptr == &ARMv4::JumpTo_2) return 3; + else if (funcptr == &ARMv4::JumpTo_3A) return 4; + else if (funcptr == &ARMv4::JumpTo_3B) return 5; + else if (funcptr == &ARMv4::DRead8_2) return 6; + else if (funcptr == &ARMv4::DRead16_2) return 7; + else if (funcptr == &ARMv4::DRead32_2) return 8; + else if (funcptr == &ARMv4::DRead32S_2) return 9; + else if (funcptr == &ARMv4::DWrite8_2) return 10; + else if (funcptr == &ARMv4::DWrite16_2) return 11; + else if (funcptr == &ARMv4::DWrite32_2) return 12; + else if (funcptr == &ARMv4::DWrite32S_2) return 13; + else if (funcptr == &ARMv4::AddExecute) return 14; + else if (funcptr == &ARMv4::AddExtraCycle) return 15; + else if (funcptr == &ARMv4::QueueUpdateMode) return 16; + else if (funcptr == &ARMv4::SignExtend8) return 17; + else if (funcptr == &ARMv4::SignExtend16) return 18; + else if (funcptr == &ARMv4::ROR32) return 19; + else { Platform::Log(Platform::LogLevel::Error, "ARM7: INVALID FUNCTION POINTER FOR SAVESTATES; DID SOMEONE FORGET TO UPDATE SERIALIZATION?\n"); return -1; } + } + + typedef void (ARMv4::*funcptrA7)(void); + funcptrA7 GetQueueFuncFromID(int funcid) + { + switch (funcid) + { + case 0: return &ARMv4::StartExecARM; + case 1: return &ARMv4::StartExecTHUMB; + case 2: return &ARMv4::UpdateNextInstr1; + case 3: return &ARMv4::JumpTo_2; + case 4: return &ARMv4::JumpTo_3A; + case 5: return &ARMv4::JumpTo_3B; + case 6: return &ARMv4::DRead8_2; + case 7: return &ARMv4::DRead16_2; + case 8: return &ARMv4::DRead32_2; + case 9: return &ARMv4::DRead32S_2; + case 10: return &ARMv4::DWrite8_2; + case 11: return &ARMv4::DWrite16_2; + case 12: return &ARMv4::DWrite32_2; + case 13: return &ARMv4::DWrite32S_2; + case 14: return &ARMv4::AddExecute; + case 15: return &ARMv4::AddExtraCycle; + case 16: return &ARMv4::QueueUpdateMode; + case 17: return &ARMv4::SignExtend8; + case 18: return &ARMv4::SignExtend16; + case 19: return &ARMv4::ROR32; + default: Platform::Log(Platform::LogLevel::Error, "ARM7: INVALID FUNCTION ID FOR LOADING SAVESTATES; EITHER THE SAVESTATE IS BORKED OR SOMEONE FORGOT TO UPDATE SERIALIZATION\n"); return nullptr; + } + } + + // Queue Functions + void StartExecARM(); + void StartExecTHUMB(); + void UpdateNextInstr1() { NextInstr[1] = RetVal; } + void JumpTo_2(); + void JumpTo_3A(); + void JumpTo_3B(); + void DRead8_2(); + void DRead16_2(); + void DRead32_2(); + void DRead32S_2(); + void DWrite8_2(); + void DWrite16_2(); + void DWrite32_2(); + void DWrite32S_2(); + void AddExecute(); + void AddExtraCycle(); + void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } + void SignExtend8() { if (!(LDRFailedRegs & 1<CheckInterlock) return; + cpu->AddCycles_C(); Log(LogLevel::Warn, "undefined ARM%d instruction %08X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-8); #ifdef GDBSTUB_ENABLED cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-8); @@ -49,11 +51,14 @@ void A_UNK(ARM* cpu) cpu->R_UND[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 4; + cpu->JumpTo(cpu->ExceptionBase + 0x04); } void T_UNK(ARM* cpu) { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); Log(LogLevel::Warn, "undefined THUMB%d instruction %04X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-4); #ifdef GDBSTUB_ENABLED cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-4); @@ -66,13 +71,27 @@ void T_UNK(ARM* cpu) cpu->R_UND[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 2; + cpu->JumpTo(cpu->ExceptionBase + 0x04); } +void A_BKPT(ARM* cpu) +{ + if (cpu->CheckInterlock) return; + if (cpu->Num == 1) return A_UNK(cpu); // checkme + + Log(LogLevel::Warn, "BKPT: "); // combine with the prefetch abort warning message + ((ARMv5*)cpu)->PrefetchAbort(); +} + void A_MSR_IMM(ARM* cpu) { + if (cpu->CheckInterlock) return; + if ((cpu->Num != 1) && (cpu->CurInstr & ((0x7<<16)|(1<<22)))) cpu->AddCycles_CI(2); // arm9 cpsr_sxc & spsr + else cpu->AddCycles_C(); + u32* psr; if (cpu->CurInstr & (1<<22)) { @@ -90,7 +109,6 @@ void A_MSR_IMM(ARM* cpu) case 0x1A: case 0x1B: psr = &cpu->R_UND[2]; break; default: - cpu->AddCycles_C(); return; } } @@ -101,12 +119,9 @@ void A_MSR_IMM(ARM* cpu) u32 mask = 0; if (cpu->CurInstr & (1<<16)) mask |= 0x000000FF; - if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; - if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; - if (cpu->CurInstr & (1<<19)) mask |= 0xFF000000; - - if (!(cpu->CurInstr & (1<<22))) - mask &= 0xFFFFFFDF; + //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; // unused by arm 7 & 9 + //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; // unused by arm 7 & 9 + if (cpu->CurInstr & (1<<19)) mask |= ((cpu->Num==1) ? 0xF0000000 : 0xF8000000); if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; @@ -121,11 +136,29 @@ void A_MSR_IMM(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - cpu->AddCycles_C(); + if (cpu->CPSR & 0x20) [[unlikely]] + { + if (cpu->Num == 0) + { + cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. + ((ARMv5*)cpu)->StartExec = &ARMv5::StartExecTHUMB; + if (cpu->MRTrack.Type == MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[0] = ((ARMv5*)cpu)->StartExec; + } + else + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least + } + } } void A_MSR_REG(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr & 0xF); + + if ((cpu->Num != 1) && (cpu->CurInstr & ((0x7<<16)|(1<<22)))) cpu->AddCycles_CI(2); // arm9 cpsr_sxc & spsr + else cpu->AddCycles_C(); + u32* psr; if (cpu->CurInstr & (1<<22)) { @@ -143,7 +176,6 @@ void A_MSR_REG(ARM* cpu) case 0x1A: case 0x1B: psr = &cpu->R_UND[2]; break; default: - cpu->AddCycles_C(); return; } } @@ -154,12 +186,9 @@ void A_MSR_REG(ARM* cpu) u32 mask = 0; if (cpu->CurInstr & (1<<16)) mask |= 0x000000FF; - if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; - if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; - if (cpu->CurInstr & (1<<19)) mask |= 0xFF000000; - - if (!(cpu->CurInstr & (1<<22))) - mask &= 0xFFFFFFDF; + //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; // unused by arm 7 & 9 + //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; // unused by arm 7 & 9 + if (cpu->CurInstr & (1<<19)) mask |= ((cpu->Num==1) ? 0xF0000000 : 0xF8000000); if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; @@ -174,11 +203,25 @@ void A_MSR_REG(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - cpu->AddCycles_C(); + if (cpu->CPSR & 0x20) [[unlikely]] + { + if (cpu->Num == 0) + { + cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. + ((ARMv5*)cpu)->StartExec = &ARMv5::StartExecTHUMB; + if (cpu->MRTrack.Type == MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[0] = ((ARMv5*)cpu)->StartExec; + } + else + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least + } + } } void A_MRS(ARM* cpu) { + if (cpu->CheckInterlock) return; u32 psr; if (cpu->CurInstr & (1<<22)) { @@ -201,8 +244,19 @@ void A_MRS(ARM* cpu) else psr = cpu->CPSR; - cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; - cpu->AddCycles_C(); + if (cpu->Num != 1) // arm9 + { + cpu->AddCycles_CI(2); // 1 X + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + } + else cpu->AddCycles_C(); // arm7 + + if (((cpu->CurInstr>>12) & 0xF) == 15) + { + if (cpu->Num == 1) // doesn't seem to jump on the arm9? checkme + cpu->JumpTo(psr & ~0x1); // checkme: this shouldn't be able to switch to thumb? + } + else cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; } @@ -211,15 +265,19 @@ void A_MCR(ARM* cpu) if ((cpu->CPSR & 0x1F) == 0x10) return A_UNK(cpu); + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr>>12)&0xF); + u32 cp = (cpu->CurInstr >> 8) & 0xF; - //u32 op = (cpu->CurInstr >> 21) & 0x7; + u32 op = (cpu->CurInstr >> 21) & 0x7; u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; + u32 val = cpu->R[(cpu->CurInstr>>12)&0xF]; + if (((cpu->CurInstr>>12) & 0xF) == 15) val += 4; if (cpu->Num==0 && cp==15) { - ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo, cpu->R[(cpu->CurInstr>>12)&0xF]); + ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo|(op<<12), val); } else if (cpu->Num==1 && cp==14) { @@ -227,11 +285,12 @@ void A_MCR(ARM* cpu) } else { - Log(LogLevel::Warn, "bad MCR opcode p%d,%d,%d,%d on ARM%d\n", cp, cn, cm, cpinfo, cpu->Num?7:9); + Log(LogLevel::Warn, "bad MCR opcode p%d, %d, reg, c%d, c%d, %d on ARM%d\n", cp, op, cn, cm, cpinfo, cpu->Num?7:9); return A_UNK(cpu); // TODO: check what kind of exception it really is } - - cpu->AddCycles_CI(1 + 1); // TODO: checkme + + if (cpu->Num==0) cpu->AddCycles_CI(5); // checkme + else /* ARM7 */ cpu->AddCycles_CI(1 + 1); // TODO: checkme } void A_MRC(ARM* cpu) @@ -239,15 +298,24 @@ void A_MRC(ARM* cpu) if ((cpu->CPSR & 0x1F) == 0x10) return A_UNK(cpu); + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr>>12)&0xF); + u32 cp = (cpu->CurInstr >> 8) & 0xF; - //u32 op = (cpu->CurInstr >> 21) & 0x7; + u32 op = (cpu->CurInstr >> 21) & 0x7; u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; + u32 rd = (cpu->CurInstr>>12) & 0xF; if (cpu->Num==0 && cp==15) { - cpu->R[(cpu->CurInstr>>12)&0xF] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo); + if (rd != 15) cpu->R[rd] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo|(op<<12)); + else + { + // r15 updates the top 4 bits of the cpsr, done to "allow for conditional branching based on coprocessor status" + u32 flags = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo|(op<<12)) & 0xF0000000; + cpu->CPSR = (cpu->CPSR & ~0xF0000000) | flags; + } } else if (cpu->Num==1 && cp==14) { @@ -255,17 +323,25 @@ void A_MRC(ARM* cpu) } else { - Log(LogLevel::Warn, "bad MRC opcode p%d,%d,%d,%d on ARM%d\n", cp, cn, cm, cpinfo, cpu->Num?7:9); + Log(LogLevel::Warn, "bad MRC opcode p%d, %d, reg, c%d, c%d, %d on ARM%d\n", cp, op, cn, cm, cpinfo, cpu->Num?7:9); return A_UNK(cpu); // TODO: check what kind of exception it really is } - cpu->AddCycles_CI(2 + 1); // TODO: checkme + if (cpu->Num != 1) + { + cpu->AddCycles_CI(2); // 1 Execute cycle + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 Memory cycles + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); + } + else cpu->AddCycles_CI(2 + 1); // TODO: checkme } -void A_SVC(ARM* cpu) +void A_SVC(ARM* cpu) // A_SWI { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); u32 oldcpsr = cpu->CPSR; cpu->CPSR &= ~0xBF; cpu->CPSR |= 0x93; @@ -273,11 +349,14 @@ void A_SVC(ARM* cpu) cpu->R_SVC[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 4; + cpu->JumpTo(cpu->ExceptionBase + 0x08); } -void T_SVC(ARM* cpu) +void T_SVC(ARM* cpu) // T_SWI { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); u32 oldcpsr = cpu->CPSR; cpu->CPSR &= ~0xBF; cpu->CPSR |= 0x93; @@ -285,6 +364,7 @@ void T_SVC(ARM* cpu) cpu->R_SVC[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 2; + cpu->JumpTo(cpu->ExceptionBase + 0x08); } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 1066ac69..4c5ddafe 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -36,6 +36,7 @@ void A_MRS(ARM* cpu); void A_MCR(ARM* cpu); void A_MRC(ARM* cpu); void A_SVC(ARM* cpu); +void A_BKPT(ARM* cpu); void T_SVC(ARM* cpu); diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 167e184e..3e80f423 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -19,6 +19,7 @@ #include #include "ARM.h" #include "NDS.h" +#include "ARMInterpreter_MultiplySuperLLE.h" namespace melonDS::ARMInterpreter { @@ -83,25 +84,25 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) #define LSL_IMM_S(x, s) \ if (s > 0) \ { \ - cpu->SetC(x & (1<<(32-s))); \ + if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(32-s))); \ x <<= s; \ } #define LSR_IMM_S(x, s) \ if (s == 0) { \ - cpu->SetC(x & (1<<31)); \ + if (!cpu->CheckInterlock) cpu->SetC(x & (1<<31)); \ x = 0; \ } else { \ - cpu->SetC(x & (1<<(s-1))); \ + if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(s-1))); \ x >>= s; \ } #define ASR_IMM_S(x, s) \ if (s == 0) { \ - cpu->SetC(x & (1<<31)); \ + if (!cpu->CheckInterlock) cpu->SetC(x & (1<<31)); \ x = ((s32)x) >> 31; \ } else { \ - cpu->SetC(x & (1<<(s-1))); \ + if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(s-1))); \ x = ((s32)x) >> s; \ } @@ -110,11 +111,11 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) { \ u32 newc = (x & 1); \ x = (x >> 1) | ((cpu->CPSR & 0x20000000) << 2); \ - cpu->SetC(newc); \ + if (!cpu->CheckInterlock) cpu->SetC(newc); \ } \ else \ { \ - cpu->SetC(x & (1<<(s-1))); \ + if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(s-1))); \ x = ROR(x, s); \ } @@ -134,40 +135,49 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) x = ROR(x, (s&0x1F)); #define LSL_REG_S(x, s) \ - if (s > 31) { cpu->SetC((s>32) ? 0 : (x & (1<<0))); x = 0; } \ - else if (s > 0) { cpu->SetC(x & (1<<(32-s))); x <<= s; } + if (s > 31) { if (!cpu->CheckInterlock) cpu->SetC((s>32) ? 0 : (x & (1<<0))); x = 0; } \ + else if (s > 0) { if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(32-s))); x <<= s; } #define LSR_REG_S(x, s) \ - if (s > 31) { cpu->SetC((s>32) ? 0 : (x & (1<<31))); x = 0; } \ - else if (s > 0) { cpu->SetC(x & (1<<(s-1))); x >>= s; } + if (s > 31) { if (!cpu->CheckInterlock) cpu->SetC((s>32) ? 0 : (x & (1<<31))); x = 0; } \ + else if (s > 0) { if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(s-1))); x >>= s; } #define ASR_REG_S(x, s) \ - if (s > 31) { cpu->SetC(x & (1<<31)); x = ((s32)x) >> 31; } \ - else if (s > 0) { cpu->SetC(x & (1<<(s-1))); x = ((s32)x) >> s; } + if (s > 31) { if (!cpu->CheckInterlock) cpu->SetC(x & (1<<31)); x = ((s32)x) >> 31; } \ + else if (s > 0) { if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(s-1))); x = ((s32)x) >> s; } #define ROR_REG_S(x, s) \ - if (s > 0) cpu->SetC(x & (1<<(s-1))); \ + if (s > 0) if (!cpu->CheckInterlock) cpu->SetC(x & (1<<(s-1))); \ x = ROR(x, (s&0x1F)); #define A_CALC_OP2_IMM \ - u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); + u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); \ + u16 ilmask = 0; \ + u8 iltime[16]; #define A_CALC_OP2_IMM_S \ u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); \ if ((cpu->CurInstr>>7)&0x1E) \ - cpu->SetC(b & 0x80000000); + if (!cpu->CheckInterlock) cpu->SetC(b & 0x80000000); \ + u16 ilmask = 0; \ + u8 iltime[16]; #define A_CALC_OP2_REG_SHIFT_IMM(shiftop) \ u32 b = cpu->R[cpu->CurInstr&0xF]; \ u32 s = (cpu->CurInstr>>7)&0x1F; \ - shiftop(b, s); + shiftop(b, s); \ + u16 ilmask = 1 << (cpu->CurInstr&0xF); \ + u8 iltime[16]; iltime[cpu->CurInstr&0xF] = 0; #define A_CALC_OP2_REG_SHIFT_REG(shiftop) \ u32 b = cpu->R[cpu->CurInstr&0xF]; \ if ((cpu->CurInstr&0xF)==15) b += 4; \ - shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); + shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); \ + u16 ilmask = (1 << (cpu->CurInstr&0xF)) | (1 << ((cpu->CurInstr>>8)&0xF)); \ + u8 iltime[16]; iltime[(cpu->CurInstr>>8)&0xF] = 0; \ + iltime[cpu->CurInstr&0xF] = 1; // REMINDER: THIS IS WRONG, THIS CAN OVERWRITE LOWER VALUES. #define A_IMPLEMENT_ALU_OP(x,s) \ @@ -314,7 +324,9 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -327,7 +339,9 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZ(res & 0x80000000, \ !res); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -345,7 +359,9 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a ^ b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -358,7 +374,9 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a ^ b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZ(res & 0x80000000, \ !res); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -376,7 +394,9 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -389,7 +409,9 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZCV(res & 0x80000000, \ !res, \ CarrySub(a, b), \ @@ -409,7 +431,9 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = b - a; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -422,7 +446,9 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = b - a; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZCV(res & 0x80000000, \ !res, \ CarrySub(b, a), \ @@ -442,7 +468,9 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a + b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -456,6 +484,7 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZCV(res & 0x80000000, \ !res, \ CarryAdd(a, b), \ @@ -474,6 +503,8 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC(c) \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b + (cpu->CPSR&0x20000000 ? 1:0); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -487,6 +518,8 @@ A_IMPLEMENT_ALU_OP(ADD,) } #define A_ADC_S(c) \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res_tmp = a + b; \ u32 carry = (cpu->CPSR&0x20000000 ? 1:0); \ @@ -509,6 +542,8 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC(c) \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -522,6 +557,8 @@ A_IMPLEMENT_ALU_OP(ADC,) } #define A_SBC_S(c) \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res_tmp = a - b; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ @@ -544,6 +581,8 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC(c) \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = b - a - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -557,6 +596,8 @@ A_IMPLEMENT_ALU_OP(SBC,) } #define A_RSC_S(c) \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res_tmp = b - a; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ @@ -580,51 +621,141 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & b; \ - cpu->SetNZ(res & 0x80000000, \ - !res); \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ + { \ + if (cpu->Num == 1) \ + { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ + else cpu->JumpTo(res & ~1, true); /* TSTP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ + } A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a ^ b; \ - cpu->SetNZ(res & 0x80000000, \ - !res); \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ + { \ + if (cpu->Num == 1) \ + { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ + else cpu->JumpTo(res & ~1, true); /* TEQP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ + } A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b; \ - cpu->SetNZCV(res & 0x80000000, \ - !res, \ - CarrySub(a, b), \ - OverflowSub(a, b)); \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ + { \ + if (cpu->Num == 1) \ + { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarrySub(a, b), \ + OverflowSub(a, b)); \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ + else cpu->JumpTo(res & ~1, true); /* CMPP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarrySub(a, b), \ + OverflowSub(a, b)); \ + } A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a + b; \ - cpu->SetNZCV(res & 0x80000000, \ - !res, \ - CarryAdd(a, b), \ - OverflowAdd(a, b)); \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ + { \ + if (cpu->Num == 1) \ + { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarryAdd(a, b), \ + OverflowAdd(a, b)); \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ + else cpu->JumpTo(res & ~1, true); /* CMNP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarryAdd(a, b), \ + OverflowAdd(a, b)); \ + } A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a | b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -637,7 +768,9 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a | b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZ(res & 0x80000000, \ !res); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -654,6 +787,7 @@ A_IMPLEMENT_ALU_OP(ORR,_S) #define A_MOV(c) \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -665,6 +799,7 @@ A_IMPLEMENT_ALU_OP(ORR,_S) } #define A_MOV_S(c) \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ cpu->SetNZ(b & 0x80000000, \ !b); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -700,7 +835,9 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & ~b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -713,7 +850,9 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & ~b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ cpu->SetNZ(res & 0x80000000, \ !res); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -731,6 +870,7 @@ A_IMPLEMENT_ALU_OP(BIC,_S) #define A_MVN(c) \ b = ~b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -743,6 +883,7 @@ A_IMPLEMENT_ALU_OP(BIC,_S) #define A_MVN_S(c) \ b = ~b; \ + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ cpu->SetNZ(b & 0x80000000, \ !b); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ @@ -764,28 +905,41 @@ void A_MUL(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 res = rm * rs; - - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ(res & 0x80000000, - !res); - if (cpu->Num==1) cpu->SetC(0); - } - - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); // S + else + { + cpu->AddCycles_CI(2); // 1 X + + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 2; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 3; else cycles = 4; + if (cpu->CurInstr & (1<<20)) cpu->SetC(MULSCarry(rm, rs, 0, cycles==4)); + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); + u32 res = rm * rs; + + // all multiply instructions fail writes to r15 on arm7/9 + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ(res & 0x80000000, + !res); + } } void A_MLA(ARM* cpu) @@ -794,28 +948,43 @@ void A_MLA(ARM* cpu) u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; - u32 res = (rm * rs) + rn; - - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ(res & 0x80000000, - !res); - if (cpu->Num==1) cpu->SetC(0); - } - - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + u8 iltime[16] = {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); + else + { + cpu->AddCycles_CI(2); // 1 X + + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(MULSCarry(rm, rs, rn, cycles==5)); + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); + u32 res = (rm * rs) + rn; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ(res & 0x80000000, + !res); + } } void A_UMULL(ARM* cpu) @@ -823,62 +992,92 @@ void A_UMULL(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u64 res = (u64)rm * (u64)rs; - - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(UMULLSCarry(0, rm, rs, cycles==5)); + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); + u64 res = (u64)rm * (u64)rs; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + } } void A_UMLAL(ARM* cpu) { u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - - u64 res = (u64)rm * (u64)rs; - u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); - res += rd; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + u8 iltime[16] = {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF))/* | + (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(UMULLSCarry(rd, rm, rs, cycles==5)); + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); + u64 res = (u64)rm * (u64)rs; + + res += rd; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + } } void A_SMULL(ARM* cpu) @@ -886,62 +1085,92 @@ void A_SMULL(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - s64 res = (s64)(s32)rm * (s64)(s32)rs; - - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(SMULLSCarry(0, rm, rs, cycles==5)); + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); + s64 res = (s64)(s32)rm * (s64)(s32)rs; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + } } void A_SMLAL(ARM* cpu) { u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - - s64 res = (s64)(s32)rm * (s64)(s32)rs; - s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); - res += rd; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + u8 iltime[16] {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)) /*| + (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(SMULLSCarry(rd, rm, rs, cycles==5)); + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); + s64 res = (s64)(s32)rm * (s64)(s32)rs; + + res += rd; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + } } void A_SMLAxy(ARM* cpu) @@ -952,6 +1181,12 @@ void A_SMLAxy(ARM* cpu) u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u8 iltime[16] {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); + if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; if (cpu->CurInstr & (1<<6)) rs >>= 16; @@ -959,12 +1194,17 @@ void A_SMLAxy(ARM* cpu) u32 res_mul = ((s16)rm * (s16)rs); u32 res = res_mul + rn; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMLAWy(ARM* cpu) @@ -975,17 +1215,28 @@ void A_SMLAWy(ARM* cpu) u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u8 iltime[16] = {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); + if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; u32 res_mul = ((s64)(s32)rm * (s16)rs) >> 16; u32 res = res_mul + rn; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMULxy(ARM* cpu) @@ -995,15 +1246,23 @@ void A_SMULxy(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; u32 res = ((s16)rm * (s16)rs); + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMULWy(ARM* cpu) @@ -1013,13 +1272,21 @@ void A_SMULWy(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; u32 res = ((s64)(s32)rm * (s16)rs) >> 16; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMLALxy(ARM* cpu) @@ -1029,6 +1296,13 @@ void A_SMLALxy(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u8 iltime[16] {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF))/* | + (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); + if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; if (cpu->CurInstr & (1<<6)) rs >>= 16; @@ -1039,10 +1313,16 @@ void A_SMLALxy(ARM* cpu) s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); res += rd; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->AddCycles_CI(1); // TODO: interlock?? + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + + cpu->AddCycles_CI(2); // 1 X + + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } @@ -1053,6 +1333,8 @@ void A_CLZ(ARM* cpu) u32 val = cpu->R[cpu->CurInstr & 0xF]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr & 0xF); + u32 res = 0; while ((val & 0xFF000000) == 0) { @@ -1067,8 +1349,10 @@ void A_CLZ(ARM* cpu) val |= 0x1; } - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; cpu->AddCycles_C(); + + if (((cpu->CurInstr >> 12) & 0xF) == 15) cpu->JumpTo(res & ~1); + else cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; } void A_QADD(ARM* cpu) @@ -1078,6 +1362,8 @@ void A_QADD(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); + u32 res = rm + rn; if (OverflowAdd(rm, rn)) { @@ -1085,8 +1371,14 @@ void A_QADD(ARM* cpu) cpu->CPSR |= 0x08000000; } - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + // all saturated math instructions fail writes to r15 + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; + + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } void A_QSUB(ARM* cpu) @@ -1095,6 +1387,8 @@ void A_QSUB(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); u32 res = rm - rn; if (OverflowSub(rm, rn)) @@ -1102,9 +1396,14 @@ void A_QSUB(ARM* cpu) res = (res & 0x80000000) ? 0x7FFFFFFF : 0x80000000; cpu->CPSR |= 0x08000000; } + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } void A_QDADD(ARM* cpu) @@ -1113,6 +1412,8 @@ void A_QDADD(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); if (OverflowAdd(rn, rn)) { @@ -1128,9 +1429,14 @@ void A_QDADD(ARM* cpu) res = (res & 0x80000000) ? 0x7FFFFFFF : 0x80000000; cpu->CPSR |= 0x08000000; } + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } void A_QDSUB(ARM* cpu) @@ -1139,6 +1445,8 @@ void A_QDSUB(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); if (OverflowAdd(rn, rn)) { @@ -1154,9 +1462,14 @@ void A_QDSUB(ARM* cpu) res = (res & 0x80000000) ? 0x7FFFFFFF : 0x80000000; cpu->CPSR |= 0x08000000; } + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } @@ -1169,6 +1482,7 @@ void T_LSL_IMM(ARM* cpu) { u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 s = (cpu->CurInstr >> 6) & 0x1F; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); LSL_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; cpu->SetNZ(op & 0x80000000, @@ -1180,6 +1494,7 @@ void T_LSR_IMM(ARM* cpu) { u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 s = (cpu->CurInstr >> 6) & 0x1F; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); LSR_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; cpu->SetNZ(op & 0x80000000, @@ -1191,6 +1506,7 @@ void T_ASR_IMM(ARM* cpu) { u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 s = (cpu->CurInstr >> 6) & 0x1F; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); ASR_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; cpu->SetNZ(op & 0x80000000, @@ -1202,6 +1518,7 @@ void T_ADD_REG_(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 6) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << ((cpu->CurInstr >> 3) & 0x7)) | (1 << ((cpu->CurInstr >> 6) & 0x7))); u32 res = a + b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1215,6 +1532,7 @@ void T_SUB_REG_(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 6) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << ((cpu->CurInstr >> 3) & 0x7)) | (1 << ((cpu->CurInstr >> 6) & 0x7))); u32 res = a - b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1228,6 +1546,7 @@ void T_ADD_IMM_(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 b = (cpu->CurInstr >> 6) & 0x7; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); u32 res = a + b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1241,6 +1560,7 @@ void T_SUB_IMM_(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 b = (cpu->CurInstr >> 6) & 0x7; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); u32 res = a - b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1253,6 +1573,7 @@ void T_SUB_IMM_(ARM* cpu) void T_MOV_IMM(ARM* cpu) { u32 b = cpu->CurInstr & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); cpu->R[(cpu->CurInstr >> 8) & 0x7] = b; cpu->SetNZ(0, !b); @@ -1263,6 +1584,7 @@ void T_CMP_IMM(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); u32 res = a - b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1275,6 +1597,7 @@ void T_ADD_IMM(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); u32 res = a + b; cpu->R[(cpu->CurInstr >> 8) & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1288,6 +1611,7 @@ void T_SUB_IMM(ARM* cpu) { u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); u32 res = a - b; cpu->R[(cpu->CurInstr >> 8) & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1302,6 +1626,7 @@ void T_AND_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a & b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1313,6 +1638,7 @@ void T_EOR_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a ^ b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1324,6 +1650,7 @@ void T_LSL_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); LSL_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1335,6 +1662,7 @@ void T_LSR_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); LSR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1346,6 +1674,7 @@ void T_ASR_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); ASR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1357,6 +1686,7 @@ void T_ADC_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res_tmp = a + b; u32 carry = (cpu->CPSR&0x20000000 ? 1:0); u32 res = res_tmp + carry; @@ -1372,6 +1702,7 @@ void T_SBC_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res_tmp = a - b; u32 carry = (cpu->CPSR&0x20000000 ? 0:1); u32 res = res_tmp - carry; @@ -1387,6 +1718,7 @@ void T_ROR_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); ROR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1398,6 +1730,7 @@ void T_TST_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a & b; cpu->SetNZ(res & 0x80000000, !res); @@ -1407,6 +1740,7 @@ void T_TST_REG(ARM* cpu) void T_NEG_REG(ARM* cpu) { u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); u32 res = -b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1420,6 +1754,7 @@ void T_CMP_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a - b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1432,6 +1767,7 @@ void T_CMN_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a + b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1444,6 +1780,7 @@ void T_ORR_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a | b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1455,31 +1792,33 @@ void T_MUL_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + + s32 cycles; + if (cpu->Num == 0) + { + cycles = 3; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); + } + else + { + if ((a & 0xFFFFFF00) == 0x00000000 || (a & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; + else if ((a & 0xFFFF0000) == 0x00000000 || (a & 0xFFFF0000) == 0xFFFF0000) cycles = 2; + else if ((a & 0xFF000000) == 0x00000000 || (a & 0xFF000000) == 0xFF000000) cycles = 3; + else cycles = 4; + cpu->SetC(MULSCarry(b, a, 0, cycles==4)); // carry flag destroyed, they say. whatever that means... + } + cpu->AddCycles_CI(cycles); // implemented as S variant, doesn't interlock u32 res = a * b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, !res); - - s32 cycles = 0; - if (cpu->Num == 0) - { - cycles += 3; - } - else - { - cpu->SetC(0); // carry flag destroyed, they say. whatever that means... - if (a & 0xFF000000) cycles += 4; - else if (a & 0x00FF0000) cycles += 3; - else if (a & 0x0000FF00) cycles += 2; - else cycles += 1; - } - cpu->AddCycles_CI(cycles); } void T_BIC_REG(ARM* cpu) { u32 a = cpu->R[cpu->CurInstr & 0x7]; u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); u32 res = a & ~b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1490,6 +1829,7 @@ void T_BIC_REG(ARM* cpu) void T_MVN_REG(ARM* cpu) { u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); u32 res = ~b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1506,11 +1846,13 @@ void T_ADD_HIREG(ARM* cpu) u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); + u32 a = cpu->R[rd]; u32 b = cpu->R[rs]; cpu->AddCycles_C(); - + if (rd == 15) { cpu->JumpTo((a + b) | 1); @@ -1526,6 +1868,8 @@ void T_CMP_HIREG(ARM* cpu) u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); + u32 a = cpu->R[rd]; u32 b = cpu->R[rs]; u32 res = a - b; @@ -1534,14 +1878,27 @@ void T_CMP_HIREG(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + cpu->AddCycles_C(); + + if ((cpu->Num == 1) && (rd == 15)) + { + u32 oldpsr = cpu->CPSR; + cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. + if (!(cpu->CPSR & 0x20)) + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR |= 0x20; // keep it from crashing the emulator at least + } + } } void T_MOV_HIREG(ARM* cpu) { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; - + + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); cpu->AddCycles_C(); if (rd == 15) @@ -1569,22 +1926,27 @@ void T_MOV_HIREG(ARM* cpu) void T_ADD_PCREL(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(15); u32 val = cpu->R[15] & ~2; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; + cpu->AddCycles_C(); } void T_ADD_SPREL(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(13); u32 val = cpu->R[13]; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; + cpu->AddCycles_C(); } void T_ADD_SP(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(13); u32 val = cpu->R[13]; if (cpu->CurInstr & (1<<7)) val -= ((cpu->CurInstr & 0x7F) << 2); diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index 623be41a..1f271632 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -27,12 +27,16 @@ using Platform::LogLevel; void A_B(ARM* cpu) { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); s32 offset = (s32)(cpu->CurInstr << 8) >> 6; cpu->JumpTo(cpu->R[15] + offset); } void A_BL(ARM* cpu) { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); s32 offset = (s32)(cpu->CurInstr << 8) >> 6; cpu->R[14] = cpu->R[15] - 4; cpu->JumpTo(cpu->R[15] + offset); @@ -40,6 +44,8 @@ void A_BL(ARM* cpu) void A_BLX_IMM(ARM* cpu) { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); s32 offset = (s32)(cpu->CurInstr << 8) >> 6; if (cpu->CurInstr & 0x01000000) offset += 2; cpu->R[14] = cpu->R[15] - 4; @@ -48,11 +54,15 @@ void A_BLX_IMM(ARM* cpu) void A_BX(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr&0xF); + cpu->AddCycles_C(); cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); } void A_BLX_REG(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr&0xF); + cpu->AddCycles_C(); u32 lr = cpu->R[15] - 4; cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); cpu->R[14] = lr; @@ -62,22 +72,26 @@ void A_BLX_REG(ARM* cpu) void T_BCOND(ARM* cpu) { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); if (cpu->CheckCondition((cpu->CurInstr >> 8) & 0xF)) { s32 offset = (s32)(cpu->CurInstr << 24) >> 23; cpu->JumpTo(cpu->R[15] + offset + 1); } - else - cpu->AddCycles_C(); } void T_BX(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0xF); + cpu->AddCycles_C(); cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); } void T_BLX_REG(ARM* cpu) { + if (cpu->CheckInterlock) return ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0xF); + cpu->AddCycles_C(); if (cpu->Num==1) { Log(LogLevel::Warn, "!! THUMB BLX_REG ON ARM7\n"); @@ -91,12 +105,15 @@ void T_BLX_REG(ARM* cpu) void T_B(ARM* cpu) { + if (cpu->CheckInterlock) return; + cpu->AddCycles_C(); s32 offset = (s32)((cpu->CurInstr & 0x7FF) << 21) >> 20; cpu->JumpTo(cpu->R[15] + offset + 1); } void T_BL_LONG_1(ARM* cpu) { + if (cpu->CheckInterlock) return; s32 offset = (s32)((cpu->CurInstr & 0x7FF) << 21) >> 9; cpu->R[14] = cpu->R[15] + offset; cpu->AddCycles_C(); @@ -104,6 +121,12 @@ void T_BL_LONG_1(ARM* cpu) void T_BL_LONG_2(ARM* cpu) { + if ((cpu->CurInstr & 0x1801) == 0x0801) // "BLX" with bit 0 set is an undefined instruction. + return T_UNK(cpu); // TODO: Check ARM7 for exceptions + + if (cpu->CheckInterlock) return; + + cpu->AddCycles_C(); s32 offset = (cpu->CurInstr & 0x7FF) << 1; u32 pc = cpu->R[14] + offset; cpu->R[14] = (cpu->R[15] - 2) | 1; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index f7c24312..50ee6c8b 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -18,11 +18,19 @@ #include #include "ARM.h" +#include "NDS.h" namespace melonDS::ARMInterpreter { +template +inline bool ExecuteStage(ARM* cpu, u16 ilmask) +{ + if (cpu->CheckInterlock) { ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); return false;} + return true; +} + // copypasta from ALU. bad #define LSL_IMM(x, s) \ @@ -50,97 +58,209 @@ namespace melonDS::ARMInterpreter #define A_WB_CALC_OFFSET_IMM \ u32 offset = (cpu->CurInstr & 0xFFF); \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 0; #define A_WB_CALC_OFFSET_REG(shiftop) \ u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ u32 shift = ((cpu->CurInstr>>7)&0x1F); \ shiftop(offset, shift); \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 1 << (cpu->CurInstr & 0xF); +enum class Writeback +{ + None = 0, + Pre, + Post, + Trans, +}; + +template +void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 ilmask) +{ + cpu->LDRFailedRegs = 0; + static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); + + if (!ExecuteStage(cpu, (ilmask | (1<R[rn]; + else addr = cpu->R[rn]; + + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; + } + + u32 oldrd = cpu->R[rd]; + bool dabort; + if constexpr (size == 8) dabort = !cpu->DataRead8 (addr, rd); + if constexpr (size == 16) dabort = !cpu->DataRead16(addr, rd); + if constexpr (size == 32) dabort = !cpu->DataRead32(addr, rd); + + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0 && (cpu->CPSR & 0x1F) != 0x10) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_PrivMap; + } + + cpu->AddCycles_CDI(); + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + + if constexpr (size == 8 && signextend) + { + cpu->ExtReg = rd; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::SignExtend8); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::SignExtend8); + } + + if constexpr (size == 16) + { + if (cpu->Num == 1) + { + cpu->ExtReg = rd; + cpu->ExtROROffs = (addr & 0x1) * 8; + ((ARMv4*)cpu)->QueueFunction(&ARMv4::ROR32); // unaligned 16 bit loads are ROR'd on arm7 + + if constexpr (signextend) + { + if (addr&0x1) ((ARMv4*)cpu)->QueueFunction(&ARMv4::SignExtend8); // sign extend like an ldrsb if we ror'd the value. + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::SignExtend16); + } + } + else if constexpr (signextend) + { + cpu->ExtReg = rd; + ((ARMv5*)cpu)->QueueFunction(&ARMv5::SignExtend16); + } + } + + if constexpr (size == 32) + { + cpu->ExtReg = rd; + cpu->ExtROROffs = (addr & 0x3) * 8; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::ROR32); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::ROR32); + } + + if constexpr (writeback >= Writeback::Post) addr += offset; + if constexpr (writeback != Writeback::None) + { + if (rn != 15) [[likely]] // r15 writeback fails on arm9 + { + if (rd != rn) cpu->R[rn] = addr; + } + else if (cpu->Num == 1) // arm 7 + { + cpu->R[rd] = oldrd; // note that at no point does it actually write the value it loaded into a register... + cpu->LDRFailedRegs = 1<JumpTo((addr+4) & ~1); // +4 cause reasons + return; + } + } + + if (rd == 15) + { + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock((size<32) || (addr&0x3)); + + cpu->JumpTo(cpu->R[15], false, 1); + } + else if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(rd, (size < 32) || (addr&0x3)); +} + +template +void StoreSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 ilmask) +{ + static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); + + if (!ExecuteStage(cpu, (ilmask | (1<R[rn]; + else addr = cpu->R[rn]; + + u32 storeval = cpu->R[rd]; + if (rd == 15) storeval += 4; + + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; + } + + if (cpu->Num == 0) + ((ARMv5*)cpu)->HandleInterlocksMemory(rd); + + bool dabort; + if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval, rd); + if constexpr (size == 16) dabort = !cpu->DataWrite16(addr, storeval, rd); + if constexpr (size == 32) dabort = !cpu->DataWrite32(addr, storeval, rd); + + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0 && (cpu->CPSR & 0x1F) != 0x10) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_PrivMap; + } + + cpu->AddCycles_CD(); + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + + if constexpr (writeback >= Writeback::Post) addr += offset; + if constexpr (writeback != Writeback::None) + { + if (rn != 15) [[likely]] + { + cpu->R[rn] = addr; + } + else if (cpu->Num == 1) // r15 writeback fails on arm9 + { + cpu->JumpTo(addr & ~1); + } + } +} #define A_STR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ - storeval += 4; \ - cpu->DataWrite32(offset, storeval); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CD(); + if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Pre, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<32, Writeback::None, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); -// TODO: user mode (bit21) #define A_STR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ - storeval += 4; \ - cpu->DataWrite32(addr, storeval); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CD(); + if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Trans, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<32, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_STRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite8(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CD(); + if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Pre, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<8, Writeback::None, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); -// TODO: user mode (bit21) #define A_STRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite8(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CD(); + if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Trans, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<8, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead32(offset, &val); \ - val = ROR(val, ((offset&0x3)<<3)); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - { \ - if (cpu->Num==1) val &= ~0x1; \ - cpu->JumpTo(val); \ - } \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - } + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); -// TODO: user mode #define A_LDR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead32(addr, &val); \ - val = ROR(val, ((addr&0x3)<<3)); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - { \ - if (cpu->Num==1) val &= ~0x1; \ - cpu->JumpTo(val); \ - } \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - } + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead8(offset, &val); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CDI(); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRB PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); -// TODO: user mode #define A_LDRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead8(addr, &val); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CDI(); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRB PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); @@ -215,113 +335,123 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_HD_CALC_OFFSET_IMM \ u32 offset = (cpu->CurInstr & 0xF) | ((cpu->CurInstr >> 4) & 0xF0); \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 0; #define A_HD_CALC_OFFSET_REG \ u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 1 << (cpu->CurInstr & 0xF); #define A_STRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite16(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CD(); + if (cpu->CurInstr & (1<<21)) StoreSingle<16, Writeback::Pre, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<16, Writeback::None, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_STRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite16(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CD(); + StoreSingle<16, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); // TODO: CHECK LDRD/STRD TIMINGS!! #define A_LDRD \ if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED LDRD %d\n", r+1); } \ - cpu->DataRead32 (offset , &cpu->R[r ]); \ - cpu->DataRead32S(offset+4, &cpu->R[r+1]); \ - cpu->AddCycles_CDI(); + if (r&1) { A_UNK(cpu); return; } \ + cpu->LDRFailedRegs = 0; \ + if (!ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF)))) return; \ + bool dabort = !cpu->DataRead32(offset, r); \ + u32 oldval = cpu->R[r+1]; dabort |= !cpu->DataRead32S(offset+4, r+1); \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ + cpu->AddCycles_CDI(); \ + if (dabort) { \ + cpu->R[r+1] = oldval; \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ + if (r+1 == 15) { \ + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); \ + cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ + else { \ + if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(r+1); } \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED LDRD_POST %d\n", r+1); } \ - cpu->DataRead32 (addr , &cpu->R[r ]); \ - cpu->DataRead32S(addr+4, &cpu->R[r+1]); \ - cpu->AddCycles_CDI(); + if (r&1) { A_UNK(cpu); return; } \ + cpu->LDRFailedRegs = 0; \ + if (!ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF)))) return; \ + bool dabort = !cpu->DataRead32(addr, r); \ + u32 oldval = cpu->R[r+1]; dabort |= !cpu->DataRead32S(addr+4, r+1); \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ + cpu->AddCycles_CDI(); \ + if (dabort) { \ + cpu->R[r+1] = oldval; \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ + if (r+1 == 15) { \ + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); ; \ + cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ + else { \ + if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(r+1); } \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED STRD %d\n", r+1); } \ - cpu->DataWrite32 (offset , cpu->R[r ]); \ - cpu->DataWrite32S(offset+4, cpu->R[r+1]); \ - cpu->AddCycles_CD(); + if (r&1) { A_UNK(cpu); return; } \ + if (!ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF)))) return; \ + ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ + bool dabort = !cpu->DataWrite32(offset, cpu->R[r], r); \ + u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ + dabort |= !cpu->DataWrite32S (offset+4, storeval, r+1); \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ + cpu->AddCycles_CD(); \ + if (dabort) [[unlikely]] { \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRD_POST \ if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED STRD_POST %d\n", r+1); } \ - cpu->DataWrite32 (addr , cpu->R[r ]); \ - cpu->DataWrite32S(addr+4, cpu->R[r+1]); \ - cpu->AddCycles_CD(); + if (r&1) { A_UNK(cpu); return; } \ + if (!ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF)))) return; \ + ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ + bool dabort = !cpu->DataWrite32(addr, cpu->R[r], r); \ + u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ + dabort |= !cpu->DataWrite32S (addr+4, storeval, r+1); \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ + cpu->AddCycles_CD(); \ + if (dabort) [[unlikely]] { \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRH PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRH PC %08X\n", cpu->R[15]); \ + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->DataRead8(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSB PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->DataRead8(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSB PC %08X\n", cpu->R[15]); \ + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSH PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - cpu->AddCycles_CDI(); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSH PC %08X\n", cpu->R[15]); \ + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_IMPLEMENT_HD_LDRSTR(x) \ @@ -358,48 +488,142 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) +template +inline void SWP(ARM* cpu) +{ + if (!ExecuteStage(cpu, ((cpu->CurInstr >> 16) & 0xF))) return; + cpu->LDRFailedRegs = 0; + u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rd = (cpu->CurInstr >> 12) & 0xF; + u32 rm = cpu->CurInstr & 0xF; + u32 storeval = cpu->R[rm]; + if (rm == 15) storeval += 4; + + + u32 oldrd = cpu->R[rd]; + + if ((byte ? cpu->DataRead8 (base, rd) + : cpu->DataRead32(base, rd))) [[likely]] + { + if ((byte ? cpu->DataWrite8 (base, storeval, rm) + : cpu->DataWrite32(base, storeval, rm))) [[likely]] + { + // rd only gets updated if both read and write succeed + + if constexpr (!byte) + { + cpu->ExtReg = rd; + cpu->ExtROROffs = (base & 0x3) * 8; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::ROR32); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::ROR32); + } + cpu->AddCycles_CDI(); + + if (rd != 15) + { + if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(rd, byte || (base&0x3)); + } + else if (cpu->Num==1) // for some reason these jumps don't seem to work on the arm 9? + { + cpu->JumpTo(cpu->R[rd], false, 1); + } + return; + } + } + + // data abort handling + cpu->R[rd] = oldrd; + cpu->LDRFailedRegs = 1<AddCycles_CDI(); + ((ARMv5*)cpu)->DataAbort(); +} + void A_SWP(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - - u32 val; - cpu->DataRead32(base, &val); - cpu->R[(cpu->CurInstr >> 12) & 0xF] = ROR(val, 8*(base&0x3)); - - u32 numD = cpu->DataCycles; - cpu->DataWrite32(base, rm); - cpu->DataCycles += numD; - - cpu->AddCycles_CDI(); + SWP(cpu); } void A_SWPB(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; - - cpu->DataRead8(base, &cpu->R[(cpu->CurInstr >> 12) & 0xF]); - - u32 numD = cpu->DataCycles; - cpu->DataWrite8(base, rm); - cpu->DataCycles += numD; - - cpu->AddCycles_CDI(); + SWP(cpu); } +void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) +{ + enum // flags + { + load = (1<<0), + writeback = (1<<1), + decrement = (1<<2), + preinc = (1<<3), + restoreorthumb = (1<<4), // specifies restore cpsr for loads, thumb instr for stores + }; + if (cpu->Num == 1) + { + u32 base = cpu->R[baseid]; + bool flagpreinc = flags & preinc; + + if (flags & decrement) + { + flagpreinc = !flagpreinc; + base -= 0x40; + } + if (flagpreinc) base+=4; + + if (flags & load) + { + cpu->DataRead32(base, 15); + + cpu->AddCycles_CDI(); + + cpu->JumpTo(cpu->R[15], flags & restoreorthumb, 1); // TODO: fix this not maintaining current mode properly + } + else + { + cpu->DataWrite32(base, cpu->R[15] + ((flags & restoreorthumb) ? 2 : 4), 15); + + cpu->AddCycles_CD(); + } + } + else + { + cpu->AddCycles_C(); // checkme + } + + if (flags & writeback) + { + if (flags & decrement) cpu->R[baseid] -= 0x40; + else cpu->R[baseid] += 0x40; + } +} void A_LDM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; + if (!ExecuteStage(cpu, baseid)) return; + + cpu->LDRFailedRegs = 0; u32 base = cpu->R[baseid]; u32 wbbase; + u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - - if (!(cpu->CurInstr & (1<<23))) + bool dabort = false; + + if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] { + EmptyRListLDMSTM(cpu, baseid, ((1 << 0) | // load + (((cpu->CurInstr >> 21) & 1) << 1) | // writeback + ((!(cpu->CurInstr & (1<<23))) << 2) | // decrement + ((preinc >> 24) << 3) | // preinc + (((cpu->CurInstr >> 22) & 1) << 4))); // restore + return; + } + + if (!(cpu->CurInstr & (1<<23))) // decrement + { + // decrement is actually an increment starting from the end address for (int i = 0; i < 16; i++) { if (cpu->CurInstr & (1<CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) + { cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true); + //if (cpu->MRTrack.Type != MainRAMType::Null) printf("AHA, DERES THE PROBLEM\n"); + } for (int i = 0; i < 15; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]); - else cpu->DataRead32S(base, &cpu->R[i]); + u32 oldval = cpu->R[i]; + dabort |= !(first ? cpu->DataRead32 (base, i) + : cpu->DataRead32S(base, i)); + if (dabort) [[unlikely]] { cpu->R[i] = oldval; cpu->LDRFailedRegs |= (1<CurInstr & (1<<15)) { if (preinc) base += 4; - if (first) cpu->DataRead32 (base, &pc); - else cpu->DataRead32S(base, &pc); - if (!preinc) base += 4; + u32 oldval = cpu->R[15]; + dabort |= !(first ? cpu->DataRead32 (base, 15) + : cpu->DataRead32S(base, 15)); + if (dabort) [[unlikely]] { cpu->R[15] = oldval; cpu->LDRFailedRegs |= (1<<15); } - if (cpu->Num == 1) - pc &= ~0x1; + if (!preinc) base += 4; } - if (cpu->CurInstr & (1<<21)) + if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CDI(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CDI(); + } + + // handle data aborts + if (dabort) [[unlikely]] + { + if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) + { + cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; + cpu->QueueMode[1] = cpu->CPSR; + + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::QueueUpdateMode); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::QueueUpdateMode); + } + + ((ARMv5*)cpu)->DataAbort(); + return; + } + + // writeback to base + if (cpu->CurInstr & (1<<21) && (baseid != 15)) { // post writeback if (cpu->CurInstr & (1<<23)) @@ -454,29 +714,56 @@ void A_LDM(ARM* cpu) { u32 rlist = cpu->CurInstr & 0xFFFF; if ((!(rlist & ~(1 << baseid))) || (rlist & ~((2 << baseid) - 1))) - cpu->R[baseid] = wbbase; + { cpu->R[baseid] = wbbase; cpu->LDRFailedRegs = 1<R[baseid] = wbbase; } - + + // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) - cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + { + cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; + cpu->QueueMode[1] = cpu->CPSR; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::QueueUpdateMode); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::QueueUpdateMode); + } + + // jump if pc got written if (cpu->CurInstr & (1<<15)) - cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); - - cpu->AddCycles_CDI(); + { + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); + cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); + } + else if (cpu->Num == 0) + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0x7FFF); + ((ARMv5*)cpu)->SetupInterlock(lastreg); + } } void A_STM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; + if (!ExecuteStage(cpu, baseid)) return; + u32 base = cpu->R[baseid]; u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; + bool dabort = false; + + if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] + { + EmptyRListLDMSTM(cpu, baseid, ((0 << 0) | // load + (((cpu->CurInstr >> 21) & 1) << 1) | // writeback + ((!(cpu->CurInstr & (1<<23))) << 2) | // decrement + ((preinc >> 24) << 3) | // preinc + (0 << 4))); // thumb + return; + } if (!(cpu->CurInstr & (1<<23))) { @@ -486,7 +773,7 @@ void A_STM(ARM* cpu) base -= 4; } - if (cpu->CurInstr & (1<<21)) + if ((cpu->CurInstr & (1<<21)) && (baseid != 15)) cpu->R[baseid] = base; preinc = !preinc; @@ -504,21 +791,27 @@ void A_STM(ARM* cpu) cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true); } + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksMemory(__builtin_ctz(cpu->CurInstr)); + for (u32 i = 0; i < 16; i++) { if (cpu->CurInstr & (1<Num == 0) || (!(cpu->CurInstr & ((1<DataWrite32(base, oldbase) : cpu->DataWrite32S(base, oldbase); - else - first ? cpu->DataWrite32(base, base) : cpu->DataWrite32S(base, base); // checkme + val = oldbase; + else val = base; } - else - first ? cpu->DataWrite32(base, cpu->R[i]) : cpu->DataWrite32S(base, cpu->R[i]); + else val = cpu->R[i]; + + if (i == 15) val+=4; + + dabort |= !(first ? cpu->DataWrite32 (base, val, i) + : cpu->DataWrite32S(base, val, i)); first = false; @@ -529,10 +822,30 @@ void A_STM(ARM* cpu) if (cpu->CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) - cpu->R[baseid] = base; + if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CD(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CD(); + } - cpu->AddCycles_CD(); + // handle data aborts + if (dabort) [[unlikely]] + { + // restore original value of base + cpu->R[baseid] = oldbase; + ((ARMv5*)cpu)->DataAbort(); + return; + } + + if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21)) && (baseid != 15)) + cpu->R[baseid] = base; } @@ -544,165 +857,113 @@ void A_STM(ARM* cpu) void T_LDR_PCREL(ARM* cpu) { + if (!ExecuteStage(cpu, 15)) return; + + cpu->LDRFailedRegs = 0; u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); - cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); + bool dabort = !cpu->DataRead32(addr, (cpu->CurInstr >> 8) & 0x7); cpu->AddCycles_CDI(); + if (dabort) [[unlikely]] ((ARMv5*)cpu)->DataAbort(); + else if (cpu->Num == 0) + { + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 8) & 0x7); + } } void T_STR_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite32(addr, cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CD(); + StoreSingle<32, Writeback::None, true>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_STRB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite8(addr, cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CD(); + StoreSingle<8, Writeback::None, true>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDR_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - - u32 val; - cpu->DataRead32(addr, &val); - cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_STRH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite16(addr, cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CD(); + StoreSingle<16, Writeback::None, true>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRSB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRSH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_STR_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataWrite32(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<32, Writeback::None, false>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C), 0); } void T_LDR_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - u32 val; - cpu->DataRead32(offset, &val); - cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C), 0); } void T_STRB_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataWrite8(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<8, Writeback::None, false>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F), 0); } void T_LDRB_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F), 0); } void T_STRH_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataWrite16(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<16, Writeback::None, false>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E), 0); } void T_LDRH_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E), 0); } void T_STR_SPREL(ARM* cpu) { - u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->R[13]; - - cpu->DataWrite32(offset, cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<32, Writeback::None, false>(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC), 0); } void T_LDR_SPREL(ARM* cpu) { - u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->R[13]; - - cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CDI(); + LoadSingle(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC), 0); } void T_PUSH(ARM* cpu) { + if (!ExecuteStage(cpu, 13)) return; + int nregs = 0; bool first = true; + bool dabort = false; for (int i = 0; i < 8; i++) { @@ -712,17 +973,30 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) nregs++; + + if (!nregs) [[unlikely]] + { + EmptyRListLDMSTM(cpu, 13, 0b11110); + return; + } u32 base = cpu->R[13]; base -= (nregs<<2); - cpu->R[13] = base; + u32 wbbase = base; + + if (cpu->Num == 0) + { + u8 firstreg = __builtin_ctz(cpu->CurInstr); + if (firstreg == 8) firstreg = 14; + ((ARMv5*)cpu)->HandleInterlocksMemory(firstreg); + } for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]); - else cpu->DataWrite32S(base, cpu->R[i]); + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[i], i) + : cpu->DataWrite32S(base, cpu->R[i], i)); first = false; base += 4; } @@ -730,24 +1004,56 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) { - if (first) cpu->DataWrite32 (base, cpu->R[14]); - else cpu->DataWrite32S(base, cpu->R[14]); + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[14], 14) + : cpu->DataWrite32S(base, cpu->R[14], 14)); } - cpu->AddCycles_CD(); + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CD(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CD(); + } + + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + + cpu->R[13] = wbbase; } void T_POP(ARM* cpu) { + if (!ExecuteStage(cpu, 13)) return; + + cpu->LDRFailedRegs = 0; u32 base = cpu->R[13]; bool first = true; + bool dabort = false; + + if (!(cpu->CurInstr & 0x1FF)) [[unlikely]] + { + EmptyRListLDMSTM(cpu, 13, 0b00011); + return; + } for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]); - else cpu->DataRead32S(base, &cpu->R[i]); + u32 oldval = cpu->R[i]; + dabort |= !(first ? cpu->DataRead32 (base, i) + : cpu->DataRead32S(base, i)); + if (dabort) [[unlikely]] { cpu->R[i] = oldval; cpu->LDRFailedRegs |= (1<CurInstr & (1<<8)) { - u32 pc; - if (first) cpu->DataRead32 (base, &pc); - else cpu->DataRead32S(base, &pc); - if (cpu->Num==1) pc |= 0x1; - cpu->JumpTo(pc); - base += 4; + u32 oldval = cpu->R[15]; + dabort |= !(first ? cpu->DataRead32 (base, 15) + : cpu->DataRead32S(base, 15)); + + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CDI(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CDI(); + } + + if (!dabort) [[likely]] + { + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); + + cpu->JumpTo(cpu->R[15], false, 2); + base += 4; + } + else [[unlikely]] + { + cpu->R[15] = oldval; + cpu->LDRFailedRegs |= (1<<15); + ((ARMv5*)cpu)->DataAbort(); + return; + } + } + else + { + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CDI(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CDI(); + } + + if (cpu->Num == 0) + { + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + else + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); + ((ARMv5*)cpu)->SetupInterlock(lastreg); + } + } } cpu->R[13] = base; - cpu->AddCycles_CDI(); } void T_STMIA(ARM* cpu) { + if (!ExecuteStage(cpu, ((cpu->CurInstr >> 8) & 0x7))) return; + u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; + bool dabort = false; + + if (!(cpu->CurInstr & 0xFF)) [[unlikely]] + { + EmptyRListLDMSTM(cpu, (cpu->CurInstr >> 8) & 0x7, 0b10010); + return; + } + + if (cpu->Num == 0) + { + u8 firstreg = __builtin_ctz(cpu->CurInstr); + ((ARMv5*)cpu)->HandleInterlocksMemory(firstreg); + } for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]); - else cpu->DataWrite32S(base, cpu->R[i]); + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[i], i) + : cpu->DataWrite32S(base, cpu->R[i], i)); first = false; base += 4; } } + if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CD(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CD(); + } + + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + // TODO: check "Rb included in Rlist" case cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - cpu->AddCycles_CD(); } void T_LDMIA(ARM* cpu) { + if (!ExecuteStage(cpu, ((cpu->CurInstr >> 8) & 0x7))) return; + u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; + bool dabort = false; + + if (!(cpu->CurInstr & 0xFF)) [[unlikely]] + { + EmptyRListLDMSTM(cpu, (cpu->CurInstr >> 8) & 0x7, 0b00011); + return; + } for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]); - else cpu->DataRead32S(base, &cpu->R[i]); + u32 oldval = cpu->R[i]; + dabort |= !(first ? cpu->DataRead32 (base, i) + : cpu->DataRead32S(base, i)); + if (dabort) [[unlikely]] { cpu->R[i] = oldval; cpu->LDRFailedRegs |= (1<CurInstr & 0xFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); + cpu->AddCycles_CDI(); + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); + cpu->AddCycles_CDI(); + } + + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + + if (cpu->Num == 0) + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); + ((ARMv5*)cpu)->SetupInterlock(lastreg); + } + if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - - cpu->AddCycles_CDI(); } diff --git a/src/ARMInterpreter_MultiplySuperLLE.h b/src/ARMInterpreter_MultiplySuperLLE.h new file mode 100644 index 00000000..21b17bbc --- /dev/null +++ b/src/ARMInterpreter_MultiplySuperLLE.h @@ -0,0 +1,136 @@ +#ifndef ARMINTERPRETER_MULTIPLYSUPERLLE_H +#define ARMINTERPRETER_MULTIPLYSUPERLLE_H + +#include "types.h" + +using namespace melonDS; + +/* + Copyright (c) 2024 zaydlang + + This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + + + + +// code taken from: (also features a few alternative implementations that could maybe be worth looking at?) +// https://github.com/calc84maniac/multiplication-algorithm/blob/master/impl_opt.h +// based on research that can be found here: https://bmchtech.github.io/post/multiply/ + +// the code in this file is dedicated to handling the calculation of the carry flag for multiplication (S variant) instructions on the ARM7TDMI. + + +// Takes a multiplier between -0x01000000 and 0x00FFFFFF, cycles between 0 and 2 +static inline bool booths_multiplication32_opt(u32 multiplicand, u32 multiplier, u32 accumulator) { + // Set the low bit of the multiplicand to cause negation to invert the upper bits, this bit can't propagate to bit 31 + multiplicand |= 1; + + // Optimized first iteration + u32 booth = (s32)(multiplier << 31) >> 31; + u32 carry = booth * multiplicand; + // Pre-populate accumulator for output + u32 output = accumulator; + + u32 sum = output + carry; + int shift = 29; + do { + for (int i = 0; i < 4; i++, shift -= 2) { + // Get next booth factor (-2 to 2, shifted left by 30-shift) + u32 next_booth = (s32)(multiplier << shift) >> shift; + u32 factor = next_booth - booth; + booth = next_booth; + // Get scaled value of booth addend + u32 addend = multiplicand * factor; + // Combine the addend with the CSA + // Not performing any masking seems to work because the lower carries can't propagate to bit 31 + output ^= carry ^ addend; + sum += addend; + carry = sum - output; + } + } while (booth != multiplier); + + return carry >> 31; +} + +// Takes a multiplicand shifted right by 6 and a multiplier shifted right by 26 (zero or sign extended) +static inline bool booths_multiplication64_opt(u32 multiplicand, u32 multiplier, u32 accum_hi) { + // Skipping the first 14 iterations seems to work because the lower carries can't propagate to bit 63 + // This means only magic bits 62-61 are needed (which requires decoding 3 booth chunks), + // and only the last two booth iterations are needed + + // Set the low bit of the multiplicand to cause negation to invert the upper bits + multiplicand |= 1; + + // Pre-populate magic bit 61 for carry + u32 carry = ~accum_hi & UINT32_C(0x20000000); + // Pre-populate magic bits 63-60 for output (with carry magic pre-added in) + u32 output = accum_hi - UINT32_C(0x08000000); + + // Get factors from the top 3 booth chunks + u32 booth0 = (s32)(multiplier << 27) >> 27; + u32 booth1 = (s32)(multiplier << 29) >> 29; + u32 booth2 = (s32)(multiplier << 31) >> 31; + u32 factor0 = multiplier - booth0; + u32 factor1 = booth0 - booth1; + u32 factor2 = booth1 - booth2; + + // Get scaled value of the 3rd top booth addend + u32 addend = multiplicand * factor2; + // Finalize bits 61-60 of output magic using its sign + output -= addend & UINT32_C(0x10000000); + // Get scaled value of the 2nd top booth addend + addend = multiplicand * factor1; + // Finalize bits 63-62 of output magic using its sign + output -= addend & UINT32_C(0x40000000); + + // Get the carry from the CSA in bit 61 and propagate it to bit 62, which is not processed in this iteration + u32 sum = output + (addend & UINT32_C(0x20000000)); + // Subtract out the carry magic to get the actual output magic + output -= carry; + + // Get scaled value of the 1st top booth addend + addend = multiplicand * factor0; + // Add to bit 62 and propagate the carry + sum += addend & UINT32_C(0x40000000); + + // Cancel out the output magic bit 63 to get the carry bit 63 + return (sum ^ output) >> 31; +} + + +// also for MLAS and MUL (thumb ver.) +inline bool MULSCarry(s32 rm, s32 rs, u32 rn, bool lastcycle) +{ + if (lastcycle) + return (rs >> 30) == -2; + else + return booths_multiplication32_opt(rm, rs, rn); +} + +// also for UMLALS +inline bool UMULLSCarry(u64 rd, u32 rm, u32 rs, bool lastcycle) +{ + if (lastcycle) + return booths_multiplication64_opt(rm >> 6, rs >> 26, rd >> 32); + else + return booths_multiplication32_opt(rm, rs, rd & 0xFFFFFFFF); +} + +// also for SMLALS +inline bool SMULLSCarry(u64 rd, s32 rm, s32 rs, bool lastcycle) +{ + if (lastcycle) + return booths_multiplication64_opt(rm >> 6, rs >> 26, rd >> 32); + else + return booths_multiplication32_opt(rm, rs, rd & 0xFFFFFFFF); +} + +#endif diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 9582f7c8..a0afb5d4 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -51,10 +51,10 @@ namespace melonDS using Platform::Log; using Platform::LogLevel; -static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset, ""); +/*static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset, ""); static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset, ""); static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset, ""); - +*/ #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) Platform::Log(Platform::LogLevel::Debug, msg, ## __VA_ARGS__) @@ -586,7 +586,7 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept u32 numWriteAddrs = 0, writeAddrsTranslated = 0; cpu->FillPipeline(); - u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; + u32 nextInstr[2] = {(u32)cpu->NextInstr[0], (u32)cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr); @@ -644,17 +644,17 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept } else { - nextInstr[1] = cpuv5->CodeRead32(r15, false); + //nextInstr[1] = cpuv5->CodeRead32(r15, false); instrs[i].CodeCycles = cpu->CodeCycles; } } else { ARMv4* cpuv4 = (ARMv4*)cpu; - if (thumb) - nextInstr[1] = cpuv4->CodeRead16(r15); - else - nextInstr[1] = cpuv4->CodeRead32(r15); + if (thumb); + //nextInstr[1] = cpuv4->CodeRead16(r15); + else; + // nextInstr[1] = cpuv4->CodeRead32(r15); instrs[i].CodeCycles = cpu->CodeCycles; } instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr, LiteralOptimizations); @@ -722,7 +722,7 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept addressRanges[numAddressRanges++] = translatedAddrRounded; addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); - cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + //cpu->DataRead32(literalAddr, &literalValues[numLiterals]); literalLoadAddrs[numLiterals++] = translatedAddr; } } diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp index c83f8161..518442e4 100644 --- a/src/ARMJIT_A64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -83,14 +83,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // doesn't matter if we put garbage in the MSbs there if (addr & 0x2) { - cpu9->CodeRead32(addr-2, true); + //cpu9->CodeRead32(addr-2, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+2, false); + //cpu9->CodeRead32(addr+2, false); cycles += CurCPU->CodeCycles; } else { - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; } } @@ -99,9 +99,9 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) addr &= ~0x3; newPC = addr+4; - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+4, false); + //cpu9->CodeRead32(addr+4, false); cycles += cpu9->CodeCycles; } diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 6d2c4276..37d6c332 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -79,18 +79,18 @@ bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) CurCPU->R[15] = R15; if (size == 32) { - CurCPU->DataRead32(addr & ~0x3, &val); + //CurCPU->DataRead32(addr & ~0x3, &val); val = melonDS::ROR(val, (addr & 0x3) << 3); } else if (size == 16) { - CurCPU->DataRead16(addr & ~0x1, &val); + //CurCPU->DataRead16(addr & ~0x1, &val); if (signExtend) val = ((s32)val << 16) >> 16; } else { - CurCPU->DataRead8(addr, &val); + // CurCPU->DataRead8(addr, &val); if (signExtend) val = ((s32)val << 24) >> 24; } diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index bd73ae71..6e52a1c5 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -72,14 +72,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // doesn't matter if we put garbage in the MSbs there if (addr & 0x2) { - cpu9->CodeRead32(addr-2, true); + //cpu9->CodeRead32(addr-2, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+2, false); + //cpu9->CodeRead32(addr+2, false); cycles += CurCPU->CodeCycles; } else { - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; } } @@ -88,9 +88,9 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) addr &= ~0x3; newPC = addr+4; - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+4, false); + //cpu9->CodeRead32(addr+4, false); cycles += cpu9->CodeCycles; } diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 71cd0770..11624d9e 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -85,18 +85,18 @@ bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) CurCPU->R[15] = R15; if (size == 32) { - CurCPU->DataRead32(addr & ~0x3, &val); + //CurCPU->DataRead32(addr & ~0x3, &val); val = melonDS::ROR(val, (addr & 0x3) << 3); } else if (size == 16) { - CurCPU->DataRead16(addr & ~0x1, &val); + //CurCPU->DataRead16(addr & ~0x1, &val); if (signExtend) val = ((s32)val << 16) >> 16; } else { - CurCPU->DataRead8(addr, &val); + //CurCPU->DataRead8(addr, &val); if (signExtend) val = ((s32)val << 24) >> 24; } diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 58838307..d1be9761 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -194,6 +194,7 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); +const u32 A_BKPT = A_BranchAlways | A_Link | ak(ak_UNK); const u32 A_MSR_IMM = ak(ak_MSR_IMM); const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG); const u32 A_MRS = A_Write12 | ak(ak_MRS); diff --git a/src/ARM_InstrTable.h b/src/ARM_InstrTable.h index 8213c2e0..2c480f8d 100644 --- a/src/ARM_InstrTable.h +++ b/src/ARM_InstrTable.h @@ -130,7 +130,7 @@ INSTRFUNC_PROTO(ARMInstrTable[4096]) = // 0001 0010 0000 A_MSR_REG, A_BX, A_UNK, A_BLX_REG, - A_UNK, A_QSUB, A_UNK, A_UNK, + A_UNK, A_QSUB, A_UNK, A_BKPT, A_SMLAWy, A_UNK, A_SMULWy, A_STRH_REG, A_SMLAWy, A_LDRD_REG, A_SMULWy, A_STRD_REG, diff --git a/src/CP15.cpp b/src/CP15.cpp index e924bff3..a22f0f34 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -18,12 +18,18 @@ #include #include +#if defined(__x86_64__) +#include +#elif defined(__ARM_NEON) +#include +#endif #include "NDS.h" #include "DSi.h" #include "ARM.h" #include "Platform.h" #include "ARMJIT_Memory.h" #include "ARMJIT.h" +#include "CP15_Constants.h" namespace melonDS { @@ -44,33 +50,45 @@ void ARMv5::CP15Reset() CP15Control = 0x2078; // dunno RNGSeed = 44203; - TraceProcessID = 0; + // Memory Regions Protection + PU_CodeRW = 0; + PU_DataRW = 0; + + memset(PU_Region, 0, CP15_REGION_COUNT*sizeof(*PU_Region)); + + // TCM-Settings DTCMSetting = 0; ITCMSetting = 0; memset(ITCM, 0, ITCMPhysicalSize); memset(DTCM, 0, DTCMPhysicalSize); - ITCMSize = 0; - DTCMBase = 0xFFFFFFFF; - DTCMMask = 0; - - memset(ICache, 0, 0x2000); - ICacheInvalidateAll(); - memset(ICacheCount, 0, 64); - + // Cache Settings PU_CodeCacheable = 0; PU_DataCacheable = 0; - PU_DataCacheWrite = 0; + PU_WriteBufferability = 0; - PU_CodeRW = 0; - PU_DataRW = 0; + ICacheLockDown = 0; + DCacheLockDown = 0; - memset(PU_Region, 0, 8*sizeof(u32)); + memset(ICache, 0, ICACHE_SIZE); + ICacheInvalidateAll(); + ICacheCount = 0; + + memset(DCache, 0, DCACHE_SIZE); + DCacheInvalidateAll(); + DCacheCount = 0; + + // Debug / Misc Registers + CacheDebugRegisterIndex = 0; + CP15BISTTestStateRegister = 0; + CP15TraceProcessId = 0; + + // And now Update the internal state + UpdateDTCMSetting(); + UpdateITCMSetting(); UpdatePURegions(true); - - CurICacheLine = NULL; } void ARMv5::CP15DoSavestate(Savestate* file) @@ -85,14 +103,28 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->VarArray(ITCM, ITCMPhysicalSize); file->VarArray(DTCM, DTCMPhysicalSize); + file->VarArray(ICache, sizeof(ICache)); + file->VarArray(ICacheTags, sizeof(ICacheTags)); + file->Var8(&ICacheCount); + + file->VarArray(DCache, sizeof(DCache)); + file->VarArray(DCacheTags, sizeof(DCacheTags)); + file->Var8(&DCacheCount); + + file->Var32(&DCacheLockDown); + file->Var32(&ICacheLockDown); + file->Var32(&CacheDebugRegisterIndex); + file->Var32(&CP15TraceProcessId); + file->Var32(&CP15BISTTestStateRegister); + file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); - file->Var32(&PU_DataCacheWrite); + file->Var32(&PU_WriteBufferability); file->Var32(&PU_CodeRW); file->Var32(&PU_DataRW); - file->VarArray(PU_Region, 8*sizeof(u32)); + file->VarArray(PU_Region, CP15_REGION_COUNT*sizeof(u32)); if (!file->Saving) { @@ -109,15 +141,18 @@ void ARMv5::UpdateDTCMSetting() u32 newDTCMMask; u32 newDTCMSize; - if (CP15Control & (1<<16)) + if (CP15Control & CP15_TCM_CR_DTCM_ENABLE) { - newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); - if (newDTCMSize < 0x1000) newDTCMSize = 0x1000; - newDTCMMask = 0xFFFFF000 & ~(newDTCMSize-1); + newDTCMSize = CP15_DTCM_SIZE_BASE << ((DTCMSetting & CP15_DTCM_SIZE_MASK) >> CP15_DTCM_SIZE_POS); + if (newDTCMSize < (CP15_DTCM_SIZE_BASE << CP15_DTCM_SIZE_MIN)) + newDTCMSize = CP15_DTCM_SIZE_BASE << CP15_DTCM_SIZE_MIN; + + newDTCMMask = CP15_DTCM_BASE_MASK & ~(newDTCMSize-1); newDTCMBase = DTCMSetting & newDTCMMask; } else { + // DTCM Disabled newDTCMSize = 0; newDTCMBase = 0xFFFFFFFF; newDTCMMask = 0; @@ -133,9 +168,9 @@ void ARMv5::UpdateDTCMSetting() void ARMv5::UpdateITCMSetting() { - if (CP15Control & (1<<18)) + if (CP15Control & CP15_TCM_CR_ITCM_ENABLE) { - ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); + ITCMSize = CP15_ITCM_SIZE_BASE << ((ITCMSetting & CP15_ITCM_SIZE_MASK) >> CP15_ITCM_SIZE_POS); #ifdef JIT_ENABLED FastBlockLookupSize = 0; #endif @@ -149,40 +184,43 @@ void ARMv5::UpdateITCMSetting() // covers updates to a specific PU region's cache/etc settings // (not to the region range/enabled status) -void ARMv5::UpdatePURegion(u32 n) +void ARMv5::UpdatePURegion(const u32 n) { - if (!(CP15Control & (1<<0))) + if (!(CP15Control & CP15_CR_MPUENABLE)) return; - u32 coderw = (PU_CodeRW >> (4*n)) & 0xF; - u32 datarw = (PU_DataRW >> (4*n)) & 0xF; + if (n >= CP15_REGION_COUNT) + return; - u32 codecache, datacache, datawrite; + u32 coderw = (PU_CodeRW >> (CP15_REGIONACCESS_BITS_PER_REGION * n)) & CP15_REGIONACCESS_REGIONMASK; + u32 datarw = (PU_DataRW >> (CP15_REGIONACCESS_BITS_PER_REGION * n)) & CP15_REGIONACCESS_REGIONMASK; + + bool codecache, datacache, datawrite; // datacache/datawrite - // 0/0: goes to memory - // 0/1: goes to memory - // 1/0: goes to memory and cache + // 0/0: goes directly to memory + // 0/1: goes to write buffer + // 1/0: goes to write buffer and cache // 1/1: goes to cache - if (CP15Control & (1<<12)) + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) codecache = (PU_CodeCacheable >> n) & 0x1; else - codecache = 0; + codecache = false; - if (CP15Control & (1<<2)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { datacache = (PU_DataCacheable >> n) & 0x1; - datawrite = (PU_DataCacheWrite >> n) & 0x1; } else { - datacache = 0; - datawrite = 0; + datacache = false; } + + datawrite = (PU_WriteBufferability >> n) & 0x1; u32 rgn = PU_Region[n]; - if (!(rgn & (1<<0))) + if (!(rgn & CP15_REGION_ENABLE)) { return; } @@ -196,55 +234,55 @@ void ARMv5::UpdatePURegion(u32 n) u32 end = start + (1<> 2]; + u8* bustimings = NDS.ARM9MemTimings[i]; - if (pu & 0x40) - { - MemTimings[i][0] = 0xFF;//kCodeCacheTiming; - } - else - { - MemTimings[i][0] = bustimings[2] << NDS.ARM9ClockShift; - } - - if (pu & 0x10) - { - MemTimings[i][1] = kDataCacheTiming; - MemTimings[i][2] = kDataCacheTiming; - MemTimings[i][3] = 1; - } - else - { - MemTimings[i][1] = bustimings[0] << NDS.ARM9ClockShift; - MemTimings[i][2] = bustimings[2] << NDS.ARM9ClockShift; - MemTimings[i][3] = bustimings[3] << NDS.ARM9ClockShift; - } + MemTimings[i][0] = (bustimings[0] << NDS.ARM9ClockShift) - 1; + MemTimings[i][1] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + MemTimings[i][2] = (bustimings[3] << NDS.ARM9ClockShift) - 1; // sequentials technically should probably be -1 as well? + // but it doesn't really matter as long as i also dont force align the start of sequential accesses, now does it? } } @@ -337,142 +348,1110 @@ u32 ARMv5::RandomLineIndex() return (RNGSeed >> 17) & 0x3; } -void ARMv5::ICacheLookup(u32 addr) +bool ARMv5::ICacheLookup(const u32 addr) { - u32 tag = addr & 0xFFFFF800; - u32 id = (addr >> 5) & 0x3F; + const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); + const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; + +#if defined(__x86_64__) + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop + + __m128i tags; memcpy(&tags, &ICacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer + + if (!set) goto miss; // check if none of them were a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match - id <<= 2; - if (ICacheTags[id+0] == tag) { - CodeCycles = 1; - CurICacheLine = &ICache[(id+0) << 5]; - return; - } - if (ICacheTags[id+1] == tag) +#elif defined(__ARM_NEON) + uint32x4_t tags = { ICacheTags[id+0], ICacheTags[id+1], ICacheTags[id+2], ICacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) goto miss; + else set = __builtin_ctz(set) >> 4; + { - CodeCycles = 1; - CurICacheLine = &ICache[(id+1) << 5]; - return; - } - if (ICacheTags[id+2] == tag) +#else + // fallback for loop; slow + for (int set = 0; set < ICACHE_SETS; set++) { - CodeCycles = 1; - CurICacheLine = &ICache[(id+2) << 5]; - return; - } - if (ICacheTags[id+3] == tag) - { - CodeCycles = 1; - CurICacheLine = &ICache[(id+3) << 5]; - return; + if ((ICacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) +#endif + { + u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; + + if (ICacheStreamPtr >= 7) + { + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; // does this apply to streamed fetches? + NDS.ARM9Timestamp++; + } + else + { + u64 nextfill = ICacheStreamTimes[ICacheStreamPtr++]; + if (NDS.ARM9Timestamp < nextfill) + { + NDS.ARM9Timestamp = nextfill; + } + else + { + u64 fillend = ICacheStreamTimes[6] + 2; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; + else // checkme + { + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; + NDS.ARM9Timestamp++; + } + ICacheStreamPtr = 7; + } + } + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; + DataRegion = Mem9_Null; + Store = false; + + RetVal = cacheLine[(addr & (ICACHE_LINELENGTH -1)) / 4]; + if (DelayedQueue != nullptr) QueueFunction(DelayedQueue); + return true; + } } // cache miss + miss: + // We do not fill the cacheline if it is disabled in the + // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] + return false; + + //if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + WriteBufferDrain(); + FetchAddr[16] = addr; + QueueFunction(&ARMv5::ICacheLookup_2); + return true; +} + +void ARMv5::ICacheLookup_2() +{ + u32 addr = FetchAddr[16]; + const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); + const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; u32 line; - if (CP15Control & (1<<14)) + + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] { - line = ICacheCount[id>>2]; - ICacheCount[id>>2] = (line+1) & 0x3; + line = ICacheCount; + ICacheCount = (line+1) & (ICACHE_SETS-1); } else { line = RandomLineIndex(); } + if (ICacheLockDown) + { + if (ICacheLockDown & CACHE_LOCKUP_L) [[unlikely]] + { + // load into locked up cache + // into the selected set + line = ICacheLockDown & (ICACHE_SETS-1); + } else + { + u8 minSet = ICacheLockDown & (ICACHE_SETS-1); + line = line | minSet; + } + } + line += id; - addr &= ~0x1F; - u8* ptr = &ICache[line << 5]; - - if (CodeMem.Mem) + u32* ptr = (u32 *)&ICache[line << ICACHE_LINELENGTH_LOG2]; + + // bus reads can only overlap with dcache streaming by 6 cycles + if (DCacheStreamPtr < 7) { - memcpy(ptr, &CodeMem.Mem[addr & CodeMem.Mask], 32); + u64 time = DCacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + ICacheTags[line] = tag | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; + + // timing logic + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>14] == Mem9_MainRAM) + { + MRTrack.Type = MainRAMType::ICacheStream; + MRTrack.Var = line; + FetchAddr[16] = addr & ~3; + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] + ICacheStreamPtr = 7; + else ICacheStreamPtr = (addr & 0x1F) / 4; } else { - for (int i = 0; i < 32; i+=4) - *(u32*)&ptr[i] = NDS.ARM9Read32(addr+i); + for (int i = 0; i < ICACHE_LINELENGTH; i+=sizeof(u32)) + ptr[i/4] = NDS.ARM9Read32(tag+i); + + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<> 14][1] + stall) + ((MemTimings[tag >> 14][2] + 1) * ((DCACHE_LINELENGTH / 4) - 1)); + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; // this should never trigger in practice + } + else // ICache Streaming logic + { + u32 stall = (4 - NDS.ARM9ClockShift) << NDS.ARM9ClockShift; + u8 ns = MemTimings[addr>>14][1] + stall; + u8 seq = MemTimings[addr>>14][2] + 1; + + u8 linepos = (addr & 0x1F) / 4; // technically this is one too low, but we want that actually + + u64 cycles = ns + (seq * linepos); + NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; + + ICacheStreamPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + ICacheStreamTimes[i] = cycles; + } + } + RetVal = ptr[(addr & (ICACHE_LINELENGTH-1)) / 4]; } - - ICacheTags[line] = tag; - - // ouch :/ - //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); - CodeCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * 7)) << NDS.ARM9ClockShift; - CurICacheLine = ptr; + Store = false; + DataRegion = Mem9_Null; + if (DelayedQueue != nullptr) QueueFunction(DelayedQueue); } -void ARMv5::ICacheInvalidateByAddr(u32 addr) +void ARMv5::ICacheInvalidateByAddr(const u32 addr) { - u32 tag = addr & 0xFFFFF800; - u32 id = (addr >> 5) & 0x3F; + const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; - id <<= 2; - if (ICacheTags[id+0] == tag) + for (int set = 0; set < ICACHE_SETS; set++) { - ICacheTags[id+0] = 1; - return; - } - if (ICacheTags[id+1] == tag) - { - ICacheTags[id+1] = 1; - return; - } - if (ICacheTags[id+2] == tag) - { - ICacheTags[id+2] = 1; - return; - } - if (ICacheTags[id+3] == tag) - { - ICacheTags[id+3] = 1; - return; + if ((ICacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) + { + ICacheTags[id+set] &= ~CACHE_FLAG_VALID; + return; + } } } +void ARMv5::ICacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine) +{ + if (cacheSet >= ICACHE_SETS) + return; + if (cacheLine >= ICACHE_LINESPERSET) + return; + + u32 idx = (cacheLine << ICACHE_SETS_LOG2) + cacheSet; + ICacheTags[idx] &= ~CACHE_FLAG_VALID; +} + + void ARMv5::ICacheInvalidateAll() { - for (int i = 0; i < 64*4; i++) - ICacheTags[i] = 1; + #pragma GCC ivdep + for (int i = 0; i < ICACHE_SIZE / ICACHE_LINELENGTH; i++) + ICacheTags[i] &= ~CACHE_FLAG_VALID; } +bool ARMv5::IsAddressICachable(const u32 addr) const +{ + return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_ICACHEABLE; +} + +bool ARMv5::DCacheLookup(const u32 addr) +{ + //Log(LogLevel::Debug,"DCache load @ %08x\n", addr); + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + +#if defined(__x86_64__) + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop + + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer + + if (!set) goto miss; // check if none of them were a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match + + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) goto miss; + else set = __builtin_ctz(set) >> 4; + + { +#else + // fallback for loop; slow + for (int set = 0; set < DCACHE_SETS; set++) + { + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) +#endif + { + u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + + if (DCacheStreamPtr >= 7) + { + NDS.ARM9Timestamp += DataCycles = 1; + } + else + { + u64 nextfill = DCacheStreamTimes[DCacheStreamPtr++]; + //if (NDS.ARM9Timestamp < nextfill) // can this ever really fail? + { + DataCycles = nextfill - NDS.ARM9Timestamp; + if (DataCycles > (3<> 2], set, id>>2); + RetVal = cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2]; + (this->*DelayedQueue)(); + return true; + } + } + + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + // cache miss + miss: + // We do not fill the cacheline if it is disabled in the + // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] + return false; + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + WriteBufferDrain(); // checkme? + + FetchAddr[16] = addr; + QueueFunction(&ARMv5::DCacheLookup_2); + return true; +} + +void ARMv5::DCacheLookup_2() +{ + u32 addr = FetchAddr[16]; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + u32 line; + + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] + { + line = DCacheCount; + DCacheCount = (line+1) & (DCACHE_SETS-1); + } + else + { + line = RandomLineIndex(); + } + + if (DCacheLockDown) + { + if (DCacheLockDown & CACHE_LOCKUP_L) [[unlikely]] + { + // load into locked up cache + // into the selected set + line = DCacheLockDown & (DCACHE_SETS-1); + } else + { + u8 minSet = DCacheLockDown & (DCACHE_SETS-1); + line = line | minSet; + } + } + line += id; + + #if !DISABLE_CACHEWRITEBACK + // Before we fill the cacheline, we need to write back dirty content + // Datacycles will be incremented by the required cycles to do so + DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); + #endif + + QueuedDCacheLine = line; + QueueFunction(&ARMv5::DCacheLookup_3); +} + +void ARMv5::DCacheLookup_3() +{ + u32 addr = FetchAddr[16]; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + u32 line = QueuedDCacheLine; + u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; + DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; + + // timing logic + + if (NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) + { + MRTrack.Type = MainRAMType::DCacheStream; + MRTrack.Var = line; + + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] + DCacheStreamPtr = 7; + else DCacheStreamPtr = (addr & 0x1F) / 4; + + QueueFunction(DelayedQueue); + } + else + { + for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) + { + ptr[i >> 2] = BusRead32(tag+i); + } + // Disabled DCACHE Streaming: + // Wait until the entire cache line is filled before continuing with execution + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] + { + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1] + stall) + ((MemTimings[tag >> 14][2] + 1) * ((DCACHE_LINELENGTH / 4) - 1)); + DataCycles = MemTimings[tag>>14][2]; // checkme + + DataRegion = NDS.ARM9Regions[addr>>14]; + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<>14]; + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>14][1] + stall; + u8 seq = MemTimings[addr>>14][2] + 1; + + u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + + u64 cycles = ns + (seq * linepos); + DataCycles = 3<> 2]; + (this->*DelayedQueue)(); + } +} + +bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) +{ + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + + //Log(LogLevel::Debug, "Cache write 32: %08lx <= %08lx\n", addr, val); + +#if defined(__x86_64__) + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop + + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer + + if (!set) return false; // check if none of them were a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match + + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 4; + + { +#else + // fallback for loop; slow + for (int set = 0; set < DCACHE_SETS; set++) + { + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif + { + u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 2] = val; + NDS.ARM9Timestamp += DataCycles = 1; + DataRegion = Mem9_DCache; + #if !DISABLE_CACHEWRITEBACK + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) + { + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF; + } + // just mark dirty and abort the data write through the bus + return true; + } + #endif + return false; + } + } + return false; +} + +bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) +{ + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + //Log(LogLevel::Debug, "Cache write 16: %08lx <= %04x\n", addr, val); + +#if defined(__x86_64__) + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop + + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer + + if (!set) return false; // check if none of them were a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match + + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 4; + + { +#else + // fallback for loop; slow + for (int set = 0; set < DCACHE_SETS; set++) + { + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif + { + u16 *cacheLine = (u16 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 1] = val; + NDS.ARM9Timestamp += DataCycles = 1; + DataRegion = Mem9_DCache; + #if !DISABLE_CACHEWRITEBACK + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) + { + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF; + } + // just mark dirtyand abort the data write through the bus + return true; + } + #endif + return false; + } + } + return false; +} + +bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) +{ + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + + //Log(LogLevel::Debug, "Cache write 8: %08lx <= %02x\n", addr, val); + +#if defined(__x86_64__) + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop + + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer + + if (!set) return false; // check if none of them were a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match + + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 4; + + { +#else + // fallback for loop; slow + for (int set = 0; set < DCACHE_SETS; set++) + { + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif + { + u8 *cacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + cacheLine[addr & (DCACHE_LINELENGTH-1)] = val; + NDS.ARM9Timestamp += DataCycles = 1; + DataRegion = Mem9_DCache; + #if !DISABLE_CACHEWRITEBACK + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) + { + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF; + } + + // just mark dirty and abort the data write through the bus + return true; + } + #endif + return false; + } + } + return false; +} + +void ARMv5::DCacheInvalidateByAddr(const u32 addr) +{ + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + +#if defined(__x86_64__) + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop + + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer + + if (!set) return; // check if none of them were a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match + + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return; + else set = __builtin_ctz(set) >> 4; + + { +#else + // fallback for loop; slow + for (int set = 0; set < DCACHE_SETS; set++) + { + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif + { + //Log(LogLevel::Debug,"DCache invalidated %08lx\n", addr & ~(ICACHE_LINELENGTH-1)); + DCacheTags[id+set] &= ~CACHE_FLAG_VALID; + return; + } + } +} + +void ARMv5::DCacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine) +{ + if (cacheSet >= DCACHE_SETS) + return; + if (cacheLine >= DCACHE_LINESPERSET) + return; + + u32 idx = (cacheLine << DCACHE_SETS_LOG2) + cacheSet; + DCacheTags[idx] &= ~CACHE_FLAG_VALID; +} + + +void ARMv5::DCacheInvalidateAll() +{ + #pragma GCC ivdep + for (int i = 0; i < DCACHE_SIZE / DCACHE_LINELENGTH; i++) + DCacheTags[i] &= ~CACHE_FLAG_VALID; +} + +void ARMv5::DCacheClearAll() +{ + #if !DISABLE_CACHEWRITEBACK + for (int set = 0; set < DCACHE_SETS; set++) + for (int line = 0; line <= DCACHE_LINESPERSET; line++) + DCacheClearByASetAndWay(set, line); + #endif +} + +void ARMv5::DCacheClearByAddr(const u32 addr) +{ + #if !DISABLE_CACHEWRITEBACK + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + + for (int set = 0; set < DCACHE_SETS; set++) + { + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) + { + DCacheClearByASetAndWay(set, id >> DCACHE_SETS_LOG2); + return; + } + } + #endif +} + +void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) +{ + #if !DISABLE_CACHEWRITEBACK + const u32 index = cacheSet | (cacheLine << DCACHE_SETS_LOG2); + + // Only write back if valid + if (!(DCacheTags[index] & CACHE_FLAG_VALID)) + return; + + const u32 tag = DCacheTags[index] & ~CACHE_FLAG_MASK; + u32* ptr = (u32 *)&DCache[index << DCACHE_LINELENGTH_LOG2]; + + if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) + { + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + WriteBufferWrite(tag, 4); + WriteBufferWrite(ptr[0], 2, tag+0x00); + WriteBufferWrite(ptr[1], 3, tag+0x04); + WriteBufferWrite(ptr[2], 3, tag+0x08); + WriteBufferWrite(ptr[3], 3, tag+0x0C); + //NDS.ARM9Timestamp += 4; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? + } + if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) // todo: check how this behaves when both fields need to be written + { + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) + { + WriteBufferWrite(ptr[4], 3, tag+0x10); + } + else + { + WriteBufferWrite(tag+0x10, 4); + WriteBufferWrite(ptr[4], 2, tag+0x10); + } + WriteBufferWrite(ptr[5], 3, tag+0x14); + WriteBufferWrite(ptr[6], 3, tag+0x18); + WriteBufferWrite(ptr[7], 3, tag+0x1C); + //NDS.ARM9Timestamp += 4; + } + DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); + #endif +} + +bool ARMv5::IsAddressDCachable(const u32 addr) const +{ + return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE; +} + +#define A9WENTLAST (!NDS.MainRAMLastAccess) +#define A7WENTLAST ( NDS.MainRAMLastAccess) +#define A9LAST false +#define A7LAST true +#define A9PRIORITY !(NDS.ExMemCnt[0] & 0x8000) +#define A7PRIORITY (NDS.ExMemCnt[0] & 0x8000) + +template +bool ARMv5::WriteBufferHandle() +{ + while (true) + { + if (WBWriting) + { + if ((mode == WBMode::Check) && ((NDS.A9ContentionTS << NDS.ARM9ClockShift) > NDS.ARM9Timestamp)) return true; + // look up timings + // TODO: handle interrupted bursts? + u32 cycles; + switch (WBCurVal >> 61) + { + case 0: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } + if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } + cycles = 4; + NDS.MainRAMTimestamp = NDS.A9ContentionTS + 9; + NDS.MainRAMLastAccess = A9LAST; + } + else cycles = NDS.ARM9MemTimings[WBCurAddr>>14][0]; // todo: twl timings + break; + } + case 1: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } + if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } + NDS.MainRAMTimestamp = NDS.A9ContentionTS + 8; + cycles = 3; + NDS.MainRAMLastAccess = A9LAST; + } + else cycles = NDS.ARM9MemTimings[WBCurAddr>>14][0]; // todo: twl timings + break; + } + case 3: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } + if (A9WENTLAST) + { + NDS.MainRAMTimestamp += 2; + cycles = 2; + break; + } + } + else + { + cycles = NDS.ARM9MemTimings[WBCurAddr>>14][3]; + break; + } + } + case 2: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } + if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } + NDS.MainRAMTimestamp = NDS.A9ContentionTS + 9; + cycles = 4; + NDS.MainRAMLastAccess = A9LAST; + } + else cycles = NDS.ARM9MemTimings[WBCurAddr>>14][2]; // todo: twl timings + break; + } + } + + NDS.A9ContentionTS += cycles; + WBReleaseTS = (NDS.A9ContentionTS << NDS.ARM9ClockShift) - 1; + if (NDS.ARM9Regions[WBCurAddr>>14] != Mem9_MainRAM && ((WBCurVal >> 61) != 3)) + { + NDS.A9ContentionTS += 1; + WBTimestamp = WBReleaseTS + 2; // todo: twl timings + } + else + { + WBTimestamp = WBReleaseTS; + } + if (WBWritePointer != 16 && (WriteBufferFifo[WBWritePointer] >> 61) != 3) WBInitialTS = WBTimestamp; + + switch (WBCurVal >> 61) + { + case 0: // byte + BusWrite8 (WBCurAddr, WBCurVal); + break; + case 1: // halfword + BusWrite16(WBCurAddr, WBCurVal); + break; + case 2: // word + case 3: + BusWrite32(WBCurAddr, WBCurVal); + break; + default: // invalid + Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE NONSENSE VIA THE WRITE BUFFER! PANIC!!! Flag: %i\n", (u8)(WBCurVal >> 61)); + break; + } + + WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; + WBWriting = false; + if ((mode == WBMode::SingleBurst) && ((WriteBufferFifo[WBWritePointer] >> 61) != 3)) return true; + } + + // check if write buffer is empty + if (WBWritePointer == 16) return true; + + // attempt to drain write buffer + if ((WriteBufferFifo[WBWritePointer] >> 61) != 4) // not an address + { + if (WBInitialTS > NDS.ARM9Timestamp) + { + if (mode == WBMode::Check) return true; + else NDS.ARM9Timestamp = WBInitialTS; + } + + //if ((WriteBufferFifo[WBWritePointer] >> 61) == 3) WBCurAddr+=4; // TODO + //if (storeaddr[WBWritePointer] != WBCurAddr) printf("MISMATCH: %08X %08X\n", storeaddr[WBWritePointer], WBCurAddr); + + WBCurAddr = storeaddr[WBWritePointer]; + WBCurVal = WriteBufferFifo[WBWritePointer]; + WBWriting = true; + } + else + { + //WBCurAddr = (u32)WriteBufferFifo[WBWritePointer]; // TODO + } + + WBWritePointer = (WBWritePointer + 1) & 0xF; + if (WBWritePointer == WBFillPointer) + { + WBWritePointer = 16; + WBFillPointer = 0; + } + if ((mode == WBMode::WaitEntry) && (WBWritePointer != WBFillPointer)) return true; + } +} +template bool ARMv5::WriteBufferHandle(); +template bool ARMv5::WriteBufferHandle(); +template bool ARMv5::WriteBufferHandle(); +template bool ARMv5::WriteBufferHandle(); + +#undef A9WENTLAST +#undef A7WENTLAST +#undef A9LAST +#undef A7LAST +#undef A9PRIORITY +#undef A7PRIORITY + +template +void ARMv5::WriteBufferCheck() +{ + if ((WBWritePointer != 16) || WBWriting) + { + if constexpr (next == 0) + { + MRTrack.Type = MainRAMType::WBCheck; + } + else if constexpr (next == 2) + { + MRTrack.Type = MainRAMType::WBWaitWrite; + } + else + { + MRTrack.Type = MainRAMType::WBWaitRead; + } + } + /* + while (!WriteBufferHandle<0>()); // loop until we've cleared out all writeable entries + + if constexpr (next == 1 || next == 3) // check if the next write is occuring + { + if (NDS.ARM9Timestamp >= WBInitialTS)// + (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM)))// || ((NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) && WBWriting)) + { + u64 tsold = NDS.ARM9Timestamp; + while(!WriteBufferHandle<2>()); + + //if constexpr (next == 3) NDS.ARM9Timestamp = std::max(tsold, NDS.ARM9Timestamp - (2<= WBInitialTS) + while(!WriteBufferHandle<2>()); + }*/ +} +template void ARMv5::WriteBufferCheck<3>(); +template void ARMv5::WriteBufferCheck<2>(); +template void ARMv5::WriteBufferCheck<1>(); +template void ARMv5::WriteBufferCheck<0>(); + +void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) +{ + MRTrack.Type = MainRAMType::WBWrite; + WBAddrQueued[MRTrack.Var] = addr; + WBValQueued[MRTrack.Var++] = val | (u64)flag << 61; + /*switch (flag) + { + case 0: // byte + BusWrite8 (addr, val); + break; + case 1: // halfword + BusWrite16(addr, val); + break; + case 2: // word + case 3: + BusWrite32(addr, val); + break; + default: // invalid + //Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE NONSENSE VIA THE WRITE BUFFER! PANIC!!! Flag: %i\n", (u8)(WBCurVal >> 61)); + break; + }*/ + /*WriteBufferCheck<0>(); + + if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made + WriteBufferHandle<1>(); + else if (WBWritePointer == 16) // indicates empty write buffer + { + WBWritePointer = 0; + if (!WBWriting) + { + u64 ts = ((NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) ? std::max(MainRAMTimestamp, (NDS.ARM9Timestamp + 1)) : (NDS.ARM9Timestamp + 1)); + + if (!WBWriting && (WBTimestamp < ((ts + ((1<()); // loop until drained fully +} void ARMv5::CP15Write(u32 id, u32 val) { //if(id!=0x704)printf("CP15 write op %03X %08X %08X\n", id, val, R[15]); - switch (id) + switch (id & 0xFFF) { case 0x100: { u32 old = CP15Control; - val &= 0x000FF085; - CP15Control &= ~0x000FF085; - CP15Control |= val; - //printf("CP15Control = %08X (%08X->%08X)\n", CP15Control, old, val); + CP15Control = (CP15Control & ~CP15_CR_CHANGEABLE_MASK) | (val & CP15_CR_CHANGEABLE_MASK); + //Log(LogLevel::Debug, "CP15Control = %08X (%08X->%08X)\n", CP15Control, old, val); UpdateDTCMSetting(); UpdateITCMSetting(); - if ((old & 0x1005) != (val & 0x1005)) + u32 changedBits = old ^ CP15Control; + if (changedBits & (CP15_CR_MPUENABLE | CP15_CACHE_CR_ICACHEENABLE| CP15_CACHE_CR_DCACHEENABLE)) { - UpdatePURegions((old & 0x1) != (val & 0x1)); + UpdatePURegions(changedBits & CP15_CR_MPUENABLE); } - if (val & (1<<7)) Log(LogLevel::Warn, "!!!! ARM9 BIG ENDIAN MODE. VERY BAD. SHIT GONNA ASPLODE NOW\n"); - if (val & (1<<13)) ExceptionBase = 0xFFFF0000; - else ExceptionBase = 0x00000000; + if (val & CP15_CR_BIGENDIAN) Log(LogLevel::Warn, "!!!! ARM9 BIG ENDIAN MODE. VERY BAD. SHIT GONNA ASPLODE NOW\n"); + if (val & CP15_CR_HIGHEXCEPTIONBASE) ExceptionBase = CP15_EXCEPTIONBASE_HIGH; + else ExceptionBase = CP15_EXCEPTIONBASE_LOW; } return; - case 0x200: // data cacheable { u32 diff = PU_DataCacheable ^ val; PU_DataCacheable = val; - for (u32 i = 0; i < 8; i++) - { - if (diff & (1<> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); + + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries. On HW the changed + // access permissions would not be applied because of a higher priority + // region overwriting them. + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + + u32 diff = old ^ PU_DataRW; + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + u32 diff = old ^ PU_DataRW; + if (diff) UpdatePURegions(true); + #endif } return; @@ -524,19 +1546,31 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 old = PU_CodeRW; PU_CodeRW = 0; - PU_CodeRW |= (val & 0x0003); - PU_CodeRW |= ((val & 0x000C) << 2); - PU_CodeRW |= ((val & 0x0030) << 4); - PU_CodeRW |= ((val & 0x00C0) << 6); - PU_CodeRW |= ((val & 0x0300) << 8); - PU_CodeRW |= ((val & 0x0C00) << 10); - PU_CodeRW |= ((val & 0x3000) << 12); - PU_CodeRW |= ((val & 0xC000) << 14); - u32 diff = old ^ PU_CodeRW; - for (u32 i = 0; i < 8; i++) - { - if (diff & (0xF<<(i*4))) UpdatePURegion(i); - } + #pragma GCC ivdep + #pragma GCC unroll 8 + for (int i = 0; i < CP15_REGION_COUNT; i++) + PU_CodeRW |= (val >> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); + + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries, because it + // would on HW be overridden by the higher priority region + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + + u32 diff = old ^ PU_CodeRW; + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + u32 diff = old ^ PU_CodeRW; + if (diff) UpdatePURegions(true); + #endif } return; @@ -544,10 +1578,23 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 diff = PU_DataRW ^ val; PU_DataRW = val; - for (u32 i = 0; i < 8; i++) - { - if (diff & (0xF<<(i*4))) UpdatePURegion(i); - } + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries, because it + // would on HW be overridden by the higher priority region + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + if (diff) UpdatePURegions(true); + #endif } return; @@ -555,10 +1602,23 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 diff = PU_CodeRW ^ val; PU_CodeRW = val; - for (u32 i = 0; i < 8; i++) - { - if (diff & (0xF<<(i*4))) UpdatePURegion(i); - } + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries, because it + // would on HW be overridden by the higher priority region + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + if (diff) UpdatePURegions(true); + #endif } return; @@ -579,109 +1639,326 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x661: case 0x670: case 0x671: - char log_output[1024]; - PU_Region[(id >> 4) & 0xF] = val; + { + char log_output[1024]; + u32 old = PU_Region[(id >> 4) & 0xF]; + PU_Region[(id >> 4) & 0xF] = val & ~(0x3F<<6); + u32 diff = old ^ PU_Region[(id >> 4) & 0xF]; - std::snprintf(log_output, - sizeof(log_output), - "PU: region %d = %08X : %s, start: %08X size: %02X\n", - (id >> 4) & 0xF, - val, - val & 1 ? "enabled" : "disabled", - val & 0xFFFFF000, - (val & 0x3E) >> 1 - ); - Log(LogLevel::Debug, "%s", log_output); - // Some implementations of Log imply a newline, so we build up the line before printing it - - // TODO: smarter region update for this? - UpdatePURegions(true); - return; + std::snprintf(log_output, + sizeof(log_output), + "PU: region %d = %08X : %s, start: %08X size: %02X\n", + (id >> 4) & 0xF, + val, + val & 1 ? "enabled" : "disabled", + val & 0xFFFFF000, + (val & 0x3E) >> 1 + ); + // TODO: smarter region update for this? + if (diff) UpdatePURegions(true); + return; + } case 0x704: case 0x782: + //WriteBufferDrain(); // checkme Halt(1); return; case 0x750: + // Can be executed in user and priv mode ICacheInvalidateAll(); - //Halt(255); return; case 0x751: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } ICacheInvalidateByAddr(val); //Halt(255); return; - case 0x752: - Log(LogLevel::Warn, "CP15: ICACHE INVALIDATE WEIRD. %08X\n", val); + /*case 0x752: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + // Cache invalidat by line number and set number + u8 cacheSet = val >> (32 - ICACHE_SETS_LOG2) & (ICACHE_SETS -1); + u8 cacheLine = (val >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET -1); + ICacheInvalidateBySetAndWay(cacheSet, cacheLine); + } //Halt(255); return; + */ - - case 0x761: + case 0x760: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + DCacheInvalidateAll(); //printf("inval data cache %08X\n", val); return; - case 0x762: + case 0x761: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + DCacheInvalidateByAddr(val); //printf("inval data cache SI\n"); return; - + /*case 0x762: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + // Cache invalidat by line number and set number + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheInvalidateBySetAndWay(cacheSet, cacheLine); + } + return; + */ + /*case 0x770: + // invalidate both caches + // can be called from user and privileged + ICacheInvalidateAll(); + DCacheInvalidateAll(); + break; + */ + /*case 0x7A0: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + //Log(LogLevel::Debug,"clean data cache\n"); + DCacheClearAll(); + return;*/ case 0x7A1: - //printf("flush data cache %08X\n", val); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + //Log(LogLevel::Debug,"clean data cache MVA\n");= + CP15Queue = val; + QueueFunction(&ARMv5::DCClearAddr_2); return; case 0x7A2: - //printf("flush data cache SI\n"); + //Log(LogLevel::Debug,"clean data cache SET/WAY\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + // Cache invalidat by line number and set number + CP15Queue = val; + QueueFunction(&ARMv5::DCClearSetWay_2); + } + return; + case 0x7A3: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + // Test and clean (optional) + // Is not present on the NDS/DSi return; + case 0x7A4: + // Can be used in user and privileged mode + // Drain Write Buffer: Stall until all write back completed + QueueFunction(&ARMv5::WriteBufferDrain); + return; + + case 0x7D1: + //Log(LogLevel::Debug,"Prefetch instruction cache MVA\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + // we force a fill by looking up the value from cache + // if it wasn't cached yet, it will be loaded into cache + // low bits are set to 0x1C to trick cache streaming + CP15Queue = val; + DelayedQueue = nullptr; + QueueFunction(&ARMv5::ICachePrefetch_2); + return; + + /*case 0x7E0: + //Log(LogLevel::Debug,"clean & invalidate data cache\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + DCacheClearAll(); + DCacheInvalidateAll(); + return;*/ + case 0x7E1: + //Log(LogLevel::Debug,"clean & invalidate data cache MVA\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + CP15Queue = val; + QueueFunction(&ARMv5::DCClearInvalidateAddr_2); + return; + case 0x7E2: + //Log(LogLevel::Debug,"clean & invalidate data cache SET/WAY\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + // Cache invalidat by line number and set number + CP15Queue = val; + QueueFunction(&ARMv5::DCClearInvalidateSetWay_2); + } + return; + + case 0x900: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + // Cache Lockdown - Format B + // Bit 31: Lock bit + // Bit 0..Way-1: locked ways + // The Cache is 4 way associative + // But all bits are r/w + DCacheLockDown = val; + Log(LogLevel::Debug,"DCacheLockDown\n"); + return; + case 0x901: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } + // Cache Lockdown - Format B + // Bit 31: Lock bit + // Bit 0..Way-1: locked ways + // The Cache is 4 way associative + // But all bits are r/w + ICacheLockDown = val; + Log(LogLevel::Debug,"ICacheLockDown\n"); + return; case 0x910: - DTCMSetting = val & 0xFFFFF03E; + DTCMSetting = val & (CP15_DTCM_BASE_MASK | CP15_DTCM_SIZE_MASK); UpdateDTCMSetting(); return; case 0x911: - ITCMSetting = val & 0x0000003E; + ITCMSetting = val & (CP15_ITCM_BASE_MASK | CP15_ITCM_SIZE_MASK); UpdateITCMSetting(); return; case 0xD01: - TraceProcessID = val; + case 0xD11: + CP15TraceProcessId = val; return; case 0xF00: - //printf("cache debug index register %08X\n", val); + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + if (((id >> 12) & 0x0f) == 0x03) + CacheDebugRegisterIndex = val; + else if (((id >> 12) & 0x0f) == 0x00) + CP15BISTTestStateRegister = val; + else + { + return ARMInterpreter::A_UNK(this); + } + + } return; case 0xF10: - //printf("cache debug instruction tag %08X\n", val); - return; + // instruction cache Tag register + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + ICacheTags[(index << ICACHE_SETS_LOG2) + segment] = val; + } case 0xF20: - //printf("cache debug data tag %08X\n", val); - return; + // data cache Tag register + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + DCacheTags[(index << DCACHE_SETS_LOG2) + segment] = val; + } + case 0xF30: //printf("cache debug instruction cache %08X\n", val); + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + *(u32 *)&ICache[(((index << ICACHE_SETS_LOG2) + segment) << ICACHE_LINELENGTH_LOG2) + wordAddress*4] = val; + } return; case 0xF40: //printf("cache debug data cache %08X\n", val); + if (PU_Map != PU_PrivMap) + { + return ARMInterpreter::A_UNK(this); + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + *(u32 *)&DCache[(((index << DCACHE_SETS_LOG2) + segment) << DCACHE_LINELENGTH_LOG2) + wordAddress*4] = val; + } return; } - if ((id & 0xF00) == 0xF00) // test/debug shit? - return; - - if ((id & 0xF00) != 0x700) - Log(LogLevel::Debug, "unknown CP15 write op %03X %08X\n", id, val); + Log(LogLevel::Debug, "unknown CP15 write op %04X %08X\n", id, val); } -u32 ARMv5::CP15Read(u32 id) const +u32 ARMv5::CP15Read(const u32 id) const { //printf("CP15 read op %03X %08X\n", id, NDS::ARM9->R[15]); - switch (id) + switch (id & 0xFFF) { case 0x000: // CPU ID case 0x003: @@ -689,13 +1966,15 @@ u32 ARMv5::CP15Read(u32 id) const case 0x005: case 0x006: case 0x007: - return 0x41059461; + return CP15_MAINID_IMPLEMENTOR_ARM | CP15_MAINID_VARIANT_0 | CP15_MAINID_ARCH_v5TE | CP15_MAINID_IMPLEMENTATION_946 | CP15_MAINID_REVISION_1; case 0x001: // cache type - return 0x0F0D2112; + return CACHE_TR_LOCKDOWN_TYPE_B | CACHE_TR_NONUNIFIED + | (DCACHE_LINELENGTH_ENCODED << 12) | (DCACHE_SETS_LOG2 << 15) | ((DCACHE_SIZE_LOG2 - 9) << 18) + | (ICACHE_LINELENGTH_ENCODED << 0) | (ICACHE_SETS_LOG2 << 3) | ((ICACHE_SIZE_LOG2 - 9) << 6); case 0x002: // TCM size - return (6 << 6) | (5 << 18); + return CP15_TCMSIZE_ITCM_32KB | CP15_TCMSIZE_DTCM_16KB; case 0x100: // control reg @@ -707,33 +1986,30 @@ u32 ARMv5::CP15Read(u32 id) const case 0x201: return PU_CodeCacheable; case 0x300: - return PU_DataCacheWrite; + return PU_WriteBufferability; case 0x500: { + // this format has 2 bits per region, but we store 4 per region + // so we reduce and consoldate the bits + // 0x502 returns all 4 bits per region u32 ret = 0; - ret |= (PU_DataRW & 0x00000003); - ret |= ((PU_DataRW & 0x00000030) >> 2); - ret |= ((PU_DataRW & 0x00000300) >> 4); - ret |= ((PU_DataRW & 0x00003000) >> 6); - ret |= ((PU_DataRW & 0x00030000) >> 8); - ret |= ((PU_DataRW & 0x00300000) >> 10); - ret |= ((PU_DataRW & 0x03000000) >> 12); - ret |= ((PU_DataRW & 0x30000000) >> 14); + #pragma GCC ivdep + #pragma GCC unroll 8 + for (int i = 0; i < CP15_REGION_COUNT; i++) + ret |= (PU_DataRW >> (i * CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK) << (i*2); return ret; } case 0x501: { + // this format has 2 bits per region, but we store 4 per region + // so we reduce and consoldate the bits + // 0x503 returns all 4 bits per region u32 ret = 0; - ret |= (PU_CodeRW & 0x00000003); - ret |= ((PU_CodeRW & 0x00000030) >> 2); - ret |= ((PU_CodeRW & 0x00000300) >> 4); - ret |= ((PU_CodeRW & 0x00003000) >> 6); - ret |= ((PU_CodeRW & 0x00030000) >> 8); - ret |= ((PU_CodeRW & 0x00300000) >> 10); - ret |= ((PU_CodeRW & 0x03000000) >> 12); - ret |= ((PU_CodeRW & 0x30000000) >> 14); + #pragma GCC unroll 8 + for (int i = 0; i < CP15_REGION_COUNT; i++) + ret |= (PU_CodeRW >> (i * CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK) << (i*2); return ret; } case 0x502: @@ -760,263 +2036,880 @@ u32 ARMv5::CP15Read(u32 id) const case 0x671: return PU_Region[(id >> 4) & 0xF]; + case 0x7A6: + // read Cache Dirty Bit (optional) + // it is not present on the NDS/DSi + return 0; + + case 0x900: + if (PU_Map != PU_PrivMap) + { + return 0; + } else + return DCacheLockDown; + case 0x901: + if (PU_Map != PU_PrivMap) + { + return 0; + } else + return ICacheLockDown; case 0x910: return DTCMSetting; case 0x911: return ITCMSetting; - case 0xD01: - return TraceProcessID; + case 0xD01: // See arm946E-S Rev 1 technical Reference Manual, Chapter 2.3.13 */ + case 0xD11: // backwards compatible read/write of the same register + return CP15TraceProcessId; + + case 0xF00: + if (PU_Map != PU_PrivMap) + { + return 0; + } else + { + if (((id >> 12) & 0x0f) == 0x03) + return CacheDebugRegisterIndex; + if (((id >> 12) & 0x0f) == 0x00) + return CP15BISTTestStateRegister; + } + case 0xF10: + // instruction cache Tag register + if (PU_Map != PU_PrivMap) + { + return 0; + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + Log(LogLevel::Debug, "Read ICache Tag %08lx -> %08lx\n", CacheDebugRegisterIndex, ICacheTags[(index << ICACHE_SETS_LOG2) + segment]); + return ICacheTags[(index << ICACHE_SETS_LOG2) + segment]; + } + case 0xF20: + // data cache Tag register + if (PU_Map != PU_PrivMap) + { + return 0; + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + Log(LogLevel::Debug, "Read DCache Tag %08lx (%u, %02x, %u) -> %08lx\n", CacheDebugRegisterIndex, segment, index, wordAddress, DCacheTags[(index << DCACHE_SETS_LOG2) + segment]); + return DCacheTags[(index << DCACHE_SETS_LOG2) + segment]; + } + case 0xF30: + if (PU_Map != PU_PrivMap) + { + return 0; + } else + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + return *(u32 *)&ICache[(((index << ICACHE_SETS_LOG2) + segment) << ICACHE_LINELENGTH_LOG2) + wordAddress*4]; + } + case 0xF40: + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + return *(u32 *)&DCache[(((index << DCACHE_SETS_LOG2) + segment) << DCACHE_LINELENGTH_LOG2) + wordAddress*4]; + } } - if ((id & 0xF00) == 0xF00) // test/debug shit? - return 0; - - Log(LogLevel::Debug, "unknown CP15 read op %03X\n", id); + Log(LogLevel::Debug, "unknown CP15 read op %04X\n", id); return 0; } - - -// TCM are handled here. -// TODO: later on, handle PU, and maybe caches - -u32 ARMv5::CodeRead32(u32 addr, bool branch) +void ARMv5::ICachePrefetch_2() { - /*if (branch || (!(addr & 0xFFF))) - { - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return 0; - } - }*/ - - if (addr < ITCMSize) - { - CodeCycles = 1; - return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - } - - CodeCycles = RegionCodeCycles; - if (CodeCycles == 0xFF) // cached memory. hax - { - if (branch || !(addr & 0x1F)) - CodeCycles = kCodeCacheTiming;//ICacheLookup(addr); - else - CodeCycles = 1; - - //return *(u32*)&CurICacheLine[addr & 0x1C]; - } - - if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; - - return BusRead32(addr); + u32 val = CP15Queue; + ICacheLookup((val & ~0x03) | 0x1C); } - -void ARMv5::DataRead8(u32 addr, u32* val) +void ARMv5::DCClearAddr_2() { - if (!(PU_Map[addr>>12] & 0x01)) - { - DataAbort(); + u32 val = CP15Queue; + DCacheClearByAddr(val); +} + +void ARMv5::DCClearSetWay_2() +{ + u32 val = CP15Queue; + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheClearByASetAndWay(cacheSet, cacheLine); +} + +void ARMv5::DCClearInvalidateAddr_2() +{ + u32 val = CP15Queue; + DCacheClearByAddr(val); + DCacheInvalidateByAddr(val); +} + +void ARMv5::DCClearInvalidateSetWay_2() +{ + u32 val = CP15Queue; + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheClearByASetAndWay(cacheSet, cacheLine); + DCacheInvalidateBySetAndWay(cacheSet, cacheLine); +} + +// TCM are handled here. +// TODO: later on, handle PU + +void ARMv5::CodeRead32(u32 addr) +{ + // prefetch abort + // the actual exception is not raised until the aborted instruction is executed + if (!(PU_Map[addr>>12] & CP15_MAP_EXECUTABLE)) [[unlikely]] + { + NDS.ARM9Timestamp += 1; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; + DataRegion = Mem9_Null; + Store = false; + RetVal = ((u64)1<<63); + QueueFunction(DelayedQueue); return; } - DataRegion = addr; + if (addr < ITCMSize) + { + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; + NDS.ARM9Timestamp += 1; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; + DataRegion = Mem9_Null; + Store = false; + RetVal = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + QueueFunction(DelayedQueue); + return; + } + + #if !DISABLE_ICACHE + #ifdef JIT_ENABLED + //if (!NDS.IsJITEnabled()) + #endif + { + if (IsAddressICachable(addr)) + { + if (ICacheLookup(addr)) return; + } + #endif + } + + FetchAddr[16] = addr; + QueueFunction(&ARMv5::CodeRead32_2); +} + +void ARMv5::CodeRead32_2() +{ + //if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with dcache streaming by 6 cycles + if (DCacheStreamPtr < 7) + { + u64 time = DCacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + if (PU_Map[FetchAddr[16]>>12] & 0x30) + WriteBufferDrain(); + else + WriteBufferCheck<3>(); + + QueueFunction(&ARMv5::CodeRead32_3); +} + +void ARMv5::CodeRead32_3() +{ + u32 addr = FetchAddr[16]; + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + FetchAddr[16] = addr; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRCodeFetch | MR32; + + QueueFunction(DelayedQueue); + } + else + { + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<> 14][1]; + + NDS.DMA9Timestamp = NDS.ARM9Timestamp += cycles; + + if (WBTimestamp < ((NDS.ARM9Timestamp - (3<> (8 * (addr & 3))) & 0xff; +} + +bool ARMv5::DataRead8(u32 addr, u8 reg) +{ + // Data Aborts + // Exception is handled in the actual instruction implementation + if (!(PU_Map[addr>>12] & CP15_MAP_READABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs = 1<> 12][1]; + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + //if (!NDS.IsJITEnabled()) + #endif + { + if (IsAddressDCachable(addr)) + { + DelayedQueue = &ARMv5::DCacheFin8; + if (DCacheLookup(addr)) return; + } + } + #endif + + QueueFunction(&ARMv5::DRead8_3); } -void ARMv5::DataRead16(u32 addr, u32* val) +void ARMv5::DRead8_3() { - if (!(PU_Map[addr>>12] & 0x01)) + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<>12] & 0x30) + WriteBufferDrain(); + else + WriteBufferCheck<1>(); + + QueueFunction(&ARMv5::DRead8_4); +} + +void ARMv5::DRead8_4() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR8; + MRTrack.Progress = reg; + } + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<> 14][0]; + DataCycles = 3<> (8 * (addr & 2))) & 0xffff; +} + +bool ARMv5::DataRead16(u32 addr, u8 reg) +{ + // Data Aborts + // Exception is handled in the actual instruction implementation + if (!(PU_Map[addr>>12] & CP15_MAP_READABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs = 1<>12] & 0x30) + WriteBufferDrain(); + else + WriteBufferCheck<1>(); + + QueueFunction(&ARMv5::DRead16_4); +} + +void ARMv5::DRead16_4() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR16; + MRTrack.Progress = reg; + } + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<> 12][1]; + + NDS.DMA9Timestamp = NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = 3<>12] & 0x01)) - { - DataAbort(); - return; - } + u8 reg = __builtin_ctz(LDRRegs); + u32 dummy; u32* val = (LDRFailedRegs & (1<>12] & CP15_MAP_READABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs = 1<> 12][2]; + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + //if (!NDS.IsJITEnabled()) + #endif + { + if (IsAddressDCachable(addr)) + { + DelayedQueue = &ARMv5::DCacheFin32; + if (DCacheLookup(addr)) return; + } + } + #endif + + QueueFunction(&ARMv5::DRead32_3); } -void ARMv5::DataRead32S(u32 addr, u32* val) +void ARMv5::DRead32_3() { + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<>12] & 0x30) + WriteBufferDrain(); + else + WriteBufferCheck<1>(); + + QueueFunction(&ARMv5::DRead32_4); +} + +void ARMv5::DRead32_4() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32; + MRTrack.Progress = reg; + + LDRRegs &= ~1<>14]; + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<> 14][1]; + DataCycles = 3<>12] & CP15_MAP_READABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs |= 1<> 12][3]; + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + //if (!NDS.IsJITEnabled()) + #endif + { + if (IsAddressDCachable(addr)) + { + DelayedQueue = &ARMv5::DCacheFin32; + if (DCacheLookup(addr)) return; + } + } + #endif + + QueueFunction(&ARMv5::DRead32S_3); } -void ARMv5::DataWrite8(u32 addr, u8 val) +void ARMv5::DRead32S_3() { - if (!(PU_Map[addr>>12] & 0x02)) + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<>12] & 0x30) // checkme + WriteBufferDrain(); + else + WriteBufferCheck<1>(); + + QueueFunction(&ARMv5::DRead32S_4); +} + +void ARMv5::DRead32S_4() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32 | MRSequential; + MRTrack.Progress = reg; + + LDRRegs &= ~1<>14]; + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32; + MRTrack.Progress = reg; + + LDRRegs &= ~1<>14]; + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>14][2]; + DataCycles = MemTimings[addr>>14][2]; + + if (WBTimestamp < ((NDS.ARM9Timestamp - (3<>14][1]; + DataCycles = 3<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; } - DataRegion = addr; + FetchAddr[reg] = addr; + STRRegs = 1<(addr); - return; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; - } - - BusWrite8(addr, val); - DataCycles = MemTimings[addr >> 12][1]; -} - -void ARMv5::DataWrite16(u32 addr, u16 val) -{ - if (!(PU_Map[addr>>12] & 0x02)) - { - DataAbort(); - return; - } - - DataRegion = addr; - - addr &= ~1; - - if (addr < ITCMSize) - { - DataCycles = 1; - *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; - } - - BusWrite16(addr, val); - DataCycles = MemTimings[addr >> 12][1]; -} - -void ARMv5::DataWrite32(u32 addr, u32 val) -{ - if (!(PU_Map[addr>>12] & 0x02)) - { - DataAbort(); - return; - } - - DataRegion = addr; - - addr &= ~3; - - if (addr < ITCMSize) - { - DataCycles = 1; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; - } - - BusWrite32(addr, val); - DataCycles = MemTimings[addr >> 12][2]; -} - -void ARMv5::DataWrite32S(u32 addr, u32 val) -{ - addr &= ~3; - - if (addr < ITCMSize) - { - DataCycles += 1; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif @@ -1024,24 +2917,547 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) } if ((addr & DTCMMask) == DTCMBase) { - DataCycles += 1; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + NDS.ARM9Timestamp += DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return; } - BusWrite32(addr, val); - DataCycles += MemTimings[addr >> 12][3]; + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + //if (!NDS.IsJITEnabled()) + #endif + { + if (IsAddressDCachable(addr)) + { + if (DCacheWrite8(addr, val)) + return; + } + } + #endif + + if (!(PU_Map[addr>>12] & (0x30))) + { + QueueFunction(&ARMv5::DWrite8_3); + } + else + { + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + WriteBufferWrite(addr, 4); + WriteBufferWrite(val, 0, addr); + } } -void ARMv5::GetCodeMemRegion(u32 addr, MemRegion* region) +void ARMv5::DWrite8_3() { - /*if (addr < ITCMSize) + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) { - region->Mem = ITCM; - region->Mask = 0x7FFF; - return; - }*/ + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + WriteBufferCheck<2>(); + QueueFunction(&ARMv5::DWrite8_4); +} + +void ARMv5::DWrite8_4() +{ + u8 reg = __builtin_ctz(STRRegs); + u32 addr = FetchAddr[reg]; + u8 val = STRVal[reg]; + + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR8; + MRTrack.Progress = reg; + } + else + { + QueueFunction(&ARMv5::DWrite8_5); + } +} + +void ARMv5::DWrite8_5() +{ + u8 reg = __builtin_ctz(STRRegs); + u32 addr = FetchAddr[reg]; + u8 val = STRVal[reg]; + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = (NDS.DMA9Timestamp + ((1<> 14][0] + 1; + + BusWrite8(addr, val); + NDS.DMA9Timestamp = NDS.ARM9Timestamp -= 1; + + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + STRRegs = 1<(addr); +#endif + return; + } + if ((addr & DTCMMask) == DTCMBase) + { + NDS.ARM9Timestamp += DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return; + } + + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + //if (!NDS.IsJITEnabled()) + #endif + { + if (IsAddressDCachable(addr)) + { + if (DCacheWrite16(addr, val)) + return; + } + } + #endif + + if (!(PU_Map[addr>>12] & 0x30)) + { + QueueFunction(&ARMv5::DWrite16_3); + } + else + { + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + WriteBufferWrite(addr, 4); + WriteBufferWrite(val, 1, addr); + } +} + +void ARMv5::DWrite16_3() +{ + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + WriteBufferCheck<2>(); + QueueFunction(&ARMv5::DWrite16_4); +} + +void ARMv5::DWrite16_4() +{ + u8 reg = __builtin_ctz(STRRegs); + u32 addr = FetchAddr[reg]; + u16 val = STRVal[reg]; + + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR16; + MRTrack.Progress = reg; + } + else + { + QueueFunction(&ARMv5::DWrite16_5); + } +} + +void ARMv5::DWrite16_5() +{ + u8 reg = __builtin_ctz(STRRegs); + u32 addr = FetchAddr[reg]; + u16 val = STRVal[reg]; + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = (NDS.DMA9Timestamp + ((1<> 14][0] + 1; + + BusWrite16(addr, val); + NDS.DMA9Timestamp = NDS.ARM9Timestamp -= 1; + + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + STRRegs = 1<(addr); +#endif + STRRegs &= ~1<>12] & 0x30)) + { + QueueFunction(&ARMv5::DWrite32_3); + } + else + { + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + WriteBufferWrite(addr, 4); + WriteBufferWrite(val, 2, addr); + STRRegs &= ~1<(); + QueueFunction(&ARMv5::DWrite32_4); +} + +void ARMv5::DWrite32_4() +{ + u8 reg = __builtin_ctz(STRRegs); + u32 addr = FetchAddr[reg]; + u32 val = STRVal[reg]; + + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32; + MRTrack.Progress = reg; + + STRRegs &= ~1<> 14][1] + 1; + + BusWrite32(addr, val); + NDS.DMA9Timestamp = NDS.ARM9Timestamp -= 1; + + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + QueueFunction(&ARMv5::DAbortHandle); + return false; + } + + FetchAddr[reg] = addr; + STRRegs |= 1<(addr); +#endif + STRRegs &= ~1<>12] & 0x30)) // non-bufferable + { + QueueFunction(&ARMv5::DWrite32S_3); + } + else + { + WriteBufferWrite(val, 3, addr); + STRRegs &= ~1<(); + QueueFunction(&ARMv5::DWrite32S_4); +} + +void ARMv5::DWrite32S_4() +{ + u8 reg = __builtin_ctz(STRRegs); + u32 addr = FetchAddr[reg]; + u32 val = STRVal[reg]; + + // bursts cannot cross a 1kb boundary + if (addr & 0x3FF) // s + { + if ((addr >> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32 | MRSequential; + MRTrack.Progress = reg; + + STRRegs &= ~1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32; + MRTrack.Progress = reg; + + STRRegs &= ~1<>14][2] + 1; + + BusWrite32(addr, val); + + NDS.DMA9Timestamp = NDS.ARM9Timestamp -= 1; + + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>14][1] + 1; + + BusWrite32(addr, val); + + NDS.DMA9Timestamp = NDS.ARM9Timestamp -= 1; + + DataCycles = 3 << NDS.ARM9ClockShift; // checkme + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<Var32(&MRAMBurstCount); file->Bool32(&Executing); file->Bool32(&Stall); + file->Bool32(&DMAQueued); file->VarArray(MRAMBurstTable.data(), sizeof(MRAMBurstTable)); } @@ -137,14 +140,17 @@ void DMA::WriteCnt(u32 val) case 0x01000000: SrcAddrInc = 0; break; case 0x01800000: SrcAddrInc = 1; break; } - + u32 oldstartmode = StartMode; if (CPU == 0) StartMode = (Cnt >> 27) & 0x7; else StartMode = ((Cnt >> 28) & 0x3) | 0x10; if ((StartMode & 0x7) == 0) - Start(); + { + NDS.DMAsQueued[NDS.DMAQueuePtr++] = (CPU*4)+Num; + if (!(NDS.SchedListMask & (1<> 14; u32 dst_id = CurDstAddr >> 14; @@ -209,80 +241,24 @@ u32 DMA::UnitTimings9_16(bool burststart) src_s = NDS.ARM9MemTimings[src_id][5]; dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - - if (src_rgn == Mem9_MainRAM) + + if (src_rgn & dst_rgn) { - if (dst_rgn == Mem9_MainRAM) - return 16; - - if (SrcAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (dst_rgn == Mem9_GBAROM) - { - if (dst_s == 4) - MRAMBurstTable = DMATiming::MRAMRead16Bursts[1]; - else - MRAMBurstTable = DMATiming::MRAMRead16Bursts[2]; - } - else - MRAMBurstTable = DMATiming::MRAMRead16Bursts[0]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); else - { - // TODO: not quite right for GBA slot - return (((CurSrcAddr & 0x1F) == 0x1E) ? 7 : 8) + - (burststart ? dst_n : dst_s); - } - } - else if (dst_rgn == Mem9_MainRAM) - { - if (DstAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (src_rgn == Mem9_GBAROM) - { - if (src_s == 4) - MRAMBurstTable = DMATiming::MRAMWrite16Bursts[1]; - else - MRAMBurstTable = DMATiming::MRAMWrite16Bursts[2]; - } - else - MRAMBurstTable = DMATiming::MRAMWrite16Bursts[0]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } - else - { - return (burststart ? src_n : src_s) + 7; - } - } - else if (src_rgn & dst_rgn) - { - return src_n + dst_n + 1; + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } } -u32 DMA::UnitTimings9_32(bool burststart) +u32 DMA::UnitTimings9_32(int burststart) { u32 src_id = CurSrcAddr >> 14; u32 dst_id = CurDstAddr >> 14; @@ -296,77 +272,17 @@ u32 DMA::UnitTimings9_32(bool burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - if (src_rgn == Mem9_MainRAM) + if (src_rgn & dst_rgn) { - if (dst_rgn == Mem9_MainRAM) - return 18; - - if (SrcAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (dst_rgn == Mem9_GBAROM) - { - if (dst_s == 8) - MRAMBurstTable = DMATiming::MRAMRead32Bursts[2]; - else - MRAMBurstTable = DMATiming::MRAMRead32Bursts[3]; - } - else if (dst_n == 2) - MRAMBurstTable = DMATiming::MRAMRead32Bursts[0]; - else - MRAMBurstTable = DMATiming::MRAMRead32Bursts[1]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); else - { - // TODO: not quite right for GBA slot - return (((CurSrcAddr & 0x1F) == 0x1C) ? (dst_n==2 ? 7:8) : 9) + - (burststart ? dst_n : dst_s); - } - } - else if (dst_rgn == Mem9_MainRAM) - { - if (DstAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (src_rgn == Mem9_GBAROM) - { - if (src_s == 8) - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[2]; - else - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[3]; - } - else if (src_n == 2) - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[0]; - else - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[1]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } - else - { - return (burststart ? src_n : src_s) + 8; - } - } - else if (src_rgn & dst_rgn) - { - return src_n + dst_n + 1; + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } @@ -374,7 +290,7 @@ u32 DMA::UnitTimings9_32(bool burststart) // TODO: the ARM7 ones don't take into account that the two wifi regions have different timings -u32 DMA::UnitTimings7_16(bool burststart) +u32 DMA::UnitTimings7_16(int burststart) { u32 src_id = CurSrcAddr >> 15; u32 dst_id = CurDstAddr >> 15; @@ -388,79 +304,23 @@ u32 DMA::UnitTimings7_16(bool burststart) dst_n = NDS.ARM7MemTimings[dst_id][0]; dst_s = NDS.ARM7MemTimings[dst_id][1]; - if (src_rgn == Mem7_MainRAM) + if (src_rgn & dst_rgn) { - if (dst_rgn == Mem7_MainRAM) - return 16; - - if (SrcAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (dst_rgn == Mem7_GBAROM || dst_rgn == Mem7_Wifi0 || dst_rgn == Mem7_Wifi1) - { - if (dst_s == 4) - MRAMBurstTable = DMATiming::MRAMRead16Bursts[1]; - else - MRAMBurstTable = DMATiming::MRAMRead16Bursts[2]; - } - else - MRAMBurstTable = DMATiming::MRAMRead16Bursts[0]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); else - { - // TODO: not quite right for GBA slot - return (((CurSrcAddr & 0x1F) == 0x1E) ? 7 : 8) + - (burststart ? dst_n : dst_s); - } - } - else if (dst_rgn == Mem7_MainRAM) - { - if (DstAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (src_rgn == Mem7_GBAROM || src_rgn == Mem7_Wifi0 || src_rgn == Mem7_Wifi1) - { - if (src_s == 4) - MRAMBurstTable = DMATiming::MRAMWrite16Bursts[1]; - else - MRAMBurstTable = DMATiming::MRAMWrite16Bursts[2]; - } - else - MRAMBurstTable = DMATiming::MRAMWrite16Bursts[0]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } - else - { - return (burststart ? src_n : src_s) + 7; - } - } - else if (src_rgn & dst_rgn) - { - return src_n + dst_n + 1; + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } } -u32 DMA::UnitTimings7_32(bool burststart) +u32 DMA::UnitTimings7_32(int burststart) { u32 src_id = CurSrcAddr >> 15; u32 dst_id = CurDstAddr >> 15; @@ -474,77 +334,17 @@ u32 DMA::UnitTimings7_32(bool burststart) dst_n = NDS.ARM7MemTimings[dst_id][2]; dst_s = NDS.ARM7MemTimings[dst_id][3]; - if (src_rgn == Mem7_MainRAM) + if (src_rgn & dst_rgn) { - if (dst_rgn == Mem7_MainRAM) - return 18; - - if (SrcAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (dst_rgn == Mem7_GBAROM || dst_rgn == Mem7_Wifi0 || dst_rgn == Mem7_Wifi1) - { - if (dst_s == 8) - MRAMBurstTable = DMATiming::MRAMRead32Bursts[2]; - else - MRAMBurstTable = DMATiming::MRAMRead32Bursts[3]; - } - else if (dst_n == 2) - MRAMBurstTable = DMATiming::MRAMRead32Bursts[0]; - else - MRAMBurstTable = DMATiming::MRAMRead32Bursts[1]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); else - { - // TODO: not quite right for GBA slot - return (((CurSrcAddr & 0x1F) == 0x1C) ? (dst_n==2 ? 7:8) : 9) + - (burststart ? dst_n : dst_s); - } - } - else if (dst_rgn == Mem7_MainRAM) - { - if (DstAddrInc > 0) - { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) - { - MRAMBurstCount = 0; - - if (src_rgn == Mem7_GBAROM || src_rgn == Mem7_Wifi0 || src_rgn == Mem7_Wifi1) - { - if (src_s == 8) - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[2]; - else - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[3]; - } - else if (src_n == 2) - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[0]; - else - MRAMBurstTable = DMATiming::MRAMWrite32Bursts[1]; - } - - u32 ret = MRAMBurstTable[MRAMBurstCount++]; - return ret; - } - else - { - return (burststart ? src_n : src_s) + 8; - } - } - else if (src_rgn & dst_rgn) - { - return src_n + dst_n + 1; + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } @@ -552,20 +352,30 @@ u32 DMA::UnitTimings7_32(bool burststart) void DMA::Run9() { - if (NDS.ARM9Timestamp >= NDS.ARM9Target) return; + //NDS.DMA9Timestamp = std::max(NDS.DMA9Timestamp, NDS.SysTimestamp << NDS.ARM9ClockShift); + //NDS.DMA9Timestamp = (NDS.DMA9Timestamp + ((1<= NDS.ARM9Target) return; Executing = true; // add NS penalty for first accesses in burst - bool burststart = (Running == 2); - Running = 1; + int burststart = Running-1; if (!(Cnt & (1<<26))) { while (IterCount > 0 && !Stall) { - NDS.ARM9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); - burststart = false; + u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + if (rgn & Mem9_MainRAM) + { + NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; + NDS.ARM9.MRTrack.Var = Num; + return; + } + + NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); + burststart -= 1; NDS.ARM9Write16(CurDstAddr, NDS.ARM9Read16(CurSrcAddr)); @@ -574,15 +384,23 @@ void DMA::Run9() IterCount--; RemCount--; - if (NDS.ARM9Timestamp >= NDS.ARM9Target) break; + if (NDS.DMA9Timestamp >= NDS.ARM9Target) break; } } else { while (IterCount > 0 && !Stall) { - NDS.ARM9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); - burststart = false; + u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + if (rgn & Mem9_MainRAM) + { + NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; + NDS.ARM9.MRTrack.Var = Num; + return; + } + + NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); + burststart -= 1; NDS.ARM9Write32(CurDstAddr, NDS.ARM9Read32(CurSrcAddr)); @@ -591,10 +409,15 @@ void DMA::Run9() IterCount--; RemCount--; - if (NDS.ARM9Timestamp >= NDS.ARM9Target) break; + if (NDS.DMA9Timestamp >= NDS.ARM9Target) break; } } + NDS.DMA9Timestamp -= 1; + + if (burststart <= 0) Running = 1; + else Running = 2; + Executing = false; Stall = false; @@ -621,6 +444,8 @@ void DMA::Run9() Running = 0; InProgress = false; NDS.ResumeCPU(0, 1< 0 && !Stall) { + u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + if (rgn & Mem7_MainRAM) + { + NDS.ARM7.MRTrack.Type = MainRAMType::DMA16; + NDS.ARM7.MRTrack.Var = Num+4; + return; + } + NDS.ARM7Timestamp += UnitTimings7_16(burststart); burststart = false; @@ -654,6 +486,14 @@ void DMA::Run7() { while (IterCount > 0 && !Stall) { + u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + if (rgn & Mem7_MainRAM) + { + NDS.ARM7.MRTrack.Type = MainRAMType::DMA32; + NDS.ARM7.MRTrack.Var = Num+4; + return; + } + NDS.ARM7Timestamp += UnitTimings7_32(burststart); burststart = false; @@ -668,6 +508,9 @@ void DMA::Run7() } } + if (burststart <= 0) Running = 1; + else Running = 2; + Executing = false; Stall = false; @@ -691,6 +534,8 @@ void DMA::Run7() Running = 0; InProgress = false; NDS.ResumeCPU(1, 1< 0) Running = 3; + } + u32 SrcAddr {}; u32 DstAddr {}; u32 Cnt {}; - -private: - melonDS::NDS& NDS; - u32 CPU {}; - u32 Num {}; - - u32 StartMode {}; u32 CurSrcAddr {}; u32 CurDstAddr {}; u32 RemCount {}; u32 IterCount {}; s32 SrcAddrInc {}; s32 DstAddrInc {}; - u32 CountMask {}; - u32 Running {}; bool InProgress {}; - + u32 Num {}; + u32 StartMode {}; bool Executing {}; bool Stall {}; +private: + melonDS::NDS& NDS; + u32 CPU {}; + bool DMAQueued; + + u32 CountMask {}; + bool IsGXFIFODMA {}; u32 MRAMBurstCount {}; diff --git a/src/DMA_Timings.cpp b/src/DMA_Timings.cpp index a51fedfb..02539a62 100644 --- a/src/DMA_Timings.cpp +++ b/src/DMA_Timings.cpp @@ -48,7 +48,7 @@ extern const std::array MRAMDummy = {0}; extern const std::array MRAMRead16Bursts[] = { // main RAM to regular 16bit or 32bit bus (similar) - {7, 3, 2, 2, 2, 2, 2, 2, 2, 2, + {6, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -60,7 +60,7 @@ extern const std::array MRAMRead16Bursts[] = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 7, 3, 2, 2, 2, 2, 2, 2, 2, 2, + 6, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -72,7 +72,7 @@ extern const std::array MRAMRead16Bursts[] = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 7, 3, + 6, 3, 0}, // main RAM to GBA/wifi, seq=4 {8, 6, 5, 5, 5, 5, 5, 5, 5, 5, @@ -181,7 +181,7 @@ extern const std::array MRAMRead32Bursts[] = extern const std::array MRAMWrite16Bursts[] = { // regular 16bit or 32bit bus to main RAM (similar) - {8, 2, 2, 2, 2, 2, 2, 2, 2, 2, + {5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -212,7 +212,7 @@ extern const std::array MRAMWrite16Bursts[] = extern const std::array MRAMWrite32Bursts[4] = { // regular 16bit bus to main RAM - {9, 4, 4, 4, 4, 4, 4, 4, 4, 4, + {6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -220,7 +220,7 @@ extern const std::array MRAMWrite32Bursts[4] = 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0}, // regular 32bit bus to main RAM - {9, 3, 3, 3, 3, 3, 3, 3, 3, 3, + {6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, diff --git a/src/DSi.cpp b/src/DSi.cpp index f9115cbf..d7d43d24 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -129,6 +129,9 @@ void DSi::Reset() //ARM9.CP15Write(0x911, 0x00000020); //ARM9.CP15Write(0x100, ARM9.CP15Read(0x100) | 0x00050000); NDS::Reset(); + + ExMemCnt[0] = 0xEC8C; // checkme: bit 10 should be explicitly set? + ExMemCnt[1] = 0xEC8C; // The SOUNDBIAS register does nothing on DSi SPU.SetApplyBias(false); @@ -160,6 +163,7 @@ void DSi::Reset() SCFG_Clock9 = 0x0187; // CHECKME SCFG_Clock7 = 0x0187; SCFG_EXT[0] = 0x8307F100; + SetVRAMTimings(true); SCFG_EXT[1] = 0x93FFFB06; SCFG_MC = 0x0010 | (~((u32)(NDSCartSlot.GetCart() != nullptr))&1);//0x0011; SCFG_RST = 0; @@ -233,6 +237,7 @@ void DSi::DoSavestateExtra(Savestate* file) Set_SCFG_Clock9(SCFG_Clock9); Set_SCFG_MC(SCFG_MC); DSP.SetRstLine(SCFG_RST & 0x0001); + SetVRAMTimings(SCFG_EXT[0] & (1<<13)); MBK[0][8] = 0; MBK[1][8] = 0; @@ -711,6 +716,7 @@ void DSi::SoftReset() SCFG_Clock9 = 0x0187; // CHECKME SCFG_Clock7 = 0x0187; SCFG_EXT[0] = 0x8307F100; + SetVRAMTimings(true); SCFG_EXT[1] = 0x93FFFB06; SCFG_MC = 0x0010;//0x0011; // TODO: is this actually reset? @@ -1273,7 +1279,22 @@ void DSi::ApplyNewRAMSize(u32 size) void DSi::Set_SCFG_Clock9(u16 val) { ARM9Timestamp >>= ARM9ClockShift; + DMA9Timestamp >>= ARM9ClockShift; ARM9Target >>= ARM9ClockShift; + for (int i = 0; i < 7; i++) + { + ARM9.ICacheStreamTimes[i] >>= ARM9ClockShift; + ARM9.DCacheStreamTimes[i] >>= ARM9ClockShift; + } + ARM9.TimestampMemory >>= ARM9ClockShift; + ARM9.ITCMTimestamp >>= ARM9ClockShift; + ARM9.IRQTimestamp >>= ARM9ClockShift; + ARM9.WBTimestamp >>= ARM9ClockShift; + ARM9.WBDelay >>= ARM9ClockShift; + ARM9.WBReleaseTS >>= ARM9ClockShift; + ARM9.WBInitialTS >>= ARM9ClockShift; + ARM9.ILCurrTime >>= ARM9ClockShift; + ARM9.ILPrevTime >>= ARM9ClockShift; Log(LogLevel::Debug, "CLOCK9=%04X\n", val); SCFG_Clock9 = val & 0x0187; @@ -1282,8 +1303,24 @@ void DSi::Set_SCFG_Clock9(u16 val) else ARM9ClockShift = 1; ARM9Timestamp <<= ARM9ClockShift; + DMA9Timestamp <<= ARM9ClockShift; ARM9Target <<= ARM9ClockShift; - ARM9.UpdateRegionTimings(0x00000, 0x100000); + for (int i = 0; i < 7; i++) + { + ARM9.ICacheStreamTimes[i] <<= ARM9ClockShift; + ARM9.DCacheStreamTimes[i] <<= ARM9ClockShift; + } + ARM9.TimestampMemory <<= ARM9ClockShift; + ARM9.ITCMTimestamp <<= ARM9ClockShift; + ARM9.IRQTimestamp <<= ARM9ClockShift; + ARM9.WBTimestamp <<= ARM9ClockShift; + ARM9.WBDelay <<= ARM9ClockShift; + ARM9.WBReleaseTS <<= ARM9ClockShift; + ARM9.WBInitialTS <<= ARM9ClockShift; + ARM9.ILCurrTime <<= ARM9ClockShift; + ARM9.ILPrevTime <<= ARM9ClockShift; + + ARM9.UpdateRegionTimings(0x00000, 0x40000); } void DSi::Set_SCFG_MC(u32 val) @@ -1301,6 +1338,20 @@ void DSi::Set_SCFG_MC(u32 val) } } +void DSi::SetVRAMTimings(bool extrabuswidth) +{ + if (extrabuswidth) // 32 bit bus; arm9 can do 8 bit writes + { + SetARM9RegionTimings(0x06000, 0x07000, Mem9_VRAM, 32, 1, 1); + SetARM7RegionTimings(0x06000, 0x07000, Mem7_VRAM, 32, 1, 1); + } + else // 16 bit bus; arm9 cannot do 8 bit writes + { + SetARM9RegionTimings(0x06000, 0x07000, Mem9_VRAM, 16, 1, 1); + SetARM7RegionTimings(0x06000, 0x07000, Mem7_VRAM, 16, 1, 1); + } +} + u8 DSi::ARM9Read8(u32 addr) { @@ -1723,7 +1774,7 @@ void DSi::ARM9Write32(u32 addr, u32 val) return NDS::ARM9Write32(addr, val); } -bool DSi::ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) +bool DSi::ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region) { assert(ConsoleType == 1); switch (addr & 0xFF000000) @@ -2539,12 +2590,19 @@ void DSi::ARM9IOWrite32(u32 addr, u32 val) u32 oldram = (SCFG_EXT[0] >> 14) & 0x3; u32 newram = (val >> 14) & 0x3; + u32 oldvram = (SCFG_EXT[0] & (1<<13)); + u32 newvram = (val & (1<<13)); + SCFG_EXT[0] &= ~0x8007F19F; SCFG_EXT[0] |= (val & 0x8007F19F); SCFG_EXT[1] &= ~0x0000F080; SCFG_EXT[1] |= (val & 0x0000F080); Log(LogLevel::Debug, "SCFG_EXT = %08X / %08X (val9 %08X)\n", SCFG_EXT[0], SCFG_EXT[1], val); - /*switch ((SCFG_EXT[0] >> 14) & 0x3) + + if (oldvram != newvram) + SetVRAMTimings(newvram); + + switch ((SCFG_EXT[0] >> 14) & 0x3) { case 0: case 1: @@ -2557,7 +2615,7 @@ void DSi::ARM9IOWrite32(u32 addr, u32 val) NDS::MainRAMMask = 0xFFFFFF; printf("RAM: 16MB\n"); break; - }*/ + } // HAX!! // a change to the RAM size setting is supposed to apply immediately (it does so on hardware) // however, doing so will cause DS-mode app startup to break, because the change happens while the ARM7 @@ -3069,6 +3127,7 @@ void DSi::ARM7IOWrite32(u32 addr, u32 val) SCFG_EXT[0] |= (val & 0x03000000); SCFG_EXT[1] &= ~0x93FF0F07; SCFG_EXT[1] |= (val & 0x93FF0F07); + if (!(val & (1<<24))) { ExMemCnt[0] &= ~(1<<10); ExMemCnt[1] &= ~(1<<10); } // bit 10 of exmemcnt is cleared when disabling second card slot access Log(LogLevel::Debug, "SCFG_EXT = %08X / %08X (val7 %08X)\n", SCFG_EXT[0], SCFG_EXT[1], val); return; case 0x04004010: diff --git a/src/DSi.h b/src/DSi.h index 23a2460c..55d271e0 100644 --- a/src/DSi.h +++ b/src/DSi.h @@ -96,6 +96,7 @@ public: void MapNWRAM_B(u32 num, u8 val); void MapNWRAM_C(u32 num, u8 val); void MapNWRAMRange(u32 cpu, u32 num, u32 val); + void SetVRAMTimings(bool extrabuswidth); u8 ARM9Read8(u32 addr) override; u16 ARM9Read16(u32 addr) override; @@ -104,7 +105,7 @@ public: void ARM9Write16(u32 addr, u16 val) override; void ARM9Write32(u32 addr, u32 val) override; - bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) override; + bool ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region) override; u8 ARM7Read8(u32 addr) override; u16 ARM7Read16(u32 addr) override; diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 4a1426aa..3fc037df 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -2378,13 +2378,13 @@ void GPU3D::Run() noexcept if (!GeometryEnabled || FlushRequest || (CmdPIPE.IsEmpty() && !(GXStat & (1<<27)))) { - Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; + Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; return; } - s32 cycles = (NDS.ARM9Timestamp >> NDS.ARM9ClockShift) - Timestamp; + s32 cycles = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift) - Timestamp; CycleCount -= cycles; - Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; + Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; if (CycleCount <= 0) { @@ -2417,7 +2417,7 @@ void GPU3D::CheckFIFOIRQ() noexcept case 2: irq = CmdFIFO.IsEmpty(); break; } - if (irq) NDS.SetIRQ(0, IRQ_GXFIFO); + if (irq) NDS.SetIRQ(0, IRQ_GXFIFO, CycleCount); else NDS.ClearIRQ(0, IRQ_GXFIFO); } diff --git a/src/NDS.cpp b/src/NDS.cpp index a49d2e17..15af2764 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -124,6 +124,7 @@ NDS::NDS(NDSArgs&& args, int type, void* userdata) noexcept : { RegisterEventFuncs(Event_Div, this, {MakeEventThunk(NDS, DivDone)}); RegisterEventFuncs(Event_Sqrt, this, {MakeEventThunk(NDS, SqrtDone)}); + RegisterEventFuncs(Event_DMA, this, {MakeEventThunk(NDS, QueueDMAs)}); MainRAM = JIT.Memory.GetMainRAM(); SharedWRAM = JIT.Memory.GetSharedWRAM(); @@ -134,6 +135,7 @@ NDS::~NDS() noexcept { UnregisterEventFuncs(Event_Div); UnregisterEventFuncs(Event_Sqrt); + UnregisterEventFuncs(Event_DMA); // The destructor for each component is automatically called by the compiler } @@ -163,9 +165,9 @@ void NDS::SetARM9RegionTimings(u32 addrstart, u32 addrend, u32 region, int buswi for (u32 i = addrstart; i < addrend; i++) { // CPU timings - ARM9MemTimings[i][0] = N16 + cpuN; + ARM9MemTimings[i][0] = N16;// + cpuN; ARM9MemTimings[i][1] = S16; - ARM9MemTimings[i][2] = N32 + cpuN; + ARM9MemTimings[i][2] = N32;// + cpuN; ARM9MemTimings[i][3] = S32; // DMA timings @@ -177,7 +179,7 @@ void NDS::SetARM9RegionTimings(u32 addrstart, u32 addrend, u32 region, int buswi ARM9Regions[i] = region; } - ARM9.UpdateRegionTimings(addrstart<<2, addrend<<2); + ARM9.UpdateRegionTimings(addrstart, addrend); } void NDS::SetARM7RegionTimings(u32 addrstart, u32 addrend, u32 region, int buswidth, int nonseq, int seq) @@ -412,6 +414,18 @@ void NDS::SetupDirectBoot(const std::string& romname) ARM9.JumpTo(header.ARM9EntryAddress); ARM7.JumpTo(header.ARM7EntryAddress); + if (ARM9.FuncQueueFill > 0) // check if we started the queue up + { + ARM9.FuncQueueEnd = ARM9.FuncQueueFill; + ARM9.FuncQueueFill = 0; + ARM9.FuncQueueActive = true; + } + if (ARM7.FuncQueueFill > 0) // check if we started the queue up + { + ARM7.FuncQueueEnd = ARM7.FuncQueueFill; + ARM7.FuncQueueFill = 0; + ARM7.FuncQueueActive = true; + } PostFlag9 = 0x01; PostFlag7 = 0x01; @@ -458,8 +472,10 @@ void NDS::Reset() // unitialised on the first run ARM9.CP15Reset(); - ARM9Timestamp = 0; ARM9Target = 0; + ARM9Timestamp = 0; DMA9Timestamp = 0; ARM9Target = 0; ARM7Timestamp = 0; ARM7Target = 0; + MainRAMTimestamp = 0; + A9ContentionTS = 0; ConTSLock = false; SysTimestamp = 0; InitTimings(); @@ -470,8 +486,8 @@ void NDS::Reset() MapSharedWRAM(0); - ExMemCnt[0] = 0x4000; - ExMemCnt[1] = 0x4000; + ExMemCnt[0] = 0xE88C; // checkme: is this correct? + ExMemCnt[1] = 0xE88C; // note: these should only matter for direct boot; bios sets these values fairly quickly during native boot memset(ROMSeed0, 0, 2*8); memset(ROMSeed1, 0, 2*8); SetGBASlotTimings(); @@ -534,6 +550,9 @@ void NDS::Reset() KeyCnt[1] = 0; RCnt = 0; + memset(DMAsQueued, 0, sizeof(DMAsQueued)); + DMAQueuePtr = 0; + GPU.Reset(); NDSCartSlot.Reset(); GBACartSlot.Reset(); @@ -672,15 +691,26 @@ bool NDS::DoSavestate(Savestate* file) } file->Var32(&SchedListMask); file->Var64(&ARM9Timestamp); + file->Var64(&DMA9Timestamp); file->Var64(&ARM9Target); file->Var64(&ARM7Timestamp); file->Var64(&ARM7Target); file->Var64(&SysTimestamp); + file->Var64(&MainRAMTimestamp); + file->Var64(&MainRAMBurstStart); + file->Var64(&A9ContentionTS); + file->Bool32(&ConTSLock); file->Var64(&LastSysClockCycles); file->Var64(&FrameStartTimestamp); file->Var32(&NumFrames); file->Var32(&NumLagFrames); file->Bool32(&LagFrameFlag); + file->VarArray(DMAReadHold, sizeof(DMAReadHold)); + file->VarArray(DMAsQueued, sizeof(DMAsQueued)); + file->Var8(&DMAQueuePtr); + file->Bool32(&MainRAMBork); + file->Bool32(&MainRAMLastAccess); + file->Bool32(&DMALastWasMainRAM); // TODO: save KeyInput???? file->VarArray(KeyCnt, 2*sizeof(u16)); @@ -776,6 +806,793 @@ void NDS::SetARM9BIOS(const std::array& bios) noexcept ARM9BIOSNative = CRC32(ARM9BIOS.data(), ARM9BIOS.size()) == ARM9BIOSCRC32; } +#define A9WENTLAST (!MainRAMLastAccess) +#define A7WENTLAST ( MainRAMLastAccess) +#define A9LAST false +#define A7LAST true +#define A9PRIORITY !(ExMemCnt[0] & 0x8000) +#define A7PRIORITY (ExMemCnt[0] & 0x8000) + +void NDS::MainRAMHandleARM9() +{ + CurCPU = 0; + switch (ARM9.MRTrack.Type) + { + default: + { + Platform::Log(Platform::LogLevel::Error, "INVALID MAIN RAM TYPE ARM9"); + break; + } + + case MainRAMType::Fetch: + { + u8 var = ARM9.MRTrack.Var; + u32 addr = (var & MRCodeFetch) ? ARM9.FetchAddr[16] : ARM9.FetchAddr[ARM9.MRTrack.Progress]; + + if ((var & MRSequential) && A9WENTLAST && !(MainRAMBork && ((addr & 0x1F) == 0))) + { + A9ContentionTS += 2; + MainRAMTimestamp += 2; + if (!(var & MRWrite)) ARM9.DataCycles = 2 << ARM9ClockShift; + } + else + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + + MainRAMBork = !(var & MRWrite) && ((addr & 0x1F) >= 0x1A); + MainRAMTimestamp = A9ContentionTS + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? + if (var & MRWrite) A9ContentionTS += ((var & MR16) ? 6 : 7); // checkme: is this correct for 133mhz? + else + { + if (ARM9ClockShift == 1) A9ContentionTS += ((var & MR16) ? 8 : 9); + else A9ContentionTS += ((var & MR16) ? 7 : 8); + ARM9.DataCycles = 3 << ARM9ClockShift; + } + MainRAMLastAccess = A9LAST; + } + DMA9Timestamp = ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + + if (var & MRCodeFetch) + { + u32 addr = ARM9.FetchAddr[16]; + ARM9.RetVal = *(u32*)&MainRAM[addr&MainRAMMask]; + } + else + { + ARM9.DataRegion = Mem9_MainRAM; + u8 reg = ARM9.MRTrack.Progress; + u32 addr = ARM9.FetchAddr[reg]; + if (var & MRWrite) // write + { + u32 val = ARM9.STRVal[reg]; + if (var & MR32) *(u32*)&MainRAM[addr&MainRAMMask] = val; + else if (var & MR16) *(u16*)&MainRAM[addr&MainRAMMask] = val; + else *(u8 *)&MainRAM[addr&MainRAMMask] = val; + } + else // read + { + u32 dummy; + u32* val = ((ARM9.LDRFailedRegs & (1< 0) && A9WENTLAST) + { + MainRAMTimestamp += 2; + A9ContentionTS += 2; + } + else + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += ((ARM9ClockShift == 1) ? 9 : 8); + MainRAMLastAccess = A9LAST; + } + + icache[*prog] = *(u32*)&MainRAM[addr&MainRAMMask]; + + if (*prog == ARM9.ICacheStreamPtr) ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + else if (*prog > ARM9.ICacheStreamPtr) ARM9.ICacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; + + (*prog)++; + if (*prog >= 8) + { + ARM9.RetVal = icache[(ARM9.FetchAddr[16] & 0x1F) / 4]; + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + } + break; + } + + case MainRAMType::DCacheStream: + { + u8* prog = &ARM9.MRTrack.Progress; + u32 addr = (ARM9.FetchAddr[16] & ~0x1F) | (*prog * 4); + u32* dcache = (u32*)&ARM9.DCache[ARM9.MRTrack.Var << 5]; + + if ((*prog > 0) && A9WENTLAST) + { + MainRAMTimestamp += 2; + A9ContentionTS += 2; + } + else + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += ((ARM9ClockShift == 1) ? 9 : 8); + MainRAMLastAccess = A9LAST; + } + + dcache[*prog] = *(u32*)&MainRAM[addr&MainRAMMask]; + + if (*prog == ARM9.DCacheStreamPtr) ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + else if (*prog > ARM9.DCacheStreamPtr) ARM9.DCacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; + + (*prog)++; + if (*prog >= 8) + { + ARM9.DataRegion = Mem9_MainRAM; + ARM9.DataCycles = 3 << ARM9ClockShift; + ARM9.RetVal = dcache[(ARM9.FetchAddr[16] & 0x1F) / 4]; + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + } + break; + } + + case MainRAMType::DMA32: + { + DMA* dma = &DMAs[ARM9.MRTrack.Var]; + int burststart = dma->Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM9Regions[srcaddr>>14]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM9Regions[dstaddr>>14]; + if (!ARM9.MRTrack.Progress) + { + if (srcrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBurstStart = A9ContentionTS; + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += 6; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 2; + MainRAMTimestamp = A9ContentionTS + 3; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][6] - 1 + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); + } + else + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][7] - 1; + } + DMALastWasMainRAM = false; + } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; + + DMAReadHold[0] = ARM9Read32(srcaddr); + + ARM9.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242)) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMTimestamp = A9ContentionTS + 9; + MainRAMBurstStart = A9ContentionTS; + A9ContentionTS += 4; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 2; + MainRAMTimestamp = A9ContentionTS + 5; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][6] - (burststart <= 0); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][6] + (burststart == 1); + } + else + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][7] + (burststart == 1); + } + DMALastWasMainRAM = false; + } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; + + ARM9Write32(dstaddr, DMAReadHold[0]); + + dma->CurSrcAddr += dma->SrcAddrInc<<2; + dma->CurDstAddr += dma->DstAddrInc<<2; + dma->IterCount--; + dma->RemCount--; + + if (burststart <= 1) dma->Running = 1; + else dma->Running = 2; + + if ((dma->IterCount == 0) || ((ARM9Regions[dma->CurSrcAddr>>14] != Mem9_MainRAM) && (ARM9Regions[dma->CurDstAddr>>14] != Mem9_MainRAM)) || (DMA9Timestamp >= ARM9Target) || (CPUStop & ((1<Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM9Regions[srcaddr>>14]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM9Regions[dstaddr>>14]; + if (!ARM9.MRTrack.Progress) + { + if (srcrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBurstStart = A9ContentionTS; + MainRAMTimestamp = A9ContentionTS + 8; + A9ContentionTS += 5; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 1; + MainRAMTimestamp = A9ContentionTS + 3; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][4] - 1 + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); + } + else + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][5] - 1; + } + DMALastWasMainRAM = false; + } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; + + DMAReadHold[0] = ARM9Read16(srcaddr); + + ARM9.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242)) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMBurstStart = A9ContentionTS; + MainRAMTimestamp = A9ContentionTS + 8; + A9ContentionTS += 3; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 1; + MainRAMTimestamp = A9ContentionTS + 5; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][4] + (burststart == 1); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][4]; + } + else + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] + (burststart == 1); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][5]; + } + DMALastWasMainRAM = false; + } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; + + ARM9Write16(dstaddr, DMAReadHold[0]); + + dma->CurSrcAddr += dma->SrcAddrInc<<1; + dma->CurDstAddr += dma->DstAddrInc<<1; + dma->IterCount--; + dma->RemCount--; + + if (burststart <= 1) dma->Running = 1; + else dma->Running = 2; + + if ((dma->IterCount == 0) || ((ARM9Regions[dma->CurSrcAddr>>14] != Mem9_MainRAM) && (ARM9Regions[dma->CurDstAddr>>14] != Mem9_MainRAM)) || (DMA9Timestamp >= ARM9Target) || (CPUStop & ((1<()) return; + + if ((ARM9.WBWritePointer == 16) && !ARM9.WBWriting) + { + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + } + break; + } + + case MainRAMType::WBWrite: + { + if (!ARM9.WriteBufferHandle()) return; + + if (ARM9.WBWritePointer == ARM9.WBFillPointer) + { + if (!ARM9.WriteBufferHandle()) return; + } + else if (ARM9.WBWritePointer == 16) + { + ARM9.WBWritePointer = 0; + if (!ARM9.WBWriting) + { + u64 ts = (ARM9Timestamp + 1 + ((1<> 61) != 4) + { + ARM9Timestamp += ARM9.DataCycles = 1; + ARM9.WBDelay = ARM9Timestamp + 1; + } + + ARM9.MRTrack.Progress++; + if (ARM9.MRTrack.Progress >= ARM9.MRTrack.Var) + { + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + } + break; + } + + case MainRAMType::WBCheck: + { + if (!ARM9.WriteBufferHandle()) return; + + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + break; + } + + case MainRAMType::WBWaitRead: + { + if (!ARM9.WriteBufferHandle()) return; + + if (ARM9Timestamp >= ARM9.WBInitialTS) + { + if (!ARM9.WriteBufferHandle()) return; + if (ARM9Timestamp < ARM9.WBReleaseTS) ARM9Timestamp = ARM9.WBReleaseTS; + } + + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + break; + } + + case MainRAMType::WBWaitWrite: + { + if (!ARM9.WriteBufferHandle()) return; + + if (!ARM9.WriteBufferHandle()) return; + if (ARM9Timestamp < ARM9.WBReleaseTS) ARM9Timestamp = ARM9.WBReleaseTS; + + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + break; + } + } +} + +void NDS::MainRAMHandleARM7() +{ + CurCPU = 1; + switch (ARM7.MRTrack.Type) + { + default: + { + Platform::Log(Platform::LogLevel::Error, "INVALID MAIN RAM TYPE ARM7"); + break; + } + + case MainRAMType::Fetch: + { + u8 var = ARM7.MRTrack.Var; + u32 addr = (var & MRCodeFetch) ? ARM7.FetchAddr[16] : ARM7.FetchAddr[ARM7.MRTrack.Progress]; + + if ((var & MRSequential) && A7WENTLAST && !(MainRAMBork && ((addr & 0x1F) == 0)) && ((ARM7Timestamp - MainRAMBurstStart) < 242)) + { + int cycles = ((var & MR32) ? 2 : 1); + MainRAMTimestamp += cycles; + ARM7Timestamp += cycles; + //printf("%lli %lli\n", MainRAMTimestamp, ARM7Timestamp); + } + else + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + + MainRAMBork = !(var & MRWrite) && ((addr & 0x1F) >= 0x1A); + MainRAMBurstStart = ARM7Timestamp; + + MainRAMTimestamp = ARM7Timestamp + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? + if (var & MRWrite) ARM7Timestamp += ((var & MR16) ? 3 : 4); + else ARM7Timestamp += ((var & MR16) ? 5 : 6); + MainRAMLastAccess = A7LAST; + } + + if (var & MRCodeFetch) + { + ARM7.RetVal = ((var & MR32) ? *(u32*)&MainRAM[addr&MainRAMMask] : *(u16*)&MainRAM[addr&MainRAMMask]); + } + else + { + u8 reg = ARM7.MRTrack.Progress; + if (var & MRWrite) // write + { + u32 val = ARM7.STRVal[reg]; + if (var & MR32) *(u32*)&MainRAM[addr&MainRAMMask] = val; + else if (var & MR16) *(u16*)&MainRAM[addr&MainRAMMask] = val; + else *(u8 *)&MainRAM[addr&MainRAMMask] = val; + } + else // read + { + u32 dummy; + u32* val = ((ARM7.LDRFailedRegs & (1<Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM7Regions[srcaddr>>15]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM7Regions[dstaddr>>15]; + if (!ARM7.MRTrack.Progress) + { + if (srcrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBurstStart = ARM7Timestamp; + MainRAMTimestamp = ARM7Timestamp + 9; + ARM7Timestamp += 6; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 2; + MainRAMTimestamp = ARM7Timestamp + 3; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); + } + else + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][3]; + } + DMALastWasMainRAM = false; + } + + DMAReadHold[1] = ARM7Read32(srcaddr); + + ARM7.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242)) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMBurstStart = ARM7Timestamp; + MainRAMTimestamp = ARM7Timestamp + 9; + ARM7Timestamp += 4; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 2; + MainRAMTimestamp = ARM7Timestamp + 5; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2] - (burststart <= 0); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][2] + (burststart == 1); + } + else + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][3] + (burststart == 1); + } + DMALastWasMainRAM = false; + } + + ARM7Write32(dstaddr, DMAReadHold[1]); + + dma->CurSrcAddr += dma->SrcAddrInc<<2; + dma->CurDstAddr += dma->DstAddrInc<<2; + dma->IterCount--; + dma->RemCount--; + + if (burststart <= 1) dma->Running = 1; + else dma->Running = 2; + + if ((dma->IterCount == 0) || ((ARM7Regions[dma->CurSrcAddr>>15] != Mem7_MainRAM) && (ARM7Regions[dma->CurDstAddr>>15] != Mem7_MainRAM)) || (ARM7Timestamp >= ARM7Target) || (CPUStop & CPUStop_DMA7 & ((1<Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM7Regions[srcaddr>>15]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM7Regions[dstaddr>>15]; + if (!ARM7.MRTrack.Progress) + { + if (srcrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBurstStart = ARM7Timestamp; + MainRAMTimestamp = ARM7Timestamp + 8; + ARM7Timestamp += 5; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 1; + MainRAMTimestamp = ARM7Timestamp + 3; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); + } + else + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][1]; + } + DMALastWasMainRAM = false; + } + + DMAReadHold[1] = ARM7Read16(srcaddr); + + ARM7.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242)) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMBurstStart = ARM7Timestamp; + MainRAMTimestamp = ARM7Timestamp + 8; + ARM7Timestamp += 3; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 1; + MainRAMTimestamp = ARM7Timestamp + 5; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0] + (burststart == 1); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][0]; + } + else + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] + (burststart == 1); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][1]; + } + DMALastWasMainRAM = false; + } + + ARM7Write16(dstaddr, DMAReadHold[1]); + + dma->CurSrcAddr += dma->SrcAddrInc<<1; + dma->CurDstAddr += dma->DstAddrInc<<1; + dma->IterCount--; + dma->RemCount--; + + if (burststart <= 1) dma->Running = 1; + else dma->Running = 2; + + if ((dma->IterCount == 0) || ((ARM7Regions[dma->CurSrcAddr>>15] != Mem7_MainRAM) && (ARM7Regions[dma->CurDstAddr>>15] != Mem7_MainRAM)) || (ARM7Timestamp >= ARM7Target) || (CPUStop & CPUStop_DMA7 & ((1< MainRAMType::WriteBufferCmds) + A9ContentionTS = (ARM9.WBTimestamp + ((1<> ARM9ClockShift; + else if (ARM9.MRTrack.Type == MainRAMType::DMA16 || ARM9.MRTrack.Type == MainRAMType::DMA32) + A9ContentionTS = (DMA9Timestamp + ((1<> ARM9ClockShift; + else + A9ContentionTS = (ARM9Timestamp + ((1<> ARM9ClockShift; + } + + if (A7PRIORITY) + { + while (true) + { + if (A9ContentionTS < ARM7Timestamp) + { + if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; + else if (CPUStop & CPUStop_GXStall) + { + // gx stalls can occur during this, and if not handled properly will cause issues + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + A9ContentionTS = std::min(ARM9Target, A9ContentionTS+cycles); + } + else MainRAMHandleARM9(); + } + else + { + if (ARM7.MRTrack.Type == MainRAMType::Null) return 1; + else MainRAMHandleARM7(); + } + } + } + else + { + while (true) + { + if (A9ContentionTS <= ARM7Timestamp) + { + if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; + else if (CPUStop & CPUStop_GXStall) + { + // gx stalls can occur during this, and if not handled properly will cause issues + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + A9ContentionTS = std::min(ARM9Target, A9ContentionTS+cycles); + } + else MainRAMHandleARM9(); + } + else + { + if (ARM7.MRTrack.Type == MainRAMType::Null) return 1; + else MainRAMHandleARM7(); + } + } + } +} + +#undef A9WENTLAST +#undef A7WENTLAST +#undef A9LAST +#undef A7LAST +#undef A9PRIORITY +#undef A7PRIORITY + u64 NDS::NextTarget() { u64 minEvent = UINT64_MAX; @@ -826,6 +1643,21 @@ void NDS::RunSystem(u64 timestamp) } } +void NDS::RunEventManual(u32 id) +{ + if (SchedListMask & (1<> ARM9ClockShift); + SchedEvent& evt = SchedList[id]; + + if (evt.Timestamp <= curts) + { + evt.Funcs[evt.FuncID](evt.That, evt.Param); + SchedListMask &= ~(1<(*this); - dsi.RunNDMAs(0); - } - } - else - { - ARM9.Execute(); - } + CurCPU = 0; + RunTimers(0); + GPU.GPU3D.Run(); - RunTimers(0); - GPU.GPU3D.Run(); - - target = ARM9Timestamp >> ARM9ClockShift; - CurCPU = 1; - - while (ARM7Timestamp < target) - { - ARM7Target = target; // might be changed by a reschedule - - if (CPUStop & CPUStop_DMA7) - { - DMAs[4].Run(); - DMAs[5].Run(); - DMAs[6].Run(); - DMAs[7].Run(); - if (ConsoleType == 1) + if (CPUStop & CPUStop_GXStall) { - auto& dsi = dynamic_cast(*this); - dsi.RunNDMAs(1); + // GXFIFO stall + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp, DMA9Timestamp)+(cycles<(*this); + dsi.RunNDMAs(0); + } + } + else + { + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); + } + } + + //printf("A9 LOOP: 9 %lli %lli %08X %08llX %i 7 %lli %lli %08X %08llX %i\n", ARM9Timestamp, ARM9Target, ARM9.R[15], ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7Target, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type); + + RunTimers(0); + GPU.GPU3D.Run(); + + if (MainRAMHandle()) break; } - else + + while (ARM7Timestamp < ARM7Target) { - ARM7.Execute(); + //printf("A7 LOOP: 9 %lli %lli %08X %08llX %i 7 %lli %lli %08X %08llX %i\n", ARM9Timestamp, ARM9Target, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7Target, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type); + CurCPU = 1; + RunTimers(1); + + if (ARM7.MRTrack.Type == MainRAMType::Null) + { + if (CPUStop & CPUStop_DMA7) + { + DMAs[4].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[5].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[6].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[7].Run(); + if (ConsoleType == 1) + { + auto& dsi = dynamic_cast(*this); + dsi.RunNDMAs(1); + } + } + else + { + //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; + ARM7.Execute(); + } + } + + RunTimers(1); + + if (!MainRAMHandle()) break; } - - RunTimers(1); } - - RunSystem(target); + + CurCPU = 2; + RunSystem(ARM7Target); if (CPUStop & CPUStop_Sleep) { @@ -1021,14 +1878,15 @@ u32 NDS::RunFrame() #ifdef DEBUG_CHECK_DESYNC Log(LogLevel::Debug, "[%08X%08X] ARM9=%ld, ARM7=%ld, GPU=%ld\n", (u32)(SysTimestamp>>32), (u32)SysTimestamp, - (ARM9Timestamp>>1)-SysTimestamp, + std::max(std::max(ARM9Timestamp,DMA9Timestamp)>>ARM9ClockShift, A9ContentionTS)-SysTimestamp, ARM7Timestamp-SysTimestamp, GPU.GPU3D.Timestamp-SysTimestamp); #endif SPU.TransferOutput(); break; } - + //printf("MAIN LOOP: 9 %lli %lli %08X %08llX %i %i %i 7 %lli %lli %08X %08llX %i %i %i\n", ARM9Timestamp, ARM9Target, ARM9.R[15], ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM9.Halted, ARM9.FuncQueueActive, ARM7Timestamp, ARM7Target, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, ARM7.Halted, ARM7.FuncQueueActive); + // In the context of TASes, frame count is traditionally the primary measure of emulated time, // so it needs to be tracked even if NDS is powered off. NumFrames++; @@ -1061,15 +1919,10 @@ u32 NDS::RunFrame() void NDS::Reschedule(u64 target) { - if (CurCPU == 0) + if (target < ARM7Target) { - if (target < (ARM9Target >> ARM9ClockShift)) - ARM9Target = (target << ARM9ClockShift); - } - else - { - if (target < ARM7Target) - ARM7Target = target; + ARM7Target = target; + ARM9Target = (target << ARM9ClockShift); } } @@ -1110,7 +1963,7 @@ void NDS::ScheduleEvent(u32 id, bool periodic, s32 delay, u32 funcid, u32 param) else { if (CurCPU == 0) - evt.Timestamp = (ARM9Timestamp >> ARM9ClockShift) + delay; + evt.Timestamp = ((std::max(ARM9Timestamp, DMA9Timestamp) + ((1<> ARM9ClockShift) + delay; else evt.Timestamp = ARM7Timestamp + delay; } @@ -1326,26 +2179,51 @@ void NDS::SetGBASlotTimings() } -void NDS::UpdateIRQ(u32 cpu) +void NDS::UpdateIRQ(u32 cpu, s32 delay) { ARM& arm = cpu ? (ARM&)ARM7 : (ARM&)ARM9; + u64 time; - if (IME[cpu] & 0x1) + if (CurCPU == 0) { - arm.IRQ = !!(IE[cpu] & IF[cpu]); - if ((ConsoleType == 1) && cpu) - arm.IRQ |= !!(IE2 & IF2); + time = (std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift) + 4 + delay; + } + else if (CurCPU == 1) + { + time = ARM7Timestamp + 4 + delay; } else { - arm.IRQ = 0; + time = SysTimestamp + 4 + delay; } + + if (IME[cpu] & 0x1) + { + if ((IE[cpu] & IF[cpu])) + { + if (time < arm.IRQTimestamp) arm.IRQTimestamp = time; + } + else if (((ConsoleType == 1) && cpu) && (IE2 & IF2)) + { + if (time < arm.IRQTimestamp) arm.IRQTimestamp = time; + } + else + { + arm.IRQTimestamp = UINT64_MAX; + } + } + else + { + arm.IRQTimestamp = UINT64_MAX; + } + + if (cpu == 0) arm.IRQTimestamp <<= ARM9ClockShift; } -void NDS::SetIRQ(u32 cpu, u32 irq) +void NDS::SetIRQ(u32 cpu, u32 irq, s32 delay) { IF[cpu] |= (1 << irq); - UpdateIRQ(cpu); + UpdateIRQ(cpu, delay); if ((cpu == 1) && (CPUStop & CPUStop_Sleep)) { @@ -1582,9 +2460,10 @@ void NDS::HandleTimerOverflow(u32 tid) { Timer* timer = &Timers[tid]; - timer->Counter += (timer->Reload << 10); if (timer->Cnt & (1<<6)) - SetIRQ(tid >> 2, IRQ_Timer0 + (tid & 0x3)); + SetIRQ(tid >> 2, IRQ_Timer0 + (tid & 0x3), -(timer->Counter >> timer->CycleShift)); + + timer->Counter += (timer->Reload << 10); if ((tid & 0x3) == 3) return; @@ -1629,7 +2508,7 @@ void NDS::RunTimers(u32 cpu) s32 cycles; if (cpu == 0) - cycles = (ARM9Timestamp >> ARM9ClockShift) - TimerTimestamp[0]; + cycles = (std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift) - TimerTimestamp[0]; else cycles = ARM7Timestamp - TimerTimestamp[1]; @@ -1715,6 +2594,15 @@ void NDS::StopDMAs(u32 cpu, u32 mode) DMAs[cpu+3].StopIfNeeded(mode); } +void NDS::QueueDMAs(u32 param) +{ + DMAs[DMAsQueued[0]].Start(); + for(int i = 0; i < 7; i++) DMAsQueued[i] = DMAsQueued[i+1]; + DMAQueuePtr--; + + if (DMAQueuePtr != 0) ScheduleEvent(Event_DMA, false, 1, 0, 0); +} + void NDS::DivDone(u32 param) @@ -2245,7 +3133,7 @@ void NDS::ARM9Write32(u32 addr, u32 val) //Log(LogLevel::Warn, "unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9.R[15]); } -bool NDS::ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) +bool NDS::ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region) { switch (addr & 0xFF000000) { @@ -3408,9 +4296,11 @@ void NDS::ARM9IOWrite16(u32 addr, u16 val) case 0x04000204: { + u16 settablemask = 0x88FF; + if ((ConsoleType == 1) && (((DSi*)this)->SCFG_EXT[1] & (1<<24))) settablemask |= 0x0400; // bit 10 can be set if SCFG_EXT bit 24 is set u16 oldVal = ExMemCnt[0]; - ExMemCnt[0] = val; - ExMemCnt[1] = (ExMemCnt[1] & 0x007F) | (val & 0xFF80); + ExMemCnt[0] = (ExMemCnt[0] & ~settablemask) | (val & settablemask); + ExMemCnt[1] = (ExMemCnt[1] & (~settablemask | 0x7F)) | (val & (settablemask & ~0x7F)); if ((oldVal ^ ExMemCnt[0]) & 0xFF) SetGBASlotTimings(); return; @@ -4203,8 +5093,9 @@ void NDS::ARM7IOWrite16(u32 addr, u16 val) case 0x04000204: { + u16 settablemask = 0x007F; u16 oldVal = ExMemCnt[1]; - ExMemCnt[1] = (ExMemCnt[1] & 0xFF80) | (val & 0x007F); + ExMemCnt[1] = (ExMemCnt[1] & ~settablemask) | (val & settablemask); if ((ExMemCnt[1] ^ oldVal) & 0xFF) SetGBASlotTimings(); return; diff --git a/src/NDS.h b/src/NDS.h index 6e486e28..d94cdd42 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -64,6 +64,7 @@ enum Event_SPITransfer, Event_Div, Event_Sqrt, + Event_DMA, // DSi Event_DSi_SDMMCTransfer, @@ -196,6 +197,8 @@ enum Mem9_VRAM = 0x00000100, Mem9_GBAROM = 0x00020000, Mem9_GBARAM = 0x00040000, + Mem9_DCache = 0x40000000, + Mem9_Null = 0x80000000, Mem7_BIOS = 0x00000001, Mem7_MainRAM = 0x00000002, @@ -241,7 +244,8 @@ public: // TODO: Encapsulate the rest of these members int ConsoleType; int CurCPU; - + + u32 SchedListMask; SchedEvent SchedList[Event_MAX] {}; u8 ARM9MemTimings[0x40000][8]; u32 ARM9Regions[0x40000]; @@ -253,8 +257,11 @@ public: // TODO: Encapsulate the rest of these members bool LagFrameFlag; // no need to worry about those overflowing, they can keep going for atleast 4350 years - u64 ARM9Timestamp, ARM9Target; + u64 ARM9Timestamp, DMA9Timestamp, ARM9Target; u64 ARM7Timestamp, ARM7Target; + u64 MainRAMTimestamp, MainRAMBurstStart; + u64 A9ContentionTS; bool ConTSLock; + u64 SysTimestamp; u32 ARM9ClockShift; u32 IME[2]; @@ -272,11 +279,18 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed0[2*8]; alignas(u32) u8 ROMSeed1[2*8]; + u32 DMAReadHold[2]; + u8 DMAsQueued[8]; + u8 DMAQueuePtr; + bool MainRAMBork; // if a main ram read burst starts in the last 6 bytes of a 32 byte block, and then crosses the 32 byte boundary, the burst forcibly restarts + bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 + bool DMALastWasMainRAM; + protected: // These BIOS arrays should be declared *before* the component objects (JIT, SPI, etc.) // so that they're initialized before the component objects' constructors run. - std::array ARM9BIOS; - std::array ARM7BIOS; + alignas(u32) std::array ARM9BIOS; + alignas(u32) std::array ARM7BIOS; bool ARM9BIOSNative; bool ARM7BIOSNative; public: // TODO: Encapsulate the rest of these members @@ -311,6 +325,11 @@ public: // TODO: Encapsulate the rest of these members GBACart::GBACartSlot GBACartSlot; melonDS::GPU GPU; melonDS::AREngine AREngine; + DMA DMAs[8]; + +#ifdef JIT_ENABLED + bool IsJITEnabled(){return EnableJIT;}; +#endif const u32 ARM7WRAMSize = 0x10000; u8* ARM7WRAM; @@ -390,6 +409,10 @@ public: // TODO: Encapsulate the rest of these members std::unique_ptr EjectGBACart() { return GBACartSlot.EjectCart(); } + void MainRAMHandleARM9(); + void MainRAMHandleARM7(); + bool MainRAMHandle(); + u32 RunFrame(); bool IsRunning() const noexcept { return Running; } @@ -409,6 +432,7 @@ public: // TODO: Encapsulate the rest of these members void UnregisterEventFuncs(u32 id); void ScheduleEvent(u32 id, bool periodic, s32 delay, u32 funcid, u32 param); void CancelEvent(u32 id); + void RunEventManual(u32 id); void debug(u32 p); @@ -416,8 +440,8 @@ public: // TODO: Encapsulate the rest of these members void MapSharedWRAM(u8 val); - void UpdateIRQ(u32 cpu); - void SetIRQ(u32 cpu, u32 irq); + void UpdateIRQ(u32 cpu, s32 delay = 0); + void SetIRQ(u32 cpu, u32 irq, s32 delay = 0); void ClearIRQ(u32 cpu, u32 irq); void SetIRQ2(u32 irq); void ClearIRQ2(u32 irq); @@ -447,7 +471,7 @@ public: // TODO: Encapsulate the rest of these members virtual void ARM9Write16(u32 addr, u16 val); virtual void ARM9Write32(u32 addr, u32 val); - virtual bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region); + virtual bool ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region); virtual u8 ARM7Read8(u32 addr); virtual u16 ARM7Read16(u32 addr); @@ -488,8 +512,6 @@ public: // TODO: Encapsulate the rest of these members private: void InitTimings(); - u32 SchedListMask; - u64 SysTimestamp; u8 WRAMCnt; u8 PostFlag9; u8 PostFlag7; @@ -497,7 +519,6 @@ private: u16 WifiWaitCnt; u8 TimerCheckMask[2]; u64 TimerTimestamp[2]; - DMA DMAs[8]; u32 DMA9Fill[4]; u16 IPCSync9, IPCSync7; u16 IPCFIFOCnt9, IPCFIFOCnt7; @@ -525,6 +546,7 @@ private: void HandleTimerOverflow(u32 tid); u16 TimerGetCounter(u32 timer); void TimerStart(u32 id, u16 cnt); + void QueueDMAs(u32 param); void StartDiv(); void DivDone(u32 param); void SqrtDone(u32 param); diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index 1fa0fbfe..a4100fe9 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -1485,6 +1485,7 @@ void NDSCartSlot::DoSavestate(Savestate* file) noexcept file->Var32(&TransferLen); file->Var32(&TransferDir); file->VarArray(TransferCmd.data(), sizeof(TransferCmd)); + file->Var64(&ROMTransferTime); // cart inserted/len/ROM/etc should be already populated // savestate should be loaded after the right game is loaded @@ -1799,6 +1800,7 @@ void NDSCartSlot::ResetCart() noexcept TransferDir = 0; memset(TransferCmd.data(), 0, sizeof(TransferCmd)); TransferCmd[0] = 0xFF; + ROMTransferTime = -1; if (Cart) Cart->Reset(); } @@ -1835,6 +1837,12 @@ void NDSCartSlot::ROMPrepareData(u32 param) noexcept NDS.CheckDMAs(0, 0x05); } +u32 NDSCartSlot::GetROMCnt() noexcept +{ + NDS.RunEventManual(Event_ROMTransfer); + return ROMCnt; +} + void NDSCartSlot::WriteROMCnt(u32 val) noexcept { u32 xferstart = (val & ~ROMCnt) & (1<<31); @@ -1902,13 +1910,13 @@ void NDSCartSlot::WriteROMCnt(u32 val) noexcept // ROM transfer timings // the bus is parallel with 8 bits - // thus a command would take 8 cycles to be transferred - // and it would take 4 cycles to receive a word of data + // thus a command would take 8 cycles to be transferred (...actually it's 10?? checkme: does this apply to every command?) + // and it would take 4 cycles to receive a word of data (...or 3? does it overlap a cycle somewhere?) // TODO: advance read position if bit28 is set // TODO: during a write transfer, bit23 is set immediately when beginning the transfer(?) u32 xfercycle = (ROMCnt & (1<<27)) ? 8 : 5; - u32 cmddelay = 8; + u32 cmddelay = 10; // delays are only applied when the WR bit is cleared // CHECKME: do the delays apply at the end (instead of start) when WR is set? @@ -1919,9 +1927,17 @@ void NDSCartSlot::WriteROMCnt(u32 val) noexcept } if (datasize == 0) - NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*cmddelay, ROMTransfer_End, 0); + NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*cmddelay+4, ROMTransfer_End, 0); else - NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*(cmddelay+4), ROMTransfer_PrepareData, 0); + { + NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*(cmddelay+3)+3, ROMTransfer_PrepareData, 0); + + u64 curts; + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp)) >> NDS.ARM9ClockShift; + + ROMTransferTime = (xfercycle*(cmddelay+8)) + curts + 3; + } } void NDSCartSlot::AdvanceROMTransfer() noexcept @@ -1931,14 +1947,21 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept if (TransferPos < TransferLen) { u32 xfercycle = (ROMCnt & (1<<27)) ? 8 : 5; + u32 extdelay = (ROMCnt & (1<<27)) ? 7 : 5; // why is this only 7...? u32 delay = 4; if (!(ROMCnt & (1<<30))) { - if (!(TransferPos & 0x1FF)) + if (!((TransferPos+4) & 0x1FF)) delay += ((ROMCnt >> 16) & 0x3F); } + + u64 curts; + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp)) >> NDS.ARM9ClockShift; + + NDS.ScheduleEvent(Event_ROMTransfer, false, ROMTransferTime-curts, ROMTransfer_PrepareData, 0); - NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*delay, ROMTransfer_PrepareData, 0); + ROMTransferTime = (xfercycle*delay) + std::max(curts+extdelay, ROMTransferTime); } else ROMEndTransfer(0); @@ -1947,6 +1970,8 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept u32 NDSCartSlot::ReadROMData() noexcept { if (ROMCnt & (1<<30)) return 0; + + NDS.RunEventManual(Event_ROMTransfer); if (ROMCnt & (1<<23)) { @@ -1959,6 +1984,8 @@ u32 NDSCartSlot::ReadROMData() noexcept void NDSCartSlot::WriteROMData(u32 val) noexcept { if (!(ROMCnt & (1<<30))) return; + + NDS.RunEventManual(Event_ROMTransfer); ROMData = val; diff --git a/src/NDSCart.h b/src/NDSCart.h index 3704f659..37bbea27 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -414,9 +414,11 @@ public: [[nodiscard]] u8 GetROMCommand(u8 index) const noexcept { return ROMCommand[index]; } void SetROMCommand(u8 index, u8 val) noexcept { ROMCommand[index] = val; } - [[nodiscard]] u32 GetROMCnt() const noexcept { return ROMCnt; } + [[nodiscard]] u32 GetROMCnt() noexcept; + [[nodiscard]] u16 GetSPICnt() const noexcept { return SPICnt; } void SetSPICnt(u16 val) noexcept { SPICnt = val; } + private: friend class CartCommon; melonDS::NDS& NDS; @@ -441,6 +443,7 @@ private: u64 Key2_X = 0; u64 Key2_Y = 0; + u64 ROMTransferTime; void Key1_Encrypt(u32* data) const noexcept; void Key1_Decrypt(u32* data) const noexcept; diff --git a/src/Savestate.h b/src/Savestate.h index dce62844..6d85044c 100644 --- a/src/Savestate.h +++ b/src/Savestate.h @@ -24,7 +24,7 @@ #include #include "types.h" -#define SAVESTATE_MAJOR 12 +#define SAVESTATE_MAJOR 13 #define SAVESTATE_MINOR 1 namespace melonDS