13 files changed, 122 insertions, 103 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 3d30f0e3e..c538c6415 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -114,6 +114,8 @@ add_library(common STATIC
     threadsafe_queue.h
     timer.cpp
     timer.h
+    uint128.cpp
+    uint128.h
     vector_math.h
     web_result.h
 )
diff --git a/src/common/uint128.cpp b/src/common/uint128.cpp
new file mode 100644
index 000000000..2238a52c5
--- /dev/null
+++ b/src/common/uint128.cpp
@@ -0,0 +1,41 @@
+#ifdef _MSC_VER
+#include <intrin.h>
+
+#pragma intrinsic(_umul128)
+#endif
+#include <cstring>
+#include "common/uint128.h"
+
+namespace Common {
+
+u128 Multiply64Into128(u64 a, u64 b) {
+    u128 result;
+#ifdef _MSC_VER
+    result[0] = _umul128(a, b, &result[1]);
+#else
+    unsigned __int128 tmp = a;
+    tmp *= b;
+    std::memcpy(&result, &tmp, sizeof(u128));
+#endif
+    return result;
+}
+
+std::pair<u64, u64> Divide128On32(u128 dividend, u32 divisor) {
+    u64 remainder = dividend[0] % divisor;
+    u64 accum = dividend[0] / divisor;
+    if (dividend[1] == 0)
+        return {accum, remainder};
+    // We ignore dividend[1] / divisor as that overflows
+    const u64 first_segment = (dividend[1] % divisor) << 32;
+    accum += (first_segment / divisor) << 32;
+    const u64 second_segment = (first_segment % divisor) << 32;
+    accum += (second_segment / divisor);
+    remainder += second_segment % divisor;
+    if (remainder >= divisor) {
+        accum++;
+        remainder -= divisor;
+    }
+    return {accum, remainder};
+}
+
+} // namespace Common
diff --git a/src/common/uint128.h b/src/common/uint128.h
new file mode 100644
index 000000000..52e6b46eb
--- /dev/null
+++ b/src/common/uint128.h
@@ -0,0 +1,14 @@
+
+#include <utility>
+#include "common/common_types.h"
+
+namespace Common {
+
+// This function multiplies 2 u64 values and produces a u128 value;
+u128 Multiply64Into128(u64 a, u64 b);
+
+// This function divides a u128 by a u32 value and produces two u64 values:
+// the result of division and the remainder
+std::pair<u64, u64> Divide128On32(u128 dividend, u32 divisor);
+
+} // namespace Common
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index 9b7ca4030..4fdc12f11 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -12,6 +12,7 @@
 #include "core/core.h"
 #include "core/core_cpu.h"
 #include "core/core_timing.h"
+#include "core/core_timing_util.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/svc.h"
@@ -119,7 +120,7 @@ public:
         return std::max(parent.core_timing.GetDowncount(), 0);
     }
     u64 GetCNTPCT() override {
-        return parent.core_timing.GetTicks();
+        return Timing::CpuCyclesToClockCycles(parent.core_timing.GetTicks());
     }
 
     ARM_Dynarmic& parent;
@@ -151,7 +152,7 @@ std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit() const {
     config.tpidr_el0 = &cb->tpidr_el0;
     config.dczid_el0 = 4;
     config.ctr_el0 = 0x8444c004;
-    config.cntfrq_el0 = 19200000; // Value from fusee.
+    config.cntfrq_el0 = Timing::CNTFREQ;
 
     // Unpredictable instructions
     config.define_unpredictable_behaviour = true;
diff --git a/src/core/core_timing_util.cpp b/src/core/core_timing_util.cpp
index 88ff70233..7942f30d6 100644
--- a/src/core/core_timing_util.cpp
+++ b/src/core/core_timing_util.cpp
@@ -7,6 +7,7 @@
 #include <cinttypes>
 #include <limits>
 #include "common/logging/log.h"
+#include "common/uint128.h"
 
 namespace Core::Timing {
 
@@ -60,4 +61,9 @@ s64 nsToCycles(u64 ns) {
     return (BASE_CLOCK_RATE * static_cast<s64>(ns)) / 1000000000;
 }
 
+u64 CpuCyclesToClockCycles(u64 ticks) {
+    const u128 temporal = Common::Multiply64Into128(ticks, CNTFREQ);
+    return Common::Divide128On32(temporal, static_cast<u32>(BASE_CLOCK_RATE)).first;
+}
+
 } // namespace Core::Timing
diff --git a/src/core/core_timing_util.h b/src/core/core_timing_util.h
index 513cfac1b..679aa3123 100644
--- a/src/core/core_timing_util.h
+++ b/src/core/core_timing_util.h
@@ -11,6 +11,7 @@ namespace Core::Timing {
 // The below clock rate is based on Switch's clockspeed being widely known as 1.020GHz
 // The exact value used is of course unverified.
 constexpr u64 BASE_CLOCK_RATE = 1019215872; // Switch clock speed is 1020MHz un/docked
+constexpr u64 CNTFREQ = 19200000;           // Value from fusee.
 
 inline s64 msToCycles(int ms) {
     // since ms is int there is no way to overflow
@@ -61,4 +62,6 @@ inline u64 cyclesToMs(s64 cycles) {
     return cycles * 1000 / BASE_CLOCK_RATE;
 }
 
+u64 CpuCyclesToClockCycles(u64 ticks);
+
 } // namespace Core::Timing
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 5fccfd9f4..e524509df 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -199,8 +199,7 @@ void Scheduler::YieldWithoutLoadBalancing(Thread* thread) {
     ASSERT(thread->GetPriority() < THREADPRIO_COUNT);
 
     // Yield this thread -- sleep for zero time and force reschedule to different thread
-    WaitCurrentThread_Sleep();
-    GetCurrentThread()->WakeAfterDelay(0);
+    GetCurrentThread()->Sleep(0);
 }
 
 void Scheduler::YieldWithLoadBalancing(Thread* thread) {
@@ -215,8 +214,7 @@ void Scheduler::YieldWithLoadBalancing(Thread* thread) {
     ASSERT(priority < THREADPRIO_COUNT);
 
     // Sleep for zero time to be able to force reschedule to different thread
-    WaitCurrentThread_Sleep();
-    GetCurrentThread()->WakeAfterDelay(0);
+    GetCurrentThread()->Sleep(0);
 
     Thread* suggested_thread = nullptr;
 
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 77d0e3d96..047fa0c19 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1284,10 +1284,14 @@ static ResultCode StartThread(Handle thread_handle) {
 
 /// Called when a thread exits
 static void ExitThread() {
-    LOG_TRACE(Kernel_SVC, "called, pc=0x{:08X}", Core::CurrentArmInterface().GetPC());
+    auto& system = Core::System::GetInstance();
 
-    ExitCurrentThread();
-    Core::System::GetInstance().PrepareReschedule();
+    LOG_TRACE(Kernel_SVC, "called, pc=0x{:08X}", system.CurrentArmInterface().GetPC());
+
+    auto* const current_thread = system.CurrentScheduler().GetCurrentThread();
+    current_thread->Stop();
+    system.CurrentScheduler().RemoveThread(current_thread);
+    system.PrepareReschedule();
 }
 
 /// Sleep the current thread
@@ -1300,32 +1304,32 @@ static void SleepThread(s64 nanoseconds) {
         YieldAndWaitForLoadBalancing = -2,
     };
 
+    auto& system = Core::System::GetInstance();
+    auto& scheduler = system.CurrentScheduler();
+    auto* const current_thread = scheduler.GetCurrentThread();
+
     if (nanoseconds <= 0) {
-        auto& scheduler{Core::System::GetInstance().CurrentScheduler()};
         switch (static_cast<SleepType>(nanoseconds)) {
         case SleepType::YieldWithoutLoadBalancing:
-            scheduler.YieldWithoutLoadBalancing(GetCurrentThread());
+            scheduler.YieldWithoutLoadBalancing(current_thread);
             break;
         case SleepType::YieldWithLoadBalancing:
-            scheduler.YieldWithLoadBalancing(GetCurrentThread());
+            scheduler.YieldWithLoadBalancing(current_thread);
             break;
         case SleepType::YieldAndWaitForLoadBalancing:
-            scheduler.YieldAndWaitForLoadBalancing(GetCurrentThread());
+            scheduler.YieldAndWaitForLoadBalancing(current_thread);
             break;
         default:
             UNREACHABLE_MSG("Unimplemented sleep yield type '{:016X}'!", nanoseconds);
         }
     } else {
-        // Sleep current thread and check for next thread to schedule
-        WaitCurrentThread_Sleep();
-
-        // Create an event to wake the thread up after the specified nanosecond delay has passed
-        GetCurrentThread()->WakeAfterDelay(nanoseconds);
+        current_thread->Sleep(nanoseconds);
     }
 
     // Reschedule all CPU cores
-    for (std::size_t i = 0; i < Core::NUM_CPU_CORES; ++i)
-        Core::System::GetInstance().CpuCore(i).PrepareReschedule();
+    for (std::size_t i = 0; i < Core::NUM_CPU_CORES; ++i) {
+        system.CpuCore(i).PrepareReschedule();
+    }
 }
 
 /// Wait process wide key atomic
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index eb54d6651..2e712c9cb 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -68,11 +68,6 @@ void Thread::Stop() {
     owner_process->FreeTLSSlot(tls_address);
 }
 
-void WaitCurrentThread_Sleep() {
-    Thread* thread = GetCurrentThread();
-    thread->SetStatus(ThreadStatus::WaitSleep);
-}
-
 void ExitCurrentThread() {
     Thread* thread = GetCurrentThread();
     thread->Stop();
@@ -391,6 +386,14 @@ void Thread::SetActivity(ThreadActivity value) {
     }
 }
 
+void Thread::Sleep(s64 nanoseconds) {
+    // Sleep current thread and check for next thread to schedule
+    SetStatus(ThreadStatus::WaitSleep);
+
+    // Create an event to wake the thread up after the specified nanosecond delay has passed
+    WakeAfterDelay(nanoseconds);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /**
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index c48b21aba..ccdefeecc 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -383,6 +383,9 @@ public:
 
     void SetActivity(ThreadActivity value);
 
+    /// Sleeps this thread for the given amount of nanoseconds.
+    void Sleep(s64 nanoseconds);
+
 private:
     explicit Thread(KernelCore& kernel);
     ~Thread() override;
@@ -460,14 +463,4 @@ private:
  */
 Thread* GetCurrentThread();
 
-/**
- * Waits the current thread on a sleep
- */
-void WaitCurrentThread_Sleep();
-
-/**
- * Stops the current thread and removes it from the thread_list
- */
-void ExitCurrentThread();
-
 } // namespace Kernel
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 4fde53033..e0cc5175f 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -18,6 +18,7 @@
 #include "core/hle/lock.h"
 #include "core/memory.h"
 #include "core/memory_setup.h"
+#include "video_core/gpu.h"
 #include "video_core/renderer_base.h"
 
 namespace Memory {
@@ -69,8 +70,8 @@ static void MapPages(PageTable& page_table, VAddr base, u64 size, u8* memory, Pa
 
     // During boot, current_page_table might not be set yet, in which case we need not flush
     if (current_page_table) {
-        RasterizerFlushVirtualRegion(base << PAGE_BITS, size * PAGE_SIZE,
-                                     FlushMode::FlushAndInvalidate);
+        Core::System::GetInstance().GPU().FlushAndInvalidateRegion(base << PAGE_BITS,
+                                                                   size * PAGE_SIZE);
     }
 
     VAddr end = base + size;
@@ -183,10 +184,10 @@ T Read(const VAddr vaddr) {
         ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
         break;
     case PageType::RasterizerCachedMemory: {
-        RasterizerFlushVirtualRegion(vaddr, sizeof(T), FlushMode::Flush);
-
+        auto host_ptr{GetPointerFromVMA(vaddr)};
+        Core::System::GetInstance().GPU().FlushRegion(ToCacheAddr(host_ptr), sizeof(T));
         T value;
-        std::memcpy(&value, GetPointerFromVMA(vaddr), sizeof(T));
+        std::memcpy(&value, host_ptr, sizeof(T));
         return value;
     }
     default:
@@ -214,8 +215,9 @@ void Write(const VAddr vaddr, const T data) {
         ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
         break;
     case PageType::RasterizerCachedMemory: {
-        RasterizerFlushVirtualRegion(vaddr, sizeof(T), FlushMode::Invalidate);
-        std::memcpy(GetPointerFromVMA(vaddr), &data, sizeof(T));
+        auto host_ptr{GetPointerFromVMA(vaddr)};
+        Core::System::GetInstance().GPU().InvalidateRegion(ToCacheAddr(host_ptr), sizeof(T));
+        std::memcpy(host_ptr, &data, sizeof(T));
         break;
     }
     default:
@@ -338,47 +340,6 @@ void RasterizerMarkRegionCached(VAddr vaddr, u64 size, bool cached) {
     }
 }
 
-void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {
-    auto& system_instance = Core::System::GetInstance();
-
-    // Since pages are unmapped on shutdown after video core is shutdown, the renderer may be
-    // null here
-    if (!system_instance.IsPoweredOn()) {
-        return;
-    }
-
-    const VAddr end = start + size;
-
-    const auto CheckRegion = [&](VAddr region_start, VAddr region_end) {
-        if (start >= region_end || end <= region_start) {
-            // No overlap with region
-            return;
-        }
-
-        const VAddr overlap_start = std::max(start, region_start);
-        const VAddr overlap_end = std::min(end, region_end);
-        const VAddr overlap_size = overlap_end - overlap_start;
-
-        auto& gpu = system_instance.GPU();
-        switch (mode) {
-        case FlushMode::Flush:
-            gpu.FlushRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);
-            break;
-        case FlushMode::Invalidate:
-            gpu.InvalidateRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);
-            break;
-        case FlushMode::FlushAndInvalidate:
-            gpu.FlushAndInvalidateRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);
-            break;
-        }
-    };
-
-    const auto& vm_manager = Core::CurrentProcess()->VMManager();
-
-    CheckRegion(vm_manager.GetCodeRegionBaseAddress(), vm_manager.GetCodeRegionEndAddress());
-    CheckRegion(vm_manager.GetHeapRegionBaseAddress(), vm_manager.GetHeapRegionEndAddress());
-}
-
 u8 Read8(const VAddr addr) {
     return Read<u8>(addr);
 }
@@ -424,9 +385,9 @@ void ReadBlock(const Kernel::Process& process, const VAddr src_addr, void* dest_
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
-                                         FlushMode::Flush);
-            std::memcpy(dest_buffer, GetPointerFromVMA(process, current_vaddr), copy_amount);
+            const auto& host_ptr{GetPointerFromVMA(process, current_vaddr)};
+            Core::System::GetInstance().GPU().FlushRegion(ToCacheAddr(host_ptr), copy_amount);
+            std::memcpy(dest_buffer, host_ptr, copy_amount);
             break;
         }
         default:
@@ -487,9 +448,9 @@ void WriteBlock(const Kernel::Process& process, const VAddr dest_addr, const voi
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
-                                         FlushMode::Invalidate);
-            std::memcpy(GetPointerFromVMA(process, current_vaddr), src_buffer, copy_amount);
+            const auto& host_ptr{GetPointerFromVMA(process, current_vaddr)};
+            Core::System::GetInstance().GPU().InvalidateRegion(ToCacheAddr(host_ptr), copy_amount);
+            std::memcpy(host_ptr, src_buffer, copy_amount);
             break;
         }
         default:
@@ -533,9 +494,9 @@ void ZeroBlock(const Kernel::Process& process, const VAddr dest_addr, const std:
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
-                                         FlushMode::Invalidate);
-            std::memset(GetPointerFromVMA(process, current_vaddr), 0, copy_amount);
+            const auto& host_ptr{GetPointerFromVMA(process, current_vaddr)};
+            Core::System::GetInstance().GPU().InvalidateRegion(ToCacheAddr(host_ptr), copy_amount);
+            std::memset(host_ptr, 0, copy_amount);
             break;
         }
         default:
@@ -575,9 +536,9 @@ void CopyBlock(const Kernel::Process& process, VAddr dest_addr, VAddr src_addr,
             break;
         }
         case PageType::RasterizerCachedMemory: {
-            RasterizerFlushVirtualRegion(current_vaddr, static_cast<u32>(copy_amount),
-                                         FlushMode::Flush);
-            WriteBlock(process, dest_addr, GetPointerFromVMA(process, current_vaddr), copy_amount);
+            const auto& host_ptr{GetPointerFromVMA(process, current_vaddr)};
+            Core::System::GetInstance().GPU().FlushRegion(ToCacheAddr(host_ptr), copy_amount);
+            WriteBlock(process, dest_addr, host_ptr, copy_amount);
             break;
         }
         default:
diff --git a/src/core/memory.h b/src/core/memory.h
index 1acf5ce8c..c2c6643ee 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -161,10 +161,4 @@ enum class FlushMode {
  */
 void RasterizerMarkRegionCached(VAddr vaddr, u64 size, bool cached);
 
-/**
- * Flushes and invalidates any externally cached rasterizer resources touching the given virtual
- * address region.
- */
-void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode);
-
 } // namespace Memory
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b97576309..5e3d862c6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -164,8 +164,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
         // Reset the screen info's display texture to its own permanent texture
         screen_info.display_texture = screen_info.texture.resource.handle;
 
-        Memory::RasterizerFlushVirtualRegion(framebuffer_addr, size_in_bytes,
-                                             Memory::FlushMode::Flush);
+        rasterizer->FlushRegion(ToCacheAddr(Memory::GetPointer(framebuffer_addr)), size_in_bytes);
 
         constexpr u32 linear_bpp = 4;
         VideoCore::MortonCopyPixels128(VideoCore::MortonSwizzleMode::MortonToLinear,