From a9ca39f8591532ba6d37f7a3e068d5eefe416464 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Mon, 7 Feb 2022 07:52:04 +0100
Subject: NVDRV: Further improvements.

---
 src/core/hle/service/nvdrv/core/container.cpp      |   8 +-
 src/core/hle/service/nvdrv/core/container.h        |   8 +-
 src/core/hle/service/nvdrv/core/nvmap.cpp          |   7 +-
 src/core/hle/service/nvdrv/core/nvmap.h            |   7 +-
 .../hle/service/nvdrv/core/syncpoint_manager.cpp   | 112 ++++++++++++++++---
 .../hle/service/nvdrv/core/syncpoint_manager.h     | 120 ++++++++++++++-------
 src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp |  18 ++--
 src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp  |  59 +++++-----
 src/core/hle/service/nvdrv/devices/nvhost_gpu.h    |  19 ++--
 .../hle/service/nvdrv/devices/nvhost_nvdec.cpp     |   2 +-
 .../service/nvdrv/devices/nvhost_nvdec_common.cpp  |  15 ++-
 .../service/nvdrv/devices/nvhost_nvdec_common.h    |   6 +-
 src/core/hle/service/nvdrv/devices/nvhost_vic.cpp  |   2 +-
 src/video_core/engines/maxwell_3d.cpp              |  18 +---
 src/video_core/engines/maxwell_dma.cpp             |  18 +++-
 src/video_core/engines/puller.cpp                  |  18 +---
 16 files changed, 278 insertions(+), 159 deletions(-)

diff --git a/src/core/hle/service/nvdrv/core/container.cpp b/src/core/hle/service/nvdrv/core/container.cpp
index fbd66f001..4175d3d9c 100644
--- a/src/core/hle/service/nvdrv/core/container.cpp
+++ b/src/core/hle/service/nvdrv/core/container.cpp
@@ -1,7 +1,7 @@
-// Copyright 2021 yuzu emulator team
-// Copyright 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
+// SPDX-FileCopyrightText: 2022 yuzu emulator team and Skyline Team and Contributors
+// (https://github.com/skyline-emu/)
+// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3
+// or any later version Refer to the license.txt file included.
 
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/core/nvmap.h"
diff --git a/src/core/hle/service/nvdrv/core/container.h b/src/core/hle/service/nvdrv/core/container.h
index da75d74ff..e069ade4e 100644
--- a/src/core/hle/service/nvdrv/core/container.h
+++ b/src/core/hle/service/nvdrv/core/container.h
@@ -1,7 +1,7 @@
-// Copyright 2021 yuzu emulator team
-// Copyright 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
+// SPDX-FileCopyrightText: 2022 yuzu emulator team and Skyline Team and Contributors
+// (https://github.com/skyline-emu/)
+// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3
+// or any later version Refer to the license.txt file included.
 
 #pragma once
 
diff --git a/src/core/hle/service/nvdrv/core/nvmap.cpp b/src/core/hle/service/nvdrv/core/nvmap.cpp
index 9acec7ba6..86d825af9 100644
--- a/src/core/hle/service/nvdrv/core/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/core/nvmap.cpp
@@ -1,6 +1,7 @@
-// Copyright 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
+// SPDX-FileCopyrightText: 2022 yuzu emulator team and Skyline Team and Contributors
+// (https://github.com/skyline-emu/)
+// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3
+// or any later version Refer to the license.txt file included.
 
 #include "common/alignment.h"
 #include "common/assert.h"
diff --git a/src/core/hle/service/nvdrv/core/nvmap.h b/src/core/hle/service/nvdrv/core/nvmap.h
index 5acdc961e..4f37dcf43 100644
--- a/src/core/hle/service/nvdrv/core/nvmap.h
+++ b/src/core/hle/service/nvdrv/core/nvmap.h
@@ -1,6 +1,7 @@
-// Copyright 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
+// SPDX-FileCopyrightText: 2022 yuzu emulator team and Skyline Team and Contributors
+// (https://github.com/skyline-emu/)
+// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3
+// or any later version Refer to the license.txt file included.
 
 #pragma once
 
diff --git a/src/core/hle/service/nvdrv/core/syncpoint_manager.cpp b/src/core/hle/service/nvdrv/core/syncpoint_manager.cpp
index 61e00448c..b34481b48 100644
--- a/src/core/hle/service/nvdrv/core/syncpoint_manager.cpp
+++ b/src/core/hle/service/nvdrv/core/syncpoint_manager.cpp
@@ -1,5 +1,7 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-FileCopyrightText: 2022 yuzu emulator team and Skyline Team and Contributors
+// (https://github.com/skyline-emu/)
+// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3
+// or any later version Refer to the license.txt file included.
 
 #include "common/assert.h"
 #include "core/hle/service/nvdrv/core/syncpoint_manager.h"
@@ -7,32 +9,108 @@
 
 namespace Service::Nvidia::NvCore {
 
-SyncpointManager::SyncpointManager(Tegra::Host1x::Host1x& host1x_) : host1x{host1x_} {}
+SyncpointManager::SyncpointManager(Tegra::Host1x::Host1x& host1x_) : host1x{host1x_} {
+    constexpr u32 VBlank0SyncpointId{26};
+    constexpr u32 VBlank1SyncpointId{27};
+
+    // Reserve both vblank syncpoints as client managed as they use Continuous Mode
+    // Refer to section 14.3.5.3 of the TRM for more information on Continuous Mode
+    // https://github.com/Jetson-TX1-AndroidTV/android_kernel_jetson_tx1_hdmi_primary/blob/8f74a72394efb871cb3f886a3de2998cd7ff2990/drivers/gpu/host1x/drm/dc.c#L660
+    ReserveSyncpoint(VBlank0SyncpointId, true);
+    ReserveSyncpoint(VBlank1SyncpointId, true);
+
+    for (u32 syncpointId : channel_syncpoints) {
+        if (syncpointId) {
+            ReserveSyncpoint(syncpointId, false);
+        }
+    }
+}
 
 SyncpointManager::~SyncpointManager() = default;
 
-u32 SyncpointManager::RefreshSyncpoint(u32 syncpoint_id) {
-    syncpoints[syncpoint_id].min = host1x.GetSyncpointManager().GetHostSyncpointValue(syncpoint_id);
-    return GetSyncpointMin(syncpoint_id);
+u32 SyncpointManager::ReserveSyncpoint(u32 id, bool clientManaged) {
+    if (syncpoints.at(id).reserved) {
+        UNREACHABLE_MSG("Requested syncpoint is in use");
+        return 0;
+    }
+
+    syncpoints.at(id).reserved = true;
+    syncpoints.at(id).interfaceManaged = clientManaged;
+
+    return id;
 }
 
-u32 SyncpointManager::AllocateSyncpoint() {
-    for (u32 syncpoint_id = 1; syncpoint_id < MaxSyncPoints; syncpoint_id++) {
-        if (!syncpoints[syncpoint_id].is_allocated) {
-            syncpoints[syncpoint_id].is_allocated = true;
-            return syncpoint_id;
+u32 SyncpointManager::FindFreeSyncpoint() {
+    for (u32 i{1}; i < syncpoints.size(); i++) {
+        if (!syncpoints[i].reserved) {
+            return i;
         }
     }
-    ASSERT_MSG(false, "No more available syncpoints!");
-    return {};
+    UNREACHABLE_MSG("Failed to find a free syncpoint!");
+    return 0;
+}
+
+u32 SyncpointManager::AllocateSyncpoint(bool clientManaged) {
+    std::lock_guard lock(reservation_lock);
+    return ReserveSyncpoint(FindFreeSyncpoint(), clientManaged);
+}
+
+bool SyncpointManager::IsSyncpointAllocated(u32 id) {
+    return (id <= SyncpointCount) && syncpoints[id].reserved;
+}
+
+bool SyncpointManager::HasSyncpointExpired(u32 id, u32 threshold) {
+    const SyncpointInfo& syncpoint{syncpoints.at(id)};
+
+    if (!syncpoint.reserved) {
+        UNREACHABLE();
+        return 0;
+    }
+
+    // If the interface manages counters then we don't keep track of the maximum value as it handles
+    // sanity checking the values then
+    if (syncpoint.interfaceManaged) {
+        return static_cast<s32>(syncpoint.counterMin - threshold) >= 0;
+    } else {
+        return (syncpoint.counterMax - threshold) >= (syncpoint.counterMin - threshold);
+    }
+}
+
+u32 SyncpointManager::IncrementSyncpointMaxExt(u32 id, u32 amount) {
+    if (!syncpoints.at(id).reserved) {
+        UNREACHABLE();
+        return 0;
+    }
+
+    return syncpoints.at(id).counterMax += amount;
+}
+
+u32 SyncpointManager::ReadSyncpointMinValue(u32 id) {
+    if (!syncpoints.at(id).reserved) {
+        UNREACHABLE();
+        return 0;
+    }
+
+    return syncpoints.at(id).counterMin;
+}
+
+u32 SyncpointManager::UpdateMin(u32 id) {
+    if (!syncpoints.at(id).reserved) {
+        UNREACHABLE();
+        return 0;
+    }
+
+    syncpoints.at(id).counterMin = host1x.GetSyncpointManager().GetHostSyncpointValue(id);
+    return syncpoints.at(id).counterMin;
 }
 
-u32 SyncpointManager::IncreaseSyncpoint(u32 syncpoint_id, u32 value) {
-    for (u32 index = 0; index < value; ++index) {
-        syncpoints[syncpoint_id].max.fetch_add(1, std::memory_order_relaxed);
+NvFence SyncpointManager::GetSyncpointFence(u32 id) {
+    if (!syncpoints.at(id).reserved) {
+        UNREACHABLE();
+        return NvFence{};
     }
 
-    return GetSyncpointMax(syncpoint_id);
+    return {.id = static_cast<s32>(id), .value = syncpoints.at(id).counterMax};
 }
 
 } // namespace Service::Nvidia::NvCore
diff --git a/src/core/hle/service/nvdrv/core/syncpoint_manager.h b/src/core/hle/service/nvdrv/core/syncpoint_manager.h
index f332edc6e..bfc8ba84b 100644
--- a/src/core/hle/service/nvdrv/core/syncpoint_manager.h
+++ b/src/core/hle/service/nvdrv/core/syncpoint_manager.h
@@ -1,10 +1,13 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-FileCopyrightText: 2022 yuzu emulator team and Skyline Team and Contributors
+// (https://github.com/skyline-emu/)
+// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3
+// or any later version Refer to the license.txt file included.
 
 #pragma once
 
 #include <array>
 #include <atomic>
+#include <mutex>
 
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
@@ -19,68 +22,111 @@ class Host1x;
 
 namespace Service::Nvidia::NvCore {
 
+enum class ChannelType : u32 {
+    MsEnc = 0,
+    VIC = 1,
+    GPU = 2,
+    NvDec = 3,
+    Display = 4,
+    NvJpg = 5,
+    TSec = 6,
+    Max = 7
+};
+
+/**
+ * @brief SyncpointManager handles allocating and accessing host1x syncpoints, these are cached
+ * versions of the HW syncpoints which are intermittently synced
+ * @note Refer to Chapter 14 of the Tegra X1 TRM for an exhaustive overview of them
+ * @url https://http.download.nvidia.com/tegra-public-appnotes/host1x.html
+ * @url
+ * https://github.com/Jetson-TX1-AndroidTV/android_kernel_jetson_tx1_hdmi_primary/blob/jetson-tx1/drivers/video/tegra/host/nvhost_syncpt.c
+ */
 class SyncpointManager final {
 public:
     explicit SyncpointManager(Tegra::Host1x::Host1x& host1x);
     ~SyncpointManager();
 
     /**
-     * Returns true if the specified syncpoint is expired for the given value.
-     * @param syncpoint_id Syncpoint ID to check.
-     * @param value Value to check against the specified syncpoint.
-     * @returns True if the specified syncpoint is expired for the given value, otherwise False.
+     * @brief Checks if the given syncpoint is both allocated and below the number of HW syncpoints
      */
-    bool IsSyncpointExpired(u32 syncpoint_id, u32 value) const {
-        return (GetSyncpointMax(syncpoint_id) - value) >= (GetSyncpointMin(syncpoint_id) - value);
-    }
+    bool IsSyncpointAllocated(u32 id);
 
     /**
-     * Gets the lower bound for the specified syncpoint.
-     * @param syncpoint_id Syncpoint ID to get the lower bound for.
-     * @returns The lower bound for the specified syncpoint.
+     * @brief Finds a free syncpoint and reserves it
+     * @return The ID of the reserved syncpoint
      */
-    u32 GetSyncpointMin(u32 syncpoint_id) const {
-        return syncpoints.at(syncpoint_id).min.load(std::memory_order_relaxed);
-    }
+    u32 AllocateSyncpoint(bool clientManaged);
 
     /**
-     * Gets the uper bound for the specified syncpoint.
-     * @param syncpoint_id Syncpoint ID to get the upper bound for.
-     * @returns The upper bound for the specified syncpoint.
+     * @url
+     * https://github.com/Jetson-TX1-AndroidTV/android_kernel_jetson_tx1_hdmi_primary/blob/8f74a72394efb871cb3f886a3de2998cd7ff2990/drivers/gpu/host1x/syncpt.c#L259
      */
-    u32 GetSyncpointMax(u32 syncpoint_id) const {
-        return syncpoints.at(syncpoint_id).max.load(std::memory_order_relaxed);
+    bool HasSyncpointExpired(u32 id, u32 threshold);
+
+    bool IsFenceSignalled(NvFence fence) {
+        return HasSyncpointExpired(fence.id, fence.value);
     }
 
     /**
-     * Refreshes the minimum value for the specified syncpoint.
-     * @param syncpoint_id Syncpoint ID to be refreshed.
-     * @returns The new syncpoint minimum value.
+     * @brief Atomically increments the maximum value of a syncpoint by the given amount
+     * @return The new max value of the syncpoint
      */
-    u32 RefreshSyncpoint(u32 syncpoint_id);
+    u32 IncrementSyncpointMaxExt(u32 id, u32 amount);
 
     /**
-     * Allocates a new syncoint.
-     * @returns The syncpoint ID for the newly allocated syncpoint.
+     * @return The minimum value of the syncpoint
      */
-    u32 AllocateSyncpoint();
+    u32 ReadSyncpointMinValue(u32 id);
 
     /**
-     * Increases the maximum value for the specified syncpoint.
-     * @param syncpoint_id Syncpoint ID to be increased.
-     * @param value Value to increase the specified syncpoint by.
-     * @returns The new syncpoint maximum value.
+     * @brief Synchronises the minimum value of the syncpoint to with the GPU
+     * @return The new minimum value of the syncpoint
      */
-    u32 IncreaseSyncpoint(u32 syncpoint_id, u32 value);
+    u32 UpdateMin(u32 id);
+
+    /**
+     * @return A fence that will be signalled once this syncpoint hits its maximum value
+     */
+    NvFence GetSyncpointFence(u32 id);
+
+    static constexpr std::array<u32, static_cast<u32>(ChannelType::Max)> channel_syncpoints{
+        0x0,  // `MsEnc` is unimplemented
+        0xC,  // `VIC`
+        0x0,  // `GPU` syncpoints are allocated per-channel instead
+        0x36, // `NvDec`
+        0x0,  // `Display` is unimplemented
+        0x37, // `NvJpg`
+        0x0,  // `TSec` is unimplemented
+    };        //!< Maps each channel ID to a constant syncpoint
 
 private:
-    struct Syncpoint {
-        std::atomic<u32> min;
-        std::atomic<u32> max;
-        std::atomic<bool> is_allocated;
+    /**
+     * @note reservation_lock should be locked when calling this
+     */
+    u32 ReserveSyncpoint(u32 id, bool clientManaged);
+
+    /**
+     * @return The ID of the first free syncpoint
+     */
+    u32 FindFreeSyncpoint();
+
+    struct SyncpointInfo {
+        std::atomic<u32> counterMin; //!< The least value the syncpoint can be (The value it was
+                                     //!< when it was last synchronized with host1x)
+        std::atomic<u32> counterMax; //!< The maximum value the syncpoint can reach according to the
+                                     //!< current usage
+        bool interfaceManaged; //!< If the syncpoint is managed by a host1x client interface, a
+                               //!< client interface is a HW block that can handle host1x
+                               //!< transactions on behalf of a host1x client (Which would otherwise
+                               //!< need to be manually synced using PIO which is synchronous and
+                               //!< requires direct cooperation of the CPU)
+        bool reserved; //!< If the syncpoint is reserved or not, not to be confused with a reserved
+                       //!< value
     };
 
-    std::array<Syncpoint, MaxSyncPoints> syncpoints{};
+    constexpr static std::size_t SyncpointCount{192};
+    std::array<SyncpointInfo, SyncpointCount> syncpoints{};
+    std::mutex reservation_lock;
 
     Tegra::Host1x::Host1x& host1x;
 };
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
index 076edb02f..a84e4d425 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@@ -112,17 +112,23 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector
     }
 
     if (params.fence.value == 0) {
-        params.value.raw = syncpoint_manager.GetSyncpointMin(fence_id);
+        if (!syncpoint_manager.IsSyncpointAllocated(params.fence.id)) {
+            LOG_WARNING(Service_NVDRV,
+                        "Unallocated syncpt_id={}, threshold={}, timeout={}, is_allocation={}",
+                        params.fence.id, params.fence.value, params.timeout, is_allocation);
+        } else {
+            params.value.raw = syncpoint_manager.ReadSyncpointMinValue(fence_id);
+        }
         return NvResult::Success;
     }
 
-    if (syncpoint_manager.IsSyncpointExpired(fence_id, params.fence.value)) {
-        params.value.raw = syncpoint_manager.GetSyncpointMin(fence_id);
+    if (syncpoint_manager.IsFenceSignalled(params.fence)) {
+        params.value.raw = syncpoint_manager.ReadSyncpointMinValue(fence_id);
         return NvResult::Success;
     }
 
-    if (const auto new_value = syncpoint_manager.RefreshSyncpoint(fence_id);
-        syncpoint_manager.IsSyncpointExpired(fence_id, params.fence.value)) {
+    if (const auto new_value = syncpoint_manager.UpdateMin(fence_id);
+        syncpoint_manager.IsFenceSignalled(params.fence)) {
         params.value.raw = new_value;
         return NvResult::Success;
     }
@@ -296,7 +302,7 @@ NvResult nvhost_ctrl::IocCtrlClearEventWait(const std::vector<u8>& input, std::v
         EventState::Waiting) {
         auto& host1x_syncpoint_manager = system.Host1x().GetSyncpointManager();
         host1x_syncpoint_manager.DeregisterHostAction(event.assigned_syncpt, event.wait_handle);
-        syncpoint_manager.RefreshSyncpoint(event.assigned_syncpt);
+        syncpoint_manager.UpdateMin(event.assigned_syncpt);
         event.wait_handle = {};
     }
     event.fails++;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 3f981af5a..c2cc09993 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -31,9 +31,7 @@ nvhost_gpu::nvhost_gpu(Core::System& system_, EventInterface& events_interface_,
     : nvdevice{system_}, events_interface{events_interface_}, core{core_},
       syncpoint_manager{core_.GetSyncpointManager()}, nvmap{core.GetNvMapFile()},
       channel_state{system.GPU().AllocateChannel()} {
-    channel_fence.id = syncpoint_manager.AllocateSyncpoint();
-    channel_fence.value =
-        system_.Host1x().GetSyncpointManager().GetGuestSyncpointValue(channel_fence.id);
+    channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false);
     sm_exception_breakpoint_int_report_event =
         events_interface.CreateEvent("GpuChannelSMExceptionBreakpointInt");
     sm_exception_breakpoint_pause_report_event =
@@ -191,10 +189,8 @@ NvResult nvhost_gpu::AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8
     }
 
     system.GPU().InitChannel(*channel_state);
-    channel_fence.value =
-        system.Host1x().GetSyncpointManager().GetGuestSyncpointValue(channel_fence.id);
 
-    params.fence_out = channel_fence;
+    params.fence_out = syncpoint_manager.GetSyncpointFence(channel_syncpoint);
 
     std::memcpy(output.data(), &params, output.size());
     return NvResult::Success;
@@ -222,14 +218,13 @@ static std::vector<Tegra::CommandHeader> BuildWaitCommandList(NvFence fence) {
     };
 }
 
-static std::vector<Tegra::CommandHeader> BuildIncrementCommandList(NvFence fence,
-                                                                   u32 add_increment) {
+static std::vector<Tegra::CommandHeader> BuildIncrementCommandList(NvFence fence) {
     std::vector<Tegra::CommandHeader> result{
         Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1,
                                   Tegra::SubmissionMode::Increasing),
         {}};
 
-    for (u32 count = 0; count < add_increment; ++count) {
+    for (u32 count = 0; count < 2; ++count) {
         result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1,
                                                       Tegra::SubmissionMode::Increasing));
         result.emplace_back(
@@ -239,14 +234,12 @@ static std::vector<Tegra::CommandHeader> BuildIncrementCommandList(NvFence fence
     return result;
 }
 
-static std::vector<Tegra::CommandHeader> BuildIncrementWithWfiCommandList(NvFence fence,
-                                                                          u32 add_increment) {
+static std::vector<Tegra::CommandHeader> BuildIncrementWithWfiCommandList(NvFence fence) {
     std::vector<Tegra::CommandHeader> result{
         Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForIdle, 1,
                                   Tegra::SubmissionMode::Increasing),
         {}};
-    const std::vector<Tegra::CommandHeader> increment{
-        BuildIncrementCommandList(fence, add_increment)};
+    const std::vector<Tegra::CommandHeader> increment{BuildIncrementCommandList(fence)};
 
     result.insert(result.end(), increment.begin(), increment.end());
 
@@ -260,35 +253,41 @@ NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector<u8>
 
     auto& gpu = system.GPU();
 
+    std::scoped_lock lock(channel_mutex);
+
     const auto bind_id = channel_state->bind_id;
 
-    params.fence_out.id = channel_fence.id;
+    auto& flags = params.flags;
 
-    if (params.flags.add_wait.Value() &&
-        !syncpoint_manager.IsSyncpointExpired(params.fence_out.id, params.fence_out.value)) {
-        gpu.PushGPUEntries(bind_id, Tegra::CommandList{BuildWaitCommandList(params.fence_out)});
-    }
+    if (flags.fence_wait.Value()) {
+        if (flags.increment_value.Value()) {
+            return NvResult::BadParameter;
+        }
 
-    if (params.flags.add_increment.Value() || params.flags.increment.Value()) {
-        const u32 increment_value = params.flags.increment.Value() ? params.fence_out.value : 0;
-        params.fence_out.value = syncpoint_manager.IncreaseSyncpoint(
-            params.fence_out.id, params.AddIncrementValue() + increment_value);
-    } else {
-        params.fence_out.value = syncpoint_manager.GetSyncpointMax(params.fence_out.id);
+        if (!syncpoint_manager.IsFenceSignalled(params.fence)) {
+            gpu.PushGPUEntries(bind_id, Tegra::CommandList{BuildWaitCommandList(params.fence)});
+        }
     }
 
     gpu.PushGPUEntries(bind_id, std::move(entries));
+    params.fence.id = channel_syncpoint;
+
+    u32 increment{(flags.fence_increment.Value() != 0 ? 2 : 0) +
+                  (flags.increment_value.Value() != 0 ? params.fence.value : 0)};
+    params.fence.value = syncpoint_manager.IncrementSyncpointMaxExt(channel_syncpoint, increment);
 
-    if (params.flags.add_increment.Value()) {
-        if (params.flags.suppress_wfi) {
-            gpu.PushGPUEntries(bind_id, Tegra::CommandList{BuildIncrementCommandList(
-                                            params.fence_out, params.AddIncrementValue())});
+    if (flags.fence_increment.Value()) {
+        if (flags.suppress_wfi.Value()) {
+            gpu.PushGPUEntries(bind_id,
+                               Tegra::CommandList{BuildIncrementCommandList(params.fence)});
         } else {
-            gpu.PushGPUEntries(bind_id, Tegra::CommandList{BuildIncrementWithWfiCommandList(
-                                            params.fence_out, params.AddIncrementValue())});
+            gpu.PushGPUEntries(bind_id,
+                               Tegra::CommandList{BuildIncrementWithWfiCommandList(params.fence)});
         }
     }
 
+    flags.raw = 0;
+
     std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
     return NvResult::Success;
 }
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
index 3a65ed06d..1e4ecd55b 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -163,17 +163,13 @@ private:
         u32_le num_entries{}; // number of fence objects being submitted
         union {
             u32_le raw;
-            BitField<0, 1, u32_le> add_wait;      // append a wait sync_point to the list
-            BitField<1, 1, u32_le> add_increment; // append an increment to the list
-            BitField<2, 1, u32_le> new_hw_format; // mostly ignored
-            BitField<4, 1, u32_le> suppress_wfi;  // suppress wait for interrupt
-            BitField<8, 1, u32_le> increment;     // increment the returned fence
+            BitField<0, 1, u32_le> fence_wait;      // append a wait sync_point to the list
+            BitField<1, 1, u32_le> fence_increment; // append an increment to the list
+            BitField<2, 1, u32_le> new_hw_format;   // mostly ignored
+            BitField<4, 1, u32_le> suppress_wfi;    // suppress wait for interrupt
+            BitField<8, 1, u32_le> increment_value; // increment the returned fence
         } flags;
-        NvFence fence_out{}; // returned new fence object for others to wait on
-
-        u32 AddIncrementValue() const {
-            return flags.add_increment.Value() << 1;
-        }
+        NvFence fence{}; // returned new fence object for others to wait on
     };
     static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(NvFence),
                   "IoctlSubmitGpfifo is incorrect size");
@@ -213,7 +209,8 @@ private:
     NvCore::SyncpointManager& syncpoint_manager;
     NvCore::NvMap& nvmap;
     std::shared_ptr<Tegra::Control::ChannelState> channel_state;
-    NvFence channel_fence;
+    u32 channel_syncpoint;
+    std::mutex channel_mutex;
 
     // Events
     Kernel::KEvent* sm_exception_breakpoint_int_report_event;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index 00947ea19..5e3820085 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -13,7 +13,7 @@ namespace Service::Nvidia::Devices {
 u32 nvhost_nvdec::next_id{};
 
 nvhost_nvdec::nvhost_nvdec(Core::System& system_, NvCore::Container& core)
-    : nvhost_nvdec_common{system_, core} {}
+    : nvhost_nvdec_common{system_, core, NvCore::ChannelType::NvDec} {}
 nvhost_nvdec::~nvhost_nvdec() = default;
 
 NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
index b17589aa3..008092dbb 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -48,9 +48,10 @@ std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::s
 
 std::unordered_map<DeviceFD, u32> nvhost_nvdec_common::fd_to_id{};
 
-nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_)
-    : nvdevice{system_}, core{core_},
-      syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()} {}
+nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_,
+                                         NvCore::ChannelType channel_type_)
+    : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()},
+      nvmap{core.GetNvMapFile()}, channel_type{channel_type_} {}
 nvhost_nvdec_common::~nvhost_nvdec_common() = default;
 
 NvResult nvhost_nvdec_common::SetNVMAPfd(const std::vector<u8>& input) {
@@ -88,7 +89,7 @@ NvResult nvhost_nvdec_common::Submit(DeviceFD fd, const std::vector<u8>& input,
         for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
             const SyncptIncr& syncpt_incr = syncpt_increments[i];
             fence_thresholds[i] =
-                syncpoint_manager.IncreaseSyncpoint(syncpt_incr.id, syncpt_incr.increments);
+                syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
         }
     }
     for (const auto& cmd_buffer : command_buffers) {
@@ -116,10 +117,8 @@ NvResult nvhost_nvdec_common::GetSyncpoint(const std::vector<u8>& input, std::ve
     std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
     LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param);
 
-    if (device_syncpoints[params.param] == 0 && system.GPU().UseNvdec()) {
-        device_syncpoints[params.param] = syncpoint_manager.AllocateSyncpoint();
-    }
-    params.value = device_syncpoints[params.param];
+    const u32 id{NvCore::SyncpointManager::channel_syncpoints[static_cast<u32>(channel_type)]};
+    params.value = id;
     std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
 
     return NvResult::Success;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
index 53029af6a..51bb7c2cb 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -6,6 +6,7 @@
 #include <vector>
 #include "common/common_types.h"
 #include "common/swap.h"
+#include "core/hle/service/nvdrv/core/syncpoint_manager.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 
 namespace Service::Nvidia {
@@ -13,14 +14,14 @@ namespace Service::Nvidia {
 namespace NvCore {
 class Container;
 class NvMap;
-class SyncpointManager;
 } // namespace NvCore
 
 namespace Devices {
 
 class nvhost_nvdec_common : public nvdevice {
 public:
-    explicit nvhost_nvdec_common(Core::System& system_, NvCore::Container& core);
+    explicit nvhost_nvdec_common(Core::System& system_, NvCore::Container& core,
+                                 NvCore::ChannelType channel_type);
     ~nvhost_nvdec_common() override;
 
 protected:
@@ -121,6 +122,7 @@ protected:
     NvCore::Container& core;
     NvCore::SyncpointManager& syncpoint_manager;
     NvCore::NvMap& nvmap;
+    NvCore::ChannelType channel_type;
     std::array<u32, MaxSyncPoints> device_syncpoints{};
 };
 }; // namespace Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index c89ff6b27..490e399f4 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -12,7 +12,7 @@ namespace Service::Nvidia::Devices {
 u32 nvhost_vic::next_id{};
 
 nvhost_vic::nvhost_vic(Core::System& system_, NvCore::Container& core)
-    : nvhost_nvdec_common{system_, core} {}
+    : nvhost_nvdec_common{system_, core, NvCore::ChannelType::VIC} {}
 
 nvhost_vic::~nvhost_vic() = default;
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 632052c53..3c6e44a25 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -453,18 +453,10 @@ void Maxwell3D::ProcessFirmwareCall4() {
 }
 
 void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
-    struct LongQueryResult {
-        u64_le value;
-        u64_le timestamp;
-    };
-    static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
     const GPUVAddr sequence_address{regs.query.QueryAddress()};
     if (long_query) {
-        // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
-        // GPU, this command may actually take a while to complete in real hardware due to GPU
-        // wait queues.
-        LongQueryResult query_result{payload, system.GPU().GetTicks()};
-        memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+        memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks());
+        memory_manager.Write<u64>(sequence_address, payload);
     } else {
         memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload));
     }
@@ -493,10 +485,10 @@ void Maxwell3D::ProcessQueryGet() {
             const GPUVAddr sequence_address{regs.query.QueryAddress()};
             const u32 payload = regs.query.query_sequence;
             std::function<void()> operation([this, sequence_address, payload] {
-                LongQueryResult query_result{payload, system.GPU().GetTicks()};
-                memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+                memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks());
+                memory_manager.Write<u64>(sequence_address, payload);
             });
-            rasterizer->SignalFence(std::move(operation));
+            rasterizer->SyncOperation(std::move(operation));
         }
         break;
     case Regs::QueryOperation::Acquire:
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a12a95ce2..bcffd1862 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -274,16 +274,24 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
 void MaxwellDMA::ReleaseSemaphore() {
     const auto type = regs.launch_dma.semaphore_type;
     const GPUVAddr address = regs.semaphore.address;
+    const u32 payload = regs.semaphore.payload;
     switch (type) {
     case LaunchDMA::SemaphoreType::NONE:
         break;
-    case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE:
-        memory_manager.Write<u32>(address, regs.semaphore.payload);
+    case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: {
+        std::function<void()> operation(
+            [this, address, payload] { memory_manager.Write<u32>(address, payload); });
+        rasterizer->SignalFence(std::move(operation));
         break;
-    case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE:
-        memory_manager.Write<u64>(address, static_cast<u64>(regs.semaphore.payload));
-        memory_manager.Write<u64>(address + 8, system.GPU().GetTicks());
+    }
+    case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: {
+        std::function<void()> operation([this, address, payload] {
+            memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks());
+            memory_manager.Write<u64>(address, payload);
+        });
+        rasterizer->SignalFence(std::move(operation));
         break;
+    }
     default:
         ASSERT_MSG(false, "Unknown semaphore type: {}", static_cast<u32>(type.Value()));
     }
diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp
index dd9494efa..c3ed11c13 100644
--- a/src/video_core/engines/puller.cpp
+++ b/src/video_core/engines/puller.cpp
@@ -59,6 +59,7 @@ void Puller::ProcessFenceActionMethod() {
     case Puller::FenceOperation::Acquire:
         // UNIMPLEMENTED_MSG("Channel Scheduling pending.");
         // WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        rasterizer->ReleaseFences();
         break;
     case Puller::FenceOperation::Increment:
         rasterizer->SignalSyncPoint(regs.fence_action.syncpoint_id);
@@ -73,19 +74,11 @@ void Puller::ProcessSemaphoreTriggerMethod() {
     const auto op =
         static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
     if (op == GpuSemaphoreOperation::WriteLong) {
-        struct Block {
-            u32 sequence;
-            u32 zeros = 0;
-            u64 timestamp;
-        };
-
         const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
         const u32 payload = regs.semaphore_sequence;
         std::function<void()> operation([this, sequence_address, payload] {
-            Block block{};
-            block.sequence = payload;
-            block.timestamp = gpu.GetTicks();
-            memory_manager.WriteBlock(sequence_address, &block, sizeof(block));
+            memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks());
+            memory_manager.Write<u64>(sequence_address, payload);
         });
         rasterizer->SignalFence(std::move(operation));
     } else {
@@ -98,7 +91,6 @@ void Puller::ProcessSemaphoreTriggerMethod() {
                 regs.acquire_mode = false;
                 if (word != regs.acquire_value) {
                     rasterizer->ReleaseFences();
-                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                     continue;
                 }
             } else if (op == GpuSemaphoreOperation::AcquireGequal) {
@@ -106,13 +98,11 @@ void Puller::ProcessSemaphoreTriggerMethod() {
                 regs.acquire_mode = true;
                 if (word < regs.acquire_value) {
                     rasterizer->ReleaseFences();
-                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                     continue;
                 }
             } else if (op == GpuSemaphoreOperation::AcquireMask) {
                 if (word && regs.semaphore_sequence == 0) {
                     rasterizer->ReleaseFences();
-                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                     continue;
                 }
             } else {
@@ -128,7 +118,7 @@ void Puller::ProcessSemaphoreRelease() {
     std::function<void()> operation([this, sequence_address, payload] {
         memory_manager.Write<u32>(sequence_address, payload);
     });
-    rasterizer->SignalFence(std::move(operation));
+    rasterizer->SyncOperation(std::move(operation));
 }
 
 void Puller::ProcessSemaphoreAcquire() {
-- 
cgit v1.2.3