summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--.gitmodules3
-rw-r--r--CMakeLists.txt4
-rw-r--r--externals/CMakeLists.txt8
m---------externals/sirit0
m---------externals/xbyak0
-rw-r--r--src/common/CMakeLists.txt4
-rw-r--r--src/common/x64/xbyak_abi.h266
-rw-r--r--src/common/x64/xbyak_util.h47
-rw-r--r--src/core/hle/kernel/readable_event.cpp2
-rw-r--r--src/core/hle/service/hid/hid.cpp13
-rw-r--r--src/core/hle/service/hid/hid.h1
-rw-r--r--src/core/settings.h1
-rw-r--r--src/video_core/CMakeLists.txt10
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h40
-rw-r--r--src/video_core/engines/maxwell_3d.cpp34
-rw-r--r--src/video_core/engines/maxwell_3d.h19
-rw-r--r--src/video_core/macro/macro.cpp45
-rw-r--r--src/video_core/macro/macro.h128
-rw-r--r--src/video_core/macro/macro_interpreter.cpp (renamed from src/video_core/macro_interpreter.cpp)198
-rw-r--r--src/video_core/macro/macro_interpreter.h (renamed from src/video_core/macro_interpreter.h)51
-rw-r--r--src/video_core/macro/macro_jit_x64.cpp640
-rw-r--r--src/video_core/macro/macro_jit_x64.h100
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp89
-rw-r--r--src/video_core/renderer_opengl/gl_device.h12
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp218
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h8
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp12
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp123
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h6
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp123
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h32
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp3
-rw-r--r--src/video_core/renderer_vulkan/fixed_pipeline_state.cpp3
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pipeline.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_descriptor_pool.cpp1
-rw-r--r--src/video_core/renderer_vulkan/vk_device.cpp143
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.cpp20
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp70
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h16
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp174
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.h12
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp109
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h33
-rw-r--r--src/video_core/shader/decode/other.cpp16
-rw-r--r--src/video_core/shader/node.h5
-rw-r--r--src/video_core/texture_cache/format_lookup_table.cpp3
-rw-r--r--src/video_core/texture_cache/surface_base.cpp7
-rw-r--r--src/video_core/texture_cache/surface_base.h13
-rw-r--r--src/video_core/texture_cache/surface_params.cpp19
-rw-r--r--src/video_core/texture_cache/texture_cache.h247
-rw-r--r--src/yuzu/bootmanager.cpp3
-rw-r--r--src/yuzu/configuration/config.cpp3
-rw-r--r--src/yuzu/configuration/configure_debug.cpp3
-rw-r--r--src/yuzu/configuration/configure_debug.ui13
-rw-r--r--src/yuzu/configuration/configure_input_player.cpp3
-rw-r--r--src/yuzu_cmd/config.cpp2
-rw-r--r--src/yuzu_cmd/default_ini.h2
-rw-r--r--src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp3
59 files changed, 2401 insertions, 767 deletions
diff --git a/.gitmodules b/.gitmodules
index bf3b80d59..2ec9dda62 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -28,3 +28,6 @@
[submodule "libzip"]
path = externals/libzip/libzip
url = https://github.com/nih-at/libzip.git
+[submodule "xbyak"]
+ path = externals/xbyak
+ url = https://github.com/herumi/xbyak.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61321bf0a..a9f669d56 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.11)
+cmake_minimum_required(VERSION 3.15)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
@@ -13,7 +13,7 @@ project(yuzu)
option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)
option(ENABLE_QT "Enable the Qt frontend" ON)
-CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF)
+CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)
option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 0b40cd1b0..df7a5e0a9 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -75,3 +75,11 @@ if (ENABLE_WEB_SERVICE)
target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
endif()
+
+if (NOT TARGET xbyak)
+ if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+ add_library(xbyak INTERFACE)
+ target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+ target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+ endif()
+endif()
diff --git a/externals/sirit b/externals/sirit
-Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c
+Subproject eefca56afd49379bdebc97ded8b480839f93088
diff --git a/externals/xbyak b/externals/xbyak
new file mode 160000
+Subproject 82b70e665918efc2ee348091742fd0237b3b68c
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 264dff546..24b7a083c 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -171,10 +171,12 @@ if(ARCHITECTURE_x86_64)
PRIVATE
x64/cpu_detect.cpp
x64/cpu_detect.h
+ x64/xbyak_abi.h
+ x64/xbyak_util.h
)
endif()
create_target_directory_groups(common)
target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
-target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd)
+target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
new file mode 100644
index 000000000..794da8a52
--- /dev/null
+++ b/src/common/x64/xbyak_abi.h
@@ -0,0 +1,266 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <bitset>
+#include <initializer_list>
+#include <xbyak.h>
+#include "common/assert.h"
+
+namespace Common::X64 {
+
+inline int RegToIndex(const Xbyak::Reg& reg) {
+ using Kind = Xbyak::Reg::Kind;
+ ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
+ "RegSet only support GPRs and XMM registers.");
+ ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
+ return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
+}
+
+inline Xbyak::Reg64 IndexToReg64(int reg_index) {
+ ASSERT(reg_index < 16);
+ return Xbyak::Reg64(reg_index);
+}
+
+inline Xbyak::Xmm IndexToXmm(int reg_index) {
+ ASSERT(reg_index >= 16 && reg_index < 32);
+ return Xbyak::Xmm(reg_index - 16);
+}
+
+inline Xbyak::Reg IndexToReg(int reg_index) {
+ if (reg_index < 16) {
+ return IndexToReg64(reg_index);
+ } else {
+ return IndexToXmm(reg_index);
+ }
+}
+
+inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
+ std::bitset<32> bits;
+ for (const Xbyak::Reg& reg : regs) {
+ bits[RegToIndex(reg)] = true;
+ }
+ return bits;
+}
+
+const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
+const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
+
+#ifdef _WIN32
+
+// Microsoft x64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+ // GPRs
+ Xbyak::util::rcx,
+ Xbyak::util::rdx,
+ Xbyak::util::r8,
+ Xbyak::util::r9,
+ Xbyak::util::r10,
+ Xbyak::util::r11,
+ // XMMs
+ Xbyak::util::xmm0,
+ Xbyak::util::xmm1,
+ Xbyak::util::xmm2,
+ Xbyak::util::xmm3,
+ Xbyak::util::xmm4,
+ Xbyak::util::xmm5,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+ // GPRs
+ Xbyak::util::rbx,
+ Xbyak::util::rsi,
+ Xbyak::util::rdi,
+ Xbyak::util::rbp,
+ Xbyak::util::r12,
+ Xbyak::util::r13,
+ Xbyak::util::r14,
+ Xbyak::util::r15,
+ // XMMs
+ Xbyak::util::xmm6,
+ Xbyak::util::xmm7,
+ Xbyak::util::xmm8,
+ Xbyak::util::xmm9,
+ Xbyak::util::xmm10,
+ Xbyak::util::xmm11,
+ Xbyak::util::xmm12,
+ Xbyak::util::xmm13,
+ Xbyak::util::xmm14,
+ Xbyak::util::xmm15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0x20;
+
+#else
+
+// System V x86-64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+ // GPRs
+ Xbyak::util::rcx,
+ Xbyak::util::rdx,
+ Xbyak::util::rdi,
+ Xbyak::util::rsi,
+ Xbyak::util::r8,
+ Xbyak::util::r9,
+ Xbyak::util::r10,
+ Xbyak::util::r11,
+ // XMMs
+ Xbyak::util::xmm0,
+ Xbyak::util::xmm1,
+ Xbyak::util::xmm2,
+ Xbyak::util::xmm3,
+ Xbyak::util::xmm4,
+ Xbyak::util::xmm5,
+ Xbyak::util::xmm6,
+ Xbyak::util::xmm7,
+ Xbyak::util::xmm8,
+ Xbyak::util::xmm9,
+ Xbyak::util::xmm10,
+ Xbyak::util::xmm11,
+ Xbyak::util::xmm12,
+ Xbyak::util::xmm13,
+ Xbyak::util::xmm14,
+ Xbyak::util::xmm15,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+ // GPRs
+ Xbyak::util::rbx,
+ Xbyak::util::rbp,
+ Xbyak::util::r12,
+ Xbyak::util::r13,
+ Xbyak::util::r14,
+ Xbyak::util::r15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0;
+
+#endif
+
+inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+ size_t needed_frame_size, s32* out_subtraction,
+ s32* out_xmm_offset) {
+ const auto count = (regs & ABI_ALL_GPRS).count();
+ rsp_alignment -= count * 8;
+ size_t subtraction = 0;
+ const auto xmm_count = (regs & ABI_ALL_XMMS).count();
+ if (xmm_count) {
+ // If we have any XMMs to save, we must align the stack here.
+ subtraction = rsp_alignment & 0xF;
+ }
+ subtraction += 0x10 * xmm_count;
+ size_t xmm_base_subtraction = subtraction;
+ subtraction += needed_frame_size;
+ subtraction += ABI_SHADOW_SPACE;
+ // Final alignment.
+ rsp_alignment -= subtraction;
+ subtraction += rsp_alignment & 0xF;
+
+ *out_subtraction = (s32)subtraction;
+ *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
+}
+
+inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+ size_t rsp_alignment, size_t needed_frame_size = 0) {
+ s32 subtraction, xmm_offset;
+ ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+ for (std::size_t i = 0; i < regs.size(); ++i) {
+ if (regs[i] && ABI_ALL_GPRS[i]) {
+ code.push(IndexToReg64(static_cast<int>(i)));
+ }
+ }
+ if (subtraction != 0) {
+ code.sub(code.rsp, subtraction);
+ }
+
+ for (int i = 0; i < regs.count(); i++) {
+ if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
+ code.push(IndexToReg64(i));
+ }
+ }
+
+ for (std::size_t i = 0; i < regs.size(); ++i) {
+ if (regs[i] && ABI_ALL_XMMS[i]) {
+ code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
+ xmm_offset += 0x10;
+ }
+ }
+
+ return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+ size_t rsp_alignment, size_t needed_frame_size = 0) {
+ s32 subtraction, xmm_offset;
+ ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+ for (std::size_t i = 0; i < regs.size(); ++i) {
+ if (regs[i] && ABI_ALL_XMMS[i]) {
+ code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
+ xmm_offset += 0x10;
+ }
+ }
+
+ if (subtraction != 0) {
+ code.add(code.rsp, subtraction);
+ }
+
+ // GPRs need to be popped in reverse order
+ for (int i = 15; i >= 0; i--) {
+ if (regs[i]) {
+ code.pop(IndexToReg64(i));
+ }
+ }
+}
+
+inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+ size_t rsp_alignment,
+ size_t needed_frame_size = 0) {
+ s32 subtraction, xmm_offset;
+ ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+ for (std::size_t i = 0; i < regs.size(); ++i) {
+ if (regs[i] && ABI_ALL_GPRS[i]) {
+ code.push(IndexToReg64(static_cast<int>(i)));
+ }
+ }
+
+ if (subtraction != 0) {
+ code.sub(code.rsp, subtraction);
+ }
+
+ return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+ size_t rsp_alignment, size_t needed_frame_size = 0) {
+ s32 subtraction, xmm_offset;
+ ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+ if (subtraction != 0) {
+ code.add(code.rsp, subtraction);
+ }
+
+ // GPRs need to be popped in reverse order
+ for (int i = 15; i >= 0; i--) {
+ if (regs[i]) {
+ code.pop(IndexToReg64(i));
+ }
+ }
+}
+
+} // namespace Common::X64
diff --git a/src/common/x64/xbyak_util.h b/src/common/x64/xbyak_util.h
new file mode 100644
index 000000000..df17f8cbe
--- /dev/null
+++ b/src/common/x64/xbyak_util.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include <xbyak.h>
+#include "common/x64/xbyak_abi.h"
+
+namespace Common::X64 {
+
+// Constants for use with cmpps/cmpss
+enum {
+ CMP_EQ = 0,
+ CMP_LT = 1,
+ CMP_LE = 2,
+ CMP_UNORD = 3,
+ CMP_NEQ = 4,
+ CMP_NLT = 5,
+ CMP_NLE = 6,
+ CMP_ORD = 7,
+};
+
+constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
+ const u64 distance = target - (ref + 5);
+ return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
+}
+
+inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
+ return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
+}
+
+template <typename T>
+inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
+ static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
+ size_t addr = reinterpret_cast<size_t>(f);
+ if (IsWithin2G(code, addr)) {
+ code.call(f);
+ } else {
+ // ABI_RETURN is a safe temp register to use before a call
+ code.mov(ABI_RETURN, addr);
+ code.call(ABI_RETURN);
+ }
+}
+
+} // namespace Common::X64
diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp
index 00860fcbd..ef5e19e63 100644
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -38,7 +38,7 @@ void ReadableEvent::Clear() {
ResultCode ReadableEvent::Reset() {
if (!is_signaled) {
- LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
+ LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
GetObjectId(), GetTypeName(), GetName());
return ERR_INVALID_STATE;
}
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index c84cb1483..72a050de2 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
{40, nullptr, "AcquireXpadIdEventHandle"},
{41, nullptr, "ReleaseXpadIdEventHandle"},
{51, &Hid::ActivateXpad, "ActivateXpad"},
- {55, nullptr, "GetXpadIds"},
+ {55, &Hid::GetXpadIDs, "GetXpadIds"},
{56, nullptr, "ActivateJoyXpad"},
{58, nullptr, "GetJoyXpadLifoHandle"},
{59, nullptr, "GetJoyXpadIds"},
@@ -319,6 +319,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) {
rb.Push(RESULT_SUCCESS);
}
+void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) {
+ IPC::RequestParser rp{ctx};
+ const auto applet_resource_user_id{rp.Pop<u64>()};
+
+ LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id);
+
+ IPC::ResponseBuilder rb{ctx, 3};
+ rb.Push(RESULT_SUCCESS);
+ rb.Push(0);
+}
+
void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) {
IPC::RequestParser rp{ctx};
const auto applet_resource_user_id{rp.Pop<u64>()};
diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h
index c8ed4ad8b..d481a75f8 100644
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -86,6 +86,7 @@ public:
private:
void CreateAppletResource(Kernel::HLERequestContext& ctx);
void ActivateXpad(Kernel::HLERequestContext& ctx);
+ void GetXpadIDs(Kernel::HLERequestContext& ctx);
void ActivateDebugPad(Kernel::HLERequestContext& ctx);
void ActivateTouchScreen(Kernel::HLERequestContext& ctx);
void ActivateMouse(Kernel::HLERequestContext& ctx);
diff --git a/src/core/settings.h b/src/core/settings.h
index 78eb33737..36cd66fd4 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -474,6 +474,7 @@ struct Values {
bool reporting_services;
bool quest_flag;
bool disable_cpu_opt;
+ bool disable_macro_jit;
// BCAT
std::string bcat_backend;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index f00c71dae..2bf8d68ce 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -25,6 +25,12 @@ add_library(video_core STATIC
engines/shader_bytecode.h
engines/shader_header.h
engines/shader_type.h
+ macro/macro.cpp
+ macro/macro.h
+ macro/macro_interpreter.cpp
+ macro/macro_interpreter.h
+ macro/macro_jit_x64.cpp
+ macro/macro_jit_x64.h
fence_manager.h
gpu.cpp
gpu.h
@@ -36,8 +42,6 @@ add_library(video_core STATIC
gpu_thread.h
guest_driver.cpp
guest_driver.h
- macro_interpreter.cpp
- macro_interpreter.h
memory_manager.cpp
memory_manager.h
morton.cpp
@@ -229,7 +233,7 @@ endif()
create_target_directory_groups(video_core)
target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad)
+target_link_libraries(video_core PRIVATE glad xbyak)
if (ENABLE_VULKAN)
target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index d9a4a1b4d..b88fce2cd 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -56,24 +56,28 @@ public:
if (use_fast_cbuf || size < max_stream_size) {
if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
auto& memory_manager = system.GPU().MemoryManager();
+ const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
if (use_fast_cbuf) {
- if (memory_manager.IsGranularRange(gpu_addr, size)) {
- const auto host_ptr = memory_manager.GetPointer(gpu_addr);
- return ConstBufferUpload(host_ptr, size);
+ u8* dest;
+ if (is_granular) {
+ dest = memory_manager.GetPointer(gpu_addr);
} else {
staging_buffer.resize(size);
- memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
- return ConstBufferUpload(staging_buffer.data(), size);
+ dest = staging_buffer.data();
+ memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
}
+ return ConstBufferUpload(dest, size);
+ }
+ if (is_granular) {
+ u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
+ return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
+ std::memcpy(dest, host_ptr, size);
+ });
} else {
- if (memory_manager.IsGranularRange(gpu_addr, size)) {
- const auto host_ptr = memory_manager.GetPointer(gpu_addr);
- return StreamBufferUpload(host_ptr, size, alignment);
- } else {
- staging_buffer.resize(size);
- memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
- return StreamBufferUpload(staging_buffer.data(), size, alignment);
- }
+ return StreamBufferUpload(
+ size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
+ memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
+ });
}
}
}
@@ -101,7 +105,9 @@ public:
BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
std::size_t alignment = 4) {
std::lock_guard lock{mutex};
- return StreamBufferUpload(raw_pointer, size, alignment);
+ return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
+ std::memcpy(dest, raw_pointer, size);
+ });
}
void Map(std::size_t max_size) {
@@ -424,11 +430,11 @@ private:
map->MarkAsModified(false, 0);
}
- BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
- std::size_t alignment) {
+ template <typename Callable>
+ BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
AlignBuffer(alignment);
const std::size_t uploaded_offset = buffer_offset;
- std::memcpy(buffer_ptr, raw_pointer, size);
+ callable(buffer_ptr);
buffer_ptr += size;
buffer_offset += size;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 024c9e43b..e46b153f9 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
MemoryManager& memory_manager)
: system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
- macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+ macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
dirty.flags.flip();
-
InitializeRegisterDefaults();
}
@@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
regs.rasterize_enable = 1;
regs.rt_separate_frag_data = 1;
regs.framebuffer_srgb = 1;
+ regs.line_width_aliased = 1.0f;
+ regs.line_width_smooth = 1.0f;
regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+ regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+ regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
shadow_state = regs;
@@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
}
-void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
// Reset the current macro.
executing_macro = 0;
@@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
// Execute the current macro.
- macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+ macro_engine->Execute(macro_positions[entry], parameters);
if (mme_draw.current_mode != MMEDrawMode::Undefined) {
FlushMMEInlineDraw();
}
@@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
// Call the macro when there are no more parameters in the command buffer
if (is_last_call) {
- CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+ CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
}
return;
@@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
break;
}
case MAXWELL3D_REG_INDEX(macros.data): {
- ProcessMacroUpload(arg);
+ macro_engine->AddCode(regs.macros.upload_address, arg);
break;
}
case MAXWELL3D_REG_INDEX(macros.bind): {
@@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
// Call the macro when there are no more parameters in the command buffer
if (amount == methods_pending) {
- CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+ CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
}
return;
@@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
}
void Maxwell3D::ProcessMacroUpload(u32 data) {
- ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
- "upload_address exceeded macro_memory size!");
- macro_memory[regs.macros.upload_address++] = data;
+ macro_engine->AddCode(regs.macros.upload_address++, data);
}
void Maxwell3D::ProcessMacroBind(u32 data) {
@@ -457,8 +458,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
void Maxwell3D::ProcessQueryGet() {
// TODO(Subv): Support the other query units.
- ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
- "Units other than CROP are unimplemented");
+ if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+ LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+ }
switch (regs.query.query_get.operation) {
case Regs::QueryOperation::Release:
@@ -534,8 +536,8 @@ void Maxwell3D::ProcessCounterReset() {
rasterizer.ResetCounter(QueryType::SamplesPassed);
break;
default:
- LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
- static_cast<int>(regs.counter_reset));
+ LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
+ static_cast<int>(regs.counter_reset));
break;
}
}
@@ -592,8 +594,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
system.GPU().GetTicks());
return {};
default:
- UNIMPLEMENTED_MSG("Unimplemented query select type {}",
- static_cast<u32>(regs.query.query_get.select.Value()));
+ LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+ static_cast<u32>(regs.query.query_get.select.Value()));
return 1;
}
}
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 05dd6b39b..79fc9bbea 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
#include "video_core/engines/engine_upload.h"
#include "video_core/engines/shader_type.h"
#include "video_core/gpu.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro.h"
#include "video_core/textures/texture.h"
namespace Core {
@@ -598,6 +598,7 @@ public:
BitField<4, 3, u32> block_height;
BitField<8, 3, u32> block_depth;
BitField<12, 1, InvMemoryLayout> type;
+ BitField<16, 1, u32> is_3d;
} memory_layout;
union {
BitField<0, 16, u32> layers;
@@ -1411,15 +1412,6 @@ public:
const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
- /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
- /// we've seen used.
- using MacroMemory = std::array<u32, 0x40000>;
-
- /// Gets a reference to macro memory.
- const MacroMemory& GetMacroMemory() const {
- return macro_memory;
- }
-
bool ShouldExecute() const {
return execute_on;
}
@@ -1468,16 +1460,13 @@ private:
std::array<bool, Regs::NUM_REGS> mme_inline{};
- /// Memory for macro code
- MacroMemory macro_memory;
-
/// Macro method that is currently being executed / being fed parameters.
u32 executing_macro = 0;
/// Parameters that have been submitted to the macro call so far.
std::vector<u32> macro_params;
/// Interpreter for the macro codes uploaded to the GPU.
- MacroInterpreter macro_interpreter;
+ std::unique_ptr<MacroEngine> macro_engine;
static constexpr u32 null_cb_data = 0xFFFFFFFF;
struct {
@@ -1506,7 +1495,7 @@ private:
* @param num_parameters Number of arguments
* @param parameters Arguments to the method call
*/
- void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
+ void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
/// Handles writes to the macro uploading register.
void ProcessMacroUpload(u32 data);
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
new file mode 100644
index 000000000..89077a2d8
--- /dev/null
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,45 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+ uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+ auto compiled_macro = macro_cache.find(method);
+ if (compiled_macro != macro_cache.end()) {
+ compiled_macro->second->Execute(parameters, method);
+ } else {
+ // Macro not compiled, check if it's uploaded and if so, compile it
+ auto macro_code = uploaded_macro_code.find(method);
+ if (macro_code == uploaded_macro_code.end()) {
+ UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+ return;
+ }
+ macro_cache[method] = Compile(macro_code->second);
+ macro_cache[method]->Execute(parameters, method);
+ }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+ if (Settings::values.disable_macro_jit) {
+ return std::make_unique<MacroInterpreter>(maxwell3d);
+ }
+#ifdef ARCHITECTURE_x86_64
+ return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+ return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
new file mode 100644
index 000000000..b76ed891f
--- /dev/null
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,128 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+namespace Engines {
+class Maxwell3D;
+}
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+ ALU = 0,
+ AddImmediate = 1,
+ ExtractInsert = 2,
+ ExtractShiftLeftImmediate = 3,
+ ExtractShiftLeftRegister = 4,
+ Read = 5,
+ Unused = 6, // This operation doesn't seem to be a valid encoding.
+ Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+ Add = 0,
+ AddWithCarry = 1,
+ Subtract = 2,
+ SubtractWithBorrow = 3,
+ // Operations 4-7 don't seem to be valid encodings.
+ Xor = 8,
+ Or = 9,
+ And = 10,
+ AndNot = 11,
+ Nand = 12
+};
+
+enum class ResultOperation : u32 {
+ IgnoreAndFetch = 0,
+ Move = 1,
+ MoveAndSetMethod = 2,
+ FetchAndSend = 3,
+ MoveAndSend = 4,
+ FetchAndSetMethod = 5,
+ MoveAndSetMethodFetchAndSend = 6,
+ MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+ Zero = 0,
+ NotZero = 1,
+};
+
+union Opcode {
+ u32 raw;
+ BitField<0, 3, Operation> operation;
+ BitField<4, 3, ResultOperation> result_operation;
+ BitField<4, 1, BranchCondition> branch_condition;
+ // If set on a branch, then the branch doesn't have a delay slot.
+ BitField<5, 1, u32> branch_annul;
+ BitField<7, 1, u32> is_exit;
+ BitField<8, 3, u32> dst;
+ BitField<11, 3, u32> src_a;
+ BitField<14, 3, u32> src_b;
+ // The signed immediate overlaps the second source operand and the alu operation.
+ BitField<14, 18, s32> immediate;
+
+ BitField<17, 5, ALUOperation> alu_operation;
+
+ // Bitfield instructions data
+ BitField<17, 5, u32> bf_src_bit;
+ BitField<22, 5, u32> bf_size;
+ BitField<27, 5, u32> bf_dst_bit;
+
+ u32 GetBitfieldMask() const {
+ return (1 << bf_size) - 1;
+ }
+
+ s32 GetBranchTarget() const {
+ return static_cast<s32>(immediate * sizeof(u32));
+ }
+};
+
+union MethodAddress {
+ u32 raw;
+ BitField<0, 12, u32> address;
+ BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class CachedMacro {
+public:
+ virtual ~CachedMacro() = default;
+ /**
+ * Executes the macro code with the specified input parameters.
+ * @param code The macro byte code to execute
+ * @param parameters The parameters of the macro
+ */
+ virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+ virtual ~MacroEngine() = default;
+
+ // Store the uploaded macro code to compile them when they're called.
+ void AddCode(u32 method, u32 data);
+
+ // Compiles the macro if its not in the cache, and executes the compiled macro
+ void Execute(u32 method, const std::vector<u32>& parameters);
+
+protected:
+ virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+ std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+ std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 947364928..5edff27aa 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
@@ -6,109 +6,46 @@
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro_interpreter.h"
MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
namespace Tegra {
-namespace {
-enum class Operation : u32 {
- ALU = 0,
- AddImmediate = 1,
- ExtractInsert = 2,
- ExtractShiftLeftImmediate = 3,
- ExtractShiftLeftRegister = 4,
- Read = 5,
- Unused = 6, // This operation doesn't seem to be a valid encoding.
- Branch = 7,
-};
-} // Anonymous namespace
-
-enum class MacroInterpreter::ALUOperation : u32 {
- Add = 0,
- AddWithCarry = 1,
- Subtract = 2,
- SubtractWithBorrow = 3,
- // Operations 4-7 don't seem to be valid encodings.
- Xor = 8,
- Or = 9,
- And = 10,
- AndNot = 11,
- Nand = 12
-};
-
-enum class MacroInterpreter::ResultOperation : u32 {
- IgnoreAndFetch = 0,
- Move = 1,
- MoveAndSetMethod = 2,
- FetchAndSend = 3,
- MoveAndSend = 4,
- FetchAndSetMethod = 5,
- MoveAndSetMethodFetchAndSend = 6,
- MoveAndSetMethodSend = 7
-};
-
-enum class MacroInterpreter::BranchCondition : u32 {
- Zero = 0,
- NotZero = 1,
-};
-
-union MacroInterpreter::Opcode {
- u32 raw;
- BitField<0, 3, Operation> operation;
- BitField<4, 3, ResultOperation> result_operation;
- BitField<4, 1, BranchCondition> branch_condition;
- // If set on a branch, then the branch doesn't have a delay slot.
- BitField<5, 1, u32> branch_annul;
- BitField<7, 1, u32> is_exit;
- BitField<8, 3, u32> dst;
- BitField<11, 3, u32> src_a;
- BitField<14, 3, u32> src_b;
- // The signed immediate overlaps the second source operand and the alu operation.
- BitField<14, 18, s32> immediate;
-
- BitField<17, 5, ALUOperation> alu_operation;
-
- // Bitfield instructions data
- BitField<17, 5, u32> bf_src_bit;
- BitField<22, 5, u32> bf_size;
- BitField<27, 5, u32> bf_dst_bit;
-
- u32 GetBitfieldMask() const {
- return (1 << bf_size) - 1;
- }
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
- s32 GetBranchTarget() const {
- return static_cast<s32>(immediate * sizeof(u32));
- }
-};
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+ return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
+ const std::vector<u32>& code)
+ : maxwell3d(maxwell3d), code(code) {}
-void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
+void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
MICROPROFILE_SCOPE(MacroInterp);
Reset();
registers[1] = parameters[0];
+ num_parameters = parameters.size();
if (num_parameters > parameters_capacity) {
parameters_capacity = num_parameters;
this->parameters = std::make_unique<u32[]>(num_parameters);
}
- std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
+ std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
this->num_parameters = num_parameters;
// Execute the code until we hit an exit condition.
bool keep_executing = true;
while (keep_executing) {
- keep_executing = Step(offset, false);
+ keep_executing = Step(false);
}
// Assert the the macro used all the input parameters
ASSERT(next_parameter_index == num_parameters);
}
-void MacroInterpreter::Reset() {
+void MacroInterpreterImpl::Reset() {
registers = {};
pc = 0;
delayed_pc = {};
@@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {
carry_flag = false;
}
-bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
u32 base_address = pc;
- Opcode opcode = GetOpcode(offset);
+ Macro::Opcode opcode = GetOpcode();
pc += 4;
// Update the program counter if we were delayed
@@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
}
switch (opcode.operation) {
- case Operation::ALU: {
+ case Macro::Operation::ALU: {
u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
GetRegister(opcode.src_b));
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
- case Operation::AddImmediate: {
+ case Macro::Operation::AddImmediate: {
ProcessResult(opcode.result_operation, opcode.dst,
GetRegister(opcode.src_a) + opcode.immediate);
break;
}
- case Operation::ExtractInsert: {
+ case Macro::Operation::ExtractInsert: {
u32 dst = GetRegister(opcode.src_a);
u32 src = GetRegister(opcode.src_b);
@@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
ProcessResult(opcode.result_operation, opcode.dst, dst);
break;
}
- case Operation::ExtractShiftLeftImmediate: {
+ case Macro::Operation::ExtractShiftLeftImmediate: {
u32 dst = GetRegister(opcode.src_a);
u32 src = GetRegister(opcode.src_b);
@@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
- case Operation::ExtractShiftLeftRegister: {
+ case Macro::Operation::ExtractShiftLeftRegister: {
u32 dst = GetRegister(opcode.src_a);
u32 src = GetRegister(opcode.src_b);
@@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
- case Operation::Read: {
+ case Macro::Operation::Read: {
u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
- case Operation::Branch: {
+ case Macro::Operation::Branch: {
ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
u32 value = GetRegister(opcode.src_a);
bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
delayed_pc = base_address + opcode.GetBranchTarget();
// Execute one more instruction due to the delay slot.
- return Step(offset, true);
+ return Step(true);
}
break;
}
@@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
// cause an exit if it's executed inside a delay slot.
if (opcode.is_exit && !is_delay_slot) {
// Exit has a delay slot, execute the next instruction
- Step(offset, true);
+ Step(true);
return false;
}
return true;
}
-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
- const auto& macro_memory{maxwell3d.GetMacroMemory()};
- ASSERT((pc % sizeof(u32)) == 0);
- ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
- return {macro_memory[offset + pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
switch (operation) {
- case ALUOperation::Add: {
+ case Macro::ALUOperation::Add: {
const u64 result{static_cast<u64>(src_a) + src_b};
carry_flag = result > 0xffffffff;
return static_cast<u32>(result);
}
- case ALUOperation::AddWithCarry: {
+ case Macro::ALUOperation::AddWithCarry: {
const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
carry_flag = result > 0xffffffff;
return static_cast<u32>(result);
}
- case ALUOperation::Subtract: {
+ case Macro::ALUOperation::Subtract: {
const u64 result{static_cast<u64>(src_a) - src_b};
carry_flag = result < 0x100000000;
return static_cast<u32>(result);
}
- case ALUOperation::SubtractWithBorrow: {
+ case Macro::ALUOperation::SubtractWithBorrow: {
const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
carry_flag = result < 0x100000000;
return static_cast<u32>(result);
}
- case ALUOperation::Xor:
+ case Macro::ALUOperation::Xor:
return src_a ^ src_b;
- case ALUOperation::Or:
+ case Macro::ALUOperation::Or:
return src_a | src_b;
- case ALUOperation::And:
+ case Macro::ALUOperation::And:
return src_a & src_b;
- case ALUOperation::AndNot:
+ case Macro::ALUOperation::AndNot:
return src_a & ~src_b;
- case ALUOperation::Nand:
+ case Macro::ALUOperation::Nand:
return ~(src_a & src_b);
default:
@@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
}
}
-void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
switch (operation) {
- case ResultOperation::IgnoreAndFetch:
+ case Macro::ResultOperation::IgnoreAndFetch:
// Fetch parameter and ignore result.
SetRegister(reg, FetchParameter());
break;
- case ResultOperation::Move:
+ case Macro::ResultOperation::Move:
// Move result.
SetRegister(reg, result);
break;
- case ResultOperation::MoveAndSetMethod:
+ case Macro::ResultOperation::MoveAndSetMethod:
// Move result and use as Method Address.
SetRegister(reg, result);
SetMethodAddress(result);
break;
- case ResultOperation::FetchAndSend:
+ case Macro::ResultOperation::FetchAndSend:
// Fetch parameter and send result.
SetRegister(reg, FetchParameter());
Send(result);
break;
- case ResultOperation::MoveAndSend:
+ case Macro::ResultOperation::MoveAndSend:
// Move and send result.
SetRegister(reg, result);
Send(result);
break;
- case ResultOperation::FetchAndSetMethod:
+ case Macro::ResultOperation::FetchAndSetMethod:
// Fetch parameter and use result as Method Address.
SetRegister(reg, FetchParameter());
SetMethodAddress(result);
break;
- case ResultOperation::MoveAndSetMethodFetchAndSend:
+ case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
// Move result and use as Method Address, then fetch and send parameter.
SetRegister(reg, result);
SetMethodAddress(result);
Send(FetchParameter());
break;
- case ResultOperation::MoveAndSetMethodSend:
+ case Macro::ResultOperation::MoveAndSetMethodSend:
// Move result and use as Method Address, then send bits 12:17 of result.
SetRegister(reg, result);
SetMethodAddress(result);
@@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
}
}
-u32 MacroInterpreter::FetchParameter() {
- ASSERT(next_parameter_index < num_parameters);
- return parameters[next_parameter_index++];
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+ switch (cond) {
+ case Macro::BranchCondition::Zero:
+ return value == 0;
+ case Macro::BranchCondition::NotZero:
+ return value != 0;
+ }
+ UNREACHABLE();
+ return true;
}
-u32 MacroInterpreter::GetRegister(u32 register_id) const {
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+ ASSERT((pc % sizeof(u32)) == 0);
+ ASSERT(pc < code.size() * sizeof(u32));
+ return {code[pc / sizeof(u32)]};
+}
+
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
return registers.at(register_id);
}
-void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
// Register 0 is hardwired as the zero register.
// Ensure no writes to it actually occur.
if (register_id == 0) {
@@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
registers.at(register_id) = value;
}
-void MacroInterpreter::SetMethodAddress(u32 address) {
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
method_address.raw = address;
}
-void MacroInterpreter::Send(u32 value) {
+void MacroInterpreterImpl::Send(u32 value) {
maxwell3d.CallMethodFromMME(method_address.address, value);
// Increment the method address by the method increment.
method_address.address.Assign(method_address.address.Value() +
method_address.increment.Value());
}
-u32 MacroInterpreter::Read(u32 method) const {
+u32 MacroInterpreterImpl::Read(u32 method) const {
return maxwell3d.GetRegisterValue(method);
}
-bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
- switch (cond) {
- case BranchCondition::Zero:
- return value == 0;
- case BranchCondition::NotZero:
- return value != 0;
- }
- UNREACHABLE();
- return true;
+u32 MacroInterpreterImpl::FetchParameter() {
+ ASSERT(next_parameter_index < num_parameters);
+ return parameters[next_parameter_index++];
}
} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
index 631146d89..90217fc89 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
-
#include <array>
#include <optional>
-
+#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
+#include "video_core/macro/macro.h"
namespace Tegra {
namespace Engines {
class Maxwell3D;
}
-class MacroInterpreter final {
+class MacroInterpreter final : public MacroEngine {
public:
explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
- /**
- * Executes the macro code with the specified input parameters.
- * @param offset Offset to start execution at.
- * @param parameters The parameters of the macro.
- */
- void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
+protected:
+ std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
private:
- enum class ALUOperation : u32;
- enum class BranchCondition : u32;
- enum class ResultOperation : u32;
-
- union Opcode;
+ Engines::Maxwell3D& maxwell3d;
+};
- union MethodAddress {
- u32 raw;
- BitField<0, 12, u32> address;
- BitField<12, 6, u32> increment;
- };
+class MacroInterpreterImpl : public CachedMacro {
+public:
+ MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+ void Execute(const std::vector<u32>& parameters, u32 method) override;
+private:
/// Resets the execution engine state, zeroing registers, etc.
void Reset();
@@ -49,20 +42,20 @@ private:
* @param is_delay_slot Whether the current step is being executed due to a delay slot in a
* previous instruction.
*/
- bool Step(u32 offset, bool is_delay_slot);
+ bool Step(bool is_delay_slot);
/// Calculates the result of an ALU operation. src_a OP src_b;
- u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
+ u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
/// Performs the result operation on the input result and stores it in the specified register
/// (if necessary).
- void ProcessResult(ResultOperation operation, u32 reg, u32 result);
+ void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
/// Evaluates the branch condition and returns whether the branch should be taken or not.
- bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
+ bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
/// Reads an opcode at the current program counter location.
- Opcode GetOpcode(u32 offset) const;
+ Macro::Opcode GetOpcode() const;
/// Returns the specified register's value. Register 0 is hardcoded to always return 0.
u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
/// Program counter to execute at after the delay slot is executed.
std::optional<u32> delayed_pc;
- static constexpr std::size_t NumMacroRegisters = 8;
-
/// General purpose macro registers.
- std::array<u32, NumMacroRegisters> registers = {};
+ std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
/// Method address to use for the next Send instruction.
- MethodAddress method_address = {};
+ Macro::MethodAddress method_address = {};
/// Input parameters of the current macro.
std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
u32 next_parameter_index = 0;
bool carry_flag = false;
+ const std::vector<u32>& code;
};
+
} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
new file mode 100644
index 000000000..11c1cc3be
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,640 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
+static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
+static const Xbyak::Reg64 STATE = Xbyak::util::r11;
+static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
+static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
+static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
+static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
+static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+ PARAMETERS,
+ REGISTERS,
+ STATE,
+ NEXT_PARAMETER,
+ RESULT,
+ METHOD_ADDRESS,
+ BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+ return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
+ : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+ Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+ MICROPROFILE_SCOPE(MacroJitExecute);
+ ASSERT_OR_EXECUTE(program != nullptr, { return; });
+ JITState state{};
+ state.maxwell3d = &maxwell3d;
+ state.registers = {};
+ state.parameters = parameters.data();
+ program(&state);
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+ const bool is_a_zero = opcode.src_a == 0;
+ const bool is_b_zero = opcode.src_b == 0;
+ const bool valid_operation = !is_a_zero && !is_b_zero;
+ const bool is_move_operation = !is_a_zero && is_b_zero;
+ const bool has_zero_register = is_a_zero || is_b_zero;
+
+ Xbyak::Reg64 src_a;
+ Xbyak::Reg32 src_b;
+
+ if (!optimizer.zero_reg_skip) {
+ src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+ src_b = Compile_GetRegister(opcode.src_b, ebx);
+ } else {
+ if (!is_a_zero) {
+ src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+ }
+ if (!is_b_zero) {
+ src_b = Compile_GetRegister(opcode.src_b, ebx);
+ }
+ }
+ Xbyak::Label skip_carry{};
+
+ bool has_emitted = false;
+
+ switch (opcode.alu_operation) {
+ case Macro::ALUOperation::Add:
+ if (optimizer.zero_reg_skip) {
+ if (valid_operation) {
+ add(src_a, src_b);
+ }
+ } else {
+ add(src_a, src_b);
+ }
+
+ if (!optimizer.can_skip_carry) {
+ setc(byte[STATE + offsetof(JITState, carry_flag)]);
+ }
+ break;
+ case Macro::ALUOperation::AddWithCarry:
+ bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+ adc(src_a, src_b);
+ setc(byte[STATE + offsetof(JITState, carry_flag)]);
+ break;
+ case Macro::ALUOperation::Subtract:
+ if (optimizer.zero_reg_skip) {
+ if (valid_operation) {
+ sub(src_a, src_b);
+ has_emitted = true;
+ }
+ } else {
+ sub(src_a, src_b);
+ has_emitted = true;
+ }
+ if (!optimizer.can_skip_carry && has_emitted) {
+ setc(byte[STATE + offsetof(JITState, carry_flag)]);
+ }
+ break;
+ case Macro::ALUOperation::SubtractWithBorrow:
+ bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+ sbb(src_a, src_b);
+ setc(byte[STATE + offsetof(JITState, carry_flag)]);
+ break;
+ case Macro::ALUOperation::Xor:
+ if (optimizer.zero_reg_skip) {
+ if (valid_operation) {
+ xor_(src_a, src_b);
+ }
+ } else {
+ xor_(src_a, src_b);
+ }
+ break;
+ case Macro::ALUOperation::Or:
+ if (optimizer.zero_reg_skip) {
+ if (valid_operation) {
+ or_(src_a, src_b);
+ }
+ } else {
+ or_(src_a, src_b);
+ }
+ break;
+ case Macro::ALUOperation::And:
+ if (optimizer.zero_reg_skip) {
+ if (!has_zero_register) {
+ and_(src_a, src_b);
+ }
+ } else {
+ and_(src_a, src_b);
+ }
+ break;
+ case Macro::ALUOperation::AndNot:
+ if (optimizer.zero_reg_skip) {
+ if (!is_a_zero) {
+ not_(src_b);
+ and_(src_a, src_b);
+ }
+ } else {
+ not_(src_b);
+ and_(src_a, src_b);
+ }
+ break;
+ case Macro::ALUOperation::Nand:
+ if (optimizer.zero_reg_skip) {
+ if (!is_a_zero) {
+ and_(src_a, src_b);
+ not_(src_a);
+ }
+ } else {
+ and_(src_a, src_b);
+ not_(src_a);
+ }
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
+ static_cast<std::size_t>(opcode.alu_operation.Value()));
+ break;
+ }
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+ if (optimizer.skip_dummy_addimmediate) {
+ // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+ // without doing anything. In our case we can just not emit anything.
+ if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+ return;
+ }
+ }
+ // Check for redundant moves
+ if (optimizer.optimize_for_method_move &&
+ opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+ if (next_opcode.has_value()) {
+ const auto next = *next_opcode;
+ if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+ return;
+ }
+ }
+ }
+ if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+ if (opcode.immediate == 0) {
+ xor_(RESULT, RESULT);
+ } else {
+ mov(RESULT, opcode.immediate);
+ }
+ } else {
+ auto result = Compile_GetRegister(opcode.src_a, RESULT);
+ if (opcode.immediate > 2) {
+ add(result, opcode.immediate);
+ } else if (opcode.immediate == 1) {
+ inc(result);
+ } else if (opcode.immediate < 0) {
+ sub(result, opcode.immediate * -1);
+ }
+ }
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+ auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+ auto src = Compile_GetRegister(opcode.src_b, eax);
+
+ if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+ shr(src, opcode.bf_src_bit);
+ } else if (opcode.bf_src_bit == 31) {
+ xor_(src, src);
+ }
+ // Don't bother masking the whole register since we're using a 32 bit register
+ if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+ and_(src, opcode.GetBitfieldMask());
+ } else if (opcode.bf_size == 0) {
+ xor_(src, src);
+ }
+ if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+ shl(src, opcode.bf_dst_bit);
+ } else if (opcode.bf_dst_bit == 31) {
+ xor_(src, src);
+ }
+
+ const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+ if (mask != 0xffffffff) {
+ and_(dst, mask);
+ }
+ or_(dst, src);
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+ auto dst = Compile_GetRegister(opcode.src_a, eax);
+ auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+ shr(src, al);
+ if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+ and_(src, opcode.GetBitfieldMask());
+ } else if (opcode.bf_size == 0) {
+ xor_(src, src);
+ }
+
+ if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+ shl(src, opcode.bf_dst_bit);
+ } else if (opcode.bf_dst_bit == 31) {
+ xor_(src, src);
+ }
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+ auto dst = Compile_GetRegister(opcode.src_a, eax);
+ auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+ if (opcode.bf_src_bit != 0) {
+ shr(src, opcode.bf_src_bit);
+ }
+
+ if (opcode.bf_size != 31) {
+ and_(src, opcode.GetBitfieldMask());
+ }
+ shl(src, al);
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
+ return maxwell3d->GetRegisterValue(method);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+ maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+ if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+ if (opcode.immediate == 0) {
+ xor_(RESULT, RESULT);
+ } else {
+ mov(RESULT, opcode.immediate);
+ }
+ } else {
+ auto result = Compile_GetRegister(opcode.src_a, RESULT);
+ if (opcode.immediate > 2) {
+ add(result, opcode.immediate);
+ } else if (opcode.immediate == 1) {
+ inc(result);
+ } else if (opcode.immediate < 0) {
+ sub(result, opcode.immediate * -1);
+ }
+ }
+ Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+ mov(Common::X64::ABI_PARAM1, qword[STATE]);
+ mov(Common::X64::ABI_PARAM2, RESULT);
+ Common::X64::CallFarFunction(*this, &Read);
+ Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+ mov(RESULT, Common::X64::ABI_RETURN.cvt32());
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+ Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+ mov(Common::X64::ABI_PARAM1, qword[STATE]);
+ mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+ mov(Common::X64::ABI_PARAM3, value);
+ Common::X64::CallFarFunction(*this, &Send);
+ Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+
+ Xbyak::Label dont_process{};
+ // Get increment
+ test(METHOD_ADDRESS, 0x3f000);
+ // If zero, method address doesn't update
+ je(dont_process);
+
+ mov(ecx, METHOD_ADDRESS);
+ and_(METHOD_ADDRESS, 0xfff);
+ shr(ecx, 12);
+ and_(ecx, 0x3f);
+ lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
+ sal(ecx, 12);
+ or_(eax, ecx);
+
+ mov(METHOD_ADDRESS, eax);
+
+ L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+ ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+ const s32 jump_address =
+ static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+ Xbyak::Label end;
+ auto value = Compile_GetRegister(opcode.src_a, eax);
+ test(value, value);
+ if (optimizer.has_delayed_pc) {
+ switch (opcode.branch_condition) {
+ case Macro::BranchCondition::Zero:
+ jne(end, T_NEAR);
+ break;
+ case Macro::BranchCondition::NotZero:
+ je(end, T_NEAR);
+ break;
+ }
+
+ if (opcode.branch_annul) {
+ xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+ jmp(labels[jump_address], T_NEAR);
+ } else {
+ Xbyak::Label handle_post_exit{};
+ Xbyak::Label skip{};
+ jmp(skip, T_NEAR);
+ if (opcode.is_exit) {
+ L(handle_post_exit);
+ // Execute 1 instruction
+ mov(BRANCH_HOLDER, end_of_code);
+ // Jump to next instruction to skip delay slot check
+ jmp(labels[jump_address], T_NEAR);
+ } else {
+ L(handle_post_exit);
+ xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+ jmp(labels[jump_address], T_NEAR);
+ }
+ L(skip);
+ mov(BRANCH_HOLDER, handle_post_exit);
+ jmp(delay_skip[pc], T_NEAR);
+ }
+ } else {
+ switch (opcode.branch_condition) {
+ case Macro::BranchCondition::Zero:
+ je(labels[jump_address], T_NEAR);
+ break;
+ case Macro::BranchCondition::NotZero:
+ jne(labels[jump_address], T_NEAR);
+ break;
+ }
+ }
+
+ L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+ optimizer.can_skip_carry = true;
+ optimizer.has_delayed_pc = false;
+ for (auto raw_op : code) {
+ Macro::Opcode op{};
+ op.raw = raw_op;
+
+ if (op.operation == Macro::Operation::ALU) {
+ // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+ // our current code we can skip emitting the carry flag handling operations
+ if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+ op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+ optimizer.can_skip_carry = false;
+ }
+ }
+
+ if (op.operation == Macro::Operation::Branch) {
+ if (!op.branch_annul) {
+ optimizer.has_delayed_pc = true;
+ }
+ }
+ }
+}
+
+void MacroJITx64Impl::Compile() {
+ MICROPROFILE_SCOPE(MacroJitCompile);
+ bool keep_executing = true;
+ labels.fill(Xbyak::Label());
+
+ Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+ // JIT state
+ mov(STATE, Common::X64::ABI_PARAM1);
+ mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
+ static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
+ mov(REGISTERS, Common::X64::ABI_PARAM1);
+ add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
+ xor_(RESULT, RESULT);
+ xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+ xor_(NEXT_PARAMETER, NEXT_PARAMETER);
+ xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+ mov(dword[REGISTERS + 4], Compile_FetchParameter());
+
+ // Track get register for zero registers and mark it as no-op
+ optimizer.zero_reg_skip = true;
+
+ // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+ // completely skip the entire code path and no emit anything
+ optimizer.skip_dummy_addimmediate = true;
+
+ // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+ // one if our register isn't "dirty"
+ optimizer.optimize_for_method_move = true;
+
+ // Check to see if we can skip emitting certain instructions
+ Optimizer_ScanFlags();
+
+ const u32 op_count = static_cast<u32>(code.size());
+ for (u32 i = 0; i < op_count; i++) {
+ if (i < op_count - 1) {
+ pc = i + 1;
+ next_opcode = GetOpCode();
+ } else {
+ next_opcode = {};
+ }
+ pc = i;
+ Compile_NextInstruction();
+ }
+
+ L(end_of_code);
+
+ Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+ ret();
+ ready();
+ program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+ const auto opcode = GetOpCode();
+ if (labels[pc].getAddress()) {
+ return false;
+ }
+
+ L(labels[pc]);
+
+ switch (opcode.operation) {
+ case Macro::Operation::ALU:
+ Compile_ALU(opcode);
+ break;
+ case Macro::Operation::AddImmediate:
+ Compile_AddImmediate(opcode);
+ break;
+ case Macro::Operation::ExtractInsert:
+ Compile_ExtractInsert(opcode);
+ break;
+ case Macro::Operation::ExtractShiftLeftImmediate:
+ Compile_ExtractShiftLeftImmediate(opcode);
+ break;
+ case Macro::Operation::ExtractShiftLeftRegister:
+ Compile_ExtractShiftLeftRegister(opcode);
+ break;
+ case Macro::Operation::Read:
+ Compile_Read(opcode);
+ break;
+ case Macro::Operation::Branch:
+ Compile_Branch(opcode);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+ break;
+ }
+
+ if (optimizer.has_delayed_pc) {
+ if (opcode.is_exit) {
+ mov(rax, end_of_code);
+ test(BRANCH_HOLDER, BRANCH_HOLDER);
+ cmove(BRANCH_HOLDER, rax);
+ // Jump to next instruction to skip delay slot check
+ je(labels[pc + 1], T_NEAR);
+ } else {
+ // TODO(ogniK): Optimize delay slot branching
+ Xbyak::Label no_delay_slot{};
+ test(BRANCH_HOLDER, BRANCH_HOLDER);
+ je(no_delay_slot, T_NEAR);
+ mov(rax, BRANCH_HOLDER);
+ xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+ jmp(rax);
+ L(no_delay_slot);
+ }
+ L(delay_skip[pc]);
+ if (opcode.is_exit) {
+ return false;
+ }
+ } else {
+ test(BRANCH_HOLDER, BRANCH_HOLDER);
+ jne(end_of_code, T_NEAR);
+ if (opcode.is_exit) {
+ inc(BRANCH_HOLDER);
+ return false;
+ }
+ }
+ return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+ mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
+ inc(NEXT_PARAMETER);
+ return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+ if (index == 0) {
+ // Register 0 is always zero
+ xor_(dst, dst);
+ } else {
+ mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+ }
+
+ return dst;
+}
+
+Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
+ if (index == 0) {
+ // Register 0 is always zero
+ xor_(dst, dst);
+ } else {
+ mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+ }
+
+ return dst;
+}
+
+void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
+ Xbyak::Label zero{}, end{};
+ xor_(ecx, ecx);
+ shr(dst, 32);
+ setne(cl);
+ mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+ auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
+ // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+ // register.
+ if (reg == 0) {
+ return;
+ }
+ mov(dword[REGISTERS + reg * sizeof(u32)], result);
+ };
+ auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
+
+ switch (operation) {
+ case Macro::ResultOperation::IgnoreAndFetch:
+ SetRegister(reg, Compile_FetchParameter());
+ break;
+ case Macro::ResultOperation::Move:
+ SetRegister(reg, RESULT);
+ break;
+ case Macro::ResultOperation::MoveAndSetMethod:
+ SetRegister(reg, RESULT);
+ SetMethodAddress(RESULT);
+ break;
+ case Macro::ResultOperation::FetchAndSend:
+ // Fetch parameter and send result.
+ SetRegister(reg, Compile_FetchParameter());
+ Compile_Send(RESULT);
+ break;
+ case Macro::ResultOperation::MoveAndSend:
+ // Move and send result.
+ SetRegister(reg, RESULT);
+ Compile_Send(RESULT);
+ break;
+ case Macro::ResultOperation::FetchAndSetMethod:
+ // Fetch parameter and use result as Method Address.
+ SetRegister(reg, Compile_FetchParameter());
+ SetMethodAddress(RESULT);
+ break;
+ case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+ // Move result and use as Method Address, then fetch and send parameter.
+ SetRegister(reg, RESULT);
+ SetMethodAddress(RESULT);
+ Compile_Send(Compile_FetchParameter());
+ break;
+ case Macro::ResultOperation::MoveAndSetMethodSend:
+ // Move result and use as Method Address, then send bits 12:17 of result.
+ SetRegister(reg, RESULT);
+ SetMethodAddress(RESULT);
+ shr(RESULT, 12);
+ and_(RESULT, 0b111111);
+ Compile_Send(RESULT);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+ }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+ ASSERT(pc < code.size());
+ return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+ return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
new file mode 100644
index 000000000..21ee157cf
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,100 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+ explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+
+protected:
+ std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+ Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+ MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+ ~MacroJITx64Impl();
+
+ void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+ void Compile_ALU(Macro::Opcode opcode);
+ void Compile_AddImmediate(Macro::Opcode opcode);
+ void Compile_ExtractInsert(Macro::Opcode opcode);
+ void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+ void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+ void Compile_Read(Macro::Opcode opcode);
+ void Compile_Branch(Macro::Opcode opcode);
+
+private:
+ void Optimizer_ScanFlags();
+
+ void Compile();
+ bool Compile_NextInstruction();
+
+ Xbyak::Reg32 Compile_FetchParameter();
+ Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+ Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
+ void Compile_WriteCarry(Xbyak::Reg64 dst);
+
+ void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+ void Compile_Send(Xbyak::Reg32 value);
+
+ Macro::Opcode GetOpCode() const;
+ std::bitset<32> PersistentCallerSavedRegs() const;
+
+ struct JITState {
+ Engines::Maxwell3D* maxwell3d{};
+ std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+ const u32* parameters{};
+ u32 carry_flag{};
+ };
+ static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+ using ProgramType = void (*)(JITState*);
+
+ struct OptimizerState {
+ bool can_skip_carry{};
+ bool has_delayed_pc{};
+ bool zero_reg_skip{};
+ bool skip_dummy_addimmediate{};
+ bool optimize_for_method_move{};
+ };
+ OptimizerState optimizer{};
+
+ std::optional<Macro::Opcode> next_opcode{};
+ ProgramType program{nullptr};
+
+ std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
+ std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
+ Xbyak::Label end_of_code{};
+
+ bool is_delay_slot{};
+ u32 pc{};
+ std::optional<u32> delayed_pc;
+
+ const std::vector<u32>& code;
+ Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 466a911db..890fc6c63 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
#include <array>
#include <cstddef>
#include <cstring>
+#include <limits>
#include <optional>
#include <vector>
@@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
constexpr u32 NumStages = 5;
-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
- GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
- GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+ GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+ GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+ GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
constexpr std::array LimitSSBOs = {
- GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+ GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
- GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+ GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
-constexpr std::array LimitSamplers = {
- GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
- GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
- GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+ GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+ GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+ GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+ GL_MAX_TEXTURE_IMAGE_UNITS,
+ GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
- GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
- GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
- GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+ GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+ GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+ GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
template <typename T>
T GetInteger(GLenum pname) {
@@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
return std::exchange(base, base + amount);
}
+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+ std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+ std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+ [](GLenum pname) { return GetInteger<u32>(pname); });
+ return max;
+}
+
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
@@ -133,6 +144,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
}
bool IsASTCSupported() {
+ static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
static constexpr std::array formats = {
GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -149,25 +161,43 @@ bool IsASTCSupported() {
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
};
- return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
- GLint supported;
- glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
- &supported);
- return supported == GL_TRUE;
- }) == formats.end();
+ static constexpr std::array required_support = {
+ GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+ GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE,
+ };
+
+ for (const GLenum target : targets) {
+ for (const GLenum format : formats) {
+ for (const GLenum support : required_support) {
+ GLint value;
+ glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+ if (value != GL_FULL_SUPPORT) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
}
} // Anonymous namespace
-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+ : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
- const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+ const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions();
const bool is_nvidia = vendor == "NVIDIA Corporation";
const bool is_amd = vendor == "ATI Technologies Inc.";
- const bool is_intel = vendor == "Intel";
- const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+
+ bool disable_fast_buffer_sub_data = false;
+ if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+ LOG_WARNING(
+ Render_OpenGL,
+ "Beta driver 443.24 is known to have issues. There might be performance issues.");
+ disable_fast_buffer_sub_data = true;
+ }
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -182,10 +212,10 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
- has_broken_compute = is_intel_proprietary;
- has_fast_buffer_sub_data = is_nvidia;
+ has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
- GLAD_GL_NV_compute_program5;
+ GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
+ GLAD_GL_NV_transform_feedback2;
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
@@ -197,7 +227,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
}
Device::Device(std::nullptr_t) {
- uniform_buffer_alignment = 0;
+ max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+ uniform_buffer_alignment = 4;
+ shader_storage_alignment = 4;
max_vertex_attributes = 16;
max_varyings = 15;
has_warp_intrinsics = true;
@@ -205,9 +237,6 @@ Device::Device(std::nullptr_t) {
has_vertex_viewport_layer = true;
has_image_load_formatted = true;
has_variable_aoffi = true;
- has_component_indexing_bug = false;
- has_broken_compute = false;
- has_precise_bug = false;
}
bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index e915dbd86..98cca0254 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
explicit Device();
explicit Device(std::nullptr_t);
+ u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+ return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+ }
+
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
return base_bindings[stage_index];
}
@@ -80,10 +84,6 @@ public:
return has_precise_bug;
}
- bool HasBrokenCompute() const {
- return has_broken_compute;
- }
-
bool HasFastBufferSubData() const {
return has_fast_buffer_sub_data;
}
@@ -96,7 +96,8 @@ private:
static bool TestVariableAoffi();
static bool TestPreciseBug();
- std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+ std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+ std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
std::size_t uniform_buffer_alignment{};
std::size_t shader_storage_alignment{};
u32 max_vertex_attributes{};
@@ -109,7 +110,6 @@ private:
bool has_variable_aoffi{};
bool has_component_indexing_bug{};
bool has_precise_bug{};
- bool has_broken_compute{};
bool has_fast_buffer_sub_data{};
bool use_assembly_shaders{};
};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 8116a5daa..f802fd384 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
namespace {
+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+ NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+ NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
constexpr std::size_t NumSupportedVertexAttributes = 16;
template <typename Engine, typename Entry>
@@ -87,6 +93,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
return buffer.size;
}
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+ const u8 index = location / 4;
+ if (index >= 8 && index <= 39) {
+ return {GL_GENERIC_ATTRIB_NV, index - 8};
+ }
+ if (index >= 48 && index <= 55) {
+ return {GL_TEXTURE_COORD_NV, index - 48};
+ }
+ switch (index) {
+ case 7:
+ return {GL_POSITION, 0};
+ case 40:
+ return {GL_PRIMARY_COLOR_NV, 0};
+ case 41:
+ return {GL_SECONDARY_COLOR_NV, 0};
+ case 42:
+ return {GL_BACK_PRIMARY_COLOR_NV, 0};
+ case 43:
+ return {GL_BACK_SECONDARY_COLOR_NV, 0};
+ }
+ UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+ return {GL_POSITION, 0};
+}
+
void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
@@ -104,6 +138,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
CheckExtensions();
+ unified_uniform_buffer.Create();
+ glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
if (device.UseAssemblyShaders()) {
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
for (const GLuint cbuf : staging_cbufs) {
@@ -655,10 +692,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
- if (device.HasBrokenCompute()) {
- return;
- }
-
buffer_cache.Acquire();
current_cbuf = 0;
@@ -846,34 +879,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
const auto& shader_stage = stages[stage_index];
+ const auto& entries = shader->GetEntries();
+ const bool use_unified = entries.use_unified_uniforms;
+ const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
- u32 binding =
- device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
- for (const auto& entry : shader->GetEntries().const_buffers) {
- const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
- SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
+ const auto base_bindings = device.GetBaseBindings(stage_index);
+ u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+ for (const auto& entry : entries.const_buffers) {
+ const u32 index = entry.GetIndex();
+ const auto& buffer = shader_stage.const_buffers[index];
+ SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+ base_unified_offset + index * Maxwell::MaxConstBufferSize);
+ ++binding;
+ }
+ if (use_unified) {
+ const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+ entries.global_memory_entries.size());
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+ base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+ const auto& entries = kernel->GetEntries();
+ const bool use_unified = entries.use_unified_uniforms;
u32 binding = 0;
- for (const auto& entry : kernel->GetEntries().const_buffers) {
+ for (const auto& entry : entries.const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
Tegra::Engines::ConstBufferInfo buffer;
buffer.address = config.Address();
buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()];
- SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
+ SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+ use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+ ++binding;
+ }
+ if (use_unified) {
+ const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+ NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry) {
+ const ConstBufferEntry& entry, bool use_unified,
+ std::size_t unified_offset) {
if (!buffer.enabled) {
// Set values to zero to unbind buffers
if (device.UseAssemblyShaders()) {
@@ -889,20 +944,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
// UBO alignment requirements.
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
- const auto alignment = device.GetUniformBufferAlignment();
- auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
- device.HasFastBufferSubData());
- if (!device.UseAssemblyShaders()) {
- glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+ const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+ const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+ const GPUVAddr gpu_addr = buffer.address;
+ auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+ if (device.UseAssemblyShaders()) {
+ UNIMPLEMENTED_IF(use_unified);
+ if (offset != 0) {
+ const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+ glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
+ cbuf = staging_cbuf;
+ offset = 0;
+ }
+ glBindBufferRangeNV(stage, binding, cbuf, offset, size);
return;
}
- if (offset != 0) {
- const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
- glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
- cbuf = staging_cbuf;
- offset = 0;
+
+ if (use_unified) {
+ glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+ } else {
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
}
- glBindBufferRangeNV(stage, binding, cbuf, offset, size);
}
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
@@ -977,16 +1041,12 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
glBindTextureUnit(binding, 0);
return;
}
- glBindTextureUnit(binding, view->GetTexture());
-
- if (view->GetSurfaceParams().IsBuffer()) {
- return;
+ const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
+ texture.tic.z_source, texture.tic.w_source);
+ glBindTextureUnit(binding, handle);
+ if (!view->GetSurfaceParams().IsBuffer()) {
+ glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
- // Apply swizzle to textures that are not buffers.
- view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
- texture.tic.w_source);
-
- glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
@@ -1015,14 +1075,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
return;
}
- if (!tic.IsBuffer()) {
- view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
- }
if (entry.is_written) {
view->MarkAsModified(texture_cache.Tick());
}
- glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
- view->GetFormat());
+ const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+ glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
}
void RasterizerOpenGL::SyncViewport() {
@@ -1031,6 +1088,26 @@ void RasterizerOpenGL::SyncViewport() {
const auto& regs = gpu.regs;
const bool dirty_viewport = flags[Dirty::Viewports];
+ const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+ if (dirty_clip_control || flags[Dirty::FrontFace]) {
+ flags[Dirty::FrontFace] = false;
+
+ GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+ if (regs.screen_y_control.triangle_rast_flip != 0 &&
+ regs.viewport_transform[0].scale_y < 0.0f) {
+ switch (mode) {
+ case GL_CW:
+ mode = GL_CCW;
+ break;
+ case GL_CCW:
+ mode = GL_CW;
+ break;
+ }
+ }
+ glFrontFace(mode);
+ }
+
if (dirty_viewport || flags[Dirty::ClipControl]) {
flags[Dirty::ClipControl] = false;
@@ -1128,11 +1205,6 @@ void RasterizerOpenGL::SyncCullMode() {
glDisable(GL_CULL_FACE);
}
}
-
- if (flags[Dirty::FrontFace]) {
- flags[Dirty::FrontFace] = false;
- glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
- }
}
void RasterizerOpenGL::SyncPrimitiveRestart() {
@@ -1503,12 +1575,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
}
+void RasterizerOpenGL::SyncTransformFeedback() {
+ // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+ // when this is required.
+ const auto& regs = system.GPU().Maxwell3D().regs;
+
+ static constexpr std::size_t STRIDE = 3;
+ std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+ std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+ GLint* cursor = attribs.data();
+ GLint* current_stream = streams.data();
+
+ for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+ const auto& layout = regs.tfb_layouts[feedback];
+ UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+ if (layout.varying_count == 0) {
+ continue;
+ }
+
+ *current_stream = static_cast<GLint>(feedback);
+ if (current_stream != streams.data()) {
+ // When stepping one stream, push the expected token
+ cursor[0] = GL_NEXT_BUFFER_NV;
+ cursor[1] = 0;
+ cursor[2] = 0;
+ cursor += STRIDE;
+ }
+ ++current_stream;
+
+ const auto& locations = regs.tfb_varying_locs[feedback];
+ std::optional<u8> current_index;
+ for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+ const u8 location = locations[offset];
+ const u8 index = location / 4;
+
+ if (current_index == index) {
+ // Increase number of components of the previous attachment
+ ++cursor[-2];
+ continue;
+ }
+ current_index = index;
+
+ std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+ cursor[1] = 1;
+ cursor += STRIDE;
+ }
+ }
+
+ const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+ const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+ glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+ GL_INTERLEAVED_ATTRIBS);
+}
+
void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
const auto& regs = system.GPU().Maxwell3D().regs;
if (regs.tfb_enabled == 0) {
return;
}
+ if (device.UseAssemblyShaders()) {
+ SyncTransformFeedback();
+ }
+
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1535,6 +1665,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
static_cast<GLsizeiptr>(size));
}
+ // We may have to call BeginTransformFeedbackNV here since they seem to call different
+ // implementations on Nvidia's driver (the pointer is different) but we are using
+ // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+ // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
glBeginTransformFeedback(GL_POINTS);
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 87f7fe159..7abc8fdbd 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -107,7 +107,8 @@ private:
/// Configures a constant buffer.
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry);
+ const ConstBufferEntry& entry, bool use_unified,
+ std::size_t unified_offset);
/// Configures the current global memory entries to use for the draw command.
void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -201,6 +202,10 @@ private:
/// Syncs the framebuffer sRGB state to match the guest state
void SyncFramebufferSRGB();
+ /// Syncs transform feedback state to match guest state
+ /// @note Only valid on assembly shaders
+ void SyncTransformFeedback();
+
/// Begin a transform feedback
void BeginTransformFeedback(GLenum primitive_mode);
@@ -253,6 +258,7 @@ private:
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
std::size_t current_cbuf = 0;
+ OGLBuffer unified_uniform_buffer;
/// Number of commands queued to the OpenGL driver. Reseted on flush.
std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4cd0f36cf..a991ca64a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
- return std::shared_ptr<CachedShader>(new CachedShader(
- params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+ return std::shared_ptr<CachedShader>(
+ new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
+ MakeEntries(params.device, ir, shader_type), std::move(program)));
}
Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
@@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
- return std::shared_ptr<CachedShader>(new CachedShader(
- params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+ return std::shared_ptr<CachedShader>(
+ new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
+ MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
}
Shader CachedShader::CreateFromCache(const ShaderParameters& params,
@@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
PrecompiledShader shader;
shader.program = std::move(program);
shader.registry = std::move(registry);
- shader.entries = MakeEntries(ir);
+ shader.entries = MakeEntries(device, ir, entry.type);
std::scoped_lock lock{mutex};
if (callback) {
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 253484968..d6e30b321 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -61,8 +61,8 @@ struct TextureDerivates {};
using TextureArgument = std::pair<Type, Node>;
using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
- static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
#define ftou floatBitsToUint
@@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
}
+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+ const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+ // We waste one UBO for emulation
+ const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+ return num_ubos > num_available_ubos;
+}
+
struct GenericVaryingDescription {
std::string name;
u8 first_element = 0;
@@ -412,8 +419,9 @@ class GLSLDecompiler final {
public:
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
ShaderType stage, std::string_view identifier, std::string_view suffix)
- : device{device}, ir{ir}, registry{registry}, stage{stage},
- identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+ : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+ suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+ UseUnifiedUniforms(device, ir, stage)} {
if (stage != ShaderType::Compute) {
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
}
@@ -618,7 +626,9 @@ private:
break;
}
}
- if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+ if (stage != ShaderType::Geometry &&
+ (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
if (ir.UsesLayer()) {
code.AddLine("int gl_Layer;");
}
@@ -647,6 +657,16 @@ private:
--code.scope;
code.AddLine("}};");
code.AddNewLine();
+
+ if (stage == ShaderType::Geometry) {
+ if (ir.UsesLayer()) {
+ code.AddLine("out int gl_Layer;");
+ }
+ if (ir.UsesViewportIndex()) {
+ code.AddLine("out int gl_ViewportIndex;");
+ }
+ }
+ code.AddNewLine();
}
void DeclareRegisters() {
@@ -834,12 +854,24 @@ private:
}
void DeclareConstantBuffers() {
+ if (use_unified_uniforms) {
+ const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+ static_cast<u32>(ir.GetGlobalMemory().size());
+ code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+ binding);
+ code.AddLine(" uint cbufs[];");
+ code.AddLine("}};");
+ code.AddNewLine();
+ return;
+ }
+
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
- for (const auto& buffers : ir.GetConstantBuffers()) {
- const auto index = buffers.first;
+ for (const auto [index, info] : ir.GetConstantBuffers()) {
+ const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+ const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
GetConstBufferBlock(index));
- code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+ code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
code.AddLine("}};");
code.AddNewLine();
}
@@ -1038,42 +1070,51 @@ private:
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
const Node offset = cbuf->GetOffset();
+ const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
// Direct access
const u32 offset_imm = immediate->GetValue();
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
- return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
- offset_imm / (4 * 4), (offset_imm / 4) % 4),
+ if (use_unified_uniforms) {
+ return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+ Type::Uint};
+ } else {
+ return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+ offset_imm / (4 * 4), (offset_imm / 4) % 4),
+ Type::Uint};
+ }
+ }
+
+ // Indirect access
+ if (use_unified_uniforms) {
+ return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+ Visit(offset).AsUint()),
Type::Uint};
}
- if (std::holds_alternative<OperationNode>(*offset)) {
- // Indirect access
- const std::string final_offset = code.GenerateTemporary();
- code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+ const std::string final_offset = code.GenerateTemporary();
+ code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
- if (!device.HasComponentIndexingBug()) {
- return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
- final_offset, final_offset),
- Type::Uint};
- }
-
- // AMD's proprietary GLSL compiler emits ill code for variable component access.
- // To bypass this driver bug generate 4 ifs, one per each component.
- const std::string pack = code.GenerateTemporary();
- code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
- final_offset);
-
- const std::string result = code.GenerateTemporary();
- code.AddLine("uint {};", result);
- for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
- code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
- pack, GetSwizzle(swizzle));
- }
- return {result, Type::Uint};
+ if (!device.HasComponentIndexingBug()) {
+ return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+ final_offset, final_offset),
+ Type::Uint};
}
- UNREACHABLE_MSG("Unmanaged offset node type");
+ // AMD's proprietary GLSL compiler emits ill code for variable component access.
+ // To bypass this driver bug generate 4 ifs, one per each component.
+ const std::string pack = code.GenerateTemporary();
+ code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+ final_offset);
+
+ const std::string result = code.GenerateTemporary();
+ code.AddLine("uint {};", result);
+ for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+ code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+ GetSwizzle(swizzle));
+ }
+ return {result, Type::Uint};
}
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -2344,7 +2385,12 @@ private:
return {};
}
- Expression MemoryBarrierGL(Operation) {
+ Expression MemoryBarrierGroup(Operation) {
+ code.AddLine("groupMemoryBarrier();");
+ return {};
+ }
+
+ Expression MemoryBarrierGlobal(Operation) {
code.AddLine("memoryBarrier();");
return {};
}
@@ -2591,7 +2637,8 @@ private:
&GLSLDecompiler::ShuffleIndexed,
&GLSLDecompiler::Barrier,
- &GLSLDecompiler::MemoryBarrierGL,
+ &GLSLDecompiler::MemoryBarrierGroup,
+ &GLSLDecompiler::MemoryBarrierGlobal,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2704,6 +2751,7 @@ private:
const std::string_view identifier;
const std::string_view suffix;
const Header header;
+ const bool use_unified_uniforms;
std::unordered_map<u8, VaryingTFB> transform_feedback;
ShaderWriter code;
@@ -2899,7 +2947,7 @@ void GLSLDecompiler::DecompileAST() {
} // Anonymous namespace
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
ShaderEntries entries;
for (const auto& cbuf : ir.GetConstantBuffers()) {
entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2920,6 +2968,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
}
entries.shader_length = ir.GetLength();
+ entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
return entries;
}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e8a178764..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
std::vector<GlobalMemoryEntry> global_memory_entries;
std::vector<SamplerEntry> samplers;
std::vector<ImageEntry> images;
- u32 clip_distances{};
std::size_t shader_length{};
+ u32 clip_distances{};
+ bool use_unified_uniforms{};
};
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+ Tegra::Engines::ShaderType stage);
std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
const VideoCommon::Shader::Registry& registry,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 94fbd2a22..61505879b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
namespace {
struct FormatTuple {
- GLint internal_format;
+ GLenum internal_format;
GLenum format = GL_NONE;
GLenum type = GL_NONE;
};
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
return texture;
}
+constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
+ SwizzleSource w_source) {
+ return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+ (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
} // Anonymous namespace
CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -257,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
target = GetTextureTarget(params.target);
texture = CreateTexture(params, target, internal_format, texture_buffer);
DecorateSurfaceName();
- main_view = CreateViewInner(
- ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
- true);
+
+ u32 num_layers = 1;
+ if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+ num_layers = params.depth;
+ }
+
+ main_view =
+ CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
}
CachedSurface::~CachedSurface() = default;
@@ -381,7 +392,7 @@ void CachedSurface::DecorateSurfaceName() {
}
void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
- LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
+ LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
}
View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -397,32 +408,33 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
}
CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
- const bool is_proxy)
- : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
- target = GetTextureTarget(params.target);
- format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
+ bool is_proxy)
+ : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
+ target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
if (!is_proxy) {
- texture_view = CreateTextureView();
+ main_view = CreateTextureView();
}
- swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
}
CachedSurfaceView::~CachedSurfaceView() = default;
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
ASSERT(params.num_levels == 1);
+ if (params.target == SurfaceTarget::Texture3D) {
+ if (params.num_layers > 1) {
+ ASSERT(params.base_layer == 0);
+ glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+ } else {
+ glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+ params.base_level, params.base_layer);
+ }
+ return;
+ }
+
if (params.num_layers > 1) {
- // Layered framebuffer attachments
UNIMPLEMENTED_IF(params.base_layer != 0);
-
- switch (params.target) {
- case SurfaceTarget::Texture2DArray:
- glFramebufferTexture(target, attachment, GetTexture(), 0);
- break;
- default:
- UNIMPLEMENTED();
- }
+ glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
return;
}
@@ -430,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
const GLuint texture = surface.GetTexture();
switch (surface.GetSurfaceParams().target) {
case SurfaceTarget::Texture1D:
- glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+ glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
break;
case SurfaceTarget::Texture2D:
- glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+ glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
break;
case SurfaceTarget::Texture1DArray:
case SurfaceTarget::Texture2DArray:
case SurfaceTarget::TextureCubemap:
case SurfaceTarget::TextureCubeArray:
- glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+ glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
params.base_layer);
break;
default:
@@ -447,35 +459,62 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
}
}
-void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
+GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
SwizzleSource z_source, SwizzleSource w_source) {
- u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
- if (new_swizzle == swizzle)
- return;
- swizzle = new_swizzle;
- const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
- GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
- const GLuint handle = GetTexture();
- const PixelFormat format = surface.GetSurfaceParams().pixel_format;
- switch (format) {
+ if (GetSurfaceParams().IsBuffer()) {
+ return GetTexture();
+ }
+ const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+ if (current_swizzle == new_swizzle) {
+ return current_view;
+ }
+ current_swizzle = new_swizzle;
+
+ const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
+ OGLTextureView& view = entry->second;
+ if (!is_cache_miss) {
+ current_view = view.handle;
+ return view.handle;
+ }
+ view = CreateTextureView();
+ current_view = view.handle;
+
+ std::array swizzle{x_source, y_source, z_source, w_source};
+
+ switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
case PixelFormat::Z24S8:
case PixelFormat::Z32FS8:
case PixelFormat::S8Z24:
- glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+ UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+ glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
GetComponent(format, x_source == SwizzleSource::R));
- break;
- default:
- glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+
+ // Make sure we sample the first component
+ std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
+ return value == SwizzleSource::G ? SwizzleSource::R : value;
+ });
+ [[fallthrough]];
+ default: {
+ const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
+ GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
+ glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
break;
}
+ }
+ return view.handle;
}
OGLTextureView CachedSurfaceView::CreateTextureView() const {
OGLTextureView texture_view;
texture_view.Create();
- glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
- params.num_levels, params.base_layer, params.num_layers);
+ if (target == GL_TEXTURE_3D) {
+ glTextureView(texture_view.handle, target, surface.texture.handle, format,
+ params.base_level, params.num_levels, 0, 1);
+ } else {
+ glTextureView(texture_view.handle, target, surface.texture.handle, format,
+ params.base_level, params.num_levels, params.base_layer, params.num_layers);
+ }
ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
return texture_view;
@@ -518,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
const Tegra::Engines::Fermi2D::Config& copy_config) {
const auto& src_params{src_view->GetSurfaceParams()};
const auto& dst_params{dst_view->GetSurfaceParams()};
- UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
- UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+ UNIMPLEMENTED_IF(src_params.depth != 1);
+ UNIMPLEMENTED_IF(dst_params.depth != 1);
state_tracker.NotifyScissor0();
state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 02d9981a1..bfc4ddf5d 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,10 +80,12 @@ public:
explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
~CachedSurfaceView();
- /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
- void Attach(GLenum attachment, GLenum target) const;
+ /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+ /// @param attachment Attachment to bind textures to
+ /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+ void Attach(GLenum attachment, GLenum fb_target) const;
- void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
+ GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
Tegra::Texture::SwizzleSource y_source,
Tegra::Texture::SwizzleSource z_source,
Tegra::Texture::SwizzleSource w_source);
@@ -98,7 +100,7 @@ public:
if (is_proxy) {
return surface.GetTexture();
}
- return texture_view.handle;
+ return main_view.handle;
}
GLenum GetFormat() const {
@@ -110,23 +112,19 @@ public:
}
private:
- u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
- Tegra::Texture::SwizzleSource y_source,
- Tegra::Texture::SwizzleSource z_source,
- Tegra::Texture::SwizzleSource w_source) const {
- return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
- (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
- }
-
OGLTextureView CreateTextureView() const;
CachedSurface& surface;
- GLenum target{};
- GLenum format{};
+ const GLenum format;
+ const GLenum target;
+ const bool is_proxy;
+
+ std::unordered_map<u32, OGLTextureView> view_cache;
+ OGLTextureView main_view;
- OGLTextureView texture_view;
- u32 swizzle{};
- bool is_proxy{};
+ // Use an invalid default so it always fails the comparison test
+ u32 current_swizzle = 0xffffffff;
+ GLuint current_view = 0;
};
class TextureCacheOpenGL final : public TextureCacheBase {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6b489e6db..6214fcbc3 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -751,8 +751,9 @@ void RendererOpenGL::RenderScreenshot() {
}
bool RendererOpenGL::Init() {
- if (GLAD_GL_KHR_debug) {
+ if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
glEnable(GL_DEBUG_OUTPUT);
+ glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
glDebugMessageCallback(DebugHandler, nullptr);
}
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index 568744e3c..424278816 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -71,8 +71,7 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept {
const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
u32 packed_front_face = PackFrontFace(regs.front_face);
- if (regs.screen_y_control.triangle_rast_flip != 0 &&
- regs.viewport_transform[0].scale_y > 0.0f) {
+ if (regs.screen_y_control.triangle_rast_flip != 0) {
// Flip front face
packed_front_face = 1 - packed_front_face;
}
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 2871035f5..62e950d31 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -149,7 +149,7 @@ struct FormatTuple {
{VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F
{VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U
{VK_FORMAT_UNDEFINED}, // R16S
- {VK_FORMAT_UNDEFINED}, // R16UI
+ {VK_FORMAT_R16_UINT, Attachable | Storage}, // R16UI
{VK_FORMAT_UNDEFINED}, // R16I
{VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16
{VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 8e1b46277..281bf9ac3 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
};
add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
- add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size());
+ add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
+ add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());
VkDescriptorSetLayoutCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index 890fd52cf..9259b618d 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
{VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
+ {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
{VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};
VkDescriptorPoolCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 750e5a0ca..9fd8ac3f6 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -73,76 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType
std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
- static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32,
- VK_FORMAT_A8B8G8R8_UINT_PACK32,
- VK_FORMAT_A8B8G8R8_SNORM_PACK32,
- VK_FORMAT_A8B8G8R8_SRGB_PACK32,
- VK_FORMAT_B5G6R5_UNORM_PACK16,
- VK_FORMAT_A2B10G10R10_UNORM_PACK32,
- VK_FORMAT_A1R5G5B5_UNORM_PACK16,
- VK_FORMAT_R32G32B32A32_SFLOAT,
- VK_FORMAT_R32G32B32A32_UINT,
- VK_FORMAT_R32G32_SFLOAT,
- VK_FORMAT_R32G32_UINT,
- VK_FORMAT_R16G16B16A16_UINT,
- VK_FORMAT_R16G16B16A16_SNORM,
- VK_FORMAT_R16G16B16A16_UNORM,
- VK_FORMAT_R16G16_UNORM,
- VK_FORMAT_R16G16_SNORM,
- VK_FORMAT_R16G16_SFLOAT,
- VK_FORMAT_R16_UNORM,
- VK_FORMAT_R8G8B8A8_SRGB,
- VK_FORMAT_R8G8_UNORM,
- VK_FORMAT_R8G8_SNORM,
- VK_FORMAT_R8G8_UINT,
- VK_FORMAT_R8_UNORM,
- VK_FORMAT_R8_UINT,
- VK_FORMAT_B10G11R11_UFLOAT_PACK32,
- VK_FORMAT_R32_SFLOAT,
- VK_FORMAT_R32_UINT,
- VK_FORMAT_R32_SINT,
- VK_FORMAT_R16_SFLOAT,
- VK_FORMAT_R16G16B16A16_SFLOAT,
- VK_FORMAT_B8G8R8A8_UNORM,
- VK_FORMAT_B8G8R8A8_SRGB,
- VK_FORMAT_R4G4B4A4_UNORM_PACK16,
- VK_FORMAT_D32_SFLOAT,
- VK_FORMAT_D16_UNORM,
- VK_FORMAT_D16_UNORM_S8_UINT,
- VK_FORMAT_D24_UNORM_S8_UINT,
- VK_FORMAT_D32_SFLOAT_S8_UINT,
- VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
- VK_FORMAT_BC2_UNORM_BLOCK,
- VK_FORMAT_BC3_UNORM_BLOCK,
- VK_FORMAT_BC4_UNORM_BLOCK,
- VK_FORMAT_BC5_UNORM_BLOCK,
- VK_FORMAT_BC5_SNORM_BLOCK,
- VK_FORMAT_BC7_UNORM_BLOCK,
- VK_FORMAT_BC6H_UFLOAT_BLOCK,
- VK_FORMAT_BC6H_SFLOAT_BLOCK,
- VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
- VK_FORMAT_BC2_SRGB_BLOCK,
- VK_FORMAT_BC3_SRGB_BLOCK,
- VK_FORMAT_BC7_SRGB_BLOCK,
- VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
- VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
- VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
- VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
- VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
- VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
- VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
- VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
- VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
- VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
- VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
- VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
- VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
- VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
- VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
- VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
- VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
- VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
- VK_FORMAT_E5B9G9R9_UFLOAT_PACK32};
+ static constexpr std::array formats{
+ VK_FORMAT_A8B8G8R8_UNORM_PACK32,
+ VK_FORMAT_A8B8G8R8_UINT_PACK32,
+ VK_FORMAT_A8B8G8R8_SNORM_PACK32,
+ VK_FORMAT_A8B8G8R8_SRGB_PACK32,
+ VK_FORMAT_B5G6R5_UNORM_PACK16,
+ VK_FORMAT_A2B10G10R10_UNORM_PACK32,
+ VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+ VK_FORMAT_R32G32B32A32_SFLOAT,
+ VK_FORMAT_R32G32B32A32_UINT,
+ VK_FORMAT_R32G32_SFLOAT,
+ VK_FORMAT_R32G32_UINT,
+ VK_FORMAT_R16G16B16A16_UINT,
+ VK_FORMAT_R16G16B16A16_SNORM,
+ VK_FORMAT_R16G16B16A16_UNORM,
+ VK_FORMAT_R16G16_UNORM,
+ VK_FORMAT_R16G16_SNORM,
+ VK_FORMAT_R16G16_SFLOAT,
+ VK_FORMAT_R16_UNORM,
+ VK_FORMAT_R16_UINT,
+ VK_FORMAT_R8G8B8A8_SRGB,
+ VK_FORMAT_R8G8_UNORM,
+ VK_FORMAT_R8G8_SNORM,
+ VK_FORMAT_R8G8_UINT,
+ VK_FORMAT_R8_UNORM,
+ VK_FORMAT_R8_UINT,
+ VK_FORMAT_B10G11R11_UFLOAT_PACK32,
+ VK_FORMAT_R32_SFLOAT,
+ VK_FORMAT_R32_UINT,
+ VK_FORMAT_R32_SINT,
+ VK_FORMAT_R16_SFLOAT,
+ VK_FORMAT_R16G16B16A16_SFLOAT,
+ VK_FORMAT_B8G8R8A8_UNORM,
+ VK_FORMAT_B8G8R8A8_SRGB,
+ VK_FORMAT_R4G4B4A4_UNORM_PACK16,
+ VK_FORMAT_D32_SFLOAT,
+ VK_FORMAT_D16_UNORM,
+ VK_FORMAT_D16_UNORM_S8_UINT,
+ VK_FORMAT_D24_UNORM_S8_UINT,
+ VK_FORMAT_D32_SFLOAT_S8_UINT,
+ VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
+ VK_FORMAT_BC2_UNORM_BLOCK,
+ VK_FORMAT_BC3_UNORM_BLOCK,
+ VK_FORMAT_BC4_UNORM_BLOCK,
+ VK_FORMAT_BC5_UNORM_BLOCK,
+ VK_FORMAT_BC5_SNORM_BLOCK,
+ VK_FORMAT_BC7_UNORM_BLOCK,
+ VK_FORMAT_BC6H_UFLOAT_BLOCK,
+ VK_FORMAT_BC6H_SFLOAT_BLOCK,
+ VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
+ VK_FORMAT_BC2_SRGB_BLOCK,
+ VK_FORMAT_BC3_SRGB_BLOCK,
+ VK_FORMAT_BC7_SRGB_BLOCK,
+ VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
+ VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
+ VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
+ VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
+ VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
+ VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
+ VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
+ VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
+ VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
+ VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
+ VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
+ VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
+ VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
+ VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+ VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
+ VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
+ VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
+ VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
+ VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
+ };
std::unordered_map<VkFormat, VkFormatProperties> format_properties;
for (const auto format : formats) {
format_properties.emplace(format, physical.GetFormatProperties(format));
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index a5c7b7945..b8ccf164f 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -45,6 +45,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
@@ -104,8 +105,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
u32 binding = base_binding;
AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
- AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers);
+ AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
+ AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
return binding;
}
@@ -312,7 +314,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
ASSERT(point_size != 0.0f);
}
for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
- specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type();
+ const auto& attribute = fixed_state.vertex_input.attributes[i];
+ specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
+ specialization.attribute_types[i] = attribute.Type();
}
specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
@@ -375,16 +379,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
return;
}
- if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) {
- // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
- // crash.
+ if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
+ descriptor_type == STORAGE_TEXEL_BUFFER) {
+ // Nvidia has a bug where updating multiple texels at once causes the driver to crash.
+ // Note: Fixed in driver Windows 443.24, Linux 440.66.15
for (u32 i = 0; i < count; ++i) {
VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
entry.dstBinding = binding + i;
entry.dstArrayElement = 0;
entry.descriptorCount = 1;
entry.descriptorType = descriptor_type;
- entry.offset = offset + i * entry_size;
+ entry.offset = static_cast<std::size_t>(offset + i * entry_size);
entry.stride = entry_size;
}
} else if (count > 0) {
@@ -405,8 +410,9 @@ void FillDescriptorUpdateTemplateEntries(
std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
- AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers);
+ AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
+ AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
}
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index be5b77fae..19b8f9da3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -468,8 +468,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
const auto& entries = pipeline.GetEntries();
SetupComputeConstBuffers(entries);
SetupComputeGlobalBuffers(entries);
- SetupComputeTexelBuffers(entries);
+ SetupComputeUniformTexels(entries);
SetupComputeTextures(entries);
+ SetupComputeStorageTexels(entries);
SetupComputeImages(entries);
buffer_cache.Unmap();
@@ -715,7 +716,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
if (!view) {
return false;
}
- key.views.push_back(view->GetHandle());
+ key.views.push_back(view->GetAttachment());
key.width = std::min(key.width, view->GetWidth());
key.height = std::min(key.height, view->GetHeight());
key.layers = std::min(key.layers, view->GetNumLayers());
@@ -787,8 +788,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
const auto& entries = shader->GetEntries();
SetupGraphicsConstBuffers(entries, stage);
SetupGraphicsGlobalBuffers(entries, stage);
- SetupGraphicsTexelBuffers(entries, stage);
+ SetupGraphicsUniformTexels(entries, stage);
SetupGraphicsTextures(entries, stage);
+ SetupGraphicsStorageTexels(entries, stage);
SetupGraphicsImages(entries, stage);
}
texture_cache.GuardSamplers(false);
@@ -838,6 +840,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
if (regs.tfb_enabled == 0) {
return;
}
+ if (!device.IsExtTransformFeedbackSupported()) {
+ LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
+ return;
+ }
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
@@ -866,6 +872,9 @@ void RasterizerVulkan::EndTransformFeedback() {
if (regs.tfb_enabled == 0) {
return;
}
+ if (!device.IsExtTransformFeedbackSupported()) {
+ return;
+ }
scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
@@ -877,14 +886,10 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
const auto& attrib = regs.vertex_attrib_format[index];
- if (!attrib.IsValid()) {
+ if (attrib.IsConstant()) {
vertex_input.SetAttribute(index, false, 0, 0, {}, {});
continue;
}
-
- [[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer];
- ASSERT(buffer.IsEnabled());
-
vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(),
attrib.size.Value());
}
@@ -980,12 +985,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
}
}
-void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) {
+void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& gpu = system.GPU().Maxwell3D();
- for (const auto& entry : entries.texel_buffers) {
+ for (const auto& entry : entries.uniform_texels) {
const auto image = GetTextureInfo(gpu, entry, stage).tic;
- SetupTexelBuffer(image, entry);
+ SetupUniformTexels(image, entry);
}
}
@@ -1000,6 +1005,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
}
}
+void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
+ MICROPROFILE_SCOPE(Vulkan_Textures);
+ const auto& gpu = system.GPU().Maxwell3D();
+ for (const auto& entry : entries.storage_texels) {
+ const auto image = GetTextureInfo(gpu, entry, stage).tic;
+ SetupStorageTexel(image, entry);
+ }
+}
+
void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
MICROPROFILE_SCOPE(Vulkan_Images);
const auto& gpu = system.GPU().Maxwell3D();
@@ -1032,12 +1046,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
}
}
-void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) {
+void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& gpu = system.GPU().KeplerCompute();
- for (const auto& entry : entries.texel_buffers) {
+ for (const auto& entry : entries.uniform_texels) {
const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
- SetupTexelBuffer(image, entry);
+ SetupUniformTexels(image, entry);
}
}
@@ -1052,6 +1066,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
}
}
+void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
+ MICROPROFILE_SCOPE(Vulkan_Textures);
+ const auto& gpu = system.GPU().KeplerCompute();
+ for (const auto& entry : entries.storage_texels) {
+ const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+ SetupStorageTexel(image, entry);
+ }
+}
+
void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_Images);
const auto& gpu = system.GPU().KeplerCompute();
@@ -1101,8 +1124,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
update_descriptor_queue.AddBuffer(buffer, offset, size);
}
-void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic,
- const TexelBufferEntry& entry) {
+void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
+ const UniformTexelEntry& entry) {
const auto view = texture_cache.GetTextureSurface(tic, entry);
ASSERT(view->IsBufferView());
@@ -1114,8 +1137,8 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
auto view = texture_cache.GetTextureSurface(texture.tic, entry);
ASSERT(!view->IsBufferView());
- const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source,
- texture.tic.z_source, texture.tic.w_source);
+ const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source,
+ texture.tic.z_source, texture.tic.w_source);
const auto sampler = sampler_cache.GetSampler(texture.tsc);
update_descriptor_queue.AddSampledImage(sampler, image_view);
@@ -1124,6 +1147,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
sampled_views.push_back(ImageView{std::move(view), image_layout});
}
+void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
+ const StorageTexelEntry& entry) {
+ const auto view = texture_cache.GetImageSurface(tic, entry);
+ ASSERT(view->IsBufferView());
+
+ update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
+}
+
void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
auto view = texture_cache.GetImageSurface(tic, entry);
@@ -1133,7 +1164,8 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima
UNIMPLEMENTED_IF(tic.IsBuffer());
- const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+ const VkImageView image_view =
+ view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
update_descriptor_queue.AddImage(image_view);
const auto image_layout = update_descriptor_queue.GetLastImageLayout();
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 0ed0e48c6..04be37a5e 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -193,12 +193,15 @@ private:
/// Setup global buffers in the graphics pipeline.
void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
- /// Setup texel buffers in the graphics pipeline.
- void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage);
+ /// Setup uniform texels in the graphics pipeline.
+ void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
/// Setup textures in the graphics pipeline.
void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);
+ /// Setup storage texels in the graphics pipeline.
+ void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
+
/// Setup images in the graphics pipeline.
void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
@@ -209,11 +212,14 @@ private:
void SetupComputeGlobalBuffers(const ShaderEntries& entries);
/// Setup texel buffers in the compute pipeline.
- void SetupComputeTexelBuffers(const ShaderEntries& entries);
+ void SetupComputeUniformTexels(const ShaderEntries& entries);
/// Setup textures in the compute pipeline.
void SetupComputeTextures(const ShaderEntries& entries);
+ /// Setup storage texels in the compute pipeline.
+ void SetupComputeStorageTexels(const ShaderEntries& entries);
+
/// Setup images in the compute pipeline.
void SetupComputeImages(const ShaderEntries& entries);
@@ -222,10 +228,12 @@ private:
void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
- void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry);
+ void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);
void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);
+ void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
+
void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 890f34a2c..97429cc59 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -400,8 +400,9 @@ private:
u32 binding = specialization.base_binding;
binding = DeclareConstantBuffers(binding);
binding = DeclareGlobalBuffers(binding);
- binding = DeclareTexelBuffers(binding);
+ binding = DeclareUniformTexels(binding);
binding = DeclareSamplers(binding);
+ binding = DeclareStorageTexels(binding);
binding = DeclareImages(binding);
const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
@@ -741,8 +742,10 @@ private:
if (!IsGenericAttribute(index)) {
continue;
}
-
const u32 location = GetGenericAttributeLocation(index);
+ if (!IsAttributeEnabled(location)) {
+ continue;
+ }
const auto type_descriptor = GetAttributeType(location);
Id type;
if (IsInputAttributeArray()) {
@@ -887,7 +890,7 @@ private:
return binding;
}
- u32 DeclareTexelBuffers(u32 binding) {
+ u32 DeclareUniformTexels(u32 binding) {
for (const auto& sampler : ir.GetSamplers()) {
if (!sampler.is_buffer) {
continue;
@@ -908,7 +911,7 @@ private:
Decorate(id, spv::Decoration::Binding, binding++);
Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
- texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id});
+ uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
}
return binding;
}
@@ -943,31 +946,48 @@ private:
return binding;
}
- u32 DeclareImages(u32 binding) {
+ u32 DeclareStorageTexels(u32 binding) {
for (const auto& image : ir.GetImages()) {
- const auto [dim, arrayed] = GetImageDim(image);
- constexpr int depth = 0;
- constexpr bool ms = false;
- constexpr int sampled = 2; // This won't be accessed with a sampler
- constexpr auto format = spv::ImageFormat::Unknown;
- const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
- const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
- const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
- AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
-
- Decorate(id, spv::Decoration::Binding, binding++);
- Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
- if (image.is_read && !image.is_written) {
- Decorate(id, spv::Decoration::NonWritable);
- } else if (image.is_written && !image.is_read) {
- Decorate(id, spv::Decoration::NonReadable);
+ if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
+ continue;
}
+ DeclareImage(image, binding);
+ }
+ return binding;
+ }
- images.emplace(image.index, StorageImage{image_type, id});
+ u32 DeclareImages(u32 binding) {
+ for (const auto& image : ir.GetImages()) {
+ if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+ continue;
+ }
+ DeclareImage(image, binding);
}
return binding;
}
+ void DeclareImage(const Image& image, u32& binding) {
+ const auto [dim, arrayed] = GetImageDim(image);
+ constexpr int depth = 0;
+ constexpr bool ms = false;
+ constexpr int sampled = 2; // This won't be accessed with a sampler
+ const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
+ const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
+ const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
+ const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
+ AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
+
+ Decorate(id, spv::Decoration::Binding, binding++);
+ Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+ if (image.is_read && !image.is_written) {
+ Decorate(id, spv::Decoration::NonWritable);
+ } else if (image.is_written && !image.is_read) {
+ Decorate(id, spv::Decoration::NonReadable);
+ }
+
+ images.emplace(image.index, StorageImage{image_type, id});
+ }
+
bool IsRenderTargetEnabled(u32 rt) const {
for (u32 component = 0; component < 4; ++component) {
if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
@@ -986,6 +1006,10 @@ private:
return stage == ShaderType::TesselationControl;
}
+ bool IsAttributeEnabled(u32 location) const {
+ return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
+ }
+
u32 GetNumInputVertices() const {
switch (stage) {
case ShaderType::Geometry:
@@ -1201,16 +1225,20 @@ private:
UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
return {v_float_zero, Type::Float};
default:
- if (IsGenericAttribute(attribute)) {
- const u32 location = GetGenericAttributeLocation(attribute);
- const auto type_descriptor = GetAttributeType(location);
- const Type type = type_descriptor.type;
- const Id attribute_id = input_attributes.at(attribute);
- const std::vector elements = {element};
- const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
- return {OpLoad(GetTypeDefinition(type), pointer), type};
+ if (!IsGenericAttribute(attribute)) {
+ break;
}
- break;
+ const u32 location = GetGenericAttributeLocation(attribute);
+ if (!IsAttributeEnabled(location)) {
+ // Disabled attributes (also known as constant attributes) always return zero.
+ return {v_float_zero, Type::Float};
+ }
+ const auto type_descriptor = GetAttributeType(location);
+ const Type type = type_descriptor.type;
+ const Id attribute_id = input_attributes.at(attribute);
+ const std::vector elements = {element};
+ const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
+ return {OpLoad(GetTypeDefinition(type), pointer), type};
}
UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
return {v_float_zero, Type::Float};
@@ -1246,7 +1274,7 @@ private:
} else {
UNREACHABLE_MSG("Unmanaged offset node type");
}
- pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index,
+ pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
buffer_element);
}
return {OpLoad(t_float, pointer), Type::Float};
@@ -1601,7 +1629,7 @@ private:
const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
const Id carry = OpCompositeExtract(t_uint, result, 1);
- return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool};
+ return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
}
Expression LogicalAssign(Operation operation) {
@@ -1664,7 +1692,7 @@ private:
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
const u32 index = meta.sampler.index;
if (meta.sampler.is_buffer) {
- const auto& entry = texel_buffers.at(index);
+ const auto& entry = uniform_texels.at(index);
return OpLoad(entry.image_type, entry.image);
} else {
const auto& entry = sampled_images.at(index);
@@ -1941,39 +1969,20 @@ private:
return {};
}
- Expression AtomicImageAdd(Operation operation) {
- UNIMPLEMENTED();
- return {};
- }
-
- Expression AtomicImageMin(Operation operation) {
- UNIMPLEMENTED();
- return {};
- }
-
- Expression AtomicImageMax(Operation operation) {
- UNIMPLEMENTED();
- return {};
- }
-
- Expression AtomicImageAnd(Operation operation) {
- UNIMPLEMENTED();
- return {};
- }
-
- Expression AtomicImageOr(Operation operation) {
- UNIMPLEMENTED();
- return {};
- }
+ template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+ Expression AtomicImage(Operation operation) {
+ const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+ ASSERT(meta.values.size() == 1);
- Expression AtomicImageXor(Operation operation) {
- UNIMPLEMENTED();
- return {};
- }
+ const Id coordinate = GetCoordinates(operation, Type::Int);
+ const Id image = images.at(meta.image.index).image;
+ const Id sample = v_uint_zero;
+ const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);
- Expression AtomicImageExchange(Operation operation) {
- UNIMPLEMENTED();
- return {};
+ const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
+ const Id semantics = v_uint_zero;
+ const Id value = AsUint(Visit(meta.values[0]));
+ return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
}
template <Id (Module::*func)(Id, Id, Id, Id, Id)>
@@ -1988,7 +1997,7 @@ private:
return {v_float_zero, Type::Float};
}
const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
- const Id semantics = Constant(t_uint, 0);
+ const Id semantics = v_uint_zero;
const Id value = AsUint(Visit(operation[1]));
return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
@@ -2215,8 +2224,8 @@ private:
return {};
}
- Expression MemoryBarrierGL(Operation) {
- const auto scope = spv::Scope::Device;
+ template <spv::Scope scope>
+ Expression MemoryBarrier(Operation) {
const auto semantics =
spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2612,11 +2621,11 @@ private:
&SPIRVDecompiler::ImageLoad,
&SPIRVDecompiler::ImageStore,
- &SPIRVDecompiler::AtomicImageAdd,
- &SPIRVDecompiler::AtomicImageAnd,
- &SPIRVDecompiler::AtomicImageOr,
- &SPIRVDecompiler::AtomicImageXor,
- &SPIRVDecompiler::AtomicImageExchange,
+ &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
+ &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
+ &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
+ &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
+ &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,
&SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
&SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
@@ -2681,7 +2690,8 @@ private:
&SPIRVDecompiler::ShuffleIndexed,
&SPIRVDecompiler::Barrier,
- &SPIRVDecompiler::MemoryBarrierGL,
+ &SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
+ &SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2757,8 +2767,11 @@ private:
Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
+ const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
+
const Id v_float_zero = Constant(t_float, 0.0f);
const Id v_float_one = Constant(t_float, 1.0f);
+ const Id v_uint_zero = Constant(t_uint, 0);
// Nvidia uses these defaults for varyings (e.g. position and generic attributes)
const Id v_varying_default =
@@ -2783,15 +2796,16 @@ private:
std::unordered_map<u8, GenericVaryingDescription> output_attributes;
std::map<u32, Id> constant_buffers;
std::map<GlobalMemoryBase, Id> global_buffers;
- std::map<u32, TexelBuffer> texel_buffers;
+ std::map<u32, TexelBuffer> uniform_texels;
std::map<u32, SampledImage> sampled_images;
+ std::map<u32, TexelBuffer> storage_texels;
std::map<u32, StorageImage> images;
+ std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
Id instance_index{};
Id vertex_index{};
Id base_instance{};
Id base_vertex{};
- std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
Id frag_depth{};
Id frag_coord{};
Id front_facing{};
@@ -3047,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
}
for (const auto& sampler : ir.GetSamplers()) {
if (sampler.is_buffer) {
- entries.texel_buffers.emplace_back(sampler);
+ entries.uniform_texels.emplace_back(sampler);
} else {
entries.samplers.emplace_back(sampler);
}
}
for (const auto& image : ir.GetImages()) {
- entries.images.emplace_back(image);
+ if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+ entries.storage_texels.emplace_back(image);
+ } else {
+ entries.images.emplace_back(image);
+ }
}
for (const auto& attribute : ir.GetInputAttributes()) {
if (IsGenericAttribute(attribute)) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index f4c05ac3c..2b0e90396 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -21,8 +21,9 @@ class VKDevice;
namespace Vulkan {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using TexelBufferEntry = VideoCommon::Shader::Sampler;
+using UniformTexelEntry = VideoCommon::Shader::Sampler;
using SamplerEntry = VideoCommon::Shader::Sampler;
+using StorageTexelEntry = VideoCommon::Shader::Image;
using ImageEntry = VideoCommon::Shader::Image;
constexpr u32 DESCRIPTOR_SET = 0;
@@ -66,13 +67,15 @@ private:
struct ShaderEntries {
u32 NumBindings() const {
return static_cast<u32>(const_buffers.size() + global_buffers.size() +
- texel_buffers.size() + samplers.size() + images.size());
+ uniform_texels.size() + samplers.size() + storage_texels.size() +
+ images.size());
}
std::vector<ConstBufferEntry> const_buffers;
std::vector<GlobalBufferEntry> global_buffers;
- std::vector<TexelBufferEntry> texel_buffers;
+ std::vector<UniformTexelEntry> uniform_texels;
std::vector<SamplerEntry> samplers;
+ std::vector<StorageTexelEntry> storage_texels;
std::vector<ImageEntry> images;
std::set<u32> attributes;
std::array<bool, Maxwell::NumClipDistances> clip_distances{};
@@ -88,7 +91,8 @@ struct Specialization final {
u32 shared_memory_size{};
// Graphics specific
- std::optional<float> point_size{};
+ std::optional<float> point_size;
+ std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
bool ndc_minus_one_to_one{};
};
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 55f43e61b..430031665 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
ci.pNext = nullptr;
ci.flags = 0;
ci.size = static_cast<VkDeviceSize>(host_memory_size);
- ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
- VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+ ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
+ VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
ci.queueFamilyIndexCount = 0;
ci.pQueueFamilyIndices = nullptr;
@@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
ci.extent = {params.width, params.height, 1};
break;
case SurfaceTarget::Texture3D:
+ ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
ci.extent = {params.width, params.height, params.depth};
break;
case SurfaceTarget::TextureBuffer:
@@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
return ci;
}
+u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source,
+ Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) {
+ return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+ (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
} // Anonymous namespace
CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
@@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
}
// TODO(Rodrigo): Move this to a virtual function.
- main_view = CreateViewInner(
- ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels),
- true);
+ u32 num_layers = 1;
+ if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+ num_layers = params.depth;
+ }
+ main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));
}
CachedSurface::~CachedSurface() = default;
@@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() {
}
View CachedSurface::CreateView(const ViewParams& params) {
- return CreateViewInner(params, false);
-}
-
-View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
// TODO(Rodrigo): Add name decorations
- return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy);
+ return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);
}
void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
@@ -342,38 +347,44 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
}
CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
- const ViewParams& params, bool is_proxy)
+ const ViewParams& params)
: VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
- base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level},
- num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target)
- : VK_IMAGE_VIEW_TYPE_1D} {}
+ base_level{params.base_level}, num_levels{params.num_levels},
+ image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} {
+ if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ base_layer = 0;
+ num_layers = 1;
+ base_slice = params.base_layer;
+ num_slices = params.num_layers;
+ } else {
+ base_layer = params.base_layer;
+ num_layers = params.num_layers;
+ }
+}
CachedSurfaceView::~CachedSurfaceView() = default;
-VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
- SwizzleSource z_source, SwizzleSource w_source) {
- const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
- if (last_image_view && last_swizzle == swizzle) {
+VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source,
+ SwizzleSource z_source, SwizzleSource w_source) {
+ const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+ if (last_image_view && last_swizzle == new_swizzle) {
return last_image_view;
}
- last_swizzle = swizzle;
+ last_swizzle = new_swizzle;
- const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
+ const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
auto& image_view = entry->second;
if (!is_cache_miss) {
return last_image_view = *image_view;
}
- auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
- auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
- auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
- auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
-
+ std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
+ MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
// A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
- std::swap(swizzle_x, swizzle_z);
+ std::swap(swizzle[0], swizzle[2]);
}
// Games can sample depth or stencil values on textures. This is decided by the swizzle value on
@@ -395,11 +406,16 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
UNIMPLEMENTED();
}
- // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
- swizzle_x = VK_COMPONENT_SWIZZLE_R;
- swizzle_y = VK_COMPONENT_SWIZZLE_G;
- swizzle_z = VK_COMPONENT_SWIZZLE_B;
- swizzle_w = VK_COMPONENT_SWIZZLE_A;
+ // Make sure we sample the first component
+ std::transform(
+ swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
+ return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
+ });
+ }
+
+ if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ ASSERT(base_slice == 0);
+ ASSERT(num_slices == params.depth);
}
VkImageViewCreateInfo ci;
@@ -409,7 +425,7 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
ci.image = surface.GetImageHandle();
ci.viewType = image_view_type;
ci.format = surface.GetImage().GetFormat();
- ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
+ ci.components = {swizzle[0], swizzle[1], swizzle[2], swizzle[3]};
ci.subresourceRange.aspectMask = aspect;
ci.subresourceRange.baseMipLevel = base_level;
ci.subresourceRange.levelCount = num_levels;
@@ -420,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
return last_image_view = *image_view;
}
+VkImageView CachedSurfaceView::GetAttachment() {
+ if (render_target) {
+ return *render_target;
+ }
+
+ VkImageViewCreateInfo ci;
+ ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+ ci.pNext = nullptr;
+ ci.flags = 0;
+ ci.image = surface.GetImageHandle();
+ ci.format = surface.GetImage().GetFormat();
+ ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
+ VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
+ ci.subresourceRange.aspectMask = aspect_mask;
+ ci.subresourceRange.baseMipLevel = base_level;
+ ci.subresourceRange.levelCount = num_levels;
+ if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D;
+ ci.subresourceRange.baseArrayLayer = base_slice;
+ ci.subresourceRange.layerCount = num_slices;
+ } else {
+ ci.viewType = image_view_type;
+ ci.subresourceRange.baseArrayLayer = base_layer;
+ ci.subresourceRange.layerCount = num_layers;
+ }
+ render_target = device.GetLogical().CreateImageView(ci);
+ return *render_target;
+}
+
VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
const VKDevice& device, VKResourceManager& resource_manager,
VKMemoryManager& memory_manager, VKScheduler& scheduler,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index f211ccb1e..807e26c8a 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -91,7 +91,6 @@ protected:
void DecorateSurfaceName();
View CreateView(const ViewParams& params) override;
- View CreateViewInner(const ViewParams& params, bool is_proxy);
private:
void UploadBuffer(const std::vector<u8>& staging_buffer);
@@ -120,23 +119,20 @@ private:
class CachedSurfaceView final : public VideoCommon::ViewBase {
public:
explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
- const ViewParams& params, bool is_proxy);
+ const ViewParams& params);
~CachedSurfaceView();
- VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source,
- Tegra::Texture::SwizzleSource y_source,
- Tegra::Texture::SwizzleSource z_source,
- Tegra::Texture::SwizzleSource w_source);
+ VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source,
+ Tegra::Texture::SwizzleSource y_source,
+ Tegra::Texture::SwizzleSource z_source,
+ Tegra::Texture::SwizzleSource w_source);
+
+ VkImageView GetAttachment();
bool IsSameSurface(const CachedSurfaceView& rhs) const {
return &surface == &rhs.surface;
}
- VkImageView GetHandle() {
- return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
- Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
- }
-
u32 GetWidth() const {
return params.GetMipWidth(base_level);
}
@@ -180,14 +176,6 @@ public:
}
private:
- static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
- Tegra::Texture::SwizzleSource y_source,
- Tegra::Texture::SwizzleSource z_source,
- Tegra::Texture::SwizzleSource w_source) {
- return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
- (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
- }
-
// Store a copy of these values to avoid double dereference when reading them
const SurfaceParams params;
const VkImage image;
@@ -196,15 +184,18 @@ private:
const VKDevice& device;
CachedSurface& surface;
- const u32 base_layer;
- const u32 num_layers;
const u32 base_level;
const u32 num_levels;
const VkImageViewType image_view_type;
+ u32 base_layer = 0;
+ u32 num_layers = 0;
+ u32 base_slice = 0;
+ u32 num_slices = 0;
VkImageView last_image_view = nullptr;
u32 last_swizzle = 0;
+ vk::ImageView render_target;
std::unordered_map<u32, vk::ImageView> view_cache;
};
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index 694b325e1..c0a8f233f 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
return Operation(OperationCode::YNegate);
case SystemVariable::InvocationInfo:
LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
- return Immediate(0U);
+ return Immediate(0x00ff'0000U);
case SystemVariable::WscaleFactorXY:
UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented");
return Immediate(0U);
@@ -299,9 +299,19 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
break;
}
case OpCode::Id::MEMBAR: {
- UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
- bb.push_back(Operation(OperationCode::MemoryBarrierGL));
+ const OperationCode type = [instr] {
+ switch (instr.membar.type) {
+ case Tegra::Shader::MembarType::CTA:
+ return OperationCode::MemoryBarrierGroup;
+ case Tegra::Shader::MembarType::GL:
+ return OperationCode::MemoryBarrierGlobal;
+ default:
+ UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
+ return OperationCode::MemoryBarrierGlobal;
+ }
+ }();
+ bb.push_back(Operation(type));
break;
}
case OpCode::Id::DEPBAR: {
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c06512413..c5e5165ff 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -233,8 +233,9 @@ enum class OperationCode {
ThreadLtMask, /// () -> uint
ShuffleIndexed, /// (uint value, uint index) -> uint
- Barrier, /// () -> void
- MemoryBarrierGL, /// () -> void
+ Barrier, /// () -> void
+ MemoryBarrierGroup, /// () -> void
+ MemoryBarrierGlobal, /// () -> void
Amount,
};
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 7032e0059..f476f03b0 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
ComponentType alpha_component;
bool is_srgb;
};
-constexpr std::array<Table, 77> DefinitionTable = {{
+constexpr std::array<Table, 78> DefinitionTable = {{
{TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
{TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
{TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -98,6 +98,7 @@ constexpr std::array<Table, 77> DefinitionTable = {{
{TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
{TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
{TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
+ {TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
{TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},
{TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 715f39d0d..94d3a6ae5 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -248,12 +248,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
// Use an extra temporal buffer
auto& tmp_buffer = staging_cache.GetBuffer(1);
- // Special case for 3D Texture Segments
- const bool must_read_current_data =
- params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;
tmp_buffer.resize(guest_memory_size);
host_ptr = tmp_buffer.data();
- if (must_read_current_data) {
+
+ if (params.target == SurfaceTarget::Texture3D) {
+ // Special case for 3D texture segments
memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
}
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 79e10ffbb..173f2edba 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -217,8 +217,8 @@ public:
}
bool IsProtected() const {
- // Only 3D Slices are to be protected
- return is_target && params.block_depth > 0;
+ // Only 3D slices are to be protected
+ return is_target && params.target == SurfaceTarget::Texture3D;
}
bool IsRenderTarget() const {
@@ -250,6 +250,11 @@ public:
return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));
}
+ TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) {
+ return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth,
+ base_level, num_levels));
+ }
+
std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,
const GPUVAddr view_addr,
const std::size_t candidate_size, const u32 mipmap,
@@ -272,8 +277,8 @@ public:
std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,
const std::size_t candidate_size) {
if (params.target == SurfaceTarget::Texture3D ||
- (params.num_levels == 1 && !params.is_layered) ||
- view_params.target == SurfaceTarget::Texture3D) {
+ view_params.target == SurfaceTarget::Texture3D ||
+ (params.num_levels == 1 && !params.is_layered)) {
return {};
}
const auto layer_mipmap{GetLayerMipmap(view_addr)};
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 884fabffe..0b2b2b8c4 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
params.num_levels = 1;
params.emulated_levels = 1;
- const bool is_layered = config.layers > 1 && params.block_depth == 0;
- params.is_layered = is_layered;
- params.depth = is_layered ? config.layers.Value() : 1;
- params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
+ if (config.memory_layout.is_3d != 0) {
+ params.depth = config.layers.Value();
+ params.is_layered = false;
+ params.target = SurfaceTarget::Texture3D;
+ } else if (config.layers > 1) {
+ params.depth = config.layers.Value();
+ params.is_layered = true;
+ params.target = SurfaceTarget::Texture2DArray;
+ } else {
+ params.depth = 1;
+ params.is_layered = false;
+ params.target = SurfaceTarget::Texture2D;
+ }
return params;
}
@@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
params.width = config.width;
params.height = config.height;
params.pitch = config.pitch;
- // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters
+ // TODO(Rodrigo): Try to guess texture arrays from parameters
params.target = SurfaceTarget::Texture2D;
params.depth = 1;
params.num_levels = 1;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d6efc34b2..b543fc8c0 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -14,6 +14,7 @@
#include <unordered_map>
#include <vector>
+#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_map.hpp>
#include <boost/range/iterator_range.hpp>
@@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
template <typename TSurface, typename TView>
class TextureCache {
+ using VectorSurface = boost::container::small_vector<TSurface, 1>;
public:
void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -296,30 +298,30 @@ public:
const GPUVAddr src_gpu_addr = src_config.Address();
const GPUVAddr dst_gpu_addr = dst_config.Address();
DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
- const std::optional<VAddr> dst_cpu_addr =
- system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr);
- const std::optional<VAddr> src_cpu_addr =
- system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
- std::pair<TSurface, TView> dst_surface =
- GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
- std::pair<TSurface, TView> src_surface =
- GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false);
- ImageBlit(src_surface.second, dst_surface.second, copy_config);
+
+ const auto& memory_manager = system.GPU().MemoryManager();
+ const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr);
+ const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr);
+ std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
+ TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
+ ImageBlit(src_surface, dst_surface.second, copy_config);
dst_surface.first->MarkAsModified(true, Tick());
}
- TSurface TryFindFramebufferSurface(VAddr addr) {
+ TSurface TryFindFramebufferSurface(VAddr addr) const {
if (!addr) {
return nullptr;
}
const VAddr page = addr >> registry_page_bits;
- std::vector<TSurface>& list = registry[page];
- for (auto& surface : list) {
- if (surface->GetCpuAddr() == addr) {
- return surface;
- }
+ const auto it = registry.find(page);
+ if (it == registry.end()) {
+ return nullptr;
}
- return nullptr;
+ const auto& list = it->second;
+ const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
+ return surface->GetCpuAddr() == addr;
+ });
+ return found != list.end() ? *found : nullptr;
}
u64 Tick() {
@@ -498,18 +500,18 @@ private:
* @param untopological Indicates to the recycler that the texture has no way
* to match the overlaps due to topological reasons.
**/
- RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
+ RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
if (Settings::IsGPULevelExtreme()) {
return RecycleStrategy::Flush;
}
// 3D Textures decision
- if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) {
+ if (params.target == SurfaceTarget::Texture3D) {
return RecycleStrategy::Flush;
}
for (const auto& s : overlaps) {
const auto& s_params = s->GetSurfaceParams();
- if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) {
+ if (s_params.target == SurfaceTarget::Texture3D) {
return RecycleStrategy::Flush;
}
}
@@ -538,9 +540,8 @@ private:
* @param untopological Indicates to the recycler that the texture has no way to match the
* overlaps due to topological reasons.
**/
- std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
- const SurfaceParams& params, const GPUVAddr gpu_addr,
- const bool preserve_contents,
+ std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
+ const GPUVAddr gpu_addr, const bool preserve_contents,
const MatchTopologyResult untopological) {
const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
for (auto& surface : overlaps) {
@@ -650,47 +651,65 @@ private:
* @param params The parameters on the new surface.
* @param gpu_addr The starting address of the new surface.
**/
- std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
+ std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
const SurfaceParams& params,
- const GPUVAddr gpu_addr) {
+ GPUVAddr gpu_addr) {
if (params.target == SurfaceTarget::Texture3D) {
- return {};
+ return std::nullopt;
}
- bool modified = false;
+ const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
TSurface new_surface = GetUncachedSurface(gpu_addr, params);
- u32 passed_tests = 0;
+
+ if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
+ LoadSurface(new_surface);
+ for (const auto& surface : overlaps) {
+ Unregister(surface);
+ }
+ Register(new_surface);
+ return {{new_surface, new_surface->GetMainView()}};
+ }
+
+ std::size_t passed_tests = 0;
for (auto& surface : overlaps) {
const SurfaceParams& src_params = surface->GetSurfaceParams();
- if (src_params.is_layered || src_params.num_levels > 1) {
- // We send this cases to recycle as they are more complex to handle
- return {};
- }
- const std::size_t candidate_size = surface->GetSizeInBytes();
- auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
+ const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
if (!mipmap_layer) {
continue;
}
- const auto [layer, mipmap] = *mipmap_layer;
- if (new_surface->GetMipmapSize(mipmap) != candidate_size) {
+ const auto [base_layer, base_mipmap] = *mipmap_layer;
+ if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
continue;
}
- modified |= surface->IsModified();
- // Now we got all the data set up
- const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
- const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
- const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1);
- passed_tests++;
- ImageCopy(surface, new_surface, copy_params);
+ ++passed_tests;
+
+ // Copy all mipmaps and layers
+ const u32 block_width = params.GetDefaultBlockWidth();
+ const u32 block_height = params.GetDefaultBlockHeight();
+ for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
+ const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
+ const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
+ if (width < block_width || height < block_height) {
+ // Current APIs forbid copying small compressed textures, avoid errors
+ break;
+ }
+ const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
+ src_params.depth);
+ ImageCopy(surface, new_surface, copy_params);
+ }
}
if (passed_tests == 0) {
- return {};
+ return std::nullopt;
+ }
+ if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
// In Accurate GPU all tests should pass, else we recycle
- } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
- return {};
+ return std::nullopt;
}
+
+ const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
for (const auto& surface : overlaps) {
Unregister(surface);
}
+
new_surface->MarkAsModified(modified, Tick());
Register(new_surface);
return {{new_surface, new_surface->GetMainView()}};
@@ -708,53 +727,11 @@ private:
* @param preserve_contents Indicates that the new surface should be loaded from memory or
* left blank.
*/
- std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
+ std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
const SurfaceParams& params,
- const GPUVAddr gpu_addr,
- const VAddr cpu_addr,
+ GPUVAddr gpu_addr, VAddr cpu_addr,
bool preserve_contents) {
- if (params.target == SurfaceTarget::Texture3D) {
- bool failed = false;
- if (params.num_levels > 1) {
- // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
- return std::nullopt;
- }
- TSurface new_surface = GetUncachedSurface(gpu_addr, params);
- bool modified = false;
- for (auto& surface : overlaps) {
- const SurfaceParams& src_params = surface->GetSurfaceParams();
- if (src_params.target != SurfaceTarget::Texture2D) {
- failed = true;
- break;
- }
- if (src_params.height != params.height) {
- failed = true;
- break;
- }
- if (src_params.block_depth != params.block_depth ||
- src_params.block_height != params.block_height) {
- failed = true;
- break;
- }
- const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
- const auto offsets = params.GetBlockOffsetXYZ(offset);
- const auto z = std::get<2>(offsets);
- modified |= surface->IsModified();
- const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
- 1);
- ImageCopy(surface, new_surface, copy_params);
- }
- if (failed) {
- return std::nullopt;
- }
- for (const auto& surface : overlaps) {
- Unregister(surface);
- }
- new_surface->MarkAsModified(modified, Tick());
- Register(new_surface);
- auto view = new_surface->GetMainView();
- return {{std::move(new_surface), view}};
- } else {
+ if (params.target != SurfaceTarget::Texture3D) {
for (const auto& surface : overlaps) {
if (!surface->MatchTarget(params.target)) {
if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) {
@@ -770,11 +747,60 @@ private:
continue;
}
if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
- return {{surface, surface->GetMainView()}};
+ return std::make_pair(surface, surface->GetMainView());
}
}
return InitializeSurface(gpu_addr, params, preserve_contents);
}
+
+ if (params.num_levels > 1) {
+ // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
+ return std::nullopt;
+ }
+
+ if (overlaps.size() == 1) {
+ const auto& surface = overlaps[0];
+ const SurfaceParams& overlap_params = surface->GetSurfaceParams();
+ // Don't attempt to render to textures with more than one level for now
+ // The texture has to be to the right or the sample address if we want to render to it
+ if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) {
+ const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr());
+ const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+ if (slice < overlap_params.depth) {
+ auto view = surface->Emplace3DView(slice, params.depth, 0, 1);
+ return std::make_pair(std::move(surface), std::move(view));
+ }
+ }
+ }
+
+ TSurface new_surface = GetUncachedSurface(gpu_addr, params);
+ bool modified = false;
+
+ for (auto& surface : overlaps) {
+ const SurfaceParams& src_params = surface->GetSurfaceParams();
+ if (src_params.target != SurfaceTarget::Texture2D ||
+ src_params.height != params.height ||
+ src_params.block_depth != params.block_depth ||
+ src_params.block_height != params.block_height) {
+ return std::nullopt;
+ }
+ modified |= surface->IsModified();
+
+ const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
+ const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+ const u32 width = params.width;
+ const u32 height = params.height;
+ const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
+ ImageCopy(surface, new_surface, copy_params);
+ }
+ for (const auto& surface : overlaps) {
+ Unregister(surface);
+ }
+ new_surface->MarkAsModified(modified, Tick());
+ Register(new_surface);
+
+ TView view = new_surface->GetMainView();
+ return std::make_pair(std::move(new_surface), std::move(view));
}
/**
@@ -810,7 +836,7 @@ private:
TSurface& current_surface = iter->second;
const auto topological_result = current_surface->MatchesTopology(params);
if (topological_result != MatchTopologyResult::FullMatch) {
- std::vector<TSurface> overlaps{current_surface};
+ VectorSurface overlaps{current_surface};
return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
topological_result);
}
@@ -852,7 +878,7 @@ private:
}
}
- // Check if it's a 3D texture
+ // Manage 3D textures
if (params.block_depth > 0) {
auto surface =
Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
@@ -868,12 +894,9 @@ private:
// two things either the candidate surface is a supertexture of the overlap
// or they don't match in any known way.
if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
- if (current_surface->GetGpuAddr() == gpu_addr) {
- std::optional<std::pair<TSurface, TView>> view =
- TryReconstructSurface(overlaps, params, gpu_addr);
- if (view) {
- return *view;
- }
+ const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
+ if (view) {
+ return *view;
}
return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
MatchTopologyResult::FullMatch);
@@ -991,7 +1014,9 @@ private:
params.target = target;
params.is_tiled = false;
params.srgb_conversion = false;
- params.is_layered = false;
+ params.is_layered =
+ target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
+ target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
params.block_width = 0;
params.block_height = 0;
params.block_depth = 0;
@@ -1124,23 +1149,25 @@ private:
}
}
- std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
+ VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
if (size == 0) {
return {};
}
const VAddr cpu_addr_end = cpu_addr + size;
- VAddr start = cpu_addr >> registry_page_bits;
const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
- std::vector<TSurface> surfaces;
- while (start <= end) {
- std::vector<TSurface>& list = registry[start];
- for (auto& surface : list) {
- if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
- surface->MarkAsPicked(true);
- surfaces.push_back(surface);
+ VectorSurface surfaces;
+ for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
+ const auto it = registry.find(start);
+ if (it == registry.end()) {
+ continue;
+ }
+ for (auto& surface : it->second) {
+ if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
+ continue;
}
+ surface->MarkAsPicked(true);
+ surfaces.push_back(surface);
}
- start++;
}
for (auto& surface : surfaces) {
surface->MarkAsPicked(false);
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 1adf8932b..1f5e43043 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -106,6 +106,9 @@ public:
format.setVersion(4, 3);
format.setProfile(QSurfaceFormat::CompatibilityProfile);
format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
+ if (Settings::values.renderer_debug) {
+ format.setOption(QSurfaceFormat::FormatOption::DebugContext);
+ }
// TODO: expose a setting for buffer value (ie default/single/double/triple)
format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
format.setSwapInterval(0);
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index b08b87426..7e9073cc3 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() {
Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();
Settings::values.disable_cpu_opt =
ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool();
+ Settings::values.disable_macro_jit =
+ ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();
qt_config->endGroup();
}
@@ -1011,6 +1013,7 @@ void Config::SaveDebuggingValues() {
WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);
WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);
WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false);
+ WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);
qt_config->endGroup();
}
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp
index c2026763e..2c77441fd 100644
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {
ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);
ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
+ ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+ ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);
}
void ConfigureDebug::ApplyConfiguration() {
@@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {
Settings::values.quest_flag = ui->quest_flag->isChecked();
Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();
Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
+ Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
Debugger::ToggleConsole();
Log::Filter filter;
filter.ParseFilterString(Settings::values.log_filter);
diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui
index e0d4c4a44..46f0208c6 100644
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@@ -148,6 +148,19 @@
</property>
</widget>
</item>
+ <item>
+ <widget class="QCheckBox" name="disable_macro_jit">
+ <property name="enabled">
+ <bool>true</bool>
+ </property>
+ <property name="whatsThis">
+ <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string>
+ </property>
+ <property name="text">
+ <string>Disable Macro JIT</string>
+ </property>
+ </widget>
+ </item>
</layout>
</widget>
</item>
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp
index e4eb5594b..a05fa64ba 100644
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() {
SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]);
}
}
+
UpdateButtonLabels();
+ ApplyConfiguration();
}
void ConfigureInputPlayer::ClearAll() {
@@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() {
}
UpdateButtonLabels();
+ ApplyConfiguration();
}
void ConfigureInputPlayer::UpdateButtonLabels() {
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index c20d48c42..7240270f5 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -432,6 +432,8 @@ void Config::ReadValues() {
Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);
Settings::values.disable_cpu_opt =
sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false);
+ Settings::values.disable_macro_jit =
+ sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);
const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");
std::stringstream ss(title_list);
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index abc6e6e65..6f53e9659 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -291,6 +291,8 @@ quest_flag =
# Determines whether or not JIT CPU optimizations are enabled
# false: Optimizations Enabled, true: Optimizations Disabled
disable_cpu_opt =
+# Enables/Disables the macro JIT compiler
+disable_macro_jit=false
[WebService]
# Whether or not to enable telemetry
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
index 411e7e647..09cc0a3b5 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
@@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen)
SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8);
SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0);
SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
+ if (Settings::values.renderer_debug) {
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+ }
SDL_GL_SetSwapInterval(0);
std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname,