diff options
30 files changed, 1264 insertions, 314 deletions
diff --git a/externals/sirit b/externals/sirit -Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c +Subproject eefca56afd49379bdebc97ded8b480839f93088 diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp index 00860fcbd..ef5e19e63 100644 --- a/src/core/hle/kernel/readable_event.cpp +++ b/src/core/hle/kernel/readable_event.cpp @@ -38,7 +38,7 @@ void ReadableEvent::Clear() { ResultCode ReadableEvent::Reset() { if (!is_signaled) { - LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", + LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", GetObjectId(), GetTypeName(), GetName()); return ERR_INVALID_STATE; } diff --git a/src/core/settings.h b/src/core/settings.h index 78eb33737..36cd66fd4 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -474,6 +474,7 @@ struct Values { bool reporting_services; bool quest_flag; bool disable_cpu_opt; + bool disable_macro_jit; // BCAT std::string bcat_backend; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index d6ee82836..2bf8d68ce 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -25,6 +25,12 @@ add_library(video_core STATIC engines/shader_bytecode.h engines/shader_header.h engines/shader_type.h + macro/macro.cpp + macro/macro.h + macro/macro_interpreter.cpp + macro/macro_interpreter.h + macro/macro_jit_x64.cpp + macro/macro_jit_x64.h fence_manager.h gpu.cpp gpu.h @@ -36,8 +42,6 @@ add_library(video_core STATIC gpu_thread.h guest_driver.cpp guest_driver.h - macro_interpreter.cpp - macro_interpreter.h memory_manager.cpp memory_manager.h morton.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 13ef2e42d..e46b153f9 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, - macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { + macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} { dirty.flags.flip(); - InitializeRegisterDefaults(); } @@ -120,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { +void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) { // Reset the current macro. executing_macro = 0; @@ -129,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3 ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size()); // Execute the current macro. - macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); + macro_engine->Execute(macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } @@ -165,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Call the macro when there are no more parameters in the command buffer if (is_last_call) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); + CallMacroMethod(executing_macro, macro_params); macro_params.clear(); } return; @@ -201,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { break; } case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(arg); + macro_engine->AddCode(regs.macros.upload_address, arg); break; } case MAXWELL3D_REG_INDEX(macros.bind): { @@ -310,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Call the macro when there are no more parameters in the command buffer if (amount == methods_pending) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); + CallMacroMethod(executing_macro, macro_params); macro_params.clear(); } return; @@ -424,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() { } void Maxwell3D::ProcessMacroUpload(u32 data) { - ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), - "upload_address exceeded macro_memory size!"); - macro_memory[regs.macros.upload_address++] = data; + macro_engine->AddCode(regs.macros.upload_address++, data); } void Maxwell3D::ProcessMacroBind(u32 data) { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39b..b827b112f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -23,7 +23,7 @@ #include "video_core/engines/engine_upload.h" #include "video_core/engines/shader_type.h" #include "video_core/gpu.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro.h" #include "video_core/textures/texture.h" namespace Core { @@ -1411,15 +1411,6 @@ public: const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; - /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than - /// we've seen used. - using MacroMemory = std::array<u32, 0x40000>; - - /// Gets a reference to macro memory. - const MacroMemory& GetMacroMemory() const { - return macro_memory; - } - bool ShouldExecute() const { return execute_on; } @@ -1468,16 +1459,13 @@ private: std::array<bool, Regs::NUM_REGS> mme_inline{}; - /// Memory for macro code - MacroMemory macro_memory; - /// Macro method that is currently being executed / being fed parameters. u32 executing_macro = 0; /// Parameters that have been submitted to the macro call so far. std::vector<u32> macro_params; /// Interpreter for the macro codes uploaded to the GPU. - MacroInterpreter macro_interpreter; + std::unique_ptr<MacroEngine> macro_engine; static constexpr u32 null_cb_data = 0xFFFFFFFF; struct { @@ -1506,7 +1494,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); + void CallMacroMethod(u32 method, const std::vector<u32>& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp new file mode 100644 index 000000000..89077a2d8 --- /dev/null +++ b/src/video_core/macro/macro.cpp @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "core/settings.h" +#include "video_core/macro/macro.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +namespace Tegra { + +void MacroEngine::AddCode(u32 method, u32 data) { + uploaded_macro_code[method].push_back(data); +} + +void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { + auto compiled_macro = macro_cache.find(method); + if (compiled_macro != macro_cache.end()) { + compiled_macro->second->Execute(parameters, method); + } else { + // Macro not compiled, check if it's uploaded and if so, compile it + auto macro_code = uploaded_macro_code.find(method); + if (macro_code == uploaded_macro_code.end()) { + UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); + return; + } + macro_cache[method] = Compile(macro_code->second); + macro_cache[method]->Execute(parameters, method); + } +} + +std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) { + if (Settings::values.disable_macro_jit) { + return std::make_unique<MacroInterpreter>(maxwell3d); + } +#ifdef ARCHITECTURE_x86_64 + return std::make_unique<MacroJITx64>(maxwell3d); +#else + return std::make_unique<MacroInterpreter>(maxwell3d); +#endif +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h new file mode 100644 index 000000000..b76ed891f --- /dev/null +++ b/src/video_core/macro/macro.h @@ -0,0 +1,128 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} +namespace Macro { +constexpr std::size_t NUM_MACRO_REGISTERS = 8; +enum class Operation : u32 { + ALU = 0, + AddImmediate = 1, + ExtractInsert = 2, + ExtractShiftLeftImmediate = 3, + ExtractShiftLeftRegister = 4, + Read = 5, + Unused = 6, // This operation doesn't seem to be a valid encoding. + Branch = 7, +}; + +enum class ALUOperation : u32 { + Add = 0, + AddWithCarry = 1, + Subtract = 2, + SubtractWithBorrow = 3, + // Operations 4-7 don't seem to be valid encodings. + Xor = 8, + Or = 9, + And = 10, + AndNot = 11, + Nand = 12 +}; + +enum class ResultOperation : u32 { + IgnoreAndFetch = 0, + Move = 1, + MoveAndSetMethod = 2, + FetchAndSend = 3, + MoveAndSend = 4, + FetchAndSetMethod = 5, + MoveAndSetMethodFetchAndSend = 6, + MoveAndSetMethodSend = 7 +}; + +enum class BranchCondition : u32 { + Zero = 0, + NotZero = 1, +}; + +union Opcode { + u32 raw; + BitField<0, 3, Operation> operation; + BitField<4, 3, ResultOperation> result_operation; + BitField<4, 1, BranchCondition> branch_condition; + // If set on a branch, then the branch doesn't have a delay slot. + BitField<5, 1, u32> branch_annul; + BitField<7, 1, u32> is_exit; + BitField<8, 3, u32> dst; + BitField<11, 3, u32> src_a; + BitField<14, 3, u32> src_b; + // The signed immediate overlaps the second source operand and the alu operation. + BitField<14, 18, s32> immediate; + + BitField<17, 5, ALUOperation> alu_operation; + + // Bitfield instructions data + BitField<17, 5, u32> bf_src_bit; + BitField<22, 5, u32> bf_size; + BitField<27, 5, u32> bf_dst_bit; + + u32 GetBitfieldMask() const { + return (1 << bf_size) - 1; + } + + s32 GetBranchTarget() const { + return static_cast<s32>(immediate * sizeof(u32)); + } +}; + +union MethodAddress { + u32 raw; + BitField<0, 12, u32> address; + BitField<12, 6, u32> increment; +}; + +} // namespace Macro + +class CachedMacro { +public: + virtual ~CachedMacro() = default; + /** + * Executes the macro code with the specified input parameters. + * @param code The macro byte code to execute + * @param parameters The parameters of the macro + */ + virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0; +}; + +class MacroEngine { +public: + virtual ~MacroEngine() = default; + + // Store the uploaded macro code to compile them when they're called. + void AddCode(u32 method, u32 data); + + // Compiles the macro if its not in the cache, and executes the compiled macro + void Execute(u32 method, const std::vector<u32>& parameters); + +protected: + virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0; + +private: + std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache; + std::unordered_map<u32, std::vector<u32>> uploaded_macro_code; +}; + +std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d); + +} // namespace Tegra diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 947364928..5edff27aa 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -1,4 +1,4 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -6,109 +6,46 @@ #include "common/logging/log.h" #include "common/microprofile.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro_interpreter.h" MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -namespace { -enum class Operation : u32 { - ALU = 0, - AddImmediate = 1, - ExtractInsert = 2, - ExtractShiftLeftImmediate = 3, - ExtractShiftLeftRegister = 4, - Read = 5, - Unused = 6, // This operation doesn't seem to be a valid encoding. - Branch = 7, -}; -} // Anonymous namespace - -enum class MacroInterpreter::ALUOperation : u32 { - Add = 0, - AddWithCarry = 1, - Subtract = 2, - SubtractWithBorrow = 3, - // Operations 4-7 don't seem to be valid encodings. - Xor = 8, - Or = 9, - And = 10, - AndNot = 11, - Nand = 12 -}; - -enum class MacroInterpreter::ResultOperation : u32 { - IgnoreAndFetch = 0, - Move = 1, - MoveAndSetMethod = 2, - FetchAndSend = 3, - MoveAndSend = 4, - FetchAndSetMethod = 5, - MoveAndSetMethodFetchAndSend = 6, - MoveAndSetMethodSend = 7 -}; - -enum class MacroInterpreter::BranchCondition : u32 { - Zero = 0, - NotZero = 1, -}; - -union MacroInterpreter::Opcode { - u32 raw; - BitField<0, 3, Operation> operation; - BitField<4, 3, ResultOperation> result_operation; - BitField<4, 1, BranchCondition> branch_condition; - // If set on a branch, then the branch doesn't have a delay slot. - BitField<5, 1, u32> branch_annul; - BitField<7, 1, u32> is_exit; - BitField<8, 3, u32> dst; - BitField<11, 3, u32> src_a; - BitField<14, 3, u32> src_b; - // The signed immediate overlaps the second source operand and the alu operation. - BitField<14, 18, s32> immediate; - - BitField<17, 5, ALUOperation> alu_operation; - - // Bitfield instructions data - BitField<17, 5, u32> bf_src_bit; - BitField<22, 5, u32> bf_size; - BitField<27, 5, u32> bf_dst_bit; - - u32 GetBitfieldMask() const { - return (1 << bf_size) - 1; - } +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} - s32 GetBranchTarget() const { - return static_cast<s32>(immediate * sizeof(u32)); - } -}; +std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { + return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); +} -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, + const std::vector<u32>& code) + : maxwell3d(maxwell3d), code(code) {} -void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { +void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); registers[1] = parameters[0]; + num_parameters = parameters.size(); if (num_parameters > parameters_capacity) { parameters_capacity = num_parameters; this->parameters = std::make_unique<u32[]>(num_parameters); } - std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); + std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); this->num_parameters = num_parameters; // Execute the code until we hit an exit condition. bool keep_executing = true; while (keep_executing) { - keep_executing = Step(offset, false); + keep_executing = Step(false); } // Assert the the macro used all the input parameters ASSERT(next_parameter_index == num_parameters); } -void MacroInterpreter::Reset() { +void MacroInterpreterImpl::Reset() { registers = {}; pc = 0; delayed_pc = {}; @@ -120,10 +57,10 @@ void MacroInterpreter::Reset() { carry_flag = false; } -bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { +bool MacroInterpreterImpl::Step(bool is_delay_slot) { u32 base_address = pc; - Opcode opcode = GetOpcode(offset); + Macro::Opcode opcode = GetOpcode(); pc += 4; // Update the program counter if we were delayed @@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { } switch (opcode.operation) { - case Operation::ALU: { + case Macro::Operation::ALU: { u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), GetRegister(opcode.src_b)); ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::AddImmediate: { + case Macro::Operation::AddImmediate: { ProcessResult(opcode.result_operation, opcode.dst, GetRegister(opcode.src_a) + opcode.immediate); break; } - case Operation::ExtractInsert: { + case Macro::Operation::ExtractInsert: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, dst); break; } - case Operation::ExtractShiftLeftImmediate: { + case Macro::Operation::ExtractShiftLeftImmediate: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::ExtractShiftLeftRegister: { + case Macro::Operation::ExtractShiftLeftRegister: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::Read: { + case Macro::Operation::Read: { u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::Branch: { + case Macro::Operation::Branch: { ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); u32 value = GetRegister(opcode.src_a); bool taken = EvaluateBranchCondition(opcode.branch_condition, value); @@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { delayed_pc = base_address + opcode.GetBranchTarget(); // Execute one more instruction due to the delay slot. - return Step(offset, true); + return Step(true); } break; } @@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { // cause an exit if it's executed inside a delay slot. if (opcode.is_exit && !is_delay_slot) { // Exit has a delay slot, execute the next instruction - Step(offset, true); + Step(true); return false; } return true; } -MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { - const auto& macro_memory{maxwell3d.GetMacroMemory()}; - ASSERT((pc % sizeof(u32)) == 0); - ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); - return {macro_memory[offset + pc / sizeof(u32)]}; -} - -u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { +u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { switch (operation) { - case ALUOperation::Add: { + case Macro::ALUOperation::Add: { const u64 result{static_cast<u64>(src_a) + src_b}; carry_flag = result > 0xffffffff; return static_cast<u32>(result); } - case ALUOperation::AddWithCarry: { + case Macro::ALUOperation::AddWithCarry: { const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; carry_flag = result > 0xffffffff; return static_cast<u32>(result); } - case ALUOperation::Subtract: { + case Macro::ALUOperation::Subtract: { const u64 result{static_cast<u64>(src_a) - src_b}; carry_flag = result < 0x100000000; return static_cast<u32>(result); } - case ALUOperation::SubtractWithBorrow: { + case Macro::ALUOperation::SubtractWithBorrow: { const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; carry_flag = result < 0x100000000; return static_cast<u32>(result); } - case ALUOperation::Xor: + case Macro::ALUOperation::Xor: return src_a ^ src_b; - case ALUOperation::Or: + case Macro::ALUOperation::Or: return src_a | src_b; - case ALUOperation::And: + case Macro::ALUOperation::And: return src_a & src_b; - case ALUOperation::AndNot: + case Macro::ALUOperation::AndNot: return src_a & ~src_b; - case ALUOperation::Nand: + case Macro::ALUOperation::Nand: return ~(src_a & src_b); default: @@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) } } -void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { +void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { switch (operation) { - case ResultOperation::IgnoreAndFetch: + case Macro::ResultOperation::IgnoreAndFetch: // Fetch parameter and ignore result. SetRegister(reg, FetchParameter()); break; - case ResultOperation::Move: + case Macro::ResultOperation::Move: // Move result. SetRegister(reg, result); break; - case ResultOperation::MoveAndSetMethod: + case Macro::ResultOperation::MoveAndSetMethod: // Move result and use as Method Address. SetRegister(reg, result); SetMethodAddress(result); break; - case ResultOperation::FetchAndSend: + case Macro::ResultOperation::FetchAndSend: // Fetch parameter and send result. SetRegister(reg, FetchParameter()); Send(result); break; - case ResultOperation::MoveAndSend: + case Macro::ResultOperation::MoveAndSend: // Move and send result. SetRegister(reg, result); Send(result); break; - case ResultOperation::FetchAndSetMethod: + case Macro::ResultOperation::FetchAndSetMethod: // Fetch parameter and use result as Method Address. SetRegister(reg, FetchParameter()); SetMethodAddress(result); break; - case ResultOperation::MoveAndSetMethodFetchAndSend: + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: // Move result and use as Method Address, then fetch and send parameter. SetRegister(reg, result); SetMethodAddress(result); Send(FetchParameter()); break; - case ResultOperation::MoveAndSetMethodSend: + case Macro::ResultOperation::MoveAndSetMethodSend: // Move result and use as Method Address, then send bits 12:17 of result. SetRegister(reg, result); SetMethodAddress(result); @@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res } } -u32 MacroInterpreter::FetchParameter() { - ASSERT(next_parameter_index < num_parameters); - return parameters[next_parameter_index++]; +bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { + switch (cond) { + case Macro::BranchCondition::Zero: + return value == 0; + case Macro::BranchCondition::NotZero: + return value != 0; + } + UNREACHABLE(); + return true; } -u32 MacroInterpreter::GetRegister(u32 register_id) const { +Macro::Opcode MacroInterpreterImpl::GetOpcode() const { + ASSERT((pc % sizeof(u32)) == 0); + ASSERT(pc < code.size() * sizeof(u32)); + return {code[pc / sizeof(u32)]}; +} + +u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { return registers.at(register_id); } -void MacroInterpreter::SetRegister(u32 register_id, u32 value) { +void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { // Register 0 is hardwired as the zero register. // Ensure no writes to it actually occur. if (register_id == 0) { @@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) { registers.at(register_id) = value; } -void MacroInterpreter::SetMethodAddress(u32 address) { +void MacroInterpreterImpl::SetMethodAddress(u32 address) { method_address.raw = address; } -void MacroInterpreter::Send(u32 value) { +void MacroInterpreterImpl::Send(u32 value) { maxwell3d.CallMethodFromMME(method_address.address, value); // Increment the method address by the method increment. method_address.address.Assign(method_address.address.Value() + method_address.increment.Value()); } -u32 MacroInterpreter::Read(u32 method) const { +u32 MacroInterpreterImpl::Read(u32 method) const { return maxwell3d.GetRegisterValue(method); } -bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { - switch (cond) { - case BranchCondition::Zero: - return value == 0; - case BranchCondition::NotZero: - return value != 0; - } - UNREACHABLE(); - return true; +u32 MacroInterpreterImpl::FetchParameter() { + ASSERT(next_parameter_index < num_parameters); + return parameters[next_parameter_index++]; } } // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index 631146d89..90217fc89 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -1,44 +1,37 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once - #include <array> #include <optional> - +#include <vector> #include "common/bit_field.h" #include "common/common_types.h" +#include "video_core/macro/macro.h" namespace Tegra { namespace Engines { class Maxwell3D; } -class MacroInterpreter final { +class MacroInterpreter final : public MacroEngine { public: explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); - /** - * Executes the macro code with the specified input parameters. - * @param offset Offset to start execution at. - * @param parameters The parameters of the macro. - */ - void Execute(u32 offset, std::size_t num_parameters, const u32* parameters); +protected: + std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; private: - enum class ALUOperation : u32; - enum class BranchCondition : u32; - enum class ResultOperation : u32; - - union Opcode; + Engines::Maxwell3D& maxwell3d; +}; - union MethodAddress { - u32 raw; - BitField<0, 12, u32> address; - BitField<12, 6, u32> increment; - }; +class MacroInterpreterImpl : public CachedMacro { +public: + MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); + void Execute(const std::vector<u32>& parameters, u32 method) override; +private: /// Resets the execution engine state, zeroing registers, etc. void Reset(); @@ -49,20 +42,20 @@ private: * @param is_delay_slot Whether the current step is being executed due to a delay slot in a * previous instruction. */ - bool Step(u32 offset, bool is_delay_slot); + bool Step(bool is_delay_slot); /// Calculates the result of an ALU operation. src_a OP src_b; - u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); + u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); /// Performs the result operation on the input result and stores it in the specified register /// (if necessary). - void ProcessResult(ResultOperation operation, u32 reg, u32 result); + void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); /// Evaluates the branch condition and returns whether the branch should be taken or not. - bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; + bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; /// Reads an opcode at the current program counter location. - Opcode GetOpcode(u32 offset) const; + Macro::Opcode GetOpcode() const; /// Returns the specified register's value. Register 0 is hardcoded to always return 0. u32 GetRegister(u32 register_id) const; @@ -89,13 +82,11 @@ private: /// Program counter to execute at after the delay slot is executed. std::optional<u32> delayed_pc; - static constexpr std::size_t NumMacroRegisters = 8; - /// General purpose macro registers. - std::array<u32, NumMacroRegisters> registers = {}; + std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {}; /// Method address to use for the next Send instruction. - MethodAddress method_address = {}; + Macro::MethodAddress method_address = {}; /// Input parameters of the current macro. std::unique_ptr<u32[]> parameters; @@ -105,5 +96,7 @@ private: u32 next_parameter_index = 0; bool carry_flag = false; + const std::vector<u32>& code; }; + } // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp new file mode 100644 index 000000000..11c1cc3be --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -0,0 +1,640 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "common/x64/xbyak_util.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47)); +MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); + +namespace Tegra { +static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; +static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; +static const Xbyak::Reg64 STATE = Xbyak::util::r11; +static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; +static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; +static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; +static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; +static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; + +static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ + PARAMETERS, + REGISTERS, + STATE, + NEXT_PARAMETER, + RESULT, + METHOD_ADDRESS, + BRANCH_HOLDER, +}); + +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} + +std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { + return std::make_unique<MacroJITx64Impl>(maxwell3d, code); +} + +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code) + : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { + Compile(); +} + +MacroJITx64Impl::~MacroJITx64Impl() = default; + +void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) { + MICROPROFILE_SCOPE(MacroJitExecute); + ASSERT_OR_EXECUTE(program != nullptr, { return; }); + JITState state{}; + state.maxwell3d = &maxwell3d; + state.registers = {}; + state.parameters = parameters.data(); + program(&state); +} + +void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { + const bool is_a_zero = opcode.src_a == 0; + const bool is_b_zero = opcode.src_b == 0; + const bool valid_operation = !is_a_zero && !is_b_zero; + const bool is_move_operation = !is_a_zero && is_b_zero; + const bool has_zero_register = is_a_zero || is_b_zero; + + Xbyak::Reg64 src_a; + Xbyak::Reg32 src_b; + + if (!optimizer.zero_reg_skip) { + src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + src_b = Compile_GetRegister(opcode.src_b, ebx); + } else { + if (!is_a_zero) { + src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + } + if (!is_b_zero) { + src_b = Compile_GetRegister(opcode.src_b, ebx); + } + } + Xbyak::Label skip_carry{}; + + bool has_emitted = false; + + switch (opcode.alu_operation) { + case Macro::ALUOperation::Add: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + add(src_a, src_b); + } + } else { + add(src_a, src_b); + } + + if (!optimizer.can_skip_carry) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::AddWithCarry: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + adc(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Subtract: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + sub(src_a, src_b); + has_emitted = true; + } + } else { + sub(src_a, src_b); + has_emitted = true; + } + if (!optimizer.can_skip_carry && has_emitted) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::SubtractWithBorrow: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + sbb(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Xor: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + xor_(src_a, src_b); + } + } else { + xor_(src_a, src_b); + } + break; + case Macro::ALUOperation::Or: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + or_(src_a, src_b); + } + } else { + or_(src_a, src_b); + } + break; + case Macro::ALUOperation::And: + if (optimizer.zero_reg_skip) { + if (!has_zero_register) { + and_(src_a, src_b); + } + } else { + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::AndNot: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + not_(src_b); + and_(src_a, src_b); + } + } else { + not_(src_b); + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::Nand: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + and_(src_a, src_b); + not_(src_a); + } + } else { + and_(src_a, src_b); + not_(src_a); + } + break; + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", + static_cast<std::size_t>(opcode.alu_operation.Value())); + break; + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { + if (optimizer.skip_dummy_addimmediate) { + // Games tend to use this as an exit instruction placeholder. It's to encode an instruction + // without doing anything. In our case we can just not emit anything. + if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { + return; + } + } + // Check for redundant moves + if (optimizer.optimize_for_method_move && + opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next_opcode.has_value()) { + const auto next = *next_opcode; + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + return; + } + } + } + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, RESULT); + auto src = Compile_GetRegister(opcode.src_b, eax); + + if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { + shr(src, opcode.bf_src_bit); + } else if (opcode.bf_src_bit == 31) { + xor_(src, src); + } + // Don't bother masking the whole register since we're using a 32 bit register + if (opcode.bf_size != 31 && opcode.bf_size != 0) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + + const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + if (mask != 0xffffffff) { + and_(dst, mask); + } + or_(dst, src); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, eax); + auto src = Compile_GetRegister(opcode.src_b, RESULT); + + shr(src, al); + if (opcode.bf_size != 0 && opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + + if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, eax); + auto src = Compile_GetRegister(opcode.src_b, RESULT); + + if (opcode.bf_src_bit != 0) { + shr(src, opcode.bf_src_bit); + } + + if (opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } + shl(src, al); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { + return maxwell3d->GetRegisterValue(method); +} + +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethodFromMME(method_address.address, value); +} + +void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, RESULT); + Common::X64::CallFarFunction(*this, &Read); + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(RESULT, Common::X64::ABI_RETURN.cvt32()); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); + mov(Common::X64::ABI_PARAM3, value); + Common::X64::CallFarFunction(*this, &Send); + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + + Xbyak::Label dont_process{}; + // Get increment + test(METHOD_ADDRESS, 0x3f000); + // If zero, method address doesn't update + je(dont_process); + + mov(ecx, METHOD_ADDRESS); + and_(METHOD_ADDRESS, 0xfff); + shr(ecx, 12); + and_(ecx, 0x3f); + lea(eax, ptr[rcx + METHOD_ADDRESS_64]); + sal(ecx, 12); + or_(eax, ecx); + + mov(METHOD_ADDRESS, eax); + + L(dont_process); +} + +void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + const s32 jump_address = + static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32)); + + Xbyak::Label end; + auto value = Compile_GetRegister(opcode.src_a, eax); + test(value, value); + if (optimizer.has_delayed_pc) { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + jne(end, T_NEAR); + break; + case Macro::BranchCondition::NotZero: + je(end, T_NEAR); + break; + } + + if (opcode.branch_annul) { + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } else { + Xbyak::Label handle_post_exit{}; + Xbyak::Label skip{}; + jmp(skip, T_NEAR); + if (opcode.is_exit) { + L(handle_post_exit); + // Execute 1 instruction + mov(BRANCH_HOLDER, end_of_code); + // Jump to next instruction to skip delay slot check + jmp(labels[jump_address], T_NEAR); + } else { + L(handle_post_exit); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } + L(skip); + mov(BRANCH_HOLDER, handle_post_exit); + jmp(delay_skip[pc], T_NEAR); + } + } else { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + je(labels[jump_address], T_NEAR); + break; + case Macro::BranchCondition::NotZero: + jne(labels[jump_address], T_NEAR); + break; + } + } + + L(end); +} + +void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() { + optimizer.can_skip_carry = true; + optimizer.has_delayed_pc = false; + for (auto raw_op : code) { + Macro::Opcode op{}; + op.raw = raw_op; + + if (op.operation == Macro::Operation::ALU) { + // Scan for any ALU operations which actually use the carry flag, if they don't exist in + // our current code we can skip emitting the carry flag handling operations + if (op.alu_operation == Macro::ALUOperation::AddWithCarry || + op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { + optimizer.can_skip_carry = false; + } + } + + if (op.operation == Macro::Operation::Branch) { + if (!op.branch_annul) { + optimizer.has_delayed_pc = true; + } + } + } +} + +void MacroJITx64Impl::Compile() { + MICROPROFILE_SCOPE(MacroJitCompile); + bool keep_executing = true; + labels.fill(Xbyak::Label()); + + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + // JIT state + mov(STATE, Common::X64::ABI_PARAM1); + mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + + static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]); + mov(REGISTERS, Common::X64::ABI_PARAM1); + add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers))); + xor_(RESULT, RESULT); + xor_(METHOD_ADDRESS, METHOD_ADDRESS); + xor_(NEXT_PARAMETER, NEXT_PARAMETER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + + mov(dword[REGISTERS + 4], Compile_FetchParameter()); + + // Track get register for zero registers and mark it as no-op + optimizer.zero_reg_skip = true; + + // AddImmediate tends to be used as a NOP instruction, if we detect this we can + // completely skip the entire code path and no emit anything + optimizer.skip_dummy_addimmediate = true; + + // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting + // one if our register isn't "dirty" + optimizer.optimize_for_method_move = true; + + // Check to see if we can skip emitting certain instructions + Optimizer_ScanFlags(); + + const u32 op_count = static_cast<u32>(code.size()); + for (u32 i = 0; i < op_count; i++) { + if (i < op_count - 1) { + pc = i + 1; + next_opcode = GetOpCode(); + } else { + next_opcode = {}; + } + pc = i; + Compile_NextInstruction(); + } + + L(end_of_code); + + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + ret(); + ready(); + program = getCode<ProgramType>(); +} + +bool MacroJITx64Impl::Compile_NextInstruction() { + const auto opcode = GetOpCode(); + if (labels[pc].getAddress()) { + return false; + } + + L(labels[pc]); + + switch (opcode.operation) { + case Macro::Operation::ALU: + Compile_ALU(opcode); + break; + case Macro::Operation::AddImmediate: + Compile_AddImmediate(opcode); + break; + case Macro::Operation::ExtractInsert: + Compile_ExtractInsert(opcode); + break; + case Macro::Operation::ExtractShiftLeftImmediate: + Compile_ExtractShiftLeftImmediate(opcode); + break; + case Macro::Operation::ExtractShiftLeftRegister: + Compile_ExtractShiftLeftRegister(opcode); + break; + case Macro::Operation::Read: + Compile_Read(opcode); + break; + case Macro::Operation::Branch: + Compile_Branch(opcode); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); + break; + } + + if (optimizer.has_delayed_pc) { + if (opcode.is_exit) { + mov(rax, end_of_code); + test(BRANCH_HOLDER, BRANCH_HOLDER); + cmove(BRANCH_HOLDER, rax); + // Jump to next instruction to skip delay slot check + je(labels[pc + 1], T_NEAR); + } else { + // TODO(ogniK): Optimize delay slot branching + Xbyak::Label no_delay_slot{}; + test(BRANCH_HOLDER, BRANCH_HOLDER); + je(no_delay_slot, T_NEAR); + mov(rax, BRANCH_HOLDER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(rax); + L(no_delay_slot); + } + L(delay_skip[pc]); + if (opcode.is_exit) { + return false; + } + } else { + test(BRANCH_HOLDER, BRANCH_HOLDER); + jne(end_of_code, T_NEAR); + if (opcode.is_exit) { + inc(BRANCH_HOLDER); + return false; + } + } + return true; +} + +Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { + mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); + inc(NEXT_PARAMETER); + return eax; +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[REGISTERS + index * sizeof(u32)]); + } + + return dst; +} + +Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[REGISTERS + index * sizeof(u32)]); + } + + return dst; +} + +void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { + Xbyak::Label zero{}, end{}; + xor_(ecx, ecx); + shr(dst, 32); + setne(cl); + mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); +} + +void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { + auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { + // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero + // register. + if (reg == 0) { + return; + } + mov(dword[REGISTERS + reg * sizeof(u32)], result); + }; + auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; + + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + SetRegister(reg, Compile_FetchParameter()); + break; + case Macro::ResultOperation::Move: + SetRegister(reg, RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethod: + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, Compile_FetchParameter()); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, RESULT); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, Compile_FetchParameter()); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + Compile_Send(Compile_FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + shr(RESULT, 12); + and_(RESULT, 0b111111); + Compile_Send(RESULT); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation)); + } +} + +Macro::Opcode MacroJITx64Impl::GetOpCode() const { + ASSERT(pc < code.size()); + return {code[pc]}; +} + +std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const { + return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h new file mode 100644 index 000000000..21ee157cf --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.h @@ -0,0 +1,100 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <bitset> +#include <xbyak.h> +#include "common/bit_field.h" +#include "common/common_types.h" +#include "common/x64/xbyak_abi.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games +constexpr size_t MAX_CODE_SIZE = 0x10000; + +class MacroJITx64 final : public MacroEngine { +public: + explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + +protected: + std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { +public: + MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); + ~MacroJITx64Impl(); + + void Execute(const std::vector<u32>& parameters, u32 method) override; + + void Compile_ALU(Macro::Opcode opcode); + void Compile_AddImmediate(Macro::Opcode opcode); + void Compile_ExtractInsert(Macro::Opcode opcode); + void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); + void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); + void Compile_Read(Macro::Opcode opcode); + void Compile_Branch(Macro::Opcode opcode); + +private: + void Optimizer_ScanFlags(); + + void Compile(); + bool Compile_NextInstruction(); + + Xbyak::Reg32 Compile_FetchParameter(); + Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); + Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); + void Compile_WriteCarry(Xbyak::Reg64 dst); + + void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); + void Compile_Send(Xbyak::Reg32 value); + + Macro::Opcode GetOpCode() const; + std::bitset<32> PersistentCallerSavedRegs() const; + + struct JITState { + Engines::Maxwell3D* maxwell3d{}; + std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{}; + const u32* parameters{}; + u32 carry_flag{}; + }; + static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); + using ProgramType = void (*)(JITState*); + + struct OptimizerState { + bool can_skip_carry{}; + bool has_delayed_pc{}; + bool zero_reg_skip{}; + bool skip_dummy_addimmediate{}; + bool optimize_for_method_move{}; + }; + OptimizerState optimizer{}; + + std::optional<Macro::Opcode> next_opcode{}; + ProgramType program{nullptr}; + + std::array<Xbyak::Label, MAX_CODE_SIZE> labels{}; + std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{}; + Xbyak::Label end_of_code{}; + + bool is_delay_slot{}; + u32 pc{}; + std::optional<u32> delayed_pc; + + const std::vector<u32>& code; + Engines::Maxwell3D& maxwell3d; +}; + +} // namespace Tegra diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 8b424e2cb..890fc6c63 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -185,12 +185,20 @@ bool IsASTCSupported() { Device::Device() : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); - const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); + const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; + bool disable_fast_buffer_sub_data = false; + if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { + LOG_WARNING( + Render_OpenGL, + "Beta driver 443.24 is known to have issues. There might be performance issues."); + disable_fast_buffer_sub_data = true; + } + uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); @@ -204,7 +212,7 @@ Device::Device() has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_fast_buffer_sub_data = is_nvidia; + has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 502b95973..d6e30b321 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -626,7 +626,9 @@ private: break; } } - if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { + + if (stage != ShaderType::Geometry && + (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) { if (ir.UsesLayer()) { code.AddLine("int gl_Layer;"); } @@ -655,6 +657,16 @@ private: --code.scope; code.AddLine("}};"); code.AddNewLine(); + + if (stage == ShaderType::Geometry) { + if (ir.UsesLayer()) { + code.AddLine("out int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("out int gl_ViewportIndex;"); + } + } + code.AddNewLine(); } void DeclareRegisters() { diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index e7952924a..6214fcbc3 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -751,11 +751,9 @@ void RendererOpenGL::RenderScreenshot() { } bool RendererOpenGL::Init() { - if (GLAD_GL_KHR_debug) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); - if (Settings::values.renderer_debug) { - glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); - } + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); glDebugMessageCallback(DebugHandler, nullptr); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 8e1b46277..281bf9ac3 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const { }; add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size()); add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size()); - add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size()); + add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size()); add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size()); + add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size()); add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size()); VkDescriptorSetLayoutCreateInfo ci; diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index 890fd52cf..9259b618d 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() { {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60}, {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64}, {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64}, + {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64}, {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}}; VkDescriptorPoolCreateInfo ci; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 65a1c6245..b8ccf164f 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -45,6 +45,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; +constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ @@ -104,8 +105,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries, u32 binding = base_binding; AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers); AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers); - AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers); + AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels); AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers); + AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels); AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images); return binding; } @@ -377,16 +379,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3 return; } - if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) { - // Nvidia has a bug where updating multiple uniform texels at once causes the driver to - // crash. + if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER || + descriptor_type == STORAGE_TEXEL_BUFFER) { + // Nvidia has a bug where updating multiple texels at once causes the driver to crash. + // Note: Fixed in driver Windows 443.24, Linux 440.66.15 for (u32 i = 0; i < count; ++i) { VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back(); entry.dstBinding = binding + i; entry.dstArrayElement = 0; entry.descriptorCount = 1; entry.descriptorType = descriptor_type; - entry.offset = offset + i * entry_size; + entry.offset = static_cast<std::size_t>(offset + i * entry_size); entry.stride = entry_size; } } else if (count > 0) { @@ -407,8 +410,9 @@ void FillDescriptorUpdateTemplateEntries( std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) { AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers); AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers); - AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers); + AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels); AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers); + AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels); AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a3d992ed3..d86c46412 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -468,8 +468,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { const auto& entries = pipeline.GetEntries(); SetupComputeConstBuffers(entries); SetupComputeGlobalBuffers(entries); - SetupComputeTexelBuffers(entries); + SetupComputeUniformTexels(entries); SetupComputeTextures(entries); + SetupComputeStorageTexels(entries); SetupComputeImages(entries); buffer_cache.Unmap(); @@ -787,8 +788,9 @@ void RasterizerVulkan::SetupShaderDescriptors( const auto& entries = shader->GetEntries(); SetupGraphicsConstBuffers(entries, stage); SetupGraphicsGlobalBuffers(entries, stage); - SetupGraphicsTexelBuffers(entries, stage); + SetupGraphicsUniformTexels(entries, stage); SetupGraphicsTextures(entries, stage); + SetupGraphicsStorageTexels(entries, stage); SetupGraphicsImages(entries, stage); } texture_cache.GuardSamplers(false); @@ -838,6 +840,10 @@ void RasterizerVulkan::BeginTransformFeedback() { if (regs.tfb_enabled == 0) { return; } + if (!device.IsExtTransformFeedbackSupported()) { + LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); + return; + } UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || @@ -866,6 +872,9 @@ void RasterizerVulkan::EndTransformFeedback() { if (regs.tfb_enabled == 0) { return; } + if (!device.IsExtTransformFeedbackSupported()) { + return; + } scheduler.Record( [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); @@ -976,12 +985,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, } } -void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().Maxwell3D(); - for (const auto& entry : entries.texel_buffers) { + for (const auto& entry : entries.uniform_texels) { const auto image = GetTextureInfo(gpu, entry, stage).tic; - SetupTexelBuffer(image, entry); + SetupUniformTexels(image, entry); } } @@ -996,6 +1005,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std:: } } +void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().Maxwell3D(); + for (const auto& entry : entries.storage_texels) { + const auto image = GetTextureInfo(gpu, entry, stage).tic; + SetupStorageTexel(image, entry); + } +} + void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Images); const auto& gpu = system.GPU().Maxwell3D(); @@ -1028,12 +1046,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { } } -void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) { +void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().KeplerCompute(); - for (const auto& entry : entries.texel_buffers) { + for (const auto& entry : entries.uniform_texels) { const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; - SetupTexelBuffer(image, entry); + SetupUniformTexels(image, entry); } } @@ -1048,6 +1066,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { } } +void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().KeplerCompute(); + for (const auto& entry : entries.storage_texels) { + const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; + SetupStorageTexel(image, entry); + } +} + void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Images); const auto& gpu = system.GPU().KeplerCompute(); @@ -1097,8 +1124,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd update_descriptor_queue.AddBuffer(buffer, offset, size); } -void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic, - const TexelBufferEntry& entry) { +void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, + const UniformTexelEntry& entry) { const auto view = texture_cache.GetTextureSurface(tic, entry); ASSERT(view->IsBufferView()); @@ -1120,6 +1147,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu sampled_views.push_back(ImageView{std::move(view), image_layout}); } +void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic, + const StorageTexelEntry& entry) { + const auto view = texture_cache.GetImageSurface(tic, entry); + ASSERT(view->IsBufferView()); + + update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); +} + void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { auto view = texture_cache.GetImageSurface(tic, entry); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 0ed0e48c6..04be37a5e 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -193,12 +193,15 @@ private: /// Setup global buffers in the graphics pipeline. void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); - /// Setup texel buffers in the graphics pipeline. - void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage); + /// Setup uniform texels in the graphics pipeline. + void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); /// Setup textures in the graphics pipeline. void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage); + /// Setup storage texels in the graphics pipeline. + void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage); + /// Setup images in the graphics pipeline. void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); @@ -209,11 +212,14 @@ private: void SetupComputeGlobalBuffers(const ShaderEntries& entries); /// Setup texel buffers in the compute pipeline. - void SetupComputeTexelBuffers(const ShaderEntries& entries); + void SetupComputeUniformTexels(const ShaderEntries& entries); /// Setup textures in the compute pipeline. void SetupComputeTextures(const ShaderEntries& entries); + /// Setup storage texels in the compute pipeline. + void SetupComputeStorageTexels(const ShaderEntries& entries); + /// Setup images in the compute pipeline. void SetupComputeImages(const ShaderEntries& entries); @@ -222,10 +228,12 @@ private: void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); - void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry); + void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry); void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); + void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry); + void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index a13e8baa7..97429cc59 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -400,8 +400,9 @@ private: u32 binding = specialization.base_binding; binding = DeclareConstantBuffers(binding); binding = DeclareGlobalBuffers(binding); - binding = DeclareTexelBuffers(binding); + binding = DeclareUniformTexels(binding); binding = DeclareSamplers(binding); + binding = DeclareStorageTexels(binding); binding = DeclareImages(binding); const Id main = OpFunction(t_void, {}, TypeFunction(t_void)); @@ -889,7 +890,7 @@ private: return binding; } - u32 DeclareTexelBuffers(u32 binding) { + u32 DeclareUniformTexels(u32 binding) { for (const auto& sampler : ir.GetSamplers()) { if (!sampler.is_buffer) { continue; @@ -910,7 +911,7 @@ private: Decorate(id, spv::Decoration::Binding, binding++); Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id}); + uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id}); } return binding; } @@ -945,31 +946,48 @@ private: return binding; } - u32 DeclareImages(u32 binding) { + u32 DeclareStorageTexels(u32 binding) { for (const auto& image : ir.GetImages()) { - const auto [dim, arrayed] = GetImageDim(image); - constexpr int depth = 0; - constexpr bool ms = false; - constexpr int sampled = 2; // This won't be accessed with a sampler - constexpr auto format = spv::ImageFormat::Unknown; - const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); - const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); - const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); - AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); - - Decorate(id, spv::Decoration::Binding, binding++); - Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - if (image.is_read && !image.is_written) { - Decorate(id, spv::Decoration::NonWritable); - } else if (image.is_written && !image.is_read) { - Decorate(id, spv::Decoration::NonReadable); + if (image.type != Tegra::Shader::ImageType::TextureBuffer) { + continue; } + DeclareImage(image, binding); + } + return binding; + } - images.emplace(image.index, StorageImage{image_type, id}); + u32 DeclareImages(u32 binding) { + for (const auto& image : ir.GetImages()) { + if (image.type == Tegra::Shader::ImageType::TextureBuffer) { + continue; + } + DeclareImage(image, binding); } return binding; } + void DeclareImage(const Image& image, u32& binding) { + const auto [dim, arrayed] = GetImageDim(image); + constexpr int depth = 0; + constexpr bool ms = false; + constexpr int sampled = 2; // This won't be accessed with a sampler + const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown; + const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); + const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); + const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); + AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); + + Decorate(id, spv::Decoration::Binding, binding++); + Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); + if (image.is_read && !image.is_written) { + Decorate(id, spv::Decoration::NonWritable); + } else if (image.is_written && !image.is_read) { + Decorate(id, spv::Decoration::NonReadable); + } + + images.emplace(image.index, StorageImage{image_type, id}); + } + bool IsRenderTargetEnabled(u32 rt) const { for (u32 component = 0; component < 4; ++component) { if (header.ps.IsColorComponentOutputEnabled(rt, component)) { @@ -1256,7 +1274,7 @@ private: } else { UNREACHABLE_MSG("Unmanaged offset node type"); } - pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index, + pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index, buffer_element); } return {OpLoad(t_float, pointer), Type::Float}; @@ -1611,7 +1629,7 @@ private: const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b); const Id carry = OpCompositeExtract(t_uint, result, 1); - return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool}; + return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool}; } Expression LogicalAssign(Operation operation) { @@ -1674,7 +1692,7 @@ private: const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 index = meta.sampler.index; if (meta.sampler.is_buffer) { - const auto& entry = texel_buffers.at(index); + const auto& entry = uniform_texels.at(index); return OpLoad(entry.image_type, entry.image); } else { const auto& entry = sampled_images.at(index); @@ -1951,39 +1969,20 @@ private: return {}; } - Expression AtomicImageAdd(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageMin(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageMax(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageAnd(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageOr(Operation operation) { - UNIMPLEMENTED(); - return {}; - } + template <Id (Module::*func)(Id, Id, Id, Id, Id)> + Expression AtomicImage(Operation operation) { + const auto& meta{std::get<MetaImage>(operation.GetMeta())}; + ASSERT(meta.values.size() == 1); - Expression AtomicImageXor(Operation operation) { - UNIMPLEMENTED(); - return {}; - } + const Id coordinate = GetCoordinates(operation, Type::Int); + const Id image = images.at(meta.image.index).image; + const Id sample = v_uint_zero; + const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample); - Expression AtomicImageExchange(Operation operation) { - UNIMPLEMENTED(); - return {}; + const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); + const Id semantics = v_uint_zero; + const Id value = AsUint(Visit(meta.values[0])); + return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; } template <Id (Module::*func)(Id, Id, Id, Id, Id)> @@ -1998,7 +1997,7 @@ private: return {v_float_zero, Type::Float}; } const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); - const Id semantics = Constant(t_uint, 0); + const Id semantics = v_uint_zero; const Id value = AsUint(Visit(operation[1])); return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; @@ -2622,11 +2621,11 @@ private: &SPIRVDecompiler::ImageLoad, &SPIRVDecompiler::ImageStore, - &SPIRVDecompiler::AtomicImageAdd, - &SPIRVDecompiler::AtomicImageAnd, - &SPIRVDecompiler::AtomicImageOr, - &SPIRVDecompiler::AtomicImageXor, - &SPIRVDecompiler::AtomicImageExchange, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>, &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>, &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>, @@ -2768,8 +2767,11 @@ private: Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); + const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint); + const Id v_float_zero = Constant(t_float, 0.0f); const Id v_float_one = Constant(t_float, 1.0f); + const Id v_uint_zero = Constant(t_uint, 0); // Nvidia uses these defaults for varyings (e.g. position and generic attributes) const Id v_varying_default = @@ -2794,15 +2796,16 @@ private: std::unordered_map<u8, GenericVaryingDescription> output_attributes; std::map<u32, Id> constant_buffers; std::map<GlobalMemoryBase, Id> global_buffers; - std::map<u32, TexelBuffer> texel_buffers; + std::map<u32, TexelBuffer> uniform_texels; std::map<u32, SampledImage> sampled_images; + std::map<u32, TexelBuffer> storage_texels; std::map<u32, StorageImage> images; + std::array<Id, Maxwell::NumRenderTargets> frag_colors{}; Id instance_index{}; Id vertex_index{}; Id base_instance{}; Id base_vertex{}; - std::array<Id, Maxwell::NumRenderTargets> frag_colors{}; Id frag_depth{}; Id frag_coord{}; Id front_facing{}; @@ -3058,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { } for (const auto& sampler : ir.GetSamplers()) { if (sampler.is_buffer) { - entries.texel_buffers.emplace_back(sampler); + entries.uniform_texels.emplace_back(sampler); } else { entries.samplers.emplace_back(sampler); } } for (const auto& image : ir.GetImages()) { - entries.images.emplace_back(image); + if (image.type == Tegra::Shader::ImageType::TextureBuffer) { + entries.storage_texels.emplace_back(image); + } else { + entries.images.emplace_back(image); + } } for (const auto& attribute : ir.GetInputAttributes()) { if (IsGenericAttribute(attribute)) { diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index b7af26388..2b0e90396 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -21,8 +21,9 @@ class VKDevice; namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using TexelBufferEntry = VideoCommon::Shader::Sampler; +using UniformTexelEntry = VideoCommon::Shader::Sampler; using SamplerEntry = VideoCommon::Shader::Sampler; +using StorageTexelEntry = VideoCommon::Shader::Image; using ImageEntry = VideoCommon::Shader::Image; constexpr u32 DESCRIPTOR_SET = 0; @@ -66,13 +67,15 @@ private: struct ShaderEntries { u32 NumBindings() const { return static_cast<u32>(const_buffers.size() + global_buffers.size() + - texel_buffers.size() + samplers.size() + images.size()); + uniform_texels.size() + samplers.size() + storage_texels.size() + + images.size()); } std::vector<ConstBufferEntry> const_buffers; std::vector<GlobalBufferEntry> global_buffers; - std::vector<TexelBufferEntry> texel_buffers; + std::vector<UniformTexelEntry> uniform_texels; std::vector<SamplerEntry> samplers; + std::vector<StorageTexelEntry> storage_texels; std::vector<ImageEntry> images; std::set<u32> attributes; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 2f1d5021d..ea487b770 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params, ci.pNext = nullptr; ci.flags = 0; ci.size = static_cast<VkDeviceSize>(host_memory_size); - ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT; + ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; ci.queueFamilyIndexCount = 0; ci.pQueueFamilyIndices = nullptr; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 45e3ddd2c..6f63217a2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -655,45 +655,63 @@ private: **/ std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr) { + GPUVAddr gpu_addr) { if (params.target == SurfaceTarget::Texture3D) { - return {}; + return std::nullopt; } - bool modified = false; + const auto test_modified = [](TSurface& surface) { return surface->IsModified(); }; TSurface new_surface = GetUncachedSurface(gpu_addr, params); - u32 passed_tests = 0; + + if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) { + LoadSurface(new_surface); + for (const auto& surface : overlaps) { + Unregister(surface); + } + Register(new_surface); + return {{new_surface, new_surface->GetMainView()}}; + } + + std::size_t passed_tests = 0; for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.is_layered || src_params.num_levels > 1) { - // We send this cases to recycle as they are more complex to handle - return {}; - } - const std::size_t candidate_size = surface->GetSizeInBytes(); - auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; + const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; if (!mipmap_layer) { continue; } - const auto [layer, mipmap] = *mipmap_layer; - if (new_surface->GetMipmapSize(mipmap) != candidate_size) { + const auto [base_layer, base_mipmap] = *mipmap_layer; + if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { continue; } - modified |= surface->IsModified(); - // Now we got all the data set up - const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); - const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); - const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1); - passed_tests++; - ImageCopy(surface, new_surface, copy_params); + ++passed_tests; + + // Copy all mipmaps and layers + const u32 block_width = params.GetDefaultBlockWidth(); + const u32 block_height = params.GetDefaultBlockHeight(); + for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) { + const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); + const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); + if (width < block_width || height < block_height) { + // Current APIs forbid copying small compressed textures, avoid errors + break; + } + const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, + src_params.depth); + ImageCopy(surface, new_surface, copy_params); + } } if (passed_tests == 0) { - return {}; + return std::nullopt; + } + if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { // In Accurate GPU all tests should pass, else we recycle - } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { - return {}; + return std::nullopt; } + + const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified); for (const auto& surface : overlaps) { Unregister(surface); } + new_surface->MarkAsModified(modified, Tick()); Register(new_surface); return {{new_surface, new_surface->GetMainView()}}; @@ -871,12 +889,9 @@ private: // two things either the candidate surface is a supertexture of the overlap // or they don't match in any known way. if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { - if (current_surface->GetGpuAddr() == gpu_addr) { - std::optional<std::pair<TSurface, TView>> view = - TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } + const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr); + if (view) { + return *view; } return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, MatchTopologyResult::FullMatch); diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index b08b87426..7e9073cc3 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() { Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool(); Settings::values.disable_cpu_opt = ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); + Settings::values.disable_macro_jit = + ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool(); qt_config->endGroup(); } @@ -1011,6 +1013,7 @@ void Config::SaveDebuggingValues() { WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false); WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false); WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); + WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false); qt_config->endGroup(); } diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp index c2026763e..2c77441fd 100644 --- a/src/yuzu/configuration/configure_debug.cpp +++ b/src/yuzu/configuration/configure_debug.cpp @@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() { ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt); ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn()); ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); + ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn()); + ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit); } void ConfigureDebug::ApplyConfiguration() { @@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() { Settings::values.quest_flag = ui->quest_flag->isChecked(); Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked(); Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); + Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked(); Debugger::ToggleConsole(); Log::Filter filter; filter.ParseFilterString(Settings::values.log_filter); diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui index e0d4c4a44..46f0208c6 100644 --- a/src/yuzu/configuration/configure_debug.ui +++ b/src/yuzu/configuration/configure_debug.ui @@ -148,6 +148,19 @@ </property> </widget> </item> + <item> + <widget class="QCheckBox" name="disable_macro_jit"> + <property name="enabled"> + <bool>true</bool> + </property> + <property name="whatsThis"> + <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string> + </property> + <property name="text"> + <string>Disable Macro JIT</string> + </property> + </widget> + </item> </layout> </widget> </item> diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp index e4eb5594b..a05fa64ba 100644 --- a/src/yuzu/configuration/configure_input_player.cpp +++ b/src/yuzu/configuration/configure_input_player.cpp @@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() { SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]); } } + UpdateButtonLabels(); + ApplyConfiguration(); } void ConfigureInputPlayer::ClearAll() { @@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() { } UpdateButtonLabels(); + ApplyConfiguration(); } void ConfigureInputPlayer::UpdateButtonLabels() { diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index c20d48c42..7240270f5 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -432,6 +432,8 @@ void Config::ReadValues() { Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false); Settings::values.disable_cpu_opt = sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); + Settings::values.disable_macro_jit = + sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false); const auto title_list = sdl2_config->Get("AddOns", "title_ids", ""); std::stringstream ss(title_list); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index abc6e6e65..6f53e9659 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -291,6 +291,8 @@ quest_flag = # Determines whether or not JIT CPU optimizations are enabled # false: Optimizations Enabled, true: Optimizations Disabled disable_cpu_opt = +# Enables/Disables the macro JIT compiler +disable_macro_jit=false [WebService] # Whether or not to enable telemetry |